Last active
February 7, 2018 18:08
-
-
Save JasonKessler/1e28a8f4f0208653634d9f295a5c9a58 to your computer and use it in GitHub Desktop.
Formatting scraped ICLR Reviews
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
url = 'https://openreview.net/notes?invitation=ICLR.cc%2F2018%2FConference%2F-%2FBlind_Submission&offset=0&limit=1000' | |
df = pd.DataFrame(requests.get(url).json()['notes']) # Each row in this data frame is a paper. | |
forum_content = [] | |
for i, forum_id in list(enumerate(df.forum)): # Each forum_id is a review, comment, or acceptance decision about a paper. | |
forum_content.append(requests.get('https://openreview.net/notes?forum={}&trash=true'.format(forum_id)).json()) | |
time.sleep(.3) | |
df['forumContent'] = pd.Series(forum_content) | |
df['title'] = df.content.apply(lambda x: x['title']) | |
df['authors'] = df.content.apply(lambda x: x['authors']) | |
df['decision_raw'] = df.forumContent.apply(lambda x:[n['content']['decision'] for n in x['notes'] | |
if 'decision' in n['content']][0]) | |
''' | |
>>> df['decision_raw'].value_counts() | |
Reject 504 | |
Accept (Poster) 313 | |
Invite to Workshop Track 90 | |
Accept (Oral) 23 | |
Name: decision_raw, dtype: int64 | |
''' | |
# Create a data frame of reviews from forum content. Note that the colummn "forum" is also present in df | |
only_reviews_df = pd.concat(df.forumContent.apply(lambda c: pd.DataFrame([ | |
{'review': n['content']['review'], | |
'rating': n['content']['rating'], | |
'confidence': n['content']['confidence'], | |
'forum': n['forum']} | |
for n in c['notes'] | |
if 'content' in n and 'review' in n['content'] | |
])).tolist()) | |
# merge it with the paper and decision information | |
reviews_df = pd.merge(df[['title', 'authors', 'decision_raw', 'forum']], only_reviews_df, on='forum') | |
''' | |
title Improving Discriminator-Generator Balance in G... | |
authors [Simen Selseng and Björn Gambäck] | |
decision_raw Reject | |
forum SyBPtQfAZ | |
confidence 4: The reviewer is confident but not absolutel... | |
rating 3: Clear rejection | |
review The paper proposes a variety of modifications ... | |
Name: 0, dtype: object | |
Note: there are 2806 reviews | |
''' | |
# Categorize the reviews by Accept (>6), Neutral (4, 5, and 6), and Reject (<4), | |
# and categorize the acceptance decisions as Accept (paper or poster), Workshop, or Reject. | |
# Concatentate them together into a single category. | |
reviews_df['decision'] = (reviews_df['decision_raw'] | |
.apply(lambda x: 'Reject' if x == 'Reject' | |
else ('Accept' if x.startswith('Accept') | |
else 'Workshop'))) | |
reviews_df['rating_bin'] = (reviews_df['rating'] | |
.apply(lambda x: (lambda s: 'Negative' if s < 5 | |
else ('Positive' if s > 6 else 'Neutral')) | |
(int(x.split(':')[0].strip())))) | |
reviews_df['category'] = reviews_df['decision'] + ', ' + reviews_df['rating_bin'] | |
reviews_df.to_csv('iclr2018_reviews.csv.bz2', index=False, compression='bz2') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment