Skip to content

Instantly share code, notes, and snippets.

@JasonKessler
Last active February 7, 2018 18:08
Show Gist options
  • Save JasonKessler/1e28a8f4f0208653634d9f295a5c9a58 to your computer and use it in GitHub Desktop.
Save JasonKessler/1e28a8f4f0208653634d9f295a5c9a58 to your computer and use it in GitHub Desktop.
Formatting scraped ICLR Reviews
url = 'https://openreview.net/notes?invitation=ICLR.cc%2F2018%2FConference%2F-%2FBlind_Submission&offset=0&limit=1000'
df = pd.DataFrame(requests.get(url).json()['notes']) # Each row in this data frame is a paper.
forum_content = []
for i, forum_id in list(enumerate(df.forum)): # Each forum_id is a review, comment, or acceptance decision about a paper.
forum_content.append(requests.get('https://openreview.net/notes?forum={}&trash=true'.format(forum_id)).json())
time.sleep(.3)
df['forumContent'] = pd.Series(forum_content)
df['title'] = df.content.apply(lambda x: x['title'])
df['authors'] = df.content.apply(lambda x: x['authors'])
df['decision_raw'] = df.forumContent.apply(lambda x:[n['content']['decision'] for n in x['notes']
if 'decision' in n['content']][0])
'''
>>> df['decision_raw'].value_counts()
Reject 504
Accept (Poster) 313
Invite to Workshop Track 90
Accept (Oral) 23
Name: decision_raw, dtype: int64
'''
# Create a data frame of reviews from forum content. Note that the colummn "forum" is also present in df
only_reviews_df = pd.concat(df.forumContent.apply(lambda c: pd.DataFrame([
{'review': n['content']['review'],
'rating': n['content']['rating'],
'confidence': n['content']['confidence'],
'forum': n['forum']}
for n in c['notes']
if 'content' in n and 'review' in n['content']
])).tolist())
# merge it with the paper and decision information
reviews_df = pd.merge(df[['title', 'authors', 'decision_raw', 'forum']], only_reviews_df, on='forum')
'''
title Improving Discriminator-Generator Balance in G...
authors [Simen Selseng and Björn Gambäck]
decision_raw Reject
forum SyBPtQfAZ
confidence 4: The reviewer is confident but not absolutel...
rating 3: Clear rejection
review The paper proposes a variety of modifications ...
Name: 0, dtype: object
Note: there are 2806 reviews
'''
# Categorize the reviews by Accept (>6), Neutral (4, 5, and 6), and Reject (<4),
# and categorize the acceptance decisions as Accept (paper or poster), Workshop, or Reject.
# Concatentate them together into a single category.
reviews_df['decision'] = (reviews_df['decision_raw']
.apply(lambda x: 'Reject' if x == 'Reject'
else ('Accept' if x.startswith('Accept')
else 'Workshop')))
reviews_df['rating_bin'] = (reviews_df['rating']
.apply(lambda x: (lambda s: 'Negative' if s < 5
else ('Positive' if s > 6 else 'Neutral'))
(int(x.split(':')[0].strip()))))
reviews_df['category'] = reviews_df['decision'] + ', ' + reviews_df['rating_bin']
reviews_df.to_csv('iclr2018_reviews.csv.bz2', index=False, compression='bz2')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment