JasonKessler · February 7, 2018 18:08
diff --git a/format_scraped_iclr_reviews.py b/format_scraped_iclr_reviews.py
 url = 'https://openreview.net/notes?invitation=ICLR.cc%2F2018%2FConference%2F-%2FBlind_Submission&offset=0&limit=1000'
 df = pd.DataFrame(requests.get(url).json()['notes']) # Each row in this data frame is a paper.
 forum_content = []
 for i, forum_id in list(enumerate(df.forum)): # Each forum_id is a review, comment, or acceptance decision about a paper.
  forum_content.append(requests.get('https://openreview.net/notes?forum={}&trash=true'.format(forum_id)).json())
  time.sleep(.3)
 df['forumContent'] = pd.Series(forum_content)

 df['title'] = df.content.apply(lambda x: x['title'])
 df['authors'] = df.content.apply(lambda x: x['authors'])
 df['decision_raw'] = df.forumContent.apply(lambda x:[n['content']['decision'] for n in x['notes'] 
                                                     if 'decision' in n['content']][0])
 '''
 >>> df['decision_raw'].value_counts()
 Reject                      504
 Accept (Poster)             313
 Invite to Workshop Track     90
 Accept (Oral)                23
 Name: decision_raw, dtype: int64
 '''

 # Create a data frame of reviews from forum content.  Note that the colummn "forum" is also present in df
 only_reviews_df = pd.concat(df.forumContent.apply(lambda c: pd.DataFrame([
    {'review': n['content']['review'], 
     'rating': n['content']['rating'],  
     'confidence': n['content']['confidence'],
     'forum': n['forum']} 
    for n in c['notes'] 
    if 'content' in n and 'review' in n['content']
 ])).tolist())

 # merge it with the paper and decision information
 reviews_df = pd.merge(df[['title', 'authors', 'decision_raw', 'forum']], only_reviews_df, on='forum')
 '''
 title           Improving Discriminator-Generator Balance in G...
 authors                         [Simen Selseng and Björn Gambäck]
 decision_raw                                               Reject
 forum                                                   SyBPtQfAZ
 confidence      4: The reviewer is confident but not absolutel...
 rating                                         3: Clear rejection
 review          The paper proposes a variety of modifications ...
 Name: 0, dtype: object
 Note: there are 2806 reviews
 '''

 # Categorize the reviews by Accept (>6), Neutral (4, 5, and 6), and Reject (<4),
 # and categorize the acceptance decisions as Accept (paper or poster), Workshop, or Reject. 
 # Concatentate them together into a single category.
 reviews_df['decision'] = (reviews_df['decision_raw']
                          .apply(lambda x: 'Reject' if x == 'Reject' 
                                 else ('Accept' if x.startswith('Accept') 
                                       else 'Workshop')))
 reviews_df['rating_bin'] = (reviews_df['rating']
                            .apply(lambda x: (lambda s: 'Negative' if s < 5 
                                              else ('Positive' if s > 6 else 'Neutral'))
                                   (int(x.split(':')[0].strip()))))
 reviews_df['category'] = reviews_df['decision'] + ', ' + reviews_df['rating_bin']
 reviews_df.to_csv('iclr2018_reviews.csv.bz2', index=False, compression='bz2')
	url = 'https://openreview.net/notes?invitation=ICLR.cc%2F2018%2FConference%2F-%2FBlind_Submission&offset=0&limit=1000'
	df = pd.DataFrame(requests.get(url).json()['notes']) # Each row in this data frame is a paper.
	forum_content = []
	for i, forum_id in list(enumerate(df.forum)): # Each forum_id is a review, comment, or acceptance decision about a paper.
	forum_content.append(requests.get('https://openreview.net/notes?forum={}&trash=true'.format(forum_id)).json())
	time.sleep(.3)
	df['forumContent'] = pd.Series(forum_content)

	df['title'] = df.content.apply(lambda x: x['title'])
	df['authors'] = df.content.apply(lambda x: x['authors'])
	df['decision_raw'] = df.forumContent.apply(lambda x:[n['content']['decision'] for n in x['notes']
	if 'decision' in n['content']][0])
	'''
	>>> df['decision_raw'].value_counts()
	Reject 504
	Accept (Poster) 313
	Invite to Workshop Track 90
	Accept (Oral) 23
	Name: decision_raw, dtype: int64
	'''

	# Create a data frame of reviews from forum content. Note that the colummn "forum" is also present in df
	only_reviews_df = pd.concat(df.forumContent.apply(lambda c: pd.DataFrame([
	{'review': n['content']['review'],
	'rating': n['content']['rating'],
	'confidence': n['content']['confidence'],
	'forum': n['forum']}
	for n in c['notes']
	if 'content' in n and 'review' in n['content']
	])).tolist())

	# merge it with the paper and decision information
	reviews_df = pd.merge(df[['title', 'authors', 'decision_raw', 'forum']], only_reviews_df, on='forum')
	'''
	title Improving Discriminator-Generator Balance in G...
	authors [Simen Selseng and Björn Gambäck]
	decision_raw Reject
	forum SyBPtQfAZ
	confidence 4: The reviewer is confident but not absolutel...
	rating 3: Clear rejection
	review The paper proposes a variety of modifications ...
	Name: 0, dtype: object
	Note: there are 2806 reviews
	'''

	# Categorize the reviews by Accept (>6), Neutral (4, 5, and 6), and Reject (<4),
	# and categorize the acceptance decisions as Accept (paper or poster), Workshop, or Reject.
	# Concatentate them together into a single category.
	reviews_df['decision'] = (reviews_df['decision_raw']
	.apply(lambda x: 'Reject' if x == 'Reject'
	else ('Accept' if x.startswith('Accept')
	else 'Workshop')))
	reviews_df['rating_bin'] = (reviews_df['rating']
	.apply(lambda x: (lambda s: 'Negative' if s < 5
	else ('Positive' if s > 6 else 'Neutral'))
	(int(x.split(':')[0].strip()))))
	reviews_df['category'] = reviews_df['decision'] + ', ' + reviews_df['rating_bin']
	reviews_df.to_csv('iclr2018_reviews.csv.bz2', index=False, compression='bz2')