Created
July 22, 2020 19:56
-
-
Save csarron/2b37586e2d5518ab34fd2cd160eb8702 to your computer and use it in GitHub Desktop.
Get openness statistics of conferences from DBLP
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" | |
Get openness statistics of top conferences, motivated by | |
http://s3.eurecom.fr/~balzarot/notes/inbreeding/inbreeding.html | |
install: pip install requests matplotlib | |
usage example: | |
python calc_openness.py MobiCom MobiSys SenSys --plot --save_dir mobile | |
python calc_openness.py SOSP OSDI EuroSys USENIX FAST ASPLOS NSDI --plot | |
python calc_openness.py ACL EMNLP NAACL --plot --save_dir nlp | |
python calc_openness.py CVPR ICCV ECCV --plot --save_dir cv | |
""" | |
__author__ = "Qingqing Cao, https://awk.ai/, Twitter@sysnlp" | |
__copyright__ = "Copyright 2020, MIT LICENSE" | |
import argparse | |
import json | |
import os | |
from collections import defaultdict | |
import requests | |
_API_BASE = 'https://dblp.org/search/publ/api?' | |
_API_TEMPLATE = _API_BASE + 'q=/conf/{}/{}&format=json&h=1000' | |
author_record = defaultdict(dict) | |
# conference names can be found at https://dblp.org/db/conf/ | |
def gen_author_flags(conf, year_str=''): | |
years = set() | |
api = _API_TEMPLATE.format(conf.lower(), year_str) | |
response = requests.get(api) | |
if not response.ok: | |
print('request failed, try again or check api endpoint!') | |
return -1 | |
data = json.loads(response.content) | |
hits = data['result']['hits'] | |
all_hits = hits['hit'] | |
paper_authors_map = dict() | |
for item in all_hits: | |
info = item['info'] | |
authors = info.get('authors', None) | |
if authors is None: | |
# skip no author record | |
continue | |
venue = info.get('venue', None) | |
if venue is None: | |
continue | |
year = int(info['year']) | |
years.add(year) | |
all_authors = authors['author'] | |
if isinstance(all_authors, dict): | |
record_authors = [all_authors] | |
else: | |
assert isinstance(all_authors, list) | |
record_authors = all_authors | |
paper_key = info['key'] | |
paper_authors = set() | |
for author in record_authors: | |
author_id = author['@pid'] | |
paper_authors.add(author_id) | |
author_years = author_record[author_id].get(conf, set()) | |
author_years.add(year) | |
author_record[author_id][conf] = author_years | |
if isinstance(venue, list): | |
venue = venue[0] | |
if conf in venue: # only consider conf venue | |
paper_authors_map[paper_key] = paper_authors | |
return years, paper_authors_map | |
def main(args): | |
conferences = args.conferences | |
conf_old_papers = defaultdict(dict) | |
conf_new_papers = defaultdict(dict) | |
all_old = defaultdict(dict) | |
all_new = defaultdict(dict) | |
for conf in conferences: | |
years, _ = gen_author_flags(conf) | |
conf_years = sorted(list(years), reverse=True) | |
for conf_year in conf_years: | |
if conf_year < args.after_years: | |
continue | |
_, paper_authors_map = gen_author_flags( | |
conf, conf_year) | |
num_year_papers = len(paper_authors_map) | |
if num_year_papers == 0: | |
continue | |
num_old_author_paper = 0 | |
for paper_key, paper_authors in paper_authors_map.items(): | |
is_old_author = False | |
for paper_author in paper_authors: | |
# if one of the author published in previous years, | |
# it is old-author paper | |
author_years = author_record[paper_author][conf] | |
is_old_author = min(author_years) < conf_year | |
if is_old_author: | |
break | |
num_old_author_paper += 1 if is_old_author else 0 | |
num_new_author_papers = num_year_papers - num_old_author_paper | |
conf_old_papers[conf][conf_year] = num_old_author_paper | |
conf_new_papers[conf][conf_year] = num_new_author_papers | |
print(conf, conf_year, num_old_author_paper, num_new_author_papers) | |
all_old[conf_year][conf] = num_old_author_paper | |
all_new[conf_year][conf] = num_new_author_papers | |
# conf stats for all years | |
print('all combined:') | |
x = [] | |
y = [] | |
for year in sorted(all_old.keys(), reverse=True): | |
x.append(year) | |
num_old = sum(all_old[year].values()) | |
num_new = sum(all_new[year].values()) | |
frac = num_new / (num_old + num_new) | |
y.append(frac) | |
print(year, frac, num_new, num_old) | |
if args.plot: | |
import matplotlib as mpl | |
mpl.use('Agg') | |
import matplotlib.pyplot as plt | |
cnt = 0 | |
save_dir = args.save_dir | |
os.makedirs(save_dir, exist_ok=True) | |
for key in conf_new_papers.keys(): | |
old_data = conf_old_papers[key] | |
new_data = conf_new_papers[key] | |
x = [] | |
y = [] | |
for year in sorted(old_data.keys(), reverse=True): | |
x.append(year) | |
y.append(new_data[year] / (new_data[year] + old_data[year])) | |
plt.figure(cnt) | |
cnt += 1 | |
plt.plot(x, y, '-o') | |
plt.xticks(x, rotation=45) | |
plt.yticks([i / 10 for i in range(0, 11)]) | |
plt.ylim(0, 1) | |
plt.grid(True) | |
plt.title("{} Fraction of Papers from New Authors".format(key)) | |
plt.savefig(os.path.join(save_dir, '{}.png'.format(key))) | |
plt.show() | |
plt.figure(cnt) | |
plt.plot(x, y, '-o') | |
plt.xticks(x, rotation=45) | |
plt.yticks([i / 10 for i in range(0, 11)]) | |
plt.ylim(0, 1) | |
plt.grid(True) | |
plt.title("Fraction of Papers from New Authors") | |
plt.savefig(os.path.join(save_dir, 'combined.png')) | |
plt.show() | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('conferences', type=str, nargs='+', | |
help='list of conferences separate by space') | |
parser.add_argument("--plot", action="store_true", | |
help="if true, plot stats") | |
parser.add_argument("--after_years", type=int, default=2000, | |
help="get data after the year, set to 0" | |
"to get all avaiable dblp data") | |
parser.add_argument("--save_dir", type=str, default='.', | |
help="dir to save plot image") | |
main(parser.parse_args()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment