Skip to content

Instantly share code, notes, and snippets.

@act65
Last active March 7, 2019 21:06
Show Gist options
  • Save act65/4c636bf482b597e52247567bf07f483b to your computer and use it in GitHub Desktop.
Save act65/4c636bf482b597e52247567bf07f483b to your computer and use it in GitHub Desktop.
scrape papers from semantic scholar
import os
import json
import requests
import argparse
from bs4 import BeautifulSoup as bs
def fetch(url):
page = requests.get(url)
return bs(page.content, "html.parser")
def save(page, path, name):
base_dir = os.path.join(path, name)
os.makedirs(base_dir, exist_ok=True)
save_name = os.path.join(base_dir, 'index.html')
with open(save_name, 'wt') as f:
f.write(str(page))
return page
def save_assets(page, path, name, source_path):
asset_dir = os.path.join(path, name, 'figures')
os.makedirs(asset_dir, exist_ok=True)
for im in page.findAll('img'):
if im['src'].endswith('.png'):
img_url = os.path.join(source_path, name, im['src'])
# slightly edit the html to standardise paths to the images
if not 'figures' in im['src']:
im['src'] = os.path.join('figures', im['src'])
save_name = os.path.join(asset_dir, im['src'].split('/')[-1])
response = requests.get(img_url).content
with open(save_name, 'wb') as f:
f.write(response)
return page
parser = argparse.ArgumentParser()
parser.add_argument("--file_name", type=str, default='viewer-ids.json')
parser.add_argument("--source_url", type=str, default='https://reader.semanticscholar.org/')
parser.add_argument("--save_path", type=str, default='/tmp/semanticscholar')
parser.add_argument("--start", type=int, default=0)
if __name__ == '__main__':
args = parser.parse_args()
with open(args.file_name) as f:
data = json.load(f)
for i, d in enumerate(data[args.start:]):
print('\rNumber: {} out of {}'.format(i, len(data)), end='', flush=True)
name = d['arxiv_id']
url = os.path.join(args.source_url, name, 'index.html')
p = save(save_assets(fetch(url),
args.save_path,
name,
args.source_url),
args.save_path,
name)
This file has been truncated, but you can view the full file.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment