Created
July 4, 2023 14:03
-
-
Save weiglemc/1fb86319177f98eb91e920155da720b5 to your computer and use it in GitHub Desktop.
Python script to grab data from the Internet Archive via the CDX API server, uses function from Sawood Alam's CDXSummary tool
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# grab-cdx.py | |
from requests import Session | |
from rich.console import Console | |
from urllib.parse import urlencode | |
URIR = "https://www.cnn.com/" | |
FROM = "20150424" | |
TO = "20220923" | |
OTHER_PARAMS = "&from=" + FROM + "&to=" + TO + "&collapse=timestamp:8&filter=statuscode:200" # only one entry per day, 200 OK | |
REQSESSION = Session() | |
errprint = Console(stderr=True, style="red", highlight=False).print | |
# HELPFUL FUNCTION FROM CDXSUMMARY | |
# https://github.com/internetarchive/cdx-summary/blob/main/cdxsummary/__main__.py | |
def get_stream_from_api(url): | |
pages = int(REQSESSION.get(f"{url}&showNumPages=true").text) | |
for page in range(pages): | |
pageurl = f"{url}&page={page}" | |
errprint(f"Downloading [[cyan]{page + 1}/{pages}[/cyan]]: [magenta]{pageurl}[/magenta]") | |
r = REQSESSION.get(pageurl, stream=True) | |
if r.ok: | |
r.raw.decode_content = True | |
for line in r.raw: yield line | |
def write_cdx (urir, cdxapi, params, outfile): | |
url = f"{cdxapi}?{params}&{urlencode({'url': urir})}" | |
input_stream = get_stream_from_api(url) | |
f = open(outfile, "w") | |
for line in input_stream: f.write(line.decode()) | |
f.close() | |
try: input_stream.close() | |
except: pass | |
# MAIN | |
cdxapi = "https://web.archive.org/cdx/search" | |
params = "matchType=exact" + OTHER_PARAMS | |
outfile = "cnn-" + FROM + "-" + TO + "-day.cdx" | |
write_cdx(URIR, cdxapi, params, outfile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Politeness to avoid being blocked:
from random import randint, from time import sleep
last line of get_stream_from_api (inside for loop): sleep(randint(8,11))
Can also get more pages at once: (suggestion from Sawood)
for page in range(startPage, pages, 5):
pageurl = f"{url}&page={page}&pageSize=5"