Skip to content

Instantly share code, notes, and snippets.

@weiglemc
Created July 4, 2023 14:03
Show Gist options
  • Save weiglemc/1fb86319177f98eb91e920155da720b5 to your computer and use it in GitHub Desktop.
Save weiglemc/1fb86319177f98eb91e920155da720b5 to your computer and use it in GitHub Desktop.
Python script to grab data from the Internet Archive via the CDX API server, uses function from Sawood Alam's CDXSummary tool
# grab-cdx.py
from requests import Session
from rich.console import Console
from urllib.parse import urlencode
URIR = "https://www.cnn.com/"
FROM = "20150424"
TO = "20220923"
OTHER_PARAMS = "&from=" + FROM + "&to=" + TO + "&collapse=timestamp:8&filter=statuscode:200" # only one entry per day, 200 OK
REQSESSION = Session()
errprint = Console(stderr=True, style="red", highlight=False).print
# HELPFUL FUNCTION FROM CDXSUMMARY
# https://github.com/internetarchive/cdx-summary/blob/main/cdxsummary/__main__.py
def get_stream_from_api(url):
pages = int(REQSESSION.get(f"{url}&showNumPages=true").text)
for page in range(pages):
pageurl = f"{url}&page={page}"
errprint(f"Downloading [[cyan]{page + 1}/{pages}[/cyan]]: [magenta]{pageurl}[/magenta]")
r = REQSESSION.get(pageurl, stream=True)
if r.ok:
r.raw.decode_content = True
for line in r.raw: yield line
def write_cdx (urir, cdxapi, params, outfile):
url = f"{cdxapi}?{params}&{urlencode({'url': urir})}"
input_stream = get_stream_from_api(url)
f = open(outfile, "w")
for line in input_stream: f.write(line.decode())
f.close()
try: input_stream.close()
except: pass
# MAIN
cdxapi = "https://web.archive.org/cdx/search"
params = "matchType=exact" + OTHER_PARAMS
outfile = "cnn-" + FROM + "-" + TO + "-day.cdx"
write_cdx(URIR, cdxapi, params, outfile)
@lesleyodu
Copy link

lesleyodu commented Mar 17, 2025

Politeness to avoid being blocked:

from random import randint, from time import sleep
last line of get_stream_from_api (inside for loop): sleep(randint(8,11))

Can also get more pages at once: (suggestion from Sawood)

for page in range(startPage, pages, 5):
pageurl = f"{url}&page={page}&pageSize=5"

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment