Last active
February 2, 2025 23:51
-
-
Save harryposner/41313082be84213b33ea6947f5475e4e to your computer and use it in GitHub Desktop.
Scrape NYT crossword puzzles in AcrossLite format from xwordinfo.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Scrape NYT crossword puzzles in AcrossLite format from xwordinfo.com | |
Usage: ./get_nyt_xword.py [puzzle-iso-date] filename | |
""" | |
import re | |
import sys | |
import datetime | |
import puz | |
import requests | |
from bs4 import BeautifulSoup | |
URL = "https://www.xwordinfo.com/Crossword" | |
def get_soup(puzzle_date=None): | |
datestring = puzzle_date.strftime("%m/%d/%Y") if puzzle_date else None | |
resp = requests.get(URL, data={"date": datestring}) | |
resp.raise_for_status() | |
return BeautifulSoup(resp.text, features="html.parser") | |
def xword_info(soup): | |
info = {} | |
info.update(**read_stats(soup)) | |
info["author"] = read_author(soup) | |
info["title"] = read_title(soup) | |
info["solution"] = read_solution(soup) | |
info["clues"] = read_clues(soup) | |
info["extensions"] = read_extensions(soup) | |
return info | |
def read_stats(soup): | |
stats_elts = soup.find("div", id="CPHContent_StatsData").find_all("span") | |
stats = {} | |
for element in stats_elts: | |
if element.text[-1].isdigit(): | |
for k, v in (pair.split(":") for pair in element.text.split(",")): | |
k = k.strip() | |
k = {"Rows": "height", "Columns": "width"}.get(k, k) | |
stats[k] = int(v.strip()) | |
return stats | |
def read_title(soup): | |
return soup.find("h1", id="PuzTitle").text | |
def read_author(soup): | |
author_box = soup.find("div", id="CPHContent_AEGrid").text.strip() | |
for line in author_box.splitlines(): | |
if "Author" in line: | |
author = line.split(":")[1].strip() | |
elif "Editor" in line: | |
editor = line.split(":")[1].strip() | |
return f"{author} / {editor}" | |
# read_author(soup) | |
def puzzle_squares(soup): | |
puzzle_table = soup.find("table", id="PuzTable").find_all("td") | |
for tag in puzzle_table: | |
if tag.attrs.get("class") == ["black"]: | |
yield {"val": puz.BLACKSQUARE, "markup": None} | |
continue | |
# `continue` above means `markup` won't ever be a black cell | |
markup = bool(tag.attrs.get("class") or tag.find("div", style=True)) | |
letter_tag = tag.find("div", class_="letter") | |
if letter_tag is not None: | |
yield {"val": letter_tag.text, "markup": markup} | |
continue | |
rebus_tag = (tag.find("div", class_="subst2") | |
or tag.find("div", class_="subst")) | |
if rebus_tag is not None: | |
yield {"val": rebus_tag.text, "markup": markup} | |
continue | |
raise ValueError(f"Couldn't read puzzle tag: {tag}") | |
def read_solution(soup): | |
return "".join(sq["val"][0] for sq in puzzle_squares(soup)) | |
EMPTY = b"\x00" | |
MARKUP = b"\x80" | |
def read_extensions(soup): | |
markup = [] | |
rebus_solutions = [] | |
rebus_locations = [] | |
for square in puzzle_squares(soup): | |
if len(square["val"]) > 1: | |
rebus_solutions.append(square["val"]) | |
rebus_locations.append((len(rebus_solutions)+1).to_bytes(1, "big")) | |
else: | |
rebus_locations.append(EMPTY) | |
if square["markup"]: | |
markup.append(MARKUP) | |
else: | |
markup.append(EMPTY) | |
extensions = {} | |
if rebus_solutions: | |
extensions[puz.Extensions.Rebus] = b"".join(rebus_locations) | |
rebus = [f"{i+1:2}:{word};" for i, word in enumerate(rebus_solutions)] | |
rebus = "".join(rebus).encode("ascii") | |
extensions[puz.Extensions.RebusSolutions] = rebus | |
if len(set(markup)) > 1: | |
extensions[puz.Extensions.Markup] = b"".join(markup) | |
return extensions | |
def read_clues(soup): | |
def extract_clues(panel): | |
tag_iter = (t.find(string=True, recursive=False) for t in panel.children) | |
clues = [] | |
for clue_number in tag_iter: | |
clue = next(tag_iter).rstrip(" :") | |
clues.append((int(clue_number), clue)) | |
return clues | |
across, down = map(extract_clues, soup.find_all("div", class_="numclue")) | |
# Across comes before down when clues have the same number | |
all_clues = sorted(across + down, key=lambda num_and_clue: num_and_clue[0]) | |
return [clue for __, clue in all_clues] | |
def validate_scrape(info): | |
assert info["Blocks"] == info["solution"].count(puz.BLACKSQUARE) | |
assert len(info["solution"]) == info["height"] * info["width"] | |
assert info["Words"] == len(info["clues"]) | |
def get_puzzle(puzzle_date=None): | |
puzzle = puz.Puzzle() | |
soup = get_soup(puzzle_date) | |
info = xword_info(soup) | |
validate_scrape(info) | |
for k, v in info.items(): | |
if k in puzzle.__dict__: | |
puzzle.__setattr__(k, v) | |
puzzle.fill = re.sub(r"[A-Z]", "-", puzzle.solution) | |
# This relies on dictionaries preserving insertion order | |
puzzle._extensions_order = list(puzzle.extensions.keys()) | |
return puzzle | |
def main(*argv): | |
if len(argv) == 3: | |
__, datestring, fname_out = argv | |
puzzle_date = datetime.date.fromisoformat(datestring) | |
elif len(argv) == 2: | |
fname_out = argv[1] | |
puzzle_date = None | |
else: | |
sys.stderr.write(f"usage: {argv[0]} [puzzle-iso-date] file\n") | |
sys.stderr.write(f"{argv[0]}: error: the file argument is required\n") | |
return 2 | |
puzzle = get_puzzle(puzzle_date) | |
puzzle.save(fname_out) | |
print(f"Saved {puzzle.title} to {fname_out}") | |
return 0 | |
def get_unreadable_puzzles(start=datetime.date(1993, 11, 21), delay=1): | |
import time | |
def daterange(start, stop): | |
current = start | |
while current < stop: | |
yield current | |
current += datetime.timedelta(1) | |
unreadable = [] | |
for puzzle_date in daterange(start, datetime.date.today()): | |
try: | |
get_puzzle(puzzle_date) | |
except Exception: | |
unreadable.append(puzzle_date) | |
print(puzzle_date) | |
finally: | |
time.sleep(delay) | |
return unreadable | |
if __name__ == "__main__": | |
sys.exit(main(*sys.argv)) |
It doesn't, sorry.
On September 22, they used 'en-dashes' instead of regular dashes. which broke the script with "UnicodeEncodeError: 'latin-1' codec can't encode character '\u2013' " Seems like that's in the puz library, not in your code.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Does this one work on the pre-shortz solutions? Thanks!