Last active
July 9, 2025 10:25
-
-
Save yell0wsuit/c25632f7c863d194edb2ded6d22a3cc3 to your computer and use it in GitHub Desktop.
Python script to extract Apple Dictionary entries from AppleDict source XML format.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Script to extract Apple Dictionary entries from AppleDict source XML format to HTML files. | |
You will need to use pyglossary to extract the dictionary's Body.data to XML first. | |
Usage: | |
python extractentries.py input.xml output_folder/ | |
""" | |
import os | |
import re | |
import sys | |
from tqdm import tqdm | |
def sanitize_filename(title: str) -> str: | |
"""Make `<d:title>` safe for filenames""" | |
return re.sub(r"[^\w\-_.]", "_", title.strip()) | |
def extract_entries(xml_path: str, output_dir: str): | |
"""Extract Apple Dictionary entries from AppleDict source XML format""" | |
os.makedirs(output_dir, exist_ok=True) | |
with open(xml_path, "r", encoding="utf-8") as f: | |
xml = f.read() | |
chunks = xml.split("<d:entry") | |
entry_count = 0 | |
for chunk in tqdm(chunks[1:], desc="Extracting entries", unit="entry"): | |
full_entry = "<d:entry" + chunk | |
closing_tag_index = full_entry.find("</d:entry>") | |
if closing_tag_index == -1: | |
continue # skip malformed entry | |
full_entry = full_entry[: closing_tag_index + len("</d:entry>")] | |
title_match = re.search(r'd:title="(.*?)"', full_entry) | |
title = title_match.group(1) if title_match else f"entry{entry_count}" | |
safe_title = sanitize_filename(title) | |
filename = f"{entry_count+1:05d}_{safe_title}.html" | |
with open( | |
os.path.join(output_dir, filename), "w", encoding="utf-8" | |
) as out_file: | |
out_file.write(full_entry) | |
entry_count += 1 | |
print(f"✅ Extracted {entry_count} entries to: {output_dir}") | |
if __name__ == "__main__": | |
if len(sys.argv) != 3: | |
print("Usage: python extract_appledict_entries.py input.xml output_folder/") | |
sys.exit(1) | |
input_xml = sys.argv[1] | |
output_folder = sys.argv[2] | |
extract_entries(input_xml, output_folder) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment