Skip to content

Instantly share code, notes, and snippets.

@yell0wsuit
Last active July 9, 2025 10:25
Show Gist options
  • Save yell0wsuit/c25632f7c863d194edb2ded6d22a3cc3 to your computer and use it in GitHub Desktop.
Save yell0wsuit/c25632f7c863d194edb2ded6d22a3cc3 to your computer and use it in GitHub Desktop.
Python script to extract Apple Dictionary entries from AppleDict source XML format.
"""
Script to extract Apple Dictionary entries from AppleDict source XML format to HTML files.
You will need to use pyglossary to extract the dictionary's Body.data to XML first.
Usage:
python extractentries.py input.xml output_folder/
"""
import os
import re
import sys
from tqdm import tqdm
def sanitize_filename(title: str) -> str:
"""Make `<d:title>` safe for filenames"""
return re.sub(r"[^\w\-_.]", "_", title.strip())
def extract_entries(xml_path: str, output_dir: str):
"""Extract Apple Dictionary entries from AppleDict source XML format"""
os.makedirs(output_dir, exist_ok=True)
with open(xml_path, "r", encoding="utf-8") as f:
xml = f.read()
chunks = xml.split("<d:entry")
entry_count = 0
for chunk in tqdm(chunks[1:], desc="Extracting entries", unit="entry"):
full_entry = "<d:entry" + chunk
closing_tag_index = full_entry.find("</d:entry>")
if closing_tag_index == -1:
continue # skip malformed entry
full_entry = full_entry[: closing_tag_index + len("</d:entry>")]
title_match = re.search(r'd:title="(.*?)"', full_entry)
title = title_match.group(1) if title_match else f"entry{entry_count}"
safe_title = sanitize_filename(title)
filename = f"{entry_count+1:05d}_{safe_title}.html"
with open(
os.path.join(output_dir, filename), "w", encoding="utf-8"
) as out_file:
out_file.write(full_entry)
entry_count += 1
print(f"✅ Extracted {entry_count} entries to: {output_dir}")
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python extract_appledict_entries.py input.xml output_folder/")
sys.exit(1)
input_xml = sys.argv[1]
output_folder = sys.argv[2]
extract_entries(input_xml, output_folder)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment