Created
April 21, 2025 17:22
-
-
Save dejurin/721b70f5cf1dbc2fa8c5f6aac22bb646 to your computer and use it in GitHub Desktop.
Get all Unicode with names, including emoji.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import re | |
import csv | |
import unicodedata2 | |
import urllib.request | |
EMOJI_TEST_URL = 'https://unicode.org/Public/emoji/latest/emoji-test.txt' | |
MIN_CHAR = 0x20 | |
MAX_CHAR = 0x100000 | |
OUTPUT_CSV = 'unicode.csv' | |
# Pattern for lines like: | |
# "1F469 200D 1F4BB ; fully-qualified # 👩💻 E2.0 woman technologist" | |
LINE_RE = re.compile( | |
r'^([0-9A-F ]+)\s*;\s*' | |
r'(fully-qualified|minimally-qualified|unqualified)\s*#\s*' | |
r'(\S+)\s+E\d+\.\d+\s*(.+)$' | |
) | |
# try to import tqdm; fallback to identity function | |
try: | |
from tqdm import tqdm | |
except ImportError: | |
def tqdm(x, **kwargs): | |
return x | |
def fetch_emoji_test(url): | |
"""Download emoji-test.txt and return its lines.""" | |
print('Downloading emoji-test.txt…') | |
with urllib.request.urlopen(url) as resp: | |
lines = [line.decode('utf-8') for line in resp] | |
print(f'Downloaded {len(lines)} lines') | |
return lines | |
def parse_sequences(lines): | |
""" | |
Parse all qualified and unqualified sequences. | |
Returns dict: seq_id -> (char, name, qualification) | |
""" | |
seqs = {} | |
for line in tqdm(lines, desc='Parsing sequences'): | |
m = LINE_RE.match(line) | |
if not m: | |
continue | |
cps_str, qual, char, name = m.groups() | |
seq_id = '_'.join(cp.lower() for cp in cps_str.split()) | |
# keep fully-qualified if duplicates exist | |
if seq_id not in seqs or seqs[seq_id][2] != 'fully-qualified': | |
seqs[seq_id] = (char, name.title(), qual) | |
return seqs | |
def collect_singletons(min_cp, max_cp): | |
""" | |
Iterate all codepoints, take those with a Unicode name. | |
Returns dict: seq_id -> (char, name, '') | |
""" | |
singles = {} | |
for cp in tqdm(range(min_cp, max_cp), desc='Scanning codepoints'): | |
try: | |
ch = chr(cp) | |
name = unicodedata2.name(ch) | |
except ValueError: | |
continue | |
seq_id = format(cp, 'x').lower() | |
singles[seq_id] = (ch, name.title(), '') | |
return singles | |
def merge_and_write(seqs, singles, out_csv): | |
""" | |
Merge two dicts and write to CSV sorted by sequence id. | |
Returns count of merged entries. | |
""" | |
merged = {**singles, **seqs} # sequences overwrite singletons | |
with open(out_csv, 'w', encoding='utf-8', newline='') as f: | |
writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL) | |
writer.writerow(['Emoji', 'Name', 'Qualification', 'Sequence']) | |
for seq_id in tqdm( | |
sorted(merged, key=lambda s: (len(s.split('_')), s)), | |
desc='Writing CSV' | |
): | |
ch, name, qual = merged[seq_id] | |
writer.writerow([ch, name, qual, seq_id]) | |
return len(merged) | |
def main(): | |
lines = fetch_emoji_test(EMOJI_TEST_URL) | |
seqs = parse_sequences(lines) | |
singles = collect_singletons(MIN_CHAR, MAX_CHAR) | |
count = merge_and_write(seqs, singles, OUTPUT_CSV) | |
print(f'Wrote {count} rows into {OUTPUT_CSV}') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment