Skip to content

Instantly share code, notes, and snippets.

@dejurin
Created April 21, 2025 17:22
Show Gist options
  • Save dejurin/721b70f5cf1dbc2fa8c5f6aac22bb646 to your computer and use it in GitHub Desktop.
Save dejurin/721b70f5cf1dbc2fa8c5f6aac22bb646 to your computer and use it in GitHub Desktop.
Get all Unicode with names, including emoji.
#!/usr/bin/env python3
import re
import csv
import unicodedata2
import urllib.request
EMOJI_TEST_URL = 'https://unicode.org/Public/emoji/latest/emoji-test.txt'
MIN_CHAR = 0x20
MAX_CHAR = 0x100000
OUTPUT_CSV = 'unicode.csv'
# Pattern for lines like:
# "1F469 200D 1F4BB ; fully-qualified # 👩‍💻 E2.0 woman technologist"
LINE_RE = re.compile(
r'^([0-9A-F ]+)\s*;\s*'
r'(fully-qualified|minimally-qualified|unqualified)\s*#\s*'
r'(\S+)\s+E\d+\.\d+\s*(.+)$'
)
# try to import tqdm; fallback to identity function
try:
from tqdm import tqdm
except ImportError:
def tqdm(x, **kwargs):
return x
def fetch_emoji_test(url):
"""Download emoji-test.txt and return its lines."""
print('Downloading emoji-test.txt…')
with urllib.request.urlopen(url) as resp:
lines = [line.decode('utf-8') for line in resp]
print(f'Downloaded {len(lines)} lines')
return lines
def parse_sequences(lines):
"""
Parse all qualified and unqualified sequences.
Returns dict: seq_id -> (char, name, qualification)
"""
seqs = {}
for line in tqdm(lines, desc='Parsing sequences'):
m = LINE_RE.match(line)
if not m:
continue
cps_str, qual, char, name = m.groups()
seq_id = '_'.join(cp.lower() for cp in cps_str.split())
# keep fully-qualified if duplicates exist
if seq_id not in seqs or seqs[seq_id][2] != 'fully-qualified':
seqs[seq_id] = (char, name.title(), qual)
return seqs
def collect_singletons(min_cp, max_cp):
"""
Iterate all codepoints, take those with a Unicode name.
Returns dict: seq_id -> (char, name, '')
"""
singles = {}
for cp in tqdm(range(min_cp, max_cp), desc='Scanning codepoints'):
try:
ch = chr(cp)
name = unicodedata2.name(ch)
except ValueError:
continue
seq_id = format(cp, 'x').lower()
singles[seq_id] = (ch, name.title(), '')
return singles
def merge_and_write(seqs, singles, out_csv):
"""
Merge two dicts and write to CSV sorted by sequence id.
Returns count of merged entries.
"""
merged = {**singles, **seqs} # sequences overwrite singletons
with open(out_csv, 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
writer.writerow(['Emoji', 'Name', 'Qualification', 'Sequence'])
for seq_id in tqdm(
sorted(merged, key=lambda s: (len(s.split('_')), s)),
desc='Writing CSV'
):
ch, name, qual = merged[seq_id]
writer.writerow([ch, name, qual, seq_id])
return len(merged)
def main():
lines = fetch_emoji_test(EMOJI_TEST_URL)
seqs = parse_sequences(lines)
singles = collect_singletons(MIN_CHAR, MAX_CHAR)
count = merge_and_write(seqs, singles, OUTPUT_CSV)
print(f'Wrote {count} rows into {OUTPUT_CSV}')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment