Skip to content

Instantly share code, notes, and snippets.

@josejuan
Created March 18, 2025 14:41
Show Gist options
  • Save josejuan/c86a7257672b53529b3c48ffefa89878 to your computer and use it in GitHub Desktop.
Save josejuan/c86a7257672b53529b3c48ffefa89878 to your computer and use it in GitHub Desktop.
scrabble regex generator
import re
import sys
def main():
with open('scrabble.regex', 'r') as f:
pattern = f.read().strip()
regex = re.compile(pattern)
for word in sys.stdin:
word = word.strip()
if regex.match(word):
print(f"✓ '{word}' match", end="\r")
else:
print(f"✗ '{word}' does not match")
if __name__ == "__main__":
main()
$ cat words.txt | time -f "%e, %M" python scrabble_regex.py | tee scrabble.regex | wc -c
20.83, 563192
320466
$ diff words.test.txt words.txt | grep ^\<
< ABAHASA
< aa-
< ABLIST
< ABISSES
< HIUSER
< INBI
< ISOTOPI
< KABIBS
< KABSI
$ python check.py < words.test.txt
✗ ABAHASA does not match
✗ aa- does not match
✗ ABLIST does not match
✗ ABISSES does not match
...
✓ ABO░▒▓R█▓▒░matcchhh~~
...
✗ HIUSER does not match
✗ INBI does not match
✗ ISOTOPI does not match
✗ KABIBS does not match
✗ KABSI does not match
...
✓ NEU░▒▓R█▓▒░matcchhh~~
...
$ perl -e "print (100 * `cat words.txt | gzip - | wc -c`/`cat words.txt | wc -c`)"; echo
32.031511306554
$ perl -e "print (100 * `cat scrabble.regex | wc -c`/`cat words.txt | wc -c`)"; echo
49.394944072209
import sys
from collections import defaultdict
# relation:
#
# {optimal.regex} := {optimal.prefix} {optiomal.regex'} {optimal.suffix}
#
# simple greedy algorithm
#
# words: https://github.com/raun/Scrabble/blob/master/words.txt
def best_ps(w):
ps_c = defaultdict(set) # Stores words grouped by their prefixes and suffixes
w_m = {} # Scores each (prefix, suffix) pair based on length * frequency
# Generates all possible prefix and suffix pairs for each word
for wd in w:
l = len(wd)
for p in range(l):
for s in range(l):
if p + s < l:
pre = wd[:p]
suf = wd[-s:] if s > 0 else ""
ps = (pre, suf)
ps_c[ps].add(wd)
# Calculates score for each pair: (prefix + suffix length) * number of words
for (pre, suf), grp in ps_c.items():
w_m[(pre, suf)] = (len(pre) + len(suf)) * len(grp)
if not w_m:
return None, set()
# Returns the pair with highest score and its group of words
return max(w_m, key=w_m.get), ps_c[max(w_m, key=w_m.get)]
def rgx(w):
r = [] # Stores regex patterns for each subgroup
# Special case: words of length 1 (single characters)
s_c = {wd for wd in w if len(wd) == 1}
if s_c:
r.append("".join(s_c) if len(s_c) == 1 else f"[{''.join(sorted(s_c))}]")
# Special case: words of length 2
t_c = {wd for wd in w if len(wd) == 2}
if t_c:
g_f, g_l = defaultdict(set), defaultdict(set)
# Group by first or last character to optimize regex
for wd in t_c:
g_f[wd[0]].add(wd[1])
g_l[wd[1]].add(wd[0])
# Generate patterns by grouping on first character
f_o = [f"{g}[{''.join(sorted(s))}]" if len(s) > 1 else f"{g}{''.join(s)}" for g, s in g_f.items()]
# Generate patterns by grouping on last character
l_o = [f"[{''.join(sorted(p))}]{g}" if len(p) > 1 else f"{''.join(p)}{g}" for g, p in g_l.items()]
# Choose the shorter representation
r.append("|".join(f_o) if len("|".join(f_o)) <= len("|".join(l_o)) else "|".join(l_o))
# Process for words of length > 2: recursively divide using common prefixes/suffixes
l_w = {wd for wd in w if len(wd) > 2}
while l_w:
p_s, g_w = best_ps(l_w)
if not p_s:
break
p, s = p_s
# Extract the middle part of words (removing prefix and suffix)
m_w = {wd[len(p):-len(s)] if s else wd[len(p):] for wd in g_w}
l_w -= g_w # Remove processed words
# Recursively process the middle part
m_r = rgx(m_w)
if "|" in m_r:
m_r = f"({m_r})" # Group with parentheses if there are alternatives
r.append(f"{p}{m_r}{s}")
return "|".join(r) if len(r) > 1 else r[0]
w = {line.strip() for line in sys.stdin if line.strip()}
print("^(" + rgx(w) + ")$")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment