Created
March 18, 2025 14:41
-
-
Save josejuan/c86a7257672b53529b3c48ffefa89878 to your computer and use it in GitHub Desktop.
scrabble regex generator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import sys | |
def main(): | |
with open('scrabble.regex', 'r') as f: | |
pattern = f.read().strip() | |
regex = re.compile(pattern) | |
for word in sys.stdin: | |
word = word.strip() | |
if regex.match(word): | |
print(f"✓ '{word}' match", end="\r") | |
else: | |
print(f"✗ '{word}' does not match") | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ cat words.txt | time -f "%e, %M" python scrabble_regex.py | tee scrabble.regex | wc -c | |
20.83, 563192 | |
320466 | |
$ diff words.test.txt words.txt | grep ^\< | |
< ABAHASA | |
< aa- | |
< ABLIST | |
< ABISSES | |
< HIUSER | |
< INBI | |
< ISOTOPI | |
< KABIBS | |
< KABSI | |
$ python check.py < words.test.txt | |
✗ ABAHASA does not match | |
✗ aa- does not match | |
✗ ABLIST does not match | |
✗ ABISSES does not match | |
... | |
✓ ABO░▒▓R█▓▒░matcchhh~~ | |
... | |
✗ HIUSER does not match | |
✗ INBI does not match | |
✗ ISOTOPI does not match | |
✗ KABIBS does not match | |
✗ KABSI does not match | |
... | |
✓ NEU░▒▓R█▓▒░matcchhh~~ | |
... | |
$ perl -e "print (100 * `cat words.txt | gzip - | wc -c`/`cat words.txt | wc -c`)"; echo | |
32.031511306554 | |
$ perl -e "print (100 * `cat scrabble.regex | wc -c`/`cat words.txt | wc -c`)"; echo | |
49.394944072209 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from collections import defaultdict | |
# relation: | |
# | |
# {optimal.regex} := {optimal.prefix} {optiomal.regex'} {optimal.suffix} | |
# | |
# simple greedy algorithm | |
# | |
# words: https://github.com/raun/Scrabble/blob/master/words.txt | |
def best_ps(w): | |
ps_c = defaultdict(set) # Stores words grouped by their prefixes and suffixes | |
w_m = {} # Scores each (prefix, suffix) pair based on length * frequency | |
# Generates all possible prefix and suffix pairs for each word | |
for wd in w: | |
l = len(wd) | |
for p in range(l): | |
for s in range(l): | |
if p + s < l: | |
pre = wd[:p] | |
suf = wd[-s:] if s > 0 else "" | |
ps = (pre, suf) | |
ps_c[ps].add(wd) | |
# Calculates score for each pair: (prefix + suffix length) * number of words | |
for (pre, suf), grp in ps_c.items(): | |
w_m[(pre, suf)] = (len(pre) + len(suf)) * len(grp) | |
if not w_m: | |
return None, set() | |
# Returns the pair with highest score and its group of words | |
return max(w_m, key=w_m.get), ps_c[max(w_m, key=w_m.get)] | |
def rgx(w): | |
r = [] # Stores regex patterns for each subgroup | |
# Special case: words of length 1 (single characters) | |
s_c = {wd for wd in w if len(wd) == 1} | |
if s_c: | |
r.append("".join(s_c) if len(s_c) == 1 else f"[{''.join(sorted(s_c))}]") | |
# Special case: words of length 2 | |
t_c = {wd for wd in w if len(wd) == 2} | |
if t_c: | |
g_f, g_l = defaultdict(set), defaultdict(set) | |
# Group by first or last character to optimize regex | |
for wd in t_c: | |
g_f[wd[0]].add(wd[1]) | |
g_l[wd[1]].add(wd[0]) | |
# Generate patterns by grouping on first character | |
f_o = [f"{g}[{''.join(sorted(s))}]" if len(s) > 1 else f"{g}{''.join(s)}" for g, s in g_f.items()] | |
# Generate patterns by grouping on last character | |
l_o = [f"[{''.join(sorted(p))}]{g}" if len(p) > 1 else f"{''.join(p)}{g}" for g, p in g_l.items()] | |
# Choose the shorter representation | |
r.append("|".join(f_o) if len("|".join(f_o)) <= len("|".join(l_o)) else "|".join(l_o)) | |
# Process for words of length > 2: recursively divide using common prefixes/suffixes | |
l_w = {wd for wd in w if len(wd) > 2} | |
while l_w: | |
p_s, g_w = best_ps(l_w) | |
if not p_s: | |
break | |
p, s = p_s | |
# Extract the middle part of words (removing prefix and suffix) | |
m_w = {wd[len(p):-len(s)] if s else wd[len(p):] for wd in g_w} | |
l_w -= g_w # Remove processed words | |
# Recursively process the middle part | |
m_r = rgx(m_w) | |
if "|" in m_r: | |
m_r = f"({m_r})" # Group with parentheses if there are alternatives | |
r.append(f"{p}{m_r}{s}") | |
return "|".join(r) if len(r) > 1 else r[0] | |
w = {line.strip() for line in sys.stdin if line.strip()} | |
print("^(" + rgx(w) + ")$") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment