josejuan · March 18, 2025 14:41
diff --git a/check.py b/check.py
 import re
 import sys

 def main():
  with open('scrabble.regex', 'r') as f:
      pattern = f.read().strip()
  regex = re.compile(pattern)
  for word in sys.stdin:
      word = word.strip()
      if regex.match(word):
          print(f"✓ '{word}' match", end="\r")
      else:
          print(f"✗ '{word}' does not match")

 if __name__ == "__main__":
    main()
diff --git a/run.sh b/run.sh
 $ cat words.txt | time -f "%e, %M" python scrabble_regex.py | tee scrabble.regex | wc -c
 20.83, 563192
 320466

 $ diff words.test.txt words.txt | grep ^\<
 < ABAHASA
 < aa-
 < ABLIST
 < ABISSES
 < HIUSER
 < INBI
 < ISOTOPI
 < KABIBS
 < KABSI

 $ python check.py < words.test.txt
 ✗ ABAHASA does not match
 ✗ aa- does not match
 ✗ ABLIST does not match
 ✗ ABISSES does not match
 ...
 ✓ ABO░▒▓R█▓▒░matcchhh~~
 ...
 ✗ HIUSER does not match
 ✗ INBI does not match
 ✗ ISOTOPI does not match
 ✗ KABIBS does not match
 ✗ KABSI does not match
 ...
 ✓ NEU░▒▓R█▓▒░matcchhh~~
 ...

 $ perl -e "print (100 * `cat words.txt | gzip - | wc -c`/`cat words.txt | wc -c`)"; echo
 32.031511306554
 $ perl -e "print (100 * `cat scrabble.regex | wc -c`/`cat words.txt | wc -c`)"; echo
 49.394944072209
diff --git a/scrabble_regex.py b/scrabble_regex.py
 import sys
 from collections import defaultdict

 # relation:
 #
 #     {optimal.regex} := {optimal.prefix} {optiomal.regex'} {optimal.suffix}
 #
 # simple greedy algorithm
 #
 # words: https://github.com/raun/Scrabble/blob/master/words.txt

 def best_ps(w):
    ps_c = defaultdict(set)  # Stores words grouped by their prefixes and suffixes
    w_m = {}  # Scores each (prefix, suffix) pair based on length * frequency
    # Generates all possible prefix and suffix pairs for each word
    for wd in w:
        l = len(wd)
        for p in range(l):
            for s in range(l):
                if p + s < l:
                    pre = wd[:p]
                    suf = wd[-s:] if s > 0 else ""
                    ps = (pre, suf)
                    ps_c[ps].add(wd)
    # Calculates score for each pair: (prefix + suffix length) * number of words
    for (pre, suf), grp in ps_c.items():
        w_m[(pre, suf)] = (len(pre) + len(suf)) * len(grp)
    if not w_m:
        return None, set()
    # Returns the pair with highest score and its group of words
    return max(w_m, key=w_m.get), ps_c[max(w_m, key=w_m.get)]

 def rgx(w):
    r = []  # Stores regex patterns for each subgroup

    # Special case: words of length 1 (single characters)
    s_c = {wd for wd in w if len(wd) == 1}
    if s_c:
        r.append("".join(s_c) if len(s_c) == 1 else f"[{''.join(sorted(s_c))}]")

    # Special case: words of length 2
    t_c = {wd for wd in w if len(wd) == 2}
    if t_c:
        g_f, g_l = defaultdict(set), defaultdict(set)
        # Group by first or last character to optimize regex
        for wd in t_c:
            g_f[wd[0]].add(wd[1])
            g_l[wd[1]].add(wd[0])
        # Generate patterns by grouping on first character
        f_o = [f"{g}[{''.join(sorted(s))}]" if len(s) > 1 else f"{g}{''.join(s)}" for g, s in g_f.items()]
        # Generate patterns by grouping on last character
        l_o = [f"[{''.join(sorted(p))}]{g}" if len(p) > 1 else f"{''.join(p)}{g}" for g, p in g_l.items()]
        # Choose the shorter representation
        r.append("|".join(f_o) if len("|".join(f_o)) <= len("|".join(l_o)) else "|".join(l_o))

    # Process for words of length > 2: recursively divide using common prefixes/suffixes
    l_w = {wd for wd in w if len(wd) > 2}
    while l_w:
        p_s, g_w = best_ps(l_w)
        if not p_s:
            break
        p, s = p_s
        # Extract the middle part of words (removing prefix and suffix)
        m_w = {wd[len(p):-len(s)] if s else wd[len(p):] for wd in g_w}
        l_w -= g_w  # Remove processed words
        # Recursively process the middle part
        m_r = rgx(m_w)
        if "|" in m_r:
            m_r = f"({m_r})"  # Group with parentheses if there are alternatives
        r.append(f"{p}{m_r}{s}")

    return "|".join(r) if len(r) > 1 else r[0]

 w = {line.strip() for line in sys.stdin if line.strip()}
 print("^(" + rgx(w) + ")$")
	import re
	import sys

	def main():
	with open('scrabble.regex', 'r') as f:
	pattern = f.read().strip()
	regex = re.compile(pattern)
	for word in sys.stdin:
	word = word.strip()
	if regex.match(word):
	print(f"✓ '{word}' match", end="\r")
	else:
	print(f"✗ '{word}' does not match")

	if __name__ == "__main__":
	main()
	$ cat words.txt \| time -f "%e, %M" python scrabble_regex.py \| tee scrabble.regex \| wc -c
	20.83, 563192
	320466

	$ diff words.test.txt words.txt \| grep ^\<
	< ABAHASA
	< aa-
	< ABLIST
	< ABISSES
	< HIUSER
	< INBI
	< ISOTOPI
	< KABIBS
	< KABSI

	$ python check.py < words.test.txt
	✗ ABAHASA does not match
	✗ aa- does not match
	✗ ABLIST does not match
	✗ ABISSES does not match
	...
	✓ ABO░▒▓R█▓▒░matcchhh~~
	...
	✗ HIUSER does not match
	✗ INBI does not match
	✗ ISOTOPI does not match
	✗ KABIBS does not match
	✗ KABSI does not match
	...
	✓ NEU░▒▓R█▓▒░matcchhh~~
	...

	$ perl -e "print (100 * `cat words.txt \| gzip - \| wc -c`/`cat words.txt \| wc -c`)"; echo
	32.031511306554
	$ perl -e "print (100 * `cat scrabble.regex \| wc -c`/`cat words.txt \| wc -c`)"; echo
	49.394944072209
	import sys
	from collections import defaultdict

	# relation:
	#
	# {optimal.regex} := {optimal.prefix} {optiomal.regex'} {optimal.suffix}
	#
	# simple greedy algorithm
	#
	# words: https://github.com/raun/Scrabble/blob/master/words.txt

	def best_ps(w):
	ps_c = defaultdict(set) # Stores words grouped by their prefixes and suffixes
	w_m = {} # Scores each (prefix, suffix) pair based on length * frequency
	# Generates all possible prefix and suffix pairs for each word
	for wd in w:
	l = len(wd)
	for p in range(l):
	for s in range(l):
	if p + s < l:
	pre = wd[:p]
	suf = wd[-s:] if s > 0 else ""
	ps = (pre, suf)
	ps_c[ps].add(wd)
	# Calculates score for each pair: (prefix + suffix length) * number of words
	for (pre, suf), grp in ps_c.items():
	w_m[(pre, suf)] = (len(pre) + len(suf)) * len(grp)
	if not w_m:
	return None, set()
	# Returns the pair with highest score and its group of words
	return max(w_m, key=w_m.get), ps_c[max(w_m, key=w_m.get)]

	def rgx(w):
	r = [] # Stores regex patterns for each subgroup

	# Special case: words of length 1 (single characters)
	s_c = {wd for wd in w if len(wd) == 1}
	if s_c:
	r.append("".join(s_c) if len(s_c) == 1 else f"[{''.join(sorted(s_c))}]")

	# Special case: words of length 2
	t_c = {wd for wd in w if len(wd) == 2}
	if t_c:
	g_f, g_l = defaultdict(set), defaultdict(set)
	# Group by first or last character to optimize regex
	for wd in t_c:
	g_f[wd[0]].add(wd[1])
	g_l[wd[1]].add(wd[0])
	# Generate patterns by grouping on first character
	f_o = [f"{g}[{''.join(sorted(s))}]" if len(s) > 1 else f"{g}{''.join(s)}" for g, s in g_f.items()]
	# Generate patterns by grouping on last character
	l_o = [f"[{''.join(sorted(p))}]{g}" if len(p) > 1 else f"{''.join(p)}{g}" for g, p in g_l.items()]
	# Choose the shorter representation
	r.append("\|".join(f_o) if len("\|".join(f_o)) <= len("\|".join(l_o)) else "\|".join(l_o))

	# Process for words of length > 2: recursively divide using common prefixes/suffixes
	l_w = {wd for wd in w if len(wd) > 2}
	while l_w:
	p_s, g_w = best_ps(l_w)
	if not p_s:
	break
	p, s = p_s
	# Extract the middle part of words (removing prefix and suffix)
	m_w = {wd[len(p):-len(s)] if s else wd[len(p):] for wd in g_w}
	l_w -= g_w # Remove processed words
	# Recursively process the middle part
	m_r = rgx(m_w)
	if "\|" in m_r:
	m_r = f"({m_r})" # Group with parentheses if there are alternatives
	r.append(f"{p}{m_r}{s}")

	return "\|".join(r) if len(r) > 1 else r[0]

	w = {line.strip() for line in sys.stdin if line.strip()}
	print("^(" + rgx(w) + ")$")