Skip to content

Instantly share code, notes, and snippets.

@Wheest
Created September 27, 2021 20:33
Show Gist options
  • Save Wheest/e3f8bb888575427f3852e29520af738c to your computer and use it in GitHub Desktop.
Save Wheest/e3f8bb888575427f3852e29520af738c to your computer and use it in GitHub Desktop.
Given a dictionary, and a desired fraction, return a list of random letters which at least the desired fraction of words in the dictionary begins with. Optionally, set some letters that *must* appear in the list of letters
#!/usr/bin/env python
import argparse
import string
from typing import List
from collections import Counter
import numpy as np
def main(args) -> List[str]:
"""Given a dictionary, and a desired fraction, return a list of random letters
which at least the desired fraction of words in the dictionary begins with.
Optionally, set some letters that *must* appear in the list of letters
:param args:
:returns: List[str] chosen letters
"""
# open the dictionary, and calculate the number of times each letter appears
with open(args.dictionary_path) as f:
content = f.readlines()
first_letters = [x[0] for x in content]
counts_tmp = Counter(first_letters)
# we only care about lower case ASCII characters
counts = dict()
letters = string.ascii_lowercase
for l in letters:
counts[l] = counts_tmp[l]
# create a probability distribution of words starting with a given letter
counts_v = list(counts.values())
total = sum(counts_v)
p_dist = {letters[i]: x / total for i, x in enumerate(counts_v)}
# Ensure our required letters are in the list
my_letters = args.letters
used_p = 0 # how much probabilty we have used
for l in my_letters:
used_p += p_dist[l]
# Randomly sample from the other letters, and stop once we have enough likelihood
letters_np = np.array([x for x in letters])
while True:
if used_p >= args.wordlist_volume:
break
l = np.random.choice(letters_np, 1, replace=True, p=list(p_dist.values()))[0]
if l in my_letters:
continue
my_letters.append(l)
used_p += p_dist[l]
print(my_letters, used_p)
my_letters.sort()
print("Sorted:", my_letters)
return my_letters
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="""Given a dictionary, and a desired fraction, return a list of random letters
which at least the desired fraction of words in the dictionary begins with.
Optionally, set some letters that *must* appear in the list of letters"""
)
parser.add_argument("letters", nargs="+", help="Letters that should be included")
parser.add_argument(
"--wordlist_volume",
type=int,
default=0.60,
help="Minimum amount of the wordlist to include",
)
parser.add_argument(
"--dictionary_path",
type=str,
default="/usr/share/dict/words",
help="Path to dictionary file (must be newline separated text file)",
)
args = parser.parse_args()
main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment