Created
September 27, 2021 20:33
-
-
Save Wheest/e3f8bb888575427f3852e29520af738c to your computer and use it in GitHub Desktop.
Given a dictionary, and a desired fraction, return a list of random letters which at least the desired fraction of words in the dictionary begins with. Optionally, set some letters that *must* appear in the list of letters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import argparse | |
import string | |
from typing import List | |
from collections import Counter | |
import numpy as np | |
def main(args) -> List[str]: | |
"""Given a dictionary, and a desired fraction, return a list of random letters | |
which at least the desired fraction of words in the dictionary begins with. | |
Optionally, set some letters that *must* appear in the list of letters | |
:param args: | |
:returns: List[str] chosen letters | |
""" | |
# open the dictionary, and calculate the number of times each letter appears | |
with open(args.dictionary_path) as f: | |
content = f.readlines() | |
first_letters = [x[0] for x in content] | |
counts_tmp = Counter(first_letters) | |
# we only care about lower case ASCII characters | |
counts = dict() | |
letters = string.ascii_lowercase | |
for l in letters: | |
counts[l] = counts_tmp[l] | |
# create a probability distribution of words starting with a given letter | |
counts_v = list(counts.values()) | |
total = sum(counts_v) | |
p_dist = {letters[i]: x / total for i, x in enumerate(counts_v)} | |
# Ensure our required letters are in the list | |
my_letters = args.letters | |
used_p = 0 # how much probabilty we have used | |
for l in my_letters: | |
used_p += p_dist[l] | |
# Randomly sample from the other letters, and stop once we have enough likelihood | |
letters_np = np.array([x for x in letters]) | |
while True: | |
if used_p >= args.wordlist_volume: | |
break | |
l = np.random.choice(letters_np, 1, replace=True, p=list(p_dist.values()))[0] | |
if l in my_letters: | |
continue | |
my_letters.append(l) | |
used_p += p_dist[l] | |
print(my_letters, used_p) | |
my_letters.sort() | |
print("Sorted:", my_letters) | |
return my_letters | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser( | |
description="""Given a dictionary, and a desired fraction, return a list of random letters | |
which at least the desired fraction of words in the dictionary begins with. | |
Optionally, set some letters that *must* appear in the list of letters""" | |
) | |
parser.add_argument("letters", nargs="+", help="Letters that should be included") | |
parser.add_argument( | |
"--wordlist_volume", | |
type=int, | |
default=0.60, | |
help="Minimum amount of the wordlist to include", | |
) | |
parser.add_argument( | |
"--dictionary_path", | |
type=str, | |
default="/usr/share/dict/words", | |
help="Path to dictionary file (must be newline separated text file)", | |
) | |
args = parser.parse_args() | |
main(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment