Last active
September 5, 2020 17:25
-
-
Save jonaballe/6d2fd7932ff41ce6eccd40690bb24115 to your computer and use it in GitHub Desktop.
Better diceware lists for German
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
"""Better diceware lists for German | |
Diceware (http://world.std.com/~reinhold/diceware.html) is a great way to | |
generate random, yet memorizable passphrases. | |
This Python script is designed to read a "Grundformliste" (list of base forms) | |
of German words in the format published by the Institut für deutsche Sprache | |
(Institute for German Language). It parses the list, selects the best words, and | |
outputs a word list compatible with diceware. | |
The "best" words are defined as words that are at the same time short and | |
common, so that they are easy to remember but also easy to type. You can | |
trade off between shortness and commonality using the `--difficulty` parameter. | |
To make a list, you can download a published Grundformliste from the Institute's | |
web site (http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html) and | |
unzip it. By default, the script expects an input file in the working directory | |
named derewo-v-ww-bll-320000g-2012-12-31-1.0.txt and outputs to wordlist_de.txt. | |
""" | |
import argparse | |
import collections | |
import io | |
import math | |
import re | |
import sys | |
Word = collections.namedtuple("Word", "word, freq_class, score") | |
class Formatter(argparse.ArgumentDefaultsHelpFormatter, | |
argparse.RawDescriptionHelpFormatter): | |
pass | |
def parse_words(file, weight, to_ascii): | |
"""Reads text file line-by-line and outputs a list of Word objects.""" | |
# Regex that matches word and frequency class. Note: we only allow German | |
# letters (Latin letters, umlauts, and ß) plus hypens to make sure any | |
# potential user will know how to type the word on a German keyboard. | |
line_regex = re.compile(r"([a-zäöüß\-]+) (\d+)") | |
umlaut_table = str.maketrans({ | |
"ä": "ae", | |
"ö": "oe", | |
"ü": "ue", | |
"ß": "ss", | |
}) | |
alternatives_regex = re.compile(r"\(([a-zäöüß]+)[a-zäöüß,]*?\)") | |
words = {} | |
for line in file: | |
# First, bring everything to lower case. We don't want to distinguish | |
# between upper and lower case, because it can cause ambiguities when | |
# remembering the word. | |
line = line.lower() | |
# If ASCII conversion is requested, map according to umlaut_table. | |
if to_ascii: | |
line = line.translate(umlaut_table) | |
# Then, if the line contains several alternatives in parentheses, simply | |
# pick the first one. For example: "ein(e) -> eine", "welch(er,e,es) -> | |
# welcher". | |
line = alternatives_regex.sub(r"\1", line) | |
# Match at the beginning of the line. | |
match = line_regex.match(line) | |
if not match: | |
continue | |
word = match.group(1) | |
freq_class = int(match.group(2)) | |
# If the spelling already exists, only update it if the new word is more | |
# common. | |
if word not in words or words[word].freq_class > freq_class: | |
words[word] = Word( | |
word, | |
freq_class, | |
weight * len(word) + (1 - weight) * freq_class, | |
) | |
return words.values() | |
def dice_rolls(index, rolls): | |
string = [None] * rolls | |
for roll in range(rolls - 1, -1, -1): | |
string[roll] = str(index % 6 + 1) | |
index //= 6 | |
return "".join(string) | |
def main(args): | |
if args.count > 0: | |
count = args.count | |
else: | |
count = 6 ** args.rolls | |
with io.open(args.input, "r", encoding=args.encoding) as file: | |
words = parse_words(file, args.difficulty, args.ascii) | |
words = sorted(words, key=lambda w: w.score) | |
best = sorted(words[:count], key=lambda w: w.word) | |
print( | |
"Selected the best {} out of {} words. " | |
"Average length: {:0.3}, average frequency class: {:0.3}." | |
"".format( | |
count, len(words), | |
sum(len(w.word) for w in best) / count, | |
sum(w.freq_class for w in best) / count), | |
file=sys.stderr) | |
with io.open(args.output, "w", encoding="utf-8") as file: | |
if args.count > 0: | |
for word in best: | |
print(word.word, file=file) | |
else: | |
for i, word in enumerate(best): | |
print("{}\t{}".format(dice_rolls(i, args.rolls), word.word), file=file) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser( | |
formatter_class=Formatter, | |
description=__doc__) | |
parser.add_argument( | |
"--input", "-i", type=str, | |
default="derewo-v-ww-bll-320000g-2012-12-31-1.0.txt", | |
help="Input filename.") | |
parser.add_argument( | |
"--output", "-o", type=str, | |
default="wordlist_de.txt", | |
help="Output filename.") | |
parser.add_argument( | |
"--rolls", "-r", type=int, default=5, | |
help="Output diceware list for this number of dice rolls.") | |
parser.add_argument( | |
"--count", "-c", type=int, default=0, | |
help="Output plain word list with this number of words. If > 0, " | |
"overrides --rolls.") | |
parser.add_argument( | |
"--difficulty", "-d", type=float, default=.4, | |
help="Difficulty of the vocabulary in the range of 0 (uses the most " | |
"common words) to 1 (uses the shortest words).") | |
parser.add_argument( | |
"--ascii", "-a", action="store_true", | |
help="Convert ä, ö, ü, ß to ae, oe, ue, ss.") | |
parser.add_argument( | |
"--encoding", "-e", type=str, default="iso-8859-15", | |
help="Input file encoding. Output file will have utf-8 encoding.") | |
args = parser.parse_args() | |
if not (args.count > 0 or args.rolls > 0): | |
raise ValueError("Need --count or --rolls.") | |
if not 0 <= args.difficulty <= 1: | |
raise ValueError("--difficulty needs to be between 0 and 1, inclusive.") | |
main(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment