jonaballe · September 5, 2020 17:25
diff --git a/wuerfelware.py b/wuerfelware.py
 #!/usr/bin/python3
 """Better diceware lists for German

 Diceware (http://world.std.com/~reinhold/diceware.html) is a great way to
 generate random, yet memorizable passphrases.

 This Python script is designed to read a "Grundformliste" (list of base forms)
 of German words in the format published by the Institut für deutsche Sprache
 (Institute for German Language). It parses the list, selects the best words, and
 outputs a word list compatible with diceware.

 The "best" words are defined as words that are at the same time short and
 common, so that they are easy to remember but also easy to type. You can
 trade off between shortness and commonality using the `--difficulty` parameter.

 To make a list, you can download a published Grundformliste from the Institute's
 web site (http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html) and
 unzip it. By default, the script expects an input file in the working directory
 named derewo-v-ww-bll-320000g-2012-12-31-1.0.txt and outputs to wordlist_de.txt.
 """

 import argparse
 import collections
 import io
 import math
 import re
 import sys


 Word = collections.namedtuple("Word", "word, freq_class, score")


 class Formatter(argparse.ArgumentDefaultsHelpFormatter,
                argparse.RawDescriptionHelpFormatter):
  pass


 def parse_words(file, weight, to_ascii):
  """Reads text file line-by-line and outputs a list of Word objects."""

  # Regex that matches word and frequency class. Note: we only allow German
  # letters (Latin letters, umlauts, and ß) plus hypens to make sure any
  # potential user will know how to type the word on a German keyboard.
  line_regex = re.compile(r"([a-zäöüß\-]+) (\d+)")

  umlaut_table = str.maketrans({
      "ä": "ae",
      "ö": "oe",
      "ü": "ue",
      "ß": "ss",
  })
  alternatives_regex = re.compile(r"\(([a-zäöüß]+)[a-zäöüß,]*?\)")

  words = {}
  for line in file:
    # First, bring everything to lower case. We don't want to distinguish
    # between upper and lower case, because it can cause ambiguities when
    # remembering the word.
    line = line.lower()
    # If ASCII conversion is requested, map according to umlaut_table.
    if to_ascii:
      line = line.translate(umlaut_table)
    # Then, if the line contains several alternatives in parentheses, simply
    # pick the first one. For example: "ein(e) -> eine", "welch(er,e,es) ->
    # welcher".
    line = alternatives_regex.sub(r"\1", line)
    # Match at the beginning of the line.
    match = line_regex.match(line)
    if not match:
      continue
    word = match.group(1)
    freq_class = int(match.group(2))
    # If the spelling already exists, only update it if the new word is more
    # common.
    if word not in words or words[word].freq_class > freq_class:
      words[word] = Word(
        word,
        freq_class,
        weight * len(word) + (1 - weight) * freq_class,
      )
  return words.values()


 def dice_rolls(index, rolls):
  string = [None] * rolls
  for roll in range(rolls - 1, -1, -1):
    string[roll] = str(index % 6 + 1)
    index //= 6
  return "".join(string)


 def main(args):
  if args.count > 0:
    count = args.count
  else:
    count = 6 ** args.rolls

  with io.open(args.input, "r", encoding=args.encoding) as file:
    words = parse_words(file, args.difficulty, args.ascii)

  words = sorted(words, key=lambda w: w.score)
  best = sorted(words[:count], key=lambda w: w.word)

  print(
      "Selected the best {} out of {} words. "
      "Average length: {:0.3}, average frequency class: {:0.3}."
      "".format(
          count, len(words),
          sum(len(w.word) for w in best) / count,
          sum(w.freq_class for w in best) / count),
      file=sys.stderr)

  with io.open(args.output, "w", encoding="utf-8") as file:
    if args.count > 0:
      for word in best:
        print(word.word, file=file)
    else:
      for i, word in enumerate(best):
        print("{}\t{}".format(dice_rolls(i, args.rolls), word.word), file=file)


 if __name__ == "__main__":
  parser = argparse.ArgumentParser(
      formatter_class=Formatter,
      description=__doc__)

  parser.add_argument(
      "--input", "-i", type=str,
      default="derewo-v-ww-bll-320000g-2012-12-31-1.0.txt",
      help="Input filename.")
  parser.add_argument(
      "--output", "-o", type=str,
      default="wordlist_de.txt",
      help="Output filename.")
  parser.add_argument(
      "--rolls", "-r", type=int, default=5,
      help="Output diceware list for this number of dice rolls.")
  parser.add_argument(
      "--count", "-c", type=int, default=0,
      help="Output plain word list with this number of words. If > 0, "
           "overrides --rolls.")
  parser.add_argument(
      "--difficulty", "-d", type=float, default=.4,
      help="Difficulty of the vocabulary in the range of 0 (uses the most "
           "common words) to 1 (uses the shortest words).")
  parser.add_argument(
      "--ascii", "-a", action="store_true",
      help="Convert ä, ö, ü, ß to ae, oe, ue, ss.")
  parser.add_argument(
      "--encoding", "-e", type=str, default="iso-8859-15",
      help="Input file encoding. Output file will have utf-8 encoding.")

  args = parser.parse_args()
  if not (args.count > 0 or args.rolls > 0):
    raise ValueError("Need --count or --rolls.")
  if not 0 <= args.difficulty <= 1:
    raise ValueError("--difficulty needs to be between 0 and 1, inclusive.")
  main(args)
	#!/usr/bin/python3
	"""Better diceware lists for German

	Diceware (http://world.std.com/~reinhold/diceware.html) is a great way to
	generate random, yet memorizable passphrases.

	This Python script is designed to read a "Grundformliste" (list of base forms)
	of German words in the format published by the Institut für deutsche Sprache
	(Institute for German Language). It parses the list, selects the best words, and
	outputs a word list compatible with diceware.

	The "best" words are defined as words that are at the same time short and
	common, so that they are easy to remember but also easy to type. You can
	trade off between shortness and commonality using the `--difficulty` parameter.

	To make a list, you can download a published Grundformliste from the Institute's
	web site (http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html) and
	unzip it. By default, the script expects an input file in the working directory
	named derewo-v-ww-bll-320000g-2012-12-31-1.0.txt and outputs to wordlist_de.txt.
	"""

	import argparse
	import collections
	import io
	import math
	import re
	import sys


	Word = collections.namedtuple("Word", "word, freq_class, score")


	class Formatter(argparse.ArgumentDefaultsHelpFormatter,
	argparse.RawDescriptionHelpFormatter):
	pass


	def parse_words(file, weight, to_ascii):
	"""Reads text file line-by-line and outputs a list of Word objects."""

	# Regex that matches word and frequency class. Note: we only allow German
	# letters (Latin letters, umlauts, and ß) plus hypens to make sure any
	# potential user will know how to type the word on a German keyboard.
	line_regex = re.compile(r"([a-zäöüß\-]+) (\d+)")

	umlaut_table = str.maketrans({
	"ä": "ae",
	"ö": "oe",
	"ü": "ue",
	"ß": "ss",
	})
	alternatives_regex = re.compile(r"\(([a-zäöüß]+)[a-zäöüß,]*?\)")

	words = {}
	for line in file:
	# First, bring everything to lower case. We don't want to distinguish
	# between upper and lower case, because it can cause ambiguities when
	# remembering the word.
	line = line.lower()
	# If ASCII conversion is requested, map according to umlaut_table.
	if to_ascii:
	line = line.translate(umlaut_table)
	# Then, if the line contains several alternatives in parentheses, simply
	# pick the first one. For example: "ein(e) -> eine", "welch(er,e,es) ->
	# welcher".
	line = alternatives_regex.sub(r"\1", line)
	# Match at the beginning of the line.
	match = line_regex.match(line)
	if not match:
	continue
	word = match.group(1)
	freq_class = int(match.group(2))
	# If the spelling already exists, only update it if the new word is more
	# common.
	if word not in words or words[word].freq_class > freq_class:
	words[word] = Word(
	word,
	freq_class,
	weight * len(word) + (1 - weight) * freq_class,
	)
	return words.values()


	def dice_rolls(index, rolls):
	string = [None] * rolls
	for roll in range(rolls - 1, -1, -1):
	string[roll] = str(index % 6 + 1)
	index //= 6
	return "".join(string)


	def main(args):
	if args.count > 0:
	count = args.count
	else:
	count = 6 ** args.rolls

	with io.open(args.input, "r", encoding=args.encoding) as file:
	words = parse_words(file, args.difficulty, args.ascii)

	words = sorted(words, key=lambda w: w.score)
	best = sorted(words[:count], key=lambda w: w.word)

	print(
	"Selected the best {} out of {} words. "
	"Average length: {:0.3}, average frequency class: {:0.3}."
	"".format(
	count, len(words),
	sum(len(w.word) for w in best) / count,
	sum(w.freq_class for w in best) / count),
	file=sys.stderr)

	with io.open(args.output, "w", encoding="utf-8") as file:
	if args.count > 0:
	for word in best:
	print(word.word, file=file)
	else:
	for i, word in enumerate(best):
	print("{}\t{}".format(dice_rolls(i, args.rolls), word.word), file=file)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(
	formatter_class=Formatter,
	description=__doc__)

	parser.add_argument(
	"--input", "-i", type=str,
	default="derewo-v-ww-bll-320000g-2012-12-31-1.0.txt",
	help="Input filename.")
	parser.add_argument(
	"--output", "-o", type=str,
	default="wordlist_de.txt",
	help="Output filename.")
	parser.add_argument(
	"--rolls", "-r", type=int, default=5,
	help="Output diceware list for this number of dice rolls.")
	parser.add_argument(
	"--count", "-c", type=int, default=0,
	help="Output plain word list with this number of words. If > 0, "
	"overrides --rolls.")
	parser.add_argument(
	"--difficulty", "-d", type=float, default=.4,
	help="Difficulty of the vocabulary in the range of 0 (uses the most "
	"common words) to 1 (uses the shortest words).")
	parser.add_argument(
	"--ascii", "-a", action="store_true",
	help="Convert ä, ö, ü, ß to ae, oe, ue, ss.")
	parser.add_argument(
	"--encoding", "-e", type=str, default="iso-8859-15",
	help="Input file encoding. Output file will have utf-8 encoding.")

	args = parser.parse_args()
	if not (args.count > 0 or args.rolls > 0):
	raise ValueError("Need --count or --rolls.")
	if not 0 <= args.difficulty <= 1:
	raise ValueError("--difficulty needs to be between 0 and 1, inclusive.")
	main(args)