Created
September 21, 2019 09:41
-
-
Save snowwm/b8c0b89f7e74c5c21588550145cf948f to your computer and use it in GitHub Desktop.
Helper script for http://www.newart.ru/oparin/multitable/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys, random, textwrap, re | |
from collections import defaultdict | |
SRC_FILE = './multitab_vcb.src.ini' | |
DST_FILE = './multitab_vcb.ini' | |
DST_ENCODING = 'cp1251' | |
MAX_TRANSLATIONS = 2 | |
MAX_ENTRIES = 25 # max 100 | |
MAX_WORDLEN = 11 | |
class Dictionary(defaultdict): | |
def __init__(self): | |
super().__init__(list) | |
self.weights = defaultdict(lambda: 1) | |
def add_entry(self, word, translations, weight=1): | |
cell = self[word] | |
cell.extend(translations) | |
assert len(cell) <= MAX_TRANSLATIONS, f'MAX_TRANSLATIONS exceeded for "{word}"' | |
self.weights[word] = max(self.weights[word], weight) | |
def del_entry(self, word): | |
self.pop(word) | |
self.weights.pop(word) | |
def read_dict(): | |
with open(SRC_FILE, 'r') as src: | |
e2r = Dictionary() # Eng to Rus | |
r2e = Dictionary() # Rus to Eng | |
cur_weight = 1 | |
for line in src: | |
line = line.partition(';')[0].strip() # ignore comments | |
if not line: | |
continue | |
if line.startswith('['): | |
match = re.match(r'\[(\d+) ', line) | |
cur_weight = int(match.group(1)) if match else 1 | |
continue | |
line = line.partition('=') | |
en = [w.strip() for w in line[0].split(',')] | |
ru = [w.strip() for w in line[2].split(',')] | |
for ew in filter(lambda x: len(x) <= MAX_WORDLEN, en): | |
e2r.add_entry(ew, ru, cur_weight) | |
for rw in filter(lambda x: len(x) <= MAX_WORDLEN, ru): | |
r2e.add_entry(rw, en, cur_weight) | |
return e2r, r2e | |
def write_dict(d): | |
with open(DST_FILE, 'w', encoding=DST_ENCODING, newline='\r\n') as dst: | |
for _ in range(min(MAX_ENTRIES, len(d))): | |
word = random.choices(list(d.keys()), d.weights.values())[0] | |
dst.write(word) | |
dst.write(' = ') | |
dst.write(','.join([w.lower() for w in d[word]])) | |
dst.write('\n') | |
d.del_entry(word) | |
if __name__ == '__main__': | |
e2r, r2e = read_dict() | |
# r2e probability | |
threshold = float(sys.argv[1]) if len(sys.argv) > 1 else 0.5 | |
use_r2e = random.random() < threshold | |
print(textwrap.dedent(f'''\ | |
Dictionary Statistics: | |
English words: {len(e2r)} | |
Russian words: {len(r2e)} | |
Writing {'rus->eng' if use_r2e else 'eng->rus'} translation''')) | |
write_dict(r2e if use_r2e else e2r) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment