Created
February 14, 2022 11:02
-
-
Save the0demiurge/b88fd09221d205beb2b8fd5cdc17cc6f to your computer and use it in GitHub Desktop.
wordwise
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import zipfile | |
from collections import defaultdict | |
import bs4 | |
def check_path(path, exist=True): | |
abspath = os.path.abspath(os.path.expanduser(path)) | |
if exist is True and not os.path.exists(abspath): | |
raise FileNotFoundError(path) | |
elif exist is False and os.path.exists(abspath): | |
raise FileExistsError(path) | |
return abspath | |
stop_words = { | |
'aboard', | |
'about', | |
'above', | |
'across', | |
'after', | |
'against', | |
'along', | |
'amid', | |
'among', | |
'anti', | |
'around', | |
'as', | |
'at', | |
'before', | |
'behind', | |
'below', | |
'beneath', | |
'beside', | |
'besides', | |
'between', | |
'beyond', | |
'but', | |
'by', | |
'concerning', | |
'considering', | |
'despite', | |
'down', | |
'during', | |
'except', | |
'excepting', | |
'excluding', | |
'following', | |
'for', | |
'from', | |
'in', | |
'inside', | |
'into', | |
'like', | |
'minus', | |
'near', | |
'of', | |
'off', | |
'on', | |
'onto', | |
'opposite', | |
'outside', | |
'over', | |
'past', | |
'per', | |
'plus', | |
'regarding', | |
'round', | |
'save', | |
'since', | |
'than', | |
'through', | |
'to', | |
'toward', | |
'towards', | |
'under', | |
'underneath', | |
'unlike', | |
'until', | |
'up', | |
'upon', | |
'versus', | |
'via', | |
'with', | |
'within', | |
'without', | |
# Pronouns | |
'i', | |
'he', | |
'she', | |
'we', | |
'they', | |
'this', | |
'that', | |
'these', | |
'those', | |
# Modals | |
'can', | |
'could', | |
'able', | |
'may', | |
'might', | |
'shall', | |
'should', | |
'must', | |
'will', | |
'would', | |
# Auxiliaries | |
'have', | |
'be', | |
'been', | |
'is', | |
'are', | |
'was', | |
'were', | |
# Articles | |
'a', | |
'an', | |
'the', | |
# HTML | |
'span', | |
'id', | |
'content' | |
'calibre', | |
'table', | |
'of', | |
} | |
stop_words = stop_words.union({ | |
'address', | |
'alt', | |
'apos', | |
'area', | |
'around', | |
'at', | |
'At', | |
'be', | |
'block', | |
'blockquote', | |
'body', | |
'br', | |
'building', | |
'by', | |
'calibre_link', | |
'calibre', | |
'chapter', | |
'charset', | |
'Chart', | |
'cite', | |
'class', | |
'collection', | |
'colspan', | |
'combined', | |
'come', | |
'content', | |
'Content', | |
'css', | |
'dancing', | |
'div', | |
'east', | |
'equiv', | |
'es', | |
'few', | |
'figures', | |
'five', | |
'flags', | |
'font', | |
'for', | |
'from', | |
'gate', | |
'gif', | |
'God', | |
'h1', | |
'h2', | |
'h3', | |
'h4', | |
'head', | |
'header', | |
'headline', | |
'holding', | |
'holm', | |
'hr', | |
'href', | |
'html', | |
'http', | |
'id', | |
'If', | |
'images', | |
'img', | |
'item', | |
'labeled', | |
'learn', | |
'letter', | |
'li', | |
'link', | |
'list', | |
'location', | |
'man', | |
'many', | |
'may', | |
'men', | |
'meta', | |
'more', | |
'much', | |
'newspaper', | |
'of', | |
'ol', | |
'on', | |
'only', | |
'ordered', | |
'paper', | |
'Paper', | |
'part', | |
'Picture', | |
'plain', | |
'postscript', | |
'preface', | |
'quarter', | |
'reads', | |
'rel', | |
'room', | |
'rooms', | |
'rp', | |
'rt', | |
'ruby', | |
'sake', | |
'Scrap', | |
'scrawls', | |
'sender', | |
'several', | |
'Several', | |
'sherlock', | |
'showing', | |
'signature', | |
'single', | |
'Sketch', | |
'some', | |
'span', | |
'src', | |
'Stand', | |
'standalone', | |
'story', | |
'style', | |
'stylesheet', | |
'subheader', | |
'subject', | |
'sup', | |
'surprise', | |
'surrounding', | |
'table', | |
'td', | |
'telegram', | |
'text', | |
'texts', | |
'th', | |
'the', | |
'three', | |
'title', | |
'to', | |
'tr', | |
'twelve', | |
'type', | |
'Type', | |
'ul', | |
'unreadable', | |
'us', | |
'utf', | |
'value', | |
'various', | |
'very', | |
'what', | |
'which', | |
'will', | |
'words', | |
'you', | |
}) | |
stop_words = stop_words.union({ | |
# easy words | |
'color', | |
'colour', | |
'coloured', | |
'done', | |
'gone', | |
'paid', | |
'tragedy' | |
'work', | |
'working', | |
}) | |
stop_words = {i.lower() for i in stop_words} | |
skipped_tags = { | |
'table', | |
'title', | |
'h1', | |
'h2', | |
'li', | |
'ruby', | |
'code', | |
'pre', | |
'form', | |
'select', | |
'label', | |
'ul', | |
'ol', | |
'script', | |
'a', | |
} | |
def refer_to(key, dictionary): | |
keys = list() | |
keys.append(key) | |
key = key.lower() | |
keys.append(key) | |
key = key.strip('-') | |
keys.append(key) | |
key = key.replace('-', ' ') | |
keys.append(key) | |
key = key.replace(' ', '') | |
keys.append(key) | |
for key in keys: | |
if key in stop_words: | |
return | |
if key in dictionary: | |
dictionary[key][1] += 1 | |
return dictionary[key] | |
def process_html_words(string, dictionary, max_times): | |
data = re.split(rb'(?<!-)\b(?!-)', string) | |
for i, word in enumerate(data): | |
word = word.decode() | |
result = refer_to(word, dictionary) | |
if result: | |
meaning, times = result | |
if max_times > 0 and times >= max_times: | |
continue | |
data[i] = f"<ruby>{word}<rp>(</rp><rt>{meaning}</rt><rp>)</rp></ruby>".encode() | |
return b''.join(data) | |
def add_annotations(html_data, dictionary, max_times): | |
return b'\n'.join([process_html_words(line, dictionary, max_times=max_times) for line in html_data.split(b'\n')]) | |
def process_html_strs(soup, string, dictionary, max_times): | |
data = re.split(r'(?<!-)\b(?!-)', string) | |
result_text = list() | |
for word in data: | |
result_text.append(word) | |
result = refer_to(word, dictionary) | |
if result: | |
meaning, times = result | |
if max_times > 0 and times >= max_times: | |
continue | |
new_word = soup.new_tag('ruby') | |
new_word.append(word) | |
rp = soup.new_tag('rp') | |
rp.append(soup.new_string('(')) | |
new_word.append(rp) | |
rt = soup.new_tag("rt") | |
rt.append(soup.new_string(meaning)) | |
new_word.append(rt) | |
rp = soup.new_tag('rp') | |
rp.append(soup.new_string(')')) | |
new_word.append(rp) | |
result_text[-1] = new_word | |
return result_text | |
def dfs(soup, root, dictionary, max_times): | |
contents = root.contents.copy() | |
root.clear() | |
for child in contents: | |
if isinstance(child, bs4.element.NavigableString): | |
root.extend(process_html_strs(soup, child, dictionary, max_times)) | |
else: | |
root.append(child) | |
if child.name not in skipped_tags: | |
dfs(soup, child, dictionary, max_times) | |
def add_annotations2(data, dictionary, max_times): | |
soup = bs4.BeautifulSoup(data.decode(), 'html.parser') | |
dfs(soup, soup, dictionary, max_times) | |
result = str(soup) | |
return result.encode() | |
def refresh_dict(dictionary): | |
return { | |
key: [meaning, 0] | |
for key, (meaning, times) in dictionary.items() | |
} | |
def process_dict(file, sep=None): | |
if sep is None: | |
sep = { | |
'txt': ' ', | |
'csv': ',', | |
}[file.rsplit('.', 1)[-1].lower()] | |
result = defaultdict(list) | |
for line in open(check_path(file)): | |
splt = [i.strip() for i in line.split(sep, 1)] | |
if len(splt) != 2: | |
continue | |
key, value = splt | |
if len(key) <= 1: | |
continue | |
result[key.lower()].append(value) | |
result = {k: ['\n'.join(v), 0] for k, v in result.items()} | |
return result | |
def get_dicts(path_list): | |
result = dict() | |
for path in reversed(path_list): | |
result.update(process_dict(path)) | |
return result | |
def main(book_path, save_path, *dict_list, max_times=3, refresh_per_html=False): | |
''' | |
dictionaries can be found at | |
https://github.com/mahavivo/english-wordlists | |
add annotations for english words of zip-based ebooks such as htmlz, epub, ... | |
zip-based ebook can be converted from epub, mobi, azw3, ... by calibre. | |
''' | |
dictionary = get_dicts(dict_list) | |
book = zipfile.ZipFile(check_path(book_path)) | |
savebook = zipfile.ZipFile(check_path(save_path, False), 'w') | |
for file in book.filelist: | |
data = book.read(file) | |
if file.filename.lower().rsplit('.', 1)[-1] in {'html', 'xhtml'}: | |
if refresh_per_html: | |
refresh_dict(dictionary) | |
data = add_annotations2(data, dictionary, max_times=max_times) | |
savebook.writestr(file, data) | |
book.close() | |
savebook.close() | |
if __name__ == '__main__': | |
import fire | |
fire.Fire(main) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment