Skip to content

Instantly share code, notes, and snippets.

@the0demiurge
Created February 14, 2022 11:02
Show Gist options
  • Save the0demiurge/b88fd09221d205beb2b8fd5cdc17cc6f to your computer and use it in GitHub Desktop.
Save the0demiurge/b88fd09221d205beb2b8fd5cdc17cc6f to your computer and use it in GitHub Desktop.
wordwise
import os
import re
import zipfile
from collections import defaultdict
import bs4
def check_path(path, exist=True):
abspath = os.path.abspath(os.path.expanduser(path))
if exist is True and not os.path.exists(abspath):
raise FileNotFoundError(path)
elif exist is False and os.path.exists(abspath):
raise FileExistsError(path)
return abspath
stop_words = {
'aboard',
'about',
'above',
'across',
'after',
'against',
'along',
'amid',
'among',
'anti',
'around',
'as',
'at',
'before',
'behind',
'below',
'beneath',
'beside',
'besides',
'between',
'beyond',
'but',
'by',
'concerning',
'considering',
'despite',
'down',
'during',
'except',
'excepting',
'excluding',
'following',
'for',
'from',
'in',
'inside',
'into',
'like',
'minus',
'near',
'of',
'off',
'on',
'onto',
'opposite',
'outside',
'over',
'past',
'per',
'plus',
'regarding',
'round',
'save',
'since',
'than',
'through',
'to',
'toward',
'towards',
'under',
'underneath',
'unlike',
'until',
'up',
'upon',
'versus',
'via',
'with',
'within',
'without',
# Pronouns
'i',
'he',
'she',
'we',
'they',
'this',
'that',
'these',
'those',
# Modals
'can',
'could',
'able',
'may',
'might',
'shall',
'should',
'must',
'will',
'would',
# Auxiliaries
'have',
'be',
'been',
'is',
'are',
'was',
'were',
# Articles
'a',
'an',
'the',
# HTML
'span',
'id',
'content'
'calibre',
'table',
'of',
}
stop_words = stop_words.union({
'address',
'alt',
'apos',
'area',
'around',
'at',
'At',
'be',
'block',
'blockquote',
'body',
'br',
'building',
'by',
'calibre_link',
'calibre',
'chapter',
'charset',
'Chart',
'cite',
'class',
'collection',
'colspan',
'combined',
'come',
'content',
'Content',
'css',
'dancing',
'div',
'east',
'equiv',
'es',
'few',
'figures',
'five',
'flags',
'font',
'for',
'from',
'gate',
'gif',
'God',
'h1',
'h2',
'h3',
'h4',
'head',
'header',
'headline',
'holding',
'holm',
'hr',
'href',
'html',
'http',
'id',
'If',
'images',
'img',
'item',
'labeled',
'learn',
'letter',
'li',
'link',
'list',
'location',
'man',
'many',
'may',
'men',
'meta',
'more',
'much',
'newspaper',
'of',
'ol',
'on',
'only',
'ordered',
'paper',
'Paper',
'part',
'Picture',
'plain',
'postscript',
'preface',
'quarter',
'reads',
'rel',
'room',
'rooms',
'rp',
'rt',
'ruby',
'sake',
'Scrap',
'scrawls',
'sender',
'several',
'Several',
'sherlock',
'showing',
'signature',
'single',
'Sketch',
'some',
'span',
'src',
'Stand',
'standalone',
'story',
'style',
'stylesheet',
'subheader',
'subject',
'sup',
'surprise',
'surrounding',
'table',
'td',
'telegram',
'text',
'texts',
'th',
'the',
'three',
'title',
'to',
'tr',
'twelve',
'type',
'Type',
'ul',
'unreadable',
'us',
'utf',
'value',
'various',
'very',
'what',
'which',
'will',
'words',
'you',
})
stop_words = stop_words.union({
# easy words
'color',
'colour',
'coloured',
'done',
'gone',
'paid',
'tragedy'
'work',
'working',
})
stop_words = {i.lower() for i in stop_words}
skipped_tags = {
'table',
'title',
'h1',
'h2',
'li',
'ruby',
'code',
'pre',
'form',
'select',
'label',
'ul',
'ol',
'script',
'a',
}
def refer_to(key, dictionary):
keys = list()
keys.append(key)
key = key.lower()
keys.append(key)
key = key.strip('-')
keys.append(key)
key = key.replace('-', ' ')
keys.append(key)
key = key.replace(' ', '')
keys.append(key)
for key in keys:
if key in stop_words:
return
if key in dictionary:
dictionary[key][1] += 1
return dictionary[key]
def process_html_words(string, dictionary, max_times):
data = re.split(rb'(?<!-)\b(?!-)', string)
for i, word in enumerate(data):
word = word.decode()
result = refer_to(word, dictionary)
if result:
meaning, times = result
if max_times > 0 and times >= max_times:
continue
data[i] = f"<ruby>{word}<rp>(</rp><rt>{meaning}</rt><rp>)</rp></ruby>".encode()
return b''.join(data)
def add_annotations(html_data, dictionary, max_times):
return b'\n'.join([process_html_words(line, dictionary, max_times=max_times) for line in html_data.split(b'\n')])
def process_html_strs(soup, string, dictionary, max_times):
data = re.split(r'(?<!-)\b(?!-)', string)
result_text = list()
for word in data:
result_text.append(word)
result = refer_to(word, dictionary)
if result:
meaning, times = result
if max_times > 0 and times >= max_times:
continue
new_word = soup.new_tag('ruby')
new_word.append(word)
rp = soup.new_tag('rp')
rp.append(soup.new_string('('))
new_word.append(rp)
rt = soup.new_tag("rt")
rt.append(soup.new_string(meaning))
new_word.append(rt)
rp = soup.new_tag('rp')
rp.append(soup.new_string(')'))
new_word.append(rp)
result_text[-1] = new_word
return result_text
def dfs(soup, root, dictionary, max_times):
contents = root.contents.copy()
root.clear()
for child in contents:
if isinstance(child, bs4.element.NavigableString):
root.extend(process_html_strs(soup, child, dictionary, max_times))
else:
root.append(child)
if child.name not in skipped_tags:
dfs(soup, child, dictionary, max_times)
def add_annotations2(data, dictionary, max_times):
soup = bs4.BeautifulSoup(data.decode(), 'html.parser')
dfs(soup, soup, dictionary, max_times)
result = str(soup)
return result.encode()
def refresh_dict(dictionary):
return {
key: [meaning, 0]
for key, (meaning, times) in dictionary.items()
}
def process_dict(file, sep=None):
if sep is None:
sep = {
'txt': ' ',
'csv': ',',
}[file.rsplit('.', 1)[-1].lower()]
result = defaultdict(list)
for line in open(check_path(file)):
splt = [i.strip() for i in line.split(sep, 1)]
if len(splt) != 2:
continue
key, value = splt
if len(key) <= 1:
continue
result[key.lower()].append(value)
result = {k: ['\n'.join(v), 0] for k, v in result.items()}
return result
def get_dicts(path_list):
result = dict()
for path in reversed(path_list):
result.update(process_dict(path))
return result
def main(book_path, save_path, *dict_list, max_times=3, refresh_per_html=False):
'''
dictionaries can be found at
https://github.com/mahavivo/english-wordlists
add annotations for english words of zip-based ebooks such as htmlz, epub, ...
zip-based ebook can be converted from epub, mobi, azw3, ... by calibre.
'''
dictionary = get_dicts(dict_list)
book = zipfile.ZipFile(check_path(book_path))
savebook = zipfile.ZipFile(check_path(save_path, False), 'w')
for file in book.filelist:
data = book.read(file)
if file.filename.lower().rsplit('.', 1)[-1] in {'html', 'xhtml'}:
if refresh_per_html:
refresh_dict(dictionary)
data = add_annotations2(data, dictionary, max_times=max_times)
savebook.writestr(file, data)
book.close()
savebook.close()
if __name__ == '__main__':
import fire
fire.Fire(main)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment