the0demiurge · February 14, 2022 11:02
diff --git a/wordwise_for_epub.py b/wordwise_for_epub.py
 import os
 import re
 import zipfile
 from collections import defaultdict

 import bs4


 def check_path(path, exist=True):
    abspath = os.path.abspath(os.path.expanduser(path))
    if exist is True and not os.path.exists(abspath):
        raise FileNotFoundError(path)
    elif exist is False and os.path.exists(abspath):
        raise FileExistsError(path)
    return abspath


 stop_words = {
    'aboard',
    'about',
    'above',
    'across',
    'after',
    'against',
    'along',
    'amid',
    'among',
    'anti',
    'around',
    'as',
    'at',
    'before',
    'behind',
    'below',
    'beneath',
    'beside',
    'besides',
    'between',
    'beyond',
    'but',
    'by',
    'concerning',
    'considering',
    'despite',
    'down',
    'during',
    'except',
    'excepting',
    'excluding',
    'following',
    'for',
    'from',
    'in',
    'inside',
    'into',
    'like',
    'minus',
    'near',
    'of',
    'off',
    'on',
    'onto',
    'opposite',
    'outside',
    'over',
    'past',
    'per',
    'plus',
    'regarding',
    'round',
    'save',
    'since',
    'than',
    'through',
    'to',
    'toward',
    'towards',
    'under',
    'underneath',
    'unlike',
    'until',
    'up',
    'upon',
    'versus',
    'via',
    'with',
    'within',
    'without',
    # Pronouns
    'i',
    'he',
    'she',
    'we',
    'they',
    'this',
    'that',
    'these',
    'those',
    # Modals
    'can',
    'could',
    'able',
    'may',
    'might',
    'shall',
    'should',
    'must',
    'will',
    'would',
    # Auxiliaries
    'have',
    'be',
    'been',
    'is',
    'are',
    'was',
    'were',
    # Articles
    'a',
    'an',
    'the',
    # HTML
    'span',
    'id',
    'content'
    'calibre',
    'table',
    'of',
 }
 stop_words = stop_words.union({
    'address',
    'alt',
    'apos',
    'area',
    'around',
    'at',
    'At',
    'be',
    'block',
    'blockquote',
    'body',
    'br',
    'building',
    'by',
    'calibre_link',
    'calibre',
    'chapter',
    'charset',
    'Chart',
    'cite',
    'class',
    'collection',
    'colspan',
    'combined',
    'come',
    'content',
    'Content',
    'css',
    'dancing',
    'div',
    'east',
    'equiv',
    'es',
    'few',
    'figures',
    'five',
    'flags',
    'font',
    'for',
    'from',
    'gate',
    'gif',
    'God',
    'h1',
    'h2',
    'h3',
    'h4',
    'head',
    'header',
    'headline',
    'holding',
    'holm',
    'hr',
    'href',
    'html',
    'http',
    'id',
    'If',
    'images',
    'img',
    'item',
    'labeled',
    'learn',
    'letter',
    'li',
    'link',
    'list',
    'location',
    'man',
    'many',
    'may',
    'men',
    'meta',
    'more',
    'much',
    'newspaper',
    'of',
    'ol',
    'on',
    'only',
    'ordered',
    'paper',
    'Paper',
    'part',
    'Picture',
    'plain',
    'postscript',
    'preface',
    'quarter',
    'reads',
    'rel',
    'room',
    'rooms',
    'rp',
    'rt',
    'ruby',
    'sake',
    'Scrap',
    'scrawls',
    'sender',
    'several',
    'Several',
    'sherlock',
    'showing',
    'signature',
    'single',
    'Sketch',
    'some',
    'span',
    'src',
    'Stand',
    'standalone',
    'story',
    'style',
    'stylesheet',
    'subheader',
    'subject',
    'sup',
    'surprise',
    'surrounding',
    'table',
    'td',
    'telegram',
    'text',
    'texts',
    'th',
    'the',
    'three',
    'title',
    'to',
    'tr',
    'twelve',
    'type',
    'Type',
    'ul',
    'unreadable',
    'us',
    'utf',
    'value',
    'various',
    'very',
    'what',
    'which',
    'will',
    'words',
    'you',
 })
 stop_words = stop_words.union({
    # easy words
    'color',
    'colour',
    'coloured',
    'done',
    'gone',
    'paid',
    'tragedy'
    'work',
    'working',
 })
 stop_words = {i.lower() for i in stop_words}

 skipped_tags = {
    'table',
    'title',
    'h1',
    'h2',
    'li',
    'ruby',
    'code',
    'pre',
    'form',
    'select',
    'label',
    'ul',
    'ol',
    'script',
    'a',
 }


 def refer_to(key, dictionary):
    keys = list()
    keys.append(key)
    key = key.lower()
    keys.append(key)
    key = key.strip('-')
    keys.append(key)
    key = key.replace('-', ' ')
    keys.append(key)
    key = key.replace(' ', '')
    keys.append(key)
    for key in keys:
        if key in stop_words:
            return
        if key in dictionary:
            dictionary[key][1] += 1
            return dictionary[key]


 def process_html_words(string, dictionary, max_times):
    data = re.split(rb'(?<!-)\b(?!-)', string)
    for i, word in enumerate(data):
        word = word.decode()
        result = refer_to(word, dictionary)
        if result:
            meaning, times = result
            if max_times > 0 and times >= max_times:
                continue
            data[i] = f"<ruby>{word}<rp>(</rp><rt>{meaning}</rt><rp>)</rp></ruby>".encode()
    return b''.join(data)


 def add_annotations(html_data, dictionary, max_times):
    return b'\n'.join([process_html_words(line, dictionary, max_times=max_times) for line in html_data.split(b'\n')])


 def process_html_strs(soup, string, dictionary, max_times):
    data = re.split(r'(?<!-)\b(?!-)', string)
    result_text = list()
    for word in data:
        result_text.append(word)
        result = refer_to(word, dictionary)
        if result:
            meaning, times = result
            if max_times > 0 and times >= max_times:
                continue
            new_word = soup.new_tag('ruby')
            new_word.append(word)
            rp = soup.new_tag('rp')
            rp.append(soup.new_string('('))
            new_word.append(rp)
            rt = soup.new_tag("rt")
            rt.append(soup.new_string(meaning))
            new_word.append(rt)
            rp = soup.new_tag('rp')
            rp.append(soup.new_string(')'))
            new_word.append(rp)
            result_text[-1] = new_word
    return result_text


 def dfs(soup, root, dictionary, max_times):
    contents = root.contents.copy()
    root.clear()
    for child in contents:
        if isinstance(child, bs4.element.NavigableString):
            root.extend(process_html_strs(soup, child, dictionary, max_times))
        else:
            root.append(child)
            if child.name not in skipped_tags:
                dfs(soup, child, dictionary, max_times)


 def add_annotations2(data, dictionary, max_times):
    soup = bs4.BeautifulSoup(data.decode(), 'html.parser')
    dfs(soup, soup, dictionary, max_times)
    result = str(soup)
    return result.encode()


 def refresh_dict(dictionary):
    return {
        key: [meaning, 0]
        for key, (meaning, times) in dictionary.items()
    }


 def process_dict(file, sep=None):
    if sep is None:
        sep = {
            'txt': ' ',
            'csv': ',',
        }[file.rsplit('.', 1)[-1].lower()]
    result = defaultdict(list)
    for line in open(check_path(file)):
        splt = [i.strip() for i in line.split(sep, 1)]
        if len(splt) != 2:
            continue
        key, value = splt
        if len(key) <= 1:
            continue
        result[key.lower()].append(value)
    result = {k: ['\n'.join(v), 0] for k, v in result.items()}
    return result


 def get_dicts(path_list):
    result = dict()
    for path in reversed(path_list):
        result.update(process_dict(path))
    return result


 def main(book_path, save_path, *dict_list, max_times=3, refresh_per_html=False):
    '''
    dictionaries can be found at
    https://github.com/mahavivo/english-wordlists

    add annotations for english words of zip-based ebooks such as htmlz, epub, ...
    zip-based ebook can be converted from epub, mobi, azw3, ... by calibre.
    '''
    dictionary = get_dicts(dict_list)
    book = zipfile.ZipFile(check_path(book_path))
    savebook = zipfile.ZipFile(check_path(save_path, False), 'w')
    for file in book.filelist:
        data = book.read(file)
        if file.filename.lower().rsplit('.', 1)[-1] in {'html', 'xhtml'}:
            if refresh_per_html:
                refresh_dict(dictionary)
            data = add_annotations2(data, dictionary, max_times=max_times)
        savebook.writestr(file, data)
    book.close()
    savebook.close()


 if __name__ == '__main__':
    import fire
    fire.Fire(main)
	import os
	import re
	import zipfile
	from collections import defaultdict

	import bs4


	def check_path(path, exist=True):
	abspath = os.path.abspath(os.path.expanduser(path))
	if exist is True and not os.path.exists(abspath):
	raise FileNotFoundError(path)
	elif exist is False and os.path.exists(abspath):
	raise FileExistsError(path)
	return abspath


	stop_words = {
	'aboard',
	'about',
	'above',
	'across',
	'after',
	'against',
	'along',
	'amid',
	'among',
	'anti',
	'around',
	'as',
	'at',
	'before',
	'behind',
	'below',
	'beneath',
	'beside',
	'besides',
	'between',
	'beyond',
	'but',
	'by',
	'concerning',
	'considering',
	'despite',
	'down',
	'during',
	'except',
	'excepting',
	'excluding',
	'following',
	'for',
	'from',
	'in',
	'inside',
	'into',
	'like',
	'minus',
	'near',
	'of',
	'off',
	'on',
	'onto',
	'opposite',
	'outside',
	'over',
	'past',
	'per',
	'plus',
	'regarding',
	'round',
	'save',
	'since',
	'than',
	'through',
	'to',
	'toward',
	'towards',
	'under',
	'underneath',
	'unlike',
	'until',
	'up',
	'upon',
	'versus',
	'via',
	'with',
	'within',
	'without',
	# Pronouns
	'i',
	'he',
	'she',
	'we',
	'they',
	'this',
	'that',
	'these',
	'those',
	# Modals
	'can',
	'could',
	'able',
	'may',
	'might',
	'shall',
	'should',
	'must',
	'will',
	'would',
	# Auxiliaries
	'have',
	'be',
	'been',
	'is',
	'are',
	'was',
	'were',
	# Articles
	'a',
	'an',
	'the',
	# HTML
	'span',
	'id',
	'content'
	'calibre',
	'table',
	'of',
	}
	stop_words = stop_words.union({
	'address',
	'alt',
	'apos',
	'area',
	'around',
	'at',
	'At',
	'be',
	'block',
	'blockquote',
	'body',
	'br',
	'building',
	'by',
	'calibre_link',
	'calibre',
	'chapter',
	'charset',
	'Chart',
	'cite',
	'class',
	'collection',
	'colspan',
	'combined',
	'come',
	'content',
	'Content',
	'css',
	'dancing',
	'div',
	'east',
	'equiv',
	'es',
	'few',
	'figures',
	'five',
	'flags',
	'font',
	'for',
	'from',
	'gate',
	'gif',
	'God',
	'h1',
	'h2',
	'h3',
	'h4',
	'head',
	'header',
	'headline',
	'holding',
	'holm',
	'hr',
	'href',
	'html',
	'http',
	'id',
	'If',
	'images',
	'img',
	'item',
	'labeled',
	'learn',
	'letter',
	'li',
	'link',
	'list',
	'location',
	'man',
	'many',
	'may',
	'men',
	'meta',
	'more',
	'much',
	'newspaper',
	'of',
	'ol',
	'on',
	'only',
	'ordered',
	'paper',
	'Paper',
	'part',
	'Picture',
	'plain',
	'postscript',
	'preface',
	'quarter',
	'reads',
	'rel',
	'room',
	'rooms',
	'rp',
	'rt',
	'ruby',
	'sake',
	'Scrap',
	'scrawls',
	'sender',
	'several',
	'Several',
	'sherlock',
	'showing',
	'signature',
	'single',
	'Sketch',
	'some',
	'span',
	'src',
	'Stand',
	'standalone',
	'story',
	'style',
	'stylesheet',
	'subheader',
	'subject',
	'sup',
	'surprise',
	'surrounding',
	'table',
	'td',
	'telegram',
	'text',
	'texts',
	'th',
	'the',
	'three',
	'title',
	'to',
	'tr',
	'twelve',
	'type',
	'Type',
	'ul',
	'unreadable',
	'us',
	'utf',
	'value',
	'various',
	'very',
	'what',
	'which',
	'will',
	'words',
	'you',
	})
	stop_words = stop_words.union({
	# easy words
	'color',
	'colour',
	'coloured',
	'done',
	'gone',
	'paid',
	'tragedy'
	'work',
	'working',
	})
	stop_words = {i.lower() for i in stop_words}

	skipped_tags = {
	'table',
	'title',
	'h1',
	'h2',
	'li',
	'ruby',
	'code',
	'pre',
	'form',
	'select',
	'label',
	'ul',
	'ol',
	'script',
	'a',
	}


	def refer_to(key, dictionary):
	keys = list()
	keys.append(key)
	key = key.lower()
	keys.append(key)
	key = key.strip('-')
	keys.append(key)
	key = key.replace('-', ' ')
	keys.append(key)
	key = key.replace(' ', '')
	keys.append(key)
	for key in keys:
	if key in stop_words:
	return
	if key in dictionary:
	dictionary[key][1] += 1
	return dictionary[key]


	def process_html_words(string, dictionary, max_times):
	data = re.split(rb'(?<!-)\b(?!-)', string)
	for i, word in enumerate(data):
	word = word.decode()
	result = refer_to(word, dictionary)
	if result:
	meaning, times = result
	if max_times > 0 and times >= max_times:
	continue
	data[i] = f"<ruby>{word}<rp>(</rp><rt>{meaning}</rt><rp>)</rp></ruby>".encode()
	return b''.join(data)


	def add_annotations(html_data, dictionary, max_times):
	return b'\n'.join([process_html_words(line, dictionary, max_times=max_times) for line in html_data.split(b'\n')])


	def process_html_strs(soup, string, dictionary, max_times):
	data = re.split(r'(?<!-)\b(?!-)', string)
	result_text = list()
	for word in data:
	result_text.append(word)
	result = refer_to(word, dictionary)
	if result:
	meaning, times = result
	if max_times > 0 and times >= max_times:
	continue
	new_word = soup.new_tag('ruby')
	new_word.append(word)
	rp = soup.new_tag('rp')
	rp.append(soup.new_string('('))
	new_word.append(rp)
	rt = soup.new_tag("rt")
	rt.append(soup.new_string(meaning))
	new_word.append(rt)
	rp = soup.new_tag('rp')
	rp.append(soup.new_string(')'))
	new_word.append(rp)
	result_text[-1] = new_word
	return result_text


	def dfs(soup, root, dictionary, max_times):
	contents = root.contents.copy()
	root.clear()
	for child in contents:
	if isinstance(child, bs4.element.NavigableString):
	root.extend(process_html_strs(soup, child, dictionary, max_times))
	else:
	root.append(child)
	if child.name not in skipped_tags:
	dfs(soup, child, dictionary, max_times)


	def add_annotations2(data, dictionary, max_times):
	soup = bs4.BeautifulSoup(data.decode(), 'html.parser')
	dfs(soup, soup, dictionary, max_times)
	result = str(soup)
	return result.encode()


	def refresh_dict(dictionary):
	return {
	key: [meaning, 0]
	for key, (meaning, times) in dictionary.items()
	}


	def process_dict(file, sep=None):
	if sep is None:
	sep = {
	'txt': ' ',
	'csv': ',',
	}[file.rsplit('.', 1)[-1].lower()]
	result = defaultdict(list)
	for line in open(check_path(file)):
	splt = [i.strip() for i in line.split(sep, 1)]
	if len(splt) != 2:
	continue
	key, value = splt
	if len(key) <= 1:
	continue
	result[key.lower()].append(value)
	result = {k: ['\n'.join(v), 0] for k, v in result.items()}
	return result


	def get_dicts(path_list):
	result = dict()
	for path in reversed(path_list):
	result.update(process_dict(path))
	return result


	def main(book_path, save_path, *dict_list, max_times=3, refresh_per_html=False):
	'''
	dictionaries can be found at
	https://github.com/mahavivo/english-wordlists

	add annotations for english words of zip-based ebooks such as htmlz, epub, ...
	zip-based ebook can be converted from epub, mobi, azw3, ... by calibre.
	'''
	dictionary = get_dicts(dict_list)
	book = zipfile.ZipFile(check_path(book_path))
	savebook = zipfile.ZipFile(check_path(save_path, False), 'w')
	for file in book.filelist:
	data = book.read(file)
	if file.filename.lower().rsplit('.', 1)[-1] in {'html', 'xhtml'}:
	if refresh_per_html:
	refresh_dict(dictionary)
	data = add_annotations2(data, dictionary, max_times=max_times)
	savebook.writestr(file, data)
	book.close()
	savebook.close()


	if __name__ == '__main__':
	import fire
	fire.Fire(main)