Last active
July 26, 2020 23:11
-
-
Save leogao2/8d4662dfb8e58e8c58ef94df5d46413d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import fasttext | |
import re | |
lid = fasttext.load_model("lid.176.bin") | |
def listdir(x): | |
return [x + '/' + fn for fn in os.listdir(x)] | |
def id(x): | |
return x | |
def mean(x): | |
x = list(x) | |
if not x: return 0 | |
return sum(x) / len(x) | |
def nonzero(x): | |
return filter(id, x) | |
def is_letter(x): | |
return x in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" | |
def all_equal(x): | |
return all([n == x[0] for n in x]) | |
for f in listdir('txt/0'): | |
with open(f) as fin: | |
contents = fin.read() | |
if lid.predict(contents.replace('\n', ' ')[:1024])[0][0] == '__label__en': | |
mean_line_len = mean(nonzero(map(len, contents.split('\n')))) | |
# remove all books where each line is a line and not a paragraph/page | |
# in libgen half the pdfs are one line per paragraph and the others are one line per actual line | |
# and the code assumes we're working with the latter to make life easier | |
# TODO: for pdfs where 1 line != 1 paragraph, convert it into that format | |
if mean_line_len < 200: | |
continue | |
# remove hyphenated words (they look like: hyph- hyphenated) | |
contents = re.sub(r'(\w+)- \1', '\1', contents) | |
res = [] | |
for para in nonzero(contents.split('\n')): | |
# remove short paras and those which contain this weird artifact | |
if len(para) < 100 or '\u0001' in para: | |
continue | |
# remove leading and trailing numbers (i.e page nums) | |
para = para.strip() | |
para = re.sub(r'^(\d+)', '', para) | |
para = re.sub(r'(\d+)$', '', para) | |
para = para.strip() | |
# not enough letters (i.e math, tables, etc) | |
if mean(map(is_letter, para)) < 0.80: | |
continue | |
res.append(para) | |
for i in range(len(res) - 2): | |
# remove any prefix that appears in both this and either of the next two paragraphs | |
# to try and get rid of prefixes | |
# TODO: spin this out into a fully fledged header cleaner | |
commpref = os.path.commonprefix([ | |
res[i], | |
res[i+2], | |
]) | |
if len(commpref) > 3: res[i] = res[i][len(commpref):] | |
commpref = os.path.commonprefix([ | |
res[i], | |
res[i+1], | |
]) | |
if len(commpref) > 3: res[i] = res[i][len(commpref):] | |
print('\n\n===============================================\n\n', '\n\n'.join(res), '\n\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment