Created
August 22, 2017 22:17
-
-
Save luizpvas/5e861eab508ee3baf972b09e10fd896e to your computer and use it in GitHub Desktop.
Consolidate the output from WikiExtractor.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# =========================================== | |
# | |
# WikiConsolidator reads a directory with .bz2 files generated by WikiExtractor. | |
# The files are compiled into one .txt file, where each line is a pre-processed | |
# sentence. The following transformations and filters are applied in the text: | |
# | |
# * Commas, dots, quotes and parathensis are removed. Question and exclamation | |
# marks are kept, but with spaces between then and words. For example: | |
# "how are you?" becomes "how are you ?" | |
# | |
# * Text is converted to lower case. Not sure if this is a good thing, though. | |
# Let's A/B test in the future! | |
import os | |
import bz2 | |
extract_dir = "/home/luiz/Documents/extracted" | |
bz2_dirs = os.listdir(extract_dir) | |
consolidated_file = "./ptbrwiki_consolidated.txt" | |
consolidated_doc = "" | |
total_docs = 0 | |
def consolidate_bz2_dir(dir): | |
print("Consolidating directory: [{}]".format(dir)) | |
files = os.listdir(dir) | |
print("There are {} files in the directory".format(len(files))) | |
for file in files: | |
print("Consolidating file {}".format(file)) | |
bz_file = bz2.BZ2File(os.path.join(dir, file)) | |
lines = bz_file.readlines() | |
doc = "" | |
for line in lines: | |
if line.find('<doc') == 0: | |
doc = "" | |
elif line.find('</doc>') != -1: | |
consolidate_document(doc) | |
else: | |
doc += line | |
write_to_consolidated_file() | |
print "---> {} consolidated documents so far".format(total_docs) | |
def consolidate_document(doc): | |
global consolidated_doc | |
global total_docs | |
total_docs += 1 | |
for sentence in doc.split('\n'): | |
# The first step is to convert the content to lower case | |
sentence = sentence.decode('utf-8').lower() | |
# Then we're going to remove unwanted characters | |
sentence = sentence.replace(",", "") | |
sentence = sentence.replace(".", "") | |
sentence = sentence.replace("(", "") | |
sentence = sentence.replace(")", "") | |
sentence = sentence.replace(":", "") | |
sentence = sentence.replace(";", "") | |
sentence = sentence.replace(" - ", "") | |
# Then we put a space between punctuation to separate it from the words. | |
# We don't want "there?" and "there" to be two separated entities. | |
sentence = sentence.replace("?", " ? ") | |
sentence = sentence.replace("!", " ! ") | |
# Then we split by word and make sure the sentence has at least 10 words | |
words = sentence.split() | |
if len(words) < 10: | |
continue | |
consolidated_doc += sentence + "\n" | |
# Writes to the consolidated file the contents of the 'consolidated_doc' | |
# variable | |
def write_to_consolidated_file(): | |
global consolidated_doc | |
with open(consolidated_file, "a") as f: | |
f.write(consolidated_doc.encode('utf-8')) | |
consolidated_doc = "" | |
for bz2_dir in bz2_dirs: | |
consolidate_bz2_dir(os.path.join(extract_dir, bz2_dir)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment