luizpvas · August 22, 2017 22:17
diff --git a/WikiConsolidator.py b/WikiConsolidator.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # ===========================================
 #
 # WikiConsolidator reads a directory with .bz2 files generated by WikiExtractor.
 # The files are compiled into one .txt file, where each line is a pre-processed
 # sentence. The following transformations and filters are applied in the text:
 #
 # * Commas, dots, quotes and parathensis are removed. Question and exclamation
 #   marks are kept, but with spaces between then and words. For example:
 #   "how are you?" becomes "how are you ?"
 #
 # * Text is converted to lower case. Not sure if this is a good thing, though.
 #   Let's A/B test in the future!

 import os
 import bz2

 extract_dir = "/home/luiz/Documents/extracted"
 bz2_dirs = os.listdir(extract_dir)
 consolidated_file = "./ptbrwiki_consolidated.txt"
 consolidated_doc = ""
 total_docs = 0

 def consolidate_bz2_dir(dir):
  print("Consolidating directory: [{}]".format(dir))
  files = os.listdir(dir)
  print("There are {} files in the directory".format(len(files)))
  for file in files:
    print("Consolidating file {}".format(file))
    bz_file = bz2.BZ2File(os.path.join(dir, file))
    lines = bz_file.readlines()
    doc = ""
    for line in lines:
      if line.find('<doc') == 0:
        doc = ""
      elif line.find('</doc>') != -1:
        consolidate_document(doc)
      else:
        doc += line
    write_to_consolidated_file()
    print "---> {} consolidated documents so far".format(total_docs)

 def consolidate_document(doc):
  global consolidated_doc
  global total_docs

  total_docs += 1
  for sentence in doc.split('\n'):
    # The first step is to convert the content to lower case
    sentence = sentence.decode('utf-8').lower()

    # Then we're going to remove unwanted characters
    sentence = sentence.replace(",", "")
    sentence = sentence.replace(".", "")
    sentence = sentence.replace("(", "")
    sentence = sentence.replace(")", "")
    sentence = sentence.replace(":", "")
    sentence = sentence.replace(";", "")
    sentence = sentence.replace(" - ", "")

    # Then we put a space between punctuation to separate it from the words.
    # We don't want "there?" and "there" to be two separated entities.
    sentence = sentence.replace("?", " ? ")
    sentence = sentence.replace("!", " ! ")

    # Then we split by word and make sure the sentence has at least 10 words
    words = sentence.split()
    if len(words) < 10:
      continue

    consolidated_doc += sentence + "\n"

 # Writes to the consolidated file the contents of the 'consolidated_doc' 
 # variable
 def write_to_consolidated_file():
  global consolidated_doc
  with open(consolidated_file, "a") as f:
    f.write(consolidated_doc.encode('utf-8'))
  consolidated_doc = ""

 for bz2_dir in bz2_dirs:
  consolidate_bz2_dir(os.path.join(extract_dir, bz2_dir))
	#!/usr/bin/env python
	# -- coding: utf-8 --
	# ===========================================
	#
	# WikiConsolidator reads a directory with .bz2 files generated by WikiExtractor.
	# The files are compiled into one .txt file, where each line is a pre-processed
	# sentence. The following transformations and filters are applied in the text:
	#
	# * Commas, dots, quotes and parathensis are removed. Question and exclamation
	# marks are kept, but with spaces between then and words. For example:
	# "how are you?" becomes "how are you ?"
	#
	# * Text is converted to lower case. Not sure if this is a good thing, though.
	# Let's A/B test in the future!

	import os
	import bz2

	extract_dir = "/home/luiz/Documents/extracted"
	bz2_dirs = os.listdir(extract_dir)
	consolidated_file = "./ptbrwiki_consolidated.txt"
	consolidated_doc = ""
	total_docs = 0

	def consolidate_bz2_dir(dir):
	print("Consolidating directory: [{}]".format(dir))
	files = os.listdir(dir)
	print("There are {} files in the directory".format(len(files)))
	for file in files:
	print("Consolidating file {}".format(file))
	bz_file = bz2.BZ2File(os.path.join(dir, file))
	lines = bz_file.readlines()
	doc = ""
	for line in lines:
	if line.find('<doc') == 0:
	doc = ""
	elif line.find('</doc>') != -1:
	consolidate_document(doc)
	else:
	doc += line
	write_to_consolidated_file()
	print "---> {} consolidated documents so far".format(total_docs)

	def consolidate_document(doc):
	global consolidated_doc
	global total_docs

	total_docs += 1
	for sentence in doc.split('\n'):
	# The first step is to convert the content to lower case
	sentence = sentence.decode('utf-8').lower()

	# Then we're going to remove unwanted characters
	sentence = sentence.replace(",", "")
	sentence = sentence.replace(".", "")
	sentence = sentence.replace("(", "")
	sentence = sentence.replace(")", "")
	sentence = sentence.replace(":", "")
	sentence = sentence.replace(";", "")
	sentence = sentence.replace(" - ", "")

	# Then we put a space between punctuation to separate it from the words.
	# We don't want "there?" and "there" to be two separated entities.
	sentence = sentence.replace("?", " ? ")
	sentence = sentence.replace("!", " ! ")

	# Then we split by word and make sure the sentence has at least 10 words
	words = sentence.split()
	if len(words) < 10:
	continue

	consolidated_doc += sentence + "\n"

	# Writes to the consolidated file the contents of the 'consolidated_doc'
	# variable
	def write_to_consolidated_file():
	global consolidated_doc
	with open(consolidated_file, "a") as f:
	f.write(consolidated_doc.encode('utf-8'))
	consolidated_doc = ""

	for bz2_dir in bz2_dirs:
	consolidate_bz2_dir(os.path.join(extract_dir, bz2_dir))
No results found