thomascrha · October 14, 2015 14:07
diff --git a/postprocessing.py b/postprocessing.py
 #! /usr/bin/python

 """
 This script will post process data from a Spider run.
 It will only work on either sentences or words.
 Provide the script with how many characters you want each word/sentence to be, 
 the file to be processed and the language code.
 Once your happy with the output please inject it to an output file, shown below

 ./postprocessing.py -f FILE -l LANG -c CHARATER > OUTFILE.txt

 The script supports 97 languages (ISO 639-1 codes given):

 af, am, an, ar, as, az, be, bg, bn, br, bs, ca, cs, cy, da, de, dz, el, en, eo, 
 es, et, eu, fa, fi, fo, fr, ga, gl, gu, he, hi, hr, ht, hu, hy, id, is, it, ja, 
 jv, ka, kk, km, kn, ko, ku, ky, la, lb, lo, lt, lv, mg, mk, ml, mn, mr, ms, mt, 
 nb, ne, nl, nn, no, oc, or, pa, pl, ps, pt, qu, ro, ru, rw, se, si, sk, sl, sq, 
 sr, sv, sw, ta, te, th, tl, tr, ug, uk, ur, vi, vo, wa, xh, zh, zu

 """
 import sys, codecs, re
 from argparse import ArgumentParser, RawDescriptionHelpFormatter
 from langid import classify

 """
 TODO:
 - Add feature to process whole directory
 - Add feature to save to file or directory
 - Add feature to check against multiple languages
 - Add feature to be able to enter threshold of accuracy you require
 """
 #Output STDOUT as UTF-8
 sys.stdout = codecs.getwriter("utf8")(sys.stdout)
 sys.stderr = codecs.getwriter("utf8")(sys.stderr)

 #read text as a positional argument and procss each line
 def process(file, character, language):
 	with open(file, 'r') as file:
 		for line in file:
 			#check if line is greater than 65 characters
 			if len(line) > character:
 				#determine the language of each line
 				id = classify(line)
 				#check if language is Icelandic
 				if re.search(language, str(id)): 
 					print line.strip()

 def main():
 	parser = ArgumentParser(description = __doc__, formatter_class = RawDescriptionHelpFormatter)

 	parser.add_argument('-f', '--file', required = True, type = str, help = 'file to be processed')
 	parser.add_argument('-c', '--character', required = False, type = int, default=65, help = 'minimum number of characters that sentence/word to be')
 	parser.add_argument('-l', '--language', required = True, type = str, default='en', help = 'language to be extracted')

 	opts = parser.parse_args()

 	process(opts.file,opts.character,opts.language)

 if __name__ == '__main__':
 	main()
	#! /usr/bin/python

	"""
	This script will post process data from a Spider run.
	It will only work on either sentences or words.
	Provide the script with how many characters you want each word/sentence to be,
	the file to be processed and the language code.
	Once your happy with the output please inject it to an output file, shown below

	./postprocessing.py -f FILE -l LANG -c CHARATER > OUTFILE.txt

	The script supports 97 languages (ISO 639-1 codes given):

	af, am, an, ar, as, az, be, bg, bn, br, bs, ca, cs, cy, da, de, dz, el, en, eo,
	es, et, eu, fa, fi, fo, fr, ga, gl, gu, he, hi, hr, ht, hu, hy, id, is, it, ja,
	jv, ka, kk, km, kn, ko, ku, ky, la, lb, lo, lt, lv, mg, mk, ml, mn, mr, ms, mt,
	nb, ne, nl, nn, no, oc, or, pa, pl, ps, pt, qu, ro, ru, rw, se, si, sk, sl, sq,
	sr, sv, sw, ta, te, th, tl, tr, ug, uk, ur, vi, vo, wa, xh, zh, zu

	"""
	import sys, codecs, re
	from argparse import ArgumentParser, RawDescriptionHelpFormatter
	from langid import classify

	"""
	TODO:
	- Add feature to process whole directory
	- Add feature to save to file or directory
	- Add feature to check against multiple languages
	- Add feature to be able to enter threshold of accuracy you require
	"""
	#Output STDOUT as UTF-8
	sys.stdout = codecs.getwriter("utf8")(sys.stdout)
	sys.stderr = codecs.getwriter("utf8")(sys.stderr)

	#read text as a positional argument and procss each line
	def process(file, character, language):
	with open(file, 'r') as file:
	for line in file:
	#check if line is greater than 65 characters
	if len(line) > character:
	#determine the language of each line
	id = classify(line)
	#check if language is Icelandic
	if re.search(language, str(id)):
	print line.strip()

	def main():
	parser = ArgumentParser(description = __doc__, formatter_class = RawDescriptionHelpFormatter)

	parser.add_argument('-f', '--file', required = True, type = str, help = 'file to be processed')
	parser.add_argument('-c', '--character', required = False, type = int, default=65, help = 'minimum number of characters that sentence/word to be')
	parser.add_argument('-l', '--language', required = True, type = str, default='en', help = 'language to be extracted')

	opts = parser.parse_args()

	process(opts.file,opts.character,opts.language)

	if __name__ == '__main__':
	main()