Created
October 14, 2015 14:07
-
-
Save thomascrha/2cb70b9fc78b25b43264 to your computer and use it in GitHub Desktop.
Proces word and sentence lists, to determine language.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/python | |
""" | |
This script will post process data from a Spider run. | |
It will only work on either sentences or words. | |
Provide the script with how many characters you want each word/sentence to be, | |
the file to be processed and the language code. | |
Once your happy with the output please inject it to an output file, shown below | |
./postprocessing.py -f FILE -l LANG -c CHARATER > OUTFILE.txt | |
The script supports 97 languages (ISO 639-1 codes given): | |
af, am, an, ar, as, az, be, bg, bn, br, bs, ca, cs, cy, da, de, dz, el, en, eo, | |
es, et, eu, fa, fi, fo, fr, ga, gl, gu, he, hi, hr, ht, hu, hy, id, is, it, ja, | |
jv, ka, kk, km, kn, ko, ku, ky, la, lb, lo, lt, lv, mg, mk, ml, mn, mr, ms, mt, | |
nb, ne, nl, nn, no, oc, or, pa, pl, ps, pt, qu, ro, ru, rw, se, si, sk, sl, sq, | |
sr, sv, sw, ta, te, th, tl, tr, ug, uk, ur, vi, vo, wa, xh, zh, zu | |
""" | |
import sys, codecs, re | |
from argparse import ArgumentParser, RawDescriptionHelpFormatter | |
from langid import classify | |
""" | |
TODO: | |
- Add feature to process whole directory | |
- Add feature to save to file or directory | |
- Add feature to check against multiple languages | |
- Add feature to be able to enter threshold of accuracy you require | |
""" | |
#Output STDOUT as UTF-8 | |
sys.stdout = codecs.getwriter("utf8")(sys.stdout) | |
sys.stderr = codecs.getwriter("utf8")(sys.stderr) | |
#read text as a positional argument and procss each line | |
def process(file, character, language): | |
with open(file, 'r') as file: | |
for line in file: | |
#check if line is greater than 65 characters | |
if len(line) > character: | |
#determine the language of each line | |
id = classify(line) | |
#check if language is Icelandic | |
if re.search(language, str(id)): | |
print line.strip() | |
def main(): | |
parser = ArgumentParser(description = __doc__, formatter_class = RawDescriptionHelpFormatter) | |
parser.add_argument('-f', '--file', required = True, type = str, help = 'file to be processed') | |
parser.add_argument('-c', '--character', required = False, type = int, default=65, help = 'minimum number of characters that sentence/word to be') | |
parser.add_argument('-l', '--language', required = True, type = str, default='en', help = 'language to be extracted') | |
opts = parser.parse_args() | |
process(opts.file,opts.character,opts.language) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment