Created
December 15, 2019 16:35
-
-
Save isoboroff/03b8f11cc2315a2ec6a8a97a53977789 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
if __name__ == "__main__": | |
import json | |
import argparse | |
import spacy | |
import dateparser | |
import signal | |
from contextlib import contextmanager | |
from tqdm import tqdm | |
parser = argparse.ArgumentParser(description='Preprocess web news text with Spacy') | |
parser.add_argument('bundle', help='Bundle to index (zip file)') | |
args = parser.parse_args() | |
nlp = spacy.load('en_core_web_lg') | |
# Cool trick from https://www.jujens.eu/posts/en/2018/Jun/02/python-timeout-function/ | |
# Use a context manager to timeout functions by wrapping them in with.. clauses. | |
@contextmanager | |
def timeout(time): | |
signal.signal(signal.SIGALRM, raise_timeout) | |
signal.alarm(time) | |
try: | |
yield | |
except TimeoutError: | |
pass | |
finally: | |
signal.signal(signal.SIGALRM, signal.SIG_IGN) | |
def raise_timeout(signum, frame): | |
raise TimeoutError | |
# The file is JSON lines... | |
def process(docstring): | |
obj = json.loads(docstring) | |
doc = nlp(obj['text']) | |
for ent in doc.ents: | |
# I wish I had proper metadata, but since I don't, | |
# Try to parse the first thing in the document | |
# that Spacy thinks is a DATE. | |
if 'first_date' not in obj and ent.label_ == 'DATE': | |
date = dateparser.parse(ent.text) | |
if date is not None: | |
obj['first_date'] = date.isoformat() | |
obj['first_stamp'] = str(int(date.timestamp())) | |
if ent.label_ not in obj: | |
obj[ent.label_] = ent.text | |
else: | |
obj[ent.label_] += " " + ent.text | |
return obj | |
with open(args.bundle, 'r') as bundle: | |
linecount = 0 | |
for line in bundle: | |
linecount += 1 | |
with open(args.bundle, 'r') as bundle: | |
for line in tqdm(bundle, total=linecount): | |
with timeout(10): | |
try: | |
print(json.dumps(process(line))) | |
except: | |
print(json.dumps(line)) | |
continue | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment