Created
May 27, 2018 10:13
-
-
Save DataTurks/97ff613967e8139e57091f9299c3a104 to your computer and use it in GitHub Desktop.
Creates NER training data in Spacy format from JSON downloaded from Dataturks.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import sys | |
import os | |
import json | |
import logging | |
import pickle | |
############################################ NOTE ######################################################## | |
# | |
# Creates NER training data in Spacy format from JSON downloaded from Dataturks. | |
# | |
# Outputs the Spacy training data as a pickle file which can be used during Spacy training. | |
# | |
# Run: python Dataturks_to_Spacy.py <dataturks_JSON_FilePath> <training_output_FilePath> | |
# | |
# | |
############################################################################################################ | |
#enable info logging. | |
logging.getLogger().setLevel(logging.INFO) | |
def convertSingleItem(dataturks_labeled_item): | |
try: | |
data = json.loads(dataturks_labeled_item) | |
text = data['content'] | |
entities = [] | |
for annotation in data['annotation']: | |
#only a single point in text annotation. | |
point = annotation['points'][0] | |
labels = annotation['label'] | |
# handle both list of labels or a single label. | |
if not isinstance(labels, list): | |
labels = [labels] | |
for label in labels: | |
#dataturks indices are both inclusive [start, end] but spacy is not [start, end) | |
entities.append((point['start'], point['end'] + 1 ,label)); | |
return (text, {"entities" : entities}); | |
except Exception as e: | |
logging.exception("Unable to process item " + dataturks_labeled_item + "\n" + "error = " + str(e)) | |
return None | |
def main(dataturks_JSON_FilePath, training_output_FilePath): | |
#make sure everything is setup. | |
if (not os.path.exists(dataturks_JSON_FilePath)): | |
logging.exception( | |
"Please specify a valid path to dataturks JSON output file, " + dataturks_JSON_FilePath + " doesn't exist") | |
return | |
if (not os.path.exists(os.path.dirname(training_output_FilePath))): | |
logging.exception( | |
"Please specify a valid path to output file, " + os.path.dirname(training_output_FilePath) + " directory doesn't exist") | |
return | |
with open(training_output_FilePath, "a+") as f: | |
logging.info("File " + training_output_FilePath + " exists....") | |
logging.info("Converting " + dataturks_JSON_FilePath + " ..."); | |
lines = [] | |
with open(dataturks_JSON_FilePath, 'r') as f: | |
lines = f.readlines() | |
if (not lines or len(lines) == 0): | |
logging.exception( | |
"Please specify a valid path to dataturks JSON output file, " + dataturks_JSON_FilePath + " is empty") | |
return | |
count = 0; | |
success = 0 | |
training_data = [] | |
for line in lines: | |
result = convertSingleItem(line) | |
if (result): | |
training_data.append(result) | |
success = success + 1 | |
count += 1; | |
if (count % 100 == 0): | |
logging.info(str(count) + " items done ...") | |
with open(training_output_FilePath, 'wb') as output: | |
pickle.dump(training_data, output, pickle.HIGHEST_PROTOCOL) | |
logging.info( | |
"Completed: " + str(success) + " items done, " + str(len(lines) - success) + " items ignored due to errors") | |
def create_arg_parser(): | |
""""Creates and returns the ArgumentParser object.""" | |
parser = argparse.ArgumentParser(description='Converts Dataturks NER output JSON file to Spacy training file format.') | |
parser.add_argument('dataturks_JSON_FilePath', | |
help='Path to the JSON file downloaded from Dataturks.') | |
parser.add_argument('training_output_FilePath', | |
help='Path to the file where Spacy training data will be stored as pickle output.') | |
return parser | |
if __name__ == '__main__': | |
arg_parser = create_arg_parser() | |
parsed_args = arg_parser.parse_args(sys.argv[1:]) | |
dataturks_JSON_FilePath = parsed_args.dataturks_JSON_FilePath | |
training_output_FilePath = parsed_args.training_output_FilePath | |
main(dataturks_JSON_FilePath, training_output_FilePath) |
Aravind3939
commented
Apr 2, 2020
via email
What I have done is I converted annotated data to StanfordNlp
format.afterwards i converted stanfordnlp to spacy.it is working now
…On Thu, 2 Apr 2020 at 10:19 PM, rahulsarkar906 ***@***.***> wrote:
***@***.**** commented on this gist.
------------------------------
Same error I am getting
—
You are receiving this because you commented.
Reply to this email directly, view it on GitHub
<https://gist.github.com/97ff613967e8139e57091f9299c3a104#gistcomment-3237861>,
or unsubscribe
<https://github.com/notifications/unsubscribe-auth/AMI6GWUWGG6Y72NPUDNYOXTRKS623ANCNFSM4LFDL5QQ>
.
Can you please share some sample code of doing the conversion?
U can search code in google type stanfordtonlp code.
…On Fri, 3 Apr 2020 at 2:40 AM, rahulsarkar906 ***@***.***> wrote:
***@***.**** commented on this gist.
------------------------------
Can you please share some sample code of doing the conversion?
—
You are receiving this because you commented.
Reply to this email directly, view it on GitHub
<https://gist.github.com/97ff613967e8139e57091f9299c3a104#gistcomment-3238128>,
or unsubscribe
<https://github.com/notifications/unsubscribe-auth/AMI6GWVDYAZEU3GZR6NTH73RKT5LZANCNFSM4LFDL5QQ>
.
https://medium.com/@dataturks/convert-stanford-corenlp-training-data-to-dataturks-ner-json-output-eeaab60fb7b6
First download stanford ner format from dataturks website.then convert
stanfordner to dataturks json.
Then pass converted dataturks json to ur code.
On Fri, 3 Apr 2020 at 9:04 PM, Ramidi Aravind Reddy <
[email protected]> wrote:
… U can search code in google type stanfordtonlp code.
On Fri, 3 Apr 2020 at 2:40 AM, rahulsarkar906 ***@***.***>
wrote:
> ***@***.**** commented on this gist.
> ------------------------------
>
> Can you please share some sample code of doing the conversion?
>
> —
> You are receiving this because you commented.
> Reply to this email directly, view it on GitHub
> <https://gist.github.com/97ff613967e8139e57091f9299c3a104#gistcomment-3238128>,
> or unsubscribe
> <https://github.com/notifications/unsubscribe-auth/AMI6GWVDYAZEU3GZR6NTH73RKT5LZANCNFSM4LFDL5QQ>
> .
>
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment