-
-
Save kaustumbh7/6dc0b909dbdfea4ae2428fb77e18273f to your computer and use it in GitHub Desktop.
#!/usr/bin/env python | |
# coding: utf8 | |
# Training additional entity types using spaCy | |
from __future__ import unicode_literals, print_function | |
import pickle | |
import plac | |
import random | |
from pathlib import Path | |
import spacy | |
from spacy.util import minibatch, compounding | |
# New entity labels | |
# Specify the new entity labels which you want to add here | |
LABEL = ['I-geo', 'B-geo', 'I-art', 'B-art', 'B-tim', 'B-nat', 'B-eve', 'O', 'I-per', 'I-tim', 'I-nat', 'I-eve', 'B-per', 'I-org', 'B-gpe', 'B-org', 'I-gpe'] | |
""" | |
geo = Geographical Entity | |
org = Organization | |
per = Person | |
gpe = Geopolitical Entity | |
tim = Time indicator | |
art = Artifact | |
eve = Event | |
nat = Natural Phenomenon | |
""" | |
# Loading training data | |
with open ('Data/ner_corpus_260', 'rb') as fp: | |
TRAIN_DATA = pickle.load(fp) | |
@plac.annotations( | |
model=("Model name. Defaults to blank 'en' model.", "option", "m", str), | |
new_model_name=("New model name for model meta.", "option", "nm", str), | |
output_dir=("Optional output directory", "option", "o", Path), | |
n_iter=("Number of training iterations", "option", "n", int)) | |
def main(model=None, new_model_name='new_model', output_dir=None, n_iter=10): | |
"""Setting up the pipeline and entity recognizer, and training the new entity.""" | |
if model is not None: | |
nlp = spacy.load(model) # load existing spacy model | |
print("Loaded model '%s'" % model) | |
else: | |
nlp = spacy.blank('en') # create blank Language class | |
print("Created blank 'en' model") | |
if 'ner' not in nlp.pipe_names: | |
ner = nlp.create_pipe('ner') | |
nlp.add_pipe(ner) | |
else: | |
ner = nlp.get_pipe('ner') | |
for i in LABEL: | |
ner.add_label(i) # Add new entity labels to entity recognizer | |
if model is None: | |
optimizer = nlp.begin_training() | |
else: | |
optimizer = nlp.entity.create_optimizer() | |
# Get names of other pipes to disable them during training to train only NER | |
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] | |
with nlp.disable_pipes(*other_pipes): # only train NER | |
for itn in range(n_iter): | |
random.shuffle(TRAIN_DATA) | |
losses = {} | |
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001)) | |
for batch in batches: | |
texts, annotations = zip(*batch) | |
nlp.update(texts, annotations, sgd=optimizer, drop=0.35, | |
losses=losses) | |
print('Losses', losses) | |
# Test the trained model | |
test_text = 'Gianni Infantino is the president of FIFA.' | |
doc = nlp(test_text) | |
print("Entities in '%s'" % test_text) | |
for ent in doc.ents: | |
print(ent.label_, ent.text) | |
# Save model | |
if output_dir is not None: | |
output_dir = Path(output_dir) | |
if not output_dir.exists(): | |
output_dir.mkdir() | |
nlp.meta['name'] = new_model_name # rename model | |
nlp.to_disk(output_dir) | |
print("Saved model to", output_dir) | |
# Test the saved model | |
print("Loading from", output_dir) | |
nlp2 = spacy.load(output_dir) | |
doc2 = nlp2(test_text) | |
for ent in doc2.ents: | |
print(ent.label_, ent.text) | |
if __name__ == '__main__': | |
plac.call(main) |
Hi Please guide me help me i have text file how can i convert that text file into xml and find out my required entities? Please help me i have to complete my project asap as the deadline is very near it will be really greatfull if u provide me some help
…
On Thu, 4 Jul 2019 at 6:52 PM, Kaustumbh Jaiswal @.***> wrote: hi @kaustumbh7 https://github.com/kaustumbh7 .. basicaly i have annoted data in xml format so what i have to do first ? convert that into what? json? or something else. also one other thing i have to find out family member names like father,mother.son etc so where i have to put my own label name 'FamilyMember' ? In order to use the above script for training your NER model, you first need to convert your xml file to json format. Then convert it into the form required by spacy (which is nothing but a list of tuples as shown here https://spacy.io/usage/training) as mentioned before. You can refer this https://towardsdatascience.com/custom-named-entity-recognition-using-spacy-7140ebbb3718 article for a better understanding. If you want to predict family member relationships, you can tag your data accordingly by adding a new entity 'FamilyMember'. — You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub https://gist.github.com/6dc0b909dbdfea4ae2428fb77e18273f?email_source=notifications&email_token=AMO4XRROXTBXBSIJHTQNQZTP5X6CTA5CNFSM4H4CF342YY3PNVWWK3TUL52HS4DFVNDWS43UINXW23LFNZ2KUY3PNVWWK3TUL5UWJTQAFUZEU#gistcomment-2961994, or mute the thread https://github.com/notifications/unsubscribe-auth/AMO4XRRWUEUICRWE3MUNAOTP5X6CTANCNFSM4H4CF34Q .
Hello Farah,
Can you please share your text file?
Hey Kaustumbh! Thanks for sharing!
I am getting "Warning: Unnamed vectors -- this won't allow multiple vectors models to be loaded. (Shape: (0,0))". Would you happen to have a clue?
Hey @farahsalman23 ,
I strongly suggest you to go through this article. Here you will get to know in what format your data should be for executing this script.
Warning: Unnamed vectors
Hey David,
I guess that might be an issue with the spacy
version. Please install spacy==2.0.18
and try again.
@kaustumbh7 I followed the process you outlined, and when I run this script I get :
TypeError: object of type 'NoneType' has no len()
@Z-e-e Hello, make sure that you have initialized the LABEL
list correctly.
@kaustumbh7 I am trying to understand the process of training and I am using the exact same dataset that you have specified here and I am still getting this error "TypeError: object of type 'NoneType' has no len()".
Can you help me out in where I am going wrong ?
@kaustumbh7 I am trying to understand the process of training and I am using the exact same dataset that you have specified here and I am still getting this error "TypeError: object of type 'NoneType' has no len()".
Can you help me out in where I am going wrong ?
Hi, the issue was that I had empty texts values in the Training batch. I filtered them out while creating data in spacy format and it works like a gem now. Thanks anyways. This is a really helpful code :)
hi,
while running this code I am getting error as- 'KeyError: "[E022] Could not find a transition with the name 'U-Tag' in the NER model."'
Could you please let me know solution for this or what could be the problem for this error. I have followed your steps for creating the training dataset. I am using the same dataset as yours.
Thank you.
hi,
while running this code I am getting error as- 'KeyError: "[E022] Could not find a transition with the name 'U-Tag' in the NER model."'
Could you please let me know solution for this or what could be the problem for this error. I have followed your steps for creating the training dataset. I am using the same dataset as yours.
Thank you.
Hi @mgrove6,
You will have to include all the labels that you are using to train your dataset in the LABEL field (Line 16) in your code. The error is basically trying to say that you have trained your model with a label that is not a valid custom label. To make the label (U-Tag) a valid one, include the same to the LABEL list as is in line 16 in the above code.
Hey, This article was very helpful for beginners. /i am now able to train the data. Thanks
But How do I test, without training everytime?
hi,
while running this code I am getting error as- 'KeyError: "[E022] Could not find a transition with the name 'U-Tag' in the NER model."'
Could you please let me know solution for this or what could be the problem for this error. I have followed your steps for creating the training dataset. I am using the same dataset as yours.
Thank you.Hi @mgrove6,
You will have to include all the labels that you are using to train your dataset in the LABEL field (Line 16) in your code. The error is basically trying to say that you have trained your model with a label that is not a valid custom label. To make the label (U-Tag) a valid one, include the same to the LABEL list as is in line 16 in the above code.
Hi, I was able to fix this. I added
ner.add_label("Tag")
@ line 51
In order to use the above script for training your NER model, you first need to convert your
xml
file tojson
format. Then convert it into the form required byspacy
(which is nothing but a list of tuples as shown here) as mentioned before. You can refer this article for a better understanding.If you want to predict family member relationships, you can tag your data accordingly by adding a new entity 'FamilyMember'.