Skip to content

Instantly share code, notes, and snippets.

@wilmeragsgh
Created November 3, 2022 13:19
Show Gist options
  • Save wilmeragsgh/588590da2fff4a87bb021aa0ae7792f6 to your computer and use it in GitHub Desktop.
Save wilmeragsgh/588590da2fff4a87bb021aa0ae7792f6 to your computer and use it in GitHub Desktop.
Generate training data for spacy from keyword list
# assuming training_data as an array of the form [raw_text,[(init_char, end_char, "LABEL")]] where the internal array can contain multiple labels per raw_text
import spacy
from spacy.tokens import DocBin
nlp = spacy.blank(lang) # lang refers to a spacy model language, ex (en, es, ...)
db = DocBin()
errors = []
for text, annotations in training_data[:train_size]:
doc = nlp(text)
ents = []
for start, end, label in annotations:
span = doc.char_span(start, end, label=label)
ents.append(span)
try:
doc.ents = ents
db.add(doc)
except TypeError:
errors.append(text)
except ValueError:
doc.ents = [ents[-1]]
db.add(doc)
db.to_disk("./train.spacy")
###
nlp = spacy.blank("en")
db = DocBin()
for text, annotations in training_data[train_size:]:
doc = nlp(text)
ents = []
for start, end, label in annotations:
span = doc.char_span(start, end, label=label)
ents.append(span)
try:
doc.ents = ents
db.add(doc)
except TypeError:
errors.append(text)
except ValueError:
doc.ents = [ents[-1]]
db.add(doc)
db.to_disk("./dev.spacy")
%%sh
python -m spacy download en_core_web_lg # or the spacy language model of preference
python -m spacy init config config.cfg --lang en --pipeline ner --optimize accuracy
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy --output ./trained_model
%%sh
python -m spacy evaluate trained_model/model-best dev.spacy
### loading the model
import spacy
nlp = spacy.load("./trained_model/model-best")
nlp.pipe(documents) # inference over new documents
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment