Created
November 3, 2022 13:19
-
-
Save wilmeragsgh/588590da2fff4a87bb021aa0ae7792f6 to your computer and use it in GitHub Desktop.
Generate training data for spacy from keyword list
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# assuming training_data as an array of the form [raw_text,[(init_char, end_char, "LABEL")]] where the internal array can contain multiple labels per raw_text | |
import spacy | |
from spacy.tokens import DocBin | |
nlp = spacy.blank(lang) # lang refers to a spacy model language, ex (en, es, ...) | |
db = DocBin() | |
errors = [] | |
for text, annotations in training_data[:train_size]: | |
doc = nlp(text) | |
ents = [] | |
for start, end, label in annotations: | |
span = doc.char_span(start, end, label=label) | |
ents.append(span) | |
try: | |
doc.ents = ents | |
db.add(doc) | |
except TypeError: | |
errors.append(text) | |
except ValueError: | |
doc.ents = [ents[-1]] | |
db.add(doc) | |
db.to_disk("./train.spacy") | |
### | |
nlp = spacy.blank("en") | |
db = DocBin() | |
for text, annotations in training_data[train_size:]: | |
doc = nlp(text) | |
ents = [] | |
for start, end, label in annotations: | |
span = doc.char_span(start, end, label=label) | |
ents.append(span) | |
try: | |
doc.ents = ents | |
db.add(doc) | |
except TypeError: | |
errors.append(text) | |
except ValueError: | |
doc.ents = [ents[-1]] | |
db.add(doc) | |
db.to_disk("./dev.spacy") | |
%%sh | |
python -m spacy download en_core_web_lg # or the spacy language model of preference | |
python -m spacy init config config.cfg --lang en --pipeline ner --optimize accuracy | |
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy --output ./trained_model | |
%%sh | |
python -m spacy evaluate trained_model/model-best dev.spacy | |
### loading the model | |
import spacy | |
nlp = spacy.load("./trained_model/model-best") | |
nlp.pipe(documents) # inference over new documents |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment