Skip to content

Instantly share code, notes, and snippets.

@myksao
Last active June 14, 2022 18:42
Show Gist options
  • Save myksao/15163af223de2647d5b7669fbbea975c to your computer and use it in GitHub Desktop.
Save myksao/15163af223de2647d5b7669fbbea975c to your computer and use it in GitHub Desktop.
Modifying Document in Spacy Using Python
#I won't go through the process of installation in this article,want to go straight to the point but
#there are some few things i would like you to always put in mind;
#I will be talking about how to remove text in Doc object in this Part
#Doc - Document
#Spacy document 'Doc' i.e. Doc works with words and sentences object
#Doc object has immutable(can't change) text due to this we can't just go around modifying
#any text in the doc without following some standard procedure i.e rule based approach (i know you are building an expert system smiles).
#Here are some approaches to follow:
#1. Run the code before statistical pipeline components like ner,tagger :
nlp = spacy.load("en")
doc = nlp("your text")
Create your function
def custom_component(doc):
# Do something to the doc here
return doc
Add the function to a pipe
nlp.add_pipe(custom_component) #this will add the function to the end by default
#To add the function to the first pipe after tokenizer :
nlp.add_pipe(component, first=True)
#Here are some other tips:
#Argument Description Example
#last If True, add last nlp.add_pipe(component, last=True)
#first If True, add first nlp.add_pipe(component, first=True)
#before Add before component nlp.add_pipe(component, before="ner")
#after Add after component nlp.add_pipe(component, after="tagger")
#2. Modifying text: i.e remove
#There are different ways of removing text :
#Note: But rememeber that After every modification always create a new Doc i.e Data Structure
#i. Using custom extension
from spacy.tokens import Token,Doc,Span
Using method extension
def get_excluded(token,span:list):
return token.text in span
Token.set_extension('excluded',getter=get_excluded)
doc = nlp("text")
doc.get_excluded(span=['','','']) #custom words you would like to add
new_doc = [word.text for word in doc if not word.get_excluded] # i.e if token is in the get_excluded function dont add to the array
spaces=[]
for word in new_doc:
if word.whitespace_:
spaces.append(True)
elif (word.i+1) < len(doc) and word.nbor() == word.get_excluded:
spaces.append(True)
else:
spaces.append(False)
return Doc(doc.vocab,words=[word.text for word in new_doc],spaces=spaces)
Token.set_extension
#ii. Using matcher
import spacy
from spacy.matcher import Matcher
from spacy.symbols import ORTH ,TEXT
nlp = spacy.load("core")
nlp.add_pipe(clean,first=True)
matcher = Matcher(nlp.vocab)
pattern = [{"TEXT":"Hello","TEXT":"World"}] #check spacy rule based matching token attribute to guide you
for open('.txt','r') as fulltext:
doc = nlp(fulltext) # loop words from txt,csv
doc = nlp("HelloWorld Africa") #example
matches = matcher.add(doc) #add doc to the matcher so as to identifer each text
for match_id,start,end import matches:
span_word = doc[start:end] # span of words i.e. Hello World = doc[0:3]
span_char = doc.text[start:end] or doc.char_span(start,end) #span of characters i.e. "Hello world" => doc.text[0:10] or doc.char_span(0,10)
new_doc=[word.text for word in doc if word.text != span]
spaces=[]
for word in new_doc:
if word.whitespace_:
spaces.append(True)
elif (word.i+1) < len(doc) and word.nbor() == span:
spaces.append(True)
else:
spaces.append(False)
return Doc()
#iii. Regex , Doc.retokenizer.merge and data structure
I prefer this method due to the flexibility of modifying any doc,span,token in the retokenize function of the Doc
import spacy
import re
nlp = spacy.load("core")
nlp.add_pipe(clean,first=True)
doc = nlp("text") # loop words from txt,csv
Matching full text : Use python regex
notify_pattern = R'@(\w+|\W+)[:]?'
hashtags_pattern =R'#\w*[a-zA-Z]+\w*'
retweet_pattern =R'(RT)+[:]?'
def extraction(doc,span) -> Doc:
# checking the word from the merge doc if it is not in the span word
real_data = [words for words in doc if words.text not in span]
# print(span)
print(real_data)
spaces = []
for word in real_data:
# print(word)
if word.whitespace_:
spaces.append(True)
# still checking the doc we merge , if nbor text is equal to te span word
elif (word.i+1) < len(doc) and word.nbor(1).text in span:
#if the alarming word separate two words, add a space.
spaces.append(True)
else:
spaces.append(False)
# return new doc
return Doc(doc.vocab,words=[word.text for word in real_data],spaces=spaces)
def clean(doc:Doc) -> Doc:
# merge all span patterns , so as to combine each of them
with doc.retokenize() as retokenizer:
for match in re.finditer(pattern,doc.text):
start,end = match.span()
print(doc.char_span(start,end))
if doc.char_span(start,end) !=None:
retokenizer.merge(doc[start:end])# span of words i.e. Hello World = doc[0:3]
retokenizer.merge(doc.char_span(start,end)) #span of characters i.e. "Hello world" => doc.text[0:10] or doc.char_span(0,10)
else:
pass
# After combining the words :::: Extract all match words , so as to remove them
return extraction(doc,span= [match.group() for match in re.finditer(pattern,doc.text)])
#This approachs helps you modify the doc easily.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment