Last active
June 14, 2022 18:42
-
-
Save myksao/15163af223de2647d5b7669fbbea975c to your computer and use it in GitHub Desktop.
Modifying Document in Spacy Using Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#I won't go through the process of installation in this article,want to go straight to the point but | |
#there are some few things i would like you to always put in mind; | |
#I will be talking about how to remove text in Doc object in this Part | |
#Doc - Document | |
#Spacy document 'Doc' i.e. Doc works with words and sentences object | |
#Doc object has immutable(can't change) text due to this we can't just go around modifying | |
#any text in the doc without following some standard procedure i.e rule based approach (i know you are building an expert system smiles). | |
#Here are some approaches to follow: | |
#1. Run the code before statistical pipeline components like ner,tagger : | |
nlp = spacy.load("en") | |
doc = nlp("your text") | |
Create your function | |
def custom_component(doc): | |
# Do something to the doc here | |
return doc | |
Add the function to a pipe | |
nlp.add_pipe(custom_component) #this will add the function to the end by default | |
#To add the function to the first pipe after tokenizer : | |
nlp.add_pipe(component, first=True) | |
#Here are some other tips: | |
#Argument Description Example | |
#last If True, add last nlp.add_pipe(component, last=True) | |
#first If True, add first nlp.add_pipe(component, first=True) | |
#before Add before component nlp.add_pipe(component, before="ner") | |
#after Add after component nlp.add_pipe(component, after="tagger") | |
#2. Modifying text: i.e remove | |
#There are different ways of removing text : | |
#Note: But rememeber that After every modification always create a new Doc i.e Data Structure | |
#i. Using custom extension | |
from spacy.tokens import Token,Doc,Span | |
Using method extension | |
def get_excluded(token,span:list): | |
return token.text in span | |
Token.set_extension('excluded',getter=get_excluded) | |
doc = nlp("text") | |
doc.get_excluded(span=['','','']) #custom words you would like to add | |
new_doc = [word.text for word in doc if not word.get_excluded] # i.e if token is in the get_excluded function dont add to the array | |
spaces=[] | |
for word in new_doc: | |
if word.whitespace_: | |
spaces.append(True) | |
elif (word.i+1) < len(doc) and word.nbor() == word.get_excluded: | |
spaces.append(True) | |
else: | |
spaces.append(False) | |
return Doc(doc.vocab,words=[word.text for word in new_doc],spaces=spaces) | |
Token.set_extension | |
#ii. Using matcher | |
import spacy | |
from spacy.matcher import Matcher | |
from spacy.symbols import ORTH ,TEXT | |
nlp = spacy.load("core") | |
nlp.add_pipe(clean,first=True) | |
matcher = Matcher(nlp.vocab) | |
pattern = [{"TEXT":"Hello","TEXT":"World"}] #check spacy rule based matching token attribute to guide you | |
for open('.txt','r') as fulltext: | |
doc = nlp(fulltext) # loop words from txt,csv | |
doc = nlp("HelloWorld Africa") #example | |
matches = matcher.add(doc) #add doc to the matcher so as to identifer each text | |
for match_id,start,end import matches: | |
span_word = doc[start:end] # span of words i.e. Hello World = doc[0:3] | |
span_char = doc.text[start:end] or doc.char_span(start,end) #span of characters i.e. "Hello world" => doc.text[0:10] or doc.char_span(0,10) | |
new_doc=[word.text for word in doc if word.text != span] | |
spaces=[] | |
for word in new_doc: | |
if word.whitespace_: | |
spaces.append(True) | |
elif (word.i+1) < len(doc) and word.nbor() == span: | |
spaces.append(True) | |
else: | |
spaces.append(False) | |
return Doc() | |
#iii. Regex , Doc.retokenizer.merge and data structure | |
I prefer this method due to the flexibility of modifying any doc,span,token in the retokenize function of the Doc | |
import spacy | |
import re | |
nlp = spacy.load("core") | |
nlp.add_pipe(clean,first=True) | |
doc = nlp("text") # loop words from txt,csv | |
Matching full text : Use python regex | |
notify_pattern = R'@(\w+|\W+)[:]?' | |
hashtags_pattern =R'#\w*[a-zA-Z]+\w*' | |
retweet_pattern =R'(RT)+[:]?' | |
def extraction(doc,span) -> Doc: | |
# checking the word from the merge doc if it is not in the span word | |
real_data = [words for words in doc if words.text not in span] | |
# print(span) | |
print(real_data) | |
spaces = [] | |
for word in real_data: | |
# print(word) | |
if word.whitespace_: | |
spaces.append(True) | |
# still checking the doc we merge , if nbor text is equal to te span word | |
elif (word.i+1) < len(doc) and word.nbor(1).text in span: | |
#if the alarming word separate two words, add a space. | |
spaces.append(True) | |
else: | |
spaces.append(False) | |
# return new doc | |
return Doc(doc.vocab,words=[word.text for word in real_data],spaces=spaces) | |
def clean(doc:Doc) -> Doc: | |
# merge all span patterns , so as to combine each of them | |
with doc.retokenize() as retokenizer: | |
for match in re.finditer(pattern,doc.text): | |
start,end = match.span() | |
print(doc.char_span(start,end)) | |
if doc.char_span(start,end) !=None: | |
retokenizer.merge(doc[start:end])# span of words i.e. Hello World = doc[0:3] | |
retokenizer.merge(doc.char_span(start,end)) #span of characters i.e. "Hello world" => doc.text[0:10] or doc.char_span(0,10) | |
else: | |
pass | |
# After combining the words :::: Extract all match words , so as to remove them | |
return extraction(doc,span= [match.group() for match in re.finditer(pattern,doc.text)]) | |
#This approachs helps you modify the doc easily. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment