myksao · June 14, 2022 18:42
diff --git a/modify.py b/modify.py
 #I won't go through the process of installation in this article,want to go straight to the point but 
 #there are some few things i would like you to always put in mind;

 #I will be talking about how to remove text in Doc object in this Part

 #Doc - Document

 #Spacy document 'Doc' i.e. Doc works with words and sentences object


 	#Doc object has immutable(can't change) text due to this we can't just go around modifying 
 	#any text in the doc without following some standard procedure i.e rule based approach (i know you are building an expert system smiles).
 	#Here are some approaches to follow:
 	
 		#1. Run the code before statistical pipeline components like ner,tagger :
 			
 			nlp = spacy.load("en")
 			doc = nlp("your text")
 			
 			Create your function 
 				def custom_component(doc):
 					# Do something to the doc here
 					return doc
 			
 			Add the function to a pipe
 				nlp.add_pipe(custom_component) #this will add the function to the end by default
 			
 			 #To add the function to the first pipe after tokenizer  :
 				nlp.add_pipe(component, first=True)
 			
 			#Here are some other tips:
 			
 		#Argument	        Description	                    Example
 		#last	        If True, add last	    nlp.add_pipe(component, last=True)
 		#first	        If True, add first	    nlp.add_pipe(component, first=True)
 		#before	        Add before component	nlp.add_pipe(component, before="ner")
 		#after	        Add after component	    nlp.add_pipe(component, after="tagger")



 		#2.  Modifying text: i.e remove 
 			
 			#There are different ways of removing text : 
 			
 				#Note: But rememeber that After every modification always create a new Doc i.e Data Structure
 			
 				#i. Using custom extension 
 					
 					from spacy.tokens import Token,Doc,Span
 					
 					Using method  extension
 					
 					def get_excluded(token,span:list):
 						
 						return token.text in span
 					
 					Token.set_extension('excluded',getter=get_excluded)
 					
 					doc = nlp("text")
 					doc.get_excluded(span=['','','']) #custom words you would like to add 
 					new_doc = [word.text for word in doc if not word.get_excluded] # i.e if token is in the get_excluded function dont add to the array 
 					spaces=[]
 					
 					for word in new_doc:
 						if word.whitespace_:
 							spaces.append(True)
 						elif (word.i+1) < len(doc) and word.nbor() == word.get_excluded:
 							spaces.append(True)
 						else: 
 							spaces.append(False)
 						
 					return Doc(doc.vocab,words=[word.text for word in new_doc],spaces=spaces)
 						
 					
 					Token.set_extension
 				
 				#ii. Using matcher 
        
 					import spacy
 					from spacy.matcher import Matcher
 					from spacy.symbols import ORTH ,TEXT
 					
 					nlp = spacy.load("core")
 					
 					nlp.add_pipe(clean,first=True)
 					
 					matcher = Matcher(nlp.vocab)
 					
 					pattern = [{"TEXT":"Hello","TEXT":"World"}] #check spacy rule based matching token attribute to guide you  
 					
 					
 					for open('.txt','r') as fulltext:
 						
 						doc = nlp(fulltext) # loop words from txt,csv 
 						doc = nlp("HelloWorld Africa") #example  
 						matches = matcher.add(doc) #add doc to the matcher so as to identifer each text 
 						for match_id,start,end import matches:
 							span_word = doc[start:end] # span of words i.e. Hello World = doc[0:3]
 							span_char = doc.text[start:end] or doc.char_span(start,end) #span of characters i.e. "Hello world" =>  doc.text[0:10] or doc.char_span(0,10)
 								
 						new_doc=[word.text for word in doc if word.text != span]
 						
 					spaces=[]
 					
 					for word in new_doc:
 						if word.whitespace_:
 							spaces.append(True)
 						elif (word.i+1) < len(doc) and word.nbor() == span:
 							spaces.append(True)
 						else: 
 							spaces.append(False)
 						
 					return Doc()
 					
 				#iii. Regex , Doc.retokenizer.merge and data structure 
 				
 					I prefer this method due to the flexibility of modifying any doc,span,token in the retokenize function of the Doc
 					import spacy
 					import re
 					
 					nlp = spacy.load("core")
 					
 					nlp.add_pipe(clean,first=True)
 					doc = nlp("text") # loop words from txt,csv 
 					Matching full text : Use python regex
 					notify_pattern = R'@(\w+|\W+)[:]?'
 					hashtags_pattern =R'#\w*[a-zA-Z]+\w*'
 					retweet_pattern =R'(RT)+[:]?'
 									
 					
 				def extraction(doc,span) -> Doc:
 					# checking the word from the merge doc if it is not in the span word
 					real_data = [words for words in doc if words.text not in span]
 					# print(span)
 					print(real_data)
 					spaces = []
 					for word in real_data:
 						# print(word)
 						if word.whitespace_:
 							spaces.append(True)

 						# still checking the  doc we merge , if nbor text is equal to te span word
 						elif (word.i+1) < len(doc) and word.nbor(1).text in span:
 							#if the alarming word separate two words, add a space.
 							spaces.append(True)

 						else:
 							spaces.append(False)
 					# return new doc 
 					return  Doc(doc.vocab,words=[word.text for word in real_data],spaces=spaces)

 				def clean(doc:Doc) -> Doc:
 					# merge all span patterns , so as to combine each of them 
 					with doc.retokenize() as retokenizer:
 						for match in re.finditer(pattern,doc.text):
 							start,end = match.span()
 							print(doc.char_span(start,end))
 							if doc.char_span(start,end) !=None:

 								retokenizer.merge(doc[start:end])# span of words i.e. Hello World = doc[0:3]
 								retokenizer.merge(doc.char_span(start,end))  #span of characters i.e. "Hello world" =>  doc.text[0:10] or doc.char_span(0,10)
 							else: 
 								pass
 					

 					# After combining the words :::: Extract all match words , so as to remove them 
 					return extraction(doc,span= [match.group() for match in re.finditer(pattern,doc.text)])
    
 			
 			
 	#This approachs helps you modify the doc easily.
	#I won't go through the process of installation in this article,want to go straight to the point but
	#there are some few things i would like you to always put in mind;

	#I will be talking about how to remove text in Doc object in this Part

	#Doc - Document

	#Spacy document 'Doc' i.e. Doc works with words and sentences object


	#Doc object has immutable(can't change) text due to this we can't just go around modifying
	#any text in the doc without following some standard procedure i.e rule based approach (i know you are building an expert system smiles).
	#Here are some approaches to follow:

	#1. Run the code before statistical pipeline components like ner,tagger :

	nlp = spacy.load("en")
	doc = nlp("your text")

	Create your function
	def custom_component(doc):
	# Do something to the doc here
	return doc

	Add the function to a pipe
	nlp.add_pipe(custom_component) #this will add the function to the end by default

	#To add the function to the first pipe after tokenizer :
	nlp.add_pipe(component, first=True)

	#Here are some other tips:

	#Argument Description Example
	#last If True, add last nlp.add_pipe(component, last=True)
	#first If True, add first nlp.add_pipe(component, first=True)
	#before Add before component nlp.add_pipe(component, before="ner")
	#after Add after component nlp.add_pipe(component, after="tagger")



	#2. Modifying text: i.e remove

	#There are different ways of removing text :

	#Note: But rememeber that After every modification always create a new Doc i.e Data Structure

	#i. Using custom extension

	from spacy.tokens import Token,Doc,Span

	Using method extension

	def get_excluded(token,span:list):

	return token.text in span

	Token.set_extension('excluded',getter=get_excluded)

	doc = nlp("text")
	doc.get_excluded(span=['','','']) #custom words you would like to add
	new_doc = [word.text for word in doc if not word.get_excluded] # i.e if token is in the get_excluded function dont add to the array
	spaces=[]

	for word in new_doc:
	if word.whitespace_:
	spaces.append(True)
	elif (word.i+1) < len(doc) and word.nbor() == word.get_excluded:
	spaces.append(True)
	else:
	spaces.append(False)

	return Doc(doc.vocab,words=[word.text for word in new_doc],spaces=spaces)


	Token.set_extension

	#ii. Using matcher

	import spacy
	from spacy.matcher import Matcher
	from spacy.symbols import ORTH ,TEXT

	nlp = spacy.load("core")

	nlp.add_pipe(clean,first=True)

	matcher = Matcher(nlp.vocab)

	pattern = [{"TEXT":"Hello","TEXT":"World"}] #check spacy rule based matching token attribute to guide you


	for open('.txt','r') as fulltext:

	doc = nlp(fulltext) # loop words from txt,csv
	doc = nlp("HelloWorld Africa") #example
	matches = matcher.add(doc) #add doc to the matcher so as to identifer each text
	for match_id,start,end import matches:
	span_word = doc[start:end] # span of words i.e. Hello World = doc[0:3]
	span_char = doc.text[start:end] or doc.char_span(start,end) #span of characters i.e. "Hello world" => doc.text[0:10] or doc.char_span(0,10)

	new_doc=[word.text for word in doc if word.text != span]

	spaces=[]

	for word in new_doc:
	if word.whitespace_:
	spaces.append(True)
	elif (word.i+1) < len(doc) and word.nbor() == span:
	spaces.append(True)
	else:
	spaces.append(False)

	return Doc()

	#iii. Regex , Doc.retokenizer.merge and data structure

	I prefer this method due to the flexibility of modifying any doc,span,token in the retokenize function of the Doc
	import spacy
	import re

	nlp = spacy.load("core")

	nlp.add_pipe(clean,first=True)
	doc = nlp("text") # loop words from txt,csv
	Matching full text : Use python regex
	notify_pattern = R'@(\w+\|\W+)[:]?'
	hashtags_pattern =R'#\w[a-zA-Z]+\w'
	retweet_pattern =R'(RT)+[:]?'


	def extraction(doc,span) -> Doc:
	# checking the word from the merge doc if it is not in the span word
	real_data = [words for words in doc if words.text not in span]
	# print(span)
	print(real_data)
	spaces = []
	for word in real_data:
	# print(word)
	if word.whitespace_:
	spaces.append(True)

	# still checking the doc we merge , if nbor text is equal to te span word
	elif (word.i+1) < len(doc) and word.nbor(1).text in span:
	#if the alarming word separate two words, add a space.
	spaces.append(True)

	else:
	spaces.append(False)
	# return new doc
	return Doc(doc.vocab,words=[word.text for word in real_data],spaces=spaces)

	def clean(doc:Doc) -> Doc:
	# merge all span patterns , so as to combine each of them
	with doc.retokenize() as retokenizer:
	for match in re.finditer(pattern,doc.text):
	start,end = match.span()
	print(doc.char_span(start,end))
	if doc.char_span(start,end) !=None:

	retokenizer.merge(doc[start:end])# span of words i.e. Hello World = doc[0:3]
	retokenizer.merge(doc.char_span(start,end)) #span of characters i.e. "Hello world" => doc.text[0:10] or doc.char_span(0,10)
	else:
	pass


	# After combining the words :::: Extract all match words , so as to remove them
	return extraction(doc,span= [match.group() for match in re.finditer(pattern,doc.text)])



	#This approachs helps you modify the doc easily.