gregjan · April 18, 2023 19:13
diff --git a/sentences.py b/sentences.py
 from bs4 import BeautifulSoup
 import spacy
 import stanza
 import re
 nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma,depparse,constituency',  package={'constituency': 'ptb3_bert'})

 # Change to a loop through of whatever the document storage is
 HTMLFile = open("AgentOrange.html", "r")
 #change this to whatever string search you're using
 x="problems"   # FIXME: Not currently used..
 word_pattern = re.compile(" (\w*)\)")

 phrase_labels = ["VP", "NP", "S"]  # Whatever labels as appropriate
 # See http://surdeanu.cs.arizona.edu/mihai/teaching/ista555-fall13/readings/PennTreebankConstituents.html

 def generatePhrases(tree):
    children = list(tree.children)
    if len(children) == 0: return
    if str(children[0].label) in phrase_labels: # tree is a phrase we care about
        match = word_pattern.findall(str(tree))
        yield " ".join(match)
    # NOTE: you could avoid replication below by only pursuing deeper when this tree label is NOT a phrase_label.
    # Currently it keeps looking deeper even if phrase label matched above..
    for c in children:
        yield from generatePhrases(c)

 # Reading the file
 doc = BeautifulSoup(HTMLFile, 'html.parser')
 for pidx, para in enumerate(doc.find_all("p")):
    data=nlp(para.text)
    for sidx, sentence in enumerate(data.sentences):
        tree=sentence.constituency
        print(tree)
        phrases = generatePhrases(tree)
        for phrase in phrases:
            print("para %s sent %s phrase: "%(pidx, sidx) + str(phrase))
    if pidx > 6: break
	from bs4 import BeautifulSoup
	import spacy
	import stanza
	import re
	nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma,depparse,constituency', package={'constituency': 'ptb3_bert'})

	# Change to a loop through of whatever the document storage is
	HTMLFile = open("AgentOrange.html", "r")
	#change this to whatever string search you're using
	x="problems" # FIXME: Not currently used..
	word_pattern = re.compile(" (\w*)\)")

	phrase_labels = ["VP", "NP", "S"] # Whatever labels as appropriate
	# See http://surdeanu.cs.arizona.edu/mihai/teaching/ista555-fall13/readings/PennTreebankConstituents.html

	def generatePhrases(tree):
	children = list(tree.children)
	if len(children) == 0: return
	if str(children[0].label) in phrase_labels: # tree is a phrase we care about
	match = word_pattern.findall(str(tree))
	yield " ".join(match)
	# NOTE: you could avoid replication below by only pursuing deeper when this tree label is NOT a phrase_label.
	# Currently it keeps looking deeper even if phrase label matched above..
	for c in children:
	yield from generatePhrases(c)

	# Reading the file
	doc = BeautifulSoup(HTMLFile, 'html.parser')
	for pidx, para in enumerate(doc.find_all("p")):
	data=nlp(para.text)
	for sidx, sentence in enumerate(data.sentences):
	tree=sentence.constituency
	print(tree)
	phrases = generatePhrases(tree)
	for phrase in phrases:
	print("para %s sent %s phrase: "%(pidx, sidx) + str(phrase))
	if pidx > 6: break