Created
May 31, 2022 09:56
-
-
Save karthikavijayanexpts/f40515465653405e256a042dc2605f56 to your computer and use it in GitHub Desktop.
Splitting compound sentences to a set of simple sentences using spaCy package
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
nlp = spacy.load('en_core_web_md') | |
def compound_to_simple(sentence): | |
doc = nlp(sentence) | |
root_token = None | |
for token in doc: | |
if (token.dep_ == "ROOT"): | |
root_token = token | |
other_verbs = [] | |
for token in doc: | |
ancestors = list(token.ancestors) | |
if (token.pos_ == "VERB" and len(ancestors) < 3 and token != root_token): | |
other_verbs.append(token) | |
token_spans = [] | |
all_verbs = [root_token] + other_verbs | |
for other_verb in all_verbs: | |
first_token_index = len(doc) | |
last_token_index = 0 | |
this_verb_children = list(other_verb.children) | |
for child in this_verb_children: | |
if (child not in all_verbs): | |
if (child.i < first_token_index): | |
first_token_index = child.i | |
if (child.i > last_token_index): | |
last_token_index = child.i | |
token_spans.append((first_token_index, last_token_index)) | |
sentence_clauses = [] | |
for token_span in token_spans: | |
start = token_span[0] | |
end = token_span[1] | |
if (start < end): | |
clause = doc[start:end] | |
sentence_clauses.append(clause) | |
sentence_clauses = sorted(sentence_clauses, key=lambda tup: tup[0]) | |
clauses_text = [clause.text for clause in sentence_clauses] | |
return clauses_text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment