Skip to content

Instantly share code, notes, and snippets.

@karthikavijayanexpts
Created May 31, 2022 09:56
Show Gist options
  • Save karthikavijayanexpts/f40515465653405e256a042dc2605f56 to your computer and use it in GitHub Desktop.
Save karthikavijayanexpts/f40515465653405e256a042dc2605f56 to your computer and use it in GitHub Desktop.
Splitting compound sentences to a set of simple sentences using spaCy package
import spacy
nlp = spacy.load('en_core_web_md')
def compound_to_simple(sentence):
doc = nlp(sentence)
root_token = None
for token in doc:
if (token.dep_ == "ROOT"):
root_token = token
other_verbs = []
for token in doc:
ancestors = list(token.ancestors)
if (token.pos_ == "VERB" and len(ancestors) < 3 and token != root_token):
other_verbs.append(token)
token_spans = []
all_verbs = [root_token] + other_verbs
for other_verb in all_verbs:
first_token_index = len(doc)
last_token_index = 0
this_verb_children = list(other_verb.children)
for child in this_verb_children:
if (child not in all_verbs):
if (child.i < first_token_index):
first_token_index = child.i
if (child.i > last_token_index):
last_token_index = child.i
token_spans.append((first_token_index, last_token_index))
sentence_clauses = []
for token_span in token_spans:
start = token_span[0]
end = token_span[1]
if (start < end):
clause = doc[start:end]
sentence_clauses.append(clause)
sentence_clauses = sorted(sentence_clauses, key=lambda tup: tup[0])
clauses_text = [clause.text for clause in sentence_clauses]
return clauses_text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment