Created
January 24, 2025 16:35
-
-
Save ninapavlich/78ce925656a3e5f4e28a251f86e618e8 to your computer and use it in GitHub Desktop.
Splits paragraphs into sentence, handling certain edge cases
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Based off snippet from: https://stackoverflow.com/questions/4576077/how-can-i-split-a-text-into-sentences | |
/* | |
Example outputs: | |
Handles ambiguous appbreviation and sentence ending: | |
console.log(splitParagraphIntoSentences('This is a sentence. Mr. Smith went to N.Y.C. He said, "Hello!"')) | |
[ | |
"This is a sentence.", | |
"Mr. Smith went to N.Y.C.", | |
'He said, "Hello!"', | |
] | |
Handles short quote: | |
console.log(splitParagraphIntoSentences('And then he said "whoa!" because he was shocked.')) | |
[ | |
'And then he said "whoa!"', | |
"because he was shocked." | |
] | |
Handles numbers within a sentence: | |
console.log(splitParagraphIntoSentences("This is a long string with some numbers [125.000,55 and 140.000] and an end. This is another sentence.")) | |
[ | |
"This is a long string with some numbers [125.000,55 and 140.000] and an end.", | |
"This is another sentence." | |
] | |
Handles no punctuation: | |
console.log(splitParagraphIntoSentences("This is a long string with some numbers [125.000,55 and 140.000] and an end. This is another sentence.")) | |
[ | |
"Just some text with no punctuation", | |
"Just some text with no punctuation" | |
] | |
Splits title with abbreviation: | |
console.log(splitParagraphIntoSentences("Lee Zeldin, Trump's E.P.A. Nominee, Is Short on Environmental Experience")) | |
[ | |
"Lee Zeldin, Trump's E.P.A. Nominee, Is Short on Environmental Experience" | |
] | |
Splits multiple paragraphs with quotes: | |
console.log(splitParagraphIntoSentences(`“I got my twins back when they were 8 months old and got my son back after I lived in my apartment for a year,” she said. “I was sleeping in the dining room so the kids could have the bedrooms. I never had any privacy at all. I had this dream of buying a house someday, and the head of ABLE introduced me to her friend Jessica, a real estate agent.”`)) | |
[ | |
'"I got my twins back when they were 8 months old and got my son back after I lived in my apartment for a year,"', | |
"she said.", | |
'"I was sleeping in the dining room so the kids could have the bedrooms."', | |
'"I never had any privacy at all."', | |
'"I had this dream of buying a house someday, and the head of ABLE introduced me to her friend Jessica, a real estate agent."', | |
] | |
*/ | |
export const splitParagraphIntoSentences = (paragraph: string): string[] => { | |
const text = markSentenceBreaks(paragraph); | |
let sentences = text.split("<stop>"); | |
sentences = sentences.map((s) => s.trim()); | |
if (sentences.length && !sentences[sentences.length - 1]) sentences.pop(); | |
sentences = sentences.filter((sentence) => sentence !== '".'); | |
return sentences; | |
}; | |
const markSentenceBreaks = (paragraph: string): string => { | |
const alphabets = "([A-Za-z])"; | |
const prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"; | |
const suffixes = "(Inc|Ltd|Jr|Sr|Co)"; | |
const starters = | |
"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\\s|She\\s|It\\s|They\\s|Their\\s|Our\\s|We\\s|But\\s|However\\s|That\\s|This\\s|Wherever)"; | |
const acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"; | |
const websites = "[.](com|net|org|io|gov|edu|me)"; | |
const digits = "([0-9])"; | |
const multipleDots = "\\.{2,}"; | |
const insideQuotation = /(?:"|“)((?:\\.|[^"\\])*?)(?:"|”)/gm; | |
// Replace all quotes, we will handle those seaparately, recursively | |
let text = paragraph.replace( | |
new RegExp(insideQuotation, "gm"), | |
() => `<quote>`, | |
); | |
text = " " + text + " "; | |
// Replace newlines | |
text = text.replace(/\n/g, " "); | |
// Flag periods from common prefixes | |
text = text.replace(new RegExp(prefixes, "g"), "$1<prd>"); | |
// Flag periods from wev=bsites | |
text = text.replace(new RegExp(websites, "g"), "<prd>$1"); | |
// Flag periods from numeric values | |
text = text.replace(new RegExp(digits + "[.]" + digits, "g"), "$1<prd>$2"); | |
// Flag periods from ellipsis? | |
text = text.replace( | |
new RegExp(multipleDots, "g"), | |
(match) => "<prd>".repeat(match.length) + "<stop>", | |
); | |
// Uncommon prefix | |
if (text.includes("Ph.D")) text = text.replace(/Ph\.D\./g, "Ph<prd>D<prd>"); | |
// Other abbreviations | |
text = text.replace(new RegExp("\\s" + alphabets + "[.] ", "g"), " $1<prd> "); | |
text = text.replace( | |
new RegExp(acronyms + " " + starters, "g"), | |
"$1<stop> $2", | |
); | |
text = text.replace( | |
new RegExp(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "g"), | |
"$1<prd>$2<prd>$3<prd>", | |
); | |
text = text.replace( | |
new RegExp(alphabets + "[.]" + alphabets + "[.]", "g"), | |
"$1<prd>$2<prd>", | |
); | |
text = text.replace( | |
new RegExp(" " + suffixes + "[.] " + starters, "g"), | |
" $1<stop> $2", | |
); | |
text = text.replace(new RegExp(" " + suffixes + "[.]", "g"), " $1<prd>"); | |
text = text.replace(new RegExp(" " + alphabets + "[.]", "g"), " $1<prd>"); | |
if (text.includes("”")) text = text.replace(/”/g, "”."); | |
console.log("C?", text); | |
if (text.includes('"')) text = text.replace(/"(?:\s|$)/g, '".'); | |
console.log("D?", text); | |
if (text.includes("!")) text = text.replace(/!"\s/g, '"!'); | |
console.log("E?", text); | |
if (text.includes("?")) text = text.replace(/\?"\s/g, '"?'); | |
text = text.replace(/\./g, ".<stop>"); | |
text = text.replace(/\?/g, "?<stop>"); | |
text = text.replace(/!/g, "!<stop>"); | |
text = text.replace(/<prd>/g, "."); | |
// Parse internal quotes | |
let index = 0; | |
const matches: string[] = []; | |
for (const match of paragraph.matchAll(new RegExp(insideQuotation, "gm"))) { | |
matches.push(match[1]); // The capture group | |
} | |
// Recursively mark sentences inside the quote | |
text = text.replace(new RegExp("<quote>", "gm"), (_, _group) => { | |
const match = matches[index++].toString(); | |
return markSentenceBreaks(match) | |
.split("<stop>") | |
.filter((sentence) => !!sentence.trim()) | |
.map((sentence) => `"${sentence.trim()}"<stop>`) | |
.join(""); | |
}); | |
return text; | |
}; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment