Skip to content

Instantly share code, notes, and snippets.

@ninapavlich
Created January 24, 2025 16:35
Show Gist options
  • Save ninapavlich/78ce925656a3e5f4e28a251f86e618e8 to your computer and use it in GitHub Desktop.
Save ninapavlich/78ce925656a3e5f4e28a251f86e618e8 to your computer and use it in GitHub Desktop.
Splits paragraphs into sentence, handling certain edge cases
// Based off snippet from: https://stackoverflow.com/questions/4576077/how-can-i-split-a-text-into-sentences
/*
Example outputs:
Handles ambiguous appbreviation and sentence ending:
console.log(splitParagraphIntoSentences('This is a sentence. Mr. Smith went to N.Y.C. He said, "Hello!"'))
[
"This is a sentence.",
"Mr. Smith went to N.Y.C.",
'He said, "Hello!"',
]
Handles short quote:
console.log(splitParagraphIntoSentences('And then he said "whoa!" because he was shocked.'))
[
'And then he said "whoa!"',
"because he was shocked."
]
Handles numbers within a sentence:
console.log(splitParagraphIntoSentences("This is a long string with some numbers [125.000,55 and 140.000] and an end. This is another sentence."))
[
"This is a long string with some numbers [125.000,55 and 140.000] and an end.",
"This is another sentence."
]
Handles no punctuation:
console.log(splitParagraphIntoSentences("This is a long string with some numbers [125.000,55 and 140.000] and an end. This is another sentence."))
[
"Just some text with no punctuation",
"Just some text with no punctuation"
]
Splits title with abbreviation:
console.log(splitParagraphIntoSentences("Lee Zeldin, Trump's E.P.A. Nominee, Is Short on Environmental Experience"))
[
"Lee Zeldin, Trump's E.P.A. Nominee, Is Short on Environmental Experience"
]
Splits multiple paragraphs with quotes:
console.log(splitParagraphIntoSentences(`“I got my twins back when they were 8 months old and got my son back after I lived in my apartment for a year,” she said. “I was sleeping in the dining room so the kids could have the bedrooms. I never had any privacy at all. I had this dream of buying a house someday, and the head of ABLE introduced me to her friend Jessica, a real estate agent.”`))
[
'"I got my twins back when they were 8 months old and got my son back after I lived in my apartment for a year,"',
"she said.",
'"I was sleeping in the dining room so the kids could have the bedrooms."',
'"I never had any privacy at all."',
'"I had this dream of buying a house someday, and the head of ABLE introduced me to her friend Jessica, a real estate agent."',
]
*/
export const splitParagraphIntoSentences = (paragraph: string): string[] => {
const text = markSentenceBreaks(paragraph);
let sentences = text.split("<stop>");
sentences = sentences.map((s) => s.trim());
if (sentences.length && !sentences[sentences.length - 1]) sentences.pop();
sentences = sentences.filter((sentence) => sentence !== '".');
return sentences;
};
const markSentenceBreaks = (paragraph: string): string => {
const alphabets = "([A-Za-z])";
const prefixes = "(Mr|St|Mrs|Ms|Dr)[.]";
const suffixes = "(Inc|Ltd|Jr|Sr|Co)";
const starters =
"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\\s|She\\s|It\\s|They\\s|Their\\s|Our\\s|We\\s|But\\s|However\\s|That\\s|This\\s|Wherever)";
const acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)";
const websites = "[.](com|net|org|io|gov|edu|me)";
const digits = "([0-9])";
const multipleDots = "\\.{2,}";
const insideQuotation = /(?:"|)((?:\\.|[^"\\])*?)(?:"|)/gm;
// Replace all quotes, we will handle those seaparately, recursively
let text = paragraph.replace(
new RegExp(insideQuotation, "gm"),
() => `<quote>`,
);
text = " " + text + " ";
// Replace newlines
text = text.replace(/\n/g, " ");
// Flag periods from common prefixes
text = text.replace(new RegExp(prefixes, "g"), "$1<prd>");
// Flag periods from wev=bsites
text = text.replace(new RegExp(websites, "g"), "<prd>$1");
// Flag periods from numeric values
text = text.replace(new RegExp(digits + "[.]" + digits, "g"), "$1<prd>$2");
// Flag periods from ellipsis?
text = text.replace(
new RegExp(multipleDots, "g"),
(match) => "<prd>".repeat(match.length) + "<stop>",
);
// Uncommon prefix
if (text.includes("Ph.D")) text = text.replace(/Ph\.D\./g, "Ph<prd>D<prd>");
// Other abbreviations
text = text.replace(new RegExp("\\s" + alphabets + "[.] ", "g"), " $1<prd> ");
text = text.replace(
new RegExp(acronyms + " " + starters, "g"),
"$1<stop> $2",
);
text = text.replace(
new RegExp(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "g"),
"$1<prd>$2<prd>$3<prd>",
);
text = text.replace(
new RegExp(alphabets + "[.]" + alphabets + "[.]", "g"),
"$1<prd>$2<prd>",
);
text = text.replace(
new RegExp(" " + suffixes + "[.] " + starters, "g"),
" $1<stop> $2",
);
text = text.replace(new RegExp(" " + suffixes + "[.]", "g"), " $1<prd>");
text = text.replace(new RegExp(" " + alphabets + "[.]", "g"), " $1<prd>");
if (text.includes("”")) text = text.replace(//g, "”.");
console.log("C?", text);
if (text.includes('"')) text = text.replace(/"(?:\s|$)/g, '".');
console.log("D?", text);
if (text.includes("!")) text = text.replace(/!"\s/g, '"!');
console.log("E?", text);
if (text.includes("?")) text = text.replace(/\?"\s/g, '"?');
text = text.replace(/\./g, ".<stop>");
text = text.replace(/\?/g, "?<stop>");
text = text.replace(/!/g, "!<stop>");
text = text.replace(/<prd>/g, ".");
// Parse internal quotes
let index = 0;
const matches: string[] = [];
for (const match of paragraph.matchAll(new RegExp(insideQuotation, "gm"))) {
matches.push(match[1]); // The capture group
}
// Recursively mark sentences inside the quote
text = text.replace(new RegExp("<quote>", "gm"), (_, _group) => {
const match = matches[index++].toString();
return markSentenceBreaks(match)
.split("<stop>")
.filter((sentence) => !!sentence.trim())
.map((sentence) => `"${sentence.trim()}"<stop>`)
.join("");
});
return text;
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment