ninapavlich · January 24, 2025 16:35
diff --git a/splitParagraphIntoSentences.ts b/splitParagraphIntoSentences.ts
 // Based off snippet from: https://stackoverflow.com/questions/4576077/how-can-i-split-a-text-into-sentences

 /*
    
 Example outputs:

 Handles ambiguous appbreviation and sentence ending:
 console.log(splitParagraphIntoSentences('This is a sentence. Mr. Smith went to N.Y.C. He said, "Hello!"'))
 [
  "This is a sentence.",
  "Mr. Smith went to N.Y.C.",
  'He said, "Hello!"',
 ]

 Handles short quote:
 console.log(splitParagraphIntoSentences('And then he said "whoa!" because he was shocked.'))
 [
  'And then he said "whoa!"', 
  "because he was shocked."
 ]

 Handles numbers within a sentence:
 console.log(splitParagraphIntoSentences("This is a long string with some numbers [125.000,55 and 140.000] and an end. This is another sentence."))
 [
  "This is a long string with some numbers [125.000,55 and 140.000] and an end.",
   "This is another sentence."
 ]

 Handles no punctuation:
 console.log(splitParagraphIntoSentences("This is a long string with some numbers [125.000,55 and 140.000] and an end. This is another sentence."))
 [
  "Just some text with no punctuation",
  "Just some text with no punctuation"
 ]

 Splits title with abbreviation:
 console.log(splitParagraphIntoSentences("Lee Zeldin, Trump's E.P.A. Nominee, Is Short on Environmental Experience"))
 [
  "Lee Zeldin, Trump's E.P.A. Nominee, Is Short on Environmental Experience"
 ]

 Splits multiple paragraphs with quotes:
 console.log(splitParagraphIntoSentences(`“I got my twins back when they were 8 months old and got my son back after I lived in my apartment for a year,” she said. “I was sleeping in the dining room so the kids could have the bedrooms. I never had any privacy at all. I had this dream of buying a house someday, and the head of ABLE introduced me to her friend Jessica, a real estate agent.”`))
 [
  '"I got my twins back when they were 8 months old and got my son back after I lived in my apartment for a year,"',
  "she said.",
  '"I was sleeping in the dining room so the kids could have the bedrooms."',
  '"I never had any privacy at all."',
  '"I had this dream of buying a house someday, and the head of ABLE introduced me to her friend Jessica, a real estate agent."',
 ]

 */

 export const splitParagraphIntoSentences = (paragraph: string): string[] => {
  
  const text = markSentenceBreaks(paragraph);

  let sentences = text.split("<stop>");
  sentences = sentences.map((s) => s.trim());
  if (sentences.length && !sentences[sentences.length - 1]) sentences.pop();
  sentences = sentences.filter((sentence) => sentence !== '".');
  return sentences;
 };
    
 const markSentenceBreaks = (paragraph: string): string => {
  const alphabets = "([A-Za-z])";
  const prefixes = "(Mr|St|Mrs|Ms|Dr)[.]";
  const suffixes = "(Inc|Ltd|Jr|Sr|Co)";
  const starters =
    "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\\s|She\\s|It\\s|They\\s|Their\\s|Our\\s|We\\s|But\\s|However\\s|That\\s|This\\s|Wherever)";
  const acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)";
  const websites = "[.](com|net|org|io|gov|edu|me)";
  const digits = "([0-9])";
  const multipleDots = "\\.{2,}";

  const insideQuotation = /(?:"|“)((?:\\.|[^"\\])*?)(?:"|”)/gm;

  // Replace all quotes, we will handle those seaparately, recursively
  let text = paragraph.replace(
    new RegExp(insideQuotation, "gm"),
    () => `<quote>`,
  );
  text = " " + text + "  ";
  // Replace newlines
  text = text.replace(/\n/g, " ");
  // Flag periods from common prefixes
  text = text.replace(new RegExp(prefixes, "g"), "$1<prd>");
  // Flag periods from wev=bsites
  text = text.replace(new RegExp(websites, "g"), "<prd>$1");
  // Flag periods from numeric values
  text = text.replace(new RegExp(digits + "[.]" + digits, "g"), "$1<prd>$2");
  // Flag periods from ellipsis?
  text = text.replace(
    new RegExp(multipleDots, "g"),
    (match) => "<prd>".repeat(match.length) + "<stop>",
  );
  // Uncommon prefix
  if (text.includes("Ph.D")) text = text.replace(/Ph\.D\./g, "Ph<prd>D<prd>");
  // Other abbreviations
  text = text.replace(new RegExp("\\s" + alphabets + "[.] ", "g"), " $1<prd> ");
  text = text.replace(
    new RegExp(acronyms + " " + starters, "g"),
    "$1<stop> $2",
  );
  text = text.replace(
    new RegExp(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "g"),
    "$1<prd>$2<prd>$3<prd>",
  );
  text = text.replace(
    new RegExp(alphabets + "[.]" + alphabets + "[.]", "g"),
    "$1<prd>$2<prd>",
  );
  text = text.replace(
    new RegExp(" " + suffixes + "[.] " + starters, "g"),
    " $1<stop> $2",
  );
  text = text.replace(new RegExp(" " + suffixes + "[.]", "g"), " $1<prd>");
  text = text.replace(new RegExp(" " + alphabets + "[.]", "g"), " $1<prd>");
  if (text.includes("”")) text = text.replace(/”/g, "”.");
  console.log("C?", text);
  if (text.includes('"')) text = text.replace(/"(?:\s|$)/g, '".');
  console.log("D?", text);
  if (text.includes("!")) text = text.replace(/!"\s/g, '"!');
  console.log("E?", text);
  if (text.includes("?")) text = text.replace(/\?"\s/g, '"?');
  text = text.replace(/\./g, ".<stop>");
  text = text.replace(/\?/g, "?<stop>");
  text = text.replace(/!/g, "!<stop>");
  text = text.replace(/<prd>/g, ".");

  // Parse internal quotes
  let index = 0;
  const matches: string[] = [];
  for (const match of paragraph.matchAll(new RegExp(insideQuotation, "gm"))) {
    matches.push(match[1]); // The capture group
  }
  // Recursively mark sentences inside the quote
  text = text.replace(new RegExp("<quote>", "gm"), (_, _group) => {
    const match = matches[index++].toString();
    return markSentenceBreaks(match)
      .split("<stop>")
      .filter((sentence) => !!sentence.trim())
      .map((sentence) => `"${sentence.trim()}"<stop>`)
      .join("");
  });

  return text;
 };
	// Based off snippet from: https://stackoverflow.com/questions/4576077/how-can-i-split-a-text-into-sentences

	/*

	Example outputs:

	Handles ambiguous appbreviation and sentence ending:
	console.log(splitParagraphIntoSentences('This is a sentence. Mr. Smith went to N.Y.C. He said, "Hello!"'))
	[
	"This is a sentence.",
	"Mr. Smith went to N.Y.C.",
	'He said, "Hello!"',
	]

	Handles short quote:
	console.log(splitParagraphIntoSentences('And then he said "whoa!" because he was shocked.'))
	[
	'And then he said "whoa!"',
	"because he was shocked."
	]

	Handles numbers within a sentence:
	console.log(splitParagraphIntoSentences("This is a long string with some numbers [125.000,55 and 140.000] and an end. This is another sentence."))
	[
	"This is a long string with some numbers [125.000,55 and 140.000] and an end.",
	"This is another sentence."
	]

	Handles no punctuation:
	console.log(splitParagraphIntoSentences("This is a long string with some numbers [125.000,55 and 140.000] and an end. This is another sentence."))
	[
	"Just some text with no punctuation",
	"Just some text with no punctuation"
	]

	Splits title with abbreviation:
	console.log(splitParagraphIntoSentences("Lee Zeldin, Trump's E.P.A. Nominee, Is Short on Environmental Experience"))
	[
	"Lee Zeldin, Trump's E.P.A. Nominee, Is Short on Environmental Experience"
	]

	Splits multiple paragraphs with quotes:
	console.log(splitParagraphIntoSentences(`“I got my twins back when they were 8 months old and got my son back after I lived in my apartment for a year,” she said. “I was sleeping in the dining room so the kids could have the bedrooms. I never had any privacy at all. I had this dream of buying a house someday, and the head of ABLE introduced me to her friend Jessica, a real estate agent.”`))
	[
	'"I got my twins back when they were 8 months old and got my son back after I lived in my apartment for a year,"',
	"she said.",
	'"I was sleeping in the dining room so the kids could have the bedrooms."',
	'"I never had any privacy at all."',
	'"I had this dream of buying a house someday, and the head of ABLE introduced me to her friend Jessica, a real estate agent."',
	]

	*/

	export const splitParagraphIntoSentences = (paragraph: string): string[] => {

	const text = markSentenceBreaks(paragraph);

	let sentences = text.split("<stop>");
	sentences = sentences.map((s) => s.trim());
	if (sentences.length && !sentences[sentences.length - 1]) sentences.pop();
	sentences = sentences.filter((sentence) => sentence !== '".');
	return sentences;
	};

	const markSentenceBreaks = (paragraph: string): string => {
	const alphabets = "([A-Za-z])";
	const prefixes = "(Mr\|St\|Mrs\|Ms\|Dr)[.]";
	const suffixes = "(Inc\|Ltd\|Jr\|Sr\|Co)";
	const starters =
	"(Mr\|Mrs\|Ms\|Dr\|Prof\|Capt\|Cpt\|Lt\|He\\s\|She\\s\|It\\s\|They\\s\|Their\\s\|Our\\s\|We\\s\|But\\s\|However\\s\|That\\s\|This\\s\|Wherever)";
	const acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)";
	const websites = "[.](com\|net\|org\|io\|gov\|edu\|me)";
	const digits = "([0-9])";
	const multipleDots = "\\.{2,}";

	const insideQuotation = /(?:"\|“)((?:\\.\|[^"\\])*?)(?:"\|”)/gm;

	// Replace all quotes, we will handle those seaparately, recursively
	let text = paragraph.replace(
	new RegExp(insideQuotation, "gm"),
	() => `<quote>`,
	);
	text = " " + text + " ";
	// Replace newlines
	text = text.replace(/\n/g, " ");
	// Flag periods from common prefixes
	text = text.replace(new RegExp(prefixes, "g"), "$1<prd>");
	// Flag periods from wev=bsites
	text = text.replace(new RegExp(websites, "g"), "<prd>$1");
	// Flag periods from numeric values
	text = text.replace(new RegExp(digits + "[.]" + digits, "g"), "$1<prd>$2");
	// Flag periods from ellipsis?
	text = text.replace(
	new RegExp(multipleDots, "g"),
	(match) => "<prd>".repeat(match.length) + "<stop>",
	);
	// Uncommon prefix
	if (text.includes("Ph.D")) text = text.replace(/Ph\.D\./g, "Ph<prd>D<prd>");
	// Other abbreviations
	text = text.replace(new RegExp("\\s" + alphabets + "[.] ", "g"), " $1<prd> ");
	text = text.replace(
	new RegExp(acronyms + " " + starters, "g"),
	"$1<stop> $2",
	);
	text = text.replace(
	new RegExp(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "g"),
	"$1<prd>$2<prd>$3<prd>",
	);
	text = text.replace(
	new RegExp(alphabets + "[.]" + alphabets + "[.]", "g"),
	"$1<prd>$2<prd>",
	);
	text = text.replace(
	new RegExp(" " + suffixes + "[.] " + starters, "g"),
	" $1<stop> $2",
	);
	text = text.replace(new RegExp(" " + suffixes + "[.]", "g"), " $1<prd>");
	text = text.replace(new RegExp(" " + alphabets + "[.]", "g"), " $1<prd>");
	if (text.includes("”")) text = text.replace(/”/g, "”.");
	console.log("C?", text);
	if (text.includes('"')) text = text.replace(/"(?:\s\|$)/g, '".');
	console.log("D?", text);
	if (text.includes("!")) text = text.replace(/!"\s/g, '"!');
	console.log("E?", text);
	if (text.includes("?")) text = text.replace(/\?"\s/g, '"?');
	text = text.replace(/\./g, ".<stop>");
	text = text.replace(/\?/g, "?<stop>");
	text = text.replace(/!/g, "!<stop>");
	text = text.replace(/<prd>/g, ".");

	// Parse internal quotes
	let index = 0;
	const matches: string[] = [];
	for (const match of paragraph.matchAll(new RegExp(insideQuotation, "gm"))) {
	matches.push(match[1]); // The capture group
	}
	// Recursively mark sentences inside the quote
	text = text.replace(new RegExp("<quote>", "gm"), (_, _group) => {
	const match = matches[index++].toString();
	return markSentenceBreaks(match)
	.split("<stop>")
	.filter((sentence) => !!sentence.trim())
	.map((sentence) => `"${sentence.trim()}"<stop>`)
	.join("");
	});

	return text;
	};