Skip to content

Instantly share code, notes, and snippets.

@diatche
Created January 10, 2025 20:37
Show Gist options
  • Save diatche/ee49c7cad7b093bbc569793fe0eb0c56 to your computer and use it in GitHub Desktop.
Save diatche/ee49c7cad7b093bbc569793fe0eb0c56 to your computer and use it in GitHub Desktop.
Split Transcript into Paragraphs
"""
Script: Split Transcript into Paragraphs
Description:
This script processes a transcript from a text file and splits it into paragraphs.
It uses a word count threshold to determine where paragraphs should end, ensuring
natural breaks at the end of sentences. The processed output is saved to the same
directory as the input file, with a suffix added to the base name.
Usage:
python script.py <input_file>
Arguments:
<input_file> Path to the transcript file (must be a plain text file).
Output:
A new file is created in the same directory as the input file, with '_formatted'
added to the base name and the same extension.
"""
import re
import os
import sys
def split_into_paragraphs(input_file, min_words=80, max_words=120, suffix="_formatted"):
# Ensure the input file exists
if not os.path.isfile(input_file):
print(f"Error: File '{input_file}' not found.")
sys.exit(1)
# Read the input file
with open(input_file, "r") as file:
text = file.read()
# Split text into sentences
sentences = re.split(r"(?<=[.!?])\s+", text)
# Create paragraphs
paragraphs = []
current_paragraph = []
word_count = 0
for sentence in sentences:
words_in_sentence = len(sentence.split())
if word_count + words_in_sentence > max_words and word_count >= min_words:
paragraphs.append(" ".join(current_paragraph))
current_paragraph = []
word_count = 0
current_paragraph.append(sentence)
word_count += words_in_sentence
# Append the last paragraph if it exists
if current_paragraph:
paragraphs.append(" ".join(current_paragraph))
# Determine the output file name
base_name, ext = os.path.splitext(input_file)
output_file = f"{base_name}{suffix}{ext}"
# Write paragraphs to the output file
with open(output_file, "w") as file:
for paragraph in paragraphs:
file.write(paragraph + "\n\n")
print(f"Processed text saved to {output_file}")
# Main function to handle CLI arguments
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python script.py <input_file>")
sys.exit(1)
input_file = sys.argv[1]
split_into_paragraphs(input_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment