diatche · January 10, 2025 20:37
diff --git a/format_transcript.py b/format_transcript.py
 """
 Script: Split Transcript into Paragraphs

 Description:
 This script processes a transcript from a text file and splits it into paragraphs.
 It uses a word count threshold to determine where paragraphs should end, ensuring
 natural breaks at the end of sentences. The processed output is saved to the same
 directory as the input file, with a suffix added to the base name.

 Usage:
    python script.py <input_file>

 Arguments:
    <input_file>  Path to the transcript file (must be a plain text file).

 Output:
    A new file is created in the same directory as the input file, with '_formatted'
    added to the base name and the same extension.
 """

 import re
 import os
 import sys


 def split_into_paragraphs(input_file, min_words=80, max_words=120, suffix="_formatted"):
    # Ensure the input file exists
    if not os.path.isfile(input_file):
        print(f"Error: File '{input_file}' not found.")
        sys.exit(1)

    # Read the input file
    with open(input_file, "r") as file:
        text = file.read()

    # Split text into sentences
    sentences = re.split(r"(?<=[.!?])\s+", text)

    # Create paragraphs
    paragraphs = []
    current_paragraph = []
    word_count = 0

    for sentence in sentences:
        words_in_sentence = len(sentence.split())
        if word_count + words_in_sentence > max_words and word_count >= min_words:
            paragraphs.append(" ".join(current_paragraph))
            current_paragraph = []
            word_count = 0

        current_paragraph.append(sentence)
        word_count += words_in_sentence

    # Append the last paragraph if it exists
    if current_paragraph:
        paragraphs.append(" ".join(current_paragraph))

    # Determine the output file name
    base_name, ext = os.path.splitext(input_file)
    output_file = f"{base_name}{suffix}{ext}"

    # Write paragraphs to the output file
    with open(output_file, "w") as file:
        for paragraph in paragraphs:
            file.write(paragraph + "\n\n")

    print(f"Processed text saved to {output_file}")


 # Main function to handle CLI arguments
 if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python script.py <input_file>")
        sys.exit(1)

    input_file = sys.argv[1]
    split_into_paragraphs(input_file)
	"""
	Script: Split Transcript into Paragraphs

	Description:
	This script processes a transcript from a text file and splits it into paragraphs.
	It uses a word count threshold to determine where paragraphs should end, ensuring
	natural breaks at the end of sentences. The processed output is saved to the same
	directory as the input file, with a suffix added to the base name.

	Usage:
	python script.py <input_file>

	Arguments:
	<input_file> Path to the transcript file (must be a plain text file).

	Output:
	A new file is created in the same directory as the input file, with '_formatted'
	added to the base name and the same extension.
	"""

	import re
	import os
	import sys


	def split_into_paragraphs(input_file, min_words=80, max_words=120, suffix="_formatted"):
	# Ensure the input file exists
	if not os.path.isfile(input_file):
	print(f"Error: File '{input_file}' not found.")
	sys.exit(1)

	# Read the input file
	with open(input_file, "r") as file:
	text = file.read()

	# Split text into sentences
	sentences = re.split(r"(?<=[.!?])\s+", text)

	# Create paragraphs
	paragraphs = []
	current_paragraph = []
	word_count = 0

	for sentence in sentences:
	words_in_sentence = len(sentence.split())
	if word_count + words_in_sentence > max_words and word_count >= min_words:
	paragraphs.append(" ".join(current_paragraph))
	current_paragraph = []
	word_count = 0

	current_paragraph.append(sentence)
	word_count += words_in_sentence

	# Append the last paragraph if it exists
	if current_paragraph:
	paragraphs.append(" ".join(current_paragraph))

	# Determine the output file name
	base_name, ext = os.path.splitext(input_file)
	output_file = f"{base_name}{suffix}{ext}"

	# Write paragraphs to the output file
	with open(output_file, "w") as file:
	for paragraph in paragraphs:
	file.write(paragraph + "\n\n")

	print(f"Processed text saved to {output_file}")


	# Main function to handle CLI arguments
	if __name__ == "__main__":
	if len(sys.argv) != 2:
	print("Usage: python script.py <input_file>")
	sys.exit(1)

	input_file = sys.argv[1]
	split_into_paragraphs(input_file)