Created
January 10, 2025 20:37
-
-
Save diatche/ee49c7cad7b093bbc569793fe0eb0c56 to your computer and use it in GitHub Desktop.
Split Transcript into Paragraphs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Script: Split Transcript into Paragraphs | |
Description: | |
This script processes a transcript from a text file and splits it into paragraphs. | |
It uses a word count threshold to determine where paragraphs should end, ensuring | |
natural breaks at the end of sentences. The processed output is saved to the same | |
directory as the input file, with a suffix added to the base name. | |
Usage: | |
python script.py <input_file> | |
Arguments: | |
<input_file> Path to the transcript file (must be a plain text file). | |
Output: | |
A new file is created in the same directory as the input file, with '_formatted' | |
added to the base name and the same extension. | |
""" | |
import re | |
import os | |
import sys | |
def split_into_paragraphs(input_file, min_words=80, max_words=120, suffix="_formatted"): | |
# Ensure the input file exists | |
if not os.path.isfile(input_file): | |
print(f"Error: File '{input_file}' not found.") | |
sys.exit(1) | |
# Read the input file | |
with open(input_file, "r") as file: | |
text = file.read() | |
# Split text into sentences | |
sentences = re.split(r"(?<=[.!?])\s+", text) | |
# Create paragraphs | |
paragraphs = [] | |
current_paragraph = [] | |
word_count = 0 | |
for sentence in sentences: | |
words_in_sentence = len(sentence.split()) | |
if word_count + words_in_sentence > max_words and word_count >= min_words: | |
paragraphs.append(" ".join(current_paragraph)) | |
current_paragraph = [] | |
word_count = 0 | |
current_paragraph.append(sentence) | |
word_count += words_in_sentence | |
# Append the last paragraph if it exists | |
if current_paragraph: | |
paragraphs.append(" ".join(current_paragraph)) | |
# Determine the output file name | |
base_name, ext = os.path.splitext(input_file) | |
output_file = f"{base_name}{suffix}{ext}" | |
# Write paragraphs to the output file | |
with open(output_file, "w") as file: | |
for paragraph in paragraphs: | |
file.write(paragraph + "\n\n") | |
print(f"Processed text saved to {output_file}") | |
# Main function to handle CLI arguments | |
if __name__ == "__main__": | |
if len(sys.argv) != 2: | |
print("Usage: python script.py <input_file>") | |
sys.exit(1) | |
input_file = sys.argv[1] | |
split_into_paragraphs(input_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment