Skip to content

Instantly share code, notes, and snippets.

@greg-randall
Created July 13, 2025 14:21
Show Gist options
  • Select an option

  • Save greg-randall/8b727a46d85aaab1fed24e962b21c406 to your computer and use it in GitHub Desktop.

Select an option

Save greg-randall/8b727a46d85aaab1fed24e962b21c406 to your computer and use it in GitHub Desktop.
This Python script converts a large text file into a series of audio files. You can run it from the command line by passing in your text file and choosing a voice with the --voice flag (e.g., --voice bm_lewis or the default --voice bf_emma).
"""
A script to convert a large text file into multiple speech audio files
by splitting the text into chunks and processing them sequentially.
"""
import argparse
import datetime
import re
import warnings
from pathlib import Path
from typing import List
import nltk
import soundfile as sf
from kokoro import KPipeline
from tqdm import tqdm
# --- Suppress specific library warnings ---
warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.modules.rnn")
warnings.filterwarnings("ignore", category=FutureWarning, module="torch.nn.utils.weight_norm")
# --- Constants ---
DEFAULT_VOICE = 'bf_emma'
DEFAULT_CHUNK_SIZE = 600
AUDIO_SAMPLE_RATE = 24000
# Download the sentence tokenizer model from NLTK.
nltk.download('punkt', quiet=True)
def generate_audio_files(pipeline: KPipeline, filename_prefix: Path, text: str, voice: str, speed: float = 1.0) -> None:
"""
Generates one or more TTS audio files from the given text using a pre-initialized Kokoro pipeline.
"""
generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'')
for i, (gs, ps, audio) in enumerate(generator):
output_filename = filename_prefix.with_name(f"{filename_prefix.name}_{i}.wav")
sf.write(output_filename, audio, AUDIO_SAMPLE_RATE)
def split_text(text: str, max_length: int) -> List[str]:
"""
Splits text into chunks smaller than `max_length`, respecting sentence boundaries.
"""
sentences = nltk.sent_tokenize(text)
chunks = []
current_chunk = ""
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
if len(sentence) > max_length:
if current_chunk:
chunks.append(current_chunk)
current_chunk = ""
words = sentence.split()
sub_chunk = ""
for word in words:
if len(sub_chunk) + len(word) + 1 <= max_length:
sub_chunk += f" {word}" if sub_chunk else word
else:
chunks.append(sub_chunk)
sub_chunk = word
if sub_chunk:
chunks.append(sub_chunk)
continue
if len(current_chunk) + len(sentence) + 1 <= max_length:
current_chunk += f" {sentence}" if current_chunk else sentence
else:
chunks.append(current_chunk)
current_chunk = sentence
if current_chunk:
chunks.append(current_chunk)
return chunks
def create_output_folder(input_file: Path) -> Path:
"""Creates a uniquely named folder for the output audio files."""
base_filename = input_file.stem
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
dir_name = Path(f"{base_filename}_audio_{timestamp}")
dir_name.mkdir(exist_ok=True)
return dir_name
def verify_chunks(original_text: str, chunks: List[str]) -> bool:
"""
Performs a simple verification to ensure no content was lost during chunking.
"""
pattern = re.compile(r'[\W_]+')
original_processed = pattern.sub('', original_text).lower()
chunks_processed = pattern.sub('', "".join(chunks)).lower()
return original_processed == chunks_processed
def main():
"""Main function to orchestrate the text-to-speech processing."""
parser = argparse.ArgumentParser(
description="Reads a text file, splits it into chunks, and converts each chunk to speech.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("input_file", type=Path, help="The text file to be processed.")
parser.add_argument("--voice", type=str, default=DEFAULT_VOICE, help="The voice to use for text-to-speech.")
parser.add_argument("--chunk-size", type=int, default=DEFAULT_CHUNK_SIZE, help="The maximum character length of each text chunk.")
args = parser.parse_args()
if not args.input_file.is_file():
print(f"Error: Input file not found at {args.input_file}")
return
# --- Step 1: Read and Split Text ---
print(f"Reading '{args.input_file.name}' and splitting into chunks of ~{args.chunk_size} characters...")
try:
original_text = args.input_file.read_text(encoding='utf-8')
chunks = split_text(original_text, args.chunk_size)
print(f"Successfully split text into {len(chunks)} chunks.")
except Exception as e:
print(f"Error reading or splitting file: {e}")
return
# --- Step 2: Verify Integrity ---
print("\nVerifying that no content was lost during chunking...")
if verify_chunks(original_text, chunks):
print("Verification successful: Chunked text matches original.")
else:
print("Error: Verification failed. The chunked text does not match the original. Aborting.")
return
# --- Step 3: Create Output Directory ---
output_folder = create_output_folder(args.input_file)
print(f"\nCreated output folder: {output_folder}")
# --- Step 4: Process Chunks Sequentially ---
print(f"\nProcessing {len(chunks)} chunks...")
# Initialize the TTS pipeline once, before the loop starts.
pipeline = KPipeline(lang_code='a')
# Process each chunk one by one with a progress bar.
for i, chunk in enumerate(tqdm(chunks, desc="Generating Audio"), start=1):
try:
speech_file_path = output_folder / f'{i:06d}'
generate_audio_files(pipeline, speech_file_path, chunk, voice=args.voice)
except Exception as e:
tqdm.write(f"Error processing chunk {i}: {e}")
print("\nProcessing complete.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment