greg-randall · July 13, 2025 14:21
diff --git a/gen-audio.py b/gen-audio.py
 """
 A script to convert a large text file into multiple speech audio files
 by splitting the text into chunks and processing them sequentially.
 """

 import argparse
 import datetime
 import re
 import warnings
 from pathlib import Path
 from typing import List

 import nltk
 import soundfile as sf
 from kokoro import KPipeline
 from tqdm import tqdm

 # --- Suppress specific library warnings ---
 warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.modules.rnn")
 warnings.filterwarnings("ignore", category=FutureWarning, module="torch.nn.utils.weight_norm")


 # --- Constants ---
 DEFAULT_VOICE = 'bf_emma'
 DEFAULT_CHUNK_SIZE = 600
 AUDIO_SAMPLE_RATE = 24000

 # Download the sentence tokenizer model from NLTK.
 nltk.download('punkt', quiet=True)


 def generate_audio_files(pipeline: KPipeline, filename_prefix: Path, text: str, voice: str, speed: float = 1.0) -> None:
    """
    Generates one or more TTS audio files from the given text using a pre-initialized Kokoro pipeline.
    """
    generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'')

    for i, (gs, ps, audio) in enumerate(generator):
        output_filename = filename_prefix.with_name(f"{filename_prefix.name}_{i}.wav")
        sf.write(output_filename, audio, AUDIO_SAMPLE_RATE)


 def split_text(text: str, max_length: int) -> List[str]:
    """
    Splits text into chunks smaller than `max_length`, respecting sentence boundaries.
    """
    sentences = nltk.sent_tokenize(text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue

        if len(sentence) > max_length:
            if current_chunk:
                chunks.append(current_chunk)
                current_chunk = ""
            
            words = sentence.split()
            sub_chunk = ""
            for word in words:
                if len(sub_chunk) + len(word) + 1 <= max_length:
                    sub_chunk += f" {word}" if sub_chunk else word
                else:
                    chunks.append(sub_chunk)
                    sub_chunk = word
            if sub_chunk:
                chunks.append(sub_chunk)
            continue

        if len(current_chunk) + len(sentence) + 1 <= max_length:
            current_chunk += f" {sentence}" if current_chunk else sentence
        else:
            chunks.append(current_chunk)
            current_chunk = sentence

    if current_chunk:
        chunks.append(current_chunk)

    return chunks


 def create_output_folder(input_file: Path) -> Path:
    """Creates a uniquely named folder for the output audio files."""
    base_filename = input_file.stem
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    dir_name = Path(f"{base_filename}_audio_{timestamp}")
    dir_name.mkdir(exist_ok=True)
    return dir_name


 def verify_chunks(original_text: str, chunks: List[str]) -> bool:
    """
    Performs a simple verification to ensure no content was lost during chunking.
    """
    pattern = re.compile(r'[\W_]+')
    original_processed = pattern.sub('', original_text).lower()
    chunks_processed = pattern.sub('', "".join(chunks)).lower()
    return original_processed == chunks_processed


 def main():
    """Main function to orchestrate the text-to-speech processing."""
    
    parser = argparse.ArgumentParser(
        description="Reads a text file, splits it into chunks, and converts each chunk to speech.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument("input_file", type=Path, help="The text file to be processed.")
    parser.add_argument("--voice", type=str, default=DEFAULT_VOICE, help="The voice to use for text-to-speech.")
    parser.add_argument("--chunk-size", type=int, default=DEFAULT_CHUNK_SIZE, help="The maximum character length of each text chunk.")
    args = parser.parse_args()

    if not args.input_file.is_file():
        print(f"Error: Input file not found at {args.input_file}")
        return

    # --- Step 1: Read and Split Text ---
    print(f"Reading '{args.input_file.name}' and splitting into chunks of ~{args.chunk_size} characters...")
    try:
        original_text = args.input_file.read_text(encoding='utf-8')
        chunks = split_text(original_text, args.chunk_size)
        print(f"Successfully split text into {len(chunks)} chunks.")
    except Exception as e:
        print(f"Error reading or splitting file: {e}")
        return

    # --- Step 2: Verify Integrity ---
    print("\nVerifying that no content was lost during chunking...")
    if verify_chunks(original_text, chunks):
        print("Verification successful: Chunked text matches original.")
    else:
        print("Error: Verification failed. The chunked text does not match the original. Aborting.")
        return

    # --- Step 3: Create Output Directory ---
    output_folder = create_output_folder(args.input_file)
    print(f"\nCreated output folder: {output_folder}")

    # --- Step 4: Process Chunks Sequentially ---
    print(f"\nProcessing {len(chunks)} chunks...")
    
    # Initialize the TTS pipeline once, before the loop starts.
    pipeline = KPipeline(lang_code='a')

    # Process each chunk one by one with a progress bar.
    for i, chunk in enumerate(tqdm(chunks, desc="Generating Audio"), start=1):
        try:
            speech_file_path = output_folder / f'{i:06d}'
            generate_audio_files(pipeline, speech_file_path, chunk, voice=args.voice)
        except Exception as e:
            tqdm.write(f"Error processing chunk {i}: {e}")
            
    print("\nProcessing complete.")


 if __name__ == "__main__":
    main()
	"""
	A script to convert a large text file into multiple speech audio files
	by splitting the text into chunks and processing them sequentially.
	"""

	import argparse
	import datetime
	import re
	import warnings
	from pathlib import Path
	from typing import List

	import nltk
	import soundfile as sf
	from kokoro import KPipeline
	from tqdm import tqdm

	# --- Suppress specific library warnings ---
	warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.modules.rnn")
	warnings.filterwarnings("ignore", category=FutureWarning, module="torch.nn.utils.weight_norm")


	# --- Constants ---
	DEFAULT_VOICE = 'bf_emma'
	DEFAULT_CHUNK_SIZE = 600
	AUDIO_SAMPLE_RATE = 24000

	# Download the sentence tokenizer model from NLTK.
	nltk.download('punkt', quiet=True)


	def generate_audio_files(pipeline: KPipeline, filename_prefix: Path, text: str, voice: str, speed: float = 1.0) -> None:
	"""
	Generates one or more TTS audio files from the given text using a pre-initialized Kokoro pipeline.
	"""
	generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'')

	for i, (gs, ps, audio) in enumerate(generator):
	output_filename = filename_prefix.with_name(f"{filename_prefix.name}_{i}.wav")
	sf.write(output_filename, audio, AUDIO_SAMPLE_RATE)


	def split_text(text: str, max_length: int) -> List[str]:
	"""
	Splits text into chunks smaller than `max_length`, respecting sentence boundaries.
	"""
	sentences = nltk.sent_tokenize(text)
	chunks = []
	current_chunk = ""

	for sentence in sentences:
	sentence = sentence.strip()
	if not sentence:
	continue

	if len(sentence) > max_length:
	if current_chunk:
	chunks.append(current_chunk)
	current_chunk = ""

	words = sentence.split()
	sub_chunk = ""
	for word in words:
	if len(sub_chunk) + len(word) + 1 <= max_length:
	sub_chunk += f" {word}" if sub_chunk else word
	else:
	chunks.append(sub_chunk)
	sub_chunk = word
	if sub_chunk:
	chunks.append(sub_chunk)
	continue

	if len(current_chunk) + len(sentence) + 1 <= max_length:
	current_chunk += f" {sentence}" if current_chunk else sentence
	else:
	chunks.append(current_chunk)
	current_chunk = sentence

	if current_chunk:
	chunks.append(current_chunk)

	return chunks


	def create_output_folder(input_file: Path) -> Path:
	"""Creates a uniquely named folder for the output audio files."""
	base_filename = input_file.stem
	timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
	dir_name = Path(f"{base_filename}_audio_{timestamp}")
	dir_name.mkdir(exist_ok=True)
	return dir_name


	def verify_chunks(original_text: str, chunks: List[str]) -> bool:
	"""
	Performs a simple verification to ensure no content was lost during chunking.
	"""
	pattern = re.compile(r'[\W_]+')
	original_processed = pattern.sub('', original_text).lower()
	chunks_processed = pattern.sub('', "".join(chunks)).lower()
	return original_processed == chunks_processed


	def main():
	"""Main function to orchestrate the text-to-speech processing."""

	parser = argparse.ArgumentParser(
	description="Reads a text file, splits it into chunks, and converts each chunk to speech.",
	formatter_class=argparse.ArgumentDefaultsHelpFormatter
	)
	parser.add_argument("input_file", type=Path, help="The text file to be processed.")
	parser.add_argument("--voice", type=str, default=DEFAULT_VOICE, help="The voice to use for text-to-speech.")
	parser.add_argument("--chunk-size", type=int, default=DEFAULT_CHUNK_SIZE, help="The maximum character length of each text chunk.")
	args = parser.parse_args()

	if not args.input_file.is_file():
	print(f"Error: Input file not found at {args.input_file}")
	return

	# --- Step 1: Read and Split Text ---
	print(f"Reading '{args.input_file.name}' and splitting into chunks of ~{args.chunk_size} characters...")
	try:
	original_text = args.input_file.read_text(encoding='utf-8')
	chunks = split_text(original_text, args.chunk_size)
	print(f"Successfully split text into {len(chunks)} chunks.")
	except Exception as e:
	print(f"Error reading or splitting file: {e}")
	return

	# --- Step 2: Verify Integrity ---
	print("\nVerifying that no content was lost during chunking...")
	if verify_chunks(original_text, chunks):
	print("Verification successful: Chunked text matches original.")
	else:
	print("Error: Verification failed. The chunked text does not match the original. Aborting.")
	return

	# --- Step 3: Create Output Directory ---
	output_folder = create_output_folder(args.input_file)
	print(f"\nCreated output folder: {output_folder}")

	# --- Step 4: Process Chunks Sequentially ---
	print(f"\nProcessing {len(chunks)} chunks...")

	# Initialize the TTS pipeline once, before the loop starts.
	pipeline = KPipeline(lang_code='a')

	# Process each chunk one by one with a progress bar.
	for i, chunk in enumerate(tqdm(chunks, desc="Generating Audio"), start=1):
	try:
	speech_file_path = output_folder / f'{i:06d}'
	generate_audio_files(pipeline, speech_file_path, chunk, voice=args.voice)
	except Exception as e:
	tqdm.write(f"Error processing chunk {i}: {e}")

	print("\nProcessing complete.")


	if __name__ == "__main__":
	main()
No results found