Skip to content

Instantly share code, notes, and snippets.

@thelabcat
Last active June 17, 2026 16:23
Show Gist options
  • Select an option

  • Save thelabcat/1bfbc93a2a76dc03882a09b7d6e8ccf9 to your computer and use it in GitHub Desktop.

Select an option

Save thelabcat/1bfbc93a2a76dc03882a09b7d6e8ccf9 to your computer and use it in GitHub Desktop.
Chatterkey - CLI script for calling Chatterbox TTS
#!/usr/bin/env python3
"""Chatterkey
CLI script for calling Chatterbox TTS
Copyright 2026 Wilbur Jaywright dba Marswide BGL
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
S.D.G."""
import argparse
import io
import sys
import time
from warnings import warn
from chatterbox.tts import ChatterboxTTS
from chatterbox.tts_turbo import ChatterboxTurboTTS
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
from pygame import mixer
import torch
from torch import cuda
from torch.backends import mps
from torchcodec.encoders import AudioEncoder
# Compute device defaults to Mac Metal Performance Shaders if we have that, then NVidia CUDA secondarily, and last CPU
DEVICE_DEFAULT = "mps" if mps.is_available() else "cuda" if cuda.is_available() else "cpu"
# Get all the command line arguments stuff done first
parser = argparse.ArgumentParser(
prog='Chatterkey',
description='CLI script for calling Chatterbox TTS',
epilog='S.D.G.')
parser.add_argument("-d", "--device", default=DEVICE_DEFAULT, choices=("cuda", "mps", "cpu"), help=f"What device to use for compute, defaults to {DEVICE_DEFAULT} on your system")
parser.add_argument("-t", "--turbo", action=argparse.BooleanOptionalAction, help="Use the turbo model (English only)")
parser.add_argument("-m", "--multilingual", help="Use the multilingual model, with the provided language ID")
parser.add_argument("-p", "--audio-prompt", help="Base synthesis on this voice clip (specified by path)")
parser.add_argument("-o", "--output-file", help="Write the output to a file")
parser.add_argument("text", nargs="*", type=str, help="What to say, defaults to stdin")
args = parser.parse_args()
# TODO: Make argparse know these two are mutually exclusive
assert not (args.turbo and args.multilingual), "Cannot do both turbo and multilingual: They are two different models!"
# --- Patch for Torch.load() by Noobies ---
map_location = torch.device(args.device)
torch_load_original = torch.load
def patched_torch_load(*args, **kwargs):
if 'map_location' not in kwargs:
kwargs['map_location'] = map_location
return torch_load_original(*args, **kwargs)
torch.load = patched_torch_load
# --------
# Load the correct model according to user specifications
model = ChatterboxTurboTTS.from_pretrained(device=args.device) if args.turbo \
else ChatterboxMultilingualTTS.from_pretrained(device=args.device) if args.multilingual \
else ChatterboxTTS.from_pretrained(device=args.device) # Default to the English-only model
print("=== Models loaded! ===")
def generate_wav_encoder(text: str):
"""Synthesize the given text, and return an audio encoder around it"""
wav = model.generate(text, audio_prompt_path=args.audio_prompt, language_id=args.multilingual) if args.multilingual \
else model.generate(text, audio_prompt_path=args.audio_prompt) # One keyword argument is special for multilingual
return AudioEncoder(wav, sample_rate=model.sr)
def synth_and_play(text: str):
"""Synthesize the given text, and play it immediately (blocking)"""
b = io.BytesIO()
generate_wav_encoder(text).to_file_like(b, format="wav")
b.seek(0)
# Wait for sound to finish playing
# Technically this code is AI generated, but I wrote it down and I understand how it works
channel = mixer.Sound(b).play()
while channel.get_busy():
time.sleep(0.1)
# No output file specified, play using PyGame Mixer
if not args.output_file:
mixer.init()
# Text was specified, just play it and we're done
if args.text:
synth_and_play(" ".join(args.text))
# We are in stdin mode
else:
for line in sys.stdin:
synth_and_play(line)
mixer.quit()
# Output file was specified, write to file
else:
# Regardless of stdin mode, acquire ALL text and synthesize it at once
text = " ".join(args.text) if args.text else sys.stdin.read()
generate_wav_encoder(text).to_file(args.output_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment