Last active
June 17, 2026 16:23
-
-
Save thelabcat/1bfbc93a2a76dc03882a09b7d6e8ccf9 to your computer and use it in GitHub Desktop.
Chatterkey - CLI script for calling Chatterbox TTS
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """Chatterkey | |
| CLI script for calling Chatterbox TTS | |
| Copyright 2026 Wilbur Jaywright dba Marswide BGL | |
| Licensed under the Apache License, Version 2.0 (the "License"); | |
| you may not use this file except in compliance with the License. | |
| You may obtain a copy of the License at | |
| http://www.apache.org/licenses/LICENSE-2.0 | |
| Unless required by applicable law or agreed to in writing, software | |
| distributed under the License is distributed on an "AS IS" BASIS, | |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| See the License for the specific language governing permissions and | |
| limitations under the License. | |
| S.D.G.""" | |
| import argparse | |
| import io | |
| import sys | |
| import time | |
| from warnings import warn | |
| from chatterbox.tts import ChatterboxTTS | |
| from chatterbox.tts_turbo import ChatterboxTurboTTS | |
| from chatterbox.mtl_tts import ChatterboxMultilingualTTS | |
| from pygame import mixer | |
| import torch | |
| from torch import cuda | |
| from torch.backends import mps | |
| from torchcodec.encoders import AudioEncoder | |
| # Compute device defaults to Mac Metal Performance Shaders if we have that, then NVidia CUDA secondarily, and last CPU | |
| DEVICE_DEFAULT = "mps" if mps.is_available() else "cuda" if cuda.is_available() else "cpu" | |
| # Get all the command line arguments stuff done first | |
| parser = argparse.ArgumentParser( | |
| prog='Chatterkey', | |
| description='CLI script for calling Chatterbox TTS', | |
| epilog='S.D.G.') | |
| parser.add_argument("-d", "--device", default=DEVICE_DEFAULT, choices=("cuda", "mps", "cpu"), help=f"What device to use for compute, defaults to {DEVICE_DEFAULT} on your system") | |
| parser.add_argument("-t", "--turbo", action=argparse.BooleanOptionalAction, help="Use the turbo model (English only)") | |
| parser.add_argument("-m", "--multilingual", help="Use the multilingual model, with the provided language ID") | |
| parser.add_argument("-p", "--audio-prompt", help="Base synthesis on this voice clip (specified by path)") | |
| parser.add_argument("-o", "--output-file", help="Write the output to a file") | |
| parser.add_argument("text", nargs="*", type=str, help="What to say, defaults to stdin") | |
| args = parser.parse_args() | |
| # TODO: Make argparse know these two are mutually exclusive | |
| assert not (args.turbo and args.multilingual), "Cannot do both turbo and multilingual: They are two different models!" | |
| # --- Patch for Torch.load() by Noobies --- | |
| map_location = torch.device(args.device) | |
| torch_load_original = torch.load | |
| def patched_torch_load(*args, **kwargs): | |
| if 'map_location' not in kwargs: | |
| kwargs['map_location'] = map_location | |
| return torch_load_original(*args, **kwargs) | |
| torch.load = patched_torch_load | |
| # -------- | |
| # Load the correct model according to user specifications | |
| model = ChatterboxTurboTTS.from_pretrained(device=args.device) if args.turbo \ | |
| else ChatterboxMultilingualTTS.from_pretrained(device=args.device) if args.multilingual \ | |
| else ChatterboxTTS.from_pretrained(device=args.device) # Default to the English-only model | |
| print("=== Models loaded! ===") | |
| def generate_wav_encoder(text: str): | |
| """Synthesize the given text, and return an audio encoder around it""" | |
| wav = model.generate(text, audio_prompt_path=args.audio_prompt, language_id=args.multilingual) if args.multilingual \ | |
| else model.generate(text, audio_prompt_path=args.audio_prompt) # One keyword argument is special for multilingual | |
| return AudioEncoder(wav, sample_rate=model.sr) | |
| def synth_and_play(text: str): | |
| """Synthesize the given text, and play it immediately (blocking)""" | |
| b = io.BytesIO() | |
| generate_wav_encoder(text).to_file_like(b, format="wav") | |
| b.seek(0) | |
| # Wait for sound to finish playing | |
| # Technically this code is AI generated, but I wrote it down and I understand how it works | |
| channel = mixer.Sound(b).play() | |
| while channel.get_busy(): | |
| time.sleep(0.1) | |
| # No output file specified, play using PyGame Mixer | |
| if not args.output_file: | |
| mixer.init() | |
| # Text was specified, just play it and we're done | |
| if args.text: | |
| synth_and_play(" ".join(args.text)) | |
| # We are in stdin mode | |
| else: | |
| for line in sys.stdin: | |
| synth_and_play(line) | |
| mixer.quit() | |
| # Output file was specified, write to file | |
| else: | |
| # Regardless of stdin mode, acquire ALL text and synthesize it at once | |
| text = " ".join(args.text) if args.text else sys.stdin.read() | |
| generate_wav_encoder(text).to_file(args.output_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment