thelabcat · June 17, 2026 16:23
diff --git a/chatterkey.py b/chatterkey.py
 #!/usr/bin/env python3
 """Chatterkey

 CLI script for calling Chatterbox TTS

 Copyright 2026 Wilbur Jaywright dba Marswide BGL

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.

 S.D.G."""

 import argparse
 import io
 import sys
 import time
 from warnings import warn
 from chatterbox.tts import ChatterboxTTS
 from chatterbox.tts_turbo import ChatterboxTurboTTS
 from chatterbox.mtl_tts import ChatterboxMultilingualTTS
 from pygame import mixer
 import torch
 from torch import cuda
 from torch.backends import mps
 from torchcodec.encoders import AudioEncoder

 # Compute device defaults to Mac Metal Performance Shaders if we have that, then NVidia CUDA secondarily, and last CPU
 DEVICE_DEFAULT = "mps" if mps.is_available() else "cuda" if cuda.is_available() else "cpu"

 # Get all the command line arguments stuff done first
 parser = argparse.ArgumentParser(
                    prog='Chatterkey',
                    description='CLI script for calling Chatterbox TTS',
                    epilog='S.D.G.')

 parser.add_argument("-d", "--device", default=DEVICE_DEFAULT, choices=("cuda", "mps", "cpu"), help=f"What device to use for compute, defaults to {DEVICE_DEFAULT} on your system")
 parser.add_argument("-t", "--turbo", action=argparse.BooleanOptionalAction, help="Use the turbo model (English only)")
 parser.add_argument("-m", "--multilingual", help="Use the multilingual model, with the provided language ID")
 parser.add_argument("-p", "--audio-prompt", help="Base synthesis on this voice clip (specified by path)")
 parser.add_argument("-o", "--output-file", help="Write the output to a file")
 parser.add_argument("text", nargs="*", type=str, help="What to say, defaults to stdin")
 args = parser.parse_args()

 # TODO: Make argparse know these two are mutually exclusive
 assert not (args.turbo and args.multilingual), "Cannot do both turbo and multilingual: They are two different models!"


 # --- Patch for Torch.load() by Noobies ---
 map_location = torch.device(args.device)
 torch_load_original = torch.load


 def patched_torch_load(*args, **kwargs):
    if 'map_location' not in kwargs:
        kwargs['map_location'] = map_location
    return torch_load_original(*args, **kwargs)


 torch.load = patched_torch_load
 # --------

 # Load the correct model according to user specifications
 model = ChatterboxTurboTTS.from_pretrained(device=args.device) if args.turbo \
    else ChatterboxMultilingualTTS.from_pretrained(device=args.device) if args.multilingual \
    else ChatterboxTTS.from_pretrained(device=args.device)  # Default to the English-only model
 print("=== Models loaded! ===")


 def generate_wav_encoder(text: str):
    """Synthesize the given text, and return an audio encoder around it"""
    wav = model.generate(text, audio_prompt_path=args.audio_prompt, language_id=args.multilingual) if args.multilingual \
        else model.generate(text, audio_prompt_path=args.audio_prompt)  # One keyword argument is special for multilingual
    return AudioEncoder(wav, sample_rate=model.sr)


 def synth_and_play(text: str):
    """Synthesize the given text, and play it immediately (blocking)"""
    b = io.BytesIO()
    generate_wav_encoder(text).to_file_like(b, format="wav")
    b.seek(0)

    # Wait for sound to finish playing
    # Technically this code is AI generated, but I wrote it down and I understand how it works
    channel = mixer.Sound(b).play()
    while channel.get_busy():
        time.sleep(0.1)


 # No output file specified, play using PyGame Mixer
 if not args.output_file:
    mixer.init()

    # Text was specified, just play it and we're done
    if args.text:
        synth_and_play(" ".join(args.text))

    # We are in stdin mode
    else:
        for line in sys.stdin:
            synth_and_play(line)

    mixer.quit()

 # Output file was specified, write to file
 else:
    # Regardless of stdin mode, acquire ALL text and synthesize it at once
    text = " ".join(args.text) if args.text else sys.stdin.read()
    generate_wav_encoder(text).to_file(args.output_file)
	#!/usr/bin/env python3
	"""Chatterkey

	CLI script for calling Chatterbox TTS

	Copyright 2026 Wilbur Jaywright dba Marswide BGL

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.

	S.D.G."""

	import argparse
	import io
	import sys
	import time
	from warnings import warn
	from chatterbox.tts import ChatterboxTTS
	from chatterbox.tts_turbo import ChatterboxTurboTTS
	from chatterbox.mtl_tts import ChatterboxMultilingualTTS
	from pygame import mixer
	import torch
	from torch import cuda
	from torch.backends import mps
	from torchcodec.encoders import AudioEncoder

	# Compute device defaults to Mac Metal Performance Shaders if we have that, then NVidia CUDA secondarily, and last CPU
	DEVICE_DEFAULT = "mps" if mps.is_available() else "cuda" if cuda.is_available() else "cpu"

	# Get all the command line arguments stuff done first
	parser = argparse.ArgumentParser(
	prog='Chatterkey',
	description='CLI script for calling Chatterbox TTS',
	epilog='S.D.G.')

	parser.add_argument("-d", "--device", default=DEVICE_DEFAULT, choices=("cuda", "mps", "cpu"), help=f"What device to use for compute, defaults to {DEVICE_DEFAULT} on your system")
	parser.add_argument("-t", "--turbo", action=argparse.BooleanOptionalAction, help="Use the turbo model (English only)")
	parser.add_argument("-m", "--multilingual", help="Use the multilingual model, with the provided language ID")
	parser.add_argument("-p", "--audio-prompt", help="Base synthesis on this voice clip (specified by path)")
	parser.add_argument("-o", "--output-file", help="Write the output to a file")
	parser.add_argument("text", nargs="*", type=str, help="What to say, defaults to stdin")
	args = parser.parse_args()

	# TODO: Make argparse know these two are mutually exclusive
	assert not (args.turbo and args.multilingual), "Cannot do both turbo and multilingual: They are two different models!"


	# --- Patch for Torch.load() by Noobies ---
	map_location = torch.device(args.device)
	torch_load_original = torch.load


	def patched_torch_load(args, *kwargs):
	if 'map_location' not in kwargs:
	kwargs['map_location'] = map_location
	return torch_load_original(args, *kwargs)


	torch.load = patched_torch_load
	# --------

	# Load the correct model according to user specifications
	model = ChatterboxTurboTTS.from_pretrained(device=args.device) if args.turbo \
	else ChatterboxMultilingualTTS.from_pretrained(device=args.device) if args.multilingual \
	else ChatterboxTTS.from_pretrained(device=args.device) # Default to the English-only model
	print("=== Models loaded! ===")


	def generate_wav_encoder(text: str):
	"""Synthesize the given text, and return an audio encoder around it"""
	wav = model.generate(text, audio_prompt_path=args.audio_prompt, language_id=args.multilingual) if args.multilingual \
	else model.generate(text, audio_prompt_path=args.audio_prompt) # One keyword argument is special for multilingual
	return AudioEncoder(wav, sample_rate=model.sr)


	def synth_and_play(text: str):
	"""Synthesize the given text, and play it immediately (blocking)"""
	b = io.BytesIO()
	generate_wav_encoder(text).to_file_like(b, format="wav")
	b.seek(0)

	# Wait for sound to finish playing
	# Technically this code is AI generated, but I wrote it down and I understand how it works
	channel = mixer.Sound(b).play()
	while channel.get_busy():
	time.sleep(0.1)


	# No output file specified, play using PyGame Mixer
	if not args.output_file:
	mixer.init()

	# Text was specified, just play it and we're done
	if args.text:
	synth_and_play(" ".join(args.text))

	# We are in stdin mode
	else:
	for line in sys.stdin:
	synth_and_play(line)

	mixer.quit()

	# Output file was specified, write to file
	else:
	# Regardless of stdin mode, acquire ALL text and synthesize it at once
	text = " ".join(args.text) if args.text else sys.stdin.read()
	generate_wav_encoder(text).to_file(args.output_file)
No results found