luiscape · May 13, 2025 02:42
diff --git a/parakeet.py b/parakeet.py
 # Run NVIDIA Parakeet in Modal. 
 import wave

 import numpy as np

 import modal

 image = (
    modal.Image.from_registry("nvidia/cuda:12.8.0-cudnn-devel-ubuntu22.04", add_python="3.12")
    .pip_install("uv")
    .env(
        {
            "HF_HUB_ENABLE_HF_TRANSFER": "1",
            "HF_HOME": "/cache",
            "DEBIAN_FRONTEND": "noninteractive",
            "CXX": "g++",
            "CC": "g++",
        }
    )
    .apt_install("ffmpeg")
    .run_commands(
        "uv pip install --system hf_transfer huggingface_hub[hf-xet] nemo_toolkit[asr] cuda-python>=12.3",
        "uv pip install --system 'numpy<2.0'",  # downgrade numpy; incompatible current version
    )
    .entrypoint([])
 )
 app = modal.App("parakeet", image=image)
 model_cache = modal.Volume.from_name("parakeet-model-cache", create_if_missing=True)

 with image.imports():
    import wave

    import nemo.collections.asr as nemo_asr
    import numpy as np


 @app.cls(
    volumes={
        "/cache": model_cache,
    },
    gpu="a10g",
 )
 class Parakeet:
    @modal.enter()
    def load(self):
        self.model = nemo_asr.models.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v2")

    @modal.method()
    def transcribe(self, audio_data: np.ndarray) -> str:
        output = self.model.transcribe([audio_data])
        transcription = output[0].text
        return transcription


 def convert_to_mono_16khz(input_path: str) -> np.ndarray:
    with wave.open(input_path, "rb") as wav_in:
        n_channels = wav_in.getnchannels()
        sample_width = wav_in.getsampwidth()
        frame_rate = wav_in.getframerate()
        n_frames = wav_in.getnframes()
        frames = wav_in.readframes(n_frames)

    # Convert frames to numpy array
    if sample_width == 1:
        dtype = np.uint8
    elif sample_width == 2:
        dtype = np.int16
    elif sample_width == 4:
        dtype = np.int32
    else:
        raise ValueError(f"Unsupported sample width: {sample_width}")

    # Reshape the array based on number of channels and convert to mono
    # by averaging all channels.
    audio_data = np.frombuffer(frames, dtype=dtype)
    if n_channels > 1:
        audio_data = audio_data.reshape(-1, n_channels)
        audio_data = audio_data.mean(axis=1).astype(dtype)

    if frame_rate != 16000:
        # Calculate resampling ratio
        ratio = 16000 / frame_rate
        new_length = int(len(audio_data) * ratio)

        # Resample using linear interpolation
        indices = np.linspace(0, len(audio_data) - 1, new_length)
        audio_data = np.interp(indices, np.arange(len(audio_data)), audio_data).astype(dtype)

    return audio_data


 # Sample file:
 #
 # Dream Within a Dream by Allan Poe
 # wget -O dream_within_a_dream.wav https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/mono_44100/156550__acclivity__a-dream-within-a-dream.wav
 @app.local_entrypoint()
 def main(file: str):
    parakeet = Parakeet()

    audio_data = convert_to_mono_16khz(file)
    transcription = parakeet.transcribe.remote(audio_data)

    print(transcription)
	# Run NVIDIA Parakeet in Modal.
	import wave

	import numpy as np

	import modal

	image = (
	modal.Image.from_registry("nvidia/cuda:12.8.0-cudnn-devel-ubuntu22.04", add_python="3.12")
	.pip_install("uv")
	.env(
	{
	"HF_HUB_ENABLE_HF_TRANSFER": "1",
	"HF_HOME": "/cache",
	"DEBIAN_FRONTEND": "noninteractive",
	"CXX": "g++",
	"CC": "g++",
	}
	)
	.apt_install("ffmpeg")
	.run_commands(
	"uv pip install --system hf_transfer huggingface_hub[hf-xet] nemo_toolkit[asr] cuda-python>=12.3",
	"uv pip install --system 'numpy<2.0'", # downgrade numpy; incompatible current version
	)
	.entrypoint([])
	)
	app = modal.App("parakeet", image=image)
	model_cache = modal.Volume.from_name("parakeet-model-cache", create_if_missing=True)

	with image.imports():
	import wave

	import nemo.collections.asr as nemo_asr
	import numpy as np


	@app.cls(
	volumes={
	"/cache": model_cache,
	},
	gpu="a10g",
	)
	class Parakeet:
	@modal.enter()
	def load(self):
	self.model = nemo_asr.models.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v2")

	@modal.method()
	def transcribe(self, audio_data: np.ndarray) -> str:
	output = self.model.transcribe([audio_data])
	transcription = output[0].text
	return transcription


	def convert_to_mono_16khz(input_path: str) -> np.ndarray:
	with wave.open(input_path, "rb") as wav_in:
	n_channels = wav_in.getnchannels()
	sample_width = wav_in.getsampwidth()
	frame_rate = wav_in.getframerate()
	n_frames = wav_in.getnframes()
	frames = wav_in.readframes(n_frames)

	# Convert frames to numpy array
	if sample_width == 1:
	dtype = np.uint8
	elif sample_width == 2:
	dtype = np.int16
	elif sample_width == 4:
	dtype = np.int32
	else:
	raise ValueError(f"Unsupported sample width: {sample_width}")

	# Reshape the array based on number of channels and convert to mono
	# by averaging all channels.
	audio_data = np.frombuffer(frames, dtype=dtype)
	if n_channels > 1:
	audio_data = audio_data.reshape(-1, n_channels)
	audio_data = audio_data.mean(axis=1).astype(dtype)

	if frame_rate != 16000:
	# Calculate resampling ratio
	ratio = 16000 / frame_rate
	new_length = int(len(audio_data) * ratio)

	# Resample using linear interpolation
	indices = np.linspace(0, len(audio_data) - 1, new_length)
	audio_data = np.interp(indices, np.arange(len(audio_data)), audio_data).astype(dtype)

	return audio_data


	# Sample file:
	#
	# Dream Within a Dream by Allan Poe
	# wget -O dream_within_a_dream.wav https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/mono_44100/156550__acclivity__a-dream-within-a-dream.wav
	@app.local_entrypoint()
	def main(file: str):
	parakeet = Parakeet()

	audio_data = convert_to_mono_16khz(file)
	transcription = parakeet.transcribe.remote(audio_data)

	print(transcription)