Skip to content

Instantly share code, notes, and snippets.

@luiscape
Last active May 13, 2025 02:42
Show Gist options
  • Save luiscape/1a33a78540e3ae0d321f88b3c51280f8 to your computer and use it in GitHub Desktop.
Save luiscape/1a33a78540e3ae0d321f88b3c51280f8 to your computer and use it in GitHub Desktop.
parakeet_modal.py
# Run NVIDIA Parakeet in Modal.
import wave
import numpy as np
import modal
image = (
modal.Image.from_registry("nvidia/cuda:12.8.0-cudnn-devel-ubuntu22.04", add_python="3.12")
.pip_install("uv")
.env(
{
"HF_HUB_ENABLE_HF_TRANSFER": "1",
"HF_HOME": "/cache",
"DEBIAN_FRONTEND": "noninteractive",
"CXX": "g++",
"CC": "g++",
}
)
.apt_install("ffmpeg")
.run_commands(
"uv pip install --system hf_transfer huggingface_hub[hf-xet] nemo_toolkit[asr] cuda-python>=12.3",
"uv pip install --system 'numpy<2.0'", # downgrade numpy; incompatible current version
)
.entrypoint([])
)
app = modal.App("parakeet", image=image)
model_cache = modal.Volume.from_name("parakeet-model-cache", create_if_missing=True)
with image.imports():
import wave
import nemo.collections.asr as nemo_asr
import numpy as np
@app.cls(
volumes={
"/cache": model_cache,
},
gpu="a10g",
)
class Parakeet:
@modal.enter()
def load(self):
self.model = nemo_asr.models.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v2")
@modal.method()
def transcribe(self, audio_data: np.ndarray) -> str:
output = self.model.transcribe([audio_data])
transcription = output[0].text
return transcription
def convert_to_mono_16khz(input_path: str) -> np.ndarray:
with wave.open(input_path, "rb") as wav_in:
n_channels = wav_in.getnchannels()
sample_width = wav_in.getsampwidth()
frame_rate = wav_in.getframerate()
n_frames = wav_in.getnframes()
frames = wav_in.readframes(n_frames)
# Convert frames to numpy array
if sample_width == 1:
dtype = np.uint8
elif sample_width == 2:
dtype = np.int16
elif sample_width == 4:
dtype = np.int32
else:
raise ValueError(f"Unsupported sample width: {sample_width}")
# Reshape the array based on number of channels and convert to mono
# by averaging all channels.
audio_data = np.frombuffer(frames, dtype=dtype)
if n_channels > 1:
audio_data = audio_data.reshape(-1, n_channels)
audio_data = audio_data.mean(axis=1).astype(dtype)
if frame_rate != 16000:
# Calculate resampling ratio
ratio = 16000 / frame_rate
new_length = int(len(audio_data) * ratio)
# Resample using linear interpolation
indices = np.linspace(0, len(audio_data) - 1, new_length)
audio_data = np.interp(indices, np.arange(len(audio_data)), audio_data).astype(dtype)
return audio_data
# Sample file:
#
# Dream Within a Dream by Allan Poe
# wget -O dream_within_a_dream.wav https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/mono_44100/156550__acclivity__a-dream-within-a-dream.wav
@app.local_entrypoint()
def main(file: str):
parakeet = Parakeet()
audio_data = convert_to_mono_16khz(file)
transcription = parakeet.transcribe.remote(audio_data)
print(transcription)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment