Last active
May 13, 2025 02:42
-
-
Save luiscape/1a33a78540e3ae0d321f88b3c51280f8 to your computer and use it in GitHub Desktop.
parakeet_modal.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Run NVIDIA Parakeet in Modal. | |
import wave | |
import numpy as np | |
import modal | |
image = ( | |
modal.Image.from_registry("nvidia/cuda:12.8.0-cudnn-devel-ubuntu22.04", add_python="3.12") | |
.pip_install("uv") | |
.env( | |
{ | |
"HF_HUB_ENABLE_HF_TRANSFER": "1", | |
"HF_HOME": "/cache", | |
"DEBIAN_FRONTEND": "noninteractive", | |
"CXX": "g++", | |
"CC": "g++", | |
} | |
) | |
.apt_install("ffmpeg") | |
.run_commands( | |
"uv pip install --system hf_transfer huggingface_hub[hf-xet] nemo_toolkit[asr] cuda-python>=12.3", | |
"uv pip install --system 'numpy<2.0'", # downgrade numpy; incompatible current version | |
) | |
.entrypoint([]) | |
) | |
app = modal.App("parakeet", image=image) | |
model_cache = modal.Volume.from_name("parakeet-model-cache", create_if_missing=True) | |
with image.imports(): | |
import wave | |
import nemo.collections.asr as nemo_asr | |
import numpy as np | |
@app.cls( | |
volumes={ | |
"/cache": model_cache, | |
}, | |
gpu="a10g", | |
) | |
class Parakeet: | |
@modal.enter() | |
def load(self): | |
self.model = nemo_asr.models.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v2") | |
@modal.method() | |
def transcribe(self, audio_data: np.ndarray) -> str: | |
output = self.model.transcribe([audio_data]) | |
transcription = output[0].text | |
return transcription | |
def convert_to_mono_16khz(input_path: str) -> np.ndarray: | |
with wave.open(input_path, "rb") as wav_in: | |
n_channels = wav_in.getnchannels() | |
sample_width = wav_in.getsampwidth() | |
frame_rate = wav_in.getframerate() | |
n_frames = wav_in.getnframes() | |
frames = wav_in.readframes(n_frames) | |
# Convert frames to numpy array | |
if sample_width == 1: | |
dtype = np.uint8 | |
elif sample_width == 2: | |
dtype = np.int16 | |
elif sample_width == 4: | |
dtype = np.int32 | |
else: | |
raise ValueError(f"Unsupported sample width: {sample_width}") | |
# Reshape the array based on number of channels and convert to mono | |
# by averaging all channels. | |
audio_data = np.frombuffer(frames, dtype=dtype) | |
if n_channels > 1: | |
audio_data = audio_data.reshape(-1, n_channels) | |
audio_data = audio_data.mean(axis=1).astype(dtype) | |
if frame_rate != 16000: | |
# Calculate resampling ratio | |
ratio = 16000 / frame_rate | |
new_length = int(len(audio_data) * ratio) | |
# Resample using linear interpolation | |
indices = np.linspace(0, len(audio_data) - 1, new_length) | |
audio_data = np.interp(indices, np.arange(len(audio_data)), audio_data).astype(dtype) | |
return audio_data | |
# Sample file: | |
# | |
# Dream Within a Dream by Allan Poe | |
# wget -O dream_within_a_dream.wav https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/mono_44100/156550__acclivity__a-dream-within-a-dream.wav | |
@app.local_entrypoint() | |
def main(file: str): | |
parakeet = Parakeet() | |
audio_data = convert_to_mono_16khz(file) | |
transcription = parakeet.transcribe.remote(audio_data) | |
print(transcription) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment