Created
July 1, 2025 12:11
-
-
Save realamirhe/8a2802a1608c189bbe806ec4c7157336 to your computer and use it in GitHub Desktop.
run whisper speech to text in the colab
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| !ffmpeg -i file.m4a file.wav | |
| import os | |
| os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" | |
| # torch must be loaded after the env setup | |
| import torch | |
| torch.cuda.empty_cache() | |
| from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
| from datasets import load_dataset | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| model_id = "openai/whisper-large-v3-turbo" | |
| model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
| model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True | |
| ) | |
| model.to(device) | |
| processor = AutoProcessor.from_pretrained(model_id) | |
| pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=model, | |
| tokenizer=processor.tokenizer, | |
| feature_extractor=processor.feature_extractor, | |
| torch_dtype=torch_dtype, | |
| device=device, | |
| ) | |
| result = pipe('file.wav', return_timestamps=True, generate_kwargs={"language": "persian"}) | |
| print(result["text"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment