Skip to content

Instantly share code, notes, and snippets.

@nerdalert
Last active May 15, 2025 09:54
Show Gist options
  • Save nerdalert/8ae1404d6f1e7472763d9b7040713990 to your computer and use it in GitHub Desktop.
Save nerdalert/8ae1404d6f1e7472763d9b7040713990 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
transcribe_video_to_srt.py
Transcribe a video or audio file into SRT subtitles using OpenAI Whisper.
Dependencies & Install:
------------------------------------
# 1. Create & activate a virtual environment (optional but recommended):
# python3 -m venv venv
# source venv/bin/activate
#
# 2. Install FFmpeg (must be in your PATH):
# Homebrew: brew install ffmpeg
# On OSX, run this to avoid CERTIFICATE_VERIFY_FAILED errors: /Applications/Python\ 3.12/Install\ Certificates.command
#
# 3. Install Whisper (and its dependencies):
# pip install git+https://github.com/openai/whisper.git
#
# 4. (Optional) For CUDA GPU acceleration, install matching torch wheels:
# pip install torch torchvision torchaudio \
# --index-url https://download.pytorch.org/whl/cu117 \
#
#
# Usage: ./transcribe_video_to_srt.py input_video.mp4 --model small --language en --max-words 8 --output subtitles.srt
"""
import argparse
import os
import sys
from datetime import timedelta
import whisper
def format_timestamp(seconds: float) -> str:
"""
Convert seconds (float) to SRT timestamp format: HH:MM:SS,mmm
"""
td = timedelta(seconds=seconds)
total_seconds = int(td.total_seconds())
hours = total_seconds // 3600
minutes = (total_seconds % 3600) // 60
secs = total_seconds % 60
milliseconds = int(td.microseconds / 1000)
return f"{hours:02d}:{minutes:02d}:{secs:02d},{milliseconds:03d}"
def segments_to_srt(segments: list[dict], max_words: int) -> str:
"""
Given Whisper segments, build a string in .srt format.
If a segment has more than max_words, split it into smaller chunks.
"""
lines: list[str] = []
idx = 1
for seg in segments:
words = seg["text"].strip().split()
start_time = seg["start"]
end_time = seg["end"]
duration = end_time - start_time
# If short enough, emit as-is
if len(words) <= max_words:
lines.append(str(idx))
lines.append(f"{format_timestamp(start_time)} --> {format_timestamp(end_time)}")
lines.append(" ".join(words))
lines.append("")
idx += 1
else:
# Split into sub‐segments of up to max_words
word_duration = duration / len(words)
for i in range(0, len(words), max_words):
chunk = words[i : i + max_words]
sub_start = start_time + i * word_duration
sub_end = sub_start + len(chunk) * word_duration
lines.append(str(idx))
lines.append(f"{format_timestamp(sub_start)} --> {format_timestamp(sub_end)}")
lines.append(" ".join(chunk))
lines.append("")
idx += 1
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(
description="Transcribe video/audio to SRT using OpenAI Whisper"
)
parser.add_argument(
"input",
help="Path to input video or audio file (e.g. .mp4, .wav)",
)
parser.add_argument(
"-m", "--model",
default="small",
help="Whisper model to use (tiny, base, small, medium, large)",
)
parser.add_argument(
"-l", "--language",
default=None,
help="Force language code (e.g. en, es). If omitted, will auto-detect.",
)
parser.add_argument(
"-w", "--max-words",
type=int,
default=10,
help="Maximum words per SRT caption chunk (default: 10)",
)
parser.add_argument(
"-o", "--output",
help="Path to output .srt file (default: <input>.<model>.srt)",
)
args = parser.parse_args()
input_path = args.input
if not os.path.isfile(input_path):
print(f"Error: file not found: {input_path}", file=sys.stderr)
sys.exit(1)
model_size = args.model
output_path = args.output or f"{os.path.splitext(input_path)[0]}.{model_size}.srt"
print(f"[1/3] Loading Whisper model '{model_size}'…")
model = whisper.load_model(model_size)
print(f"[2/3] Transcribing '{input_path}'…")
result = model.transcribe(input_path, language=args.language)
segments = result.get("segments", [])
if not segments:
print("No speech segments detected. Exiting.", file=sys.stderr)
sys.exit(1)
print(f"[3/3] Converting {len(segments)} segments into SRT (max {args.max_words} words each) …")
srt_content = segments_to_srt(segments, args.max_words)
with open(output_path, "w", encoding="utf-8") as f:
f.write(srt_content)
print(f"✅ Done! Subtitles written to '{output_path}'")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment