Skip to content

Instantly share code, notes, and snippets.

@diegocostares
Last active May 3, 2024 18:53
Show Gist options
  • Save diegocostares/881974b593bffcf1641f048be04f56c9 to your computer and use it in GitHub Desktop.
Save diegocostares/881974b593bffcf1641f048be04f56c9 to your computer and use it in GitHub Desktop.
AudioTranscriberTool: Herramienta de transcripción de audio con WisperX que admite múltiples fuentes de entrada, incluidos archivos de audio, video y enlaces de redes sociales. Luego produce una transcripción del contenido en varios formatos (TXT, SRT, JSON).
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/diegocostares/881974b593bffcf1641f048be04f56c9/audiotranscribertool.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"id": "3eb38cc7",
"metadata": {
"id": "3eb38cc7"
},
"source": [
"# ✨**AudioTranscriberTool con WhisperX** 🎙️\n",
"\n",
"---\n",
"Herramienta de transcripción de audio con WisperX que admite múltiples fuentes de entrada, incluidos archivos de audio, video y enlaces de redes sociales. Luego produce una transcripción del contenido en varios formatos (TXT, SRT, JSON).\n",
"\n",
"_Nota: Para ejecutar solo has click en el boton de reproduccion a la izquierda (▶️)_"
]
},
{
"cell_type": "markdown",
"source": [
"## Instalación de dependencias 📦"
],
"metadata": {
"id": "_cvbIZu8kqYX"
},
"id": "_cvbIZu8kqYX"
},
{
"cell_type": "code",
"execution_count": null,
"id": "839c047c",
"metadata": {
"id": "839c047c"
},
"outputs": [],
"source": [
"!pip install git+https://github.com/m-bain/whisperx.git --upgrade\n",
"!pip install -U yt-dlp"
]
},
{
"cell_type": "markdown",
"source": [
"# Programa principal"
],
"metadata": {
"id": "TwtSHHeDRbb3"
},
"id": "TwtSHHeDRbb3"
},
{
"cell_type": "code",
"source": [
"import os\n",
"from google.colab import files\n",
"import whisperx\n",
"import json\n",
"import moviepy.editor as mp\n",
"import yt_dlp\n",
"import re\n",
"\n",
"\n",
"class Transcriber:\n",
" def __init__(self, device=\"cuda\", batch_size=5, compute_type=\"float16\"):\n",
" self.device = device\n",
" self.batch_size = batch_size\n",
" self.compute_type = compute_type\n",
" self.file_path = None\n",
" self.transcription = None\n",
"\n",
" def clear_previous_audio(self):\n",
" if self.file_path and os.path.exists(self.file_path):\n",
" os.remove(self.file_path)\n",
"\n",
" def upload_file(self, source_type):\n",
" self.clear_previous_audio()\n",
"\n",
" if source_type == \"social_media\":\n",
" url = input(f\"Ingrese la URL de la red social: \")\n",
" self.download_audio_from_url(url)\n",
" elif source_type in {\"audio\", \"video\"}:\n",
" self.file_path = self.upload_file_and_get_path()\n",
" else:\n",
" print(\"Tipo de fuente no válido.\")\n",
" return\n",
"\n",
" if source_type == \"video\":\n",
" self.extract_audio_from_video()\n",
"\n",
" def upload_file_and_get_path(self):\n",
" uploaded = files.upload()\n",
" return list(uploaded.keys())[0]\n",
"\n",
" def download_audio_from_url(self, url):\n",
" ydl_opts = {\n",
" \"format\": \"bestaudio/best\",\n",
" \"postprocessors\": [\n",
" {\n",
" \"key\": \"FFmpegExtractAudio\",\n",
" \"preferredcodec\": \"wav\",\n",
" \"preferredquality\": \"192\",\n",
" }\n",
" ],\n",
" \"outtmpl\": f\"extracted_audio.%(ext)s\",\n",
" \"quiet\": True,\n",
" }\n",
" with yt_dlp.YoutubeDL(ydl_opts) as ydl:\n",
" ydl.download([url])\n",
" self.file_path = f\"extracted_audio.wav\"\n",
"\n",
" def extract_audio_from_video(self):\n",
" clip = mp.VideoFileClip(self.file_path)\n",
" clip.audio.write_audiofile(\"extracted_audio.wav\")\n",
" self.file_path = \"extracted_audio.wav\"\n",
"\n",
" def transcribe_audio(self, output_format):\n",
" self.output_format = output_format\n",
" model = whisperx.load_model(\n",
" \"large-v2\", self.device, compute_type=self.compute_type\n",
" )\n",
" audio = whisperx.load_audio(self.file_path)\n",
" self.transcription = model.transcribe(audio, batch_size=self.batch_size)\n",
"\n",
" model_a, metadata = whisperx.load_align_model(\n",
" language_code=self.transcription[\"language\"], device=self.device\n",
" )\n",
" self.transcription = whisperx.align(\n",
" self.transcription[\"segments\"],\n",
" model_a,\n",
" metadata,\n",
" audio,\n",
" self.device,\n",
" return_char_alignments=False,\n",
" )\n",
"\n",
" self.save_transcription()\n",
"\n",
" def save_transcription(self):\n",
" if self.output_format == \"txt\":\n",
" transcription_text = \"\\n\".join(\n",
" [segment[\"text\"] for segment in self.transcription[\"segments\"]]\n",
" )\n",
" with open(\"transcription.txt\", \"w\", encoding=\"utf8\") as file:\n",
" file.write(transcription_text)\n",
" files.download(\"transcription.txt\")\n",
"\n",
" elif self.output_format == \"srt\":\n",
" words_per_entry = int(\n",
" input(\n",
" f\"\\n\\nIngrese la cantidad de palabras por entrada (recommended 3-4): \"\n",
" )\n",
" )\n",
" srt_content = self.create_srt_content(self.transcription, words_per_entry)\n",
" with open(\"transcription.srt\", \"w\", encoding=\"utf8\") as file:\n",
" file.write(srt_content)\n",
" files.download(\"transcription.srt\")\n",
"\n",
" elif self.output_format == \"json\":\n",
" with open(\"transcription.json\", \"w\", encoding=\"utf8\") as file:\n",
" json.dump(self.transcription, file, indent=4, ensure_ascii=False)\n",
" files.download(\"transcription.json\")\n",
" else:\n",
" print(\"Formato no soportado.\")\n",
"\n",
" def create_srt_content(self, data, words_per_entry=5):\n",
" srt_content = \"\"\n",
" global_idx = 1\n",
"\n",
" for segment in data[\"segments\"]:\n",
" words = segment[\"text\"].split()\n",
" total_words = len(words)\n",
" segment_duration = segment[\"end\"] - segment[\"start\"]\n",
"\n",
" entry_words = []\n",
" current_word_count = 0\n",
" is_break_point = False\n",
"\n",
" for word in words:\n",
" if bool(re.search(r\"[,.?]\", word)):\n",
" is_break_point = True\n",
" entry_words.append(word)\n",
" current_word_count += 1\n",
"\n",
" if current_word_count >= words_per_entry or is_break_point:\n",
" entry_duration = (\n",
" current_word_count / total_words\n",
" ) * segment_duration\n",
"\n",
" start_time = self.seconds_to_srt_time(segment[\"start\"])\n",
" end_time = self.seconds_to_srt_time(\n",
" segment[\"start\"] + entry_duration\n",
" )\n",
"\n",
" srt_content += f\"{global_idx}\\n{start_time} --> {end_time}\\n{' '.join(entry_words)}\\n\\n\"\n",
" global_idx += 1\n",
"\n",
" segment[\"start\"] += entry_duration\n",
" entry_words = []\n",
" current_word_count = 0\n",
" is_break_point = False\n",
"\n",
" if entry_words:\n",
" start_time = self.seconds_to_srt_time(segment[\"start\"])\n",
" end_time = self.seconds_to_srt_time(segment[\"end\"])\n",
"\n",
" srt_content += f\"{global_idx}\\n{start_time} --> {end_time}\\n{' '.join(entry_words)}\\n\\n\"\n",
" global_idx += 1\n",
"\n",
" return srt_content\n",
"\n",
" @staticmethod\n",
" def seconds_to_srt_time(seconds):\n",
" hours, remainder = divmod(seconds, 3600)\n",
" minutes, remainder = divmod(remainder, 60)\n",
" seconds, milliseconds = divmod(remainder, 1)\n",
" return \"{:02}:{:02}:{:02},{:03}\".format(\n",
" int(hours), int(minutes), int(seconds), int(milliseconds * 1000)\n",
" )\n",
"\n",
"\n",
"# @title Transcriptor de Audio { vertical-output: true, display-mode: \"form\" }\n",
"source_type = \"social_media\" # @param [\"audio\", \"video\", \"social_media\"]\n",
"output_format = \"txt\" # @param [\"txt\", \"srt\", \"json\"]\n",
"\n",
"\n",
"transcriber = Transcriber()\n",
"transcriber.upload_file(source_type=source_type)\n",
"transcriber.transcribe_audio(output_format=output_format)\n"
],
"metadata": {
"id": "6rkpcEtpnURd"
},
"id": "6rkpcEtpnURd",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"colab": {
"provenance": [],
"collapsed_sections": [
"_cvbIZu8kqYX"
],
"gpuType": "T4",
"include_colab_link": true
},
"language_info": {
"name": "python"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment