Last active
May 3, 2024 18:53
-
-
Save diegocostares/881974b593bffcf1641f048be04f56c9 to your computer and use it in GitHub Desktop.
AudioTranscriberTool: Herramienta de transcripción de audio con WisperX que admite múltiples fuentes de entrada, incluidos archivos de audio, video y enlaces de redes sociales. Luego produce una transcripción del contenido en varios formatos (TXT, SRT, JSON).
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/diegocostares/881974b593bffcf1641f048be04f56c9/audiotranscribertool.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "3eb38cc7", | |
"metadata": { | |
"id": "3eb38cc7" | |
}, | |
"source": [ | |
"# ✨**AudioTranscriberTool con WhisperX** 🎙️\n", | |
"\n", | |
"---\n", | |
"Herramienta de transcripción de audio con WisperX que admite múltiples fuentes de entrada, incluidos archivos de audio, video y enlaces de redes sociales. Luego produce una transcripción del contenido en varios formatos (TXT, SRT, JSON).\n", | |
"\n", | |
"_Nota: Para ejecutar solo has click en el boton de reproduccion a la izquierda (▶️)_" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## Instalación de dependencias 📦" | |
], | |
"metadata": { | |
"id": "_cvbIZu8kqYX" | |
}, | |
"id": "_cvbIZu8kqYX" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "839c047c", | |
"metadata": { | |
"id": "839c047c" | |
}, | |
"outputs": [], | |
"source": [ | |
"!pip install git+https://github.com/m-bain/whisperx.git --upgrade\n", | |
"!pip install -U yt-dlp" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Programa principal" | |
], | |
"metadata": { | |
"id": "TwtSHHeDRbb3" | |
}, | |
"id": "TwtSHHeDRbb3" | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import os\n", | |
"from google.colab import files\n", | |
"import whisperx\n", | |
"import json\n", | |
"import moviepy.editor as mp\n", | |
"import yt_dlp\n", | |
"import re\n", | |
"\n", | |
"\n", | |
"class Transcriber:\n", | |
" def __init__(self, device=\"cuda\", batch_size=5, compute_type=\"float16\"):\n", | |
" self.device = device\n", | |
" self.batch_size = batch_size\n", | |
" self.compute_type = compute_type\n", | |
" self.file_path = None\n", | |
" self.transcription = None\n", | |
"\n", | |
" def clear_previous_audio(self):\n", | |
" if self.file_path and os.path.exists(self.file_path):\n", | |
" os.remove(self.file_path)\n", | |
"\n", | |
" def upload_file(self, source_type):\n", | |
" self.clear_previous_audio()\n", | |
"\n", | |
" if source_type == \"social_media\":\n", | |
" url = input(f\"Ingrese la URL de la red social: \")\n", | |
" self.download_audio_from_url(url)\n", | |
" elif source_type in {\"audio\", \"video\"}:\n", | |
" self.file_path = self.upload_file_and_get_path()\n", | |
" else:\n", | |
" print(\"Tipo de fuente no válido.\")\n", | |
" return\n", | |
"\n", | |
" if source_type == \"video\":\n", | |
" self.extract_audio_from_video()\n", | |
"\n", | |
" def upload_file_and_get_path(self):\n", | |
" uploaded = files.upload()\n", | |
" return list(uploaded.keys())[0]\n", | |
"\n", | |
" def download_audio_from_url(self, url):\n", | |
" ydl_opts = {\n", | |
" \"format\": \"bestaudio/best\",\n", | |
" \"postprocessors\": [\n", | |
" {\n", | |
" \"key\": \"FFmpegExtractAudio\",\n", | |
" \"preferredcodec\": \"wav\",\n", | |
" \"preferredquality\": \"192\",\n", | |
" }\n", | |
" ],\n", | |
" \"outtmpl\": f\"extracted_audio.%(ext)s\",\n", | |
" \"quiet\": True,\n", | |
" }\n", | |
" with yt_dlp.YoutubeDL(ydl_opts) as ydl:\n", | |
" ydl.download([url])\n", | |
" self.file_path = f\"extracted_audio.wav\"\n", | |
"\n", | |
" def extract_audio_from_video(self):\n", | |
" clip = mp.VideoFileClip(self.file_path)\n", | |
" clip.audio.write_audiofile(\"extracted_audio.wav\")\n", | |
" self.file_path = \"extracted_audio.wav\"\n", | |
"\n", | |
" def transcribe_audio(self, output_format):\n", | |
" self.output_format = output_format\n", | |
" model = whisperx.load_model(\n", | |
" \"large-v2\", self.device, compute_type=self.compute_type\n", | |
" )\n", | |
" audio = whisperx.load_audio(self.file_path)\n", | |
" self.transcription = model.transcribe(audio, batch_size=self.batch_size)\n", | |
"\n", | |
" model_a, metadata = whisperx.load_align_model(\n", | |
" language_code=self.transcription[\"language\"], device=self.device\n", | |
" )\n", | |
" self.transcription = whisperx.align(\n", | |
" self.transcription[\"segments\"],\n", | |
" model_a,\n", | |
" metadata,\n", | |
" audio,\n", | |
" self.device,\n", | |
" return_char_alignments=False,\n", | |
" )\n", | |
"\n", | |
" self.save_transcription()\n", | |
"\n", | |
" def save_transcription(self):\n", | |
" if self.output_format == \"txt\":\n", | |
" transcription_text = \"\\n\".join(\n", | |
" [segment[\"text\"] for segment in self.transcription[\"segments\"]]\n", | |
" )\n", | |
" with open(\"transcription.txt\", \"w\", encoding=\"utf8\") as file:\n", | |
" file.write(transcription_text)\n", | |
" files.download(\"transcription.txt\")\n", | |
"\n", | |
" elif self.output_format == \"srt\":\n", | |
" words_per_entry = int(\n", | |
" input(\n", | |
" f\"\\n\\nIngrese la cantidad de palabras por entrada (recommended 3-4): \"\n", | |
" )\n", | |
" )\n", | |
" srt_content = self.create_srt_content(self.transcription, words_per_entry)\n", | |
" with open(\"transcription.srt\", \"w\", encoding=\"utf8\") as file:\n", | |
" file.write(srt_content)\n", | |
" files.download(\"transcription.srt\")\n", | |
"\n", | |
" elif self.output_format == \"json\":\n", | |
" with open(\"transcription.json\", \"w\", encoding=\"utf8\") as file:\n", | |
" json.dump(self.transcription, file, indent=4, ensure_ascii=False)\n", | |
" files.download(\"transcription.json\")\n", | |
" else:\n", | |
" print(\"Formato no soportado.\")\n", | |
"\n", | |
" def create_srt_content(self, data, words_per_entry=5):\n", | |
" srt_content = \"\"\n", | |
" global_idx = 1\n", | |
"\n", | |
" for segment in data[\"segments\"]:\n", | |
" words = segment[\"text\"].split()\n", | |
" total_words = len(words)\n", | |
" segment_duration = segment[\"end\"] - segment[\"start\"]\n", | |
"\n", | |
" entry_words = []\n", | |
" current_word_count = 0\n", | |
" is_break_point = False\n", | |
"\n", | |
" for word in words:\n", | |
" if bool(re.search(r\"[,.?]\", word)):\n", | |
" is_break_point = True\n", | |
" entry_words.append(word)\n", | |
" current_word_count += 1\n", | |
"\n", | |
" if current_word_count >= words_per_entry or is_break_point:\n", | |
" entry_duration = (\n", | |
" current_word_count / total_words\n", | |
" ) * segment_duration\n", | |
"\n", | |
" start_time = self.seconds_to_srt_time(segment[\"start\"])\n", | |
" end_time = self.seconds_to_srt_time(\n", | |
" segment[\"start\"] + entry_duration\n", | |
" )\n", | |
"\n", | |
" srt_content += f\"{global_idx}\\n{start_time} --> {end_time}\\n{' '.join(entry_words)}\\n\\n\"\n", | |
" global_idx += 1\n", | |
"\n", | |
" segment[\"start\"] += entry_duration\n", | |
" entry_words = []\n", | |
" current_word_count = 0\n", | |
" is_break_point = False\n", | |
"\n", | |
" if entry_words:\n", | |
" start_time = self.seconds_to_srt_time(segment[\"start\"])\n", | |
" end_time = self.seconds_to_srt_time(segment[\"end\"])\n", | |
"\n", | |
" srt_content += f\"{global_idx}\\n{start_time} --> {end_time}\\n{' '.join(entry_words)}\\n\\n\"\n", | |
" global_idx += 1\n", | |
"\n", | |
" return srt_content\n", | |
"\n", | |
" @staticmethod\n", | |
" def seconds_to_srt_time(seconds):\n", | |
" hours, remainder = divmod(seconds, 3600)\n", | |
" minutes, remainder = divmod(remainder, 60)\n", | |
" seconds, milliseconds = divmod(remainder, 1)\n", | |
" return \"{:02}:{:02}:{:02},{:03}\".format(\n", | |
" int(hours), int(minutes), int(seconds), int(milliseconds * 1000)\n", | |
" )\n", | |
"\n", | |
"\n", | |
"# @title Transcriptor de Audio { vertical-output: true, display-mode: \"form\" }\n", | |
"source_type = \"social_media\" # @param [\"audio\", \"video\", \"social_media\"]\n", | |
"output_format = \"txt\" # @param [\"txt\", \"srt\", \"json\"]\n", | |
"\n", | |
"\n", | |
"transcriber = Transcriber()\n", | |
"transcriber.upload_file(source_type=source_type)\n", | |
"transcriber.transcribe_audio(output_format=output_format)\n" | |
], | |
"metadata": { | |
"id": "6rkpcEtpnURd" | |
}, | |
"id": "6rkpcEtpnURd", | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"collapsed_sections": [ | |
"_cvbIZu8kqYX" | |
], | |
"gpuType": "T4", | |
"include_colab_link": true | |
}, | |
"language_info": { | |
"name": "python" | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"accelerator": "GPU" | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment