diegocostares · May 3, 2024 18:53
diff --git a/audiotranscribertool.ipynb b/audiotranscribertool.ipynb
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/diegocostares/881974b593bffcf1641f048be04f56c9/audiotranscribertool.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "3eb38cc7",
      "metadata": {
        "id": "3eb38cc7"
      },
      "source": [
        "# ✨**AudioTranscriberTool con WhisperX** 🎙️\n",
        "\n",
        "---\n",
        "Herramienta de transcripción de audio con WisperX que admite múltiples fuentes de entrada, incluidos archivos de audio, video y enlaces de redes sociales. Luego produce una transcripción del contenido en varios formatos (TXT, SRT, JSON).\n",
        "\n",
        "_Nota: Para ejecutar solo has click en el boton de reproduccion a la izquierda (▶️)_"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Instalación de dependencias 📦"
      ],
      "metadata": {
        "id": "_cvbIZu8kqYX"
      },
      "id": "_cvbIZu8kqYX"
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "839c047c",
      "metadata": {
        "id": "839c047c"
      },
      "outputs": [],
      "source": [
        "!pip install git+https://github.com/m-bain/whisperx.git --upgrade\n",
        "!pip install -U yt-dlp"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Programa principal"
      ],
      "metadata": {
        "id": "TwtSHHeDRbb3"
      },
      "id": "TwtSHHeDRbb3"
    },
    {
      "cell_type": "code",
      "source": [
        "import os\n",
        "from google.colab import files\n",
        "import whisperx\n",
        "import json\n",
        "import moviepy.editor as mp\n",
        "import yt_dlp\n",
        "import re\n",
        "\n",
        "\n",
        "class Transcriber:\n",
        "    def __init__(self, device=\"cuda\", batch_size=5, compute_type=\"float16\"):\n",
        "        self.device = device\n",
        "        self.batch_size = batch_size\n",
        "        self.compute_type = compute_type\n",
        "        self.file_path = None\n",
        "        self.transcription = None\n",
        "\n",
        "    def clear_previous_audio(self):\n",
        "        if self.file_path and os.path.exists(self.file_path):\n",
        "            os.remove(self.file_path)\n",
        "\n",
        "    def upload_file(self, source_type):\n",
        "        self.clear_previous_audio()\n",
        "\n",
        "        if source_type == \"social_media\":\n",
        "            url = input(f\"Ingrese la URL de la red social: \")\n",
        "            self.download_audio_from_url(url)\n",
        "        elif source_type in {\"audio\", \"video\"}:\n",
        "            self.file_path = self.upload_file_and_get_path()\n",
        "        else:\n",
        "            print(\"Tipo de fuente no válido.\")\n",
        "            return\n",
        "\n",
        "        if source_type == \"video\":\n",
        "            self.extract_audio_from_video()\n",
        "\n",
        "    def upload_file_and_get_path(self):\n",
        "        uploaded = files.upload()\n",
        "        return list(uploaded.keys())[0]\n",
        "\n",
        "    def download_audio_from_url(self, url):\n",
        "        ydl_opts = {\n",
        "            \"format\": \"bestaudio/best\",\n",
        "            \"postprocessors\": [\n",
        "                {\n",
        "                    \"key\": \"FFmpegExtractAudio\",\n",
        "                    \"preferredcodec\": \"wav\",\n",
        "                    \"preferredquality\": \"192\",\n",
        "                }\n",
        "            ],\n",
        "            \"outtmpl\": f\"extracted_audio.%(ext)s\",\n",
        "            \"quiet\": True,\n",
        "        }\n",
        "        with yt_dlp.YoutubeDL(ydl_opts) as ydl:\n",
        "            ydl.download([url])\n",
        "        self.file_path = f\"extracted_audio.wav\"\n",
        "\n",
        "    def extract_audio_from_video(self):\n",
        "        clip = mp.VideoFileClip(self.file_path)\n",
        "        clip.audio.write_audiofile(\"extracted_audio.wav\")\n",
        "        self.file_path = \"extracted_audio.wav\"\n",
        "\n",
        "    def transcribe_audio(self, output_format):\n",
        "        self.output_format = output_format\n",
        "        model = whisperx.load_model(\n",
        "            \"large-v2\", self.device, compute_type=self.compute_type\n",
        "        )\n",
        "        audio = whisperx.load_audio(self.file_path)\n",
        "        self.transcription = model.transcribe(audio, batch_size=self.batch_size)\n",
        "\n",
        "        model_a, metadata = whisperx.load_align_model(\n",
        "            language_code=self.transcription[\"language\"], device=self.device\n",
        "        )\n",
        "        self.transcription = whisperx.align(\n",
        "            self.transcription[\"segments\"],\n",
        "            model_a,\n",
        "            metadata,\n",
        "            audio,\n",
        "            self.device,\n",
        "            return_char_alignments=False,\n",
        "        )\n",
        "\n",
        "        self.save_transcription()\n",
        "\n",
        "    def save_transcription(self):\n",
        "        if self.output_format == \"txt\":\n",
        "            transcription_text = \"\\n\".join(\n",
        "                [segment[\"text\"] for segment in self.transcription[\"segments\"]]\n",
        "            )\n",
        "            with open(\"transcription.txt\", \"w\", encoding=\"utf8\") as file:\n",
        "                file.write(transcription_text)\n",
        "            files.download(\"transcription.txt\")\n",
        "\n",
        "        elif self.output_format == \"srt\":\n",
        "            words_per_entry = int(\n",
        "                input(\n",
        "                    f\"\\n\\nIngrese la cantidad de palabras por entrada (recommended 3-4): \"\n",
        "                )\n",
        "            )\n",
        "            srt_content = self.create_srt_content(self.transcription, words_per_entry)\n",
        "            with open(\"transcription.srt\", \"w\", encoding=\"utf8\") as file:\n",
        "                file.write(srt_content)\n",
        "            files.download(\"transcription.srt\")\n",
        "\n",
        "        elif self.output_format == \"json\":\n",
        "            with open(\"transcription.json\", \"w\", encoding=\"utf8\") as file:\n",
        "                json.dump(self.transcription, file, indent=4, ensure_ascii=False)\n",
        "            files.download(\"transcription.json\")\n",
        "        else:\n",
        "            print(\"Formato no soportado.\")\n",
        "\n",
        "    def create_srt_content(self, data, words_per_entry=5):\n",
        "        srt_content = \"\"\n",
        "        global_idx = 1\n",
        "\n",
        "        for segment in data[\"segments\"]:\n",
        "            words = segment[\"text\"].split()\n",
        "            total_words = len(words)\n",
        "            segment_duration = segment[\"end\"] - segment[\"start\"]\n",
        "\n",
        "            entry_words = []\n",
        "            current_word_count = 0\n",
        "            is_break_point = False\n",
        "\n",
        "            for word in words:\n",
        "                if bool(re.search(r\"[,.?]\", word)):\n",
        "                    is_break_point = True\n",
        "                entry_words.append(word)\n",
        "                current_word_count += 1\n",
        "\n",
        "                if current_word_count >= words_per_entry or is_break_point:\n",
        "                    entry_duration = (\n",
        "                        current_word_count / total_words\n",
        "                    ) * segment_duration\n",
        "\n",
        "                    start_time = self.seconds_to_srt_time(segment[\"start\"])\n",
        "                    end_time = self.seconds_to_srt_time(\n",
        "                        segment[\"start\"] + entry_duration\n",
        "                    )\n",
        "\n",
        "                    srt_content += f\"{global_idx}\\n{start_time} --> {end_time}\\n{' '.join(entry_words)}\\n\\n\"\n",
        "                    global_idx += 1\n",
        "\n",
        "                    segment[\"start\"] += entry_duration\n",
        "                    entry_words = []\n",
        "                    current_word_count = 0\n",
        "                    is_break_point = False\n",
        "\n",
        "            if entry_words:\n",
        "                start_time = self.seconds_to_srt_time(segment[\"start\"])\n",
        "                end_time = self.seconds_to_srt_time(segment[\"end\"])\n",
        "\n",
        "                srt_content += f\"{global_idx}\\n{start_time} --> {end_time}\\n{' '.join(entry_words)}\\n\\n\"\n",
        "                global_idx += 1\n",
        "\n",
        "        return srt_content\n",
        "\n",
        "    @staticmethod\n",
        "    def seconds_to_srt_time(seconds):\n",
        "        hours, remainder = divmod(seconds, 3600)\n",
        "        minutes, remainder = divmod(remainder, 60)\n",
        "        seconds, milliseconds = divmod(remainder, 1)\n",
        "        return \"{:02}:{:02}:{:02},{:03}\".format(\n",
        "            int(hours), int(minutes), int(seconds), int(milliseconds * 1000)\n",
        "        )\n",
        "\n",
        "\n",
        "# @title Transcriptor de Audio { vertical-output: true, display-mode: \"form\" }\n",
        "source_type = \"social_media\"  # @param [\"audio\", \"video\", \"social_media\"]\n",
        "output_format = \"txt\"  # @param [\"txt\", \"srt\", \"json\"]\n",
        "\n",
        "\n",
        "transcriber = Transcriber()\n",
        "transcriber.upload_file(source_type=source_type)\n",
        "transcriber.transcribe_audio(output_format=output_format)\n"
      ],
      "metadata": {
        "id": "6rkpcEtpnURd"
      },
      "id": "6rkpcEtpnURd",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "colab": {
      "provenance": [],
      "collapsed_sections": [
        "_cvbIZu8kqYX"
      ],
      "gpuType": "T4",
      "include_colab_link": true
    },
    "language_info": {
      "name": "python"
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU"
  },
  "nbformat": 4,
  "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/diegocostares/881974b593bffcf1641f048be04f56c9/audiotranscribertool.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"id": "3eb38cc7",
	"metadata": {
	"id": "3eb38cc7"
	},
	"source": [
	"# ✨AudioTranscriberTool con WhisperX 🎙️\n",
	"\n",
	"---\n",
	"Herramienta de transcripción de audio con WisperX que admite múltiples fuentes de entrada, incluidos archivos de audio, video y enlaces de redes sociales. Luego produce una transcripción del contenido en varios formatos (TXT, SRT, JSON).\n",
	"\n",
	"_Nota: Para ejecutar solo has click en el boton de reproduccion a la izquierda (▶️)_"
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"## Instalación de dependencias 📦"
	],
	"metadata": {
	"id": "_cvbIZu8kqYX"
	},
	"id": "_cvbIZu8kqYX"
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "839c047c",
	"metadata": {
	"id": "839c047c"
	},
	"outputs": [],
	"source": [
	"!pip install git+https://github.com/m-bain/whisperx.git --upgrade\n",
	"!pip install -U yt-dlp"
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Programa principal"
	],
	"metadata": {
	"id": "TwtSHHeDRbb3"
	},
	"id": "TwtSHHeDRbb3"
	},
	{
	"cell_type": "code",
	"source": [
	"import os\n",
	"from google.colab import files\n",
	"import whisperx\n",
	"import json\n",
	"import moviepy.editor as mp\n",
	"import yt_dlp\n",
	"import re\n",
	"\n",
	"\n",
	"class Transcriber:\n",
	" def __init__(self, device=\"cuda\", batch_size=5, compute_type=\"float16\"):\n",
	" self.device = device\n",
	" self.batch_size = batch_size\n",
	" self.compute_type = compute_type\n",
	" self.file_path = None\n",
	" self.transcription = None\n",
	"\n",
	" def clear_previous_audio(self):\n",
	" if self.file_path and os.path.exists(self.file_path):\n",
	" os.remove(self.file_path)\n",
	"\n",
	" def upload_file(self, source_type):\n",
	" self.clear_previous_audio()\n",
	"\n",
	" if source_type == \"social_media\":\n",
	" url = input(f\"Ingrese la URL de la red social: \")\n",
	" self.download_audio_from_url(url)\n",
	" elif source_type in {\"audio\", \"video\"}:\n",
	" self.file_path = self.upload_file_and_get_path()\n",
	" else:\n",
	" print(\"Tipo de fuente no válido.\")\n",
	" return\n",
	"\n",
	" if source_type == \"video\":\n",
	" self.extract_audio_from_video()\n",
	"\n",
	" def upload_file_and_get_path(self):\n",
	" uploaded = files.upload()\n",
	" return list(uploaded.keys())[0]\n",
	"\n",
	" def download_audio_from_url(self, url):\n",
	" ydl_opts = {\n",
	" \"format\": \"bestaudio/best\",\n",
	" \"postprocessors\": [\n",
	" {\n",
	" \"key\": \"FFmpegExtractAudio\",\n",
	" \"preferredcodec\": \"wav\",\n",
	" \"preferredquality\": \"192\",\n",
	" }\n",
	" ],\n",
	" \"outtmpl\": f\"extracted_audio.%(ext)s\",\n",
	" \"quiet\": True,\n",
	" }\n",
	" with yt_dlp.YoutubeDL(ydl_opts) as ydl:\n",
	" ydl.download([url])\n",
	" self.file_path = f\"extracted_audio.wav\"\n",
	"\n",
	" def extract_audio_from_video(self):\n",
	" clip = mp.VideoFileClip(self.file_path)\n",
	" clip.audio.write_audiofile(\"extracted_audio.wav\")\n",
	" self.file_path = \"extracted_audio.wav\"\n",
	"\n",
	" def transcribe_audio(self, output_format):\n",
	" self.output_format = output_format\n",
	" model = whisperx.load_model(\n",
	" \"large-v2\", self.device, compute_type=self.compute_type\n",
	" )\n",
	" audio = whisperx.load_audio(self.file_path)\n",
	" self.transcription = model.transcribe(audio, batch_size=self.batch_size)\n",
	"\n",
	" model_a, metadata = whisperx.load_align_model(\n",
	" language_code=self.transcription[\"language\"], device=self.device\n",
	" )\n",
	" self.transcription = whisperx.align(\n",
	" self.transcription[\"segments\"],\n",
	" model_a,\n",
	" metadata,\n",
	" audio,\n",
	" self.device,\n",
	" return_char_alignments=False,\n",
	" )\n",
	"\n",
	" self.save_transcription()\n",
	"\n",
	" def save_transcription(self):\n",
	" if self.output_format == \"txt\":\n",
	" transcription_text = \"\\n\".join(\n",
	" [segment[\"text\"] for segment in self.transcription[\"segments\"]]\n",
	" )\n",
	" with open(\"transcription.txt\", \"w\", encoding=\"utf8\") as file:\n",
	" file.write(transcription_text)\n",
	" files.download(\"transcription.txt\")\n",
	"\n",
	" elif self.output_format == \"srt\":\n",
	" words_per_entry = int(\n",
	" input(\n",
	" f\"\\n\\nIngrese la cantidad de palabras por entrada (recommended 3-4): \"\n",
	" )\n",
	" )\n",
	" srt_content = self.create_srt_content(self.transcription, words_per_entry)\n",
	" with open(\"transcription.srt\", \"w\", encoding=\"utf8\") as file:\n",
	" file.write(srt_content)\n",
	" files.download(\"transcription.srt\")\n",
	"\n",
	" elif self.output_format == \"json\":\n",
	" with open(\"transcription.json\", \"w\", encoding=\"utf8\") as file:\n",
	" json.dump(self.transcription, file, indent=4, ensure_ascii=False)\n",
	" files.download(\"transcription.json\")\n",
	" else:\n",
	" print(\"Formato no soportado.\")\n",
	"\n",
	" def create_srt_content(self, data, words_per_entry=5):\n",
	" srt_content = \"\"\n",
	" global_idx = 1\n",
	"\n",
	" for segment in data[\"segments\"]:\n",
	" words = segment[\"text\"].split()\n",
	" total_words = len(words)\n",
	" segment_duration = segment[\"end\"] - segment[\"start\"]\n",
	"\n",
	" entry_words = []\n",
	" current_word_count = 0\n",
	" is_break_point = False\n",
	"\n",
	" for word in words:\n",
	" if bool(re.search(r\"[,.?]\", word)):\n",
	" is_break_point = True\n",
	" entry_words.append(word)\n",
	" current_word_count += 1\n",
	"\n",
	" if current_word_count >= words_per_entry or is_break_point:\n",
	" entry_duration = (\n",
	" current_word_count / total_words\n",
	" ) * segment_duration\n",
	"\n",
	" start_time = self.seconds_to_srt_time(segment[\"start\"])\n",
	" end_time = self.seconds_to_srt_time(\n",
	" segment[\"start\"] + entry_duration\n",
	" )\n",
	"\n",
	" srt_content += f\"{global_idx}\\n{start_time} --> {end_time}\\n{' '.join(entry_words)}\\n\\n\"\n",
	" global_idx += 1\n",
	"\n",
	" segment[\"start\"] += entry_duration\n",
	" entry_words = []\n",
	" current_word_count = 0\n",
	" is_break_point = False\n",
	"\n",
	" if entry_words:\n",
	" start_time = self.seconds_to_srt_time(segment[\"start\"])\n",
	" end_time = self.seconds_to_srt_time(segment[\"end\"])\n",
	"\n",
	" srt_content += f\"{global_idx}\\n{start_time} --> {end_time}\\n{' '.join(entry_words)}\\n\\n\"\n",
	" global_idx += 1\n",
	"\n",
	" return srt_content\n",
	"\n",
	" @staticmethod\n",
	" def seconds_to_srt_time(seconds):\n",
	" hours, remainder = divmod(seconds, 3600)\n",
	" minutes, remainder = divmod(remainder, 60)\n",
	" seconds, milliseconds = divmod(remainder, 1)\n",
	" return \"{:02}:{:02}:{:02},{:03}\".format(\n",
	" int(hours), int(minutes), int(seconds), int(milliseconds * 1000)\n",
	" )\n",
	"\n",
	"\n",
	"# @title Transcriptor de Audio { vertical-output: true, display-mode: \"form\" }\n",
	"source_type = \"social_media\" # @param [\"audio\", \"video\", \"social_media\"]\n",
	"output_format = \"txt\" # @param [\"txt\", \"srt\", \"json\"]\n",
	"\n",
	"\n",
	"transcriber = Transcriber()\n",
	"transcriber.upload_file(source_type=source_type)\n",
	"transcriber.transcribe_audio(output_format=output_format)\n"
	],
	"metadata": {
	"id": "6rkpcEtpnURd"
	},
	"id": "6rkpcEtpnURd",
	"execution_count": null,
	"outputs": []
	}
	],
	"metadata": {
	"colab": {
	"provenance": [],
	"collapsed_sections": [
	"_cvbIZu8kqYX"
	],
	"gpuType": "T4",
	"include_colab_link": true
	},
	"language_info": {
	"name": "python"
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"accelerator": "GPU"
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}