Created
December 24, 2025 12:07
-
-
Save chottokun/f653cfdb9144f325ca878adeb6bd967d to your computer and use it in GitHub Desktop.
whisper+stable-ts.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "provenance": [], | |
| "gpuType": "T4", | |
| "authorship_tag": "ABX9TyPoeaafoGqd8OkS5JbP32yG", | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| }, | |
| "accelerator": "GPU" | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/chottokun/f653cfdb9144f325ca878adeb6bd967d/whisper-stable-ts.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "08b4c072" | |
| }, | |
| "source": [ | |
| "# `stable-ts` と `faster-whisper` を用いた音声認識環境\n", | |
| "\n", | |
| "ご提示いただいた設定に基づき、`stable-ts` と `faster-whisper` を用いた音声認識環境を構築します。\n", | |
| "\n", | |
| "**⚠️ 重要: 実行前に [ランタイム] > [ランタイムのタイプを変更] からハードウェアアクセラレータを「T4 GPU」に設定してください。**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "11901012", | |
| "outputId": "62a6e2ef-a837-4f01-a0a1-6b48a3ac43ee" | |
| }, | |
| "source": [ | |
| "# @title 1. ライブラリのインストール\n", | |
| "# stable-ts と faster-whisper をインストールします\n", | |
| "!pip install stable-ts faster-whisper\n", | |
| "\n", | |
| "import torch\n", | |
| "import stable_whisper\n", | |
| "from google.colab import files\n", | |
| "\n", | |
| "# GPUが使えるか確認\n", | |
| "if torch.cuda.is_available():\n", | |
| " print(\"\\n✅ GPU (CUDA) が利用可能です。高速推論モードで動作します。\")\n", | |
| "else:\n", | |
| " print(\"\\n⚠️ GPUが検出されませんでした。ランタイムの設定からGPUを有効にしてください。CPUでは非常に遅くなります。\")" | |
| ], | |
| "execution_count": 7, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Requirement already satisfied: stable-ts in /usr/local/lib/python3.12/dist-packages (2.19.1)\n", | |
| "Requirement already satisfied: faster-whisper in /usr/local/lib/python3.12/dist-packages (1.2.1)\n", | |
| "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (from stable-ts) (2.0.2)\n", | |
| "Requirement already satisfied: torch in /usr/local/lib/python3.12/dist-packages (from stable-ts) (2.9.0+cu126)\n", | |
| "Requirement already satisfied: torchaudio in /usr/local/lib/python3.12/dist-packages (from stable-ts) (2.9.0+cu126)\n", | |
| "Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from stable-ts) (4.67.1)\n", | |
| "Requirement already satisfied: openai-whisper<=20250625,>=20230314 in /usr/local/lib/python3.12/dist-packages (from stable-ts) (20250625)\n", | |
| "Requirement already satisfied: ctranslate2<5,>=4.0 in /usr/local/lib/python3.12/dist-packages (from faster-whisper) (4.6.2)\n", | |
| "Requirement already satisfied: huggingface-hub>=0.21 in /usr/local/lib/python3.12/dist-packages (from faster-whisper) (0.36.0)\n", | |
| "Requirement already satisfied: tokenizers<1,>=0.13 in /usr/local/lib/python3.12/dist-packages (from faster-whisper) (0.22.1)\n", | |
| "Requirement already satisfied: onnxruntime<2,>=1.14 in /usr/local/lib/python3.12/dist-packages (from faster-whisper) (1.23.2)\n", | |
| "Requirement already satisfied: av>=11 in /usr/local/lib/python3.12/dist-packages (from faster-whisper) (16.0.1)\n", | |
| "Requirement already satisfied: setuptools in /usr/local/lib/python3.12/dist-packages (from ctranslate2<5,>=4.0->faster-whisper) (75.2.0)\n", | |
| "Requirement already satisfied: pyyaml<7,>=5.3 in /usr/local/lib/python3.12/dist-packages (from ctranslate2<5,>=4.0->faster-whisper) (6.0.3)\n", | |
| "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.21->faster-whisper) (3.20.0)\n", | |
| "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.21->faster-whisper) (2025.3.0)\n", | |
| "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.21->faster-whisper) (25.0)\n", | |
| "Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.21->faster-whisper) (2.32.4)\n", | |
| "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.21->faster-whisper) (4.15.0)\n", | |
| "Requirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.21->faster-whisper) (1.2.0)\n", | |
| "Requirement already satisfied: coloredlogs in /usr/local/lib/python3.12/dist-packages (from onnxruntime<2,>=1.14->faster-whisper) (15.0.1)\n", | |
| "Requirement already satisfied: flatbuffers in /usr/local/lib/python3.12/dist-packages (from onnxruntime<2,>=1.14->faster-whisper) (25.9.23)\n", | |
| "Requirement already satisfied: protobuf in /usr/local/lib/python3.12/dist-packages (from onnxruntime<2,>=1.14->faster-whisper) (5.29.5)\n", | |
| "Requirement already satisfied: sympy in /usr/local/lib/python3.12/dist-packages (from onnxruntime<2,>=1.14->faster-whisper) (1.14.0)\n", | |
| "Requirement already satisfied: more-itertools in /usr/local/lib/python3.12/dist-packages (from openai-whisper<=20250625,>=20230314->stable-ts) (10.8.0)\n", | |
| "Requirement already satisfied: numba in /usr/local/lib/python3.12/dist-packages (from openai-whisper<=20250625,>=20230314->stable-ts) (0.60.0)\n", | |
| "Requirement already satisfied: tiktoken in /usr/local/lib/python3.12/dist-packages (from openai-whisper<=20250625,>=20230314->stable-ts) (0.12.0)\n", | |
| "Requirement already satisfied: triton>=2 in /usr/local/lib/python3.12/dist-packages (from openai-whisper<=20250625,>=20230314->stable-ts) (3.5.0)\n", | |
| "Requirement already satisfied: networkx>=2.5.1 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (3.6.1)\n", | |
| "Requirement already satisfied: jinja2 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (3.1.6)\n", | |
| "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (12.6.77)\n", | |
| "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (12.6.77)\n", | |
| "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.6.80 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (12.6.80)\n", | |
| "Requirement already satisfied: nvidia-cudnn-cu12==9.10.2.21 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (9.10.2.21)\n", | |
| "Requirement already satisfied: nvidia-cublas-cu12==12.6.4.1 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (12.6.4.1)\n", | |
| "Requirement already satisfied: nvidia-cufft-cu12==11.3.0.4 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (11.3.0.4)\n", | |
| "Requirement already satisfied: nvidia-curand-cu12==10.3.7.77 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (10.3.7.77)\n", | |
| "Requirement already satisfied: nvidia-cusolver-cu12==11.7.1.2 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (11.7.1.2)\n", | |
| "Requirement already satisfied: nvidia-cusparse-cu12==12.5.4.2 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (12.5.4.2)\n", | |
| "Requirement already satisfied: nvidia-cusparselt-cu12==0.7.1 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (0.7.1)\n", | |
| "Requirement already satisfied: nvidia-nccl-cu12==2.27.5 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (2.27.5)\n", | |
| "Requirement already satisfied: nvidia-nvshmem-cu12==3.3.20 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (3.3.20)\n", | |
| "Requirement already satisfied: nvidia-nvtx-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (12.6.77)\n", | |
| "Requirement already satisfied: nvidia-nvjitlink-cu12==12.6.85 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (12.6.85)\n", | |
| "Requirement already satisfied: nvidia-cufile-cu12==1.11.1.6 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (1.11.1.6)\n", | |
| "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.12/dist-packages (from sympy->onnxruntime<2,>=1.14->faster-whisper) (1.3.0)\n", | |
| "Requirement already satisfied: humanfriendly>=9.1 in /usr/local/lib/python3.12/dist-packages (from coloredlogs->onnxruntime<2,>=1.14->faster-whisper) (10.0)\n", | |
| "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2->torch->stable-ts) (3.0.3)\n", | |
| "Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /usr/local/lib/python3.12/dist-packages (from numba->openai-whisper<=20250625,>=20230314->stable-ts) (0.43.0)\n", | |
| "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->huggingface-hub>=0.21->faster-whisper) (3.4.4)\n", | |
| "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests->huggingface-hub>=0.21->faster-whisper) (3.11)\n", | |
| "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->huggingface-hub>=0.21->faster-whisper) (2.5.0)\n", | |
| "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests->huggingface-hub>=0.21->faster-whisper) (2025.11.12)\n", | |
| "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.12/dist-packages (from tiktoken->openai-whisper<=20250625,>=20230314->stable-ts) (2025.11.3)\n", | |
| "\n", | |
| "✅ GPU (CUDA) が利用可能です。高速推論モードで動作します。\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 1000 | |
| }, | |
| "id": "d1242fbe", | |
| "outputId": "1922227e-ec58-4cf4-a5cb-0fb65b91fca5" | |
| }, | |
| "source": [ | |
| "# @title 2. 音声ファイルをアップロードして実行\n", | |
| "# モデルのロード(初回はダウンロードに時間がかかります)\n", | |
| "print(\"⏳ モデル(large-v3-turbo)をロード中...\")\n", | |
| "model = stable_whisper.load_faster_whisper(\n", | |
| " \"large-v3-turbo\",\n", | |
| " device=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n", | |
| " compute_type=\"float16\" if torch.cuda.is_available() else \"int8\"\n", | |
| ")\n", | |
| "print(\"✅ モデルロード完了\")\n", | |
| "\n", | |
| "# 音声ファイルのアップロード\n", | |
| "print(\"\\n📂 音声ファイル(mp3, wav, m4a等)をアップロードしてください:\")\n", | |
| "uploaded = files.upload()\n", | |
| "\n", | |
| "for filename in uploaded.keys():\n", | |
| " print(f\"\\n🎙️ 解析中: {filename} ...\")\n", | |
| "\n", | |
| " #\n", | |
| " result = model.transcribe(\n", | |
| " filename,\n", | |
| " language=\"ja\",\n", | |
| "\n", | |
| " #\n", | |
| " vad=True, # VAD(音声区間検出)を有効化\n", | |
| " condition_on_previous_text=False,# 前の文脈に依存しない(ループ対策の要)\n", | |
| " word_timestamps=False, # 単語ごとのタイムスタンプを使わない(安定化)\n", | |
| " beam_size=5 # 探索幅(精度向上のため)\n", | |
| " )\n", | |
| "\n", | |
| " # 結果の表示\n", | |
| " print(\"\\n\" + \"=\"*30 + \" 認識結果 \" + \"=\"*30)\n", | |
| " for segment in result.segments:\n", | |
| " # 秒数を整形して表示\n", | |
| " start = f\"{segment.start:.2f}\"\n", | |
| " end = f\"{segment.end:.2f}\"\n", | |
| " print(f\"[{start}s -> {end}s] {segment.text}\")\n", | |
| " print(\"=\"*68)\n", | |
| "\n", | |
| " # SRTファイル(字幕ファイル)として保存・ダウンロード\n", | |
| " srt_filename = filename.rsplit('.', 1)[0] + \".srt\"\n", | |
| " result.to_srt_vtt(srt_filename)\n", | |
| " files.download(srt_filename)\n", | |
| " print(f\"\\n💾 字幕ファイル({srt_filename})をダウンロードしました。\")" | |
| ], | |
| "execution_count": 8, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "⏳ モデル(large-v3-turbo)をロード中...\n", | |
| "✅ モデルロード完了\n", | |
| "\n", | |
| "📂 音声ファイル(mp3, wav, m4a等)をアップロードしてください:\n" | |
| ] | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/plain": [ | |
| "<IPython.core.display.HTML object>" | |
| ], | |
| "text/html": [ | |
| "\n", | |
| " <input type=\"file\" id=\"files-89316340-2eac-4148-a9fc-dbf16a097068\" name=\"files[]\" multiple disabled\n", | |
| " style=\"border:none\" />\n", | |
| " <output id=\"result-89316340-2eac-4148-a9fc-dbf16a097068\">\n", | |
| " Upload widget is only available when the cell has been executed in the\n", | |
| " current browser session. Please rerun this cell to enable.\n", | |
| " </output>\n", | |
| " <script>// Copyright 2017 Google LLC\n", | |
| "//\n", | |
| "// Licensed under the Apache License, Version 2.0 (the \"License\");\n", | |
| "// you may not use this file except in compliance with the License.\n", | |
| "// You may obtain a copy of the License at\n", | |
| "//\n", | |
| "// http://www.apache.org/licenses/LICENSE-2.0\n", | |
| "//\n", | |
| "// Unless required by applicable law or agreed to in writing, software\n", | |
| "// distributed under the License is distributed on an \"AS IS\" BASIS,\n", | |
| "// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", | |
| "// See the License for the specific language governing permissions and\n", | |
| "// limitations under the License.\n", | |
| "\n", | |
| "/**\n", | |
| " * @fileoverview Helpers for google.colab Python module.\n", | |
| " */\n", | |
| "(function(scope) {\n", | |
| "function span(text, styleAttributes = {}) {\n", | |
| " const element = document.createElement('span');\n", | |
| " element.textContent = text;\n", | |
| " for (const key of Object.keys(styleAttributes)) {\n", | |
| " element.style[key] = styleAttributes[key];\n", | |
| " }\n", | |
| " return element;\n", | |
| "}\n", | |
| "\n", | |
| "// Max number of bytes which will be uploaded at a time.\n", | |
| "const MAX_PAYLOAD_SIZE = 100 * 1024;\n", | |
| "\n", | |
| "function _uploadFiles(inputId, outputId) {\n", | |
| " const steps = uploadFilesStep(inputId, outputId);\n", | |
| " const outputElement = document.getElementById(outputId);\n", | |
| " // Cache steps on the outputElement to make it available for the next call\n", | |
| " // to uploadFilesContinue from Python.\n", | |
| " outputElement.steps = steps;\n", | |
| "\n", | |
| " return _uploadFilesContinue(outputId);\n", | |
| "}\n", | |
| "\n", | |
| "// This is roughly an async generator (not supported in the browser yet),\n", | |
| "// where there are multiple asynchronous steps and the Python side is going\n", | |
| "// to poll for completion of each step.\n", | |
| "// This uses a Promise to block the python side on completion of each step,\n", | |
| "// then passes the result of the previous step as the input to the next step.\n", | |
| "function _uploadFilesContinue(outputId) {\n", | |
| " const outputElement = document.getElementById(outputId);\n", | |
| " const steps = outputElement.steps;\n", | |
| "\n", | |
| " const next = steps.next(outputElement.lastPromiseValue);\n", | |
| " return Promise.resolve(next.value.promise).then((value) => {\n", | |
| " // Cache the last promise value to make it available to the next\n", | |
| " // step of the generator.\n", | |
| " outputElement.lastPromiseValue = value;\n", | |
| " return next.value.response;\n", | |
| " });\n", | |
| "}\n", | |
| "\n", | |
| "/**\n", | |
| " * Generator function which is called between each async step of the upload\n", | |
| " * process.\n", | |
| " * @param {string} inputId Element ID of the input file picker element.\n", | |
| " * @param {string} outputId Element ID of the output display.\n", | |
| " * @return {!Iterable<!Object>} Iterable of next steps.\n", | |
| " */\n", | |
| "function* uploadFilesStep(inputId, outputId) {\n", | |
| " const inputElement = document.getElementById(inputId);\n", | |
| " inputElement.disabled = false;\n", | |
| "\n", | |
| " const outputElement = document.getElementById(outputId);\n", | |
| " outputElement.innerHTML = '';\n", | |
| "\n", | |
| " const pickedPromise = new Promise((resolve) => {\n", | |
| " inputElement.addEventListener('change', (e) => {\n", | |
| " resolve(e.target.files);\n", | |
| " });\n", | |
| " });\n", | |
| "\n", | |
| " const cancel = document.createElement('button');\n", | |
| " inputElement.parentElement.appendChild(cancel);\n", | |
| " cancel.textContent = 'Cancel upload';\n", | |
| " const cancelPromise = new Promise((resolve) => {\n", | |
| " cancel.onclick = () => {\n", | |
| " resolve(null);\n", | |
| " };\n", | |
| " });\n", | |
| "\n", | |
| " // Wait for the user to pick the files.\n", | |
| " const files = yield {\n", | |
| " promise: Promise.race([pickedPromise, cancelPromise]),\n", | |
| " response: {\n", | |
| " action: 'starting',\n", | |
| " }\n", | |
| " };\n", | |
| "\n", | |
| " cancel.remove();\n", | |
| "\n", | |
| " // Disable the input element since further picks are not allowed.\n", | |
| " inputElement.disabled = true;\n", | |
| "\n", | |
| " if (!files) {\n", | |
| " return {\n", | |
| " response: {\n", | |
| " action: 'complete',\n", | |
| " }\n", | |
| " };\n", | |
| " }\n", | |
| "\n", | |
| " for (const file of files) {\n", | |
| " const li = document.createElement('li');\n", | |
| " li.append(span(file.name, {fontWeight: 'bold'}));\n", | |
| " li.append(span(\n", | |
| " `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n", | |
| " `last modified: ${\n", | |
| " file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n", | |
| " 'n/a'} - `));\n", | |
| " const percent = span('0% done');\n", | |
| " li.appendChild(percent);\n", | |
| "\n", | |
| " outputElement.appendChild(li);\n", | |
| "\n", | |
| " const fileDataPromise = new Promise((resolve) => {\n", | |
| " const reader = new FileReader();\n", | |
| " reader.onload = (e) => {\n", | |
| " resolve(e.target.result);\n", | |
| " };\n", | |
| " reader.readAsArrayBuffer(file);\n", | |
| " });\n", | |
| " // Wait for the data to be ready.\n", | |
| " let fileData = yield {\n", | |
| " promise: fileDataPromise,\n", | |
| " response: {\n", | |
| " action: 'continue',\n", | |
| " }\n", | |
| " };\n", | |
| "\n", | |
| " // Use a chunked sending to avoid message size limits. See b/62115660.\n", | |
| " let position = 0;\n", | |
| " do {\n", | |
| " const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n", | |
| " const chunk = new Uint8Array(fileData, position, length);\n", | |
| " position += length;\n", | |
| "\n", | |
| " const base64 = btoa(String.fromCharCode.apply(null, chunk));\n", | |
| " yield {\n", | |
| " response: {\n", | |
| " action: 'append',\n", | |
| " file: file.name,\n", | |
| " data: base64,\n", | |
| " },\n", | |
| " };\n", | |
| "\n", | |
| " let percentDone = fileData.byteLength === 0 ?\n", | |
| " 100 :\n", | |
| " Math.round((position / fileData.byteLength) * 100);\n", | |
| " percent.textContent = `${percentDone}% done`;\n", | |
| "\n", | |
| " } while (position < fileData.byteLength);\n", | |
| " }\n", | |
| "\n", | |
| " // All done.\n", | |
| " yield {\n", | |
| " response: {\n", | |
| " action: 'complete',\n", | |
| " }\n", | |
| " };\n", | |
| "}\n", | |
| "\n", | |
| "scope.google = scope.google || {};\n", | |
| "scope.google.colab = scope.google.colab || {};\n", | |
| "scope.google.colab._files = {\n", | |
| " _uploadFiles,\n", | |
| " _uploadFilesContinue,\n", | |
| "};\n", | |
| "})(self);\n", | |
| "</script> " | |
| ] | |
| }, | |
| "metadata": {} | |
| }, | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Saving n0001a-The-Ant-and-Grasshopper.mp3 to n0001a-The-Ant-and-Grasshopper (1).mp3\n", | |
| "\n", | |
| "🎙️ 解析中: n0001a-The-Ant-and-Grasshopper (1).mp3 ...\n", | |
| "Detected Language: japanese\n" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "name": "stderr", | |
| "text": [ | |
| "Transcribe: 100%|██████████| 165.88/165.88 [00:04<00:00, 39.68sec/s]\n", | |
| "VAD: 100%|██████████| 165.88/165.88 [00:02<00:00, 62.77sec/s]\n", | |
| "Adjustment: 100%|██████████| 194.0/194.0 [00:00<00:00, 107503.63sec/s]" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "\n", | |
| "============================== 認識結果 ==============================\n", | |
| "[2.46s -> 6.22s] 日本語多読ブックス レベルゼロ\n", | |
| "[6.22s -> 14.00s] アリとキリギリス イソップ物語より\n", | |
| "[15.68s -> 17.88s] NPO 多言語多読\n", | |
| "[22.43s -> 23.18s] 夏です\n", | |
| "[23.18s -> 27.52s] キリギリスが木の下にいます\n", | |
| "[29.66s -> 31.90s] キリギリスはうたをうたいます\n", | |
| "[33.60s -> 34.58s] ラララララ\n", | |
| "[38.11s -> 39.26s] アリがきました\n", | |
| "[40.96s -> 43.54s] アリはたべものをはこびます\n", | |
| "[45.47s -> 47.18s] キリギリスがいいました\n", | |
| "[47.18s -> 52.84s] アリさん、いっしょにうたをうたいましょう\n", | |
| "[55.30s -> 56.62s] アリはいいました\n", | |
| "[56.62s -> 60.90s] いいえ、うたいません\n", | |
| "[62.62s -> 64.78s] わたしたちははたらきます\n", | |
| "[68.03s -> 69.88s] キリギリスはききました\n", | |
| "[71.84s -> 73.02s] どうしてですか\n", | |
| "[73.02s -> 78.38s] ふゆはたべものがありませんから\n", | |
| "[80.03s -> 83.38s] え、いまはなつですよ\n", | |
| "[85.25s -> 87.10s] キリギリスは笑いました\n", | |
| "[89.95s -> 91.56s] それから毎日\n", | |
| "[91.56s -> 95.86s] キリギリスは歌を歌いました\n", | |
| "[98.43s -> 99.96s] 働きませんでした\n", | |
| "[103.90s -> 104.60s] 冬です\n", | |
| "[106.14s -> 107.38s] 雪が降ります\n", | |
| "[109.31s -> 110.72s] とても寒いです\n", | |
| "[110.72s -> 116.78s] キリギリスの家には食べ物がありません\n", | |
| "[116.78s -> 125.52s] アリの家には食べ物がたくさんあります\n", | |
| "[125.52s -> 131.12s] キリギリスはアリの家へ行きました\n", | |
| "[134.75s -> 136.46s] キリギリスは言いました\n", | |
| "[136.46s -> 144.54s] アリさん、お願いです。食べ物をください\n", | |
| "[147.33s -> 148.64s] アリは言いました\n", | |
| "[148.64s -> 154.26s] 私たちは夏働きました\n", | |
| "[154.26s -> 158.58s] だから食べ物があります\n", | |
| "[160.54s -> 164.02s] あなたは夏何をしましたか\n", | |
| "[165.88s -> 194.00s] ご視聴ありがとうございました\n", | |
| "====================================================================\n", | |
| "Saved: /content/n0001a-The-Ant-and-Grasshopper (1).srt\n" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "name": "stderr", | |
| "text": [ | |
| "\n", | |
| "/usr/local/lib/python3.12/dist-packages/stable_whisper/text_output.py:183: UserWarning: Result is missing word timestamps. Word-level timing cannot be exported. Use ``word_level=False`` to avoid this warning\n", | |
| " warnings.warn('Result is missing word timestamps. Word-level timing cannot be exported. '\n" | |
| ] | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/plain": [ | |
| "<IPython.core.display.Javascript object>" | |
| ], | |
| "application/javascript": [ | |
| "\n", | |
| " async function download(id, filename, size) {\n", | |
| " if (!google.colab.kernel.accessAllowed) {\n", | |
| " return;\n", | |
| " }\n", | |
| " const div = document.createElement('div');\n", | |
| " const label = document.createElement('label');\n", | |
| " label.textContent = `Downloading \"${filename}\": `;\n", | |
| " div.appendChild(label);\n", | |
| " const progress = document.createElement('progress');\n", | |
| " progress.max = size;\n", | |
| " div.appendChild(progress);\n", | |
| " document.body.appendChild(div);\n", | |
| "\n", | |
| " const buffers = [];\n", | |
| " let downloaded = 0;\n", | |
| "\n", | |
| " const channel = await google.colab.kernel.comms.open(id);\n", | |
| " // Send a message to notify the kernel that we're ready.\n", | |
| " channel.send({})\n", | |
| "\n", | |
| " for await (const message of channel.messages) {\n", | |
| " // Send a message to notify the kernel that we're ready.\n", | |
| " channel.send({})\n", | |
| " if (message.buffers) {\n", | |
| " for (const buffer of message.buffers) {\n", | |
| " buffers.push(buffer);\n", | |
| " downloaded += buffer.byteLength;\n", | |
| " progress.value = downloaded;\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " const blob = new Blob(buffers, {type: 'application/binary'});\n", | |
| " const a = document.createElement('a');\n", | |
| " a.href = window.URL.createObjectURL(blob);\n", | |
| " a.download = filename;\n", | |
| " div.appendChild(a);\n", | |
| " a.click();\n", | |
| " div.remove();\n", | |
| " }\n", | |
| " " | |
| ] | |
| }, | |
| "metadata": {} | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/plain": [ | |
| "<IPython.core.display.Javascript object>" | |
| ], | |
| "application/javascript": [ | |
| "download(\"download_1811b4a0-ba20-4fa6-8ecd-601b4484dde4\", \"n0001a-The-Ant-and-Grasshopper (1).srt\", 2375)" | |
| ] | |
| }, | |
| "metadata": {} | |
| }, | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "\n", | |
| "💾 字幕ファイル(n0001a-The-Ant-and-Grasshopper (1).srt)をダウンロードしました。\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [], | |
| "metadata": { | |
| "id": "DDnKDNqlNn5H" | |
| }, | |
| "execution_count": 8, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "210e4e37" | |
| }, | |
| "source": [ | |
| "# @title 3. サンプル音声でのテスト\n", | |
| "# 指定されたURLから音声と正解テキストをダウンロードして文字起こしを行います。" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 1000 | |
| }, | |
| "id": "50542827", | |
| "outputId": "51668824-2c31-4358-82fb-1657f218f709" | |
| }, | |
| "source": [ | |
| "import os\n", | |
| "\n", | |
| "# ファイルのダウンロード\n", | |
| "audio_url = \"https://clrd.ninjal.ac.jp/csj/sound-f/interview_aps-smp.mp3\"\n", | |
| "text_url = \"https://clrd.ninjal.ac.jp/csj/trans-f/interview_aps-smp.txt\"\n", | |
| "audio_file = \"interview_aps-smp.mp3\"\n", | |
| "text_file = \"interview_aps-smp.txt\"\n", | |
| "\n", | |
| "!wget -q -O {audio_file} {audio_url}\n", | |
| "!wget -q -O {text_file} {text_url}\n", | |
| "\n", | |
| "print(f\"✅ ダウンロード完了: {audio_file}, {text_file}\")\n", | |
| "\n", | |
| "# モデルがロードされていない場合はロードする(前のセルを実行していない場合用)\n", | |
| "if 'model' not in locals():\n", | |
| " print(\"⏳ モデル(large-v3-turbo)をロード中...\")\n", | |
| " model = stable_whisper.load_faster_whisper(\n", | |
| " \"large-v3-turbo\",\n", | |
| " device=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n", | |
| " compute_type=\"float16\" if torch.cuda.is_available() else \"int8\"\n", | |
| " )\n", | |
| "\n", | |
| "# 文字起こし実行\n", | |
| "print(f\"\\n🎙️ 解析中: {audio_file} ...\")\n", | |
| "result = model.transcribe(\n", | |
| " audio_file,\n", | |
| " language=\"ja\",\n", | |
| " vad=True,\n", | |
| " condition_on_previous_text=False,\n", | |
| " word_timestamps=False,\n", | |
| " beam_size=5\n", | |
| ")\n", | |
| "\n", | |
| "# 結果表示\n", | |
| "print(\"\\n\" + \"=\"*30 + \" Whisper 認識結果 \" + \"=\"*30)\n", | |
| "for segment in result.segments:\n", | |
| " print(f\"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}\")\n", | |
| "\n", | |
| "print(\"\\n\" + \"=\"*30 + \" 正解データ (参考) \" + \"=\"*30)\n", | |
| "# エンコーディングを自動判別して表示(Shift_JISなどの可能性があるため)\n", | |
| "encodings = ['utf-8', 'shift_jis', 'euc-jp']\n", | |
| "reference_text = \"\"\n", | |
| "for enc in encodings:\n", | |
| " try:\n", | |
| " with open(text_file, 'r', encoding=enc) as f:\n", | |
| " reference_text = f.read()\n", | |
| " break\n", | |
| " except UnicodeDecodeError:\n", | |
| " continue\n", | |
| "\n", | |
| "print(reference_text[:2000] + (\"...\" if len(reference_text) > 2000 else \"\")) # 長すぎる場合は省略\n", | |
| "\n", | |
| "# SRT保存\n", | |
| "srt_filename = audio_file.rsplit('.', 1)[0] + \".srt\"\n", | |
| "result.to_srt_vtt(srt_filename)\n", | |
| "files.download(srt_filename)\n", | |
| "print(f\"\\n💾 字幕ファイル({srt_filename})をダウンロードしました。\")" | |
| ], | |
| "execution_count": 9, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "✅ ダウンロード完了: interview_aps-smp.mp3, interview_aps-smp.txt\n", | |
| "\n", | |
| "🎙️ 解析中: interview_aps-smp.mp3 ...\n", | |
| "Detected Language: japanese\n" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "name": "stderr", | |
| "text": [ | |
| "Transcribe: 100%|██████████| 71.75/71.75 [00:02<00:00, 32.79sec/s] \n", | |
| "VAD: 100%|██████████| 71.75/71.75 [00:01<00:00, 64.04sec/s]\n", | |
| "Adjustment: 100%|██████████| 71.14/71.14 [00:00<00:00, 238324.91sec/s]" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "\n", | |
| "============================== Whisper 認識結果 ==============================\n", | |
| "[0.35s -> 12.24s] 質問させていただきます。読んだんですけれども、理解には遠く及ばずという感じで、言葉の意味からお聞きしたいと思うんですけど、\n", | |
| "[12.80s -> 20.68s] パラ言語情報っていう言葉と、あとフォルマント、あと聴音動っていうことについてまずはじめに聞かせてください。\n", | |
| "[20.68s -> 25.56s] 3つね。言語ってのはわかりますよね。言葉ですよね。\n", | |
| "[25.96s -> 33.34s] 言語情報っていうのはね、簡単に言えば単語の意味。辞書に書いてありますよね。\n", | |
| "[33.34s -> 44.18s] それから単語がくっついたときに、くっついた文を作ったりするときに、名詞に助詞がくっついて、動詞があって、最後に助動詞があって、\n", | |
| "[44.70s -> 49.88s] そういう普通に言語学の教科書に書いてあるような、それが言語情報ですね。\n", | |
| "[49.88s -> 60.90s] で、パラって言葉はね、語源的にはね、その、なんとかの横に、とかね、隣に、とかね、そういう意味なんですよ。\n", | |
| "[62.50s -> 71.14s] で、つまり言語からちょっとずれたところにある情報、だけど、だけど、その実際には存在している情報っていうことですね。\n", | |
| "\n", | |
| "============================== 正解データ (参考) ==============================\n", | |
| "%講演ID:D04M0041\n", | |
| "%\n", | |
| "%<SOT>\n", | |
| "%%【略】\n", | |
| "0003 00008.805-00012.085 L:\n", | |
| "質問させていただきます & シツモンサセテイタダキマス \n", | |
| "(F あの) & (F アノ) \n", | |
| "読んだんですけれども & ヨンダンデスケレドモ \n", | |
| "0004 00009.417-00009.838 R:\n", | |
| "(F うん) & (F <VN>) \n", | |
| "0005 00011.770-00012.901 R:\n", | |
| "(F うん) & (F <VN>) \n", | |
| "分からなかった & ワカラナカッタ \n", | |
| "0006 00012.536-00013.221 L:\n", | |
| "大抵の & タイ(笑 テーノ) \n", | |
| "0007 00013.250-00014.315 R:<笑>\n", | |
| "0008 00013.698-00016.817 L:\n", | |
| "(F あのー) & (F アノー) \n", | |
| "理解には & リカイニワ \n", | |
| "遠く & トーク \n", | |
| "及ばずという & オヨバズトユー \n", | |
| "感じで & カンジデ \n", | |
| "0009 00017.156-00018.411 L:\n", | |
| "(F あのー) & (F アノー) \n", | |
| "言葉の & コトバノ \n", | |
| "意味 & イミ \n", | |
| "0010 00018.861-00020.915 L:\n", | |
| "から & カラ \n", | |
| "お聞きしたいと & オキキシタイト \n", | |
| "思うんですけど & オモウンデスケド \n", | |
| "0011 00019.775-00020.137 R:\n", | |
| "(F はい)(F はい) & (F (? ハ)イ)(F ハイ) \n", | |
| "0012 00020.603-00020.823 R:\n", | |
| "(F うん) & (F <VN>) ...\n", | |
| "Saved: /content/interview_aps-smp.srt\n" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "name": "stderr", | |
| "text": [ | |
| "\n", | |
| "/usr/local/lib/python3.12/dist-packages/stable_whisper/text_output.py:183: UserWarning: Result is missing word timestamps. Word-level timing cannot be exported. Use ``word_level=False`` to avoid this warning\n", | |
| " warnings.warn('Result is missing word timestamps. Word-level timing cannot be exported. '\n" | |
| ] | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/plain": [ | |
| "<IPython.core.display.Javascript object>" | |
| ], | |
| "application/javascript": [ | |
| "\n", | |
| " async function download(id, filename, size) {\n", | |
| " if (!google.colab.kernel.accessAllowed) {\n", | |
| " return;\n", | |
| " }\n", | |
| " const div = document.createElement('div');\n", | |
| " const label = document.createElement('label');\n", | |
| " label.textContent = `Downloading \"${filename}\": `;\n", | |
| " div.appendChild(label);\n", | |
| " const progress = document.createElement('progress');\n", | |
| " progress.max = size;\n", | |
| " div.appendChild(progress);\n", | |
| " document.body.appendChild(div);\n", | |
| "\n", | |
| " const buffers = [];\n", | |
| " let downloaded = 0;\n", | |
| "\n", | |
| " const channel = await google.colab.kernel.comms.open(id);\n", | |
| " // Send a message to notify the kernel that we're ready.\n", | |
| " channel.send({})\n", | |
| "\n", | |
| " for await (const message of channel.messages) {\n", | |
| " // Send a message to notify the kernel that we're ready.\n", | |
| " channel.send({})\n", | |
| " if (message.buffers) {\n", | |
| " for (const buffer of message.buffers) {\n", | |
| " buffers.push(buffer);\n", | |
| " downloaded += buffer.byteLength;\n", | |
| " progress.value = downloaded;\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " const blob = new Blob(buffers, {type: 'application/binary'});\n", | |
| " const a = document.createElement('a');\n", | |
| " a.href = window.URL.createObjectURL(blob);\n", | |
| " a.download = filename;\n", | |
| " div.appendChild(a);\n", | |
| " a.click();\n", | |
| " div.remove();\n", | |
| " }\n", | |
| " " | |
| ] | |
| }, | |
| "metadata": {} | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/plain": [ | |
| "<IPython.core.display.Javascript object>" | |
| ], | |
| "application/javascript": [ | |
| "download(\"download_d2e992ee-ba9d-402d-8c8d-c58b12a53443\", \"interview_aps-smp.srt\", 1417)" | |
| ] | |
| }, | |
| "metadata": {} | |
| }, | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "\n", | |
| "💾 字幕ファイル(interview_aps-smp.srt)をダウンロードしました。\n" | |
| ] | |
| } | |
| ] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment