chottokun · December 24, 2025 12:07
diff --git a/whisper-stable-ts.ipynb b/whisper-stable-ts.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4",
      "authorship_tag": "ABX9TyPoeaafoGqd8OkS5JbP32yG",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/chottokun/f653cfdb9144f325ca878adeb6bd967d/whisper-stable-ts.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "08b4c072"
      },
      "source": [
        "# `stable-ts` と `faster-whisper` を用いた音声認識環境\n",
        "\n",
        "ご提示いただいた設定に基づき、`stable-ts` と `faster-whisper` を用いた音声認識環境を構築します。\n",
        "\n",
        "**⚠️ 重要: 実行前に [ランタイム] > [ランタイムのタイプを変更] からハードウェアアクセラレータを「T4 GPU」に設定してください。**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "11901012",
        "outputId": "62a6e2ef-a837-4f01-a0a1-6b48a3ac43ee"
      },
      "source": [
        "# @title 1. ライブラリのインストール\n",
        "# stable-ts と faster-whisper をインストールします\n",
        "!pip install stable-ts faster-whisper\n",
        "\n",
        "import torch\n",
        "import stable_whisper\n",
        "from google.colab import files\n",
        "\n",
        "# GPUが使えるか確認\n",
        "if torch.cuda.is_available():\n",
        "    print(\"\\n✅ GPU (CUDA) が利用可能です。高速推論モードで動作します。\")\n",
        "else:\n",
        "    print(\"\\n⚠️ GPUが検出されませんでした。ランタイムの設定からGPUを有効にしてください。CPUでは非常に遅くなります。\")"
      ],
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Requirement already satisfied: stable-ts in /usr/local/lib/python3.12/dist-packages (2.19.1)\n",
            "Requirement already satisfied: faster-whisper in /usr/local/lib/python3.12/dist-packages (1.2.1)\n",
            "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (from stable-ts) (2.0.2)\n",
            "Requirement already satisfied: torch in /usr/local/lib/python3.12/dist-packages (from stable-ts) (2.9.0+cu126)\n",
            "Requirement already satisfied: torchaudio in /usr/local/lib/python3.12/dist-packages (from stable-ts) (2.9.0+cu126)\n",
            "Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from stable-ts) (4.67.1)\n",
            "Requirement already satisfied: openai-whisper<=20250625,>=20230314 in /usr/local/lib/python3.12/dist-packages (from stable-ts) (20250625)\n",
            "Requirement already satisfied: ctranslate2<5,>=4.0 in /usr/local/lib/python3.12/dist-packages (from faster-whisper) (4.6.2)\n",
            "Requirement already satisfied: huggingface-hub>=0.21 in /usr/local/lib/python3.12/dist-packages (from faster-whisper) (0.36.0)\n",
            "Requirement already satisfied: tokenizers<1,>=0.13 in /usr/local/lib/python3.12/dist-packages (from faster-whisper) (0.22.1)\n",
            "Requirement already satisfied: onnxruntime<2,>=1.14 in /usr/local/lib/python3.12/dist-packages (from faster-whisper) (1.23.2)\n",
            "Requirement already satisfied: av>=11 in /usr/local/lib/python3.12/dist-packages (from faster-whisper) (16.0.1)\n",
            "Requirement already satisfied: setuptools in /usr/local/lib/python3.12/dist-packages (from ctranslate2<5,>=4.0->faster-whisper) (75.2.0)\n",
            "Requirement already satisfied: pyyaml<7,>=5.3 in /usr/local/lib/python3.12/dist-packages (from ctranslate2<5,>=4.0->faster-whisper) (6.0.3)\n",
            "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.21->faster-whisper) (3.20.0)\n",
            "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.21->faster-whisper) (2025.3.0)\n",
            "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.21->faster-whisper) (25.0)\n",
            "Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.21->faster-whisper) (2.32.4)\n",
            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.21->faster-whisper) (4.15.0)\n",
            "Requirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.21->faster-whisper) (1.2.0)\n",
            "Requirement already satisfied: coloredlogs in /usr/local/lib/python3.12/dist-packages (from onnxruntime<2,>=1.14->faster-whisper) (15.0.1)\n",
            "Requirement already satisfied: flatbuffers in /usr/local/lib/python3.12/dist-packages (from onnxruntime<2,>=1.14->faster-whisper) (25.9.23)\n",
            "Requirement already satisfied: protobuf in /usr/local/lib/python3.12/dist-packages (from onnxruntime<2,>=1.14->faster-whisper) (5.29.5)\n",
            "Requirement already satisfied: sympy in /usr/local/lib/python3.12/dist-packages (from onnxruntime<2,>=1.14->faster-whisper) (1.14.0)\n",
            "Requirement already satisfied: more-itertools in /usr/local/lib/python3.12/dist-packages (from openai-whisper<=20250625,>=20230314->stable-ts) (10.8.0)\n",
            "Requirement already satisfied: numba in /usr/local/lib/python3.12/dist-packages (from openai-whisper<=20250625,>=20230314->stable-ts) (0.60.0)\n",
            "Requirement already satisfied: tiktoken in /usr/local/lib/python3.12/dist-packages (from openai-whisper<=20250625,>=20230314->stable-ts) (0.12.0)\n",
            "Requirement already satisfied: triton>=2 in /usr/local/lib/python3.12/dist-packages (from openai-whisper<=20250625,>=20230314->stable-ts) (3.5.0)\n",
            "Requirement already satisfied: networkx>=2.5.1 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (3.6.1)\n",
            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (3.1.6)\n",
            "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (12.6.77)\n",
            "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (12.6.77)\n",
            "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.6.80 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (12.6.80)\n",
            "Requirement already satisfied: nvidia-cudnn-cu12==9.10.2.21 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (9.10.2.21)\n",
            "Requirement already satisfied: nvidia-cublas-cu12==12.6.4.1 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (12.6.4.1)\n",
            "Requirement already satisfied: nvidia-cufft-cu12==11.3.0.4 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (11.3.0.4)\n",
            "Requirement already satisfied: nvidia-curand-cu12==10.3.7.77 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (10.3.7.77)\n",
            "Requirement already satisfied: nvidia-cusolver-cu12==11.7.1.2 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (11.7.1.2)\n",
            "Requirement already satisfied: nvidia-cusparse-cu12==12.5.4.2 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (12.5.4.2)\n",
            "Requirement already satisfied: nvidia-cusparselt-cu12==0.7.1 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (0.7.1)\n",
            "Requirement already satisfied: nvidia-nccl-cu12==2.27.5 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (2.27.5)\n",
            "Requirement already satisfied: nvidia-nvshmem-cu12==3.3.20 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (3.3.20)\n",
            "Requirement already satisfied: nvidia-nvtx-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (12.6.77)\n",
            "Requirement already satisfied: nvidia-nvjitlink-cu12==12.6.85 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (12.6.85)\n",
            "Requirement already satisfied: nvidia-cufile-cu12==1.11.1.6 in /usr/local/lib/python3.12/dist-packages (from torch->stable-ts) (1.11.1.6)\n",
            "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.12/dist-packages (from sympy->onnxruntime<2,>=1.14->faster-whisper) (1.3.0)\n",
            "Requirement already satisfied: humanfriendly>=9.1 in /usr/local/lib/python3.12/dist-packages (from coloredlogs->onnxruntime<2,>=1.14->faster-whisper) (10.0)\n",
            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2->torch->stable-ts) (3.0.3)\n",
            "Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /usr/local/lib/python3.12/dist-packages (from numba->openai-whisper<=20250625,>=20230314->stable-ts) (0.43.0)\n",
            "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->huggingface-hub>=0.21->faster-whisper) (3.4.4)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests->huggingface-hub>=0.21->faster-whisper) (3.11)\n",
            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->huggingface-hub>=0.21->faster-whisper) (2.5.0)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests->huggingface-hub>=0.21->faster-whisper) (2025.11.12)\n",
            "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.12/dist-packages (from tiktoken->openai-whisper<=20250625,>=20230314->stable-ts) (2025.11.3)\n",
            "\n",
            "✅ GPU (CUDA) が利用可能です。高速推論モードで動作します。\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "d1242fbe",
        "outputId": "1922227e-ec58-4cf4-a5cb-0fb65b91fca5"
      },
      "source": [
        "# @title 2. 音声ファイルをアップロードして実行\n",
        "# モデルのロード（初回はダウンロードに時間がかかります）\n",
        "print(\"⏳ モデル(large-v3-turbo)をロード中...\")\n",
        "model = stable_whisper.load_faster_whisper(\n",
        "    \"large-v3-turbo\",\n",
        "    device=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
        "    compute_type=\"float16\" if torch.cuda.is_available() else \"int8\"\n",
        ")\n",
        "print(\"✅ モデルロード完了\")\n",
        "\n",
        "# 音声ファイルのアップロード\n",
        "print(\"\\n📂 音声ファイル(mp3, wav, m4a等)をアップロードしてください:\")\n",
        "uploaded = files.upload()\n",
        "\n",
        "for filename in uploaded.keys():\n",
        "    print(f\"\\n🎙️ 解析中: {filename} ...\")\n",
        "\n",
        "    #\n",
        "    result = model.transcribe(\n",
        "        filename,\n",
        "        language=\"ja\",\n",
        "\n",
        "        #\n",
        "        vad=True,                        # VAD(音声区間検出)を有効化\n",
        "        condition_on_previous_text=False,# 前の文脈に依存しない（ループ対策の要）\n",
        "        word_timestamps=False,           # 単語ごとのタイムスタンプを使わない（安定化）\n",
        "        beam_size=5                      # 探索幅（精度向上のため）\n",
        "    )\n",
        "\n",
        "    # 結果の表示\n",
        "    print(\"\\n\" + \"=\"*30 + \" 認識結果 \" + \"=\"*30)\n",
        "    for segment in result.segments:\n",
        "        # 秒数を整形して表示\n",
        "        start = f\"{segment.start:.2f}\"\n",
        "        end = f\"{segment.end:.2f}\"\n",
        "        print(f\"[{start}s -> {end}s] {segment.text}\")\n",
        "    print(\"=\"*68)\n",
        "\n",
        "    # SRTファイル（字幕ファイル）として保存・ダウンロード\n",
        "    srt_filename = filename.rsplit('.', 1)[0] + \".srt\"\n",
        "    result.to_srt_vtt(srt_filename)\n",
        "    files.download(srt_filename)\n",
        "    print(f\"\\n💾 字幕ファイル({srt_filename})をダウンロードしました。\")"
      ],
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "⏳ モデル(large-v3-turbo)をロード中...\n",
            "✅ モデルロード完了\n",
            "\n",
            "📂 音声ファイル(mp3, wav, m4a等)をアップロードしてください:\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ],
            "text/html": [
              "\n",
              "     <input type=\"file\" id=\"files-89316340-2eac-4148-a9fc-dbf16a097068\" name=\"files[]\" multiple disabled\n",
              "        style=\"border:none\" />\n",
              "     <output id=\"result-89316340-2eac-4148-a9fc-dbf16a097068\">\n",
              "      Upload widget is only available when the cell has been executed in the\n",
              "      current browser session. Please rerun this cell to enable.\n",
              "      </output>\n",
              "      <script>// Copyright 2017 Google LLC\n",
              "//\n",
              "// Licensed under the Apache License, Version 2.0 (the \"License\");\n",
              "// you may not use this file except in compliance with the License.\n",
              "// You may obtain a copy of the License at\n",
              "//\n",
              "//      http://www.apache.org/licenses/LICENSE-2.0\n",
              "//\n",
              "// Unless required by applicable law or agreed to in writing, software\n",
              "// distributed under the License is distributed on an \"AS IS\" BASIS,\n",
              "// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
              "// See the License for the specific language governing permissions and\n",
              "// limitations under the License.\n",
              "\n",
              "/**\n",
              " * @fileoverview Helpers for google.colab Python module.\n",
              " */\n",
              "(function(scope) {\n",
              "function span(text, styleAttributes = {}) {\n",
              "  const element = document.createElement('span');\n",
              "  element.textContent = text;\n",
              "  for (const key of Object.keys(styleAttributes)) {\n",
              "    element.style[key] = styleAttributes[key];\n",
              "  }\n",
              "  return element;\n",
              "}\n",
              "\n",
              "// Max number of bytes which will be uploaded at a time.\n",
              "const MAX_PAYLOAD_SIZE = 100 * 1024;\n",
              "\n",
              "function _uploadFiles(inputId, outputId) {\n",
              "  const steps = uploadFilesStep(inputId, outputId);\n",
              "  const outputElement = document.getElementById(outputId);\n",
              "  // Cache steps on the outputElement to make it available for the next call\n",
              "  // to uploadFilesContinue from Python.\n",
              "  outputElement.steps = steps;\n",
              "\n",
              "  return _uploadFilesContinue(outputId);\n",
              "}\n",
              "\n",
              "// This is roughly an async generator (not supported in the browser yet),\n",
              "// where there are multiple asynchronous steps and the Python side is going\n",
              "// to poll for completion of each step.\n",
              "// This uses a Promise to block the python side on completion of each step,\n",
              "// then passes the result of the previous step as the input to the next step.\n",
              "function _uploadFilesContinue(outputId) {\n",
              "  const outputElement = document.getElementById(outputId);\n",
              "  const steps = outputElement.steps;\n",
              "\n",
              "  const next = steps.next(outputElement.lastPromiseValue);\n",
              "  return Promise.resolve(next.value.promise).then((value) => {\n",
              "    // Cache the last promise value to make it available to the next\n",
              "    // step of the generator.\n",
              "    outputElement.lastPromiseValue = value;\n",
              "    return next.value.response;\n",
              "  });\n",
              "}\n",
              "\n",
              "/**\n",
              " * Generator function which is called between each async step of the upload\n",
              " * process.\n",
              " * @param {string} inputId Element ID of the input file picker element.\n",
              " * @param {string} outputId Element ID of the output display.\n",
              " * @return {!Iterable<!Object>} Iterable of next steps.\n",
              " */\n",
              "function* uploadFilesStep(inputId, outputId) {\n",
              "  const inputElement = document.getElementById(inputId);\n",
              "  inputElement.disabled = false;\n",
              "\n",
              "  const outputElement = document.getElementById(outputId);\n",
              "  outputElement.innerHTML = '';\n",
              "\n",
              "  const pickedPromise = new Promise((resolve) => {\n",
              "    inputElement.addEventListener('change', (e) => {\n",
              "      resolve(e.target.files);\n",
              "    });\n",
              "  });\n",
              "\n",
              "  const cancel = document.createElement('button');\n",
              "  inputElement.parentElement.appendChild(cancel);\n",
              "  cancel.textContent = 'Cancel upload';\n",
              "  const cancelPromise = new Promise((resolve) => {\n",
              "    cancel.onclick = () => {\n",
              "      resolve(null);\n",
              "    };\n",
              "  });\n",
              "\n",
              "  // Wait for the user to pick the files.\n",
              "  const files = yield {\n",
              "    promise: Promise.race([pickedPromise, cancelPromise]),\n",
              "    response: {\n",
              "      action: 'starting',\n",
              "    }\n",
              "  };\n",
              "\n",
              "  cancel.remove();\n",
              "\n",
              "  // Disable the input element since further picks are not allowed.\n",
              "  inputElement.disabled = true;\n",
              "\n",
              "  if (!files) {\n",
              "    return {\n",
              "      response: {\n",
              "        action: 'complete',\n",
              "      }\n",
              "    };\n",
              "  }\n",
              "\n",
              "  for (const file of files) {\n",
              "    const li = document.createElement('li');\n",
              "    li.append(span(file.name, {fontWeight: 'bold'}));\n",
              "    li.append(span(\n",
              "        `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n",
              "        `last modified: ${\n",
              "            file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n",
              "                                    'n/a'} - `));\n",
              "    const percent = span('0% done');\n",
              "    li.appendChild(percent);\n",
              "\n",
              "    outputElement.appendChild(li);\n",
              "\n",
              "    const fileDataPromise = new Promise((resolve) => {\n",
              "      const reader = new FileReader();\n",
              "      reader.onload = (e) => {\n",
              "        resolve(e.target.result);\n",
              "      };\n",
              "      reader.readAsArrayBuffer(file);\n",
              "    });\n",
              "    // Wait for the data to be ready.\n",
              "    let fileData = yield {\n",
              "      promise: fileDataPromise,\n",
              "      response: {\n",
              "        action: 'continue',\n",
              "      }\n",
              "    };\n",
              "\n",
              "    // Use a chunked sending to avoid message size limits. See b/62115660.\n",
              "    let position = 0;\n",
              "    do {\n",
              "      const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n",
              "      const chunk = new Uint8Array(fileData, position, length);\n",
              "      position += length;\n",
              "\n",
              "      const base64 = btoa(String.fromCharCode.apply(null, chunk));\n",
              "      yield {\n",
              "        response: {\n",
              "          action: 'append',\n",
              "          file: file.name,\n",
              "          data: base64,\n",
              "        },\n",
              "      };\n",
              "\n",
              "      let percentDone = fileData.byteLength === 0 ?\n",
              "          100 :\n",
              "          Math.round((position / fileData.byteLength) * 100);\n",
              "      percent.textContent = `${percentDone}% done`;\n",
              "\n",
              "    } while (position < fileData.byteLength);\n",
              "  }\n",
              "\n",
              "  // All done.\n",
              "  yield {\n",
              "    response: {\n",
              "      action: 'complete',\n",
              "    }\n",
              "  };\n",
              "}\n",
              "\n",
              "scope.google = scope.google || {};\n",
              "scope.google.colab = scope.google.colab || {};\n",
              "scope.google.colab._files = {\n",
              "  _uploadFiles,\n",
              "  _uploadFilesContinue,\n",
              "};\n",
              "})(self);\n",
              "</script> "
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Saving n0001a-The-Ant-and-Grasshopper.mp3 to n0001a-The-Ant-and-Grasshopper (1).mp3\n",
            "\n",
            "🎙️ 解析中: n0001a-The-Ant-and-Grasshopper (1).mp3 ...\n",
            "Detected Language: japanese\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "Transcribe: 100%|██████████| 165.88/165.88 [00:04<00:00, 39.68sec/s]\n",
            "VAD: 100%|██████████| 165.88/165.88 [00:02<00:00, 62.77sec/s]\n",
            "Adjustment: 100%|██████████| 194.0/194.0 [00:00<00:00, 107503.63sec/s]"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            "============================== 認識結果 ==============================\n",
            "[2.46s -> 6.22s] 日本語多読ブックス レベルゼロ\n",
            "[6.22s -> 14.00s] アリとキリギリス イソップ物語より\n",
            "[15.68s -> 17.88s] NPO 多言語多読\n",
            "[22.43s -> 23.18s] 夏です\n",
            "[23.18s -> 27.52s] キリギリスが木の下にいます\n",
            "[29.66s -> 31.90s] キリギリスはうたをうたいます\n",
            "[33.60s -> 34.58s] ラララララ\n",
            "[38.11s -> 39.26s] アリがきました\n",
            "[40.96s -> 43.54s] アリはたべものをはこびます\n",
            "[45.47s -> 47.18s] キリギリスがいいました\n",
            "[47.18s -> 52.84s] アリさん、いっしょにうたをうたいましょう\n",
            "[55.30s -> 56.62s] アリはいいました\n",
            "[56.62s -> 60.90s] いいえ、うたいません\n",
            "[62.62s -> 64.78s] わたしたちははたらきます\n",
            "[68.03s -> 69.88s] キリギリスはききました\n",
            "[71.84s -> 73.02s] どうしてですか\n",
            "[73.02s -> 78.38s] ふゆはたべものがありませんから\n",
            "[80.03s -> 83.38s] え、いまはなつですよ\n",
            "[85.25s -> 87.10s] キリギリスは笑いました\n",
            "[89.95s -> 91.56s] それから毎日\n",
            "[91.56s -> 95.86s] キリギリスは歌を歌いました\n",
            "[98.43s -> 99.96s] 働きませんでした\n",
            "[103.90s -> 104.60s] 冬です\n",
            "[106.14s -> 107.38s] 雪が降ります\n",
            "[109.31s -> 110.72s] とても寒いです\n",
            "[110.72s -> 116.78s] キリギリスの家には食べ物がありません\n",
            "[116.78s -> 125.52s] アリの家には食べ物がたくさんあります\n",
            "[125.52s -> 131.12s] キリギリスはアリの家へ行きました\n",
            "[134.75s -> 136.46s] キリギリスは言いました\n",
            "[136.46s -> 144.54s] アリさん、お願いです。食べ物をください\n",
            "[147.33s -> 148.64s] アリは言いました\n",
            "[148.64s -> 154.26s] 私たちは夏働きました\n",
            "[154.26s -> 158.58s] だから食べ物があります\n",
            "[160.54s -> 164.02s] あなたは夏何をしましたか\n",
            "[165.88s -> 194.00s] ご視聴ありがとうございました\n",
            "====================================================================\n",
            "Saved: /content/n0001a-The-Ant-and-Grasshopper (1).srt\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "\n",
            "/usr/local/lib/python3.12/dist-packages/stable_whisper/text_output.py:183: UserWarning: Result is missing word timestamps. Word-level timing cannot be exported. Use ``word_level=False`` to avoid this warning\n",
            "  warnings.warn('Result is missing word timestamps. Word-level timing cannot be exported. '\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ],
            "application/javascript": [
              "\n",
              "    async function download(id, filename, size) {\n",
              "      if (!google.colab.kernel.accessAllowed) {\n",
              "        return;\n",
              "      }\n",
              "      const div = document.createElement('div');\n",
              "      const label = document.createElement('label');\n",
              "      label.textContent = `Downloading \"${filename}\": `;\n",
              "      div.appendChild(label);\n",
              "      const progress = document.createElement('progress');\n",
              "      progress.max = size;\n",
              "      div.appendChild(progress);\n",
              "      document.body.appendChild(div);\n",
              "\n",
              "      const buffers = [];\n",
              "      let downloaded = 0;\n",
              "\n",
              "      const channel = await google.colab.kernel.comms.open(id);\n",
              "      // Send a message to notify the kernel that we're ready.\n",
              "      channel.send({})\n",
              "\n",
              "      for await (const message of channel.messages) {\n",
              "        // Send a message to notify the kernel that we're ready.\n",
              "        channel.send({})\n",
              "        if (message.buffers) {\n",
              "          for (const buffer of message.buffers) {\n",
              "            buffers.push(buffer);\n",
              "            downloaded += buffer.byteLength;\n",
              "            progress.value = downloaded;\n",
              "          }\n",
              "        }\n",
              "      }\n",
              "      const blob = new Blob(buffers, {type: 'application/binary'});\n",
              "      const a = document.createElement('a');\n",
              "      a.href = window.URL.createObjectURL(blob);\n",
              "      a.download = filename;\n",
              "      div.appendChild(a);\n",
              "      a.click();\n",
              "      div.remove();\n",
              "    }\n",
              "  "
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ],
            "application/javascript": [
              "download(\"download_1811b4a0-ba20-4fa6-8ecd-601b4484dde4\", \"n0001a-The-Ant-and-Grasshopper (1).srt\", 2375)"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            "💾 字幕ファイル(n0001a-The-Ant-and-Grasshopper (1).srt)をダウンロードしました。\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "DDnKDNqlNn5H"
      },
      "execution_count": 8,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "210e4e37"
      },
      "source": [
        "# @title 3. サンプル音声でのテスト\n",
        "# 指定されたURLから音声と正解テキストをダウンロードして文字起こしを行います。"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "50542827",
        "outputId": "51668824-2c31-4358-82fb-1657f218f709"
      },
      "source": [
        "import os\n",
        "\n",
        "# ファイルのダウンロード\n",
        "audio_url = \"https://clrd.ninjal.ac.jp/csj/sound-f/interview_aps-smp.mp3\"\n",
        "text_url = \"https://clrd.ninjal.ac.jp/csj/trans-f/interview_aps-smp.txt\"\n",
        "audio_file = \"interview_aps-smp.mp3\"\n",
        "text_file = \"interview_aps-smp.txt\"\n",
        "\n",
        "!wget -q -O {audio_file} {audio_url}\n",
        "!wget -q -O {text_file} {text_url}\n",
        "\n",
        "print(f\"✅ ダウンロード完了: {audio_file}, {text_file}\")\n",
        "\n",
        "# モデルがロードされていない場合はロードする（前のセルを実行していない場合用）\n",
        "if 'model' not in locals():\n",
        "    print(\"⏳ モデル(large-v3-turbo)をロード中...\")\n",
        "    model = stable_whisper.load_faster_whisper(\n",
        "        \"large-v3-turbo\",\n",
        "        device=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
        "        compute_type=\"float16\" if torch.cuda.is_available() else \"int8\"\n",
        "    )\n",
        "\n",
        "# 文字起こし実行\n",
        "print(f\"\\n🎙️ 解析中: {audio_file} ...\")\n",
        "result = model.transcribe(\n",
        "    audio_file,\n",
        "    language=\"ja\",\n",
        "    vad=True,\n",
        "    condition_on_previous_text=False,\n",
        "    word_timestamps=False,\n",
        "    beam_size=5\n",
        ")\n",
        "\n",
        "# 結果表示\n",
        "print(\"\\n\" + \"=\"*30 + \" Whisper 認識結果 \" + \"=\"*30)\n",
        "for segment in result.segments:\n",
        "    print(f\"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}\")\n",
        "\n",
        "print(\"\\n\" + \"=\"*30 + \" 正解データ (参考) \" + \"=\"*30)\n",
        "# エンコーディングを自動判別して表示（Shift_JISなどの可能性があるため）\n",
        "encodings = ['utf-8', 'shift_jis', 'euc-jp']\n",
        "reference_text = \"\"\n",
        "for enc in encodings:\n",
        "    try:\n",
        "        with open(text_file, 'r', encoding=enc) as f:\n",
        "            reference_text = f.read()\n",
        "        break\n",
        "    except UnicodeDecodeError:\n",
        "        continue\n",
        "\n",
        "print(reference_text[:2000] + (\"...\" if len(reference_text) > 2000 else \"\")) # 長すぎる場合は省略\n",
        "\n",
        "# SRT保存\n",
        "srt_filename = audio_file.rsplit('.', 1)[0] + \".srt\"\n",
        "result.to_srt_vtt(srt_filename)\n",
        "files.download(srt_filename)\n",
        "print(f\"\\n💾 字幕ファイル({srt_filename})をダウンロードしました。\")"
      ],
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "✅ ダウンロード完了: interview_aps-smp.mp3, interview_aps-smp.txt\n",
            "\n",
            "🎙️ 解析中: interview_aps-smp.mp3 ...\n",
            "Detected Language: japanese\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "Transcribe: 100%|██████████| 71.75/71.75 [00:02<00:00, 32.79sec/s]             \n",
            "VAD: 100%|██████████| 71.75/71.75 [00:01<00:00, 64.04sec/s]\n",
            "Adjustment: 100%|██████████| 71.14/71.14 [00:00<00:00, 238324.91sec/s]"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            "============================== Whisper 認識結果 ==============================\n",
            "[0.35s -> 12.24s] 質問させていただきます。読んだんですけれども、理解には遠く及ばずという感じで、言葉の意味からお聞きしたいと思うんですけど、\n",
            "[12.80s -> 20.68s] パラ言語情報っていう言葉と、あとフォルマント、あと聴音動っていうことについてまずはじめに聞かせてください。\n",
            "[20.68s -> 25.56s] 3つね。言語ってのはわかりますよね。言葉ですよね。\n",
            "[25.96s -> 33.34s] 言語情報っていうのはね、簡単に言えば単語の意味。辞書に書いてありますよね。\n",
            "[33.34s -> 44.18s] それから単語がくっついたときに、くっついた文を作ったりするときに、名詞に助詞がくっついて、動詞があって、最後に助動詞があって、\n",
            "[44.70s -> 49.88s] そういう普通に言語学の教科書に書いてあるような、それが言語情報ですね。\n",
            "[49.88s -> 60.90s] で、パラって言葉はね、語源的にはね、その、なんとかの横に、とかね、隣に、とかね、そういう意味なんですよ。\n",
            "[62.50s -> 71.14s] で、つまり言語からちょっとずれたところにある情報、だけど、だけど、その実際には存在している情報っていうことですね。\n",
            "\n",
            "============================== 正解データ (参考) ==============================\n",
            "%講演ID:D04M0041\n",
            "%\n",
            "%<SOT>\n",
            "%%【略】\n",
            "0003 00008.805-00012.085 L:\n",
            "質問させていただきます                        & シツモンサセテイタダキマス                   \n",
            "(F あの)                                      & (F アノ)                                     \n",
            "読んだんですけれども                          & ヨンダンデスケレドモ                         \n",
            "0004 00009.417-00009.838 R:\n",
            "(F うん)                                      & (F <VN>)                                     \n",
            "0005 00011.770-00012.901 R:\n",
            "(F うん)                                      & (F <VN>)                                     \n",
            "分からなかった                                & ワカラナカッタ                               \n",
            "0006 00012.536-00013.221 L:\n",
            "大抵の                                        & タイ(笑 テーノ)                              \n",
            "0007 00013.250-00014.315 R:<笑>\n",
            "0008 00013.698-00016.817 L:\n",
            "(F あのー)                                    & (F アノー)                                   \n",
            "理解には                                      & リカイニワ                                   \n",
            "遠く                                          & トーク                                       \n",
            "及ばずという                                  & オヨバズトユー                               \n",
            "感じで                                        & カンジデ                                     \n",
            "0009 00017.156-00018.411 L:\n",
            "(F あのー)                                    & (F アノー)                                   \n",
            "言葉の                                        & コトバノ                                     \n",
            "意味                                          & イミ                                         \n",
            "0010 00018.861-00020.915 L:\n",
            "から                                          & カラ                                         \n",
            "お聞きしたいと                                & オキキシタイト                               \n",
            "思うんですけど                                & オモウンデスケド                             \n",
            "0011 00019.775-00020.137 R:\n",
            "(F はい)(F はい)                              & (F (? ハ)イ)(F ハイ)                         \n",
            "0012 00020.603-00020.823 R:\n",
            "(F うん)                                      & (F <VN>)                    ...\n",
            "Saved: /content/interview_aps-smp.srt\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "\n",
            "/usr/local/lib/python3.12/dist-packages/stable_whisper/text_output.py:183: UserWarning: Result is missing word timestamps. Word-level timing cannot be exported. Use ``word_level=False`` to avoid this warning\n",
            "  warnings.warn('Result is missing word timestamps. Word-level timing cannot be exported. '\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ],
            "application/javascript": [
              "\n",
              "    async function download(id, filename, size) {\n",
              "      if (!google.colab.kernel.accessAllowed) {\n",
              "        return;\n",
              "      }\n",
              "      const div = document.createElement('div');\n",
              "      const label = document.createElement('label');\n",
              "      label.textContent = `Downloading \"${filename}\": `;\n",
              "      div.appendChild(label);\n",
              "      const progress = document.createElement('progress');\n",
              "      progress.max = size;\n",
              "      div.appendChild(progress);\n",
              "      document.body.appendChild(div);\n",
              "\n",
              "      const buffers = [];\n",
              "      let downloaded = 0;\n",
              "\n",
              "      const channel = await google.colab.kernel.comms.open(id);\n",
              "      // Send a message to notify the kernel that we're ready.\n",
              "      channel.send({})\n",
              "\n",
              "      for await (const message of channel.messages) {\n",
              "        // Send a message to notify the kernel that we're ready.\n",
              "        channel.send({})\n",
              "        if (message.buffers) {\n",
              "          for (const buffer of message.buffers) {\n",
              "            buffers.push(buffer);\n",
              "            downloaded += buffer.byteLength;\n",
              "            progress.value = downloaded;\n",
              "          }\n",
              "        }\n",
              "      }\n",
              "      const blob = new Blob(buffers, {type: 'application/binary'});\n",
              "      const a = document.createElement('a');\n",
              "      a.href = window.URL.createObjectURL(blob);\n",
              "      a.download = filename;\n",
              "      div.appendChild(a);\n",
              "      a.click();\n",
              "      div.remove();\n",
              "    }\n",
              "  "
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ],
            "application/javascript": [
              "download(\"download_d2e992ee-ba9d-402d-8c8d-c58b12a53443\", \"interview_aps-smp.srt\", 1417)"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            "💾 字幕ファイル(interview_aps-smp.srt)をダウンロードしました。\n"
          ]
        }
      ]
    }
  ]
 }
No results found