chottokun · January 23, 2026 13:26
diff --git a/qwen3-tts.ipynb b/qwen3-tts.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4",
      "authorship_tag": "ABX9TyPIG2feBIPfrpMF9/Xujsvc",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/chottokun/39a800222132745c81c13e6cdc00ba70/qwen3-tts.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "bb4aaed8"
      },
      "source": [
        "!nvidia-smi\n",
        "import os\n",
        "if not os.path.exists(\"Qwen3-TTS\"):\n",
        "    !git clone https://github.com/QwenLM/Qwen3-TTS\n",
        "%cd Qwen3-TTS"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "36a01e17"
      },
      "source": [
        "import os\n",
        "\n",
        "if os.path.exists('requirements.txt'):\n",
        "    !pip install -r requirements.txt\n",
        "else:\n",
        "    print(\"No requirements.txt found. Installing standard libraries...\")\n",
        "    !pip install torch torchaudio transformers accelerate"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "8bca68f1"
      },
      "source": [
        "!pip install -e ."
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "6d970d14"
      },
      "source": [
        "import time\n",
        "import torch\n",
        "import soundfile as sf\n",
        "from qwen_tts import Qwen3TTSModel\n",
        "\n",
        "# Define device\n",
        "device = \"cuda:0\"\n",
        "# Model ID\n",
        "MODEL_PATH = \"Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign\"\n",
        "\n",
        "print(f\"Loading model from {MODEL_PATH}...\")\n",
        "# Tesla T4 does not support Flash Attention 2. Using 'sdpa' instead.\n",
        "tts = Qwen3TTSModel.from_pretrained(\n",
        "    MODEL_PATH,\n",
        "    device_map=device,\n",
        "    dtype=torch.bfloat16,\n",
        "    attn_implementation=\"sdpa\",\n",
        ")\n",
        "\n",
        "# Anime Character Scenarios\n",
        "scenarios = [\n",
        "    {\n",
        "        \"role\": \"Tsundere (ツンデレ)\",\n",
        "        \"text\": \"べ、別にあんたのために作ったわけじゃないんだからね！勘違いしないでよね！\",\n",
        "        \"instruct\": \"10代の少女の声。ツンデレな口調。最初は怒ったように早口で、後半は少し照れたように声が小さくなる。高めのトーン。\"\n",
        "    },\n",
        "    {\n",
        "        \"role\": \"Tsundere (ツンデレ)\",\n",
        "        \"text\": \"べ、別にあんたのために作ったわけじゃないんだからね！勘違いしないでよね！\",\n",
        "        \"instruct\": \"10代の少女の声。ツンデレな口調。高めのトーン。最初は怒ったように早口で、「べ、別にあんたのために作ったわけじゃないんだからね！」後半は少し照れたように声が小さくなって「勘違いしないでよね！」\"\n",
        "    },\n",
        "    {\n",
        "        \"role\": \"Onee-san (お姉さん)\",\n",
        "        \"text\": \"あらあら、迷子になっちゃったの？お姉さんが案内してあげるわ。\",\n",
        "        \"instruct\": \"20代女性の声。落ち着いた、包容力のあるお姉さんボイス。ゆっくりとした話し方で、語尾に少し息が混じるような色気を含ませる。\"\n",
        "    },\n",
        "    {\n",
        "        \"role\": \"Chuunibyou (中二病)\",\n",
        "        \"text\": \"クックック…我が右腕に封印されし黒龍が暴れだす…！鎮まれ、俺の血よ！\",\n",
        "        \"instruct\": \"10代男性の声。中二病特有の大げさな演技がかった口調。低音でカッコつけて喋るが、時折素の声が混じるような不安定さを出す。\"\n",
        "    },\n",
        "    {\n",
        "        \"role\": \"Genki Girl (元気っ子)\",\n",
        "        \"text\": \"おっはよー！今日もいい天気だね！これなら洗濯物もすぐ乾きそう！\",\n",
        "        \"instruct\": \"10代少女の声。明るく元気でハキハキとした口調。声量が大きく、抑揚がはっきりしている。聞いているだけで元気が出るような声。\"\n",
        "    },\n",
        "    {\n",
        "        \"role\": \"Cool_Kuudere (クール・クーデレ)\",\n",
        "        \"text\": \"…任務完了。次の指示を。…何？私の顔に何かついている？\",\n",
        "        \"instruct\": \"10代後半の女性の声。感情の起伏が少ない、冷静沈着なトーン。淡々としているが、冷たすぎず、少しミステリアスな雰囲気。\"\n",
        "    },\n",
        "    {\n",
        "        \"role\": \"Samurai (侍・古風)\",\n",
        "        \"text\": \"拙者は流浪の侍。名はまだ無い。この刀一本で乱世を生き抜くだけでござる。\",\n",
        "        \"instruct\": \"20代男性の声。時代劇のような古風な話し方。腹から声を出すような、芯のある凛とした声色。\"\n",
        "    },\n",
        "    {\n",
        "        \"role\": \"Villainess (悪役令嬢)\",\n",
        "        \"text\": \"オーホッホッホ！このわたくしに逆らおうなんて、100年早くてよ！\",\n",
        "        \"instruct\": \"高飛車な自信家の女性の声。高笑いが似合う、少し甲高いトーン。相手を見下すような威圧感と、どこか憎めない愛嬌を混ぜる。\"\n",
        "    },\n",
        "    {\n",
        "        \"role\": \"Villainess (悪役令嬢)\",\n",
        "        \"text\": \"オーホッホッホ！このわたくしに逆らおうなんて、100年早くてよ！\",\n",
        "        \"instruct\": \"高飛車な自信家の女性の声。少し甲高いトーン。相手を見下すような威圧感と、どこか憎めない愛嬌を混ぜる。「オーホッホッホ！」は高笑いです。「100年早くてよ！」は威圧的に。\"\n",
        "    },\n",
        "    {\n",
        "        \"role\": \"Shota (少年・ショタ)\",\n",
        "        \"text\": \"おねえちゃん、これすごいよ！見て見て！僕が見つけたんだ！\",\n",
        "        \"instruct\": \"10歳くらいの少年の声。声変わり前の高めの声。無邪気で好奇心旺盛な感じ。少し舌足らずな可愛さを出す。\"\n",
        "    }\n",
        "]\n",
        "\n",
        "print(f\"\\nGenerating {len(scenarios)} anime character samples with timing measurements...\\n\")\n",
        "print(\"-\" * 60)\n",
        "\n",
        "# 1. Measure Individual Generation Time\n",
        "for i, s in enumerate(scenarios):\n",
        "    text = s[\"text\"]\n",
        "    instruct = s[\"instruct\"]\n",
        "\n",
        "    # Use Japanese for all as requested\n",
        "    language = \"Japanese\"\n",
        "\n",
        "    torch.cuda.synchronize()\n",
        "    start_time = time.time()\n",
        "\n",
        "    wavs, sr = tts.generate_voice_design(\n",
        "        text=text,\n",
        "        language=language,\n",
        "        instruct=instruct,\n",
        "        max_new_tokens=2048,\n",
        "    )\n",
        "\n",
        "    torch.cuda.synchronize()\n",
        "    end_time = time.time()\n",
        "    elapsed = end_time - start_time\n",
        "\n",
        "    role_key = s['role'].replace(' ', '_').replace('(', '').replace(')', '').replace('、', '')\n",
        "    filename = f\"anime_sample_{i:02d}_{role_key}.wav\"\n",
        "    sf.write(filename, wavs[0], sr)\n",
        "\n",
        "    print(f\"[{s['role']}]\\n  Text Len: {len(text)} chars | Time: {elapsed:.3f}s | Saved: {filename}\")\n",
        "\n",
        "print(\"-\" * 60)\n",
        "\n",
        "# 2. Measure Batch Generation Time (for comparison)\n",
        "print(\"\\nMeasuring Batch Generation Time (All 8 samples at once)...\")\n",
        "texts = [s[\"text\"] for s in scenarios]\n",
        "instructs = [s[\"instruct\"] for s in scenarios]\n",
        "languages = [\"Japanese\"] * len(scenarios)\n",
        "\n",
        "torch.cuda.synchronize()\n",
        "start_time = time.time()\n",
        "\n",
        "_ = tts.generate_voice_design(\n",
        "    text=texts,\n",
        "    language=languages,\n",
        "    instruct=instructs,\n",
        "    max_new_tokens=2048,\n",
        ")\n",
        "\n",
        "torch.cuda.synchronize()\n",
        "end_time = time.time()\n",
        "elapsed = end_time - start_time\n",
        "print(f\"[Batch Total] Count: {len(texts)} | Time: {elapsed:.3f}s | Avg per sample: {elapsed/len(texts):.3f}s\")\n",
        "print(\"-\" * 60)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "5f91638b"
      },
      "source": [
        "import IPython.display as ipd\n",
        "import glob\n",
        "import os\n",
        "from IPython.display import display, Markdown\n",
        "\n",
        "# Re-define scenarios to map back if needed, or just list files\n",
        "# Ideally we want to show the text with the audio.\n",
        "\n",
        "wav_files = sorted(glob.glob(\"anime_sample_*.wav\"))\n",
        "print(f\"Found {len(wav_files)} generated files.\\n\")\n",
        "\n",
        "for wav_file in wav_files:\n",
        "    # Try to extract info from filename or just display it\n",
        "    # filename format: anime_sample_{i:02d}_{role_key}.wav\n",
        "    try:\n",
        "        idx = int(wav_file.split('_')[2])\n",
        "        # Access scenarios from the previous cell context\n",
        "        if 'scenarios' in locals() and idx < len(scenarios):\n",
        "            s = scenarios[idx]\n",
        "            display(Markdown(f\"### {s['role']}\\n**Text:** `{s['text']}`\\n\\n**Instruct:** *{s['instruct']}*\"))\n",
        "        else:\n",
        "             display(Markdown(f\"### File: {wav_file}\"))\n",
        "    except:\n",
        "        display(Markdown(f\"### File: {wav_file}\"))\n",
        "\n",
        "    ipd.display(ipd.Audio(wav_file))"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!nvidia-smi"
      ],
      "metadata": {
        "id": "TgnCHzod6vZt"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"gpuType": "T4",
	"authorship_tag": "ABX9TyPIG2feBIPfrpMF9/Xujsvc",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	},
	"accelerator": "GPU"
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/chottokun/39a800222132745c81c13e6cdc00ba70/qwen3-tts.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "bb4aaed8"
	},
	"source": [
	"!nvidia-smi\n",
	"import os\n",
	"if not os.path.exists(\"Qwen3-TTS\"):\n",
	" !git clone https://github.com/QwenLM/Qwen3-TTS\n",
	"%cd Qwen3-TTS"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "36a01e17"
	},
	"source": [
	"import os\n",
	"\n",
	"if os.path.exists('requirements.txt'):\n",
	" !pip install -r requirements.txt\n",
	"else:\n",
	" print(\"No requirements.txt found. Installing standard libraries...\")\n",
	" !pip install torch torchaudio transformers accelerate"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "8bca68f1"
	},
	"source": [
	"!pip install -e ."
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "6d970d14"
	},
	"source": [
	"import time\n",
	"import torch\n",
	"import soundfile as sf\n",
	"from qwen_tts import Qwen3TTSModel\n",
	"\n",
	"# Define device\n",
	"device = \"cuda:0\"\n",
	"# Model ID\n",
	"MODEL_PATH = \"Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign\"\n",
	"\n",
	"print(f\"Loading model from {MODEL_PATH}...\")\n",
	"# Tesla T4 does not support Flash Attention 2. Using 'sdpa' instead.\n",
	"tts = Qwen3TTSModel.from_pretrained(\n",
	" MODEL_PATH,\n",
	" device_map=device,\n",
	" dtype=torch.bfloat16,\n",
	" attn_implementation=\"sdpa\",\n",
	")\n",
	"\n",
	"# Anime Character Scenarios\n",
	"scenarios = [\n",
	" {\n",
	" \"role\": \"Tsundere (ツンデレ)\",\n",
	" \"text\": \"べ、別にあんたのために作ったわけじゃないんだからね！勘違いしないでよね！\",\n",
	" \"instruct\": \"10代の少女の声。ツンデレな口調。最初は怒ったように早口で、後半は少し照れたように声が小さくなる。高めのトーン。\"\n",
	" },\n",
	" {\n",
	" \"role\": \"Tsundere (ツンデレ)\",\n",
	" \"text\": \"べ、別にあんたのために作ったわけじゃないんだからね！勘違いしないでよね！\",\n",
	" \"instruct\": \"10代の少女の声。ツンデレな口調。高めのトーン。最初は怒ったように早口で、「べ、別にあんたのために作ったわけじゃないんだからね！」後半は少し照れたように声が小さくなって「勘違いしないでよね！」\"\n",
	" },\n",
	" {\n",
	" \"role\": \"Onee-san (お姉さん)\",\n",
	" \"text\": \"あらあら、迷子になっちゃったの？お姉さんが案内してあげるわ。\",\n",
	" \"instruct\": \"20代女性の声。落ち着いた、包容力のあるお姉さんボイス。ゆっくりとした話し方で、語尾に少し息が混じるような色気を含ませる。\"\n",
	" },\n",
	" {\n",
	" \"role\": \"Chuunibyou (中二病)\",\n",
	" \"text\": \"クックック…我が右腕に封印されし黒龍が暴れだす…！鎮まれ、俺の血よ！\",\n",
	" \"instruct\": \"10代男性の声。中二病特有の大げさな演技がかった口調。低音でカッコつけて喋るが、時折素の声が混じるような不安定さを出す。\"\n",
	" },\n",
	" {\n",
	" \"role\": \"Genki Girl (元気っ子)\",\n",
	" \"text\": \"おっはよー！今日もいい天気だね！これなら洗濯物もすぐ乾きそう！\",\n",
	" \"instruct\": \"10代少女の声。明るく元気でハキハキとした口調。声量が大きく、抑揚がはっきりしている。聞いているだけで元気が出るような声。\"\n",
	" },\n",
	" {\n",
	" \"role\": \"Cool_Kuudere (クール・クーデレ)\",\n",
	" \"text\": \"…任務完了。次の指示を。…何？私の顔に何かついている？\",\n",
	" \"instruct\": \"10代後半の女性の声。感情の起伏が少ない、冷静沈着なトーン。淡々としているが、冷たすぎず、少しミステリアスな雰囲気。\"\n",
	" },\n",
	" {\n",
	" \"role\": \"Samurai (侍・古風)\",\n",
	" \"text\": \"拙者は流浪の侍。名はまだ無い。この刀一本で乱世を生き抜くだけでござる。\",\n",
	" \"instruct\": \"20代男性の声。時代劇のような古風な話し方。腹から声を出すような、芯のある凛とした声色。\"\n",
	" },\n",
	" {\n",
	" \"role\": \"Villainess (悪役令嬢)\",\n",
	" \"text\": \"オーホッホッホ！このわたくしに逆らおうなんて、100年早くてよ！\",\n",
	" \"instruct\": \"高飛車な自信家の女性の声。高笑いが似合う、少し甲高いトーン。相手を見下すような威圧感と、どこか憎めない愛嬌を混ぜる。\"\n",
	" },\n",
	" {\n",
	" \"role\": \"Villainess (悪役令嬢)\",\n",
	" \"text\": \"オーホッホッホ！このわたくしに逆らおうなんて、100年早くてよ！\",\n",
	" \"instruct\": \"高飛車な自信家の女性の声。少し甲高いトーン。相手を見下すような威圧感と、どこか憎めない愛嬌を混ぜる。「オーホッホッホ！」は高笑いです。「100年早くてよ！」は威圧的に。\"\n",
	" },\n",
	" {\n",
	" \"role\": \"Shota (少年・ショタ)\",\n",
	" \"text\": \"おねえちゃん、これすごいよ！見て見て！僕が見つけたんだ！\",\n",
	" \"instruct\": \"10歳くらいの少年の声。声変わり前の高めの声。無邪気で好奇心旺盛な感じ。少し舌足らずな可愛さを出す。\"\n",
	" }\n",
	"]\n",
	"\n",
	"print(f\"\\nGenerating {len(scenarios)} anime character samples with timing measurements...\\n\")\n",
	"print(\"-\" * 60)\n",
	"\n",
	"# 1. Measure Individual Generation Time\n",
	"for i, s in enumerate(scenarios):\n",
	" text = s[\"text\"]\n",
	" instruct = s[\"instruct\"]\n",
	"\n",
	" # Use Japanese for all as requested\n",
	" language = \"Japanese\"\n",
	"\n",
	" torch.cuda.synchronize()\n",
	" start_time = time.time()\n",
	"\n",
	" wavs, sr = tts.generate_voice_design(\n",
	" text=text,\n",
	" language=language,\n",
	" instruct=instruct,\n",
	" max_new_tokens=2048,\n",
	" )\n",
	"\n",
	" torch.cuda.synchronize()\n",
	" end_time = time.time()\n",
	" elapsed = end_time - start_time\n",
	"\n",
	" role_key = s['role'].replace(' ', '_').replace('(', '').replace(')', '').replace('、', '')\n",
	" filename = f\"anime_sample_{i:02d}_{role_key}.wav\"\n",
	" sf.write(filename, wavs[0], sr)\n",
	"\n",
	" print(f\"[{s['role']}]\\n Text Len: {len(text)} chars \| Time: {elapsed:.3f}s \| Saved: {filename}\")\n",
	"\n",
	"print(\"-\" * 60)\n",
	"\n",
	"# 2. Measure Batch Generation Time (for comparison)\n",
	"print(\"\\nMeasuring Batch Generation Time (All 8 samples at once)...\")\n",
	"texts = [s[\"text\"] for s in scenarios]\n",
	"instructs = [s[\"instruct\"] for s in scenarios]\n",
	"languages = [\"Japanese\"] * len(scenarios)\n",
	"\n",
	"torch.cuda.synchronize()\n",
	"start_time = time.time()\n",
	"\n",
	"_ = tts.generate_voice_design(\n",
	" text=texts,\n",
	" language=languages,\n",
	" instruct=instructs,\n",
	" max_new_tokens=2048,\n",
	")\n",
	"\n",
	"torch.cuda.synchronize()\n",
	"end_time = time.time()\n",
	"elapsed = end_time - start_time\n",
	"print(f\"[Batch Total] Count: {len(texts)} \| Time: {elapsed:.3f}s \| Avg per sample: {elapsed/len(texts):.3f}s\")\n",
	"print(\"-\" * 60)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "5f91638b"
	},
	"source": [
	"import IPython.display as ipd\n",
	"import glob\n",
	"import os\n",
	"from IPython.display import display, Markdown\n",
	"\n",
	"# Re-define scenarios to map back if needed, or just list files\n",
	"# Ideally we want to show the text with the audio.\n",
	"\n",
	"wav_files = sorted(glob.glob(\"anime_sample_*.wav\"))\n",
	"print(f\"Found {len(wav_files)} generated files.\\n\")\n",
	"\n",
	"for wav_file in wav_files:\n",
	" # Try to extract info from filename or just display it\n",
	" # filename format: anime_sample_{i:02d}_{role_key}.wav\n",
	" try:\n",
	" idx = int(wav_file.split('_')[2])\n",
	" # Access scenarios from the previous cell context\n",
	" if 'scenarios' in locals() and idx < len(scenarios):\n",
	" s = scenarios[idx]\n",
	" display(Markdown(f\"### {s['role']}\\nText: `{s['text']}`\\n\\nInstruct: {s['instruct']}\"))\n",
	" else:\n",
	" display(Markdown(f\"### File: {wav_file}\"))\n",
	" except:\n",
	" display(Markdown(f\"### File: {wav_file}\"))\n",
	"\n",
	" ipd.display(ipd.Audio(wav_file))"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"!nvidia-smi"
	],
	"metadata": {
	"id": "TgnCHzod6vZt"
	},
	"execution_count": null,
	"outputs": []
	}
	]
	}
No results found