Created
January 23, 2026 13:26
-
-
Save chottokun/39a800222132745c81c13e6cdc00ba70 to your computer and use it in GitHub Desktop.
Qwen3-TTS.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "provenance": [], | |
| "gpuType": "T4", | |
| "authorship_tag": "ABX9TyPIG2feBIPfrpMF9/Xujsvc", | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| }, | |
| "accelerator": "GPU" | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/chottokun/39a800222132745c81c13e6cdc00ba70/qwen3-tts.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "bb4aaed8" | |
| }, | |
| "source": [ | |
| "!nvidia-smi\n", | |
| "import os\n", | |
| "if not os.path.exists(\"Qwen3-TTS\"):\n", | |
| " !git clone https://github.com/QwenLM/Qwen3-TTS\n", | |
| "%cd Qwen3-TTS" | |
| ], | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "36a01e17" | |
| }, | |
| "source": [ | |
| "import os\n", | |
| "\n", | |
| "if os.path.exists('requirements.txt'):\n", | |
| " !pip install -r requirements.txt\n", | |
| "else:\n", | |
| " print(\"No requirements.txt found. Installing standard libraries...\")\n", | |
| " !pip install torch torchaudio transformers accelerate" | |
| ], | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "8bca68f1" | |
| }, | |
| "source": [ | |
| "!pip install -e ." | |
| ], | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "6d970d14" | |
| }, | |
| "source": [ | |
| "import time\n", | |
| "import torch\n", | |
| "import soundfile as sf\n", | |
| "from qwen_tts import Qwen3TTSModel\n", | |
| "\n", | |
| "# Define device\n", | |
| "device = \"cuda:0\"\n", | |
| "# Model ID\n", | |
| "MODEL_PATH = \"Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign\"\n", | |
| "\n", | |
| "print(f\"Loading model from {MODEL_PATH}...\")\n", | |
| "# Tesla T4 does not support Flash Attention 2. Using 'sdpa' instead.\n", | |
| "tts = Qwen3TTSModel.from_pretrained(\n", | |
| " MODEL_PATH,\n", | |
| " device_map=device,\n", | |
| " dtype=torch.bfloat16,\n", | |
| " attn_implementation=\"sdpa\",\n", | |
| ")\n", | |
| "\n", | |
| "# Anime Character Scenarios\n", | |
| "scenarios = [\n", | |
| " {\n", | |
| " \"role\": \"Tsundere (ツンデレ)\",\n", | |
| " \"text\": \"べ、別にあんたのために作ったわけじゃないんだからね!勘違いしないでよね!\",\n", | |
| " \"instruct\": \"10代の少女の声。ツンデレな口調。最初は怒ったように早口で、後半は少し照れたように声が小さくなる。高めのトーン。\"\n", | |
| " },\n", | |
| " {\n", | |
| " \"role\": \"Tsundere (ツンデレ)\",\n", | |
| " \"text\": \"べ、別にあんたのために作ったわけじゃないんだからね!勘違いしないでよね!\",\n", | |
| " \"instruct\": \"10代の少女の声。ツンデレな口調。高めのトーン。最初は怒ったように早口で、「べ、別にあんたのために作ったわけじゃないんだからね!」後半は少し照れたように声が小さくなって「勘違いしないでよね!」\"\n", | |
| " },\n", | |
| " {\n", | |
| " \"role\": \"Onee-san (お姉さん)\",\n", | |
| " \"text\": \"あらあら、迷子になっちゃったの?お姉さんが案内してあげるわ。\",\n", | |
| " \"instruct\": \"20代女性の声。落ち着いた、包容力のあるお姉さんボイス。ゆっくりとした話し方で、語尾に少し息が混じるような色気を含ませる。\"\n", | |
| " },\n", | |
| " {\n", | |
| " \"role\": \"Chuunibyou (中二病)\",\n", | |
| " \"text\": \"クックック…我が右腕に封印されし黒龍が暴れだす…!鎮まれ、俺の血よ!\",\n", | |
| " \"instruct\": \"10代男性の声。中二病特有の大げさな演技がかった口調。低音でカッコつけて喋るが、時折素の声が混じるような不安定さを出す。\"\n", | |
| " },\n", | |
| " {\n", | |
| " \"role\": \"Genki Girl (元気っ子)\",\n", | |
| " \"text\": \"おっはよー!今日もいい天気だね!これなら洗濯物もすぐ乾きそう!\",\n", | |
| " \"instruct\": \"10代少女の声。明るく元気でハキハキとした口調。声量が大きく、抑揚がはっきりしている。聞いているだけで元気が出るような声。\"\n", | |
| " },\n", | |
| " {\n", | |
| " \"role\": \"Cool_Kuudere (クール・クーデレ)\",\n", | |
| " \"text\": \"…任務完了。次の指示を。…何?私の顔に何かついている?\",\n", | |
| " \"instruct\": \"10代後半の女性の声。感情の起伏が少ない、冷静沈着なトーン。淡々としているが、冷たすぎず、少しミステリアスな雰囲気。\"\n", | |
| " },\n", | |
| " {\n", | |
| " \"role\": \"Samurai (侍・古風)\",\n", | |
| " \"text\": \"拙者は流浪の侍。名はまだ無い。この刀一本で乱世を生き抜くだけでござる。\",\n", | |
| " \"instruct\": \"20代男性の声。時代劇のような古風な話し方。腹から声を出すような、芯のある凛とした声色。\"\n", | |
| " },\n", | |
| " {\n", | |
| " \"role\": \"Villainess (悪役令嬢)\",\n", | |
| " \"text\": \"オーホッホッホ!このわたくしに逆らおうなんて、100年早くてよ!\",\n", | |
| " \"instruct\": \"高飛車な自信家の女性の声。高笑いが似合う、少し甲高いトーン。相手を見下すような威圧感と、どこか憎めない愛嬌を混ぜる。\"\n", | |
| " },\n", | |
| " {\n", | |
| " \"role\": \"Villainess (悪役令嬢)\",\n", | |
| " \"text\": \"オーホッホッホ!このわたくしに逆らおうなんて、100年早くてよ!\",\n", | |
| " \"instruct\": \"高飛車な自信家の女性の声。少し甲高いトーン。相手を見下すような威圧感と、どこか憎めない愛嬌を混ぜる。「オーホッホッホ!」は高笑いです。「100年早くてよ!」は威圧的に。\"\n", | |
| " },\n", | |
| " {\n", | |
| " \"role\": \"Shota (少年・ショタ)\",\n", | |
| " \"text\": \"おねえちゃん、これすごいよ!見て見て!僕が見つけたんだ!\",\n", | |
| " \"instruct\": \"10歳くらいの少年の声。声変わり前の高めの声。無邪気で好奇心旺盛な感じ。少し舌足らずな可愛さを出す。\"\n", | |
| " }\n", | |
| "]\n", | |
| "\n", | |
| "print(f\"\\nGenerating {len(scenarios)} anime character samples with timing measurements...\\n\")\n", | |
| "print(\"-\" * 60)\n", | |
| "\n", | |
| "# 1. Measure Individual Generation Time\n", | |
| "for i, s in enumerate(scenarios):\n", | |
| " text = s[\"text\"]\n", | |
| " instruct = s[\"instruct\"]\n", | |
| "\n", | |
| " # Use Japanese for all as requested\n", | |
| " language = \"Japanese\"\n", | |
| "\n", | |
| " torch.cuda.synchronize()\n", | |
| " start_time = time.time()\n", | |
| "\n", | |
| " wavs, sr = tts.generate_voice_design(\n", | |
| " text=text,\n", | |
| " language=language,\n", | |
| " instruct=instruct,\n", | |
| " max_new_tokens=2048,\n", | |
| " )\n", | |
| "\n", | |
| " torch.cuda.synchronize()\n", | |
| " end_time = time.time()\n", | |
| " elapsed = end_time - start_time\n", | |
| "\n", | |
| " role_key = s['role'].replace(' ', '_').replace('(', '').replace(')', '').replace('、', '')\n", | |
| " filename = f\"anime_sample_{i:02d}_{role_key}.wav\"\n", | |
| " sf.write(filename, wavs[0], sr)\n", | |
| "\n", | |
| " print(f\"[{s['role']}]\\n Text Len: {len(text)} chars | Time: {elapsed:.3f}s | Saved: {filename}\")\n", | |
| "\n", | |
| "print(\"-\" * 60)\n", | |
| "\n", | |
| "# 2. Measure Batch Generation Time (for comparison)\n", | |
| "print(\"\\nMeasuring Batch Generation Time (All 8 samples at once)...\")\n", | |
| "texts = [s[\"text\"] for s in scenarios]\n", | |
| "instructs = [s[\"instruct\"] for s in scenarios]\n", | |
| "languages = [\"Japanese\"] * len(scenarios)\n", | |
| "\n", | |
| "torch.cuda.synchronize()\n", | |
| "start_time = time.time()\n", | |
| "\n", | |
| "_ = tts.generate_voice_design(\n", | |
| " text=texts,\n", | |
| " language=languages,\n", | |
| " instruct=instructs,\n", | |
| " max_new_tokens=2048,\n", | |
| ")\n", | |
| "\n", | |
| "torch.cuda.synchronize()\n", | |
| "end_time = time.time()\n", | |
| "elapsed = end_time - start_time\n", | |
| "print(f\"[Batch Total] Count: {len(texts)} | Time: {elapsed:.3f}s | Avg per sample: {elapsed/len(texts):.3f}s\")\n", | |
| "print(\"-\" * 60)" | |
| ], | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "5f91638b" | |
| }, | |
| "source": [ | |
| "import IPython.display as ipd\n", | |
| "import glob\n", | |
| "import os\n", | |
| "from IPython.display import display, Markdown\n", | |
| "\n", | |
| "# Re-define scenarios to map back if needed, or just list files\n", | |
| "# Ideally we want to show the text with the audio.\n", | |
| "\n", | |
| "wav_files = sorted(glob.glob(\"anime_sample_*.wav\"))\n", | |
| "print(f\"Found {len(wav_files)} generated files.\\n\")\n", | |
| "\n", | |
| "for wav_file in wav_files:\n", | |
| " # Try to extract info from filename or just display it\n", | |
| " # filename format: anime_sample_{i:02d}_{role_key}.wav\n", | |
| " try:\n", | |
| " idx = int(wav_file.split('_')[2])\n", | |
| " # Access scenarios from the previous cell context\n", | |
| " if 'scenarios' in locals() and idx < len(scenarios):\n", | |
| " s = scenarios[idx]\n", | |
| " display(Markdown(f\"### {s['role']}\\n**Text:** `{s['text']}`\\n\\n**Instruct:** *{s['instruct']}*\"))\n", | |
| " else:\n", | |
| " display(Markdown(f\"### File: {wav_file}\"))\n", | |
| " except:\n", | |
| " display(Markdown(f\"### File: {wav_file}\"))\n", | |
| "\n", | |
| " ipd.display(ipd.Audio(wav_file))" | |
| ], | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "!nvidia-smi" | |
| ], | |
| "metadata": { | |
| "id": "TgnCHzod6vZt" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment