Skip to content

Instantly share code, notes, and snippets.

@chottokun
Created January 23, 2026 13:26
Show Gist options
  • Select an option

  • Save chottokun/39a800222132745c81c13e6cdc00ba70 to your computer and use it in GitHub Desktop.

Select an option

Save chottokun/39a800222132745c81c13e6cdc00ba70 to your computer and use it in GitHub Desktop.
Qwen3-TTS.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4",
"authorship_tag": "ABX9TyPIG2feBIPfrpMF9/Xujsvc",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/chottokun/39a800222132745c81c13e6cdc00ba70/qwen3-tts.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "bb4aaed8"
},
"source": [
"!nvidia-smi\n",
"import os\n",
"if not os.path.exists(\"Qwen3-TTS\"):\n",
" !git clone https://github.com/QwenLM/Qwen3-TTS\n",
"%cd Qwen3-TTS"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "36a01e17"
},
"source": [
"import os\n",
"\n",
"if os.path.exists('requirements.txt'):\n",
" !pip install -r requirements.txt\n",
"else:\n",
" print(\"No requirements.txt found. Installing standard libraries...\")\n",
" !pip install torch torchaudio transformers accelerate"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "8bca68f1"
},
"source": [
"!pip install -e ."
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "6d970d14"
},
"source": [
"import time\n",
"import torch\n",
"import soundfile as sf\n",
"from qwen_tts import Qwen3TTSModel\n",
"\n",
"# Define device\n",
"device = \"cuda:0\"\n",
"# Model ID\n",
"MODEL_PATH = \"Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign\"\n",
"\n",
"print(f\"Loading model from {MODEL_PATH}...\")\n",
"# Tesla T4 does not support Flash Attention 2. Using 'sdpa' instead.\n",
"tts = Qwen3TTSModel.from_pretrained(\n",
" MODEL_PATH,\n",
" device_map=device,\n",
" dtype=torch.bfloat16,\n",
" attn_implementation=\"sdpa\",\n",
")\n",
"\n",
"# Anime Character Scenarios\n",
"scenarios = [\n",
" {\n",
" \"role\": \"Tsundere (ツンデレ)\",\n",
" \"text\": \"べ、別にあんたのために作ったわけじゃないんだからね!勘違いしないでよね!\",\n",
" \"instruct\": \"10代の少女の声。ツンデレな口調。最初は怒ったように早口で、後半は少し照れたように声が小さくなる。高めのトーン。\"\n",
" },\n",
" {\n",
" \"role\": \"Tsundere (ツンデレ)\",\n",
" \"text\": \"べ、別にあんたのために作ったわけじゃないんだからね!勘違いしないでよね!\",\n",
" \"instruct\": \"10代の少女の声。ツンデレな口調。高めのトーン。最初は怒ったように早口で、「べ、別にあんたのために作ったわけじゃないんだからね!」後半は少し照れたように声が小さくなって「勘違いしないでよね!」\"\n",
" },\n",
" {\n",
" \"role\": \"Onee-san (お姉さん)\",\n",
" \"text\": \"あらあら、迷子になっちゃったの?お姉さんが案内してあげるわ。\",\n",
" \"instruct\": \"20代女性の声。落ち着いた、包容力のあるお姉さんボイス。ゆっくりとした話し方で、語尾に少し息が混じるような色気を含ませる。\"\n",
" },\n",
" {\n",
" \"role\": \"Chuunibyou (中二病)\",\n",
" \"text\": \"クックック…我が右腕に封印されし黒龍が暴れだす…!鎮まれ、俺の血よ!\",\n",
" \"instruct\": \"10代男性の声。中二病特有の大げさな演技がかった口調。低音でカッコつけて喋るが、時折素の声が混じるような不安定さを出す。\"\n",
" },\n",
" {\n",
" \"role\": \"Genki Girl (元気っ子)\",\n",
" \"text\": \"おっはよー!今日もいい天気だね!これなら洗濯物もすぐ乾きそう!\",\n",
" \"instruct\": \"10代少女の声。明るく元気でハキハキとした口調。声量が大きく、抑揚がはっきりしている。聞いているだけで元気が出るような声。\"\n",
" },\n",
" {\n",
" \"role\": \"Cool_Kuudere (クール・クーデレ)\",\n",
" \"text\": \"…任務完了。次の指示を。…何?私の顔に何かついている?\",\n",
" \"instruct\": \"10代後半の女性の声。感情の起伏が少ない、冷静沈着なトーン。淡々としているが、冷たすぎず、少しミステリアスな雰囲気。\"\n",
" },\n",
" {\n",
" \"role\": \"Samurai (侍・古風)\",\n",
" \"text\": \"拙者は流浪の侍。名はまだ無い。この刀一本で乱世を生き抜くだけでござる。\",\n",
" \"instruct\": \"20代男性の声。時代劇のような古風な話し方。腹から声を出すような、芯のある凛とした声色。\"\n",
" },\n",
" {\n",
" \"role\": \"Villainess (悪役令嬢)\",\n",
" \"text\": \"オーホッホッホ!このわたくしに逆らおうなんて、100年早くてよ!\",\n",
" \"instruct\": \"高飛車な自信家の女性の声。高笑いが似合う、少し甲高いトーン。相手を見下すような威圧感と、どこか憎めない愛嬌を混ぜる。\"\n",
" },\n",
" {\n",
" \"role\": \"Villainess (悪役令嬢)\",\n",
" \"text\": \"オーホッホッホ!このわたくしに逆らおうなんて、100年早くてよ!\",\n",
" \"instruct\": \"高飛車な自信家の女性の声。少し甲高いトーン。相手を見下すような威圧感と、どこか憎めない愛嬌を混ぜる。「オーホッホッホ!」は高笑いです。「100年早くてよ!」は威圧的に。\"\n",
" },\n",
" {\n",
" \"role\": \"Shota (少年・ショタ)\",\n",
" \"text\": \"おねえちゃん、これすごいよ!見て見て!僕が見つけたんだ!\",\n",
" \"instruct\": \"10歳くらいの少年の声。声変わり前の高めの声。無邪気で好奇心旺盛な感じ。少し舌足らずな可愛さを出す。\"\n",
" }\n",
"]\n",
"\n",
"print(f\"\\nGenerating {len(scenarios)} anime character samples with timing measurements...\\n\")\n",
"print(\"-\" * 60)\n",
"\n",
"# 1. Measure Individual Generation Time\n",
"for i, s in enumerate(scenarios):\n",
" text = s[\"text\"]\n",
" instruct = s[\"instruct\"]\n",
"\n",
" # Use Japanese for all as requested\n",
" language = \"Japanese\"\n",
"\n",
" torch.cuda.synchronize()\n",
" start_time = time.time()\n",
"\n",
" wavs, sr = tts.generate_voice_design(\n",
" text=text,\n",
" language=language,\n",
" instruct=instruct,\n",
" max_new_tokens=2048,\n",
" )\n",
"\n",
" torch.cuda.synchronize()\n",
" end_time = time.time()\n",
" elapsed = end_time - start_time\n",
"\n",
" role_key = s['role'].replace(' ', '_').replace('(', '').replace(')', '').replace('、', '')\n",
" filename = f\"anime_sample_{i:02d}_{role_key}.wav\"\n",
" sf.write(filename, wavs[0], sr)\n",
"\n",
" print(f\"[{s['role']}]\\n Text Len: {len(text)} chars | Time: {elapsed:.3f}s | Saved: {filename}\")\n",
"\n",
"print(\"-\" * 60)\n",
"\n",
"# 2. Measure Batch Generation Time (for comparison)\n",
"print(\"\\nMeasuring Batch Generation Time (All 8 samples at once)...\")\n",
"texts = [s[\"text\"] for s in scenarios]\n",
"instructs = [s[\"instruct\"] for s in scenarios]\n",
"languages = [\"Japanese\"] * len(scenarios)\n",
"\n",
"torch.cuda.synchronize()\n",
"start_time = time.time()\n",
"\n",
"_ = tts.generate_voice_design(\n",
" text=texts,\n",
" language=languages,\n",
" instruct=instructs,\n",
" max_new_tokens=2048,\n",
")\n",
"\n",
"torch.cuda.synchronize()\n",
"end_time = time.time()\n",
"elapsed = end_time - start_time\n",
"print(f\"[Batch Total] Count: {len(texts)} | Time: {elapsed:.3f}s | Avg per sample: {elapsed/len(texts):.3f}s\")\n",
"print(\"-\" * 60)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "5f91638b"
},
"source": [
"import IPython.display as ipd\n",
"import glob\n",
"import os\n",
"from IPython.display import display, Markdown\n",
"\n",
"# Re-define scenarios to map back if needed, or just list files\n",
"# Ideally we want to show the text with the audio.\n",
"\n",
"wav_files = sorted(glob.glob(\"anime_sample_*.wav\"))\n",
"print(f\"Found {len(wav_files)} generated files.\\n\")\n",
"\n",
"for wav_file in wav_files:\n",
" # Try to extract info from filename or just display it\n",
" # filename format: anime_sample_{i:02d}_{role_key}.wav\n",
" try:\n",
" idx = int(wav_file.split('_')[2])\n",
" # Access scenarios from the previous cell context\n",
" if 'scenarios' in locals() and idx < len(scenarios):\n",
" s = scenarios[idx]\n",
" display(Markdown(f\"### {s['role']}\\n**Text:** `{s['text']}`\\n\\n**Instruct:** *{s['instruct']}*\"))\n",
" else:\n",
" display(Markdown(f\"### File: {wav_file}\"))\n",
" except:\n",
" display(Markdown(f\"### File: {wav_file}\"))\n",
"\n",
" ipd.display(ipd.Audio(wav_file))"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!nvidia-smi"
],
"metadata": {
"id": "TgnCHzod6vZt"
},
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment