Created
December 19, 2018 11:50
-
-
Save erogol/8f39174c3f0475221c8978aeb10d4fdc to your computer and use it in GitHub Desktop.
TTS_example.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "TTS_example.ipynb", | |
"version": "0.3.2", | |
"provenance": [], | |
"collapsed_sections": [], | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"accelerator": "GPU" | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/erogol/8f39174c3f0475221c8978aeb10d4fdc/tts_example.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "6uMCom74Ft81", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"from google.colab import drive\n", | |
"drive.mount('/content/drive/')b" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "9wqjz3lIGXZd", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# get TTS to your local\n", | |
"!git clone https://github.com/mozilla/TTS" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "TzjnO4pjGePs", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# download LJSpeech dataset\n", | |
"!wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2d\n", | |
"# decompress \n", | |
"!tar -xvjf LJSpeech-1.1.tar.bz2" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "__k0BrbfLQ-F", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# create train-val splits\n", | |
"!shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv\n", | |
"!head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv\n", | |
"!tail -n 11000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "G1OnsNyJJtem", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# install TTS requirements\n", | |
"!cd TTS\n", | |
"!pip install -r requirements.txt" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "J1XOWu_oKfdv", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# check the default TTS config.json. It is necessary for all your training \n", | |
"# settings\n", | |
"!cat config.json" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "y7_Xao7uNOvX", | |
"colab_type": "code", | |
"outputId": "6400488b-c1f4-45d1-dc40-a9a72543b1f8", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 36 | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"\n", | |
"# set data fields for LJSpeech\n", | |
"%%writefile config.json\n", | |
"\n", | |
"{\n", | |
" \"model_name\": \"TTS-master\",\n", | |
" \"model_description\": \"Higher dropout rate for stopnet and disabled custom initialization, pull current mel prediction to stopnet.\",\n", | |
"\n", | |
" \"audio\":{\n", | |
" \"audio_processor\": \"audio\", // to use dictate different audio processors, if available.\n", | |
" // Audio processing parameters\n", | |
" \"num_mels\": 80, // size of the mel spec frame. \n", | |
" \"num_freq\": 1025, // number of stft frequency levels. Size of the linear spectogram frame.\n", | |
" \"sample_rate\": 22050, // wav sample-rate. If different than the original data, it is resampled.\n", | |
" \"frame_length_ms\": 50, // stft window length in ms.\n", | |
" \"frame_shift_ms\": 12.5, // stft window hop-lengh in ms.\n", | |
" \"preemphasis\": 0.97, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.\n", | |
" \"min_level_db\": -100, // normalization range\n", | |
" \"ref_level_db\": 20, // reference level db, theoretically 20db is the sound of air.\n", | |
" \"power\": 1.5, // value to sharpen wav signals after GL algorithm.\n", | |
" \"griffin_lim_iters\": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.\n", | |
" // Normalization parameters\n", | |
" \"signal_norm\": true, // normalize the spec values in range [0, 1]\n", | |
" \"symmetric_norm\": false, // move normalization to range [-1, 1]\n", | |
" \"max_norm\": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]\n", | |
" \"clip_norm\": true, // clip normalized values into the range.\n", | |
" \"mel_fmin\": null, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!\n", | |
" \"mel_fmax\": null, // maximum freq level for mel-spec. Tune for dataset!!\n", | |
" \"do_trim_silence\": true // enable trimming of slience of audio as you load it.\n", | |
" },\n", | |
"\n", | |
" \"embedding_size\": 256, \n", | |
" \"text_cleaner\": \"english_cleaners\",\n", | |
" \"epochs\": 1000,\n", | |
" \n", | |
" \"lr\": 0.0001,\n", | |
" \"lr_decay\": false,\n", | |
" \"warmup_steps\": 4000,\n", | |
"\n", | |
" \"batch_size\": 32,\n", | |
" \"eval_batch_size\":32,\n", | |
" \"r\": 5,\n", | |
" \"wd\": 0.000001,\n", | |
" \"checkpoint\": true,\n", | |
" \"save_step\": 5000,\n", | |
" \"print_step\": 10,\n", | |
"\n", | |
" \"run_eval\": true,\n", | |
" \"data_path\": \"../../Data/LJSpeech-1.1/\", // can overwritten from command argument\n", | |
" \"meta_file_train\": \"metadata_train.csv\", // metafile for training dataloader\n", | |
" \"meta_file_val\": \"metadata_val.csv\", // metafile for validation dataloader\n", | |
" \"data_loader\": \"TTSDataset\", // dataloader, [\"TTSDataset\", \"TTSDatasetCached\", \"TTSDatasetMemory\"]\n", | |
" \"dataset\": \"ljspeech\", // one of TTS.dataset.preprocessors, only valid id dataloader == \"TTSDataset\", rest uses \"tts_cache\" by default.\n", | |
" \"min_seq_len\": 0,\n", | |
" \"output_path\": \"../keep/\",\n", | |
" \"num_loader_workers\": 2,\n", | |
" \"num_val_loader_workers\": 2\n", | |
"}" | |
], | |
"execution_count": 0, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Overwriting config.json\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "1Z7PR2pBLHxq", | |
"colab_type": "code", | |
"outputId": "e285d179-469c-445a-948f-dc706e51e1f4", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 1123 | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# pull the trigger\n", | |
"!python train.py --config_path config.json --data_path ../LJSpeech-1.1/ | tee training.log" | |
], | |
"execution_count": 0, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
" > Using CUDA: True\n", | |
" > Number of GPUs: 1\n", | |
" > Git Hash: 20ee7c0\n", | |
" > Experiment folder: /content/TTS/../keep/December-19-2018_11+32AM-TTS-master-20ee7c0\n", | |
" > Setting up Audio Processor...\n", | |
" | > fft size: 2048, hop length: 275, win length: 1102\n", | |
" | > Audio Processor attributes.\n", | |
" | > bits:None\n", | |
" | > sample_rate:22050\n", | |
" | > num_mels:80\n", | |
" | > min_level_db:-100\n", | |
" | > frame_shift_ms:12.5\n", | |
" | > frame_length_ms:50\n", | |
" | > ref_level_db:20\n", | |
" | > num_freq:1025\n", | |
" | > power:1.5\n", | |
" | > preemphasis:0.97\n", | |
" | > griffin_lim_iters:60\n", | |
" | > signal_norm:True\n", | |
" | > symmetric_norm:False\n", | |
" | > mel_fmin:0\n", | |
" | > mel_fmax:None\n", | |
" | > max_norm:1.0\n", | |
" | > clip_norm:True\n", | |
" | > do_trim_silence:True\n", | |
" | > n_fft:2048\n", | |
" | > hop_length:275\n", | |
" | > win_length:1102\n", | |
" | > Number of characters : 149\n", | |
" | > Num output units : 1025\n", | |
"\n", | |
" > Starting a new training\n", | |
" | > Model has 7083650 parameters\n", | |
" > Reading LJSpeech from - ../LJSpeech-1.1/\n", | |
" | > Number of instances : 12000\n", | |
" | > Max length sequence 187\n", | |
" | > Min length sequence 5\n", | |
" | > Avg length sequence 98.28775\n", | |
" | > 0 instances are ignored by min_seq_len (0)\n", | |
" | > Batch group shuffling is active.\n", | |
" | > Epoch 0/1000\n", | |
" | > Step:9/375 GlobalStep:10 TotalLoss:0.33402 LinearLoss:0.14762 MelLoss:0.18641 StopLoss:0.73977 GradNorm:0.04653 GradNormST:1.01440 AvgTextLen:36.2 AvgSpecLen:172.2 StepTime:2.21 LR:0.000100\n", | |
" | > Step:19/375 GlobalStep:20 TotalLoss:0.29765 LinearLoss:0.14310 MelLoss:0.15455 StopLoss:0.66492 GradNorm:0.06941 GradNormST:0.48356 AvgTextLen:44.8 AvgSpecLen:222.0 StepTime:2.41 LR:0.000100\n", | |
" | > Step:29/375 GlobalStep:30 TotalLoss:0.28474 LinearLoss:0.14459 MelLoss:0.14015 StopLoss:0.72310 GradNorm:0.06081 GradNormST:0.92077 AvgTextLen:48.8 AvgSpecLen:246.3 StepTime:3.07 LR:0.000100\n", | |
" | > Step:39/375 GlobalStep:40 TotalLoss:0.27690 LinearLoss:0.14139 MelLoss:0.13551 StopLoss:0.73683 GradNorm:0.05845 GradNormST:2.58913 AvgTextLen:51.6 AvgSpecLen:254.2 StepTime:3.22 LR:0.000100\n", | |
" | > Step:49/375 GlobalStep:50 TotalLoss:0.26876 LinearLoss:0.13891 MelLoss:0.12986 StopLoss:0.72490 GradNorm:0.03796 GradNormST:3.11640 AvgTextLen:60.6 AvgSpecLen:313.0 StepTime:3.56 LR:0.000100\n", | |
" | > Step:59/375 GlobalStep:60 TotalLoss:0.25832 LinearLoss:0.13163 MelLoss:0.12669 StopLoss:0.74020 GradNorm:0.05046 GradNormST:1.47773 AvgTextLen:65.6 AvgSpecLen:326.4 StepTime:3.71 LR:0.000100\n", | |
" | > Step:69/375 GlobalStep:70 TotalLoss:0.25436 LinearLoss:0.12792 MelLoss:0.12644 StopLoss:0.67741 GradNorm:0.06263 GradNormST:2.02652 AvgTextLen:66.6 AvgSpecLen:321.7 StepTime:3.73 LR:0.000100\n", | |
" | > Step:79/375 GlobalStep:80 TotalLoss:0.25033 LinearLoss:0.12247 MelLoss:0.12785 StopLoss:0.77042 GradNorm:0.05292 GradNormST:0.97249 AvgTextLen:72.3 AvgSpecLen:363.9 StepTime:4.67 LR:0.000100\n", | |
" | > Step:89/375 GlobalStep:90 TotalLoss:0.24469 LinearLoss:0.11755 MelLoss:0.12714 StopLoss:0.67230 GradNorm:0.08123 GradNormST:0.90303 AvgTextLen:79.5 AvgSpecLen:389.7 StepTime:4.24 LR:0.000100\n", | |
" | > Step:99/375 GlobalStep:100 TotalLoss:0.24387 LinearLoss:0.11588 MelLoss:0.12799 StopLoss:0.56327 GradNorm:0.05469 GradNormST:2.77339 AvgTextLen:78.1 AvgSpecLen:393.7 StepTime:3.59 LR:0.000100\n", | |
" | > Step:109/375 GlobalStep:110 TotalLoss:0.23841 LinearLoss:0.11188 MelLoss:0.12652 StopLoss:0.62725 GradNorm:0.11853 GradNormST:1.29791 AvgTextLen:81.9 AvgSpecLen:400.1 StepTime:4.18 LR:0.000100\n", | |
" | > Step:119/375 GlobalStep:120 TotalLoss:0.23700 LinearLoss:0.10967 MelLoss:0.12733 StopLoss:0.60695 GradNorm:0.02969 GradNormST:1.44895 AvgTextLen:82.7 AvgSpecLen:423.1 StepTime:4.22 LR:0.000100\n", | |
" | > Step:129/375 GlobalStep:130 TotalLoss:0.23633 LinearLoss:0.10984 MelLoss:0.12650 StopLoss:0.73151 GradNorm:0.10447 GradNormST:1.45647 AvgTextLen:91.1 AvgSpecLen:462.0 StepTime:5.03 LR:0.000100\n", | |
" | > Step:139/375 GlobalStep:140 TotalLoss:0.23326 LinearLoss:0.10804 MelLoss:0.12522 StopLoss:0.56982 GradNorm:0.04163 GradNormST:1.32453 AvgTextLen:95.1 AvgSpecLen:470.1 StepTime:4.71 LR:0.000100\n", | |
" | > Step:149/375 GlobalStep:150 TotalLoss:0.23332 LinearLoss:0.10898 MelLoss:0.12434 StopLoss:0.64671 GradNorm:0.05904 GradNormST:0.94457 AvgTextLen:93.3 AvgSpecLen:470.4 StepTime:4.87 LR:0.000100\n", | |
" | > Step:159/375 GlobalStep:160 TotalLoss:0.23035 LinearLoss:0.10733 MelLoss:0.12302 StopLoss:0.70120 GradNorm:0.03810 GradNormST:1.59297 AvgTextLen:97.2 AvgSpecLen:487.0 StepTime:5.40 LR:0.000100\n" | |
], | |
"name": "stdout" | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
First command is invalid.