Last active
August 1, 2021 13:06
-
-
Save NTT123/c99b5a391af56e0cb8f7b190d3d7f0ee to your computer and use it in GitHub Desktop.
InfoRe MFA Example.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "InfoRe MFA Example.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"mount_file_id": "1APtblq540manNHFWUBz0XhfgcClDDEQB", | |
"authorship_tag": "ABX9TyPtpFafXIyvqYnGZerqgb2A", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/NTT123/c99b5a391af56e0cb8f7b190d3d7f0ee/infore-mfa-example.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "IPkicKwU8IWj" | |
}, | |
"source": [ | |
"!apt update -y\n", | |
"!pip3 install gdown" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "G6Z-aDd08hfk" | |
}, | |
"source": [ | |
"%%bash\n", | |
"data_root=\"./train_data\" # modify this\n", | |
"pushd .\n", | |
"mkdir -p $data_root\n", | |
"gdown --id 1p4dqtkb4N9WLzggMtPzGB7WnVSOCaIFq -O scripts.csv\n", | |
"cd $data_root\n", | |
"gdown --id 1Pe-5lKT_lZsliv2WxQDai2mjhI9ZMFlj -O infore.zip\n", | |
"unzip infore.zip \n", | |
"popd" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "180fmSWhP_Wq", | |
"outputId": "2bde1a36-513b-42eb-a411-acac7f931540" | |
}, | |
"source": [ | |
"%%writefile install_mfa.sh\n", | |
"#!/bin/bash\n", | |
"\n", | |
"## a script to install Montreal Forced Aligner (MFA)\n", | |
"\n", | |
"root_dir=${1:-/tmp/mfa}\n", | |
"mkdir -p $root_dir\n", | |
"cd $root_dir\n", | |
"\n", | |
"# download miniconda3\n", | |
"wget -q --show-progress https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh\n", | |
"bash Miniconda3-latest-Linux-x86_64.sh -b -p $root_dir/miniconda3 -f\n", | |
"\n", | |
"# create py38 env\n", | |
"$root_dir/miniconda3/bin/conda create -n aligner -c conda-forge openblas python=3.8 openfst pynini ngram baumwelch -y\n", | |
"source $root_dir/miniconda3/bin/activate aligner\n", | |
"\n", | |
"# install mfa, download kaldi\n", | |
"pip install montreal-forced-aligner\n", | |
"mfa thirdparty download\n", | |
"\n", | |
"echo -e \"\\n======== DONE ==========\"\n", | |
"echo -e \"\\nTo activate MFA, run: source $root_dir/miniconda3/bin/activate aligner\"\n", | |
"echo -e \"\\nTo delete MFA, run: rm -rf $root_dir\"\n", | |
"echo -e \"\\nSee: https://montreal-forced-aligner.readthedocs.io/en/latest/aligning.html to know how to use MFA\"" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Writing install_mfa.sh\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "UDtqhang9IHT" | |
}, | |
"source": [ | |
"!bash ./install_mfa.sh /tmp/mfa # path to install directory\n", | |
"!source /tmp/mfa/miniconda3/bin/activate aligner; mfa align --help" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "1MGHIiU_97LC" | |
}, | |
"source": [ | |
"!cat train_data/lexicon.txt | cut -f 1 > /content/words.txt" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "FfMSocEUf79K", | |
"outputId": "12094e1e-9c5c-4773-a560-71a3a156cd88" | |
}, | |
"source": [ | |
"# paper: https://www.aclweb.org/anthology/W16-5207.pdf\n", | |
"# title: A non-expert Kaldi recipe for Vietnamese Speech Recognition System\n", | |
"\n", | |
"# we are not using the `consonants` as described here\n", | |
"consonants = [\n", | |
" 'ngh', \n", | |
" 'ch', 'gh', 'gi', 'kh', 'ng', 'nh', 'ph', 'qu', 'tr', 'th', \n", | |
" 'b', 'c', 'd', 'đ', 'g', 'h', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'v', 'x'\n", | |
"]\n", | |
"vowels = (\n", | |
" ['a', 'ă', 'â', 'e', 'ê', 'i', 'o', 'ô', 'ơ', 'u', 'ư', 'y'] +\n", | |
" ['á', 'ắ', 'ấ', 'é', 'ế', 'í', 'ó', 'ố', 'ớ', 'ú', 'ứ', 'ý'] +\n", | |
" ['à', 'ằ', 'ầ', 'è', 'ề', 'ì', 'ò', 'ồ', 'ờ', 'ù', 'ừ', 'ỳ'] +\n", | |
" ['ả', 'ẳ', 'ẩ', 'ẻ', 'ể', 'ỉ', 'ỏ', 'ổ', 'ở', 'ủ', 'ử', 'ỷ'] +\n", | |
" ['ã', 'ẵ', 'ẫ', 'ẽ', 'ễ', 'ĩ', 'õ', 'ỗ', 'ỡ', 'ũ', 'ữ', 'ỹ'] +\n", | |
" ['ạ', 'ặ', 'ậ', 'ẹ', 'ệ', 'ị', 'ọ', 'ộ', 'ợ', 'ụ', 'ự', 'ỵ']\n", | |
")\n", | |
"\n", | |
"punctuations = ['.', '?', '\"', '\\'', ',', '-', '–', '!', ':', ';', '(', ')', '[', ']', '\\n' ]\n", | |
"\n", | |
"alphabet = sorted(set(''.join(consonants + vowels)))\n", | |
"print(alphabet)\n", | |
"# phonemes = sorted(consonants + vowels, key=len, reverse=True)\n", | |
"phonemes = consonants + vowels\n", | |
"print(phonemes)\n", | |
"\n", | |
"import unicodedata\n", | |
"def text_to_phonemes(text, keep_punctuation=False):\n", | |
" text = unicodedata.normalize('NFKC', text.strip().lower())\n", | |
" idx = 0\n", | |
" out = []\n", | |
" while idx < len(text):\n", | |
" # length: 3, 2, 1\n", | |
" for l in [3, 2, 1]:\n", | |
" if idx + l <= len(text) and text[idx: (idx+l)] in phonemes:\n", | |
" out.append(text[idx: (idx+l)])\n", | |
" idx = idx + l\n", | |
" break\n", | |
" else:\n", | |
" if idx < len(text):\n", | |
" if keep_punctuation and text[idx] in punctuations:\n", | |
" out.append(text[idx])\n", | |
" if text[idx] == ' ':\n", | |
" out.append(text[idx])\n", | |
" idx = idx + 1\n", | |
" return out" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"['a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'à', 'á', 'â', 'ã', 'è', 'é', 'ê', 'ì', 'í', 'ò', 'ó', 'ô', 'õ', 'ù', 'ú', 'ý', 'ă', 'đ', 'ĩ', 'ũ', 'ơ', 'ư', 'ạ', 'ả', 'ấ', 'ầ', 'ẩ', 'ẫ', 'ậ', 'ắ', 'ằ', 'ẳ', 'ẵ', 'ặ', 'ẹ', 'ẻ', 'ẽ', 'ế', 'ề', 'ể', 'ễ', 'ệ', 'ỉ', 'ị', 'ọ', 'ỏ', 'ố', 'ồ', 'ổ', 'ỗ', 'ộ', 'ớ', 'ờ', 'ở', 'ỡ', 'ợ', 'ụ', 'ủ', 'ứ', 'ừ', 'ử', 'ữ', 'ự', 'ỳ', 'ỵ', 'ỷ', 'ỹ']\n", | |
"['ngh', 'ch', 'gh', 'gi', 'kh', 'ng', 'nh', 'ph', 'qu', 'tr', 'th', 'b', 'c', 'd', 'đ', 'g', 'h', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'v', 'x', 'a', 'ă', 'â', 'e', 'ê', 'i', 'o', 'ô', 'ơ', 'u', 'ư', 'y', 'á', 'ắ', 'ấ', 'é', 'ế', 'í', 'ó', 'ố', 'ớ', 'ú', 'ứ', 'ý', 'à', 'ằ', 'ầ', 'è', 'ề', 'ì', 'ò', 'ồ', 'ờ', 'ù', 'ừ', 'ỳ', 'ả', 'ẳ', 'ẩ', 'ẻ', 'ể', 'ỉ', 'ỏ', 'ổ', 'ở', 'ủ', 'ử', 'ỷ', 'ã', 'ẵ', 'ẫ', 'ẽ', 'ễ', 'ĩ', 'õ', 'ỗ', 'ỡ', 'ũ', 'ữ', 'ỹ', 'ạ', 'ặ', 'ậ', 'ẹ', 'ệ', 'ị', 'ọ', 'ộ', 'ợ', 'ụ', 'ự', 'ỵ']\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ioh4U9iKf_Lx" | |
}, | |
"source": [ | |
"lines = open('/content/words.txt', 'r').readlines()\n", | |
"f = open('/content/phonemes.txt', 'w')\n", | |
"for line in lines:\n", | |
" t = ' '.join(text_to_phonemes(line))\n", | |
" f.write(t + '\\n')\n", | |
"f.close()" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "b3nMwfzK_g0B" | |
}, | |
"source": [ | |
"ws = open('/content/words.txt').readlines()\n", | |
"ps = open('/content/phonemes.txt').readlines()\n", | |
"f = open('/content/lexicon.txt', 'w')\n", | |
"for w, p in zip(ws, ps):\n", | |
" w = w.strip()\n", | |
" p = p.strip()\n", | |
"\n", | |
" # this is a hack to match phoneme set in the vietTTS repo\n", | |
" p = p.split()\n", | |
" p = [ \" \".join(list(x)) for x in p]\n", | |
" p = \" \".join(p)\n", | |
" # hack ends\n", | |
" \n", | |
" if w == \"q\":\n", | |
" p = \"qu i\"\n", | |
" f.write(f'{w}\\t{p}\\n')\n", | |
"f.close()" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "TaT_fa_bAhn7" | |
}, | |
"source": [ | |
"!mkdir -p /content/wavs\n", | |
"!cp /content/train_data/*.wav /content/wavs" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "w5k2kXmOBD9q" | |
}, | |
"source": [ | |
"s = open('scripts.csv').readlines()\n", | |
"from pathlib import Path\n", | |
"for l in s:\n", | |
" fn, txt, t = l.strip().split('|')\n", | |
" fn = Path(fn).stem\n", | |
" with open(f'/content/wavs/{fn}.txt', 'w') as f:\n", | |
" f.write(txt + '\\n')\n", | |
" # print(fn)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "uwlgRSd19lbK", | |
"outputId": "93d9e0cb-0e25-499a-aafa-bff84469086d" | |
}, | |
"source": [ | |
"!source /tmp/mfa/miniconda3/bin/activate aligner; mfa train --clean -C /content/wavs /content/lexicon.txt /content/InfoRe_Tg" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"All required kaldi binaries were found!\n", | |
"/root/Documents/MFA/wavs/train_and_align.log\n", | |
"INFO - Setting up corpus information...\n", | |
"INFO - Number of speakers in corpus: 1, average number of utterances per speaker: 14935.0\n", | |
"INFO - Parsing dictionary without pronunciation probabilities without silence probabilities\n", | |
"INFO - Creating dictionary information...\n", | |
"INFO - Setting up training data...\n", | |
"Generating base features (mfcc)...\n", | |
"Calculating CMVN...\n", | |
"INFO - Initializing training for mono...\n", | |
"INFO - Initialization complete!\n", | |
"100% 39/39 [21:54<00:00, 33.70s/it]\n", | |
"INFO - Training complete!\n", | |
"INFO - Generating alignments using mono models using 5000 utterances...\n", | |
"INFO - Initializing training for tri...\n", | |
"INFO - Initialization complete!\n", | |
"100% 34/34 [13:54<00:00, 24.54s/it]\n", | |
"INFO - Training complete!\n", | |
"INFO - Generating alignments using tri models using 10000 utterances...\n", | |
"INFO - Initializing training for lda...\n", | |
"INFO - Initialization complete!\n", | |
"100% 35/35 [47:09<00:00, 80.84s/it]\n", | |
"INFO - Training complete!\n", | |
"INFO - Generating alignments using lda models using 10000 utterances...\n", | |
"INFO - Initializing training for sat1...\n", | |
"INFO - Initializing speaker-adapted triphone training...\n", | |
"INFO - Initialization complete!\n", | |
"100% 34/34 [1:10:06<00:00, 123.73s/it]\n", | |
"INFO - Training complete!\n", | |
"INFO - Generating alignments using sat1 models for the whole corpus...\n", | |
"INFO - Initializing training for sat2...\n", | |
"INFO - Initializing speaker-adapted triphone training...\n", | |
"INFO - Initialization complete!\n", | |
"100% 34/34 [1:51:03<00:00, 195.99s/it]\n", | |
"INFO - Training complete!\n", | |
"INFO - Generating alignments using sat2 models for the whole corpus...\n", | |
"WARNING - There were 17 segments/files not aligned. Please see /content/InfoRe_Tg/unaligned.txt for more details on why alignment failed for these files.\n", | |
"INFO - All done!\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "VN2uiuDA-6ft" | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment