From 6cdb105afcd34597ee864126f0cb7cab491417c2 Mon Sep 17 00:00:00 2001 From: "Isaque C. Silva" <152451006+isaquecsilva@users.noreply.github.com> Date: Thu, 31 Oct 2024 17:23:15 -0300 Subject: [PATCH 1/3] Allowing users to continue previous training by downloading last checkpoint file from google drive storage. --- ...piper_multilingual_training_notebook.ipynb | 1098 ++++++++++------- 1 file changed, 635 insertions(+), 463 deletions(-) diff --git a/notebooks/piper_multilingual_training_notebook.ipynb b/notebooks/piper_multilingual_training_notebook.ipynb index d4c385fd..0f98f654 100644 --- a/notebooks/piper_multilingual_training_notebook.ipynb +++ b/notebooks/piper_multilingual_training_notebook.ipynb @@ -1,465 +1,637 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eK3nmYDB6C1a" - }, - "source": [ - "# **[Piper](https://github.com/rhasspy/piper) training notebook.**\n", - "## ![Piper logo](https://contribute.rhasspy.org/img/logo.png)\n", - "\n", - "---\n", - "\n", - "- Notebook made by [rmcpantoja](http://github.com/rmcpantoja)\n", - "- Collaborator: [Xx_Nessu_xX](https://fakeyou.com/profile/Xx_Nessu_xX)\n", - "\n", - "---\n", - "\n", - "# Notes:\n", - "\n", - "- **Things in orange mean that they are important.**" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AICh6p5OJybj" - }, - "source": [ - "# 🔧 ***First steps.*** 🔧" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "qyxSMuzjfQrz" - }, - "outputs": [], - "source": [ - "#@markdown ## **Google Colab Anti-Disconnect.** 🔌\n", - "#@markdown ---\n", - "#@markdown #### Avoid automatic disconnection. Still, it will disconnect after **6 to 12 hours**.\n", - "\n", - "import IPython\n", - "js_code = '''\n", - "function ClickConnect(){\n", - "console.log(\"Working\");\n", - "document.querySelector(\"colab-toolbar-button#connect\").click()\n", - "}\n", - "setInterval(ClickConnect,60000)\n", - "'''\n", - "display(IPython.display.Javascript(js_code))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "ygxzp-xHTC7T" - }, - "outputs": [], - "source": [ - "#@markdown ## **Check GPU type.** 👁️\n", - "#@markdown ---\n", - "#@markdown #### A higher capable GPU can lead to faster training speeds. By default, you will have a **Tesla T4**.\n", - "!nvidia-smi" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "sUNjId07JfAK" - }, - "outputs": [], - "source": [ - "#@markdown # **Mount Google Drive.** 📂\n", - "from google.colab import drive\n", - "drive.mount('/content/drive', force_remount=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "_XwmTVlcUgCh" - }, - "outputs": [], - "source": [ - "#@markdown # **Install software.** 📦\n", - "\n", - "#@markdown ####In this cell the synthesizer and its necessary dependencies to execute the training will be installed. (this may take a while)\n", - "\n", - "#@markdown #### **Do you want to use the patch?**\n", - "#@markdown The patch provides the ability to export audio files to the output folder and save a single model while training.\n", - "usepatch = True #@param {type:\"boolean\"}\n", - "#@markdown ---\n", - "# clone:\n", - "!git clone -q https://github.com/rmcpantoja/piper\n", - "%cd /content/piper/src/python\n", - "!wget -q \"https://raw.githubusercontent.com/coqui-ai/TTS/dev/TTS/bin/resample.py\"\n", - "#!pip install -q -r requirements.txt\n", - "!pip install -q cython>=0.29.0 piper-phonemize==1.1.0 librosa>=0.9.2 numpy>=1.19.0 onnxruntime>=1.11.0 pytorch-lightning==1.7.0 torch==1.11.0\n", - "!pip install -q torchtext==0.12.0 torchvision==0.12.0\n", - "# fixing recent compativility isswes:\n", - "!pip install -q torchaudio==0.11.0 torchmetrics==0.11.4\n", - "!bash build_monotonic_align.sh\n", - "!apt-get install -q espeak-ng\n", - "# download patches:\n", - "if usepatch:\n", - " print(\"Downloading patch...\")\n", - " !gdown -q \"1EWEb7amo1rgFGpBFfRD4BKX3pkjVK1I-\" -O \"/content/piper/src/python/patch.zip\"\n", - " !unzip -o -q \"patch.zip\"\n", - "%cd /content" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "A3bMzEE0V5Ma" - }, - "source": [ - "# 🤖 ***Training.*** 🤖" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "SvEGjf0aV8eg" - }, - "outputs": [], - "source": [ - "#@markdown # **1. Extract dataset.** 📥\n", - "#@markdown ####Important: the audios must be in **wav format, (16000 or 22050hz, 16-bits, mono), and, for convenience, numbered. Example:**\n", - "\n", - "#@markdown * **1.wav**\n", - "#@markdown * **2.wav**\n", - "#@markdown * **3.wav**\n", - "#@markdown * **.....**\n", - "\n", - "#@markdown ---\n", - "\n", - "%cd /content\n", - "!mkdir /content/dataset\n", - "%cd /content/dataset\n", - "!mkdir /content/dataset/wavs\n", - "#@markdown ### Audio dataset path to unzip:\n", - "zip_path = \"/content/drive/MyDrive/Wavs.zip\" #@param {type:\"string\"}\n", - "!unzip \"{zip_path}\" -d /content/dataset/wavs\n", - "#@markdown ---" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "E0W0OCvXXvue" - }, - "outputs": [], - "source": [ - "#@markdown # **2. Upload the transcript file.** 📝\n", - "#@markdown ---\n", - "#@markdown ####**Important: the transcription means writing what the character says in each of the audios, and it must have the following structure:**\n", - "\n", - "#@markdown ##### For a single-speaker dataset:\n", - "#@markdown * wavs/1.wav|This is what my character says in audio 1.\n", - "#@markdown * wavs/2.wav|This, the text that the character says in audio 2.\n", - "#@markdown * ...\n", - "\n", - "#@markdown ##### For a multi-speaker dataset:\n", - "\n", - "#@markdown * wavs/speaker1audio1.wav|speaker1|This is what the first speaker says.\n", - "#@markdown * wavs/speaker1audio2.wav|speaker1|This is another audio of the first speaker.\n", - "#@markdown * wavs/speaker2audio1.wav|speaker2|This is what the second speaker says in the first audio.\n", - "#@markdown * wavs/speaker2audio2.wav|speaker2|This is another audio of the second speaker.\n", - "#@markdown * ...\n", - "\n", - "#@markdown And so on. In addition, the transcript must be in a **.csv format. (UTF-8 without BOM)**\n", - "\n", - "#@markdown ---\n", - "%cd /content/dataset\n", - "from google.colab import files\n", - "!rm /content/dataset/metadata.csv\n", - "listfn, length = files.upload().popitem()\n", - "if listfn != \"metadata.csv\":\n", - " !mv \"$listfn\" metadata.csv\n", - "%cd .." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "dOyx9Y6JYvRF" - }, - "outputs": [], - "source": [ - "#@markdown # **3. Preprocess dataset.** 🔄\n", - "\n", - "import os\n", - "#@markdown ### First of all, select the language of your dataset.\n", - "language = \"English (U.S.)\" #@param [\"Català\", \"Dansk\", \"Deutsch\", \"Ελληνικά\", \"English (British)\", \"English (U.S.)\", \"Español\", \"Español (latinoamericano)\", \"Suomi\", \"Français\", \"Magyar\", \"Icelandic\", \"Italiano\", \"ქართული\", \"қазақша\", \"Lëtzebuergesch\", \"नेपाली\", \"Nederlands\", \"Norsk\", \"Polski\", \"Português (Brasil)\", \"Română\", \"Русский\", \"Српски\", \"Svenska\", \"Kiswahili\", \"Türkçe\", \"украї́нська\", \"Tiếng Việt\", \"简体中文\"]\n", - "#@markdown ---\n", - "# language definition:\n", - "languages = {\n", - " \"Català\": \"ca\",\n", - " \"Dansk\": \"da\",\n", - " \"Deutsch\": \"de\",\n", - " \"Ελληνικά\": \"grc\",\n", - " \"English (British)\": \"en\",\n", - " \"English (U.S.)\": \"en-us\",\n", - " \"Español\": \"es\",\n", - " \"Español (latinoamericano)\": \"es-419\",\n", - " \"Suomi\": \"fi\",\n", - " \"Français\": \"fr\",\n", - " \"Magyar\": \"hu\",\n", - " \"Icelandic\": \"is\",\n", - " \"Italiano\": \"it\",\n", - " \"ქართული\": \"ka\",\n", - " \"қазақша\": \"kk\",\n", - " \"Lëtzebuergesch\": \"lb\",\n", - " \"नेपाली\": \"ne\",\n", - " \"Nederlands\": \"nl\",\n", - " \"Norsk\": \"nb\",\n", - " \"Polski\": \"pl\",\n", - " \"Português (Brasil)\": \"pt-br\",\n", - " \"Română\": \"ro\",\n", - " \"Русский\": \"ru\",\n", - " \"Српски\": \"sr\",\n", - " \"Svenska\": \"sv\",\n", - " \"Kiswahili\": \"sw\",\n", - " \"Türkçe\": \"tr\",\n", - " \"украї́нська\": \"uk\",\n", - " \"Tiếng Việt\": \"vi\",\n", - " \"简体中文\": \"zh\"\n", - "}\n", - "\n", - "def _get_language(code):\n", - " return languages[code]\n", - "\n", - "final_language = _get_language(language)\n", - "#@markdown ### Choose a name for your model:\n", - "model_name = \"Test\" #@param {type:\"string\"}\n", - "#@markdown ---\n", - "# output:\n", - "#@markdown ### Choose the working folder: (recommended to save to Drive)\n", - "\n", - "#@markdown The working folder will be used in preprocessing, but also in training the model.\n", - "output_path = \"/content/drive/MyDrive/colab/piper\" #@param {type:\"string\"}\n", - "output_dir = output_path+\"/\"+model_name\n", - "if not os.path.exists(output_dir):\n", - " os.makedirs(output_dir)\n", - "#@markdown ---\n", - "#@markdown ### Choose dataset format:\n", - "dataset_format = \"ljspeech\" #@param [\"ljspeech\", \"mycroft\"]\n", - "#@markdown ---\n", - "#@markdown ### Is this a single speaker dataset? Otherwise, uncheck:\n", - "single_speaker = True #@param {type:\"boolean\"}\n", - "if single_speaker:\n", - " force_sp = \" --single-speaker\"\n", - "else:\n", - " force_sp = \"\"\n", - "#@markdown ---\n", - "#@markdown ### Select the sample rate of the dataset:\n", - "sample_rate = \"22050\" #@param [\"16000\", \"22050\"]\n", - "#@markdown ---\n", - "%cd /content/piper/src/python\n", - "#@markdown ### Do you want to train using this sample rate, but your audios don't have it?\n", - "#@markdown The resampler helps you do it quickly!\n", - "resample = False #@param {type:\"boolean\"}\n", - "if resample:\n", - " !python resample.py --input_dir \"/content/dataset/wavs\" --output_dir \"/content/dataset/wavs_resampled\" --output_sr {sample_rate} --file_ext \"wav\"\n", - " !mv /content/dataset/wavs_resampled/* /content/dataset/wavs\n", - "#@markdown ---\n", - "\n", - "!python -m piper_train.preprocess \\\n", - " --language {final_language} \\\n", - " --input-dir /content/dataset \\\n", - " --output-dir \"{output_dir}\" \\\n", - " --dataset-format {dataset_format} \\\n", - " --sample-rate {sample_rate} \\\n", - " {force_sp}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "ickQlOCRjkBL" - }, - "outputs": [], - "source": [ - "#@markdown # **4. Settings.** 🧰\n", - "import json\n", - "import ipywidgets as widgets\n", - "from IPython.display import display\n", - "from google.colab import output\n", - "import os\n", - "#@markdown ### Select the action to train this dataset:\n", - "\n", - "#@markdown * The option to continue a training is self-explanatory. If you've previously trained a model with free colab, your time is up and you're considering training it some more, this is ideal for you. You just have to set the same settings that you set when you first trained this model.\n", - "#@markdown * The option to convert a single-speaker model to a multi-speaker model is self-explanatory, and for this it is important that you have processed a dataset that contains text and audio from all possible speakers that you want to train in your model.\n", - "#@markdown * The finetune option is used to train a dataset using a pretrained model, that is, train on that data. This option is ideal if you want to train a very small dataset (more than five minutes recommended).\n", - "#@markdown * The train from scratch option builds features such as dictionary and speech form from scratch, and this may take longer to converge. For this, hours of audio (8 at least) are recommended, which have a large collection of phonemes.\n", - "\n", - "action = \"finetune\" #@param [\"Continue training\", \"convert single-speaker to multi-speaker model\", \"finetune\", \"train from scratch\"]\n", - "#@markdown ---\n", - "if action == \"Continue training\":\n", - " if os.path.exists(f\"{output_dir}/lightning_logs/version_0/checkpoints/last.ckpt\"):\n", - " ft_command = f'--resume_from_checkpoint \"{output_dir}/lightning_logs/version_0/checkpoints/last.ckpt\" '\n", - " print(f\"Continuing {model_name}'s training at: {output_dir}/lightning_logs/version_0/checkpoints/last.ckpt\")\n", - " else:\n", - " raise Exception(\"Training cannot be continued as there is no checkpoint to continue at.\")\n", - "elif action == \"finetune\":\n", - " if os.path.exists(f\"{output_dir}/lightning_logs/version_0/checkpoints/last.ckpt\"):\n", - " raise Exception(\"Oh no! You have already trained this model before, you cannot choose this option since your progress will be lost, and then your previous time will not count. Please select the option to continue a training.\")\n", - " else:\n", - " ft_command = '--resume_from_checkpoint \"/content/pretrained.ckpt\" '\n", - "elif action == \"convert single-speaker to multi-speaker model\":\n", - " if not single_speaker:\n", - " ft_command = '--resume_from_single_speaker_checkpoint \"/content/pretrained.ckpt\" '\n", - " else:\n", - " raise Exception(\"This dataset is not a multi-speaker dataset!\")\n", - "else:\n", - " ft_command = \"\"\n", - "if action== \"convert single-speaker to multi-speaker model\" or action == \"finetune\":\n", - " try:\n", - " with open('/content/piper/notebooks/pretrained_models.json') as f:\n", - " pretrained_models = json.load(f)\n", - " if final_language in pretrained_models:\n", - " models = pretrained_models[final_language]\n", - " model_options = [(model_name, model_name) for model_name, model_url in models.items()]\n", - " model_dropdown = widgets.Dropdown(description = \"Choose pretrained model\", options=model_options)\n", - " download_button = widgets.Button(description=\"Download\")\n", - " def download_model(btn):\n", - " model_name = model_dropdown.value\n", - " model_url = pretrained_models[final_language][model_name]\n", - " print(\"Downloading pretrained model...\")\n", - " if model_url.startswith(\"1\"):\n", - " !gdown -q \"{model_url}\" -O \"/content/pretrained.ckpt\"\n", - " elif model_url.startswith(\"https://drive.google.com/file/d/\"):\n", - " !gdown -q \"{model_url}\" -O \"/content/pretrained.ckpt\" --fuzzy\n", - " else:\n", - " !wget -q \"{model_url}\" -O \"/content/pretrained.ckpt\"\n", - " model_dropdown.close()\n", - " download_button.close()\n", - " output.clear()\n", - " if os.path.exists(\"/content/pretrained.ckpt\"):\n", - " print(\"Model downloaded!\")\n", - " else:\n", - " raise Exception(\"Couldn't download the pretrained model!\")\n", - " download_button.on_click(download_model)\n", - " display(model_dropdown, download_button)\n", - " else:\n", - " raise Exception(f\"There are no pretrained models available for the language {final_language}\")\n", - " except FileNotFoundError:\n", - " raise Exception(\"The pretrained_models.json file was not found.\")\n", - "else:\n", - " print(\"Warning: this model will be trained from scratch. You need at least 8 hours of data for everything to work decent. Good luck!\")\n", - "#@markdown ### Choose batch size based on this dataset:\n", - "batch_size = 12 #@param {type:\"integer\"}\n", - "#@markdown ---\n", - "validation_split = 0.01\n", - "#@markdown ### Choose the quality for this model:\n", - "\n", - "#@markdown * x-low - 16Khz audio, 5-7M params\n", - "#@markdown * medium - 22.05Khz audio, 15-20 params\n", - "#@markdown * high - 22.05Khz audio, 28-32M params\n", - "quality = \"medium\" #@param [\"high\", \"x-low\", \"medium\"]\n", - "#@markdown ---\n", - "#@markdown ### For how many epochs to save training checkpoints?\n", - "#@markdown The larger your dataset, you should set this saving interval to a smaller value, as epochs can progress longer time.\n", - "checkpoint_epochs = 5 #@param {type:\"integer\"}\n", - "#@markdown ---\n", - "#@markdown ### Step interval to generate model samples:\n", - "log_every_n_steps = 1000 #@param {type:\"integer\"}\n", - "#@markdown ---\n", - "#@markdown ### Training epochs:\n", - "max_epochs = 10000 #@param {type:\"integer\"}\n", - "#@markdown ---" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "colab": { - "background_save": true - }, - "id": "X4zbSjXg2J3N" - }, - "outputs": [], - "source": [ - "#@markdown # **5. Train.** 🏋️‍♂️\n", - "#@markdown Run this cell to train your final model! If possible, some audio samples will be saved during training in the output folder.\n", - "\n", - "get_ipython().system(f'''\n", - "python -m piper_train \\\n", - "--dataset-dir \"{output_dir}\" \\\n", - "--accelerator 'gpu' \\\n", - "--devices 1 \\\n", - "--batch-size {batch_size} \\\n", - "--validation-split {validation_split} \\\n", - "--num-test-examples 2 \\\n", - "--quality {quality} \\\n", - "--checkpoint-epochs {checkpoint_epochs} \\\n", - "--log_every_n_steps {log_every_n_steps} \\\n", - "--max_epochs {max_epochs} \\\n", - "{ft_command}\\\n", - "--precision 32\n", - "''')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6ISG085SYn85" - }, - "source": [ - "# Have you finished training and want to test the model?\n", - "\n", - "* If you want to run this model in any software that Piper integrates or the same Piper app, export your model using the [model exporter notebook](https://colab.research.google.com/github/rmcpantoja/piper/blob/master/notebooks/piper_model_exporter.ipynb)!\n", - "* Wait! I want to test this right now before exporting it to the supported format for Piper. Test your generated last.ckpt with [this notebook](https://colab.research.google.com/github/rmcpantoja/piper/blob/master/notebooks/piper_inference_(ckpt).ipynb)!" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "provenance": [], - "include_colab_link": true - }, - "gpuClass": "standard", - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + { + "cell_type": "markdown", + "metadata": { + "id": "eK3nmYDB6C1a" + }, + "source": [ + "# **[Piper](https://github.com/rhasspy/piper) training notebook.**\n", + "## ![Piper logo](https://contribute.rhasspy.org/img/logo.png)\n", + "\n", + "---\n", + "\n", + "- Notebook made by [rmcpantoja](http://github.com/rmcpantoja)\n", + "- Collaborator: [Xx_Nessu_xX](http://github.com/Xx_Nessu_xX)\n", + "\n", + "---\n", + "\n", + "# Notes:\n", + "\n", + "- **Things in orange mean that they are important.**\n", + "\n", + "# Credits:\n", + "\n", + "* [Feanix-Fyre fork](https://github.com/Feanix-Fyre/piper) with some improvements.\n", + "* [Tacotron2 NVIDIA training notebook](https://github.com/justinjohn0306/FakeYou-Tacotron2-Notebook) - Dataset duration snippet.\n", + "* [🐸TTS](https://github.com/coqui-ai/TTS) - Resampler and XTTS formater demo." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AICh6p5OJybj" + }, + "source": [ + "# 🔧 ***First steps.*** 🔧" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "qyxSMuzjfQrz" + }, + "outputs": [], + "source": [ + "#@markdown ## **Google Colab Anti-Disconnect.** 🔌\n", + "#@markdown ---\n", + "#@markdown #### Avoid automatic disconnection. Still, it will disconnect after **6 to 12 hours**.\n", + "\n", + "import IPython\n", + "js_code = '''\n", + "function ClickConnect(){\n", + "console.log(\"Working\");\n", + "document.querySelector(\"colab-toolbar-button#connect\").click()\n", + "}\n", + "setInterval(ClickConnect,60000)\n", + "'''\n", + "display(IPython.display.Javascript(js_code))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "ygxzp-xHTC7T" + }, + "outputs": [], + "source": [ + "#@markdown ## **Check GPU type.** 👁️\n", + "#@markdown ---\n", + "#@markdown #### A higher capable GPU can lead to faster training speeds. By default, you will have a **Tesla T4**.\n", + "!nvidia-smi" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "sUNjId07JfAK" + }, + "outputs": [], + "source": [ + "#@markdown # **Mount Google Drive.** 📂\n", + "#@markdown ---\n", + "from google.colab import drive\n", + "drive.mount('/content/drive', force_remount=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "_XwmTVlcUgCh" + }, + "outputs": [], + "source": [ + "#@markdown # **Install software.** 📦\n", + "#@markdown ---\n", + "#@markdown ####In this cell the synthesizer and its necessary dependencies to execute the training will be installed. (this may take a while)\n", + "\n", + "# clone:\n", + "!git clone -q https://github.com/rmcpantoja/piper\n", + "%cd /content/piper/src/python\n", + "!wget -q \"https://raw.githubusercontent.com/coqui-ai/TTS/dev/TTS/bin/resample.py\"\n", + "!pip install pip==24.0\n", + "#!pip install -q -r requirements.txt\n", + "!pip install -q cython>=0.29.0 piper-phonemize==1.1.0 librosa>=0.9.2 numpy==1.24 onnxruntime>=1.11.0 pytorch-lightning==1.7.7 torch==1.13.0+cu117 --extra-index-url https://download.pytorch.org/whl/cu117\n", + "!pip install -q torchtext==0.14.0 torchvision==0.14.0\n", + "# fixing recent compativility isswes:\n", + "!pip install -q torchaudio==0.13.0 torchmetrics==0.11.4 faster_whisper\n", + "!pip install --upgrade gdown transformers\n", + "!bash build_monotonic_align.sh\n", + "# Useful vars:\n", + "use_whisper = True\n", + "print(\"Done!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "A3bMzEE0V5Ma" + }, + "source": [ + "# 🤖 ***Training.*** 🤖" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "SvEGjf0aV8eg" + }, + "outputs": [], + "source": [ + "#@markdown # **1. Extract dataset.** 📥\n", + "#@markdown ---\n", + "#@markdown ####Important: the audios must be in **wav format, (16000 or 22050hz, 16-bits, mono), and, for convenience, numbered. Example:**\n", + "\n", + "#@markdown * **1.wav**\n", + "#@markdown * **2.wav**\n", + "#@markdown * **3.wav**\n", + "#@markdown * **.....**\n", + "\n", + "#@markdown ---\n", + "import os\n", + "import wave\n", + "import zipfile\n", + "import datetime\n", + "\n", + "def get_dataset_duration(wav_path):\n", + " totalduration = 0\n", + " for file_name in [x for x in os.listdir(wav_path) if os.path.isfile(x) and \".wav\" in x]:\n", + " with wave.open(file_name, \"rb\") as wave_file:\n", + " frames = wave_file.getnframes()\n", + " rate = wave_file.getframerate()\n", + " duration = frames / float(rate)\n", + " totalduration += duration\n", + " wav_count = len(os.listdir(wav_path))\n", + " duration_str = str(datetime.timedelta(seconds=round(totalduration, 0)))\n", + " return wav_count, duration_str\n", + "\n", + "%cd /content\n", + "if not os.path.exists(\"/content/dataset\"):\n", + " os.makedirs(\"/content/dataset\")\n", + " os.makedirs(\"/content/dataset/wavs\")\n", + "%cd /content/dataset\n", + "#@markdown ### Audio dataset path to unzip:\n", + "zip_path = \"/content/drive/MyDrive/wavs.zip\" #@param {type:\"string\"}\n", + "zip_path = zip_path.strip()\n", + "if zip_path:\n", + " if os.path.exists(zip_path):\n", + " if zipfile.is_zipfile(zip_path):\n", + " print(\"Unzipping audio content...\")\n", + " !unzip -q -j \"{zip_path}\" -d /content/dataset/wavs\n", + " else:\n", + " print(\"Copying audio contents of this folder...\")\n", + " fp = zip_path + \"/.\"\n", + " !cp -a \"$fp\" \"/content/dataset/wavs\"\n", + " else:\n", + " raise Exception(\"The path provided to the wavs is not correct. Please set a valid path.\")\n", + "else:\n", + " raise Exception(\"You must provide with a path to the wavs.\")\n", + "if os.path.exists(\"/content/dataset/wavs/wavs\"):\n", + " for file in os.listdir(\"/content/dataset/wavs/wavs\"):\n", + " !mv /content/dataset/wavs/wavs/\"$file\" /content/dataset/wavs/\"$file\"\n", + " !rm -r /content/dataset/wavs/*.txt\n", + " !rm -r /content/dataset/wavs/*.csv\n", + "%cd /content/dataset/wavs\n", + "audio_count, dataset_dur = get_dataset_duration(\"/content/dataset/wavs\")\n", + "print(f\"Opened dataset with {audio_count} wavs with duration {dataset_dur}.\")\n", + "%cd ..\n", + "#@markdown ---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "E0W0OCvXXvue" + }, + "outputs": [], + "source": [ + "#@markdown # **2. Upload the transcript file.** 📝\n", + "#@markdown ---\n", + "#@markdown ####**Important: the transcription means writing what the character says in each of the audios, and it must have the following structure:**\n", + "\n", + "#@markdown ##### For a single-speaker dataset:\n", + "#@markdown * wavs/1.wav|This is what my character says in audio 1.\n", + "#@markdown * wavs/2.wav|This, the text that the character says in audio 2.\n", + "#@markdown * ...\n", + "\n", + "#@markdown ##### For a multi-speaker dataset:\n", + "\n", + "#@markdown * wavs/speaker1audio1.wav|speaker1|This is what the first speaker says.\n", + "#@markdown * wavs/speaker1audio2.wav|speaker1|This is another audio of the first speaker.\n", + "#@markdown * wavs/speaker2audio1.wav|speaker2|This is what the second speaker says in the first audio.\n", + "#@markdown * wavs/speaker2audio2.wav|speaker2|This is another audio of the second speaker.\n", + "#@markdown * ...\n", + "\n", + "#@markdown And so on. In addition, the transcript must be in a **.csv or .txt format. (UTF-8 without BOM)**\n", + "\n", + "#@markdown ## ![NEW!](https://web.archive.org/web/20060506120010/http://hk.geocities.com:80/man_minghk/image/new.gif) Auto-transcribe with whisper if transcription is not provided.\n", + "\n", + "#@markdown **Note: If you don't upload any transcription files, the wavs will be transcribed using the whisper tool when you execute the next step. Then, the notebook will continue with the rest of the preprocessing if there are no errors. Although the Whisper tool has good transcription results, in my experience I recommend transcribing manually and uploading it from this cell, since a good TTS voice needs to be optimized to give even better results. For example, when transcribing manually you will be able to observe every detail that the speaker makes (such as punctuation, sounds, etc.), and capture them in the transcription according to the speaker's intonations.**\n", + "\n", + "\n", + "#@markdown However, if you want to transcribe and review this transcription, you can use the individual notebooks:\n", + "\n", + "#@markdown * [English](http://colab.research.google.com/github/rmcpantoja/My-Colab-Notebooks/blob/main/notebooks/OpenAI_Whisper_-_DotCSV_(Speech_dataset_multi-transcryption_support)en.ipynb)\n", + "#@markdown * [French](http://colab.research.google.com/github/rmcpantoja/My-Colab-Notebooks/blob/main/notebooks/OpenAI_Whisper_-_DotCSV_(Speech_dataset_multi-transcryption_support)fr.ipynb)\n", + "#@markdown * [Spanish](http://colab.research.google.com/github/rmcpantoja/My-Colab-Notebooks/blob/main/notebooks/OpenAI_Whisper_-_DotCSV_(Speech_dataset_multi-transcryption_support)es.ipynb)\n", + "\n", + "#@markdown ---\n", + "%cd /content/dataset\n", + "from google.colab import files\n", + "!rm /content/dataset/metadata.csv\n", + "\n", + "if os.path.exists(\"/content/dataset/wavs/_transcription.txt\"):\n", + " !mv \"/content/dataset/wavs/_transcription.txt\" metadata.csv\n", + "else:\n", + " listfn, length = files.upload().popitem()\n", + " if listfn != \"metadata.csv\":\n", + " !mv \"$listfn\" metadata.csv\n", + "\n", + "use_whisper = False\n", + "%cd .." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "dOyx9Y6JYvRF" + }, + "outputs": [], + "source": [ + "#@markdown # **3. Preprocess dataset.** 🔄\n", + "#@markdown ---\n", + "import os\n", + "if use_whisper:\n", + " import torch\n", + " from faster_whisper import WhisperModel\n", + " from tqdm import tqdm\n", + " from google import colab\n", + "\n", + " device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + " print(f\"Using device: {device}\")\n", + "\n", + " def make_dataset(path, language):\n", + " metadata = \"\"\n", + " text = \"\"\n", + " files = [f for f in os.listdir(path) if f.endswith(\".wav\")]\n", + " assert len(files) > 0, \"You don't have wavs uploaded either! Please upload at least one zip with the wavs in step 2.\"\n", + " metadata_file = open(f\"{path}/../metadata.csv\", \"w\")\n", + " whisper = WhisperModel(\"large-v3\", device=device, compute_type=\"float16\")\n", + " for audio_file in tqdm(files):\n", + " full_path = os.path.join(path, audio_file)\n", + " segments, _ = whisper.transcribe(full_path, word_timestamps=False, language=language)\n", + " for segment in segments:\n", + " text += segment.text\n", + " text = text.strip()\n", + " text = text.replace('\\n', ' ')\n", + " metadata = f\"{audio_file}|{text}\\n\"\n", + " metadata_file.write(metadata)\n", + " text = \"\"\n", + " colab.files.download(f\"{path}/../metadata.csv\")\n", + " del whisper\n", + " return True\n", + "\n", + "#@markdown ### First of all, select the language of your dataset.\n", + "language = \"English (U.S.)\" #@param [\"ألعَرَبِي\", \"Català\", \"čeština\", \"Dansk\", \"Deutsch\", \"Ελληνικά\", \"English (British)\", \"English (U.S.)\", \"Español (Castellano)\", \"Español (Latinoamericano)\", \"Suomi\", \"Français\", \"Magyar\", \"Icelandic\", \"Italiano\", \"ქართული\", \"қазақша\", \"Lëtzebuergesch\", \"नेपाली\", \"Nederlands\", \"Norsk\", \"Polski\", \"Português (Brasil)\", \"Português (Portugal)\", \"Română\", \"Русский\", \"Српски\", \"Svenska\", \"Kiswahili\", \"Türkçe\", \"украї́нська\", \"Tiếng Việt\", \"简体中文\"]\n", + "#@markdown ---\n", + "# language definition:\n", + "languages = {\n", + " \"ألعَرَبِي\": \"ar\",\n", + " \"Català\": \"ca\",\n", + " \"čeština\": \"cs\",\n", + " \"Dansk\": \"da\",\n", + " \"Deutsch\": \"de\",\n", + " \"Ελληνικά\": \"el\",\n", + " \"English (British)\": \"en\",\n", + " \"English (U.S.)\": \"en-us\",\n", + " \"Español (Castellano)\": \"es\",\n", + " \"Español (Latinoamericano)\": \"es-419\",\n", + " \"Suomi.\": \"fi\",\n", + " \"Français\": \"fr\",\n", + " \"Magyar\": \"hu\",\n", + " \"Icelandic\": \"is\",\n", + " \"Italiano\": \"it\",\n", + " \"ქართული\": \"ka\",\n", + " \"қазақша\": \"kk\",\n", + " \"Lëtzebuergesch\": \"lb\",\n", + " \"नेपाली\": \"ne\",\n", + " \"Nederlands\": \"nl\",\n", + " \"Norsk\": \"nb\",\n", + " \"Polski\": \"pl\",\n", + " \"Português (Brasil)\": \"pt-br\",\n", + " \"Português (Portugal)\": \"pt-pt\",\n", + " \"Română\": \"ro\",\n", + " \"Русский\": \"ru\",\n", + " \"Српски\": \"sr\",\n", + " \"Svenska\": \"sv\",\n", + " \"Kiswahili\": \"sw\",\n", + " \"Türkçe\": \"tr\",\n", + " \"украї́нська\": \"uk\",\n", + " \"Tiếng Việt\": \"vi\",\n", + " \"简体中文\": \"zh\"\n", + "}\n", + "\n", + "def _get_language(code):\n", + " return languages[code]\n", + "\n", + "final_language = _get_language(language)\n", + "#@markdown ### Choose a name for your model:\n", + "model_name = \"Test\" #@param {type:\"string\"}\n", + "#@markdown ---\n", + "# output:\n", + "#@markdown ### Choose the working folder: (recommended to save to Drive)\n", + "\n", + "#@markdown The working folder will be used in preprocessing, but also in training the model.\n", + "output_path = \"/content/drive/MyDrive/colab/piper\" #@param {type:\"string\"}\n", + "output_dir = output_path+\"/\"+model_name\n", + "if not os.path.exists(output_dir):\n", + " os.makedirs(output_dir)\n", + "#@markdown ---\n", + "#@markdown ### Choose dataset format:\n", + "dataset_format = \"ljspeech\" #@param [\"ljspeech\", \"mycroft\"]\n", + "#@markdown ---\n", + "#@markdown ### Is this a single speaker dataset? Otherwise, uncheck:\n", + "single_speaker = True #@param {type:\"boolean\"}\n", + "if single_speaker:\n", + " force_sp = \" --single-speaker\"\n", + "else:\n", + " force_sp = \"\"\n", + "#@markdown ---\n", + "#@markdown ### Select the sample rate of the dataset:\n", + "sample_rate = \"22050\" #@param [\"16000\", \"22050\"]\n", + "#@markdown ---\n", + "# creating paths:\n", + "if not os.path.exists(\"/content/audio_cache\"):\n", + " os.makedirs(\"/content/audio_cache\")\n", + "%cd /content/piper/src/python\n", + "#@markdown ### Do you want to train using this sample rate, but your audios don't have it?\n", + "#@markdown The resampler helps you do it quickly!\n", + "resample = False #@param {type:\"boolean\"}\n", + "if resample:\n", + " !python resample.py --input_dir \"/content/dataset/wavs\" --output_dir \"/content/dataset/wavs_resampled\" --output_sr {sample_rate} --file_ext \"wav\"\n", + " !mv /content/dataset/wavs_resampled/* /content/dataset/wavs\n", + "#@markdown ---\n", + "# check transcription:\n", + "if use_whisper:\n", + " print(\"Transcript file hasn't been uploaded. Transcribing these audios using Whisper...\")\n", + " make_dataset(\"/content/dataset/wavs\", final_language[:2])\n", + " print(\"Transcription done! Pre-processing...\")\n", + "!python -m piper_train.preprocess \\\n", + " --language {final_language} \\\n", + " --input-dir /content/dataset \\\n", + " --cache-dir \"/content/audio_cache\" \\\n", + " --output-dir \"{output_dir}\" \\\n", + " --dataset-name \"{model_name}\" \\\n", + " --dataset-format {dataset_format} \\\n", + " --sample-rate {sample_rate} \\\n", + " {force_sp}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "ickQlOCRjkBL" + }, + "outputs": [], + "source": [ + "#@markdown # **4. Settings.** 🧰\n", + "#@markdown ---\n", + "import json\n", + "import ipywidgets as widgets\n", + "from IPython.display import display\n", + "from google.colab import output\n", + "import os\n", + "import re\n", + "import glob\n", + "#@markdown ### **Select the action to train this dataset: (READ CAREFULLY)**\n", + "\n", + "#@markdown * The option to continue a training is self-explanatory. If you've previously trained a model with free colab, your time is up and you're considering training it some more, this is ideal for you. You just have to set the same settings that you set when you first trained this model.\n", + "#@markdown * The option to convert a single-speaker model to a multi-speaker model is self-explanatory, and for this it is important that you have processed a dataset that contains text and audio from all possible speakers that you want to train in your model.\n", + "#@markdown * The finetune option is used to train a dataset using a pretrained model, that is, train on that data. This option is ideal if you want to train a very small dataset (more than five minutes recommended).\n", + "#@markdown * The train from scratch option builds features such as dictionary and speech form from scratch, and this may take longer to converge. For this, hours of audio (8 at least) are recommended, which have a large collection of phonemes.\n", + "\n", + "action = \"finetune\" #@param [\"Continue training\", \"convert single-speaker to multi-speaker model\", \"finetune\", \"train from scratch\"]\n", + "#@markdown ---\n", + "if action == \"Continue training\":\n", + " checkpoints = glob.glob(f\"{output_dir}/lightning_logs/**/checkpoints/last.ckpt\", recursive=True)\n", + " if len(checkpoints):\n", + " last_checkpoint = sorted(checkpoints, key=lambda x: int(re.findall(r'version_(\\d+)', x)[0]))[-1]\n", + " ft_command = f'--resume_from_checkpoint \"{last_checkpoint}\" '\n", + " print(f\"\\033[93mContinuing {model_name}'s training at: {last_checkpoint}\")\n", + " else:\n", + " raise Exception(\"Training cannot be continued as there is no checkpoint to continue at.\")\n", + "elif action == \"finetune\":\n", + " if os.path.exists(f\"{output_dir}/lightning_logs/version_0/checkpoints/last.ckpt\"):\n", + " raise Exception(\"Oh no! You have already trained this model before, you cannot choose this option since your progress will be lost, and then your previous time will not count. Please select the option to continue a training.\")\n", + " else:\n", + " ft_command = '--resume_from_checkpoint \"/content/pretrained.ckpt\" '\n", + "elif action == \"convert single-speaker to multi-speaker model\":\n", + " if not single_speaker:\n", + " ft_command = '--resume_from_single_speaker_checkpoint \"/content/pretrained.ckpt\" '\n", + " else:\n", + " raise Exception(\"This dataset is not a multi-speaker dataset!\")\n", + "else:\n", + " ft_command = \"\"\n", + "if action== \"convert single-speaker to multi-speaker model\" or action == \"finetune\":\n", + " try:\n", + " with open('/content/piper/notebooks/pretrained_models.json') as f:\n", + " pretrained_models = json.load(f)\n", + " if final_language in pretrained_models:\n", + " models = pretrained_models[final_language]\n", + " model_options = [(model_name, model_name) for model_name, model_url in models.items()]\n", + " model_dropdown = widgets.Dropdown(description = \"Choose pretrained model\", options=model_options)\n", + " download_button = widgets.Button(description=\"Download\")\n", + " def download_model(btn):\n", + " model_name = model_dropdown.value\n", + " model_url = pretrained_models[final_language][model_name]\n", + " print(\"\\033[93mDownloading pretrained model...\")\n", + " if model_url.startswith(\"1\"):\n", + " !gdown -q \"{model_url}\" -O \"/content/pretrained.ckpt\"\n", + " elif model_url.startswith(\"https://drive.google.com/file/d/\"):\n", + " !gdown -q \"{model_url}\" -O \"/content/pretrained.ckpt\" --fuzzy\n", + " else:\n", + " !wget -q \"{model_url}\" -O \"/content/pretrained.ckpt\"\n", + " model_dropdown.close()\n", + " download_button.close()\n", + " output.clear()\n", + " if os.path.exists(\"/content/pretrained.ckpt\"):\n", + " print(\"\\033[93mModel downloaded!\")\n", + " else:\n", + " raise Exception(\"Couldn't download the pretrained model!\")\n", + " download_button.on_click(download_model)\n", + " display(model_dropdown, download_button)\n", + " else:\n", + " raise Exception(f\"There are no pretrained models available for the language {final_language}\")\n", + " except FileNotFoundError:\n", + " raise Exception(\"The pretrained_models.json file was not found.\")\n", + "else:\n", + " print(\"\\033[93mWarning: this model will be trained from scratch. You need at least 8 hours of data for everything to work decent. Good luck!\")\n", + "#@markdown ### Choose batch size based on this dataset:\n", + "batch_size = 12 #@param {type:\"integer\"}\n", + "#@markdown ---\n", + "\n", + "#@markdown ### Choose the quality for this model:\n", + "\n", + "#@markdown * x-low - 16Khz audio, 5-7M params\n", + "#@markdown * medium - 22.05Khz audio, 15-20 params\n", + "#@markdown * high - 22.05Khz audio, 28-32M params\n", + "quality = \"medium\" #@param [\"high\", \"x-low\", \"medium\"]\n", + "#@markdown ---\n", + "#@markdown ### For how many epochs to save training checkpoints?\n", + "#@markdown The larger your dataset, you should set this saving interval to a smaller value, as epochs can progress longer time.\n", + "checkpoint_epochs = 5 #@param {type:\"integer\"}\n", + "#@markdown ---\n", + "#@markdown ### Interval to save best k models:\n", + "#@markdown Set to 0 if you want to disable saving multiple models. If this is the case, check the checkbox below. If set to 1, models will be saved with the file name epoch=xx-step=xx.ckpt, so you will need to empty Drive's trash every so often.\n", + "num_ckpt = 0 #@param {type:\"integer\"}\n", + "#@markdown ---\n", + "#@markdown ### Save latest model:\n", + "#@markdown This checkbox must be checked if you want to save a single model (last.ckpt). Saving a single model is applied only if num_ckpt is equal to 0. If so, the interval parameter of epochs to save is ignored, since the last model per epoch is saved; also, you won't have to worry about storage. Being equal to 1, last.ckpt will be saved, but another model (model_vVersion.ckpt, the latter takes into account the epoch range you set), so you would have to empty the trash often.\n", + "\n", + "#@markdown **It's not recommended to use this option in extremely small datasets, since by saving the last model each epoch, this process will be very fast and the trainer will not be able to save the complete model, which would result in a corrupt last.ckpt.**\n", + "save_last = True # @param {type:\"boolean\"}\n", + "#@markdown ---\n", + "#@markdown ### Step interval to generate model samples:\n", + "log_every_n_steps = 1000 #@param {type:\"integer\"}\n", + "#@markdown ---\n", + "#@markdown ### Training epochs:\n", + "max_epochs = 10000 #@param {type:\"integer\"}\n", + "#@markdown ---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "MpKDfhAHjHJ3" + }, + "outputs": [], + "source": [ + "#@markdown # **5. Run the TensorBoard extension.** 📈\n", + "#@markdown ---\n", + "#@markdown The TensorBoard is used to visualize the results of the model while it's being trained such as audio and losses.\n", + "\n", + "%load_ext tensorboard\n", + "%tensorboard --logdir {output_dir}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "X4zbSjXg2J3N" + }, + "outputs": [], + "source": [ + "#@markdown # **6. Train.** 🏋️‍♂️\n", + "#@markdown ---\n", + "#@markdown ### Run this cell to train your final model!\n", + "\n", + "#@markdown ---\n", + "#@markdown ### **Disable validation?**\n", + "#@markdown By uncheck this checkbox, this will allow to train the full dataset, without using any audio files or examples as a validation set. So, it will not be able to generate audios on the tensorboard while it's training. It is recommended to disable validation on extremely small datasets.\n", + "\n", + "validation = True #@param {type:\"boolean\"}\n", + "\n", + "#@markdown ---\n", + "\n", + "#@markdown ### **Continue previous training?**\n", + "#@markdown Want to continue training from previous checkpoint file? Fill the input below with a _Google Drive_ share link to your checkpoint file. No previous checkpoint? Just leave it blank.\n", + "\n", + "last_checkpoint = '' #@param {type:\"string\"}\n", + "\n", + "if validation:\n", + " validation_split = 0.01\n", + " num_test_examples = 1\n", + "else:\n", + " validation_split = 0\n", + " num_test_examples = 0\n", + "if not save_last:\n", + " save_last_command = \"\"\n", + "else:\n", + " save_last_command = \"--save_last True \"\n", + "\n", + "resume_from_checkpoint = ''\n", + "\n", + "if not last_checkpoint == '':\n", + " if not os.path.exists('/content/previous-checkpoint'):\n", + " os.makedirs('/content/previous-checkpoint')\n", + "\n", + " # setting the path where to store the previous checkpoint file\n", + " previous_checkpoint_path = '/content/previous-checkpoint/checkpoint.ckpt'\n", + "\n", + " # downloading previous checkpoint from google drive share link\n", + " !gdown -q \"{last_checkpoint}\" -O \"/content/previous-checkpoint/checkpoint.ckpt\" --fuzzy\n", + "\n", + " # defining command arg that will start the training from the last checkpoint\n", + " resume_from_checkpoint = '--resume_from_checkpoint {previous_checkpoint_path}' if single_speaker else '--resume_from_single_speaker_checkpoint {previous_checkpoint_path}'\n", + "\n", + "get_ipython().system(f'''\n", + "python -m piper_train \\\n", + "--dataset-dir \"{output_dir}\" \\\n", + "{resume_from_checkpoint} \\\n", + "--accelerator 'gpu' \\\n", + "--devices 1 \\\n", + "--batch-size {batch_size} \\\n", + "--validation-split {validation_split} \\\n", + "--num-test-examples {num_test_examples} \\\n", + "--quality {quality} \\\n", + "--checkpoint-epochs {checkpoint_epochs} \\\n", + "--num_ckpt {num_ckpt} \\\n", + "{save_last_command}\\\n", + "--log_every_n_steps {log_every_n_steps} \\\n", + "--max_epochs {max_epochs} \\\n", + "{ft_command}\\\n", + "--precision 32\n", + "''')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6ISG085SYn85" + }, + "source": [ + "# **Have you finished training and want to test the model?**\n", + "\n", + "* If you want to run this model in any software that Piper integrates or the same Piper app, export your model using the [model exporter notebook](https://colab.research.google.com/github/pitekantrop/piper/blob/master/notebooks/piper_model_exporter.ipynb)!\n", + "* Wait! I want to test this right now before exporting it to the supported format for Piper. Test your generated last.ckpt with [this notebook](https://colab.research.google.com/github/pitekantrop/piper/blob/master/notebooks/piper_inference_(ckpt).ipynb)!" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "include_colab_link": true, + "provenance": [] + }, + "gpuClass": "standard", + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 30cc9513a127705378066b4b1ed20cacf2415e83 Mon Sep 17 00:00:00 2001 From: "Isaque C. Silva" <152451006+isaquecsilva@users.noreply.github.com> Date: Thu, 31 Oct 2024 19:27:02 -0300 Subject: [PATCH 2/3] bugfix: it was not working before due to bad strings formatting, now fixed. --- notebooks/piper_multilingual_training_notebook.ipynb | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/notebooks/piper_multilingual_training_notebook.ipynb b/notebooks/piper_multilingual_training_notebook.ipynb index 0f98f654..192982d2 100644 --- a/notebooks/piper_multilingual_training_notebook.ipynb +++ b/notebooks/piper_multilingual_training_notebook.ipynb @@ -579,15 +579,16 @@ " previous_checkpoint_path = '/content/previous-checkpoint/checkpoint.ckpt'\n", "\n", " # downloading previous checkpoint from google drive share link\n", - " !gdown -q \"{last_checkpoint}\" -O \"/content/previous-checkpoint/checkpoint.ckpt\" --fuzzy\n", + " print(f'Downloading previous checkpoint from: %s' %last_checkpoint)\n", + "\n", + " !gdown -q \"{last_checkpoint}\" -O \"{previous_checkpoint_path}\" --fuzzy\n", "\n", " # defining command arg that will start the training from the last checkpoint\n", - " resume_from_checkpoint = '--resume_from_checkpoint {previous_checkpoint_path}' if single_speaker else '--resume_from_single_speaker_checkpoint {previous_checkpoint_path}'\n", + " resume_from_checkpoint = f'--resume_from_checkpoint=\"{previous_checkpoint_path}\"' if single_speaker else f'--resume_from_single_speaker_checkpoint=\"{previous_checkpoint_path}\"'\n", "\n", "get_ipython().system(f'''\n", "python -m piper_train \\\n", "--dataset-dir \"{output_dir}\" \\\n", - "{resume_from_checkpoint} \\\n", "--accelerator 'gpu' \\\n", "--devices 1 \\\n", "--batch-size {batch_size} \\\n", @@ -600,7 +601,8 @@ "--log_every_n_steps {log_every_n_steps} \\\n", "--max_epochs {max_epochs} \\\n", "{ft_command}\\\n", - "--precision 32\n", + "--precision 32 \\\n", + "{resume_from_checkpoint}\n", "''')\n" ] }, From 638cd51110c985cc601bfc13b3b40f2d6be14860 Mon Sep 17 00:00:00 2001 From: "Isaque C. Silva" <152451006+isaquecsilva@users.noreply.github.com> Date: Thu, 31 Oct 2024 19:35:15 -0300 Subject: [PATCH 3/3] Open In Colab link fixed to match correct branch. --- notebooks/piper_multilingual_training_notebook.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/piper_multilingual_training_notebook.ipynb b/notebooks/piper_multilingual_training_notebook.ipynb index 192982d2..c311a438 100644 --- a/notebooks/piper_multilingual_training_notebook.ipynb +++ b/notebooks/piper_multilingual_training_notebook.ipynb @@ -7,7 +7,7 @@ "id": "view-in-github" }, "source": [ - "\"Open" + "\"Open" ] }, {