From 06db40706f9aaab367630314a2804a27c0a5282a Mon Sep 17 00:00:00 2001 From: zzdoog Date: Wed, 31 Jul 2024 13:06:17 +0800 Subject: [PATCH] Fix weird pulse at the end of the model --- Colab/StyleTTS2_Demo_LibriTTS.ipynb | 155 ++++++++------- Colab/StyleTTS2_Finetune_Demo.ipynb | 294 ++++++++++++++-------------- Demo/Inference_LibriTTS.ipynb | 11 +- 3 files changed, 240 insertions(+), 220 deletions(-) diff --git a/Colab/StyleTTS2_Demo_LibriTTS.ipynb b/Colab/StyleTTS2_Demo_LibriTTS.ipynb index be125469..839fbd2e 100644 --- a/Colab/StyleTTS2_Demo_LibriTTS.ipynb +++ b/Colab/StyleTTS2_Demo_LibriTTS.ipynb @@ -3,8 +3,8 @@ { "cell_type": "markdown", "metadata": { - "id": "view-in-github", - "colab_type": "text" + "colab_type": "text", + "id": "view-in-github" }, "source": [ "\"Open" @@ -31,11 +31,9 @@ }, "outputs": [ { - "output_type": "stream", "name": "stdout", - "text": [ - - ] + "output_type": "stream", + "text": [] } ], "source": [ @@ -237,6 +235,9 @@ " duration = torch.sigmoid(duration).sum(axis=-1)\n", " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n", "\n", + " # Eliminate potential noise at the end of the audio during generation.\n", + " if not text[-1].isalnum():\n", + " pred_dur[-1] = 1\n", "\n", " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n", " c_frame = 0\n", @@ -314,6 +315,9 @@ " duration = torch.sigmoid(duration).sum(axis=-1)\n", " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n", "\n", + " # Eliminate potential noise at the end of the audio during generation.\n", + " if not text[-1].isalnum():\n", + " pred_dur[-1] = 1\n", "\n", " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n", " c_frame = 0\n", @@ -397,6 +401,9 @@ " duration = torch.sigmoid(duration).sum(axis=-1)\n", " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n", "\n", + " # Eliminate potential noise at the end of the audio during generation.\n", + " if not text[-1].isalnum():\n", + " pred_dur[-1] = 1\n", "\n", " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n", " c_frame = 0\n", @@ -994,16 +1001,21 @@ }, { "cell_type": "markdown", + "metadata": { + "id": "hPKg9eYpL00f" + }, "source": [ "#### Extreme setting (`alpha = 1, beta=1`)\n", "This setting uses 0% of the reference timbre and prosody and use the diffusion model to sample the entire style. This makes the speaker very dissimilar to the reference speaker." - ], - "metadata": { - "id": "hPKg9eYpL00f" - } + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ei-7JOccL0bF" + }, + "outputs": [], "source": [ "# unseen speaker\n", "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n", @@ -1013,25 +1025,25 @@ "for _ in range(5):\n", " wav = inference(text, ref_s, diffusion_steps=10, alpha=1, beta=1, embedding_scale=1)\n", " display(ipd.Audio(wav, rate=24000, normalize=False))" - ], - "metadata": { - "id": "Ei-7JOccL0bF" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", + "metadata": { + "id": "FVMPc3bhL3eL" + }, "source": [ "#### No variation (`alpha = 0, beta=0`)\n", "This setting uses 100% of the reference timbre and prosody and do not use the diffusion model at all. This makes the speaker very similar to the reference speaker, but there is no variation." - ], - "metadata": { - "id": "FVMPc3bhL3eL" - } + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yh1QZ7uhL4wM" + }, + "outputs": [], "source": [ "# unseen speaker\n", "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n", @@ -1041,35 +1053,35 @@ "for _ in range(5):\n", " wav = inference(text, ref_s, diffusion_steps=10, alpha=0, beta=0, embedding_scale=1)\n", " display(ipd.Audio(wav, rate=24000, normalize=False))" - ], - "metadata": { - "id": "yh1QZ7uhL4wM" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", + "metadata": { + "id": "T0EvkWrAMBDB" + }, "source": [ "### Extra fun!\n", "\n", "You can record your own voice and clone it using pre-trained StyleTTS 2 model here." - ], - "metadata": { - "id": "T0EvkWrAMBDB" - } + ] }, { "cell_type": "markdown", - "source": [ - "#### Run the following cell to record your voice for 5 seconds. Please keep speaking to have the best effect." - ], "metadata": { "id": "R985j5QONY8I" - } + }, + "source": [ + "#### Run the following cell to record your voice for 5 seconds. Please keep speaking to have the best effect." + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MWrFs0KWMBpz" + }, + "outputs": [], "source": [ "# all imports\n", "from IPython.display import Javascript\n", @@ -1106,71 +1118,71 @@ " with open('audio.wav','wb') as f:\n", " f.write(b)\n", " return 'audio.wav' # or webm ?" - ], - "metadata": { - "id": "MWrFs0KWMBpz" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "#### Please run this cell and speak:" - ], "metadata": { "id": "z35qXwM0Nhx1" - } + }, + "source": [ + "#### Please run this cell and speak:" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KUEoFyQBMR-8" + }, + "outputs": [], "source": [ "print('Speak now for 5 seconds.')\n", "audio = record(sec=5)\n", "import IPython.display as ipd\n", "display(ipd.Audio(audio, rate=24000, normalize=False))" - ], - "metadata": { - "id": "KUEoFyQBMR-8" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "#### Synthesize in your own voice" - ], "metadata": { "id": "OQS_7IBpNmM1" - } + }, + "source": [ + "#### Synthesize in your own voice" + ] }, { "cell_type": "code", - "source": [ - "text = ''' StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis. ''' # @param {type:\"string\"}\n" - ], + "execution_count": null, "metadata": { "cellView": "form", "id": "c0I3LY7vM8Ta" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "text = ''' StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis. ''' # @param {type:\"string\"}\n" + ] }, { "cell_type": "code", - "source": [ - "reference_dicts = {}\n", - "reference_dicts['You'] = audio" - ], + "execution_count": null, "metadata": { "id": "80eW-pwxNCxu" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "reference_dicts = {}\n", + "reference_dicts['You'] = audio" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yIga6MTuNJaN" + }, + "outputs": [], "source": [ "start = time.time()\n", "noise = torch.randn(1,1,256).to(device)\n", @@ -1185,25 +1197,20 @@ " display(ipd.Audio(wav, rate=24000, normalize=False))\n", " print('Reference:')\n", " display(ipd.Audio(path, rate=24000, normalize=False))" - ], - "metadata": { - "id": "yIga6MTuNJaN" - }, - "execution_count": null, - "outputs": [] + ] } ], "metadata": { "accelerator": "GPU", "colab": { - "provenance": [], + "authorship_tag": "ABX9TyPQdFTqqVEknEG/ma/HMfU+", "collapsed_sections": [ "aAGQPfgYIR23", "eJdB_nCOIVIN", "R985j5QONY8I" ], - "authorship_tag": "ABX9TyPQdFTqqVEknEG/ma/HMfU+", - "include_colab_link": true + "include_colab_link": true, + "provenance": [] }, "kernelspec": { "display_name": "Python 3", diff --git a/Colab/StyleTTS2_Finetune_Demo.ipynb b/Colab/StyleTTS2_Finetune_Demo.ipynb index facfdadd..de06a154 100644 --- a/Colab/StyleTTS2_Finetune_Demo.ipynb +++ b/Colab/StyleTTS2_Finetune_Demo.ipynb @@ -1,28 +1,10 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "gpuType": "T4", - "authorship_tag": "ABX9TyNiDU9ykIeYxO86Lmuid+ph", - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - }, - "accelerator": "GPU" - }, "cells": [ { "cell_type": "markdown", "metadata": { - "id": "view-in-github", - "colab_type": "text" + "colab_type": "text", + "id": "view-in-github" }, "source": [ "\"Open" @@ -30,15 +12,20 @@ }, { "cell_type": "markdown", - "source": [ - "### Install packages and download models" - ], "metadata": { "id": "yLqBa4uYPrqE" - } + }, + "source": [ + "### Install packages and download models" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "H72WF06ZPrTF" + }, + "outputs": [], "source": [ "%%shell\n", "git clone https://github.com/yl4579/StyleTTS2.git\n", @@ -47,75 +34,75 @@ "sudo apt-get install espeak-ng\n", "git-lfs clone https://huggingface.co/yl4579/StyleTTS2-LibriTTS\n", "mv StyleTTS2-LibriTTS/Models ." - ], - "metadata": { - "id": "H72WF06ZPrTF" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", + "metadata": { + "id": "G398sL8wPzTB" + }, "source": [ "### Download dataset (LJSpeech, 200 samples, ~15 minutes of data)\n", "\n", "You can definitely do it with fewer samples. This is just a proof of concept with 200 smaples." - ], - "metadata": { - "id": "G398sL8wPzTB" - } + ] }, { "cell_type": "code", - "source": [ - "%cd StyleTTS2\n", - "!rm -rf Data" - ], + "execution_count": null, "metadata": { "id": "kJuQUBrEPy5C" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "%cd StyleTTS2\n", + "!rm -rf Data" + ] }, { "cell_type": "code", - "source": [ - "!gdown --id 1vqz26D3yn7OXS2vbfYxfSnpLS6m6tOFP\n", - "!unzip Data.zip" - ], + "execution_count": null, "metadata": { "id": "mDXW8ZZePuSb" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "!gdown --id 1vqz26D3yn7OXS2vbfYxfSnpLS6m6tOFP\n", + "!unzip Data.zip" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "_AlBQREWU8ud" + }, "source": [ "### Change the finetuning config\n", "\n", "Depending on the GPU you got, you may want to change the bacth size, max audio length, epiochs and so on." - ], - "metadata": { - "id": "_AlBQREWU8ud" - } + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7uEITi0hU4I2" + }, + "outputs": [], "source": [ "config_path = \"Configs/config_ft.yml\"\n", "\n", "import yaml\n", "config = yaml.safe_load(open(config_path))" - ], - "metadata": { - "id": "7uEITi0hU4I2" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "TPTRgOKSVT4K" + }, + "outputs": [], "source": [ "config['data_params']['root_path'] = \"Data/wavs\"\n", "\n", @@ -125,58 +112,58 @@ "\n", "with open(config_path, 'w') as outfile:\n", " yaml.dump(config, outfile, default_flow_style=True)" - ], - "metadata": { - "id": "TPTRgOKSVT4K" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "### Start finetuning\n" - ], "metadata": { "id": "uUuB_19NWj2Y" - } + }, + "source": [ + "### Start finetuning\n" + ] }, { "cell_type": "code", - "source": [ - "!python train_finetune.py --config_path ./Configs/config_ft.yml" - ], + "execution_count": null, "metadata": { "id": "HZVAD5GKWm-O" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "!python train_finetune.py --config_path ./Configs/config_ft.yml" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "I0_7wsGkXGfc" + }, "source": [ "### Test the model quality\n", "\n", "Note that this mainly serves as a proof of concept due to RAM limitation of free Colab instances. A lot of settings are suboptimal. In the future when DDP works for train_second.py, we will also add mixed precision finetuning to save time and RAM. You can also add SLM adversarial training run if you have paid Colab services (such as A100 with 40G of RAM)." - ], - "metadata": { - "id": "I0_7wsGkXGfc" - } + ] }, { "cell_type": "code", - "source": [ - "import nltk\n", - "nltk.download('punkt')" - ], + "execution_count": null, "metadata": { "id": "OPLphjbncE7p" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "import nltk\n", + "nltk.download('punkt')" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jIIAoDACXJL0" + }, + "outputs": [], "source": [ "import torch\n", "torch.manual_seed(0)\n", @@ -263,39 +250,39 @@ "model = build_model(model_params, text_aligner, pitch_extractor, plbert)\n", "_ = [model[key].eval() for key in model]\n", "_ = [model[key].to(device) for key in model]" - ], - "metadata": { - "id": "jIIAoDACXJL0" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "source": [ - "files = [f for f in os.listdir(\"Models/LJSpeech/\") if f.endswith('.pth')]\n", - "sorted_files = sorted(files, key=lambda x: int(x.split('_')[-1].split('.')[0]))" - ], + "execution_count": null, "metadata": { "id": "eKXRAyyzcMpQ" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "files = [f for f in os.listdir(\"Models/LJSpeech/\") if f.endswith('.pth')]\n", + "sorted_files = sorted(files, key=lambda x: int(x.split('_')[-1].split('.')[0]))" + ] }, { "cell_type": "code", - "source": [ - "params_whole = torch.load(\"Models/LJSpeech/\" + sorted_files[-1], map_location='cpu')\n", - "params = params_whole['net']" - ], + "execution_count": null, "metadata": { "id": "ULuU9-VDb9Pk" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "params_whole = torch.load(\"Models/LJSpeech/\" + sorted_files[-1], map_location='cpu')\n", + "params = params_whole['net']" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "J-U29yIYc2ea" + }, + "outputs": [], "source": [ "for key in model:\n", " if key in params:\n", @@ -314,26 +301,26 @@ "# except:\n", "# _load(params[key], model[key])\n", "_ = [model[key].eval() for key in model]" - ], - "metadata": { - "id": "J-U29yIYc2ea" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "source": [ - "from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule" - ], + "execution_count": null, "metadata": { "id": "jrPQ_Yrwc3n6" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "n2CWYNoqc455" + }, + "outputs": [], "source": [ "sampler = DiffusionSampler(\n", " model.diffusion.diffusion,\n", @@ -341,15 +328,15 @@ " sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters\n", " clamp=False\n", ")" - ], - "metadata": { - "id": "n2CWYNoqc455" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2x5kVb3nc_eY" + }, + "outputs": [], "source": [ "def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):\n", " text = text.strip()\n", @@ -390,6 +377,10 @@ " duration = torch.sigmoid(duration).sum(axis=-1)\n", " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n", "\n", + " # Eliminate potential noise at the end of the audio during generation.\n", + " if not text[-1].isalnum():\n", + " pred_dur[-1] = 1\n", + "\n", " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n", " c_frame = 0\n", " for i in range(pred_aln_trg.size(0)):\n", @@ -418,50 +409,50 @@ "\n", "\n", " return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later" - ], - "metadata": { - "id": "2x5kVb3nc_eY" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "### Synthesize speech" - ], "metadata": { "id": "O159JnwCc6CC" - } + }, + "source": [ + "### Synthesize speech" + ] }, { "cell_type": "code", - "source": [ - "text = '''Maltby and Company would issue warrants on them deliverable to the importer, and the goods were then passed to be stored in neighboring warehouses.\n", - "'''" - ], + "execution_count": null, "metadata": { "id": "ThciXQ6rc9Eq" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "text = '''Maltby and Company would issue warrants on them deliverable to the importer, and the goods were then passed to be stored in neighboring warehouses.\n", + "'''" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jldPkJyCc83a" + }, + "outputs": [], "source": [ "# get a random reference in the training set, note that it doesn't matter which one you use\n", "path = \"Data/wavs/LJ001-0110.wav\"\n", "# this style vector ref_s can be saved as a parameter together with the model weights\n", "ref_s = compute_style(path)" - ], - "metadata": { - "id": "jldPkJyCc83a" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_mIU0jqDdQ-c" + }, + "outputs": [], "source": [ "start = time.time()\n", "wav = inference(text, ref_s, alpha=0.9, beta=0.9, diffusion_steps=10, embedding_scale=1)\n", @@ -469,12 +460,25 @@ "print(f\"RTF = {rtf:5f}\")\n", "import IPython.display as ipd\n", "display(ipd.Audio(wav, rate=24000, normalize=False))" - ], - "metadata": { - "id": "_mIU0jqDdQ-c" - }, - "execution_count": null, - "outputs": [] + ] } - ] + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "authorship_tag": "ABX9TyNiDU9ykIeYxO86Lmuid+ph", + "gpuType": "T4", + "include_colab_link": true, + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/Demo/Inference_LibriTTS.ipynb b/Demo/Inference_LibriTTS.ipynb index 4b85bf5f..29a17ce3 100644 --- a/Demo/Inference_LibriTTS.ipynb +++ b/Demo/Inference_LibriTTS.ipynb @@ -294,7 +294,10 @@ " duration = torch.sigmoid(duration).sum(axis=-1)\n", " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n", "\n", - "\n", + " # Eliminate potential noise at the end of the audio during generation.\n", + " if not text[-1].isalnum():\n", + " pred_dur[-1] = 1\n", + " \n", " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n", " c_frame = 0\n", " for i in range(pred_aln_trg.size(0)):\n", @@ -730,6 +733,9 @@ " duration = torch.sigmoid(duration).sum(axis=-1)\n", " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n", "\n", + " # Eliminate potential noise at the end of the audio during generation.\n", + " if not text[-1].isalnum():\n", + " pred_dur[-1] = 1\n", "\n", " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n", " c_frame = 0\n", @@ -862,6 +868,9 @@ " duration = torch.sigmoid(duration).sum(axis=-1)\n", " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n", "\n", + " # Eliminate potential noise at the end of the audio during generation.\n", + " if not text[-1].isalnum():\n", + " pred_dur[-1] = 1\n", "\n", " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n", " c_frame = 0\n",