yl4579 · ZZDoog · Jul 31, 2024
diff --git a/Colab/StyleTTS2_Demo_LibriTTS.ipynb b/Colab/StyleTTS2_Demo_LibriTTS.ipynb
@@ -3,8 +3,8 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "view-in-github",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "view-in-github"
       },
       "source": [
         "<a href=\"https://colab.research.google.com/github/yl4579/StyleTTS2/blob/main/Colab/StyleTTS2_Demo_LibriTTS.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
@@ -31,11 +31,9 @@
       },
       "outputs": [
         {
-          "output_type": "stream",
           "name": "stdout",
-          "text": [
-
-          ]
+          "output_type": "stream",
+          "text": []
         }
       ],
       "source": [
@@ -237,6 +235,9 @@
         "        duration = torch.sigmoid(duration).sum(axis=-1)\n",
         "        pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
         "\n",
+        "        # Eliminate potential noise at the end of the audio during generation.\n",
+        "        if not text[-1].isalnum():\n",
+        "            pred_dur[-1] = 1\n",
         "\n",
         "        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
         "        c_frame = 0\n",
@@ -314,6 +315,9 @@
         "      duration = torch.sigmoid(duration).sum(axis=-1)\n",
         "      pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
         "\n",
+        "      # Eliminate potential noise at the end of the audio during generation.\n",
+        "      if not text[-1].isalnum():\n",
+        "          pred_dur[-1] = 1\n",
         "\n",
         "      pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
         "      c_frame = 0\n",
@@ -397,6 +401,9 @@
         "        duration = torch.sigmoid(duration).sum(axis=-1)\n",
         "        pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
         "\n",
+        "        # Eliminate potential noise at the end of the audio during generation.\n",
+        "        if not text[-1].isalnum():\n",
+        "            pred_dur[-1] = 1\n",
         "\n",
         "        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
         "        c_frame = 0\n",
@@ -994,16 +1001,21 @@
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "hPKg9eYpL00f"
+      },
       "source": [
         "#### Extreme setting (`alpha = 1, beta=1`)\n",
         "This setting uses 0% of the reference timbre and prosody and use the diffusion model to sample the entire style. This makes the speaker very dissimilar to the reference speaker."
-      ],
-      "metadata": {
-        "id": "hPKg9eYpL00f"
-      }
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Ei-7JOccL0bF"
+      },
+      "outputs": [],
       "source": [
         "# unseen speaker\n",
         "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
@@ -1013,25 +1025,25 @@
         "for _ in range(5):\n",
         "    wav = inference(text, ref_s, diffusion_steps=10, alpha=1, beta=1, embedding_scale=1)\n",
         "    display(ipd.Audio(wav, rate=24000, normalize=False))"
-      ],
-      "metadata": {
-        "id": "Ei-7JOccL0bF"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "FVMPc3bhL3eL"
+      },
       "source": [
         "#### No variation (`alpha = 0, beta=0`)\n",
         "This setting uses 100% of the reference timbre and prosody and do not use the diffusion model at all. This makes the speaker very similar to the reference speaker, but there is no variation."
-      ],
-      "metadata": {
-        "id": "FVMPc3bhL3eL"
-      }
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "yh1QZ7uhL4wM"
+      },
+      "outputs": [],
       "source": [
         "# unseen speaker\n",
         "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
@@ -1041,35 +1053,35 @@
         "for _ in range(5):\n",
         "    wav = inference(text, ref_s, diffusion_steps=10, alpha=0, beta=0, embedding_scale=1)\n",
         "    display(ipd.Audio(wav, rate=24000, normalize=False))"
-      ],
-      "metadata": {
-        "id": "yh1QZ7uhL4wM"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "T0EvkWrAMBDB"
+      },
       "source": [
         "### Extra fun!\n",
         "\n",
         "You can record your own voice and clone it using pre-trained StyleTTS 2 model here."
-      ],
-      "metadata": {
-        "id": "T0EvkWrAMBDB"
-      }
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "#### Run the following cell to record your voice for 5 seconds. Please keep speaking to have the best effect."
-      ],
       "metadata": {
         "id": "R985j5QONY8I"
-      }
+      },
+      "source": [
+        "#### Run the following cell to record your voice for 5 seconds. Please keep speaking to have the best effect."
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "MWrFs0KWMBpz"
+      },
+      "outputs": [],
       "source": [
         "# all imports\n",
         "from IPython.display import Javascript\n",
@@ -1106,71 +1118,71 @@
         "  with open('audio.wav','wb') as f:\n",
         "    f.write(b)\n",
         "  return 'audio.wav'  # or webm ?"
-      ],
-      "metadata": {
-        "id": "MWrFs0KWMBpz"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "#### Please run this cell and speak:"
-      ],
       "metadata": {
         "id": "z35qXwM0Nhx1"
-      }
+      },
+      "source": [
+        "#### Please run this cell and speak:"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "KUEoFyQBMR-8"
+      },
+      "outputs": [],
       "source": [
         "print('Speak now for 5 seconds.')\n",
         "audio = record(sec=5)\n",
         "import IPython.display as ipd\n",
         "display(ipd.Audio(audio, rate=24000, normalize=False))"
-      ],
-      "metadata": {
-        "id": "KUEoFyQBMR-8"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "#### Synthesize in your own voice"
-      ],
       "metadata": {
         "id": "OQS_7IBpNmM1"
-      }
+      },
+      "source": [
+        "#### Synthesize in your own voice"
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "text = ''' StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis. ''' # @param {type:\"string\"}\n"
-      ],
+      "execution_count": null,
       "metadata": {
         "cellView": "form",
         "id": "c0I3LY7vM8Ta"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "text = ''' StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis. ''' # @param {type:\"string\"}\n"
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "reference_dicts = {}\n",
-        "reference_dicts['You'] = audio"
-      ],
+      "execution_count": null,
       "metadata": {
         "id": "80eW-pwxNCxu"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "reference_dicts = {}\n",
+        "reference_dicts['You'] = audio"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "yIga6MTuNJaN"
+      },
+      "outputs": [],
       "source": [
         "start = time.time()\n",
         "noise = torch.randn(1,1,256).to(device)\n",
@@ -1185,25 +1197,20 @@
         "    display(ipd.Audio(wav, rate=24000, normalize=False))\n",
         "    print('Reference:')\n",
         "    display(ipd.Audio(path, rate=24000, normalize=False))"
-      ],
-      "metadata": {
-        "id": "yIga6MTuNJaN"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     }
   ],
   "metadata": {
     "accelerator": "GPU",
     "colab": {
-      "provenance": [],
+      "authorship_tag": "ABX9TyPQdFTqqVEknEG/ma/HMfU+",
       "collapsed_sections": [
         "aAGQPfgYIR23",
         "eJdB_nCOIVIN",
         "R985j5QONY8I"
       ],
-      "authorship_tag": "ABX9TyPQdFTqqVEknEG/ma/HMfU+",
-      "include_colab_link": true
+      "include_colab_link": true,
+      "provenance": []
     },
     "kernelspec": {
       "display_name": "Python 3",