From 998c1d2d869c94370d6c1ed5c49524f4a8a93bd5 Mon Sep 17 00:00:00 2001 From: Blaise Date: Mon, 22 Jul 2024 12:57:51 +0200 Subject: [PATCH] Initial refactor - not finished --- .github/workflows/unittest.yml | 4 +- .gitignore | 7 + README.md | 424 +---- api.py | 93 - rvc.py | 1468 ---------------- rvc/configs/v1/32000.json | 2 +- rvc/configs/v1/40000.json | 2 +- rvc/configs/v1/48000.json | 2 +- rvc/configs/v2/32000.json | 2 +- rvc/configs/v2/40000.json | 2 +- rvc/configs/v2/48000.json | 2 +- rvc/infer/infer.py | 146 +- rvc/infer/pipeline.py | 31 +- rvc/lib/FCPEF0Predictor.py | 46 +- rvc/lib/algorithm/generators.py | 4 +- rvc/lib/algorithm/nsf.py | 8 +- rvc/lib/infer_pack/models.py | 14 +- .../modules/F0Predictor/DioF0Predictor.py | 16 +- .../modules/F0Predictor/HarvestF0Predictor.py | 14 +- .../modules/F0Predictor/PMF0Predictor.py | 12 +- rvc/lib/predictor/Dio.py | 16 +- rvc/lib/predictor/FCPE.py | 46 +- rvc/lib/predictor/Harvest.py | 14 +- rvc/lib/predictor/PM.py | 12 +- rvc/lib/predictor/RMVPE.py | 6 +- rvc/lib/predictors/Dio.py | 16 +- rvc/lib/predictors/FCPE.py | 46 +- rvc/lib/predictors/Harvest.py | 14 +- rvc/lib/predictors/PM.py | 12 +- rvc/lib/predictors/RMVPE.py | 8 +- rvc/lib/rmvpe.py | 6 +- rvc/lib/tools/prerequisites_download.py | 16 +- rvc/lib/tools/pretrained_selector.py | 28 +- rvc/lib/tools/tts.py | 7 +- rvc/lib/utils.py | 5 +- rvc/train/data_utils.py | 20 +- rvc/train/extract/extract_f0_print.py | 4 +- rvc/train/extract/extract_feature_print.py | 16 +- rvc/train/extract/preparing_files.py | 14 +- rvc/train/mel_processing.py | 12 +- rvc/train/preprocess/preprocess.py | 10 +- rvc/train/process/extract_index.py | 5 +- rvc/train/process/extract_model.py | 6 +- rvc/train/process/extract_small_model.py | 4 +- rvc/train/train.py | 60 +- rvc/train/utils.py | 116 +- rvc_cli.py | 1540 +++++++++++++++++ uvr.py => uvr_cli.py | 0 48 files changed, 2010 insertions(+), 2348 deletions(-) delete mode 100644 api.py delete mode 100644 rvc.py create mode 100644 rvc_cli.py rename uvr.py => uvr_cli.py (100%) diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index b751876..ff1282e 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -27,7 +27,7 @@ jobs: python core.py prerequisites --models "True" - name: Test Preprocess run: | - python core.py preprocess --model_name "Evaluate" --dataset_path "logs/mute/sliced_audios" --sampling_rate "48000" + python core.py preprocess --model_name "Evaluate" --dataset_path "logs/mute/sliced_audios" --sample_rate "48000" - name: Test Extract run: | - python core.py extract --model_name "Evaluate" --sampling_rate "48000" \ No newline at end of file + python core.py extract --model_name "Evaluate" --sample_rate "48000" \ No newline at end of file diff --git a/.gitignore b/.gitignore index 5d4b6ac..f196171 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,9 @@ +#temp +docs + +# Ignore logs folder +logs + # Ignore compiled executables *.exe @@ -9,6 +15,7 @@ *.bin *.ckpt *.yaml +*.txt # Ignore Python bytecode files *.pyc diff --git a/README.md b/README.md index 218218c..3d1b2e7 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,7 @@ -## RVC_CLI: Retrieval-based Voice Conversion Command Line Interface +## RVC_CLI: 🚀 RVC + UVR = A perfect set of tools for voice cloning, easily and free! + +> [!NOTE] +> The documentation is currently under construction. [![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/iahispano/applio/blob/master/assets/Applio_NoUI.ipynb) @@ -7,13 +10,7 @@ 1. [Installation](#installation) - [Windows](#windows) - [Linux](#linux) -2. [Getting Started](#getting-started) - - [Inference](#inference) - - [Training](#training) - - [UVR](#uvr) - - [Additional Features](#additional-features) -3. [API](#api) -4. [Credits](#credits) +2. [References](#references) ### Installation @@ -21,7 +18,7 @@ Ensure that you have the necessary Python packages installed by following these #### Windows -Execute the [install.bat](./install.bat) file to activate a Conda environment. Afterward, launch the application using `env/python.exe rvc.py` instead of the conventional `python rvc.py` command. +Execute the [install.bat](./install.bat) file to activate a Conda environment. Afterward, launch the application using `env/python.exe cli.py` instead of the conventional `python cli.py` command. #### Linux @@ -32,389 +29,50 @@ chmod +x install.sh ### Getting Started -Download the necessary models and executables by running the following command: - -```bash -python rvc.py prerequisites -``` - -_More information about the prerequisites command [here](#prerequisites-download)_ - For detailed information and command-line options, refer to the help command: ```bash -python rvc.py -h +python rvc_cli.py -h +python uvr_cli.py -h ``` This command provides a clear overview of the available modes and their corresponding parameters, facilitating effective utilization of the RVC CLI. -### Inference - -#### Single Inference - -```bash -python rvc.py infer --f0up_key "f0up_key" --filter_radius "filter_radius" --index_rate "index_rate" --hop_length "hop_length" --rms_mix_rate "rms_mix_rate" --protect "protect" --f0autotune "f0autotune" --f0method "f0method" --input_path "input_path" --output_path "output_path" --pth_path "pth_path" --index_path "index_path" --split_audio "split_audio" --clean_audio "clean_audio" --clean_strength "clean_strength" --export_format "export_format" -``` - -| Parameter Name | Required | Default | Valid Options | Description | -| ---------------- | -------- | ------- | --------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `f0up_key` | No | 0 | -24 to +24 | Set the pitch of the audio, the higher the value, thehigher the pitch. | -| `filter_radius` | No | 3 | 0 to 10 | If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration. | -| `index_rate` | No | 0.3 | 0.0 to 1.0 | Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio. | -| `hop_length` | No | 128 | 1 to 512 | Denotes the duration it takes for the system to transition to a significant pitch change. Smaller hop lengths require more time for inference but tend to yield higher pitch accuracy. | -| `rms_mix_rate` | No | 1 | 0 to 1 | Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed. | -| `protect` | No | 0.33 | 0 to 0.5 | Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect. | -| `f0autotune` | No | False | True or False | Apply a soft autotune to your inferences, recommended for singing conversions. | -| `f0method` | No | rmvpe | pm, harvest, dio, crepe, crepe-tiny, rmvpe, fcpe, hybrid[crepe+rmvpe], hybrid[crepe+fcpe], hybrid[rmvpe+fcpe], hybrid[crepe+rmvpe+fcpe] | Pitch extraction algorithm to use for the audio conversion. The default algorithm is rmvpe, which is recommended for most cases. | -| `input_path` | Yes | None | Full path to the input audio file | Full path to the input audio file | -| `output_path` | Yes | None | Full path to the output audio file | Full path to the output audio file | -| `pth_path` | Yes | None | Full path to the pth file | Full path to the pth file | -| `index_path` | Yes | None | Full index file path | Full index file path | -| `split_audio` | No | False | True or False | Split the audio into chunks for inference to obtain better results in some cases. | -| `clean_audio` | No | False | True or False | Clean your audio output using noise detection algorithms, recommended for speaking audios. | -| `clean_strength` | No | 0.7 | 0.0 to 1.0 | Set the clean-up level to the audio you want, the more you increase it the more it will clean up, but it is possible that the audio will be more compressed. | -| `export_format` | No | WAV | WAV, MP3, FLAC, OGG, M4A | File audio format | -| `embedder_model` | No | hubert | hubert or contentvec | Embedder model to use for the audio conversion. The default model is hubert, which is recommended for most cases. | -| `upscale_audio` | No | False | True or False | Upscale the audio to 48kHz for better results. | - -_Refer to `python rvc.py infer -h` for additional help._ - -#### Batch Inference - -```bash -python rvc.py batch_infer --f0up_key "f0up_key" --filter_radius "filter_radius" --index_rate "index_rate" --hop_length "hop_length" --rms_mix_rate "rms_mix_rate" --protect "protect" --f0autotune "f0autotune" --f0method "f0method" --input_folder_path "input_folder_path" --output_folder_path "output_folder_path" --pth_path "pth_path" --index_path "index_path" --split_audio "split_audio" --clean_audio "clean_audio" --clean_strength "clean_strength" --export_format "export_format" -``` - -| Parameter Name | Required | Default | Valid Options | Description | -| -------------------- | -------- | ------- | --------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `f0up_key` | No | 0 | -24 to +24 | Set the pitch of the audio, the higher the value, thehigher the pitch. | -| `filter_radius` | No | 3 | 0 to 10 | If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration. | -| `index_rate` | No | 0.3 | 0.0 to 1.0 | Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio. | -| `hop_length` | No | 128 | 1 to 512 | Denotes the duration it takes for the system to transition to a significant pitch change. Smaller hop lengths require more time for inference but tend to yield higher pitch accuracy. | -| `rms_mix_rate` | No | 1 | 0 to 1 | Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed. | -| `protect` | No | 0.33 | 0 to 0.5 | Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect. | -| `f0autotune` | No | False | True or False | Apply a soft autotune to your inferences, recommended for singing conversions. | -| `f0method` | No | rmvpe | pm, harvest, dio, crepe, crepe-tiny, rmvpe, fcpe, hybrid[crepe+rmvpe], hybrid[crepe+fcpe], hybrid[rmvpe+fcpe], hybrid[crepe+rmvpe+fcpe] | Pitch extraction algorithm to use for the audio conversion. The default algorithm is rmvpe, which is recommended for most cases. | -| `input_folder_path` | Yes | None | Full path to the input audio folder (The folder may only contain audio files) | Full path to the input audio folder | -| `output_folder_path` | Yes | None | Full path to the output audio folder | Full path to the output audio folder | -| `pth_path` | Yes | None | Full path to the pth file | Full path to the pth file | -| `index_path` | Yes | None | Full path to the index file | Full path to the index file | -| `split_audio` | No | False | True or False | Split the audio into chunks for inference to obtain better results in some cases. | -| `clean_audio` | No | False | True or False | Clean your audio output using noise detection algorithms, recommended for speaking audios. | -| `clean_strength` | No | 0.7 | 0.0 to 1.0 | Set the clean-up level to the audio you want, the more you increase it the more it will clean up, but it is possible that the audio will be more compressed. | -| `export_format` | No | WAV | WAV, MP3, FLAC, OGG, M4A | File audio format | -| `embedder_model` | No | hubert | hubert or contentvec | Embedder model to use for the audio conversion. The default model is hubert, which is recommended for most cases. | -| `upscale_audio` | No | False | True or False | Upscale the audio to 48kHz for better results. | - -_Refer to `python rvc.py batch_infer -h` for additional help._ - -#### TTS Inference - -```bash -python rvc.py tts_infer --tts_text "tts_text" --tts_voice "tts_voice" --f0up_key "f0up_key" --filter_radius "filter_radius" --index_rate "index_rate" --hop_length "hop_length" --rms_mix_rate "rms_mix_rate" --protect "protect" --f0autotune "f0autotune" --f0method "f0method" --output_tts_path "output_tts_path" --output_rvc_path "output_rvc_path" --pth_path "pth_path" --index_path "index_path"--split_audio "split_audio" --clean_audio "clean_audio" --clean_strength "clean_strength" --export_format "export_format" -``` - -| Parameter Name | Required | Default | Valid Options | Description | -| ----------------- | -------- | ------- | --------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `tts_text` | Yes | None | Text for TTS synthesis | Text for TTS synthesis | -| `tts_voice` | Yes | None | Voice for TTS synthesis | Voice for TTS synthesis | -| `f0up_key` | No | 0 | -24 to +24 | Set the pitch of the audio, the higher the value, thehigher the pitch. | -| `filter_radius` | No | 3 | 0 to 10 | If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration. | -| `index_rate` | No | 0.3 | 0.0 to 1.0 | Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio. | -| `hop_length` | No | 128 | 1 to 512 | Denotes the duration it takes for the system to transition to a significant pitch change. Smaller hop lengths require more time for inference but tend to yield higher pitch accuracy. | -| `rms_mix_rate` | No | 1 | 0 to 1 | Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed. | -| `protect` | No | 0.33 | 0 to 0.5 | Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect. | -| `f0autotune` | No | False | True or False | Apply a soft autotune to your inferences, recommended for singing conversions. | -| `f0method` | No | rmvpe | pm, harvest, dio, crepe, crepe-tiny, rmvpe, fcpe, hybrid[crepe+rmvpe], hybrid[crepe+fcpe], hybrid[rmvpe+fcpe], hybrid[crepe+rmvpe+fcpe] | Pitch extraction algorithm to use for the audio conversion. The default algorithm is rmvpe, which is recommended for most cases. | -| `output_tts_path` | Yes | None | Full path to the output TTS audio file | Full path to the output TTS audio file | -| `output_rvc_path` | Yes | None | Full path to the input RVC audio file | Full path to the input RVC audio file | -| `pth_path` | Yes | None | Full path to the pth file | Full path to the pth file | -| `index_path` | Yes | None | Full path to the index file | Full path to the index file | -| `split_audio` | No | False | True or False | Split the audio into chunks for inference to obtain better results in some cases. | -| `clean_audio` | No | False | True or False | Clean your audio output using noise detection algorithms, recommended for speaking audios. | -| `clean_strength` | No | 0.7 | 0.0 to 1.0 | Set the clean-up level to the audio you want, the more you increase it the more it will clean up, but it is possible that the audio will be more compressed. | -| `export_format` | No | WAV | WAV, MP3, FLAC, OGG, M4A | File audio format | -| `embedder_model` | No | hubert | hubert or contentvec | Embedder model to use for the audio conversion. The default model is hubert, which is recommended for most cases. | -| `upscale_audio` | No | False | True or False | Upscale the audio to 48kHz for better results. | - -_Refer to `python rvc.py tts_infer -h` for additional help._ - -### Training - -#### Preprocess Dataset - -```bash -python rvc.py preprocess --model_name "model_name" --dataset_path "dataset_path" --sampling_rate "sampling_rate" -``` - -| Parameter Name | Required | Default | Valid Options | Description | -| --------------- | -------- | ------- | ------------------------------------------------------------------------- | ------------------------------- | -| `model_name` | Yes | None | Name of the model | Name of the model | -| `dataset_path` | Yes | None | Full path to the dataset folder (The folder may only contain audio files) | Full path to the dataset folder | -| `sampling_rate` | Yes | None | 32000, 40000, or 48000 | Sampling rate of the audio data | - -_Refer to `python rvc.py preprocess -h` for additional help._ - -#### Extract Features - -```bash -python rvc.py extract --model_name "model_name" --rvc_version "rvc_version" --pitch_guidance "pitch_guidance" --hop_length "hop_length" --sampling_rate "sampling_rate" -``` - -| Parameter Name | Required | Default | Valid Options | Description | -| ---------------- | -------- | ------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `model_name` | Yes | None | Name of the model | Name of the model | -| `rvc_version` | No | v2 | v1 or v2 | Version of the model | -| `pitch_guidance` | No | True | True or False | By employing pitch guidance, it becomes feasible to mirror the intonation of the original voice, including its pitch. This feature is particularly valuable for singing and other scenarios where preserving the original melody or pitch pattern is essential. | -| `hop_length` | No | 128 | 1 to 512 | Denotes the duration it takes for the system to transition to a significant pitch change. Smaller hop lengths require more time for inference but tend to yield higher pitch accuracy. | -| `sampling_rate` | Yes | None | 32000, 40000, or 48000 | Sampling rate of the audio data | -| `embedder_model` | No | hubert | hubert or contentvec | Embedder model to use for the audio conversion. The default model is hubert, which is recommended for most cases. | - -#### Start Training - -```bash -python rvc.py train --model_name "model_name" --rvc_version "rvc_version" --save_every_epoch "save_every_epoch" --save_only_latest "save_only_latest" --save_every_weights "save_every_weights" --total_epoch "total_epoch" --sampling_rate "sampling_rate" --batch_size "batch_size" --gpu "gpu" --pitch_guidance "pitch_guidance" --overtraining_detector "overtraining_detector" --overtraining_threshold "overtraining_threshold" --sync_graph "sync_graph" --pretrained "pretrained" --custom_pretrained "custom_pretrained" [--g_pretrained "g_pretrained"] [--d_pretrained "d_pretrained"] -``` - -| Parameter Name | Required | Default | Valid Options | Description | -| ------------------------ | -------- | ------- | ----------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `model_name` | Yes | None | Name of the model | Name of the model | -| `rvc_version` | No | v2 | v1 or v2 | Version of the model | -| `save_every_epoch` | Yes | None | 1 to 50 | Determine at how many epochs the model will saved at. | -| `save_only_latest` | No | False | True or False | Enabling this setting will result in the G and D files saving only their most recent versions, effectively conserving storage space. | -| `save_every_weights` | No | True | True or False | This setting enables you to save the weights of the model at the conclusion of each epoch. | -| `total_epoch` | No | 1000 | 1 to 10000 | Specifies the overall quantity of epochs for the model training process. | -| `sampling_rate` | Yes | None | 32000, 40000, or 48000 | Sampling rate of the audio data | -| `batch_size` | No | 8 | 1 to 50 | It's advisable to align it with the available VRAM of your GPU. A setting of 4 offers improved accuracy but slower processing, while 8 provides faster and standard results. | -| `gpu` | No | 0 | 0 to ∞ separated by - | Specify the number of GPUs you wish to utilize for training by entering them separated by hyphens (-). | -| `pitch_guidance` | No | True | True or False | By employing pitch guidance, it becomes feasible to mirror the intonation of the original voice, including its pitch. This feature is particularly valuable for singing and other scenarios where preserving the original melody or pitch pattern is essential. | -| `overtraining_detector` | No | False | True or False | Utilize the overtraining detector to prevent overfitting. This feature is particularly valuable for scenarios where the model is at risk of overfitting. | -| `overtraining_threshold` | No | 50 | 1 to 100 | Set the threshold for the overtraining detector. The lower the value, the more sensitive the detector will be. | -| `pretrained` | No | True | True or False | Utilize pretrained models when training your own. This approach reduces training duration and enhances overall quality. | -| `custom_pretrained` | No | False | True or False | Utilizing custom pretrained models can lead to superior results, as selecting the most suitable pretrained models tailored to the specific use case can significantly enhance performance. | -| `g_pretrained` | No | None | Full path to pretrained file G, only if you have used custom_pretrained | Full path to pretrained file G | -| `d_pretrained` | No | None | Full path to pretrained file D, only if you have used custom_pretrained | Full path to pretrained file D | -| `sync_graph` | No | False | True or False | Synchronize the graph of the tensorbaord. Only enable this setting if you are training a new model. | - -_Refer to `python rvc.py train -h` for additional help._ - -#### Generate Index File - -```bash -python rvc.py index --model_name "model_name" --rvc_version "rvc_version" -``` - -| Parameter Name | Required | Default | Valid Options | Description | -| -------------- | -------- | ------- | ----------------- | -------------------- | -| `model_name` | Yes | None | Name of the model | Name of the model | -| `rvc_version` | Yes | None | v1 or v2 | Version of the model | - -_Refer to `python rvc.py index -h` for additional help._ - -### UVR - -```bash -python uvr.py [audio_file] [options] -``` - -#### Info and Debugging - -| Parameter Name | Required | Default | Valid Options | Description | -| --------------------- | -------- | ------- | ------------------------- | ---------------------------------------------------------------------- | -| `audio_file` | Yes | None | Any valid audio file path | The path to the audio file you want to separate, in any common format. | -| `-d`, `--debug` | No | False | | Enable debug logging. | -| `-e`, `--env_info` | No | False | | Print environment information and exit. | -| `-l`, `--list_models` | No | False | | List all supported models and exit. | -| `--log_level` | No | info | info, debug, warning | Log level. | - -#### Separation I/O Params - -| Parameter Name | Required | Default | Valid Options | Description | -| ------------------------ | -------- | ---------------------------- | ------------------------- | ---------------------------------- | -| `-m`, `--model_filename` | No | UVR-MDX-NET-Inst_HQ_3.onnx | Any valid model file path | Model to use for separation. | -| `--output_format` | No | WAV | Any common audio format | Output format for separated files. | -| `--output_dir` | No | None | Any valid directory path | Directory to write output files. | -| `--model_file_dir` | No | /tmp/audio-separator-models/ | Any valid directory path | Model files directory. | - -#### Common Separation Parameters - -| Parameter Name | Required | Default | Valid Options | Description | -| ----------------- | -------- | ------- | ------------------------------------------------------- | ---------------------------------------------------------- | -| `--invert_spect` | No | False | | Invert secondary stem using spectrogram. | -| `--normalization` | No | 0.9 | Any float value | Max peak amplitude to normalize input and output audio to. | -| `--single_stem` | No | None | Instrumental, Vocals, Drums, Bass, Guitar, Piano, Other | Output only a single stem. | -| `--sample_rate` | No | 44100 | Any integer value | Modify the sample rate of the output audio. | - -#### MDXC Architecture Parameters - -| Parameter Name | Required | Default | Valid Options | Description | -| ------------------------------------ | -------- | ------- | ----------------- | ---------------------------------------------------------------------------------- | -| `--mdxc_segment_size` | No | 256 | Any integer value | Size of segments for MDXC architecture. | -| `--mdxc_override_model_segment_size` | No | False | | Opverride model default segment size instead of using the model default value. | -| `--mdxc_overlap` | No | 8 | 2 to 50 | Amount of overlap between prediction windows for MDXC architecture. | -| `--mdxc_batch_size` | No | 1 | Any integer value | Batch size for MDXC architecture. | -| `--mdxc_pitch_shift` | No | 0 | Any integer value | Shift audio pitch by a number of semitones while processing for MDXC architecture. | - -#### MDX Architecture Parameters - -| Parameter Name | Required | Default | Valid Options | Description | -| ---------------------- | -------- | ------- | ----------------- | ------------------------------------------------------------------ | -| `--mdx_segment_size` | No | 256 | Any integer value | Size of segments for MDX architecture. | -| `--mdx_overlap` | No | 0.25 | 0.001 to 0.999 | Amount of overlap between prediction windows for MDX architecture. | -| `--mdx_batch_size` | No | 1 | Any integer value | Batch size for MDX architecture. | -| `--mdx_hop_length` | No | 1024 | Any integer value | Hop length for MDX architecture. | -| `--mdx_enable_denoise` | No | False | | Enable denoising during separation for MDX architecture. | - -#### Demucs Architecture Parameters - -| Parameter Name | Required | Default | Valid Options | Description | -| --------------------------- | -------- | ------- | ----------------- | ----------------------------------------------------------------- | -| `--demucs_segment_size` | No | Default | Any integer value | Size of segments for Demucs architecture. | -| `--demucs_shifts` | No | 2 | Any integer value | Number of predictions with random shifts for Demucs architecture. | -| `--demucs_overlap` | No | 0.25 | 0.001 to 0.999 | Overlap between prediction windows for Demucs architecture. | -| `--demucs_segments_enabled` | No | True | | Enable segment-wise processing for Demucs architecture. | - -#### VR Architecture Parameters - -| Parameter Name | Required | Default | Valid Options | Description | -| ----------------------------- | -------- | ------- | ----------------- | --------------------------------------------------------------------- | -| `--vr_batch_size` | No | 4 | Any integer value | Batch size for VR architecture. | -| `--vr_window_size` | No | 512 | Any integer value | Window size for VR architecture. | -| `--vr_aggression` | No | 5 | -100 to 100 | Intensity of primary stem extraction for VR architecture. | -| `--vr_enable_tta` | No | False | | Enable Test-Time-Augmentation for VR architecture. | -| `--vr_high_end_process` | No | False | | Mirror the missing frequency range of the output for VR architecture. | -| `--vr_enable_post_process` | No | False | | Identify leftover artifacts within vocal output for VR architecture. | -| `--vr_post_process_threshold` | No | 0.2 | 0.1 to 0.3 | Threshold for post-process feature for VR architecture. | - -### Additional Features - -#### Model Extract - -```bash -python rvc.py model_extract --pth_path "pth_path" --model_name "model_name" --sampling_rate "sampling_rate" --pitch_guidance "pitch_guidance" --rvc_version "rvc_version" --epoch "epoch" --step "step" -``` - -| Parameter Name | Required | Default | Valid Options | Description | -| ---------------- | -------- | ------- | ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `pth_path` | Yes | None | Path to the pth file | Full path to the pth file | -| `model_name` | Yes | None | Name of the model | Name of the model | -| `sampling_rate` | Yes | None | 32000, 40000, or 48000 | Sampling rate of the audio data | -| `pitch_guidance` | Yes | None | True or False | By employing pitch guidance, it becomes feasible to mirror the intonation of the original voice, including its pitch. This feature is particularly valuable for singing and other scenarios where preserving the original melody or pitch pattern is essential. | -| `rvc_version` | Yes | None | v1 or v2 | Version of the model | -| `epoch` | Yes | None | 1 to 10000 | Specifies the overall quantity of epochs for the model training process. | -| `step` | Yes | None | 1 to ∞ | Specifies the overall quantity of steps for the model training process. | - -#### Model Information - -```bash -python rvc.py model_information --pth_path "pth_path" -``` - -| Parameter Name | Required | Default | Valid Options | Description | -| -------------- | -------- | ------- | -------------------- | ------------------------- | -| `pth_path` | Yes | None | Path to the pth file | Full path to the pth file | - -#### Model Blender - -```bash -python rvc.py model_blender --model_name "model_name" --pth_path_1 "pth_path_1" --pth_path_2 "pth_path_2" --ratio "ratio" -``` - -| Parameter Name | Required | Default | Valid Options | Description | -| -------------- | -------- | ------- | --------------------------- | -------------------------------- | -| `model_name` | Yes | None | Name of the model | Name of the model | -| `pth_path_1` | Yes | None | Path to the first pth file | Full path to the first pth file | -| `pth_path_2` | Yes | None | Path to the second pth file | Full path to the second pth file | -| `ratio` | No | 0.5 | 0.0 to 1 | Value for blender ratio | - -#### Launch TensorBoard - -```bash -python rvc.py tensorboard -``` - -#### Download Models - -Run the download script with the following command: - -```bash -python rvc.py download --model_link "model_link" -``` - -| Parameter Name | Required | Default | Valid Options | Description | -| -------------- | -------- | ------- | --------------------------------------------------------------------------- | ----------------- | -| `model_link` | Yes | None | Link of the model (enclosed in double quotes; Google Drive or Hugging Face) | Link of the model | - -_Refer to `python rvc.py download -h` for additional help._ - -#### Audio Analyzer - -```bash -python rvc.py audio_analyzer --input_path "input_path" -``` - -| Parameter Name | Required | Default | Valid Options | Description | -| -------------- | -------- | ------- | --------------------------------- | --------------------------------- | -| `input_path` | Yes | None | Full path to the input audio file | Full path to the input audio file | - -_Refer to `python rvc.py audio_analyzer -h` for additional help._ - -#### Prerequisites Download - -```bash -python rvc.py prerequisites --pretraineds_v1 "pretraineds_v1" --pretraineds_v2 "--pretraineds_v2" --models "models" --exe "exe" -``` - -| Parameter Name | Required | Default | Valid Options | Description | -| ---------------- | -------- | ------- | ------------- | --------------------------------------------------------------------------------------------- | -| `pretraineds_v1` | No | True | True or False | Download pretrained models for v1 | -| `pretraineds_v2` | No | True | True or False | Download pretrained models for v2 | -| `models` | No | True | True or False | Download models for v1 and v2 | -| `exe` | No | True | True or False | Download the necessary executable files for the CLI to function properly (FFmpeg and FFprobe) | - -### API - -```bash -python rvc.py api --host "host" --port "port" -``` - -| Parameter Name | Required | Default | Valid Options | Description | -| -------------- | -------- | --------- | --------------------- | --------------------- | -| `host` | No | 127.0.0.1 | Value for host IP | Value for host IP | -| `port` | No | 8000 | Value for port number | Value for port number | - -To use the RVC CLI via the API, utilize the provided script. Make API requests to the following endpoints: - -- **Docs**: `/docs` -- **Ping**: `/ping` -- **Infer**: `/infer` -- **Batch Infer**: `/batch_infer` -- **TTS**: `/tts` -- **Preprocess**: `/preprocess` -- **Extract**: `/extract` -- **Train**: `/train` -- **Index**: `/index` -- **Model Information**: `/model_information` -- **Model Fusion**: `/model_fusion` -- **Download**: `/download` - -Make POST requests to these endpoints with the same required parameters as in CLI mode. - -### Credits +### References The RVC CLI builds upon the foundations of the following projects: -- [ContentVec](https://github.com/auspicious3000/contentvec/) by auspicious3000 -- [HIFIGAN](https://github.com/jik876/hifi-gan) by jik876 -- [audio-slicer](https://github.com/openvpi/audio-slicer) by openvpi -- [python-audio-separator](https://github.com/karaokenerds/python-audio-separator) by karaokenerds -- [RMVPE](https://github.com/Dream-High/RMVPE) by Dream-High -- [FCPE](https://github.com/CNChTu/FCPE) by CNChTu -- [VITS](https://github.com/jaywalnut310/vits) by jaywalnut310 -- [So-Vits-SVC](https://github.com/svc-develop-team/so-vits-svc) by svc-develop-team -- [Harmonify](https://huggingface.co/Eempostor/Harmonify) by Eempostor -- [Retrieval-based-Voice-Conversion-WebUI](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI) by RVC-Project -- [Mangio-RVC-Fork](https://github.com/Mangio621/Mangio-RVC-Fork) by Mangio621 -- [anyf0](https://github.com/SoulMelody/anyf0) by SoulMelody +- **Vocoders:** + + - [HiFi-GAN](https://github.com/jik876/hifi-gan) by jik876 + - [Vocos](https://github.com/gemelo-ai/vocos) by gemelo-ai + - [BigVGAN](https://github.com/NVIDIA/BigVGAN) by NVIDIA + - [BigVSAN](https://github.com/sony/bigvsan) by sony + - [vocoders](https://github.com/reppy4620/vocoders) by reppy4620 + - [vocoder](https://github.com/fishaudio/vocoder) by fishaudio + +- **VC Clients:** + + - [Retrieval-based-Voice-Conversion-WebUI](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI) by RVC-Project + - [So-Vits-SVC](https://github.com/svc-develop-team/so-vits-svc) by svc-develop-team + - [Mangio-RVC-Fork](https://github.com/Mangio621/Mangio-RVC-Fork) by Mangio621 + - [VITS](https://github.com/jaywalnut310/vits) by jaywalnut310 + - [Harmonify](https://huggingface.co/Eempostor/Harmonify) by Eempostor + - [rvc-trainer](https://github.com/thepowerfuldeez/rvc-trainer) by thepowerfuldeez + +- **Pitch Extractors:** + + - [RMVPE](https://github.com/Dream-High/RMVPE) by Dream-High + - [torchfcpe](https://github.com/CNChTu/FCPE) by CNChTu + - [torchcrepe](https://github.com/maxrmorrison/torchcrepe) by maxrmorrison + - [anyf0](https://github.com/SoulMelody/anyf0) by SoulMelody + +- **Other:** + - [FAIRSEQ](https://github.com/facebookresearch/fairseq) by facebookresearch + - [FAISS](https://github.com/facebookresearch/faiss) by facebookresearch + - [ContentVec](https://github.com/auspicious3000/contentvec/) by auspicious3000 + - [audio-slicer](https://github.com/openvpi/audio-slicer) by openvpi + - [python-audio-separator](https://github.com/karaokenerds/python-audio-separator) by karaokenerds + - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui) by Anjok07 We acknowledge and appreciate the contributions of the respective authors and communities involved in these projects. diff --git a/api.py b/api.py deleted file mode 100644 index bcaae79..0000000 --- a/api.py +++ /dev/null @@ -1,93 +0,0 @@ -from fastapi import FastAPI, Request -import subprocess -import time - -app = FastAPI() - - -# Helper function to execute commands -def execute_command(command): - try: - result = subprocess.run(command, capture_output=True, text=True) - return {"output": result.stdout, "error": result.stderr} - except Exception as e: - return {"error": str(e)} - - -# Infer -@app.post("/infer") -async def infer(request: Request): - command = ["python", "rvc.py", "infer"] + await request.json() - return execute_command(command) - - -# Batch Infer -@app.post("/batch_infer") -async def batch_infer(request: Request): - command = ["python", "rvc.py", "batch_infer"] + await request.json() - return execute_command(command) - - -# TTS -@app.post("/tts") -async def tts(request: Request): - command = ["python", "rvc.py", "tts"] + await request.json() - return execute_command(command) - - -# Preprocess -@app.post("/preprocess") -async def preprocess(request: Request): - command = ["python", "rvc.py", "preprocess"] + await request.json() - return execute_command(command) - - -# Extract -@app.post("/extract") -async def extract(request: Request): - command = ["python", "rvc.py", "extract"] + await request.json() - return execute_command(command) - - -# Train -@app.post("/train") -async def train(request: Request): - command = ["python", "rvc.py", "train"] + await request.json() - return execute_command(command) - - -# Index -@app.post("/index") -async def index(request: Request): - command = ["python", "rvc.py", "index"] + await request.json() - return execute_command(command) - - -# Model Information -@app.post("/model_information") -async def model_information(request: Request): - command = ["python", "rvc.py", "model_information"] + await request.json() - return execute_command(command) - - -# Model Fusion -@app.post("/model_fusion") -async def model_fusion(request: Request): - command = ["python", "rvc.py", "model_fusion"] + await request.json() - return execute_command(command) - - -# Download -@app.post("/download") -async def download(request: Request): - command = ["python", "rvc.py", "download"] + await request.json() - return execute_command(command) - - -# Ping endpoint to check latency -@app.get("/ping") -async def ping(): - start_time = time.time() - end_time = time.time() - latency = end_time - start_time - return {"ping": "pong", "latency": latency} diff --git a/rvc.py b/rvc.py deleted file mode 100644 index bda832e..0000000 --- a/rvc.py +++ /dev/null @@ -1,1468 +0,0 @@ -import os -import sys -import json -import argparse -import subprocess - -now_dir = os.getcwd() -sys.path.append(now_dir) - -from rvc.configs.config import Config - -from rvc.lib.tools.prerequisites_download import prequisites_download_pipeline -from rvc.train.extract.preparing_files import generate_config, generate_filelist -from rvc.lib.tools.pretrained_selector import pretrained_selector - -from rvc.train.process.model_blender import model_blender -from rvc.train.process.model_information import model_information -from rvc.train.process.extract_small_model import extract_small_model - -from rvc.infer.infer import VoiceConverter - -infer_pipeline = VoiceConverter() - -from rvc.lib.tools.analyzer import analyze_audio - -from rvc.lib.tools.launch_tensorboard import launch_tensorboard_pipeline - -from rvc.lib.tools.model_download import model_download_pipeline - -config = Config() -current_script_directory = os.path.dirname(os.path.realpath(__file__)) -logs_path = os.path.join(current_script_directory, "logs") - -# Get TTS Voices -> https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=6A5AA1D4EAFF4E9FB37E23D68491D6F4 -with open(os.path.join("rvc", "lib", "tools", "tts_voices.json"), "r") as f: - voices_data = json.load(f) - - -locales = list({voice["Locale"] for voice in voices_data}) -python = sys.executable - - -# Infer -def run_infer_script( - f0_up_key, - filter_radius, - index_rate, - rms_mix_rate, - protect, - hop_length, - f0_method, - input_path, - output_path, - pth_path, - index_path, - split_audio, - f0_autotune, - clean_audio, - clean_strength, - export_format, - embedder_model, - embedder_model_custom, - upscale_audio, - f0_file, -): - f0_autotune = "True" if str(f0_autotune) == "True" else "False" - clean_audio = "True" if str(clean_audio) == "True" else "False" - upscale_audio = "True" if str(upscale_audio) == "True" else "False" - infer_pipeline.convert_audio( - f0_up_key=f0_up_key, - filter_radius=filter_radius, - index_rate=index_rate, - rms_mix_rate=rms_mix_rate, - protect=protect, - hop_length=hop_length, - f0_method=f0_method, - audio_input_path=input_path, - audio_output_path=output_path, - model_path=pth_path, - index_path=index_path, - split_audio=split_audio, - f0_autotune=f0_autotune, - clean_audio=clean_audio, - clean_strength=clean_strength, - export_format=export_format, - embedder_model=embedder_model, - embedder_model_custom=embedder_model_custom, - upscale_audio=upscale_audio, - f0_file=f0_file, - ) - return f"File {input_path} inferred successfully.", output_path.replace( - ".wav", f".{export_format.lower()}" - ) - - -# Batch infer -def run_batch_infer_script( - f0_up_key, - filter_radius, - index_rate, - rms_mix_rate, - protect, - hop_length, - f0_method, - input_folder, - output_folder, - pth_path, - index_path, - split_audio, - f0_autotune, - clean_audio, - clean_strength, - export_format, - embedder_model, - embedder_model_custom, - upscale_audio, - f0_file, -): - f0_autotune = "True" if str(f0_autotune) == "True" else "False" - clean_audio = "True" if str(clean_audio) == "True" else "False" - upscale_audio = "True" if str(upscale_audio) == "True" else "False" - audio_files = [ - f for f in os.listdir(input_folder) if f.endswith((".mp3", ".wav", ".flac")) - ] - print(f"Detected {len(audio_files)} audio files for inference.") - - for audio_file in audio_files: - if "_output" in audio_file: - pass - else: - input_path = os.path.join(input_folder, audio_file) - output_file_name = os.path.splitext(os.path.basename(audio_file))[0] - output_path = os.path.join( - output_folder, - f"{output_file_name}_output{os.path.splitext(audio_file)[1]}", - ) - print(f"Inferring {input_path}...") - - infer_pipeline.convert_audio( - f0_up_key=f0_up_key, - filter_radius=filter_radius, - index_rate=index_rate, - rms_mix_rate=rms_mix_rate, - protect=protect, - hop_length=hop_length, - f0_method=f0_method, - audio_input_path=input_path, - audio_output_path=output_path, - model_path=pth_path, - index_path=index_path, - split_audio=split_audio, - f0_autotune=f0_autotune, - clean_audio=clean_audio, - clean_strength=clean_strength, - export_format=export_format, - embedder_model=embedder_model, - embedder_model_custom=embedder_model_custom, - upscale_audio=upscale_audio, - f0_file=f0_file, - ) - - return f"Files from {input_folder} inferred successfully." - - -# TTS -def run_tts_script( - tts_text, - tts_voice, - tts_rate, - f0_up_key, - filter_radius, - index_rate, - rms_mix_rate, - protect, - hop_length, - f0_method, - output_tts_path, - output_rvc_path, - pth_path, - index_path, - split_audio, - f0_autotune, - clean_audio, - clean_strength, - export_format, - embedder_model, - embedder_model_custom, - upscale_audio, - f0_file, -): - f0_autotune = "True" if str(f0_autotune) == "True" else "False" - clean_audio = "True" if str(clean_audio) == "True" else "False" - upscale_audio = "True" if str(upscale_audio) == "True" else "False" - tts_script_path = os.path.join("rvc", "lib", "tools", "tts.py") - - if os.path.exists(output_tts_path): - os.remove(output_tts_path) - - command_tts = [ - python, - tts_script_path, - tts_text, - tts_voice, - str(tts_rate), - output_tts_path, - ] - subprocess.run(command_tts) - - infer_pipeline.convert_audio( - f0_up_key=f0_up_key, - filter_radius=filter_radius, - index_rate=index_rate, - rms_mix_rate=rms_mix_rate, - protect=protect, - hop_length=hop_length, - f0_method=f0_method, - audio_input_path=output_tts_path, - audio_output_path=output_rvc_path, - model_path=pth_path, - index_path=index_path, - split_audio=split_audio, - f0_autotune=f0_autotune, - clean_audio=clean_audio, - clean_strength=clean_strength, - export_format=export_format, - embedder_model=embedder_model, - embedder_model_custom=embedder_model_custom, - upscale_audio=upscale_audio, - f0_file=f0_file, - ) - - return f"Text {tts_text} synthesized successfully.", output_rvc_path.replace( - ".wav", f".{export_format.lower()}" - ) - - -# Preprocess -def run_preprocess_script(model_name, dataset_path, sampling_rate, cpu_cores): - per = 3.0 if config.is_half else 3.7 - preprocess_script_path = os.path.join("rvc", "train", "preprocess", "preprocess.py") - command = [ - python, - preprocess_script_path, - *map( - str, - [ - os.path.join(logs_path, model_name), - dataset_path, - sampling_rate, - per, - cpu_cores, - ], - ), - ] - - os.makedirs(os.path.join(logs_path, model_name), exist_ok=True) - subprocess.run(command) - return f"Model {model_name} preprocessed successfully." - - -# Extract -def run_extract_script( - model_name, - rvc_version, - f0_method, - pitch_guidance, - hop_length, - cpu_cores, - sampling_rate, - embedder_model, - embedder_model_custom, -): - model_path = os.path.join(logs_path, model_name) - extract_f0_script_path = os.path.join( - "rvc", "train", "extract", "extract_f0_print.py" - ) - extract_feature_script_path = os.path.join( - "rvc", "train", "extract", "extract_feature_print.py" - ) - - command_1 = [ - python, - extract_f0_script_path, - *map( - str, - [ - model_path, - f0_method, - hop_length, - cpu_cores, - ], - ), - ] - command_2 = [ - python, - extract_feature_script_path, - *map( - str, - [ - config.device, - "1", - "0", - "0", - model_path, - rvc_version, - "True", - embedder_model, - embedder_model_custom, - ], - ), - ] - subprocess.run(command_1) - subprocess.run(command_2) - - f0 = 1 if str(pitch_guidance) == "True" else 0 - generate_config(rvc_version, sampling_rate, model_path) - generate_filelist(f0, model_path, rvc_version, sampling_rate) - return f"Model {model_name} extracted successfully." - - -# Train -def run_train_script( - model_name, - rvc_version, - save_every_epoch, - save_only_latest, - save_every_weights, - total_epoch, - sampling_rate, - batch_size, - gpu, - pitch_guidance, - overtraining_detector, - overtraining_threshold, - pretrained, - custom_pretrained, - sync_graph, - cache_data_in_gpu, - g_pretrained_path=None, - d_pretrained_path=None, -): - f0 = 1 if str(pitch_guidance) == "True" else 0 - latest = 1 if str(save_only_latest) == "True" else 0 - save_every = 1 if str(save_every_weights) == "True" else 0 - detector = 1 if str(overtraining_detector) == "True" else 0 - sync = 1 if str(sync_graph) == "True" else 0 - cache_data = 1 if str(cache_data_in_gpu) == "True" else 0 - - if str(pretrained) == "True": - if str(custom_pretrained) == "False": - pg, pd = pretrained_selector(f0)[rvc_version][sampling_rate] - else: - if g_pretrained_path is None or d_pretrained_path is None: - raise ValueError( - "Please provide the path to the pretrained G and D models." - ) - pg, pd = g_pretrained_path, d_pretrained_path - else: - pg, pd = "", "" - - train_script_path = os.path.join("rvc", "train", "train.py") - command = [ - python, - train_script_path, - *map( - str, - [ - "-se", - save_every_epoch, - "-te", - total_epoch, - "-pg", - pg, - "-pd", - pd, - "-sr", - sampling_rate, - "-bs", - batch_size, - "-g", - gpu, - "-e", - os.path.join(logs_path, model_name), - "-v", - rvc_version, - "-l", - latest, - "-c", - cache_data, - "-sw", - save_every, - "-f0", - f0, - "-od", - detector, - "-ot", - overtraining_threshold, - "-sg", - sync, - ], - ), - ] - - subprocess.run(command) - run_index_script(model_name, rvc_version) - return f"Model {model_name} trained successfully." - - -# Index -def run_index_script(model_name, rvc_version): - index_script_path = os.path.join("rvc", "train", "process", "extract_index.py") - command = [ - python, - index_script_path, - os.path.join(logs_path, model_name), - rvc_version, - ] - - subprocess.run(command) - return f"Index file for {model_name} generated successfully." - - -# Model extract -def run_model_extract_script( - pth_path, model_name, sampling_rate, pitch_guidance, rvc_version, epoch, step -): - f0 = 1 if str(pitch_guidance) == "True" else 0 - extract_small_model( - pth_path, model_name, sampling_rate, f0, rvc_version, epoch, step - ) - return f"Model {model_name} extracted successfully." - - -# Model information -def run_model_information_script(pth_path): - print(model_information(pth_path)) - - -# Model blender -def run_model_blender_script(model_name, pth_path_1, pth_path_2, ratio): - message, model_blended = model_blender(model_name, pth_path_1, pth_path_2, ratio) - return message, model_blended - - -# Tensorboard -def run_tensorboard_script(): - launch_tensorboard_pipeline() - - -# Download -def run_download_script(model_link): - model_download_pipeline(model_link) - return f"Model downloaded successfully." - - -# Prerequisites -def run_prerequisites_script(pretraineds_v1, pretraineds_v2, models, exe): - prequisites_download_pipeline(pretraineds_v1, pretraineds_v2, models, exe) - return "Prerequisites installed successfully." - - -# Audio analyzer -def run_audio_analyzer_script(input_path, save_plot_path="logs/audio_analysis.png"): - audio_info, plot_path = analyze_audio(input_path, save_plot_path) - print( - f"Audio info of {input_path}: {audio_info}", - f"Audio file {input_path} analyzed successfully. Plot saved at: {plot_path}", - ) - return audio_info, plot_path - - -# API -def run_api_script(ip, port): - command = [ - "env/Scripts/uvicorn.exe" if os.name == "nt" else "uvicorn", - "api:app", - "--host", - ip, - "--port", - port, - ] - subprocess.run(command) - - -# Parse arguments -def parse_arguments(): - parser = argparse.ArgumentParser( - description="Run the main.py script with specific parameters." - ) - subparsers = parser.add_subparsers( - title="subcommands", dest="mode", help="Choose a mode" - ) - - # Parser for 'infer' mode - infer_parser = subparsers.add_parser("infer", help="Run inference") - infer_parser.add_argument( - "--f0_up_key", - type=str, - help="Value for f0_up_key", - choices=[str(i) for i in range(-24, 25)], - default="0", - ) - infer_parser.add_argument( - "--filter_radius", - type=str, - help="Value for filter_radius", - choices=[str(i) for i in range(11)], - default="3", - ) - infer_parser.add_argument( - "--index_rate", - type=str, - help="Value for index_rate", - choices=[str(i / 10) for i in range(11)], - default="0.3", - ) - infer_parser.add_argument( - "--rms_mix_rate", - type=str, - help="Value for rms_mix_rate", - choices=[str(i / 10) for i in range(11)], - default="1", - ) - infer_parser.add_argument( - "--protect", - type=str, - help="Value for protect", - choices=[str(i / 10) for i in range(6)], - default="0.33", - ) - infer_parser.add_argument( - "--hop_length", - type=str, - help="Value for hop_length", - choices=[str(i) for i in range(1, 513)], - default="128", - ) - infer_parser.add_argument( - "--f0_method", - type=str, - help="Value for f0_method", - choices=[ - "crepe", - "crepe-tiny", - "rmvpe", - "fcpe", - "hybrid[crepe+rmvpe]", - "hybrid[crepe+fcpe]", - "hybrid[rmvpe+fcpe]", - "hybrid[crepe+rmvpe+fcpe]", - ], - default="rmvpe", - ) - infer_parser.add_argument("--input_path", type=str, help="Input path") - infer_parser.add_argument("--output_path", type=str, help="Output path") - infer_parser.add_argument("--pth_path", type=str, help="Path to the .pth file") - infer_parser.add_argument( - "--index_path", - type=str, - help="Path to the .index file", - ) - infer_parser.add_argument( - "--split_audio", - type=str, - help="Enable split audio", - choices=["True", "False"], - default="False", - ) - infer_parser.add_argument( - "--f0_autotune", - type=str, - help="Enable autotune", - choices=["True", "False"], - default="False", - ) - infer_parser.add_argument( - "--clean_audio", - type=str, - help="Enable clean audio", - choices=["True", "False"], - default="False", - ) - infer_parser.add_argument( - "--clean_strength", - type=str, - help="Value for clean_strength", - choices=[str(i / 10) for i in range(11)], - default="0.7", - ) - infer_parser.add_argument( - "--export_format", - type=str, - help="Export format", - choices=["WAV", "MP3", "FLAC", "OGG", "M4A"], - default="WAV", - ) - infer_parser.add_argument( - "--embedder_model", - type=str, - help="Embedder model", - choices=[ - "contentvec", - "japanese-hubert-base", - "chinese-hubert-large", - "custom", - ], - default="contentvec", - ) - infer_parser.add_argument( - "--embedder_model_custom", - type=str, - help="Custom Embedder model", - default=None, - ) - infer_parser.add_argument( - "--upscale_audio", - type=str, - help="Enable audio upscaling", - choices=["True", "False"], - default="False", - ) - infer_parser.add_argument( - "--f0_file", - type=str, - help="Path to the f0 file", - default=None, - ) - - # Parser for 'batch_infer' mode - batch_infer_parser = subparsers.add_parser( - "batch_infer", help="Run batch inference" - ) - batch_infer_parser.add_argument( - "--f0_up_key", - type=str, - help="Value for f0_up_key", - choices=[str(i) for i in range(-24, 25)], - default="0", - ) - batch_infer_parser.add_argument( - "--filter_radius", - type=str, - help="Value for filter_radius", - choices=[str(i) for i in range(11)], - default="3", - ) - batch_infer_parser.add_argument( - "--index_rate", - type=str, - help="Value for index_rate", - choices=[str(i / 10) for i in range(11)], - default="0.3", - ) - batch_infer_parser.add_argument( - "--rms_mix_rate", - type=str, - help="Value for rms_mix_rate", - choices=[str(i / 10) for i in range(11)], - default="1", - ) - batch_infer_parser.add_argument( - "--protect", - type=str, - help="Value for protect", - choices=[str(i / 10) for i in range(6)], - default="0.33", - ) - batch_infer_parser.add_argument( - "--hop_length", - type=str, - help="Value for hop_length", - choices=[str(i) for i in range(1, 513)], - default="128", - ) - batch_infer_parser.add_argument( - "--f0_method", - type=str, - help="Value for f0_method", - choices=[ - "crepe", - "crepe-tiny", - "rmvpe", - "fcpe", - "hybrid[crepe+rmvpe]", - "hybrid[crepe+fcpe]", - "hybrid[rmvpe+fcpe]", - "hybrid[crepe+rmvpe+fcpe]", - ], - default="rmvpe", - ) - batch_infer_parser.add_argument("--input_folder", type=str, help="Input folder") - batch_infer_parser.add_argument("--output_folder", type=str, help="Output folder") - batch_infer_parser.add_argument( - "--pth_path", type=str, help="Path to the .pth file" - ) - batch_infer_parser.add_argument( - "--index_path", - type=str, - help="Path to the .index file", - ) - batch_infer_parser.add_argument( - "--split_audio", - type=str, - help="Enable split audio", - choices=["True", "False"], - default="False", - ) - batch_infer_parser.add_argument( - "--f0_autotune", - type=str, - help="Enable autotune", - choices=["True", "False"], - default="False", - ) - batch_infer_parser.add_argument( - "--clean_audio", - type=str, - help="Enable clean audio", - choices=["True", "False"], - default="False", - ) - batch_infer_parser.add_argument( - "--clean_strength", - type=str, - help="Value for clean_strength", - choices=[str(i / 10) for i in range(11)], - default="0.7", - ) - batch_infer_parser.add_argument( - "--export_format", - type=str, - help="Export format", - choices=["WAV", "MP3", "FLAC", "OGG", "M4A"], - default="WAV", - ) - batch_infer_parser.add_argument( - "--embedder_model", - type=str, - help="Embedder model", - choices=[ - "contentvec", - "japanese-hubert-base", - "chinese-hubert-large", - "custom", - ], - default="contentvec", - ) - batch_infer_parser.add_argument( - "--embedder_model_custom", - type=str, - help="Custom Embedder model", - default=None, - ) - batch_infer_parser.add_argument( - "--upscale_audio", - type=str, - help="Enable audio upscaling", - choices=["True", "False"], - default="False", - ) - batch_infer_parser.add_argument( - "--f0_file", - type=str, - help="Path to the f0 file", - default=None, - ) - - # Parser for 'tts' mode - tts_parser = subparsers.add_parser("tts", help="Run TTS") - tts_parser.add_argument( - "--tts_text", - type=str, - help="Text to be synthesized", - ) - tts_parser.add_argument( - "--tts_voice", - type=str, - help="Voice to be used", - choices=locales, - ) - tts_parser.add_argument( - "--tts_rate", - type=str, - help="Increase or decrease TTS speed", - choices=[str(i) for i in range(-100, 100)], - default="0", - ) - tts_parser.add_argument( - "--f0_up_key", - type=str, - help="Value for f0_up_key", - choices=[str(i) for i in range(-24, 25)], - default="0", - ) - tts_parser.add_argument( - "--filter_radius", - type=str, - help="Value for filter_radius", - choices=[str(i) for i in range(11)], - default="3", - ) - tts_parser.add_argument( - "--index_rate", - type=str, - help="Value for index_rate", - choices=[str(i / 10) for i in range(11)], - default="0.3", - ) - tts_parser.add_argument( - "--rms_mix_rate", - type=str, - help="Value for rms_mix_rate", - choices=[str(i / 10) for i in range(11)], - default="1", - ) - tts_parser.add_argument( - "--protect", - type=str, - help="Value for protect", - choices=[str(i / 10) for i in range(6)], - default="0.33", - ) - tts_parser.add_argument( - "--hop_length", - type=str, - help="Value for hop_length", - choices=[str(i) for i in range(1, 513)], - default="128", - ) - tts_parser.add_argument( - "--f0_method", - type=str, - help="Value for f0_method", - choices=[ - "crepe", - "crepe-tiny", - "rmvpe", - "fcpe", - "hybrid[crepe+rmvpe]", - "hybrid[crepe+fcpe]", - "hybrid[rmvpe+fcpe]", - "hybrid[crepe+rmvpe+fcpe]", - ], - default="rmvpe", - ) - tts_parser.add_argument("--output_tts_path", type=str, help="Output tts path") - tts_parser.add_argument("--output_rvc_path", type=str, help="Output rvc path") - tts_parser.add_argument("--pth_path", type=str, help="Path to the .pth file") - tts_parser.add_argument( - "--index_path", - type=str, - help="Path to the .index file", - ) - tts_parser.add_argument( - "--split_audio", - type=str, - help="Enable split audio", - choices=["True", "False"], - default="False", - ) - tts_parser.add_argument( - "--f0_autotune", - type=str, - help="Enable autotune", - choices=["True", "False"], - default="False", - ) - tts_parser.add_argument( - "--clean_audio", - type=str, - help="Enable clean audio", - choices=["True", "False"], - default="False", - ) - tts_parser.add_argument( - "--clean_strength", - type=str, - help="Value for clean_strength", - choices=[str(i / 10) for i in range(11)], - default="0.7", - ) - tts_parser.add_argument( - "--export_format", - type=str, - help="Export format", - choices=["WAV", "MP3", "FLAC", "OGG", "M4A"], - default="WAV", - ) - tts_parser.add_argument( - "--embedder_model", - type=str, - help="Embedder model", - choices=[ - "contentvec", - "japanese-hubert-base", - "chinese-hubert-large", - "custom", - ], - default="contentvec", - ) - tts_parser.add_argument( - "--embedder_model_custom", - type=str, - help="Custom Embedder model", - default=None, - ) - tts_parser.add_argument( - "--upscale_audio", - type=str, - help="Enable audio upscaling", - choices=["True", "False"], - default="False", - ) - tts_parser.add_argument( - "--f0_file", - type=str, - help="Path to the f0 file", - default=None, - ) - - # Parser for 'preprocess' mode - preprocess_parser = subparsers.add_parser("preprocess", help="Run preprocessing") - preprocess_parser.add_argument("--model_name", type=str, help="Name of the model") - preprocess_parser.add_argument( - "--dataset_path", - type=str, - help="Path to the dataset", - ) - preprocess_parser.add_argument( - "--sampling_rate", - type=str, - help="Sampling rate", - choices=["32000", "40000", "48000"], - ) - preprocess_parser.add_argument( - "--cpu_cores", - type=str, - help="Number of CPU cores to use", - choices=[str(i) for i in range(1, 64)], - default=None, - ) - - # Parser for 'extract' mode - extract_parser = subparsers.add_parser("extract", help="Run extract") - extract_parser.add_argument( - "--model_name", - type=str, - help="Name of the model", - ) - extract_parser.add_argument( - "--rvc_version", - type=str, - help="Version of the model", - choices=["v1", "v2"], - default="v2", - ) - extract_parser.add_argument( - "--f0_method", - type=str, - help="Value for f0_method", - choices=[ - "crepe", - "crepe-tiny", - "rmvpe", - ], - default="rmvpe", - ) - extract_parser.add_argument( - "--pitch_guidance", - type=str, - help="Pitch guidance", - choices=["True", "False"], - default="True", - ) - extract_parser.add_argument( - "--hop_length", - type=str, - help="Value for hop_length", - choices=[str(i) for i in range(1, 513)], - default="128", - ) - extract_parser.add_argument( - "--cpu_cores", - type=str, - help="Number of CPU cores to use", - choices=[str(i) for i in range(1, 64)], - default=None, - ) - extract_parser.add_argument( - "--sampling_rate", - type=str, - help="Sampling rate", - choices=["32000", "40000", "48000"], - ) - extract_parser.add_argument( - "--embedder_model", - type=str, - help="Embedder model", - choices=[ - "contentvec", - "japanese-hubert-base", - "chinese-hubert-large", - "custom", - ], - default="contentvec", - ) - extract_parser.add_argument( - "--embedder_model_custom", - type=str, - help="Custom Embedder model", - default=None, - ) - - # Parser for 'train' mode - train_parser = subparsers.add_parser("train", help="Run training") - train_parser.add_argument( - "--model_name", - type=str, - help="Name of the model", - ) - train_parser.add_argument( - "--rvc_version", - type=str, - help="Version of the model", - choices=["v1", "v2"], - default="v2", - ) - train_parser.add_argument( - "--save_every_epoch", - type=str, - help="Save every epoch", - choices=[str(i) for i in range(1, 101)], - ) - train_parser.add_argument( - "--save_only_latest", - type=str, - help="Save weight only at last epoch", - choices=["True", "False"], - default="False", - ) - train_parser.add_argument( - "--save_every_weights", - type=str, - help="Save weight every epoch", - choices=["True", "False"], - default="True", - ) - train_parser.add_argument( - "--total_epoch", - type=str, - help="Total epoch", - choices=[str(i) for i in range(1, 10001)], - default="1000", - ) - train_parser.add_argument( - "--sampling_rate", - type=str, - help="Sampling rate", - choices=["32000", "40000", "48000"], - ) - train_parser.add_argument( - "--batch_size", - type=str, - help="Batch size", - choices=[str(i) for i in range(1, 51)], - default="8", - ) - train_parser.add_argument( - "--gpu", - type=str, - help="GPU number", - default="0", - ) - train_parser.add_argument( - "--pitch_guidance", - type=str, - help="Pitch guidance", - choices=["True", "False"], - default="True", - ) - train_parser.add_argument( - "--pretrained", - type=str, - help="Pretrained", - choices=["True", "False"], - default="True", - ) - train_parser.add_argument( - "--custom_pretrained", - type=str, - help="Custom pretrained", - choices=["True", "False"], - default="False", - ) - train_parser.add_argument( - "--g_pretrained_path", - type=str, - nargs="?", - default=None, - help="Path to the pretrained G file", - ) - train_parser.add_argument( - "--d_pretrained_path", - type=str, - nargs="?", - default=None, - help="Path to the pretrained D file", - ) - train_parser.add_argument( - "--overtraining_detector", - type=str, - help="Overtraining detector", - choices=["True", "False"], - default="False", - ) - train_parser.add_argument( - "--overtraining_threshold", - type=str, - help="Overtraining threshold", - choices=[str(i) for i in range(1, 101)], - default="50", - ) - train_parser.add_argument( - "--sync_graph", - type=str, - help="Sync graph", - choices=["True", "False"], - default="False", - ) - train_parser.add_argument( - "--cache_data_in_gpu", - type=str, - help="Cache data in GPU", - choices=["True", "False"], - default="False", - ) - - # Parser for 'index' mode - index_parser = subparsers.add_parser("index", help="Generate index file") - index_parser.add_argument( - "--model_name", - type=str, - help="Name of the model", - ) - index_parser.add_argument( - "--rvc_version", - type=str, - help="Version of the model", - choices=["v1", "v2"], - default="v2", - ) - - # Parser for 'model_extract' mode - model_extract_parser = subparsers.add_parser("model_extract", help="Extract model") - model_extract_parser.add_argument( - "--pth_path", - type=str, - help="Path to the .pth file", - ) - model_extract_parser.add_argument( - "--model_name", - type=str, - help="Name of the model", - ) - model_extract_parser.add_argument( - "--sampling_rate", - type=str, - help="Sampling rate", - choices=["40000", "48000"], - ) - model_extract_parser.add_argument( - "--pitch_guidance", - type=str, - help="Pitch guidance", - choices=["True", "False"], - ) - model_extract_parser.add_argument( - "--rvc_version", - type=str, - help="Version of the model", - choices=["v1", "v2"], - default="v2", - ) - model_extract_parser.add_argument( - "--epoch", - type=str, - help="Epochs of the model", - choices=[str(i) for i in range(1, 10001)], - ) - model_extract_parser.add_argument( - "--step", - type=str, - help="Steps of the model", - ) - - # Parser for 'model_information' mode - model_information_parser = subparsers.add_parser( - "model_information", help="Print model information" - ) - model_information_parser.add_argument( - "--pth_path", - type=str, - help="Path to the .pth file", - ) - - # Parser for 'model_blender' mode - model_blender_parser = subparsers.add_parser( - "model_blender", help="Fuse two models" - ) - model_blender_parser.add_argument( - "--model_name", - type=str, - help="Name of the model", - ) - model_blender_parser.add_argument( - "--pth_path_1", - type=str, - help="Path to the first .pth file", - ) - model_blender_parser.add_argument( - "--pth_path_2", - type=str, - help="Path to the second .pth file", - ) - model_blender_parser.add_argument( - "--ratio", - type=str, - help="Value for blender ratio", - choices=[str(i / 10) for i in range(11)], - default="0.5", - ) - - # Parser for 'tensorboard' mode - subparsers.add_parser("tensorboard", help="Run tensorboard") - - # Parser for 'download' mode - download_parser = subparsers.add_parser("download", help="Download models") - download_parser.add_argument( - "--model_link", - type=str, - help="Link of the model", - ) - - # Parser for 'prerequisites' mode - prerequisites_parser = subparsers.add_parser( - "prerequisites", help="Install prerequisites" - ) - prerequisites_parser.add_argument( - "--pretraineds_v1", - type=str, - choices=["True", "False"], - default="True", - help="Download pretrained models for v1", - ) - prerequisites_parser.add_argument( - "--pretraineds_v2", - type=str, - choices=["True", "False"], - default="True", - help="Download pretrained models for v2", - ) - prerequisites_parser.add_argument( - "--models", - type=str, - choices=["True", "False"], - default="True", - help="Donwload models", - ) - prerequisites_parser.add_argument( - "--exe", - type=str, - choices=["True", "False"], - default="True", - help="Download executables", - ) - - # Parser for 'audio_analyzer' mode - audio_analyzer = subparsers.add_parser("audio_analyzer", help="Run audio analyzer") - audio_analyzer.add_argument( - "--input_path", - type=str, - help="Path to the input audio file", - ) - - # Parser for 'api' mode - api_parser = subparsers.add_parser("api", help="Run the API") - api_parser.add_argument( - "--host", type=str, help="Host address", default="127.0.0.1" - ) - api_parser.add_argument("--port", type=str, help="Port", default="8000") - - return parser.parse_args() - - -def main(): - if len(sys.argv) == 1: - print("Please run the script with '-h' for more information.") - sys.exit(1) - - args = parse_arguments() - - try: - if args.mode == "infer": - run_infer_script( - str(args.f0_up_key), - str(args.filter_radius), - str(args.index_rate), - str(args.rms_mix_rate), - str(args.protect), - str(args.hop_length), - str(args.f0_method), - str(args.input_path), - str(args.output_path), - str(args.pth_path), - str(args.index_path), - str(args.split_audio), - str(args.f0_autotune), - str(args.clean_audio), - str(args.clean_strength), - str(args.export_format), - str(args.embedder_model), - str(args.embedder_model_custom), - str(args.upscale_audio), - str(args.f0_file), - ) - elif args.mode == "batch_infer": - run_batch_infer_script( - str(args.f0_up_key), - str(args.filter_radius), - str(args.index_rate), - str(args.rms_mix_rate), - str(args.protect), - str(args.hop_length), - str(args.f0_method), - str(args.input_folder), - str(args.output_folder), - str(args.pth_path), - str(args.index_path), - str(args.split_audio), - str(args.f0_autotune), - str(args.clean_audio), - str(args.clean_strength), - str(args.export_format), - str(args.embedder_model), - str(args.embedder_model_custom), - str(args.upscale_audio), - str(args.f0_file), - ) - elif args.mode == "tts": - run_tts_script( - str(args.tts_text), - str(args.tts_voice), - str(args.tts_rate), - str(args.f0_up_key), - str(args.filter_radius), - str(args.index_rate), - str(args.rms_mix_rate), - str(args.protect), - str(args.hop_length), - str(args.f0_method), - str(args.output_tts_path), - str(args.output_rvc_path), - str(args.pth_path), - str(args.index_path), - str(args.split_audio), - str(args.f0_autotune), - str(args.clean_audio), - str(args.clean_strength), - str(args.export_format), - str(args.embedder_model), - str(args.embedder_model_custom), - str(args.upscale_audio), - str(args.f0_file), - ) - elif args.mode == "preprocess": - run_preprocess_script( - str(args.model_name), - str(args.dataset_path), - str(args.sampling_rate), - str(args.cpu_cores), - ) - elif args.mode == "extract": - run_extract_script( - str(args.model_name), - str(args.rvc_version), - str(args.f0_method), - str(args.pitch_guidance), - str(args.hop_length), - str(args.cpu_cores), - str(args.sampling_rate), - str(args.embedder_model), - str(args.embedder_model_custom), - ) - elif args.mode == "train": - run_train_script( - str(args.model_name), - str(args.rvc_version), - str(args.save_every_epoch), - str(args.save_only_latest), - str(args.save_every_weights), - str(args.total_epoch), - str(args.sampling_rate), - str(args.batch_size), - str(args.gpu), - str(args.pitch_guidance), - str(args.overtraining_detector), - str(args.overtraining_threshold), - str(args.pretrained), - str(args.custom_pretrained), - str(args.sync_graph), - str(args.cache_data_in_gpu), - str(args.g_pretrained_path), - str(args.d_pretrained_path), - ) - elif args.mode == "index": - run_index_script( - str(args.model_name), - str(args.rvc_version), - ) - elif args.mode == "model_extract": - run_model_extract_script( - str(args.pth_path), - str(args.model_name), - str(args.sampling_rate), - str(args.pitch_guidance), - str(args.rvc_version), - str(args.epoch), - str(args.step), - ) - elif args.mode == "model_information": - run_model_information_script( - str(args.pth_path), - ) - elif args.mode == "model_blender": - run_model_blender_script( - str(args.model_name), - str(args.pth_path_1), - str(args.pth_path_2), - str(args.ratio), - ) - elif args.mode == "tensorboard": - run_tensorboard_script() - elif args.mode == "download": - run_download_script( - str(args.model_link), - ) - elif args.mode == "prerequisites": - run_prerequisites_script( - str(args.pretraineds_v1), - str(args.pretraineds_v2), - str(args.models), - str(args.exe), - ) - elif args.mode == "audio_analyzer": - run_audio_analyzer_script( - str(args.input_path), - ) - elif args.mode == "api": - run_api_script( - str(args.host), - str(args.port), - ) - except Exception as error: - print(f"Error: {error}") - - -if __name__ == "__main__": - main() diff --git a/rvc/configs/v1/32000.json b/rvc/configs/v1/32000.json index e985327..2f28f4f 100644 --- a/rvc/configs/v1/32000.json +++ b/rvc/configs/v1/32000.json @@ -17,7 +17,7 @@ }, "data": { "max_wav_value": 32768.0, - "sampling_rate": 32000, + "sample_rate": 32000, "filter_length": 1024, "hop_length": 320, "win_length": 1024, diff --git a/rvc/configs/v1/40000.json b/rvc/configs/v1/40000.json index a101b18..3961ddb 100644 --- a/rvc/configs/v1/40000.json +++ b/rvc/configs/v1/40000.json @@ -17,7 +17,7 @@ }, "data": { "max_wav_value": 32768.0, - "sampling_rate": 40000, + "sample_rate": 40000, "filter_length": 2048, "hop_length": 400, "win_length": 2048, diff --git a/rvc/configs/v1/48000.json b/rvc/configs/v1/48000.json index d4cb93a..41ea3b6 100644 --- a/rvc/configs/v1/48000.json +++ b/rvc/configs/v1/48000.json @@ -17,7 +17,7 @@ }, "data": { "max_wav_value": 32768.0, - "sampling_rate": 48000, + "sample_rate": 48000, "filter_length": 2048, "hop_length": 480, "win_length": 2048, diff --git a/rvc/configs/v2/32000.json b/rvc/configs/v2/32000.json index ee493c3..eabab7b 100644 --- a/rvc/configs/v2/32000.json +++ b/rvc/configs/v2/32000.json @@ -13,7 +13,7 @@ }, "data": { "max_wav_value": 32768.0, - "sampling_rate": 32000, + "sample_rate": 32000, "filter_length": 1024, "hop_length": 320, "win_length": 1024, diff --git a/rvc/configs/v2/40000.json b/rvc/configs/v2/40000.json index d865016..e1ba44a 100644 --- a/rvc/configs/v2/40000.json +++ b/rvc/configs/v2/40000.json @@ -13,7 +13,7 @@ }, "data": { "max_wav_value": 32768.0, - "sampling_rate": 40000, + "sample_rate": 40000, "filter_length": 2048, "hop_length": 400, "win_length": 2048, diff --git a/rvc/configs/v2/48000.json b/rvc/configs/v2/48000.json index b80a1e6..1a4da9f 100644 --- a/rvc/configs/v2/48000.json +++ b/rvc/configs/v2/48000.json @@ -13,7 +13,7 @@ }, "data": { "max_wav_value": 32768.0, - "sampling_rate": 48000, + "sample_rate": 48000, "filter_length": 2048, "hop_length": 480, "win_length": 2048, diff --git a/rvc/infer/infer.py b/rvc/infer/infer.py index 8d431c2..715675f 100644 --- a/rvc/infer/infer.py +++ b/rvc/infer/infer.py @@ -4,6 +4,7 @@ import torch import librosa import logging +import traceback import numpy as np import soundfile as sf import noisereduce as nr @@ -34,7 +35,9 @@ def __init__(self): Initializes the VoiceConverter with default configuration, and sets up models and parameters. """ self.config = Config() # Load RVC configuration - self.hubert_model = None # Initialize the Hubert model (for embedding extraction) + self.hubert_model = ( + None # Initialize the Hubert model (for embedding extraction) + ) self.tgt_sr = None # Target sampling rate for the output audio self.net_g = None # Generator network for voice conversion self.vc = None # Voice conversion pipeline instance @@ -43,7 +46,7 @@ def __init__(self): self.n_spk = None # Number of speakers in the model self.use_f0 = None # Whether the model uses F0 - def load_hubert(self, embedder_model, embedder_model_custom): + def load_hubert(self, embedder_model: str, embedder_model_custom: str = None): """ Loads the HuBERT model for speaker embedding extraction. @@ -115,28 +118,28 @@ def convert_audio_format(input_path, output_path, output_format): def convert_audio( self, - audio_input_path, - audio_output_path, - model_path, - index_path, - sid=0, - f0_up_key=None, - f0_file=None, - f0_method=None, - index_rate=None, - resample_sr=0, - rms_mix_rate=None, - protect=None, - hop_length=None, - split_audio=False, - f0_autotune=False, - filter_radius=None, - embedder_model=None, - embedder_model_custom=None, - clean_audio=False, - clean_strength=0.7, - export_format="WAV", - upscale_audio=False, + audio_input_path: str, + audio_output_path: str, + model_path: str, + index_path: str, + embedder_model: str, + pitch: int, + f0_file: str, + f0_method: str, + index_rate: float, + volume_envelope: int, + protect: float, + hop_length: int, + split_audio: bool, + f0_autotune: bool, + filter_radius: int, + embedder_model_custom: str, + clean_audio: bool, + clean_strength: float, + export_format: str, + upscale_audio: bool, + resample_sr: int = 0, + sid: int = 0, ): """ Performs voice conversion on the input audio. @@ -147,12 +150,12 @@ def convert_audio( model_path (str): Path to the voice conversion model. index_path (str): Path to the index file. sid (int, optional): Speaker ID. Default is 0. - f0_up_key (str, optional): Key for F0 up-sampling. Default is None. + pitch (str, optional): Key for F0 up-sampling. Default is None. f0_file (str, optional): Path to the F0 file. Default is None. f0_method (str, optional): Method for F0 extraction. Default is None. index_rate (float, optional): Rate for index matching. Default is None. resample_sr (int, optional): Resample sampling rate. Default is 0. - rms_mix_rate (float, optional): RMS mix rate. Default is None. + volume_envelope (float, optional): RMS mix rate. Default is None. protect (float, optional): Protection rate for certain audio segments. Default is None. hop_length (int, optional): Hop length for audio processing. Default is None. split_audio (bool, optional): Whether to split the audio for processing. Default is False. @@ -164,6 +167,7 @@ def convert_audio( clean_strength (float, optional): Strength of the audio cleaning. Default is 0.7. export_format (str, optional): Format for exporting the audio. Default is "WAV". upscale_audio (bool, optional): Whether to upscale the audio. Default is False. + """ self.get_vc(model_path, sid) @@ -171,7 +175,7 @@ def convert_audio( start_time = time.time() print(f"Converting audio '{audio_input_path}'...") - if upscale_audio == "True": + if upscale_audio == True: upscale(audio_input_path, audio_input_path) audio = load_audio(audio_input_path, 16000) @@ -195,7 +199,7 @@ def convert_audio( if self.tgt_sr != resample_sr >= 16000: self.tgt_sr = resample_sr - if split_audio == "True": + if split_audio: result, new_dir_path = process_audio(audio_input_path) if result == "Error": return "Error with Split Audio", None @@ -213,31 +217,32 @@ def convert_audio( try: for path in paths: self.convert_audio( - path, - path, - model_path, - index_path, - sid, - f0_up_key, - None, - f0_method, - index_rate, - resample_sr, - rms_mix_rate, - protect, - hop_length, - False, - f0_autotune, - filter_radius, - embedder_model, - embedder_model_custom, - clean_audio, - clean_strength, - export_format, - upscale_audio, + audio_input_path=path, + audio_output_path=path, + model_path=model_path, + index_path=index_path, + sid=sid, + pitch=pitch, + f0_file=None, + f0_method=f0_method, + index_rate=index_rate, + resample_sr=resample_sr, + volume_envelope=volume_envelope, + protect=protect, + hop_length=hop_length, + split_audio=False, + f0_autotune=f0_autotune, + filter_radius=filter_radius, + embedder_model=embedder_model, + embedder_model_custom=embedder_model_custom, + clean_audio=clean_audio, + clean_strength=clean_strength, + export_format=export_format, + upscale_audio=upscale_audio, ) except Exception as error: - print(error) + print(f"Error in processing split audio segment: {error}") + print(traceback.format_exc()) return f"Error {error}" print("Finished processing segmented audio, now merging audio...") merge_timestamps_file = os.path.join( @@ -248,31 +253,31 @@ def convert_audio( os.remove(merge_timestamps_file) else: audio_opt = self.vc.pipeline( - self.hubert_model, - self.net_g, - sid, - audio, - audio_input_path, - f0_up_key, - f0_method, - file_index, - index_rate, - self.use_f0, - filter_radius, - self.tgt_sr, - resample_sr, - rms_mix_rate, - self.version, - protect, - hop_length, - f0_autotune, + model=self.hubert_model, + net_g=self.net_g, + sid=sid, + audio=audio, + input_audio_path=audio_input_path, + pitch=pitch, + f0_method=f0_method, + file_index=file_index, + index_rate=index_rate, + pitch_guidance=self.use_f0, + filter_radius=filter_radius, + tgt_sr=self.tgt_sr, + resample_sr=resample_sr, + volume_envelope=volume_envelope, + version=self.version, + protect=protect, + hop_length=hop_length, + f0_autotune=f0_autotune, f0_file=f0_file, ) if audio_output_path: sf.write(audio_output_path, audio_opt, self.tgt_sr, format="WAV") - if clean_audio == "True": + if clean_audio: cleaned_audio = self.remove_audio_noise( audio_output_path, clean_strength ) @@ -295,6 +300,7 @@ def convert_audio( except Exception as error: print(f"Voice conversion failed: {error}") + print(traceback.format_exc()) def get_vc(self, weight_root, sid): """ diff --git a/rvc/infer/pipeline.py b/rvc/infer/pipeline.py index 1631cc4..80dd6bb 100644 --- a/rvc/infer/pipeline.py +++ b/rvc/infer/pipeline.py @@ -274,7 +274,7 @@ def get_f0_hybrid( f0_max=int(f0_max), dtype=torch.float32, device=self.device, - sampling_rate=self.sample_rate, + sample_rate=self.sample_rate, threshold=0.03, ) f0 = self.model_fcpe.compute_f0(x, p_len=p_len) @@ -295,7 +295,7 @@ def get_f0( input_audio_path, x, p_len, - f0_up_key, + pitch, f0_method, filter_radius, hop_length, @@ -309,7 +309,7 @@ def get_f0( input_audio_path: Path to the input audio file. x: The input audio signal as a NumPy array. p_len: Desired length of the F0 output. - f0_up_key: Key to adjust the pitch of the F0 contour. + pitch: Key to adjust the pitch of the F0 contour. f0_method: Method to use for F0 estimation (e.g., "crepe"). filter_radius: Radius for median filtering the F0 contour. hop_length: Hop length for F0 estimation methods. @@ -337,7 +337,7 @@ def get_f0( f0_max=int(self.f0_max), dtype=torch.float32, device=self.device, - sampling_rate=self.sample_rate, + sample_rate=self.sample_rate, threshold=0.03, ) f0 = self.model_fcpe.compute_f0(x, p_len=p_len) @@ -357,7 +357,7 @@ def get_f0( if f0_autotune == "True": f0 = Autotune.autotune_f0(self, f0) - f0 *= pow(2, f0_up_key / 12) + f0 *= pow(2, pitch / 12) tf0 = self.sample_rate // self.window if inp_f0 is not None: delta_t = np.round( @@ -497,7 +497,7 @@ def pipeline( sid, audio, input_audio_path, - f0_up_key, + pitch, f0_method, file_index, index_rate, @@ -505,7 +505,7 @@ def pipeline( filter_radius, tgt_sr, resample_sr, - rms_mix_rate, + volume_envelope, version, protect, hop_length, @@ -521,7 +521,7 @@ def pipeline( sid: Speaker ID for the target voice. audio: The input audio signal. input_audio_path: Path to the input audio file. - f0_up_key: Key to adjust the pitch of the F0 contour. + pitch: Key to adjust the pitch of the F0 contour. f0_method: Method to use for F0 estimation. file_index: Path to the FAISS index file for speaker embedding retrieval. index_rate: Blending rate for speaker embedding retrieval. @@ -529,7 +529,7 @@ def pipeline( filter_radius: Radius for median filtering the F0 contour. tgt_sr: Target sampling rate for the output audio. resample_sr: Resampling rate for the output audio. - rms_mix_rate: Blending rate for adjusting the RMS level of the output audio. + volume_envelope: Blending rate for adjusting the RMS level of the output audio. version: Model version. protect: Protection level for preserving the original pitch. hop_length: Hop length for F0 estimation methods. @@ -578,13 +578,12 @@ def pipeline( except Exception as error: print(error) sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() - pitch, pitchf = None, None - if pitch_guidance == 1: + if pitch_guidance == True: pitch, pitchf = self.get_f0( input_audio_path, audio_pad, p_len, - f0_up_key, + pitch, f0_method, filter_radius, hop_length, @@ -599,7 +598,7 @@ def pipeline( pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() for t in opt_ts: t = t // self.window * self.window - if pitch_guidance == 1: + if pitch_guidance == True: audio_opt.append( self.voice_conversion( model, @@ -632,7 +631,7 @@ def pipeline( )[self.t_pad_tgt : -self.t_pad_tgt] ) s = t - if pitch_guidance == 1: + if pitch_guidance == True: audio_opt.append( self.voice_conversion( model, @@ -665,9 +664,9 @@ def pipeline( )[self.t_pad_tgt : -self.t_pad_tgt] ) audio_opt = np.concatenate(audio_opt) - if rms_mix_rate != 1: + if volume_envelope != 1: audio_opt = AudioProcessor.change_rms( - audio, self.sample_rate, audio_opt, tgt_sr, rms_mix_rate + audio, self.sample_rate, audio_opt, tgt_sr, volume_envelope ) if resample_sr >= self.sample_rate and tgt_sr != resample_sr: audio_opt = librosa.resample( diff --git a/rvc/lib/FCPEF0Predictor.py b/rvc/lib/FCPEF0Predictor.py index 08541bb..02c7519 100644 --- a/rvc/lib/FCPEF0Predictor.py +++ b/rvc/lib/FCPEF0Predictor.py @@ -22,13 +22,13 @@ def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False): - sampling_rate = None + sample_rate = None try: - data, sampling_rate = sf.read(full_path, always_2d=True) # than soundfile. + data, sample_rate = sf.read(full_path, always_2d=True) # than soundfile. except Exception as error: print(f"'{full_path}' failed to load with {error}") if return_empty_on_exception: - return [], sampling_rate or target_sr or 48000 + return [], sample_rate or target_sr or 48000 else: raise Exception(error) @@ -55,16 +55,16 @@ def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False if ( torch.isinf(data) | torch.isnan(data) ).any() and return_empty_on_exception: # resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except - return [], sampling_rate or target_sr or 48000 - if target_sr is not None and sampling_rate != target_sr: + return [], sample_rate or target_sr or 48000 + if target_sr is not None and sample_rate != target_sr: data = torch.from_numpy( librosa.core.resample( - data.numpy(), orig_sr=sampling_rate, target_sr=target_sr + data.numpy(), orig_sr=sample_rate, target_sr=target_sr ) ) - sampling_rate = target_sr + sample_rate = target_sr - return data, sampling_rate + return data, sample_rate def dynamic_range_compression(x, C=1, clip_val=1e-5): @@ -108,7 +108,7 @@ def __init__( self.hann_window = {} def get_mel(self, y, keyshift=0, speed=1, center=False, train=False): - sampling_rate = self.target_sr + sample_rate = self.target_sr n_mels = self.n_mels n_fft = self.n_fft win_size = self.win_size @@ -131,7 +131,7 @@ def get_mel(self, y, keyshift=0, speed=1, center=False, train=False): mel_basis_key = str(fmax) + "_" + str(y.device) if mel_basis_key not in mel_basis: mel = librosa_mel_fn( - sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax + sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax ) mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device) @@ -842,14 +842,14 @@ class Wav2Mel: def __init__(self, args, device=None, dtype=torch.float32): # self.args = args - self.sampling_rate = args.mel.sampling_rate + self.sample_rate = args.mel.sample_rate self.hop_size = args.mel.hop_size if device is None: device = "cuda" if torch.cuda.is_available() else "cpu" self.device = device self.dtype = dtype self.stft = STFT( - args.mel.sampling_rate, + args.mel.sample_rate, args.mel.num_mels, args.mel.n_fft, args.mel.win_size, @@ -868,13 +868,13 @@ def extract_nvstft(self, audio, keyshift=0, train=False): def extract_mel(self, audio, sample_rate, keyshift=0, train=False): audio = audio.to(self.dtype).to(self.device) # resample - if sample_rate == self.sampling_rate: + if sample_rate == self.sample_rate: audio_res = audio else: key_str = str(sample_rate) if key_str not in self.resample_kernel: self.resample_kernel[key_str] = Resample( - sample_rate, self.sampling_rate, lowpass_filter_width=128 + sample_rate, self.sample_rate, lowpass_filter_width=128 ) self.resample_kernel[key_str] = ( self.resample_kernel[key_str].to(self.dtype).to(self.device) @@ -932,7 +932,7 @@ def __init__( f0_max=1100, dtype=torch.float32, device=None, - sampling_rate=44100, + sample_rate=44100, threshold=0.05, ): self.fcpe = FCPEInfer(model_path, device=device, dtype=dtype) @@ -944,7 +944,7 @@ def __init__( else: self.device = device self.threshold = threshold - self.sampling_rate = sampling_rate + self.sample_rate = sample_rate self.dtype = dtype self.name = "fcpe" @@ -977,7 +977,7 @@ def repeat_expand( elif ndim == 2: return results[0] - def post_process(self, x, sampling_rate, f0, pad_to): + def post_process(self, x, sample_rate, f0, pad_to): if isinstance(f0, np.ndarray): f0 = torch.from_numpy(f0).float().to(x.device) @@ -993,8 +993,8 @@ def post_process(self, x, sampling_rate, f0, pad_to): # 去掉0频率, 并线性插值 nzindex = torch.nonzero(f0).squeeze() f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy() - time_org = self.hop_length / sampling_rate * nzindex.cpu().numpy() - time_frame = np.arange(pad_to) * self.hop_length / sampling_rate + time_org = self.hop_length / sample_rate * nzindex.cpu().numpy() + time_frame = np.arange(pad_to) * self.hop_length / sample_rate vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0] @@ -1019,18 +1019,18 @@ def compute_f0(self, wav, p_len=None): if p_len is None: print("fcpe p_len is None") p_len = x.shape[0] // self.hop_length - f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0, :, 0] + f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0] if torch.all(f0 == 0): rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len) return rtn, rtn - return self.post_process(x, self.sampling_rate, f0, p_len)[0] + return self.post_process(x, self.sample_rate, f0, p_len)[0] def compute_f0_uv(self, wav, p_len=None): x = torch.FloatTensor(wav).to(self.dtype).to(self.device) if p_len is None: p_len = x.shape[0] // self.hop_length - f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0, :, 0] + f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0] if torch.all(f0 == 0): rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len) return rtn, rtn - return self.post_process(x, self.sampling_rate, f0, p_len) + return self.post_process(x, self.sample_rate, f0, p_len) diff --git a/rvc/lib/algorithm/generators.py b/rvc/lib/algorithm/generators.py index 6c84d1d..98dc91b 100644 --- a/rvc/lib/algorithm/generators.py +++ b/rvc/lib/algorithm/generators.py @@ -142,7 +142,7 @@ def __init__( self.noise_std = noise_std self.harmonic_num = harmonic_num self.dim = self.harmonic_num + 1 - self.sampling_rate = samp_rate + self.sample_rate = samp_rate self.voiced_threshold = voiced_threshold def _f02uv(self, f0): @@ -172,7 +172,7 @@ def forward(self, f0: torch.Tensor, upp: int): f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( idx + 2 ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic - rad_values = (f0_buf / float(self.sampling_rate)) % 1 + rad_values = (f0_buf / float(self.sample_rate)) % 1 rand_ini = torch.rand( f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device ) diff --git a/rvc/lib/algorithm/nsf.py b/rvc/lib/algorithm/nsf.py index cd2931b..a7eaac7 100644 --- a/rvc/lib/algorithm/nsf.py +++ b/rvc/lib/algorithm/nsf.py @@ -14,7 +14,7 @@ class SourceModuleHnNSF(torch.nn.Module): Source Module for harmonic-plus-noise excitation. Args: - sampling_rate (int): Sampling rate in Hz. + sample_rate (int): Sampling rate in Hz. harmonic_num (int, optional): Number of harmonics above F0. Defaults to 0. sine_amp (float, optional): Amplitude of sine source signal. Defaults to 0.1. add_noise_std (float, optional): Standard deviation of additive Gaussian noise. Defaults to 0.003. @@ -24,7 +24,7 @@ class SourceModuleHnNSF(torch.nn.Module): def __init__( self, - sampling_rate, + sample_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, @@ -38,7 +38,7 @@ def __init__( self.is_half = is_half self.l_sin_gen = SineGen( - sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod + sample_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod ) self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) self.l_tanh = torch.nn.Tanh() @@ -86,7 +86,7 @@ def __init__( self.num_upsamples = len(upsample_rates) self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates)) self.m_source = SourceModuleHnNSF( - sampling_rate=sr, harmonic_num=0, is_half=is_half + sample_rate=sr, harmonic_num=0, is_half=is_half ) self.conv_pre = torch.nn.Conv1d( diff --git a/rvc/lib/infer_pack/models.py b/rvc/lib/infer_pack/models.py index dcd0a57..77a2b36 100644 --- a/rvc/lib/infer_pack/models.py +++ b/rvc/lib/infer_pack/models.py @@ -370,7 +370,7 @@ def __init__( self.noise_std = noise_std self.harmonic_num = harmonic_num self.dim = self.harmonic_num + 1 - self.sampling_rate = samp_rate + self.sample_rate = samp_rate self.voiced_threshold = voiced_threshold def _f02uv(self, f0): @@ -397,7 +397,7 @@ def forward(self, f0: torch.Tensor, upp: int): f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( idx + 2 ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic - rad_values = (f0_buf / float(self.sampling_rate)) % 1 + rad_values = (f0_buf / float(self.sample_rate)) % 1 rand_ini = torch.rand( f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device ) @@ -436,9 +436,9 @@ def forward(self, f0: torch.Tensor, upp: int): class SourceModuleHnNSF(torch.nn.Module): """SourceModule for hn-nsf - SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + SourceModule(sample_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshod=0) - sampling_rate: sampling_rate in Hz + sample_rate: sample_rate in Hz harmonic_num: number of harmonic above F0 (default: 0) sine_amp: amplitude of sine source signal (default: 0.1) add_noise_std: std of additive Gaussian noise (default: 0.003) @@ -454,7 +454,7 @@ class SourceModuleHnNSF(torch.nn.Module): def __init__( self, - sampling_rate, + sample_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, @@ -468,7 +468,7 @@ def __init__( self.is_half = is_half # to produce sine waveforms self.l_sin_gen = SineGen( - sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod + sample_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod ) # to merge source harmonics into a single excitation @@ -511,7 +511,7 @@ def __init__( self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates)) self.m_source = SourceModuleHnNSF( - sampling_rate=sr, harmonic_num=0, is_half=is_half + sample_rate=sr, harmonic_num=0, is_half=is_half ) self.noise_convs = nn.ModuleList() self.conv_pre = Conv1d( diff --git a/rvc/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py b/rvc/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py index 6d241b4..e95c0e9 100644 --- a/rvc/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py +++ b/rvc/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py @@ -4,11 +4,11 @@ class DioF0Predictor(F0Predictor): - def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): + def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sample_rate=44100): self.hop_length = hop_length self.f0_min = f0_min self.f0_max = f0_max - self.sampling_rate = sampling_rate + self.sample_rate = sample_rate def interpolate_f0(self, f0): data = np.reshape(f0, (f0.size, 1)) @@ -60,12 +60,12 @@ def compute_f0(self, wav, p_len=None): p_len = wav.shape[0] // self.hop_length f0, t = pyworld.dio( wav.astype(np.double), - fs=self.sampling_rate, + fs=self.sample_rate, f0_floor=self.f0_min, f0_ceil=self.f0_max, - frame_period=1000 * self.hop_length / self.sampling_rate, + frame_period=1000 * self.hop_length / self.sample_rate, ) - f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sample_rate) for index, pitch in enumerate(f0): f0[index] = round(pitch, 1) return self.interpolate_f0(self.resize_f0(f0, p_len))[0] @@ -75,12 +75,12 @@ def compute_f0_uv(self, wav, p_len=None): p_len = wav.shape[0] // self.hop_length f0, t = pyworld.dio( wav.astype(np.double), - fs=self.sampling_rate, + fs=self.sample_rate, f0_floor=self.f0_min, f0_ceil=self.f0_max, - frame_period=1000 * self.hop_length / self.sampling_rate, + frame_period=1000 * self.hop_length / self.sample_rate, ) - f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sample_rate) for index, pitch in enumerate(f0): f0[index] = round(pitch, 1) return self.interpolate_f0(self.resize_f0(f0, p_len)) diff --git a/rvc/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py b/rvc/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py index e533f28..f30f61b 100644 --- a/rvc/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py +++ b/rvc/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py @@ -4,11 +4,11 @@ class HarvestF0Predictor(F0Predictor): - def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): + def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sample_rate=44100): self.hop_length = hop_length self.f0_min = f0_min self.f0_max = f0_max - self.sampling_rate = sampling_rate + self.sample_rate = sample_rate def interpolate_f0(self, f0): data = np.reshape(f0, (f0.size, 1)) @@ -60,10 +60,10 @@ def compute_f0(self, wav, p_len=None): p_len = wav.shape[0] // self.hop_length f0, t = pyworld.harvest( wav.astype(np.double), - fs=self.sampling_rate, + fs=self.sample_rate, f0_ceil=self.f0_max, f0_floor=self.f0_min, - frame_period=1000 * self.hop_length / self.sampling_rate, + frame_period=1000 * self.hop_length / self.sample_rate, ) f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs) return self.interpolate_f0(self.resize_f0(f0, p_len))[0] @@ -73,10 +73,10 @@ def compute_f0_uv(self, wav, p_len=None): p_len = wav.shape[0] // self.hop_length f0, t = pyworld.harvest( wav.astype(np.double), - fs=self.sampling_rate, + fs=self.sample_rate, f0_floor=self.f0_min, f0_ceil=self.f0_max, - frame_period=1000 * self.hop_length / self.sampling_rate, + frame_period=1000 * self.hop_length / self.sample_rate, ) - f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sample_rate) return self.interpolate_f0(self.resize_f0(f0, p_len)) diff --git a/rvc/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py b/rvc/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py index 0fe1c74..1b81a9f 100644 --- a/rvc/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py +++ b/rvc/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py @@ -4,11 +4,11 @@ class PMF0Predictor(F0Predictor): - def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): + def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sample_rate=44100): self.hop_length = hop_length self.f0_min = f0_min self.f0_max = f0_max - self.sampling_rate = sampling_rate + self.sample_rate = sample_rate def interpolate_f0(self, f0): data = np.reshape(f0, (f0.size, 1)) @@ -50,9 +50,9 @@ def compute_f0(self, wav, p_len=None): p_len = x.shape[0] // self.hop_length else: assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" - time_step = self.hop_length / self.sampling_rate * 1000 + time_step = self.hop_length / self.sample_rate * 1000 f0 = ( - parselmouth.Sound(x, self.sampling_rate) + parselmouth.Sound(x, self.sample_rate) .to_pitch_ac( time_step=time_step / 1000, voicing_threshold=0.6, @@ -74,9 +74,9 @@ def compute_f0_uv(self, wav, p_len=None): p_len = x.shape[0] // self.hop_length else: assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" - time_step = self.hop_length / self.sampling_rate * 1000 + time_step = self.hop_length / self.sample_rate * 1000 f0 = ( - parselmouth.Sound(x, self.sampling_rate) + parselmouth.Sound(x, self.sample_rate) .to_pitch_ac( time_step=time_step / 1000, voicing_threshold=0.6, diff --git a/rvc/lib/predictor/Dio.py b/rvc/lib/predictor/Dio.py index 0ac3b0b..a69bb47 100644 --- a/rvc/lib/predictor/Dio.py +++ b/rvc/lib/predictor/Dio.py @@ -4,11 +4,11 @@ class DioF0Predictor(F0Predictor): - def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): + def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sample_rate=44100): self.hop_length = hop_length self.f0_min = f0_min self.f0_max = f0_max - self.sampling_rate = sampling_rate + self.sample_rate = sample_rate def interpolate_f0(self, f0): data = np.reshape(f0, (f0.size, 1)) @@ -60,12 +60,12 @@ def compute_f0(self, wav, p_len=None): p_len = wav.shape[0] // self.hop_length f0, t = pyworld.dio( wav.astype(np.double), - fs=self.sampling_rate, + fs=self.sample_rate, f0_floor=self.f0_min, f0_ceil=self.f0_max, - frame_period=1000 * self.hop_length / self.sampling_rate, + frame_period=1000 * self.hop_length / self.sample_rate, ) - f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sample_rate) for index, pitch in enumerate(f0): f0[index] = round(pitch, 1) return self.interpolate_f0(self.resize_f0(f0, p_len))[0] @@ -75,12 +75,12 @@ def compute_f0_uv(self, wav, p_len=None): p_len = wav.shape[0] // self.hop_length f0, t = pyworld.dio( wav.astype(np.double), - fs=self.sampling_rate, + fs=self.sample_rate, f0_floor=self.f0_min, f0_ceil=self.f0_max, - frame_period=1000 * self.hop_length / self.sampling_rate, + frame_period=1000 * self.hop_length / self.sample_rate, ) - f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sample_rate) for index, pitch in enumerate(f0): f0[index] = round(pitch, 1) return self.interpolate_f0(self.resize_f0(f0, p_len)) diff --git a/rvc/lib/predictor/FCPE.py b/rvc/lib/predictor/FCPE.py index 08541bb..02c7519 100644 --- a/rvc/lib/predictor/FCPE.py +++ b/rvc/lib/predictor/FCPE.py @@ -22,13 +22,13 @@ def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False): - sampling_rate = None + sample_rate = None try: - data, sampling_rate = sf.read(full_path, always_2d=True) # than soundfile. + data, sample_rate = sf.read(full_path, always_2d=True) # than soundfile. except Exception as error: print(f"'{full_path}' failed to load with {error}") if return_empty_on_exception: - return [], sampling_rate or target_sr or 48000 + return [], sample_rate or target_sr or 48000 else: raise Exception(error) @@ -55,16 +55,16 @@ def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False if ( torch.isinf(data) | torch.isnan(data) ).any() and return_empty_on_exception: # resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except - return [], sampling_rate or target_sr or 48000 - if target_sr is not None and sampling_rate != target_sr: + return [], sample_rate or target_sr or 48000 + if target_sr is not None and sample_rate != target_sr: data = torch.from_numpy( librosa.core.resample( - data.numpy(), orig_sr=sampling_rate, target_sr=target_sr + data.numpy(), orig_sr=sample_rate, target_sr=target_sr ) ) - sampling_rate = target_sr + sample_rate = target_sr - return data, sampling_rate + return data, sample_rate def dynamic_range_compression(x, C=1, clip_val=1e-5): @@ -108,7 +108,7 @@ def __init__( self.hann_window = {} def get_mel(self, y, keyshift=0, speed=1, center=False, train=False): - sampling_rate = self.target_sr + sample_rate = self.target_sr n_mels = self.n_mels n_fft = self.n_fft win_size = self.win_size @@ -131,7 +131,7 @@ def get_mel(self, y, keyshift=0, speed=1, center=False, train=False): mel_basis_key = str(fmax) + "_" + str(y.device) if mel_basis_key not in mel_basis: mel = librosa_mel_fn( - sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax + sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax ) mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device) @@ -842,14 +842,14 @@ class Wav2Mel: def __init__(self, args, device=None, dtype=torch.float32): # self.args = args - self.sampling_rate = args.mel.sampling_rate + self.sample_rate = args.mel.sample_rate self.hop_size = args.mel.hop_size if device is None: device = "cuda" if torch.cuda.is_available() else "cpu" self.device = device self.dtype = dtype self.stft = STFT( - args.mel.sampling_rate, + args.mel.sample_rate, args.mel.num_mels, args.mel.n_fft, args.mel.win_size, @@ -868,13 +868,13 @@ def extract_nvstft(self, audio, keyshift=0, train=False): def extract_mel(self, audio, sample_rate, keyshift=0, train=False): audio = audio.to(self.dtype).to(self.device) # resample - if sample_rate == self.sampling_rate: + if sample_rate == self.sample_rate: audio_res = audio else: key_str = str(sample_rate) if key_str not in self.resample_kernel: self.resample_kernel[key_str] = Resample( - sample_rate, self.sampling_rate, lowpass_filter_width=128 + sample_rate, self.sample_rate, lowpass_filter_width=128 ) self.resample_kernel[key_str] = ( self.resample_kernel[key_str].to(self.dtype).to(self.device) @@ -932,7 +932,7 @@ def __init__( f0_max=1100, dtype=torch.float32, device=None, - sampling_rate=44100, + sample_rate=44100, threshold=0.05, ): self.fcpe = FCPEInfer(model_path, device=device, dtype=dtype) @@ -944,7 +944,7 @@ def __init__( else: self.device = device self.threshold = threshold - self.sampling_rate = sampling_rate + self.sample_rate = sample_rate self.dtype = dtype self.name = "fcpe" @@ -977,7 +977,7 @@ def repeat_expand( elif ndim == 2: return results[0] - def post_process(self, x, sampling_rate, f0, pad_to): + def post_process(self, x, sample_rate, f0, pad_to): if isinstance(f0, np.ndarray): f0 = torch.from_numpy(f0).float().to(x.device) @@ -993,8 +993,8 @@ def post_process(self, x, sampling_rate, f0, pad_to): # 去掉0频率, 并线性插值 nzindex = torch.nonzero(f0).squeeze() f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy() - time_org = self.hop_length / sampling_rate * nzindex.cpu().numpy() - time_frame = np.arange(pad_to) * self.hop_length / sampling_rate + time_org = self.hop_length / sample_rate * nzindex.cpu().numpy() + time_frame = np.arange(pad_to) * self.hop_length / sample_rate vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0] @@ -1019,18 +1019,18 @@ def compute_f0(self, wav, p_len=None): if p_len is None: print("fcpe p_len is None") p_len = x.shape[0] // self.hop_length - f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0, :, 0] + f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0] if torch.all(f0 == 0): rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len) return rtn, rtn - return self.post_process(x, self.sampling_rate, f0, p_len)[0] + return self.post_process(x, self.sample_rate, f0, p_len)[0] def compute_f0_uv(self, wav, p_len=None): x = torch.FloatTensor(wav).to(self.dtype).to(self.device) if p_len is None: p_len = x.shape[0] // self.hop_length - f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0, :, 0] + f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0] if torch.all(f0 == 0): rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len) return rtn, rtn - return self.post_process(x, self.sampling_rate, f0, p_len) + return self.post_process(x, self.sample_rate, f0, p_len) diff --git a/rvc/lib/predictor/Harvest.py b/rvc/lib/predictor/Harvest.py index dba3f9e..42044a2 100644 --- a/rvc/lib/predictor/Harvest.py +++ b/rvc/lib/predictor/Harvest.py @@ -4,11 +4,11 @@ class HarvestF0Predictor(F0Predictor): - def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): + def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sample_rate=44100): self.hop_length = hop_length self.f0_min = f0_min self.f0_max = f0_max - self.sampling_rate = sampling_rate + self.sample_rate = sample_rate def interpolate_f0(self, f0): data = np.reshape(f0, (f0.size, 1)) @@ -60,10 +60,10 @@ def compute_f0(self, wav, p_len=None): p_len = wav.shape[0] // self.hop_length f0, t = pyworld.harvest( wav.astype(np.double), - fs=self.sampling_rate, + fs=self.sample_rate, f0_ceil=self.f0_max, f0_floor=self.f0_min, - frame_period=1000 * self.hop_length / self.sampling_rate, + frame_period=1000 * self.hop_length / self.sample_rate, ) f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs) return self.interpolate_f0(self.resize_f0(f0, p_len))[0] @@ -73,10 +73,10 @@ def compute_f0_uv(self, wav, p_len=None): p_len = wav.shape[0] // self.hop_length f0, t = pyworld.harvest( wav.astype(np.double), - fs=self.sampling_rate, + fs=self.sample_rate, f0_floor=self.f0_min, f0_ceil=self.f0_max, - frame_period=1000 * self.hop_length / self.sampling_rate, + frame_period=1000 * self.hop_length / self.sample_rate, ) - f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sample_rate) return self.interpolate_f0(self.resize_f0(f0, p_len)) diff --git a/rvc/lib/predictor/PM.py b/rvc/lib/predictor/PM.py index f8446ab..30cde5a 100644 --- a/rvc/lib/predictor/PM.py +++ b/rvc/lib/predictor/PM.py @@ -4,11 +4,11 @@ class PMF0Predictor(F0Predictor): - def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): + def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sample_rate=44100): self.hop_length = hop_length self.f0_min = f0_min self.f0_max = f0_max - self.sampling_rate = sampling_rate + self.sample_rate = sample_rate def interpolate_f0(self, f0): data = np.reshape(f0, (f0.size, 1)) @@ -50,9 +50,9 @@ def compute_f0(self, wav, p_len=None): p_len = x.shape[0] // self.hop_length else: assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" - time_step = self.hop_length / self.sampling_rate * 1000 + time_step = self.hop_length / self.sample_rate * 1000 f0 = ( - parselmouth.Sound(x, self.sampling_rate) + parselmouth.Sound(x, self.sample_rate) .to_pitch_ac( time_step=time_step / 1000, voicing_threshold=0.6, @@ -74,9 +74,9 @@ def compute_f0_uv(self, wav, p_len=None): p_len = x.shape[0] // self.hop_length else: assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" - time_step = self.hop_length / self.sampling_rate * 1000 + time_step = self.hop_length / self.sample_rate * 1000 f0 = ( - parselmouth.Sound(x, self.sampling_rate) + parselmouth.Sound(x, self.sample_rate) .to_pitch_ac( time_step=time_step / 1000, voicing_threshold=0.6, diff --git a/rvc/lib/predictor/RMVPE.py b/rvc/lib/predictor/RMVPE.py index 2f50d08..89b984b 100644 --- a/rvc/lib/predictor/RMVPE.py +++ b/rvc/lib/predictor/RMVPE.py @@ -258,7 +258,7 @@ def __init__( self, is_half, n_mel_channels, - sampling_rate, + sample_rate, win_length, hop_length, n_fft=None, @@ -270,7 +270,7 @@ def __init__( n_fft = win_length if n_fft is None else n_fft self.hann_window = {} mel_basis = mel( - sr=sampling_rate, + sr=sample_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=mel_fmin, @@ -282,7 +282,7 @@ def __init__( self.n_fft = win_length if n_fft is None else n_fft self.hop_length = hop_length self.win_length = win_length - self.sampling_rate = sampling_rate + self.sample_rate = sample_rate self.n_mel_channels = n_mel_channels self.clamp = clamp self.is_half = is_half diff --git a/rvc/lib/predictors/Dio.py b/rvc/lib/predictors/Dio.py index 5f33c82..859ba5b 100644 --- a/rvc/lib/predictors/Dio.py +++ b/rvc/lib/predictors/Dio.py @@ -4,11 +4,11 @@ class DioF0Predictor(F0Predictor): - def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): + def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sample_rate=44100): self.hop_length = hop_length self.f0_min = f0_min self.f0_max = f0_max - self.sampling_rate = sampling_rate + self.sample_rate = sample_rate def interpolate_f0(self, f0): data = np.reshape(f0, (f0.size, 1)) @@ -60,12 +60,12 @@ def compute_f0(self, wav, p_len=None): p_len = wav.shape[0] // self.hop_length f0, t = pyworld.dio( wav.astype(np.double), - fs=self.sampling_rate, + fs=self.sample_rate, f0_floor=self.f0_min, f0_ceil=self.f0_max, - frame_period=1000 * self.hop_length / self.sampling_rate, + frame_period=1000 * self.hop_length / self.sample_rate, ) - f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sample_rate) for index, pitch in enumerate(f0): f0[index] = round(pitch, 1) return self.interpolate_f0(self.resize_f0(f0, p_len))[0] @@ -75,12 +75,12 @@ def compute_f0_uv(self, wav, p_len=None): p_len = wav.shape[0] // self.hop_length f0, t = pyworld.dio( wav.astype(np.double), - fs=self.sampling_rate, + fs=self.sample_rate, f0_floor=self.f0_min, f0_ceil=self.f0_max, - frame_period=1000 * self.hop_length / self.sampling_rate, + frame_period=1000 * self.hop_length / self.sample_rate, ) - f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sample_rate) for index, pitch in enumerate(f0): f0[index] = round(pitch, 1) return self.interpolate_f0(self.resize_f0(f0, p_len)) diff --git a/rvc/lib/predictors/FCPE.py b/rvc/lib/predictors/FCPE.py index 08541bb..02c7519 100644 --- a/rvc/lib/predictors/FCPE.py +++ b/rvc/lib/predictors/FCPE.py @@ -22,13 +22,13 @@ def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False): - sampling_rate = None + sample_rate = None try: - data, sampling_rate = sf.read(full_path, always_2d=True) # than soundfile. + data, sample_rate = sf.read(full_path, always_2d=True) # than soundfile. except Exception as error: print(f"'{full_path}' failed to load with {error}") if return_empty_on_exception: - return [], sampling_rate or target_sr or 48000 + return [], sample_rate or target_sr or 48000 else: raise Exception(error) @@ -55,16 +55,16 @@ def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False if ( torch.isinf(data) | torch.isnan(data) ).any() and return_empty_on_exception: # resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except - return [], sampling_rate or target_sr or 48000 - if target_sr is not None and sampling_rate != target_sr: + return [], sample_rate or target_sr or 48000 + if target_sr is not None and sample_rate != target_sr: data = torch.from_numpy( librosa.core.resample( - data.numpy(), orig_sr=sampling_rate, target_sr=target_sr + data.numpy(), orig_sr=sample_rate, target_sr=target_sr ) ) - sampling_rate = target_sr + sample_rate = target_sr - return data, sampling_rate + return data, sample_rate def dynamic_range_compression(x, C=1, clip_val=1e-5): @@ -108,7 +108,7 @@ def __init__( self.hann_window = {} def get_mel(self, y, keyshift=0, speed=1, center=False, train=False): - sampling_rate = self.target_sr + sample_rate = self.target_sr n_mels = self.n_mels n_fft = self.n_fft win_size = self.win_size @@ -131,7 +131,7 @@ def get_mel(self, y, keyshift=0, speed=1, center=False, train=False): mel_basis_key = str(fmax) + "_" + str(y.device) if mel_basis_key not in mel_basis: mel = librosa_mel_fn( - sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax + sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax ) mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device) @@ -842,14 +842,14 @@ class Wav2Mel: def __init__(self, args, device=None, dtype=torch.float32): # self.args = args - self.sampling_rate = args.mel.sampling_rate + self.sample_rate = args.mel.sample_rate self.hop_size = args.mel.hop_size if device is None: device = "cuda" if torch.cuda.is_available() else "cpu" self.device = device self.dtype = dtype self.stft = STFT( - args.mel.sampling_rate, + args.mel.sample_rate, args.mel.num_mels, args.mel.n_fft, args.mel.win_size, @@ -868,13 +868,13 @@ def extract_nvstft(self, audio, keyshift=0, train=False): def extract_mel(self, audio, sample_rate, keyshift=0, train=False): audio = audio.to(self.dtype).to(self.device) # resample - if sample_rate == self.sampling_rate: + if sample_rate == self.sample_rate: audio_res = audio else: key_str = str(sample_rate) if key_str not in self.resample_kernel: self.resample_kernel[key_str] = Resample( - sample_rate, self.sampling_rate, lowpass_filter_width=128 + sample_rate, self.sample_rate, lowpass_filter_width=128 ) self.resample_kernel[key_str] = ( self.resample_kernel[key_str].to(self.dtype).to(self.device) @@ -932,7 +932,7 @@ def __init__( f0_max=1100, dtype=torch.float32, device=None, - sampling_rate=44100, + sample_rate=44100, threshold=0.05, ): self.fcpe = FCPEInfer(model_path, device=device, dtype=dtype) @@ -944,7 +944,7 @@ def __init__( else: self.device = device self.threshold = threshold - self.sampling_rate = sampling_rate + self.sample_rate = sample_rate self.dtype = dtype self.name = "fcpe" @@ -977,7 +977,7 @@ def repeat_expand( elif ndim == 2: return results[0] - def post_process(self, x, sampling_rate, f0, pad_to): + def post_process(self, x, sample_rate, f0, pad_to): if isinstance(f0, np.ndarray): f0 = torch.from_numpy(f0).float().to(x.device) @@ -993,8 +993,8 @@ def post_process(self, x, sampling_rate, f0, pad_to): # 去掉0频率, 并线性插值 nzindex = torch.nonzero(f0).squeeze() f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy() - time_org = self.hop_length / sampling_rate * nzindex.cpu().numpy() - time_frame = np.arange(pad_to) * self.hop_length / sampling_rate + time_org = self.hop_length / sample_rate * nzindex.cpu().numpy() + time_frame = np.arange(pad_to) * self.hop_length / sample_rate vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0] @@ -1019,18 +1019,18 @@ def compute_f0(self, wav, p_len=None): if p_len is None: print("fcpe p_len is None") p_len = x.shape[0] // self.hop_length - f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0, :, 0] + f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0] if torch.all(f0 == 0): rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len) return rtn, rtn - return self.post_process(x, self.sampling_rate, f0, p_len)[0] + return self.post_process(x, self.sample_rate, f0, p_len)[0] def compute_f0_uv(self, wav, p_len=None): x = torch.FloatTensor(wav).to(self.dtype).to(self.device) if p_len is None: p_len = x.shape[0] // self.hop_length - f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0, :, 0] + f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0] if torch.all(f0 == 0): rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len) return rtn, rtn - return self.post_process(x, self.sampling_rate, f0, p_len) + return self.post_process(x, self.sample_rate, f0, p_len) diff --git a/rvc/lib/predictors/Harvest.py b/rvc/lib/predictors/Harvest.py index 1d82a13..f56d35e 100644 --- a/rvc/lib/predictors/Harvest.py +++ b/rvc/lib/predictors/Harvest.py @@ -4,11 +4,11 @@ class HarvestF0Predictor(F0Predictor): - def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): + def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sample_rate=44100): self.hop_length = hop_length self.f0_min = f0_min self.f0_max = f0_max - self.sampling_rate = sampling_rate + self.sample_rate = sample_rate def interpolate_f0(self, f0): data = np.reshape(f0, (f0.size, 1)) @@ -60,10 +60,10 @@ def compute_f0(self, wav, p_len=None): p_len = wav.shape[0] // self.hop_length f0, t = pyworld.harvest( wav.astype(np.double), - fs=self.sampling_rate, + fs=self.sample_rate, f0_ceil=self.f0_max, f0_floor=self.f0_min, - frame_period=1000 * self.hop_length / self.sampling_rate, + frame_period=1000 * self.hop_length / self.sample_rate, ) f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs) return self.interpolate_f0(self.resize_f0(f0, p_len))[0] @@ -73,10 +73,10 @@ def compute_f0_uv(self, wav, p_len=None): p_len = wav.shape[0] // self.hop_length f0, t = pyworld.harvest( wav.astype(np.double), - fs=self.sampling_rate, + fs=self.sample_rate, f0_floor=self.f0_min, f0_ceil=self.f0_max, - frame_period=1000 * self.hop_length / self.sampling_rate, + frame_period=1000 * self.hop_length / self.sample_rate, ) - f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sample_rate) return self.interpolate_f0(self.resize_f0(f0, p_len)) diff --git a/rvc/lib/predictors/PM.py b/rvc/lib/predictors/PM.py index dda600c..0243784 100644 --- a/rvc/lib/predictors/PM.py +++ b/rvc/lib/predictors/PM.py @@ -4,11 +4,11 @@ class PMF0Predictor(F0Predictor): - def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): + def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sample_rate=44100): self.hop_length = hop_length self.f0_min = f0_min self.f0_max = f0_max - self.sampling_rate = sampling_rate + self.sample_rate = sample_rate def interpolate_f0(self, f0): data = np.reshape(f0, (f0.size, 1)) @@ -50,9 +50,9 @@ def compute_f0(self, wav, p_len=None): p_len = x.shape[0] // self.hop_length else: assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" - time_step = self.hop_length / self.sampling_rate * 1000 + time_step = self.hop_length / self.sample_rate * 1000 f0 = ( - parselmouth.Sound(x, self.sampling_rate) + parselmouth.Sound(x, self.sample_rate) .to_pitch_ac( time_step=time_step / 1000, voicing_threshold=0.6, @@ -74,9 +74,9 @@ def compute_f0_uv(self, wav, p_len=None): p_len = x.shape[0] // self.hop_length else: assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" - time_step = self.hop_length / self.sampling_rate * 1000 + time_step = self.hop_length / self.sample_rate * 1000 f0 = ( - parselmouth.Sound(x, self.sampling_rate) + parselmouth.Sound(x, self.sample_rate) .to_pitch_ac( time_step=time_step / 1000, voicing_threshold=0.6, diff --git a/rvc/lib/predictors/RMVPE.py b/rvc/lib/predictors/RMVPE.py index a3705bb..f4d9175 100644 --- a/rvc/lib/predictors/RMVPE.py +++ b/rvc/lib/predictors/RMVPE.py @@ -356,7 +356,7 @@ class MelSpectrogram(torch.nn.Module): Args: is_half (bool): Whether to use half-precision floating-point numbers. n_mel_channels (int): Number of Mel-frequency bands. - sampling_rate (int): Sampling rate of the audio. + sample_rate (int): Sampling rate of the audio. win_length (int): Length of the window function in samples. hop_length (int): Hop size between frames in samples. n_fft (int, optional): Length of the FFT window. Defaults to None, which uses win_length. @@ -369,7 +369,7 @@ def __init__( self, is_half, n_mel_channels, - sampling_rate, + sample_rate, win_length, hop_length, n_fft=None, @@ -381,7 +381,7 @@ def __init__( n_fft = win_length if n_fft is None else n_fft self.hann_window = {} mel_basis = mel( - sr=sampling_rate, + sr=sample_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=mel_fmin, @@ -393,7 +393,7 @@ def __init__( self.n_fft = win_length if n_fft is None else n_fft self.hop_length = hop_length self.win_length = win_length - self.sampling_rate = sampling_rate + self.sample_rate = sample_rate self.n_mel_channels = n_mel_channels self.clamp = clamp self.is_half = is_half diff --git a/rvc/lib/rmvpe.py b/rvc/lib/rmvpe.py index 2f50d08..89b984b 100644 --- a/rvc/lib/rmvpe.py +++ b/rvc/lib/rmvpe.py @@ -258,7 +258,7 @@ def __init__( self, is_half, n_mel_channels, - sampling_rate, + sample_rate, win_length, hop_length, n_fft=None, @@ -270,7 +270,7 @@ def __init__( n_fft = win_length if n_fft is None else n_fft self.hann_window = {} mel_basis = mel( - sr=sampling_rate, + sr=sample_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=mel_fmin, @@ -282,7 +282,7 @@ def __init__( self.n_fft = win_length if n_fft is None else n_fft self.hop_length = hop_length self.win_length = win_length - self.sampling_rate = sampling_rate + self.sample_rate = sample_rate self.n_mel_channels = n_mel_channels self.clamp = clamp self.is_half = is_half diff --git a/rvc/lib/tools/prerequisites_download.py b/rvc/lib/tools/prerequisites_download.py index c1b4cba..fd99dab 100644 --- a/rvc/lib/tools/prerequisites_download.py +++ b/rvc/lib/tools/prerequisites_download.py @@ -120,20 +120,24 @@ def download_mapping_files(list): def prequisites_download_pipeline(pretraineds_v1, pretraineds_v2, models, exe): - if models == "True": + if models == True: download_mapping_files(models_list) download_mapping_files(embedders_list) - if exe == "True" and os.name == "nt": - download_files(executables_list) + if exe == True: + if os.name == "nt": + download_files(executables_list) + else: + print("Executable files are only available for Windows") - if pretraineds_v1 == "True": + if pretraineds_v1 == True: download_mapping_files(pretraineds_v1_list) - if pretraineds_v2 == "True": + if pretraineds_v2 == True: download_mapping_files(pretraineds_v2_list) - clear_console() # Clear the console after all downloads are completed + # Clear the console after all downloads are completed + clear_console() def clear_console(): diff --git a/rvc/lib/tools/pretrained_selector.py b/rvc/lib/tools/pretrained_selector.py index c0e7043..e982fac 100644 --- a/rvc/lib/tools/pretrained_selector.py +++ b/rvc/lib/tools/pretrained_selector.py @@ -1,61 +1,61 @@ def pretrained_selector(pitch_guidance): - if pitch_guidance: + if pitch_guidance == True: return { "v1": { - "32000": ( + 32000: ( "rvc/models/pretraineds/pretrained_v1/f0G32k.pth", "rvc/models/pretraineds/pretrained_v1/f0D32k.pth", ), - "40000": ( + 40000: ( "rvc/models/pretraineds/pretrained_v1/f0G40k.pth", "rvc/models/pretraineds/pretrained_v1/f0D40k.pth", ), - "48000": ( + 48000: ( "rvc/models/pretraineds/pretrained_v1/f0G48k.pth", "rvc/models/pretraineds/pretrained_v1/f0D48k.pth", ), }, "v2": { - "32000": ( + 32000: ( "rvc/models/pretraineds/pretrained_v2/f0G32k.pth", "rvc/models/pretraineds/pretrained_v2/f0D32k.pth", ), - "40000": ( + 40000: ( "rvc/models/pretraineds/pretrained_v2/f0G40k.pth", "rvc/models/pretraineds/pretrained_v2/f0D40k.pth", ), - "48000": ( + 48000: ( "rvc/models/pretraineds/pretrained_v2/f0G48k.pth", "rvc/models/pretraineds/pretrained_v2/f0D48k.pth", ), }, } - else: + elif pitch_guidance == False: return { "v1": { - "32000": ( + 32000: ( "rvc/models/pretraineds/pretrained_v1/G32k.pth", "rvc/models/pretraineds/pretrained_v1/D32k.pth", ), - "40000": ( + 40000: ( "rvc/models/pretraineds/pretrained_v1/G40k.pth", "rvc/models/pretraineds/pretrained_v1/D40k.pth", ), - "48000": ( + 48000: ( "rvc/models/pretraineds/pretrained_v1/G48k.pth", "rvc/models/pretraineds/pretrained_v1/D48k.pth", ), }, "v2": { - "32000": ( + 32000: ( "rvc/models/pretraineds/pretrained_v2/G32k.pth", "rvc/models/pretraineds/pretrained_v2/D32k.pth", ), - "40000": ( + 40000: ( "rvc/models/pretraineds/pretrained_v2/G40k.pth", "rvc/models/pretraineds/pretrained_v2/D40k.pth", ), - "48000": ( + 48000: ( "rvc/models/pretraineds/pretrained_v2/G48k.pth", "rvc/models/pretraineds/pretrained_v2/D48k.pth", ), diff --git a/rvc/lib/tools/tts.py b/rvc/lib/tools/tts.py index 7d8afab..a9994db 100644 --- a/rvc/lib/tools/tts.py +++ b/rvc/lib/tools/tts.py @@ -4,10 +4,11 @@ async def main(): - text = sys.argv[1] - voice = sys.argv[2] + # Parse command line arguments + text = str(sys.argv[1]) + voice = str(sys.argv[2]) rate = int(sys.argv[3]) - output_file = sys.argv[4] + output_file = str(sys.argv[4]) rates = f"+{rate}%" if rate >= 0 else f"{rate}%" diff --git a/rvc/lib/utils.py b/rvc/lib/utils.py index e95f068..9c89c42 100644 --- a/rvc/lib/utils.py +++ b/rvc/lib/utils.py @@ -14,12 +14,12 @@ sys.path.append(now_dir) -def load_audio(file, sampling_rate): +def load_audio(file, sample_rate): try: file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") out, _ = ( ffmpeg.input(file, threads=0) - .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sampling_rate) + .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sample_rate) .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) ) except Exception as error: @@ -39,7 +39,6 @@ def format_title(title): def load_embedding(embedder_model, custom_embedder=None): - embedder_root = os.path.join(now_dir, "rvc", "models", "embedders") embedding_list = { "contentvec": os.path.join(embedder_root, "contentvec_base.pt"), diff --git a/rvc/train/data_utils.py b/rvc/train/data_utils.py index 7830815..8ccb3bb 100644 --- a/rvc/train/data_utils.py +++ b/rvc/train/data_utils.py @@ -18,11 +18,11 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): def __init__(self, hparams): self.audiopaths_and_text = load_filepaths_and_text(hparams.training_files) self.max_wav_value = hparams.max_wav_value - self.sampling_rate = hparams.sampling_rate + self.sample_rate = hparams.sample_rate self.filter_length = hparams.filter_length self.hop_length = hparams.hop_length self.win_length = hparams.win_length - self.sampling_rate = hparams.sampling_rate + self.sample_rate = hparams.sample_rate self.min_text_len = getattr(hparams, "min_text_len", 1) self.max_text_len = getattr(hparams, "max_text_len", 5000) self._filter() @@ -115,11 +115,11 @@ def get_audio(self, filename): Args: filename (str): Path to audio file. """ - audio, sampling_rate = load_wav_to_torch(filename) - if sampling_rate != self.sampling_rate: + audio, sample_rate = load_wav_to_torch(filename) + if sample_rate != self.sample_rate: raise ValueError( "{} SR doesn't match target {} SR".format( - sampling_rate, self.sampling_rate + sample_rate, self.sample_rate ) ) audio_norm = audio @@ -256,11 +256,11 @@ class TextAudioLoader(torch.utils.data.Dataset): def __init__(self, hparams): self.audiopaths_and_text = load_filepaths_and_text(hparams.training_files) self.max_wav_value = hparams.max_wav_value - self.sampling_rate = hparams.sampling_rate + self.sample_rate = hparams.sample_rate self.filter_length = hparams.filter_length self.hop_length = hparams.hop_length self.win_length = hparams.win_length - self.sampling_rate = hparams.sampling_rate + self.sample_rate = hparams.sample_rate self.min_text_len = getattr(hparams, "min_text_len", 1) self.max_text_len = getattr(hparams, "max_text_len", 5000) self._filter() @@ -341,11 +341,11 @@ def get_audio(self, filename): Args: filename (str): Path to audio file. """ - audio, sampling_rate = load_wav_to_torch(filename) - if sampling_rate != self.sampling_rate: + audio, sample_rate = load_wav_to_torch(filename) + if sample_rate != self.sample_rate: raise ValueError( "{} SR doesn't match target {} SR".format( - sampling_rate, self.sampling_rate + sample_rate, self.sample_rate ) ) audio_norm = audio diff --git a/rvc/train/extract/extract_f0_print.py b/rvc/train/extract/extract_f0_print.py index b2a8544..ccaaa87 100644 --- a/rvc/train/extract/extract_f0_print.py +++ b/rvc/train/extract/extract_f0_print.py @@ -14,8 +14,8 @@ from rvc.lib.predictors.RMVPE import RMVPE0Predictor # Parse command line arguments -exp_dir = sys.argv[1] -f0_method = sys.argv[2] +exp_dir = str(sys.argv[1]) +f0_method = str(sys.argv[2]) hop_length = int(sys.argv[3]) num_processes = int(sys.argv[4]) diff --git a/rvc/train/extract/extract_feature_print.py b/rvc/train/extract/extract_feature_print.py index 2e8b833..45fceb7 100644 --- a/rvc/train/extract/extract_feature_print.py +++ b/rvc/train/extract/extract_feature_print.py @@ -11,20 +11,22 @@ sys.path.append(now_dir) from rvc.lib.utils import load_embedding -device = sys.argv[1] +# Parse command line arguments +device = str(sys.argv[1]) n_parts = int(sys.argv[2]) i_part = int(sys.argv[3]) -i_gpu = sys.argv[4] -exp_dir = sys.argv[5] -os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu) -version = sys.argv[6] +i_gpu = int(sys.argv[4]) +exp_dir = str(sys.argv[5]) +version = str(sys.argv[6]) is_half = bool(sys.argv[7]) -embedder_model = sys.argv[8] +embedder_model = str(sys.argv[8]) + try: - embedder_model_custom = sys.argv[9] + embedder_model_custom = str(sys.argv[9]) except: embedder_model_custom = None +os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu) wav_path = f"{exp_dir}/sliced_audios_16k" out_path = f"{exp_dir}/v1_extracted" if version == "v1" else f"{exp_dir}/v2_extracted" diff --git a/rvc/train/extract/preparing_files.py b/rvc/train/extract/preparing_files.py index 9b5d336..a73bf77 100644 --- a/rvc/train/extract/preparing_files.py +++ b/rvc/train/extract/preparing_files.py @@ -8,21 +8,21 @@ current_directory = os.getcwd() -def generate_config(rvc_version, sampling_rate, model_path): - config_path = os.path.join("rvc", "configs", rvc_version, f"{sampling_rate}.json") +def generate_config(rvc_version: str, sample_rate: int, model_path: str): + config_path = os.path.join("rvc", "configs", rvc_version, f"{sample_rate}.json") config_save_path = os.path.join(model_path, "config.json") if not os.path.exists(config_save_path): shutil.copyfile(config_path, config_save_path) -def generate_filelist(pitch_guidance, model_path, rvc_version, sampling_rate): +def generate_filelist(pitch_guidance: bool, model_path: str, rvc_version: str, sample_rate: int): gt_wavs_dir = f"{model_path}/sliced_audios" feature_dir = ( f"{model_path}/v1_extracted" if rvc_version == "v1" else f"{model_path}/v2_extracted" ) - if pitch_guidance == 1: + if pitch_guidance == True: f0_dir = f"{model_path}/f0" f0nsf_dir = f"{model_path}/f0_voiced" names = ( @@ -31,7 +31,7 @@ def generate_filelist(pitch_guidance, model_path, rvc_version, sampling_rate): & set([name.split(".")[0] for name in os.listdir(f0_dir)]) & set([name.split(".")[0] for name in os.listdir(f0nsf_dir)]) ) - else: + elif pitch_guidance == False: names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set( [name.split(".")[0] for name in os.listdir(feature_dir)] ) @@ -46,12 +46,12 @@ def generate_filelist(pitch_guidance, model_path, rvc_version, sampling_rate): if pitch_guidance == 1: for _ in range(2): options.append( - f"{current_directory}/logs/mute/sliced_audios/mute{sampling_rate}.wav|{current_directory}/logs/mute/{rvc_version}_extracted/mute.npy|{current_directory}/logs/mute/f0/mute.wav.npy|{current_directory}/logs/mute/f0_voiced/mute.wav.npy|0" + f"{current_directory}/logs/mute/sliced_audios/mute{sample_rate}.wav|{current_directory}/logs/mute/{rvc_version}_extracted/mute.npy|{current_directory}/logs/mute/f0/mute.wav.npy|{current_directory}/logs/mute/f0_voiced/mute.wav.npy|0" ) else: for _ in range(2): options.append( - f"{current_directory}/logs/mute/sliced_audios/mute{sampling_rate}.wav|{current_directory}/logs/mute/{rvc_version}_extracted/mute.npy|0" + f"{current_directory}/logs/mute/sliced_audios/mute{sample_rate}.wav|{current_directory}/logs/mute/{rvc_version}_extracted/mute.npy|0" ) shuffle(options) with open(f"{model_path}/filelist.txt", "w") as f: diff --git a/rvc/train/mel_processing.py b/rvc/train/mel_processing.py index 6f7af3c..ecad648 100644 --- a/rvc/train/mel_processing.py +++ b/rvc/train/mel_processing.py @@ -94,7 +94,7 @@ def spectrogram_torch(y, n_fft, hop_size, win_size, center=False): return spec -def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): +def spec_to_mel_torch(spec, n_fft, num_mels, sample_rate, fmin, fmax): """ Convert a spectrogram to a mel-spectrogram. @@ -102,7 +102,7 @@ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): spec (torch.Tensor): Magnitude spectrogram. n_fft (int): FFT window size. num_mels (int): Number of mel frequency bins. - sampling_rate (int): Sampling rate of the audio signal. + sample_rate (int): Sampling rate of the audio signal. fmin (float): Minimum frequency. fmax (float): Maximum frequency. """ @@ -111,7 +111,7 @@ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): fmax_dtype_device = str(fmax) + "_" + dtype_device if fmax_dtype_device not in mel_basis: mel = librosa_mel_fn( - sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax + sr=sample_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax ) mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( dtype=spec.dtype, device=spec.device @@ -123,7 +123,7 @@ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): def mel_spectrogram_torch( - y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False + y, n_fft, num_mels, sample_rate, hop_size, win_size, fmin, fmax, center=False ): """ Compute the mel-spectrogram of a signal. @@ -132,7 +132,7 @@ def mel_spectrogram_torch( y (torch.Tensor): Input signal. n_fft (int): FFT window size. num_mels (int): Number of mel frequency bins. - sampling_rate (int): Sampling rate of the audio signal. + sample_rate (int): Sampling rate of the audio signal. hop_size (int): Hop size between frames. win_size (int): Window size. fmin (float): Minimum frequency. @@ -141,6 +141,6 @@ def mel_spectrogram_torch( """ spec = spectrogram_torch(y, n_fft, hop_size, win_size, center) - melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax) + melspec = spec_to_mel_torch(spec, n_fft, num_mels, sample_rate, fmin, fmax) return melspec diff --git a/rvc/train/preprocess/preprocess.py b/rvc/train/preprocess/preprocess.py index 0857645..6820459 100644 --- a/rvc/train/preprocess/preprocess.py +++ b/rvc/train/preprocess/preprocess.py @@ -15,10 +15,10 @@ from rvc.lib.utils import load_audio from rvc.train.slicer import Slicer -# Load command line arguments -experiment_directory = sys.argv[1] -input_root = sys.argv[2] -sampling_rate = int(sys.argv[3]) +# Parse command line arguments +experiment_directory = str(sys.argv[1]) +input_root = str(sys.argv[2]) +sample_rate = int(sys.argv[3]) percentage = float(sys.argv[4]) num_processes = int(sys.argv[5]) if len(sys.argv) > 5 else cpu_count() @@ -135,5 +135,5 @@ def preprocess_training_set( if __name__ == "__main__": preprocess_training_set( - input_root, sampling_rate, num_processes, experiment_directory, percentage + input_root, sample_rate, num_processes, experiment_directory, percentage ) diff --git a/rvc/train/process/extract_index.py b/rvc/train/process/extract_index.py index e118259..a57a620 100644 --- a/rvc/train/process/extract_index.py +++ b/rvc/train/process/extract_index.py @@ -5,8 +5,9 @@ from sklearn.cluster import MiniBatchKMeans from multiprocessing import cpu_count -exp_dir = sys.argv[1] -version = sys.argv[2] +# Parse command line arguments +exp_dir = str(sys.argv[1]) +version = str(sys.argv[2]) try: feature_dir = os.path.join(exp_dir, f"{version}_extracted") diff --git a/rvc/train/process/extract_model.py b/rvc/train/process/extract_model.py index 8ff6714..6413868 100644 --- a/rvc/train/process/extract_model.py +++ b/rvc/train/process/extract_model.py @@ -18,7 +18,7 @@ def replace_keys_in_dict(d, old_key_part, new_key_part): return updated_dict -def extract_model(ckpt, sr, if_f0, name, model_dir, epoch, step, version, hps): +def extract_model(ckpt, sr, pitch_guidance, name, model_dir, epoch, step, version, hps): try: print(f"Saved model '{model_dir}' (epoch {epoch} and step {step})") pth_file = f"{name}_{epoch}e_{step}s.pth" @@ -48,13 +48,13 @@ def extract_model(ckpt, sr, if_f0, name, model_dir, epoch, step, version, hps): hps.model.upsample_kernel_sizes, hps.model.spk_embed_dim, hps.model.gin_channels, - hps.data.sampling_rate, + hps.data.sample_rate, ] opt["epoch"] = epoch opt["step"] = step opt["sr"] = sr - opt["f0"] = if_f0 + opt["f0"] = pitch_guidance opt["version"] = version opt["creation_date"] = datetime.datetime.now().isoformat() diff --git a/rvc/train/process/extract_small_model.py b/rvc/train/process/extract_small_model.py index cd755b9..79aa936 100644 --- a/rvc/train/process/extract_small_model.py +++ b/rvc/train/process/extract_small_model.py @@ -21,7 +21,7 @@ def replace_keys_in_dict(d, old_key_part, new_key_part): return updated_dict -def extract_small_model(path, name, sr, if_f0, version, epoch, step): +def extract_small_model(path: str, name: str, sr: int, pitch_guidance: bool, version: str, epoch: int, step: int): try: ckpt = torch.load(path, map_location="cpu") pth_file = f"{name}.pth" @@ -150,7 +150,7 @@ def extract_small_model(path, name, sr, if_f0, version, epoch, step): opt["epoch"] = epoch opt["step"] = step opt["sr"] = sr - opt["f0"] = int(if_f0) + opt["f0"] = int(pitch_guidance) opt["version"] = version opt["creation_date"] = datetime.datetime.now().isoformat() diff --git a/rvc/train/train.py b/rvc/train/train.py index 10c84db..938d695 100644 --- a/rvc/train/train.py +++ b/rvc/train/train.py @@ -125,12 +125,13 @@ def start(): print("GPU not detected, reverting to CPU (not recommended)") n_gpus = 1 - if hps.sync_graph == 1: + print(f"Value of sg {hps.sync_graph}") + if hps.sync_graph == True: print( "Sync graph is now activated! With sync graph enabled, the model undergoes a single epoch of training. Once the graphs are synchronized, training proceeds for the previously specified number of epochs." ) hps.custom_total_epoch = 1 - hps.custom_save_every_weights = "1" + hps.custom_save_every_weights = True start() # Synchronize graphs by modifying config files @@ -237,10 +238,12 @@ def run( torch.cuda.set_device(rank) # Create datasets and dataloaders - if hps.if_f0 == 1: + if hps.pitch_guidance == True: train_dataset = TextAudioLoaderMultiNSFsid(hps.data) - else: + elif hps.pitch_guidance == False: train_dataset = TextAudioLoader(hps.data) + else: + raise ValueError(f"Unexpected value for hps.pitch_guidance: {hps.pitch_guidance}") train_sampler = DistributedBucketSampler( train_dataset, @@ -251,10 +254,11 @@ def run( shuffle=True, ) - if hps.if_f0 == 1: + if hps.pitch_guidance == True: collate_fn = TextAudioCollateMultiNSFsid() - else: + elif hps.pitch_guidance == False: collate_fn = TextAudioCollate() + train_loader = DataLoader( train_dataset, num_workers=4, @@ -271,7 +275,7 @@ def run( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, **hps.model, - use_f0=hps.if_f0 == 1, + use_f0=hps.pitch_guidance == True, is_half=hps.train.fp16_run, sr=hps.sample_rate, ) @@ -422,7 +426,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, data_iterator = cache if cache == []: for batch_idx, info in enumerate(train_loader): - if hps.if_f0 == 1: + if hps.pitch_guidance == True: ( phone, phone_lengths, @@ -434,7 +438,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, wave_lengths, sid, ) = info - else: + elif hps.pitch_guidance == False: ( phone, phone_lengths, @@ -447,7 +451,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, if torch.cuda.is_available(): phone = phone.cuda(rank, non_blocking=True) phone_lengths = phone_lengths.cuda(rank, non_blocking=True) - if hps.if_f0 == 1: + if hps.pitch_guidance == True: pitch = pitch.cuda(rank, non_blocking=True) pitchf = pitchf.cuda(rank, non_blocking=True) sid = sid.cuda(rank, non_blocking=True) @@ -455,7 +459,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, spec_lengths = spec_lengths.cuda(rank, non_blocking=True) wave = wave.cuda(rank, non_blocking=True) wave_lengths = wave_lengths.cuda(rank, non_blocking=True) - if hps.if_f0 == 1: + if hps.pitch_guidance == True: cache.append( ( batch_idx, @@ -472,7 +476,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, ), ) ) - else: + elif hps.pitch_guidance == False: cache.append( ( batch_idx, @@ -495,7 +499,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, epoch_recorder = EpochRecorder() with tqdm(total=len(train_loader), leave=False) as pbar: for batch_idx, info in data_iterator: - if hps.if_f0 == 1: + if hps.pitch_guidance == True: ( phone, phone_lengths, @@ -507,12 +511,12 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, wave_lengths, sid, ) = info - else: + elif hps.pitch_guidance == False: phone, phone_lengths, spec, spec_lengths, wave, wave_lengths, sid = info if (hps.if_cache_data_in_gpu == False) and torch.cuda.is_available(): phone = phone.cuda(rank, non_blocking=True) phone_lengths = phone_lengths.cuda(rank, non_blocking=True) - if hps.if_f0 == 1: + if hps.pitch_guidance == True: pitch = pitch.cuda(rank, non_blocking=True) pitchf = pitchf.cuda(rank, non_blocking=True) sid = sid.cuda(rank, non_blocking=True) @@ -522,7 +526,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, # Forward pass with autocast(enabled=hps.train.fp16_run): - if hps.if_f0 == 1: + if hps.pitch_guidance == True: ( y_hat, ids_slice, @@ -532,7 +536,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, ) = net_g( phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid ) - else: + elif hps.pitch_guidance == False: ( y_hat, ids_slice, @@ -544,7 +548,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, spec, hps.data.filter_length, hps.data.n_mel_channels, - hps.data.sampling_rate, + hps.data.sample_rate, hps.data.mel_fmin, hps.data.mel_fmax, ) @@ -556,7 +560,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, y_hat.float().squeeze(1), hps.data.filter_length, hps.data.n_mel_channels, - hps.data.sampling_rate, + hps.data.sample_rate, hps.data.hop_length, hps.data.win_length, hps.data.mel_fmin, @@ -669,9 +673,9 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, pbar.update(1) # Save checkpoint - if epoch % hps.save_every_epoch == 0 and rank == 0: + if epoch % hps.save_every_epoch == False and rank == 0: checkpoint_suffix = "{}.pth".format( - global_step if hps.if_latest == 0 else 2333333 + global_step if hps.if_latest == False else 2333333 ) save_checkpoint( net_g, @@ -688,7 +692,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, os.path.join(hps.model_dir, "D_" + checkpoint_suffix), ) - if rank == 0 and hps.custom_save_every_weights == "1": + if rank == 0 and hps.custom_save_every_weights == True: if hasattr(net_g, "module"): ckpt = net_g.module.state_dict() else: @@ -696,7 +700,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, extract_model( ckpt, hps.sample_rate, - hps.if_f0 == 1, + hps.pitch_guidance == True, hps.name, os.path.join( hps.model_dir, "{}_{}e_{}s.pth".format(hps.name, epoch, global_step) @@ -708,7 +712,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, ) # Overtraining detection and best model saving - if hps.overtraining_detector == 1: + if hps.overtraining_detector == True: if epoch >= (lowest_value["epoch"] + hps.overtraining_threshold): print( "Stopping training due to possible overtraining. Lowest generator loss: {} at epoch {}, step {}".format( @@ -737,7 +741,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, extract_model( ckpt, hps.sample_rate, - hps.if_f0 == 1, + hps.pitch_guidance == True, hps.name, os.path.join( hps.model_dir, @@ -756,11 +760,11 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, lowest_value_rounded, 3 ) # Round to 3 decimal place - if epoch > 1 and hps.overtraining_detector == 1: + if epoch > 1 and hps.overtraining_detector == True: print( f"{hps.name} | epoch={epoch} | step={global_step} | {epoch_recorder.record()} | lowest_value={lowest_value_rounded} (epoch {lowest_value['epoch']} and step {lowest_value['step']}) | Number of epochs remaining for overtraining: {lowest_value['epoch'] + hps.overtraining_threshold - epoch}" ) - elif epoch > 1 and hps.overtraining_detector == 0: + elif epoch > 1 and hps.overtraining_detector == False: print( f"{hps.name} | epoch={epoch} | step={global_step} | {epoch_recorder.record()} | lowest_value={lowest_value_rounded} (epoch {lowest_value['epoch']} and step {lowest_value['step']})" ) @@ -794,7 +798,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, extract_model( ckpt, hps.sample_rate, - hps.if_f0 == 1, + hps.pitch_guidance == True, hps.name, os.path.join( hps.model_dir, "{}_{}e_{}s.pth".format(hps.name, epoch, global_step) diff --git a/rvc/train/utils.py b/rvc/train/utils.py index 6e2f1fa..17defa0 100644 --- a/rvc/train/utils.py +++ b/rvc/train/utils.py @@ -151,7 +151,7 @@ def summarize( histograms={}, images={}, audios={}, - audio_sampling_rate=22050, + audio_sample_rate=22050, ): """ Summarizes training statistics and logs them to a TensorBoard writer. @@ -163,7 +163,7 @@ def summarize( histograms (dict, optional): Dictionary of histogram values to log. Defaults to {}. images (dict, optional): Dictionary of image values to log. Defaults to {}. audios (dict, optional): Dictionary of audio values to log. Defaults to {}. - audio_sampling_rate (int, optional): Sampling rate of the audio data. Defaults to 22050. + audio_sample_rate (int, optional): Sampling rate of the audio data. Defaults to 22050. """ for k, v in scalars.items(): writer.add_scalar(k, v, global_step) @@ -172,7 +172,7 @@ def summarize( for k, v in images.items(): writer.add_image(k, v, global_step, dataformats="HWC") for k, v in audios.items(): - writer.add_audio(k, v, global_step, audio_sampling_rate) + writer.add_audio(k, v, global_step, audio_sample_rate) def latest_checkpoint_path(dir_path, regex="G_*.pth"): @@ -224,8 +224,8 @@ def load_wav_to_torch(full_path): Args: full_path (str): The path to the WAV file. """ - sampling_rate, data = read(full_path) - return torch.FloatTensor(data.astype(np.float32)), sampling_rate + sample_rate, data = read(full_path) + return torch.FloatTensor(data.astype(np.float32)), sample_rate def load_filepaths_and_text(filename, split="|"): @@ -247,83 +247,84 @@ def get_hparams(): """ parser = argparse.ArgumentParser() parser.add_argument( - "-se", "--save_every_epoch", - type=int, - required=True, - help="checkpoint save frequency (epoch)", + type=str, + help="Frequency (in epochs) at which checkpoints are saved.", ) parser.add_argument( - "-te", "--total_epoch", type=int, required=True, help="total_epoch" + "--total_epoch", + type=str, + help="Total number of training epochs.", ) parser.add_argument( - "-pg", "--pretrainG", type=str, default="", help="Pretrained Discriminator path" + "--pretrainG", + type=str, + help="Path to the pretrained Generator model.", ) parser.add_argument( - "-pd", "--pretrainD", type=str, default="", help="Pretrained Generator path" + "--pretrainD", + type=str, + help="Path to the pretrained Discriminator model.", ) - parser.add_argument("-g", "--gpus", type=str, default="0", help="split by -") parser.add_argument( - "-bs", "--batch_size", type=int, required=True, help="batch size" + "--gpus", + type=str, + help="Hyphen-separated list of GPU device IDs to use (e.g., '0-1-2').", ) parser.add_argument( - "-e", "--experiment_dir", type=str, required=True, help="experiment dir" + "--batch_size", + type=str, + help="Batch size for training.", + ) + parser.add_argument( + "--experiment_dir", + type=str, + help="Directory to store experiment outputs.", ) parser.add_argument( - "-sr", "--sample_rate", type=str, required=True, help="sample rate, 32k/40k/48k" + "--sample_rate", + type=str, + help="Sample rate to use.", ) parser.add_argument( - "-sw", "--save_every_weights", type=str, - default="0", - help="save the extracted model in weights directory when saving checkpoints", + help="Save the model weights in the weights directory when saving checkpoints.", ) parser.add_argument( - "-v", "--version", type=str, required=True, help="model version" + "--version", + type=str, + help="Model version identifier.", ) parser.add_argument( - "-f0", - "--if_f0", - type=int, - required=True, - help="use f0 as one of the inputs of the model, 1 or 0", + "--pitch_guidance", + type=str, + help="Use pitch (f0) as one of the inputs to the model (True or False).", ) parser.add_argument( - "-l", "--if_latest", - type=int, - required=True, - help="if only save the latest G/D pth file, 1 or 0", + type=str, + help="Only save the latest Generator/Discriminator model files (True or False).", ) parser.add_argument( - "-c", "--if_cache_data_in_gpu", - type=int, - required=True, - help="if caching the dataset in GPU memory, 1 or 0", + type=str, + help="Cache the dataset in GPU memory (True or False).", ) - parser.add_argument( - "-od", "--overtraining_detector", - type=int, - required=True, - help="Detect overtraining or not, 1 or 0", + type=str, + help="Detect overtraining (True or False).", ) parser.add_argument( - "-ot", "--overtraining_threshold", - type=int, - default=50, - help="overtraining_threshold", + type=str, + help="Threshold for overtraining detection.", ) parser.add_argument( - "-sg", - "--sync-graph", - type=int, - required=True, - help="Sync graph or not, 1 or 0", + "--sync_graph", + type=str, + help="Synchronize graph (True or False).", ) args = parser.parse_args() @@ -334,23 +335,24 @@ def get_hparams(): config = json.load(f) hparams = HParams(**config) hparams.model_dir = hparams.experiment_dir = experiment_dir - hparams.save_every_epoch = args.save_every_epoch + hparams.save_every_epoch = int(args.save_every_epoch) hparams.name = name - hparams.total_epoch = args.total_epoch + hparams.total_epoch = int(args.total_epoch) hparams.pretrainG = args.pretrainG hparams.pretrainD = args.pretrainD hparams.version = args.version hparams.gpus = args.gpus - hparams.batch_size = args.batch_size - hparams.sample_rate = args.sample_rate - hparams.if_f0 = args.if_f0 - hparams.if_latest = args.if_latest - hparams.save_every_weights = args.save_every_weights - hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu + hparams.batch_size = int(args.batch_size) + hparams.sample_rate = int(args.sample_rate) + hparams.pitch_guidance = args.pitch_guidance + hparams.if_latest = bool(args.if_latest) + hparams.save_every_weights = bool(args.save_every_weights) + hparams.if_cache_data_in_gpu = bool(args.if_cache_data_in_gpu) hparams.data.training_files = f"{experiment_dir}/filelist.txt" - hparams.overtraining_detector = args.overtraining_detector - hparams.overtraining_threshold = args.overtraining_threshold + hparams.overtraining_detector = bool(args.overtraining_detector) + hparams.overtraining_threshold = int(args.overtraining_threshold) hparams.sync_graph = args.sync_graph + print(hparams) return hparams diff --git a/rvc_cli.py b/rvc_cli.py new file mode 100644 index 0000000..5895b77 --- /dev/null +++ b/rvc_cli.py @@ -0,0 +1,1540 @@ +import os +import sys +import json +import argparse +import subprocess +from functools import lru_cache + +now_dir = os.getcwd() +sys.path.append(now_dir) + +current_script_directory = os.path.dirname(os.path.realpath(__file__)) +logs_path = os.path.join(current_script_directory, "logs") + +from rvc.lib.tools.prerequisites_download import prequisites_download_pipeline +from rvc.train.extract.preparing_files import generate_config, generate_filelist +from rvc.train.process.model_blender import model_blender +from rvc.train.process.model_information import model_information +from rvc.train.process.extract_small_model import extract_small_model +from rvc.lib.tools.analyzer import analyze_audio +from rvc.lib.tools.launch_tensorboard import launch_tensorboard_pipeline +from rvc.lib.tools.model_download import model_download_pipeline + +python = sys.executable + + +# Get TTS Voices -> https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=6A5AA1D4EAFF4E9FB37E23D68491D6F4 +@lru_cache(maxsize=1) # Cache only one result since the file is static +def load_voices_data(): + with open(os.path.join("rvc", "lib", "tools", "tts_voices.json")) as f: + return json.load(f) + + +voices_data = load_voices_data() +locales = list({voice["Locale"] for voice in voices_data}) + + +@lru_cache(maxsize=None) +def import_voice_converter(): + from rvc.infer.infer import VoiceConverter + + return VoiceConverter() + + +@lru_cache(maxsize=1) +def get_config(): + from rvc.configs.config import Config + + return Config() + + +# Infer +def run_infer_script( + pitch: int, + filter_radius: int, + index_rate: float, + volume_envelope: int, + protect: float, + hop_length: int, + f0_method: str, + input_path: str, + output_path: str, + pth_path: str, + index_path: str, + split_audio: bool, + f0_autotune: bool, + clean_audio: bool, + clean_strength: float, + export_format: str, + upscale_audio: bool, + f0_file: str, + embedder_model: str, + embedder_model_custom: str = None, +): + infer_pipeline = import_voice_converter() + infer_pipeline.convert_audio( + pitch=pitch, + filter_radius=filter_radius, + index_rate=index_rate, + volume_envelope=volume_envelope, + protect=protect, + hop_length=hop_length, + f0_method=f0_method, + audio_input_path=input_path, + audio_output_path=output_path, + model_path=pth_path, + index_path=index_path, + split_audio=split_audio, + f0_autotune=f0_autotune, + clean_audio=clean_audio, + clean_strength=clean_strength, + export_format=export_format, + upscale_audio=upscale_audio, + f0_file=f0_file, + embedder_model=embedder_model, + embedder_model_custom=embedder_model_custom, + ) + return f"File {input_path} inferred successfully.", output_path.replace( + ".wav", f".{export_format.lower()}" + ) + + +# Batch infer +def run_batch_infer_script( + pitch: int, + filter_radius: int, + index_rate: float, + volume_envelope: int, + protect: float, + hop_length: int, + f0_method: str, + input_folder: str, + output_folder: str, + pth_path: str, + index_path: str, + split_audio: bool, + f0_autotune: bool, + clean_audio: bool, + clean_strength: float, + export_format: str, + upscale_audio: bool, + f0_file: str, + embedder_model: str, + embedder_model_custom: str = None, +): + audio_files = [ + f for f in os.listdir(input_folder) if f.endswith((".mp3", ".wav", ".flac")) + ] + print(f"Detected {len(audio_files)} audio files for inference.") + + for audio_file in audio_files: + if "_output" in audio_file: + pass + else: + input_path = os.path.join(input_folder, audio_file) + output_file_name = os.path.splitext(os.path.basename(audio_file))[0] + output_path = os.path.join( + output_folder, + f"{output_file_name}_output{os.path.splitext(audio_file)[1]}", + ) + infer_pipeline = import_voice_converter() + print(f"Inferring {input_path}...") + infer_pipeline.convert_audio( + pitch=pitch, + filter_radius=filter_radius, + index_rate=index_rate, + volume_envelope=volume_envelope, + protect=protect, + hop_length=hop_length, + f0_method=f0_method, + audio_input_path=input_path, + audio_output_path=output_path, + model_path=pth_path, + index_path=index_path, + split_audio=split_audio, + f0_autotune=f0_autotune, + clean_audio=clean_audio, + clean_strength=clean_strength, + export_format=export_format, + upscale_audio=upscale_audio, + f0_file=f0_file, + embedder_model=embedder_model, + embedder_model_custom=embedder_model_custom, + ) + + return f"Files from {input_folder} inferred successfully." + + +# TTS +def run_tts_script( + tts_text: str, + tts_voice: str, + tts_rate: int, + pitch: int, + filter_radius: int, + index_rate: float, + volume_envelope: int, + protect: float, + hop_length: int, + f0_method: str, + output_tts_path: str, + output_rvc_path: str, + pth_path: str, + index_path: str, + split_audio: bool, + f0_autotune: bool, + clean_audio: bool, + clean_strength: float, + export_format: str, + upscale_audio: bool, + f0_file: str, + embedder_model: str, + embedder_model_custom: str = None, +): + + tts_script_path = os.path.join("rvc", "lib", "tools", "tts.py") + + if os.path.exists(output_tts_path): + os.remove(output_tts_path) + + command_tts = [ + python, + tts_script_path, + tts_text, + tts_voice, + tts_rate, + output_tts_path, + ] + subprocess.run(command_tts) + infer_pipeline = import_voice_converter() + infer_pipeline.convert_audio( + pitch=pitch, + filter_radius=filter_radius, + index_rate=index_rate, + volume_envelope=volume_envelope, + protect=protect, + hop_length=hop_length, + f0_method=f0_method, + audio_input_path=output_tts_path, + audio_output_path=output_rvc_path, + model_path=pth_path, + index_path=index_path, + split_audio=split_audio, + f0_autotune=f0_autotune, + clean_audio=clean_audio, + clean_strength=clean_strength, + export_format=export_format, + upscale_audio=upscale_audio, + f0_file=f0_file, + embedder_model=embedder_model, + embedder_model_custom=embedder_model_custom, + ) + + return f"Text {tts_text} synthesized successfully.", output_rvc_path.replace( + ".wav", f".{export_format.lower()}" + ) + + +# Preprocess +def run_preprocess_script( + model_name: str, dataset_path: str, sample_rate: int, cpu_cores: int +): + config = get_config() + per = 3.0 if config.is_half else 3.7 + preprocess_script_path = os.path.join("rvc", "train", "preprocess", "preprocess.py") + command = [ + python, + preprocess_script_path, + *map( + str, + [ + os.path.join(logs_path, model_name), + dataset_path, + sample_rate, + per, + cpu_cores, + ], + ), + ] + os.makedirs(os.path.join(logs_path, model_name), exist_ok=True) + subprocess.run(command) + return f"Model {model_name} preprocessed successfully." + + +# Extract +def run_extract_script( + model_name: str, + rvc_version: str, + f0_method: str, + pitch_guidance: bool, + hop_length: int, + cpu_cores: int, + sample_rate: int, + embedder_model: str, + embedder_model_custom: str = None, +): + config = get_config() + model_path = os.path.join(logs_path, model_name) + extract_f0_script_path = os.path.join( + "rvc", "train", "extract", "extract_f0_print.py" + ) + extract_feature_script_path = os.path.join( + "rvc", "train", "extract", "extract_feature_print.py" + ) + + command_1 = [ + python, + extract_f0_script_path, + *map( + str, + [ + model_path, + f0_method, + hop_length, + cpu_cores, + ], + ), + ] + + command_2 = [ + python, + extract_feature_script_path, + *map( + str, + [ + config.device, + 1, + 0, + 0, + model_path, + rvc_version, + config.is_half, + embedder_model, + embedder_model_custom, + ], + ), + ] + subprocess.run(command_1) + subprocess.run(command_2) + + generate_config(rvc_version, sample_rate, model_path) + generate_filelist(pitch_guidance, model_path, rvc_version, sample_rate) + return f"Model {model_name} extracted successfully." + + +# Train +def run_train_script( + model_name: str, + rvc_version: str, + save_every_epoch: int, + save_only_latest: bool, + save_every_weights: bool, + total_epoch: int, + sample_rate: int, + batch_size: int, + gpu: int, + pitch_guidance: bool, + overtraining_detector: bool, + overtraining_threshold: int, + pretrained: bool, + sync_graph: bool, + cache_data_in_gpu: bool, + custom_pretrained: bool = False, + g_pretrained_path: str = None, + d_pretrained_path: str = None, +): + + if pretrained == True: + from rvc.lib.tools.pretrained_selector import pretrained_selector + + if custom_pretrained == False: + pg, pd = pretrained_selector(pitch_guidance)[rvc_version][sample_rate] + else: + if g_pretrained_path is None or d_pretrained_path is None: + raise ValueError( + "Please provide the path to the pretrained G and D models." + ) + pg, pd = g_pretrained_path, d_pretrained_path + else: + pg, pd = "", "" + + train_script_path = os.path.join("rvc", "train", "train.py") + command = [ + python, + train_script_path, + *map( + str, + [ + "--sync_graph", + sync_graph, + "--save_every_epoch", + save_every_epoch, + "--total_epoch", + total_epoch, + "--pretrainG", + pg, + "--pretrainD", + pd, + "--sample_rate", + sample_rate, + "--batch_size", + batch_size, + "--gpus", + gpu, + "--experiment_dir", + os.path.join(logs_path, model_name), + "--version", + rvc_version, + "--if_latest", + save_only_latest, + "--if_cache_data_in_gpu", + cache_data_in_gpu, + "--save_every_weights", + save_every_weights, + "--pitch_guidance", + pitch_guidance, + "--overtraining_detector", + overtraining_detector, + "--overtraining_threshold", + overtraining_threshold, + ], + ), + ] + print(command) + subprocess.run(command) + run_index_script(model_name, rvc_version) + return f"Model {model_name} trained successfully." + + +# Index +def run_index_script(model_name: str, rvc_version: str): + index_script_path = os.path.join("rvc", "train", "process", "extract_index.py") + command = [ + python, + index_script_path, + os.path.join(logs_path, model_name), + rvc_version, + ] + + subprocess.run(command) + return f"Index file for {model_name} generated successfully." + + +# Model extract +def run_model_extract_script( + pth_path: str, + model_name: str, + sample_rate: int, + pitch_guidance: bool, + rvc_version: str, + epoch: int, + step: int, +): + extract_small_model( + pth_path, model_name, sample_rate, pitch_guidance, rvc_version, epoch, step + ) + return f"Model {model_name} extracted successfully." + + +# Model information +def run_model_information_script(pth_path: str): + print(model_information(pth_path)) + + +# Model blender +def run_model_blender_script( + model_name: str, pth_path_1: str, pth_path_2: str, ratio: float +): + message, model_blended = model_blender(model_name, pth_path_1, pth_path_2, ratio) + return message, model_blended + + +# Tensorboard +def run_tensorboard_script(): + launch_tensorboard_pipeline() + + +# Download +def run_download_script(model_link: str): + model_download_pipeline(model_link) + return f"Model downloaded successfully." + + +# Prerequisites +def run_prerequisites_script( + pretraineds_v1: bool, pretraineds_v2: bool, models: bool, exe: bool +): + prequisites_download_pipeline(pretraineds_v1, pretraineds_v2, models, exe) + return "Prerequisites installed successfully." + + +# Audio analyzer +def run_audio_analyzer_script( + input_path: str, save_plot_path: str = "logs/audio_analysis.png" +): + audio_info, plot_path = analyze_audio(input_path, save_plot_path) + print( + f"Audio info of {input_path}: {audio_info}", + f"Audio file {input_path} analyzed successfully. Plot saved at: {plot_path}", + ) + return audio_info, plot_path + + +# API +def run_api_script(ip: str, port: int): + command = [ + "env/Scripts/uvicorn.exe" if os.name == "nt" else "uvicorn", + "api:app", + "--host", + ip, + "--port", + port, + ] + subprocess.run(command) + + +# Parse arguments +def parse_arguments(): + parser = argparse.ArgumentParser( + description="Run the main.py script with specific parameters." + ) + subparsers = parser.add_subparsers( + title="subcommands", dest="mode", help="Choose a mode" + ) + + # Parser for 'infer' mode + infer_parser = subparsers.add_parser("infer", help="Run inference") + pitch_description = ( + "Set the pitch of the audio. Higher values result in a higher pitch." + ) + infer_parser.add_argument( + "--pitch", + type=int, + help=pitch_description, + choices=range(-24, 25), + default=0, + ) + filter_radius_description = "Apply median filtering to the extracted pitch values if this value is greater than or equal to three. This can help reduce breathiness in the output audio." + infer_parser.add_argument( + "--filter_radius", + type=int, + help=filter_radius_description, + choices=range(11), + default=3, + ) + index_rate_description = "Control the influence of the index file on the output. Higher values mean stronger influence. Lower values can help reduce artifacts but may result in less accurate voice cloning." + infer_parser.add_argument( + "--index_rate", + type=float, + help=index_rate_description, + choices=[(i / 10) for i in range(11)], + default=0.3, + ) + volume_envelope_description = "Control the blending of the output's volume envelope. A value of 1 means the output envelope is fully used." + infer_parser.add_argument( + "--volume_envelope", + type=float, + help=volume_envelope_description, + choices=[(i / 10) for i in range(11)], + default=1, + ) + protect_description = "Protect consonants and breathing sounds from artifacts. A value of 0.5 offers the strongest protection, while lower values may reduce the protection level but potentially mitigate the indexing effect." + infer_parser.add_argument( + "--protect", + type=float, + help=protect_description, + choices=[(i / 10) for i in range(6)], + default=0.33, + ) + hop_length_description = "Only applicable for the Crepe pitch extraction method. Determines the time it takes for the system to react to a significant pitch change. Smaller values require more processing time but can lead to better pitch accuracy." + infer_parser.add_argument( + "--hop_length", + type=int, + help=hop_length_description, + choices=range(1, 513), + default=128, + ) + f0_method_description = "Choose the pitch extraction algorithm for the conversion. 'rmvpe' is the default and generally recommended." + infer_parser.add_argument( + "--f0_method", + type=str, + help=f0_method_description, + choices=[ + "crepe", + "crepe-tiny", + "rmvpe", + "fcpe", + "hybrid[crepe+rmvpe]", + "hybrid[crepe+fcpe]", + "hybrid[rmvpe+fcpe]", + "hybrid[crepe+rmvpe+fcpe]", + ], + default="rmvpe", + ) + infer_parser.add_argument( + "--input_path", + type=str, + help="Full path to the input audio file.", + required=True, + ) + infer_parser.add_argument( + "--output_path", + type=str, + help="Full path to the output audio file.", + required=True, + ) + pth_path_description = "Full path to the RVC model file (.pth)." + infer_parser.add_argument( + "--pth_path", type=str, help=pth_path_description, required=True + ) + index_path_description = "Full path to the index file (.index)." + infer_parser.add_argument( + "--index_path", type=str, help=index_path_description, required=True + ) + split_audio_description = "Split the audio into smaller segments before inference. This can improve the quality of the output for longer audio files." + infer_parser.add_argument( + "--split_audio", + type=bool, + choices=[True, False], + help=split_audio_description, + default=False, + ) + f0_autotune_description = "Apply a light autotune to the inferred audio. Particularly useful for singing voice conversions." + infer_parser.add_argument( + "--f0_autotune", + type=bool, + choices=[True, False], + help=f0_autotune_description, + default=False, + ) + clean_audio_description = "Clean the output audio using noise reduction algorithms. Recommended for speech conversions." + infer_parser.add_argument( + "--clean_audio", + type=bool, + choices=[True, False], + help=clean_audio_description, + default=False, + ) + clean_strength_description = "Adjust the intensity of the audio cleaning process. Higher values result in stronger cleaning, but may lead to a more compressed sound." + infer_parser.add_argument( + "--clean_strength", + type=float, + help=clean_strength_description, + choices=[(i / 10) for i in range(11)], + default=0.7, + ) + export_format_description = "Select the desired output audio format." + infer_parser.add_argument( + "--export_format", + type=str, + help=export_format_description, + choices=["WAV", "MP3", "FLAC", "OGG", "M4A"], + default="WAV", + ) + embedder_model_description = ( + "Choose the model used for generating speaker embeddings." + ) + infer_parser.add_argument( + "--embedder_model", + type=str, + help=embedder_model_description, + choices=[ + "contentvec", + "japanese-hubert-base", + "chinese-hubert-large", + "custom", + ], + default="contentvec", + ) + embedder_model_custom_description = "Specify the path to a custom model for speaker embedding. Only applicable if 'embedder_model' is set to 'custom'." + infer_parser.add_argument( + "--embedder_model_custom", + type=str, + help=embedder_model_custom_description, + default=None, + ) + upscale_audio_description = "Upscale the input audio to a higher quality before processing. This can improve the overall quality of the output, especially for low-quality input audio." + infer_parser.add_argument( + "--upscale_audio", + type=bool, + choices=[True, False], + help=upscale_audio_description, + default=False, + ) + f0_file_description = "Full path to an external F0 file (.f0). This allows you to use pre-computed pitch values for the input audio." + infer_parser.add_argument( + "--f0_file", + type=str, + help=f0_file_description, + default=None, + ) + + # Parser for 'batch_infer' mode + batch_infer_parser = subparsers.add_parser( + "batch_infer", + help="Run batch inference", + ) + batch_infer_parser.add_argument( + "--pitch", + type=int, + help=pitch_description, + choices=range(-24, 25), + default=0, + ) + batch_infer_parser.add_argument( + "--filter_radius", + type=int, + help=filter_radius_description, + choices=range(11), + default=3, + ) + batch_infer_parser.add_argument( + "--index_rate", + type=float, + help=index_rate_description, + choices=[(i / 10) for i in range(11)], + default=0.3, + ) + batch_infer_parser.add_argument( + "--volume_envelope", + type=float, + help=volume_envelope_description, + choices=[(i / 10) for i in range(11)], + default=1, + ) + batch_infer_parser.add_argument( + "--protect", + type=float, + help=protect_description, + choices=[(i / 10) for i in range(6)], + default=0.33, + ) + batch_infer_parser.add_argument( + "--hop_length", + type=int, + help=hop_length_description, + choices=range(1, 513), + default=128, + ) + batch_infer_parser.add_argument( + "--f0_method", + type=str, + help=f0_method_description, + choices=[ + "crepe", + "crepe-tiny", + "rmvpe", + "fcpe", + "hybrid[crepe+rmvpe]", + "hybrid[crepe+fcpe]", + "hybrid[rmvpe+fcpe]", + "hybrid[crepe+rmvpe+fcpe]", + ], + default="rmvpe", + ) + batch_infer_parser.add_argument( + "--input_folder", + type=str, + help="Path to the folder containing input audio files.", + required=True, + ) + batch_infer_parser.add_argument( + "--output_folder", + type=str, + help="Path to the folder for saving output audio files.", + required=True, + ) + batch_infer_parser.add_argument( + "--pth_path", type=str, help=pth_path_description, required=True + ) + batch_infer_parser.add_argument( + "--index_path", type=str, help=index_path_description, required=True + ) + batch_infer_parser.add_argument( + "--split_audio", + type=bool, + choices=[True, False], + help=split_audio_description, + default=False, + ) + batch_infer_parser.add_argument( + "--f0_autotune", + type=bool, + choices=[True, False], + help=f0_autotune_description, + default=False, + ) + batch_infer_parser.add_argument( + "--clean_audio", + type=bool, + choices=[True, False], + help=clean_audio_description, + default=False, + ) + batch_infer_parser.add_argument( + "--clean_strength", + type=float, + help=clean_strength_description, + choices=[(i / 10) for i in range(11)], + default=0.7, + ) + batch_infer_parser.add_argument( + "--export_format", + type=str, + help=export_format_description, + choices=["WAV", "MP3", "FLAC", "OGG", "M4A"], + default="WAV", + ) + batch_infer_parser.add_argument( + "--embedder_model", + type=str, + help=embedder_model_description, + choices=[ + "contentvec", + "japanese-hubert-base", + "chinese-hubert-large", + "custom", + ], + default="contentvec", + ) + batch_infer_parser.add_argument( + "--embedder_model_custom", + type=str, + help=embedder_model_custom_description, + default=None, + ) + batch_infer_parser.add_argument( + "--upscale_audio", + type=bool, + choices=[True, False], + help=upscale_audio_description, + default=False, + ) + batch_infer_parser.add_argument( + "--f0_file", + type=str, + help=f0_file_description, + default=None, + ) + + # Parser for 'tts' mode + tts_parser = subparsers.add_parser("tts", help="Run TTS inference") + tts_parser.add_argument( + "--tts_text", type=str, help="Text to be synthesized", required=True + ) + tts_parser.add_argument( + "--tts_voice", + type=str, + help="Voice to be used for TTS synthesis.", + choices=locales, + required=True, + ) + tts_parser.add_argument( + "--tts_rate", + type=int, + help="Control the speaking rate of the TTS. Values range from -100 (slower) to 100 (faster).", + choices=range(-100, 101), + default=0, + ) + tts_parser.add_argument( + "--pitch", + type=int, + help=pitch_description, + choices=range(-24, 25), + default=0, + ) + tts_parser.add_argument( + "--filter_radius", + type=int, + help=filter_radius_description, + choices=range(11), + default=3, + ) + tts_parser.add_argument( + "--index_rate", + type=float, + help=index_rate_description, + choices=[(i / 10) for i in range(11)], + default=0.3, + ) + tts_parser.add_argument( + "--volume_envelope", + type=float, + help=volume_envelope_description, + choices=[(i / 10) for i in range(11)], + default=1, + ) + tts_parser.add_argument( + "--protect", + type=float, + help=protect_description, + choices=[(i / 10) for i in range(6)], + default=0.33, + ) + tts_parser.add_argument( + "--hop_length", + type=int, + help=hop_length_description, + choices=range(1, 513), + default=128, + ) + tts_parser.add_argument( + "--f0_method", + type=str, + help=f0_method_description, + choices=[ + "crepe", + "crepe-tiny", + "rmvpe", + "fcpe", + "hybrid[crepe+rmvpe]", + "hybrid[crepe+fcpe]", + "hybrid[rmvpe+fcpe]", + "hybrid[crepe+rmvpe+fcpe]", + ], + default="rmvpe", + ) + tts_parser.add_argument( + "--output_tts_path", + type=str, + help="Full path to save the synthesized TTS audio.", + required=True, + ) + tts_parser.add_argument( + "--output_rvc_path", + type=str, + help="Full path to save the voice-converted audio using the synthesized TTS.", + required=True, + ) + tts_parser.add_argument( + "--pth_path", type=str, help=pth_path_description, required=True + ) + tts_parser.add_argument( + "--index_path", type=str, help=index_path_description, required=True + ) + tts_parser.add_argument( + "--split_audio", + type=bool, + choices=[True, False], + help=split_audio_description, + default=False, + ) + tts_parser.add_argument( + "--f0_autotune", + type=bool, + choices=[True, False], + help=f0_autotune_description, + default=False, + ) + tts_parser.add_argument( + "--clean_audio", + type=bool, + choices=[True, False], + help=clean_audio_description, + default=False, + ) + tts_parser.add_argument( + "--clean_strength", + type=float, + help=clean_strength_description, + choices=[(i / 10) for i in range(11)], + default=0.7, + ) + tts_parser.add_argument( + "--export_format", + type=str, + help=export_format_description, + choices=["WAV", "MP3", "FLAC", "OGG", "M4A"], + default="WAV", + ) + tts_parser.add_argument( + "--embedder_model", + type=str, + help=embedder_model_description, + choices=[ + "contentvec", + "japanese-hubert-base", + "chinese-hubert-large", + "custom", + ], + default="contentvec", + ) + tts_parser.add_argument( + "--embedder_model_custom", + type=str, + help=embedder_model_custom_description, + default=None, + ) + tts_parser.add_argument( + "--upscale_audio", + type=bool, + choices=[True, False], + help=upscale_audio_description, + default=False, + ) + tts_parser.add_argument( + "--f0_file", + type=str, + help=f0_file_description, + default=None, + ) + + # Parser for 'preprocess' mode + preprocess_parser = subparsers.add_parser( + "preprocess", help="Preprocess a dataset for training." + ) + preprocess_parser.add_argument( + "--model_name", type=str, help="Name of the model to be trained.", required=True + ) + preprocess_parser.add_argument( + "--dataset_path", type=str, help="Path to the dataset directory.", required=True + ) + preprocess_parser.add_argument( + "--sample_rate", + type=int, + help="Target sampling rate for the audio data.", + choices=[32000, 40000, 48000], + required=True, + ) + preprocess_parser.add_argument( + "--cpu_cores", + type=int, + help="Number of CPU cores to use for preprocessing.", + choices=range(1, 65), + ) + + # Parser for 'extract' mode + extract_parser = subparsers.add_parser( + "extract", help="Extract features from a dataset." + ) + extract_parser.add_argument( + "--model_name", type=str, help="Name of the model.", required=True + ) + extract_parser.add_argument( + "--rvc_version", + type=str, + help="Version of the RVC model ('v1' or 'v2').", + choices=["v1", "v2"], + default="v2", + ) + extract_parser.add_argument( + "--f0_method", + type=str, + help="Pitch extraction method to use.", + choices=[ + "crepe", + "crepe-tiny", + "rmvpe", + ], + default="rmvpe", + ) + extract_parser.add_argument( + "--pitch_guidance", + type=bool, + choices=[True, False], + help="Enable or disable pitch guidance during feature extraction.", + default=True, + ) + extract_parser.add_argument( + "--hop_length", + type=int, + help="Hop length for feature extraction. Only applicable for Crepe pitch extraction.", + choices=range(1, 513), + default=128, + ) + extract_parser.add_argument( + "--cpu_cores", + type=int, + help="Number of CPU cores to use for feature extraction (optional).", + choices=range(1, 65), + default=None, + ) + extract_parser.add_argument( + "--sample_rate", + type=int, + help="Target sampling rate for the audio data.", + choices=[32000, 40000, 48000], + required=True, + ) + extract_parser.add_argument( + "--embedder_model", + type=str, + help=embedder_model_description, + choices=[ + "contentvec", + "japanese-hubert-base", + "chinese-hubert-large", + "custom", + ], + default="contentvec", + ) + extract_parser.add_argument( + "--embedder_model_custom", + type=str, + help=embedder_model_custom_description, + default=None, + ) + + # Parser for 'train' mode + train_parser = subparsers.add_parser("train", help="Train an RVC model.") + train_parser.add_argument( + "--model_name", type=str, help="Name of the model to be trained.", required=True + ) + train_parser.add_argument( + "--rvc_version", + type=str, + help="Version of the RVC model to train ('v1' or 'v2').", + choices=["v1", "v2"], + default="v2", + ) + train_parser.add_argument( + "--save_every_epoch", + type=int, + help="Save the model every specified number of epochs.", + choices=range(1, 101), + required=True, + ) + train_parser.add_argument( + "--save_only_latest", + type=bool, + choices=[True, False], + help="Save only the latest model checkpoint.", + default=False, + ) + train_parser.add_argument( + "--save_every_weights", + type=bool, + choices=[True, False], + help="Save model weights every epoch.", + default=True, + ) + train_parser.add_argument( + "--total_epoch", + type=int, + help="Total number of epochs to train for.", + choices=range(1, 10001), + default=1000, + ) + train_parser.add_argument( + "--sample_rate", + type=int, + help="Sampling rate of the training data.", + choices=[32000, 40000, 48000], + required=True, + ) + train_parser.add_argument( + "--batch_size", + type=int, + help="Batch size for training.", + choices=range(1, 51), + default=8, + ) + train_parser.add_argument( + "--gpu", + type=str, + help="GPU device to use for training (e.g., '0').", + default="0", + ) + train_parser.add_argument( + "--pitch_guidance", + type=bool, + choices=[True, False], + help="Enable or disable pitch guidance during training.", + default=True, + ) + train_parser.add_argument( + "--pretrained", + type=bool, + choices=[True, False], + help="Use a pretrained model for initialization.", + default=True, + ) + train_parser.add_argument( + "--custom_pretrained", + type=bool, + choices=[True, False], + help="Use a custom pretrained model.", + default=False, + ) + train_parser.add_argument( + "--g_pretrained_path", + type=str, + nargs="?", + default=None, + help="Path to the pretrained generator model file.", + ) + train_parser.add_argument( + "--d_pretrained_path", + type=str, + nargs="?", + default=None, + help="Path to the pretrained discriminator model file.", + ) + train_parser.add_argument( + "--overtraining_detector", + type=bool, + choices=[True, False], + help="Enable overtraining detection.", + default=False, + ) + train_parser.add_argument( + "--overtraining_threshold", + type=int, + help="Threshold for overtraining detection.", + choices=range(1, 101), + default=50, + ) + train_parser.add_argument( + "--sync_graph", + type=bool, + choices=[True, False], + help="Enable graph synchronization for distributed training.", + default=False, + ) + train_parser.add_argument( + "--cache_data_in_gpu", + type=bool, + choices=[True, False], + help="Cache training data in GPU memory.", + default=False, + ) + + # Parser for 'index' mode + index_parser = subparsers.add_parser( + "index", help="Generate an index file for an RVC model." + ) + index_parser.add_argument( + "--model_name", type=str, help="Name of the model.", required=True + ) + index_parser.add_argument( + "--rvc_version", + type=str, + help="Version of the RVC model ('v1' or 'v2').", + choices=["v1", "v2"], + default="v2", + ) + + # Parser for 'model_extract' mode + model_extract_parser = subparsers.add_parser( + "model_extract", help="Extract a specific epoch from a trained model." + ) + model_extract_parser.add_argument( + "--pth_path", type=str, help="Path to the main .pth model file.", required=True + ) + model_extract_parser.add_argument( + "--model_name", type=str, help="Name of the model.", required=True + ) + model_extract_parser.add_argument( + "--sample_rate", + type=int, + help="Sampling rate of the extracted model.", + choices=[32000, 40000, 48000], + required=True, + ) + model_extract_parser.add_argument( + "--pitch_guidance", + type=bool, + choices=[True, False], + help="Enable or disable pitch guidance for the extracted model.", + required=True, + ) + model_extract_parser.add_argument( + "--rvc_version", + type=str, + help="Version of the extracted RVC model ('v1' or 'v2').", + choices=["v1", "v2"], + default="v2", + ) + model_extract_parser.add_argument( + "--epoch", + type=int, + help="Epoch number to extract from the model.", + choices=range(1, 10001), + required=True, + ) + model_extract_parser.add_argument( + "--step", + type=str, + help="Step number to extract from the model (optional).", + required=False, + ) + + # Parser for 'model_information' mode + model_information_parser = subparsers.add_parser( + "model_information", help="Display information about a trained model." + ) + model_information_parser.add_argument( + "--pth_path", type=str, help="Path to the .pth model file.", required=True + ) + + # Parser for 'model_blender' mode + model_blender_parser = subparsers.add_parser( + "model_blender", help="Fuse two RVC models together." + ) + model_blender_parser.add_argument( + "--model_name", type=str, help="Name of the new fused model.", required=True + ) + model_blender_parser.add_argument( + "--pth_path_1", + type=str, + help="Path to the first .pth model file.", + required=True, + ) + model_blender_parser.add_argument( + "--pth_path_2", + type=str, + help="Path to the second .pth model file.", + required=True, + ) + model_blender_parser.add_argument( + "--ratio", + type=float, + help="Ratio for blending the two models (0.0 to 1.0).", + choices=[(i / 10) for i in range(11)], + default=0.5, + ) + + # Parser for 'tensorboard' mode + subparsers.add_parser( + "tensorboard", help="Launch TensorBoard for monitoring training progress." + ) + + # Parser for 'download' mode + download_parser = subparsers.add_parser( + "download", help="Download a model from a provided link." + ) + download_parser.add_argument( + "--model_link", type=str, help="Direct link to the model file.", required=True + ) + + # Parser for 'prerequisites' mode + prerequisites_parser = subparsers.add_parser( + "prerequisites", help="Install prerequisites for RVC." + ) + prerequisites_parser.add_argument( + "--pretraineds_v1", + type=bool, + choices=[True, False], + default=True, + help="Download pretrained models for RVC v1.", + ) + prerequisites_parser.add_argument( + "--pretraineds_v2", + type=bool, + choices=[True, False], + default=True, + help="Download pretrained models for RVC v2.", + ) + prerequisites_parser.add_argument( + "--models", + type=bool, + choices=[True, False], + default=True, + help="Download additional models.", + ) + prerequisites_parser.add_argument( + "--exe", + type=bool, + choices=[True, False], + default=True, + help="Download required executables.", + ) + + # Parser for 'audio_analyzer' mode + audio_analyzer = subparsers.add_parser( + "audio_analyzer", help="Analyze an audio file." + ) + audio_analyzer.add_argument( + "--input_path", type=str, help="Path to the input audio file.", required=True + ) + + # Parser for 'api' mode + api_parser = subparsers.add_parser("api", help="Start the RVC API server.") + api_parser.add_argument( + "--host", type=str, help="Host address for the API server.", default="127.0.0.1" + ) + api_parser.add_argument( + "--port", type=int, help="Port for the API server.", default=8000 + ) + + return parser.parse_args() + + +def main(): + if len(sys.argv) == 1: + print("Please run the script with '-h' for more information.") + sys.exit(1) + + args = parse_arguments() + + try: + if args.mode == "infer": + run_infer_script( + pitch=args.pitch, + filter_radius=args.filter_radius, + index_rate=args.index_rate, + volume_envelope=args.volume_envelope, + protect=args.protect, + hop_length=args.hop_length, + f0_method=args.f0_method, + input_path=args.input_path, + output_path=args.output_path, + pth_path=args.pth_path, + index_path=args.index_path, + split_audio=args.split_audio, + f0_autotune=args.f0_autotune, + clean_audio=args.clean_audio, + clean_strength=args.clean_strength, + export_format=args.export_format, + embedder_model=args.embedder_model, + embedder_model_custom=args.embedder_model_custom, + upscale_audio=args.upscale_audio, + f0_file=args.f0_file, + ) + elif args.mode == "batch_infer": + run_batch_infer_script( + pitch=args.pitch, + filter_radius=args.filter_radius, + index_rate=args.index_rate, + volume_envelope=args.volume_envelope, + protect=args.protect, + hop_length=args.hop_length, + f0_method=args.f0_method, + input_folder=args.input_folder, + output_folder=args.output_folder, + pth_path=args.pth_path, + index_path=args.index_path, + split_audio=args.split_audio, + f0_autotune=args.f0_autotune, + clean_audio=args.clean_audio, + clean_strength=args.clean_strength, + export_format=args.export_format, + embedder_model=args.embedder_model, + embedder_model_custom=args.embedder_model_custom, + upscale_audio=args.upscale_audio, + f0_file=args.f0_file, + ) + elif args.mode == "tts": + run_tts_script( + tts_text=args.tts_text, + tts_voice=args.tts_voice, + tts_rate=args.tts_rate, + pitch=args.pitch, + filter_radius=args.filter_radius, + index_rate=args.index_rate, + volume_envelope=args.volume_envelope, + protect=args.protect, + hop_length=args.hop_length, + f0_method=args.f0_method, + input_path=args.input_path, + output_path=args.output_path, + pth_path=args.pth_path, + index_path=args.index_path, + split_audio=args.split_audio, + f0_autotune=args.f0_autotune, + clean_audio=args.clean_audio, + clean_strength=args.clean_strength, + export_format=args.export_format, + embedder_model=args.embedder_model, + embedder_model_custom=args.embedder_model_custom, + upscale_audio=args.upscale_audio, + f0_file=args.f0_file, + ) + elif args.mode == "preprocess": + run_preprocess_script( + model_name=args.model_name, + dataset_path=args.dataset_path, + sample_rate=args.sample_rate, + cpu_cores=args.cpu_cores, + ) + elif args.mode == "extract": + run_extract_script( + model_name=args.model_name, + rvc_version=args.rvc_version, + f0_method=args.f0_method, + pitch_guidance=args.pitch_guidance, + hop_length=args.hop_length, + cpu_cores=args.cpu_cores, + sample_rate=args.sample_rate, + embedder_model=args.embedder_model, + embedder_model_custom=args.embedder_model_custom, + ) + elif args.mode == "train": + run_train_script( + model_name=args.model_name, + rvc_version=args.rvc_version, + save_every_epoch=args.save_every_epoch, + save_only_latest=args.save_only_latest, + save_every_weights=args.save_every_weights, + total_epoch=args.total_epoch, + sample_rate=args.sample_rate, + batch_size=args.batch_size, + gpu=args.gpu, + pitch_guidance=args.pitch_guidance, + overtraining_detector=args.overtraining_detector, + overtraining_threshold=args.overtraining_threshold, + pretrained=args.pretrained, + custom_pretrained=args.custom_pretrained, + sync_graph=args.sync_graph, + cache_data_in_gpu=args.cache_data_in_gpu, + g_pretrained_path=args.g_pretrained_path, + d_pretrained_path=args.d_pretrained_path, + ) + elif args.mode == "index": + run_index_script( + model_name=args.model_name, + rvc_version=args.rvc_version, + ) + elif args.mode == "model_extract": + run_model_extract_script( + pth_path=args.pth_path, + model_name=args.model_name, + sample_rate=args.sample_rate, + pitch_guidance=args.pitch_guidance, + rvc_version=args.rvc_version, + epoch=args.epoch, + step=args.step, + ) + elif args.mode == "model_information": + run_model_information_script( + pth_path=args.pth_path, + ) + elif args.mode == "model_blender": + run_model_blender_script( + model_name=args.model_name, + pth_path_1=args.pth_path_1, + pth_path_2=args.pth_path_2, + ratio=args.ratio, + ) + elif args.mode == "tensorboard": + run_tensorboard_script() + elif args.mode == "download": + run_download_script( + model_link=args.model_link, + ) + elif args.mode == "prerequisites": + run_prerequisites_script( + pretraineds_v1=args.pretraineds_v1, + pretraineds_v2=args.pretraineds_v2, + models=args.models, + exe=args.exe, + ) + elif args.mode == "audio_analyzer": + run_audio_analyzer_script( + input_path=args.input_path, + ) + elif args.mode == "api": + run_api_script( + ip=args.host, + port=args.port, + ) + except Exception as error: + print(f"Error: {error}") + + import traceback + + traceback.print_exc() + + +if __name__ == "__main__": + main() diff --git a/uvr.py b/uvr_cli.py similarity index 100% rename from uvr.py rename to uvr_cli.py