diff --git a/.env b/.env new file mode 100644 index 0000000..04bf864 --- /dev/null +++ b/.env @@ -0,0 +1,9 @@ +OPENBLAS_NUM_THREADS = 1 +no_proxy = localhost, 127.0.0.1, ::1 + +# You can change the location of the model, etc. by changing here +weight_root = assets/weights +weight_uvr5_root = assets/uvr5_weights +index_root = output +TEMP = tmp + diff --git a/.github/workflows/genlocale.yml b/.github/workflows/genlocale.yml index ebed03a..96a29e8 100644 --- a/.github/workflows/genlocale.yml +++ b/.github/workflows/genlocale.yml @@ -13,8 +13,9 @@ jobs: - name: Run locale generation run: | - python3 lib/i18n/scan_i18n.py - cd lib/i18n && python3 locale_diff.py + python3 i18n/scan_i18n.py + cd i18n + python3 locale_diff.py - name: Commit back if: ${{ !github.head_ref }} diff --git a/.github/workflows/unitest.yml b/.github/workflows/unitest.yml index 879df3f..eca7f70 100644 --- a/.github/workflows/unitest.yml +++ b/.github/workflows/unitest.yml @@ -30,7 +30,7 @@ jobs: run: | mkdir -p logs/mi-test touch logs/mi-test/preprocess.log - python trainset_preprocess_pipeline_print.py logs/mute/0_gt_wavs 48000 8 logs/mi-test True + python infer/modules/train/preprocess.py logs/mute/0_gt_wavs 48000 8 logs/mi-test True touch logs/mi-test/extract_f0_feature.log - python extract_f0_print.py logs/mi-test $(nproc) pm - python extract_feature_print.py cpu 1 0 0 logs/mi-test v1 + python infer/modules/train/extract/extract_f0_print.py logs/mi-test $(nproc) pm + python infer/modules/train/extract_feature_print.py cpu 1 0 0 logs/mi-test v1 diff --git a/.gitignore b/.gitignore index 630c32e..22e9bf8 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,6 @@ __pycache__ hubert_base.pt /logs .venv +/opt +tools/aria2c/ +tools/flag.txt diff --git a/Dockerfile b/Dockerfile index 65ffbc1..fc1ee26 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,17 +12,17 @@ RUN apt update && apt install -y -qq ffmpeg aria2 RUN pip3 install -r requirements.txt -RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D40k.pth -d pretrained_v2/ -o D40k.pth -RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G40k.pth -d pretrained_v2/ -o G40k.pth -RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D40k.pth -d pretrained_v2/ -o f0D40k.pth -RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G40k.pth -d pretrained_v2/ -o f0G40k.pth +RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D40k.pth -d assets/pretrained_v2/ -o D40k.pth +RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G40k.pth -d assets/pretrained_v2/ -o G40k.pth +RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D40k.pth -d assets/pretrained_v2/ -o f0D40k.pth +RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G40k.pth -d assets/pretrained_v2/ -o f0G40k.pth -RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2-人声vocals+非人声instrumentals.pth -d uvr5_weights/ -o HP2-人声vocals+非人声instrumentals.pth -RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5-主旋律人声vocals+其他instrumentals.pth -d uvr5_weights/ -o HP5-主旋律人声vocals+其他instrumentals.pth +RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2-人声vocals+非人声instrumentals.pth -d assets/uvr5_weights/ -o HP2-人声vocals+非人声instrumentals.pth +RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5-主旋律人声vocals+其他instrumentals.pth -d assets/uvr5_weights/ -o HP5-主旋律人声vocals+其他instrumentals.pth -RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -o hubert_base.pt +RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d assets/hubert -o hubert_base.pt -RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt -o rmvpe.pt +RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt -d assets/hubert -o rmvpe.pt VOLUME [ "/app/weights", "/app/opt" ] diff --git a/README.md b/README.md index 8e39559..795a8bb 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ ------ -[**English**](./docs/README.en.md) | [**中文简体**](./README.md) | [**日本語**](./docs/README.ja.md) | [**한국어**](./docs/README.ko.md) ([**韓國語**](./docs/README.ko.han.md)) | [**Türkçe**](./docs/README.tr.md) +[**English**](./docs/en/README.en.md) | [**中文简体**](./README.md) | [**日本語**](./docs/jp/README.ja.md) | [**한국어**](./docs/kr/README.ko.md) ([**韓國語**](./docs/kr/README.ko.han.md)) | [**Türkçe**](./docs/tr/README.tr.md) 点此查看我们的[演示视频](https://www.bilibili.com/video/BV1pm4y1z7Gm/) ! @@ -89,15 +89,15 @@ RVC需要其他一些预模型来推理和训练。 以下是一份清单,包括了所有RVC所需的预模型和其他文件的名称: ```bash -hubert_base.pt +./assets/hubert/hubert_base.pt -./pretrained +./assets/pretrained -./uvr5_weights +./assets/uvr5_weights 想测试v2版本模型的话,需要额外下载 -./pretrained_v2 +./assets/pretrained_v2 如果你正在使用Windows,则你可能需要这个文件,若ffmpeg和ffprobe已安装则跳过; ubuntu/debian 用户可以通过apt install ffmpeg来安装这2个库, Mac 用户则可以通过brew install ffmpeg来安装 (需要预先安装brew) diff --git a/app.py b/app.py deleted file mode 100644 index 69bb617..0000000 --- a/app.py +++ /dev/null @@ -1,317 +0,0 @@ -import os -import torch - -# os.system("wget -P cvec/ https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt") -import gradio as gr -import librosa -import numpy as np -import logging -from fairseq import checkpoint_utils -from lib.train.vc_infer_pipeline import VC -import traceback -from config import defaultconfig as config -from lib.infer_pack.models import ( - SynthesizerTrnMs256NSFsid, - SynthesizerTrnMs256NSFsid_nono, - SynthesizerTrnMs768NSFsid, - SynthesizerTrnMs768NSFsid_nono, -) -from i18n import I18nAuto - -logging.getLogger("numba").setLevel(logging.WARNING) -logging.getLogger("markdown_it").setLevel(logging.WARNING) -logging.getLogger("urllib3").setLevel(logging.WARNING) -logging.getLogger("matplotlib").setLevel(logging.WARNING) - -i18n = I18nAuto() -i18n.print() - -weight_root = "weights" -weight_uvr5_root = "uvr5_weights" -index_root = "logs" -names = [] -hubert_model = None -for name in os.listdir(weight_root): - if name.endswith(".pth"): - names.append(name) -index_paths = [] -for root, dirs, files in os.walk(index_root, topdown=False): - for name in files: - if name.endswith(".index") and "trained" not in name: - index_paths.append("%s/%s" % (root, name)) - - -def get_vc(sid): - global n_spk, tgt_sr, net_g, vc, cpt, version - if sid == "" or sid == []: - global hubert_model - if hubert_model != None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的 - print("clean_empty_cache") - del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt - hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None - if torch.cuda.is_available(): - torch.cuda.empty_cache() - ###楼下不这么折腾清理不干净 - if_f0 = cpt.get("f0", 1) - version = cpt.get("version", "v1") - if version == "v1": - if if_f0 == 1: - net_g = SynthesizerTrnMs256NSFsid( - *cpt["config"], is_half=config.is_half - ) - else: - net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) - elif version == "v2": - if if_f0 == 1: - net_g = SynthesizerTrnMs768NSFsid( - *cpt["config"], is_half=config.is_half - ) - else: - net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) - del net_g, cpt - if torch.cuda.is_available(): - torch.cuda.empty_cache() - cpt = None - return {"visible": False, "__type__": "update"} - person = "%s/%s" % (weight_root, sid) - print("loading %s" % person) - cpt = torch.load(person, map_location="cpu") - tgt_sr = cpt["config"][-1] - cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk - if_f0 = cpt.get("f0", 1) - version = cpt.get("version", "v1") - if version == "v1": - if if_f0 == 1: - net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half) - else: - net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) - elif version == "v2": - if if_f0 == 1: - net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half) - else: - net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) - del net_g.enc_q - print(net_g.load_state_dict(cpt["weight"], strict=False)) - net_g.eval().to(config.device) - if config.is_half: - net_g = net_g.half() - else: - net_g = net_g.float() - vc = VC(tgt_sr, config) - n_spk = cpt["config"][-3] - return {"visible": True, "maximum": n_spk, "__type__": "update"} - - -def load_hubert(): - global hubert_model - models, _, _ = checkpoint_utils.load_model_ensemble_and_task( - ["hubert_base.pt"], - suffix="", - ) - hubert_model = models[0] - hubert_model = hubert_model.to(config.device) - if config.is_half: - hubert_model = hubert_model.half() - else: - hubert_model = hubert_model.float() - hubert_model.eval() - - -def vc_single( - sid, - input_audio_path, - f0_up_key, - f0_file, - f0_method, - file_index, - file_index2, - # file_big_npy, - index_rate, - filter_radius, - resample_sr, - rms_mix_rate, - protect, -): # spk_item, input_audio0, vc_transform0,f0_file,f0method0 - global tgt_sr, net_g, vc, hubert_model, version - if input_audio_path is None: - return "You need to upload an audio", None - f0_up_key = int(f0_up_key) - try: - audio = input_audio_path[1] / 32768.0 - if len(audio.shape) == 2: - audio = np.mean(audio, -1) - audio = librosa.resample(audio, orig_sr=input_audio_path[0], target_sr=16000) - audio_max = np.abs(audio).max() / 0.95 - if audio_max > 1: - audio /= audio_max - times = [0, 0, 0] - if hubert_model == None: - load_hubert() - if_f0 = cpt.get("f0", 1) - file_index = ( - ( - file_index.strip(" ") - .strip('"') - .strip("\n") - .strip('"') - .strip(" ") - .replace("trained", "added") - ) - if file_index != "" - else file_index2 - ) # 防止小白写错,自动帮他替换掉 - # file_big_npy = ( - # file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - # ) - audio_opt = vc.pipeline( - hubert_model, - net_g, - sid, - audio, - input_audio_path, - times, - f0_up_key, - f0_method, - file_index, - # file_big_npy, - index_rate, - if_f0, - filter_radius, - tgt_sr, - resample_sr, - rms_mix_rate, - version, - protect, - f0_file=f0_file, - ) - if resample_sr >= 16000 and tgt_sr != resample_sr: - tgt_sr = resample_sr - index_info = ( - "Using index:%s." % file_index - if os.path.exists(file_index) - else "Index not used." - ) - return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % ( - index_info, - times[0], - times[1], - times[2], - ), (tgt_sr, audio_opt) - except: - info = traceback.format_exc() - print(info) - return info, (None, None) - - -app = gr.Blocks() -with app: - with gr.Tabs(): - with gr.TabItem("在线demo"): - gr.Markdown( - value=""" - RVC 在线demo - """ - ) - sid = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names)) - with gr.Column(): - spk_item = gr.Slider( - minimum=0, - maximum=2333, - step=1, - label=i18n("请选择说话人id"), - value=0, - visible=False, - interactive=True, - ) - sid.change( - fn=get_vc, - inputs=[sid], - outputs=[spk_item], - ) - gr.Markdown( - value=i18n("男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ") - ) - vc_input3 = gr.Audio(label="上传音频(长度小于90秒)") - vc_transform0 = gr.Number(label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0) - f0method0 = gr.Radio( - label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU"), - choices=["pm", "harvest", "crepe", "rmvpe"], - value="pm", - interactive=True, - ) - filter_radius0 = gr.Slider( - minimum=0, - maximum=7, - label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"), - value=3, - step=1, - interactive=True, - ) - with gr.Column(): - file_index1 = gr.Textbox( - label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"), - value="", - interactive=False, - visible=False, - ) - file_index2 = gr.Dropdown( - label=i18n("自动检测index路径,下拉式选择(dropdown)"), - choices=sorted(index_paths), - interactive=True, - ) - index_rate1 = gr.Slider( - minimum=0, - maximum=1, - label=i18n("检索特征占比"), - value=0.88, - interactive=True, - ) - resample_sr0 = gr.Slider( - minimum=0, - maximum=48000, - label=i18n("后处理重采样至最终采样率,0为不进行重采样"), - value=0, - step=1, - interactive=True, - ) - rms_mix_rate0 = gr.Slider( - minimum=0, - maximum=1, - label=i18n("输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"), - value=1, - interactive=True, - ) - protect0 = gr.Slider( - minimum=0, - maximum=0.5, - label=i18n("保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果"), - value=0.33, - step=0.01, - interactive=True, - ) - f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调")) - but0 = gr.Button(i18n("转换"), variant="primary") - vc_output1 = gr.Textbox(label=i18n("输出信息")) - vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)")) - but0.click( - vc_single, - [ - spk_item, - vc_input3, - vc_transform0, - f0_file, - f0method0, - file_index1, - file_index2, - # file_big_npy1, - index_rate1, - filter_radius0, - resample_sr0, - rms_mix_rate0, - protect0, - ], - [vc_output1, vc_output2], - ) - - -app.launch() diff --git a/pretrained/.gitignore b/assets/hubert/.gitignore similarity index 100% rename from pretrained/.gitignore rename to assets/hubert/.gitignore diff --git a/pretrained_v2/.gitignore b/assets/pretrained/.gitignore similarity index 100% rename from pretrained_v2/.gitignore rename to assets/pretrained/.gitignore diff --git a/uvr5_weights/.gitignore b/assets/pretrained_v2/.gitignore similarity index 100% rename from uvr5_weights/.gitignore rename to assets/pretrained_v2/.gitignore diff --git a/weights/.gitignore b/assets/rmvpe/.gitignore similarity index 100% rename from weights/.gitignore rename to assets/rmvpe/.gitignore diff --git a/assets/uvr5_weights/.gitignore b/assets/uvr5_weights/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/assets/uvr5_weights/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/assets/weights/.gitignore b/assets/weights/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/assets/weights/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/config.py b/configs/config.py similarity index 93% rename from config.py rename to configs/config.py index 6bb117e..90b9bc0 100644 --- a/config.py +++ b/configs/config.py @@ -1,25 +1,26 @@ -import os import argparse +import os import sys -import torch from multiprocessing import cpu_count +import torch + def use_fp32_config(): for config_file in [ - "32k.json", - "40k.json", - "48k.json", - "48k_v2.json", - "32k_v2.json", + "v1/32k.json", + "v1/40k.json", + "v1/48k.json", + "v2/48k.json", + "v2/32k.json", ]: with open(f"configs/{config_file}", "r") as f: strr = f.read().replace("true", "false") with open(f"configs/{config_file}", "w") as f: f.write(strr) - with open("trainset_preprocess_pipeline_print.py", "r") as f: + with open("infer/modules/train/preprocess.py", "r") as f: strr = f.read().replace("3.7", "3.0") - with open("trainset_preprocess_pipeline_print.py", "w") as f: + with open("infer/modules/train/preprocess.py", "w") as f: f.write(strr) @@ -111,9 +112,9 @@ class Config: + 0.4 ) if self.gpu_mem <= 4: - with open("trainset_preprocess_pipeline_print.py", "r") as f: + with open("infer/modules/train/preprocess.py", "r") as f: strr = f.read().replace("3.7", "3.0") - with open("trainset_preprocess_pipeline_print.py", "w") as f: + with open("infer/modules/train/preprocess.py", "w") as f: f.write(strr) elif self.has_mps(): print("No supported Nvidia GPU found") @@ -198,6 +199,3 @@ class Config: except: pass return x_pad, x_query, x_center, x_max - - -defaultconfig = Config() diff --git a/configs/32k.json b/configs/v1/32k.json similarity index 100% rename from configs/32k.json rename to configs/v1/32k.json diff --git a/configs/40k.json b/configs/v1/40k.json similarity index 100% rename from configs/40k.json rename to configs/v1/40k.json diff --git a/configs/48k.json b/configs/v1/48k.json similarity index 100% rename from configs/48k.json rename to configs/v1/48k.json diff --git a/configs/32k_v2.json b/configs/v2/32k.json similarity index 100% rename from configs/32k_v2.json rename to configs/v2/32k.json diff --git a/configs/48k_v2.json b/configs/v2/48k.json similarity index 100% rename from configs/48k_v2.json rename to configs/v2/48k.json diff --git a/docker-compose.yml b/docker-compose.yml index 4b40ec5..a5db88d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,7 +6,7 @@ services: dockerfile: Dockerfile container_name: rvc volumes: - - ./weights:/app/weights + - ./weights:/app/assets/weights - ./opt:/app/opt # - ./dataset:/app/dataset # you can use this folder in order to provide your dataset for model training ports: diff --git a/docs/Changelog_CN.md b/docs/cn/Changelog_CN.md similarity index 100% rename from docs/Changelog_CN.md rename to docs/cn/Changelog_CN.md diff --git a/docs/faq.md b/docs/cn/faq.md similarity index 100% rename from docs/faq.md rename to docs/cn/faq.md diff --git a/docs/Changelog_EN.md b/docs/en/Changelog_EN.md similarity index 100% rename from docs/Changelog_EN.md rename to docs/en/Changelog_EN.md diff --git a/docs/README.en.md b/docs/en/README.en.md similarity index 94% rename from docs/README.en.md rename to docs/en/README.en.md index 09cc102..806c15d 100644 --- a/docs/README.en.md +++ b/docs/en/README.en.md @@ -18,7 +18,7 @@ An easy-to-use Voice Conversion framework based on VITS.

------ [**Changelog**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/docs/Changelog_EN.md) | [**FAQ (Frequently Asked Questions)**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/wiki/FAQ-(Frequently-Asked-Questions)) -[**English**](./README.en.md) | [**中文简体**](../README.md) | [**日本語**](./README.ja.md) | [**한국어**](./README.ko.md) ([**韓國語**](./README.ko.han.md)) | [**Türkçe**](./README.tr.md) +[**English**](../en/README.en.md) | [**中文简体**](../../README.md) | [**日本語**](../jp/README.ja.md) | [**한국어**](../kr/README.ko.md) ([**韓國語**](../kr/README.ko.han.md)) | [**Türkçe**](../tr/README.tr.md) Check our [Demo Video](https://www.bilibili.com/video/BV1pm4y1z7Gm/) here! @@ -91,11 +91,15 @@ You need to download them from our [Huggingface space](https://huggingface.co/lj Here's a list of Pre-models and other files that RVC needs: ```bash -hubert_base.pt +./assets/hubert/hubert_base.pt -./pretrained +./assets/pretrained -./uvr5_weights +./assets/uvr5_weights + +Additional downloads are required if you want to test the v2 version of the model. + +./assets/pretrained_v2 If you want to test the v2 version model (the v2 version model has changed the input from the 256 dimensional feature of 9-layer Hubert+final_proj to the 768 dimensional feature of 12-layer Hubert, and has added 3 period discriminators), you will need to download additional features diff --git a/docs/faiss_tips_en.md b/docs/en/faiss_tips_en.md similarity index 100% rename from docs/faiss_tips_en.md rename to docs/en/faiss_tips_en.md diff --git a/docs/faq_en.md b/docs/en/faq_en.md similarity index 100% rename from docs/faq_en.md rename to docs/en/faq_en.md diff --git a/docs/training_tips_en.md b/docs/en/training_tips_en.md similarity index 100% rename from docs/training_tips_en.md rename to docs/en/training_tips_en.md diff --git a/docs/README.ja.md b/docs/jp/README.ja.md similarity index 92% rename from docs/README.ja.md rename to docs/jp/README.ja.md index 5bb2ba2..6200fda 100644 --- a/docs/README.ja.md +++ b/docs/jp/README.ja.md @@ -19,7 +19,7 @@ VITSに基づく使いやすい音声変換(voice changer)framework

[**更新日誌**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/docs/Changelog_CN.md) -[**English**](./README.en.md) | [**中文简体**](../README.md) | [**日本語**](./README.ja.md) | [**한국어**](./README.ko.md) ([**韓國語**](./README.ko.han.md)) | [**Türkçe**](./README.tr.md) +[**English**](../en/README.en.md) | [**中文简体**](../../README.md) | [**日本語**](../jp/README.ja.md) | [**한국어**](../kr/README.ko.md) ([**韓國語**](../kr/README.ko.han.md)) | [**Türkçe**](../tr/README.tr.md) > デモ動画は[こちら](https://www.bilibili.com/video/BV1pm4y1z7Gm/)でご覧ください。 @@ -72,11 +72,15 @@ modelsは[Hugging Face space](https://huggingface.co/lj1995/VoiceConversionWebUI 以下は、RVCに必要な基底モデルやその他のファイルの一覧です。 ```bash -hubert_base.pt +./assets/hubert/hubert_base.pt -./pretrained +./assets/pretrained -./uvr5_weights +./assets/uvr5_weights + +V2のモデルを使用するには、追加でファイルをダウンロードする必要があります + +./assets/pretrained_v2 # ffmpegがすでにinstallされている場合は省略 ./ffmpeg diff --git a/docs/faiss_tips_ja.md b/docs/jp/faiss_tips_ja.md similarity index 100% rename from docs/faiss_tips_ja.md rename to docs/jp/faiss_tips_ja.md diff --git a/docs/training_tips_ja.md b/docs/jp/training_tips_ja.md similarity index 100% rename from docs/training_tips_ja.md rename to docs/jp/training_tips_ja.md diff --git a/docs/Changelog_KO.md b/docs/kr/Changelog_KO.md similarity index 100% rename from docs/Changelog_KO.md rename to docs/kr/Changelog_KO.md diff --git a/docs/README.ko.han.md b/docs/kr/README.ko.han.md similarity index 92% rename from docs/README.ko.han.md rename to docs/kr/README.ko.han.md index 78ceaac..78c3c47 100644 --- a/docs/README.ko.han.md +++ b/docs/kr/README.ko.han.md @@ -18,7 +18,7 @@ VITS基盤의 簡單하고使用하기 쉬운音聲變換틀

------ [**更新日誌**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/docs/Changelog_KO.md) -[**English**](./README.en.md) | [**中文简体**](../README.md) | [**日本語**](./README.ja.md) | [**한국어**](./README.ko.md) ([**韓國語**](./README.ko.han.md)) | [**Türkçe**](./README.tr.md) +[**English**](../en/README.en.md) | [**中文简体**](../../README.md) | [**日本語**](../jp/README.ja.md) | [**한국어**](../kr/README.ko.md) ([**韓國語**](../kr/README.ko.han.md)) | [**Türkçe**](../tr/README.tr.md) > [示範映像](https://www.bilibili.com/video/BV1pm4y1z7Gm/)을 確認해 보세요! @@ -69,11 +69,15 @@ RVC 모델은 推論과訓練을 依하여 다른 預備모델이 必要합니 다음은 RVC에 必要한 預備모델 및 其他 파일 目錄입니다: ```bash -hubert_base.pt +./assets/hubert/hubert_base.pt -./pretrained +./assets/pretrained -./uvr5_weights +./assets/uvr5_weights + +V2 버전 모델을 테스트하려면 추가 다운로드가 필요합니다. + +./assets/pretrained_v2 # Windows를 使用하는境遇 이 사전도 必要할 수 있습니다. FFmpeg가 設置되어 있으면 건너뛰어도 됩니다. ffmpeg.exe diff --git a/docs/README.ko.md b/docs/kr/README.ko.md similarity index 92% rename from docs/README.ko.md rename to docs/kr/README.ko.md index 0689688..5ea73e0 100644 --- a/docs/README.ko.md +++ b/docs/kr/README.ko.md @@ -19,7 +19,7 @@ VITS 기반의 간단하고 사용하기 쉬운 음성 변환 프레임워크. [데모 영상](https://www.bilibili.com/video/BV1pm4y1z7Gm/)을 확인해 보세요! @@ -77,11 +77,15 @@ RVC 모델은 추론과 훈련을 위하여 다른 사전 모델이 필요합니 다음은 RVC에 필요한 사전 모델 및 기타 파일 목록입니다: ```bash -hubert_base.pt +./assets/hubert/hubert_base.pt -./pretrained +./assets/pretrained -./uvr5_weights +./assets/uvr5_weights + +V2 버전 모델을 테스트하려면 추가 다운로드가 필요합니다. + +./assets/pretrained_v2 # Windows를 사용하는 경우 이 사전도 필요할 수 있습니다. FFmpeg가 설치되어 있으면 건너뛰어도 됩니다. ffmpeg.exe diff --git a/docs/faiss_tips_ko.md b/docs/kr/faiss_tips_ko.md similarity index 100% rename from docs/faiss_tips_ko.md rename to docs/kr/faiss_tips_ko.md diff --git a/docs/training_tips_ko.md b/docs/kr/training_tips_ko.md similarity index 100% rename from docs/training_tips_ko.md rename to docs/kr/training_tips_ko.md diff --git a/docs/Changelog_TR.md b/docs/tr/Changelog_TR.md similarity index 100% rename from docs/Changelog_TR.md rename to docs/tr/Changelog_TR.md diff --git a/docs/README.tr.md b/docs/tr/README.tr.md similarity index 95% rename from docs/README.tr.md rename to docs/tr/README.tr.md index 8c04cd2..8c0c2b1 100644 --- a/docs/README.tr.md +++ b/docs/tr/README.tr.md @@ -20,7 +20,7 @@ Kolay kullanılabilen VITS tabanlı bir Ses Dönüşümü çerçevesi.

------ [**Değişiklik Kaydı**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/docs/Changelog_TR.md) | [**SSS (Sıkça Sorulan Sorular)**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/wiki/FAQ-(Frequently-Asked-Questions)) -[**English**](./README.en.md) | [**中文简体**](../README.md) | [**日本語**](./README.ja.md) | [**한국어**](./README.ko.md) ([**韓國語**](./README.ko.han.md)) | [**Türkçe**](./README.tr.md) +[**English**](../en/README.en.md) | [**中文简体**](../../README.md) | [**日本語**](../jp/README.ja.md) | [**한국어**](../kr/README.ko.md) ([**韓國語**](../kr/README.ko.han.md)) | [**Türkçe**](../tr/README.tr.md) Demo Videosu için [buraya](https://www.bilibili.com/video/BV1pm4y1z7Gm/) bakın! @@ -88,15 +88,15 @@ Onları [Huggingface alanımızdan](https://huggingface.co/lj1995/VoiceConversio İşte RVC'nin ihtiyaç duyduğu Diğer Ön-Modellerin ve diğer dosyaların listesi: ```bash -hubert_base.pt +./assets/hubert/hubert_base.pt -./pretrained +./assets/pretrained -./uvr5_weights +./assets/uvr5_weights V2 sürümü modelini test etmek istiyorsanız (v2 sürümü modeli girişi 256 boyutlu 9 katmanlı Hubert+final_proj'dan 768 boyutlu 12 katmanlı Hubert'ın özelliğine ve 3 dönem ayrımına değiştirilmiştir), ek özellikleri indirmeniz gerekecektir. -./pretrained_v2 +./assets/pretrained_v2 #Eğer Windows kullanıyorsanız, FFmpeg yüklü değilse bu dictionariyaya da ihtiyacınız olabilir, FFmpeg yüklüyse atlayın ffmpeg.exe diff --git a/docs/faiss_tips_tr.md b/docs/tr/faiss_tips_tr.md similarity index 100% rename from docs/faiss_tips_tr.md rename to docs/tr/faiss_tips_tr.md diff --git a/docs/faq_tr.md b/docs/tr/faq_tr.md similarity index 100% rename from docs/faq_tr.md rename to docs/tr/faq_tr.md diff --git a/docs/training_tips_tr.md b/docs/tr/training_tips_tr.md similarity index 100% rename from docs/training_tips_tr.md rename to docs/tr/training_tips_tr.md diff --git a/gui_v1.py b/gui_v1.py index 9486508..16ae1b3 100644 --- a/gui_v1.py +++ b/gui_v1.py @@ -1,4 +1,6 @@ -import os, sys, pdb +import os +import pdb +import sys os.environ["OMP_NUM_THREADS"] = "2" if sys.platform == "darwin": @@ -16,7 +18,8 @@ class Harvest(multiprocessing.Process): self.opt_q = opt_q def run(self): - import numpy as np, pyworld + import numpy as np + import pyworld while 1: idx, x, res_f0, n_cpu, ts = self.inp_q.get() @@ -33,21 +36,26 @@ class Harvest(multiprocessing.Process): if __name__ == "__main__": - from multiprocessing import Queue - from queue import Empty - import numpy as np - import multiprocessing - import traceback, re import json + import multiprocessing + import re + import threading + import time + import traceback + from multiprocessing import Queue, cpu_count + from queue import Empty + + import librosa + import noisereduce as nr + import numpy as np import PySimpleGUI as sg import sounddevice as sd - import noisereduce as nr - from multiprocessing import cpu_count - import librosa, torch, time, threading + import torch import torch.nn.functional as F import torchaudio.transforms as tat - from i18n import I18nAuto - import rvc_for_realtime + + import tools.rvc_for_realtime as rvc_for_realtime + from i18n.i18n import I18nAuto i18n = I18nAuto() device = rvc_for_realtime.config.device @@ -131,7 +139,9 @@ if __name__ == "__main__": ), sg.FileBrowse( i18n("选择.pth文件"), - initial_folder=os.path.join(os.getcwd(), "weights"), + initial_folder=os.path.join( + os.getcwd(), "assets/weights" + ), file_types=((". pth"),), ), ], diff --git a/i18n.py b/i18n/i18n.py similarity index 82% rename from i18n.py rename to i18n/i18n.py index d64f2ea..a64ee23 100644 --- a/i18n.py +++ b/i18n/i18n.py @@ -1,10 +1,10 @@ -import locale import json +import locale import os def load_language_list(language): - with open(f"./lib/i18n/{language}.json", "r", encoding="utf-8") as f: + with open(f"./i18n/locale/{language}.json", "r", encoding="utf-8") as f: language_list = json.load(f) return language_list @@ -15,7 +15,7 @@ class I18nAuto: language = locale.getdefaultlocale()[ 0 ] # getlocale can't identify the system's language ((None, None)) - if not os.path.exists(f"./lib/i18n/{language}.json"): + if not os.path.exists(f"./i18n/locale/{language}.json"): language = "en_US" self.language = language # print("Use Language:", language) diff --git a/lib/i18n/en_US.json b/i18n/locale/en_US.json similarity index 99% rename from lib/i18n/en_US.json rename to i18n/locale/en_US.json index c734dea..ef5fbd9 100644 --- a/lib/i18n/en_US.json +++ b/i18n/locale/en_US.json @@ -15,6 +15,7 @@ "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "Enter the GPU index(es) separated by '-', e.g., 0-0-1 to use 2 processes in GPU0 and 1 process in GPU1", "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Step 1: Fill in the experimental configuration. Experimental data is stored in the 'logs' folder, with each experiment having a separate folder. Manually enter the experiment name path, which contains the experimental configuration, logs, and trained model files.", "step1:正在处理数据": "Step 1: Processing data", + "step2:正在提取音高&正在提取特征": "step2:Pitch extraction & feature extraction", "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Step 2a: Automatically traverse all files in the training folder that can be decoded into audio and perform slice normalization. Generates 2 wav folders in the experiment directory. Currently, only single-singer/speaker training is supported.", "step2a:无需提取音高": "Step 2a: Skipping pitch extraction", "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Step 2b: Use CPU to extract pitch (if the model has pitch), use GPU to extract features (select GPU index):", diff --git a/lib/i18n/es_ES.json b/i18n/locale/es_ES.json similarity index 99% rename from lib/i18n/es_ES.json rename to i18n/locale/es_ES.json index 6083ef9..ebcb860 100644 --- a/lib/i18n/es_ES.json +++ b/i18n/locale/es_ES.json @@ -15,6 +15,7 @@ "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程", "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "paso 1: Complete la configuración del experimento. Los datos del experimento se almacenan en el directorio 'logs', con cada experimento en una carpeta separada. La ruta del nombre del experimento debe ingresarse manualmente y debe contener la configuración del experimento, los registros y los archivos del modelo entrenado.", "step1:正在处理数据": "Paso 1: Procesando datos", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "paso 2a: recorra automáticamente la carpeta de capacitación y corte y normalice todos los archivos de audio que se pueden decodificar en audio. Se generarán dos carpetas 'wav' en el directorio del experimento. Actualmente, solo se admite la capacitación de una sola persona.", "step2a:无需提取音高": "Paso 2a: No es necesario extraer el tono", "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "paso 2b: use la CPU para extraer el tono (si el modelo tiene guía de tono) y la GPU para extraer características (seleccione el número de tarjeta).", diff --git a/lib/i18n/it_IT.json b/i18n/locale/it_IT.json similarity index 99% rename from lib/i18n/it_IT.json rename to i18n/locale/it_IT.json index e94f6b3..26736c9 100644 --- a/lib/i18n/it_IT.json +++ b/i18n/locale/it_IT.json @@ -15,6 +15,7 @@ "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程", "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Passaggio 1: compilare la configurazione sperimentale. ", "step1:正在处理数据": "Passaggio 1: elaborazione dei dati", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Passaggio 2a: attraversa automaticamente tutti i file nella cartella di addestramento che possono essere decodificati in audio ed esegui la normalizzazione delle sezioni. ", "step2a:无需提取音高": "Step 2a: Saltare l'estrazione del tono", "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Passaggio 2b: utilizzare la CPU per estrarre il tono (se il modello ha il tono), utilizzare la GPU per estrarre le caratteristiche (selezionare l'indice GPU):", diff --git a/lib/i18n/ja_JP.json b/i18n/locale/ja_JP.json similarity index 99% rename from lib/i18n/ja_JP.json rename to i18n/locale/ja_JP.json index 92bd344..12647ec 100644 --- a/lib/i18n/ja_JP.json +++ b/i18n/locale/ja_JP.json @@ -15,6 +15,7 @@ "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpeカード番号設定:異なるプロセスに使用するカード番号を入力する。例えば、0-0-1でカード0に2つのプロセス、カード1に1つのプロセスを実行する。", "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "ステップ1:実験設定を入力します。実験データはlogsに保存され、各実験にはフォルダーがあります。実験名のパスを手動で入力する必要があり、実験設定、ログ、トレーニングされたモデルファイルが含まれます。", "step1:正在处理数据": "step1:処理中のデータ", + "step2:正在提取音高&正在提取特征": "step2:ピッチ抽出と特徴抽出", "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "ステップ2a: 訓練フォルダー内のすべての音声ファイルを自動的に探索し、スライスと正規化を行い、2つのwavフォルダーを実験ディレクトリに生成します。現在は一人でのトレーニングのみをサポートしています。", "step2a:无需提取音高": "step2a:ピッチの抽出は不要", "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "ステップ2b: CPUを使用して音高を抽出する(モデルに音高がある場合)、GPUを使用して特徴を抽出する(GPUの番号を選択する)", diff --git a/lib/i18n/ru_RU.json b/i18n/locale/ru_RU.json similarity index 99% rename from lib/i18n/ru_RU.json rename to i18n/locale/ru_RU.json index 5dc4b27..d94216b 100644 --- a/lib/i18n/ru_RU.json +++ b/i18n/locale/ru_RU.json @@ -15,6 +15,7 @@ "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "Введите номера графических процессоров, разделенные символом «-», например, 0-0-1, чтобы запустить два процесса на GPU 0 и один процесс на GPU 1:", "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Шаг 1. Конфигурирование модели. Данные обучения модели сохраняются в папку 'logs', и для каждой модели создаётся отдельная папка. Введите вручную путь к настройкам для модели, в которой находятся логи и тренировочные файлы.", "step1:正在处理数据": "Шаг 1. Переработка данных", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Шаг 2А. Автоматическая обработка исходных аудиозаписей для обучения и выполнение нормализации среза. Создаст 2 папки wav в папке модели. В данный момент поддерживается обучение только на одноголосных записях.", "step2a:无需提取音高": "Шаг 2А. Пропуск извлечения тональности", "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Шаг 2Б. Оценка и извлечение тональности в аудиофайлах с помощью процессора (если включена поддержка изменения высоты звука), извлечение черт с помощью GPU (выберите номер GPU):", diff --git a/lib/i18n/tr_TR.json b/i18n/locale/tr_TR.json similarity index 99% rename from lib/i18n/tr_TR.json rename to i18n/locale/tr_TR.json index 8e847c2..3b1b0eb 100644 --- a/lib/i18n/tr_TR.json +++ b/i18n/locale/tr_TR.json @@ -15,6 +15,7 @@ "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程", "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Adım 1: Deneysel yapılandırmayı doldurun. Deneysel veriler 'logs' klasöründe saklanır ve her bir deney için ayrı bir klasör vardır. Deneysel adı yolu manuel olarak girin; bu yol, deneysel yapılandırmayı, günlükleri ve eğitilmiş model dosyalarını içerir.", "step1:正在处理数据": "Adım 1: Veri işleme", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Adım 2a: Eğitim klasöründe ses dosyalarını otomatik olarak gezinerek dilimleme normalizasyonu yapın. Deney dizini içinde 2 wav klasörü oluşturur. Şu anda sadece tek kişilik eğitim desteklenmektedir.", "step2a:无需提取音高": "Adım 2a: Pitch çıkartma adımını atlama", "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Adım 2b: Ses yüksekliği (Pitch) çıkartmak için CPU kullanın (eğer model ses yüksekliği içeriyorsa), özellikleri çıkartmak için GPU kullanın (GPU indeksini seçin):", diff --git a/lib/i18n/zh_CN.json b/i18n/locale/zh_CN.json similarity index 99% rename from lib/i18n/zh_CN.json rename to i18n/locale/zh_CN.json index 12f8738..96ca25b 100644 --- a/lib/i18n/zh_CN.json +++ b/i18n/locale/zh_CN.json @@ -15,6 +15,7 @@ "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程", "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ", "step1:正在处理数据": "step1:正在处理数据", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ", "step2a:无需提取音高": "step2a:无需提取音高", "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)", diff --git a/lib/i18n/zh_HK.json b/i18n/locale/zh_HK.json similarity index 99% rename from lib/i18n/zh_HK.json rename to i18n/locale/zh_HK.json index 52e0b40..a4ebff1 100644 --- a/lib/i18n/zh_HK.json +++ b/i18n/locale/zh_HK.json @@ -15,6 +15,7 @@ "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置:以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程", "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1:填寫實驗配置。實驗數據放在logs下,每個實驗一個資料夾,需手動輸入實驗名路徑,內含實驗配置、日誌、訓練得到的模型檔案。", "step1:正在处理数据": "step1:正在处理数据", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。", "step2a:无需提取音高": "step2a:无需提取音高", "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)", diff --git a/lib/i18n/zh_SG.json b/i18n/locale/zh_SG.json similarity index 99% rename from lib/i18n/zh_SG.json rename to i18n/locale/zh_SG.json index 52e0b40..a4ebff1 100644 --- a/lib/i18n/zh_SG.json +++ b/i18n/locale/zh_SG.json @@ -15,6 +15,7 @@ "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置:以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程", "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1:填寫實驗配置。實驗數據放在logs下,每個實驗一個資料夾,需手動輸入實驗名路徑,內含實驗配置、日誌、訓練得到的模型檔案。", "step1:正在处理数据": "step1:正在处理数据", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。", "step2a:无需提取音高": "step2a:无需提取音高", "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)", diff --git a/lib/i18n/zh_TW.json b/i18n/locale/zh_TW.json similarity index 99% rename from lib/i18n/zh_TW.json rename to i18n/locale/zh_TW.json index 52e0b40..a4ebff1 100644 --- a/lib/i18n/zh_TW.json +++ b/i18n/locale/zh_TW.json @@ -15,6 +15,7 @@ "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置:以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程", "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1:填寫實驗配置。實驗數據放在logs下,每個實驗一個資料夾,需手動輸入實驗名路徑,內含實驗配置、日誌、訓練得到的模型檔案。", "step1:正在处理数据": "step1:正在处理数据", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。", "step2a:无需提取音高": "step2a:无需提取音高", "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)", diff --git a/lib/i18n/locale_diff.py b/i18n/locale_diff.py similarity index 88% rename from lib/i18n/locale_diff.py rename to i18n/locale_diff.py index 65bb929..674f7dd 100644 --- a/lib/i18n/locale_diff.py +++ b/i18n/locale_diff.py @@ -3,12 +3,14 @@ import os from collections import OrderedDict # Define the standard file name -standard_file = "zh_CN.json" +standard_file = "locale/zh_CN.json" # Find all JSON files in the directory -dir_path = "./" +dir_path = "locale/" languages = [ - f for f in os.listdir(dir_path) if f.endswith(".json") and f != standard_file + os.path.join(dir_path, f) + for f in os.listdir(dir_path) + if f.endswith(".json") and f != standard_file ] # Load the standard file diff --git a/lib/i18n/scan_i18n.py b/i18n/scan_i18n.py similarity index 91% rename from lib/i18n/scan_i18n.py rename to i18n/scan_i18n.py index ce875c9..f3e52cf 100644 --- a/lib/i18n/scan_i18n.py +++ b/i18n/scan_i18n.py @@ -1,7 +1,6 @@ import ast import glob import json - from collections import OrderedDict @@ -50,8 +49,8 @@ print() print("Total unique:", len(code_keys)) -standard_file = "zh_CN.json" -with open(f"lib/i18n/{standard_file}", "r", encoding="utf-8") as f: +standard_file = "i18n/locale/zh_CN.json" +with open(standard_file, "r", encoding="utf-8") as f: standard_data = json.load(f, object_pairs_hook=OrderedDict) standard_keys = set(standard_data.keys()) @@ -71,6 +70,6 @@ for s in strings: code_keys_dict[s] = s # write back -with open(f"lib/i18n/{standard_file}", "w", encoding="utf-8") as f: +with open(standard_file, "w", encoding="utf-8") as f: json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True) f.write("\n") diff --git a/infer-web.py b/infer-web.py index d8971aa..b75b8f8 100644 --- a/infer-web.py +++ b/infer-web.py @@ -1,43 +1,33 @@ +import logging import os import shutil -import sys - -now_dir = os.getcwd() -sys.path.append(now_dir) -import traceback, pdb -import warnings - -import numpy as np -import torch - -os.environ["OPENBLAS_NUM_THREADS"] = "1" -os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1" -import logging import threading +import traceback +import warnings from random import shuffle from subprocess import Popen from time import sleep -import faiss -import ffmpeg -import gradio as gr -import soundfile as sf -from config import defaultconfig as config import fairseq -from i18n import I18nAuto -from lib.infer_pack.models import ( - SynthesizerTrnMs256NSFsid, - SynthesizerTrnMs256NSFsid_nono, - SynthesizerTrnMs768NSFsid, - SynthesizerTrnMs768NSFsid_nono, -) -from lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM -from infer_uvr5 import _audio_pre_, _audio_pre_new -from lib.audio import load_audio -from lib.train.process_ckpt import change_info, extract_small_model, merge, show_info -from lib.train.vc_infer_pipeline import VC +import faiss +import gradio as gr +import numpy as np +import torch +from dotenv import load_dotenv from sklearn.cluster import MiniBatchKMeans +from configs.config import Config +from i18n.i18n import I18nAuto +from infer.lib.train.process_ckpt import ( + change_info, + extract_small_model, + merge, + show_info, +) +from infer.modules.onnx.export import export_onnx +from infer.modules.uvr5.modules import uvr +from infer.modules.vc.modules import VC + logging.getLogger("numba").setLevel(logging.WARNING) now_dir = os.getcwd() @@ -48,11 +38,15 @@ shutil.rmtree("%s/runtime/Lib/site-packages/uvr5_pack" % (now_dir), ignore_error os.makedirs(tmp, exist_ok=True) os.makedirs(os.path.join(now_dir, "logs"), exist_ok=True) os.makedirs(os.path.join(now_dir, "weights"), exist_ok=True) -os.environ["TEMP"] = tmp warnings.filterwarnings("ignore") torch.manual_seed(114514) +load_dotenv() +config = Config() +vc = VC(config) + + if config.dml == True: def forward_dml(ctx, x, scale): @@ -126,27 +120,10 @@ class ToolButton(gr.Button, gr.components.FormComponent): return "button" -hubert_model = None - - -def load_hubert(): - global hubert_model - models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( - ["hubert_base.pt"], - suffix="", - ) - hubert_model = models[0] - hubert_model = hubert_model.to(config.device) - if config.is_half: - hubert_model = hubert_model.half() - else: - hubert_model = hubert_model.float() - hubert_model.eval() - - -weight_root = "weights" -weight_uvr5_root = "uvr5_weights" +weight_root = os.getenv("weight_root") +weight_uvr5_root = os.getenv("weight_uvr5_root") index_root = "logs" + names = [] for name in os.listdir(weight_root): if name.endswith(".pth"): @@ -161,364 +138,6 @@ for name in os.listdir(weight_uvr5_root): if name.endswith(".pth") or "onnx" in name: uvr5_names.append(name.replace(".pth", "")) -cpt = None - - -def vc_single( - sid, - input_audio_path, - f0_up_key, - f0_file, - f0_method, - file_index, - file_index2, - # file_big_npy, - index_rate, - filter_radius, - resample_sr, - rms_mix_rate, - protect, -): # spk_item, input_audio0, vc_transform0,f0_file,f0method0 - global tgt_sr, net_g, vc, hubert_model, version, cpt - if input_audio_path is None: - return "You need to upload an audio", None - f0_up_key = int(f0_up_key) - try: - audio = load_audio(input_audio_path, 16000) - audio_max = np.abs(audio).max() / 0.95 - if audio_max > 1: - audio /= audio_max - times = [0, 0, 0] - if not hubert_model: - load_hubert() - if_f0 = cpt.get("f0", 1) - file_index = ( - ( - file_index.strip(" ") - .strip('"') - .strip("\n") - .strip('"') - .strip(" ") - .replace("trained", "added") - ) - if file_index != "" - else file_index2 - ) # 防止小白写错,自动帮他替换掉 - # file_big_npy = ( - # file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - # ) - audio_opt = vc.pipeline( - hubert_model, - net_g, - sid, - audio, - input_audio_path, - times, - f0_up_key, - f0_method, - file_index, - # file_big_npy, - index_rate, - if_f0, - filter_radius, - tgt_sr, - resample_sr, - rms_mix_rate, - version, - protect, - f0_file=f0_file, - ) - index_info = ( - "Using index:%s." % file_index - if os.path.exists(file_index) - else "Index not used." - ) - return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % ( - index_info, - times[0], - times[1], - times[2], - ), ( - resample_sr if resample_sr >= 16000 and tgt_sr != resample_sr else tgt_sr, - audio_opt, - ) - except: - info = traceback.format_exc() - print(info) - return info, (None, None) - - -def vc_multi( - sid, - dir_path, - opt_root, - paths, - f0_up_key, - f0_method, - file_index, - file_index2, - # file_big_npy, - index_rate, - filter_radius, - resample_sr, - rms_mix_rate, - protect, - format1, -): - try: - dir_path = ( - dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - ) # 防止小白拷路径头尾带了空格和"和回车 - opt_root = opt_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - os.makedirs(opt_root, exist_ok=True) - try: - if dir_path != "": - paths = [os.path.join(dir_path, name) for name in os.listdir(dir_path)] - else: - paths = [path.name for path in paths] - except: - traceback.print_exc() - paths = [path.name for path in paths] - infos = [] - for path in paths: - info, opt = vc_single( - sid, - path, - f0_up_key, - None, - f0_method, - file_index, - file_index2, - # file_big_npy, - index_rate, - filter_radius, - resample_sr, - rms_mix_rate, - protect, - ) - if "Success" in info: - try: - tgt_sr, audio_opt = opt - if format1 in ["wav", "flac"]: - sf.write( - "%s/%s.%s" % (opt_root, os.path.basename(path), format1), - audio_opt, - tgt_sr, - ) - else: - path = "%s/%s.wav" % (opt_root, os.path.basename(path)) - sf.write( - path, - audio_opt, - tgt_sr, - ) - if os.path.exists(path): - os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" - % (path, path[:-4] + ".%s" % format1) - ) - except: - info += traceback.format_exc() - infos.append("%s->%s" % (os.path.basename(path), info)) - yield "\n".join(infos) - yield "\n".join(infos) - except: - yield traceback.format_exc() - - -def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0): - infos = [] - try: - inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - save_root_vocal = ( - save_root_vocal.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - ) - save_root_ins = ( - save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - ) - if model_name == "onnx_dereverb_By_FoxJoy": - from MDXNet import MDXNetDereverb - - pre_fun = MDXNetDereverb(15) - else: - func = _audio_pre_ if "DeEcho" not in model_name else _audio_pre_new - pre_fun = func( - agg=int(agg), - model_path=os.path.join(weight_uvr5_root, model_name + ".pth"), - device=config.device, - is_half=config.is_half, - ) - if inp_root != "": - paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)] - else: - paths = [path.name for path in paths] - for path in paths: - inp_path = os.path.join(inp_root, path) - need_reformat = 1 - done = 0 - try: - info = ffmpeg.probe(inp_path, cmd="ffprobe") - if ( - info["streams"][0]["channels"] == 2 - and info["streams"][0]["sample_rate"] == "44100" - ): - need_reformat = 0 - pre_fun._path_audio_( - inp_path, save_root_ins, save_root_vocal, format0 - ) - done = 1 - except: - need_reformat = 1 - traceback.print_exc() - if need_reformat == 1: - tmp_path = "%s/%s.reformatted.wav" % (tmp, os.path.basename(inp_path)) - os.system( - "ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y" - % (inp_path, tmp_path) - ) - inp_path = tmp_path - try: - if done == 0: - pre_fun._path_audio_( - inp_path, save_root_ins, save_root_vocal, format0 - ) - infos.append("%s->Success" % (os.path.basename(inp_path))) - yield "\n".join(infos) - except: - infos.append( - "%s->%s" % (os.path.basename(inp_path), traceback.format_exc()) - ) - yield "\n".join(infos) - except: - infos.append(traceback.format_exc()) - yield "\n".join(infos) - finally: - try: - if model_name == "onnx_dereverb_By_FoxJoy": - del pre_fun.pred.model - del pre_fun.pred.model_ - else: - del pre_fun.model - del pre_fun - except: - traceback.print_exc() - print("clean_empty_cache") - if torch.cuda.is_available(): - torch.cuda.empty_cache() - yield "\n".join(infos) - - -def get_index_path_from_model(sid): - sel_index_path = "" - name = os.path.join("logs", sid.split(".")[0], "") - # print(name) - for f in index_paths: - if name in f: - # print("selected index path:", f) - sel_index_path = f - break - return sel_index_path - - -# 一个选项卡全局只能有一个音色 -def get_vc(sid, to_return_protect0, to_return_protect1): - global n_spk, tgt_sr, net_g, vc, cpt, version - if sid == "" or sid == []: - global hubert_model - if hubert_model is not None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的 - print("clean_empty_cache") - del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt - hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None - if torch.cuda.is_available(): - torch.cuda.empty_cache() - ###楼下不这么折腾清理不干净 - if_f0 = cpt.get("f0", 1) - version = cpt.get("version", "v1") - if version == "v1": - if if_f0 == 1: - net_g = SynthesizerTrnMs256NSFsid( - *cpt["config"], is_half=config.is_half - ) - else: - net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) - elif version == "v2": - if if_f0 == 1: - net_g = SynthesizerTrnMs768NSFsid( - *cpt["config"], is_half=config.is_half - ) - else: - net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) - del net_g, cpt - if torch.cuda.is_available(): - torch.cuda.empty_cache() - return ( - {"visible": False, "__type__": "update"}, - { - "visible": True, - "value": to_return_protect0, - "__type__": "update", - }, - { - "visible": True, - "value": to_return_protect1, - "__type__": "update", - }, - "", - "", - ) - person = "%s/%s" % (weight_root, sid) - print("loading %s" % person) - - cpt = torch.load(person, map_location="cpu") - tgt_sr = cpt["config"][-1] - cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk - if_f0 = cpt.get("f0", 1) - if if_f0 == 0: - to_return_protect0 = to_return_protect1 = { - "visible": False, - "value": 0.33, - "__type__": "update", - } - else: - to_return_protect0 = { - "visible": True, - "value": to_return_protect0, - "__type__": "update", - } - to_return_protect1 = { - "visible": True, - "value": to_return_protect1, - "__type__": "update", - } - version = cpt.get("version", "v1") - if version == "v1": - if if_f0 == 1: - net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half) - else: - net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) - elif version == "v2": - if if_f0 == 1: - net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half) - else: - net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) - del net_g.enc_q - print(net_g.load_state_dict(cpt["weight"], strict=False)) - net_g.eval().to(config.device) - if config.is_half: - net_g = net_g.half() - else: - net_g = net_g.float() - vc = VC(tgt_sr, config) - n_spk = cpt["config"][-3] - index = {"value": get_index_path_from_model(sid), "__type__": "update"} - return ( - {"visible": True, "maximum": n_spk, "__type__": "update"}, - to_return_protect0, - to_return_protect1, - index, - index, - ) - def change_choices(): names = [] @@ -582,7 +201,7 @@ def preprocess_dataset(trainset_dir, exp_dir, sr, n_p): f.close() cmd = ( get_quoted_python_cmd() - + ' trainset_preprocess_pipeline_print.py "%s" %s %s "%s/logs/%s" ' + + ' infer/modules/train/preprocess.py "%s" %s %s "%s/logs/%s" ' % (trainset_dir, sr, n_p, now_dir, exp_dir) + str(config.noparallel) ) @@ -619,7 +238,7 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp if f0method != "rmvpe_gpu": cmd = ( get_quoted_python_cmd() - + ' extract_f0_print.py "%s/logs/%s" %s %s' + + ' infer/modules/train/extract/extract_f0_print.py "%s/logs/%s" %s %s' % ( now_dir, exp_dir, @@ -646,10 +265,13 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp leng = len(gpus_rmvpe) ps = [] for idx, n_g in enumerate(gpus_rmvpe): - cmd = ( - get_quoted_python_cmd() - + ' extract_f0_rmvpe.py %s %s %s "%s/logs/%s" %s ' - % (leng, idx, n_g, now_dir, exp_dir, config.is_half) + cmd = get_quoted_python_cmd() + ' infer/modules/train/extract/extract_f0_rmvpe.py %s %s %s "%s/logs/%s" %s ' % ( + leng, + idx, + n_g, + now_dir, + exp_dir, + config.is_half, ) print(cmd) p = Popen( @@ -666,9 +288,13 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp ), ).start() else: - cmd = config.python_cmd + ' extract_f0_rmvpe_dml.py "%s/logs/%s" ' % ( - now_dir, - exp_dir, + cmd = ( + config.python_cmd + + ' infer/modules/train/extract/extract_f0_rmvpe_dml.py "%s/logs/%s" ' + % ( + now_dir, + exp_dir, + ) ) print(cmd) p = Popen( @@ -699,18 +325,14 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp leng = len(gpus) ps = [] for idx, n_g in enumerate(gpus): - cmd = ( - get_quoted_python_cmd() - + ' extract_feature_print.py %s %s %s %s "%s/logs/%s" %s' - % ( - config.device, - leng, - idx, - n_g, - now_dir, - exp_dir, - version19, - ) + cmd = get_quoted_python_cmd() + ' infer/modules/train/extract_feature_print.py %s %s %s %s "%s/logs/%s" %s' % ( + config.device, + leng, + idx, + n_g, + now_dir, + exp_dir, + version19, ) print(cmd) p = Popen( @@ -742,26 +364,26 @@ def change_sr2(sr2, if_f0_3, version19): path_str = "" if version19 == "v1" else "_v2" f0_str = "f0" if if_f0_3 else "" if_pretrained_generator_exist = os.access( - "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK + "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK ) if_pretrained_discriminator_exist = os.access( - "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK + "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK ) if not if_pretrained_generator_exist: print( - "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), + "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model", ) if not if_pretrained_discriminator_exist: print( - "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), + "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model", ) return ( - "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2) + "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2) if if_pretrained_generator_exist else "", - "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2) + "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2) if if_pretrained_discriminator_exist else "", ) @@ -778,26 +400,26 @@ def change_version19(sr2, if_f0_3, version19): ) f0_str = "f0" if if_f0_3 else "" if_pretrained_generator_exist = os.access( - "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK + "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK ) if_pretrained_discriminator_exist = os.access( - "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK + "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK ) if not if_pretrained_generator_exist: print( - "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), + "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model", ) if not if_pretrained_discriminator_exist: print( - "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), + "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model", ) return ( - "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2) + "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2) if if_pretrained_generator_exist else "", - "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2) + "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2) if if_pretrained_discriminator_exist else "", to_return_sr2, @@ -807,37 +429,37 @@ def change_version19(sr2, if_f0_3, version19): def change_f0(if_f0_3, sr2, version19): # f0method8,pretrained_G14,pretrained_D15 path_str = "" if version19 == "v1" else "_v2" if_pretrained_generator_exist = os.access( - "pretrained%s/f0G%s.pth" % (path_str, sr2), os.F_OK + "assets/pretrained%s/f0G%s.pth" % (path_str, sr2), os.F_OK ) if_pretrained_discriminator_exist = os.access( - "pretrained%s/f0D%s.pth" % (path_str, sr2), os.F_OK + "assets/pretrained%s/f0D%s.pth" % (path_str, sr2), os.F_OK ) if not if_pretrained_generator_exist: print( - "pretrained%s/f0G%s.pth" % (path_str, sr2), + "assets/pretrained%s/f0G%s.pth" % (path_str, sr2), "not exist, will not use pretrained model", ) if not if_pretrained_discriminator_exist: print( - "pretrained%s/f0D%s.pth" % (path_str, sr2), + "assets/pretrained%s/f0D%s.pth" % (path_str, sr2), "not exist, will not use pretrained model", ) if if_f0_3: return ( {"visible": True, "__type__": "update"}, - "pretrained%s/f0G%s.pth" % (path_str, sr2) + "assets/pretrained%s/f0G%s.pth" % (path_str, sr2) if if_pretrained_generator_exist else "", - "pretrained%s/f0D%s.pth" % (path_str, sr2) + "assets/pretrained%s/f0D%s.pth" % (path_str, sr2) if if_pretrained_discriminator_exist else "", ) return ( {"visible": False, "__type__": "update"}, - ("pretrained%s/G%s.pth" % (path_str, sr2)) + ("assets/pretrained%s/G%s.pth" % (path_str, sr2)) if if_pretrained_generator_exist else "", - ("pretrained%s/D%s.pth" % (path_str, sr2)) + ("assets/pretrained%s/D%s.pth" % (path_str, sr2)) if if_pretrained_discriminator_exist else "", ) @@ -935,7 +557,7 @@ def click_train( if pretrained_D15 == "": print("no pretrained Discriminator") if gpus16: - cmd = get_quoted_python_cmd() + ' train_nsf_sim_cache_sid_load_pretrain.py -e "%s" -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' % ( + cmd = get_quoted_python_cmd() + ' infer/modules/train/train.py -e "%s" -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' % ( exp_dir1, sr2, 1 if if_f0_3 else 0, @@ -953,7 +575,7 @@ def click_train( else: cmd = ( config.python_cmd - + ' train_nsf_sim_cache_sid_load_pretrain.py -e "%s" -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' + + ' infer/modules/train/train.py -e "%s" -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' % ( exp_dir1, sr2, @@ -1083,280 +705,41 @@ def train1key( infos.append(strr) return "\n".join(infos) - model_log_dir = "%s/logs/%s" % (now_dir, exp_dir1) - preprocess_log_path = "%s/preprocess.log" % model_log_dir - extract_f0_feature_log_path = "%s/extract_f0_feature.log" % model_log_dir - gt_wavs_dir = "%s/0_gt_wavs" % model_log_dir - feature_dir = ( - "%s/3_feature256" % model_log_dir - if version19 == "v1" - else "%s/3_feature768" % model_log_dir - ) - - os.makedirs(model_log_dir, exist_ok=True) - #########step1:处理数据 - open(preprocess_log_path, "w").close() - cmd = ( - get_quoted_python_cmd() - + ' trainset_preprocess_pipeline_print.py "%s" %s %s "%s" ' - % (trainset_dir4, sr_dict[sr2], np7, model_log_dir) - + str(config.noparallel) - ) + ####### step1:处理数据 yield get_info_str(i18n("step1:正在处理数据")) - yield get_info_str(cmd) - p = Popen(cmd, shell=True) - p.wait() - with open(preprocess_log_path, "r") as f: - print(f.read()) - #########step2a:提取音高 - open(extract_f0_feature_log_path, "w") - if if_f0_3: - yield get_info_str("step2a:正在提取音高") - if f0method8 != "rmvpe_gpu": - cmd = config.python_cmd + ' extract_f0_print.py "%s" %s %s' % ( - model_log_dir, - np7, - f0method8, - ) - yield get_info_str(cmd) - p = Popen(cmd, shell=True, cwd=now_dir) - p.wait() - else: - if gpus_rmvpe != "-": - gpus_rmvpe = gpus_rmvpe.split("-") - leng = len(gpus_rmvpe) - ps = [] - for idx, n_g in enumerate(gpus_rmvpe): - cmd = ( - get_quoted_python_cmd() - + ' extract_f0_rmvpe.py %s %s %s "%s" %s ' - % ( - leng, - idx, - n_g, - model_log_dir, - config.is_half, - ) - ) - yield get_info_str(cmd) - p = Popen( - cmd, shell=True, cwd=now_dir - ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir - ps.append(p) - for p in ps: - p.wait() - else: # dml - cmd = config.python_cmd + ' extract_f0_rmvpe_dml.py "%s" ' % ( - model_log_dir - ) - yield get_info_str(cmd) - p = Popen( - cmd, shell=True, cwd=now_dir - ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir - p.wait() - with open(extract_f0_feature_log_path, "r") as f: - print(f.read()) - else: - yield get_info_str(i18n("step2a:无需提取音高")) - #######step2b:提取特征 - yield get_info_str(i18n("step2b:正在提取特征")) - gpus = gpus16.split("-") - leng = len(gpus) - ps = [] - for idx, n_g in enumerate(gpus): - cmd = ( - get_quoted_python_cmd() - + ' extract_feature_print.py %s %s %s %s "%s" %s' - % ( - config.device, - leng, - idx, - n_g, - model_log_dir, - version19, - ) + [get_info_str(_) for _ in preprocess_dataset(trainset_dir4, exp_dir1, sr2, np7)] + + ####### step2a:提取音高 + yield get_info_str(i18n("step2:正在提取音高&正在提取特征")) + [ + get_info_str(_) + for _ in extract_f0_feature( + gpus16, np7, f0method8, if_f0_3, exp_dir1, version19, gpus_rmvpe ) - yield get_info_str(cmd) - p = Popen( - cmd, shell=True, cwd=now_dir - ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir - ps.append(p) - for p in ps: - p.wait() - with open(extract_f0_feature_log_path, "r") as f: - print(f.read()) - #######step3a:训练模型 + ] + + ####### step3a:训练模型 yield get_info_str(i18n("step3a:正在训练模型")) - # 生成filelist - if if_f0_3: - f0_dir = "%s/2a_f0" % model_log_dir - f0nsf_dir = "%s/2b-f0nsf" % model_log_dir - names = ( - set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) - & set([name.split(".")[0] for name in os.listdir(feature_dir)]) - & set([name.split(".")[0] for name in os.listdir(f0_dir)]) - & set([name.split(".")[0] for name in os.listdir(f0nsf_dir)]) - ) - else: - names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set( - [name.split(".")[0] for name in os.listdir(feature_dir)] - ) - opt = [] - for name in names: - if if_f0_3: - opt.append( - "%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s" - % ( - gt_wavs_dir.replace("\\", "\\\\"), - name, - feature_dir.replace("\\", "\\\\"), - name, - f0_dir.replace("\\", "\\\\"), - name, - f0nsf_dir.replace("\\", "\\\\"), - name, - spk_id5, - ) - ) - else: - opt.append( - "%s/%s.wav|%s/%s.npy|%s" - % ( - gt_wavs_dir.replace("\\", "\\\\"), - name, - feature_dir.replace("\\", "\\\\"), - name, - spk_id5, - ) - ) - fea_dim = 256 if version19 == "v1" else 768 - if if_f0_3: - for _ in range(2): - opt.append( - "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s" - % (now_dir, sr2, now_dir, fea_dim, now_dir, now_dir, spk_id5) - ) - else: - for _ in range(2): - opt.append( - "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s" - % (now_dir, sr2, now_dir, fea_dim, spk_id5) - ) - shuffle(opt) - with open("%s/filelist.txt" % model_log_dir, "w") as f: - f.write("\n".join(opt)) - yield get_info_str("write filelist done") - if gpus16: - cmd = get_quoted_python_cmd() + ' train_nsf_sim_cache_sid_load_pretrain.py -e "%s" -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' % ( - exp_dir1, - sr2, - 1 if if_f0_3 else 0, - batch_size12, - gpus16, - total_epoch11, - save_epoch10, - "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "", - "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "", - 1 if if_save_latest13 == i18n("是") else 0, - 1 if if_cache_gpu17 == i18n("是") else 0, - 1 if if_save_every_weights18 == i18n("是") else 0, - version19, - ) - else: - cmd = ( - config.python_cmd - + ' train_nsf_sim_cache_sid_load_pretrain.py -e "%s" -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' - % ( - exp_dir1, - sr2, - 1 if if_f0_3 else 0, - batch_size12, - total_epoch11, - save_epoch10, - "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "", - "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "", - 1 if if_save_latest13 == i18n("是") else 0, - 1 if if_cache_gpu17 == i18n("是") else 0, - 1 if if_save_every_weights18 == i18n("是") else 0, - version19, - ) - ) - yield get_info_str(cmd) - p = Popen(cmd, shell=True, cwd=now_dir) - p.wait() + click_train( + exp_dir1, + sr2, + if_f0_3, + spk_id5, + save_epoch10, + total_epoch11, + batch_size12, + if_save_latest13, + pretrained_G14, + pretrained_D15, + gpus16, + if_cache_gpu17, + if_save_every_weights18, + version19, + ) yield get_info_str(i18n("训练结束, 您可查看控制台训练日志或实验文件夹下的train.log")) - #######step3b:训练索引 - npys = [] - listdir_res = list(os.listdir(feature_dir)) - for name in sorted(listdir_res): - phone = np.load("%s/%s" % (feature_dir, name)) - npys.append(phone) - big_npy = np.concatenate(npys, 0) - big_npy_idx = np.arange(big_npy.shape[0]) - np.random.shuffle(big_npy_idx) - big_npy = big_npy[big_npy_idx] - - if big_npy.shape[0] > 2e5: - # if(1): - info = "Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0] - print(info) - yield get_info_str(info) - try: - big_npy = ( - MiniBatchKMeans( - n_clusters=10000, - verbose=True, - batch_size=256 * config.n_cpu, - compute_labels=False, - init="random", - ) - .fit(big_npy) - .cluster_centers_ - ) - except: - info = traceback.format_exc() - print(info) - yield get_info_str(info) - - np.save("%s/total_fea.npy" % model_log_dir, big_npy) - n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39) - yield get_info_str("%s,%s" % (big_npy.shape, n_ivf)) - index = faiss.index_factory(256 if version19 == "v1" else 768, "IVF%s,Flat" % n_ivf) - yield get_info_str("training index") - index_ivf = faiss.extract_index_ivf(index) # - index_ivf.nprobe = 1 - index.train(big_npy) - faiss.write_index( - index, - "%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index" - % ( - model_log_dir.replace(now_dir + "/", ""), - n_ivf, - index_ivf.nprobe, - exp_dir1, - version19, - ), - ) - yield get_info_str("adding index") - batch_size_add = 8192 - for i in range(0, big_npy.shape[0], batch_size_add): - index.add(big_npy[i : i + batch_size_add]) - faiss.write_index( - index, - "%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index" - % ( - model_log_dir.replace(now_dir + "/", ""), - n_ivf, - index_ivf.nprobe, - exp_dir1, - version19, - ), - ) - yield get_info_str( - "成功构建索引, added_IVF%s_Flat_nprobe_%s_%s_%s.index" - % (n_ivf, index_ivf.nprobe, exp_dir1, version19) - ) + ####### step3b:训练索引 + [get_info_str(_) for _ in train_index(exp_dir1, version19)] yield get_info_str(i18n("全流程结束!")) @@ -1388,56 +771,6 @@ def change_f0_method(f0method8): return {"visible": visible, "__type__": "update"} -def export_onnx(ModelPath, ExportedPath): - global cpt - cpt = torch.load(ModelPath, map_location="cpu") - cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] - vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768 - - test_phone = torch.rand(1, 200, vec_channels) # hidden unit - test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用) - test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹) - test_pitchf = torch.rand(1, 200) # nsf基频 - test_ds = torch.LongTensor([0]) # 说话人ID - test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子) - - device = "cpu" # 导出时设备(不影响使用模型) - - net_g = SynthesizerTrnMsNSFsidM( - *cpt["config"], is_half=False, version=cpt.get("version", "v1") - ) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16) - net_g.load_state_dict(cpt["weight"], strict=False) - input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"] - output_names = [ - "audio", - ] - # net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出 - torch.onnx.export( - net_g, - ( - test_phone.to(device), - test_phone_lengths.to(device), - test_pitch.to(device), - test_pitchf.to(device), - test_ds.to(device), - test_rnd.to(device), - ), - ExportedPath, - dynamic_axes={ - "phone": [1], - "pitch": [1], - "pitchf": [1], - "rnd": [2], - }, - do_constant_folding=False, - opset_version=13, - verbose=False, - input_names=input_names, - output_names=output_names, - ) - return "Finished" - - with gr.Blocks(title="RVC WebUI") as app: gr.Markdown( value=i18n( @@ -1554,7 +887,7 @@ with gr.Blocks(title="RVC WebUI") as app: vc_output1 = gr.Textbox(label=i18n("输出信息")) vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)")) but0.click( - vc_single, + vc.vc_single, [ spk_item, input_audio0, @@ -1674,7 +1007,7 @@ with gr.Blocks(title="RVC WebUI") as app: but1 = gr.Button(i18n("转换"), variant="primary") vc_output3 = gr.Textbox(label=i18n("输出信息")) but1.click( - vc_multi, + vc.vc_multi, [ spk_item, dir_input, @@ -1696,7 +1029,7 @@ with gr.Blocks(title="RVC WebUI") as app: api_name="infer_convert_batch", ) sid0.change( - fn=get_vc, + fn=vc.get_vc, inputs=[sid0, protect0, protect1], outputs=[spk_item, protect0, protect1, file_index2, file_index4], ) @@ -1917,12 +1250,12 @@ with gr.Blocks(title="RVC WebUI") as app: with gr.Row(): pretrained_G14 = gr.Textbox( label=i18n("加载预训练底模G路径"), - value="pretrained_v2/f0G40k.pth", + value="assets/pretrained_v2/f0G40k.pth", interactive=True, ) pretrained_D15 = gr.Textbox( label=i18n("加载预训练底模D路径"), - value="pretrained_v2/f0D40k.pth", + value="assets/pretrained_v2/f0D40k.pth", interactive=True, ) sr2.change( diff --git a/lib/audio.py b/infer/lib/audio.py similarity index 72% rename from lib/audio.py rename to infer/lib/audio.py index 776939d..045055c 100644 --- a/lib/audio.py +++ b/infer/lib/audio.py @@ -1,4 +1,5 @@ import ffmpeg +import librosa import numpy as np @@ -15,7 +16,13 @@ def load_audio(file, sr): .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) ) + return np.frombuffer(out, np.float32).flatten() + + except AttributeError: + audio = file[1] / 32768.0 + if len(audio.shape) == 2: + audio = np.mean(audio, -1) + return librosa.resample(audio, orig_sr=file[0], target_sr=16000) + except Exception as e: raise RuntimeError(f"Failed to load audio: {e}") - - return np.frombuffer(out, np.float32).flatten() diff --git a/lib/infer_pack/attentions.py b/infer/lib/infer_pack/attentions.py similarity index 96% rename from lib/infer_pack/attentions.py rename to infer/lib/infer_pack/attentions.py index 84d5c87..2b6060c 100644 --- a/lib/infer_pack/attentions.py +++ b/infer/lib/infer_pack/attentions.py @@ -1,13 +1,13 @@ import copy import math + import numpy as np import torch from torch import nn from torch.nn import functional as F -from lib.infer_pack import commons -from lib.infer_pack import modules -from lib.infer_pack.modules import LayerNorm +from infer.lib.infer_pack import commons, modules +from infer.lib.infer_pack.modules import LayerNorm class Encoder(nn.Module): diff --git a/lib/infer_pack/commons.py b/infer/lib/infer_pack/commons.py similarity index 96% rename from lib/infer_pack/commons.py rename to infer/lib/infer_pack/commons.py index 4937729..7ba7d21 100644 --- a/lib/infer_pack/commons.py +++ b/infer/lib/infer_pack/commons.py @@ -1,4 +1,5 @@ import math + import numpy as np import torch from torch import nn diff --git a/lib/infer_pack/models.py b/infer/lib/infer_pack/models.py similarity index 98% rename from lib/infer_pack/models.py rename to infer/lib/infer_pack/models.py index 4749738..9878048 100644 --- a/lib/infer_pack/models.py +++ b/infer/lib/infer_pack/models.py @@ -1,17 +1,17 @@ -import math, pdb, os +import math +import os +import pdb from time import time as ttime + +import numpy as np import torch from torch import nn +from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d from torch.nn import functional as F -from lib.infer_pack import modules -from lib.infer_pack import attentions -from lib.infer_pack import commons -from lib.infer_pack.commons import init_weights, get_padding -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d -from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm -from lib.infer_pack.commons import init_weights -import numpy as np -from lib.infer_pack import commons +from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm + +from infer.lib.infer_pack import attentions, commons, modules +from infer.lib.infer_pack.commons import get_padding, init_weights class TextEncoder256(nn.Module): diff --git a/lib/infer_pack/models_onnx.py b/infer/lib/infer_pack/models_onnx.py similarity index 98% rename from lib/infer_pack/models_onnx.py rename to infer/lib/infer_pack/models_onnx.py index 963e67b..4642a90 100644 --- a/lib/infer_pack/models_onnx.py +++ b/infer/lib/infer_pack/models_onnx.py @@ -1,17 +1,17 @@ -import math, pdb, os +import math +import os +import pdb from time import time as ttime + +import numpy as np import torch from torch import nn +from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d from torch.nn import functional as F -from lib.infer_pack import modules -from lib.infer_pack import attentions -from lib.infer_pack import commons -from lib.infer_pack.commons import init_weights, get_padding -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d -from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm -from lib.infer_pack.commons import init_weights -import numpy as np -from lib.infer_pack import commons +from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm + +from infer.lib.infer_pack import attentions, commons, modules +from infer.lib.infer_pack.commons import get_padding, init_weights class TextEncoder256(nn.Module): diff --git a/lib/infer_pack/modules.py b/infer/lib/infer_pack/modules.py similarity index 95% rename from lib/infer_pack/modules.py rename to infer/lib/infer_pack/modules.py index b54dc47..edf2207 100644 --- a/lib/infer_pack/modules.py +++ b/infer/lib/infer_pack/modules.py @@ -1,18 +1,17 @@ import copy import math + import numpy as np import scipy import torch from torch import nn +from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d from torch.nn import functional as F +from torch.nn.utils import remove_weight_norm, weight_norm -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d -from torch.nn.utils import weight_norm, remove_weight_norm - -from lib.infer_pack import commons -from lib.infer_pack.commons import init_weights, get_padding -from lib.infer_pack.transforms import piecewise_rational_quadratic_transform - +from infer.lib.infer_pack import commons +from infer.lib.infer_pack.commons import get_padding, init_weights +from infer.lib.infer_pack.transforms import piecewise_rational_quadratic_transform LRELU_SLOPE = 0.1 diff --git a/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py similarity index 94% rename from lib/infer_pack/modules/F0Predictor/DioF0Predictor.py rename to infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py index b5a8e3e..e69a603 100644 --- a/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py +++ b/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py @@ -1,6 +1,7 @@ -from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor -import pyworld import numpy as np +import pyworld + +from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor class DioF0Predictor(F0Predictor): diff --git a/lib/infer_pack/modules/F0Predictor/F0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py similarity index 100% rename from lib/infer_pack/modules/F0Predictor/F0Predictor.py rename to infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py diff --git a/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py similarity index 94% rename from lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py rename to infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py index f8dae30..27f3356 100644 --- a/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py +++ b/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py @@ -1,6 +1,7 @@ -from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor -import pyworld import numpy as np +import pyworld + +from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor class HarvestF0Predictor(F0Predictor): diff --git a/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py similarity index 94% rename from lib/infer_pack/modules/F0Predictor/PMF0Predictor.py rename to infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py index b70de29..957ec46 100644 --- a/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py +++ b/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py @@ -1,6 +1,7 @@ -from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor -import parselmouth import numpy as np +import parselmouth + +from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor class PMF0Predictor(F0Predictor): diff --git a/lib/infer_pack/modules/F0Predictor/__init__.py b/infer/lib/infer_pack/modules/F0Predictor/__init__.py similarity index 100% rename from lib/infer_pack/modules/F0Predictor/__init__.py rename to infer/lib/infer_pack/modules/F0Predictor/__init__.py diff --git a/lib/infer_pack/onnx_inference.py b/infer/lib/infer_pack/onnx_inference.py similarity index 97% rename from lib/infer_pack/onnx_inference.py rename to infer/lib/infer_pack/onnx_inference.py index b4aba75..3901d76 100644 --- a/lib/infer_pack/onnx_inference.py +++ b/infer/lib/infer_pack/onnx_inference.py @@ -1,6 +1,6 @@ -import onnxruntime import librosa import numpy as np +import onnxruntime import soundfile diff --git a/lib/infer_pack/transforms.py b/infer/lib/infer_pack/transforms.py similarity index 97% rename from lib/infer_pack/transforms.py rename to infer/lib/infer_pack/transforms.py index 7d93c48..6d07b3b 100644 --- a/lib/infer_pack/transforms.py +++ b/infer/lib/infer_pack/transforms.py @@ -1,9 +1,7 @@ +import numpy as np import torch from torch.nn import functional as F -import numpy as np - - DEFAULT_MIN_BIN_WIDTH = 1e-3 DEFAULT_MIN_BIN_HEIGHT = 1e-3 DEFAULT_MIN_DERIVATIVE = 1e-3 diff --git a/lib/rmvpe.py b/infer/lib/rmvpe.py similarity index 96% rename from lib/rmvpe.py rename to infer/lib/rmvpe.py index e5fa613..0c288b2 100644 --- a/lib/rmvpe.py +++ b/infer/lib/rmvpe.py @@ -1,11 +1,11 @@ -import torch, numpy as np, pdb +import pdb + +import numpy as np +import torch import torch.nn as nn import torch.nn.functional as F -import torch, pdb -import numpy as np -import torch.nn.functional as F +from librosa.util import normalize, pad_center, tiny from scipy.signal import get_window -from librosa.util import pad_center, tiny, normalize ###stft codes from https://github.com/pseeth/torch-stft/blob/master/torch_stft/util.py @@ -670,7 +670,8 @@ class RMVPE: if __name__ == "__main__": - import soundfile as sf, librosa + import librosa + import soundfile as sf audio, sampling_rate = sf.read(r"C:\Users\liujing04\Desktop\Z\冬之花clip1.wav") if len(audio.shape) > 1: diff --git a/lib/slicer2.py b/infer/lib/slicer2.py similarity index 100% rename from lib/slicer2.py rename to infer/lib/slicer2.py diff --git a/lib/train/data_utils.py b/infer/lib/train/data_utils.py similarity index 96% rename from lib/train/data_utils.py rename to infer/lib/train/data_utils.py index 3437e24..db4e78f 100644 --- a/lib/train/data_utils.py +++ b/infer/lib/train/data_utils.py @@ -1,10 +1,12 @@ -import os, traceback +import os +import traceback + import numpy as np import torch import torch.utils.data -from lib.train.mel_processing import spectrogram_torch -from lib.train.utils import load_wav_to_torch, load_filepaths_and_text +from infer.lib.train.mel_processing import spectrogram_torch +from infer.lib.train.utils import load_filepaths_and_text, load_wav_to_torch class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): diff --git a/lib/train/losses.py b/infer/lib/train/losses.py similarity index 100% rename from lib/train/losses.py rename to infer/lib/train/losses.py diff --git a/lib/train/mel_processing.py b/infer/lib/train/mel_processing.py similarity index 96% rename from lib/train/mel_processing.py rename to infer/lib/train/mel_processing.py index 3cc3687..85342c4 100644 --- a/lib/train/mel_processing.py +++ b/infer/lib/train/mel_processing.py @@ -2,7 +2,6 @@ import torch import torch.utils.data from librosa.filters import mel as librosa_mel_fn - MAX_WAV_VALUE = 32768.0 diff --git a/lib/train/process_ckpt.py b/infer/lib/train/process_ckpt.py similarity index 96% rename from lib/train/process_ckpt.py rename to infer/lib/train/process_ckpt.py index 324d5a5..887dc71 100644 --- a/lib/train/process_ckpt.py +++ b/infer/lib/train/process_ckpt.py @@ -1,9 +1,11 @@ -import torch, traceback, os, sys - -now_dir = os.getcwd() -sys.path.append(now_dir) +import os +import sys +import traceback from collections import OrderedDict -from i18n import I18nAuto + +import torch + +from i18n.i18n import I18nAuto i18n = I18nAuto() @@ -40,7 +42,7 @@ def savee(ckpt, sr, if_f0, name, epoch, version, hps): opt["sr"] = sr opt["f0"] = if_f0 opt["version"] = version - torch.save(opt, "weights/%s.pth" % name) + torch.save(opt, "assets/weights/%s.pth" % name) return "Success." except: return traceback.format_exc() @@ -183,7 +185,7 @@ def extract_small_model(path, name, sr, if_f0, info, version): opt["version"] = version opt["sr"] = sr opt["f0"] = int(if_f0) - torch.save(opt, "weights/%s.pth" % name) + torch.save(opt, "assets/weights/%s.pth" % name) return "Success." except: return traceback.format_exc() @@ -253,7 +255,7 @@ def merge(path1, path2, alpha1, sr, f0, info, name, version): opt["f0"] = 1 if f0 == i18n("是") else 0 opt["version"] = version opt["info"] = info - torch.save(opt, "weights/%s.pth" % name) + torch.save(opt, "assets/weights/%s.pth" % name) return "Success." except: return traceback.format_exc() diff --git a/lib/train/utils.py b/infer/lib/train/utils.py similarity index 96% rename from lib/train/utils.py rename to infer/lib/train/utils.py index 9c0fb5c..314eee7 100644 --- a/lib/train/utils.py +++ b/infer/lib/train/utils.py @@ -1,13 +1,15 @@ -import os, traceback -import glob -import sys import argparse -import logging +import glob import json +import logging +import os import subprocess +import sys +import traceback + import numpy as np -from scipy.io.wavfile import read import torch +from scipy.io.wavfile import read MATPLOTLIB_FLAG = False @@ -362,9 +364,9 @@ def get_hparams(init=True): os.makedirs(experiment_dir) if args.version == "v1" or args.sample_rate == "40k": - config_path = "configs/%s.json" % args.sample_rate + config_path = "configs/v1/%s.json" % args.sample_rate else: - config_path = "configs/%s_v2.json" % args.sample_rate + config_path = "configs/v2/%s.json" % args.sample_rate config_save_path = os.path.join(experiment_dir, "config.json") if init: with open(config_path, "r") as f: diff --git a/lib/uvr5_pack/lib_v5/dataset.py b/infer/lib/uvr5_pack/lib_v5/dataset.py similarity index 100% rename from lib/uvr5_pack/lib_v5/dataset.py rename to infer/lib/uvr5_pack/lib_v5/dataset.py diff --git a/lib/uvr5_pack/lib_v5/layers.py b/infer/lib/uvr5_pack/lib_v5/layers.py similarity index 100% rename from lib/uvr5_pack/lib_v5/layers.py rename to infer/lib/uvr5_pack/lib_v5/layers.py index b82f06b..4fc1b5c 100644 --- a/lib/uvr5_pack/lib_v5/layers.py +++ b/infer/lib/uvr5_pack/lib_v5/layers.py @@ -1,6 +1,6 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn from . import spec_utils diff --git a/lib/uvr5_pack/lib_v5/layers_123812KB .py b/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py similarity index 100% rename from lib/uvr5_pack/lib_v5/layers_123812KB .py rename to infer/lib/uvr5_pack/lib_v5/layers_123812KB .py index b82f06b..4fc1b5c 100644 --- a/lib/uvr5_pack/lib_v5/layers_123812KB .py +++ b/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py @@ -1,6 +1,6 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn from . import spec_utils diff --git a/lib/uvr5_pack/lib_v5/layers_123821KB.py b/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py similarity index 100% rename from lib/uvr5_pack/lib_v5/layers_123821KB.py rename to infer/lib/uvr5_pack/lib_v5/layers_123821KB.py index b82f06b..4fc1b5c 100644 --- a/lib/uvr5_pack/lib_v5/layers_123821KB.py +++ b/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py @@ -1,6 +1,6 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn from . import spec_utils diff --git a/lib/uvr5_pack/lib_v5/layers_33966KB.py b/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py similarity index 100% rename from lib/uvr5_pack/lib_v5/layers_33966KB.py rename to infer/lib/uvr5_pack/lib_v5/layers_33966KB.py index a38b7bb..9b127bc 100644 --- a/lib/uvr5_pack/lib_v5/layers_33966KB.py +++ b/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py @@ -1,6 +1,6 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn from . import spec_utils diff --git a/lib/uvr5_pack/lib_v5/layers_537227KB.py b/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py similarity index 100% rename from lib/uvr5_pack/lib_v5/layers_537227KB.py rename to infer/lib/uvr5_pack/lib_v5/layers_537227KB.py index a38b7bb..9b127bc 100644 --- a/lib/uvr5_pack/lib_v5/layers_537227KB.py +++ b/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py @@ -1,6 +1,6 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn from . import spec_utils diff --git a/lib/uvr5_pack/lib_v5/layers_537238KB.py b/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py similarity index 100% rename from lib/uvr5_pack/lib_v5/layers_537238KB.py rename to infer/lib/uvr5_pack/lib_v5/layers_537238KB.py index a38b7bb..9b127bc 100644 --- a/lib/uvr5_pack/lib_v5/layers_537238KB.py +++ b/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py @@ -1,6 +1,6 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn from . import spec_utils diff --git a/lib/uvr5_pack/lib_v5/layers_new.py b/infer/lib/uvr5_pack/lib_v5/layers_new.py similarity index 100% rename from lib/uvr5_pack/lib_v5/layers_new.py rename to infer/lib/uvr5_pack/lib_v5/layers_new.py index 0c13e60..44153b6 100644 --- a/lib/uvr5_pack/lib_v5/layers_new.py +++ b/infer/lib/uvr5_pack/lib_v5/layers_new.py @@ -1,6 +1,6 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn from . import spec_utils diff --git a/lib/uvr5_pack/lib_v5/model_param_init.py b/infer/lib/uvr5_pack/lib_v5/model_param_init.py similarity index 100% rename from lib/uvr5_pack/lib_v5/model_param_init.py rename to infer/lib/uvr5_pack/lib_v5/model_param_init.py diff --git a/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/2band_32000.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/2band_48000.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/3band_44100.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/4band_44100.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/4band_v2.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/4band_v3.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/ensemble.json b/infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/ensemble.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json diff --git a/lib/uvr5_pack/lib_v5/nets.py b/infer/lib/uvr5_pack/lib_v5/nets.py similarity index 100% rename from lib/uvr5_pack/lib_v5/nets.py rename to infer/lib/uvr5_pack/lib_v5/nets.py index db4c5e3..5da3948 100644 --- a/lib/uvr5_pack/lib_v5/nets.py +++ b/infer/lib/uvr5_pack/lib_v5/nets.py @@ -1,8 +1,8 @@ -import torch -from torch import nn -import torch.nn.functional as F - import layers +import torch +import torch.nn.functional as F +from torch import nn + from . import spec_utils diff --git a/lib/uvr5_pack/lib_v5/nets_123812KB.py b/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py similarity index 100% rename from lib/uvr5_pack/lib_v5/nets_123812KB.py rename to infer/lib/uvr5_pack/lib_v5/nets_123812KB.py index becbfae..167d4cb 100644 --- a/lib/uvr5_pack/lib_v5/nets_123812KB.py +++ b/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py @@ -1,6 +1,6 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn from . import layers_123821KB as layers diff --git a/lib/uvr5_pack/lib_v5/nets_123821KB.py b/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py similarity index 100% rename from lib/uvr5_pack/lib_v5/nets_123821KB.py rename to infer/lib/uvr5_pack/lib_v5/nets_123821KB.py index becbfae..167d4cb 100644 --- a/lib/uvr5_pack/lib_v5/nets_123821KB.py +++ b/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py @@ -1,6 +1,6 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn from . import layers_123821KB as layers diff --git a/lib/uvr5_pack/lib_v5/nets_33966KB.py b/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py similarity index 100% rename from lib/uvr5_pack/lib_v5/nets_33966KB.py rename to infer/lib/uvr5_pack/lib_v5/nets_33966KB.py index b8986f9..73a5b83 100644 --- a/lib/uvr5_pack/lib_v5/nets_33966KB.py +++ b/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py @@ -1,6 +1,6 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn from . import layers_33966KB as layers diff --git a/lib/uvr5_pack/lib_v5/nets_537227KB.py b/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py similarity index 100% rename from lib/uvr5_pack/lib_v5/nets_537227KB.py rename to infer/lib/uvr5_pack/lib_v5/nets_537227KB.py index a1bb530..823b44f 100644 --- a/lib/uvr5_pack/lib_v5/nets_537227KB.py +++ b/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py @@ -1,7 +1,7 @@ -import torch import numpy as np -from torch import nn +import torch import torch.nn.functional as F +from torch import nn from . import layers_537238KB as layers diff --git a/lib/uvr5_pack/lib_v5/nets_537238KB.py b/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py similarity index 100% rename from lib/uvr5_pack/lib_v5/nets_537238KB.py rename to infer/lib/uvr5_pack/lib_v5/nets_537238KB.py index a1bb530..823b44f 100644 --- a/lib/uvr5_pack/lib_v5/nets_537238KB.py +++ b/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py @@ -1,7 +1,7 @@ -import torch import numpy as np -from torch import nn +import torch import torch.nn.functional as F +from torch import nn from . import layers_537238KB as layers diff --git a/lib/uvr5_pack/lib_v5/nets_61968KB.py b/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py similarity index 100% rename from lib/uvr5_pack/lib_v5/nets_61968KB.py rename to infer/lib/uvr5_pack/lib_v5/nets_61968KB.py index becbfae..167d4cb 100644 --- a/lib/uvr5_pack/lib_v5/nets_61968KB.py +++ b/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py @@ -1,6 +1,6 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn from . import layers_123821KB as layers diff --git a/lib/uvr5_pack/lib_v5/nets_new.py b/infer/lib/uvr5_pack/lib_v5/nets_new.py similarity index 99% rename from lib/uvr5_pack/lib_v5/nets_new.py rename to infer/lib/uvr5_pack/lib_v5/nets_new.py index bfaf72e..1c0f4fa 100644 --- a/lib/uvr5_pack/lib_v5/nets_new.py +++ b/infer/lib/uvr5_pack/lib_v5/nets_new.py @@ -1,6 +1,7 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn + from . import layers_new diff --git a/lib/uvr5_pack/lib_v5/spec_utils.py b/infer/lib/uvr5_pack/lib_v5/spec_utils.py similarity index 99% rename from lib/uvr5_pack/lib_v5/spec_utils.py rename to infer/lib/uvr5_pack/lib_v5/spec_utils.py index a3fd46d..a9634fd 100644 --- a/lib/uvr5_pack/lib_v5/spec_utils.py +++ b/infer/lib/uvr5_pack/lib_v5/spec_utils.py @@ -1,8 +1,12 @@ -import os, librosa +import hashlib +import json +import math +import os + +import librosa import numpy as np import soundfile as sf from tqdm import tqdm -import json, math, hashlib def crop_center(h1, h2): @@ -519,10 +523,11 @@ def istft(spec, hl): if __name__ == "__main__": - import cv2 + import argparse import sys import time - import argparse + + import cv2 from model_param_init import ModelParameters p = argparse.ArgumentParser() diff --git a/lib/uvr5_pack/name_params.json b/infer/lib/uvr5_pack/name_params.json similarity index 61% rename from lib/uvr5_pack/name_params.json rename to infer/lib/uvr5_pack/name_params.json index 950adcf..8ed51a6 100644 --- a/lib/uvr5_pack/name_params.json +++ b/infer/lib/uvr5_pack/name_params.json @@ -4,92 +4,92 @@ "model_hash_name" : [ { "hash_name": "47939caf0cfe52a0e81442b85b971dfd", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100" }, { "hash_name": "4e4ecb9764c50a8c414fee6e10395bbe", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", "param_name": "4band_v2" }, { "hash_name": "ca106edd563e034bde0bdec4bb7a4b36", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", "param_name": "4band_v2" }, { "hash_name": "e60a1e84803ce4efc0a6551206cc4b71", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100" }, { "hash_name": "a82f14e75892e55e994376edbf0c8435", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100" }, { "hash_name": "6dd9eaa6f0420af9f1d403aaafa4cc06", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", "param_name": "4band_v2_sn" }, { "hash_name": "08611fb99bd59eaa79ad27c58d137727", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", "param_name": "4band_v2_sn" }, { "hash_name": "5c7bbca45a187e81abbbd351606164e5", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", "param_name": "3band_44100_msb2" }, { "hash_name": "d6b2cb685a058a091e5e7098192d3233", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", "param_name": "3band_44100_msb2" }, { "hash_name": "c1b9f38170a7c90e96f027992eb7c62b", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100" }, { "hash_name": "c3448ec923fa0edf3d03a19e633faa53", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100" }, { "hash_name": "68aa2c8093d0080704b200d140f59e54", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json", "param_name": "3band_44100" }, { "hash_name": "fdc83be5b798e4bd29fe00fe6600e147", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", "param_name": "3band_44100_mid.json" }, { "hash_name": "2ce34bc92fd57f55db16b7a4def3d745", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", "param_name": "3band_44100_mid.json" }, { "hash_name": "52fdca89576f06cf4340b74a4730ee5f", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100.json" }, { "hash_name": "41191165b05d38fc77f072fa9e8e8a30", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100.json" }, { "hash_name": "89e83b511ad474592689e562d5b1f80e", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", "param_name": "2band_32000.json" }, { "hash_name": "0b954da81d453b716b114d6d7c95177f", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", "param_name": "2band_32000.json" } @@ -97,47 +97,47 @@ "v4 Models": [ { "hash_name": "6a00461c51c2920fd68937d4609ed6c8", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json", "param_name": "1band_sr16000_hl512" }, { "hash_name": "0ab504864d20f1bd378fe9c81ef37140", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", "param_name": "1band_sr32000_hl512" }, { "hash_name": "7dd21065bf91c10f7fccb57d7d83b07f", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", "param_name": "1band_sr32000_hl512" }, { "hash_name": "80ab74d65e515caa3622728d2de07d23", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", "param_name": "1band_sr32000_hl512" }, { "hash_name": "edc115e7fc523245062200c00caa847f", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", "param_name": "1band_sr33075_hl384" }, { "hash_name": "28063e9f6ab5b341c5f6d3c67f2045b7", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", "param_name": "1band_sr33075_hl384" }, { "hash_name": "b58090534c52cbc3e9b5104bad666ef2", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", "param_name": "1band_sr44100_hl512" }, { "hash_name": "0cdab9947f1b0928705f518f3c78ea8f", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", "param_name": "1band_sr44100_hl512" }, { "hash_name": "ae702fed0238afb5346db8356fe25f13", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json", "param_name": "1band_sr44100_hl1024" } ] @@ -148,113 +148,113 @@ "1 Band": [ { "hash_name": "1band_sr16000_hl512", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json", "param_name": "1band_sr16000_hl512" }, { "hash_name": "1band_sr32000_hl512", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", "param_name": "1band_sr16000_hl512" }, { "hash_name": "1band_sr33075_hl384", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", "param_name": "1band_sr33075_hl384" }, { "hash_name": "1band_sr44100_hl256", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json", "param_name": "1band_sr44100_hl256" }, { "hash_name": "1band_sr44100_hl512", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", "param_name": "1band_sr44100_hl512" }, { "hash_name": "1band_sr44100_hl1024", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json", "param_name": "1band_sr44100_hl1024" } ], "2 Band": [ { "hash_name": "2band_44100_lofi", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json", "param_name": "2band_44100_lofi" }, { "hash_name": "2band_32000", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", "param_name": "2band_32000" }, { "hash_name": "2band_48000", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/2band_48000.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json", "param_name": "2band_48000" } ], "3 Band": [ { "hash_name": "3band_44100", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json", "param_name": "3band_44100" }, { "hash_name": "3band_44100_mid", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", "param_name": "3band_44100_mid" }, { "hash_name": "3band_44100_msb2", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", "param_name": "3band_44100_msb2" } ], "4 Band": [ { "hash_name": "4band_44100", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100" }, { "hash_name": "4band_44100_mid", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json", "param_name": "4band_44100_mid" }, { "hash_name": "4band_44100_msb", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json", "param_name": "4band_44100_msb" }, { "hash_name": "4band_44100_msb2", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json", "param_name": "4band_44100_msb2" }, { "hash_name": "4band_44100_reverse", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json", "param_name": "4band_44100_reverse" }, { "hash_name": "4band_44100_sw", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json", "param_name": "4band_44100_sw" }, { "hash_name": "4band_v2", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", "param_name": "4band_v2" }, { "hash_name": "4band_v2_sn", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", "param_name": "4band_v2_sn" }, { "hash_name": "tmodelparam", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/tmodelparam.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/tmodelparam.json", "param_name": "User Model Param Set" } ] diff --git a/lib/uvr5_pack/utils.py b/infer/lib/uvr5_pack/utils.py similarity index 97% rename from lib/uvr5_pack/utils.py rename to infer/lib/uvr5_pack/utils.py index 0fafe87..f4805cd 100644 --- a/lib/uvr5_pack/utils.py +++ b/infer/lib/uvr5_pack/utils.py @@ -1,10 +1,11 @@ -import torch -import numpy as np -from tqdm import tqdm import json +import numpy as np +import torch +from tqdm import tqdm -def load_data(file_name: str = "./lib/uvr5_pack/name_params.json") -> dict: + +def load_data(file_name: str = "./infer/lib/uvr5_pack/name_params.json") -> dict: with open(file_name, "r") as f: data = json.load(f) diff --git a/infer/modules/onnx/export.py b/infer/modules/onnx/export.py new file mode 100644 index 0000000..ed4a416 --- /dev/null +++ b/infer/modules/onnx/export.py @@ -0,0 +1,52 @@ +import torch + +from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM + + +def export_onnx(ModelPath, ExportedPath): + cpt = torch.load(ModelPath, map_location="cpu") + cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] + vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768 + + test_phone = torch.rand(1, 200, vec_channels) # hidden unit + test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用) + test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹) + test_pitchf = torch.rand(1, 200) # nsf基频 + test_ds = torch.LongTensor([0]) # 说话人ID + test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子) + + device = "cpu" # 导出时设备(不影响使用模型) + + net_g = SynthesizerTrnMsNSFsidM( + *cpt["config"], is_half=False, version=cpt.get("version", "v1") + ) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16) + net_g.load_state_dict(cpt["weight"], strict=False) + input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"] + output_names = [ + "audio", + ] + # net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出 + torch.onnx.export( + net_g, + ( + test_phone.to(device), + test_phone_lengths.to(device), + test_pitch.to(device), + test_pitchf.to(device), + test_ds.to(device), + test_rnd.to(device), + ), + ExportedPath, + dynamic_axes={ + "phone": [1], + "pitch": [1], + "pitchf": [1], + "rnd": [2], + }, + do_constant_folding=False, + opset_version=13, + verbose=False, + input_names=input_names, + output_names=output_names, + ) + return "Finished" diff --git a/extract_f0_print.py b/infer/modules/train/extract/extract_f0_print.py similarity index 91% rename from extract_f0_print.py rename to infer/modules/train/extract/extract_f0_print.py index 4f6c806..6949f1c 100644 --- a/extract_f0_print.py +++ b/infer/modules/train/extract/extract_f0_print.py @@ -1,10 +1,17 @@ -import os, traceback, sys, parselmouth +import os +import sys +import traceback + +import parselmouth now_dir = os.getcwd() sys.path.append(now_dir) -from lib.audio import load_audio +import logging + +import numpy as np import pyworld -import numpy as np, logging + +from infer.lib.audio import load_audio logging.getLogger("numba").setLevel(logging.WARNING) from multiprocessing import Process @@ -76,10 +83,12 @@ class FeatureInput(object): f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs) elif f0_method == "rmvpe": if hasattr(self, "model_rmvpe") == False: - from lib.rmvpe import RMVPE + from infer.lib.rmvpe import RMVPE print("loading rmvpe model") - self.model_rmvpe = RMVPE("rmvpe.pt", is_half=False, device="cpu") + self.model_rmvpe = RMVPE( + "assets/rmvpe/rmvpe.pt", is_half=False, device="cpu" + ) f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) return f0 diff --git a/extract_f0_rmvpe.py b/infer/modules/train/extract/extract_f0_rmvpe.py similarity index 89% rename from extract_f0_rmvpe.py rename to infer/modules/train/extract/extract_f0_rmvpe.py index 00ca16c..52d7492 100644 --- a/extract_f0_rmvpe.py +++ b/infer/modules/train/extract/extract_f0_rmvpe.py @@ -1,10 +1,17 @@ -import os, traceback, sys, parselmouth +import os +import sys +import traceback + +import parselmouth now_dir = os.getcwd() sys.path.append(now_dir) -from lib.audio import load_audio +import logging + +import numpy as np import pyworld -import numpy as np, logging + +from infer.lib.audio import load_audio logging.getLogger("numba").setLevel(logging.WARNING) @@ -39,10 +46,12 @@ class FeatureInput(object): # p_len = x.shape[0] // self.hop if f0_method == "rmvpe": if hasattr(self, "model_rmvpe") == False: - from lib.rmvpe import RMVPE + from infer.lib.rmvpe import RMVPE print("loading rmvpe model") - self.model_rmvpe = RMVPE("rmvpe.pt", is_half=is_half, device="cuda") + self.model_rmvpe = RMVPE( + "assets/rmvpe/rmvpe.pt", is_half=is_half, device="cuda" + ) f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) return f0 diff --git a/extract_f0_rmvpe_dml.py b/infer/modules/train/extract/extract_f0_rmvpe_dml.py similarity index 89% rename from extract_f0_rmvpe_dml.py rename to infer/modules/train/extract/extract_f0_rmvpe_dml.py index 0de50c5..2d812ab 100644 --- a/extract_f0_rmvpe_dml.py +++ b/infer/modules/train/extract/extract_f0_rmvpe_dml.py @@ -1,10 +1,17 @@ -import os, traceback, sys, parselmouth +import os +import sys +import traceback + +import parselmouth now_dir = os.getcwd() sys.path.append(now_dir) -from lib.audio import load_audio +import logging + +import numpy as np import pyworld -import numpy as np, logging + +from infer.lib.audio import load_audio logging.getLogger("numba").setLevel(logging.WARNING) @@ -37,10 +44,12 @@ class FeatureInput(object): # p_len = x.shape[0] // self.hop if f0_method == "rmvpe": if hasattr(self, "model_rmvpe") == False: - from lib.rmvpe import RMVPE + from infer.lib.rmvpe import RMVPE print("loading rmvpe model") - self.model_rmvpe = RMVPE("rmvpe.pt", is_half=False, device=device) + self.model_rmvpe = RMVPE( + "assets/rmvpe/rmvpe.pt", is_half=False, device=device + ) f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) return f0 diff --git a/extract_feature_print.py b/infer/modules/train/extract_feature_print.py similarity index 94% rename from extract_feature_print.py rename to infer/modules/train/extract_feature_print.py index e613de4..f8bfc2a 100644 --- a/extract_feature_print.py +++ b/infer/modules/train/extract_feature_print.py @@ -1,4 +1,6 @@ -import os, sys, traceback +import os +import sys +import traceback os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0" @@ -14,11 +16,11 @@ else: exp_dir = sys.argv[5] os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu) version = sys.argv[6] +import fairseq +import numpy as np +import soundfile as sf import torch import torch.nn.functional as F -import soundfile as sf -import numpy as np -import fairseq if "privateuseone" not in device: device = "cpu" @@ -48,7 +50,7 @@ def printt(strr): printt(sys.argv) -model_path = "hubert_base.pt" +model_path = "assets/hubert/hubert_base.pt" printt(exp_dir) wavPath = "%s/1_16k_wavs" % exp_dir diff --git a/trainset_preprocess_pipeline_print.py b/infer/modules/train/preprocess.py similarity index 95% rename from trainset_preprocess_pipeline_print.py rename to infer/modules/train/preprocess.py index 62671ba..c57b5dc 100644 --- a/trainset_preprocess_pipeline_print.py +++ b/infer/modules/train/preprocess.py @@ -1,4 +1,7 @@ -import sys, os, multiprocessing +import multiprocessing +import os +import sys + from scipy import signal now_dir = os.getcwd() @@ -9,12 +12,16 @@ sr = int(sys.argv[2]) n_p = int(sys.argv[3]) exp_dir = sys.argv[4] noparallel = sys.argv[5] == "True" -import numpy as np, os, traceback -from lib.slicer2 import Slicer -import librosa, traceback -from scipy.io import wavfile import multiprocessing -from lib.audio import load_audio +import os +import traceback + +import librosa +import numpy as np +from scipy.io import wavfile + +from infer.lib.audio import load_audio +from infer.lib.slicer2 import Slicer mutex = multiprocessing.Lock() f = open("%s/preprocess.log" % exp_dir, "a+") diff --git a/train_nsf_sim_cache_sid_load_pretrain.py b/infer/modules/train/train.py similarity index 96% rename from train_nsf_sim_cache_sid_load_pretrain.py rename to infer/modules/train/train.py index c1bdf11..ac52cf6 100644 --- a/train_nsf_sim_cache_sid_load_pretrain.py +++ b/infer/modules/train/train.py @@ -1,53 +1,63 @@ -import os, sys +import os +import sys now_dir = os.getcwd() sys.path.append(os.path.join(now_dir)) -from lib.train import utils import datetime +from infer.lib.train import utils + hps = utils.get_hparams() os.environ["CUDA_VISIBLE_DEVICES"] = hps.gpus.replace("-", ",") n_gpus = len(hps.gpus.split("-")) -from random import shuffle, randint +from random import randint, shuffle import torch torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = False -from torch.nn import functional as F -from torch.utils.data import DataLoader -from torch.utils.tensorboard import SummaryWriter -import torch.multiprocessing as mp -import torch.distributed as dist -from torch.nn.parallel import DistributedDataParallel as DDP -from torch.cuda.amp import autocast, GradScaler -from lib.infer_pack import commons from time import sleep from time import time as ttime -from lib.train.data_utils import ( - TextAudioLoaderMultiNSFsid, - TextAudioLoader, - TextAudioCollateMultiNSFsid, - TextAudioCollate, + +import torch.distributed as dist +import torch.multiprocessing as mp +from torch.cuda.amp import GradScaler, autocast +from torch.nn import functional as F +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter + +from infer.lib.infer_pack import commons +from infer.lib.train.data_utils import ( DistributedBucketSampler, + TextAudioCollate, + TextAudioCollateMultiNSFsid, + TextAudioLoader, + TextAudioLoaderMultiNSFsid, ) if hps.version == "v1": - from lib.infer_pack.models import ( - SynthesizerTrnMs256NSFsid as RVC_Model_f0, + from infer.lib.infer_pack.models import MultiPeriodDiscriminator + from infer.lib.infer_pack.models import SynthesizerTrnMs256NSFsid as RVC_Model_f0 + from infer.lib.infer_pack.models import ( SynthesizerTrnMs256NSFsid_nono as RVC_Model_nof0, - MultiPeriodDiscriminator, ) else: - from lib.infer_pack.models import ( + from infer.lib.infer_pack.models import ( SynthesizerTrnMs768NSFsid as RVC_Model_f0, SynthesizerTrnMs768NSFsid_nono as RVC_Model_nof0, MultiPeriodDiscriminatorV2 as MultiPeriodDiscriminator, ) -from lib.train.losses import generator_loss, discriminator_loss, feature_loss, kl_loss -from lib.train.mel_processing import mel_spectrogram_torch, spec_to_mel_torch -from lib.train.process_ckpt import savee + +from infer.lib.train.losses import ( + discriminator_loss, + feature_loss, + generator_loss, + kl_loss, +) +from infer.lib.train.mel_processing import mel_spectrogram_torch, spec_to_mel_torch +from infer.lib.train.process_ckpt import savee global_step = 0 diff --git a/MDXNet.py b/infer/modules/uvr5/mdxnet.py similarity index 83% rename from MDXNet.py rename to infer/modules/uvr5/mdxnet.py index c519e25..4a70469 100644 --- a/MDXNet.py +++ b/infer/modules/uvr5/mdxnet.py @@ -1,285 +1,243 @@ -import soundfile as sf -import torch, pdb, os, warnings, librosa -import numpy as np -from tqdm import tqdm -import torch - -dim_c = 4 - - -class Conv_TDF_net_trim: - def __init__( - self, device, model_name, target_name, L, dim_f, dim_t, n_fft, hop=1024 - ): - super(Conv_TDF_net_trim, self).__init__() - - self.dim_f = dim_f - self.dim_t = 2**dim_t - self.n_fft = n_fft - self.hop = hop - self.n_bins = self.n_fft // 2 + 1 - self.chunk_size = hop * (self.dim_t - 1) - self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to( - device - ) - self.target_name = target_name - self.blender = "blender" in model_name - - out_c = dim_c * 4 if target_name == "*" else dim_c - self.freq_pad = torch.zeros( - [1, out_c, self.n_bins - self.dim_f, self.dim_t] - ).to(device) - - self.n = L // 2 - - def stft(self, x): - x = x.reshape([-1, self.chunk_size]) - x = torch.stft( - x, - n_fft=self.n_fft, - hop_length=self.hop, - window=self.window, - center=True, - return_complex=True, - ) - x = torch.view_as_real(x) - x = x.permute([0, 3, 1, 2]) - x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape( - [-1, dim_c, self.n_bins, self.dim_t] - ) - return x[:, :, : self.dim_f] - - def istft(self, x, freq_pad=None): - freq_pad = ( - self.freq_pad.repeat([x.shape[0], 1, 1, 1]) - if freq_pad is None - else freq_pad - ) - x = torch.cat([x, freq_pad], -2) - c = 4 * 2 if self.target_name == "*" else 2 - x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape( - [-1, 2, self.n_bins, self.dim_t] - ) - x = x.permute([0, 2, 3, 1]) - x = x.contiguous() - x = torch.view_as_complex(x) - x = torch.istft( - x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True - ) - return x.reshape([-1, c, self.chunk_size]) - - -def get_models(device, dim_f, dim_t, n_fft): - return Conv_TDF_net_trim( - device=device, - model_name="Conv-TDF", - target_name="vocals", - L=11, - dim_f=dim_f, - dim_t=dim_t, - n_fft=n_fft, - ) - - -warnings.filterwarnings("ignore") -import sys - -now_dir = os.getcwd() -sys.path.append(now_dir) -from config import Config - -cpu = torch.device("cpu") -device = Config().device -# if torch.cuda.is_available(): -# device = torch.device("cuda:0") -# elif torch.backends.mps.is_available(): -# device = torch.device("mps") -# else: -# device = torch.device("cpu") - - -class Predictor: - def __init__(self, args): - self.args = args - self.model_ = get_models( - device=cpu, dim_f=args.dim_f, dim_t=args.dim_t, n_fft=args.n_fft - ) - import onnxruntime as ort - - print(ort.get_available_providers()) - self.model = ort.InferenceSession( - os.path.join(args.onnx, self.model_.target_name + ".onnx"), - providers=[ - "CUDAExecutionProvider", - "DmlExecutionProvider", - "CPUExecutionProvider", - ], - ) - print("onnx load done") - - def demix(self, mix): - samples = mix.shape[-1] - margin = self.args.margin - chunk_size = self.args.chunks * 44100 - assert not margin == 0, "margin cannot be zero!" - if margin > chunk_size: - margin = chunk_size - - segmented_mix = {} - - if self.args.chunks == 0 or samples < chunk_size: - chunk_size = samples - - counter = -1 - for skip in range(0, samples, chunk_size): - counter += 1 - - s_margin = 0 if counter == 0 else margin - end = min(skip + chunk_size + margin, samples) - - start = skip - s_margin - - segmented_mix[skip] = mix[:, start:end].copy() - if end == samples: - break - - sources = self.demix_base(segmented_mix, margin_size=margin) - """ - mix:(2,big_sample) - segmented_mix:offset->(2,small_sample) - sources:(1,2,big_sample) - """ - return sources - - def demix_base(self, mixes, margin_size): - chunked_sources = [] - progress_bar = tqdm(total=len(mixes)) - progress_bar.set_description("Processing") - for mix in mixes: - cmix = mixes[mix] - sources = [] - n_sample = cmix.shape[1] - model = self.model_ - trim = model.n_fft // 2 - gen_size = model.chunk_size - 2 * trim - pad = gen_size - n_sample % gen_size - mix_p = np.concatenate( - (np.zeros((2, trim)), cmix, np.zeros((2, pad)), np.zeros((2, trim))), 1 - ) - mix_waves = [] - i = 0 - while i < n_sample + pad: - waves = np.array(mix_p[:, i : i + model.chunk_size]) - mix_waves.append(waves) - i += gen_size - mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(cpu) - with torch.no_grad(): - _ort = self.model - spek = model.stft(mix_waves) - if self.args.denoise: - spec_pred = ( - -_ort.run(None, {"input": -spek.cpu().numpy()})[0] * 0.5 - + _ort.run(None, {"input": spek.cpu().numpy()})[0] * 0.5 - ) - tar_waves = model.istft(torch.tensor(spec_pred)) - else: - tar_waves = model.istft( - torch.tensor(_ort.run(None, {"input": spek.cpu().numpy()})[0]) - ) - tar_signal = ( - tar_waves[:, :, trim:-trim] - .transpose(0, 1) - .reshape(2, -1) - .numpy()[:, :-pad] - ) - - start = 0 if mix == 0 else margin_size - end = None if mix == list(mixes.keys())[::-1][0] else -margin_size - if margin_size == 0: - end = None - sources.append(tar_signal[:, start:end]) - - progress_bar.update(1) - - chunked_sources.append(sources) - _sources = np.concatenate(chunked_sources, axis=-1) - # del self.model - progress_bar.close() - return _sources - - def prediction(self, m, vocal_root, others_root, format): - os.makedirs(vocal_root, exist_ok=True) - os.makedirs(others_root, exist_ok=True) - basename = os.path.basename(m) - mix, rate = librosa.load(m, mono=False, sr=44100) - if mix.ndim == 1: - mix = np.asfortranarray([mix, mix]) - mix = mix.T - sources = self.demix(mix.T) - opt = sources[0].T - if format in ["wav", "flac"]: - sf.write( - "%s/%s_main_vocal.%s" % (vocal_root, basename, format), mix - opt, rate - ) - sf.write("%s/%s_others.%s" % (others_root, basename, format), opt, rate) - else: - path_vocal = "%s/%s_main_vocal.wav" % (vocal_root, basename) - path_other = "%s/%s_others.wav" % (others_root, basename) - sf.write(path_vocal, mix - opt, rate) - sf.write(path_other, opt, rate) - if os.path.exists(path_vocal): - os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" - % (path_vocal, path_vocal[:-4] + ".%s" % format) - ) - if os.path.exists(path_other): - os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" - % (path_other, path_other[:-4] + ".%s" % format) - ) - - -class MDXNetDereverb: - def __init__(self, chunks): - self.onnx = "uvr5_weights/onnx_dereverb_By_FoxJoy" - self.shifts = 10 #'Predict with randomised equivariant stabilisation' - self.mixing = "min_mag" # ['default','min_mag','max_mag'] - self.chunks = chunks - self.margin = 44100 - self.dim_t = 9 - self.dim_f = 3072 - self.n_fft = 6144 - self.denoise = True - self.pred = Predictor(self) - - def _path_audio_(self, input, vocal_root, others_root, format): - self.pred.prediction(input, vocal_root, others_root, format) - - -if __name__ == "__main__": - dereverb = MDXNetDereverb(15) - from time import time as ttime - - t0 = ttime() - dereverb._path_audio_( - "雪雪伴奏对消HP5.wav", - "vocal", - "others", - ) - t1 = ttime() - print(t1 - t0) - - -""" - -runtime\python.exe MDXNet.py - -6G: -15/9:0.8G->6.8G -14:0.8G->6.5G -25:炸 - -half15:0.7G->6.6G,22.69s -fp32-15:0.7G->6.6G,20.85s - -""" +import os +import warnings + +import librosa +import numpy as np +import onnxruntime as ort +import soundfile as sf +import torch +from tqdm import tqdm + +cpu = torch.device("cpu") + + +class ConvTDFNetTrim: + def __init__( + self, device, model_name, target_name, L, dim_f, dim_t, n_fft, hop=1024 + ): + super(ConvTDFNetTrim, self).__init__() + + self.dim_f = dim_f + self.dim_t = 2**dim_t + self.n_fft = n_fft + self.hop = hop + self.n_bins = self.n_fft // 2 + 1 + self.chunk_size = hop * (self.dim_t - 1) + self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to( + device + ) + self.target_name = target_name + self.blender = "blender" in model_name + + self.dim_c = 4 + out_c = self.dim_c * 4 if target_name == "*" else self.dim_c + self.freq_pad = torch.zeros( + [1, out_c, self.n_bins - self.dim_f, self.dim_t] + ).to(device) + + self.n = L // 2 + + def stft(self, x): + x = x.reshape([-1, self.chunk_size]) + x = torch.stft( + x, + n_fft=self.n_fft, + hop_length=self.hop, + window=self.window, + center=True, + return_complex=True, + ) + x = torch.view_as_real(x) + x = x.permute([0, 3, 1, 2]) + x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape( + [-1, self.dim_c, self.n_bins, self.dim_t] + ) + return x[:, :, : self.dim_f] + + def istft(self, x, freq_pad=None): + freq_pad = ( + self.freq_pad.repeat([x.shape[0], 1, 1, 1]) + if freq_pad is None + else freq_pad + ) + x = torch.cat([x, freq_pad], -2) + c = 4 * 2 if self.target_name == "*" else 2 + x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape( + [-1, 2, self.n_bins, self.dim_t] + ) + x = x.permute([0, 2, 3, 1]) + x = x.contiguous() + x = torch.view_as_complex(x) + x = torch.istft( + x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True + ) + return x.reshape([-1, c, self.chunk_size]) + + +def get_models(device, dim_f, dim_t, n_fft): + return ConvTDFNetTrim( + device=device, + model_name="Conv-TDF", + target_name="vocals", + L=11, + dim_f=dim_f, + dim_t=dim_t, + n_fft=n_fft, + ) + + +class Predictor: + def __init__(self, args): + print(ort.get_available_providers()) + self.args = args + self.model_ = get_models( + device=cpu, dim_f=args.dim_f, dim_t=args.dim_t, n_fft=args.n_fft + ) + self.model = ort.InferenceSession( + os.path.join(args.onnx, self.model_.target_name + ".onnx"), + providers=[ + "CUDAExecutionProvider", + "DmlExecutionProvider", + "CPUExecutionProvider", + ], + ) + print("onnx load done") + + def demix(self, mix): + samples = mix.shape[-1] + margin = self.args.margin + chunk_size = self.args.chunks * 44100 + assert not margin == 0, "margin cannot be zero!" + if margin > chunk_size: + margin = chunk_size + + segmented_mix = {} + + if self.args.chunks == 0 or samples < chunk_size: + chunk_size = samples + + counter = -1 + for skip in range(0, samples, chunk_size): + counter += 1 + + s_margin = 0 if counter == 0 else margin + end = min(skip + chunk_size + margin, samples) + + start = skip - s_margin + + segmented_mix[skip] = mix[:, start:end].copy() + if end == samples: + break + + sources = self.demix_base(segmented_mix, margin_size=margin) + """ + mix:(2,big_sample) + segmented_mix:offset->(2,small_sample) + sources:(1,2,big_sample) + """ + return sources + + def demix_base(self, mixes, margin_size): + chunked_sources = [] + progress_bar = tqdm(total=len(mixes)) + progress_bar.set_description("Processing") + for mix in mixes: + cmix = mixes[mix] + sources = [] + n_sample = cmix.shape[1] + model = self.model_ + trim = model.n_fft // 2 + gen_size = model.chunk_size - 2 * trim + pad = gen_size - n_sample % gen_size + mix_p = np.concatenate( + (np.zeros((2, trim)), cmix, np.zeros((2, pad)), np.zeros((2, trim))), 1 + ) + mix_waves = [] + i = 0 + while i < n_sample + pad: + waves = np.array(mix_p[:, i : i + model.chunk_size]) + mix_waves.append(waves) + i += gen_size + mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(cpu) + with torch.no_grad(): + _ort = self.model + spek = model.stft(mix_waves) + if self.args.denoise: + spec_pred = ( + -_ort.run(None, {"input": -spek.cpu().numpy()})[0] * 0.5 + + _ort.run(None, {"input": spek.cpu().numpy()})[0] * 0.5 + ) + tar_waves = model.istft(torch.tensor(spec_pred)) + else: + tar_waves = model.istft( + torch.tensor(_ort.run(None, {"input": spek.cpu().numpy()})[0]) + ) + tar_signal = ( + tar_waves[:, :, trim:-trim] + .transpose(0, 1) + .reshape(2, -1) + .numpy()[:, :-pad] + ) + + start = 0 if mix == 0 else margin_size + end = None if mix == list(mixes.keys())[::-1][0] else -margin_size + if margin_size == 0: + end = None + sources.append(tar_signal[:, start:end]) + + progress_bar.update(1) + + chunked_sources.append(sources) + _sources = np.concatenate(chunked_sources, axis=-1) + # del self.model + progress_bar.close() + return _sources + + def prediction(self, m, vocal_root, others_root, format): + os.makedirs(vocal_root, exist_ok=True) + os.makedirs(others_root, exist_ok=True) + basename = os.path.basename(m) + mix, rate = librosa.load(m, mono=False, sr=44100) + if mix.ndim == 1: + mix = np.asfortranarray([mix, mix]) + mix = mix.T + sources = self.demix(mix.T) + opt = sources[0].T + if format in ["wav", "flac"]: + sf.write( + "%s/%s_main_vocal.%s" % (vocal_root, basename, format), mix - opt, rate + ) + sf.write("%s/%s_others.%s" % (others_root, basename, format), opt, rate) + else: + path_vocal = "%s/%s_main_vocal.wav" % (vocal_root, basename) + path_other = "%s/%s_others.wav" % (others_root, basename) + sf.write(path_vocal, mix - opt, rate) + sf.write(path_other, opt, rate) + if os.path.exists(path_vocal): + os.system( + "ffmpeg -i %s -vn %s -q:a 2 -y" + % (path_vocal, path_vocal[:-4] + ".%s" % format) + ) + if os.path.exists(path_other): + os.system( + "ffmpeg -i %s -vn %s -q:a 2 -y" + % (path_other, path_other[:-4] + ".%s" % format) + ) + + +class MDXNetDereverb: + def __init__(self, chunks, device): + self.onnx = "uvr5_weights/onnx_dereverb_By_FoxJoy" + self.shifts = 10 # 'Predict with randomised equivariant stabilisation' + self.mixing = "min_mag" # ['default','min_mag','max_mag'] + self.chunks = chunks + self.margin = 44100 + self.dim_t = 9 + self.dim_f = 3072 + self.n_fft = 6144 + self.denoise = True + self.pred = Predictor(self) + self.device = device + + def path_audio(self, input, vocal_root, others_root, format): + self.pred.prediction(input, vocal_root, others_root, format) diff --git a/infer/modules/uvr5/modules.py b/infer/modules/uvr5/modules.py new file mode 100644 index 0000000..16ad0a9 --- /dev/null +++ b/infer/modules/uvr5/modules.py @@ -0,0 +1,96 @@ +import os +import traceback + +import ffmpeg +import torch + +from configs.config import Config +from infer.modules.uvr5.mdxnet import MDXNetDereverb +from infer.modules.uvr5.preprocess import AudioPre, AudioPreDeEcho + +config = Config() + + +def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0): + infos = [] + try: + inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + save_root_vocal = ( + save_root_vocal.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) + save_root_ins = ( + save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) + if model_name == "onnx_dereverb_By_FoxJoy": + pre_fun = MDXNetDereverb(15, config.device) + else: + func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho + pre_fun = func( + agg=int(agg), + model_path=os.path.join( + os.getenv("weight_uvr5_root"), model_name + ".pth" + ), + device=config.device, + is_half=config.is_half, + ) + if inp_root != "": + paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)] + else: + paths = [path.name for path in paths] + for path in paths: + inp_path = os.path.join(inp_root, path) + need_reformat = 1 + done = 0 + try: + info = ffmpeg.probe(inp_path, cmd="ffprobe") + if ( + info["streams"][0]["channels"] == 2 + and info["streams"][0]["sample_rate"] == "44100" + ): + need_reformat = 0 + pre_fun._path_audio_( + inp_path, save_root_ins, save_root_vocal, format0 + ) + done = 1 + except: + need_reformat = 1 + traceback.print_exc() + if need_reformat == 1: + tmp_path = "%s/%s.reformatted.wav" % ( + os.path.join("tmp"), + os.path.basename(inp_path), + ) + os.system( + "ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y" + % (inp_path, tmp_path) + ) + inp_path = tmp_path + try: + if done == 0: + pre_fun.path_audio( + inp_path, save_root_ins, save_root_vocal, format0 + ) + infos.append("%s->Success" % (os.path.basename(inp_path))) + yield "\n".join(infos) + except: + infos.append( + "%s->%s" % (os.path.basename(inp_path), traceback.format_exc()) + ) + yield "\n".join(infos) + except: + infos.append(traceback.format_exc()) + yield "\n".join(infos) + finally: + try: + if model_name == "onnx_dereverb_By_FoxJoy": + del pre_fun.pred.model + del pre_fun.pred.model_ + else: + del pre_fun.model + del pre_fun + except: + traceback.print_exc() + print("clean_empty_cache") + if torch.cuda.is_available(): + torch.cuda.empty_cache() + yield "\n".join(infos) diff --git a/infer_uvr5.py b/infer/modules/uvr5/preprocess.py similarity index 91% rename from infer_uvr5.py rename to infer/modules/uvr5/preprocess.py index 0ffdb5d..26aeada 100644 --- a/infer_uvr5.py +++ b/infer/modules/uvr5/preprocess.py @@ -1,24 +1,18 @@ -import os, sys, torch, warnings, pdb +import os -now_dir = os.getcwd() -sys.path.append(now_dir) -from json import load as ll - -warnings.filterwarnings("ignore") import librosa -import importlib import numpy as np -import hashlib, math -from tqdm import tqdm -from lib.uvr5_pack.lib_v5 import spec_utils -from lib.uvr5_pack.utils import _get_name_params, inference -from lib.uvr5_pack.lib_v5.model_param_init import ModelParameters import soundfile as sf -from lib.uvr5_pack.lib_v5.nets_new import CascadedNet -from lib.uvr5_pack.lib_v5 import nets_61968KB as nets +import torch + +from infer.lib.uvr5_pack.lib_v5 import nets_61968KB as Nets +from infer.lib.uvr5_pack.lib_v5 import spec_utils +from infer.lib.uvr5_pack.lib_v5.model_param_init import ModelParameters +from infer.lib.uvr5_pack.lib_v5.nets_new import CascadedNet +from infer.lib.uvr5_pack.utils import inference -class _audio_pre_: +class AudioPre: def __init__(self, agg, model_path, device, is_half): self.model_path = model_path self.device = device @@ -31,8 +25,8 @@ class _audio_pre_: "agg": agg, "high_end_process": "mirroring", } - mp = ModelParameters("lib/uvr5_pack/lib_v5/modelparams/4band_v2.json") - model = nets.CascadedASPPNet(mp.param["bins"] * 2) + mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json") + model = Nets.CascadedASPPNet(mp.param["bins"] * 2) cpk = torch.load(model_path, map_location="cpu") model.load_state_dict(cpk) model.eval() @@ -182,7 +176,7 @@ class _audio_pre_: ) -class _audio_pre_new: +class AudioPreDeEcho: def __init__(self, agg, model_path, device, is_half): self.model_path = model_path self.device = device @@ -347,17 +341,3 @@ class _audio_pre_new: "ffmpeg -i %s -vn %s -q:a 2 -y" % (path, path[:-4] + ".%s" % format) ) - - -if __name__ == "__main__": - device = "cuda" - is_half = True - # model_path = "uvr5_weights/2_HP-UVR.pth" - # model_path = "uvr5_weights/VR-DeEchoDeReverb.pth" - # model_path = "uvr5_weights/VR-DeEchoNormal.pth" - model_path = "uvr5_weights/DeEchoNormal.pth" - # pre_fun = _audio_pre_(model_path=model_path, device=device, is_half=True,agg=10) - pre_fun = _audio_pre_new(model_path=model_path, device=device, is_half=True, agg=10) - audio_path = "雪雪伴奏对消HP5.wav" - save_path = "opt" - pre_fun._path_audio_(audio_path, save_path, save_path) diff --git a/infer/modules/vc/__init__.py b/infer/modules/vc/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/infer/modules/vc/modules.py b/infer/modules/vc/modules.py new file mode 100644 index 0000000..ac37f44 --- /dev/null +++ b/infer/modules/vc/modules.py @@ -0,0 +1,248 @@ +import traceback + +import numpy as np +import soundfile as sf +import torch + +from infer.lib.audio import load_audio +from infer.lib.infer_pack.models import ( + SynthesizerTrnMs256NSFsid, + SynthesizerTrnMs256NSFsid_nono, + SynthesizerTrnMs768NSFsid, + SynthesizerTrnMs768NSFsid_nono, +) +from infer.modules.vc.pipeline import Pipeline +from infer.modules.vc.utils import * + + +class VC: + def __init__(self, config): + self.n_spk = None + self.tgt_sr = None + self.net_g = None + self.pipeline = None + self.cpt = None + self.version = None + self.if_f0 = None + self.version = None + self.hubert_model = None + + self.config = config + + def get_vc(self, sid, *to_return_protect): + person = f'{os.getenv("weight_root")}/{sid}' + print(f"loading {person}") + + self.cpt = torch.load(person, map_location="cpu") + self.tgt_sr = self.cpt["config"][-1] + self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0] # n_spk + self.if_f0 = self.cpt.get("f0", 1) + self.version = self.cpt.get("version", "v1") + + to_return_protect0 = { + "visible": self.if_f0 != 0, + "value": to_return_protect[0] + if self.if_f0 != 0 and to_return_protect + else 0.5, + "__type__": "update", + } + to_return_protect1 = { + "visible": self.if_f0 != 0, + "value": to_return_protect[1] + if self.if_f0 != 0 and to_return_protect + else 0.33, + "__type__": "update", + } + + synthesizer_class = { + ("v1", 1): SynthesizerTrnMs256NSFsid, + ("v1", 0): SynthesizerTrnMs256NSFsid_nono, + ("v2", 1): SynthesizerTrnMs768NSFsid, + ("v2", 0): SynthesizerTrnMs768NSFsid_nono, + } + + self.net_g = synthesizer_class.get( + (self.version, self.if_f0), SynthesizerTrnMs256NSFsid + )(*self.cpt["config"], is_half=self.config.is_half) + + del self.net_g.enc_q + + self.net_g.load_state_dict(self.cpt["weight"], strict=False) + self.net_g.eval().to(self.config.device) + if self.config.is_half: + self.net_g = self.net_g.half() + else: + self.net_g = self.net_g.float() + + self.pipeline = Pipeline(self.tgt_sr, self.config) + n_spk = self.cpt["config"][-3] + index = {"value": get_index_path_from_model(sid), "__type__": "update"} + + return ( + ( + {"visible": True, "maximum": n_spk, "__type__": "update"}, + to_return_protect0, + to_return_protect1, + index, + index, + ) + if to_return_protect + else {"visible": True, "maximum": n_spk, "__type__": "update"} + ) + + def vc_single( + self, + sid, + input_audio_path, + f0_up_key, + f0_file, + f0_method, + file_index, + file_index2, + index_rate, + filter_radius, + resample_sr, + rms_mix_rate, + protect, + ): + if input_audio_path is None: + return "You need to upload an audio", None + f0_up_key = int(f0_up_key) + try: + audio = load_audio(input_audio_path, 16000) + audio_max = np.abs(audio).max() / 0.95 + if audio_max > 1: + audio /= audio_max + times = [0, 0, 0] + + if self.hubert_model is None: + self.hubert_model = load_hubert(self.config) + + file_index = ( + ( + file_index.strip(" ") + .strip('"') + .strip("\n") + .strip('"') + .strip(" ") + .replace("trained", "added") + ) + if file_index != "" + else file_index2 + ) # 防止小白写错,自动帮他替换掉 + + audio_opt = self.pipeline.pipeline( + self.hubert_model, + self.net_g, + sid, + audio, + input_audio_path, + times, + f0_up_key, + f0_method, + file_index, + index_rate, + self.if_f0, + filter_radius, + self.tgt_sr, + resample_sr, + rms_mix_rate, + self.version, + protect, + f0_file, + ) + if self.tgt_sr != resample_sr >= 16000: + self.tgt_sr = resample_sr + index_info = ( + "Using index:%s." % file_index + if os.path.exists(file_index) + else "Index not used." + ) + return ( + f"Success.\n {index_info}\nTime:\n npy:{times[0]}s, f0:{times[1]}s, infer:{times[2]}s", + (self.tgt_sr, audio_opt), + ) + except: + info = traceback.format_exc() + print(info) + return info, (None, None) + + def vc_multi( + self, + sid, + dir_path, + opt_root, + paths, + f0_up_key, + f0_method, + file_index, + file_index2, + index_rate, + filter_radius, + resample_sr, + rms_mix_rate, + protect, + format1, + ): + try: + dir_path = ( + dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) # 防止小白拷路径头尾带了空格和"和回车 + opt_root = opt_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + os.makedirs(opt_root, exist_ok=True) + try: + if dir_path != "": + paths = [ + os.path.join(dir_path, name) for name in os.listdir(dir_path) + ] + else: + paths = [path.name for path in paths] + except: + traceback.print_exc() + paths = [path.name for path in paths] + infos = [] + for path in paths: + info, opt = self.vc_single( + sid, + path, + f0_up_key, + None, + f0_method, + file_index, + file_index2, + # file_big_npy, + index_rate, + filter_radius, + resample_sr, + rms_mix_rate, + protect, + ) + if "Success" in info: + try: + tgt_sr, audio_opt = opt + if format1 in ["wav", "flac"]: + sf.write( + "%s/%s.%s" + % (opt_root, os.path.basename(path), format1), + audio_opt, + tgt_sr, + ) + else: + path = "%s/%s.wav" % (opt_root, os.path.basename(path)) + sf.write( + path, + audio_opt, + tgt_sr, + ) + if os.path.exists(path): + os.system( + "ffmpeg -i %s -vn %s -q:a 2 -y" + % (path, path[:-4] + ".%s" % format1) + ) + except: + info += traceback.format_exc() + infos.append("%s->%s" % (os.path.basename(path), info)) + yield "\n".join(infos) + yield "\n".join(infos) + except: + yield traceback.format_exc() diff --git a/lib/train/vc_infer_pipeline.py b/infer/modules/vc/pipeline.py similarity index 90% rename from lib/train/vc_infer_pipeline.py rename to infer/modules/vc/pipeline.py index 980fc21..31e5399 100644 --- a/lib/train/vc_infer_pipeline.py +++ b/infer/modules/vc/pipeline.py @@ -1,449 +1,458 @@ -import numpy as np, parselmouth, torch, pdb, sys, os -from time import time as ttime -import torch.nn.functional as F -import scipy.signal as signal -import pyworld, os, traceback, faiss, librosa, torchcrepe -from scipy import signal -from functools import lru_cache - -now_dir = os.getcwd() -sys.path.append(now_dir) - -bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) - -input_audio_path2wav = {} - - -@lru_cache -def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period): - audio = input_audio_path2wav[input_audio_path] - f0, t = pyworld.harvest( - audio, - fs=fs, - f0_ceil=f0max, - f0_floor=f0min, - frame_period=frame_period, - ) - f0 = pyworld.stonemask(audio, f0, t, fs) - return f0 - - -def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比 - # print(data1.max(),data2.max()) - rms1 = librosa.feature.rms( - y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2 - ) # 每半秒一个点 - rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2) - rms1 = torch.from_numpy(rms1) - rms1 = F.interpolate( - rms1.unsqueeze(0), size=data2.shape[0], mode="linear" - ).squeeze() - rms2 = torch.from_numpy(rms2) - rms2 = F.interpolate( - rms2.unsqueeze(0), size=data2.shape[0], mode="linear" - ).squeeze() - rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6) - data2 *= ( - torch.pow(rms1, torch.tensor(1 - rate)) - * torch.pow(rms2, torch.tensor(rate - 1)) - ).numpy() - return data2 - - -class VC(object): - def __init__(self, tgt_sr, config): - self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = ( - config.x_pad, - config.x_query, - config.x_center, - config.x_max, - config.is_half, - ) - self.sr = 16000 # hubert输入采样率 - self.window = 160 # 每帧点数 - self.t_pad = self.sr * self.x_pad # 每条前后pad时间 - self.t_pad_tgt = tgt_sr * self.x_pad - self.t_pad2 = self.t_pad * 2 - self.t_query = self.sr * self.x_query # 查询切点前后查询时间 - self.t_center = self.sr * self.x_center # 查询切点位置 - self.t_max = self.sr * self.x_max # 免查询时长阈值 - self.device = config.device - - def get_f0( - self, - input_audio_path, - x, - p_len, - f0_up_key, - f0_method, - filter_radius, - inp_f0=None, - ): - global input_audio_path2wav - time_step = self.window / self.sr * 1000 - f0_min = 50 - f0_max = 1100 - f0_mel_min = 1127 * np.log(1 + f0_min / 700) - f0_mel_max = 1127 * np.log(1 + f0_max / 700) - if f0_method == "pm": - f0 = ( - parselmouth.Sound(x, self.sr) - .to_pitch_ac( - time_step=time_step / 1000, - voicing_threshold=0.6, - pitch_floor=f0_min, - pitch_ceiling=f0_max, - ) - .selected_array["frequency"] - ) - pad_size = (p_len - len(f0) + 1) // 2 - if pad_size > 0 or p_len - len(f0) - pad_size > 0: - f0 = np.pad( - f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" - ) - elif f0_method == "harvest": - input_audio_path2wav[input_audio_path] = x.astype(np.double) - f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10) - if filter_radius > 2: - f0 = signal.medfilt(f0, 3) - elif f0_method == "crepe": - model = "full" - # Pick a batch size that doesn't cause memory errors on your gpu - batch_size = 512 - # Compute pitch using first gpu - audio = torch.tensor(np.copy(x))[None].float() - f0, pd = torchcrepe.predict( - audio, - self.sr, - self.window, - f0_min, - f0_max, - model, - batch_size=batch_size, - device=self.device, - return_periodicity=True, - ) - pd = torchcrepe.filter.median(pd, 3) - f0 = torchcrepe.filter.mean(f0, 3) - f0[pd < 0.1] = 0 - f0 = f0[0].cpu().numpy() - elif f0_method == "rmvpe": - if hasattr(self, "model_rmvpe") == False: - from lib.rmvpe import RMVPE - - print("loading rmvpe model") - self.model_rmvpe = RMVPE( - "rmvpe.pt", is_half=self.is_half, device=self.device - ) - - f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) - if "privateuseone" in str(self.device): # clean ortruntime memory - del self.model_rmvpe.model - del self.model_rmvpe - print("cleaning ortruntime memory") - - f0 *= pow(2, f0_up_key / 12) - # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) - tf0 = self.sr // self.window # 每秒f0点数 - if inp_f0 is not None: - delta_t = np.round( - (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 - ).astype("int16") - replace_f0 = np.interp( - list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] - ) - shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0] - f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[ - :shape - ] - # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) - f0bak = f0.copy() - f0_mel = 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( - f0_mel_max - f0_mel_min - ) + 1 - f0_mel[f0_mel <= 1] = 1 - f0_mel[f0_mel > 255] = 255 - f0_coarse = np.rint(f0_mel).astype(np.int32) - return f0_coarse, f0bak # 1-0 - - def vc( - self, - model, - net_g, - sid, - audio0, - pitch, - pitchf, - times, - index, - big_npy, - index_rate, - version, - protect, - ): # ,file_index,file_big_npy - feats = torch.from_numpy(audio0) - if self.is_half: - feats = feats.half() - else: - feats = feats.float() - if feats.dim() == 2: # double channels - feats = feats.mean(-1) - assert feats.dim() == 1, feats.dim() - feats = feats.view(1, -1) - padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) - - inputs = { - "source": feats.to(self.device), - "padding_mask": padding_mask, - "output_layer": 9 if version == "v1" else 12, - } - t0 = ttime() - with torch.no_grad(): - logits = model.extract_features(**inputs) - feats = model.final_proj(logits[0]) if version == "v1" else logits[0] - if protect < 0.5 and pitch != None and pitchf != None: - feats0 = feats.clone() - if ( - isinstance(index, type(None)) == False - and isinstance(big_npy, type(None)) == False - and index_rate != 0 - ): - npy = feats[0].cpu().numpy() - if self.is_half: - npy = npy.astype("float32") - - # _, I = index.search(npy, 1) - # npy = big_npy[I.squeeze()] - - score, ix = index.search(npy, k=8) - weight = np.square(1 / score) - weight /= weight.sum(axis=1, keepdims=True) - npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) - - if self.is_half: - npy = npy.astype("float16") - feats = ( - torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate - + (1 - index_rate) * feats - ) - - feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) - if protect < 0.5 and pitch != None and pitchf != None: - feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute( - 0, 2, 1 - ) - t1 = ttime() - p_len = audio0.shape[0] // self.window - if feats.shape[1] < p_len: - p_len = feats.shape[1] - if pitch != None and pitchf != None: - pitch = pitch[:, :p_len] - pitchf = pitchf[:, :p_len] - - if protect < 0.5 and pitch != None and pitchf != None: - pitchff = pitchf.clone() - pitchff[pitchf > 0] = 1 - pitchff[pitchf < 1] = protect - pitchff = pitchff.unsqueeze(-1) - feats = feats * pitchff + feats0 * (1 - pitchff) - feats = feats.to(feats0.dtype) - p_len = torch.tensor([p_len], device=self.device).long() - with torch.no_grad(): - if pitch != None and pitchf != None: - audio1 = ( - (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]) - .data.cpu() - .float() - .numpy() - ) - else: - audio1 = ( - (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy() - ) - del feats, p_len, padding_mask - if torch.cuda.is_available(): - torch.cuda.empty_cache() - t2 = ttime() - times[0] += t1 - t0 - times[2] += t2 - t1 - return audio1 - - def pipeline( - self, - model, - net_g, - sid, - audio, - input_audio_path, - times, - f0_up_key, - f0_method, - file_index, - # file_big_npy, - index_rate, - if_f0, - filter_radius, - tgt_sr, - resample_sr, - rms_mix_rate, - version, - protect, - f0_file=None, - ): - if ( - file_index != "" - # and file_big_npy != "" - # and os.path.exists(file_big_npy) == True - and os.path.exists(file_index) == True - and index_rate != 0 - ): - try: - index = faiss.read_index(file_index) - # big_npy = np.load(file_big_npy) - big_npy = index.reconstruct_n(0, index.ntotal) - except: - traceback.print_exc() - index = big_npy = None - else: - index = big_npy = None - audio = signal.filtfilt(bh, ah, audio) - audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") - opt_ts = [] - if audio_pad.shape[0] > self.t_max: - audio_sum = np.zeros_like(audio) - for i in range(self.window): - audio_sum += audio_pad[i : i - self.window] - for t in range(self.t_center, audio.shape[0], self.t_center): - opt_ts.append( - t - - self.t_query - + np.where( - np.abs(audio_sum[t - self.t_query : t + self.t_query]) - == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min() - )[0][0] - ) - s = 0 - audio_opt = [] - t = None - t1 = ttime() - audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") - p_len = audio_pad.shape[0] // self.window - inp_f0 = None - if hasattr(f0_file, "name") == True: - try: - with open(f0_file.name, "r") as f: - lines = f.read().strip("\n").split("\n") - inp_f0 = [] - for line in lines: - inp_f0.append([float(i) for i in line.split(",")]) - inp_f0 = np.array(inp_f0, dtype="float32") - except: - traceback.print_exc() - sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() - pitch, pitchf = None, None - if if_f0 == 1: - pitch, pitchf = self.get_f0( - input_audio_path, - audio_pad, - p_len, - f0_up_key, - f0_method, - filter_radius, - inp_f0, - ) - pitch = pitch[:p_len] - pitchf = pitchf[:p_len] - if self.device == "mps": - pitchf = pitchf.astype(np.float32) - pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() - pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() - t2 = ttime() - times[1] += t2 - t1 - for t in opt_ts: - t = t // self.window * self.window - if if_f0 == 1: - audio_opt.append( - self.vc( - model, - net_g, - sid, - audio_pad[s : t + self.t_pad2 + self.window], - pitch[:, s // self.window : (t + self.t_pad2) // self.window], - pitchf[:, s // self.window : (t + self.t_pad2) // self.window], - times, - index, - big_npy, - index_rate, - version, - protect, - )[self.t_pad_tgt : -self.t_pad_tgt] - ) - else: - audio_opt.append( - self.vc( - model, - net_g, - sid, - audio_pad[s : t + self.t_pad2 + self.window], - None, - None, - times, - index, - big_npy, - index_rate, - version, - protect, - )[self.t_pad_tgt : -self.t_pad_tgt] - ) - s = t - if if_f0 == 1: - audio_opt.append( - self.vc( - model, - net_g, - sid, - audio_pad[t:], - pitch[:, t // self.window :] if t is not None else pitch, - pitchf[:, t // self.window :] if t is not None else pitchf, - times, - index, - big_npy, - index_rate, - version, - protect, - )[self.t_pad_tgt : -self.t_pad_tgt] - ) - else: - audio_opt.append( - self.vc( - model, - net_g, - sid, - audio_pad[t:], - None, - None, - times, - index, - big_npy, - index_rate, - version, - protect, - )[self.t_pad_tgt : -self.t_pad_tgt] - ) - audio_opt = np.concatenate(audio_opt) - if rms_mix_rate != 1: - audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate) - if resample_sr >= 16000 and tgt_sr != resample_sr: - audio_opt = librosa.resample( - audio_opt, orig_sr=tgt_sr, target_sr=resample_sr - ) - audio_max = np.abs(audio_opt).max() / 0.99 - max_int16 = 32768 - if audio_max > 1: - max_int16 /= audio_max - audio_opt = (audio_opt * max_int16).astype(np.int16) - del pitch, pitchf, sid - if torch.cuda.is_available(): - torch.cuda.empty_cache() - return audio_opt +import os +import sys +import traceback +from functools import lru_cache +from time import time as ttime + +import faiss +import librosa +import numpy as np +import parselmouth +import pyworld +import torch +import torch.nn.functional as F +import torchcrepe +from scipy import signal + +now_dir = os.getcwd() +sys.path.append(now_dir) + +bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) + +input_audio_path2wav = {} + + +@lru_cache +def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period): + audio = input_audio_path2wav[input_audio_path] + f0, t = pyworld.harvest( + audio, + fs=fs, + f0_ceil=f0max, + f0_floor=f0min, + frame_period=frame_period, + ) + f0 = pyworld.stonemask(audio, f0, t, fs) + return f0 + + +def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比 + # print(data1.max(),data2.max()) + rms1 = librosa.feature.rms( + y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2 + ) # 每半秒一个点 + rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2) + rms1 = torch.from_numpy(rms1) + rms1 = F.interpolate( + rms1.unsqueeze(0), size=data2.shape[0], mode="linear" + ).squeeze() + rms2 = torch.from_numpy(rms2) + rms2 = F.interpolate( + rms2.unsqueeze(0), size=data2.shape[0], mode="linear" + ).squeeze() + rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6) + data2 *= ( + torch.pow(rms1, torch.tensor(1 - rate)) + * torch.pow(rms2, torch.tensor(rate - 1)) + ).numpy() + return data2 + + +class Pipeline(object): + def __init__(self, tgt_sr, config): + self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = ( + config.x_pad, + config.x_query, + config.x_center, + config.x_max, + config.is_half, + ) + self.sr = 16000 # hubert输入采样率 + self.window = 160 # 每帧点数 + self.t_pad = self.sr * self.x_pad # 每条前后pad时间 + self.t_pad_tgt = tgt_sr * self.x_pad + self.t_pad2 = self.t_pad * 2 + self.t_query = self.sr * self.x_query # 查询切点前后查询时间 + self.t_center = self.sr * self.x_center # 查询切点位置 + self.t_max = self.sr * self.x_max # 免查询时长阈值 + self.device = config.device + + self.model_rmvpe = None + + def get_f0( + self, + input_audio_path, + x, + p_len, + f0_up_key, + f0_method, + filter_radius, + inp_f0=None, + ): + global input_audio_path2wav + time_step = self.window / self.sr * 1000 + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + if f0_method == "pm": + f0 = ( + parselmouth.Sound(x, self.sr) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=f0_min, + pitch_ceiling=f0_max, + ) + .selected_array["frequency"] + ) + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + f0 = np.pad( + f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" + ) + elif f0_method == "harvest": + input_audio_path2wav[input_audio_path] = x.astype(np.double) + f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10) + if filter_radius > 2: + f0 = signal.medfilt(f0, 3) + elif f0_method == "crepe": + model = "full" + # Pick a batch size that doesn't cause memory errors on your gpu + batch_size = 512 + # Compute pitch using first gpu + audio = torch.tensor(np.copy(x))[None].float() + f0, pd = torchcrepe.predict( + audio, + self.sr, + self.window, + f0_min, + f0_max, + model, + batch_size=batch_size, + device=self.device, + return_periodicity=True, + ) + pd = torchcrepe.filter.median(pd, 3) + f0 = torchcrepe.filter.mean(f0, 3) + f0[pd < 0.1] = 0 + f0 = f0[0].cpu().numpy() + elif f0_method == "rmvpe": + if not hasattr(self, "model_rmvpe"): + from infer.lib.rmvpe import RMVPE + + print("loading rmvpe model") + self.model_rmvpe = RMVPE( + "rmvpe.pt", is_half=self.is_half, device=self.device + ) + f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + + if "privateuseone" in str(self.device): # clean ortruntime memory + del self.model_rmvpe.model + del self.model_rmvpe + print("cleaning ortruntime memory") + + f0 *= pow(2, f0_up_key / 12) + # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) + tf0 = self.sr // self.window # 每秒f0点数 + if inp_f0 is not None: + delta_t = np.round( + (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 + ).astype("int16") + replace_f0 = np.interp( + list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] + ) + shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0] + f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[ + :shape + ] + # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) + f0bak = f0.copy() + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(np.int32) + return f0_coarse, f0bak # 1-0 + + def vc( + self, + model, + net_g, + sid, + audio0, + pitch, + pitchf, + times, + index, + big_npy, + index_rate, + version, + protect, + ): # ,file_index,file_big_npy + feats = torch.from_numpy(audio0) + if self.is_half: + feats = feats.half() + else: + feats = feats.float() + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + feats = feats.view(1, -1) + padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) + + inputs = { + "source": feats.to(self.device), + "padding_mask": padding_mask, + "output_layer": 9 if version == "v1" else 12, + } + t0 = ttime() + with torch.no_grad(): + logits = model.extract_features(**inputs) + feats = model.final_proj(logits[0]) if version == "v1" else logits[0] + if protect < 0.5 and pitch is not None and pitchf is not None: + feats0 = feats.clone() + if ( + not isinstance(index, type(None)) + and not isinstance(big_npy, type(None)) + and index_rate != 0 + ): + npy = feats[0].cpu().numpy() + if self.is_half: + npy = npy.astype("float32") + + # _, I = index.search(npy, 1) + # npy = big_npy[I.squeeze()] + + score, ix = index.search(npy, k=8) + weight = np.square(1 / score) + weight /= weight.sum(axis=1, keepdims=True) + npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) + + if self.is_half: + npy = npy.astype("float16") + feats = ( + torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + + (1 - index_rate) * feats + ) + + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) + if protect < 0.5 and pitch is not None and pitchf is not None: + feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute( + 0, 2, 1 + ) + t1 = ttime() + p_len = audio0.shape[0] // self.window + if feats.shape[1] < p_len: + p_len = feats.shape[1] + if pitch is not None and pitchf is not None: + pitch = pitch[:, :p_len] + pitchf = pitchf[:, :p_len] + + if protect < 0.5 and pitch is not None and pitchf is not None: + pitchff = pitchf.clone() + pitchff[pitchf > 0] = 1 + pitchff[pitchf < 1] = protect + pitchff = pitchff.unsqueeze(-1) + feats = feats * pitchff + feats0 * (1 - pitchff) + feats = feats.to(feats0.dtype) + p_len = torch.tensor([p_len], device=self.device).long() + with torch.no_grad(): + if pitch is not None and pitchf is not None: + audio1 = ( + (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]) + .data.cpu() + .float() + .numpy() + ) + else: + audio1 = ( + (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy() + ) + del feats, p_len, padding_mask + if torch.cuda.is_available(): + torch.cuda.empty_cache() + t2 = ttime() + times[0] += t1 - t0 + times[2] += t2 - t1 + return audio1 + + def pipeline( + self, + model, + net_g, + sid, + audio, + input_audio_path, + times, + f0_up_key, + f0_method, + file_index, + index_rate, + if_f0, + filter_radius, + tgt_sr, + resample_sr, + rms_mix_rate, + version, + protect, + f0_file=None, + ): + if ( + file_index != "" + # and file_big_npy != "" + # and os.path.exists(file_big_npy) == True + and os.path.exists(file_index) + and index_rate != 0 + ): + try: + index = faiss.read_index(file_index) + # big_npy = np.load(file_big_npy) + big_npy = index.reconstruct_n(0, index.ntotal) + except: + traceback.print_exc() + index = big_npy = None + else: + index = big_npy = None + audio = signal.filtfilt(bh, ah, audio) + audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") + opt_ts = [] + if audio_pad.shape[0] > self.t_max: + audio_sum = np.zeros_like(audio) + for i in range(self.window): + audio_sum += audio_pad[i : i - self.window] + for t in range(self.t_center, audio.shape[0], self.t_center): + opt_ts.append( + t + - self.t_query + + np.where( + np.abs(audio_sum[t - self.t_query : t + self.t_query]) + == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min() + )[0][0] + ) + s = 0 + audio_opt = [] + t = None + t1 = ttime() + audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") + p_len = audio_pad.shape[0] // self.window + inp_f0 = None + if hasattr(f0_file, "name"): + try: + with open(f0_file.name, "r") as f: + lines = f.read().strip("\n").split("\n") + inp_f0 = [] + for line in lines: + inp_f0.append([float(i) for i in line.split(",")]) + inp_f0 = np.array(inp_f0, dtype="float32") + except: + traceback.print_exc() + sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() + pitch, pitchf = None, None + if if_f0 == 1: + pitch, pitchf = self.get_f0( + input_audio_path, + audio_pad, + p_len, + f0_up_key, + f0_method, + filter_radius, + inp_f0, + ) + pitch = pitch[:p_len] + pitchf = pitchf[:p_len] + if self.device == "mps": + pitchf = pitchf.astype(np.float32) + pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() + pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() + t2 = ttime() + times[1] += t2 - t1 + for t in opt_ts: + t = t // self.window * self.window + if if_f0 == 1: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + pitch[:, s // self.window : (t + self.t_pad2) // self.window], + pitchf[:, s // self.window : (t + self.t_pad2) // self.window], + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + else: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + None, + None, + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + s = t + if if_f0 == 1: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[t:], + pitch[:, t // self.window :] if t is not None else pitch, + pitchf[:, t // self.window :] if t is not None else pitchf, + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + else: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[t:], + None, + None, + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + audio_opt = np.concatenate(audio_opt) + if rms_mix_rate != 1: + audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate) + if tgt_sr != resample_sr >= 16000: + audio_opt = librosa.resample( + audio_opt, orig_sr=tgt_sr, target_sr=resample_sr + ) + audio_max = np.abs(audio_opt).max() / 0.99 + max_int16 = 32768 + if audio_max > 1: + max_int16 /= audio_max + audio_opt = (audio_opt * max_int16).astype(np.int16) + del pitch, pitchf, sid + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio_opt diff --git a/infer/modules/vc/utils.py b/infer/modules/vc/utils.py new file mode 100644 index 0000000..98497e2 --- /dev/null +++ b/infer/modules/vc/utils.py @@ -0,0 +1,33 @@ +import os + +from fairseq import checkpoint_utils + + +def get_index_path_from_model(sid): + return next( + ( + f + for f in [ + os.path.join(root, name) + for root, dirs, files in os.walk(os.getenv("index_root"), topdown=False) + for name in files + if name.endswith(".index") and "trained" not in name + ] + if sid.split(".")[0] in f + ), + "", + ) + + +def load_hubert(config): + models, _, _ = checkpoint_utils.load_model_ensemble_and_task( + ["assets/hubert/hubert_base.pt"], + suffix="", + ) + hubert_model = models[0] + hubert_model = hubert_model.to(config.device) + if config.is_half: + hubert_model = hubert_model.half() + else: + hubert_model = hubert_model.float() + return hubert_model.eval() diff --git a/infer_batch_rvc.py b/infer_batch_rvc.py deleted file mode 100644 index 3fc9a05..0000000 --- a/infer_batch_rvc.py +++ /dev/null @@ -1,216 +0,0 @@ -""" -v1 -runtime\python.exe myinfer-v2-0528.py 0 "E:\codes\py39\RVC-beta\todo-songs" "E:\codes\py39\logs\mi-test\added_IVF677_Flat_nprobe_7.index" harvest "E:\codes\py39\RVC-beta\output" "E:\codes\py39\test-20230416b\weights\mi-test.pth" 0.66 cuda:0 True 3 0 1 0.33 -v2 -runtime\python.exe myinfer-v2-0528.py 0 "E:\codes\py39\RVC-beta\todo-songs" "E:\codes\py39\test-20230416b\logs\mi-test-v2\aadded_IVF677_Flat_nprobe_1_v2.index" harvest "E:\codes\py39\RVC-beta\output_v2" "E:\codes\py39\test-20230416b\weights\mi-test-v2.pth" 0.66 cuda:0 True 3 0 1 0.33 -""" -import os, sys, pdb, torch - -now_dir = os.getcwd() -sys.path.append(now_dir) -import sys -import torch -import tqdm as tq -from multiprocessing import cpu_count - - -class Config: - def __init__(self, device, is_half): - self.device = device - self.is_half = is_half - self.n_cpu = 0 - self.gpu_name = None - self.gpu_mem = None - self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() - - def device_config(self) -> tuple: - if torch.cuda.is_available(): - i_device = int(self.device.split(":")[-1]) - self.gpu_name = torch.cuda.get_device_name(i_device) - if ( - ("16" in self.gpu_name and "V100" not in self.gpu_name.upper()) - or "P40" in self.gpu_name.upper() - or "1060" in self.gpu_name - or "1070" in self.gpu_name - or "1080" in self.gpu_name - ): - print("16系/10系显卡和P40强制单精度") - self.is_half = False - for config_file in ["32k.json", "40k.json", "48k.json"]: - with open(f"configs/{config_file}", "r") as f: - strr = f.read().replace("true", "false") - with open(f"configs/{config_file}", "w") as f: - f.write(strr) - with open("trainset_preprocess_pipeline_print.py", "r") as f: - strr = f.read().replace("3.7", "3.0") - with open("trainset_preprocess_pipeline_print.py", "w") as f: - f.write(strr) - else: - self.gpu_name = None - self.gpu_mem = int( - torch.cuda.get_device_properties(i_device).total_memory - / 1024 - / 1024 - / 1024 - + 0.4 - ) - if self.gpu_mem <= 4: - with open("trainset_preprocess_pipeline_print.py", "r") as f: - strr = f.read().replace("3.7", "3.0") - with open("trainset_preprocess_pipeline_print.py", "w") as f: - f.write(strr) - elif torch.backends.mps.is_available(): - print("没有发现支持的N卡, 使用MPS进行推理") - self.device = "mps" - else: - print("没有发现支持的N卡, 使用CPU进行推理") - self.device = "cpu" - self.is_half = True - - if self.n_cpu == 0: - self.n_cpu = cpu_count() - - if self.is_half: - # 6G显存配置 - x_pad = 3 - x_query = 10 - x_center = 60 - x_max = 65 - else: - # 5G显存配置 - x_pad = 1 - x_query = 6 - x_center = 38 - x_max = 41 - - if self.gpu_mem != None and self.gpu_mem <= 4: - x_pad = 1 - x_query = 5 - x_center = 30 - x_max = 32 - - return x_pad, x_query, x_center, x_max - - -f0up_key = sys.argv[1] -input_path = sys.argv[2] -index_path = sys.argv[3] -f0method = sys.argv[4] # harvest or pm -opt_path = sys.argv[5] -model_path = sys.argv[6] -index_rate = float(sys.argv[7]) -device = sys.argv[8] -is_half = sys.argv[9].lower() != "false" -filter_radius = int(sys.argv[10]) -resample_sr = int(sys.argv[11]) -rms_mix_rate = float(sys.argv[12]) -protect = float(sys.argv[13]) -print(sys.argv) -config = Config(device, is_half) -now_dir = os.getcwd() -sys.path.append(now_dir) -from lib.train.vc_infer_pipeline import VC -from lib.infer_pack.models import ( - SynthesizerTrnMs256NSFsid, - SynthesizerTrnMs256NSFsid_nono, - SynthesizerTrnMs768NSFsid, - SynthesizerTrnMs768NSFsid_nono, -) -from lib.audio import load_audio -from fairseq import checkpoint_utils -from scipy.io import wavfile - -hubert_model = None - - -def load_hubert(hubert_model_path="hubert_base.pt"): - global hubert_model - models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( - [hubert_model_path], - suffix="", - ) - hubert_model = models[0] - hubert_model = hubert_model.to(device) - if is_half: - hubert_model = hubert_model.half() - else: - hubert_model = hubert_model.float() - hubert_model.eval() - - -def vc_single(sid, input_audio, f0_up_key, f0_file, f0_method, file_index, index_rate): - global tgt_sr, net_g, vc, hubert_model, version - if input_audio is None: - return "You need to upload an audio", None - f0_up_key = int(f0_up_key) - audio = load_audio(input_audio, 16000) - times = [0, 0, 0] - if hubert_model == None: - load_hubert() - if_f0 = cpt.get("f0", 1) - # audio_opt=vc.pipeline(hubert_model,net_g,sid,audio,times,f0_up_key,f0_method,file_index,file_big_npy,index_rate,if_f0,f0_file=f0_file) - audio_opt = vc.pipeline( - hubert_model, - net_g, - sid, - audio, - input_audio, - times, - f0_up_key, - f0_method, - file_index, - index_rate, - if_f0, - filter_radius, - tgt_sr, - resample_sr, - rms_mix_rate, - version, - protect, - f0_file=f0_file, - ) - print(times) - return audio_opt - - -def get_vc(model_path): - global n_spk, tgt_sr, net_g, vc, cpt, device, is_half, version - print("loading pth %s" % model_path) - cpt = torch.load(model_path, map_location="cpu") - tgt_sr = cpt["config"][-1] - cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk - if_f0 = cpt.get("f0", 1) - version = cpt.get("version", "v1") - if version == "v1": - if if_f0 == 1: - net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half) - else: - net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) - elif version == "v2": - if if_f0 == 1: # - net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=is_half) - else: - net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) - del net_g.enc_q - print(net_g.load_state_dict(cpt["weight"], strict=False)) # 不加这一行清不干净,真奇葩 - net_g.eval().to(device) - if is_half: - net_g = net_g.half() - else: - net_g = net_g.float() - vc = VC(tgt_sr, config) - n_spk = cpt["config"][-3] - # return {"visible": True,"maximum": n_spk, "__type__": "update"} - - -if __name__ == "__main__": - get_vc(model_path) - audios = os.listdir(input_path) - for file in tq.tqdm(audios): - if file.endswith(".wav"): - file_path = os.path.join(input_path, file) - wav_opt = vc_single( - 0, file_path, f0up_key, None, f0method, index_path, index_rate - ) - out_path = os.path.join(opt_path, file) - wavfile.write(out_path, tgt_sr, wav_opt) diff --git a/infer_cli.py b/infer_cli.py deleted file mode 100644 index 59f246b..0000000 --- a/infer_cli.py +++ /dev/null @@ -1,272 +0,0 @@ -from scipy.io import wavfile -from fairseq import checkpoint_utils -from lib.audio import load_audio -from lib.infer_pack.models import ( - SynthesizerTrnMs256NSFsid, - SynthesizerTrnMs256NSFsid_nono, - SynthesizerTrnMs768NSFsid, - SynthesizerTrnMs768NSFsid_nono, -) -from lib.train.vc_infer_pipeline import VC -from multiprocessing import cpu_count -import numpy as np -import torch -import sys -import glob -import argparse -import os -import sys -import pdb -import torch - -now_dir = os.getcwd() -sys.path.append(now_dir) - -#### -# USAGE -# -# In your Terminal or CMD or whatever -# python infer_cli.py [TRANSPOSE_VALUE] "[INPUT_PATH]" "[OUTPUT_PATH]" "[MODEL_PATH]" "[INDEX_FILE_PATH]" "[INFERENCE_DEVICE]" "[METHOD]" - -using_cli = False -device = "cuda:0" -is_half = False - -if len(sys.argv) > 0: - f0_up_key = int(sys.argv[1]) # transpose value - input_path = sys.argv[2] - output_path = sys.argv[3] - model_path = sys.argv[4] - file_index = sys.argv[5] # .index file - device = sys.argv[6] - f0_method = sys.argv[7] # pm or harvest or crepe - - using_cli = True - - # file_index2=sys.argv[8] - # index_rate=float(sys.argv[10]) #search feature ratio - # filter_radius=float(sys.argv[11]) #median filter - # resample_sr=float(sys.argv[12]) #resample audio in post processing - # rms_mix_rate=float(sys.argv[13]) #search feature - print(sys.argv) - - -class Config: - def __init__(self, device, is_half): - self.device = device - self.is_half = is_half - self.n_cpu = 0 - self.gpu_name = None - self.gpu_mem = None - self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() - - def device_config(self) -> tuple: - if torch.cuda.is_available() and device != "cpu": - i_device = int(self.device.split(":")[-1]) - self.gpu_name = torch.cuda.get_device_name(i_device) - if ( - ("16" in self.gpu_name and "V100" not in self.gpu_name.upper()) - or "P40" in self.gpu_name.upper() - or "1060" in self.gpu_name - or "1070" in self.gpu_name - or "1080" in self.gpu_name - ): - print("16系/10系显卡和P40强制单精度") - self.is_half = False - for config_file in ["32k.json", "40k.json", "48k.json"]: - with open(f"configs/{config_file}", "r") as f: - strr = f.read().replace("true", "false") - with open(f"configs/{config_file}", "w") as f: - f.write(strr) - with open("trainset_preprocess_pipeline_print.py", "r") as f: - strr = f.read().replace("3.7", "3.0") - with open("trainset_preprocess_pipeline_print.py", "w") as f: - f.write(strr) - else: - self.gpu_name = None - self.gpu_mem = int( - torch.cuda.get_device_properties(i_device).total_memory - / 1024 - / 1024 - / 1024 - + 0.4 - ) - if self.gpu_mem <= 4: - with open("trainset_preprocess_pipeline_print.py", "r") as f: - strr = f.read().replace("3.7", "3.0") - with open("trainset_preprocess_pipeline_print.py", "w") as f: - f.write(strr) - elif torch.backends.mps.is_available(): - print("没有发现支持的N卡, 使用MPS进行推理") - self.device = "mps" - else: - print("没有发现支持的N卡, 使用CPU进行推理") - self.device = "cpu" - self.is_half = False - - if self.n_cpu == 0: - self.n_cpu = cpu_count() - - if self.is_half: - # 6G显存配置 - x_pad = 3 - x_query = 10 - x_center = 60 - x_max = 65 - else: - # 5G显存配置 - x_pad = 1 - x_query = 6 - x_center = 38 - x_max = 41 - - if self.gpu_mem != None and self.gpu_mem <= 4: - x_pad = 1 - x_query = 5 - x_center = 30 - x_max = 32 - - return x_pad, x_query, x_center, x_max - - -config = Config(device, is_half) -now_dir = os.getcwd() -sys.path.append(now_dir) - -hubert_model = None - - -def load_hubert(): - global hubert_model - models, _, _ = checkpoint_utils.load_model_ensemble_and_task( - ["hubert_base.pt"], - suffix="", - ) - hubert_model = models[0] - hubert_model = hubert_model.to(config.device) - if config.is_half: - hubert_model = hubert_model.half() - else: - hubert_model = hubert_model.float() - hubert_model.eval() - - -def vc_single( - sid=0, - input_audio_path=None, - f0_up_key=0, - f0_file=None, - f0_method="pm", - file_index="", # .index file - file_index2="", - # file_big_npy, - index_rate=1.0, - filter_radius=3, - resample_sr=0, - rms_mix_rate=1.0, - model_path="", - output_path="", - protect=0.33, -): - global tgt_sr, net_g, vc, hubert_model, version - get_vc(model_path) - if input_audio_path is None: - return "You need to upload an audio file", None - - f0_up_key = int(f0_up_key) - audio = load_audio(input_audio_path, 16000) - audio_max = np.abs(audio).max() / 0.95 - - if audio_max > 1: - audio /= audio_max - times = [0, 0, 0] - - if hubert_model == None: - load_hubert() - - if_f0 = cpt.get("f0", 1) - - file_index = ( - ( - file_index.strip(" ") - .strip('"') - .strip("\n") - .strip('"') - .strip(" ") - .replace("trained", "added") - ) - if file_index != "" - else file_index2 - ) - - audio_opt = vc.pipeline( - hubert_model, - net_g, - sid, - audio, - input_audio_path, - times, - f0_up_key, - f0_method, - file_index, - # file_big_npy, - index_rate, - if_f0, - filter_radius, - tgt_sr, - resample_sr, - rms_mix_rate, - version, - f0_file=f0_file, - protect=protect, - ) - wavfile.write(output_path, tgt_sr, audio_opt) - return "processed" - - -def get_vc(model_path): - global n_spk, tgt_sr, net_g, vc, cpt, device, is_half, version - print("loading pth %s" % model_path) - cpt = torch.load(model_path, map_location="cpu") - tgt_sr = cpt["config"][-1] - cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk - if_f0 = cpt.get("f0", 1) - version = cpt.get("version", "v1") - if version == "v1": - if if_f0 == 1: - net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half) - else: - net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) - elif version == "v2": - if if_f0 == 1: - net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=is_half) - else: - net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) - del net_g.enc_q - print(net_g.load_state_dict(cpt["weight"], strict=False)) - net_g.eval().to(device) - if is_half: - net_g = net_g.half() - else: - net_g = net_g.float() - vc = VC(tgt_sr, config) - n_spk = cpt["config"][-3] - # return {"visible": True,"maximum": n_spk, "__type__": "update"} - - -if using_cli: - vc_single( - sid=0, - input_audio_path=input_path, - f0_up_key=f0_up_key, - f0_file=None, - f0_method=f0_method, - file_index=file_index, - file_index2="", - index_rate=1, - filter_radius=3, - resample_sr=0, - rms_mix_rate=0, - model_path=model_path, - output_path=output_path, - ) diff --git a/lib/train/cmd.txt b/lib/train/cmd.txt deleted file mode 100644 index e4b895e..0000000 --- a/lib/train/cmd.txt +++ /dev/null @@ -1 +0,0 @@ -python train_nsf_sim_cache_sid.py -c configs/mi_mix40k_nsf_co256_cs1sid_ms2048.json -m ft-mi \ No newline at end of file diff --git a/tools/app.py b/tools/app.py new file mode 100644 index 0000000..76a9a83 --- /dev/null +++ b/tools/app.py @@ -0,0 +1,146 @@ +import logging +import os + +# os.system("wget -P cvec/ https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt") +import gradio as gr +from dotenv import load_dotenv + +from configs.config import Config +from i18n.i18n import I18nAuto +from infer.modules.vc.modules import VC + +logging.getLogger("numba").setLevel(logging.WARNING) +logging.getLogger("markdown_it").setLevel(logging.WARNING) +logging.getLogger("urllib3").setLevel(logging.WARNING) +logging.getLogger("matplotlib").setLevel(logging.WARNING) + +i18n = I18nAuto() +i18n.print() + +load_dotenv() +config = Config() +vc = VC(config) + +weight_root = os.getenv("weight_root") +weight_uvr5_root = os.getenv("weight_uvr5_root") +index_root = "logs" +names = [] +hubert_model = None +for name in os.listdir(weight_root): + if name.endswith(".pth"): + names.append(name) +index_paths = [] +for root, dirs, files in os.walk(index_root, topdown=False): + for name in files: + if name.endswith(".index") and "trained" not in name: + index_paths.append("%s/%s" % (root, name)) + + +app = gr.Blocks() +with app: + with gr.Tabs(): + with gr.TabItem("在线demo"): + gr.Markdown( + value=""" + RVC 在线demo + """ + ) + sid = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names)) + with gr.Column(): + spk_item = gr.Slider( + minimum=0, + maximum=2333, + step=1, + label=i18n("请选择说话人id"), + value=0, + visible=False, + interactive=True, + ) + sid.change(fn=vc.get_vc, inputs=[sid], outputs=[spk_item]) + gr.Markdown( + value=i18n("男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ") + ) + vc_input3 = gr.Audio(label="上传音频(长度小于90秒)") + vc_transform0 = gr.Number(label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0) + f0method0 = gr.Radio( + label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU"), + choices=["pm", "harvest", "crepe", "rmvpe"], + value="pm", + interactive=True, + ) + filter_radius0 = gr.Slider( + minimum=0, + maximum=7, + label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"), + value=3, + step=1, + interactive=True, + ) + with gr.Column(): + file_index1 = gr.Textbox( + label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"), + value="", + interactive=False, + visible=False, + ) + file_index2 = gr.Dropdown( + label=i18n("自动检测index路径,下拉式选择(dropdown)"), + choices=sorted(index_paths), + interactive=True, + ) + index_rate1 = gr.Slider( + minimum=0, + maximum=1, + label=i18n("检索特征占比"), + value=0.88, + interactive=True, + ) + resample_sr0 = gr.Slider( + minimum=0, + maximum=48000, + label=i18n("后处理重采样至最终采样率,0为不进行重采样"), + value=0, + step=1, + interactive=True, + ) + rms_mix_rate0 = gr.Slider( + minimum=0, + maximum=1, + label=i18n("输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"), + value=1, + interactive=True, + ) + protect0 = gr.Slider( + minimum=0, + maximum=0.5, + label=i18n("保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果"), + value=0.33, + step=0.01, + interactive=True, + ) + f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调")) + but0 = gr.Button(i18n("转换"), variant="primary") + vc_output1 = gr.Textbox(label=i18n("输出信息")) + vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)")) + but0.click( + vc.vc_single, + [ + spk_item, + vc_input3, + vc_transform0, + f0_file, + f0method0, + file_index1, + file_index2, + # file_big_npy1, + index_rate1, + filter_radius0, + resample_sr0, + rms_mix_rate0, + protect0, + ], + [vc_output1, vc_output2], + ) + + +app.launch() diff --git a/tools/calc_rvc_model_similarity.py b/tools/calc_rvc_model_similarity.py index edc1cf8..3f74ca5 100644 --- a/tools/calc_rvc_model_similarity.py +++ b/tools/calc_rvc_model_similarity.py @@ -1,6 +1,8 @@ # This code references https://huggingface.co/JosephusCheung/ASimilarityCalculatior/blob/main/qwerty.py # Fill in the path of the model to be queried and the root directory of the reference models, and this script will return the similarity between the model to be queried and all reference models. -import sys, os +import os +import sys + import torch import torch.nn as nn import torch.nn.functional as F diff --git a/tools/export_onnx.py b/tools/export_onnx.py index 2d334a6..822e09e 100644 --- a/tools/export_onnx.py +++ b/tools/export_onnx.py @@ -1,5 +1,5 @@ -from lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM import torch +from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM if __name__ == "__main__": MoeVS = True # 模型是否为MoeVoiceStudio(原MoeSS)使用 diff --git a/tools/infer/infer-pm-index256.py b/tools/infer/infer-pm-index256.py index 2ab44e1..efaaa81 100644 --- a/tools/infer/infer-pm-index256.py +++ b/tools/infer/infer-pm-index256.py @@ -2,34 +2,36 @@ 对源特征进行检索 """ -import torch, pdb, os, parselmouth +import os +import pdb + +import parselmouth +import torch os.environ["CUDA_VISIBLE_DEVICES"] = "0" +# import torchcrepe +from time import time as ttime + +# import pyworld +import librosa import numpy as np +import scipy.signal as signal import soundfile as sf +import torch.nn.functional as F +from fairseq import checkpoint_utils # from models import SynthesizerTrn256#hifigan_nonsf # from lib.infer_pack.models import SynthesizerTrn256NSF as SynthesizerTrn256#hifigan_nsf -from lib.infer_pack.models import ( +from infer.lib.infer_pack.models import ( SynthesizerTrnMs256NSFsid as SynthesizerTrn256, ) # hifigan_nsf +from scipy.io import wavfile # from lib.infer_pack.models import SynthesizerTrnMs256NSFsid_sim as SynthesizerTrn256#hifigan_nsf # from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf # from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf -from scipy.io import wavfile -from fairseq import checkpoint_utils - -# import pyworld -import librosa -import torch.nn.functional as F -import scipy.signal as signal - -# import torchcrepe -from time import time as ttime - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_path = r"E:\codes\py39\vits_vc_gpu_train\hubert_base.pt" # print("load model(s) from {}".format(model_path)) diff --git a/tools/infer/train-index-v2.py b/tools/infer/train-index-v2.py index 77dfa0b..e72ffe7 100644 --- a/tools/infer/train-index-v2.py +++ b/tools/infer/train-index-v2.py @@ -1,11 +1,14 @@ """ 格式:直接cid为自带的index位;aid放不下了,通过字典来查,反正就5w个 """ -import faiss, numpy as np, os -from sklearn.cluster import MiniBatchKMeans +import os import traceback from multiprocessing import cpu_count +import faiss +import numpy as np +from sklearn.cluster import MiniBatchKMeans + # ###########如果是原始特征要先写save n_cpu = 0 if n_cpu == 0: diff --git a/tools/infer/train-index.py b/tools/infer/train-index.py index c49f24b..2446e4c 100644 --- a/tools/infer/train-index.py +++ b/tools/infer/train-index.py @@ -1,7 +1,10 @@ """ 格式:直接cid为自带的index位;aid放不下了,通过字典来查,反正就5w个 """ -import faiss, numpy as np, os +import os + +import faiss +import numpy as np # ###########如果是原始特征要先写save inp_root = r"E:\codes\py39\dataset\mi\2-co256" diff --git a/tools/infer/trans_weights.py b/tools/infer/trans_weights.py index e0f7f0c..a8ff3b0 100644 --- a/tools/infer/trans_weights.py +++ b/tools/infer/trans_weights.py @@ -1,4 +1,6 @@ -import torch, pdb +import pdb + +import torch # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-suc\G_1000.pth")["model"]#sim_nsf# # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder-flow-enc_q\G_1000.pth")["model"]#sim_nsf# diff --git a/tools/infer_batch_rvc.py b/tools/infer_batch_rvc.py new file mode 100644 index 0000000..763d17f --- /dev/null +++ b/tools/infer_batch_rvc.py @@ -0,0 +1,72 @@ +import argparse +import os +import sys + +print("Command-line arguments:", sys.argv) + +now_dir = os.getcwd() +sys.path.append(now_dir) +import sys + +import tqdm as tq +from dotenv import load_dotenv +from scipy.io import wavfile + +from configs.config import Config +from infer.modules.vc.modules import VC + + +def arg_parse() -> tuple: + parser = argparse.ArgumentParser() + parser.add_argument("--f0up_key", type=int, default=0) + parser.add_argument("--input_path", type=str, help="input path") + parser.add_argument("--index_path", type=str, help="index path") + parser.add_argument("--f0method", type=str, default="harvest", help="harvest or pm") + parser.add_argument("--opt_path", type=str, help="opt path") + parser.add_argument("--model_name", type=str, help="store in assets/weight_root") + parser.add_argument("--index_rate", type=float, default=0.66, help="index rate") + parser.add_argument("--device", type=str, help="device") + parser.add_argument("--is_half", type=bool, help="use half -> True") + parser.add_argument("--filter_radius", type=int, default=3, help="filter radius") + parser.add_argument("--resample_sr", type=int, default=0, help="resample sr") + parser.add_argument("--rms_mix_rate", type=float, default=1, help="rms mix rate") + parser.add_argument("--protect", type=float, default=0.33, help="protect") + + args = parser.parse_args() + sys.argv = sys.argv[:1] + + return args + + +def main(): + load_dotenv() + args = arg_parse() + config = Config() + config.device = args.device if args.device else config.device + config.is_half = args.is_half if args.is_half else config.is_half + vc = VC(config) + vc.get_vc(args.model_name) + audios = os.listdir(args.input_path) + for file in tq.tqdm(audios): + if file.endswith(".wav"): + file_path = os.path.join(args.input_path, file) + _, wav_opt = vc.vc_single( + 0, + file_path, + args.f0up_key, + None, + args.f0method, + args.index_path, + None, + args.index_rate, + args.filter_radius, + args.resample_sr, + args.rms_mix_rate, + args.protect, + ) + out_path = os.path.join(args.opt_path, file) + wavfile.write(out_path, wav_opt[0], wav_opt[1]) + + +if __name__ == "__main__": + main() diff --git a/tools/infer_cli.py b/tools/infer_cli.py new file mode 100644 index 0000000..bbe0a53 --- /dev/null +++ b/tools/infer_cli.py @@ -0,0 +1,67 @@ +import argparse +import os +import sys + +now_dir = os.getcwd() +sys.path.append(now_dir) +from dotenv import load_dotenv +from scipy.io import wavfile + +from configs.config import Config +from infer.modules.vc.modules import VC + +#### +# USAGE +# +# In your Terminal or CMD or whatever + + +def arg_parse() -> tuple: + parser = argparse.ArgumentParser() + parser.add_argument("--f0up_key", type=int, default=0) + parser.add_argument("--input_path", type=str, help="input path") + parser.add_argument("--index_path", type=str, help="index path") + parser.add_argument("--f0method", type=str, default="harvest", help="harvest or pm") + parser.add_argument("--opt_path", type=str, help="opt path") + parser.add_argument("--model_name", type=str, help="store in assets/weight_root") + parser.add_argument("--index_rate", type=float, default=0.66, help="index rate") + parser.add_argument("--device", type=str, help="device") + parser.add_argument("--is_half", type=bool, help="use half -> True") + parser.add_argument("--filter_radius", type=int, default=3, help="filter radius") + parser.add_argument("--resample_sr", type=int, default=0, help="resample sr") + parser.add_argument("--rms_mix_rate", type=float, default=1, help="rms mix rate") + parser.add_argument("--protect", type=float, default=0.33, help="protect") + + args = parser.parse_args() + sys.argv = sys.argv[:1] + + return args + + +def main(): + load_dotenv() + args = arg_parse() + config = Config() + config.device = args.device if args.device else config.device + config.is_half = args.is_half if args.is_half else config.is_half + vc = VC(config) + vc.get_vc(args.model_name) + _, wav_opt = vc.vc_single( + 0, + args.input_path, + args.f0up_key, + None, + args.f0method, + args.index_path, + None, + args.index_rate, + args.filter_radius, + args.resample_sr, + args.rms_mix_rate, + args.protect, + ) + wavfile.write(args.opt_path, wav_opt[0], wav_opt[1]) + + +if __name__ == "__main__": + main() diff --git a/tools/onnx_inference_demo.py b/tools/onnx_inference_demo.py index a4a9490..bd9ef1c 100644 --- a/tools/onnx_inference_demo.py +++ b/tools/onnx_inference_demo.py @@ -1,5 +1,6 @@ import soundfile -from ..lib.infer_pack.onnx_inference import OnnxRVC + +from ..infer.lib.infer_pack.onnx_inference import OnnxRVC hop_size = 512 sampling_rate = 40000 # 采样率 diff --git a/rvc_for_realtime.py b/tools/rvc_for_realtime.py similarity index 96% rename from rvc_for_realtime.py rename to tools/rvc_for_realtime.py index bc0004d..396b384 100644 --- a/rvc_for_realtime.py +++ b/tools/rvc_for_realtime.py @@ -1,21 +1,34 @@ -import os, sys -import faiss, torch, traceback, parselmouth, numpy as np, torchcrepe, torch.nn as nn, pyworld +import os +import sys +import traceback +from time import time as ttime + import fairseq -from lib.infer_pack.models import ( +import faiss +import numpy as np +import parselmouth +import pyworld +import scipy.signal as signal +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchcrepe + +from infer.lib.infer_pack.models import ( SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono, SynthesizerTrnMs768NSFsid, SynthesizerTrnMs768NSFsid_nono, ) -from time import time as ttime -import torch.nn.functional as F -import scipy.signal as signal now_dir = os.getcwd() sys.path.append(now_dir) -from config import defaultconfig as config from multiprocessing import Manager as M +from configs.config import Config + +config = Config() + mm = M() if config.dml == True: @@ -57,7 +70,7 @@ class RVC: print("index search enabled") self.index_rate = index_rate models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( - ["hubert_base.pt"], + ["assets/hubert/hubert_base.pt"], suffix="", ) hubert_model = models[0] @@ -211,14 +224,14 @@ class RVC: def get_f0_rmvpe(self, x, f0_up_key): if hasattr(self, "model_rmvpe") == False: - from lib.rmvpe import RMVPE + from infer.lib.rmvpe import RMVPE print("loading rmvpe model") self.model_rmvpe = RMVPE( # "rmvpe.pt", is_half=self.is_half if self.device.type!="privateuseone" else False, device=self.device if self.device.type!="privateuseone"else "cpu"####dml时强制对rmvpe用cpu跑 # "rmvpe.pt", is_half=False, device=self.device####dml配置 # "rmvpe.pt", is_half=False, device="cpu"####锁定cpu配置 - "rmvpe.pt", + "assets/rmvpe/rmvpe.pt", is_half=self.is_half, device=self.device, ####正常逻辑 )