Merge pull request #1056 from RVC-Project/clean

Cleanup RVC
2025-04-26 14:38:58 +08:00 · 2023-08-29 22:47:54 +08:00 · 2023-08-29 22:47:54 +08:00 · 5083f7fddc
commit 5083f7fddc
parent 2fd2f62c92 7f56639d5d
146 changed files with 1980 additions and 2647 deletions
--- a/.env
+++ b/.env
@ -0,0 +1,9 @@
+OPENBLAS_NUM_THREADS = 1
+no_proxy = localhost, 127.0.0.1, ::1
+
+# You can change the location of the model, etc. by changing here
+weight_root = assets/weights
+weight_uvr5_root = assets/uvr5_weights
+index_root = output
+TEMP = tmp
+
--- a/.github/workflows/genlocale.yml
+++ b/.github/workflows/genlocale.yml
@ -13,8 +13,9 @@ jobs:

      - name: Run locale generation
        run: |
-          python3 lib/i18n/scan_i18n.py
-          cd lib/i18n && python3 locale_diff.py
+          python3 i18n/scan_i18n.py
+          cd i18n
+          python3 locale_diff.py

      - name: Commit back
        if: ${{ !github.head_ref }}
--- a/.github/workflows/unitest.yml
+++ b/.github/workflows/unitest.yml
@ -30,7 +30,7 @@ jobs:
      run: |
        mkdir -p logs/mi-test
        touch logs/mi-test/preprocess.log
-        python trainset_preprocess_pipeline_print.py logs/mute/0_gt_wavs 48000 8 logs/mi-test True
+        python infer/modules/train/preprocess.py logs/mute/0_gt_wavs 48000 8 logs/mi-test True
        touch logs/mi-test/extract_f0_feature.log
-        python extract_f0_print.py logs/mi-test $(nproc) pm
-        python extract_feature_print.py cpu 1 0 0 logs/mi-test v1
+        python infer/modules/train/extract/extract_f0_print.py logs/mi-test $(nproc) pm
+        python infer/modules/train/extract_feature_print.py cpu 1 0 0 logs/mi-test v1
--- a/.gitignore
+++ b/.gitignore
@ -5,3 +5,6 @@ __pycache__
 hubert_base.pt
 /logs
 .venv
+/opt
+tools/aria2c/
+tools/flag.txt
--- a/16
+++ b/16
@ -12,17 +12,17 @@ RUN apt update && apt install -y -qq ffmpeg aria2

 RUN pip3 install -r requirements.txt

-RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D40k.pth -d pretrained_v2/ -o D40k.pth
-RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G40k.pth -d pretrained_v2/ -o G40k.pth
-RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D40k.pth -d pretrained_v2/ -o f0D40k.pth
-RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G40k.pth -d pretrained_v2/ -o f0G40k.pth
+RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D40k.pth -d assets/pretrained_v2/ -o D40k.pth
+RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G40k.pth -d assets/pretrained_v2/ -o G40k.pth
+RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D40k.pth -d assets/pretrained_v2/ -o f0D40k.pth
+RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G40k.pth -d assets/pretrained_v2/ -o f0G40k.pth

-RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2-人声vocals+非人声instrumentals.pth -d uvr5_weights/ -o HP2-人声vocals+非人声instrumentals.pth
-RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5-主旋律人声vocals+其他instrumentals.pth -d uvr5_weights/ -o HP5-主旋律人声vocals+其他instrumentals.pth
+RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2-人声vocals+非人声instrumentals.pth -d assets/uvr5_weights/ -o HP2-人声vocals+非人声instrumentals.pth
+RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5-主旋律人声vocals+其他instrumentals.pth -d assets/uvr5_weights/ -o HP5-主旋律人声vocals+其他instrumentals.pth

-RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -o hubert_base.pt
+RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d assets/hubert -o hubert_base.pt

-RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt -o rmvpe.pt
+RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt -d assets/hubert -o rmvpe.pt

 VOLUME [ "/app/weights", "/app/opt" ]

--- a/README.md
+++ b/README.md
@ -20,7 +20,7 @@

 ------

-[**English**](./docs/README.en.md) | [**中文简体**](./README.md) | [**日本語**](./docs/README.ja.md) | [**한국어**](./docs/README.ko.md) ([**韓國語**](./docs/README.ko.han.md)) | [**Türkçe**](./docs/README.tr.md)
+[**English**](./docs/en/README.en.md) | [**中文简体**](./README.md) | [**日本語**](./docs/jp/README.ja.md) | [**한국어**](./docs/kr/README.ko.md) ([**韓國語**](./docs/kr/README.ko.han.md)) | [**Türkçe**](./docs/tr/README.tr.md)

 点此查看我们的[演示视频](https://www.bilibili.com/video/BV1pm4y1z7Gm/) !

@ -89,15 +89,15 @@ RVC需要其他一些预模型来推理和训练。

 以下是一份清单，包括了所有RVC所需的预模型和其他文件的名称:
 ```bash
-hubert_base.pt
+./assets/hubert/hubert_base.pt

-./pretrained 
+./assets/pretrained 

-./uvr5_weights
+./assets/uvr5_weights

 想测试v2版本模型的话，需要额外下载

-./pretrained_v2 
+./assets/pretrained_v2

 如果你正在使用Windows，则你可能需要这个文件，若ffmpeg和ffprobe已安装则跳过; ubuntu/debian 用户可以通过apt install ffmpeg来安装这2个库, Mac 用户则可以通过brew install ffmpeg来安装 (需要预先安装brew)

--- a/app.py
+++ b/app.py
@ -1,317 +0,0 @@
-import os
-import torch
-
-# os.system("wget -P cvec/ https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt")
-import gradio as gr
-import librosa
-import numpy as np
-import logging
-from fairseq import checkpoint_utils
-from lib.train.vc_infer_pipeline import VC
-import traceback
-from config import defaultconfig as config
-from lib.infer_pack.models import (
-    SynthesizerTrnMs256NSFsid,
-    SynthesizerTrnMs256NSFsid_nono,
-    SynthesizerTrnMs768NSFsid,
-    SynthesizerTrnMs768NSFsid_nono,
-)
-from i18n import I18nAuto
-
-logging.getLogger("numba").setLevel(logging.WARNING)
-logging.getLogger("markdown_it").setLevel(logging.WARNING)
-logging.getLogger("urllib3").setLevel(logging.WARNING)
-logging.getLogger("matplotlib").setLevel(logging.WARNING)
-
-i18n = I18nAuto()
-i18n.print()
-
-weight_root = "weights"
-weight_uvr5_root = "uvr5_weights"
-index_root = "logs"
-names = []
-hubert_model = None
-for name in os.listdir(weight_root):
-    if name.endswith(".pth"):
-        names.append(name)
-index_paths = []
-for root, dirs, files in os.walk(index_root, topdown=False):
-    for name in files:
-        if name.endswith(".index") and "trained" not in name:
-            index_paths.append("%s/%s" % (root, name))
-
-
-def get_vc(sid):
-    global n_spk, tgt_sr, net_g, vc, cpt, version
-    if sid == "" or sid == []:
-        global hubert_model
-        if hubert_model != None:  # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
-            print("clean_empty_cache")
-            del net_g, n_spk, vc, hubert_model, tgt_sr  # ,cpt
-            hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-            ###楼下不这么折腾清理不干净
-            if_f0 = cpt.get("f0", 1)
-            version = cpt.get("version", "v1")
-            if version == "v1":
-                if if_f0 == 1:
-                    net_g = SynthesizerTrnMs256NSFsid(
-                        *cpt["config"], is_half=config.is_half
-                    )
-                else:
-                    net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
-            elif version == "v2":
-                if if_f0 == 1:
-                    net_g = SynthesizerTrnMs768NSFsid(
-                        *cpt["config"], is_half=config.is_half
-                    )
-                else:
-                    net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
-            del net_g, cpt
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-            cpt = None
-        return {"visible": False, "__type__": "update"}
-    person = "%s/%s" % (weight_root, sid)
-    print("loading %s" % person)
-    cpt = torch.load(person, map_location="cpu")
-    tgt_sr = cpt["config"][-1]
-    cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
-    if_f0 = cpt.get("f0", 1)
-    version = cpt.get("version", "v1")
-    if version == "v1":
-        if if_f0 == 1:
-            net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
-        else:
-            net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
-    elif version == "v2":
-        if if_f0 == 1:
-            net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
-        else:
-            net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
-    del net_g.enc_q
-    print(net_g.load_state_dict(cpt["weight"], strict=False))
-    net_g.eval().to(config.device)
-    if config.is_half:
-        net_g = net_g.half()
-    else:
-        net_g = net_g.float()
-    vc = VC(tgt_sr, config)
-    n_spk = cpt["config"][-3]
-    return {"visible": True, "maximum": n_spk, "__type__": "update"}
-
-
-def load_hubert():
-    global hubert_model
-    models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
-        ["hubert_base.pt"],
-        suffix="",
-    )
-    hubert_model = models[0]
-    hubert_model = hubert_model.to(config.device)
-    if config.is_half:
-        hubert_model = hubert_model.half()
-    else:
-        hubert_model = hubert_model.float()
-    hubert_model.eval()
-
-
-def vc_single(
-    sid,
-    input_audio_path,
-    f0_up_key,
-    f0_file,
-    f0_method,
-    file_index,
-    file_index2,
-    # file_big_npy,
-    index_rate,
-    filter_radius,
-    resample_sr,
-    rms_mix_rate,
-    protect,
-):  # spk_item, input_audio0, vc_transform0,f0_file,f0method0
-    global tgt_sr, net_g, vc, hubert_model, version
-    if input_audio_path is None:
-        return "You need to upload an audio", None
-    f0_up_key = int(f0_up_key)
-    try:
-        audio = input_audio_path[1] / 32768.0
-        if len(audio.shape) == 2:
-            audio = np.mean(audio, -1)
-        audio = librosa.resample(audio, orig_sr=input_audio_path[0], target_sr=16000)
-        audio_max = np.abs(audio).max() / 0.95
-        if audio_max > 1:
-            audio /= audio_max
-        times = [0, 0, 0]
-        if hubert_model == None:
-            load_hubert()
-        if_f0 = cpt.get("f0", 1)
-        file_index = (
-            (
-                file_index.strip(" ")
-                .strip('"')
-                .strip("\n")
-                .strip('"')
-                .strip(" ")
-                .replace("trained", "added")
-            )
-            if file_index != ""
-            else file_index2
-        )  # 防止小白写错，自动帮他替换掉
-        # file_big_npy = (
-        #     file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
-        # )
-        audio_opt = vc.pipeline(
-            hubert_model,
-            net_g,
-            sid,
-            audio,
-            input_audio_path,
-            times,
-            f0_up_key,
-            f0_method,
-            file_index,
-            # file_big_npy,
-            index_rate,
-            if_f0,
-            filter_radius,
-            tgt_sr,
-            resample_sr,
-            rms_mix_rate,
-            version,
-            protect,
-            f0_file=f0_file,
-        )
-        if resample_sr >= 16000 and tgt_sr != resample_sr:
-            tgt_sr = resample_sr
-        index_info = (
-            "Using index:%s." % file_index
-            if os.path.exists(file_index)
-            else "Index not used."
-        )
-        return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
-            index_info,
-            times[0],
-            times[1],
-            times[2],
-        ), (tgt_sr, audio_opt)
-    except:
-        info = traceback.format_exc()
-        print(info)
-        return info, (None, None)
-
-
-app = gr.Blocks()
-with app:
-    with gr.Tabs():
-        with gr.TabItem("在线demo"):
-            gr.Markdown(
-                value="""
-                RVC 在线demo
-                """
-            )
-            sid = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names))
-            with gr.Column():
-                spk_item = gr.Slider(
-                    minimum=0,
-                    maximum=2333,
-                    step=1,
-                    label=i18n("请选择说话人id"),
-                    value=0,
-                    visible=False,
-                    interactive=True,
-                )
-            sid.change(
-                fn=get_vc,
-                inputs=[sid],
-                outputs=[spk_item],
-            )
-            gr.Markdown(
-                value=i18n("男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ")
-            )
-            vc_input3 = gr.Audio(label="上传音频（长度小于90秒）")
-            vc_transform0 = gr.Number(label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0)
-            f0method0 = gr.Radio(
-                label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU"),
-                choices=["pm", "harvest", "crepe", "rmvpe"],
-                value="pm",
-                interactive=True,
-            )
-            filter_radius0 = gr.Slider(
-                minimum=0,
-                maximum=7,
-                label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波，数值为滤波半径，使用可以削弱哑音"),
-                value=3,
-                step=1,
-                interactive=True,
-            )
-            with gr.Column():
-                file_index1 = gr.Textbox(
-                    label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"),
-                    value="",
-                    interactive=False,
-                    visible=False,
-                )
-            file_index2 = gr.Dropdown(
-                label=i18n("自动检测index路径,下拉式选择(dropdown)"),
-                choices=sorted(index_paths),
-                interactive=True,
-            )
-            index_rate1 = gr.Slider(
-                minimum=0,
-                maximum=1,
-                label=i18n("检索特征占比"),
-                value=0.88,
-                interactive=True,
-            )
-            resample_sr0 = gr.Slider(
-                minimum=0,
-                maximum=48000,
-                label=i18n("后处理重采样至最终采样率，0为不进行重采样"),
-                value=0,
-                step=1,
-                interactive=True,
-            )
-            rms_mix_rate0 = gr.Slider(
-                minimum=0,
-                maximum=1,
-                label=i18n("输入源音量包络替换输出音量包络融合比例，越靠近1越使用输出包络"),
-                value=1,
-                interactive=True,
-            )
-            protect0 = gr.Slider(
-                minimum=0,
-                maximum=0.5,
-                label=i18n("保护清辅音和呼吸声，防止电音撕裂等artifact，拉满0.5不开启，调低加大保护力度但可能降低索引效果"),
-                value=0.33,
-                step=0.01,
-                interactive=True,
-            )
-            f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"))
-            but0 = gr.Button(i18n("转换"), variant="primary")
-            vc_output1 = gr.Textbox(label=i18n("输出信息"))
-            vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)"))
-            but0.click(
-                vc_single,
-                [
-                    spk_item,
-                    vc_input3,
-                    vc_transform0,
-                    f0_file,
-                    f0method0,
-                    file_index1,
-                    file_index2,
-                    # file_big_npy1,
-                    index_rate1,
-                    filter_radius0,
-                    resample_sr0,
-                    rms_mix_rate0,
-                    protect0,
-                ],
-                [vc_output1, vc_output2],
-            )
-
-
-app.launch()
--- a/assets/hubert/.gitignore
+++ b/assets/hubert/.gitignore
--- a/assets/pretrained/.gitignore
+++ b/assets/pretrained/.gitignore
--- a/assets/pretrained_v2/.gitignore
+++ b/assets/pretrained_v2/.gitignore
--- a/assets/rmvpe/.gitignore
+++ b/assets/rmvpe/.gitignore
--- a/assets/uvr5_weights/.gitignore
+++ b/assets/uvr5_weights/.gitignore
@ -0,0 +1,2 @@
+*
+!.gitignore
--- a/assets/weights/.gitignore
+++ b/assets/weights/.gitignore
@ -0,0 +1,2 @@
+*
+!.gitignore
--- a/configs/config.py
+++ b/configs/config.py
@ -1,25 +1,26 @@
-import os
 import argparse
+import os
 import sys
-import torch
 from multiprocessing import cpu_count

+import torch
+

 def use_fp32_config():
    for config_file in [
-        "32k.json",
-        "40k.json",
-        "48k.json",
-        "48k_v2.json",
-        "32k_v2.json",
+        "v1/32k.json",
+        "v1/40k.json",
+        "v1/48k.json",
+        "v2/48k.json",
+        "v2/32k.json",
    ]:
        with open(f"configs/{config_file}", "r") as f:
            strr = f.read().replace("true", "false")
        with open(f"configs/{config_file}", "w") as f:
            f.write(strr)
-    with open("trainset_preprocess_pipeline_print.py", "r") as f:
+    with open("infer/modules/train/preprocess.py", "r") as f:
        strr = f.read().replace("3.7", "3.0")
-    with open("trainset_preprocess_pipeline_print.py", "w") as f:
+    with open("infer/modules/train/preprocess.py", "w") as f:
        f.write(strr)


@ -111,9 +112,9 @@ class Config:
                + 0.4
            )
            if self.gpu_mem <= 4:
-                with open("trainset_preprocess_pipeline_print.py", "r") as f:
+                with open("infer/modules/train/preprocess.py", "r") as f:
                    strr = f.read().replace("3.7", "3.0")
-                with open("trainset_preprocess_pipeline_print.py", "w") as f:
+                with open("infer/modules/train/preprocess.py", "w") as f:
                    f.write(strr)
        elif self.has_mps():
            print("No supported Nvidia GPU found")
@ -198,6 +199,3 @@ class Config:
                except:
                    pass
        return x_pad, x_query, x_center, x_max
-
-
-defaultconfig = Config()
--- a/configs/v1/32k.json
+++ b/configs/v1/32k.json
--- a/configs/v1/40k.json
+++ b/configs/v1/40k.json
--- a/configs/v1/48k.json
+++ b/configs/v1/48k.json
--- a/configs/v2/32k.json
+++ b/configs/v2/32k.json
--- a/configs/v2/48k.json
+++ b/configs/v2/48k.json
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -6,7 +6,7 @@ services:
      dockerfile: Dockerfile
    container_name: rvc
    volumes:
-      - ./weights:/app/weights
+      - ./weights:/app/assets/weights
      - ./opt:/app/opt
      # - ./dataset:/app/dataset # you can use this folder in order to provide your dataset for model training
    ports:
--- a/docs/cn/Changelog_CN.md
+++ b/docs/cn/Changelog_CN.md
--- a/docs/cn/faq.md
+++ b/docs/cn/faq.md
--- a/docs/en/Changelog_EN.md
+++ b/docs/en/Changelog_EN.md
--- a/docs/en/README.en.md
+++ b/docs/en/README.en.md
@ -18,7 +18,7 @@ An easy-to-use Voice Conversion framework based on VITS.<br><br>
 ------
 [**Changelog**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/docs/Changelog_EN.md) | [**FAQ (Frequently Asked Questions)**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/wiki/FAQ-(Frequently-Asked-Questions)) 

-[**English**](./README.en.md) | [**中文简体**](../README.md) | [**日本語**](./README.ja.md) | [**한국어**](./README.ko.md) ([**韓國語**](./README.ko.han.md)) | [**Türkçe**](./README.tr.md)
+[**English**](../en/README.en.md) | [**中文简体**](../../README.md) | [**日本語**](../jp/README.ja.md) | [**한국어**](../kr/README.ko.md) ([**韓國語**](../kr/README.ko.han.md)) | [**Türkçe**](../tr/README.tr.md)


 Check our [Demo Video](https://www.bilibili.com/video/BV1pm4y1z7Gm/) here!
@ -91,11 +91,15 @@ You need to download them from our [Huggingface space](https://huggingface.co/lj

 Here's a list of Pre-models and other files that RVC needs:
 ```bash
-hubert_base.pt
+./assets/hubert/hubert_base.pt

-./pretrained 
+./assets/pretrained 

-./uvr5_weights
+./assets/uvr5_weights
+
+Additional downloads are required if you want to test the v2 version of the model.
+
+./assets/pretrained_v2

 If you want to test the v2 version model (the v2 version model has changed the input from the 256 dimensional feature of 9-layer Hubert+final_proj to the 768 dimensional feature of 12-layer Hubert, and has added 3 period discriminators), you will need to download additional features

--- a/docs/en/faiss_tips_en.md
+++ b/docs/en/faiss_tips_en.md
--- a/docs/en/faq_en.md
+++ b/docs/en/faq_en.md
--- a/docs/en/training_tips_en.md
+++ b/docs/en/training_tips_en.md
--- a/docs/jp/README.ja.md
+++ b/docs/jp/README.ja.md
@ -19,7 +19,7 @@ VITSに基づく使いやすい音声変換（voice changer）framework<br><br>

 [**更新日誌**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/docs/Changelog_CN.md)

-[**English**](./README.en.md) | [**中文简体**](../README.md) | [**日本語**](./README.ja.md) | [**한국어**](./README.ko.md) ([**韓國語**](./README.ko.han.md)) | [**Türkçe**](./README.tr.md)
+[**English**](../en/README.en.md) | [**中文简体**](../../README.md) | [**日本語**](../jp/README.ja.md) | [**한국어**](../kr/README.ko.md) ([**韓國語**](../kr/README.ko.han.md)) | [**Türkçe**](../tr/README.tr.md)

 > デモ動画は[こちら](https://www.bilibili.com/video/BV1pm4y1z7Gm/)でご覧ください。

@ -72,11 +72,15 @@ modelsは[Hugging Face space](https://huggingface.co/lj1995/VoiceConversionWebUI

 以下は、RVCに必要な基底モデルやその他のファイルの一覧です。
 ```bash
-hubert_base.pt
+./assets/hubert/hubert_base.pt

-./pretrained 
+./assets/pretrained 

-./uvr5_weights
+./assets/uvr5_weights
+
+V2のモデルを使用するには、追加でファイルをダウンロードする必要があります
+
+./assets/pretrained_v2

 # ffmpegがすでにinstallされている場合は省略
 ./ffmpeg
--- a/docs/jp/faiss_tips_ja.md
+++ b/docs/jp/faiss_tips_ja.md
--- a/docs/jp/training_tips_ja.md
+++ b/docs/jp/training_tips_ja.md
--- a/docs/kr/Changelog_KO.md
+++ b/docs/kr/Changelog_KO.md
--- a/docs/kr/README.ko.han.md
+++ b/docs/kr/README.ko.han.md
@ -18,7 +18,7 @@ VITS基盤의 簡單하고使用하기 쉬운音聲變換틀<br><br>
 ------
 [**更新日誌**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/docs/Changelog_KO.md)

-[**English**](./README.en.md) | [**中文简体**](../README.md) | [**日本語**](./README.ja.md) | [**한국어**](./README.ko.md) ([**韓國語**](./README.ko.han.md)) | [**Türkçe**](./README.tr.md)
+[**English**](../en/README.en.md) | [**中文简体**](../../README.md) | [**日本語**](../jp/README.ja.md) | [**한국어**](../kr/README.ko.md) ([**韓國語**](../kr/README.ko.han.md)) | [**Türkçe**](../tr/README.tr.md)

 > [示範映像](https://www.bilibili.com/video/BV1pm4y1z7Gm/)을 確認해 보세요!

@ -69,11 +69,15 @@ RVC 모델은 推論과訓練을 依하여 다른 預備모델이 必要합니

 다음은 RVC에 必要한 預備모델 및 其他 파일 目錄입니다:
 ```bash
-hubert_base.pt
+./assets/hubert/hubert_base.pt

-./pretrained 
+./assets/pretrained 

-./uvr5_weights
+./assets/uvr5_weights
+
+V2 버전 모델을 테스트하려면 추가 다운로드가 필요합니다.
+
+./assets/pretrained_v2

 # Windows를 使用하는境遇 이 사전도 必要할 수 있습니다. FFmpeg가 設置되어 있으면 건너뛰어도 됩니다.
 ffmpeg.exe
--- a/docs/kr/README.ko.md
+++ b/docs/kr/README.ko.md
@ -19,7 +19,7 @@ VITS 기반의 간단하고 사용하기 쉬운 음성 변환 프레임워크.<b

 [**업데이트 로그**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/docs/Changelog_KO.md)

-[**English**](./README.en.md) | [**中文简体**](../README.md) | [**日本語**](./README.ja.md) | [**한국어**](./README.ko.md) ([**韓國語**](./README.ko.han.md)) | [**Türkçe**](./README.tr.md)
+[**English**](../en/README.en.md) | [**中文简体**](../../README.md) | [**日本語**](../jp/README.ja.md) | [**한국어**](../kr/README.ko.md) ([**韓國語**](../kr/README.ko.han.md)) | [**Türkçe**](../tr/README.tr.md)

 > [데모 영상](https://www.bilibili.com/video/BV1pm4y1z7Gm/)을 확인해 보세요!

@ -77,11 +77,15 @@ RVC 모델은 추론과 훈련을 위하여 다른 사전 모델이 필요합니
 다음은 RVC에 필요한 사전 모델 및 기타 파일 목록입니다:

 ```bash
-hubert_base.pt
+./assets/hubert/hubert_base.pt

-./pretrained
+./assets/pretrained 

-./uvr5_weights
+./assets/uvr5_weights
+
+V2 버전 모델을 테스트하려면 추가 다운로드가 필요합니다.
+
+./assets/pretrained_v2

 # Windows를 사용하는 경우 이 사전도 필요할 수 있습니다. FFmpeg가 설치되어 있으면 건너뛰어도 됩니다.
 ffmpeg.exe
--- a/docs/kr/faiss_tips_ko.md
+++ b/docs/kr/faiss_tips_ko.md
--- a/docs/kr/training_tips_ko.md
+++ b/docs/kr/training_tips_ko.md
--- a/docs/tr/Changelog_TR.md
+++ b/docs/tr/Changelog_TR.md
--- a/docs/tr/README.tr.md
+++ b/docs/tr/README.tr.md
@ -20,7 +20,7 @@ Kolay kullanılabilen VITS tabanlı bir Ses Dönüşümü çerçevesi.<br><br>
 ------
 [**Değişiklik Kaydı**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/docs/Changelog_TR.md) | [**SSS (Sıkça Sorulan Sorular)**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/wiki/FAQ-(Frequently-Asked-Questions)) 

-[**English**](./README.en.md) | [**中文简体**](../README.md) | [**日本語**](./README.ja.md) | [**한국어**](./README.ko.md) ([**韓國語**](./README.ko.han.md)) | [**Türkçe**](./README.tr.md)
+[**English**](../en/README.en.md) | [**中文简体**](../../README.md) | [**日本語**](../jp/README.ja.md) | [**한국어**](../kr/README.ko.md) ([**韓國語**](../kr/README.ko.han.md)) | [**Türkçe**](../tr/README.tr.md)

 Demo Videosu için [buraya](https://www.bilibili.com/video/BV1pm4y1z7Gm/) bakın!

@ -88,15 +88,15 @@ Onları [Huggingface alanımızdan](https://huggingface.co/lj1995/VoiceConversio

 İşte RVC'nin ihtiyaç duyduğu Diğer Ön-Modellerin ve diğer dosyaların listesi:
 ```bash
-hubert_base.pt
+./assets/hubert/hubert_base.pt

-./pretrained 
+./assets/pretrained 

-./uvr5_weights
+./assets/uvr5_weights

 V2 sürümü modelini test etmek istiyorsanız (v2 sürümü modeli girişi 256 boyutlu 9 katmanlı Hubert+final_proj'dan 768 boyutlu 12 katmanlı Hubert'ın özelliğine ve 3 dönem ayrımına değiştirilmiştir), ek özellikleri indirmeniz gerekecektir.

-./pretrained_v2
+./assets/pretrained_v2

 #Eğer Windows kullanıyorsanız, FFmpeg yüklü değilse bu dictionariyaya da ihtiyacınız olabilir, FFmpeg yüklüyse atlayın
 ffmpeg.exe
--- a/docs/tr/faiss_tips_tr.md
+++ b/docs/tr/faiss_tips_tr.md
--- a/docs/tr/faq_tr.md
+++ b/docs/tr/faq_tr.md
--- a/docs/tr/training_tips_tr.md
+++ b/docs/tr/training_tips_tr.md
--- a/gui_v1.py
+++ b/gui_v1.py
@ -1,4 +1,6 @@
-import os, sys, pdb
+import os
+import pdb
+import sys

 os.environ["OMP_NUM_THREADS"] = "2"
 if sys.platform == "darwin":
@ -16,7 +18,8 @@ class Harvest(multiprocessing.Process):
        self.opt_q = opt_q

    def run(self):
-        import numpy as np, pyworld
+        import numpy as np
+        import pyworld

        while 1:
            idx, x, res_f0, n_cpu, ts = self.inp_q.get()
@ -33,21 +36,26 @@ class Harvest(multiprocessing.Process):


 if __name__ == "__main__":
-    from multiprocessing import Queue
-    from queue import Empty
-    import numpy as np
-    import multiprocessing
-    import traceback, re
    import json
+    import multiprocessing
+    import re
+    import threading
+    import time
+    import traceback
+    from multiprocessing import Queue, cpu_count
+    from queue import Empty
+
+    import librosa
+    import noisereduce as nr
+    import numpy as np
    import PySimpleGUI as sg
    import sounddevice as sd
-    import noisereduce as nr
-    from multiprocessing import cpu_count
-    import librosa, torch, time, threading
+    import torch
    import torch.nn.functional as F
    import torchaudio.transforms as tat
-    from i18n import I18nAuto
-    import rvc_for_realtime
+
+    import tools.rvc_for_realtime as rvc_for_realtime
+    from i18n.i18n import I18nAuto

    i18n = I18nAuto()
    device = rvc_for_realtime.config.device
@ -131,7 +139,9 @@ if __name__ == "__main__":
                                ),
                                sg.FileBrowse(
                                    i18n("选择.pth文件"),
-                                    initial_folder=os.path.join(os.getcwd(), "weights"),
+                                    initial_folder=os.path.join(
+                                        os.getcwd(), "assets/weights"
+                                    ),
                                    file_types=((". pth"),),
                                ),
                            ],
--- a/i18n/i18n.py
+++ b/i18n/i18n.py
@ -1,10 +1,10 @@
-import locale
 import json
+import locale
 import os


 def load_language_list(language):
-    with open(f"./lib/i18n/{language}.json", "r", encoding="utf-8") as f:
+    with open(f"./i18n/locale/{language}.json", "r", encoding="utf-8") as f:
        language_list = json.load(f)
    return language_list

@ -15,7 +15,7 @@ class I18nAuto:
            language = locale.getdefaultlocale()[
                0
            ]  # getlocale can't identify the system's language ((None, None))
-        if not os.path.exists(f"./lib/i18n/{language}.json"):
+        if not os.path.exists(f"./i18n/locale/{language}.json"):
            language = "en_US"
        self.language = language
        # print("Use Language:", language)
--- a/i18n/locale/en_US.json
+++ b/i18n/locale/en_US.json
@ -15,6 +15,7 @@
    "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "Enter the GPU index(es) separated by '-', e.g., 0-0-1 to use 2 processes in GPU0 and 1 process in GPU1",
    "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Step 1: Fill in the experimental configuration. Experimental data is stored in the 'logs' folder, with each experiment having a separate folder. Manually enter the experiment name path, which contains the experimental configuration, logs, and trained model files.",
    "step1:正在处理数据": "Step 1: Processing data",
+    "step2:正在提取音高&正在提取特征": "step2:Pitch extraction & feature extraction",
    "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Step 2a: Automatically traverse all files in the training folder that can be decoded into audio and perform slice normalization. Generates 2 wav folders in the experiment directory. Currently, only single-singer/speaker training is supported.",
    "step2a:无需提取音高": "Step 2a: Skipping pitch extraction",
    "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Step 2b: Use CPU to extract pitch (if the model has pitch), use GPU to extract features (select GPU index):",
--- a/i18n/locale/es_ES.json
+++ b/i18n/locale/es_ES.json
@ -15,6 +15,7 @@
    "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程",
    "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "paso 1: Complete la configuración del experimento. Los datos del experimento se almacenan en el directorio 'logs', con cada experimento en una carpeta separada. La ruta del nombre del experimento debe ingresarse manualmente y debe contener la configuración del experimento, los registros y los archivos del modelo entrenado.",
    "step1:正在处理数据": "Paso 1: Procesando datos",
+    "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征",
    "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "paso 2a: recorra automáticamente la carpeta de capacitación y corte y normalice todos los archivos de audio que se pueden decodificar en audio. Se generarán dos carpetas 'wav' en el directorio del experimento. Actualmente, solo se admite la capacitación de una sola persona.",
    "step2a:无需提取音高": "Paso 2a: No es necesario extraer el tono",
    "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "paso 2b: use la CPU para extraer el tono (si el modelo tiene guía de tono) y la GPU para extraer características (seleccione el número de tarjeta).",
--- a/i18n/locale/it_IT.json
+++ b/i18n/locale/it_IT.json
@ -15,6 +15,7 @@
    "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程",
    "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Passaggio 1: compilare la configurazione sperimentale. ",
    "step1:正在处理数据": "Passaggio 1: elaborazione dei dati",
+    "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征",
    "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Passaggio 2a: attraversa automaticamente tutti i file nella cartella di addestramento che possono essere decodificati in audio ed esegui la normalizzazione delle sezioni. ",
    "step2a:无需提取音高": "Step 2a: Saltare l'estrazione del tono",
    "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Passaggio 2b: utilizzare la CPU per estrarre il tono (se il modello ha il tono), utilizzare la GPU per estrarre le caratteristiche (selezionare l'indice GPU):",
--- a/i18n/locale/ja_JP.json
+++ b/i18n/locale/ja_JP.json
@ -15,6 +15,7 @@
    "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpeカード番号設定：異なるプロセスに使用するカード番号を入力する。例えば、0-0-1でカード0に2つのプロセス、カード1に1つのプロセスを実行する。",
    "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "ステップ1:実験設定を入力します。実験データはlogsに保存され、各実験にはフォルダーがあります。実験名のパスを手動で入力する必要があり、実験設定、ログ、トレーニングされたモデルファイルが含まれます。",
    "step1:正在处理数据": "step1:処理中のデータ",
+    "step2:正在提取音高&正在提取特征": "step2:ピッチ抽出と特徴抽出",
    "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "ステップ2a: 訓練フォルダー内のすべての音声ファイルを自動的に探索し、スライスと正規化を行い、2つのwavフォルダーを実験ディレクトリに生成します。現在は一人でのトレーニングのみをサポートしています。",
    "step2a:无需提取音高": "step2a:ピッチの抽出は不要",
    "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "ステップ2b: CPUを使用して音高を抽出する(モデルに音高がある場合)、GPUを使用して特徴を抽出する(GPUの番号を選択する)",
--- a/i18n/locale/ru_RU.json
+++ b/i18n/locale/ru_RU.json
@ -15,6 +15,7 @@
    "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "Введите номера графических процессоров, разделенные символом «-», например, 0-0-1, чтобы запустить два процесса на GPU 0 и один процесс на GPU 1:",
    "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Шаг 1. Конфигурирование модели. Данные обучения модели сохраняются в папку 'logs', и для каждой модели создаётся отдельная папка. Введите вручную путь к настройкам для модели, в которой находятся логи и тренировочные файлы.",
    "step1:正在处理数据": "Шаг 1. Переработка данных",
+    "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征",
    "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Шаг 2А. Автоматическая обработка исходных аудиозаписей для обучения и выполнение нормализации среза. Создаст 2 папки wav в папке модели. В данный момент поддерживается обучение только на одноголосных записях.",
    "step2a:无需提取音高": "Шаг 2А. Пропуск извлечения тональности",
    "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Шаг 2Б. Оценка и извлечение тональности в аудиофайлах с помощью процессора (если включена поддержка изменения высоты звука), извлечение черт с помощью GPU (выберите номер GPU):",
--- a/i18n/locale/tr_TR.json
+++ b/i18n/locale/tr_TR.json
@ -15,6 +15,7 @@
    "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程",
    "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Adım 1: Deneysel yapılandırmayı doldurun. Deneysel veriler 'logs' klasöründe saklanır ve her bir deney için ayrı bir klasör vardır. Deneysel adı yolu manuel olarak girin; bu yol, deneysel yapılandırmayı, günlükleri ve eğitilmiş model dosyalarını içerir.",
    "step1:正在处理数据": "Adım 1: Veri işleme",
+    "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征",
    "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Adım 2a: Eğitim klasöründe ses dosyalarını otomatik olarak gezinerek dilimleme normalizasyonu yapın. Deney dizini içinde 2 wav klasörü oluşturur. Şu anda sadece tek kişilik eğitim desteklenmektedir.",
    "step2a:无需提取音高": "Adım 2a: Pitch çıkartma adımını atlama",
    "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Adım 2b: Ses yüksekliği (Pitch) çıkartmak için CPU kullanın (eğer model ses yüksekliği içeriyorsa), özellikleri çıkartmak için GPU kullanın (GPU indeksini seçin):",
--- a/i18n/locale/zh_CN.json
+++ b/i18n/locale/zh_CN.json
@ -15,6 +15,7 @@
    "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程",
    "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ",
    "step1:正在处理数据": "step1:正在处理数据",
+    "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征",
    "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ",
    "step2a:无需提取音高": "step2a:无需提取音高",
    "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)",
--- a/i18n/locale/zh_HK.json
+++ b/i18n/locale/zh_HK.json
@ -15,6 +15,7 @@
    "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置：以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程",
    "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1：填寫實驗配置。實驗數據放在logs下，每個實驗一個資料夾，需手動輸入實驗名路徑，內含實驗配置、日誌、訓練得到的模型檔案。",
    "step1:正在处理数据": "step1:正在处理数据",
+    "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征",
    "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a：自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化，在實驗目錄下生成2個wav資料夾；暫時只支援單人訓練。",
    "step2a:无需提取音高": "step2a:无需提取音高",
    "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)",
--- a/i18n/locale/zh_SG.json
+++ b/i18n/locale/zh_SG.json
@ -15,6 +15,7 @@
    "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置：以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程",
    "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1：填寫實驗配置。實驗數據放在logs下，每個實驗一個資料夾，需手動輸入實驗名路徑，內含實驗配置、日誌、訓練得到的模型檔案。",
    "step1:正在处理数据": "step1:正在处理数据",
+    "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征",
    "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a：自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化，在實驗目錄下生成2個wav資料夾；暫時只支援單人訓練。",
    "step2a:无需提取音高": "step2a:无需提取音高",
    "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)",
--- a/i18n/locale/zh_TW.json
+++ b/i18n/locale/zh_TW.json
@ -15,6 +15,7 @@
    "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置：以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程",
    "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1：填寫實驗配置。實驗數據放在logs下，每個實驗一個資料夾，需手動輸入實驗名路徑，內含實驗配置、日誌、訓練得到的模型檔案。",
    "step1:正在处理数据": "step1:正在处理数据",
+    "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征",
    "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a：自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化，在實驗目錄下生成2個wav資料夾；暫時只支援單人訓練。",
    "step2a:无需提取音高": "step2a:无需提取音高",
    "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)",
--- a/lib/i18n/locale_diff.py
+++ b/lib/i18n/locale_diff.py
@ -3,12 +3,14 @@ import os
 from collections import OrderedDict

 # Define the standard file name
-standard_file = "zh_CN.json"
+standard_file = "locale/zh_CN.json"

 # Find all JSON files in the directory
-dir_path = "./"
+dir_path = "locale/"
 languages = [
-    f for f in os.listdir(dir_path) if f.endswith(".json") and f != standard_file
+    os.path.join(dir_path, f)
+    for f in os.listdir(dir_path)
+    if f.endswith(".json") and f != standard_file
 ]

 # Load the standard file
--- a/lib/i18n/scan_i18n.py
+++ b/lib/i18n/scan_i18n.py
@ -1,7 +1,6 @@
 import ast
 import glob
 import json
-
 from collections import OrderedDict


@ -50,8 +49,8 @@ print()
 print("Total unique:", len(code_keys))


-standard_file = "zh_CN.json"
-with open(f"lib/i18n/{standard_file}", "r", encoding="utf-8") as f:
+standard_file = "i18n/locale/zh_CN.json"
+with open(standard_file, "r", encoding="utf-8") as f:
    standard_data = json.load(f, object_pairs_hook=OrderedDict)
 standard_keys = set(standard_data.keys())

@ -71,6 +70,6 @@ for s in strings:
    code_keys_dict[s] = s

 # write back
-with open(f"lib/i18n/{standard_file}", "w", encoding="utf-8") as f:
+with open(standard_file, "w", encoding="utf-8") as f:
    json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True)
    f.write("\n")
--- a/infer-web.py
+++ b/infer-web.py
--- a/infer/lib/audio.py
+++ b/infer/lib/audio.py
@ -1,4 +1,5 @@
 import ffmpeg
+import librosa
 import numpy as np


@ -15,7 +16,13 @@ def load_audio(file, sr):
            .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
        )
+        return np.frombuffer(out, np.float32).flatten()
+
+    except AttributeError:
+        audio = file[1] / 32768.0
+        if len(audio.shape) == 2:
+            audio = np.mean(audio, -1)
+        return librosa.resample(audio, orig_sr=file[0], target_sr=16000)
+
    except Exception as e:
        raise RuntimeError(f"Failed to load audio: {e}")
-
-    return np.frombuffer(out, np.float32).flatten()
--- a/infer/lib/infer_pack/attentions.py
+++ b/infer/lib/infer_pack/attentions.py
@ -1,13 +1,13 @@
 import copy
 import math
+
 import numpy as np
 import torch
 from torch import nn
 from torch.nn import functional as F

-from lib.infer_pack import commons
-from lib.infer_pack import modules
-from lib.infer_pack.modules import LayerNorm
+from infer.lib.infer_pack import commons, modules
+from infer.lib.infer_pack.modules import LayerNorm


 class Encoder(nn.Module):
--- a/infer/lib/infer_pack/commons.py
+++ b/infer/lib/infer_pack/commons.py
@ -1,4 +1,5 @@
 import math
+
 import numpy as np
 import torch
 from torch import nn
--- a/infer/lib/infer_pack/models.py
+++ b/infer/lib/infer_pack/models.py
@ -1,17 +1,17 @@
-import math, pdb, os
+import math
+import os
+import pdb
 from time import time as ttime
+
+import numpy as np
 import torch
 from torch import nn
+from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
 from torch.nn import functional as F
-from lib.infer_pack import modules
-from lib.infer_pack import attentions
-from lib.infer_pack import commons
-from lib.infer_pack.commons import init_weights, get_padding
-from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
-from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
-from lib.infer_pack.commons import init_weights
-import numpy as np
-from lib.infer_pack import commons
+from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
+
+from infer.lib.infer_pack import attentions, commons, modules
+from infer.lib.infer_pack.commons import get_padding, init_weights


 class TextEncoder256(nn.Module):
--- a/infer/lib/infer_pack/models_onnx.py
+++ b/infer/lib/infer_pack/models_onnx.py
@ -1,17 +1,17 @@
-import math, pdb, os
+import math
+import os
+import pdb
 from time import time as ttime
+
+import numpy as np
 import torch
 from torch import nn
+from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
 from torch.nn import functional as F
-from lib.infer_pack import modules
-from lib.infer_pack import attentions
-from lib.infer_pack import commons
-from lib.infer_pack.commons import init_weights, get_padding
-from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
-from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
-from lib.infer_pack.commons import init_weights
-import numpy as np
-from lib.infer_pack import commons
+from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
+
+from infer.lib.infer_pack import attentions, commons, modules
+from infer.lib.infer_pack.commons import get_padding, init_weights


 class TextEncoder256(nn.Module):
--- a/infer/lib/infer_pack/modules.py
+++ b/infer/lib/infer_pack/modules.py
@ -1,18 +1,17 @@
 import copy
 import math
+
 import numpy as np
 import scipy
 import torch
 from torch import nn
+from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
 from torch.nn import functional as F
+from torch.nn.utils import remove_weight_norm, weight_norm

-from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
-from torch.nn.utils import weight_norm, remove_weight_norm
-
-from lib.infer_pack import commons
-from lib.infer_pack.commons import init_weights, get_padding
-from lib.infer_pack.transforms import piecewise_rational_quadratic_transform
-
+from infer.lib.infer_pack import commons
+from infer.lib.infer_pack.commons import get_padding, init_weights
+from infer.lib.infer_pack.transforms import piecewise_rational_quadratic_transform

 LRELU_SLOPE = 0.1

--- a/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py
+++ b/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py
@ -1,6 +1,7 @@
-from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
-import pyworld
 import numpy as np
+import pyworld
+
+from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor


 class DioF0Predictor(F0Predictor):
--- a/infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py
+++ b/infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py
--- a/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py
+++ b/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py
@ -1,6 +1,7 @@
-from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
-import pyworld
 import numpy as np
+import pyworld
+
+from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor


 class HarvestF0Predictor(F0Predictor):
--- a/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py
+++ b/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py
@ -1,6 +1,7 @@
-from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
-import parselmouth
 import numpy as np
+import parselmouth
+
+from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor


 class PMF0Predictor(F0Predictor):
--- a/infer/lib/infer_pack/modules/F0Predictor/init.py
+++ b/infer/lib/infer_pack/modules/F0Predictor/init.py
--- a/infer/lib/infer_pack/onnx_inference.py
+++ b/infer/lib/infer_pack/onnx_inference.py
@ -1,6 +1,6 @@
-import onnxruntime
 import librosa
 import numpy as np
+import onnxruntime
 import soundfile


--- a/infer/lib/infer_pack/transforms.py
+++ b/infer/lib/infer_pack/transforms.py
@ -1,9 +1,7 @@
+import numpy as np
 import torch
 from torch.nn import functional as F

-import numpy as np
-
-
 DEFAULT_MIN_BIN_WIDTH = 1e-3
 DEFAULT_MIN_BIN_HEIGHT = 1e-3
 DEFAULT_MIN_DERIVATIVE = 1e-3
--- a/infer/lib/rmvpe.py
+++ b/infer/lib/rmvpe.py
@ -1,11 +1,11 @@
-import torch, numpy as np, pdb
+import pdb
+
+import numpy as np
+import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import torch, pdb
-import numpy as np
-import torch.nn.functional as F
+from librosa.util import normalize, pad_center, tiny
 from scipy.signal import get_window
-from librosa.util import pad_center, tiny, normalize


 ###stft codes from https://github.com/pseeth/torch-stft/blob/master/torch_stft/util.py
@ -670,7 +670,8 @@ class RMVPE:


 if __name__ == "__main__":
-    import soundfile as sf, librosa
+    import librosa
+    import soundfile as sf

    audio, sampling_rate = sf.read(r"C:\Users\liujing04\Desktop\Z\冬之花clip1.wav")
    if len(audio.shape) > 1:
--- a/infer/lib/slicer2.py
+++ b/infer/lib/slicer2.py
--- a/infer/lib/train/data_utils.py
+++ b/infer/lib/train/data_utils.py
@ -1,10 +1,12 @@
-import os, traceback
+import os
+import traceback
+
 import numpy as np
 import torch
 import torch.utils.data

-from lib.train.mel_processing import spectrogram_torch
-from lib.train.utils import load_wav_to_torch, load_filepaths_and_text
+from infer.lib.train.mel_processing import spectrogram_torch
+from infer.lib.train.utils import load_filepaths_and_text, load_wav_to_torch


 class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
--- a/infer/lib/train/losses.py
+++ b/infer/lib/train/losses.py
--- a/infer/lib/train/mel_processing.py
+++ b/infer/lib/train/mel_processing.py
@ -2,7 +2,6 @@ import torch
 import torch.utils.data
 from librosa.filters import mel as librosa_mel_fn

-
 MAX_WAV_VALUE = 32768.0


--- a/infer/lib/train/process_ckpt.py
+++ b/infer/lib/train/process_ckpt.py
@ -1,9 +1,11 @@
-import torch, traceback, os, sys
-
-now_dir = os.getcwd()
-sys.path.append(now_dir)
+import os
+import sys
+import traceback
 from collections import OrderedDict
-from i18n import I18nAuto
+
+import torch
+
+from i18n.i18n import I18nAuto

 i18n = I18nAuto()

@ -40,7 +42,7 @@ def savee(ckpt, sr, if_f0, name, epoch, version, hps):
        opt["sr"] = sr
        opt["f0"] = if_f0
        opt["version"] = version
-        torch.save(opt, "weights/%s.pth" % name)
+        torch.save(opt, "assets/weights/%s.pth" % name)
        return "Success."
    except:
        return traceback.format_exc()
@ -183,7 +185,7 @@ def extract_small_model(path, name, sr, if_f0, info, version):
        opt["version"] = version
        opt["sr"] = sr
        opt["f0"] = int(if_f0)
-        torch.save(opt, "weights/%s.pth" % name)
+        torch.save(opt, "assets/weights/%s.pth" % name)
        return "Success."
    except:
        return traceback.format_exc()
@ -253,7 +255,7 @@ def merge(path1, path2, alpha1, sr, f0, info, name, version):
        opt["f0"] = 1 if f0 == i18n("是") else 0
        opt["version"] = version
        opt["info"] = info
-        torch.save(opt, "weights/%s.pth" % name)
+        torch.save(opt, "assets/weights/%s.pth" % name)
        return "Success."
    except:
        return traceback.format_exc()
--- a/infer/lib/train/utils.py
+++ b/infer/lib/train/utils.py
@ -1,13 +1,15 @@
-import os, traceback
-import glob
-import sys
 import argparse
-import logging
+import glob
 import json
+import logging
+import os
 import subprocess
+import sys
+import traceback
+
 import numpy as np
-from scipy.io.wavfile import read
 import torch
+from scipy.io.wavfile import read

 MATPLOTLIB_FLAG = False

@ -362,9 +364,9 @@ def get_hparams(init=True):
        os.makedirs(experiment_dir)

    if args.version == "v1" or args.sample_rate == "40k":
-        config_path = "configs/%s.json" % args.sample_rate
+        config_path = "configs/v1/%s.json" % args.sample_rate
    else:
-        config_path = "configs/%s_v2.json" % args.sample_rate
+        config_path = "configs/v2/%s.json" % args.sample_rate
    config_save_path = os.path.join(experiment_dir, "config.json")
    if init:
        with open(config_path, "r") as f:
--- a/infer/lib/uvr5_pack/lib_v5/dataset.py
+++ b/infer/lib/uvr5_pack/lib_v5/dataset.py
--- a/infer/lib/uvr5_pack/lib_v5/layers.py
+++ b/infer/lib/uvr5_pack/lib_v5/layers.py
@ -1,6 +1,6 @@
 import torch
-from torch import nn
 import torch.nn.functional as F
+from torch import nn

 from . import spec_utils

--- a/infer/lib/uvr5_pack/lib_v5/layers_123812KB
+++ b/infer/lib/uvr5_pack/lib_v5/layers_123812KB
@ -1,6 +1,6 @@
 import torch
-from torch import nn
 import torch.nn.functional as F
+from torch import nn

 from . import spec_utils

--- a/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py
+++ b/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py
@ -1,6 +1,6 @@
 import torch
-from torch import nn
 import torch.nn.functional as F
+from torch import nn

 from . import spec_utils

--- a/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py
+++ b/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py
@ -1,6 +1,6 @@
 import torch
-from torch import nn
 import torch.nn.functional as F
+from torch import nn

 from . import spec_utils

--- a/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py
+++ b/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py
@ -1,6 +1,6 @@
 import torch
-from torch import nn
 import torch.nn.functional as F
+from torch import nn

 from . import spec_utils

--- a/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py
+++ b/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py
@ -1,6 +1,6 @@
 import torch
-from torch import nn
 import torch.nn.functional as F
+from torch import nn

 from . import spec_utils

--- a/infer/lib/uvr5_pack/lib_v5/layers_new.py
+++ b/infer/lib/uvr5_pack/lib_v5/layers_new.py
@ -1,6 +1,6 @@
 import torch
-from torch import nn
 import torch.nn.functional as F
+from torch import nn

 from . import spec_utils

--- a/infer/lib/uvr5_pack/lib_v5/model_param_init.py
+++ b/infer/lib/uvr5_pack/lib_v5/model_param_init.py
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json
+++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json
+++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json
+++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json
+++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json
+++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json
+++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json
+++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json
+++ b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json
+++ b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json
+++ b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json
+++ b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json
+++ b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json
+++ b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json
+++ b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json
+++ b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json
+++ b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json
--- a/Show More
+++ b/Show More