Reformat and rewrite _get_name_params (#57)

* Reformat * rewrite _get_name_params * Add workflow for automatic formatting * Revert "Add workflow for automatic formatting" This reverts commit 9111c5dbc1830248305fb075587a88be07ad3115. * revert Retrieval_based_Voice_Conversion_WebUI.ipynb --------- Co-authored-by: 源文雨 <41315874+fumiama@users.noreply.github.com>
2025-04-04 03:38:58 +08:00 · 2023-04-15 20:44:24 +09:00 · 2023-04-15 20:44:24 +09:00 · c8261b2ccc
commit c8261b2ccc
parent aaa893c4b1
45 changed files with 4878 additions and 2456 deletions
--- a/config.py
+++ b/config.py
@ -16,12 +16,17 @@ n_cpu   =   0

 ########################命令行参数########################
 import argparse
+
 parser = argparse.ArgumentParser()
 parser.add_argument("--port", type=int, default=7865, help="Listen port")
 parser.add_argument("--pycmd", type=str, default="python", help="Python command")
-parser.add_argument("--colab", action='store_true', help="Launch in colab")
-parser.add_argument("--noparallel", action='store_true', help="Disable parallel processing")
-parser.add_argument("--noautoopen", action='store_true', help="Do not open in browser automatically")
+parser.add_argument("--colab", action="store_true", help="Launch in colab")
+parser.add_argument(
+    "--noparallel", action="store_true", help="Disable parallel processing"
+)
+parser.add_argument(
+    "--noautoopen", action="store_true", help="Do not open in browser automatically"
+)
 cmd_opts = parser.parse_args()

 python_cmd = cmd_opts.pycmd
@ -34,13 +39,15 @@ noautoopen=cmd_opts.noautoopen
 import sys
 import torch

+
 # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+.
 # check `getattr` and try it for compatibility
 def has_mps() -> bool:
    if sys.platform != "darwin":
        return False
    else:
-        if not getattr(torch, 'has_mps', False): return False
+        if not getattr(torch, "has_mps", False):
+            return False
        try:
            torch.zeros(1).to(torch.device("mps"))
            return True
@ -48,7 +55,7 @@ def has_mps() -> bool:
            return False


-if(not torch.cuda.is_available()):
+if not torch.cuda.is_available():
    if has_mps():
        print("没有发现支持的N卡, 使用MPS进行推理")
        device = "mps"
@ -57,15 +64,17 @@ if(not torch.cuda.is_available()):
        device = "cpu"
        is_half = False

-if(device not in ["cpu", "mps"]):
+if device not in ["cpu", "mps"]:
    gpu_name = torch.cuda.get_device_name(int(device.split(":")[-1]))
-    if("16" in gpu_name or "MX" in gpu_name):
+    if "16" in gpu_name or "MX" in gpu_name:
        print("16系显卡/MX系显卡强制单精度")
        is_half = False

 from multiprocessing import cpu_count
-if(n_cpu==0): n_cpu=cpu_count()
-if(is_half):
+
+if n_cpu == 0:
+    n_cpu = cpu_count()
+if is_half:
    # 6G显存配置
    x_pad = 3
    x_query = 10
--- a/export_onnx.py
+++ b/export_onnx.py
@ -5,7 +5,6 @@ person = "Shiroha/shiroha.pth"
 exported_path = "model.onnx"


-
 cpt = torch.load(person, map_location="cpu")
 cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
 print(*cpt["config"])
@ -19,16 +18,19 @@ test_pitchf = torch.rand(1, 200)
 test_ds = torch.LongTensor([0])
 test_rnd = torch.rand(1, 192, 200)
 input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
-output_names = ["audio", ]
+output_names = [
+    "audio",
+]
 device = "cpu"
-torch.onnx.export(net_g,
+torch.onnx.export(
+    net_g,
    (
        test_phone.to(device),
        test_phone_lengths.to(device),
        test_pitch.to(device),
        test_pitchf.to(device),
        test_ds.to(device),
-                test_rnd.to(device)
+        test_rnd.to(device),
    ),
    exported_path,
    dynamic_axes={
@ -41,4 +43,5 @@ torch.onnx.export(net_g,
    opset_version=16,
    verbose=False,
    input_names=input_names,
-            output_names=output_names)
+    output_names=output_names,
+)
--- a/extract_f0_print.py
+++ b/extract_f0_print.py
@ -3,19 +3,24 @@ import librosa
 import pyworld
 from scipy.io import wavfile
 import numpy as np, logging
-logging.getLogger('numba').setLevel(logging.WARNING)
+
+logging.getLogger("numba").setLevel(logging.WARNING)
 from multiprocessing import Process

 exp_dir = sys.argv[1]
 f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
+
+
 def printt(strr):
    print(strr)
    f.write("%s\n" % strr)
    f.flush()

+
 n_p = int(sys.argv[2])
 f0method = sys.argv[3]

+
 class FeatureInput(object):
    def __init__(self, samplerate=16000, hop_size=160):
        self.fs = samplerate
@ -31,17 +36,26 @@ class FeatureInput(object):
        x, sr = librosa.load(path, self.fs)
        p_len = x.shape[0] // self.hop
        assert sr == self.fs
-        if(f0_method=="pm"):
+        if f0_method == "pm":
            time_step = 160 / 16000 * 1000
            f0_min = 50
            f0_max = 1100
-            f0 = parselmouth.Sound(x, sr).to_pitch_ac(
-                time_step=time_step / 1000, voicing_threshold=0.6,
-                pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
+            f0 = (
+                parselmouth.Sound(x, sr)
+                .to_pitch_ac(
+                    time_step=time_step / 1000,
+                    voicing_threshold=0.6,
+                    pitch_floor=f0_min,
+                    pitch_ceiling=f0_max,
+                )
+                .selected_array["frequency"]
+            )
            pad_size = (p_len - len(f0) + 1) // 2
-            if(pad_size>0 or p_len - len(f0) - pad_size>0):
-                f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
-        elif(f0_method=="harvest"):
+            if pad_size > 0 or p_len - len(f0) - pad_size > 0:
+                f0 = np.pad(
+                    f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
+                )
+        elif f0_method == "harvest":
            f0, t = pyworld.harvest(
                x.astype(np.double),
                fs=sr,
@ -50,7 +64,7 @@ class FeatureInput(object):
                frame_period=1000 * self.hop / sr,
            )
            f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
-        elif(f0_method=="dio"):
+        elif f0_method == "dio":
            f0, t = pyworld.dio(
                x.astype(np.double),
                fs=sr,
@ -78,22 +92,37 @@ class FeatureInput(object):
        return f0_coarse

    def go(self, paths, f0_method):
-        if (len(paths) == 0): printt("no-f0-todo")
+        if len(paths) == 0:
+            printt("no-f0-todo")
        else:
            printt("todo-f0-%s" % len(paths))
            n = max(len(paths) // 5, 1)  # 每个进程最多打印5条
            for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
                try:
-                    if(idx%n==0):printt("f0ing,now-%s,all-%s,-%s"%(idx,len(paths),inp_path))
-                    if(os.path.exists(opt_path1+".npy")==True and os.path.exists(opt_path2+".npy")==True):continue
+                    if idx % n == 0:
+                        printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path))
+                    if (
+                        os.path.exists(opt_path1 + ".npy") == True
+                        and os.path.exists(opt_path2 + ".npy") == True
+                    ):
+                        continue
                    featur_pit = self.compute_f0(inp_path, f0_method)
-                    np.save(opt_path2,featur_pit,allow_pickle=False,)#nsf
+                    np.save(
+                        opt_path2,
+                        featur_pit,
+                        allow_pickle=False,
+                    )  # nsf
                    coarse_pit = self.coarse_f0(featur_pit)
-                    np.save(opt_path1,coarse_pit,allow_pickle=False,)#ori
+                    np.save(
+                        opt_path1,
+                        coarse_pit,
+                        allow_pickle=False,
+                    )  # ori
                except:
                    printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc()))

-if __name__=='__main__':
+
+if __name__ == "__main__":
    # exp_dir=r"E:\codes\py39\dataset\mi-test"
    # n_p=16
    # f = open("%s/log_extract_f0.log"%exp_dir, "w")
@ -108,14 +137,21 @@ if __name__=='__main__':
    os.makedirs(opt_root2, exist_ok=True)
    for name in sorted(list(os.listdir(inp_root))):
        inp_path = "%s/%s" % (inp_root, name)
-        if ("spec" in inp_path): continue
+        if "spec" in inp_path:
+            continue
        opt_path1 = "%s/%s" % (opt_root1, name)
        opt_path2 = "%s/%s" % (opt_root2, name)
        paths.append([inp_path, opt_path1, opt_path2])

    ps = []
    for i in range(n_p):
-        p=Process(target=featureInput.go,args=(paths[i::n_p],f0method,))
+        p = Process(
+            target=featureInput.go,
+            args=(
+                paths[i::n_p],
+                f0method,
+            ),
+        )
        p.start()
        ps.append(p)
    for p in ps:
--- a/extract_feature_print.py
+++ b/extract_feature_print.py
@ -1,4 +1,5 @@
 import os, sys, traceback
+
 # device=sys.argv[1]
 n_part = int(sys.argv[2])
 i_part = int(sys.argv[3])
@ -14,13 +15,18 @@ import torch.nn.functional as F
 import soundfile as sf
 import numpy as np
 from fairseq import checkpoint_utils
+
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

 f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
+
+
 def printt(strr):
    print(strr)
    f.write("%s\n" % strr)
    f.flush()
+
+
 printt(sys.argv)
 model_path = "hubert_base.pt"

@ -28,6 +34,8 @@ printt(exp_dir)
 wavPath = "%s/1_16k_wavs" % exp_dir
 outPath = "%s/3_feature256" % exp_dir
 os.makedirs(outPath, exist_ok=True)
+
+
 # wave must be 16k, hop_size=320
 def readwave(wav_path, normalize=False):
    wav, sr = sf.read(wav_path)
@ -41,6 +49,8 @@ def readwave(wav_path, normalize=False):
            feats = F.layer_norm(feats, feats.shape)
    feats = feats.view(1, -1)
    return feats
+
+
 # HuBERT model
 printt("load model(s) from {}".format(model_path))
 models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
@ -50,12 +60,14 @@ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
 model = models[0]
 model = model.to(device)
 printt("move model to %s" % device)
-if device != "cpu": model = model.half()
+if device != "cpu":
+    model = model.half()
 model.eval()

 todo = sorted(list(os.listdir(wavPath)))[i_part::n_part]
 n = max(1, len(todo) // 10)  # 最多打印十条
-if(len(todo)==0):printt("no-feature-todo")
+if len(todo) == 0:
+    printt("no-feature-todo")
 else:
    printt("all-feature-%s" % len(todo))
    for idx, file in enumerate(todo):
@ -64,12 +76,15 @@ else:
                wav_path = "%s/%s" % (wavPath, file)
                out_path = "%s/%s" % (outPath, file.replace("wav", "npy"))

-                if(os.path.exists(out_path)):continue
+                if os.path.exists(out_path):
+                    continue

                feats = readwave(wav_path, normalize=saved_cfg.task.normalize)
                padding_mask = torch.BoolTensor(feats.shape).fill_(False)
                inputs = {
-                    "source": feats.half().to(device) if device != "cpu" else feats.to(device),
+                    "source": feats.half().to(device)
+                    if device != "cpu"
+                    else feats.to(device),
                    "padding_mask": padding_mask.to(device),
                    "output_layer": 9,  # layer 9
                }
@ -78,11 +93,12 @@ else:
                    feats = model.final_proj(logits[0])

                feats = feats.squeeze(0).float().cpu().numpy()
-                if(np.isnan(feats).sum()==0):
+                if np.isnan(feats).sum() == 0:
                    np.save(out_path, feats, allow_pickle=False)
                else:
                    printt("%s-contains nan" % file)
-                if (idx % n == 0):printt("now-%s,all-%s,%s,%s"%(len(todo),idx,file,feats.shape))
+                if idx % n == 0:
+                    printt("now-%s,all-%s,%s,%s" % (len(todo), idx, file, feats.shape))
        except:
            printt(traceback.format_exc())
    printt("all-feature-done")
--- a/extract_locale.py
+++ b/extract_locale.py
@ -7,9 +7,10 @@ pattern = r"""i18n\((["'][^"']+["'])\)"""
 # Initialize the dictionary to store key-value pairs
 data = {}

+
 def process(fn: str):
    global data
-    with open(fn, 'r', encoding='utf-8') as f:
+    with open(fn, "r", encoding="utf-8") as f:
        contents = f.read()
        matches = re.findall(pattern, contents)
        for key in matches:
@ -17,12 +18,13 @@ def process(fn: str):
            print("extract:", key)
            data[key] = key

+
 print("processing infer-web.py")
-process('infer-web.py')
+process("infer-web.py")

 print("processing gui.py")
-process('gui.py')
+process("gui.py")

 # Save as a JSON file
-with open('./locale/zh_CN.json', 'w', encoding='utf-8') as f:
+with open("./locale/zh_CN.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=4)
--- a/gui.py
+++ b/gui.py
@ -10,15 +10,19 @@ import torchaudio.transforms as tat
 # import matplotlib.pyplot as plt
 from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono
 from webui_locale import I18nAuto
+
 i18n = I18nAuto()

 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

+
 class RVC:
-    def __init__(self,key,hubert_path,pth_path,index_path,npy_path,index_rate) -> None:
-        '''
+    def __init__(
+        self, key, hubert_path, pth_path, index_path, npy_path, index_rate
+    ) -> None:
+        """
        初始化
-        '''
+        """
        self.f0_up_key = key
        self.time_step = 160 / 16000 * 1000
        self.f0_min = 50
@ -27,7 +31,7 @@ class RVC:
        self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
        self.index = faiss.read_index(index_path)
        self.index_rate = index_rate
-        '''NOT YET USED'''
+        """NOT YET USED"""
        self.big_npy = np.load(npy_path)
        model_path = hubert_path
        print("load model(s) from {}".format(model_path))
@ -43,7 +47,7 @@ class RVC:
        tgt_sr = cpt["config"][-1]
        cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
        if_f0 = cpt.get("f0", 1)
-        if(if_f0==1):
+        if if_f0 == 1:
            self.net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=True)
        else:
            self.net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
@ -52,10 +56,11 @@ class RVC:
        self.net_g.eval().to(device)
        self.net_g.half()

-
    def get_f0_coarse(self, f0):
        f0_mel = 1127 * np.log(1 + f0 / 700)
-        f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (self.f0_mel_max - self.f0_mel_min) + 1
+        f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (
+            self.f0_mel_max - self.f0_mel_min
+        ) + 1
        f0_mel[f0_mel <= 1] = 1
        f0_mel[f0_mel > 255] = 255
        # f0_mel[f0_mel > 188] = 188
@ -63,24 +68,30 @@ class RVC:
        return f0_coarse

    def get_f0(self, x, p_len, f0_up_key=0):
-        f0 = parselmouth.Sound(x, 16000).to_pitch_ac(
-            time_step=self.time_step / 1000, voicing_threshold=0.6,
-            pitch_floor=self.f0_min, pitch_ceiling=self.f0_max).selected_array['frequency']
+        f0 = (
+            parselmouth.Sound(x, 16000)
+            .to_pitch_ac(
+                time_step=self.time_step / 1000,
+                voicing_threshold=0.6,
+                pitch_floor=self.f0_min,
+                pitch_ceiling=self.f0_max,
+            )
+            .selected_array["frequency"]
+        )

        pad_size = (p_len - len(f0) + 1) // 2
-        if(pad_size>0 or p_len - len(f0) - pad_size>0):
-            f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
+        if pad_size > 0 or p_len - len(f0) - pad_size > 0:
+            f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
        f0 *= pow(2, f0_up_key / 12)
        # f0=suofang(f0)
        f0bak = f0.copy()
        f0_coarse = self.get_f0_coarse(f0)
        return f0_coarse, f0bak

-
    def infer(self, feats: torch.Tensor) -> np.ndarray:
-        '''
+        """
        推理函数
-        '''
+        """
        audio = feats.clone().cpu().numpy()
        assert feats.dim() == 1, feats.dim()
        feats = feats.view(1, -1)
@ -96,11 +107,18 @@ class RVC:
            feats = self.model.final_proj(logits[0])

        ####索引优化
-        if(isinstance(self.index,type(None))==False and isinstance(self.big_npy,type(None))==False and self.index_rate!=0):
+        if (
+            isinstance(self.index, type(None)) == False
+            and isinstance(self.big_npy, type(None)) == False
+            and self.index_rate != 0
+        ):
            npy = feats[0].cpu().numpy().astype("float32")
            _, I = self.index.search(npy, 1)
            npy = self.big_npy[I.squeeze()].astype("float16")
-            feats = torch.from_numpy(npy).unsqueeze(0).to(device)*self.index_rate + (1-self.index_rate)*feats
+            feats = (
+                torch.from_numpy(npy).unsqueeze(0).to(device) * self.index_rate
+                + (1 - self.index_rate) * feats
+            )

        feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
        torch.cuda.synchronize()
@ -120,17 +138,21 @@ class RVC:
        ii = 0  # sid
        sid = torch.LongTensor([ii]).to(device)
        with torch.no_grad():
-            infered_audio = self.net_g.infer(feats, p_len,pitch,pitchf,sid)[0][0, 0].data.cpu().float()#nsf
+            infered_audio = (
+                self.net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
+                .data.cpu()
+                .float()
+            )  # nsf
        torch.cuda.synchronize()
        return infered_audio


 class Config:
    def __init__(self) -> None:
-        self.hubert_path:str=''
-        self.pth_path:str=''
-        self.index_path:str=''
-        self.npy_path:str=''
+        self.hubert_path: str = ""
+        self.pth_path: str = ""
+        self.index_path: str = ""
+        self.npy_path: str = ""
        self.pitch: int = 12
        self.samplerate: int = 44100
        self.block_time: float = 1.0  # s
@ -142,6 +164,7 @@ class Config:
        self.O_noise_reduce = False
        self.index_rate = 0.3

+
 class GUI:
    def __init__(self) -> None:
        self.config = Config()
@ -150,37 +173,145 @@ class GUI:
        self.launcher()

    def launcher(self):
-        sg.theme('LightBlue3')
+        sg.theme("LightBlue3")
        input_devices, output_devices, _, _ = self.get_devices()
        layout = [
            [
-                sg.Frame(title=i18n('加载模型'),layout=[
-                    [sg.Input(default_text='TEMP\\hubert_base.pt',key='hubert_path'),sg.FileBrowse(i18n('Hubert模型'))],
-                    [sg.Input(default_text='TEMP\\atri.pth',key='pth_path'),sg.FileBrowse(i18n('选择.pth文件'))],
-                    [sg.Input(default_text='TEMP\\added_IVF512_Flat_atri_baseline_src_feat.index',key='index_path'),sg.FileBrowse(i18n('选择.index文件'))],
-                    [sg.Input(default_text='TEMP\\big_src_feature_atri.npy',key='npy_path'),sg.FileBrowse(i18n('选择.npy文件'))]
-                ])
+                sg.Frame(
+                    title=i18n("加载模型"),
+                    layout=[
+                        [
+                            sg.Input(
+                                default_text="TEMP\\hubert_base.pt", key="hubert_path"
+                            ),
+                            sg.FileBrowse(i18n("Hubert模型")),
                        ],
                        [
-                sg.Frame(layout=[
-                    [sg.Text(i18n("输入设备")),sg.Combo(input_devices,key='sg_input_device',default_value=input_devices[sd.default.device[0]])],
-                    [sg.Text(i18n("输出设备")),sg.Combo(output_devices,key='sg_output_device',default_value=output_devices[sd.default.device[1]])]
-                ],title=i18n("音频设备(请使用同种类驱动)"))
+                            sg.Input(default_text="TEMP\\atri.pth", key="pth_path"),
+                            sg.FileBrowse(i18n("选择.pth文件")),
                        ],
                        [
-                sg.Frame(layout=[
-                    [sg.Text(i18n("响应阈值")),sg.Slider(range=(-60,0),key='threhold',resolution=1,orientation='h',default_value=-30)],
-                    [sg.Text(i18n("音调设置")),sg.Slider(range=(-24,24),key='pitch',resolution=1,orientation='h',default_value=12)],
-                    [sg.Text(i18n('Index Rate')),sg.Slider(range=(0.0,1.0),key='index_rate',resolution=0.01,orientation='h',default_value=0.5)]
-                ],title=i18n("常规设置")),
-                sg.Frame(layout=[
-                    [sg.Text(i18n("采样长度")),sg.Slider(range=(0.1,3.0),key='block_time',resolution=0.1,orientation='h',default_value=1.0)],
-                    [sg.Text(i18n("淡入淡出长度")),sg.Slider(range=(0.01,0.15),key='crossfade_length',resolution=0.01,orientation='h',default_value=0.08)],
-                    [sg.Text(i18n("额外推理时长")),sg.Slider(range=(0.05,3.00),key='extra_time',resolution=0.01,orientation='h',default_value=0.05)],
-                    [sg.Checkbox(i18n('输入降噪'),key='I_noise_reduce'),sg.Checkbox(i18n('输出降噪'),key='O_noise_reduce')]
-                ],title=i18n("性能设置"))
+                            sg.Input(
+                                default_text="TEMP\\added_IVF512_Flat_atri_baseline_src_feat.index",
+                                key="index_path",
+                            ),
+                            sg.FileBrowse(i18n("选择.index文件")),
+                        ],
+                        [
+                            sg.Input(
+                                default_text="TEMP\\big_src_feature_atri.npy",
+                                key="npy_path",
+                            ),
+                            sg.FileBrowse(i18n("选择.npy文件")),
+                        ],
+                    ],
+                )
+            ],
+            [
+                sg.Frame(
+                    layout=[
+                        [
+                            sg.Text(i18n("输入设备")),
+                            sg.Combo(
+                                input_devices,
+                                key="sg_input_device",
+                                default_value=input_devices[sd.default.device[0]],
+                            ),
+                        ],
+                        [
+                            sg.Text(i18n("输出设备")),
+                            sg.Combo(
+                                output_devices,
+                                key="sg_output_device",
+                                default_value=output_devices[sd.default.device[1]],
+                            ),
+                        ],
+                    ],
+                    title=i18n("音频设备(请使用同种类驱动)"),
+                )
+            ],
+            [
+                sg.Frame(
+                    layout=[
+                        [
+                            sg.Text(i18n("响应阈值")),
+                            sg.Slider(
+                                range=(-60, 0),
+                                key="threhold",
+                                resolution=1,
+                                orientation="h",
+                                default_value=-30,
+                            ),
+                        ],
+                        [
+                            sg.Text(i18n("音调设置")),
+                            sg.Slider(
+                                range=(-24, 24),
+                                key="pitch",
+                                resolution=1,
+                                orientation="h",
+                                default_value=12,
+                            ),
+                        ],
+                        [
+                            sg.Text(i18n("Index Rate")),
+                            sg.Slider(
+                                range=(0.0, 1.0),
+                                key="index_rate",
+                                resolution=0.01,
+                                orientation="h",
+                                default_value=0.5,
+                            ),
+                        ],
+                    ],
+                    title=i18n("常规设置"),
+                ),
+                sg.Frame(
+                    layout=[
+                        [
+                            sg.Text(i18n("采样长度")),
+                            sg.Slider(
+                                range=(0.1, 3.0),
+                                key="block_time",
+                                resolution=0.1,
+                                orientation="h",
+                                default_value=1.0,
+                            ),
+                        ],
+                        [
+                            sg.Text(i18n("淡入淡出长度")),
+                            sg.Slider(
+                                range=(0.01, 0.15),
+                                key="crossfade_length",
+                                resolution=0.01,
+                                orientation="h",
+                                default_value=0.08,
+                            ),
+                        ],
+                        [
+                            sg.Text(i18n("额外推理时长")),
+                            sg.Slider(
+                                range=(0.05, 3.00),
+                                key="extra_time",
+                                resolution=0.01,
+                                orientation="h",
+                                default_value=0.05,
+                            ),
+                        ],
+                        [
+                            sg.Checkbox(i18n("输入降噪"), key="I_noise_reduce"),
+                            sg.Checkbox(i18n("输出降噪"), key="O_noise_reduce"),
+                        ],
+                    ],
+                    title=i18n("性能设置"),
+                ),
+            ],
+            [
+                sg.Button(i18n("开始音频转换"), key="start_vc"),
+                sg.Button(i18n("停止音频转换"), key="stop_vc"),
+                sg.Text(i18n("推理时间(ms):")),
+                sg.Text("0", key="infer_time"),
            ],
-            [sg.Button(i18n("开始音频转换"),key='start_vc'),sg.Button(i18n("停止音频转换"),key='stop_vc'),sg.Text(i18n("推理时间(ms):")),sg.Text("0",key='infer_time')]
        ]

        self.window = sg.Window("RVC - GUI", layout=layout)
@ -192,29 +323,28 @@ class GUI:
            if event == sg.WINDOW_CLOSED:
                self.flag_vc = False
                exit()
-            if event == 'start_vc' and self.flag_vc==False:
+            if event == "start_vc" and self.flag_vc == False:
                self.set_values(values)
                print(str(self.config.__dict__))
-                print('using_cuda:'+str(torch.cuda.is_available()))
+                print("using_cuda:" + str(torch.cuda.is_available()))
                self.start_vc()
-            if event=='stop_vc'and self.flag_vc==True:
+            if event == "stop_vc" and self.flag_vc == True:
                self.flag_vc = False

-
    def set_values(self, values):
-        self.set_devices(values["sg_input_device"],values['sg_output_device'])
-        self.config.hubert_path=values['hubert_path']
-        self.config.pth_path=values['pth_path']
-        self.config.index_path=values['index_path']
-        self.config.npy_path=values['npy_path']
-        self.config.threhold=values['threhold']
-        self.config.pitch=values['pitch']
-        self.config.block_time=values['block_time']
-        self.config.crossfade_time=values['crossfade_length']
-        self.config.extra_time=values['extra_time']
-        self.config.I_noise_reduce=values['I_noise_reduce']
-        self.config.O_noise_reduce=values['O_noise_reduce']
-        self.config.index_rate=values['index_rate']
+        self.set_devices(values["sg_input_device"], values["sg_output_device"])
+        self.config.hubert_path = values["hubert_path"]
+        self.config.pth_path = values["pth_path"]
+        self.config.index_path = values["index_path"]
+        self.config.npy_path = values["npy_path"]
+        self.config.threhold = values["threhold"]
+        self.config.pitch = values["pitch"]
+        self.config.block_time = values["block_time"]
+        self.config.crossfade_time = values["crossfade_length"]
+        self.config.extra_time = values["extra_time"]
+        self.config.I_noise_reduce = values["I_noise_reduce"]
+        self.config.O_noise_reduce = values["O_noise_reduce"]
+        self.config.index_rate = values["index_rate"]

    def start_vc(self):
        torch.cuda.empty_cache()
@ -223,44 +353,76 @@ class GUI:
        self.crossfade_frame = int(self.config.crossfade_time * self.config.samplerate)
        self.sola_search_frame = int(0.012 * self.config.samplerate)
        self.delay_frame = int(0.02 * self.config.samplerate)  # 往前预留0.02s
-        self.extra_frame=int(self.config.extra_time*self.config.samplerate)#往后预留0.04s
+        self.extra_frame = int(
+            self.config.extra_time * self.config.samplerate
+        )  # 往后预留0.04s
        self.rvc = None
-        self.rvc=RVC(self.config.pitch,self.config.hubert_path,self.config.pth_path,self.config.index_path,self.config.npy_path,self.config.index_rate)
-        self.input_wav:np.ndarray=np.zeros(self.extra_frame+self.crossfade_frame+self.sola_search_frame+self.block_frame,dtype='float32')
-        self.output_wav:torch.Tensor=torch.zeros(self.block_frame,device=device,dtype=torch.float32)
-        self.sola_buffer:torch.Tensor=torch.zeros(self.crossfade_frame,device=device,dtype=torch.float32)
-        self.fade_in_window:torch.Tensor=torch.linspace(0.0,1.0,steps=self.crossfade_frame,device=device,dtype=torch.float32)
+        self.rvc = RVC(
+            self.config.pitch,
+            self.config.hubert_path,
+            self.config.pth_path,
+            self.config.index_path,
+            self.config.npy_path,
+            self.config.index_rate,
+        )
+        self.input_wav: np.ndarray = np.zeros(
+            self.extra_frame
+            + self.crossfade_frame
+            + self.sola_search_frame
+            + self.block_frame,
+            dtype="float32",
+        )
+        self.output_wav: torch.Tensor = torch.zeros(
+            self.block_frame, device=device, dtype=torch.float32
+        )
+        self.sola_buffer: torch.Tensor = torch.zeros(
+            self.crossfade_frame, device=device, dtype=torch.float32
+        )
+        self.fade_in_window: torch.Tensor = torch.linspace(
+            0.0, 1.0, steps=self.crossfade_frame, device=device, dtype=torch.float32
+        )
        self.fade_out_window: torch.Tensor = 1 - self.fade_in_window
-        self.resampler1=tat.Resample(orig_freq=self.config.samplerate,new_freq=16000,dtype=torch.float32)
-        self.resampler2=tat.Resample(orig_freq=40000,new_freq=self.config.samplerate,dtype=torch.float32)
+        self.resampler1 = tat.Resample(
+            orig_freq=self.config.samplerate, new_freq=16000, dtype=torch.float32
+        )
+        self.resampler2 = tat.Resample(
+            orig_freq=40000, new_freq=self.config.samplerate, dtype=torch.float32
+        )
        thread_vc = threading.Thread(target=self.soundinput)
        thread_vc.start()

-
    def soundinput(self):
-        '''
+        """
        接受音频输入
-        '''
-        with sd.Stream(callback=self.audio_callback, blocksize=self.block_frame,samplerate=self.config.samplerate,dtype='float32'):
+        """
+        with sd.Stream(
+            callback=self.audio_callback,
+            blocksize=self.block_frame,
+            samplerate=self.config.samplerate,
+            dtype="float32",
+        ):
            while self.flag_vc:
                time.sleep(self.config.block_time)
-                print('Audio block passed.')
-        print('ENDing VC')
+                print("Audio block passed.")
+        print("ENDing VC")

-
-    def audio_callback(self,indata:np.ndarray,outdata:np.ndarray, frames, times, status):
-        '''
+    def audio_callback(
+        self, indata: np.ndarray, outdata: np.ndarray, frames, times, status
+    ):
+        """
        音频处理
-        '''
+        """
        start_time = time.perf_counter()
        indata = librosa.to_mono(indata.T)
        if self.config.I_noise_reduce:
            indata[:] = nr.reduce_noise(y=indata, sr=self.config.samplerate)

-        '''noise gate'''
+        """noise gate"""
        frame_length = 2048
        hop_length = 1024
-        rms=librosa.feature.rms(y=indata,frame_length=frame_length,hop_length=hop_length)
+        rms = librosa.feature.rms(
+            y=indata, frame_length=frame_length, hop_length=hop_length
+        )
        db_threhold = librosa.amplitude_to_db(rms, ref=1.0)[0] < self.config.threhold
        # print(rms.shape,db.shape,db)
        for i in range(db_threhold.shape[0]):
@ -269,36 +431,65 @@ class GUI:
        self.input_wav[:] = np.append(self.input_wav[self.block_frame :], indata)

        # infer
-        print('input_wav:'+str(self.input_wav.shape))
+        print("input_wav:" + str(self.input_wav.shape))
        # print('infered_wav:'+str(infer_wav.shape))
-        infer_wav:torch.Tensor=self.resampler2(self.rvc.infer(self.resampler1(torch.from_numpy(self.input_wav))))[-self.crossfade_frame-self.sola_search_frame-self.block_frame:].to(device)
-        print('infer_wav:'+str(infer_wav.shape))
+        infer_wav: torch.Tensor = self.resampler2(
+            self.rvc.infer(self.resampler1(torch.from_numpy(self.input_wav)))
+        )[-self.crossfade_frame - self.sola_search_frame - self.block_frame :].to(
+            device
+        )
+        print("infer_wav:" + str(infer_wav.shape))

        # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC
-        cor_nom=F.conv1d(infer_wav[None,None,:self.crossfade_frame + self.sola_search_frame],self.sola_buffer[None,None,:])
-        cor_den=torch.sqrt(F.conv1d(infer_wav[None,None,:self.crossfade_frame + self.sola_search_frame]**2,torch.ones(1, 1,self.crossfade_frame,device=device))+1e-8)
+        cor_nom = F.conv1d(
+            infer_wav[None, None, : self.crossfade_frame + self.sola_search_frame],
+            self.sola_buffer[None, None, :],
+        )
+        cor_den = torch.sqrt(
+            F.conv1d(
+                infer_wav[None, None, : self.crossfade_frame + self.sola_search_frame]
+                ** 2,
+                torch.ones(1, 1, self.crossfade_frame, device=device),
+            )
+            + 1e-8
+        )
        sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0])
-        print('sola offset: ' + str(int(sola_offset)))
+        print("sola offset: " + str(int(sola_offset)))

        # crossfade
        self.output_wav[:] = infer_wav[sola_offset : sola_offset + self.block_frame]
        self.output_wav[: self.crossfade_frame] *= self.fade_in_window
        self.output_wav[: self.crossfade_frame] += self.sola_buffer[:]
        if sola_offset < self.sola_search_frame:
-            self.sola_buffer[:] = infer_wav[-self.sola_search_frame - self.crossfade_frame + sola_offset: -self.sola_search_frame + sola_offset]* self.fade_out_window
+            self.sola_buffer[:] = (
+                infer_wav[
+                    -self.sola_search_frame
+                    - self.crossfade_frame
+                    + sola_offset : -self.sola_search_frame
+                    + sola_offset
+                ]
+                * self.fade_out_window
+            )
        else:
-            self.sola_buffer[:] = infer_wav[- self.crossfade_frame :]* self.fade_out_window
+            self.sola_buffer[:] = (
+                infer_wav[-self.crossfade_frame :] * self.fade_out_window
+            )

        if self.config.O_noise_reduce:
-            outdata[:]=np.tile(nr.reduce_noise(y=self.output_wav[:].cpu().numpy(),sr=self.config.samplerate),(2,1)).T
+            outdata[:] = np.tile(
+                nr.reduce_noise(
+                    y=self.output_wav[:].cpu().numpy(), sr=self.config.samplerate
+                ),
+                (2, 1),
+            ).T
        else:
            outdata[:] = self.output_wav[:].repeat(2, 1).t().cpu().numpy()
        total_time = time.perf_counter() - start_time
-        print('infer time:'+str(total_time))
-        self.window['infer_time'].update(int(total_time*1000))
+        print("infer time:" + str(total_time))
+        self.window["infer_time"].update(int(total_time * 1000))

    def get_devices(self, update: bool = True):
-        '''获取设备列表'''
+        """获取设备列表"""
        if update:
            sd._terminate()
            sd._initialize()
@ -317,18 +508,33 @@ class GUI:
            for d in devices
            if d["max_output_channels"] > 0
        ]
-        input_devices_indices = [d["index"] for d in devices if d["max_input_channels"] > 0]
+        input_devices_indices = [
+            d["index"] for d in devices if d["max_input_channels"] > 0
+        ]
        output_devices_indices = [
            d["index"] for d in devices if d["max_output_channels"] > 0
        ]
-        return input_devices, output_devices, input_devices_indices, output_devices_indices
+        return (
+            input_devices,
+            output_devices,
+            input_devices_indices,
+            output_devices_indices,
+        )

    def set_devices(self, input_device, output_device):
-        '''设置输出设备'''
-        input_devices,output_devices,input_device_indices, output_device_indices=self.get_devices()
+        """设置输出设备"""
+        (
+            input_devices,
+            output_devices,
+            input_device_indices,
+            output_device_indices,
+        ) = self.get_devices()
        sd.default.device[0] = input_device_indices[input_devices.index(input_device)]
-        sd.default.device[1]=output_device_indices[output_devices.index(output_device)]
+        sd.default.device[1] = output_device_indices[
+            output_devices.index(output_device)
+        ]
        print("input device:" + str(sd.default.device[0]) + ":" + str(input_device))
        print("output device:" + str(sd.default.device[1]) + ":" + str(output_device))

+
 gui = GUI()
--- a/infer-web.py
+++ b/infer-web.py
--- a/infer/infer-pm-index256.py
+++ b/infer/infer-pm-index256.py
@ -1,14 +1,19 @@
-'''
+"""

 对源特征进行检索
-'''
+"""
 import torch, pdb, os, parselmouth
+
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 import numpy as np
 import soundfile as sf
+
 # from models import SynthesizerTrn256#hifigan_nonsf
 # from infer_pack.models import SynthesizerTrn256NSF as SynthesizerTrn256#hifigan_nsf
-from infer_pack.models import SynthesizerTrnMs256NSFsid as SynthesizerTrn256#hifigan_nsf
+from infer_pack.models import (
+    SynthesizerTrnMs256NSFsid as SynthesizerTrn256,
+)  # hifigan_nsf
+
 # from infer_pack.models import SynthesizerTrnMs256NSFsid_sim as SynthesizerTrn256#hifigan_nsf
 # from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf
 # from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf
@ -16,10 +21,12 @@ from infer_pack.models import SynthesizerTrnMs256NSFsid as SynthesizerTrn256#hif

 from scipy.io import wavfile
 from fairseq import checkpoint_utils
+
 # import pyworld
 import librosa
 import torch.nn.functional as F
 import scipy.signal as signal
+
 # import torchcrepe
 from time import time as ttime

@ -37,7 +44,26 @@ model.eval()

 # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],183,256,is_half=True)#hifigan#512#256
 # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],109,256,is_half=True)#hifigan#512#256
-net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],183,256,is_half=True)#hifigan#512#256#no_dropout
+net_g = SynthesizerTrn256(
+    1025,
+    32,
+    192,
+    192,
+    768,
+    2,
+    6,
+    3,
+    0,
+    "1",
+    [3, 7, 11],
+    [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+    [10, 10, 2, 2],
+    512,
+    [16, 16, 4, 4],
+    183,
+    256,
+    is_half=True,
+)  # hifigan#512#256#no_dropout
 # net_g = SynthesizerTrn256(1025,32,192,192,768,2,3,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],0)#ts3
 # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2],512,[16,16,4],0)#hifigan-ps-sr
 #
@ -53,37 +79,53 @@ print(net_g.load_state_dict(weights,strict=True))

 net_g.eval().to(device)
 net_g.half()
-def get_f0(x, p_len,f0_up_key=0):

+
+def get_f0(x, p_len, f0_up_key=0):
    time_step = 160 / 16000 * 1000
    f0_min = 50
    f0_max = 1100
    f0_mel_min = 1127 * np.log(1 + f0_min / 700)
    f0_mel_max = 1127 * np.log(1 + f0_max / 700)

-    f0 = parselmouth.Sound(x, 16000).to_pitch_ac(
-        time_step=time_step / 1000, voicing_threshold=0.6,
-        pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
+    f0 = (
+        parselmouth.Sound(x, 16000)
+        .to_pitch_ac(
+            time_step=time_step / 1000,
+            voicing_threshold=0.6,
+            pitch_floor=f0_min,
+            pitch_ceiling=f0_max,
+        )
+        .selected_array["frequency"]
+    )

    pad_size = (p_len - len(f0) + 1) // 2
-    if(pad_size>0 or p_len - len(f0) - pad_size>0):
-        f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
+    if pad_size > 0 or p_len - len(f0) - pad_size > 0:
+        f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
    f0 *= pow(2, f0_up_key / 12)
    f0bak = f0.copy()

    f0_mel = 1127 * np.log(1 + f0 / 700)
-    f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
+    f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
+        f0_mel_max - f0_mel_min
+    ) + 1
    f0_mel[f0_mel <= 1] = 1
    f0_mel[f0_mel > 255] = 255
    # f0_mel[f0_mel > 188] = 188
    f0_coarse = np.rint(f0_mel).astype(np.int)
    return f0_coarse, f0bak

+
 import faiss
+
 index = faiss.read_index("infer/added_IVF512_Flat_mi_baseline_src_feat.index")
 big_npy = np.load("infer/big_src_feature_mi.npy")
 ta0 = ta1 = ta2 = 0
-for idx,name in enumerate(["冬之花clip1.wav",]):##
+for idx, name in enumerate(
+    [
+        "冬之花clip1.wav",
+    ]
+):  ##
    wav_path = "todo-songs/%s" % name  #
    f0_up_key = -2  #
    audio, sampling_rate = sf.read(wav_path)
@ -92,7 +134,6 @@ for idx,name in enumerate(["冬之花clip1.wav",]):##
    if sampling_rate != 16000:
        audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)

-
    feats = torch.from_numpy(audio).float()
    if feats.dim() == 2:  # double channels
        feats = feats.mean(-1)
@ -104,7 +145,8 @@ for idx,name in enumerate(["冬之花clip1.wav",]):##
        "padding_mask": padding_mask.to(device),
        "output_layer": 9,  # layer 9
    }
-    if torch.cuda.is_available(): torch.cuda.synchronize()
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
    t0 = ttime()
    with torch.no_grad():
        logits = model.extract_features(**inputs)
@ -113,16 +155,20 @@ for idx,name in enumerate(["冬之花clip1.wav",]):##
    ####索引优化
    npy = feats[0].cpu().numpy().astype("float32")
    D, I = index.search(npy, 1)
-    feats = torch.from_numpy(big_npy[I.squeeze()].astype("float16")).unsqueeze(0).to(device)
+    feats = (
+        torch.from_numpy(big_npy[I.squeeze()].astype("float16")).unsqueeze(0).to(device)
+    )

    feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
-    if torch.cuda.is_available(): torch.cuda.synchronize()
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
    t1 = ttime()
    # p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存
    p_len = min(feats.shape[1], 10000)  #
    pitch, pitchf = get_f0(audio, p_len, f0_up_key)
    p_len = min(feats.shape[1], 10000, pitch.shape[0])  # 太大了爆显存
-    if torch.cuda.is_available(): torch.cuda.synchronize()
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
    t2 = ttime()
    feats = feats[:, :p_len, :]
    pitch = pitch[:p_len]
@ -132,12 +178,18 @@ for idx,name in enumerate(["冬之花clip1.wav",]):##
    sid = torch.LongTensor([0]).to(device)
    pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device)
    with torch.no_grad():
-        audio = net_g.infer(feats, p_len,pitch,pitchf,sid)[0][0, 0].data.cpu().float().numpy()#nsf
-    if torch.cuda.is_available(): torch.cuda.synchronize()
+        audio = (
+            net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
+            .data.cpu()
+            .float()
+            .numpy()
+        )  # nsf
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
    t3 = ttime()
-    ta0+=(t1-t0)
-    ta1+=(t2-t1)
-    ta2+=(t3-t2)
+    ta0 += t1 - t0
+    ta1 += t2 - t1
+    ta2 += t3 - t2
    # wavfile.write("ft-mi_1k-index256-noD-%s.wav"%name, 40000, audio)##
    # wavfile.write("ft-mi-freeze-vocoder-flow-enc_q_1k-%s.wav"%name, 40000, audio)##
    # wavfile.write("ft-mi-sim1k-%s.wav"%name, 40000, audio)##
--- a/infer/train-index.py
+++ b/infer/train-index.py
@ -1,6 +1,6 @@
-'''
+"""
 格式：直接cid为自带的index位；aid放不下了，通过字典来查，反正就5w个
-'''
+"""
 import faiss, numpy as np, os

 # ###########如果是原始特征要先写save
@ -21,11 +21,11 @@ print("training")
 index_ivf = faiss.extract_index_ivf(index)  #
 index_ivf.nprobe = 9
 index.train(big_npy)
-faiss.write_index(index, 'infer/trained_IVF512_Flat_mi_baseline_src_feat.index')
+faiss.write_index(index, "infer/trained_IVF512_Flat_mi_baseline_src_feat.index")
 print("adding")
 index.add(big_npy)
 faiss.write_index(index, "infer/added_IVF512_Flat_mi_baseline_src_feat.index")
-'''
+"""
 大小（都是FP32）
 big_src_feature 2.95G
    (3098036, 256)
@ -33,4 +33,4 @@ big_emb         4.43G
    (6196072, 192)
 big_emb双倍是因为求特征要repeat后再加pitch

-'''
+"""
--- a/infer/trans_weights.py
+++ b/infer/trans_weights.py
@ -4,8 +4,13 @@ import torch,pdb
 # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder-flow-enc_q\G_1000.pth")["model"]#sim_nsf#
 # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder\G_1000.pth")["model"]#sim_nsf#
 # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-test\G_1000.pth")["model"]#sim_nsf#
-a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-no_opt-no_dropout\G_1000.pth")["model"]#sim_nsf#
-for key in a.keys():a[key]=a[key].half()
+a = torch.load(
+    r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-no_opt-no_dropout\G_1000.pth"
+)[
+    "model"
+]  # sim_nsf#
+for key in a.keys():
+    a[key] = a[key].half()
 # torch.save(a,"ft-mi-freeze-vocoder_true_1k.pt")#
 # torch.save(a,"ft-mi-sim1k.pt")#
 torch.save(a, "ft-mi-no_opt-no_dropout.pt")  #
--- a/infer_pack/commons.py
+++ b/infer_pack/commons.py
@ -48,6 +48,8 @@ def slice_segments(x, ids_str, segment_size=4):
        idx_end = idx_str + segment_size
        ret[i] = x[i, :, idx_str:idx_end]
    return ret
+
+
 def slice_segments2(x, ids_str, segment_size=4):
    ret = torch.zeros_like(x[:, :segment_size])
    for i in range(x.size(0)):
--- a/infer_pack/models.py
+++ b/infer_pack/models.py
@ -12,9 +12,20 @@ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 from infer_pack.commons import init_weights
 import numpy as np
 from infer_pack import commons
+
+
 class TextEncoder256(nn.Module):
    def __init__(
-        self,        out_channels,        hidden_channels,        filter_channels,        n_heads,        n_layers,        kernel_size,        p_dropout,        f0=True    ):
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        f0=True,
+    ):
        super().__init__()
        self.out_channels = out_channels
        self.hidden_channels = hidden_channels
@ -25,7 +36,7 @@ class TextEncoder256(nn.Module):
        self.p_dropout = p_dropout
        self.emb_phone = nn.Linear(256, hidden_channels)
        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
-        if(f0==True):
+        if f0 == True:
            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
        self.encoder = attentions.Encoder(
            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
@ -33,7 +44,7 @@ class TextEncoder256(nn.Module):
        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)

    def forward(self, phone, pitch, lengths):
-        if(pitch==None):
+        if pitch == None:
            x = self.emb_phone(phone)
        else:
            x = self.emb_phone(phone) + self.emb_pitch(pitch)
@ -48,8 +59,20 @@ class TextEncoder256(nn.Module):

        m, logs = torch.split(stats, self.out_channels, dim=1)
        return m, logs, x_mask
+
+
 class TextEncoder256Sim(nn.Module):
-    def __init__(        self,        out_channels,        hidden_channels,        filter_channels,        n_heads,        n_layers,        kernel_size,        p_dropout,        f0=True):
+    def __init__(
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        f0=True,
+    ):
        super().__init__()
        self.out_channels = out_channels
        self.hidden_channels = hidden_channels
@ -60,7 +83,7 @@ class TextEncoder256Sim(nn.Module):
        self.p_dropout = p_dropout
        self.emb_phone = nn.Linear(256, hidden_channels)
        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
-        if(f0==True):
+        if f0 == True:
            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
        self.encoder = attentions.Encoder(
            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
@ -68,17 +91,21 @@ class TextEncoder256Sim(nn.Module):
        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)

    def forward(self, phone, pitch, lengths):
-        if(pitch==None):
+        if pitch == None:
            x = self.emb_phone(phone)
        else:
            x = self.emb_phone(phone) + self.emb_pitch(pitch)
        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
        x = self.lrelu(x)
        x = torch.transpose(x, 1, -1)  # [b, h, t]
-        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(x.dtype)
+        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
+            x.dtype
+        )
        x = self.encoder(x * x_mask, x_mask)
        x = self.proj(x) * x_mask
        return x, x_mask
+
+
 class ResidualCouplingBlock(nn.Module):
    def __init__(
        self,
@ -126,6 +153,8 @@ class ResidualCouplingBlock(nn.Module):
    def remove_weight_norm(self):
        for i in range(self.n_flows):
            self.flows[i * 2].remove_weight_norm()
+
+
 class PosteriorEncoder(nn.Module):
    def __init__(
        self,
@ -169,6 +198,8 @@ class PosteriorEncoder(nn.Module):

    def remove_weight_norm(self):
        self.enc.remove_weight_norm()
+
+
 class Generator(torch.nn.Module):
    def __init__(
        self,
@ -243,6 +274,8 @@ class Generator(torch.nn.Module):
            remove_weight_norm(l)
        for l in self.resblocks:
            l.remove_weight_norm()
+
+
 class SineGen(torch.nn.Module):
    """Definition of sine generator
    SineGen(samp_rate, harmonic_num = 0,
@ -259,10 +292,15 @@ class SineGen(torch.nn.Module):
        segment is always sin(np.pi) or cos(0)
    """

-    def __init__(self, samp_rate, harmonic_num=0,
-                 sine_amp=0.1, noise_std=0.003,
+    def __init__(
+        self,
+        samp_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        noise_std=0.003,
        voiced_threshold=0,
-                 flag_for_pulse=False):
+        flag_for_pulse=False,
+    ):
        super(SineGen, self).__init__()
        self.sine_amp = sine_amp
        self.noise_std = noise_std
@ -289,27 +327,47 @@ class SineGen(torch.nn.Module):
            f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
            # fundamental component
            f0_buf[:, :, 0] = f0[:, :, 0]
-            for idx in np.arange(self.harmonic_num):f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)# idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
+            for idx in np.arange(self.harmonic_num):
+                f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
+                    idx + 2
+                )  # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
            rad_values = (f0_buf / self.sampling_rate) % 1  ###%1意味着n_har的乘积无法后处理优化
-            rand_ini = torch.rand(f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device)
+            rand_ini = torch.rand(
+                f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
+            )
            rand_ini[:, 0] = 0
            rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
            tmp_over_one = torch.cumsum(rad_values, 1)  # % 1  #####%1意味着后面的cumsum无法再优化
            tmp_over_one *= upp
-            tmp_over_one=F.interpolate(tmp_over_one.transpose(2, 1), scale_factor=upp, mode='linear', align_corners=True).transpose(2, 1)
-            rad_values=F.interpolate(rad_values.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)#######
+            tmp_over_one = F.interpolate(
+                tmp_over_one.transpose(2, 1),
+                scale_factor=upp,
+                mode="linear",
+                align_corners=True,
+            ).transpose(2, 1)
+            rad_values = F.interpolate(
+                rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
+            ).transpose(
+                2, 1
+            )  #######
            tmp_over_one %= 1
            tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
            cumsum_shift = torch.zeros_like(rad_values)
            cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
-            sine_waves = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi)
+            sine_waves = torch.sin(
+                torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
+            )
            sine_waves = sine_waves * self.sine_amp
            uv = self._f02uv(f0)
-            uv = F.interpolate(uv.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)
+            uv = F.interpolate(
+                uv.transpose(2, 1), scale_factor=upp, mode="nearest"
+            ).transpose(2, 1)
            noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
            noise = noise_amp * torch.randn_like(sine_waves)
            sine_waves = sine_waves * uv + noise
        return sine_waves, uv, noise
+
+
 class SourceModuleHnNSF(torch.nn.Module):
    """SourceModule for hn-nsf
    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
@ -328,16 +386,24 @@ class SourceModuleHnNSF(torch.nn.Module):
    uv (batchsize, length, 1)
    """

-    def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1,
-                 add_noise_std=0.003, voiced_threshod=0,is_half=True):
+    def __init__(
+        self,
+        sampling_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        add_noise_std=0.003,
+        voiced_threshod=0,
+        is_half=True,
+    ):
        super(SourceModuleHnNSF, self).__init__()

        self.sine_amp = sine_amp
        self.noise_std = add_noise_std
        self.is_half = is_half
        # to produce sine waveforms
-        self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
-                                 sine_amp, add_noise_std, voiced_threshod)
+        self.l_sin_gen = SineGen(
+            sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
+        )

        # to merge source harmonics into a single excitation
        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
@ -345,9 +411,12 @@ class SourceModuleHnNSF(torch.nn.Module):

    def forward(self, x, upp=None):
        sine_wavs, uv, _ = self.l_sin_gen(x, upp)
-        if(self.is_half):sine_wavs=sine_wavs.half()
+        if self.is_half:
+            sine_wavs = sine_wavs.half()
        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
        return sine_merge, None, None  # noise, uv
+
+
 class GeneratorNSF(torch.nn.Module):
    def __init__(
        self,
@ -360,7 +429,7 @@ class GeneratorNSF(torch.nn.Module):
        upsample_kernel_sizes,
        gin_channels,
        sr,
-        is_half=False
+        is_half=False,
    ):
        super(GeneratorNSF, self).__init__()
        self.num_kernels = len(resblock_kernel_sizes)
@ -368,9 +437,7 @@ class GeneratorNSF(torch.nn.Module):

        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
        self.m_source = SourceModuleHnNSF(
-            sampling_rate=sr,
-            harmonic_num=0,
-            is_half=is_half
+            sampling_rate=sr, harmonic_num=0, is_half=is_half
        )
        self.noise_convs = nn.ModuleList()
        self.conv_pre = Conv1d(
@ -394,8 +461,15 @@ class GeneratorNSF(torch.nn.Module):
            )
            if i + 1 < len(upsample_rates):
                stride_f0 = np.prod(upsample_rates[i + 1 :])
-                self.noise_convs.append(Conv1d(
-                    1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2))
+                self.noise_convs.append(
+                    Conv1d(
+                        1,
+                        c_cur,
+                        kernel_size=stride_f0 * 2,
+                        stride=stride_f0,
+                        padding=stride_f0 // 2,
+                    )
+                )
            else:
                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))

@ -444,11 +518,15 @@ class GeneratorNSF(torch.nn.Module):
            remove_weight_norm(l)
        for l in self.resblocks:
            l.remove_weight_norm()
+
+
 sr2sr = {
    "32k": 32000,
    "40k": 40000,
    "48k": 48000,
 }
+
+
 class SynthesizerTrnMs256NSFsid(nn.Module):
    def __init__(
        self,
@ -472,9 +550,8 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
        sr,
        **kwargs
    ):
-
        super().__init__()
-        if(type(sr)==type("strr")):
+        if type(sr) == type("strr"):
            sr = sr2sr[sr]
        self.spec_channels = spec_channels
        self.inter_channels = inter_channels
@ -511,7 +588,9 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
            upsample_rates,
            upsample_initial_channel,
            upsample_kernel_sizes,
-            gin_channels=gin_channels,   sr=sr,         is_half=kwargs["is_half"]
+            gin_channels=gin_channels,
+            sr=sr,
+            is_half=kwargs["is_half"],
        )
        self.enc_q = PosteriorEncoder(
            spec_channels,
@ -527,12 +606,15 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
        )
        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+
    def remove_weight_norm(self):
        self.dec.remove_weight_norm()
        self.flow.remove_weight_norm()
        self.enc_q.remove_weight_norm()

-    def forward(self, phone, phone_lengths, pitch,pitchf, y, y_lengths,ds):#这里ds是id，[bs,1]
+    def forward(
+        self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
+    ):  # 这里ds是id，[bs,1]
        # print(1,pitch.shape)#[bs,t]
        g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
@ -542,9 +624,7 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
            z, y_lengths, self.segment_size
        )
        # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
-        pitchf = commons.slice_segments2(
-            pitchf, ids_slice, self.segment_size
-        )
+        pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
        # print(-2,pitchf.shape,z_slice.shape)
        o = self.dec(z_slice, pitchf, g=g)
        return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
@ -556,6 +636,8 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
        z = self.flow(z_p, x_mask, g=g, reverse=True)
        o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
        return o, x_mask, (z, z_p, m_p, logs_p)
+
+
 class SynthesizerTrnMs256NSFsid_nono(nn.Module):
    def __init__(
        self,
@ -579,7 +661,6 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
        sr=None,
        **kwargs
    ):
-
        super().__init__()
        self.spec_channels = spec_channels
        self.inter_channels = inter_channels
@ -606,7 +687,8 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
            n_heads,
            n_layers,
            kernel_size,
-            p_dropout,f0=False
+            p_dropout,
+            f0=False,
        )
        self.dec = Generator(
            inter_channels,
@ -616,7 +698,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
            upsample_rates,
            upsample_initial_channel,
            upsample_kernel_sizes,
-            gin_channels=gin_channels
+            gin_channels=gin_channels,
        )
        self.enc_q = PosteriorEncoder(
            spec_channels,
@ -656,6 +738,8 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
        z = self.flow(z_p, x_mask, g=g, reverse=True)
        o = self.dec((z * x_mask)[:, :, :max_len], g=g)
        return o, x_mask, (z, z_p, m_p, logs_p)
+
+
 class SynthesizerTrnMs256NSFsid_sim(nn.Module):
    """
    Synthesizer for Training
@ -684,7 +768,6 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
        use_sdp=True,
        **kwargs
    ):
-
        super().__init__()
        self.spec_channels = spec_channels
        self.inter_channels = inter_channels
@ -721,7 +804,8 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
            upsample_rates,
            upsample_initial_channel,
            upsample_kernel_sizes,
-            gin_channels=gin_channels,is_half=kwargs["is_half"]
+            gin_channels=gin_channels,
+            is_half=kwargs["is_half"],
        )

        self.flow = ResidualCouplingBlock(
@ -729,12 +813,15 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
        )
        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+
    def remove_weight_norm(self):
        self.dec.remove_weight_norm()
        self.flow.remove_weight_norm()
        self.enc_q.remove_weight_norm()

-    def forward(self, phone, phone_lengths, pitch, pitchf, y_lengths,ds):  # y是spec不需要了现在
+    def forward(
+        self, phone, phone_lengths, pitch, pitchf, y_lengths, ds
+    ):  # y是spec不需要了现在
        g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
        x, x_mask = self.enc_p(phone, pitch, phone_lengths)
        x = self.flow(x, x_mask, g=g, reverse=True)
@ -742,18 +829,20 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
            x, y_lengths, self.segment_size
        )

-        pitchf = commons.slice_segments2(
-            pitchf, ids_slice, self.segment_size
-        )
+        pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
        o = self.dec(z_slice, pitchf, g=g)
        return o, ids_slice
-    def infer(self, phone, phone_lengths, pitch, pitchf, ds,max_len=None):  # y是spec不需要了现在
+
+    def infer(
+        self, phone, phone_lengths, pitch, pitchf, ds, max_len=None
+    ):  # y是spec不需要了现在
        g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
        x, x_mask = self.enc_p(phone, pitch, phone_lengths)
        x = self.flow(x, x_mask, g=g, reverse=True)
        o = self.dec((x * x_mask)[:, :, :max_len], pitchf, g=g)
        return o, o

+
 class MultiPeriodDiscriminator(torch.nn.Module):
    def __init__(self, use_spectral_norm=False):
        super(MultiPeriodDiscriminator, self).__init__()
@ -783,6 +872,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):

        return y_d_rs, y_d_gs, fmap_rs, fmap_gs

+
 class DiscriminatorS(torch.nn.Module):
    def __init__(self, use_spectral_norm=False):
        super(DiscriminatorS, self).__init__()
@ -812,6 +902,7 @@ class DiscriminatorS(torch.nn.Module):

        return x, fmap

+
 class DiscriminatorP(torch.nn.Module):
    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
        super(DiscriminatorP, self).__init__()
@ -889,4 +980,3 @@ class DiscriminatorP(torch.nn.Module):
        x = torch.flatten(x, 1, -1)

        return x, fmap
-
--- a/infer_pack/models_onnx.py
+++ b/infer_pack/models_onnx.py
@ -12,9 +12,20 @@ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 from infer_pack.commons import init_weights
 import numpy as np
 from infer_pack import commons
+
+
 class TextEncoder256(nn.Module):
    def __init__(
-        self,        out_channels,        hidden_channels,        filter_channels,        n_heads,        n_layers,        kernel_size,        p_dropout,        f0=True    ):
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        f0=True,
+    ):
        super().__init__()
        self.out_channels = out_channels
        self.hidden_channels = hidden_channels
@ -25,7 +36,7 @@ class TextEncoder256(nn.Module):
        self.p_dropout = p_dropout
        self.emb_phone = nn.Linear(256, hidden_channels)
        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
-        if(f0==True):
+        if f0 == True:
            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
        self.encoder = attentions.Encoder(
            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
@ -33,7 +44,7 @@ class TextEncoder256(nn.Module):
        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)

    def forward(self, phone, pitch, lengths):
-        if(pitch==None):
+        if pitch == None:
            x = self.emb_phone(phone)
        else:
            x = self.emb_phone(phone) + self.emb_pitch(pitch)
@ -48,8 +59,20 @@ class TextEncoder256(nn.Module):

        m, logs = torch.split(stats, self.out_channels, dim=1)
        return m, logs, x_mask
+
+
 class TextEncoder256Sim(nn.Module):
-    def __init__(        self,        out_channels,        hidden_channels,        filter_channels,        n_heads,        n_layers,        kernel_size,        p_dropout,        f0=True):
+    def __init__(
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        f0=True,
+    ):
        super().__init__()
        self.out_channels = out_channels
        self.hidden_channels = hidden_channels
@ -60,7 +83,7 @@ class TextEncoder256Sim(nn.Module):
        self.p_dropout = p_dropout
        self.emb_phone = nn.Linear(256, hidden_channels)
        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
-        if(f0==True):
+        if f0 == True:
            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
        self.encoder = attentions.Encoder(
            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
@ -68,17 +91,21 @@ class TextEncoder256Sim(nn.Module):
        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)

    def forward(self, phone, pitch, lengths):
-        if(pitch==None):
+        if pitch == None:
            x = self.emb_phone(phone)
        else:
            x = self.emb_phone(phone) + self.emb_pitch(pitch)
        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
        x = self.lrelu(x)
        x = torch.transpose(x, 1, -1)  # [b, h, t]
-        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(x.dtype)
+        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
+            x.dtype
+        )
        x = self.encoder(x * x_mask, x_mask)
        x = self.proj(x) * x_mask
        return x, x_mask
+
+
 class ResidualCouplingBlock(nn.Module):
    def __init__(
        self,
@ -126,6 +153,8 @@ class ResidualCouplingBlock(nn.Module):
    def remove_weight_norm(self):
        for i in range(self.n_flows):
            self.flows[i * 2].remove_weight_norm()
+
+
 class PosteriorEncoder(nn.Module):
    def __init__(
        self,
@ -169,6 +198,8 @@ class PosteriorEncoder(nn.Module):

    def remove_weight_norm(self):
        self.enc.remove_weight_norm()
+
+
 class Generator(torch.nn.Module):
    def __init__(
        self,
@ -243,6 +274,8 @@ class Generator(torch.nn.Module):
            remove_weight_norm(l)
        for l in self.resblocks:
            l.remove_weight_norm()
+
+
 class SineGen(torch.nn.Module):
    """Definition of sine generator
    SineGen(samp_rate, harmonic_num = 0,
@ -259,10 +292,15 @@ class SineGen(torch.nn.Module):
        segment is always sin(np.pi) or cos(0)
    """

-    def __init__(self, samp_rate, harmonic_num=0,
-                 sine_amp=0.1, noise_std=0.003,
+    def __init__(
+        self,
+        samp_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        noise_std=0.003,
        voiced_threshold=0,
-                 flag_for_pulse=False):
+        flag_for_pulse=False,
+    ):
        super(SineGen, self).__init__()
        self.sine_amp = sine_amp
        self.noise_std = noise_std
@ -289,27 +327,47 @@ class SineGen(torch.nn.Module):
            f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
            # fundamental component
            f0_buf[:, :, 0] = f0[:, :, 0]
-            for idx in np.arange(self.harmonic_num):f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)# idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
+            for idx in np.arange(self.harmonic_num):
+                f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
+                    idx + 2
+                )  # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
            rad_values = (f0_buf / self.sampling_rate) % 1  ###%1意味着n_har的乘积无法后处理优化
-            rand_ini = torch.rand(f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device)
+            rand_ini = torch.rand(
+                f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
+            )
            rand_ini[:, 0] = 0
            rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
            tmp_over_one = torch.cumsum(rad_values, 1)  # % 1  #####%1意味着后面的cumsum无法再优化
            tmp_over_one *= upp
-            tmp_over_one=F.interpolate(tmp_over_one.transpose(2, 1), scale_factor=upp, mode='linear', align_corners=True).transpose(2, 1)
-            rad_values=F.interpolate(rad_values.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)#######
+            tmp_over_one = F.interpolate(
+                tmp_over_one.transpose(2, 1),
+                scale_factor=upp,
+                mode="linear",
+                align_corners=True,
+            ).transpose(2, 1)
+            rad_values = F.interpolate(
+                rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
+            ).transpose(
+                2, 1
+            )  #######
            tmp_over_one %= 1
            tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
            cumsum_shift = torch.zeros_like(rad_values)
            cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
-            sine_waves = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi)
+            sine_waves = torch.sin(
+                torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
+            )
            sine_waves = sine_waves * self.sine_amp
            uv = self._f02uv(f0)
-            uv = F.interpolate(uv.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)
+            uv = F.interpolate(
+                uv.transpose(2, 1), scale_factor=upp, mode="nearest"
+            ).transpose(2, 1)
            noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
            noise = noise_amp * torch.randn_like(sine_waves)
            sine_waves = sine_waves * uv + noise
        return sine_waves, uv, noise
+
+
 class SourceModuleHnNSF(torch.nn.Module):
    """SourceModule for hn-nsf
    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
@ -328,16 +386,24 @@ class SourceModuleHnNSF(torch.nn.Module):
    uv (batchsize, length, 1)
    """

-    def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1,
-                 add_noise_std=0.003, voiced_threshod=0,is_half=True):
+    def __init__(
+        self,
+        sampling_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        add_noise_std=0.003,
+        voiced_threshod=0,
+        is_half=True,
+    ):
        super(SourceModuleHnNSF, self).__init__()

        self.sine_amp = sine_amp
        self.noise_std = add_noise_std
        self.is_half = is_half
        # to produce sine waveforms
-        self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
-                                 sine_amp, add_noise_std, voiced_threshod)
+        self.l_sin_gen = SineGen(
+            sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
+        )

        # to merge source harmonics into a single excitation
        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
@ -345,9 +411,12 @@ class SourceModuleHnNSF(torch.nn.Module):

    def forward(self, x, upp=None):
        sine_wavs, uv, _ = self.l_sin_gen(x, upp)
-        if(self.is_half):sine_wavs=sine_wavs.half()
+        if self.is_half:
+            sine_wavs = sine_wavs.half()
        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
        return sine_merge, None, None  # noise, uv
+
+
 class GeneratorNSF(torch.nn.Module):
    def __init__(
        self,
@ -360,7 +429,7 @@ class GeneratorNSF(torch.nn.Module):
        upsample_kernel_sizes,
        gin_channels,
        sr,
-        is_half=False
+        is_half=False,
    ):
        super(GeneratorNSF, self).__init__()
        self.num_kernels = len(resblock_kernel_sizes)
@ -368,9 +437,7 @@ class GeneratorNSF(torch.nn.Module):

        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
        self.m_source = SourceModuleHnNSF(
-            sampling_rate=sr,
-            harmonic_num=0,
-            is_half=is_half
+            sampling_rate=sr, harmonic_num=0, is_half=is_half
        )
        self.noise_convs = nn.ModuleList()
        self.conv_pre = Conv1d(
@ -394,8 +461,15 @@ class GeneratorNSF(torch.nn.Module):
            )
            if i + 1 < len(upsample_rates):
                stride_f0 = np.prod(upsample_rates[i + 1 :])
-                self.noise_convs.append(Conv1d(
-                    1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2))
+                self.noise_convs.append(
+                    Conv1d(
+                        1,
+                        c_cur,
+                        kernel_size=stride_f0 * 2,
+                        stride=stride_f0,
+                        padding=stride_f0 // 2,
+                    )
+                )
            else:
                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))

@ -444,11 +518,15 @@ class GeneratorNSF(torch.nn.Module):
            remove_weight_norm(l)
        for l in self.resblocks:
            l.remove_weight_norm()
+
+
 sr2sr = {
    "32k": 32000,
    "40k": 40000,
    "48k": 48000,
 }
+
+
 class SynthesizerTrnMs256NSFsid(nn.Module):
    def __init__(
        self,
@ -472,9 +550,8 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
        sr,
        **kwargs
    ):
-
        super().__init__()
-        if(type(sr)==type("strr")):
+        if type(sr) == type("strr"):
            sr = sr2sr[sr]
        self.spec_channels = spec_channels
        self.inter_channels = inter_channels
@ -511,7 +588,9 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
            upsample_rates,
            upsample_initial_channel,
            upsample_kernel_sizes,
-            gin_channels=gin_channels,   sr=sr,         is_half=kwargs["is_half"]
+            gin_channels=gin_channels,
+            sr=sr,
+            is_half=kwargs["is_half"],
        )
        self.enc_q = PosteriorEncoder(
            spec_channels,
@ -527,13 +606,13 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
        )
        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+
    def remove_weight_norm(self):
        self.dec.remove_weight_norm()
        self.flow.remove_weight_norm()
        self.enc_q.remove_weight_norm()

    def forward(self, phone, phone_lengths, pitch, nsff0, sid, rnd, max_len=None):
-    
        g = self.emb_g(sid).unsqueeze(-1)
        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
        z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
@ -541,6 +620,7 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
        o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
        return o

+
 class SynthesizerTrnMs256NSFsid_sim(nn.Module):
    """
    Synthesizer for Training
@ -569,7 +649,6 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
        use_sdp=True,
        **kwargs
    ):
-
        super().__init__()
        self.spec_channels = spec_channels
        self.inter_channels = inter_channels
@ -606,7 +685,8 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
            upsample_rates,
            upsample_initial_channel,
            upsample_kernel_sizes,
-            gin_channels=gin_channels,is_half=kwargs["is_half"]
+            gin_channels=gin_channels,
+            is_half=kwargs["is_half"],
        )

        self.flow = ResidualCouplingBlock(
@ -614,18 +694,22 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
        )
        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+
    def remove_weight_norm(self):
        self.dec.remove_weight_norm()
        self.flow.remove_weight_norm()
        self.enc_q.remove_weight_norm()

-    def forward(self, phone, phone_lengths, pitch, pitchf, ds,max_len=None):  # y是spec不需要了现在
+    def forward(
+        self, phone, phone_lengths, pitch, pitchf, ds, max_len=None
+    ):  # y是spec不需要了现在
        g = self.emb_g(ds.unsqueeze(0)).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
        x, x_mask = self.enc_p(phone, pitch, phone_lengths)
        x = self.flow(x, x_mask, g=g, reverse=True)
        o = self.dec((x * x_mask)[:, :, :max_len], pitchf, g=g)
        return o

+
 class MultiPeriodDiscriminator(torch.nn.Module):
    def __init__(self, use_spectral_norm=False):
        super(MultiPeriodDiscriminator, self).__init__()
@ -655,6 +739,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):

        return y_d_rs, y_d_gs, fmap_rs, fmap_gs

+
 class DiscriminatorS(torch.nn.Module):
    def __init__(self, use_spectral_norm=False):
        super(DiscriminatorS, self).__init__()
@ -684,6 +769,7 @@ class DiscriminatorS(torch.nn.Module):

        return x, fmap

+
 class DiscriminatorP(torch.nn.Module):
    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
        super(DiscriminatorP, self).__init__()
@ -761,4 +847,3 @@ class DiscriminatorP(torch.nn.Module):
        x = torch.flatten(x, 1, -1)

        return x, fmap
-
--- a/infer_pack/transforms.py
+++ b/infer_pack/transforms.py
@ -9,26 +9,24 @@ DEFAULT_MIN_BIN_HEIGHT = 1e-3
 DEFAULT_MIN_DERIVATIVE = 1e-3


-def piecewise_rational_quadratic_transform(inputs, 
+def piecewise_rational_quadratic_transform(
+    inputs,
    unnormalized_widths,
    unnormalized_heights,
    unnormalized_derivatives,
    inverse=False,
    tails=None,
-                                           tail_bound=1.,
+    tail_bound=1.0,
    min_bin_width=DEFAULT_MIN_BIN_WIDTH,
    min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
-                                           min_derivative=DEFAULT_MIN_DERIVATIVE):
-
+    min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
    if tails is None:
        spline_fn = rational_quadratic_spline
        spline_kwargs = {}
    else:
        spline_fn = unconstrained_rational_quadratic_spline
-        spline_kwargs = {
-            'tails': tails,
-            'tail_bound': tail_bound
-        }
+        spline_kwargs = {"tails": tails, "tail_bound": tail_bound}

    outputs, logabsdet = spline_fn(
        inputs=inputs,
@ -46,29 +44,28 @@ def piecewise_rational_quadratic_transform(inputs,

 def searchsorted(bin_locations, inputs, eps=1e-6):
    bin_locations[..., -1] += eps
-    return torch.sum(
-        inputs[..., None] >= bin_locations,
-        dim=-1
-    ) - 1
+    return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1


-def unconstrained_rational_quadratic_spline(inputs,
+def unconstrained_rational_quadratic_spline(
+    inputs,
    unnormalized_widths,
    unnormalized_heights,
    unnormalized_derivatives,
    inverse=False,
-                                            tails='linear',
-                                            tail_bound=1.,
+    tails="linear",
+    tail_bound=1.0,
    min_bin_width=DEFAULT_MIN_BIN_WIDTH,
    min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
-                                            min_derivative=DEFAULT_MIN_DERIVATIVE):
+    min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
    inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
    outside_interval_mask = ~inside_interval_mask

    outputs = torch.zeros_like(inputs)
    logabsdet = torch.zeros_like(inputs)

-    if tails == 'linear':
+    if tails == "linear":
        unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
        constant = np.log(np.exp(1 - min_derivative) - 1)
        unnormalized_derivatives[..., 0] = constant
@ -77,45 +74,57 @@ def unconstrained_rational_quadratic_spline(inputs,
        outputs[outside_interval_mask] = inputs[outside_interval_mask]
        logabsdet[outside_interval_mask] = 0
    else:
-        raise RuntimeError('{} tails are not implemented.'.format(tails))
+        raise RuntimeError("{} tails are not implemented.".format(tails))

-    outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
+    (
+        outputs[inside_interval_mask],
+        logabsdet[inside_interval_mask],
+    ) = rational_quadratic_spline(
        inputs=inputs[inside_interval_mask],
        unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
        unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
        unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
        inverse=inverse,
-        left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
+        left=-tail_bound,
+        right=tail_bound,
+        bottom=-tail_bound,
+        top=tail_bound,
        min_bin_width=min_bin_width,
        min_bin_height=min_bin_height,
-        min_derivative=min_derivative
+        min_derivative=min_derivative,
    )

    return outputs, logabsdet

-def rational_quadratic_spline(inputs,
+
+def rational_quadratic_spline(
+    inputs,
    unnormalized_widths,
    unnormalized_heights,
    unnormalized_derivatives,
    inverse=False,
-                              left=0., right=1., bottom=0., top=1.,
+    left=0.0,
+    right=1.0,
+    bottom=0.0,
+    top=1.0,
    min_bin_width=DEFAULT_MIN_BIN_WIDTH,
    min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
-                              min_derivative=DEFAULT_MIN_DERIVATIVE):
+    min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
    if torch.min(inputs) < left or torch.max(inputs) > right:
-        raise ValueError('Input to a transform is not within its domain')
+        raise ValueError("Input to a transform is not within its domain")

    num_bins = unnormalized_widths.shape[-1]

    if min_bin_width * num_bins > 1.0:
-        raise ValueError('Minimal bin width too large for the number of bins')
+        raise ValueError("Minimal bin width too large for the number of bins")
    if min_bin_height * num_bins > 1.0:
-        raise ValueError('Minimal bin height too large for the number of bins')
+        raise ValueError("Minimal bin height too large for the number of bins")

    widths = F.softmax(unnormalized_widths, dim=-1)
    widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
    cumwidths = torch.cumsum(widths, dim=-1)
-    cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
+    cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
    cumwidths = (right - left) * cumwidths + left
    cumwidths[..., 0] = left
    cumwidths[..., -1] = right
@ -126,7 +135,7 @@ def rational_quadratic_spline(inputs,
    heights = F.softmax(unnormalized_heights, dim=-1)
    heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
    cumheights = torch.cumsum(heights, dim=-1)
-    cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
+    cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
    cumheights = (top - bottom) * cumheights + bottom
    cumheights[..., 0] = bottom
    cumheights[..., -1] = top
@ -150,14 +159,12 @@ def rational_quadratic_spline(inputs,
    input_heights = heights.gather(-1, bin_idx)[..., 0]

    if inverse:
-        a = (((inputs - input_cumheights) * (input_derivatives
-                                             + input_derivatives_plus_one
-                                             - 2 * input_delta)
-              + input_heights * (input_delta - input_derivatives)))
-        b = (input_heights * input_derivatives
-             - (inputs - input_cumheights) * (input_derivatives
-                                              + input_derivatives_plus_one
-                                              - 2 * input_delta))
+        a = (inputs - input_cumheights) * (
+            input_derivatives + input_derivatives_plus_one - 2 * input_delta
+        ) + input_heights * (input_delta - input_derivatives)
+        b = input_heights * input_derivatives - (inputs - input_cumheights) * (
+            input_derivatives + input_derivatives_plus_one - 2 * input_delta
+        )
        c = -input_delta * (inputs - input_cumheights)

        discriminant = b.pow(2) - 4 * a * c
@ -167,11 +174,15 @@ def rational_quadratic_spline(inputs,
        outputs = root * input_bin_widths + input_cumwidths

        theta_one_minus_theta = root * (1 - root)
-        denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
-                                     * theta_one_minus_theta)
-        derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
+        denominator = input_delta + (
+            (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
+            * theta_one_minus_theta
+        )
+        derivative_numerator = input_delta.pow(2) * (
+            input_derivatives_plus_one * root.pow(2)
            + 2 * input_delta * theta_one_minus_theta
-                                                     + input_derivatives * (1 - root).pow(2))
+            + input_derivatives * (1 - root).pow(2)
+        )
        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)

        return outputs, -logabsdet
@ -179,15 +190,20 @@ def rational_quadratic_spline(inputs,
        theta = (inputs - input_cumwidths) / input_bin_widths
        theta_one_minus_theta = theta * (1 - theta)

-        numerator = input_heights * (input_delta * theta.pow(2)
-                                     + input_derivatives * theta_one_minus_theta)
-        denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
-                                     * theta_one_minus_theta)
+        numerator = input_heights * (
+            input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
+        )
+        denominator = input_delta + (
+            (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
+            * theta_one_minus_theta
+        )
        outputs = input_cumheights + numerator / denominator

-        derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
+        derivative_numerator = input_delta.pow(2) * (
+            input_derivatives_plus_one * theta.pow(2)
            + 2 * input_delta * theta_one_minus_theta
-                                                     + input_derivatives * (1 - theta).pow(2))
+            + input_derivatives * (1 - theta).pow(2)
+        )
        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)

        return outputs, logabsdet
--- a/infer_uvr5.py
+++ b/infer_uvr5.py
@ -1,4 +1,5 @@
 import os, sys, torch, warnings, pdb
+
 warnings.filterwarnings("ignore")
 import librosa
 import importlib
@ -10,99 +11,161 @@ from uvr5_pack.utils import _get_name_params,inference
 from uvr5_pack.lib_v5.model_param_init import ModelParameters
 from scipy.io import wavfile

-class  _audio_pre_():
+
+class _audio_pre_:
    def __init__(self, model_path, device, is_half):
        self.model_path = model_path
        self.device = device
        self.data = {
            # Processing Options
-            'postprocess': False,
-            'tta': False,
+            "postprocess": False,
+            "tta": False,
            # Constants
-            'window_size': 512,
-            'agg': 10,
-            'high_end_process': 'mirroring',
+            "window_size": 512,
+            "agg": 10,
+            "high_end_process": "mirroring",
        }
        nn_arch_sizes = [
            31191,  # default
-            33966,61968, 123821, 123812, 537238 # custom
+            33966,
+            61968,
+            123821,
+            123812,
+            537238,  # custom
        ]
-        self.nn_architecture = list('{}KB'.format(s) for s in nn_arch_sizes)
+        self.nn_architecture = list("{}KB".format(s) for s in nn_arch_sizes)
        model_size = math.ceil(os.stat(model_path).st_size / 1024)
-        nn_architecture = '{}KB'.format(min(nn_arch_sizes, key=lambda x:abs(x-model_size)))
-        nets = importlib.import_module('uvr5_pack.lib_v5.nets' + f'_{nn_architecture}'.replace('_{}KB'.format(nn_arch_sizes[0]), ''), package=None)
-        model_hash = hashlib.md5(open(model_path,'rb').read()).hexdigest()
+        nn_architecture = "{}KB".format(
+            min(nn_arch_sizes, key=lambda x: abs(x - model_size))
+        )
+        nets = importlib.import_module(
+            "uvr5_pack.lib_v5.nets"
+            + f"_{nn_architecture}".replace("_{}KB".format(nn_arch_sizes[0]), ""),
+            package=None,
+        )
+        model_hash = hashlib.md5(open(model_path, "rb").read()).hexdigest()
        param_name, model_params_d = _get_name_params(model_path, model_hash)

        mp = ModelParameters(model_params_d)
-        model = nets.CascadedASPPNet(mp.param['bins'] * 2)
-        cpk = torch.load( model_path , map_location='cpu')  
+        model = nets.CascadedASPPNet(mp.param["bins"] * 2)
+        cpk = torch.load(model_path, map_location="cpu")
        model.load_state_dict(cpk)
        model.eval()
-        if(is_half):model = model.half().to(device)
-        else:model = model.to(device)
+        if is_half:
+            model = model.half().to(device)
+        else:
+            model = model.to(device)

        self.mp = mp
        self.model = model

    def _path_audio_(self, music_file, ins_root=None, vocal_root=None):
-        if(ins_root is None and vocal_root is None):return "No save root."
+        if ins_root is None and vocal_root is None:
+            return "No save root."
        name = os.path.basename(music_file)
-        if(ins_root is not None):os.makedirs(ins_root, exist_ok=True)
-        if(vocal_root is not None):os.makedirs(vocal_root , exist_ok=True)
+        if ins_root is not None:
+            os.makedirs(ins_root, exist_ok=True)
+        if vocal_root is not None:
+            os.makedirs(vocal_root, exist_ok=True)
        X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
-        bands_n = len(self.mp.param['band'])
+        bands_n = len(self.mp.param["band"])
        # print(bands_n)
        for d in range(bands_n, 0, -1):
-            bp = self.mp.param['band'][d]
+            bp = self.mp.param["band"][d]
            if d == bands_n:  # high-end band
-                X_wave[d], _ = librosa.core.load(#理论上librosa读取可能对某些音频有bug，应该上ffmpeg读取，但是太麻烦了弃坑
-                    music_file, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
+                (
+                    X_wave[d],
+                    _,
+                ) = librosa.core.load(  # 理论上librosa读取可能对某些音频有bug，应该上ffmpeg读取，但是太麻烦了弃坑
+                    music_file,
+                    bp["sr"],
+                    False,
+                    dtype=np.float32,
+                    res_type=bp["res_type"],
+                )
                if X_wave[d].ndim == 1:
                    X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
            else:  # lower bands
-                X_wave[d] = librosa.core.resample(X_wave[d+1], self.mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
+                X_wave[d] = librosa.core.resample(
+                    X_wave[d + 1],
+                    self.mp.param["band"][d + 1]["sr"],
+                    bp["sr"],
+                    res_type=bp["res_type"],
+                )
            # Stft of wave source
-            X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(X_wave[d], bp['hl'], bp['n_fft'], self.mp.param['mid_side'], self.mp.param['mid_side_b2'], self.mp.param['reverse'])
+            X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
+                X_wave[d],
+                bp["hl"],
+                bp["n_fft"],
+                self.mp.param["mid_side"],
+                self.mp.param["mid_side_b2"],
+                self.mp.param["reverse"],
+            )
            # pdb.set_trace()
-            if d == bands_n and self.data['high_end_process'] != 'none':
-                input_high_end_h = (bp['n_fft']//2 - bp['crop_stop']) + ( self.mp.param['pre_filter_stop'] - self.mp.param['pre_filter_start'])
-                input_high_end = X_spec_s[d][:, bp['n_fft']//2-input_high_end_h:bp['n_fft']//2, :]
+            if d == bands_n and self.data["high_end_process"] != "none":
+                input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
+                    self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
+                )
+                input_high_end = X_spec_s[d][
+                    :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
+                ]

        X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
-        aggresive_set = float(self.data['agg']/100)
-        aggressiveness = {'value': aggresive_set, 'split_bin': self.mp.param['band'][1]['crop_stop']}
+        aggresive_set = float(self.data["agg"] / 100)
+        aggressiveness = {
+            "value": aggresive_set,
+            "split_bin": self.mp.param["band"][1]["crop_stop"],
+        }
        with torch.no_grad():
-            pred, X_mag, X_phase = inference(X_spec_m,self.device,self.model, aggressiveness,self.data)
+            pred, X_mag, X_phase = inference(
+                X_spec_m, self.device, self.model, aggressiveness, self.data
+            )
        # Postprocess
-        if self.data['postprocess']:
+        if self.data["postprocess"]:
            pred_inv = np.clip(X_mag - pred, 0, np.inf)
            pred = spec_utils.mask_silence(pred, pred_inv)
        y_spec_m = pred * X_phase
        v_spec_m = X_spec_m - y_spec_m

-        if (ins_root is not None):
-            if self.data['high_end_process'].startswith('mirroring'):
-                input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], y_spec_m, input_high_end, self.mp)
-                wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp,input_high_end_h, input_high_end_)
+        if ins_root is not None:
+            if self.data["high_end_process"].startswith("mirroring"):
+                input_high_end_ = spec_utils.mirroring(
+                    self.data["high_end_process"], y_spec_m, input_high_end, self.mp
+                )
+                wav_instrument = spec_utils.cmb_spectrogram_to_wave(
+                    y_spec_m, self.mp, input_high_end_h, input_high_end_
+                )
            else:
                wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
-            print ('%s instruments done'%name)
-            wavfile.write(os.path.join(ins_root, 'instrument_{}.wav'.format(name) ), self.mp.param['sr'], (np.array(wav_instrument)*32768).astype("int16"))  #
-        if (vocal_root is not None):
-            if self.data['high_end_process'].startswith('mirroring'):
-                input_high_end_ = spec_utils.mirroring(self.data['high_end_process'],  v_spec_m, input_high_end, self.mp)
-                wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp, input_high_end_h, input_high_end_)
+            print("%s instruments done" % name)
+            wavfile.write(
+                os.path.join(ins_root, "instrument_{}.wav".format(name)),
+                self.mp.param["sr"],
+                (np.array(wav_instrument) * 32768).astype("int16"),
+            )  #
+        if vocal_root is not None:
+            if self.data["high_end_process"].startswith("mirroring"):
+                input_high_end_ = spec_utils.mirroring(
+                    self.data["high_end_process"], v_spec_m, input_high_end, self.mp
+                )
+                wav_vocals = spec_utils.cmb_spectrogram_to_wave(
+                    v_spec_m, self.mp, input_high_end_h, input_high_end_
+                )
            else:
                wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
-            print ('%s vocals done'%name)
-            wavfile.write(os.path.join(vocal_root , 'vocal_{}.wav'.format(name) ), self.mp.param['sr'], (np.array(wav_vocals)*32768).astype("int16"))
+            print("%s vocals done" % name)
+            wavfile.write(
+                os.path.join(vocal_root, "vocal_{}.wav".format(name)),
+                self.mp.param["sr"],
+                (np.array(wav_vocals) * 32768).astype("int16"),
+            )

-if __name__ == '__main__':
-    device = 'cuda'
+
+if __name__ == "__main__":
+    device = "cuda"
    is_half = True
-    model_path='uvr5_weights/2_HP-UVR.pth'
+    model_path = "uvr5_weights/2_HP-UVR.pth"
    pre_fun = _audio_pre_(model_path=model_path, device=device, is_half=True)
-    audio_path = '神女劈观.aac'
-    save_path = 'opt'
+    audio_path = "神女劈观.aac"
+    save_path = "opt"
    pre_fun._path_audio_(audio_path, save_path, save_path)
--- a/locale/locale_diff.py
+++ b/locale/locale_diff.py
@ -31,7 +31,9 @@ for lang_file in languages:
        del lang_data[key]

    # Sort the keys of the language file to match the order of the standard file
-    lang_data = OrderedDict(sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0])))
+    lang_data = OrderedDict(
+        sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0]))
+    )

    # Save the updated language file
    with open(lang_file, "w", encoding="utf-8") as f:
--- a/my_utils.py
+++ b/my_utils.py
@ -1,11 +1,15 @@
 import ffmpeg
 import numpy as np
+
+
 def load_audio(file, sr):
    try:
        # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
-        file=file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")#防止小白拷路径头尾带了空格和"和回车
+        file = (
+            file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
+        )  # 防止小白拷路径头尾带了空格和"和回车
        out, _ = (
            ffmpeg.input(file, threads=0)
            .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
--- a/slicer2.py
+++ b/slicer2.py
@ -18,9 +18,7 @@ def get_rms(
    x_shape_trimmed = list(y.shape)
    x_shape_trimmed[axis] -= frame_length - 1
    out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
-    xw = np.lib.stride_tricks.as_strided(
-        y, shape=out_shape, strides=out_strides
-    )
+    xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
    if axis < 0:
        target_axis = axis - 1
    else:
@ -38,19 +36,25 @@ def get_rms(


 class Slicer:
-    def __init__(self,
+    def __init__(
+        self,
        sr: int,
-                 threshold: float = -40.,
+        threshold: float = -40.0,
        min_length: int = 5000,
        min_interval: int = 300,
        hop_size: int = 20,
-                 max_sil_kept: int = 5000):
+        max_sil_kept: int = 5000,
+    ):
        if not min_length >= min_interval >= hop_size:
-            raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
+            raise ValueError(
+                "The following condition must be satisfied: min_length >= min_interval >= hop_size"
+            )
        if not max_sil_kept >= hop_size:
-            raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
+            raise ValueError(
+                "The following condition must be satisfied: max_sil_kept >= hop_size"
+            )
        min_interval = sr * min_interval / 1000
-        self.threshold = 10 ** (threshold / 20.)
+        self.threshold = 10 ** (threshold / 20.0)
        self.hop_size = round(sr * hop_size / 1000)
        self.win_size = min(round(min_interval), 4 * self.hop_size)
        self.min_length = round(sr * min_length / 1000 / self.hop_size)
@ -59,9 +63,13 @@ class Slicer:

    def _apply_slice(self, waveform, begin, end):
        if len(waveform.shape) > 1:
-            return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
+            return waveform[
+                :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
+            ]
        else:
-            return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
+            return waveform[
+                begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
+            ]

    # @timeit
    def slice(self, waveform):
@ -71,7 +79,9 @@ class Slicer:
            samples = waveform
        if samples.shape[0] <= self.min_length:
            return [waveform]
-        rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
+        rms_list = get_rms(
+            y=samples, frame_length=self.win_size, hop_length=self.hop_size
+        ).squeeze(0)
        sil_tags = []
        silence_start = None
        clip_start = 0
@ -87,7 +97,10 @@ class Slicer:
                continue
            # Clear recorded silence start if interval is not enough or clip is too short
            is_leading_silence = silence_start == 0 and i > self.max_sil_kept
-            need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
+            need_slice_middle = (
+                i - silence_start >= self.min_interval
+                and i - clip_start >= self.min_length
+            )
            if not is_leading_silence and not need_slice_middle:
                silence_start = None
                continue
@ -100,10 +113,21 @@ class Slicer:
                    sil_tags.append((pos, pos))
                clip_start = pos
            elif i - silence_start <= self.max_sil_kept * 2:
-                pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
+                pos = rms_list[
+                    i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
+                ].argmin()
                pos += i - self.max_sil_kept
-                pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
-                pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
+                pos_l = (
+                    rms_list[
+                        silence_start : silence_start + self.max_sil_kept + 1
+                    ].argmin()
+                    + silence_start
+                )
+                pos_r = (
+                    rms_list[i - self.max_sil_kept : i + 1].argmin()
+                    + i
+                    - self.max_sil_kept
+                )
                if silence_start == 0:
                    sil_tags.append((0, pos_r))
                    clip_start = pos_r
@ -111,8 +135,17 @@ class Slicer:
                    sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
                    clip_start = max(pos_r, pos)
            else:
-                pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
-                pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
+                pos_l = (
+                    rms_list[
+                        silence_start : silence_start + self.max_sil_kept + 1
+                    ].argmin()
+                    + silence_start
+                )
+                pos_r = (
+                    rms_list[i - self.max_sil_kept : i + 1].argmin()
+                    + i
+                    - self.max_sil_kept
+                )
                if silence_start == 0:
                    sil_tags.append((0, pos_r))
                else:
@ -121,7 +154,10 @@ class Slicer:
            silence_start = None
        # Deal with trailing silence.
        total_frames = rms_list.shape[0]
-        if silence_start is not None and total_frames - silence_start >= self.min_interval:
+        if (
+            silence_start is not None
+            and total_frames - silence_start >= self.min_interval
+        ):
            silence_end = min(total_frames, silence_start + self.max_sil_kept)
            pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
            sil_tags.append((pos, total_frames + 1))
@ -133,9 +169,13 @@ class Slicer:
            if sil_tags[0][0] > 0:
                chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
            for i in range(len(sil_tags) - 1):
-                chunks.append(self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]))
+                chunks.append(
+                    self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0])
+                )
            if sil_tags[-1][1] < total_frames:
-                chunks.append(self._apply_slice(waveform, sil_tags[-1][1], total_frames))
+                chunks.append(
+                    self._apply_slice(waveform, sil_tags[-1][1], total_frames)
+                )
            return chunks


@ -147,18 +187,45 @@ def main():
    import soundfile

    parser = ArgumentParser()
-    parser.add_argument('audio', type=str, help='The audio to be sliced')
-    parser.add_argument('--out', type=str, help='Output directory of the sliced audio clips')
-    parser.add_argument('--db_thresh', type=float, required=False, default=-40,
-                        help='The dB threshold for silence detection')
-    parser.add_argument('--min_length', type=int, required=False, default=5000,
-                        help='The minimum milliseconds required for each sliced audio clip')
-    parser.add_argument('--min_interval', type=int, required=False, default=300,
-                        help='The minimum milliseconds for a silence part to be sliced')
-    parser.add_argument('--hop_size', type=int, required=False, default=10,
-                        help='Frame length in milliseconds')
-    parser.add_argument('--max_sil_kept', type=int, required=False, default=500,
-                        help='The maximum silence length kept around the sliced clip, presented in milliseconds')
+    parser.add_argument("audio", type=str, help="The audio to be sliced")
+    parser.add_argument(
+        "--out", type=str, help="Output directory of the sliced audio clips"
+    )
+    parser.add_argument(
+        "--db_thresh",
+        type=float,
+        required=False,
+        default=-40,
+        help="The dB threshold for silence detection",
+    )
+    parser.add_argument(
+        "--min_length",
+        type=int,
+        required=False,
+        default=5000,
+        help="The minimum milliseconds required for each sliced audio clip",
+    )
+    parser.add_argument(
+        "--min_interval",
+        type=int,
+        required=False,
+        default=300,
+        help="The minimum milliseconds for a silence part to be sliced",
+    )
+    parser.add_argument(
+        "--hop_size",
+        type=int,
+        required=False,
+        default=10,
+        help="Frame length in milliseconds",
+    )
+    parser.add_argument(
+        "--max_sil_kept",
+        type=int,
+        required=False,
+        default=500,
+        help="The maximum silence length kept around the sliced clip, presented in milliseconds",
+    )
    args = parser.parse_args()
    out = args.out
    if out is None:
@ -170,7 +237,7 @@ def main():
        min_length=args.min_length,
        min_interval=args.min_interval,
        hop_size=args.hop_size,
-        max_sil_kept=args.max_sil_kept
+        max_sil_kept=args.max_sil_kept,
    )
    chunks = slicer.slice(audio)
    if not os.path.exists(out):
@ -178,8 +245,16 @@ def main():
    for i, chunk in enumerate(chunks):
        if len(chunk.shape) > 1:
            chunk = chunk.T
-        soundfile.write(os.path.join(out, f'%s_%d.wav' % (os.path.basename(args.audio).rsplit('.', maxsplit=1)[0], i)), chunk, sr)
+        soundfile.write(
+            os.path.join(
+                out,
+                f"%s_%d.wav"
+                % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
+            ),
+            chunk,
+            sr,
+        )


-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/train/data_utils.py
+++ b/train/data_utils.py
@ -6,6 +6,7 @@ import torch.utils.data
 from mel_processing import spectrogram_torch
 from utils import load_wav_to_torch, load_filepaths_and_text

+
 class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
    """
    1) loads audio, text pairs
@ -40,6 +41,7 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
                lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
        self.audiopaths_and_text = audiopaths_and_text_new
        self.lengths = lengths
+
    def get_sid(self, sid):
        sid = torch.LongTensor([int(sid)])
        return sid
@ -104,9 +106,14 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
                spec = torch.load(spec_filename)
            except:
                print(spec_filename, traceback.format_exc())
-                spec = spectrogram_torch(audio_norm, self.filter_length,
-                                         self.sampling_rate, self.hop_length, self.win_length,
-                                         center=False)
+                spec = spectrogram_torch(
+                    audio_norm,
+                    self.filter_length,
+                    self.sampling_rate,
+                    self.hop_length,
+                    self.win_length,
+                    center=False,
+                )
                spec = torch.squeeze(spec, 0)
                torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
        else:
@ -127,6 +134,8 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):

    def __len__(self):
        return len(self.audiopaths_and_text)
+
+
 class TextAudioCollateMultiNSFsid:
    """Zero-pads model inputs and targets"""

@ -155,7 +164,9 @@ class TextAudioCollateMultiNSFsid:

        max_phone_len = max([x[2].size(0) for x in batch])
        phone_lengths = torch.LongTensor(len(batch))
-        phone_padded = torch.FloatTensor(len(batch), max_phone_len, batch[0][2].shape[1])#(spec, wav, phone, pitch)
+        phone_padded = torch.FloatTensor(
+            len(batch), max_phone_len, batch[0][2].shape[1]
+        )  # (spec, wav, phone, pitch)
        pitch_padded = torch.LongTensor(len(batch), max_phone_len)
        pitchf_padded = torch.FloatTensor(len(batch), max_phone_len)
        phone_padded.zero_()
@ -187,7 +198,6 @@ class TextAudioCollateMultiNSFsid:
            # dv[i] = row[5]
            sid[i] = row[5]

-
        return (
            phone_padded,
            phone_lengths,
@ -198,9 +208,10 @@ class TextAudioCollateMultiNSFsid:
            wave_padded,
            wave_lengths,
            # dv
-            sid
+            sid,
        )

+
 class TextAudioLoader(torch.utils.data.Dataset):
    """
    1) loads audio, text pairs
@ -235,6 +246,7 @@ class TextAudioLoader(torch.utils.data.Dataset):
                lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
        self.audiopaths_and_text = audiopaths_and_text_new
        self.lengths = lengths
+
    def get_sid(self, sid):
        sid = torch.LongTensor([int(sid)])
        return sid
@ -283,9 +295,14 @@ class TextAudioLoader(torch.utils.data.Dataset):
                spec = torch.load(spec_filename)
            except:
                print(spec_filename, traceback.format_exc())
-                spec = spectrogram_torch(audio_norm, self.filter_length,
-                                         self.sampling_rate, self.hop_length, self.win_length,
-                                         center=False)
+                spec = spectrogram_torch(
+                    audio_norm,
+                    self.filter_length,
+                    self.sampling_rate,
+                    self.hop_length,
+                    self.win_length,
+                    center=False,
+                )
                spec = torch.squeeze(spec, 0)
                torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
        else:
@ -306,6 +323,8 @@ class TextAudioLoader(torch.utils.data.Dataset):

    def __len__(self):
        return len(self.audiopaths_and_text)
+
+
 class TextAudioCollate:
    """Zero-pads model inputs and targets"""

@ -334,7 +353,9 @@ class TextAudioCollate:

        max_phone_len = max([x[2].size(0) for x in batch])
        phone_lengths = torch.LongTensor(len(batch))
-        phone_padded = torch.FloatTensor(len(batch), max_phone_len, batch[0][2].shape[1])
+        phone_padded = torch.FloatTensor(
+            len(batch), max_phone_len, batch[0][2].shape[1]
+        )
        phone_padded.zero_()
        sid = torch.LongTensor(len(batch))

@ -355,7 +376,6 @@ class TextAudioCollate:

            sid[i] = row[3]

-
        return (
            phone_padded,
            phone_lengths,
@ -363,9 +383,10 @@ class TextAudioCollate:
            spec_lengths,
            wave_padded,
            wave_lengths,
-            sid
+            sid,
        )

+
 class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
    """
    Maintain similar input lengths in a batch.
--- a/train/losses.py
+++ b/train/losses.py
@ -1,6 +1,7 @@
 import torch
 from torch.nn import functional as F

+
 def feature_loss(fmap_r, fmap_g):
    loss = 0
    for dr, dg in zip(fmap_r, fmap_g):
--- a/train/mel_processing.py
+++ b/train/mel_processing.py
@ -78,7 +78,8 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False)
        center=center,
        pad_mode="reflect",
        normalized=False,
-        onesided=True,return_complex=False
+        onesided=True,
+        return_complex=False,
    )

    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
@ -139,8 +140,18 @@ def mel_spectrogram_torch(
    #     normalized=False,
    #     onesided=True,
    # )
-    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
-                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+    spec = torch.stft(
+        y,
+        n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window[wnsize_dtype_device],
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=False,
+    )
    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)

    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
--- a/train/process_ckpt.py
+++ b/train/process_ckpt.py
@ -1,16 +1,78 @@
 import torch, traceback, os, pdb
 from collections import OrderedDict

+
 def savee(ckpt, sr, if_f0, name, epoch):
    try:
        opt = OrderedDict()
        opt["weight"] = {}
        for key in ckpt.keys():
-            if ("enc_q" in key): continue
+            if "enc_q" in key:
+                continue
            opt["weight"][key] = ckpt[key].half()
-        if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4], 109, 256, 40000]
-        elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4,4], 109, 256, 48000]
-        elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000]
+        if sr == "40k":
+            opt["config"] = [
+                1025,
+                32,
+                192,
+                192,
+                768,
+                2,
+                6,
+                3,
+                0,
+                "1",
+                [3, 7, 11],
+                [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                [10, 10, 2, 2],
+                512,
+                [16, 16, 4, 4],
+                109,
+                256,
+                40000,
+            ]
+        elif sr == "48k":
+            opt["config"] = [
+                1025,
+                32,
+                192,
+                192,
+                768,
+                2,
+                6,
+                3,
+                0,
+                "1",
+                [3, 7, 11],
+                [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                [10, 6, 2, 2, 2],
+                512,
+                [16, 16, 4, 4, 4],
+                109,
+                256,
+                48000,
+            ]
+        elif sr == "32k":
+            opt["config"] = [
+                513,
+                32,
+                192,
+                192,
+                768,
+                2,
+                6,
+                3,
+                0,
+                "1",
+                [3, 7, 11],
+                [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                [10, 4, 2, 2, 2],
+                512,
+                [16, 16, 4, 4, 4],
+                109,
+                256,
+                32000,
+            ]
        opt["info"] = "%sepoch" % epoch
        opt["sr"] = sr
        opt["f0"] = if_f0
@ -19,26 +81,95 @@ def savee(ckpt,sr,if_f0,name,epoch):
    except:
        return traceback.format_exc()

+
 def show_info(path):
    try:
        a = torch.load(path, map_location="cpu")
-        return "模型信息:%s\n采样率:%s\n模型是否输入音高引导:%s"%(a.get("info","None"),a.get("sr","None"),a.get("f0","None"),)
+        return "模型信息:%s\n采样率:%s\n模型是否输入音高引导:%s" % (
+            a.get("info", "None"),
+            a.get("sr", "None"),
+            a.get("f0", "None"),
+        )
    except:
        return traceback.format_exc()

+
 def extract_small_model(path, name, sr, if_f0, info):
    try:
        ckpt = torch.load(path, map_location="cpu")
-        if("model"in ckpt):ckpt=ckpt["model"]
+        if "model" in ckpt:
+            ckpt = ckpt["model"]
        opt = OrderedDict()
        opt["weight"] = {}
        for key in ckpt.keys():
-            if ("enc_q" in key): continue
+            if "enc_q" in key:
+                continue
            opt["weight"][key] = ckpt[key].half()
-        if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4], 109, 256, 40000]
-        elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4,4], 109, 256, 48000]
-        elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000]
-        if(info==""):info="Extracted model."
+        if sr == "40k":
+            opt["config"] = [
+                1025,
+                32,
+                192,
+                192,
+                768,
+                2,
+                6,
+                3,
+                0,
+                "1",
+                [3, 7, 11],
+                [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                [10, 10, 2, 2],
+                512,
+                [16, 16, 4, 4],
+                109,
+                256,
+                40000,
+            ]
+        elif sr == "48k":
+            opt["config"] = [
+                1025,
+                32,
+                192,
+                192,
+                768,
+                2,
+                6,
+                3,
+                0,
+                "1",
+                [3, 7, 11],
+                [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                [10, 6, 2, 2, 2],
+                512,
+                [16, 16, 4, 4, 4],
+                109,
+                256,
+                48000,
+            ]
+        elif sr == "32k":
+            opt["config"] = [
+                513,
+                32,
+                192,
+                192,
+                768,
+                2,
+                6,
+                3,
+                0,
+                "1",
+                [3, 7, 11],
+                [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                [10, 4, 2, 2, 2],
+                512,
+                [16, 16, 4, 4, 4],
+                109,
+                256,
+                32000,
+            ]
+        if info == "":
+            info = "Extracted model."
        opt["info"] = info
        opt["sr"] = sr
        opt["f0"] = int(if_f0)
@ -47,51 +178,67 @@ def extract_small_model(path,name,sr,if_f0,info):
    except:
        return traceback.format_exc()

+
 def change_info(path, info, name):
    try:
        ckpt = torch.load(path, map_location="cpu")
        ckpt["info"] = info
-        if(name==""):name=os.path.basename(path)
+        if name == "":
+            name = os.path.basename(path)
        torch.save(ckpt, "weights/%s" % name)
        return "Success."
    except:
        return traceback.format_exc()

+
 def merge(path1, path2, alpha1, sr, f0, info, name):
    try:
+
        def extract(ckpt):
            a = ckpt["model"]
            opt = OrderedDict()
            opt["weight"] = {}
            for key in a.keys():
-                if ("enc_q" in key): continue
+                if "enc_q" in key:
+                    continue
                opt["weight"][key] = a[key]
            return opt
+
        ckpt1 = torch.load(path1, map_location="cpu")
        ckpt2 = torch.load(path2, map_location="cpu")
        cfg = ckpt1["config"]
-        if("model"in ckpt1): ckpt1=extract(ckpt1)
-        else: ckpt1=ckpt1["weight"]
-        if("model"in ckpt2): ckpt2=extract(ckpt2)
-        else: ckpt2=ckpt2["weight"]
-        if(sorted(list(ckpt1.keys()))!=sorted(list(ckpt2.keys()))):return "Fail to merge the models. The model architectures are not the same."
+        if "model" in ckpt1:
+            ckpt1 = extract(ckpt1)
+        else:
+            ckpt1 = ckpt1["weight"]
+        if "model" in ckpt2:
+            ckpt2 = extract(ckpt2)
+        else:
+            ckpt2 = ckpt2["weight"]
+        if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())):
+            return "Fail to merge the models. The model architectures are not the same."
        opt = OrderedDict()
        opt["weight"] = {}
        for key in ckpt1.keys():
            # try:
-                if(key=="emb_g.weight"and ckpt1[key].shape!=ckpt2[key].shape):
+            if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape:
                min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0])
-                    opt["weight"][key] = (alpha1 * (ckpt1[key][:min_shape0].float()) + (1 - alpha1) * (ckpt2[key][:min_shape0].float())).half()
+                opt["weight"][key] = (
+                    alpha1 * (ckpt1[key][:min_shape0].float())
+                    + (1 - alpha1) * (ckpt2[key][:min_shape0].float())
+                ).half()
            else:
-                    opt["weight"][key] = (alpha1*(ckpt1[key].float())+(1-alpha1)*(ckpt2[key].float())).half()
+                opt["weight"][key] = (
+                    alpha1 * (ckpt1[key].float()) + (1 - alpha1) * (ckpt2[key].float())
+                ).half()
        # except:
        #     pdb.set_trace()
        opt["config"] = cfg
-        '''
+        """
        if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 40000]
        elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4], 109, 256, 48000]
        elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000]
-        '''
+        """
        opt["sr"] = sr
        opt["f0"] = 1 if f0 == "是" else 0
        opt["info"] = info
--- a/train/utils.py
+++ b/train/utils.py
@ -14,40 +14,49 @@ MATPLOTLIB_FLAG = False
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 logger = logging

+
 def load_checkpoint_d(checkpoint_path, combd, sbd, optimizer=None, load_opt=1):
    assert os.path.isfile(checkpoint_path)
-  checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
+    checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")

    ##################
    def go(model, bkey):
        saved_state_dict = checkpoint_dict[bkey]
-    if hasattr(model, 'module'):state_dict = model.module.state_dict()
-    else:state_dict = model.state_dict()
+        if hasattr(model, "module"):
+            state_dict = model.module.state_dict()
+        else:
+            state_dict = model.state_dict()
        new_state_dict = {}
        for k, v in state_dict.items():  # 模型需要的shape
            try:
                new_state_dict[k] = saved_state_dict[k]
-        if(saved_state_dict[k].shape!=state_dict[k].shape):
-          print("shape-%s-mismatch|need-%s|get-%s"%(k,state_dict[k].shape,saved_state_dict[k].shape))#
+                if saved_state_dict[k].shape != state_dict[k].shape:
+                    print(
+                        "shape-%s-mismatch|need-%s|get-%s"
+                        % (k, state_dict[k].shape, saved_state_dict[k].shape)
+                    )  #
                    raise KeyError
            except:
                # logger.info(traceback.format_exc())
                logger.info("%s is not in the checkpoint" % k)  # pretrain缺失的
                new_state_dict[k] = v  # 模型自带的随机值
-    if hasattr(model, 'module'):
+        if hasattr(model, "module"):
            model.module.load_state_dict(new_state_dict, strict=False)
        else:
            model.load_state_dict(new_state_dict, strict=False)
+
    go(combd, "combd")
    go(sbd, "sbd")
    #############
    logger.info("Loaded model weights")

-  iteration = checkpoint_dict['iteration']
-  learning_rate = checkpoint_dict['learning_rate']
-  if optimizer is not None and load_opt==1:###加载不了，如果是空的的话，重新初始化，可能还会影响lr时间表的更新，因此在train文件最外围catch
+    iteration = checkpoint_dict["iteration"]
+    learning_rate = checkpoint_dict["learning_rate"]
+    if (
+        optimizer is not None and load_opt == 1
+    ):  ###加载不了，如果是空的的话，重新初始化，可能还会影响lr时间表的更新，因此在train文件最外围catch
        #   try:
-      optimizer.load_state_dict(checkpoint_dict['optimizer'])
+        optimizer.load_state_dict(checkpoint_dict["optimizer"])
    #   except:
    #     traceback.print_exc()
    logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration))
@ -85,10 +94,10 @@ def load_checkpoint_d(checkpoint_path, combd,sbd, optimizer=None,load_opt=1):
 #   return model, optimizer, learning_rate, iteration
 def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1):
    assert os.path.isfile(checkpoint_path)
-  checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
+    checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")

-  saved_state_dict = checkpoint_dict['model']
-  if hasattr(model, 'module'):
+    saved_state_dict = checkpoint_dict["model"]
+    if hasattr(model, "module"):
        state_dict = model.module.state_dict()
    else:
        state_dict = model.state_dict()
@ -96,24 +105,29 @@ def load_checkpoint(checkpoint_path, model, optimizer=None,load_opt=1):
    for k, v in state_dict.items():  # 模型需要的shape
        try:
            new_state_dict[k] = saved_state_dict[k]
-      if(saved_state_dict[k].shape!=state_dict[k].shape):
-        print("shape-%s-mismatch|need-%s|get-%s"%(k,state_dict[k].shape,saved_state_dict[k].shape))#
+            if saved_state_dict[k].shape != state_dict[k].shape:
+                print(
+                    "shape-%s-mismatch|need-%s|get-%s"
+                    % (k, state_dict[k].shape, saved_state_dict[k].shape)
+                )  #
                raise KeyError
        except:
            # logger.info(traceback.format_exc())
            logger.info("%s is not in the checkpoint" % k)  # pretrain缺失的
            new_state_dict[k] = v  # 模型自带的随机值
-  if hasattr(model, 'module'):
+    if hasattr(model, "module"):
        model.module.load_state_dict(new_state_dict, strict=False)
    else:
        model.load_state_dict(new_state_dict, strict=False)
    logger.info("Loaded model weights")

-  iteration = checkpoint_dict['iteration']
-  learning_rate = checkpoint_dict['learning_rate']
-  if optimizer is not None and load_opt==1:###加载不了，如果是空的的话，重新初始化，可能还会影响lr时间表的更新，因此在train文件最外围catch
+    iteration = checkpoint_dict["iteration"]
+    learning_rate = checkpoint_dict["learning_rate"]
+    if (
+        optimizer is not None and load_opt == 1
+    ):  ###加载不了，如果是空的的话，重新初始化，可能还会影响lr时间表的更新，因此在train文件最外围catch
        #   try:
-      optimizer.load_state_dict(checkpoint_dict['optimizer'])
+        optimizer.load_state_dict(checkpoint_dict["optimizer"])
    #   except:
    #     traceback.print_exc()
    logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration))
@ -121,38 +135,67 @@ def load_checkpoint(checkpoint_path, model, optimizer=None,load_opt=1):


 def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
-  logger.info("Saving model and optimizer state at epoch {} to {}".format(
-    iteration, checkpoint_path))
-  if hasattr(model, 'module'):
+    logger.info(
+        "Saving model and optimizer state at epoch {} to {}".format(
+            iteration, checkpoint_path
+        )
+    )
+    if hasattr(model, "module"):
        state_dict = model.module.state_dict()
    else:
        state_dict = model.state_dict()
-  torch.save({'model': state_dict,
-              'iteration': iteration,
-              'optimizer': optimizer.state_dict(),
-              'learning_rate': learning_rate}, checkpoint_path)
+    torch.save(
+        {
+            "model": state_dict,
+            "iteration": iteration,
+            "optimizer": optimizer.state_dict(),
+            "learning_rate": learning_rate,
+        },
+        checkpoint_path,
+    )
+
+
 def save_checkpoint_d(combd, sbd, optimizer, learning_rate, iteration, checkpoint_path):
-  logger.info("Saving model and optimizer state at epoch {} to {}".format(
-    iteration, checkpoint_path))
-  if hasattr(combd, 'module'): state_dict_combd = combd.module.state_dict()
-  else:state_dict_combd = combd.state_dict()
-  if hasattr(sbd, 'module'): state_dict_sbd = sbd.module.state_dict()
-  else:state_dict_sbd = sbd.state_dict()
-  torch.save({
-              'combd': state_dict_combd,
-              'sbd': state_dict_sbd,
-              'iteration': iteration,
-              'optimizer': optimizer.state_dict(),
-              'learning_rate': learning_rate}, checkpoint_path)
+    logger.info(
+        "Saving model and optimizer state at epoch {} to {}".format(
+            iteration, checkpoint_path
+        )
+    )
+    if hasattr(combd, "module"):
+        state_dict_combd = combd.module.state_dict()
+    else:
+        state_dict_combd = combd.state_dict()
+    if hasattr(sbd, "module"):
+        state_dict_sbd = sbd.module.state_dict()
+    else:
+        state_dict_sbd = sbd.state_dict()
+    torch.save(
+        {
+            "combd": state_dict_combd,
+            "sbd": state_dict_sbd,
+            "iteration": iteration,
+            "optimizer": optimizer.state_dict(),
+            "learning_rate": learning_rate,
+        },
+        checkpoint_path,
+    )


-def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
+def summarize(
+    writer,
+    global_step,
+    scalars={},
+    histograms={},
+    images={},
+    audios={},
+    audio_sampling_rate=22050,
+):
    for k, v in scalars.items():
        writer.add_scalar(k, v, global_step)
    for k, v in histograms.items():
        writer.add_histogram(k, v, global_step)
    for k, v in images.items():
-    writer.add_image(k, v, global_step, dataformats='HWC')
+        writer.add_image(k, v, global_step, dataformats="HWC")
    for k, v in audios.items():
        writer.add_audio(k, v, global_step, audio_sampling_rate)

@ -169,23 +212,23 @@ def plot_spectrogram_to_numpy(spectrogram):
    global MATPLOTLIB_FLAG
    if not MATPLOTLIB_FLAG:
        import matplotlib
+
        matplotlib.use("Agg")
        MATPLOTLIB_FLAG = True
-    mpl_logger = logging.getLogger('matplotlib')
+        mpl_logger = logging.getLogger("matplotlib")
        mpl_logger.setLevel(logging.WARNING)
    import matplotlib.pylab as plt
    import numpy as np

    fig, ax = plt.subplots(figsize=(10, 2))
-  im = ax.imshow(spectrogram, aspect="auto", origin="lower",
-                  interpolation='none')
+    im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
    plt.colorbar(im, ax=ax)
    plt.xlabel("Frames")
    plt.ylabel("Channels")
    plt.tight_layout()

    fig.canvas.draw()
-  data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+    data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
    data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
    plt.close()
    return data
@ -195,26 +238,28 @@ def plot_alignment_to_numpy(alignment, info=None):
    global MATPLOTLIB_FLAG
    if not MATPLOTLIB_FLAG:
        import matplotlib
+
        matplotlib.use("Agg")
        MATPLOTLIB_FLAG = True
-    mpl_logger = logging.getLogger('matplotlib')
+        mpl_logger = logging.getLogger("matplotlib")
        mpl_logger.setLevel(logging.WARNING)
    import matplotlib.pylab as plt
    import numpy as np

    fig, ax = plt.subplots(figsize=(6, 4))
-  im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
-                  interpolation='none')
+    im = ax.imshow(
+        alignment.transpose(), aspect="auto", origin="lower", interpolation="none"
+    )
    fig.colorbar(im, ax=ax)
-  xlabel = 'Decoder timestep'
+    xlabel = "Decoder timestep"
    if info is not None:
-      xlabel += '\n\n' + info
+        xlabel += "\n\n" + info
    plt.xlabel(xlabel)
-  plt.ylabel('Encoder timestep')
+    plt.ylabel("Encoder timestep")
    plt.tight_layout()

    fig.canvas.draw()
-  data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+    data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
    data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
    plt.close()
    return data
@ -226,13 +271,13 @@ def load_wav_to_torch(full_path):


 def load_filepaths_and_text(filename, split="|"):
-  with open(filename, encoding='utf-8') as f:
+    with open(filename, encoding="utf-8") as f:
        filepaths_and_text = [line.strip().split(split) for line in f]
    return filepaths_and_text


 def get_hparams(init=True):
-  '''
+    """
    todo:
      结尾七人组：
        保存频率、总epoch                     done
@ -247,20 +292,56 @@ todo:
      -m:
        自动决定training_files路径,改掉train_nsf_load_pretrain.py里的hps.data.training_files    done
      -c不要了
-  '''
+    """
    parser = argparse.ArgumentParser()
    # parser.add_argument('-c', '--config', type=str, default="configs/40k.json",help='JSON file for configuration')
-  parser.add_argument('-se', '--save_every_epoch', type=int, required=True,help='checkpoint save frequency (epoch)')
-  parser.add_argument('-te', '--total_epoch', type=int, required=True,help='total_epoch')
-  parser.add_argument('-pg', '--pretrainG', type=str, default="",help='Pretrained Discriminator path')
-  parser.add_argument('-pd', '--pretrainD', type=str, default="",help='Pretrained Generator path')
-  parser.add_argument('-g', '--gpus', type=str, default="0",help='split by -')
-  parser.add_argument('-bs', '--batch_size', type=int, required=True,help='batch size')
-  parser.add_argument('-e', '--experiment_dir', type=str, required=True,help='experiment dir')#-m
-  parser.add_argument('-sr', '--sample_rate', type=str, required=True,help='sample rate, 32k/40k/48k')
-  parser.add_argument('-f0', '--if_f0', type=int, required=True,help='use f0 as one of the inputs of the model, 1 or 0')
-  parser.add_argument('-l', '--if_latest', type=int, required=True,help='if only save the latest G/D pth file, 1 or 0')
-  parser.add_argument('-c', '--if_cache_data_in_gpu', type=int, required=True,help='if caching the dataset in GPU memory, 1 or 0')
+    parser.add_argument(
+        "-se",
+        "--save_every_epoch",
+        type=int,
+        required=True,
+        help="checkpoint save frequency (epoch)",
+    )
+    parser.add_argument(
+        "-te", "--total_epoch", type=int, required=True, help="total_epoch"
+    )
+    parser.add_argument(
+        "-pg", "--pretrainG", type=str, default="", help="Pretrained Discriminator path"
+    )
+    parser.add_argument(
+        "-pd", "--pretrainD", type=str, default="", help="Pretrained Generator path"
+    )
+    parser.add_argument("-g", "--gpus", type=str, default="0", help="split by -")
+    parser.add_argument(
+        "-bs", "--batch_size", type=int, required=True, help="batch size"
+    )
+    parser.add_argument(
+        "-e", "--experiment_dir", type=str, required=True, help="experiment dir"
+    )  # -m
+    parser.add_argument(
+        "-sr", "--sample_rate", type=str, required=True, help="sample rate, 32k/40k/48k"
+    )
+    parser.add_argument(
+        "-f0",
+        "--if_f0",
+        type=int,
+        required=True,
+        help="use f0 as one of the inputs of the model, 1 or 0",
+    )
+    parser.add_argument(
+        "-l",
+        "--if_latest",
+        type=int,
+        required=True,
+        help="if only save the latest G/D pth file, 1 or 0",
+    )
+    parser.add_argument(
+        "-c",
+        "--if_cache_data_in_gpu",
+        type=int,
+        required=True,
+        help="if caching the dataset in GPU memory, 1 or 0",
+    )

    args = parser.parse_args()
    name = args.experiment_dir
@ -321,9 +402,11 @@ def get_hparams_from_file(config_path):
 def check_git_hash(model_dir):
    source_dir = os.path.dirname(os.path.realpath(__file__))
    if not os.path.exists(os.path.join(source_dir, ".git")):
-    logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
+        logger.warn(
+            "{} is not a git repository, therefore hash value comparison will be ignored.".format(
                source_dir
-    ))
+            )
+        )
        return

    cur_hash = subprocess.getoutput("git rev-parse HEAD")
@ -332,8 +415,11 @@ def check_git_hash(model_dir):
    if os.path.exists(path):
        saved_hash = open(path).read()
        if saved_hash != cur_hash:
-      logger.warn("git hash values are different. {}(saved) != {}(current)".format(
-        saved_hash[:8], cur_hash[:8]))
+            logger.warn(
+                "git hash values are different. {}(saved) != {}(current)".format(
+                    saved_hash[:8], cur_hash[:8]
+                )
+            )
    else:
        open(path, "w").write(cur_hash)

@ -353,7 +439,7 @@ def get_logger(model_dir, filename="train.log"):
    return logger


-class HParams():
+class HParams:
    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            if type(v) == dict:
--- a/train_nsf_sim_cache_sid_load_pretrain.py
+++ b/train_nsf_sim_cache_sid_load_pretrain.py
@ -1,12 +1,15 @@
 import sys, os
+
 now_dir = os.getcwd()
 sys.path.append(os.path.join(now_dir, "train"))
 import utils
+
 hps = utils.get_hparams()
 os.environ["CUDA_VISIBLE_DEVICES"] = hps.gpus.replace("-", ",")
 n_gpus = len(hps.gpus.split("-"))
 from random import shuffle
 import traceback, json, argparse, itertools, math, torch, pdb
+
 torch.backends.cudnn.deterministic = False
 torch.backends.cudnn.benchmark = False
 from torch import nn, optim
@ -20,9 +23,16 @@ from torch.cuda.amp import autocast, GradScaler
 from infer_pack import commons

 from time import time as ttime
-from data_utils import TextAudioLoaderMultiNSFsid,TextAudioLoader, TextAudioCollateMultiNSFsid,TextAudioCollate, DistributedBucketSampler
+from data_utils import (
+    TextAudioLoaderMultiNSFsid,
+    TextAudioLoader,
+    TextAudioCollateMultiNSFsid,
+    TextAudioCollate,
+    DistributedBucketSampler,
+)
 from infer_pack.models import (
-    SynthesizerTrnMs256NSFsid,SynthesizerTrnMs256NSFsid_nono,
+    SynthesizerTrnMs256NSFsid,
+    SynthesizerTrnMs256NSFsid_nono,
    MultiPeriodDiscriminator,
 )
 from losses import generator_loss, discriminator_loss, feature_loss, kl_loss
@ -32,13 +42,11 @@ from mel_processing import mel_spectrogram_torch, spec_to_mel_torch
 global_step = 0


-
 def main():
    # n_gpus = torch.cuda.device_count()
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "5555"

-
    mp.spawn(
        run,
        nprocs=n_gpus,
@ -62,10 +70,13 @@ def run(rank, n_gpus, hps):
        backend="gloo", init_method="env://", world_size=n_gpus, rank=rank
    )
    torch.manual_seed(hps.train.seed)
-    if torch.cuda.is_available(): torch.cuda.set_device(rank)
+    if torch.cuda.is_available():
+        torch.cuda.set_device(rank)

-    if (hps.if_f0 == 1):train_dataset = TextAudioLoaderMultiNSFsid(hps.data.training_files, hps.data)
-    else:train_dataset = TextAudioLoader(hps.data.training_files, hps.data)
+    if hps.if_f0 == 1:
+        train_dataset = TextAudioLoaderMultiNSFsid(hps.data.training_files, hps.data)
+    else:
+        train_dataset = TextAudioLoader(hps.data.training_files, hps.data)
    train_sampler = DistributedBucketSampler(
        train_dataset,
        hps.train.batch_size * n_gpus,
@ -77,8 +88,10 @@ def run(rank, n_gpus, hps):
    )
    # It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit.
    # num_workers=8 -> num_workers=4
-    if (hps.if_f0 == 1):collate_fn = TextAudioCollateMultiNSFsid()
-    else:collate_fn = TextAudioCollate()
+    if hps.if_f0 == 1:
+        collate_fn = TextAudioCollateMultiNSFsid()
+    else:
+        collate_fn = TextAudioCollate()
    train_loader = DataLoader(
        train_dataset,
        num_workers=4,
@ -89,13 +102,26 @@ def run(rank, n_gpus, hps):
        persistent_workers=True,
        prefetch_factor=8,
    )
-    if(hps.if_f0==1):
-        net_g = SynthesizerTrnMs256NSFsid(hps.data.filter_length // 2 + 1,hps.train.segment_size // hps.data.hop_length,**hps.model,is_half=hps.train.fp16_run,sr=hps.sample_rate)
+    if hps.if_f0 == 1:
+        net_g = SynthesizerTrnMs256NSFsid(
+            hps.data.filter_length // 2 + 1,
+            hps.train.segment_size // hps.data.hop_length,
+            **hps.model,
+            is_half=hps.train.fp16_run,
+            sr=hps.sample_rate,
+        )
    else:
-        net_g = SynthesizerTrnMs256NSFsid_nono(hps.data.filter_length // 2 + 1,hps.train.segment_size // hps.data.hop_length,**hps.model,is_half=hps.train.fp16_run)
-    if torch.cuda.is_available(): net_g = net_g.cuda(rank)
+        net_g = SynthesizerTrnMs256NSFsid_nono(
+            hps.data.filter_length // 2 + 1,
+            hps.train.segment_size // hps.data.hop_length,
+            **hps.model,
+            is_half=hps.train.fp16_run,
+        )
+    if torch.cuda.is_available():
+        net_g = net_g.cuda(rank)
    net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm)
-    if torch.cuda.is_available(): net_d = net_d.cuda(rank)
+    if torch.cuda.is_available():
+        net_d = net_d.cuda(rank)
    optim_g = torch.optim.AdamW(
        net_g.parameters(),
        hps.train.learning_rate,
@ -118,11 +144,15 @@ def run(rank, n_gpus, hps):
        net_d = DDP(net_d)

    try:  # 如果能加载自动resume
-        _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d)  # D多半加载没事
+        _, _, _, epoch_str = utils.load_checkpoint(
+            utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d
+        )  # D多半加载没事
        if rank == 0:
            logger.info("loaded D")
        # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0)
-        _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g)
+        _, _, _, epoch_str = utils.load_checkpoint(
+            utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g
+        )
        global_step = (epoch_str - 1) * len(train_loader)
        # epoch_str = 1
        # global_step = 0
@ -132,8 +162,16 @@ def run(rank, n_gpus, hps):
        global_step = 0
        if rank == 0:
            logger.info("loaded pretrained %s %s" % (hps.pretrainG, hps.pretrainD))
-        print(net_g.module.load_state_dict(torch.load(hps.pretrainG,map_location="cpu")["model"]))##测试不加载优化器
-        print(net_d.module.load_state_dict(torch.load(hps.pretrainD,map_location="cpu")["model"]))
+        print(
+            net_g.module.load_state_dict(
+                torch.load(hps.pretrainG, map_location="cpu")["model"]
+            )
+        )  ##测试不加载优化器
+        print(
+            net_d.module.load_state_dict(
+                torch.load(hps.pretrainD, map_location="cpu")["model"]
+            )
+        )

    scheduler_g = torch.optim.lr_scheduler.ExponentialLR(
        optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
@ -157,7 +195,8 @@ def run(rank, n_gpus, hps):
                scaler,
                [train_loader, None],
                logger,
-                [writer, writer_eval],cache
+                [writer, writer_eval],
+                cache,
            )
        else:
            train_and_evaluate(
@ -170,7 +209,8 @@ def run(rank, n_gpus, hps):
                scaler,
                [train_loader, None],
                None,
-                None,cache
+                None,
+                cache,
            )
        scheduler_g.step()
        scheduler_d.step()
@ -190,25 +230,101 @@ def train_and_evaluate(

    net_g.train()
    net_d.train()
-    if(cache==[]or hps.if_cache_data_in_gpu==False):#第一个epoch把cache全部填满训练集
+    if cache == [] or hps.if_cache_data_in_gpu == False:  # 第一个epoch把cache全部填满训练集
        # print("caching")
        for batch_idx, info in enumerate(train_loader):
-            if (hps.if_f0 == 1):phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths,sid=info
-            else:phone,phone_lengths,spec,spec_lengths,wave,wave_lengths,sid=info
+            if hps.if_f0 == 1:
+                (
+                    phone,
+                    phone_lengths,
+                    pitch,
+                    pitchf,
+                    spec,
+                    spec_lengths,
+                    wave,
+                    wave_lengths,
+                    sid,
+                ) = info
+            else:
+                phone, phone_lengths, spec, spec_lengths, wave, wave_lengths, sid = info
            if torch.cuda.is_available():
-                phone, phone_lengths = phone.cuda(rank, non_blocking=True), phone_lengths.cuda(rank, non_blocking=True )
-                if (hps.if_f0 == 1):pitch,pitchf = pitch.cuda(rank, non_blocking=True),pitchf.cuda(rank, non_blocking=True)
+                phone, phone_lengths = phone.cuda(
+                    rank, non_blocking=True
+                ), phone_lengths.cuda(rank, non_blocking=True)
+                if hps.if_f0 == 1:
+                    pitch, pitchf = pitch.cuda(rank, non_blocking=True), pitchf.cuda(
+                        rank, non_blocking=True
+                    )
                sid = sid.cuda(rank, non_blocking=True)
-                spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True)
-                wave, wave_lengths = wave.cuda(rank, non_blocking=True), wave_lengths.cuda(rank, non_blocking=True)
-            if(hps.if_cache_data_in_gpu==True):
-                if (hps.if_f0 == 1):cache.append((batch_idx, (phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths ,sid)))
-                else:cache.append((batch_idx, (phone,phone_lengths,spec,spec_lengths,wave,wave_lengths ,sid)))
+                spec, spec_lengths = spec.cuda(
+                    rank, non_blocking=True
+                ), spec_lengths.cuda(rank, non_blocking=True)
+                wave, wave_lengths = wave.cuda(
+                    rank, non_blocking=True
+                ), wave_lengths.cuda(rank, non_blocking=True)
+            if hps.if_cache_data_in_gpu == True:
+                if hps.if_f0 == 1:
+                    cache.append(
+                        (
+                            batch_idx,
+                            (
+                                phone,
+                                phone_lengths,
+                                pitch,
+                                pitchf,
+                                spec,
+                                spec_lengths,
+                                wave,
+                                wave_lengths,
+                                sid,
+                            ),
+                        )
+                    )
+                else:
+                    cache.append(
+                        (
+                            batch_idx,
+                            (
+                                phone,
+                                phone_lengths,
+                                spec,
+                                spec_lengths,
+                                wave,
+                                wave_lengths,
+                                sid,
+                            ),
+                        )
+                    )
            with autocast(enabled=hps.train.fp16_run):
-                if (hps.if_f0 == 1):y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, pitch,pitchf, spec, spec_lengths,sid)
-                else:y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, spec, spec_lengths,sid)
-                mel = spec_to_mel_torch(spec,hps.data.filter_length,hps.data.n_mel_channels,hps.data.sampling_rate,hps.data.mel_fmin,hps.data.mel_fmax,)
-                y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length)
+                if hps.if_f0 == 1:
+                    (
+                        y_hat,
+                        ids_slice,
+                        x_mask,
+                        z_mask,
+                        (z, z_p, m_p, logs_p, m_q, logs_q),
+                    ) = net_g(
+                        phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid
+                    )
+                else:
+                    (
+                        y_hat,
+                        ids_slice,
+                        x_mask,
+                        z_mask,
+                        (z, z_p, m_p, logs_p, m_q, logs_q),
+                    ) = net_g(phone, phone_lengths, spec, spec_lengths, sid)
+                mel = spec_to_mel_torch(
+                    spec,
+                    hps.data.filter_length,
+                    hps.data.n_mel_channels,
+                    hps.data.sampling_rate,
+                    hps.data.mel_fmin,
+                    hps.data.mel_fmax,
+                )
+                y_mel = commons.slice_segments(
+                    mel, ids_slice, hps.train.segment_size // hps.data.hop_length
+                )
                with autocast(enabled=False):
                    y_hat_mel = mel_spectrogram_torch(
                        y_hat.float().squeeze(1),
@ -220,7 +336,7 @@ def train_and_evaluate(
                        hps.data.mel_fmin,
                        hps.data.mel_fmax,
                    )
-                if(hps.train.fp16_run==True):
+                if hps.train.fp16_run == True:
                    y_hat_mel = y_hat_mel.half()
                wave = commons.slice_segments(
                    wave, ids_slice * hps.data.hop_length, hps.train.segment_size
@ -280,17 +396,27 @@ def train_and_evaluate(
                        "grad_norm_g": grad_norm_g,
                    }
                    scalar_dict.update(
-                        {"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl}
+                        {
+                            "loss/g/fm": loss_fm,
+                            "loss/g/mel": loss_mel,
+                            "loss/g/kl": loss_kl,
+                        }
                    )

                    scalar_dict.update(
                        {"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}
                    )
                    scalar_dict.update(
-                        {"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}
+                        {
+                            "loss/d_r/{}".format(i): v
+                            for i, v in enumerate(losses_disc_r)
+                        }
                    )
                    scalar_dict.update(
-                        {"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}
+                        {
+                            "loss/d_g/{}".format(i): v
+                            for i, v in enumerate(losses_disc_g)
+                        }
                    )
                    image_dict = {
                        "slice/mel_org": utils.plot_spectrogram_to_numpy(
@ -312,7 +438,7 @@ def train_and_evaluate(
            global_step += 1
        # if global_step % hps.train.eval_interval == 0:
        if epoch % hps.save_every_epoch == 0 and rank == 0:
-            if(hps.if_latest==0):
+            if hps.if_latest == 0:
                utils.save_checkpoint(
                    net_g,
                    optim_g,
@ -347,11 +473,39 @@ def train_and_evaluate(
        shuffle(cache)
        # print("using cache")
        for batch_idx, info in cache:
-            if (hps.if_f0 == 1):phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths,sid=info
-            else:phone,phone_lengths,spec,spec_lengths,wave,wave_lengths,sid=info
+            if hps.if_f0 == 1:
+                (
+                    phone,
+                    phone_lengths,
+                    pitch,
+                    pitchf,
+                    spec,
+                    spec_lengths,
+                    wave,
+                    wave_lengths,
+                    sid,
+                ) = info
+            else:
+                phone, phone_lengths, spec, spec_lengths, wave, wave_lengths, sid = info
            with autocast(enabled=hps.train.fp16_run):
-                if (hps.if_f0 == 1):y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, pitch,pitchf, spec, spec_lengths,sid)
-                else:y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, spec, spec_lengths,sid)
+                if hps.if_f0 == 1:
+                    (
+                        y_hat,
+                        ids_slice,
+                        x_mask,
+                        z_mask,
+                        (z, z_p, m_p, logs_p, m_q, logs_q),
+                    ) = net_g(
+                        phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid
+                    )
+                else:
+                    (
+                        y_hat,
+                        ids_slice,
+                        x_mask,
+                        z_mask,
+                        (z, z_p, m_p, logs_p, m_q, logs_q),
+                    ) = net_g(phone, phone_lengths, spec, spec_lengths, sid)
                mel = spec_to_mel_torch(
                    spec,
                    hps.data.filter_length,
@ -374,7 +528,7 @@ def train_and_evaluate(
                        hps.data.mel_fmin,
                        hps.data.mel_fmax,
                    )
-                if(hps.train.fp16_run==True):
+                if hps.train.fp16_run == True:
                    y_hat_mel = y_hat_mel.half()
                wave = commons.slice_segments(
                    wave, ids_slice * hps.data.hop_length, hps.train.segment_size
@ -435,17 +589,27 @@ def train_and_evaluate(
                        "grad_norm_g": grad_norm_g,
                    }
                    scalar_dict.update(
-                        {"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl}
+                        {
+                            "loss/g/fm": loss_fm,
+                            "loss/g/mel": loss_mel,
+                            "loss/g/kl": loss_kl,
+                        }
                    )

                    scalar_dict.update(
                        {"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}
                    )
                    scalar_dict.update(
-                        {"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}
+                        {
+                            "loss/d_r/{}".format(i): v
+                            for i, v in enumerate(losses_disc_r)
+                        }
                    )
                    scalar_dict.update(
-                        {"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}
+                        {
+                            "loss/d_g/{}".format(i): v
+                            for i, v in enumerate(losses_disc_g)
+                        }
                    )
                    image_dict = {
                        "slice/mel_org": utils.plot_spectrogram_to_numpy(
@ -467,7 +631,7 @@ def train_and_evaluate(
            global_step += 1
        # if global_step % hps.train.eval_interval == 0:
        if epoch % hps.save_every_epoch == 0 and rank == 0:
-            if(hps.if_latest==0):
+            if hps.if_latest == 0:
                utils.save_checkpoint(
                    net_g,
                    optim_g,
@ -498,15 +662,20 @@ def train_and_evaluate(
                    os.path.join(hps.model_dir, "D_{}.pth".format(2333333)),
                )

-
    if rank == 0:
        logger.info("====> Epoch: {}".format(epoch))
-    if(epoch>=hps.total_epoch and rank == 0):
+    if epoch >= hps.total_epoch and rank == 0:
        logger.info("Training is done. The program is closed.")
        from process_ckpt import savee  # def savee(ckpt,sr,if_f0,name,epoch):
-        if hasattr(net_g, 'module'):ckpt = net_g.module.state_dict()
-        else:ckpt = net_g.state_dict()
-        logger.info("saving final ckpt:%s"%(savee(ckpt,hps.sample_rate,hps.if_f0,hps.name,epoch)))
+
+        if hasattr(net_g, "module"):
+            ckpt = net_g.module.state_dict()
+        else:
+            ckpt = net_g.state_dict()
+        logger.info(
+            "saving final ckpt:%s"
+            % (savee(ckpt, hps.sample_rate, hps.if_f0, hps.name, epoch))
+        )
        os._exit(2333333)


--- a/trainset_preprocess_pipeline_print.py
+++ b/trainset_preprocess_pipeline_print.py
@ -1,4 +1,5 @@
 import sys, os, multiprocessing
+
 now_dir = os.getcwd()
 sys.path.append(now_dir)

@ -16,6 +17,8 @@ from my_utils import load_audio

 mutex = multiprocessing.Lock()
 f = open("%s/preprocess.log" % exp_dir, "a+")
+
+
 def println(strr):
    mutex.acquire()
    print(strr)
@ -23,7 +26,8 @@ def println(strr):
    f.flush()
    mutex.release()

-class PreProcess():
+
+class PreProcess:
    def __init__(self, sr, exp_dir):
        self.slicer = Slicer(
            sr=sr,
@ -31,7 +35,7 @@ class PreProcess():
            min_length=800,
            min_interval=400,
            hop_size=15,
-            max_sil_kept=150
+            max_sil_kept=150,
        )
        self.sr = sr
        self.per = 3.7
@ -47,10 +51,20 @@ class PreProcess():
        os.makedirs(self.wavs16k_dir, exist_ok=True)

    def norm_write(self, tmp_audio, idx0, idx1):
-        tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (self.max * self.alpha)) + (1 - self.alpha) * tmp_audio
-        wavfile.write("%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), self.sr, (tmp_audio*32768).astype(np.int16))
+        tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (self.max * self.alpha)) + (
+            1 - self.alpha
+        ) * tmp_audio
+        wavfile.write(
+            "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1),
+            self.sr,
+            (tmp_audio * 32768).astype(np.int16),
+        )
        tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000)
-        wavfile.write("%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), 16000, (tmp_audio*32768).astype(np.int16))
+        wavfile.write(
+            "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
+            16000,
+            (tmp_audio * 32768).astype(np.int16),
+        )

    def pipeline(self, path, idx0):
        try:
@ -58,10 +72,10 @@ class PreProcess():
            idx1 = 0
            for audio in self.slicer.slice(audio):
                i = 0
-                while (1):
+                while 1:
                    start = int(self.sr * (self.per - self.overlap) * i)
                    i += 1
-                    if (len(audio[start:]) > self.tail * self.sr):
+                    if len(audio[start:]) > self.tail * self.sr:
                        tmp_audio = audio[start : start + int(self.per * self.sr)]
                        self.norm_write(tmp_audio, idx0, idx1)
                        idx1 += 1
@ -79,19 +93,27 @@ class PreProcess():

    def pipeline_mp_inp_dir(self, inp_root, n_p):
        try:
-            infos = [("%s/%s" % (inp_root, name), idx) for idx, name in enumerate(sorted(list(os.listdir(inp_root))))]
+            infos = [
+                ("%s/%s" % (inp_root, name), idx)
+                for idx, name in enumerate(sorted(list(os.listdir(inp_root))))
+            ]
            if noparallel:
-                for i in range(n_p): self.pipeline_mp(infos[i::n_p])
+                for i in range(n_p):
+                    self.pipeline_mp(infos[i::n_p])
            else:
                ps = []
                for i in range(n_p):
-                    p=multiprocessing.Process(target=self.pipeline_mp,args=(infos[i::n_p],))
+                    p = multiprocessing.Process(
+                        target=self.pipeline_mp, args=(infos[i::n_p],)
+                    )
                    p.start()
                    ps.append(p)
-                    for p in ps:p.join()
+                    for p in ps:
+                        p.join()
        except:
            println("Fail. %s" % traceback.format_exc())

+
 def preprocess_trainset(inp_root, sr, n_p, exp_dir):
    pp = PreProcess(sr, exp_dir)
    println("start preprocess")
@ -99,5 +121,6 @@ def preprocess_trainset(inp_root, sr, n_p, exp_dir):
    pp.pipeline_mp_inp_dir(inp_root, n_p)
    println("end preprocess")

-if __name__=='__main__':
+
+if __name__ == "__main__":
    preprocess_trainset(inp_root, sr, n_p, exp_dir)
--- a/uvr5_pack/lib_v5/dataset.py
+++ b/uvr5_pack/lib_v5/dataset.py
@ -10,7 +10,6 @@ from uvr5_pack.lib_v5 import spec_utils


 class VocalRemoverValidationSet(torch.utils.data.Dataset):
-
    def __init__(self, patch_list):
        self.patch_list = patch_list

@ -21,7 +20,7 @@ class VocalRemoverValidationSet(torch.utils.data.Dataset):
        path = self.patch_list[idx]
        data = np.load(path)

-        X, y = data['X'], data['y']
+        X, y = data["X"], data["y"]

        X_mag = np.abs(X)
        y_mag = np.abs(y)
@ -30,16 +29,22 @@ class VocalRemoverValidationSet(torch.utils.data.Dataset):


 def make_pair(mix_dir, inst_dir):
-    input_exts = ['.wav', '.m4a', '.mp3', '.mp4', '.flac']
+    input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"]

-    X_list = sorted([
+    X_list = sorted(
+        [
            os.path.join(mix_dir, fname)
            for fname in os.listdir(mix_dir)
-        if os.path.splitext(fname)[1] in input_exts])
-    y_list = sorted([
+            if os.path.splitext(fname)[1] in input_exts
+        ]
+    )
+    y_list = sorted(
+        [
            os.path.join(inst_dir, fname)
            for fname in os.listdir(inst_dir)
-        if os.path.splitext(fname)[1] in input_exts])
+            if os.path.splitext(fname)[1] in input_exts
+        ]
+    )

    filelist = list(zip(X_list, y_list))

@ -47,10 +52,11 @@ def make_pair(mix_dir, inst_dir):


 def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
-    if split_mode == 'random':
+    if split_mode == "random":
        filelist = make_pair(
-            os.path.join(dataset_dir, 'mixtures'),
-            os.path.join(dataset_dir, 'instruments'))
+            os.path.join(dataset_dir, "mixtures"),
+            os.path.join(dataset_dir, "instruments"),
+        )

        random.shuffle(filelist)

@ -60,19 +66,23 @@ def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
            val_filelist = filelist[-val_size:]
        else:
            train_filelist = [
-                pair for pair in filelist
-                if list(pair) not in val_filelist]
-    elif split_mode == 'subdirs':
+                pair for pair in filelist if list(pair) not in val_filelist
+            ]
+    elif split_mode == "subdirs":
        if len(val_filelist) != 0:
-            raise ValueError('The `val_filelist` option is not available in `subdirs` mode')
+            raise ValueError(
+                "The `val_filelist` option is not available in `subdirs` mode"
+            )

        train_filelist = make_pair(
-            os.path.join(dataset_dir, 'training/mixtures'),
-            os.path.join(dataset_dir, 'training/instruments'))
+            os.path.join(dataset_dir, "training/mixtures"),
+            os.path.join(dataset_dir, "training/instruments"),
+        )

        val_filelist = make_pair(
-            os.path.join(dataset_dir, 'validation/mixtures'),
-            os.path.join(dataset_dir, 'validation/instruments'))
+            os.path.join(dataset_dir, "validation/mixtures"),
+            os.path.join(dataset_dir, "validation/instruments"),
+        )

    return train_filelist, val_filelist

@ -81,7 +91,9 @@ def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha):
    perm = np.random.permutation(len(X))
    for i, idx in enumerate(tqdm(perm)):
        if np.random.uniform() < reduction_rate:
-            y[idx] = spec_utils.reduce_vocal_aggressively(X[idx], y[idx], reduction_mask)
+            y[idx] = spec_utils.reduce_vocal_aggressively(
+                X[idx], y[idx], reduction_mask
+            )

        if np.random.uniform() < 0.5:
            # swap channel
@ -116,10 +128,8 @@ def make_padding(width, cropsize, offset):
 def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset):
    len_dataset = patches * len(filelist)

-    X_dataset = np.zeros(
-        (len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
-    y_dataset = np.zeros(
-        (len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
+    X_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
+    y_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)

    for i, (X_path, y_path) in enumerate(tqdm(filelist)):
        X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
@ -127,8 +137,8 @@ def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset
        X, y = X / coef, y / coef

        l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
-        X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant')
-        y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode='constant')
+        X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
+        y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")

        starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches)
        ends = starts + cropsize
@ -142,7 +152,9 @@ def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset

 def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
    patch_list = []
-    patch_dir = 'cs{}_sr{}_hl{}_nf{}_of{}'.format(cropsize, sr, hop_length, n_fft, offset)
+    patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(
+        cropsize, sr, hop_length, n_fft, offset
+    )
    os.makedirs(patch_dir, exist_ok=True)

    for i, (X_path, y_path) in enumerate(tqdm(filelist)):
@ -153,18 +165,19 @@ def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
        X, y = X / coef, y / coef

        l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
-        X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant')
-        y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode='constant')
+        X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
+        y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")

        len_dataset = int(np.ceil(X.shape[2] / roi_size))
        for j in range(len_dataset):
-            outpath = os.path.join(patch_dir, '{}_p{}.npz'.format(basename, j))
+            outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j))
            start = j * roi_size
            if not os.path.exists(outpath):
                np.savez(
                    outpath,
                    X=X_pad[:, :, start : start + cropsize],
-                    y=y_pad[:, :, start:start + cropsize])
+                    y=y_pad[:, :, start : start + cropsize],
+                )
            patch_list.append(outpath)

    return VocalRemoverValidationSet(patch_list)
--- a/uvr5_pack/lib_v5/layers.py
+++ b/uvr5_pack/lib_v5/layers.py
@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils


 class Conv2DBNActiv(nn.Module):
-
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
        super(Conv2DBNActiv, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
-                nin, nout,
+                nin,
+                nout,
                kernel_size=ksize,
                stride=stride,
                padding=pad,
                dilation=dilation,
-                bias=False),
+                bias=False,
+            ),
            nn.BatchNorm2d(nout),
-            activ()
+            activ(),
        )

    def __call__(self, x):
@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):


 class SeperableConv2DBNActiv(nn.Module):
-
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
        super(SeperableConv2DBNActiv, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
-                nin, nin,
+                nin,
+                nin,
                kernel_size=ksize,
                stride=stride,
                padding=pad,
                dilation=dilation,
                groups=nin,
-                bias=False),
-            nn.Conv2d(
-                nin, nout,
-                kernel_size=1,
-                bias=False),
+                bias=False,
+            ),
+            nn.Conv2d(nin, nout, kernel_size=1, bias=False),
            nn.BatchNorm2d(nout),
-            activ()
+            activ(),
        )

    def __call__(self, x):
@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):


 class Encoder(nn.Module):
-
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
        super(Encoder, self).__init__()
        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@ -65,14 +63,15 @@ class Encoder(nn.Module):


 class Decoder(nn.Module):
-
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
+    def __init__(
+        self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+    ):
        super(Decoder, self).__init__()
        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
        self.dropout = nn.Dropout2d(0.1) if dropout else None

    def __call__(self, x, skip=None):
-        x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
+        x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
        if skip is not None:
            skip = spec_utils.crop_center(skip, x)
            x = torch.cat([x, skip], dim=1)
@ -85,28 +84,31 @@ class Decoder(nn.Module):


 class ASPPModule(nn.Module):
-
    def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
        super(ASPPModule, self).__init__()
        self.conv1 = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, None)),
-            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
        )
        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
        self.conv3 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
+            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+        )
        self.conv4 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
+            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+        )
        self.conv5 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
        self.bottleneck = nn.Sequential(
-            Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ),
-            nn.Dropout2d(0.1)
+            Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
        )

    def forward(self, x):
        _, _, h, w = x.size()
-        feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
+        feat1 = F.interpolate(
+            self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+        )
        feat2 = self.conv2(x)
        feat3 = self.conv3(x)
        feat4 = self.conv4(x)
--- a/uvr5_pack/lib_v5/layers_123812KB
+++ b/uvr5_pack/lib_v5/layers_123812KB
@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils


 class Conv2DBNActiv(nn.Module):
-
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
        super(Conv2DBNActiv, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
-                nin, nout,
+                nin,
+                nout,
                kernel_size=ksize,
                stride=stride,
                padding=pad,
                dilation=dilation,
-                bias=False),
+                bias=False,
+            ),
            nn.BatchNorm2d(nout),
-            activ()
+            activ(),
        )

    def __call__(self, x):
@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):


 class SeperableConv2DBNActiv(nn.Module):
-
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
        super(SeperableConv2DBNActiv, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
-                nin, nin,
+                nin,
+                nin,
                kernel_size=ksize,
                stride=stride,
                padding=pad,
                dilation=dilation,
                groups=nin,
-                bias=False),
-            nn.Conv2d(
-                nin, nout,
-                kernel_size=1,
-                bias=False),
+                bias=False,
+            ),
+            nn.Conv2d(nin, nout, kernel_size=1, bias=False),
            nn.BatchNorm2d(nout),
-            activ()
+            activ(),
        )

    def __call__(self, x):
@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):


 class Encoder(nn.Module):
-
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
        super(Encoder, self).__init__()
        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@ -65,14 +63,15 @@ class Encoder(nn.Module):


 class Decoder(nn.Module):
-
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
+    def __init__(
+        self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+    ):
        super(Decoder, self).__init__()
        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
        self.dropout = nn.Dropout2d(0.1) if dropout else None

    def __call__(self, x, skip=None):
-        x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
+        x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
        if skip is not None:
            skip = spec_utils.crop_center(skip, x)
            x = torch.cat([x, skip], dim=1)
@ -85,28 +84,31 @@ class Decoder(nn.Module):


 class ASPPModule(nn.Module):
-
    def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
        super(ASPPModule, self).__init__()
        self.conv1 = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, None)),
-            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
        )
        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
        self.conv3 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
+            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+        )
        self.conv4 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
+            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+        )
        self.conv5 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
        self.bottleneck = nn.Sequential(
-            Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ),
-            nn.Dropout2d(0.1)
+            Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
        )

    def forward(self, x):
        _, _, h, w = x.size()
-        feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
+        feat1 = F.interpolate(
+            self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+        )
        feat2 = self.conv2(x)
        feat3 = self.conv3(x)
        feat4 = self.conv4(x)
--- a/uvr5_pack/lib_v5/layers_123821KB.py
+++ b/uvr5_pack/lib_v5/layers_123821KB.py
@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils


 class Conv2DBNActiv(nn.Module):
-
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
        super(Conv2DBNActiv, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
-                nin, nout,
+                nin,
+                nout,
                kernel_size=ksize,
                stride=stride,
                padding=pad,
                dilation=dilation,
-                bias=False),
+                bias=False,
+            ),
            nn.BatchNorm2d(nout),
-            activ()
+            activ(),
        )

    def __call__(self, x):
@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):


 class SeperableConv2DBNActiv(nn.Module):
-
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
        super(SeperableConv2DBNActiv, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
-                nin, nin,
+                nin,
+                nin,
                kernel_size=ksize,
                stride=stride,
                padding=pad,
                dilation=dilation,
                groups=nin,
-                bias=False),
-            nn.Conv2d(
-                nin, nout,
-                kernel_size=1,
-                bias=False),
+                bias=False,
+            ),
+            nn.Conv2d(nin, nout, kernel_size=1, bias=False),
            nn.BatchNorm2d(nout),
-            activ()
+            activ(),
        )

    def __call__(self, x):
@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):


 class Encoder(nn.Module):
-
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
        super(Encoder, self).__init__()
        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@ -65,14 +63,15 @@ class Encoder(nn.Module):


 class Decoder(nn.Module):
-
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
+    def __init__(
+        self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+    ):
        super(Decoder, self).__init__()
        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
        self.dropout = nn.Dropout2d(0.1) if dropout else None

    def __call__(self, x, skip=None):
-        x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
+        x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
        if skip is not None:
            skip = spec_utils.crop_center(skip, x)
            x = torch.cat([x, skip], dim=1)
@ -85,28 +84,31 @@ class Decoder(nn.Module):


 class ASPPModule(nn.Module):
-
    def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
        super(ASPPModule, self).__init__()
        self.conv1 = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, None)),
-            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
        )
        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
        self.conv3 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
+            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+        )
        self.conv4 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
+            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+        )
        self.conv5 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
        self.bottleneck = nn.Sequential(
-            Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ),
-            nn.Dropout2d(0.1)
+            Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
        )

    def forward(self, x):
        _, _, h, w = x.size()
-        feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
+        feat1 = F.interpolate(
+            self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+        )
        feat2 = self.conv2(x)
        feat3 = self.conv3(x)
        feat4 = self.conv4(x)
--- a/uvr5_pack/lib_v5/layers_33966KB.py
+++ b/uvr5_pack/lib_v5/layers_33966KB.py
@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils


 class Conv2DBNActiv(nn.Module):
-
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
        super(Conv2DBNActiv, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
-                nin, nout,
+                nin,
+                nout,
                kernel_size=ksize,
                stride=stride,
                padding=pad,
                dilation=dilation,
-                bias=False),
+                bias=False,
+            ),
            nn.BatchNorm2d(nout),
-            activ()
+            activ(),
        )

    def __call__(self, x):
@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):


 class SeperableConv2DBNActiv(nn.Module):
-
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
        super(SeperableConv2DBNActiv, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
-                nin, nin,
+                nin,
+                nin,
                kernel_size=ksize,
                stride=stride,
                padding=pad,
                dilation=dilation,
                groups=nin,
-                bias=False),
-            nn.Conv2d(
-                nin, nout,
-                kernel_size=1,
-                bias=False),
+                bias=False,
+            ),
+            nn.Conv2d(nin, nout, kernel_size=1, bias=False),
            nn.BatchNorm2d(nout),
-            activ()
+            activ(),
        )

    def __call__(self, x):
@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):


 class Encoder(nn.Module):
-
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
        super(Encoder, self).__init__()
        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@ -65,14 +63,15 @@ class Encoder(nn.Module):


 class Decoder(nn.Module):
-
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
+    def __init__(
+        self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+    ):
        super(Decoder, self).__init__()
        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
        self.dropout = nn.Dropout2d(0.1) if dropout else None

    def __call__(self, x, skip=None):
-        x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
+        x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
        if skip is not None:
            skip = spec_utils.crop_center(skip, x)
            x = torch.cat([x, skip], dim=1)
@ -85,32 +84,37 @@ class Decoder(nn.Module):


 class ASPPModule(nn.Module):
-
    def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
        super(ASPPModule, self).__init__()
        self.conv1 = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, None)),
-            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
        )
        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
        self.conv3 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
+            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+        )
        self.conv4 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
+            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+        )
        self.conv5 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
        self.conv6 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
        self.conv7 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
        self.bottleneck = nn.Sequential(
-            Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ),
-            nn.Dropout2d(0.1)
+            Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
        )

    def forward(self, x):
        _, _, h, w = x.size()
-        feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
+        feat1 = F.interpolate(
+            self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+        )
        feat2 = self.conv2(x)
        feat3 = self.conv3(x)
        feat4 = self.conv4(x)
--- a/uvr5_pack/lib_v5/layers_537227KB.py
+++ b/uvr5_pack/lib_v5/layers_537227KB.py
@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils


 class Conv2DBNActiv(nn.Module):
-
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
        super(Conv2DBNActiv, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
-                nin, nout,
+                nin,
+                nout,
                kernel_size=ksize,
                stride=stride,
                padding=pad,
                dilation=dilation,
-                bias=False),
+                bias=False,
+            ),
            nn.BatchNorm2d(nout),
-            activ()
+            activ(),
        )

    def __call__(self, x):
@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):


 class SeperableConv2DBNActiv(nn.Module):
-
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
        super(SeperableConv2DBNActiv, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
-                nin, nin,
+                nin,
+                nin,
                kernel_size=ksize,
                stride=stride,
                padding=pad,
                dilation=dilation,
                groups=nin,
-                bias=False),
-            nn.Conv2d(
-                nin, nout,
-                kernel_size=1,
-                bias=False),
+                bias=False,
+            ),
+            nn.Conv2d(nin, nout, kernel_size=1, bias=False),
            nn.BatchNorm2d(nout),
-            activ()
+            activ(),
        )

    def __call__(self, x):
@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):


 class Encoder(nn.Module):
-
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
        super(Encoder, self).__init__()
        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@ -65,14 +63,15 @@ class Encoder(nn.Module):


 class Decoder(nn.Module):
-
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
+    def __init__(
+        self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+    ):
        super(Decoder, self).__init__()
        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
        self.dropout = nn.Dropout2d(0.1) if dropout else None

    def __call__(self, x, skip=None):
-        x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
+        x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
        if skip is not None:
            skip = spec_utils.crop_center(skip, x)
            x = torch.cat([x, skip], dim=1)
@ -85,32 +84,37 @@ class Decoder(nn.Module):


 class ASPPModule(nn.Module):
-
    def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
        super(ASPPModule, self).__init__()
        self.conv1 = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, None)),
-            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
        )
        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
        self.conv3 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
+            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+        )
        self.conv4 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
+            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+        )
        self.conv5 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
        self.conv6 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
        self.conv7 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
        self.bottleneck = nn.Sequential(
-            Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ),
-            nn.Dropout2d(0.1)
+            Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
        )

    def forward(self, x):
        _, _, h, w = x.size()
-        feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
+        feat1 = F.interpolate(
+            self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+        )
        feat2 = self.conv2(x)
        feat3 = self.conv3(x)
        feat4 = self.conv4(x)
--- a/uvr5_pack/lib_v5/layers_537238KB.py
+++ b/uvr5_pack/lib_v5/layers_537238KB.py
@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils


 class Conv2DBNActiv(nn.Module):
-
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
        super(Conv2DBNActiv, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
-                nin, nout,
+                nin,
+                nout,
                kernel_size=ksize,
                stride=stride,
                padding=pad,
                dilation=dilation,
-                bias=False),
+                bias=False,
+            ),
            nn.BatchNorm2d(nout),
-            activ()
+            activ(),
        )

    def __call__(self, x):
@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):


 class SeperableConv2DBNActiv(nn.Module):
-
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
        super(SeperableConv2DBNActiv, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
-                nin, nin,
+                nin,
+                nin,
                kernel_size=ksize,
                stride=stride,
                padding=pad,
                dilation=dilation,
                groups=nin,
-                bias=False),
-            nn.Conv2d(
-                nin, nout,
-                kernel_size=1,
-                bias=False),
+                bias=False,
+            ),
+            nn.Conv2d(nin, nout, kernel_size=1, bias=False),
            nn.BatchNorm2d(nout),
-            activ()
+            activ(),
        )

    def __call__(self, x):
@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):


 class Encoder(nn.Module):
-
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
        super(Encoder, self).__init__()
        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@ -65,14 +63,15 @@ class Encoder(nn.Module):


 class Decoder(nn.Module):
-
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
+    def __init__(
+        self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+    ):
        super(Decoder, self).__init__()
        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
        self.dropout = nn.Dropout2d(0.1) if dropout else None

    def __call__(self, x, skip=None):
-        x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
+        x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
        if skip is not None:
            skip = spec_utils.crop_center(skip, x)
            x = torch.cat([x, skip], dim=1)
@ -85,32 +84,37 @@ class Decoder(nn.Module):


 class ASPPModule(nn.Module):
-
    def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
        super(ASPPModule, self).__init__()
        self.conv1 = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, None)),
-            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
        )
        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
        self.conv3 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
+            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+        )
        self.conv4 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
+            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+        )
        self.conv5 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
        self.conv6 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
        self.conv7 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
        self.bottleneck = nn.Sequential(
-            Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ),
-            nn.Dropout2d(0.1)
+            Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
        )

    def forward(self, x):
        _, _, h, w = x.size()
-        feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
+        feat1 = F.interpolate(
+            self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+        )
        feat2 = self.conv2(x)
        feat3 = self.conv3(x)
        feat4 = self.conv4(x)
--- a/uvr5_pack/lib_v5/model_param_init.py
+++ b/uvr5_pack/lib_v5/model_param_init.py
@ -3,33 +3,33 @@ import os
 import pathlib

 default_param = {}
-default_param['bins'] = 768
-default_param['unstable_bins'] = 9 # training only
-default_param['reduction_bins'] = 762 # training only
-default_param['sr'] = 44100
-default_param['pre_filter_start'] = 757
-default_param['pre_filter_stop'] = 768
-default_param['band'] = {}
+default_param["bins"] = 768
+default_param["unstable_bins"] = 9  # training only
+default_param["reduction_bins"] = 762  # training only
+default_param["sr"] = 44100
+default_param["pre_filter_start"] = 757
+default_param["pre_filter_stop"] = 768
+default_param["band"] = {}


-default_param['band'][1] = {
-    'sr': 11025,
-    'hl': 128,
-    'n_fft': 960,
-    'crop_start': 0,
-    'crop_stop': 245,
-    'lpf_start': 61, # inference only
-    'res_type': 'polyphase'
+default_param["band"][1] = {
+    "sr": 11025,
+    "hl": 128,
+    "n_fft": 960,
+    "crop_start": 0,
+    "crop_stop": 245,
+    "lpf_start": 61,  # inference only
+    "res_type": "polyphase",
 }

-default_param['band'][2] = {
-    'sr': 44100,
-    'hl': 512,
-    'n_fft': 1536,
-    'crop_start': 24,
-    'crop_stop': 547,
-    'hpf_start': 81, # inference only
-    'res_type': 'sinc_best'
+default_param["band"][2] = {
+    "sr": 44100,
+    "hl": 512,
+    "n_fft": 1536,
+    "crop_start": 24,
+    "crop_stop": 547,
+    "hpf_start": 81,  # inference only
+    "res_type": "sinc_best",
 }


@ -43,18 +43,27 @@ def int_keys(d):


 class ModelParameters(object):
-    def __init__(self, config_path=''):
-        if '.pth' == pathlib.Path(config_path).suffix:
+    def __init__(self, config_path=""):
+        if ".pth" == pathlib.Path(config_path).suffix:
            import zipfile

-            with zipfile.ZipFile(config_path, 'r') as zip:
-                self.param = json.loads(zip.read('param.json'), object_pairs_hook=int_keys)
-        elif '.json' == pathlib.Path(config_path).suffix:
-            with open(config_path, 'r') as f:
+            with zipfile.ZipFile(config_path, "r") as zip:
+                self.param = json.loads(
+                    zip.read("param.json"), object_pairs_hook=int_keys
+                )
+        elif ".json" == pathlib.Path(config_path).suffix:
+            with open(config_path, "r") as f:
                self.param = json.loads(f.read(), object_pairs_hook=int_keys)
        else:
            self.param = default_param

-        for k in ['mid_side', 'mid_side_b', 'mid_side_b2', 'stereo_w', 'stereo_n', 'reverse']:
+        for k in [
+            "mid_side",
+            "mid_side_b",
+            "mid_side_b2",
+            "stereo_w",
+            "stereo_n",
+            "reverse",
+        ]:
            if not k in self.param:
                self.param[k] = False
--- a/uvr5_pack/lib_v5/nets.py
+++ b/uvr5_pack/lib_v5/nets.py
@ -7,7 +7,6 @@ from uvr5_pack.lib_v5 import spec_utils


 class BaseASPPNet(nn.Module):
-
    def __init__(self, nin, ch, dilations=(4, 8, 16)):
        super(BaseASPPNet, self).__init__()
        self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@ -39,7 +38,6 @@ class BaseASPPNet(nn.Module):


 class CascadedASPPNet(nn.Module):
-
    def __init__(self, n_fft):
        super(CascadedASPPNet, self).__init__()
        self.stg1_low_band_net = BaseASPPNet(2, 16)
@ -67,10 +65,13 @@ class CascadedASPPNet(nn.Module):
        x = x[:, :, : self.max_bin]

        bandw = x.size()[2] // 2
-        aux1 = torch.cat([
+        aux1 = torch.cat(
+            [
                self.stg1_low_band_net(x[:, :, :bandw]),
-            self.stg1_high_band_net(x[:, :, bandw:])
-        ], dim=2)
+                self.stg1_high_band_net(x[:, :, bandw:]),
+            ],
+            dim=2,
+        )

        h = torch.cat([x, aux1], dim=1)
        aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@ -82,24 +83,33 @@ class CascadedASPPNet(nn.Module):
        mask = F.pad(
            input=mask,
            pad=(0, 0, 0, self.output_bin - mask.size()[2]),
-            mode='replicate')
+            mode="replicate",
+        )

        if self.training:
            aux1 = torch.sigmoid(self.aux1_out(aux1))
            aux1 = F.pad(
                input=aux1,
                pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
-                mode='replicate')
+                mode="replicate",
+            )
            aux2 = torch.sigmoid(self.aux2_out(aux2))
            aux2 = F.pad(
                input=aux2,
                pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
-                mode='replicate')
+                mode="replicate",
+            )
            return mask * mix, aux1 * mix, aux2 * mix
        else:
            if aggressiveness:
-                mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
-                mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
+                mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
+                    mask[:, :, : aggressiveness["split_bin"]],
+                    1 + aggressiveness["value"] / 3,
+                )
+                mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
+                    mask[:, :, aggressiveness["split_bin"] :],
+                    1 + aggressiveness["value"],
+                )

            return mask * mix

--- a/uvr5_pack/lib_v5/nets_123812KB.py
+++ b/uvr5_pack/lib_v5/nets_123812KB.py
@ -6,7 +6,6 @@ from uvr5_pack.lib_v5 import layers_123821KB as layers


 class BaseASPPNet(nn.Module):
-
    def __init__(self, nin, ch, dilations=(4, 8, 16)):
        super(BaseASPPNet, self).__init__()
        self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@ -38,7 +37,6 @@ class BaseASPPNet(nn.Module):


 class CascadedASPPNet(nn.Module):
-
    def __init__(self, n_fft):
        super(CascadedASPPNet, self).__init__()
        self.stg1_low_band_net = BaseASPPNet(2, 32)
@ -66,10 +64,13 @@ class CascadedASPPNet(nn.Module):
        x = x[:, :, : self.max_bin]

        bandw = x.size()[2] // 2
-        aux1 = torch.cat([
+        aux1 = torch.cat(
+            [
                self.stg1_low_band_net(x[:, :, :bandw]),
-            self.stg1_high_band_net(x[:, :, bandw:])
-        ], dim=2)
+                self.stg1_high_band_net(x[:, :, bandw:]),
+            ],
+            dim=2,
+        )

        h = torch.cat([x, aux1], dim=1)
        aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@ -81,24 +82,33 @@ class CascadedASPPNet(nn.Module):
        mask = F.pad(
            input=mask,
            pad=(0, 0, 0, self.output_bin - mask.size()[2]),
-            mode='replicate')
+            mode="replicate",
+        )

        if self.training:
            aux1 = torch.sigmoid(self.aux1_out(aux1))
            aux1 = F.pad(
                input=aux1,
                pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
-                mode='replicate')
+                mode="replicate",
+            )
            aux2 = torch.sigmoid(self.aux2_out(aux2))
            aux2 = F.pad(
                input=aux2,
                pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
-                mode='replicate')
+                mode="replicate",
+            )
            return mask * mix, aux1 * mix, aux2 * mix
        else:
            if aggressiveness:
-                mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
-                mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
+                mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
+                    mask[:, :, : aggressiveness["split_bin"]],
+                    1 + aggressiveness["value"] / 3,
+                )
+                mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
+                    mask[:, :, aggressiveness["split_bin"] :],
+                    1 + aggressiveness["value"],
+                )

            return mask * mix

--- a/uvr5_pack/lib_v5/nets_123821KB.py
+++ b/uvr5_pack/lib_v5/nets_123821KB.py
@ -6,7 +6,6 @@ from uvr5_pack.lib_v5 import layers_123821KB as layers


 class BaseASPPNet(nn.Module):
-
    def __init__(self, nin, ch, dilations=(4, 8, 16)):
        super(BaseASPPNet, self).__init__()
        self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@ -38,7 +37,6 @@ class BaseASPPNet(nn.Module):


 class CascadedASPPNet(nn.Module):
-
    def __init__(self, n_fft):
        super(CascadedASPPNet, self).__init__()
        self.stg1_low_band_net = BaseASPPNet(2, 32)
@ -66,10 +64,13 @@ class CascadedASPPNet(nn.Module):
        x = x[:, :, : self.max_bin]

        bandw = x.size()[2] // 2
-        aux1 = torch.cat([
+        aux1 = torch.cat(
+            [
                self.stg1_low_band_net(x[:, :, :bandw]),
-            self.stg1_high_band_net(x[:, :, bandw:])
-        ], dim=2)
+                self.stg1_high_band_net(x[:, :, bandw:]),
+            ],
+            dim=2,
+        )

        h = torch.cat([x, aux1], dim=1)
        aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@ -81,24 +82,33 @@ class CascadedASPPNet(nn.Module):
        mask = F.pad(
            input=mask,
            pad=(0, 0, 0, self.output_bin - mask.size()[2]),
-            mode='replicate')
+            mode="replicate",
+        )

        if self.training:
            aux1 = torch.sigmoid(self.aux1_out(aux1))
            aux1 = F.pad(
                input=aux1,
                pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
-                mode='replicate')
+                mode="replicate",
+            )
            aux2 = torch.sigmoid(self.aux2_out(aux2))
            aux2 = F.pad(
                input=aux2,
                pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
-                mode='replicate')
+                mode="replicate",
+            )
            return mask * mix, aux1 * mix, aux2 * mix
        else:
            if aggressiveness:
-                mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
-                mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
+                mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
+                    mask[:, :, : aggressiveness["split_bin"]],
+                    1 + aggressiveness["value"] / 3,
+                )
+                mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
+                    mask[:, :, aggressiveness["split_bin"] :],
+                    1 + aggressiveness["value"],
+                )

            return mask * mix

--- a/uvr5_pack/lib_v5/nets_33966KB.py
+++ b/uvr5_pack/lib_v5/nets_33966KB.py
@ -6,7 +6,6 @@ from uvr5_pack.lib_v5 import layers_33966KB as layers


 class BaseASPPNet(nn.Module):
-
    def __init__(self, nin, ch, dilations=(4, 8, 16, 32)):
        super(BaseASPPNet, self).__init__()
        self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@ -38,7 +37,6 @@ class BaseASPPNet(nn.Module):


 class CascadedASPPNet(nn.Module):
-
    def __init__(self, n_fft):
        super(CascadedASPPNet, self).__init__()
        self.stg1_low_band_net = BaseASPPNet(2, 16)
@ -66,10 +64,13 @@ class CascadedASPPNet(nn.Module):
        x = x[:, :, : self.max_bin]

        bandw = x.size()[2] // 2
-        aux1 = torch.cat([
+        aux1 = torch.cat(
+            [
                self.stg1_low_band_net(x[:, :, :bandw]),
-            self.stg1_high_band_net(x[:, :, bandw:])
-        ], dim=2)
+                self.stg1_high_band_net(x[:, :, bandw:]),
+            ],
+            dim=2,
+        )

        h = torch.cat([x, aux1], dim=1)
        aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@ -81,24 +82,33 @@ class CascadedASPPNet(nn.Module):
        mask = F.pad(
            input=mask,
            pad=(0, 0, 0, self.output_bin - mask.size()[2]),
-            mode='replicate')
+            mode="replicate",
+        )

        if self.training:
            aux1 = torch.sigmoid(self.aux1_out(aux1))
            aux1 = F.pad(
                input=aux1,
                pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
-                mode='replicate')
+                mode="replicate",
+            )
            aux2 = torch.sigmoid(self.aux2_out(aux2))
            aux2 = F.pad(
                input=aux2,
                pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
-                mode='replicate')
+                mode="replicate",
+            )
            return mask * mix, aux1 * mix, aux2 * mix
        else:
            if aggressiveness:
-                mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
-                mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
+                mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
+                    mask[:, :, : aggressiveness["split_bin"]],
+                    1 + aggressiveness["value"] / 3,
+                )
+                mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
+                    mask[:, :, aggressiveness["split_bin"] :],
+                    1 + aggressiveness["value"],
+                )

            return mask * mix

--- a/uvr5_pack/lib_v5/nets_537227KB.py
+++ b/uvr5_pack/lib_v5/nets_537227KB.py
@ -7,7 +7,6 @@ from uvr5_pack.lib_v5 import layers_537238KB as layers


 class BaseASPPNet(nn.Module):
-
    def __init__(self, nin, ch, dilations=(4, 8, 16)):
        super(BaseASPPNet, self).__init__()
        self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@ -39,7 +38,6 @@ class BaseASPPNet(nn.Module):


 class CascadedASPPNet(nn.Module):
-
    def __init__(self, n_fft):
        super(CascadedASPPNet, self).__init__()
        self.stg1_low_band_net = BaseASPPNet(2, 64)
@ -67,10 +65,13 @@ class CascadedASPPNet(nn.Module):
        x = x[:, :, : self.max_bin]

        bandw = x.size()[2] // 2
-        aux1 = torch.cat([
+        aux1 = torch.cat(
+            [
                self.stg1_low_band_net(x[:, :, :bandw]),
-            self.stg1_high_band_net(x[:, :, bandw:])
-        ], dim=2)
+                self.stg1_high_band_net(x[:, :, bandw:]),
+            ],
+            dim=2,
+        )

        h = torch.cat([x, aux1], dim=1)
        aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@ -82,24 +83,33 @@ class CascadedASPPNet(nn.Module):
        mask = F.pad(
            input=mask,
            pad=(0, 0, 0, self.output_bin - mask.size()[2]),
-            mode='replicate')
+            mode="replicate",
+        )

        if self.training:
            aux1 = torch.sigmoid(self.aux1_out(aux1))
            aux1 = F.pad(
                input=aux1,
                pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
-                mode='replicate')
+                mode="replicate",
+            )
            aux2 = torch.sigmoid(self.aux2_out(aux2))
            aux2 = F.pad(
                input=aux2,
                pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
-                mode='replicate')
+                mode="replicate",
+            )
            return mask * mix, aux1 * mix, aux2 * mix
        else:
            if aggressiveness:
-                mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
-                mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
+                mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
+                    mask[:, :, : aggressiveness["split_bin"]],
+                    1 + aggressiveness["value"] / 3,
+                )
+                mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
+                    mask[:, :, aggressiveness["split_bin"] :],
+                    1 + aggressiveness["value"],
+                )

            return mask * mix

--- a/uvr5_pack/lib_v5/nets_537238KB.py
+++ b/uvr5_pack/lib_v5/nets_537238KB.py
@ -7,7 +7,6 @@ from uvr5_pack.lib_v5 import layers_537238KB as layers


 class BaseASPPNet(nn.Module):
-
    def __init__(self, nin, ch, dilations=(4, 8, 16)):
        super(BaseASPPNet, self).__init__()
        self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@ -39,7 +38,6 @@ class BaseASPPNet(nn.Module):


 class CascadedASPPNet(nn.Module):
-
    def __init__(self, n_fft):
        super(CascadedASPPNet, self).__init__()
        self.stg1_low_band_net = BaseASPPNet(2, 64)
@ -67,10 +65,13 @@ class CascadedASPPNet(nn.Module):
        x = x[:, :, : self.max_bin]

        bandw = x.size()[2] // 2
-        aux1 = torch.cat([
+        aux1 = torch.cat(
+            [
                self.stg1_low_band_net(x[:, :, :bandw]),
-            self.stg1_high_band_net(x[:, :, bandw:])
-        ], dim=2)
+                self.stg1_high_band_net(x[:, :, bandw:]),
+            ],
+            dim=2,
+        )

        h = torch.cat([x, aux1], dim=1)
        aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@ -82,24 +83,33 @@ class CascadedASPPNet(nn.Module):
        mask = F.pad(
            input=mask,
            pad=(0, 0, 0, self.output_bin - mask.size()[2]),
-            mode='replicate')
+            mode="replicate",
+        )

        if self.training:
            aux1 = torch.sigmoid(self.aux1_out(aux1))
            aux1 = F.pad(
                input=aux1,
                pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
-                mode='replicate')
+                mode="replicate",
+            )
            aux2 = torch.sigmoid(self.aux2_out(aux2))
            aux2 = F.pad(
                input=aux2,
                pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
-                mode='replicate')
+                mode="replicate",
+            )
            return mask * mix, aux1 * mix, aux2 * mix
        else:
            if aggressiveness:
-                mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
-                mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
+                mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
+                    mask[:, :, : aggressiveness["split_bin"]],
+                    1 + aggressiveness["value"] / 3,
+                )
+                mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
+                    mask[:, :, aggressiveness["split_bin"] :],
+                    1 + aggressiveness["value"],
+                )

            return mask * mix

--- a/uvr5_pack/lib_v5/nets_61968KB.py
+++ b/uvr5_pack/lib_v5/nets_61968KB.py
@ -6,7 +6,6 @@ from uvr5_pack.lib_v5 import layers_123821KB as layers


 class BaseASPPNet(nn.Module):
-
    def __init__(self, nin, ch, dilations=(4, 8, 16)):
        super(BaseASPPNet, self).__init__()
        self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@ -38,7 +37,6 @@ class BaseASPPNet(nn.Module):


 class CascadedASPPNet(nn.Module):
-
    def __init__(self, n_fft):
        super(CascadedASPPNet, self).__init__()
        self.stg1_low_band_net = BaseASPPNet(2, 32)
@ -66,10 +64,13 @@ class CascadedASPPNet(nn.Module):
        x = x[:, :, : self.max_bin]

        bandw = x.size()[2] // 2
-        aux1 = torch.cat([
+        aux1 = torch.cat(
+            [
                self.stg1_low_band_net(x[:, :, :bandw]),
-            self.stg1_high_band_net(x[:, :, bandw:])
-        ], dim=2)
+                self.stg1_high_band_net(x[:, :, bandw:]),
+            ],
+            dim=2,
+        )

        h = torch.cat([x, aux1], dim=1)
        aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@ -81,24 +82,33 @@ class CascadedASPPNet(nn.Module):
        mask = F.pad(
            input=mask,
            pad=(0, 0, 0, self.output_bin - mask.size()[2]),
-            mode='replicate')
+            mode="replicate",
+        )

        if self.training:
            aux1 = torch.sigmoid(self.aux1_out(aux1))
            aux1 = F.pad(
                input=aux1,
                pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
-                mode='replicate')
+                mode="replicate",
+            )
            aux2 = torch.sigmoid(self.aux2_out(aux2))
            aux2 = F.pad(
                input=aux2,
                pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
-                mode='replicate')
+                mode="replicate",
+            )
            return mask * mix, aux1 * mix, aux2 * mix
        else:
            if aggressiveness:
-                mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
-                mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
+                mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
+                    mask[:, :, : aggressiveness["split_bin"]],
+                    1 + aggressiveness["value"] / 3,
+                )
+                mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
+                    mask[:, :, aggressiveness["split_bin"] :],
+                    1 + aggressiveness["value"],
+                )

            return mask * mix

--- a/uvr5_pack/lib_v5/spec_utils.py
+++ b/uvr5_pack/lib_v5/spec_utils.py
@ -4,6 +4,7 @@ import soundfile  as  sf
 from tqdm import tqdm
 import json, math, hashlib

+
 def crop_center(h1, h2):
    h1_shape = h1.size()
    h2_shape = h2.size()
@ -11,7 +12,7 @@ def crop_center(h1, h2):
    if h1_shape[3] == h2_shape[3]:
        return h1
    elif h1_shape[3] < h2_shape[3]:
-        raise ValueError('h1_shape[3] must be greater than h2_shape[3]')
+        raise ValueError("h1_shape[3] must be greater than h2_shape[3]")

    # s_freq = (h2_shape[2] - h1_shape[2]) // 2
    # e_freq = s_freq + h1_shape[2]
@ -22,7 +23,9 @@ def crop_center(h1, h2):
    return h1


-def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
+def wave_to_spectrogram(
+    wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
+):
    if reverse:
        wave_left = np.flip(np.asfortranarray(wave[0]))
        wave_right = np.flip(np.asfortranarray(wave[1]))
@ -30,8 +33,8 @@ def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=Fal
        wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
        wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
    elif mid_side_b2:
-        wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5))
-        wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5))
+        wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
+        wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
    else:
        wave_left = np.asfortranarray(wave[0])
        wave_right = np.asfortranarray(wave[1])
@ -44,7 +47,9 @@ def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=Fal
    return spec


-def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
+def wave_to_spectrogram_mt(
+    wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
+):
    import threading

    if reverse:
@ -54,8 +59,8 @@ def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=
        wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
        wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
    elif mid_side_b2:
-        wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5))
-        wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5))
+        wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
+        wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
    else:
        wave_left = np.asfortranarray(wave[0])
        wave_right = np.asfortranarray(wave[1])
@ -64,7 +69,10 @@ def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=
        global spec_left
        spec_left = librosa.stft(**kwargs)

-    thread = threading.Thread(target=run_thread, kwargs={'y': wave_left, 'n_fft': n_fft, 'hop_length': hop_length})
+    thread = threading.Thread(
+        target=run_thread,
+        kwargs={"y": wave_left, "n_fft": n_fft, "hop_length": hop_length},
+    )
    thread.start()
    spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
    thread.join()
@ -76,40 +84,50 @@ def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=

 def combine_spectrograms(specs, mp):
    l = min([specs[i].shape[2] for i in specs])
-    spec_c = np.zeros(shape=(2, mp.param['bins'] + 1, l), dtype=np.complex64)
+    spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64)
    offset = 0
-    bands_n = len(mp.param['band'])
+    bands_n = len(mp.param["band"])

    for d in range(1, bands_n + 1):
-        h = mp.param['band'][d]['crop_stop'] - mp.param['band'][d]['crop_start']
-        spec_c[:, offset:offset+h, :l] = specs[d][:, mp.param['band'][d]['crop_start']:mp.param['band'][d]['crop_stop'], :l]
+        h = mp.param["band"][d]["crop_stop"] - mp.param["band"][d]["crop_start"]
+        spec_c[:, offset : offset + h, :l] = specs[d][
+            :, mp.param["band"][d]["crop_start"] : mp.param["band"][d]["crop_stop"], :l
+        ]
        offset += h

-    if offset > mp.param['bins']:
-        raise ValueError('Too much bins')
+    if offset > mp.param["bins"]:
+        raise ValueError("Too much bins")

    # lowpass fiter
-    if mp.param['pre_filter_start'] > 0: # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:   
+    if (
+        mp.param["pre_filter_start"] > 0
+    ):  # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
        if bands_n == 1:
-            spec_c = fft_lp_filter(spec_c, mp.param['pre_filter_start'], mp.param['pre_filter_stop'])
+            spec_c = fft_lp_filter(
+                spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"]
+            )
        else:
            gp = 1
-            for b in range(mp.param['pre_filter_start'] + 1, mp.param['pre_filter_stop']):
-                g = math.pow(10, -(b - mp.param['pre_filter_start']) * (3.5 - gp) / 20.0)
+            for b in range(
+                mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]
+            ):
+                g = math.pow(
+                    10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0
+                )
                gp = g
                spec_c[:, b, :] *= g

    return np.asfortranarray(spec_c)


-def spectrogram_to_image(spec, mode='magnitude'):
-    if mode == 'magnitude':
+def spectrogram_to_image(spec, mode="magnitude"):
+    if mode == "magnitude":
        if np.iscomplexobj(spec):
            y = np.abs(spec)
        else:
            y = spec
        y = np.log10(y**2 + 1e-8)
-    elif mode == 'phase':
+    elif mode == "phase":
        if np.iscomplexobj(spec):
            y = np.angle(spec)
        else:
@ -121,9 +139,7 @@ def spectrogram_to_image(spec, mode='magnitude'):

    if y.ndim == 3:
        img = img.transpose(1, 2, 0)
-        img = np.concatenate([
-            np.max(img, axis=2, keepdims=True), img
-        ], axis=2)
+        img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2)

    return img

@ -136,12 +152,12 @@ def reduce_vocal_aggressively(X, y, softmask):
    v_mask = v_mag_tmp > y_mag_tmp
    y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf)

-    return y_mag * np.exp(1.j * np.angle(y))
+    return y_mag * np.exp(1.0j * np.angle(y))


 def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
    if min_range < fade_size * 2:
-        raise ValueError('min_range must be >= fade_area * 2')
+        raise ValueError("min_range must be >= fade_area * 2")

    mag = mag.copy()

@ -169,7 +185,9 @@ def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
            else:
                e += fade_size

-            mag[:, :, s + fade_size:e - fade_size] += ref[:, :, s + fade_size:e - fade_size]
+            mag[:, :, s + fade_size : e - fade_size] += ref[
+                :, :, s + fade_size : e - fade_size
+            ]
            old_e = e

    return mag
@ -185,15 +203,17 @@ def cache_or_load(mix_path, inst_path, mp):
    mix_basename = os.path.splitext(os.path.basename(mix_path))[0]
    inst_basename = os.path.splitext(os.path.basename(inst_path))[0]

-    cache_dir = 'mph{}'.format(hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode('utf-8')).hexdigest())
-    mix_cache_dir = os.path.join('cache', cache_dir)
-    inst_cache_dir = os.path.join('cache', cache_dir)
+    cache_dir = "mph{}".format(
+        hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest()
+    )
+    mix_cache_dir = os.path.join("cache", cache_dir)
+    inst_cache_dir = os.path.join("cache", cache_dir)

    os.makedirs(mix_cache_dir, exist_ok=True)
    os.makedirs(inst_cache_dir, exist_ok=True)

-    mix_cache_path = os.path.join(mix_cache_dir, mix_basename + '.npy')
-    inst_cache_path = os.path.join(inst_cache_dir, inst_basename + '.npy')
+    mix_cache_path = os.path.join(mix_cache_dir, mix_basename + ".npy")
+    inst_cache_path = os.path.join(inst_cache_dir, inst_basename + ".npy")

    if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path):
        X_spec_m = np.load(mix_cache_path)
@ -201,22 +221,52 @@ def cache_or_load(mix_path, inst_path, mp):
    else:
        X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}

-        for d in range(len(mp.param['band']), 0, -1):            
-            bp = mp.param['band'][d]
+        for d in range(len(mp.param["band"]), 0, -1):
+            bp = mp.param["band"][d]

-            if d == len(mp.param['band']): # high-end band
+            if d == len(mp.param["band"]):  # high-end band
                X_wave[d], _ = librosa.load(
-                    mix_path, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
+                    mix_path, bp["sr"], False, dtype=np.float32, res_type=bp["res_type"]
+                )
                y_wave[d], _ = librosa.load(
-                    inst_path, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
+                    inst_path,
+                    bp["sr"],
+                    False,
+                    dtype=np.float32,
+                    res_type=bp["res_type"],
+                )
            else:  # lower bands
-                X_wave[d] = librosa.resample(X_wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
-                y_wave[d] = librosa.resample(y_wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
+                X_wave[d] = librosa.resample(
+                    X_wave[d + 1],
+                    mp.param["band"][d + 1]["sr"],
+                    bp["sr"],
+                    res_type=bp["res_type"],
+                )
+                y_wave[d] = librosa.resample(
+                    y_wave[d + 1],
+                    mp.param["band"][d + 1]["sr"],
+                    bp["sr"],
+                    res_type=bp["res_type"],
+                )

            X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d])

-            X_spec_s[d] = wave_to_spectrogram(X_wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
-            y_spec_s[d] = wave_to_spectrogram(y_wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
+            X_spec_s[d] = wave_to_spectrogram(
+                X_wave[d],
+                bp["hl"],
+                bp["n_fft"],
+                mp.param["mid_side"],
+                mp.param["mid_side_b2"],
+                mp.param["reverse"],
+            )
+            y_spec_s[d] = wave_to_spectrogram(
+                y_wave[d],
+                bp["hl"],
+                bp["n_fft"],
+                mp.param["mid_side"],
+                mp.param["mid_side_b2"],
+                mp.param["reverse"],
+            )

        del X_wave, y_wave

@ -224,7 +274,7 @@ def cache_or_load(mix_path, inst_path, mp):
        y_spec_m = combine_spectrograms(y_spec_s, mp)

        if X_spec_m.shape != y_spec_m.shape:
-            raise ValueError('The combined spectrograms are different: ' + mix_path)
+            raise ValueError("The combined spectrograms are different: " + mix_path)

        _, ext = os.path.splitext(mix_path)

@ -244,9 +294,16 @@ def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse):
    if reverse:
        return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
    elif mid_side:
-        return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
+        return np.asfortranarray(
+            [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
+        )
    elif mid_side_b2:
-        return np.asfortranarray([np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)])
+        return np.asfortranarray(
+            [
+                np.add(wave_right / 1.25, 0.4 * wave_left),
+                np.subtract(wave_left / 1.25, 0.4 * wave_right),
+            ]
+        )
    else:
        return np.asfortranarray([wave_left, wave_right])

@ -261,7 +318,9 @@ def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
        global wave_left
        wave_left = librosa.istft(**kwargs)

-    thread = threading.Thread(target=run_thread, kwargs={'stft_matrix': spec_left, 'hop_length': hop_length})
+    thread = threading.Thread(
+        target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length}
+    )
    thread.start()
    wave_right = librosa.istft(spec_right, hop_length=hop_length)
    thread.join()
@ -269,46 +328,94 @@ def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
    if reverse:
        return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
    elif mid_side:
-        return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
+        return np.asfortranarray(
+            [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
+        )
    elif mid_side_b2:
-        return np.asfortranarray([np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)])
+        return np.asfortranarray(
+            [
+                np.add(wave_right / 1.25, 0.4 * wave_left),
+                np.subtract(wave_left / 1.25, 0.4 * wave_right),
+            ]
+        )
    else:
        return np.asfortranarray([wave_left, wave_right])


 def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None):
    wave_band = {}
-    bands_n = len(mp.param['band'])    
+    bands_n = len(mp.param["band"])
    offset = 0

    for d in range(1, bands_n + 1):
-        bp = mp.param['band'][d]
-        spec_s = np.ndarray(shape=(2, bp['n_fft'] // 2 + 1, spec_m.shape[2]), dtype=complex)
-        h = bp['crop_stop'] - bp['crop_start']
-        spec_s[:, bp['crop_start']:bp['crop_stop'], :] = spec_m[:, offset:offset+h, :]
+        bp = mp.param["band"][d]
+        spec_s = np.ndarray(
+            shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex
+        )
+        h = bp["crop_stop"] - bp["crop_start"]
+        spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[
+            :, offset : offset + h, :
+        ]

        offset += h
        if d == bands_n:  # higher
            if extra_bins_h:  # if --high_end_process bypass
-                max_bin = bp['n_fft'] // 2
-                spec_s[:, max_bin-extra_bins_h:max_bin, :] = extra_bins[:, :extra_bins_h, :]
-            if bp['hpf_start'] > 0:
-                spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1)
+                max_bin = bp["n_fft"] // 2
+                spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[
+                    :, :extra_bins_h, :
+                ]
+            if bp["hpf_start"] > 0:
+                spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
            if bands_n == 1:
-                wave = spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
+                wave = spectrogram_to_wave(
+                    spec_s,
+                    bp["hl"],
+                    mp.param["mid_side"],
+                    mp.param["mid_side_b2"],
+                    mp.param["reverse"],
+                )
            else:
-                wave = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']))
+                wave = np.add(
+                    wave,
+                    spectrogram_to_wave(
+                        spec_s,
+                        bp["hl"],
+                        mp.param["mid_side"],
+                        mp.param["mid_side_b2"],
+                        mp.param["reverse"],
+                    ),
+                )
        else:
-            sr = mp.param['band'][d+1]['sr']
+            sr = mp.param["band"][d + 1]["sr"]
            if d == 1:  # lower
-                spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop'])
-                wave = librosa.resample(spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']), bp['sr'], sr, res_type="sinc_fastest")
+                spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
+                wave = librosa.resample(
+                    spectrogram_to_wave(
+                        spec_s,
+                        bp["hl"],
+                        mp.param["mid_side"],
+                        mp.param["mid_side_b2"],
+                        mp.param["reverse"],
+                    ),
+                    bp["sr"],
+                    sr,
+                    res_type="sinc_fastest",
+                )
            else:  # mid
-                spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1)
-                spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop'])
-                wave2 = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']))
+                spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
+                spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
+                wave2 = np.add(
+                    wave,
+                    spectrogram_to_wave(
+                        spec_s,
+                        bp["hl"],
+                        mp.param["mid_side"],
+                        mp.param["mid_side_b2"],
+                        mp.param["reverse"],
+                    ),
+                )
                # wave = librosa.core.resample(wave2, bp['sr'], sr, res_type="sinc_fastest")
-                wave = librosa.core.resample(wave2, bp['sr'], sr,res_type='scipy')
+                wave = librosa.core.resample(wave2, bp["sr"], sr, res_type="scipy")

    return wave.T

@ -336,14 +443,40 @@ def fft_hp_filter(spec, bin_start, bin_stop):


 def mirroring(a, spec_m, input_high_end, mp):
-    if 'mirroring' == a:
-        mirror = np.flip(np.abs(spec_m[:, mp.param['pre_filter_start']-10-input_high_end.shape[1]:mp.param['pre_filter_start']-10, :]), 1)
-        mirror = mirror * np.exp(1.j * np.angle(input_high_end))
+    if "mirroring" == a:
+        mirror = np.flip(
+            np.abs(
+                spec_m[
+                    :,
+                    mp.param["pre_filter_start"]
+                    - 10
+                    - input_high_end.shape[1] : mp.param["pre_filter_start"]
+                    - 10,
+                    :,
+                ]
+            ),
+            1,
+        )
+        mirror = mirror * np.exp(1.0j * np.angle(input_high_end))

-        return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror)
+        return np.where(
+            np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror
+        )

-    if 'mirroring2' == a:
-        mirror = np.flip(np.abs(spec_m[:, mp.param['pre_filter_start']-10-input_high_end.shape[1]:mp.param['pre_filter_start']-10, :]), 1)
+    if "mirroring2" == a:
+        mirror = np.flip(
+            np.abs(
+                spec_m[
+                    :,
+                    mp.param["pre_filter_start"]
+                    - 10
+                    - input_high_end.shape[1] : mp.param["pre_filter_start"]
+                    - 10,
+                    :,
+                ]
+            ),
+            1,
+        )
        mi = np.multiply(mirror, input_high_end * 1.7)

        return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi)
@ -358,13 +491,14 @@ def ensembling(a, specs):
        spec = spec[:, :, :ln]
        specs[i] = specs[i][:, :, :ln]

-        if 'min_mag' == a:
+        if "min_mag" == a:
            spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec)
-        if 'max_mag' == a:
+        if "max_mag" == a:
            spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec)

    return spec

+
 def stft(wave, nfft, hl):
    wave_left = np.asfortranarray(wave[0])
    wave_right = np.asfortranarray(wave[1])
@ -374,6 +508,7 @@ def stft(wave, nfft, hl):

    return spec

+
 def istft(spec, hl):
    spec_left = np.asfortranarray(spec[0])
    spec_right = np.asfortranarray(spec[1])
@ -391,20 +526,31 @@ if __name__ == "__main__":
    from model_param_init import ModelParameters

    p = argparse.ArgumentParser()
-    p.add_argument('--algorithm', '-a', type=str, choices=['invert', 'invert_p', 'min_mag', 'max_mag', 'deep', 'align'], default='min_mag')
-    p.add_argument('--model_params', '-m', type=str, default=os.path.join('modelparams', '1band_sr44100_hl512.json'))
-    p.add_argument('--output_name', '-o', type=str, default='output')
-    p.add_argument('--vocals_only', '-v', action='store_true')
-    p.add_argument('input', nargs='+')
+    p.add_argument(
+        "--algorithm",
+        "-a",
+        type=str,
+        choices=["invert", "invert_p", "min_mag", "max_mag", "deep", "align"],
+        default="min_mag",
+    )
+    p.add_argument(
+        "--model_params",
+        "-m",
+        type=str,
+        default=os.path.join("modelparams", "1band_sr44100_hl512.json"),
+    )
+    p.add_argument("--output_name", "-o", type=str, default="output")
+    p.add_argument("--vocals_only", "-v", action="store_true")
+    p.add_argument("input", nargs="+")
    args = p.parse_args()

    start_time = time.time()

-    if args.algorithm.startswith('invert') and len(args.input) != 2:
-        raise ValueError('There should be two input files.')    
+    if args.algorithm.startswith("invert") and len(args.input) != 2:
+        raise ValueError("There should be two input files.")

-    if not args.algorithm.startswith('invert') and len(args.input) < 2:
-        raise ValueError('There must be at least two input files.')
+    if not args.algorithm.startswith("invert") and len(args.input) < 2:
+        raise ValueError("There must be at least two input files.")

    wave, specs = {}, {}
    mp = ModelParameters(args.model_params)
@ -412,39 +558,60 @@ if __name__ == "__main__":
    for i in range(len(args.input)):
        spec = {}

-        for d in range(len(mp.param['band']), 0, -1):          
-            bp = mp.param['band'][d]            
+        for d in range(len(mp.param["band"]), 0, -1):
+            bp = mp.param["band"][d]

-            if d == len(mp.param['band']): # high-end band                
+            if d == len(mp.param["band"]):  # high-end band
                wave[d], _ = librosa.load(
-                    args.input[i], bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
+                    args.input[i],
+                    bp["sr"],
+                    False,
+                    dtype=np.float32,
+                    res_type=bp["res_type"],
+                )

                if len(wave[d].shape) == 1:  # mono to stereo
                    wave[d] = np.array([wave[d], wave[d]])
            else:  # lower bands
-                wave[d] = librosa.resample(wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
+                wave[d] = librosa.resample(
+                    wave[d + 1],
+                    mp.param["band"][d + 1]["sr"],
+                    bp["sr"],
+                    res_type=bp["res_type"],
+                )

-            spec[d] = wave_to_spectrogram(wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
+            spec[d] = wave_to_spectrogram(
+                wave[d],
+                bp["hl"],
+                bp["n_fft"],
+                mp.param["mid_side"],
+                mp.param["mid_side_b2"],
+                mp.param["reverse"],
+            )

        specs[i] = combine_spectrograms(spec, mp)

    del wave

-    if args.algorithm == 'deep':
+    if args.algorithm == "deep":
        d_spec = np.where(np.abs(specs[0]) <= np.abs(spec[1]), specs[0], spec[1])
        v_spec = d_spec - specs[1]
-        sf.write(os.path.join('{}.wav'.format(args.output_name)), cmb_spectrogram_to_wave(v_spec, mp), mp.param['sr'])   
+        sf.write(
+            os.path.join("{}.wav".format(args.output_name)),
+            cmb_spectrogram_to_wave(v_spec, mp),
+            mp.param["sr"],
+        )

-    if args.algorithm.startswith('invert'):
+    if args.algorithm.startswith("invert"):
        ln = min([specs[0].shape[2], specs[1].shape[2]])
        specs[0] = specs[0][:, :, :ln]
        specs[1] = specs[1][:, :, :ln]

-        if 'invert_p' == args.algorithm:
+        if "invert_p" == args.algorithm:
            X_mag = np.abs(specs[0])
            y_mag = np.abs(specs[1])
            max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
-            v_spec = specs[1] - max_mag * np.exp(1.j * np.angle(specs[0]))
+            v_spec = specs[1] - max_mag * np.exp(1.0j * np.angle(specs[0]))
        else:
            specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2)
            v_spec = specs[0] - specs[1]
@ -458,24 +625,39 @@ if __name__ == "__main__":
                y_image = spectrogram_to_image(y_mag)
                v_image = spectrogram_to_image(v_mag)

-                cv2.imwrite('{}_X.png'.format(args.output_name), X_image)
-                cv2.imwrite('{}_y.png'.format(args.output_name), y_image)
-                cv2.imwrite('{}_v.png'.format(args.output_name), v_image)    
+                cv2.imwrite("{}_X.png".format(args.output_name), X_image)
+                cv2.imwrite("{}_y.png".format(args.output_name), y_image)
+                cv2.imwrite("{}_v.png".format(args.output_name), v_image)

-                sf.write('{}_X.wav'.format(args.output_name), cmb_spectrogram_to_wave(specs[0], mp), mp.param['sr'])
-                sf.write('{}_y.wav'.format(args.output_name), cmb_spectrogram_to_wave(specs[1], mp), mp.param['sr'])
+                sf.write(
+                    "{}_X.wav".format(args.output_name),
+                    cmb_spectrogram_to_wave(specs[0], mp),
+                    mp.param["sr"],
+                )
+                sf.write(
+                    "{}_y.wav".format(args.output_name),
+                    cmb_spectrogram_to_wave(specs[1], mp),
+                    mp.param["sr"],
+                )

-        sf.write('{}_v.wav'.format(args.output_name), cmb_spectrogram_to_wave(v_spec, mp), mp.param['sr'])    
+        sf.write(
+            "{}_v.wav".format(args.output_name),
+            cmb_spectrogram_to_wave(v_spec, mp),
+            mp.param["sr"],
+        )
    else:
-        if not args.algorithm == 'deep':
-            sf.write(os.path.join('ensembled','{}.wav'.format(args.output_name)), cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp), mp.param['sr'])
-
-    if args.algorithm == 'align':
+        if not args.algorithm == "deep":
+            sf.write(
+                os.path.join("ensembled", "{}.wav".format(args.output_name)),
+                cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp),
+                mp.param["sr"],
+            )

+    if args.algorithm == "align":
        trackalignment = [
            {
-                'file1':'"{}"'.format(args.input[0]),
-                'file2':'"{}"'.format(args.input[1])
+                "file1": '"{}"'.format(args.input[0]),
+                "file2": '"{}"'.format(args.input[1]),
            }
        ]

--- a/uvr5_pack/name_params.json
+++ b/uvr5_pack/name_params.json
@ -0,0 +1,263 @@
+{
+    "equivalent" : [
+        {
+            "model_hash_name" : [
+                {
+                    "hash_name": "47939caf0cfe52a0e81442b85b971dfd",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
+                    "param_name": "4band_44100"
+                },
+                {
+                    "hash_name": "4e4ecb9764c50a8c414fee6e10395bbe",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2.json",
+                    "param_name": "4band_v2"
+                },
+                {
+                    "hash_name": "ca106edd563e034bde0bdec4bb7a4b36",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2.json",
+                    "param_name": "4band_v2"
+                },
+                {
+                    "hash_name": "e60a1e84803ce4efc0a6551206cc4b71",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
+                    "param_name": "4band_44100"
+                },
+                {
+                    "hash_name": "a82f14e75892e55e994376edbf0c8435",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
+                    "param_name": "4band_44100"
+                },
+                {
+                    "hash_name": "6dd9eaa6f0420af9f1d403aaafa4cc06",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
+                    "param_name": "4band_v2_sn"
+                },
+                {
+                    "hash_name": "08611fb99bd59eaa79ad27c58d137727",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
+                    "param_name": "4band_v2_sn"
+                },
+                {
+                    "hash_name": "5c7bbca45a187e81abbbd351606164e5",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
+                    "param_name": "3band_44100_msb2"
+                },
+                {
+                    "hash_name": "d6b2cb685a058a091e5e7098192d3233",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
+                    "param_name": "3band_44100_msb2"
+                },
+                {
+                    "hash_name": "c1b9f38170a7c90e96f027992eb7c62b",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
+                    "param_name": "4band_44100"
+                },
+                {
+                    "hash_name": "c3448ec923fa0edf3d03a19e633faa53",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
+                    "param_name": "4band_44100"
+                },
+                {
+                    "hash_name": "68aa2c8093d0080704b200d140f59e54",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100.json",
+                    "param_name": "3band_44100"
+                },
+                {
+                    "hash_name": "fdc83be5b798e4bd29fe00fe6600e147",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
+                    "param_name": "3band_44100_mid.json"
+                },
+                {
+                    "hash_name": "2ce34bc92fd57f55db16b7a4def3d745",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
+                    "param_name": "3band_44100_mid.json"
+                },
+                {
+                    "hash_name": "52fdca89576f06cf4340b74a4730ee5f",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
+                    "param_name": "4band_44100.json"
+                },
+                {
+                    "hash_name": "41191165b05d38fc77f072fa9e8e8a30",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
+                    "param_name": "4band_44100.json"
+                },
+                {
+                    "hash_name": "89e83b511ad474592689e562d5b1f80e",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/2band_32000.json",
+                    "param_name": "2band_32000.json"
+                },
+                {
+                    "hash_name": "0b954da81d453b716b114d6d7c95177f",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/2band_32000.json",
+                    "param_name": "2band_32000.json"
+                }
+
+            ],
+            "v4 Models": [
+                {
+                    "hash_name": "6a00461c51c2920fd68937d4609ed6c8",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json",
+                    "param_name": "1band_sr16000_hl512"
+                },
+                {
+                    "hash_name": "0ab504864d20f1bd378fe9c81ef37140",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
+                    "param_name": "1band_sr32000_hl512"
+                },
+                {
+                    "hash_name": "7dd21065bf91c10f7fccb57d7d83b07f",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
+                    "param_name": "1band_sr32000_hl512"
+                },
+                {
+                    "hash_name": "80ab74d65e515caa3622728d2de07d23",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
+                    "param_name": "1band_sr32000_hl512"
+                },
+                {
+                    "hash_name": "edc115e7fc523245062200c00caa847f",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
+                    "param_name": "1band_sr33075_hl384"
+                },
+                {
+                    "hash_name": "28063e9f6ab5b341c5f6d3c67f2045b7",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
+                    "param_name": "1band_sr33075_hl384"
+                },
+                {
+                    "hash_name": "b58090534c52cbc3e9b5104bad666ef2",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
+                    "param_name": "1band_sr44100_hl512"
+                },
+                {
+                    "hash_name": "0cdab9947f1b0928705f518f3c78ea8f",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
+                    "param_name": "1band_sr44100_hl512"
+                },
+                {
+                    "hash_name": "ae702fed0238afb5346db8356fe25f13",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json",
+                    "param_name": "1band_sr44100_hl1024"
+                }
+            ]
+        }
+    ],
+    "User Models" : [
+        {
+            "1 Band": [
+                {
+                    "hash_name": "1band_sr16000_hl512",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json",
+                    "param_name": "1band_sr16000_hl512"
+                },
+                {
+                    "hash_name": "1band_sr32000_hl512",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
+                    "param_name": "1band_sr16000_hl512"
+                },
+                {
+                    "hash_name": "1band_sr33075_hl384",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
+                    "param_name": "1band_sr33075_hl384"
+                },
+                {
+                    "hash_name": "1band_sr44100_hl256",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json",
+                    "param_name": "1band_sr44100_hl256"
+                },
+                {
+                    "hash_name": "1band_sr44100_hl512",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
+                    "param_name": "1band_sr44100_hl512"
+                },
+                {
+                    "hash_name": "1band_sr44100_hl1024",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json",
+                    "param_name": "1band_sr44100_hl1024"
+                }
+            ],
+            "2 Band": [
+                {
+                    "hash_name": "2band_44100_lofi",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json",
+                    "param_name": "2band_44100_lofi"
+                },
+                {
+                    "hash_name": "2band_32000",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/2band_32000.json",
+                    "param_name": "2band_32000"
+                },
+                {
+                    "hash_name": "2band_48000",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/2band_48000.json",
+                    "param_name": "2band_48000"
+                }
+            ],
+            "3 Band": [
+                {
+                    "hash_name": "3band_44100",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100.json",
+                    "param_name": "3band_44100"
+                },
+                {
+                    "hash_name": "3band_44100_mid",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
+                    "param_name": "3band_44100_mid"
+                },
+                {
+                    "hash_name": "3band_44100_msb2",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
+                    "param_name": "3band_44100_msb2"
+                }
+            ],
+            "4 Band": [
+                {
+                    "hash_name": "4band_44100",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
+                    "param_name": "4band_44100"
+                },
+                {
+                    "hash_name": "4band_44100_mid",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_mid.json",
+                    "param_name": "4band_44100_mid"
+                },
+                {
+                    "hash_name": "4band_44100_msb",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_msb.json",
+                    "param_name": "4band_44100_msb"
+                },
+                {
+                    "hash_name": "4band_44100_msb2",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json",
+                    "param_name": "4band_44100_msb2"
+                },
+                {
+                    "hash_name": "4band_44100_reverse",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json",
+                    "param_name": "4band_44100_reverse"
+                },
+                {
+                    "hash_name": "4band_44100_sw",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_sw.json",
+                    "param_name": "4band_44100_sw"
+                },
+                {
+                    "hash_name": "4band_v2",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2.json",
+                    "param_name": "4band_v2"
+                },
+                {
+                    "hash_name": "4band_v2_sn",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
+                    "param_name": "4band_v2_sn"
+                },
+                {
+                    "hash_name": "tmodelparam",
+                    "model_params": "uvr5_pack/lib_v5/modelparams/tmodelparam.json",
+                    "param_name": "User Model Param Set"
+                }
+            ]
+        }
+    ]
+}
--- a/uvr5_pack/utils.py
+++ b/uvr5_pack/utils.py
@ -1,6 +1,15 @@
 import torch
 import numpy as np
 from tqdm import tqdm
+import json
+
+
+def load_data(file_name: str = "./uvr5_pack/data.json") -> dict:
+    with open(file_name, "r") as f:
+        data = json.load(f)
+
+    return data
+

 def make_padding(width, cropsize, offset):
    left = offset
@ -10,12 +19,16 @@ def make_padding(width, cropsize, offset):
    right = roi_size - (width % roi_size) + left

    return left, right, roi_size
-def inference(X_spec, device, model, aggressiveness,data):
-    '''
-    data ： dic configs
-    '''

-    def _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness,is_half=True):
+
+def inference(X_spec, device, model, aggressiveness, data):
+    """
+    data ： dic configs
+    """
+
+    def _execute(
+        X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True
+    ):
        model.eval()
        with torch.no_grad():
            preds = []
@ -25,9 +38,12 @@ def inference(X_spec, device, model, aggressiveness,data):
            total_iterations = sum(iterations)
            for i in tqdm(range(n_window)):
                start = i * roi_size
-                X_mag_window = X_mag_pad[None, :, :, start:start + data['window_size']]
+                X_mag_window = X_mag_pad[
+                    None, :, :, start : start + data["window_size"]
+                ]
                X_mag_window = torch.from_numpy(X_mag_window)
-                if(is_half):X_mag_window=X_mag_window.half()
+                if is_half:
+                    X_mag_window = X_mag_window.half()
                X_mag_window = X_mag_window.to(device)

                pred = model.predict(X_mag_window, aggressiveness)
@ -50,193 +66,55 @@ def inference(X_spec, device, model, aggressiveness,data):
    X_mag_pre = X_mag / coef

    n_frame = X_mag_pre.shape[2]
-    pad_l, pad_r, roi_size = make_padding(n_frame,
-                                                data['window_size'], model.offset)
+    pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset)
    n_window = int(np.ceil(n_frame / roi_size))

-    X_mag_pad = np.pad(
-        X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant')
+    X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")

-    if(list(model.state_dict().values())[0].dtype==torch.float16):is_half=True
-    else:is_half=False
-    pred = _execute(X_mag_pad, roi_size, n_window,
-                        device, model, aggressiveness,is_half)
+    if list(model.state_dict().values())[0].dtype == torch.float16:
+        is_half = True
+    else:
+        is_half = False
+    pred = _execute(
+        X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
+    )
    pred = pred[:, :, :n_frame]

-    if data['tta']:
+    if data["tta"]:
        pad_l += roi_size // 2
        pad_r += roi_size // 2
        n_window += 1

-        X_mag_pad = np.pad(
-            X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant')
+        X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")

-        pred_tta = _execute(X_mag_pad, roi_size, n_window,
-                                device, model, aggressiveness,is_half)
+        pred_tta = _execute(
+            X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
+        )
        pred_tta = pred_tta[:, :, roi_size // 2 :]
        pred_tta = pred_tta[:, :, :n_frame]

-        return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.j * X_phase)
+        return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase)
    else:
-        return pred * coef, X_mag, np.exp(1.j * X_phase)
-            
+        return pred * coef, X_mag, np.exp(1.0j * X_phase)


 def _get_name_params(model_path, model_hash):
+    data = load_data()
+    flag = False
    ModelName = model_path
-    if model_hash == '47939caf0cfe52a0e81442b85b971dfd':  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
-        param_name_auto=str('4band_44100')
-    if model_hash == '4e4ecb9764c50a8c414fee6e10395bbe':  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2.json')
-        param_name_auto=str('4band_v2')
-    if model_hash == 'ca106edd563e034bde0bdec4bb7a4b36':
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2.json')
-        param_name_auto=str('4band_v2')
-    if model_hash == 'e60a1e84803ce4efc0a6551206cc4b71':
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
-        param_name_auto=str('4band_44100')
-    if model_hash == 'a82f14e75892e55e994376edbf0c8435':  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
-        param_name_auto=str('4band_44100')
-    if model_hash == '6dd9eaa6f0420af9f1d403aaafa4cc06':   
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2_sn.json')
-        param_name_auto=str('4band_v2_sn')
-    if model_hash == '08611fb99bd59eaa79ad27c58d137727':
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2_sn.json')
-        param_name_auto=str('4band_v2_sn')
-    if model_hash == '5c7bbca45a187e81abbbd351606164e5':
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json')
-        param_name_auto=str('3band_44100_msb2')
-    if model_hash == 'd6b2cb685a058a091e5e7098192d3233':    
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json')
-        param_name_auto=str('3band_44100_msb2')
-    if model_hash == 'c1b9f38170a7c90e96f027992eb7c62b': 
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
-        param_name_auto=str('4band_44100')
-    if model_hash == 'c3448ec923fa0edf3d03a19e633faa53':  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
-        param_name_auto=str('4band_44100')
-    if model_hash == '68aa2c8093d0080704b200d140f59e54':  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100.json')
-        param_name_auto=str('3band_44100.json')
-    if model_hash == 'fdc83be5b798e4bd29fe00fe6600e147':  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_mid.json')
-        param_name_auto=str('3band_44100_mid.json')
-    if model_hash == '2ce34bc92fd57f55db16b7a4def3d745':  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_mid.json')
-        param_name_auto=str('3band_44100_mid.json')
-    if model_hash == '52fdca89576f06cf4340b74a4730ee5f':  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
-        param_name_auto=str('4band_44100.json')
-    if model_hash == '41191165b05d38fc77f072fa9e8e8a30':  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
-        param_name_auto=str('4band_44100.json')
-    if model_hash == '89e83b511ad474592689e562d5b1f80e':  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_32000.json')
-        param_name_auto=str('2band_32000.json')
-    if model_hash == '0b954da81d453b716b114d6d7c95177f':  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_32000.json')
-        param_name_auto=str('2band_32000.json')
+    for type in list(data):
+        for model in list(data[type][0]):
+            for i in range(len(data[type][0][model])):
+                if str(data[type][0][model][i]["hash_name"]) == model_hash:
+                    flag = True
+                elif str(data[type][0][model][i]["hash_name"]) in ModelName:
+                    flag = True

-    #v4 Models    
-    if model_hash == '6a00461c51c2920fd68937d4609ed6c8':  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json')
-        param_name_auto=str('1band_sr16000_hl512')
-    if model_hash == '0ab504864d20f1bd378fe9c81ef37140':  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json')
-        param_name_auto=str('1band_sr32000_hl512')
-    if model_hash == '7dd21065bf91c10f7fccb57d7d83b07f':  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json')
-        param_name_auto=str('1band_sr32000_hl512')
-    if model_hash == '80ab74d65e515caa3622728d2de07d23':  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json')
-        param_name_auto=str('1band_sr32000_hl512')
-    if model_hash == 'edc115e7fc523245062200c00caa847f':  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json')
-        param_name_auto=str('1band_sr33075_hl384')
-    if model_hash == '28063e9f6ab5b341c5f6d3c67f2045b7':  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json')
-        param_name_auto=str('1band_sr33075_hl384')
-    if model_hash == 'b58090534c52cbc3e9b5104bad666ef2':  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json')
-        param_name_auto=str('1band_sr44100_hl512')
-    if model_hash == '0cdab9947f1b0928705f518f3c78ea8f':  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json')
-        param_name_auto=str('1band_sr44100_hl512')
-    if model_hash == 'ae702fed0238afb5346db8356fe25f13':  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json')
-        param_name_auto=str('1band_sr44100_hl1024')                        
-    #User Models
-
-    #1 Band
-    if '1band_sr16000_hl512' in ModelName:  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json')
-        param_name_auto=str('1band_sr16000_hl512')
-    if '1band_sr32000_hl512' in ModelName:  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json')
-        param_name_auto=str('1band_sr32000_hl512')
-    if '1band_sr33075_hl384' in ModelName:  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json')
-        param_name_auto=str('1band_sr33075_hl384')
-    if '1band_sr44100_hl256' in ModelName:  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json')
-        param_name_auto=str('1band_sr44100_hl256')
-    if '1band_sr44100_hl512' in ModelName:  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json')
-        param_name_auto=str('1band_sr44100_hl512')
-    if '1band_sr44100_hl1024' in ModelName:  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json')
-        param_name_auto=str('1band_sr44100_hl1024')
-        
-    #2 Band
-    if '2band_44100_lofi' in ModelName:  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json')
-        param_name_auto=str('2band_44100_lofi')
-    if '2band_32000' in ModelName:  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_32000.json')
-        param_name_auto=str('2band_32000')
-    if '2band_48000' in ModelName:  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_48000.json')
-        param_name_auto=str('2band_48000')
-        
-    #3 Band   
-    if '3band_44100' in ModelName:  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100.json')
-        param_name_auto=str('3band_44100')
-    if '3band_44100_mid' in ModelName:  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_mid.json')
-        param_name_auto=str('3band_44100_mid')
-    if '3band_44100_msb2' in ModelName:  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json')
-        param_name_auto=str('3band_44100_msb2')
-        
-    #4 Band    
-    if '4band_44100' in ModelName:  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
-        param_name_auto=str('4band_44100')
-    if '4band_44100_mid' in ModelName:  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_mid.json')
-        param_name_auto=str('4band_44100_mid')
-    if '4band_44100_msb' in ModelName:  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_msb.json')
-        param_name_auto=str('4band_44100_msb')
-    if '4band_44100_msb2' in ModelName:  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json')
-        param_name_auto=str('4band_44100_msb2')
-    if '4band_44100_reverse' in ModelName:  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json')
-        param_name_auto=str('4band_44100_reverse')
-    if '4band_44100_sw' in ModelName:  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_sw.json') 
-        param_name_auto=str('4band_44100_sw')
-    if '4band_v2' in ModelName:  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2.json')
-        param_name_auto=str('4band_v2')
-    if '4band_v2_sn' in ModelName:  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2_sn.json')
-        param_name_auto=str('4band_v2_sn')
-    if 'tmodelparam' in ModelName:  
-        model_params_auto=str('uvr5_pack/lib_v5/modelparams/tmodelparam.json')
-        param_name_auto=str('User Model Param Set')
+                if flag:
+                    model_params_auto = data[type][0][model][i]["model_params"]
+                    param_name_auto = data[type][0][model][i]["param_name"]
+                    if type == "equivalent":
+                        return param_name_auto, model_params_auto
+                    else:
+                        flag = False
    return param_name_auto, model_params_auto
--- a/vc_infer_pipeline.py
+++ b/vc_infer_pipeline.py
@ -4,6 +4,8 @@ import torch.nn.functional as F
 from config import x_pad, x_query, x_center, x_max
 import scipy.signal as signal
 import pyworld, os, traceback, faiss
+
+
 class VC(object):
    def __init__(self, tgt_sr, device, is_half):
        self.sr = 16000  # hubert输入采样率
@ -23,14 +25,23 @@ class VC(object):
        f0_max = 1100
        f0_mel_min = 1127 * np.log(1 + f0_min / 700)
        f0_mel_max = 1127 * np.log(1 + f0_max / 700)
-        if(f0_method=="pm"):
-            f0 = parselmouth.Sound(x, self.sr).to_pitch_ac(
-                time_step=time_step / 1000, voicing_threshold=0.6,
-                pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
+        if f0_method == "pm":
+            f0 = (
+                parselmouth.Sound(x, self.sr)
+                .to_pitch_ac(
+                    time_step=time_step / 1000,
+                    voicing_threshold=0.6,
+                    pitch_floor=f0_min,
+                    pitch_ceiling=f0_max,
+                )
+                .selected_array["frequency"]
+            )
            pad_size = (p_len - len(f0) + 1) // 2
-            if(pad_size>0 or p_len - len(f0) - pad_size>0):
-                f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
-        elif(f0_method=="harvest"):
+            if pad_size > 0 or p_len - len(f0) - pad_size > 0:
+                f0 = np.pad(
+                    f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
+                )
+        elif f0_method == "harvest":
            f0, t = pyworld.harvest(
                x.astype(np.double),
                fs=self.sr,
@ -43,24 +54,44 @@ class VC(object):
        f0 *= pow(2, f0_up_key / 12)
        # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
        tf0 = self.sr // self.window  # 每秒f0点数
-        if (inp_f0 is not None):
-            delta_t=np.round((inp_f0[:,0].max()-inp_f0[:,0].min())*tf0+1).astype("int16")
-            replace_f0=np.interp(list(range(delta_t)), inp_f0[:, 0]*100, inp_f0[:, 1])
+        if inp_f0 is not None:
+            delta_t = np.round(
+                (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
+            ).astype("int16")
+            replace_f0 = np.interp(
+                list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
+            )
            shape = f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)].shape[0]
            f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)] = replace_f0[:shape]
        # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
        f0bak = f0.copy()
        f0_mel = 1127 * np.log(1 + f0 / 700)
-        f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
+        f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
+            f0_mel_max - f0_mel_min
+        ) + 1
        f0_mel[f0_mel <= 1] = 1
        f0_mel[f0_mel > 255] = 255
        f0_coarse = np.rint(f0_mel).astype(np.int)
        return f0_coarse, f0bak  # 1-0

-    def vc(self,model,net_g,sid,audio0,pitch,pitchf,times,index,big_npy,index_rate):#,file_index,file_big_npy
+    def vc(
+        self,
+        model,
+        net_g,
+        sid,
+        audio0,
+        pitch,
+        pitchf,
+        times,
+        index,
+        big_npy,
+        index_rate,
+    ):  # ,file_index,file_big_npy
        feats = torch.from_numpy(audio0)
-        if(self.is_half):feats=feats.half()
-        else:feats=feats.float()
+        if self.is_half:
+            feats = feats.half()
+        else:
+            feats = feats.float()
        if feats.dim() == 2:  # double channels
            feats = feats.mean(-1)
        assert feats.dim() == 1, feats.dim()
@ -77,37 +108,79 @@ class VC(object):
            logits = model.extract_features(**inputs)
            feats = model.final_proj(logits[0])

-        if(isinstance(index,type(None))==False and isinstance(big_npy,type(None))==False and index_rate!=0):
+        if (
+            isinstance(index, type(None)) == False
+            and isinstance(big_npy, type(None)) == False
+            and index_rate != 0
+        ):
            npy = feats[0].cpu().numpy()
-            if(self.is_half):npy=npy.astype("float32")
+            if self.is_half:
+                npy = npy.astype("float32")
            _, I = index.search(npy, 1)
            npy = big_npy[I.squeeze()]
-            if(self.is_half):npy=npy.astype("float16")
-            feats = torch.from_numpy(npy).unsqueeze(0).to(self.device)*index_rate + (1-index_rate)*feats
+            if self.is_half:
+                npy = npy.astype("float16")
+            feats = (
+                torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
+                + (1 - index_rate) * feats
+            )

        feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
        t1 = ttime()
        p_len = audio0.shape[0] // self.window
-        if(feats.shape[1]<p_len):
+        if feats.shape[1] < p_len:
            p_len = feats.shape[1]
-            if(pitch!=None and pitchf!=None):
+            if pitch != None and pitchf != None:
                pitch = pitch[:, :p_len]
                pitchf = pitchf[:, :p_len]
        p_len = torch.tensor([p_len], device=self.device).long()
        with torch.no_grad():
-            if(pitch!=None and pitchf!=None):
-                audio1 = (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)
+            if pitch != None and pitchf != None:
+                audio1 = (
+                    (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768)
+                    .data.cpu()
+                    .float()
+                    .numpy()
+                    .astype(np.int16)
+                )
            else:
-                audio1 = (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)
+                audio1 = (
+                    (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768)
+                    .data.cpu()
+                    .float()
+                    .numpy()
+                    .astype(np.int16)
+                )
        del feats, p_len, padding_mask
-        if torch.cuda.is_available(): torch.cuda.empty_cache()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
        t2 = ttime()
-        times[0] += (t1 - t0)
-        times[2] += (t2 - t1)
+        times[0] += t1 - t0
+        times[2] += t2 - t1
        return audio1

-    def pipeline(self,model,net_g,sid,audio,times,f0_up_key,f0_method,file_index,file_big_npy,index_rate,if_f0,f0_file=None):
-        if(file_big_npy!=""and file_index!=""and os.path.exists(file_big_npy)==True and os.path.exists(file_index)==True and index_rate!=0):
+    def pipeline(
+        self,
+        model,
+        net_g,
+        sid,
+        audio,
+        times,
+        f0_up_key,
+        f0_method,
+        file_index,
+        file_big_npy,
+        index_rate,
+        if_f0,
+        f0_file=None,
+    ):
+        if (
+            file_big_npy != ""
+            and file_index != ""
+            and os.path.exists(file_big_npy) == True
+            and os.path.exists(file_index) == True
+            and index_rate != 0
+        ):
            try:
                index = faiss.read_index(file_index)
                big_npy = np.load(file_big_npy)
@ -116,50 +189,113 @@ class VC(object):
                index = big_npy = None
        else:
            index = big_npy = None
-        audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode='reflect')
+        audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
        opt_ts = []
-        if(audio_pad.shape[0]>self.t_max):
+        if audio_pad.shape[0] > self.t_max:
            audio_sum = np.zeros_like(audio)
-            for i in range(self.window): audio_sum += audio_pad[i:i - self.window]
-            for t in range(self.t_center, audio.shape[0],self.t_center):opt_ts.append(t - self.t_query + np.where(np.abs(audio_sum[t - self.t_query:t + self.t_query]) == np.abs(audio_sum[t - self.t_query:t + self.t_query]).min())[0][0])
+            for i in range(self.window):
+                audio_sum += audio_pad[i : i - self.window]
+            for t in range(self.t_center, audio.shape[0], self.t_center):
+                opt_ts.append(
+                    t
+                    - self.t_query
+                    + np.where(
+                        np.abs(audio_sum[t - self.t_query : t + self.t_query])
+                        == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
+                    )[0][0]
+                )
        s = 0
        audio_opt = []
        t = None
        t1 = ttime()
-        audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode='reflect')
+        audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
        p_len = audio_pad.shape[0] // self.window
        inp_f0 = None
-        if(hasattr(f0_file,'name') ==True):
+        if hasattr(f0_file, "name") == True:
            try:
                with open(f0_file.name, "r") as f:
                    lines = f.read().strip("\n").split("\n")
                inp_f0 = []
-                for line in lines:inp_f0.append([float(i)for i in line.split(",")])
+                for line in lines:
+                    inp_f0.append([float(i) for i in line.split(",")])
                inp_f0 = np.array(inp_f0, dtype="float32")
            except:
                traceback.print_exc()
        sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
        pitch, pitchf = None, None
-        if(if_f0==1):
+        if if_f0 == 1:
            pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, inp_f0)
            pitch = pitch[:p_len]
            pitchf = pitchf[:p_len]
            pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
            pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
        t2 = ttime()
-        times[1] += (t2 - t1)
+        times[1] += t2 - t1
        for t in opt_ts:
            t = t // self.window * self.window
-            if (if_f0 == 1):
-                audio_opt.append(self.vc(model,net_g,sid,audio_pad[s:t+self.t_pad2+self.window],pitch[:,s//self.window:(t+self.t_pad2)//self.window],pitchf[:,s//self.window:(t+self.t_pad2)//self.window],times,index,big_npy,index_rate)[self.t_pad_tgt:-self.t_pad_tgt])
+            if if_f0 == 1:
+                audio_opt.append(
+                    self.vc(
+                        model,
+                        net_g,
+                        sid,
+                        audio_pad[s : t + self.t_pad2 + self.window],
+                        pitch[:, s // self.window : (t + self.t_pad2) // self.window],
+                        pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
+                        times,
+                        index,
+                        big_npy,
+                        index_rate,
+                    )[self.t_pad_tgt : -self.t_pad_tgt]
+                )
            else:
-                audio_opt.append(self.vc(model,net_g,sid,audio_pad[s:t+self.t_pad2+self.window],None,None,times,index,big_npy,index_rate)[self.t_pad_tgt:-self.t_pad_tgt])
+                audio_opt.append(
+                    self.vc(
+                        model,
+                        net_g,
+                        sid,
+                        audio_pad[s : t + self.t_pad2 + self.window],
+                        None,
+                        None,
+                        times,
+                        index,
+                        big_npy,
+                        index_rate,
+                    )[self.t_pad_tgt : -self.t_pad_tgt]
+                )
            s = t
-        if (if_f0 == 1):
-            audio_opt.append(self.vc(model,net_g,sid,audio_pad[t:],pitch[:,t//self.window:]if t is not None else pitch,pitchf[:,t//self.window:]if t is not None else pitchf,times,index,big_npy,index_rate)[self.t_pad_tgt:-self.t_pad_tgt])
+        if if_f0 == 1:
+            audio_opt.append(
+                self.vc(
+                    model,
+                    net_g,
+                    sid,
+                    audio_pad[t:],
+                    pitch[:, t // self.window :] if t is not None else pitch,
+                    pitchf[:, t // self.window :] if t is not None else pitchf,
+                    times,
+                    index,
+                    big_npy,
+                    index_rate,
+                )[self.t_pad_tgt : -self.t_pad_tgt]
+            )
        else:
-            audio_opt.append(self.vc(model,net_g,sid,audio_pad[t:],None,None,times,index,big_npy,index_rate)[self.t_pad_tgt:-self.t_pad_tgt])
+            audio_opt.append(
+                self.vc(
+                    model,
+                    net_g,
+                    sid,
+                    audio_pad[t:],
+                    None,
+                    None,
+                    times,
+                    index,
+                    big_npy,
+                    index_rate,
+                )[self.t_pad_tgt : -self.t_pad_tgt]
+            )
        audio_opt = np.concatenate(audio_opt)
        del pitch, pitchf, sid
-        if torch.cuda.is_available(): torch.cuda.empty_cache()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
        return audio_opt
--- a/webui_locale.py
+++ b/webui_locale.py
@ -1,16 +1,18 @@
 import locale
 import json

+
 def load_language_list(language):
    with open(f"./locale/{language}.json", "r", encoding="utf-8") as f:
        language_list = json.load(f)
    return language_list

+
 class I18nAuto:
    def __init__(self, language=None):
        if language is None:
-            language = 'auto'
-        if language == 'auto':
+            language = "auto"
+        if language == "auto":
            language = locale.getdefaultlocale()[0]
        self.language = language
        print("Use Language:", language)