rewrite infer_cli

2025-04-25 05:58:59 +08:00 · 2023-08-28 01:14:36 +09:00 · 2023-08-28 01:14:36 +09:00 · 34de3492f3
commit 34de3492f3
parent aa37b92bd3
3 changed files with 73 additions and 278 deletions
--- a/infer_cli.py
+++ b/infer_cli.py
@ -1,272 +0,0 @@
-from scipy.io import wavfile
-from fairseq import checkpoint_utils
-from lib.audio import load_audio
-from lib.infer_pack.models import (
-    SynthesizerTrnMs256NSFsid,
-    SynthesizerTrnMs256NSFsid_nono,
-    SynthesizerTrnMs768NSFsid,
-    SynthesizerTrnMs768NSFsid_nono,
-)
-from lib.train.vc_infer_pipeline import VC
-from multiprocessing import cpu_count
-import numpy as np
-import torch
-import sys
-import glob
-import argparse
-import os
-import sys
-import pdb
-import torch
-
-now_dir = os.getcwd()
-sys.path.append(now_dir)
-
-####
-# USAGE
-#
-# In your Terminal or CMD or whatever
-# python infer_cli.py [TRANSPOSE_VALUE] "[INPUT_PATH]" "[OUTPUT_PATH]" "[MODEL_PATH]" "[INDEX_FILE_PATH]" "[INFERENCE_DEVICE]" "[METHOD]"
-
-using_cli = False
-device = "cuda:0"
-is_half = False
-
-if len(sys.argv) > 0:
-    f0_up_key = int(sys.argv[1])  # transpose value
-    input_path = sys.argv[2]
-    output_path = sys.argv[3]
-    model_path = sys.argv[4]
-    file_index = sys.argv[5]  # .index file
-    device = sys.argv[6]
-    f0_method = sys.argv[7]  # pm or harvest or crepe
-
-    using_cli = True
-
-    # file_index2=sys.argv[8]
-    # index_rate=float(sys.argv[10]) #search feature ratio
-    # filter_radius=float(sys.argv[11]) #median filter
-    # resample_sr=float(sys.argv[12]) #resample audio in post processing
-    # rms_mix_rate=float(sys.argv[13]) #search feature
-    print(sys.argv)
-
-
-class Config:
-    def __init__(self, device, is_half):
-        self.device = device
-        self.is_half = is_half
-        self.n_cpu = 0
-        self.gpu_name = None
-        self.gpu_mem = None
-        self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
-
-    def device_config(self) -> tuple:
-        if torch.cuda.is_available() and device != "cpu":
-            i_device = int(self.device.split(":")[-1])
-            self.gpu_name = torch.cuda.get_device_name(i_device)
-            if (
-                ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
-                or "P40" in self.gpu_name.upper()
-                or "1060" in self.gpu_name
-                or "1070" in self.gpu_name
-                or "1080" in self.gpu_name
-            ):
-                print("16系/10系显卡和P40强制单精度")
-                self.is_half = False
-                for config_file in ["32k.json", "40k.json", "48k.json"]:
-                    with open(f"configs/{config_file}", "r") as f:
-                        strr = f.read().replace("true", "false")
-                    with open(f"configs/{config_file}", "w") as f:
-                        f.write(strr)
-                with open("trainset_preprocess_pipeline_print.py", "r") as f:
-                    strr = f.read().replace("3.7", "3.0")
-                with open("trainset_preprocess_pipeline_print.py", "w") as f:
-                    f.write(strr)
-            else:
-                self.gpu_name = None
-            self.gpu_mem = int(
-                torch.cuda.get_device_properties(i_device).total_memory
-                / 1024
-                / 1024
-                / 1024
-                + 0.4
-            )
-            if self.gpu_mem <= 4:
-                with open("trainset_preprocess_pipeline_print.py", "r") as f:
-                    strr = f.read().replace("3.7", "3.0")
-                with open("trainset_preprocess_pipeline_print.py", "w") as f:
-                    f.write(strr)
-        elif torch.backends.mps.is_available():
-            print("没有发现支持的N卡, 使用MPS进行推理")
-            self.device = "mps"
-        else:
-            print("没有发现支持的N卡, 使用CPU进行推理")
-            self.device = "cpu"
-            self.is_half = False
-
-        if self.n_cpu == 0:
-            self.n_cpu = cpu_count()
-
-        if self.is_half:
-            # 6G显存配置
-            x_pad = 3
-            x_query = 10
-            x_center = 60
-            x_max = 65
-        else:
-            # 5G显存配置
-            x_pad = 1
-            x_query = 6
-            x_center = 38
-            x_max = 41
-
-        if self.gpu_mem != None and self.gpu_mem <= 4:
-            x_pad = 1
-            x_query = 5
-            x_center = 30
-            x_max = 32
-
-        return x_pad, x_query, x_center, x_max
-
-
-config = Config(device, is_half)
-now_dir = os.getcwd()
-sys.path.append(now_dir)
-
-hubert_model = None
-
-
-def load_hubert():
-    global hubert_model
-    models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
-        ["hubert_base.pt"],
-        suffix="",
-    )
-    hubert_model = models[0]
-    hubert_model = hubert_model.to(config.device)
-    if config.is_half:
-        hubert_model = hubert_model.half()
-    else:
-        hubert_model = hubert_model.float()
-    hubert_model.eval()
-
-
-def vc_single(
-    sid=0,
-    input_audio_path=None,
-    f0_up_key=0,
-    f0_file=None,
-    f0_method="pm",
-    file_index="",  # .index file
-    file_index2="",
-    # file_big_npy,
-    index_rate=1.0,
-    filter_radius=3,
-    resample_sr=0,
-    rms_mix_rate=1.0,
-    model_path="",
-    output_path="",
-    protect=0.33,
-):
-    global tgt_sr, net_g, vc, hubert_model, version
-    get_vc(model_path)
-    if input_audio_path is None:
-        return "You need to upload an audio file", None
-
-    f0_up_key = int(f0_up_key)
-    audio = load_audio(input_audio_path, 16000)
-    audio_max = np.abs(audio).max() / 0.95
-
-    if audio_max > 1:
-        audio /= audio_max
-    times = [0, 0, 0]
-
-    if hubert_model == None:
-        load_hubert()
-
-    if_f0 = cpt.get("f0", 1)
-
-    file_index = (
-        (
-            file_index.strip(" ")
-            .strip('"')
-            .strip("\n")
-            .strip('"')
-            .strip(" ")
-            .replace("trained", "added")
-        )
-        if file_index != ""
-        else file_index2
-    )
-
-    audio_opt = vc.pipeline(
-        hubert_model,
-        net_g,
-        sid,
-        audio,
-        input_audio_path,
-        times,
-        f0_up_key,
-        f0_method,
-        file_index,
-        # file_big_npy,
-        index_rate,
-        if_f0,
-        filter_radius,
-        tgt_sr,
-        resample_sr,
-        rms_mix_rate,
-        version,
-        f0_file=f0_file,
-        protect=protect,
-    )
-    wavfile.write(output_path, tgt_sr, audio_opt)
-    return "processed"
-
-
-def get_vc(model_path):
-    global n_spk, tgt_sr, net_g, vc, cpt, device, is_half, version
-    print("loading pth %s" % model_path)
-    cpt = torch.load(model_path, map_location="cpu")
-    tgt_sr = cpt["config"][-1]
-    cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
-    if_f0 = cpt.get("f0", 1)
-    version = cpt.get("version", "v1")
-    if version == "v1":
-        if if_f0 == 1:
-            net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half)
-        else:
-            net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
-    elif version == "v2":
-        if if_f0 == 1:
-            net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=is_half)
-        else:
-            net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
-    del net_g.enc_q
-    print(net_g.load_state_dict(cpt["weight"], strict=False))
-    net_g.eval().to(device)
-    if is_half:
-        net_g = net_g.half()
-    else:
-        net_g = net_g.float()
-    vc = VC(tgt_sr, config)
-    n_spk = cpt["config"][-3]
-    # return {"visible": True,"maximum": n_spk, "__type__": "update"}
-
-
-if using_cli:
-    vc_single(
-        sid=0,
-        input_audio_path=input_path,
-        f0_up_key=f0_up_key,
-        f0_file=None,
-        f0_method=f0_method,
-        file_index=file_index,
-        file_index2="",
-        index_rate=1,
-        filter_radius=3,
-        resample_sr=0,
-        rms_mix_rate=0,
-        model_path=model_path,
-        output_path=output_path,
-    )
--- a/tools/infer_batch_rvc.py
+++ b/tools/infer_batch_rvc.py
@ -16,7 +16,7 @@ from configs.config import Config
 from infer.modules.vc.modules import VC


-def args() -> tuple:
+def arg_parse() -> tuple:
    parser = argparse.ArgumentParser()
    parser.add_argument("--f0up_key", type=int, default=0)
    parser.add_argument("--input_path", type=str, help="input path")
@ -39,6 +39,12 @@ def args() -> tuple:


 def main():
+    load_dotenv()
+    args = arg_parse()
+    config = Config()
+    config.device = args.device if args.device else config.device
+    config.is_half = args.is_half if args.is_half else config.is_half
+    vc = VC(config)
    vc.get_vc(args.model_name)
    audios = os.listdir(args.input_path)
    for file in tq.tqdm(audios):
@ -59,12 +65,8 @@ def main():
                args.protect,
            )
            out_path = os.path.join(args.opt_path, file)
-            wavfile.write(out_path, vc.tgt_sr, wav_opt[1])
+            wavfile.write(out_path, wav_opt[0], wav_opt[1])


 if __name__ == "__main__":
-    load_dotenv()
-    args = args()
-    config = Config()
-    vc = VC(config)
    main()
--- a/tools/infer_cli.py
+++ b/tools/infer_cli.py
@ -0,0 +1,65 @@
+import argparse
+import os
+import sys
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+from scipy.io import wavfile
+
+from configs.config import Config
+from infer.modules.vc.modules import VC
+from dotenv import load_dotenv
+####
+# USAGE
+#
+# In your Terminal or CMD or whatever
+
+
+def arg_parse() -> tuple:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--f0up_key", type=int, default=0)
+    parser.add_argument("--input_path", type=str, help="input path")
+    parser.add_argument("--index_path", type=str, help="index path")
+    parser.add_argument("--f0method", type=str, default="harvest", help="harvest or pm")
+    parser.add_argument("--opt_path", type=str, help="opt path")
+    parser.add_argument("--model_name", type=str, help="store in assets/weight_root")
+    parser.add_argument("--index_rate", type=float, default=0.66, help="index rate")
+    parser.add_argument("--device", type=str, help="device")
+    parser.add_argument("--is_half", type=bool, help="use half -> True")
+    parser.add_argument("--filter_radius", type=int, default=3, help="filter radius")
+    parser.add_argument("--resample_sr", type=int, default=0, help="resample sr")
+    parser.add_argument("--rms_mix_rate", type=float, default=1, help="rms mix rate")
+    parser.add_argument("--protect", type=float, default=0.33, help="protect")
+
+    args = parser.parse_args()
+    sys.argv = sys.argv[:1]
+
+    return args
+
+
+def main():
+    load_dotenv()
+    args = arg_parse()
+    config = Config()
+    config.device = args.device if args.device else config.device
+    config.is_half = args.is_half if args.is_half else config.is_half
+    vc = VC(config)
+    vc.get_vc(args.model_name)
+    _, wav_opt = vc.vc_single(
+        0,
+        args.input_path,
+        args.f0up_key,
+        None,
+        args.f0method,
+        args.index_path,
+        None,
+        args.index_rate,
+        args.filter_radius,
+        args.resample_sr,
+        args.rms_mix_rate,
+        args.protect,
+            )
+    wavfile.write(args.opt_path, wav_opt[0], wav_opt[1])
+
+
+if __name__ == "__main__":
+    main()