From ed7b11eb49cde35f18a44262f311f64bd8116bc9 Mon Sep 17 00:00:00 2001
From: Ftps <ftpsflandre@gmail.com>
Date: Mon, 21 Aug 2023 20:53:11 +0900
Subject: [PATCH] train 1-2b

---
 assets/hubert/.gitignore                      |   2 +
 assets/rmvpe/.gitignore                       |   2 +
 i18n.py                                       |  28 ----
 infer-web.py                                  |  87 ++++++-----
 infer/lib/train/process_ckpt.py               |   3 +-
 infer/lib/train/utils.py                      |   4 +-
 .../modules/train/extract/extract_f0_print.py |   4 +-
 .../modules/train/extract/extract_f0_rmvpe.py |   4 +-
 .../train/extract/extract_f0_rmvpe_dml.py     |   4 +-
 infer/modules/train/extract_feature_print.py  | 135 ++++++++++++++++++
 infer/modules/train/train.py                  |  21 +--
 11 files changed, 214 insertions(+), 80 deletions(-)
 create mode 100644 assets/hubert/.gitignore
 create mode 100644 assets/rmvpe/.gitignore
 delete mode 100644 i18n.py
 rename extract_f0_print.py => infer/modules/train/extract/extract_f0_print.py (94%)
 rename extract_f0_rmvpe.py => infer/modules/train/extract/extract_f0_rmvpe.py (93%)
 rename extract_f0_rmvpe_dml.py => infer/modules/train/extract/extract_f0_rmvpe_dml.py (93%)
 create mode 100644 infer/modules/train/extract_feature_print.py

diff --git a/assets/hubert/.gitignore b/assets/hubert/.gitignore
new file mode 100644
index 0000000..d6b7ef3
--- /dev/null
+++ b/assets/hubert/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/assets/rmvpe/.gitignore b/assets/rmvpe/.gitignore
new file mode 100644
index 0000000..d6b7ef3
--- /dev/null
+++ b/assets/rmvpe/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/i18n.py b/i18n.py
deleted file mode 100644
index 28b17c7..0000000
--- a/i18n.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import locale
-import json
-import os
-
-
-def load_language_list(language):
-    with open(f"./i18n/locale/{language}.json", "r", encoding="utf-8") as f:
-        language_list = json.load(f)
-    return language_list
-
-
-class I18nAuto:
-    def __init__(self, language=None):
-        if language in ["Auto", None]:
-            language = locale.getdefaultlocale()[
-                0
-            ]  # getlocale can't identify the system's language ((None, None))
-        if not os.path.exists(f"./lib/i18n/{language}.json"):
-            language = "en_US"
-        self.language = language
-        # print("Use Language:", language)
-        self.language_map = load_language_list(language)
-
-    def __call__(self, key):
-        return self.language_map.get(key, key)
-
-    def print(self):
-        print("Use Language:", self.language)
diff --git a/infer-web.py b/infer-web.py
index cb0b592..5b12cea 100644
--- a/infer-web.py
+++ b/infer-web.py
@@ -20,8 +20,13 @@ import faiss
 import gradio as gr
 from configs.config import Config
 import fairseq
-from i18n import I18nAuto
-from lib.train.process_ckpt import change_info, extract_small_model, merge, show_info
+from i18n.i18n import I18nAuto
+from infer.lib.train.process_ckpt import (
+    change_info,
+    extract_small_model,
+    merge,
+    show_info,
+)
 from sklearn.cluster import MiniBatchKMeans
 
 from dotenv import load_dotenv
@@ -197,7 +202,7 @@ def preprocess_dataset(trainset_dir, exp_dir, sr, n_p):
     f.close()
     cmd = (
         config.python_cmd
-        + ' trainset_preprocess_pipeline_print.py "%s" %s %s "%s/logs/%s" '
+        + ' infer/modules/train/preprocess.py "%s" %s %s "%s/logs/%s" '
         % (trainset_dir, sr, n_p, now_dir, exp_dir)
         + str(config.noparallel)
     )
@@ -232,11 +237,15 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp
     f.close()
     if if_f0:
         if f0method != "rmvpe_gpu":
-            cmd = config.python_cmd + ' extract_f0_print.py "%s/logs/%s" %s %s' % (
-                now_dir,
-                exp_dir,
-                n_p,
-                f0method,
+            cmd = (
+                config.python_cmd
+                + ' infer/modules/train/extract/extract_f0_print.py "%s/logs/%s" %s %s'
+                % (
+                    now_dir,
+                    exp_dir,
+                    n_p,
+                    f0method,
+                )
             )
             print(cmd)
             p = Popen(
@@ -259,7 +268,7 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp
                 for idx, n_g in enumerate(gpus_rmvpe):
                     cmd = (
                         config.python_cmd
-                        + ' extract_f0_rmvpe.py %s %s %s "%s/logs/%s" %s '
+                        + ' infer/modules/train/extract/extract_f0_rmvpe.py %s %s %s "%s/logs/%s" %s '
                         % (leng, idx, n_g, now_dir, exp_dir, config.is_half)
                     )
                     print(cmd)
@@ -277,9 +286,13 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp
                     ),
                 ).start()
             else:
-                cmd = config.python_cmd + ' extract_f0_rmvpe_dml.py "%s/logs/%s" ' % (
-                    now_dir,
-                    exp_dir,
+                cmd = (
+                    config.python_cmd
+                    + ' infer/modules/train/extract/extract_f0_rmvpe_dml.py "%s/logs/%s" '
+                    % (
+                        now_dir,
+                        exp_dir,
+                    )
                 )
                 print(cmd)
                 p = Popen(
@@ -312,7 +325,7 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp
     for idx, n_g in enumerate(gpus):
         cmd = (
             config.python_cmd
-            + ' extract_feature_print.py %s %s %s %s "%s/logs/%s" %s'
+            + ' infer/modules/train/extract_feature_print.py %s %s %s %s "%s/logs/%s" %s'
             % (
                 config.device,
                 leng,
@@ -353,26 +366,26 @@ def change_sr2(sr2, if_f0_3, version19):
     path_str = "" if version19 == "v1" else "_v2"
     f0_str = "f0" if if_f0_3 else ""
     if_pretrained_generator_exist = os.access(
-        "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK
+        "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK
     )
     if_pretrained_discriminator_exist = os.access(
-        "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK
+        "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK
     )
     if not if_pretrained_generator_exist:
         print(
-            "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2),
+            "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2),
             "not exist, will not use pretrained model",
         )
     if not if_pretrained_discriminator_exist:
         print(
-            "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2),
+            "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2),
             "not exist, will not use pretrained model",
         )
     return (
-        "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2)
+        "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2)
         if if_pretrained_generator_exist
         else "",
-        "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2)
+        "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2)
         if if_pretrained_discriminator_exist
         else "",
     )
@@ -389,26 +402,26 @@ def change_version19(sr2, if_f0_3, version19):
     )
     f0_str = "f0" if if_f0_3 else ""
     if_pretrained_generator_exist = os.access(
-        "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK
+        "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK
     )
     if_pretrained_discriminator_exist = os.access(
-        "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK
+        "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK
     )
     if not if_pretrained_generator_exist:
         print(
-            "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2),
+            "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2),
             "not exist, will not use pretrained model",
         )
     if not if_pretrained_discriminator_exist:
         print(
-            "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2),
+            "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2),
             "not exist, will not use pretrained model",
         )
     return (
-        "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2)
+        "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2)
         if if_pretrained_generator_exist
         else "",
-        "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2)
+        "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2)
         if if_pretrained_discriminator_exist
         else "",
         to_return_sr2,
@@ -418,37 +431,37 @@ def change_version19(sr2, if_f0_3, version19):
 def change_f0(if_f0_3, sr2, version19):  # f0method8,pretrained_G14,pretrained_D15
     path_str = "" if version19 == "v1" else "_v2"
     if_pretrained_generator_exist = os.access(
-        "pretrained%s/f0G%s.pth" % (path_str, sr2), os.F_OK
+        "assets/pretrained%s/f0G%s.pth" % (path_str, sr2), os.F_OK
     )
     if_pretrained_discriminator_exist = os.access(
-        "pretrained%s/f0D%s.pth" % (path_str, sr2), os.F_OK
+        "assets/pretrained%s/f0D%s.pth" % (path_str, sr2), os.F_OK
     )
     if not if_pretrained_generator_exist:
         print(
-            "pretrained%s/f0G%s.pth" % (path_str, sr2),
+            "assets/pretrained%s/f0G%s.pth" % (path_str, sr2),
             "not exist, will not use pretrained model",
         )
     if not if_pretrained_discriminator_exist:
         print(
-            "pretrained%s/f0D%s.pth" % (path_str, sr2),
+            "assets/pretrained%s/f0D%s.pth" % (path_str, sr2),
             "not exist, will not use pretrained model",
         )
     if if_f0_3:
         return (
             {"visible": True, "__type__": "update"},
-            "pretrained%s/f0G%s.pth" % (path_str, sr2)
+            "assets/pretrained%s/f0G%s.pth" % (path_str, sr2)
             if if_pretrained_generator_exist
             else "",
-            "pretrained%s/f0D%s.pth" % (path_str, sr2)
+            "assets/pretrained%s/f0D%s.pth" % (path_str, sr2)
             if if_pretrained_discriminator_exist
             else "",
         )
     return (
         {"visible": False, "__type__": "update"},
-        ("pretrained%s/G%s.pth" % (path_str, sr2))
+        ("assets/pretrained%s/G%s.pth" % (path_str, sr2))
         if if_pretrained_generator_exist
         else "",
-        ("pretrained%s/D%s.pth" % (path_str, sr2))
+        ("assets/pretrained%s/D%s.pth" % (path_str, sr2))
         if if_pretrained_discriminator_exist
         else "",
     )
@@ -548,7 +561,7 @@ def click_train(
     if gpus16:
         cmd = (
             config.python_cmd
-            + ' train_nsf_sim_cache_sid_load_pretrain.py -e "%s" -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s'
+            + ' infer/modules/train/train.py -e "%s" -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s'
             % (
                 exp_dir1,
                 sr2,
@@ -568,7 +581,7 @@ def click_train(
     else:
         cmd = (
             config.python_cmd
-            + ' train_nsf_sim_cache_sid_load_pretrain.py -e "%s" -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s'
+            + ' infer/modules/train/train.py -e "%s" -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s'
             % (
                 exp_dir1,
                 sr2,
@@ -1482,12 +1495,12 @@ with gr.Blocks(title="RVC WebUI") as app:
                 with gr.Row():
                     pretrained_G14 = gr.Textbox(
                         label=i18n("加载预训练底模G路径"),
-                        value="pretrained_v2/f0G40k.pth",
+                        value="assets/pretrained_v2/f0G40k.pth",
                         interactive=True,
                     )
                     pretrained_D15 = gr.Textbox(
                         label=i18n("加载预训练底模D路径"),
-                        value="pretrained_v2/f0D40k.pth",
+                        value="assets/pretrained_v2/f0D40k.pth",
                         interactive=True,
                     )
                     sr2.change(
diff --git a/infer/lib/train/process_ckpt.py b/infer/lib/train/process_ckpt.py
index a48ca61..ad32b44 100644
--- a/infer/lib/train/process_ckpt.py
+++ b/infer/lib/train/process_ckpt.py
@@ -1,7 +1,6 @@
 import torch, traceback, os, sys
 
-now_dir = os.getcwd()
-sys.path.append(now_dir)
+
 from collections import OrderedDict
 from i18n.i18n import I18nAuto
 
diff --git a/infer/lib/train/utils.py b/infer/lib/train/utils.py
index 9c0fb5c..337422b 100644
--- a/infer/lib/train/utils.py
+++ b/infer/lib/train/utils.py
@@ -362,9 +362,9 @@ def get_hparams(init=True):
         os.makedirs(experiment_dir)
 
     if args.version == "v1" or args.sample_rate == "40k":
-        config_path = "configs/%s.json" % args.sample_rate
+        config_path = "configs/v1/%s.json" % args.sample_rate
     else:
-        config_path = "configs/%s_v2.json" % args.sample_rate
+        config_path = "configs/v2/%s.json" % args.sample_rate
     config_save_path = os.path.join(experiment_dir, "config.json")
     if init:
         with open(config_path, "r") as f:
diff --git a/extract_f0_print.py b/infer/modules/train/extract/extract_f0_print.py
similarity index 94%
rename from extract_f0_print.py
rename to infer/modules/train/extract/extract_f0_print.py
index 4f6c806..d95548e 100644
--- a/extract_f0_print.py
+++ b/infer/modules/train/extract/extract_f0_print.py
@@ -79,7 +79,9 @@ class FeatureInput(object):
                 from lib.rmvpe import RMVPE
 
                 print("loading rmvpe model")
-                self.model_rmvpe = RMVPE("rmvpe.pt", is_half=False, device="cpu")
+                self.model_rmvpe = RMVPE(
+                    "assets/rmvpe/rmvpe.pt", is_half=False, device="cpu"
+                )
             f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
         return f0
 
diff --git a/extract_f0_rmvpe.py b/infer/modules/train/extract/extract_f0_rmvpe.py
similarity index 93%
rename from extract_f0_rmvpe.py
rename to infer/modules/train/extract/extract_f0_rmvpe.py
index 00ca16c..33517e0 100644
--- a/extract_f0_rmvpe.py
+++ b/infer/modules/train/extract/extract_f0_rmvpe.py
@@ -42,7 +42,9 @@ class FeatureInput(object):
                 from lib.rmvpe import RMVPE
 
                 print("loading rmvpe model")
-                self.model_rmvpe = RMVPE("rmvpe.pt", is_half=is_half, device="cuda")
+                self.model_rmvpe = RMVPE(
+                    "assets/rmvpe/rmvpe.pt", is_half=is_half, device="cuda"
+                )
             f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
         return f0
 
diff --git a/extract_f0_rmvpe_dml.py b/infer/modules/train/extract/extract_f0_rmvpe_dml.py
similarity index 93%
rename from extract_f0_rmvpe_dml.py
rename to infer/modules/train/extract/extract_f0_rmvpe_dml.py
index 0de50c5..744c69f 100644
--- a/extract_f0_rmvpe_dml.py
+++ b/infer/modules/train/extract/extract_f0_rmvpe_dml.py
@@ -40,7 +40,9 @@ class FeatureInput(object):
                 from lib.rmvpe import RMVPE
 
                 print("loading rmvpe model")
-                self.model_rmvpe = RMVPE("rmvpe.pt", is_half=False, device=device)
+                self.model_rmvpe = RMVPE(
+                    "assets/rmvpe/rmvpe.pt", is_half=False, device=device
+                )
             f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
         return f0
 
diff --git a/infer/modules/train/extract_feature_print.py b/infer/modules/train/extract_feature_print.py
new file mode 100644
index 0000000..32e0492
--- /dev/null
+++ b/infer/modules/train/extract_feature_print.py
@@ -0,0 +1,135 @@
+import os, sys, traceback
+
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
+
+device = sys.argv[1]
+n_part = int(sys.argv[2])
+i_part = int(sys.argv[3])
+if len(sys.argv) == 6:
+    exp_dir = sys.argv[4]
+    version = sys.argv[5]
+else:
+    i_gpu = sys.argv[4]
+    exp_dir = sys.argv[5]
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu)
+    version = sys.argv[6]
+import torch
+import torch.nn.functional as F
+import soundfile as sf
+import numpy as np
+import fairseq
+
+if "privateuseone" not in device:
+    device = "cpu"
+    if torch.cuda.is_available():
+        device = "cuda"
+    elif torch.backends.mps.is_available():
+        device = "mps"
+else:
+    import torch_directml
+
+    device = torch_directml.device(torch_directml.default_device())
+
+    def forward_dml(ctx, x, scale):
+        ctx.scale = scale
+        res = x.clone().detach()
+        return res
+
+    fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml
+
+f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
+
+
+def printt(strr):
+    print(strr)
+    f.write("%s\n" % strr)
+    f.flush()
+
+
+printt(sys.argv)
+model_path = "assets/hubert/hubert_base.pt"
+
+printt(exp_dir)
+wavPath = "%s/1_16k_wavs" % exp_dir
+outPath = (
+    "%s/3_feature256" % exp_dir if version == "v1" else "%s/3_feature768" % exp_dir
+)
+os.makedirs(outPath, exist_ok=True)
+
+
+# wave must be 16k, hop_size=320
+def readwave(wav_path, normalize=False):
+    wav, sr = sf.read(wav_path)
+    assert sr == 16000
+    feats = torch.from_numpy(wav).float()
+    if feats.dim() == 2:  # double channels
+        feats = feats.mean(-1)
+    assert feats.dim() == 1, feats.dim()
+    if normalize:
+        with torch.no_grad():
+            feats = F.layer_norm(feats, feats.shape)
+    feats = feats.view(1, -1)
+    return feats
+
+
+# HuBERT model
+printt("load model(s) from {}".format(model_path))
+# if hubert model is exist
+if os.access(model_path, os.F_OK) == False:
+    printt(
+        "Error: Extracting is shut down because %s does not exist, you may download it from https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main"
+        % model_path
+    )
+    exit(0)
+models, saved_cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task(
+    [model_path],
+    suffix="",
+)
+model = models[0]
+model = model.to(device)
+printt("move model to %s" % device)
+if device not in ["mps", "cpu"]:
+    model = model.half()
+model.eval()
+
+todo = sorted(list(os.listdir(wavPath)))[i_part::n_part]
+n = max(1, len(todo) // 10)  # 最多打印十条
+if len(todo) == 0:
+    printt("no-feature-todo")
+else:
+    printt("all-feature-%s" % len(todo))
+    for idx, file in enumerate(todo):
+        try:
+            if file.endswith(".wav"):
+                wav_path = "%s/%s" % (wavPath, file)
+                out_path = "%s/%s" % (outPath, file.replace("wav", "npy"))
+
+                if os.path.exists(out_path):
+                    continue
+
+                feats = readwave(wav_path, normalize=saved_cfg.task.normalize)
+                padding_mask = torch.BoolTensor(feats.shape).fill_(False)
+                inputs = {
+                    "source": feats.half().to(device)
+                    if device not in ["mps", "cpu"]
+                    else feats.to(device),
+                    "padding_mask": padding_mask.to(device),
+                    "output_layer": 9 if version == "v1" else 12,  # layer 9
+                }
+                with torch.no_grad():
+                    logits = model.extract_features(**inputs)
+                    feats = (
+                        model.final_proj(logits[0]) if version == "v1" else logits[0]
+                    )
+
+                feats = feats.squeeze(0).float().cpu().numpy()
+                if np.isnan(feats).sum() == 0:
+                    np.save(out_path, feats, allow_pickle=False)
+                else:
+                    printt("%s-contains nan" % file)
+                if idx % n == 0:
+                    printt("now-%s,all-%s,%s,%s" % (len(todo), idx, file, feats.shape))
+        except:
+            printt(traceback.format_exc())
+    printt("all-feature-done")
diff --git a/infer/modules/train/train.py b/infer/modules/train/train.py
index c1bdf11..3dca6c7 100644
--- a/infer/modules/train/train.py
+++ b/infer/modules/train/train.py
@@ -3,7 +3,7 @@ import os, sys
 now_dir = os.getcwd()
 sys.path.append(os.path.join(now_dir))
 
-from lib.train import utils
+from infer.lib.train import utils
 import datetime
 
 hps = utils.get_hparams()
@@ -22,10 +22,10 @@ import torch.multiprocessing as mp
 import torch.distributed as dist
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.cuda.amp import autocast, GradScaler
-from lib.infer_pack import commons
+from infer.lib.infer_pack import commons
 from time import sleep
 from time import time as ttime
-from lib.train.data_utils import (
+from infer.lib.train.data_utils import (
     TextAudioLoaderMultiNSFsid,
     TextAudioLoader,
     TextAudioCollateMultiNSFsid,
@@ -34,20 +34,25 @@ from lib.train.data_utils import (
 )
 
 if hps.version == "v1":
-    from lib.infer_pack.models import (
+    from infer.lib.infer_pack.models import (
         SynthesizerTrnMs256NSFsid as RVC_Model_f0,
         SynthesizerTrnMs256NSFsid_nono as RVC_Model_nof0,
         MultiPeriodDiscriminator,
     )
 else:
-    from lib.infer_pack.models import (
+    from infer.lib.infer_pack.models import (
         SynthesizerTrnMs768NSFsid as RVC_Model_f0,
         SynthesizerTrnMs768NSFsid_nono as RVC_Model_nof0,
         MultiPeriodDiscriminatorV2 as MultiPeriodDiscriminator,
     )
-from lib.train.losses import generator_loss, discriminator_loss, feature_loss, kl_loss
-from lib.train.mel_processing import mel_spectrogram_torch, spec_to_mel_torch
-from lib.train.process_ckpt import savee
+from infer.lib.train.losses import (
+    generator_loss,
+    discriminator_loss,
+    feature_loss,
+    kl_loss,
+)
+from infer.lib.train.mel_processing import mel_spectrogram_torch, spec_to_mel_torch
+from infer.lib.train.process_ckpt import savee
 
 global_step = 0