From ed7b11eb49cde35f18a44262f311f64bd8116bc9 Mon Sep 17 00:00:00 2001 From: Ftps Date: Mon, 21 Aug 2023 20:53:11 +0900 Subject: [PATCH] train 1-2b --- assets/hubert/.gitignore | 2 + assets/rmvpe/.gitignore | 2 + i18n.py | 28 ---- infer-web.py | 87 ++++++----- infer/lib/train/process_ckpt.py | 3 +- infer/lib/train/utils.py | 4 +- .../modules/train/extract/extract_f0_print.py | 4 +- .../modules/train/extract/extract_f0_rmvpe.py | 4 +- .../train/extract/extract_f0_rmvpe_dml.py | 4 +- infer/modules/train/extract_feature_print.py | 135 ++++++++++++++++++ infer/modules/train/train.py | 21 +-- 11 files changed, 214 insertions(+), 80 deletions(-) create mode 100644 assets/hubert/.gitignore create mode 100644 assets/rmvpe/.gitignore delete mode 100644 i18n.py rename extract_f0_print.py => infer/modules/train/extract/extract_f0_print.py (94%) rename extract_f0_rmvpe.py => infer/modules/train/extract/extract_f0_rmvpe.py (93%) rename extract_f0_rmvpe_dml.py => infer/modules/train/extract/extract_f0_rmvpe_dml.py (93%) create mode 100644 infer/modules/train/extract_feature_print.py diff --git a/assets/hubert/.gitignore b/assets/hubert/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/assets/hubert/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/assets/rmvpe/.gitignore b/assets/rmvpe/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/assets/rmvpe/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/i18n.py b/i18n.py deleted file mode 100644 index 28b17c7..0000000 --- a/i18n.py +++ /dev/null @@ -1,28 +0,0 @@ -import locale -import json -import os - - -def load_language_list(language): - with open(f"./i18n/locale/{language}.json", "r", encoding="utf-8") as f: - language_list = json.load(f) - return language_list - - -class I18nAuto: - def __init__(self, language=None): - if language in ["Auto", None]: - language = locale.getdefaultlocale()[ - 0 - ] # getlocale can't identify the system's language ((None, None)) - if not os.path.exists(f"./lib/i18n/{language}.json"): - language = "en_US" - self.language = language - # print("Use Language:", language) - self.language_map = load_language_list(language) - - def __call__(self, key): - return self.language_map.get(key, key) - - def print(self): - print("Use Language:", self.language) diff --git a/infer-web.py b/infer-web.py index cb0b592..5b12cea 100644 --- a/infer-web.py +++ b/infer-web.py @@ -20,8 +20,13 @@ import faiss import gradio as gr from configs.config import Config import fairseq -from i18n import I18nAuto -from lib.train.process_ckpt import change_info, extract_small_model, merge, show_info +from i18n.i18n import I18nAuto +from infer.lib.train.process_ckpt import ( + change_info, + extract_small_model, + merge, + show_info, +) from sklearn.cluster import MiniBatchKMeans from dotenv import load_dotenv @@ -197,7 +202,7 @@ def preprocess_dataset(trainset_dir, exp_dir, sr, n_p): f.close() cmd = ( config.python_cmd - + ' trainset_preprocess_pipeline_print.py "%s" %s %s "%s/logs/%s" ' + + ' infer/modules/train/preprocess.py "%s" %s %s "%s/logs/%s" ' % (trainset_dir, sr, n_p, now_dir, exp_dir) + str(config.noparallel) ) @@ -232,11 +237,15 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp f.close() if if_f0: if f0method != "rmvpe_gpu": - cmd = config.python_cmd + ' extract_f0_print.py "%s/logs/%s" %s %s' % ( - now_dir, - exp_dir, - n_p, - f0method, + cmd = ( + config.python_cmd + + ' infer/modules/train/extract/extract_f0_print.py "%s/logs/%s" %s %s' + % ( + now_dir, + exp_dir, + n_p, + f0method, + ) ) print(cmd) p = Popen( @@ -259,7 +268,7 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp for idx, n_g in enumerate(gpus_rmvpe): cmd = ( config.python_cmd - + ' extract_f0_rmvpe.py %s %s %s "%s/logs/%s" %s ' + + ' infer/modules/train/extract/extract_f0_rmvpe.py %s %s %s "%s/logs/%s" %s ' % (leng, idx, n_g, now_dir, exp_dir, config.is_half) ) print(cmd) @@ -277,9 +286,13 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp ), ).start() else: - cmd = config.python_cmd + ' extract_f0_rmvpe_dml.py "%s/logs/%s" ' % ( - now_dir, - exp_dir, + cmd = ( + config.python_cmd + + ' infer/modules/train/extract/extract_f0_rmvpe_dml.py "%s/logs/%s" ' + % ( + now_dir, + exp_dir, + ) ) print(cmd) p = Popen( @@ -312,7 +325,7 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp for idx, n_g in enumerate(gpus): cmd = ( config.python_cmd - + ' extract_feature_print.py %s %s %s %s "%s/logs/%s" %s' + + ' infer/modules/train/extract_feature_print.py %s %s %s %s "%s/logs/%s" %s' % ( config.device, leng, @@ -353,26 +366,26 @@ def change_sr2(sr2, if_f0_3, version19): path_str = "" if version19 == "v1" else "_v2" f0_str = "f0" if if_f0_3 else "" if_pretrained_generator_exist = os.access( - "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK + "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK ) if_pretrained_discriminator_exist = os.access( - "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK + "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK ) if not if_pretrained_generator_exist: print( - "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), + "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model", ) if not if_pretrained_discriminator_exist: print( - "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), + "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model", ) return ( - "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2) + "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2) if if_pretrained_generator_exist else "", - "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2) + "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2) if if_pretrained_discriminator_exist else "", ) @@ -389,26 +402,26 @@ def change_version19(sr2, if_f0_3, version19): ) f0_str = "f0" if if_f0_3 else "" if_pretrained_generator_exist = os.access( - "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK + "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK ) if_pretrained_discriminator_exist = os.access( - "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK + "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK ) if not if_pretrained_generator_exist: print( - "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), + "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model", ) if not if_pretrained_discriminator_exist: print( - "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), + "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model", ) return ( - "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2) + "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2) if if_pretrained_generator_exist else "", - "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2) + "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2) if if_pretrained_discriminator_exist else "", to_return_sr2, @@ -418,37 +431,37 @@ def change_version19(sr2, if_f0_3, version19): def change_f0(if_f0_3, sr2, version19): # f0method8,pretrained_G14,pretrained_D15 path_str = "" if version19 == "v1" else "_v2" if_pretrained_generator_exist = os.access( - "pretrained%s/f0G%s.pth" % (path_str, sr2), os.F_OK + "assets/pretrained%s/f0G%s.pth" % (path_str, sr2), os.F_OK ) if_pretrained_discriminator_exist = os.access( - "pretrained%s/f0D%s.pth" % (path_str, sr2), os.F_OK + "assets/pretrained%s/f0D%s.pth" % (path_str, sr2), os.F_OK ) if not if_pretrained_generator_exist: print( - "pretrained%s/f0G%s.pth" % (path_str, sr2), + "assets/pretrained%s/f0G%s.pth" % (path_str, sr2), "not exist, will not use pretrained model", ) if not if_pretrained_discriminator_exist: print( - "pretrained%s/f0D%s.pth" % (path_str, sr2), + "assets/pretrained%s/f0D%s.pth" % (path_str, sr2), "not exist, will not use pretrained model", ) if if_f0_3: return ( {"visible": True, "__type__": "update"}, - "pretrained%s/f0G%s.pth" % (path_str, sr2) + "assets/pretrained%s/f0G%s.pth" % (path_str, sr2) if if_pretrained_generator_exist else "", - "pretrained%s/f0D%s.pth" % (path_str, sr2) + "assets/pretrained%s/f0D%s.pth" % (path_str, sr2) if if_pretrained_discriminator_exist else "", ) return ( {"visible": False, "__type__": "update"}, - ("pretrained%s/G%s.pth" % (path_str, sr2)) + ("assets/pretrained%s/G%s.pth" % (path_str, sr2)) if if_pretrained_generator_exist else "", - ("pretrained%s/D%s.pth" % (path_str, sr2)) + ("assets/pretrained%s/D%s.pth" % (path_str, sr2)) if if_pretrained_discriminator_exist else "", ) @@ -548,7 +561,7 @@ def click_train( if gpus16: cmd = ( config.python_cmd - + ' train_nsf_sim_cache_sid_load_pretrain.py -e "%s" -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' + + ' infer/modules/train/train.py -e "%s" -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' % ( exp_dir1, sr2, @@ -568,7 +581,7 @@ def click_train( else: cmd = ( config.python_cmd - + ' train_nsf_sim_cache_sid_load_pretrain.py -e "%s" -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' + + ' infer/modules/train/train.py -e "%s" -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' % ( exp_dir1, sr2, @@ -1482,12 +1495,12 @@ with gr.Blocks(title="RVC WebUI") as app: with gr.Row(): pretrained_G14 = gr.Textbox( label=i18n("加载预训练底模G路径"), - value="pretrained_v2/f0G40k.pth", + value="assets/pretrained_v2/f0G40k.pth", interactive=True, ) pretrained_D15 = gr.Textbox( label=i18n("加载预训练底模D路径"), - value="pretrained_v2/f0D40k.pth", + value="assets/pretrained_v2/f0D40k.pth", interactive=True, ) sr2.change( diff --git a/infer/lib/train/process_ckpt.py b/infer/lib/train/process_ckpt.py index a48ca61..ad32b44 100644 --- a/infer/lib/train/process_ckpt.py +++ b/infer/lib/train/process_ckpt.py @@ -1,7 +1,6 @@ import torch, traceback, os, sys -now_dir = os.getcwd() -sys.path.append(now_dir) + from collections import OrderedDict from i18n.i18n import I18nAuto diff --git a/infer/lib/train/utils.py b/infer/lib/train/utils.py index 9c0fb5c..337422b 100644 --- a/infer/lib/train/utils.py +++ b/infer/lib/train/utils.py @@ -362,9 +362,9 @@ def get_hparams(init=True): os.makedirs(experiment_dir) if args.version == "v1" or args.sample_rate == "40k": - config_path = "configs/%s.json" % args.sample_rate + config_path = "configs/v1/%s.json" % args.sample_rate else: - config_path = "configs/%s_v2.json" % args.sample_rate + config_path = "configs/v2/%s.json" % args.sample_rate config_save_path = os.path.join(experiment_dir, "config.json") if init: with open(config_path, "r") as f: diff --git a/extract_f0_print.py b/infer/modules/train/extract/extract_f0_print.py similarity index 94% rename from extract_f0_print.py rename to infer/modules/train/extract/extract_f0_print.py index 4f6c806..d95548e 100644 --- a/extract_f0_print.py +++ b/infer/modules/train/extract/extract_f0_print.py @@ -79,7 +79,9 @@ class FeatureInput(object): from lib.rmvpe import RMVPE print("loading rmvpe model") - self.model_rmvpe = RMVPE("rmvpe.pt", is_half=False, device="cpu") + self.model_rmvpe = RMVPE( + "assets/rmvpe/rmvpe.pt", is_half=False, device="cpu" + ) f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) return f0 diff --git a/extract_f0_rmvpe.py b/infer/modules/train/extract/extract_f0_rmvpe.py similarity index 93% rename from extract_f0_rmvpe.py rename to infer/modules/train/extract/extract_f0_rmvpe.py index 00ca16c..33517e0 100644 --- a/extract_f0_rmvpe.py +++ b/infer/modules/train/extract/extract_f0_rmvpe.py @@ -42,7 +42,9 @@ class FeatureInput(object): from lib.rmvpe import RMVPE print("loading rmvpe model") - self.model_rmvpe = RMVPE("rmvpe.pt", is_half=is_half, device="cuda") + self.model_rmvpe = RMVPE( + "assets/rmvpe/rmvpe.pt", is_half=is_half, device="cuda" + ) f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) return f0 diff --git a/extract_f0_rmvpe_dml.py b/infer/modules/train/extract/extract_f0_rmvpe_dml.py similarity index 93% rename from extract_f0_rmvpe_dml.py rename to infer/modules/train/extract/extract_f0_rmvpe_dml.py index 0de50c5..744c69f 100644 --- a/extract_f0_rmvpe_dml.py +++ b/infer/modules/train/extract/extract_f0_rmvpe_dml.py @@ -40,7 +40,9 @@ class FeatureInput(object): from lib.rmvpe import RMVPE print("loading rmvpe model") - self.model_rmvpe = RMVPE("rmvpe.pt", is_half=False, device=device) + self.model_rmvpe = RMVPE( + "assets/rmvpe/rmvpe.pt", is_half=False, device=device + ) f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) return f0 diff --git a/infer/modules/train/extract_feature_print.py b/infer/modules/train/extract_feature_print.py new file mode 100644 index 0000000..32e0492 --- /dev/null +++ b/infer/modules/train/extract_feature_print.py @@ -0,0 +1,135 @@ +import os, sys, traceback + +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" +os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0" + +device = sys.argv[1] +n_part = int(sys.argv[2]) +i_part = int(sys.argv[3]) +if len(sys.argv) == 6: + exp_dir = sys.argv[4] + version = sys.argv[5] +else: + i_gpu = sys.argv[4] + exp_dir = sys.argv[5] + os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu) + version = sys.argv[6] +import torch +import torch.nn.functional as F +import soundfile as sf +import numpy as np +import fairseq + +if "privateuseone" not in device: + device = "cpu" + if torch.cuda.is_available(): + device = "cuda" + elif torch.backends.mps.is_available(): + device = "mps" +else: + import torch_directml + + device = torch_directml.device(torch_directml.default_device()) + + def forward_dml(ctx, x, scale): + ctx.scale = scale + res = x.clone().detach() + return res + + fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml + +f = open("%s/extract_f0_feature.log" % exp_dir, "a+") + + +def printt(strr): + print(strr) + f.write("%s\n" % strr) + f.flush() + + +printt(sys.argv) +model_path = "assets/hubert/hubert_base.pt" + +printt(exp_dir) +wavPath = "%s/1_16k_wavs" % exp_dir +outPath = ( + "%s/3_feature256" % exp_dir if version == "v1" else "%s/3_feature768" % exp_dir +) +os.makedirs(outPath, exist_ok=True) + + +# wave must be 16k, hop_size=320 +def readwave(wav_path, normalize=False): + wav, sr = sf.read(wav_path) + assert sr == 16000 + feats = torch.from_numpy(wav).float() + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + if normalize: + with torch.no_grad(): + feats = F.layer_norm(feats, feats.shape) + feats = feats.view(1, -1) + return feats + + +# HuBERT model +printt("load model(s) from {}".format(model_path)) +# if hubert model is exist +if os.access(model_path, os.F_OK) == False: + printt( + "Error: Extracting is shut down because %s does not exist, you may download it from https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main" + % model_path + ) + exit(0) +models, saved_cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task( + [model_path], + suffix="", +) +model = models[0] +model = model.to(device) +printt("move model to %s" % device) +if device not in ["mps", "cpu"]: + model = model.half() +model.eval() + +todo = sorted(list(os.listdir(wavPath)))[i_part::n_part] +n = max(1, len(todo) // 10) # 最多打印十条 +if len(todo) == 0: + printt("no-feature-todo") +else: + printt("all-feature-%s" % len(todo)) + for idx, file in enumerate(todo): + try: + if file.endswith(".wav"): + wav_path = "%s/%s" % (wavPath, file) + out_path = "%s/%s" % (outPath, file.replace("wav", "npy")) + + if os.path.exists(out_path): + continue + + feats = readwave(wav_path, normalize=saved_cfg.task.normalize) + padding_mask = torch.BoolTensor(feats.shape).fill_(False) + inputs = { + "source": feats.half().to(device) + if device not in ["mps", "cpu"] + else feats.to(device), + "padding_mask": padding_mask.to(device), + "output_layer": 9 if version == "v1" else 12, # layer 9 + } + with torch.no_grad(): + logits = model.extract_features(**inputs) + feats = ( + model.final_proj(logits[0]) if version == "v1" else logits[0] + ) + + feats = feats.squeeze(0).float().cpu().numpy() + if np.isnan(feats).sum() == 0: + np.save(out_path, feats, allow_pickle=False) + else: + printt("%s-contains nan" % file) + if idx % n == 0: + printt("now-%s,all-%s,%s,%s" % (len(todo), idx, file, feats.shape)) + except: + printt(traceback.format_exc()) + printt("all-feature-done") diff --git a/infer/modules/train/train.py b/infer/modules/train/train.py index c1bdf11..3dca6c7 100644 --- a/infer/modules/train/train.py +++ b/infer/modules/train/train.py @@ -3,7 +3,7 @@ import os, sys now_dir = os.getcwd() sys.path.append(os.path.join(now_dir)) -from lib.train import utils +from infer.lib.train import utils import datetime hps = utils.get_hparams() @@ -22,10 +22,10 @@ import torch.multiprocessing as mp import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel as DDP from torch.cuda.amp import autocast, GradScaler -from lib.infer_pack import commons +from infer.lib.infer_pack import commons from time import sleep from time import time as ttime -from lib.train.data_utils import ( +from infer.lib.train.data_utils import ( TextAudioLoaderMultiNSFsid, TextAudioLoader, TextAudioCollateMultiNSFsid, @@ -34,20 +34,25 @@ from lib.train.data_utils import ( ) if hps.version == "v1": - from lib.infer_pack.models import ( + from infer.lib.infer_pack.models import ( SynthesizerTrnMs256NSFsid as RVC_Model_f0, SynthesizerTrnMs256NSFsid_nono as RVC_Model_nof0, MultiPeriodDiscriminator, ) else: - from lib.infer_pack.models import ( + from infer.lib.infer_pack.models import ( SynthesizerTrnMs768NSFsid as RVC_Model_f0, SynthesizerTrnMs768NSFsid_nono as RVC_Model_nof0, MultiPeriodDiscriminatorV2 as MultiPeriodDiscriminator, ) -from lib.train.losses import generator_loss, discriminator_loss, feature_loss, kl_loss -from lib.train.mel_processing import mel_spectrogram_torch, spec_to_mel_torch -from lib.train.process_ckpt import savee +from infer.lib.train.losses import ( + generator_loss, + discriminator_loss, + feature_loss, + kl_loss, +) +from infer.lib.train.mel_processing import mel_spectrogram_torch, spec_to_mel_torch +from infer.lib.train.process_ckpt import savee global_step = 0