From 424932c46980969a14c539be14f9410fbc90f9e2 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Mon, 28 Aug 2023 01:27:08 +0900 Subject: [PATCH] Delete duplicate files --- extract_feature_print.py | 135 ------- infer_batch_rvc.py | 216 ---------- infer_uvr5.py | 363 ----------------- lib/audio.py | 21 - lib/rmvpe.py | 692 --------------------------------- lib/slicer2.py | 260 ------------- lib/train/cmd.txt | 1 - lib/train/data_utils.py | 512 ------------------------ lib/train/losses.py | 58 --- lib/train/mel_processing.py | 130 ------- lib/train/process_ckpt.py | 259 ------------ lib/train/utils.py | 487 ----------------------- lib/train/vc_infer_pipeline.py | 449 --------------------- 13 files changed, 3583 deletions(-) delete mode 100644 extract_feature_print.py delete mode 100644 infer_batch_rvc.py delete mode 100644 infer_uvr5.py delete mode 100644 lib/audio.py delete mode 100644 lib/rmvpe.py delete mode 100644 lib/slicer2.py delete mode 100644 lib/train/cmd.txt delete mode 100644 lib/train/data_utils.py delete mode 100644 lib/train/losses.py delete mode 100644 lib/train/mel_processing.py delete mode 100644 lib/train/process_ckpt.py delete mode 100644 lib/train/utils.py delete mode 100644 lib/train/vc_infer_pipeline.py diff --git a/extract_feature_print.py b/extract_feature_print.py deleted file mode 100644 index e613de4..0000000 --- a/extract_feature_print.py +++ /dev/null @@ -1,135 +0,0 @@ -import os, sys, traceback - -os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" -os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0" - -device = sys.argv[1] -n_part = int(sys.argv[2]) -i_part = int(sys.argv[3]) -if len(sys.argv) == 6: - exp_dir = sys.argv[4] - version = sys.argv[5] -else: - i_gpu = sys.argv[4] - exp_dir = sys.argv[5] - os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu) - version = sys.argv[6] -import torch -import torch.nn.functional as F -import soundfile as sf -import numpy as np -import fairseq - -if "privateuseone" not in device: - device = "cpu" - if torch.cuda.is_available(): - device = "cuda" - elif torch.backends.mps.is_available(): - device = "mps" -else: - import torch_directml - - device = torch_directml.device(torch_directml.default_device()) - - def forward_dml(ctx, x, scale): - ctx.scale = scale - res = x.clone().detach() - return res - - fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml - -f = open("%s/extract_f0_feature.log" % exp_dir, "a+") - - -def printt(strr): - print(strr) - f.write("%s\n" % strr) - f.flush() - - -printt(sys.argv) -model_path = "hubert_base.pt" - -printt(exp_dir) -wavPath = "%s/1_16k_wavs" % exp_dir -outPath = ( - "%s/3_feature256" % exp_dir if version == "v1" else "%s/3_feature768" % exp_dir -) -os.makedirs(outPath, exist_ok=True) - - -# wave must be 16k, hop_size=320 -def readwave(wav_path, normalize=False): - wav, sr = sf.read(wav_path) - assert sr == 16000 - feats = torch.from_numpy(wav).float() - if feats.dim() == 2: # double channels - feats = feats.mean(-1) - assert feats.dim() == 1, feats.dim() - if normalize: - with torch.no_grad(): - feats = F.layer_norm(feats, feats.shape) - feats = feats.view(1, -1) - return feats - - -# HuBERT model -printt("load model(s) from {}".format(model_path)) -# if hubert model is exist -if os.access(model_path, os.F_OK) == False: - printt( - "Error: Extracting is shut down because %s does not exist, you may download it from https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main" - % model_path - ) - exit(0) -models, saved_cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task( - [model_path], - suffix="", -) -model = models[0] -model = model.to(device) -printt("move model to %s" % device) -if device not in ["mps", "cpu"]: - model = model.half() -model.eval() - -todo = sorted(list(os.listdir(wavPath)))[i_part::n_part] -n = max(1, len(todo) // 10) # 最多打印十条 -if len(todo) == 0: - printt("no-feature-todo") -else: - printt("all-feature-%s" % len(todo)) - for idx, file in enumerate(todo): - try: - if file.endswith(".wav"): - wav_path = "%s/%s" % (wavPath, file) - out_path = "%s/%s" % (outPath, file.replace("wav", "npy")) - - if os.path.exists(out_path): - continue - - feats = readwave(wav_path, normalize=saved_cfg.task.normalize) - padding_mask = torch.BoolTensor(feats.shape).fill_(False) - inputs = { - "source": feats.half().to(device) - if device not in ["mps", "cpu"] - else feats.to(device), - "padding_mask": padding_mask.to(device), - "output_layer": 9 if version == "v1" else 12, # layer 9 - } - with torch.no_grad(): - logits = model.extract_features(**inputs) - feats = ( - model.final_proj(logits[0]) if version == "v1" else logits[0] - ) - - feats = feats.squeeze(0).float().cpu().numpy() - if np.isnan(feats).sum() == 0: - np.save(out_path, feats, allow_pickle=False) - else: - printt("%s-contains nan" % file) - if idx % n == 0: - printt("now-%s,all-%s,%s,%s" % (len(todo), idx, file, feats.shape)) - except: - printt(traceback.format_exc()) - printt("all-feature-done") diff --git a/infer_batch_rvc.py b/infer_batch_rvc.py deleted file mode 100644 index 3fc9a05..0000000 --- a/infer_batch_rvc.py +++ /dev/null @@ -1,216 +0,0 @@ -""" -v1 -runtime\python.exe myinfer-v2-0528.py 0 "E:\codes\py39\RVC-beta\todo-songs" "E:\codes\py39\logs\mi-test\added_IVF677_Flat_nprobe_7.index" harvest "E:\codes\py39\RVC-beta\output" "E:\codes\py39\test-20230416b\weights\mi-test.pth" 0.66 cuda:0 True 3 0 1 0.33 -v2 -runtime\python.exe myinfer-v2-0528.py 0 "E:\codes\py39\RVC-beta\todo-songs" "E:\codes\py39\test-20230416b\logs\mi-test-v2\aadded_IVF677_Flat_nprobe_1_v2.index" harvest "E:\codes\py39\RVC-beta\output_v2" "E:\codes\py39\test-20230416b\weights\mi-test-v2.pth" 0.66 cuda:0 True 3 0 1 0.33 -""" -import os, sys, pdb, torch - -now_dir = os.getcwd() -sys.path.append(now_dir) -import sys -import torch -import tqdm as tq -from multiprocessing import cpu_count - - -class Config: - def __init__(self, device, is_half): - self.device = device - self.is_half = is_half - self.n_cpu = 0 - self.gpu_name = None - self.gpu_mem = None - self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() - - def device_config(self) -> tuple: - if torch.cuda.is_available(): - i_device = int(self.device.split(":")[-1]) - self.gpu_name = torch.cuda.get_device_name(i_device) - if ( - ("16" in self.gpu_name and "V100" not in self.gpu_name.upper()) - or "P40" in self.gpu_name.upper() - or "1060" in self.gpu_name - or "1070" in self.gpu_name - or "1080" in self.gpu_name - ): - print("16系/10系显卡和P40强制单精度") - self.is_half = False - for config_file in ["32k.json", "40k.json", "48k.json"]: - with open(f"configs/{config_file}", "r") as f: - strr = f.read().replace("true", "false") - with open(f"configs/{config_file}", "w") as f: - f.write(strr) - with open("trainset_preprocess_pipeline_print.py", "r") as f: - strr = f.read().replace("3.7", "3.0") - with open("trainset_preprocess_pipeline_print.py", "w") as f: - f.write(strr) - else: - self.gpu_name = None - self.gpu_mem = int( - torch.cuda.get_device_properties(i_device).total_memory - / 1024 - / 1024 - / 1024 - + 0.4 - ) - if self.gpu_mem <= 4: - with open("trainset_preprocess_pipeline_print.py", "r") as f: - strr = f.read().replace("3.7", "3.0") - with open("trainset_preprocess_pipeline_print.py", "w") as f: - f.write(strr) - elif torch.backends.mps.is_available(): - print("没有发现支持的N卡, 使用MPS进行推理") - self.device = "mps" - else: - print("没有发现支持的N卡, 使用CPU进行推理") - self.device = "cpu" - self.is_half = True - - if self.n_cpu == 0: - self.n_cpu = cpu_count() - - if self.is_half: - # 6G显存配置 - x_pad = 3 - x_query = 10 - x_center = 60 - x_max = 65 - else: - # 5G显存配置 - x_pad = 1 - x_query = 6 - x_center = 38 - x_max = 41 - - if self.gpu_mem != None and self.gpu_mem <= 4: - x_pad = 1 - x_query = 5 - x_center = 30 - x_max = 32 - - return x_pad, x_query, x_center, x_max - - -f0up_key = sys.argv[1] -input_path = sys.argv[2] -index_path = sys.argv[3] -f0method = sys.argv[4] # harvest or pm -opt_path = sys.argv[5] -model_path = sys.argv[6] -index_rate = float(sys.argv[7]) -device = sys.argv[8] -is_half = sys.argv[9].lower() != "false" -filter_radius = int(sys.argv[10]) -resample_sr = int(sys.argv[11]) -rms_mix_rate = float(sys.argv[12]) -protect = float(sys.argv[13]) -print(sys.argv) -config = Config(device, is_half) -now_dir = os.getcwd() -sys.path.append(now_dir) -from lib.train.vc_infer_pipeline import VC -from lib.infer_pack.models import ( - SynthesizerTrnMs256NSFsid, - SynthesizerTrnMs256NSFsid_nono, - SynthesizerTrnMs768NSFsid, - SynthesizerTrnMs768NSFsid_nono, -) -from lib.audio import load_audio -from fairseq import checkpoint_utils -from scipy.io import wavfile - -hubert_model = None - - -def load_hubert(hubert_model_path="hubert_base.pt"): - global hubert_model - models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( - [hubert_model_path], - suffix="", - ) - hubert_model = models[0] - hubert_model = hubert_model.to(device) - if is_half: - hubert_model = hubert_model.half() - else: - hubert_model = hubert_model.float() - hubert_model.eval() - - -def vc_single(sid, input_audio, f0_up_key, f0_file, f0_method, file_index, index_rate): - global tgt_sr, net_g, vc, hubert_model, version - if input_audio is None: - return "You need to upload an audio", None - f0_up_key = int(f0_up_key) - audio = load_audio(input_audio, 16000) - times = [0, 0, 0] - if hubert_model == None: - load_hubert() - if_f0 = cpt.get("f0", 1) - # audio_opt=vc.pipeline(hubert_model,net_g,sid,audio,times,f0_up_key,f0_method,file_index,file_big_npy,index_rate,if_f0,f0_file=f0_file) - audio_opt = vc.pipeline( - hubert_model, - net_g, - sid, - audio, - input_audio, - times, - f0_up_key, - f0_method, - file_index, - index_rate, - if_f0, - filter_radius, - tgt_sr, - resample_sr, - rms_mix_rate, - version, - protect, - f0_file=f0_file, - ) - print(times) - return audio_opt - - -def get_vc(model_path): - global n_spk, tgt_sr, net_g, vc, cpt, device, is_half, version - print("loading pth %s" % model_path) - cpt = torch.load(model_path, map_location="cpu") - tgt_sr = cpt["config"][-1] - cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk - if_f0 = cpt.get("f0", 1) - version = cpt.get("version", "v1") - if version == "v1": - if if_f0 == 1: - net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half) - else: - net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) - elif version == "v2": - if if_f0 == 1: # - net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=is_half) - else: - net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) - del net_g.enc_q - print(net_g.load_state_dict(cpt["weight"], strict=False)) # 不加这一行清不干净,真奇葩 - net_g.eval().to(device) - if is_half: - net_g = net_g.half() - else: - net_g = net_g.float() - vc = VC(tgt_sr, config) - n_spk = cpt["config"][-3] - # return {"visible": True,"maximum": n_spk, "__type__": "update"} - - -if __name__ == "__main__": - get_vc(model_path) - audios = os.listdir(input_path) - for file in tq.tqdm(audios): - if file.endswith(".wav"): - file_path = os.path.join(input_path, file) - wav_opt = vc_single( - 0, file_path, f0up_key, None, f0method, index_path, index_rate - ) - out_path = os.path.join(opt_path, file) - wavfile.write(out_path, tgt_sr, wav_opt) diff --git a/infer_uvr5.py b/infer_uvr5.py deleted file mode 100644 index 0ffdb5d..0000000 --- a/infer_uvr5.py +++ /dev/null @@ -1,363 +0,0 @@ -import os, sys, torch, warnings, pdb - -now_dir = os.getcwd() -sys.path.append(now_dir) -from json import load as ll - -warnings.filterwarnings("ignore") -import librosa -import importlib -import numpy as np -import hashlib, math -from tqdm import tqdm -from lib.uvr5_pack.lib_v5 import spec_utils -from lib.uvr5_pack.utils import _get_name_params, inference -from lib.uvr5_pack.lib_v5.model_param_init import ModelParameters -import soundfile as sf -from lib.uvr5_pack.lib_v5.nets_new import CascadedNet -from lib.uvr5_pack.lib_v5 import nets_61968KB as nets - - -class _audio_pre_: - def __init__(self, agg, model_path, device, is_half): - self.model_path = model_path - self.device = device - self.data = { - # Processing Options - "postprocess": False, - "tta": False, - # Constants - "window_size": 512, - "agg": agg, - "high_end_process": "mirroring", - } - mp = ModelParameters("lib/uvr5_pack/lib_v5/modelparams/4band_v2.json") - model = nets.CascadedASPPNet(mp.param["bins"] * 2) - cpk = torch.load(model_path, map_location="cpu") - model.load_state_dict(cpk) - model.eval() - if is_half: - model = model.half().to(device) - else: - model = model.to(device) - - self.mp = mp - self.model = model - - def _path_audio_(self, music_file, ins_root=None, vocal_root=None, format="flac"): - if ins_root is None and vocal_root is None: - return "No save root." - name = os.path.basename(music_file) - if ins_root is not None: - os.makedirs(ins_root, exist_ok=True) - if vocal_root is not None: - os.makedirs(vocal_root, exist_ok=True) - X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} - bands_n = len(self.mp.param["band"]) - # print(bands_n) - for d in range(bands_n, 0, -1): - bp = self.mp.param["band"][d] - if d == bands_n: # high-end band - ( - X_wave[d], - _, - ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑 - music_file, - bp["sr"], - False, - dtype=np.float32, - res_type=bp["res_type"], - ) - if X_wave[d].ndim == 1: - X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) - else: # lower bands - X_wave[d] = librosa.core.resample( - X_wave[d + 1], - self.mp.param["band"][d + 1]["sr"], - bp["sr"], - res_type=bp["res_type"], - ) - # Stft of wave source - X_spec_s[d] = spec_utils.wave_to_spectrogram_mt( - X_wave[d], - bp["hl"], - bp["n_fft"], - self.mp.param["mid_side"], - self.mp.param["mid_side_b2"], - self.mp.param["reverse"], - ) - # pdb.set_trace() - if d == bands_n and self.data["high_end_process"] != "none": - input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + ( - self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"] - ) - input_high_end = X_spec_s[d][ - :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, : - ] - - X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) - aggresive_set = float(self.data["agg"] / 100) - aggressiveness = { - "value": aggresive_set, - "split_bin": self.mp.param["band"][1]["crop_stop"], - } - with torch.no_grad(): - pred, X_mag, X_phase = inference( - X_spec_m, self.device, self.model, aggressiveness, self.data - ) - # Postprocess - if self.data["postprocess"]: - pred_inv = np.clip(X_mag - pred, 0, np.inf) - pred = spec_utils.mask_silence(pred, pred_inv) - y_spec_m = pred * X_phase - v_spec_m = X_spec_m - y_spec_m - - if ins_root is not None: - if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], y_spec_m, input_high_end, self.mp - ) - wav_instrument = spec_utils.cmb_spectrogram_to_wave( - y_spec_m, self.mp, input_high_end_h, input_high_end_ - ) - else: - wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) - print("%s instruments done" % name) - if format in ["wav", "flac"]: - sf.write( - os.path.join( - ins_root, - "instrument_{}_{}.{}".format(name, self.data["agg"], format), - ), - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) # - else: - path = os.path.join( - ins_root, "instrument_{}_{}.wav".format(name, self.data["agg"]) - ) - sf.write( - path, - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) - if os.path.exists(path): - os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" - % (path, path[:-4] + ".%s" % format) - ) - if vocal_root is not None: - if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], v_spec_m, input_high_end, self.mp - ) - wav_vocals = spec_utils.cmb_spectrogram_to_wave( - v_spec_m, self.mp, input_high_end_h, input_high_end_ - ) - else: - wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) - print("%s vocals done" % name) - if format in ["wav", "flac"]: - sf.write( - os.path.join( - vocal_root, - "vocal_{}_{}.{}".format(name, self.data["agg"], format), - ), - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) - else: - path = os.path.join( - vocal_root, "vocal_{}_{}.wav".format(name, self.data["agg"]) - ) - sf.write( - path, - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) - if os.path.exists(path): - os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" - % (path, path[:-4] + ".%s" % format) - ) - - -class _audio_pre_new: - def __init__(self, agg, model_path, device, is_half): - self.model_path = model_path - self.device = device - self.data = { - # Processing Options - "postprocess": False, - "tta": False, - # Constants - "window_size": 512, - "agg": agg, - "high_end_process": "mirroring", - } - mp = ModelParameters("lib/uvr5_pack/lib_v5/modelparams/4band_v3.json") - nout = 64 if "DeReverb" in model_path else 48 - model = CascadedNet(mp.param["bins"] * 2, nout) - cpk = torch.load(model_path, map_location="cpu") - model.load_state_dict(cpk) - model.eval() - if is_half: - model = model.half().to(device) - else: - model = model.to(device) - - self.mp = mp - self.model = model - - def _path_audio_( - self, music_file, vocal_root=None, ins_root=None, format="flac" - ): # 3个VR模型vocal和ins是反的 - if ins_root is None and vocal_root is None: - return "No save root." - name = os.path.basename(music_file) - if ins_root is not None: - os.makedirs(ins_root, exist_ok=True) - if vocal_root is not None: - os.makedirs(vocal_root, exist_ok=True) - X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} - bands_n = len(self.mp.param["band"]) - # print(bands_n) - for d in range(bands_n, 0, -1): - bp = self.mp.param["band"][d] - if d == bands_n: # high-end band - ( - X_wave[d], - _, - ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑 - music_file, - bp["sr"], - False, - dtype=np.float32, - res_type=bp["res_type"], - ) - if X_wave[d].ndim == 1: - X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) - else: # lower bands - X_wave[d] = librosa.core.resample( - X_wave[d + 1], - self.mp.param["band"][d + 1]["sr"], - bp["sr"], - res_type=bp["res_type"], - ) - # Stft of wave source - X_spec_s[d] = spec_utils.wave_to_spectrogram_mt( - X_wave[d], - bp["hl"], - bp["n_fft"], - self.mp.param["mid_side"], - self.mp.param["mid_side_b2"], - self.mp.param["reverse"], - ) - # pdb.set_trace() - if d == bands_n and self.data["high_end_process"] != "none": - input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + ( - self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"] - ) - input_high_end = X_spec_s[d][ - :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, : - ] - - X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) - aggresive_set = float(self.data["agg"] / 100) - aggressiveness = { - "value": aggresive_set, - "split_bin": self.mp.param["band"][1]["crop_stop"], - } - with torch.no_grad(): - pred, X_mag, X_phase = inference( - X_spec_m, self.device, self.model, aggressiveness, self.data - ) - # Postprocess - if self.data["postprocess"]: - pred_inv = np.clip(X_mag - pred, 0, np.inf) - pred = spec_utils.mask_silence(pred, pred_inv) - y_spec_m = pred * X_phase - v_spec_m = X_spec_m - y_spec_m - - if ins_root is not None: - if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], y_spec_m, input_high_end, self.mp - ) - wav_instrument = spec_utils.cmb_spectrogram_to_wave( - y_spec_m, self.mp, input_high_end_h, input_high_end_ - ) - else: - wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) - print("%s instruments done" % name) - if format in ["wav", "flac"]: - sf.write( - os.path.join( - ins_root, - "instrument_{}_{}.{}".format(name, self.data["agg"], format), - ), - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) # - else: - path = os.path.join( - ins_root, "instrument_{}_{}.wav".format(name, self.data["agg"]) - ) - sf.write( - path, - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) - if os.path.exists(path): - os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" - % (path, path[:-4] + ".%s" % format) - ) - if vocal_root is not None: - if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], v_spec_m, input_high_end, self.mp - ) - wav_vocals = spec_utils.cmb_spectrogram_to_wave( - v_spec_m, self.mp, input_high_end_h, input_high_end_ - ) - else: - wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) - print("%s vocals done" % name) - if format in ["wav", "flac"]: - sf.write( - os.path.join( - vocal_root, - "vocal_{}_{}.{}".format(name, self.data["agg"], format), - ), - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) - else: - path = os.path.join( - vocal_root, "vocal_{}_{}.wav".format(name, self.data["agg"]) - ) - sf.write( - path, - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) - if os.path.exists(path): - os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" - % (path, path[:-4] + ".%s" % format) - ) - - -if __name__ == "__main__": - device = "cuda" - is_half = True - # model_path = "uvr5_weights/2_HP-UVR.pth" - # model_path = "uvr5_weights/VR-DeEchoDeReverb.pth" - # model_path = "uvr5_weights/VR-DeEchoNormal.pth" - model_path = "uvr5_weights/DeEchoNormal.pth" - # pre_fun = _audio_pre_(model_path=model_path, device=device, is_half=True,agg=10) - pre_fun = _audio_pre_new(model_path=model_path, device=device, is_half=True, agg=10) - audio_path = "雪雪伴奏对消HP5.wav" - save_path = "opt" - pre_fun._path_audio_(audio_path, save_path, save_path) diff --git a/lib/audio.py b/lib/audio.py deleted file mode 100644 index 776939d..0000000 --- a/lib/audio.py +++ /dev/null @@ -1,21 +0,0 @@ -import ffmpeg -import numpy as np - - -def load_audio(file, sr): - try: - # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 - # This launches a subprocess to decode audio while down-mixing and resampling as necessary. - # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. - file = ( - file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - ) # 防止小白拷路径头尾带了空格和"和回车 - out, _ = ( - ffmpeg.input(file, threads=0) - .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) - .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) - ) - except Exception as e: - raise RuntimeError(f"Failed to load audio: {e}") - - return np.frombuffer(out, np.float32).flatten() diff --git a/lib/rmvpe.py b/lib/rmvpe.py deleted file mode 100644 index e5fa613..0000000 --- a/lib/rmvpe.py +++ /dev/null @@ -1,692 +0,0 @@ -import torch, numpy as np, pdb -import torch.nn as nn -import torch.nn.functional as F -import torch, pdb -import numpy as np -import torch.nn.functional as F -from scipy.signal import get_window -from librosa.util import pad_center, tiny, normalize - - -###stft codes from https://github.com/pseeth/torch-stft/blob/master/torch_stft/util.py -def window_sumsquare( - window, - n_frames, - hop_length=200, - win_length=800, - n_fft=800, - dtype=np.float32, - norm=None, -): - """ - # from librosa 0.6 - Compute the sum-square envelope of a window function at a given hop length. - This is used to estimate modulation effects induced by windowing - observations in short-time fourier transforms. - Parameters - ---------- - window : string, tuple, number, callable, or list-like - Window specification, as in `get_window` - n_frames : int > 0 - The number of analysis frames - hop_length : int > 0 - The number of samples to advance between frames - win_length : [optional] - The length of the window function. By default, this matches `n_fft`. - n_fft : int > 0 - The length of each analysis frame. - dtype : np.dtype - The data type of the output - Returns - ------- - wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` - The sum-squared envelope of the window function - """ - if win_length is None: - win_length = n_fft - - n = n_fft + hop_length * (n_frames - 1) - x = np.zeros(n, dtype=dtype) - - # Compute the squared window at the desired length - win_sq = get_window(window, win_length, fftbins=True) - win_sq = normalize(win_sq, norm=norm) ** 2 - win_sq = pad_center(win_sq, n_fft) - - # Fill the envelope - for i in range(n_frames): - sample = i * hop_length - x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))] - return x - - -class STFT(torch.nn.Module): - def __init__( - self, filter_length=1024, hop_length=512, win_length=None, window="hann" - ): - """ - This module implements an STFT using 1D convolution and 1D transpose convolutions. - This is a bit tricky so there are some cases that probably won't work as working - out the same sizes before and after in all overlap add setups is tough. Right now, - this code should work with hop lengths that are half the filter length (50% overlap - between frames). - - Keyword Arguments: - filter_length {int} -- Length of filters used (default: {1024}) - hop_length {int} -- Hop length of STFT (restrict to 50% overlap between frames) (default: {512}) - win_length {[type]} -- Length of the window function applied to each frame (if not specified, it - equals the filter length). (default: {None}) - window {str} -- Type of window to use (options are bartlett, hann, hamming, blackman, blackmanharris) - (default: {'hann'}) - """ - super(STFT, self).__init__() - self.filter_length = filter_length - self.hop_length = hop_length - self.win_length = win_length if win_length else filter_length - self.window = window - self.forward_transform = None - self.pad_amount = int(self.filter_length / 2) - scale = self.filter_length / self.hop_length - fourier_basis = np.fft.fft(np.eye(self.filter_length)) - - cutoff = int((self.filter_length / 2 + 1)) - fourier_basis = np.vstack( - [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])] - ) - forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) - inverse_basis = torch.FloatTensor( - np.linalg.pinv(scale * fourier_basis).T[:, None, :] - ) - - assert filter_length >= self.win_length - # get window and zero center pad it to filter_length - fft_window = get_window(window, self.win_length, fftbins=True) - fft_window = pad_center(fft_window, size=filter_length) - fft_window = torch.from_numpy(fft_window).float() - - # window the bases - forward_basis *= fft_window - inverse_basis *= fft_window - - self.register_buffer("forward_basis", forward_basis.float()) - self.register_buffer("inverse_basis", inverse_basis.float()) - - def transform(self, input_data): - """Take input data (audio) to STFT domain. - - Arguments: - input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples) - - Returns: - magnitude {tensor} -- Magnitude of STFT with shape (num_batch, - num_frequencies, num_frames) - phase {tensor} -- Phase of STFT with shape (num_batch, - num_frequencies, num_frames) - """ - num_batches = input_data.shape[0] - num_samples = input_data.shape[-1] - - self.num_samples = num_samples - - # similar to librosa, reflect-pad the input - input_data = input_data.view(num_batches, 1, num_samples) - # print(1234,input_data.shape) - input_data = F.pad( - input_data.unsqueeze(1), - (self.pad_amount, self.pad_amount, 0, 0, 0, 0), - mode="reflect", - ).squeeze(1) - # print(2333,input_data.shape,self.forward_basis.shape,self.hop_length) - # pdb.set_trace() - forward_transform = F.conv1d( - input_data, self.forward_basis, stride=self.hop_length, padding=0 - ) - - cutoff = int((self.filter_length / 2) + 1) - real_part = forward_transform[:, :cutoff, :] - imag_part = forward_transform[:, cutoff:, :] - - magnitude = torch.sqrt(real_part**2 + imag_part**2) - # phase = torch.atan2(imag_part.data, real_part.data) - - return magnitude # , phase - - def inverse(self, magnitude, phase): - """Call the inverse STFT (iSTFT), given magnitude and phase tensors produced - by the ```transform``` function. - - Arguments: - magnitude {tensor} -- Magnitude of STFT with shape (num_batch, - num_frequencies, num_frames) - phase {tensor} -- Phase of STFT with shape (num_batch, - num_frequencies, num_frames) - - Returns: - inverse_transform {tensor} -- Reconstructed audio given magnitude and phase. Of - shape (num_batch, num_samples) - """ - recombine_magnitude_phase = torch.cat( - [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1 - ) - - inverse_transform = F.conv_transpose1d( - recombine_magnitude_phase, - self.inverse_basis, - stride=self.hop_length, - padding=0, - ) - - if self.window is not None: - window_sum = window_sumsquare( - self.window, - magnitude.size(-1), - hop_length=self.hop_length, - win_length=self.win_length, - n_fft=self.filter_length, - dtype=np.float32, - ) - # remove modulation effects - approx_nonzero_indices = torch.from_numpy( - np.where(window_sum > tiny(window_sum))[0] - ) - window_sum = torch.from_numpy(window_sum).to(inverse_transform.device) - inverse_transform[:, :, approx_nonzero_indices] /= window_sum[ - approx_nonzero_indices - ] - - # scale by hop ratio - inverse_transform *= float(self.filter_length) / self.hop_length - - inverse_transform = inverse_transform[..., self.pad_amount :] - inverse_transform = inverse_transform[..., : self.num_samples] - inverse_transform = inverse_transform.squeeze(1) - - return inverse_transform - - def forward(self, input_data): - """Take input data (audio) to STFT domain and then back to audio. - - Arguments: - input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples) - - Returns: - reconstruction {tensor} -- Reconstructed audio given magnitude and phase. Of - shape (num_batch, num_samples) - """ - self.magnitude, self.phase = self.transform(input_data) - reconstruction = self.inverse(self.magnitude, self.phase) - return reconstruction - - -from time import time as ttime - - -class BiGRU(nn.Module): - def __init__(self, input_features, hidden_features, num_layers): - super(BiGRU, self).__init__() - self.gru = nn.GRU( - input_features, - hidden_features, - num_layers=num_layers, - batch_first=True, - bidirectional=True, - ) - - def forward(self, x): - return self.gru(x)[0] - - -class ConvBlockRes(nn.Module): - def __init__(self, in_channels, out_channels, momentum=0.01): - super(ConvBlockRes, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=(3, 3), - stride=(1, 1), - padding=(1, 1), - bias=False, - ), - nn.BatchNorm2d(out_channels, momentum=momentum), - nn.ReLU(), - nn.Conv2d( - in_channels=out_channels, - out_channels=out_channels, - kernel_size=(3, 3), - stride=(1, 1), - padding=(1, 1), - bias=False, - ), - nn.BatchNorm2d(out_channels, momentum=momentum), - nn.ReLU(), - ) - if in_channels != out_channels: - self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) - self.is_shortcut = True - else: - self.is_shortcut = False - - def forward(self, x): - if self.is_shortcut: - return self.conv(x) + self.shortcut(x) - else: - return self.conv(x) + x - - -class Encoder(nn.Module): - def __init__( - self, - in_channels, - in_size, - n_encoders, - kernel_size, - n_blocks, - out_channels=16, - momentum=0.01, - ): - super(Encoder, self).__init__() - self.n_encoders = n_encoders - self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) - self.layers = nn.ModuleList() - self.latent_channels = [] - for i in range(self.n_encoders): - self.layers.append( - ResEncoderBlock( - in_channels, out_channels, kernel_size, n_blocks, momentum=momentum - ) - ) - self.latent_channels.append([out_channels, in_size]) - in_channels = out_channels - out_channels *= 2 - in_size //= 2 - self.out_size = in_size - self.out_channel = out_channels - - def forward(self, x): - concat_tensors = [] - x = self.bn(x) - for i in range(self.n_encoders): - _, x = self.layers[i](x) - concat_tensors.append(_) - return x, concat_tensors - - -class ResEncoderBlock(nn.Module): - def __init__( - self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01 - ): - super(ResEncoderBlock, self).__init__() - self.n_blocks = n_blocks - self.conv = nn.ModuleList() - self.conv.append(ConvBlockRes(in_channels, out_channels, momentum)) - for i in range(n_blocks - 1): - self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) - self.kernel_size = kernel_size - if self.kernel_size is not None: - self.pool = nn.AvgPool2d(kernel_size=kernel_size) - - def forward(self, x): - for i in range(self.n_blocks): - x = self.conv[i](x) - if self.kernel_size is not None: - return x, self.pool(x) - else: - return x - - -class Intermediate(nn.Module): # - def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01): - super(Intermediate, self).__init__() - self.n_inters = n_inters - self.layers = nn.ModuleList() - self.layers.append( - ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum) - ) - for i in range(self.n_inters - 1): - self.layers.append( - ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum) - ) - - def forward(self, x): - for i in range(self.n_inters): - x = self.layers[i](x) - return x - - -class ResDecoderBlock(nn.Module): - def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01): - super(ResDecoderBlock, self).__init__() - out_padding = (0, 1) if stride == (1, 2) else (1, 1) - self.n_blocks = n_blocks - self.conv1 = nn.Sequential( - nn.ConvTranspose2d( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=(3, 3), - stride=stride, - padding=(1, 1), - output_padding=out_padding, - bias=False, - ), - nn.BatchNorm2d(out_channels, momentum=momentum), - nn.ReLU(), - ) - self.conv2 = nn.ModuleList() - self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) - for i in range(n_blocks - 1): - self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) - - def forward(self, x, concat_tensor): - x = self.conv1(x) - x = torch.cat((x, concat_tensor), dim=1) - for i in range(self.n_blocks): - x = self.conv2[i](x) - return x - - -class Decoder(nn.Module): - def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01): - super(Decoder, self).__init__() - self.layers = nn.ModuleList() - self.n_decoders = n_decoders - for i in range(self.n_decoders): - out_channels = in_channels // 2 - self.layers.append( - ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum) - ) - in_channels = out_channels - - def forward(self, x, concat_tensors): - for i in range(self.n_decoders): - x = self.layers[i](x, concat_tensors[-1 - i]) - return x - - -class DeepUnet(nn.Module): - def __init__( - self, - kernel_size, - n_blocks, - en_de_layers=5, - inter_layers=4, - in_channels=1, - en_out_channels=16, - ): - super(DeepUnet, self).__init__() - self.encoder = Encoder( - in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels - ) - self.intermediate = Intermediate( - self.encoder.out_channel // 2, - self.encoder.out_channel, - inter_layers, - n_blocks, - ) - self.decoder = Decoder( - self.encoder.out_channel, en_de_layers, kernel_size, n_blocks - ) - - def forward(self, x): - x, concat_tensors = self.encoder(x) - x = self.intermediate(x) - x = self.decoder(x, concat_tensors) - return x - - -class E2E(nn.Module): - def __init__( - self, - n_blocks, - n_gru, - kernel_size, - en_de_layers=5, - inter_layers=4, - in_channels=1, - en_out_channels=16, - ): - super(E2E, self).__init__() - self.unet = DeepUnet( - kernel_size, - n_blocks, - en_de_layers, - inter_layers, - in_channels, - en_out_channels, - ) - self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) - if n_gru: - self.fc = nn.Sequential( - BiGRU(3 * 128, 256, n_gru), - nn.Linear(512, 360), - nn.Dropout(0.25), - nn.Sigmoid(), - ) - else: - self.fc = nn.Sequential( - nn.Linear(3 * nn.N_MELS, nn.N_CLASS), nn.Dropout(0.25), nn.Sigmoid() - ) - - def forward(self, mel): - # print(mel.shape) - mel = mel.transpose(-1, -2).unsqueeze(1) - x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2) - x = self.fc(x) - # print(x.shape) - return x - - -from librosa.filters import mel - - -class MelSpectrogram(torch.nn.Module): - def __init__( - self, - is_half, - n_mel_channels, - sampling_rate, - win_length, - hop_length, - n_fft=None, - mel_fmin=0, - mel_fmax=None, - clamp=1e-5, - ): - super().__init__() - n_fft = win_length if n_fft is None else n_fft - self.hann_window = {} - mel_basis = mel( - sr=sampling_rate, - n_fft=n_fft, - n_mels=n_mel_channels, - fmin=mel_fmin, - fmax=mel_fmax, - htk=True, - ) - mel_basis = torch.from_numpy(mel_basis).float() - self.register_buffer("mel_basis", mel_basis) - self.n_fft = win_length if n_fft is None else n_fft - self.hop_length = hop_length - self.win_length = win_length - self.sampling_rate = sampling_rate - self.n_mel_channels = n_mel_channels - self.clamp = clamp - self.is_half = is_half - - def forward(self, audio, keyshift=0, speed=1, center=True): - factor = 2 ** (keyshift / 12) - n_fft_new = int(np.round(self.n_fft * factor)) - win_length_new = int(np.round(self.win_length * factor)) - hop_length_new = int(np.round(self.hop_length * speed)) - keyshift_key = str(keyshift) + "_" + str(audio.device) - if keyshift_key not in self.hann_window: - self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to( - # "cpu"if(audio.device.type=="privateuseone") else audio.device - audio.device - ) - # fft = torch.stft(#doesn't support pytorch_dml - # # audio.cpu() if(audio.device.type=="privateuseone")else audio, - # audio, - # n_fft=n_fft_new, - # hop_length=hop_length_new, - # win_length=win_length_new, - # window=self.hann_window[keyshift_key], - # center=center, - # return_complex=True, - # ) - # magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) - # print(1111111111) - # print(222222222222222,audio.device,self.is_half) - if hasattr(self, "stft") == False: - # print(n_fft_new,hop_length_new,win_length_new,audio.shape) - self.stft = STFT( - filter_length=n_fft_new, - hop_length=hop_length_new, - win_length=win_length_new, - window="hann", - ).to(audio.device) - magnitude = self.stft.transform(audio) # phase - # if (audio.device.type == "privateuseone"): - # magnitude=magnitude.to(audio.device) - if keyshift != 0: - size = self.n_fft // 2 + 1 - resize = magnitude.size(1) - if resize < size: - magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) - magnitude = magnitude[:, :size, :] * self.win_length / win_length_new - mel_output = torch.matmul(self.mel_basis, magnitude) - if self.is_half == True: - mel_output = mel_output.half() - log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) - # print(log_mel_spec.device.type) - return log_mel_spec - - -class RMVPE: - def __init__(self, model_path, is_half, device=None): - self.resample_kernel = {} - self.resample_kernel = {} - self.is_half = is_half - if device is None: - device = "cuda" if torch.cuda.is_available() else "cpu" - self.device = device - self.mel_extractor = MelSpectrogram( - is_half, 128, 16000, 1024, 160, None, 30, 8000 - ).to(device) - if "privateuseone" in str(device): - import onnxruntime as ort - - ort_session = ort.InferenceSession( - "rmvpe.onnx", providers=["DmlExecutionProvider"] - ) - self.model = ort_session - else: - model = E2E(4, 1, (2, 2)) - ckpt = torch.load(model_path, map_location="cpu") - model.load_state_dict(ckpt) - model.eval() - if is_half == True: - model = model.half() - self.model = model - self.model = self.model.to(device) - cents_mapping = 20 * np.arange(360) + 1997.3794084376191 - self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368 - - def mel2hidden(self, mel): - with torch.no_grad(): - n_frames = mel.shape[-1] - mel = F.pad( - mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect" - ) - if "privateuseone" in str(self.device): - onnx_input_name = self.model.get_inputs()[0].name - onnx_outputs_names = self.model.get_outputs()[0].name - hidden = self.model.run( - [onnx_outputs_names], - input_feed={onnx_input_name: mel.cpu().numpy()}, - )[0] - else: - hidden = self.model(mel) - return hidden[:, :n_frames] - - def decode(self, hidden, thred=0.03): - cents_pred = self.to_local_average_cents(hidden, thred=thred) - f0 = 10 * (2 ** (cents_pred / 1200)) - f0[f0 == 10] = 0 - # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred]) - return f0 - - def infer_from_audio(self, audio, thred=0.03): - # torch.cuda.synchronize() - t0 = ttime() - mel = self.mel_extractor( - torch.from_numpy(audio).float().to(self.device).unsqueeze(0), center=True - ) - # print(123123123,mel.device.type) - # torch.cuda.synchronize() - t1 = ttime() - hidden = self.mel2hidden(mel) - # torch.cuda.synchronize() - t2 = ttime() - # print(234234,hidden.device.type) - if "privateuseone" not in str(self.device): - hidden = hidden.squeeze(0).cpu().numpy() - else: - hidden = hidden[0] - if self.is_half == True: - hidden = hidden.astype("float32") - - f0 = self.decode(hidden, thred=thred) - # torch.cuda.synchronize() - t3 = ttime() - # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0)) - return f0 - - def to_local_average_cents(self, salience, thred=0.05): - # t0 = ttime() - center = np.argmax(salience, axis=1) # 帧长#index - salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368 - # t1 = ttime() - center += 4 - todo_salience = [] - todo_cents_mapping = [] - starts = center - 4 - ends = center + 5 - for idx in range(salience.shape[0]): - todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) - todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) - # t2 = ttime() - todo_salience = np.array(todo_salience) # 帧长,9 - todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9 - product_sum = np.sum(todo_salience * todo_cents_mapping, 1) - weight_sum = np.sum(todo_salience, 1) # 帧长 - devided = product_sum / weight_sum # 帧长 - # t3 = ttime() - maxx = np.max(salience, axis=1) # 帧长 - devided[maxx <= thred] = 0 - # t4 = ttime() - # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3)) - return devided - - -if __name__ == "__main__": - import soundfile as sf, librosa - - audio, sampling_rate = sf.read(r"C:\Users\liujing04\Desktop\Z\冬之花clip1.wav") - if len(audio.shape) > 1: - audio = librosa.to_mono(audio.transpose(1, 0)) - audio_bak = audio.copy() - if sampling_rate != 16000: - audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) - model_path = r"D:\BaiduNetdiskDownload\RVC-beta-v2-0727AMD_realtime\rmvpe.pt" - thred = 0.03 # 0.01 - device = "cuda" if torch.cuda.is_available() else "cpu" - rmvpe = RMVPE(model_path, is_half=False, device=device) - t0 = ttime() - f0 = rmvpe.infer_from_audio(audio, thred=thred) - # f0 = rmvpe.infer_from_audio(audio, thred=thred) - # f0 = rmvpe.infer_from_audio(audio, thred=thred) - # f0 = rmvpe.infer_from_audio(audio, thred=thred) - # f0 = rmvpe.infer_from_audio(audio, thred=thred) - t1 = ttime() - print(f0.shape, t1 - t0) diff --git a/lib/slicer2.py b/lib/slicer2.py deleted file mode 100644 index 7d9d16d..0000000 --- a/lib/slicer2.py +++ /dev/null @@ -1,260 +0,0 @@ -import numpy as np - - -# This function is obtained from librosa. -def get_rms( - y, - frame_length=2048, - hop_length=512, - pad_mode="constant", -): - padding = (int(frame_length // 2), int(frame_length // 2)) - y = np.pad(y, padding, mode=pad_mode) - - axis = -1 - # put our new within-frame axis at the end for now - out_strides = y.strides + tuple([y.strides[axis]]) - # Reduce the shape on the framing axis - x_shape_trimmed = list(y.shape) - x_shape_trimmed[axis] -= frame_length - 1 - out_shape = tuple(x_shape_trimmed) + tuple([frame_length]) - xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides) - if axis < 0: - target_axis = axis - 1 - else: - target_axis = axis + 1 - xw = np.moveaxis(xw, -1, target_axis) - # Downsample along the target axis - slices = [slice(None)] * xw.ndim - slices[axis] = slice(0, None, hop_length) - x = xw[tuple(slices)] - - # Calculate power - power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True) - - return np.sqrt(power) - - -class Slicer: - def __init__( - self, - sr: int, - threshold: float = -40.0, - min_length: int = 5000, - min_interval: int = 300, - hop_size: int = 20, - max_sil_kept: int = 5000, - ): - if not min_length >= min_interval >= hop_size: - raise ValueError( - "The following condition must be satisfied: min_length >= min_interval >= hop_size" - ) - if not max_sil_kept >= hop_size: - raise ValueError( - "The following condition must be satisfied: max_sil_kept >= hop_size" - ) - min_interval = sr * min_interval / 1000 - self.threshold = 10 ** (threshold / 20.0) - self.hop_size = round(sr * hop_size / 1000) - self.win_size = min(round(min_interval), 4 * self.hop_size) - self.min_length = round(sr * min_length / 1000 / self.hop_size) - self.min_interval = round(min_interval / self.hop_size) - self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size) - - def _apply_slice(self, waveform, begin, end): - if len(waveform.shape) > 1: - return waveform[ - :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size) - ] - else: - return waveform[ - begin * self.hop_size : min(waveform.shape[0], end * self.hop_size) - ] - - # @timeit - def slice(self, waveform): - if len(waveform.shape) > 1: - samples = waveform.mean(axis=0) - else: - samples = waveform - if samples.shape[0] <= self.min_length: - return [waveform] - rms_list = get_rms( - y=samples, frame_length=self.win_size, hop_length=self.hop_size - ).squeeze(0) - sil_tags = [] - silence_start = None - clip_start = 0 - for i, rms in enumerate(rms_list): - # Keep looping while frame is silent. - if rms < self.threshold: - # Record start of silent frames. - if silence_start is None: - silence_start = i - continue - # Keep looping while frame is not silent and silence start has not been recorded. - if silence_start is None: - continue - # Clear recorded silence start if interval is not enough or clip is too short - is_leading_silence = silence_start == 0 and i > self.max_sil_kept - need_slice_middle = ( - i - silence_start >= self.min_interval - and i - clip_start >= self.min_length - ) - if not is_leading_silence and not need_slice_middle: - silence_start = None - continue - # Need slicing. Record the range of silent frames to be removed. - if i - silence_start <= self.max_sil_kept: - pos = rms_list[silence_start : i + 1].argmin() + silence_start - if silence_start == 0: - sil_tags.append((0, pos)) - else: - sil_tags.append((pos, pos)) - clip_start = pos - elif i - silence_start <= self.max_sil_kept * 2: - pos = rms_list[ - i - self.max_sil_kept : silence_start + self.max_sil_kept + 1 - ].argmin() - pos += i - self.max_sil_kept - pos_l = ( - rms_list[ - silence_start : silence_start + self.max_sil_kept + 1 - ].argmin() - + silence_start - ) - pos_r = ( - rms_list[i - self.max_sil_kept : i + 1].argmin() - + i - - self.max_sil_kept - ) - if silence_start == 0: - sil_tags.append((0, pos_r)) - clip_start = pos_r - else: - sil_tags.append((min(pos_l, pos), max(pos_r, pos))) - clip_start = max(pos_r, pos) - else: - pos_l = ( - rms_list[ - silence_start : silence_start + self.max_sil_kept + 1 - ].argmin() - + silence_start - ) - pos_r = ( - rms_list[i - self.max_sil_kept : i + 1].argmin() - + i - - self.max_sil_kept - ) - if silence_start == 0: - sil_tags.append((0, pos_r)) - else: - sil_tags.append((pos_l, pos_r)) - clip_start = pos_r - silence_start = None - # Deal with trailing silence. - total_frames = rms_list.shape[0] - if ( - silence_start is not None - and total_frames - silence_start >= self.min_interval - ): - silence_end = min(total_frames, silence_start + self.max_sil_kept) - pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start - sil_tags.append((pos, total_frames + 1)) - # Apply and return slices. - if len(sil_tags) == 0: - return [waveform] - else: - chunks = [] - if sil_tags[0][0] > 0: - chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0])) - for i in range(len(sil_tags) - 1): - chunks.append( - self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]) - ) - if sil_tags[-1][1] < total_frames: - chunks.append( - self._apply_slice(waveform, sil_tags[-1][1], total_frames) - ) - return chunks - - -def main(): - import os.path - from argparse import ArgumentParser - - import librosa - import soundfile - - parser = ArgumentParser() - parser.add_argument("audio", type=str, help="The audio to be sliced") - parser.add_argument( - "--out", type=str, help="Output directory of the sliced audio clips" - ) - parser.add_argument( - "--db_thresh", - type=float, - required=False, - default=-40, - help="The dB threshold for silence detection", - ) - parser.add_argument( - "--min_length", - type=int, - required=False, - default=5000, - help="The minimum milliseconds required for each sliced audio clip", - ) - parser.add_argument( - "--min_interval", - type=int, - required=False, - default=300, - help="The minimum milliseconds for a silence part to be sliced", - ) - parser.add_argument( - "--hop_size", - type=int, - required=False, - default=10, - help="Frame length in milliseconds", - ) - parser.add_argument( - "--max_sil_kept", - type=int, - required=False, - default=500, - help="The maximum silence length kept around the sliced clip, presented in milliseconds", - ) - args = parser.parse_args() - out = args.out - if out is None: - out = os.path.dirname(os.path.abspath(args.audio)) - audio, sr = librosa.load(args.audio, sr=None, mono=False) - slicer = Slicer( - sr=sr, - threshold=args.db_thresh, - min_length=args.min_length, - min_interval=args.min_interval, - hop_size=args.hop_size, - max_sil_kept=args.max_sil_kept, - ) - chunks = slicer.slice(audio) - if not os.path.exists(out): - os.makedirs(out) - for i, chunk in enumerate(chunks): - if len(chunk.shape) > 1: - chunk = chunk.T - soundfile.write( - os.path.join( - out, - f"%s_%d.wav" - % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i), - ), - chunk, - sr, - ) - - -if __name__ == "__main__": - main() diff --git a/lib/train/cmd.txt b/lib/train/cmd.txt deleted file mode 100644 index e4b895e..0000000 --- a/lib/train/cmd.txt +++ /dev/null @@ -1 +0,0 @@ -python train_nsf_sim_cache_sid.py -c configs/mi_mix40k_nsf_co256_cs1sid_ms2048.json -m ft-mi \ No newline at end of file diff --git a/lib/train/data_utils.py b/lib/train/data_utils.py deleted file mode 100644 index 3437e24..0000000 --- a/lib/train/data_utils.py +++ /dev/null @@ -1,512 +0,0 @@ -import os, traceback -import numpy as np -import torch -import torch.utils.data - -from lib.train.mel_processing import spectrogram_torch -from lib.train.utils import load_wav_to_torch, load_filepaths_and_text - - -class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): - """ - 1) loads audio, text pairs - 2) normalizes text and converts them to sequences of integers - 3) computes spectrograms from audio files. - """ - - def __init__(self, audiopaths_and_text, hparams): - self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) - self.max_wav_value = hparams.max_wav_value - self.sampling_rate = hparams.sampling_rate - self.filter_length = hparams.filter_length - self.hop_length = hparams.hop_length - self.win_length = hparams.win_length - self.sampling_rate = hparams.sampling_rate - self.min_text_len = getattr(hparams, "min_text_len", 1) - self.max_text_len = getattr(hparams, "max_text_len", 5000) - self._filter() - - def _filter(self): - """ - Filter text & store spec lengths - """ - # Store spectrogram lengths for Bucketing - # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2) - # spec_length = wav_length // hop_length - audiopaths_and_text_new = [] - lengths = [] - for audiopath, text, pitch, pitchf, dv in self.audiopaths_and_text: - if self.min_text_len <= len(text) and len(text) <= self.max_text_len: - audiopaths_and_text_new.append([audiopath, text, pitch, pitchf, dv]) - lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length)) - self.audiopaths_and_text = audiopaths_and_text_new - self.lengths = lengths - - def get_sid(self, sid): - sid = torch.LongTensor([int(sid)]) - return sid - - def get_audio_text_pair(self, audiopath_and_text): - # separate filename and text - file = audiopath_and_text[0] - phone = audiopath_and_text[1] - pitch = audiopath_and_text[2] - pitchf = audiopath_and_text[3] - dv = audiopath_and_text[4] - - phone, pitch, pitchf = self.get_labels(phone, pitch, pitchf) - spec, wav = self.get_audio(file) - dv = self.get_sid(dv) - - len_phone = phone.size()[0] - len_spec = spec.size()[-1] - # print(123,phone.shape,pitch.shape,spec.shape) - if len_phone != len_spec: - len_min = min(len_phone, len_spec) - # amor - len_wav = len_min * self.hop_length - - spec = spec[:, :len_min] - wav = wav[:, :len_wav] - - phone = phone[:len_min, :] - pitch = pitch[:len_min] - pitchf = pitchf[:len_min] - - return (spec, wav, phone, pitch, pitchf, dv) - - def get_labels(self, phone, pitch, pitchf): - phone = np.load(phone) - phone = np.repeat(phone, 2, axis=0) - pitch = np.load(pitch) - pitchf = np.load(pitchf) - n_num = min(phone.shape[0], 900) # DistributedBucketSampler - # print(234,phone.shape,pitch.shape) - phone = phone[:n_num, :] - pitch = pitch[:n_num] - pitchf = pitchf[:n_num] - phone = torch.FloatTensor(phone) - pitch = torch.LongTensor(pitch) - pitchf = torch.FloatTensor(pitchf) - return phone, pitch, pitchf - - def get_audio(self, filename): - audio, sampling_rate = load_wav_to_torch(filename) - if sampling_rate != self.sampling_rate: - raise ValueError( - "{} SR doesn't match target {} SR".format( - sampling_rate, self.sampling_rate - ) - ) - audio_norm = audio - # audio_norm = audio / self.max_wav_value - # audio_norm = audio / np.abs(audio).max() - - audio_norm = audio_norm.unsqueeze(0) - spec_filename = filename.replace(".wav", ".spec.pt") - if os.path.exists(spec_filename): - try: - spec = torch.load(spec_filename) - except: - print(spec_filename, traceback.format_exc()) - spec = spectrogram_torch( - audio_norm, - self.filter_length, - self.sampling_rate, - self.hop_length, - self.win_length, - center=False, - ) - spec = torch.squeeze(spec, 0) - torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) - else: - spec = spectrogram_torch( - audio_norm, - self.filter_length, - self.sampling_rate, - self.hop_length, - self.win_length, - center=False, - ) - spec = torch.squeeze(spec, 0) - torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) - return spec, audio_norm - - def __getitem__(self, index): - return self.get_audio_text_pair(self.audiopaths_and_text[index]) - - def __len__(self): - return len(self.audiopaths_and_text) - - -class TextAudioCollateMultiNSFsid: - """Zero-pads model inputs and targets""" - - def __init__(self, return_ids=False): - self.return_ids = return_ids - - def __call__(self, batch): - """Collate's training batch from normalized text and aduio - PARAMS - ------ - batch: [text_normalized, spec_normalized, wav_normalized] - """ - # Right zero-pad all one-hot text sequences to max input length - _, ids_sorted_decreasing = torch.sort( - torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True - ) - - max_spec_len = max([x[0].size(1) for x in batch]) - max_wave_len = max([x[1].size(1) for x in batch]) - spec_lengths = torch.LongTensor(len(batch)) - wave_lengths = torch.LongTensor(len(batch)) - spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len) - wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len) - spec_padded.zero_() - wave_padded.zero_() - - max_phone_len = max([x[2].size(0) for x in batch]) - phone_lengths = torch.LongTensor(len(batch)) - phone_padded = torch.FloatTensor( - len(batch), max_phone_len, batch[0][2].shape[1] - ) # (spec, wav, phone, pitch) - pitch_padded = torch.LongTensor(len(batch), max_phone_len) - pitchf_padded = torch.FloatTensor(len(batch), max_phone_len) - phone_padded.zero_() - pitch_padded.zero_() - pitchf_padded.zero_() - # dv = torch.FloatTensor(len(batch), 256)#gin=256 - sid = torch.LongTensor(len(batch)) - - for i in range(len(ids_sorted_decreasing)): - row = batch[ids_sorted_decreasing[i]] - - spec = row[0] - spec_padded[i, :, : spec.size(1)] = spec - spec_lengths[i] = spec.size(1) - - wave = row[1] - wave_padded[i, :, : wave.size(1)] = wave - wave_lengths[i] = wave.size(1) - - phone = row[2] - phone_padded[i, : phone.size(0), :] = phone - phone_lengths[i] = phone.size(0) - - pitch = row[3] - pitch_padded[i, : pitch.size(0)] = pitch - pitchf = row[4] - pitchf_padded[i, : pitchf.size(0)] = pitchf - - # dv[i] = row[5] - sid[i] = row[5] - - return ( - phone_padded, - phone_lengths, - pitch_padded, - pitchf_padded, - spec_padded, - spec_lengths, - wave_padded, - wave_lengths, - # dv - sid, - ) - - -class TextAudioLoader(torch.utils.data.Dataset): - """ - 1) loads audio, text pairs - 2) normalizes text and converts them to sequences of integers - 3) computes spectrograms from audio files. - """ - - def __init__(self, audiopaths_and_text, hparams): - self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) - self.max_wav_value = hparams.max_wav_value - self.sampling_rate = hparams.sampling_rate - self.filter_length = hparams.filter_length - self.hop_length = hparams.hop_length - self.win_length = hparams.win_length - self.sampling_rate = hparams.sampling_rate - self.min_text_len = getattr(hparams, "min_text_len", 1) - self.max_text_len = getattr(hparams, "max_text_len", 5000) - self._filter() - - def _filter(self): - """ - Filter text & store spec lengths - """ - # Store spectrogram lengths for Bucketing - # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2) - # spec_length = wav_length // hop_length - audiopaths_and_text_new = [] - lengths = [] - for audiopath, text, dv in self.audiopaths_and_text: - if self.min_text_len <= len(text) and len(text) <= self.max_text_len: - audiopaths_and_text_new.append([audiopath, text, dv]) - lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length)) - self.audiopaths_and_text = audiopaths_and_text_new - self.lengths = lengths - - def get_sid(self, sid): - sid = torch.LongTensor([int(sid)]) - return sid - - def get_audio_text_pair(self, audiopath_and_text): - # separate filename and text - file = audiopath_and_text[0] - phone = audiopath_and_text[1] - dv = audiopath_and_text[2] - - phone = self.get_labels(phone) - spec, wav = self.get_audio(file) - dv = self.get_sid(dv) - - len_phone = phone.size()[0] - len_spec = spec.size()[-1] - if len_phone != len_spec: - len_min = min(len_phone, len_spec) - len_wav = len_min * self.hop_length - spec = spec[:, :len_min] - wav = wav[:, :len_wav] - phone = phone[:len_min, :] - return (spec, wav, phone, dv) - - def get_labels(self, phone): - phone = np.load(phone) - phone = np.repeat(phone, 2, axis=0) - n_num = min(phone.shape[0], 900) # DistributedBucketSampler - phone = phone[:n_num, :] - phone = torch.FloatTensor(phone) - return phone - - def get_audio(self, filename): - audio, sampling_rate = load_wav_to_torch(filename) - if sampling_rate != self.sampling_rate: - raise ValueError( - "{} SR doesn't match target {} SR".format( - sampling_rate, self.sampling_rate - ) - ) - audio_norm = audio - # audio_norm = audio / self.max_wav_value - # audio_norm = audio / np.abs(audio).max() - - audio_norm = audio_norm.unsqueeze(0) - spec_filename = filename.replace(".wav", ".spec.pt") - if os.path.exists(spec_filename): - try: - spec = torch.load(spec_filename) - except: - print(spec_filename, traceback.format_exc()) - spec = spectrogram_torch( - audio_norm, - self.filter_length, - self.sampling_rate, - self.hop_length, - self.win_length, - center=False, - ) - spec = torch.squeeze(spec, 0) - torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) - else: - spec = spectrogram_torch( - audio_norm, - self.filter_length, - self.sampling_rate, - self.hop_length, - self.win_length, - center=False, - ) - spec = torch.squeeze(spec, 0) - torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) - return spec, audio_norm - - def __getitem__(self, index): - return self.get_audio_text_pair(self.audiopaths_and_text[index]) - - def __len__(self): - return len(self.audiopaths_and_text) - - -class TextAudioCollate: - """Zero-pads model inputs and targets""" - - def __init__(self, return_ids=False): - self.return_ids = return_ids - - def __call__(self, batch): - """Collate's training batch from normalized text and aduio - PARAMS - ------ - batch: [text_normalized, spec_normalized, wav_normalized] - """ - # Right zero-pad all one-hot text sequences to max input length - _, ids_sorted_decreasing = torch.sort( - torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True - ) - - max_spec_len = max([x[0].size(1) for x in batch]) - max_wave_len = max([x[1].size(1) for x in batch]) - spec_lengths = torch.LongTensor(len(batch)) - wave_lengths = torch.LongTensor(len(batch)) - spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len) - wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len) - spec_padded.zero_() - wave_padded.zero_() - - max_phone_len = max([x[2].size(0) for x in batch]) - phone_lengths = torch.LongTensor(len(batch)) - phone_padded = torch.FloatTensor( - len(batch), max_phone_len, batch[0][2].shape[1] - ) - phone_padded.zero_() - sid = torch.LongTensor(len(batch)) - - for i in range(len(ids_sorted_decreasing)): - row = batch[ids_sorted_decreasing[i]] - - spec = row[0] - spec_padded[i, :, : spec.size(1)] = spec - spec_lengths[i] = spec.size(1) - - wave = row[1] - wave_padded[i, :, : wave.size(1)] = wave - wave_lengths[i] = wave.size(1) - - phone = row[2] - phone_padded[i, : phone.size(0), :] = phone - phone_lengths[i] = phone.size(0) - - sid[i] = row[3] - - return ( - phone_padded, - phone_lengths, - spec_padded, - spec_lengths, - wave_padded, - wave_lengths, - sid, - ) - - -class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): - """ - Maintain similar input lengths in a batch. - Length groups are specified by boundaries. - Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}. - - It removes samples which are not included in the boundaries. - Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded. - """ - - def __init__( - self, - dataset, - batch_size, - boundaries, - num_replicas=None, - rank=None, - shuffle=True, - ): - super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) - self.lengths = dataset.lengths - self.batch_size = batch_size - self.boundaries = boundaries - - self.buckets, self.num_samples_per_bucket = self._create_buckets() - self.total_size = sum(self.num_samples_per_bucket) - self.num_samples = self.total_size // self.num_replicas - - def _create_buckets(self): - buckets = [[] for _ in range(len(self.boundaries) - 1)] - for i in range(len(self.lengths)): - length = self.lengths[i] - idx_bucket = self._bisect(length) - if idx_bucket != -1: - buckets[idx_bucket].append(i) - - for i in range(len(buckets) - 1, -1, -1): # - if len(buckets[i]) == 0: - buckets.pop(i) - self.boundaries.pop(i + 1) - - num_samples_per_bucket = [] - for i in range(len(buckets)): - len_bucket = len(buckets[i]) - total_batch_size = self.num_replicas * self.batch_size - rem = ( - total_batch_size - (len_bucket % total_batch_size) - ) % total_batch_size - num_samples_per_bucket.append(len_bucket + rem) - return buckets, num_samples_per_bucket - - def __iter__(self): - # deterministically shuffle based on epoch - g = torch.Generator() - g.manual_seed(self.epoch) - - indices = [] - if self.shuffle: - for bucket in self.buckets: - indices.append(torch.randperm(len(bucket), generator=g).tolist()) - else: - for bucket in self.buckets: - indices.append(list(range(len(bucket)))) - - batches = [] - for i in range(len(self.buckets)): - bucket = self.buckets[i] - len_bucket = len(bucket) - ids_bucket = indices[i] - num_samples_bucket = self.num_samples_per_bucket[i] - - # add extra samples to make it evenly divisible - rem = num_samples_bucket - len_bucket - ids_bucket = ( - ids_bucket - + ids_bucket * (rem // len_bucket) - + ids_bucket[: (rem % len_bucket)] - ) - - # subsample - ids_bucket = ids_bucket[self.rank :: self.num_replicas] - - # batching - for j in range(len(ids_bucket) // self.batch_size): - batch = [ - bucket[idx] - for idx in ids_bucket[ - j * self.batch_size : (j + 1) * self.batch_size - ] - ] - batches.append(batch) - - if self.shuffle: - batch_ids = torch.randperm(len(batches), generator=g).tolist() - batches = [batches[i] for i in batch_ids] - self.batches = batches - - assert len(self.batches) * self.batch_size == self.num_samples - return iter(self.batches) - - def _bisect(self, x, lo=0, hi=None): - if hi is None: - hi = len(self.boundaries) - 1 - - if hi > lo: - mid = (hi + lo) // 2 - if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]: - return mid - elif x <= self.boundaries[mid]: - return self._bisect(x, lo, mid) - else: - return self._bisect(x, mid + 1, hi) - else: - return -1 - - def __len__(self): - return self.num_samples // self.batch_size diff --git a/lib/train/losses.py b/lib/train/losses.py deleted file mode 100644 index aa7bd81..0000000 --- a/lib/train/losses.py +++ /dev/null @@ -1,58 +0,0 @@ -import torch - - -def feature_loss(fmap_r, fmap_g): - loss = 0 - for dr, dg in zip(fmap_r, fmap_g): - for rl, gl in zip(dr, dg): - rl = rl.float().detach() - gl = gl.float() - loss += torch.mean(torch.abs(rl - gl)) - - return loss * 2 - - -def discriminator_loss(disc_real_outputs, disc_generated_outputs): - loss = 0 - r_losses = [] - g_losses = [] - for dr, dg in zip(disc_real_outputs, disc_generated_outputs): - dr = dr.float() - dg = dg.float() - r_loss = torch.mean((1 - dr) ** 2) - g_loss = torch.mean(dg**2) - loss += r_loss + g_loss - r_losses.append(r_loss.item()) - g_losses.append(g_loss.item()) - - return loss, r_losses, g_losses - - -def generator_loss(disc_outputs): - loss = 0 - gen_losses = [] - for dg in disc_outputs: - dg = dg.float() - l = torch.mean((1 - dg) ** 2) - gen_losses.append(l) - loss += l - - return loss, gen_losses - - -def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): - """ - z_p, logs_q: [b, h, t_t] - m_p, logs_p: [b, h, t_t] - """ - z_p = z_p.float() - logs_q = logs_q.float() - m_p = m_p.float() - logs_p = logs_p.float() - z_mask = z_mask.float() - - kl = logs_p - logs_q - 0.5 - kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) - kl = torch.sum(kl * z_mask) - l = kl / torch.sum(z_mask) - return l diff --git a/lib/train/mel_processing.py b/lib/train/mel_processing.py deleted file mode 100644 index 3cc3687..0000000 --- a/lib/train/mel_processing.py +++ /dev/null @@ -1,130 +0,0 @@ -import torch -import torch.utils.data -from librosa.filters import mel as librosa_mel_fn - - -MAX_WAV_VALUE = 32768.0 - - -def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): - """ - PARAMS - ------ - C: compression factor - """ - return torch.log(torch.clamp(x, min=clip_val) * C) - - -def dynamic_range_decompression_torch(x, C=1): - """ - PARAMS - ------ - C: compression factor used to compress - """ - return torch.exp(x) / C - - -def spectral_normalize_torch(magnitudes): - return dynamic_range_compression_torch(magnitudes) - - -def spectral_de_normalize_torch(magnitudes): - return dynamic_range_decompression_torch(magnitudes) - - -# Reusable banks -mel_basis = {} -hann_window = {} - - -def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): - """Convert waveform into Linear-frequency Linear-amplitude spectrogram. - - Args: - y :: (B, T) - Audio waveforms - n_fft - sampling_rate - hop_size - win_size - center - Returns: - :: (B, Freq, Frame) - Linear-frequency Linear-amplitude spectrogram - """ - # Validation - if torch.min(y) < -1.07: - print("min value is ", torch.min(y)) - if torch.max(y) > 1.07: - print("max value is ", torch.max(y)) - - # Window - Cache if needed - global hann_window - dtype_device = str(y.dtype) + "_" + str(y.device) - wnsize_dtype_device = str(win_size) + "_" + dtype_device - if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( - dtype=y.dtype, device=y.device - ) - - # Padding - y = torch.nn.functional.pad( - y.unsqueeze(1), - (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), - mode="reflect", - ) - y = y.squeeze(1) - - # Complex Spectrogram :: (B, T) -> (B, Freq, Frame, RealComplex=2) - spec = torch.stft( - y, - n_fft, - hop_length=hop_size, - win_length=win_size, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=False, - ) - - # Linear-frequency Linear-amplitude spectrogram :: (B, Freq, Frame, RealComplex=2) -> (B, Freq, Frame) - spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) - return spec - - -def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): - # MelBasis - Cache if needed - global mel_basis - dtype_device = str(spec.dtype) + "_" + str(spec.device) - fmax_dtype_device = str(fmax) + "_" + dtype_device - if fmax_dtype_device not in mel_basis: - mel = librosa_mel_fn( - sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax - ) - mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( - dtype=spec.dtype, device=spec.device - ) - - # Mel-frequency Log-amplitude spectrogram :: (B, Freq=num_mels, Frame) - melspec = torch.matmul(mel_basis[fmax_dtype_device], spec) - melspec = spectral_normalize_torch(melspec) - return melspec - - -def mel_spectrogram_torch( - y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False -): - """Convert waveform into Mel-frequency Log-amplitude spectrogram. - - Args: - y :: (B, T) - Waveforms - Returns: - melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram - """ - # Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame) - spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center) - - # Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame) - melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax) - - return melspec diff --git a/lib/train/process_ckpt.py b/lib/train/process_ckpt.py deleted file mode 100644 index 324d5a5..0000000 --- a/lib/train/process_ckpt.py +++ /dev/null @@ -1,259 +0,0 @@ -import torch, traceback, os, sys - -now_dir = os.getcwd() -sys.path.append(now_dir) -from collections import OrderedDict -from i18n import I18nAuto - -i18n = I18nAuto() - - -def savee(ckpt, sr, if_f0, name, epoch, version, hps): - try: - opt = OrderedDict() - opt["weight"] = {} - for key in ckpt.keys(): - if "enc_q" in key: - continue - opt["weight"][key] = ckpt[key].half() - opt["config"] = [ - hps.data.filter_length // 2 + 1, - 32, - hps.model.inter_channels, - hps.model.hidden_channels, - hps.model.filter_channels, - hps.model.n_heads, - hps.model.n_layers, - hps.model.kernel_size, - hps.model.p_dropout, - hps.model.resblock, - hps.model.resblock_kernel_sizes, - hps.model.resblock_dilation_sizes, - hps.model.upsample_rates, - hps.model.upsample_initial_channel, - hps.model.upsample_kernel_sizes, - hps.model.spk_embed_dim, - hps.model.gin_channels, - hps.data.sampling_rate, - ] - opt["info"] = "%sepoch" % epoch - opt["sr"] = sr - opt["f0"] = if_f0 - opt["version"] = version - torch.save(opt, "weights/%s.pth" % name) - return "Success." - except: - return traceback.format_exc() - - -def show_info(path): - try: - a = torch.load(path, map_location="cpu") - return "模型信息:%s\n采样率:%s\n模型是否输入音高引导:%s\n版本:%s" % ( - a.get("info", "None"), - a.get("sr", "None"), - a.get("f0", "None"), - a.get("version", "None"), - ) - except: - return traceback.format_exc() - - -def extract_small_model(path, name, sr, if_f0, info, version): - try: - ckpt = torch.load(path, map_location="cpu") - if "model" in ckpt: - ckpt = ckpt["model"] - opt = OrderedDict() - opt["weight"] = {} - for key in ckpt.keys(): - if "enc_q" in key: - continue - opt["weight"][key] = ckpt[key].half() - if sr == "40k": - opt["config"] = [ - 1025, - 32, - 192, - 192, - 768, - 2, - 6, - 3, - 0, - "1", - [3, 7, 11], - [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - [10, 10, 2, 2], - 512, - [16, 16, 4, 4], - 109, - 256, - 40000, - ] - elif sr == "48k": - if version == "v1": - opt["config"] = [ - 1025, - 32, - 192, - 192, - 768, - 2, - 6, - 3, - 0, - "1", - [3, 7, 11], - [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - [10, 6, 2, 2, 2], - 512, - [16, 16, 4, 4, 4], - 109, - 256, - 48000, - ] - else: - opt["config"] = [ - 1025, - 32, - 192, - 192, - 768, - 2, - 6, - 3, - 0, - "1", - [3, 7, 11], - [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - [12, 10, 2, 2], - 512, - [24, 20, 4, 4], - 109, - 256, - 48000, - ] - elif sr == "32k": - if version == "v1": - opt["config"] = [ - 513, - 32, - 192, - 192, - 768, - 2, - 6, - 3, - 0, - "1", - [3, 7, 11], - [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - [10, 4, 2, 2, 2], - 512, - [16, 16, 4, 4, 4], - 109, - 256, - 32000, - ] - else: - opt["config"] = [ - 513, - 32, - 192, - 192, - 768, - 2, - 6, - 3, - 0, - "1", - [3, 7, 11], - [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - [10, 8, 2, 2], - 512, - [20, 16, 4, 4], - 109, - 256, - 32000, - ] - if info == "": - info = "Extracted model." - opt["info"] = info - opt["version"] = version - opt["sr"] = sr - opt["f0"] = int(if_f0) - torch.save(opt, "weights/%s.pth" % name) - return "Success." - except: - return traceback.format_exc() - - -def change_info(path, info, name): - try: - ckpt = torch.load(path, map_location="cpu") - ckpt["info"] = info - if name == "": - name = os.path.basename(path) - torch.save(ckpt, "weights/%s" % name) - return "Success." - except: - return traceback.format_exc() - - -def merge(path1, path2, alpha1, sr, f0, info, name, version): - try: - - def extract(ckpt): - a = ckpt["model"] - opt = OrderedDict() - opt["weight"] = {} - for key in a.keys(): - if "enc_q" in key: - continue - opt["weight"][key] = a[key] - return opt - - ckpt1 = torch.load(path1, map_location="cpu") - ckpt2 = torch.load(path2, map_location="cpu") - cfg = ckpt1["config"] - if "model" in ckpt1: - ckpt1 = extract(ckpt1) - else: - ckpt1 = ckpt1["weight"] - if "model" in ckpt2: - ckpt2 = extract(ckpt2) - else: - ckpt2 = ckpt2["weight"] - if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())): - return "Fail to merge the models. The model architectures are not the same." - opt = OrderedDict() - opt["weight"] = {} - for key in ckpt1.keys(): - # try: - if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape: - min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0]) - opt["weight"][key] = ( - alpha1 * (ckpt1[key][:min_shape0].float()) - + (1 - alpha1) * (ckpt2[key][:min_shape0].float()) - ).half() - else: - opt["weight"][key] = ( - alpha1 * (ckpt1[key].float()) + (1 - alpha1) * (ckpt2[key].float()) - ).half() - # except: - # pdb.set_trace() - opt["config"] = cfg - """ - if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 40000] - elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4], 109, 256, 48000] - elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000] - """ - opt["sr"] = sr - opt["f0"] = 1 if f0 == i18n("是") else 0 - opt["version"] = version - opt["info"] = info - torch.save(opt, "weights/%s.pth" % name) - return "Success." - except: - return traceback.format_exc() diff --git a/lib/train/utils.py b/lib/train/utils.py deleted file mode 100644 index 9c0fb5c..0000000 --- a/lib/train/utils.py +++ /dev/null @@ -1,487 +0,0 @@ -import os, traceback -import glob -import sys -import argparse -import logging -import json -import subprocess -import numpy as np -from scipy.io.wavfile import read -import torch - -MATPLOTLIB_FLAG = False - -logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) -logger = logging - - -def load_checkpoint_d(checkpoint_path, combd, sbd, optimizer=None, load_opt=1): - assert os.path.isfile(checkpoint_path) - checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") - - ################## - def go(model, bkey): - saved_state_dict = checkpoint_dict[bkey] - if hasattr(model, "module"): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - new_state_dict = {} - for k, v in state_dict.items(): # 模型需要的shape - try: - new_state_dict[k] = saved_state_dict[k] - if saved_state_dict[k].shape != state_dict[k].shape: - print( - "shape-%s-mismatch|need-%s|get-%s" - % (k, state_dict[k].shape, saved_state_dict[k].shape) - ) # - raise KeyError - except: - # logger.info(traceback.format_exc()) - logger.info("%s is not in the checkpoint" % k) # pretrain缺失的 - new_state_dict[k] = v # 模型自带的随机值 - if hasattr(model, "module"): - model.module.load_state_dict(new_state_dict, strict=False) - else: - model.load_state_dict(new_state_dict, strict=False) - return model - - go(combd, "combd") - model = go(sbd, "sbd") - ############# - logger.info("Loaded model weights") - - iteration = checkpoint_dict["iteration"] - learning_rate = checkpoint_dict["learning_rate"] - if ( - optimizer is not None and load_opt == 1 - ): ###加载不了,如果是空的的话,重新初始化,可能还会影响lr时间表的更新,因此在train文件最外围catch - # try: - optimizer.load_state_dict(checkpoint_dict["optimizer"]) - # except: - # traceback.print_exc() - logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration)) - return model, optimizer, learning_rate, iteration - - -# def load_checkpoint(checkpoint_path, model, optimizer=None): -# assert os.path.isfile(checkpoint_path) -# checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') -# iteration = checkpoint_dict['iteration'] -# learning_rate = checkpoint_dict['learning_rate'] -# if optimizer is not None: -# optimizer.load_state_dict(checkpoint_dict['optimizer']) -# # print(1111) -# saved_state_dict = checkpoint_dict['model'] -# # print(1111) -# -# if hasattr(model, 'module'): -# state_dict = model.module.state_dict() -# else: -# state_dict = model.state_dict() -# new_state_dict= {} -# for k, v in state_dict.items(): -# try: -# new_state_dict[k] = saved_state_dict[k] -# except: -# logger.info("%s is not in the checkpoint" % k) -# new_state_dict[k] = v -# if hasattr(model, 'module'): -# model.module.load_state_dict(new_state_dict) -# else: -# model.load_state_dict(new_state_dict) -# logger.info("Loaded checkpoint '{}' (epoch {})" .format( -# checkpoint_path, iteration)) -# return model, optimizer, learning_rate, iteration -def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1): - assert os.path.isfile(checkpoint_path) - checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") - - saved_state_dict = checkpoint_dict["model"] - if hasattr(model, "module"): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - new_state_dict = {} - for k, v in state_dict.items(): # 模型需要的shape - try: - new_state_dict[k] = saved_state_dict[k] - if saved_state_dict[k].shape != state_dict[k].shape: - print( - "shape-%s-mismatch|need-%s|get-%s" - % (k, state_dict[k].shape, saved_state_dict[k].shape) - ) # - raise KeyError - except: - # logger.info(traceback.format_exc()) - logger.info("%s is not in the checkpoint" % k) # pretrain缺失的 - new_state_dict[k] = v # 模型自带的随机值 - if hasattr(model, "module"): - model.module.load_state_dict(new_state_dict, strict=False) - else: - model.load_state_dict(new_state_dict, strict=False) - logger.info("Loaded model weights") - - iteration = checkpoint_dict["iteration"] - learning_rate = checkpoint_dict["learning_rate"] - if ( - optimizer is not None and load_opt == 1 - ): ###加载不了,如果是空的的话,重新初始化,可能还会影响lr时间表的更新,因此在train文件最外围catch - # try: - optimizer.load_state_dict(checkpoint_dict["optimizer"]) - # except: - # traceback.print_exc() - logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration)) - return model, optimizer, learning_rate, iteration - - -def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): - logger.info( - "Saving model and optimizer state at epoch {} to {}".format( - iteration, checkpoint_path - ) - ) - if hasattr(model, "module"): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save( - { - "model": state_dict, - "iteration": iteration, - "optimizer": optimizer.state_dict(), - "learning_rate": learning_rate, - }, - checkpoint_path, - ) - - -def save_checkpoint_d(combd, sbd, optimizer, learning_rate, iteration, checkpoint_path): - logger.info( - "Saving model and optimizer state at epoch {} to {}".format( - iteration, checkpoint_path - ) - ) - if hasattr(combd, "module"): - state_dict_combd = combd.module.state_dict() - else: - state_dict_combd = combd.state_dict() - if hasattr(sbd, "module"): - state_dict_sbd = sbd.module.state_dict() - else: - state_dict_sbd = sbd.state_dict() - torch.save( - { - "combd": state_dict_combd, - "sbd": state_dict_sbd, - "iteration": iteration, - "optimizer": optimizer.state_dict(), - "learning_rate": learning_rate, - }, - checkpoint_path, - ) - - -def summarize( - writer, - global_step, - scalars={}, - histograms={}, - images={}, - audios={}, - audio_sampling_rate=22050, -): - for k, v in scalars.items(): - writer.add_scalar(k, v, global_step) - for k, v in histograms.items(): - writer.add_histogram(k, v, global_step) - for k, v in images.items(): - writer.add_image(k, v, global_step, dataformats="HWC") - for k, v in audios.items(): - writer.add_audio(k, v, global_step, audio_sampling_rate) - - -def latest_checkpoint_path(dir_path, regex="G_*.pth"): - f_list = glob.glob(os.path.join(dir_path, regex)) - f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) - x = f_list[-1] - print(x) - return x - - -def plot_spectrogram_to_numpy(spectrogram): - global MATPLOTLIB_FLAG - if not MATPLOTLIB_FLAG: - import matplotlib - - matplotlib.use("Agg") - MATPLOTLIB_FLAG = True - mpl_logger = logging.getLogger("matplotlib") - mpl_logger.setLevel(logging.WARNING) - import matplotlib.pylab as plt - import numpy as np - - fig, ax = plt.subplots(figsize=(10, 2)) - im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") - plt.colorbar(im, ax=ax) - plt.xlabel("Frames") - plt.ylabel("Channels") - plt.tight_layout() - - fig.canvas.draw() - data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") - data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) - plt.close() - return data - - -def plot_alignment_to_numpy(alignment, info=None): - global MATPLOTLIB_FLAG - if not MATPLOTLIB_FLAG: - import matplotlib - - matplotlib.use("Agg") - MATPLOTLIB_FLAG = True - mpl_logger = logging.getLogger("matplotlib") - mpl_logger.setLevel(logging.WARNING) - import matplotlib.pylab as plt - import numpy as np - - fig, ax = plt.subplots(figsize=(6, 4)) - im = ax.imshow( - alignment.transpose(), aspect="auto", origin="lower", interpolation="none" - ) - fig.colorbar(im, ax=ax) - xlabel = "Decoder timestep" - if info is not None: - xlabel += "\n\n" + info - plt.xlabel(xlabel) - plt.ylabel("Encoder timestep") - plt.tight_layout() - - fig.canvas.draw() - data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") - data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) - plt.close() - return data - - -def load_wav_to_torch(full_path): - sampling_rate, data = read(full_path) - return torch.FloatTensor(data.astype(np.float32)), sampling_rate - - -def load_filepaths_and_text(filename, split="|"): - with open(filename, encoding="utf-8") as f: - filepaths_and_text = [line.strip().split(split) for line in f] - return filepaths_and_text - - -def get_hparams(init=True): - """ - todo: - 结尾七人组: - 保存频率、总epoch done - bs done - pretrainG、pretrainD done - 卡号:os.en["CUDA_VISIBLE_DEVICES"] done - if_latest done - 模型:if_f0 done - 采样率:自动选择config done - 是否缓存数据集进GPU:if_cache_data_in_gpu done - - -m: - 自动决定training_files路径,改掉train_nsf_load_pretrain.py里的hps.data.training_files done - -c不要了 - """ - parser = argparse.ArgumentParser() - # parser.add_argument('-c', '--config', type=str, default="configs/40k.json",help='JSON file for configuration') - parser.add_argument( - "-se", - "--save_every_epoch", - type=int, - required=True, - help="checkpoint save frequency (epoch)", - ) - parser.add_argument( - "-te", "--total_epoch", type=int, required=True, help="total_epoch" - ) - parser.add_argument( - "-pg", "--pretrainG", type=str, default="", help="Pretrained Discriminator path" - ) - parser.add_argument( - "-pd", "--pretrainD", type=str, default="", help="Pretrained Generator path" - ) - parser.add_argument("-g", "--gpus", type=str, default="0", help="split by -") - parser.add_argument( - "-bs", "--batch_size", type=int, required=True, help="batch size" - ) - parser.add_argument( - "-e", "--experiment_dir", type=str, required=True, help="experiment dir" - ) # -m - parser.add_argument( - "-sr", "--sample_rate", type=str, required=True, help="sample rate, 32k/40k/48k" - ) - parser.add_argument( - "-sw", - "--save_every_weights", - type=str, - default="0", - help="save the extracted model in weights directory when saving checkpoints", - ) - parser.add_argument( - "-v", "--version", type=str, required=True, help="model version" - ) - parser.add_argument( - "-f0", - "--if_f0", - type=int, - required=True, - help="use f0 as one of the inputs of the model, 1 or 0", - ) - parser.add_argument( - "-l", - "--if_latest", - type=int, - required=True, - help="if only save the latest G/D pth file, 1 or 0", - ) - parser.add_argument( - "-c", - "--if_cache_data_in_gpu", - type=int, - required=True, - help="if caching the dataset in GPU memory, 1 or 0", - ) - - args = parser.parse_args() - name = args.experiment_dir - experiment_dir = os.path.join("./logs", args.experiment_dir) - - if not os.path.exists(experiment_dir): - os.makedirs(experiment_dir) - - if args.version == "v1" or args.sample_rate == "40k": - config_path = "configs/%s.json" % args.sample_rate - else: - config_path = "configs/%s_v2.json" % args.sample_rate - config_save_path = os.path.join(experiment_dir, "config.json") - if init: - with open(config_path, "r") as f: - data = f.read() - with open(config_save_path, "w") as f: - f.write(data) - else: - with open(config_save_path, "r") as f: - data = f.read() - config = json.loads(data) - - hparams = HParams(**config) - hparams.model_dir = hparams.experiment_dir = experiment_dir - hparams.save_every_epoch = args.save_every_epoch - hparams.name = name - hparams.total_epoch = args.total_epoch - hparams.pretrainG = args.pretrainG - hparams.pretrainD = args.pretrainD - hparams.version = args.version - hparams.gpus = args.gpus - hparams.train.batch_size = args.batch_size - hparams.sample_rate = args.sample_rate - hparams.if_f0 = args.if_f0 - hparams.if_latest = args.if_latest - hparams.save_every_weights = args.save_every_weights - hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu - hparams.data.training_files = "%s/filelist.txt" % experiment_dir - return hparams - - -def get_hparams_from_dir(model_dir): - config_save_path = os.path.join(model_dir, "config.json") - with open(config_save_path, "r") as f: - data = f.read() - config = json.loads(data) - - hparams = HParams(**config) - hparams.model_dir = model_dir - return hparams - - -def get_hparams_from_file(config_path): - with open(config_path, "r") as f: - data = f.read() - config = json.loads(data) - - hparams = HParams(**config) - return hparams - - -def check_git_hash(model_dir): - source_dir = os.path.dirname(os.path.realpath(__file__)) - if not os.path.exists(os.path.join(source_dir, ".git")): - logger.warn( - "{} is not a git repository, therefore hash value comparison will be ignored.".format( - source_dir - ) - ) - return - - cur_hash = subprocess.getoutput("git rev-parse HEAD") - - path = os.path.join(model_dir, "githash") - if os.path.exists(path): - saved_hash = open(path).read() - if saved_hash != cur_hash: - logger.warn( - "git hash values are different. {}(saved) != {}(current)".format( - saved_hash[:8], cur_hash[:8] - ) - ) - else: - open(path, "w").write(cur_hash) - - -def get_logger(model_dir, filename="train.log"): - global logger - logger = logging.getLogger(os.path.basename(model_dir)) - logger.setLevel(logging.DEBUG) - - formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") - if not os.path.exists(model_dir): - os.makedirs(model_dir) - h = logging.FileHandler(os.path.join(model_dir, filename)) - h.setLevel(logging.DEBUG) - h.setFormatter(formatter) - logger.addHandler(h) - return logger - - -class HParams: - def __init__(self, **kwargs): - for k, v in kwargs.items(): - if type(v) == dict: - v = HParams(**v) - self[k] = v - - def keys(self): - return self.__dict__.keys() - - def items(self): - return self.__dict__.items() - - def values(self): - return self.__dict__.values() - - def __len__(self): - return len(self.__dict__) - - def __getitem__(self, key): - return getattr(self, key) - - def __setitem__(self, key, value): - return setattr(self, key, value) - - def __contains__(self, key): - return key in self.__dict__ - - def __repr__(self): - return self.__dict__.__repr__() diff --git a/lib/train/vc_infer_pipeline.py b/lib/train/vc_infer_pipeline.py deleted file mode 100644 index 980fc21..0000000 --- a/lib/train/vc_infer_pipeline.py +++ /dev/null @@ -1,449 +0,0 @@ -import numpy as np, parselmouth, torch, pdb, sys, os -from time import time as ttime -import torch.nn.functional as F -import scipy.signal as signal -import pyworld, os, traceback, faiss, librosa, torchcrepe -from scipy import signal -from functools import lru_cache - -now_dir = os.getcwd() -sys.path.append(now_dir) - -bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) - -input_audio_path2wav = {} - - -@lru_cache -def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period): - audio = input_audio_path2wav[input_audio_path] - f0, t = pyworld.harvest( - audio, - fs=fs, - f0_ceil=f0max, - f0_floor=f0min, - frame_period=frame_period, - ) - f0 = pyworld.stonemask(audio, f0, t, fs) - return f0 - - -def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比 - # print(data1.max(),data2.max()) - rms1 = librosa.feature.rms( - y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2 - ) # 每半秒一个点 - rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2) - rms1 = torch.from_numpy(rms1) - rms1 = F.interpolate( - rms1.unsqueeze(0), size=data2.shape[0], mode="linear" - ).squeeze() - rms2 = torch.from_numpy(rms2) - rms2 = F.interpolate( - rms2.unsqueeze(0), size=data2.shape[0], mode="linear" - ).squeeze() - rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6) - data2 *= ( - torch.pow(rms1, torch.tensor(1 - rate)) - * torch.pow(rms2, torch.tensor(rate - 1)) - ).numpy() - return data2 - - -class VC(object): - def __init__(self, tgt_sr, config): - self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = ( - config.x_pad, - config.x_query, - config.x_center, - config.x_max, - config.is_half, - ) - self.sr = 16000 # hubert输入采样率 - self.window = 160 # 每帧点数 - self.t_pad = self.sr * self.x_pad # 每条前后pad时间 - self.t_pad_tgt = tgt_sr * self.x_pad - self.t_pad2 = self.t_pad * 2 - self.t_query = self.sr * self.x_query # 查询切点前后查询时间 - self.t_center = self.sr * self.x_center # 查询切点位置 - self.t_max = self.sr * self.x_max # 免查询时长阈值 - self.device = config.device - - def get_f0( - self, - input_audio_path, - x, - p_len, - f0_up_key, - f0_method, - filter_radius, - inp_f0=None, - ): - global input_audio_path2wav - time_step = self.window / self.sr * 1000 - f0_min = 50 - f0_max = 1100 - f0_mel_min = 1127 * np.log(1 + f0_min / 700) - f0_mel_max = 1127 * np.log(1 + f0_max / 700) - if f0_method == "pm": - f0 = ( - parselmouth.Sound(x, self.sr) - .to_pitch_ac( - time_step=time_step / 1000, - voicing_threshold=0.6, - pitch_floor=f0_min, - pitch_ceiling=f0_max, - ) - .selected_array["frequency"] - ) - pad_size = (p_len - len(f0) + 1) // 2 - if pad_size > 0 or p_len - len(f0) - pad_size > 0: - f0 = np.pad( - f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" - ) - elif f0_method == "harvest": - input_audio_path2wav[input_audio_path] = x.astype(np.double) - f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10) - if filter_radius > 2: - f0 = signal.medfilt(f0, 3) - elif f0_method == "crepe": - model = "full" - # Pick a batch size that doesn't cause memory errors on your gpu - batch_size = 512 - # Compute pitch using first gpu - audio = torch.tensor(np.copy(x))[None].float() - f0, pd = torchcrepe.predict( - audio, - self.sr, - self.window, - f0_min, - f0_max, - model, - batch_size=batch_size, - device=self.device, - return_periodicity=True, - ) - pd = torchcrepe.filter.median(pd, 3) - f0 = torchcrepe.filter.mean(f0, 3) - f0[pd < 0.1] = 0 - f0 = f0[0].cpu().numpy() - elif f0_method == "rmvpe": - if hasattr(self, "model_rmvpe") == False: - from lib.rmvpe import RMVPE - - print("loading rmvpe model") - self.model_rmvpe = RMVPE( - "rmvpe.pt", is_half=self.is_half, device=self.device - ) - - f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) - if "privateuseone" in str(self.device): # clean ortruntime memory - del self.model_rmvpe.model - del self.model_rmvpe - print("cleaning ortruntime memory") - - f0 *= pow(2, f0_up_key / 12) - # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) - tf0 = self.sr // self.window # 每秒f0点数 - if inp_f0 is not None: - delta_t = np.round( - (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 - ).astype("int16") - replace_f0 = np.interp( - list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] - ) - shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0] - f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[ - :shape - ] - # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) - f0bak = f0.copy() - f0_mel = 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( - f0_mel_max - f0_mel_min - ) + 1 - f0_mel[f0_mel <= 1] = 1 - f0_mel[f0_mel > 255] = 255 - f0_coarse = np.rint(f0_mel).astype(np.int32) - return f0_coarse, f0bak # 1-0 - - def vc( - self, - model, - net_g, - sid, - audio0, - pitch, - pitchf, - times, - index, - big_npy, - index_rate, - version, - protect, - ): # ,file_index,file_big_npy - feats = torch.from_numpy(audio0) - if self.is_half: - feats = feats.half() - else: - feats = feats.float() - if feats.dim() == 2: # double channels - feats = feats.mean(-1) - assert feats.dim() == 1, feats.dim() - feats = feats.view(1, -1) - padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) - - inputs = { - "source": feats.to(self.device), - "padding_mask": padding_mask, - "output_layer": 9 if version == "v1" else 12, - } - t0 = ttime() - with torch.no_grad(): - logits = model.extract_features(**inputs) - feats = model.final_proj(logits[0]) if version == "v1" else logits[0] - if protect < 0.5 and pitch != None and pitchf != None: - feats0 = feats.clone() - if ( - isinstance(index, type(None)) == False - and isinstance(big_npy, type(None)) == False - and index_rate != 0 - ): - npy = feats[0].cpu().numpy() - if self.is_half: - npy = npy.astype("float32") - - # _, I = index.search(npy, 1) - # npy = big_npy[I.squeeze()] - - score, ix = index.search(npy, k=8) - weight = np.square(1 / score) - weight /= weight.sum(axis=1, keepdims=True) - npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) - - if self.is_half: - npy = npy.astype("float16") - feats = ( - torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate - + (1 - index_rate) * feats - ) - - feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) - if protect < 0.5 and pitch != None and pitchf != None: - feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute( - 0, 2, 1 - ) - t1 = ttime() - p_len = audio0.shape[0] // self.window - if feats.shape[1] < p_len: - p_len = feats.shape[1] - if pitch != None and pitchf != None: - pitch = pitch[:, :p_len] - pitchf = pitchf[:, :p_len] - - if protect < 0.5 and pitch != None and pitchf != None: - pitchff = pitchf.clone() - pitchff[pitchf > 0] = 1 - pitchff[pitchf < 1] = protect - pitchff = pitchff.unsqueeze(-1) - feats = feats * pitchff + feats0 * (1 - pitchff) - feats = feats.to(feats0.dtype) - p_len = torch.tensor([p_len], device=self.device).long() - with torch.no_grad(): - if pitch != None and pitchf != None: - audio1 = ( - (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]) - .data.cpu() - .float() - .numpy() - ) - else: - audio1 = ( - (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy() - ) - del feats, p_len, padding_mask - if torch.cuda.is_available(): - torch.cuda.empty_cache() - t2 = ttime() - times[0] += t1 - t0 - times[2] += t2 - t1 - return audio1 - - def pipeline( - self, - model, - net_g, - sid, - audio, - input_audio_path, - times, - f0_up_key, - f0_method, - file_index, - # file_big_npy, - index_rate, - if_f0, - filter_radius, - tgt_sr, - resample_sr, - rms_mix_rate, - version, - protect, - f0_file=None, - ): - if ( - file_index != "" - # and file_big_npy != "" - # and os.path.exists(file_big_npy) == True - and os.path.exists(file_index) == True - and index_rate != 0 - ): - try: - index = faiss.read_index(file_index) - # big_npy = np.load(file_big_npy) - big_npy = index.reconstruct_n(0, index.ntotal) - except: - traceback.print_exc() - index = big_npy = None - else: - index = big_npy = None - audio = signal.filtfilt(bh, ah, audio) - audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") - opt_ts = [] - if audio_pad.shape[0] > self.t_max: - audio_sum = np.zeros_like(audio) - for i in range(self.window): - audio_sum += audio_pad[i : i - self.window] - for t in range(self.t_center, audio.shape[0], self.t_center): - opt_ts.append( - t - - self.t_query - + np.where( - np.abs(audio_sum[t - self.t_query : t + self.t_query]) - == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min() - )[0][0] - ) - s = 0 - audio_opt = [] - t = None - t1 = ttime() - audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") - p_len = audio_pad.shape[0] // self.window - inp_f0 = None - if hasattr(f0_file, "name") == True: - try: - with open(f0_file.name, "r") as f: - lines = f.read().strip("\n").split("\n") - inp_f0 = [] - for line in lines: - inp_f0.append([float(i) for i in line.split(",")]) - inp_f0 = np.array(inp_f0, dtype="float32") - except: - traceback.print_exc() - sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() - pitch, pitchf = None, None - if if_f0 == 1: - pitch, pitchf = self.get_f0( - input_audio_path, - audio_pad, - p_len, - f0_up_key, - f0_method, - filter_radius, - inp_f0, - ) - pitch = pitch[:p_len] - pitchf = pitchf[:p_len] - if self.device == "mps": - pitchf = pitchf.astype(np.float32) - pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() - pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() - t2 = ttime() - times[1] += t2 - t1 - for t in opt_ts: - t = t // self.window * self.window - if if_f0 == 1: - audio_opt.append( - self.vc( - model, - net_g, - sid, - audio_pad[s : t + self.t_pad2 + self.window], - pitch[:, s // self.window : (t + self.t_pad2) // self.window], - pitchf[:, s // self.window : (t + self.t_pad2) // self.window], - times, - index, - big_npy, - index_rate, - version, - protect, - )[self.t_pad_tgt : -self.t_pad_tgt] - ) - else: - audio_opt.append( - self.vc( - model, - net_g, - sid, - audio_pad[s : t + self.t_pad2 + self.window], - None, - None, - times, - index, - big_npy, - index_rate, - version, - protect, - )[self.t_pad_tgt : -self.t_pad_tgt] - ) - s = t - if if_f0 == 1: - audio_opt.append( - self.vc( - model, - net_g, - sid, - audio_pad[t:], - pitch[:, t // self.window :] if t is not None else pitch, - pitchf[:, t // self.window :] if t is not None else pitchf, - times, - index, - big_npy, - index_rate, - version, - protect, - )[self.t_pad_tgt : -self.t_pad_tgt] - ) - else: - audio_opt.append( - self.vc( - model, - net_g, - sid, - audio_pad[t:], - None, - None, - times, - index, - big_npy, - index_rate, - version, - protect, - )[self.t_pad_tgt : -self.t_pad_tgt] - ) - audio_opt = np.concatenate(audio_opt) - if rms_mix_rate != 1: - audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate) - if resample_sr >= 16000 and tgt_sr != resample_sr: - audio_opt = librosa.resample( - audio_opt, orig_sr=tgt_sr, target_sr=resample_sr - ) - audio_max = np.abs(audio_opt).max() / 0.99 - max_int16 = 32768 - if audio_max > 1: - max_int16 /= audio_max - audio_opt = (audio_opt * max_int16).astype(np.int16) - del pitch, pitchf, sid - if torch.cuda.is_available(): - torch.cuda.empty_cache() - return audio_opt