From d6de82afef91a2e01f9c31879b8a71d7e712e35d Mon Sep 17 00:00:00 2001 From: VSlobolinskyi Date: Tue, 18 Mar 2025 10:10:54 +0200 Subject: [PATCH] Remove part of the non-inference-related modules --- infer-web.py | 833 +----------------------------- infer/lib/train/data_utils.py | 517 ------------------ infer/lib/train/losses.py | 58 --- infer/lib/train/mel_processing.py | 127 ----- infer/lib/train/process_ckpt.py | 261 ---------- infer/lib/train/utils.py | 483 ----------------- infer/modules/uvr5/modules.py | 108 ---- 7 files changed, 3 insertions(+), 2384 deletions(-) delete mode 100644 infer/lib/train/data_utils.py delete mode 100644 infer/lib/train/losses.py delete mode 100644 infer/lib/train/mel_processing.py delete mode 100644 infer/lib/train/process_ckpt.py delete mode 100644 infer/lib/train/utils.py delete mode 100644 infer/modules/uvr5/modules.py diff --git a/infer-web.py b/infer-web.py index aaf0de6..63af2f9 100644 --- a/infer-web.py +++ b/infer-web.py @@ -6,13 +6,6 @@ now_dir = os.getcwd() sys.path.append(now_dir) load_dotenv() from infer.modules.vc.modules import VC -from infer.modules.uvr5.modules import uvr -from infer.lib.train.process_ckpt import ( - change_info, - extract_small_model, - merge, - show_info, -) from i18n.i18n import I18nAuto from configs.config import Config from sklearn.cluster import MiniBatchKMeans @@ -451,8 +444,8 @@ def change_version19(sr2, if_f0_3, version19): to_return_sr2, ) - -def change_f0(if_f0_3, sr2, version19): # f0method8,pretrained_G14,pretrained_D15 +# f0method8,pretrained_G14,pretrained_D15 +def change_f0(if_f0_3, sr2, version19): path_str = "" if version19 == "v1" else "_v2" return ( {"visible": if_f0_3, "__type__": "update"}, @@ -461,324 +454,7 @@ def change_f0(if_f0_3, sr2, version19): # f0method8,pretrained_G14,pretrained_D ) -# but3.click(click_train,[exp_dir1,sr2,if_f0_3,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16]) -def click_train( - exp_dir1, - sr2, - if_f0_3, - spk_id5, - save_epoch10, - total_epoch11, - batch_size12, - if_save_latest13, - pretrained_G14, - pretrained_D15, - gpus16, - if_cache_gpu17, - if_save_every_weights18, - version19, -): - # 生成filelist - exp_dir = "%s/logs/%s" % (now_dir, exp_dir1) - os.makedirs(exp_dir, exist_ok=True) - gt_wavs_dir = "%s/0_gt_wavs" % (exp_dir) - feature_dir = ( - "%s/3_feature256" % (exp_dir) - if version19 == "v1" - else "%s/3_feature768" % (exp_dir) - ) - if if_f0_3: - f0_dir = "%s/2a_f0" % (exp_dir) - f0nsf_dir = "%s/2b-f0nsf" % (exp_dir) - names = ( - set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) - & set([name.split(".")[0] for name in os.listdir(feature_dir)]) - & set([name.split(".")[0] for name in os.listdir(f0_dir)]) - & set([name.split(".")[0] for name in os.listdir(f0nsf_dir)]) - ) - else: - names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set( - [name.split(".")[0] for name in os.listdir(feature_dir)] - ) - opt = [] - for name in names: - if if_f0_3: - opt.append( - "%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s" - % ( - gt_wavs_dir.replace("\\", "\\\\"), - name, - feature_dir.replace("\\", "\\\\"), - name, - f0_dir.replace("\\", "\\\\"), - name, - f0nsf_dir.replace("\\", "\\\\"), - name, - spk_id5, - ) - ) - else: - opt.append( - "%s/%s.wav|%s/%s.npy|%s" - % ( - gt_wavs_dir.replace("\\", "\\\\"), - name, - feature_dir.replace("\\", "\\\\"), - name, - spk_id5, - ) - ) - fea_dim = 256 if version19 == "v1" else 768 - if if_f0_3: - for _ in range(2): - opt.append( - "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s" - % (now_dir, sr2, now_dir, fea_dim, now_dir, now_dir, spk_id5) - ) - else: - for _ in range(2): - opt.append( - "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s" - % (now_dir, sr2, now_dir, fea_dim, spk_id5) - ) - shuffle(opt) - with open("%s/filelist.txt" % exp_dir, "w") as f: - f.write("\n".join(opt)) - logger.debug("Write filelist done") - # 生成config#无需生成config - # cmd = python_cmd + " train_nsf_sim_cache_sid_load_pretrain.py -e mi-test -sr 40k -f0 1 -bs 4 -g 0 -te 10 -se 5 -pg pretrained/f0G40k.pth -pd pretrained/f0D40k.pth -l 1 -c 0" - logger.info("Use gpus: %s", str(gpus16)) - if pretrained_G14 == "": - logger.info("No pretrained Generator") - if pretrained_D15 == "": - logger.info("No pretrained Discriminator") - if version19 == "v1" or sr2 == "40k": - config_path = "v1/%s.json" % sr2 - else: - config_path = "v2/%s.json" % sr2 - config_save_path = os.path.join(exp_dir, "config.json") - if not pathlib.Path(config_save_path).exists(): - with open(config_save_path, "w", encoding="utf-8") as f: - json.dump( - config.json_config[config_path], - f, - ensure_ascii=False, - indent=4, - sort_keys=True, - ) - f.write("\n") - if gpus16: - cmd = ( - '"%s" infer/modules/train/train.py -e "%s" -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' - % ( - config.python_cmd, - exp_dir1, - sr2, - 1 if if_f0_3 else 0, - batch_size12, - gpus16, - total_epoch11, - save_epoch10, - "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "", - "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "", - 1 if if_save_latest13 == i18n("是") else 0, - 1 if if_cache_gpu17 == i18n("是") else 0, - 1 if if_save_every_weights18 == i18n("是") else 0, - version19, - ) - ) - else: - cmd = ( - '"%s" infer/modules/train/train.py -e "%s" -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' - % ( - config.python_cmd, - exp_dir1, - sr2, - 1 if if_f0_3 else 0, - batch_size12, - total_epoch11, - save_epoch10, - "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "", - "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "", - 1 if if_save_latest13 == i18n("是") else 0, - 1 if if_cache_gpu17 == i18n("是") else 0, - 1 if if_save_every_weights18 == i18n("是") else 0, - version19, - ) - ) - logger.info("Execute: " + cmd) - p = Popen(cmd, shell=True, cwd=now_dir) - p.wait() - return "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log" - - -# but4.click(train_index, [exp_dir1], info3) -def train_index(exp_dir1, version19): - # exp_dir = "%s/logs/%s" % (now_dir, exp_dir1) - exp_dir = "logs/%s" % (exp_dir1) - os.makedirs(exp_dir, exist_ok=True) - feature_dir = ( - "%s/3_feature256" % (exp_dir) - if version19 == "v1" - else "%s/3_feature768" % (exp_dir) - ) - if not os.path.exists(feature_dir): - return "请先进行特征提取!" - listdir_res = list(os.listdir(feature_dir)) - if len(listdir_res) == 0: - return "请先进行特征提取!" - infos = [] - npys = [] - for name in sorted(listdir_res): - phone = np.load("%s/%s" % (feature_dir, name)) - npys.append(phone) - big_npy = np.concatenate(npys, 0) - big_npy_idx = np.arange(big_npy.shape[0]) - np.random.shuffle(big_npy_idx) - big_npy = big_npy[big_npy_idx] - if big_npy.shape[0] > 2e5: - infos.append("Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0]) - yield "\n".join(infos) - try: - big_npy = ( - MiniBatchKMeans( - n_clusters=10000, - verbose=True, - batch_size=256 * config.n_cpu, - compute_labels=False, - init="random", - ) - .fit(big_npy) - .cluster_centers_ - ) - except: - info = traceback.format_exc() - logger.info(info) - infos.append(info) - yield "\n".join(infos) - - np.save("%s/total_fea.npy" % exp_dir, big_npy) - n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39) - infos.append("%s,%s" % (big_npy.shape, n_ivf)) - yield "\n".join(infos) - index = faiss.index_factory(256 if version19 == "v1" else 768, "IVF%s,Flat" % n_ivf) - # index = faiss.index_factory(256if version19=="v1"else 768, "IVF%s,PQ128x4fs,RFlat"%n_ivf) - infos.append("training") - yield "\n".join(infos) - index_ivf = faiss.extract_index_ivf(index) # - index_ivf.nprobe = 1 - index.train(big_npy) - faiss.write_index( - index, - "%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index" - % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19), - ) - infos.append("adding") - yield "\n".join(infos) - batch_size_add = 8192 - for i in range(0, big_npy.shape[0], batch_size_add): - index.add(big_npy[i : i + batch_size_add]) - faiss.write_index( - index, - "%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index" - % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19), - ) - infos.append( - "成功构建索引 added_IVF%s_Flat_nprobe_%s_%s_%s.index" - % (n_ivf, index_ivf.nprobe, exp_dir1, version19) - ) - try: - link = os.link if platform.system() == "Windows" else os.symlink - link( - "%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index" - % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19), - "%s/%s_IVF%s_Flat_nprobe_%s_%s_%s.index" - % ( - outside_index_root, - exp_dir1, - n_ivf, - index_ivf.nprobe, - exp_dir1, - version19, - ), - ) - infos.append("链接索引到外部-%s" % (outside_index_root)) - except: - infos.append("链接索引到外部-%s失败" % (outside_index_root)) - - # faiss.write_index(index, '%s/added_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19)) - # infos.append("成功构建索引,added_IVF%s_Flat_FastScan_%s.index"%(n_ivf,version19)) - yield "\n".join(infos) - - -# but5.click(train1key, [exp_dir1, sr2, if_f0_3, trainset_dir4, spk_id5, gpus6, np7, f0method8, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17], info3) -def train1key( - exp_dir1, - sr2, - if_f0_3, - trainset_dir4, - spk_id5, - np7, - f0method8, - save_epoch10, - total_epoch11, - batch_size12, - if_save_latest13, - pretrained_G14, - pretrained_D15, - gpus16, - if_cache_gpu17, - if_save_every_weights18, - version19, - gpus_rmvpe, -): - infos = [] - - def get_info_str(strr): - infos.append(strr) - return "\n".join(infos) - - # step1:处理数据 - yield get_info_str(i18n("step1:正在处理数据")) - [get_info_str(_) for _ in preprocess_dataset(trainset_dir4, exp_dir1, sr2, np7)] - - # step2a:提取音高 - yield get_info_str(i18n("step2:正在提取音高&正在提取特征")) - [ - get_info_str(_) - for _ in extract_f0_feature( - gpus16, np7, f0method8, if_f0_3, exp_dir1, version19, gpus_rmvpe - ) - ] - - # step3a:训练模型 - yield get_info_str(i18n("step3a:正在训练模型")) - click_train( - exp_dir1, - sr2, - if_f0_3, - spk_id5, - save_epoch10, - total_epoch11, - batch_size12, - if_save_latest13, - pretrained_G14, - pretrained_D15, - gpus16, - if_cache_gpu17, - if_save_every_weights18, - version19, - ) - yield get_info_str( - i18n("训练结束, 您可查看控制台训练日志或实验文件夹下的train.log") - ) - - # step3b:训练索引 - [get_info_str(_) for _ in train_index(exp_dir1, version19)] - yield get_info_str(i18n("全流程结束!")) - - -# ckpt_path2.change(change_info_,[ckpt_path2],[sr__,if_f0__]) +# ckpt_path2.change(change_info_,[ckpt_path2],[sr__,if_f0__]) def change_info_(ckpt_path): if not os.path.exists(ckpt_path.replace(os.path.basename(ckpt_path), "train.log")): return {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"} @@ -1015,11 +691,6 @@ with gr.Blocks(title="RVC WebUI") as app: outputs=file_index4, api_name="infer_refresh_batch", ) - # file_big_npy2 = gr.Textbox( - # label=i18n("特征文件路径"), - # value="E:\\codes\\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy", - # interactive=True, - # ) with gr.Column(): resample_sr1 = gr.Slider( @@ -1110,504 +781,6 @@ with gr.Blocks(title="RVC WebUI") as app: outputs=[spk_item, protect0, protect1, file_index2, file_index4], api_name="infer_change_voice", ) - with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")): - with gr.Group(): - gr.Markdown( - value=i18n( - "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。" - ) - ) - with gr.Row(): - with gr.Column(): - dir_wav_input = gr.Textbox( - label=i18n("输入待处理音频文件夹路径"), - placeholder="C:\\Users\\Desktop\\todo-songs", - ) - wav_inputs = gr.File( - file_count="multiple", - label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹"), - ) - with gr.Column(): - model_choose = gr.Dropdown( - label=i18n("模型"), choices=uvr5_names - ) - agg = gr.Slider( - minimum=0, - maximum=20, - step=1, - label="人声提取激进程度", - value=10, - interactive=True, - visible=False, # 先不开放调整 - ) - opt_vocal_root = gr.Textbox( - label=i18n("指定输出主人声文件夹"), value="opt" - ) - opt_ins_root = gr.Textbox( - label=i18n("指定输出非主人声文件夹"), value="opt" - ) - format0 = gr.Radio( - label=i18n("导出文件格式"), - choices=["wav", "flac", "mp3", "m4a"], - value="flac", - interactive=True, - ) - but2 = gr.Button(i18n("转换"), variant="primary") - vc_output4 = gr.Textbox(label=i18n("输出信息")) - but2.click( - uvr, - [ - model_choose, - dir_wav_input, - opt_vocal_root, - wav_inputs, - opt_ins_root, - agg, - format0, - ], - [vc_output4], - api_name="uvr_convert", - ) - with gr.TabItem(i18n("训练")): - gr.Markdown( - value=i18n( - "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. " - ) - ) - with gr.Row(): - exp_dir1 = gr.Textbox(label=i18n("输入实验名"), value="mi-test") - sr2 = gr.Radio( - label=i18n("目标采样率"), - choices=["40k", "48k"], - value="40k", - interactive=True, - ) - if_f0_3 = gr.Radio( - label=i18n("模型是否带音高指导(唱歌一定要, 语音可以不要)"), - choices=[True, False], - value=True, - interactive=True, - ) - version19 = gr.Radio( - label=i18n("版本"), - choices=["v1", "v2"], - value="v2", - interactive=True, - visible=True, - ) - np7 = gr.Slider( - minimum=0, - maximum=config.n_cpu, - step=1, - label=i18n("提取音高和处理数据使用的CPU进程数"), - value=int(np.ceil(config.n_cpu / 1.5)), - interactive=True, - ) - with gr.Group(): # 暂时单人的, 后面支持最多4人的#数据处理 - gr.Markdown( - value=i18n( - "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. " - ) - ) - with gr.Row(): - trainset_dir4 = gr.Textbox( - label=i18n("输入训练文件夹路径"), - value=i18n("E:\\语音音频+标注\\米津玄师\\src"), - ) - spk_id5 = gr.Slider( - minimum=0, - maximum=4, - step=1, - label=i18n("请指定说话人id"), - value=0, - interactive=True, - ) - but1 = gr.Button(i18n("处理数据"), variant="primary") - info1 = gr.Textbox(label=i18n("输出信息"), value="") - but1.click( - preprocess_dataset, - [trainset_dir4, exp_dir1, sr2, np7], - [info1], - api_name="train_preprocess", - ) - with gr.Group(): - gr.Markdown( - value=i18n( - "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)" - ) - ) - with gr.Row(): - with gr.Column(): - gpus6 = gr.Textbox( - label=i18n( - "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2" - ), - value=gpus, - interactive=True, - visible=F0GPUVisible, - ) - gpu_info9 = gr.Textbox( - label=i18n("显卡信息"), value=gpu_info, visible=F0GPUVisible - ) - with gr.Column(): - f0method8 = gr.Radio( - label=i18n( - "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU" - ), - choices=["pm", "harvest", "dio", "rmvpe", "rmvpe_gpu"], - value="rmvpe_gpu", - interactive=True, - ) - gpus_rmvpe = gr.Textbox( - label=i18n( - "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程" - ), - value="%s-%s" % (gpus, gpus), - interactive=True, - visible=F0GPUVisible, - ) - but2 = gr.Button(i18n("特征提取"), variant="primary") - info2 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8) - f0method8.change( - fn=change_f0_method, - inputs=[f0method8], - outputs=[gpus_rmvpe], - ) - but2.click( - extract_f0_feature, - [ - gpus6, - np7, - f0method8, - if_f0_3, - exp_dir1, - version19, - gpus_rmvpe, - ], - [info2], - api_name="train_extract_f0_feature", - ) - with gr.Group(): - gr.Markdown(value=i18n("step3: 填写训练设置, 开始训练模型和索引")) - with gr.Row(): - save_epoch10 = gr.Slider( - minimum=1, - maximum=50, - step=1, - label=i18n("保存频率save_every_epoch"), - value=5, - interactive=True, - ) - total_epoch11 = gr.Slider( - minimum=2, - maximum=1000, - step=1, - label=i18n("总训练轮数total_epoch"), - value=20, - interactive=True, - ) - batch_size12 = gr.Slider( - minimum=1, - maximum=40, - step=1, - label=i18n("每张显卡的batch_size"), - value=default_batch_size, - interactive=True, - ) - if_save_latest13 = gr.Radio( - label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), - choices=[i18n("是"), i18n("否")], - value=i18n("否"), - interactive=True, - ) - if_cache_gpu17 = gr.Radio( - label=i18n( - "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速" - ), - choices=[i18n("是"), i18n("否")], - value=i18n("否"), - interactive=True, - ) - if_save_every_weights18 = gr.Radio( - label=i18n( - "是否在每次保存时间点将最终小模型保存至weights文件夹" - ), - choices=[i18n("是"), i18n("否")], - value=i18n("否"), - interactive=True, - ) - with gr.Row(): - pretrained_G14 = gr.Textbox( - label=i18n("加载预训练底模G路径"), - value="assets/pretrained_v2/f0G40k.pth", - interactive=True, - ) - pretrained_D15 = gr.Textbox( - label=i18n("加载预训练底模D路径"), - value="assets/pretrained_v2/f0D40k.pth", - interactive=True, - ) - sr2.change( - change_sr2, - [sr2, if_f0_3, version19], - [pretrained_G14, pretrained_D15], - ) - version19.change( - change_version19, - [sr2, if_f0_3, version19], - [pretrained_G14, pretrained_D15, sr2], - ) - if_f0_3.change( - change_f0, - [if_f0_3, sr2, version19], - [f0method8, gpus_rmvpe, pretrained_G14, pretrained_D15], - ) - gpus16 = gr.Textbox( - label=i18n( - "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2" - ), - value=gpus, - interactive=True, - ) - but3 = gr.Button(i18n("训练模型"), variant="primary") - but4 = gr.Button(i18n("训练特征索引"), variant="primary") - but5 = gr.Button(i18n("一键训练"), variant="primary") - info3 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=10) - but3.click( - click_train, - [ - exp_dir1, - sr2, - if_f0_3, - spk_id5, - save_epoch10, - total_epoch11, - batch_size12, - if_save_latest13, - pretrained_G14, - pretrained_D15, - gpus16, - if_cache_gpu17, - if_save_every_weights18, - version19, - ], - info3, - api_name="train_start", - ) - but4.click(train_index, [exp_dir1, version19], info3) - but5.click( - train1key, - [ - exp_dir1, - sr2, - if_f0_3, - trainset_dir4, - spk_id5, - np7, - f0method8, - save_epoch10, - total_epoch11, - batch_size12, - if_save_latest13, - pretrained_G14, - pretrained_D15, - gpus16, - if_cache_gpu17, - if_save_every_weights18, - version19, - gpus_rmvpe, - ], - info3, - api_name="train_start_all", - ) - - with gr.TabItem(i18n("ckpt处理")): - with gr.Group(): - gr.Markdown(value=i18n("模型融合, 可用于测试音色融合")) - with gr.Row(): - ckpt_a = gr.Textbox( - label=i18n("A模型路径"), value="", interactive=True - ) - ckpt_b = gr.Textbox( - label=i18n("B模型路径"), value="", interactive=True - ) - alpha_a = gr.Slider( - minimum=0, - maximum=1, - label=i18n("A模型权重"), - value=0.5, - interactive=True, - ) - with gr.Row(): - sr_ = gr.Radio( - label=i18n("目标采样率"), - choices=["40k", "48k"], - value="40k", - interactive=True, - ) - if_f0_ = gr.Radio( - label=i18n("模型是否带音高指导"), - choices=[i18n("是"), i18n("否")], - value=i18n("是"), - interactive=True, - ) - info__ = gr.Textbox( - label=i18n("要置入的模型信息"), - value="", - max_lines=8, - interactive=True, - ) - name_to_save0 = gr.Textbox( - label=i18n("保存的模型名不带后缀"), - value="", - max_lines=1, - interactive=True, - ) - version_2 = gr.Radio( - label=i18n("模型版本型号"), - choices=["v1", "v2"], - value="v1", - interactive=True, - ) - with gr.Row(): - but6 = gr.Button(i18n("融合"), variant="primary") - info4 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8) - but6.click( - merge, - [ - ckpt_a, - ckpt_b, - alpha_a, - sr_, - if_f0_, - info__, - name_to_save0, - version_2, - ], - info4, - api_name="ckpt_merge", - ) # def merge(path1,path2,alpha1,sr,f0,info): - with gr.Group(): - gr.Markdown( - value=i18n("修改模型信息(仅支持weights文件夹下提取的小模型文件)") - ) - with gr.Row(): - ckpt_path0 = gr.Textbox( - label=i18n("模型路径"), value="", interactive=True - ) - info_ = gr.Textbox( - label=i18n("要改的模型信息"), - value="", - max_lines=8, - interactive=True, - ) - name_to_save1 = gr.Textbox( - label=i18n("保存的文件名, 默认空为和源文件同名"), - value="", - max_lines=8, - interactive=True, - ) - with gr.Row(): - but7 = gr.Button(i18n("修改"), variant="primary") - info5 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8) - but7.click( - change_info, - [ckpt_path0, info_, name_to_save1], - info5, - api_name="ckpt_modify", - ) - with gr.Group(): - gr.Markdown( - value=i18n("查看模型信息(仅支持weights文件夹下提取的小模型文件)") - ) - with gr.Row(): - ckpt_path1 = gr.Textbox( - label=i18n("模型路径"), value="", interactive=True - ) - but8 = gr.Button(i18n("查看"), variant="primary") - info6 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8) - but8.click(show_info, [ckpt_path1], info6, api_name="ckpt_show") - with gr.Group(): - gr.Markdown( - value=i18n( - "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况" - ) - ) - with gr.Row(): - ckpt_path2 = gr.Textbox( - label=i18n("模型路径"), - value="E:\\codes\\py39\\logs\\mi-test_f0_48k\\G_23333.pth", - interactive=True, - ) - save_name = gr.Textbox( - label=i18n("保存名"), value="", interactive=True - ) - sr__ = gr.Radio( - label=i18n("目标采样率"), - choices=["32k", "40k", "48k"], - value="40k", - interactive=True, - ) - if_f0__ = gr.Radio( - label=i18n("模型是否带音高指导,1是0否"), - choices=["1", "0"], - value="1", - interactive=True, - ) - version_1 = gr.Radio( - label=i18n("模型版本型号"), - choices=["v1", "v2"], - value="v2", - interactive=True, - ) - info___ = gr.Textbox( - label=i18n("要置入的模型信息"), - value="", - max_lines=8, - interactive=True, - ) - but9 = gr.Button(i18n("提取"), variant="primary") - info7 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8) - ckpt_path2.change( - change_info_, [ckpt_path2], [sr__, if_f0__, version_1] - ) - but9.click( - extract_small_model, - [ckpt_path2, save_name, sr__, if_f0__, info___, version_1], - info7, - api_name="ckpt_extract", - ) - - with gr.TabItem(i18n("Onnx导出")): - with gr.Row(): - ckpt_dir = gr.Textbox( - label=i18n("RVC模型路径"), value="", interactive=True - ) - with gr.Row(): - onnx_dir = gr.Textbox( - label=i18n("Onnx输出路径"), value="", interactive=True - ) - with gr.Row(): - infoOnnx = gr.Label(label="info") - with gr.Row(): - butOnnx = gr.Button(i18n("导出Onnx模型"), variant="primary") - butOnnx.click( - export_onnx, [ckpt_dir, onnx_dir], infoOnnx, api_name="export_onnx" - ) - - tab_faq = i18n("常见问题解答") - with gr.TabItem(tab_faq): - try: - if tab_faq == "常见问题解答": - with open("docs/cn/faq.md", "r", encoding="utf8") as f: - info = f.read() - else: - with open("docs/en/faq_en.md", "r", encoding="utf8") as f: - info = f.read() - gr.Markdown(value=info) - except: - gr.Markdown(traceback.format_exc()) - if config.iscolab: app.queue(concurrency_count=511, max_size=1022).launch(share=True) else: diff --git a/infer/lib/train/data_utils.py b/infer/lib/train/data_utils.py deleted file mode 100644 index 1e1d1db..0000000 --- a/infer/lib/train/data_utils.py +++ /dev/null @@ -1,517 +0,0 @@ -import os -import traceback -import logging - -logger = logging.getLogger(__name__) - -import numpy as np -import torch -import torch.utils.data - -from infer.lib.train.mel_processing import spectrogram_torch -from infer.lib.train.utils import load_filepaths_and_text, load_wav_to_torch - - -class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): - """ - 1) loads audio, text pairs - 2) normalizes text and converts them to sequences of integers - 3) computes spectrograms from audio files. - """ - - def __init__(self, audiopaths_and_text, hparams): - self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) - self.max_wav_value = hparams.max_wav_value - self.sampling_rate = hparams.sampling_rate - self.filter_length = hparams.filter_length - self.hop_length = hparams.hop_length - self.win_length = hparams.win_length - self.sampling_rate = hparams.sampling_rate - self.min_text_len = getattr(hparams, "min_text_len", 1) - self.max_text_len = getattr(hparams, "max_text_len", 5000) - self._filter() - - def _filter(self): - """ - Filter text & store spec lengths - """ - # Store spectrogram lengths for Bucketing - # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2) - # spec_length = wav_length // hop_length - audiopaths_and_text_new = [] - lengths = [] - for audiopath, text, pitch, pitchf, dv in self.audiopaths_and_text: - if self.min_text_len <= len(text) and len(text) <= self.max_text_len: - audiopaths_and_text_new.append([audiopath, text, pitch, pitchf, dv]) - lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length)) - self.audiopaths_and_text = audiopaths_and_text_new - self.lengths = lengths - - def get_sid(self, sid): - sid = torch.LongTensor([int(sid)]) - return sid - - def get_audio_text_pair(self, audiopath_and_text): - # separate filename and text - file = audiopath_and_text[0] - phone = audiopath_and_text[1] - pitch = audiopath_and_text[2] - pitchf = audiopath_and_text[3] - dv = audiopath_and_text[4] - - phone, pitch, pitchf = self.get_labels(phone, pitch, pitchf) - spec, wav = self.get_audio(file) - dv = self.get_sid(dv) - - len_phone = phone.size()[0] - len_spec = spec.size()[-1] - # print(123,phone.shape,pitch.shape,spec.shape) - if len_phone != len_spec: - len_min = min(len_phone, len_spec) - # amor - len_wav = len_min * self.hop_length - - spec = spec[:, :len_min] - wav = wav[:, :len_wav] - - phone = phone[:len_min, :] - pitch = pitch[:len_min] - pitchf = pitchf[:len_min] - - return (spec, wav, phone, pitch, pitchf, dv) - - def get_labels(self, phone, pitch, pitchf): - phone = np.load(phone) - phone = np.repeat(phone, 2, axis=0) - pitch = np.load(pitch) - pitchf = np.load(pitchf) - n_num = min(phone.shape[0], 900) # DistributedBucketSampler - # print(234,phone.shape,pitch.shape) - phone = phone[:n_num, :] - pitch = pitch[:n_num] - pitchf = pitchf[:n_num] - phone = torch.FloatTensor(phone) - pitch = torch.LongTensor(pitch) - pitchf = torch.FloatTensor(pitchf) - return phone, pitch, pitchf - - def get_audio(self, filename): - audio, sampling_rate = load_wav_to_torch(filename) - if sampling_rate != self.sampling_rate: - raise ValueError( - "{} SR doesn't match target {} SR".format( - sampling_rate, self.sampling_rate - ) - ) - audio_norm = audio - # audio_norm = audio / self.max_wav_value - # audio_norm = audio / np.abs(audio).max() - - audio_norm = audio_norm.unsqueeze(0) - spec_filename = filename.replace(".wav", ".spec.pt") - if os.path.exists(spec_filename): - try: - spec = torch.load(spec_filename) - except: - logger.warning("%s %s", spec_filename, traceback.format_exc()) - spec = spectrogram_torch( - audio_norm, - self.filter_length, - self.sampling_rate, - self.hop_length, - self.win_length, - center=False, - ) - spec = torch.squeeze(spec, 0) - torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) - else: - spec = spectrogram_torch( - audio_norm, - self.filter_length, - self.sampling_rate, - self.hop_length, - self.win_length, - center=False, - ) - spec = torch.squeeze(spec, 0) - torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) - return spec, audio_norm - - def __getitem__(self, index): - return self.get_audio_text_pair(self.audiopaths_and_text[index]) - - def __len__(self): - return len(self.audiopaths_and_text) - - -class TextAudioCollateMultiNSFsid: - """Zero-pads model inputs and targets""" - - def __init__(self, return_ids=False): - self.return_ids = return_ids - - def __call__(self, batch): - """Collate's training batch from normalized text and aduio - PARAMS - ------ - batch: [text_normalized, spec_normalized, wav_normalized] - """ - # Right zero-pad all one-hot text sequences to max input length - _, ids_sorted_decreasing = torch.sort( - torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True - ) - - max_spec_len = max([x[0].size(1) for x in batch]) - max_wave_len = max([x[1].size(1) for x in batch]) - spec_lengths = torch.LongTensor(len(batch)) - wave_lengths = torch.LongTensor(len(batch)) - spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len) - wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len) - spec_padded.zero_() - wave_padded.zero_() - - max_phone_len = max([x[2].size(0) for x in batch]) - phone_lengths = torch.LongTensor(len(batch)) - phone_padded = torch.FloatTensor( - len(batch), max_phone_len, batch[0][2].shape[1] - ) # (spec, wav, phone, pitch) - pitch_padded = torch.LongTensor(len(batch), max_phone_len) - pitchf_padded = torch.FloatTensor(len(batch), max_phone_len) - phone_padded.zero_() - pitch_padded.zero_() - pitchf_padded.zero_() - # dv = torch.FloatTensor(len(batch), 256)#gin=256 - sid = torch.LongTensor(len(batch)) - - for i in range(len(ids_sorted_decreasing)): - row = batch[ids_sorted_decreasing[i]] - - spec = row[0] - spec_padded[i, :, : spec.size(1)] = spec - spec_lengths[i] = spec.size(1) - - wave = row[1] - wave_padded[i, :, : wave.size(1)] = wave - wave_lengths[i] = wave.size(1) - - phone = row[2] - phone_padded[i, : phone.size(0), :] = phone - phone_lengths[i] = phone.size(0) - - pitch = row[3] - pitch_padded[i, : pitch.size(0)] = pitch - pitchf = row[4] - pitchf_padded[i, : pitchf.size(0)] = pitchf - - # dv[i] = row[5] - sid[i] = row[5] - - return ( - phone_padded, - phone_lengths, - pitch_padded, - pitchf_padded, - spec_padded, - spec_lengths, - wave_padded, - wave_lengths, - # dv - sid, - ) - - -class TextAudioLoader(torch.utils.data.Dataset): - """ - 1) loads audio, text pairs - 2) normalizes text and converts them to sequences of integers - 3) computes spectrograms from audio files. - """ - - def __init__(self, audiopaths_and_text, hparams): - self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) - self.max_wav_value = hparams.max_wav_value - self.sampling_rate = hparams.sampling_rate - self.filter_length = hparams.filter_length - self.hop_length = hparams.hop_length - self.win_length = hparams.win_length - self.sampling_rate = hparams.sampling_rate - self.min_text_len = getattr(hparams, "min_text_len", 1) - self.max_text_len = getattr(hparams, "max_text_len", 5000) - self._filter() - - def _filter(self): - """ - Filter text & store spec lengths - """ - # Store spectrogram lengths for Bucketing - # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2) - # spec_length = wav_length // hop_length - audiopaths_and_text_new = [] - lengths = [] - for audiopath, text, dv in self.audiopaths_and_text: - if self.min_text_len <= len(text) and len(text) <= self.max_text_len: - audiopaths_and_text_new.append([audiopath, text, dv]) - lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length)) - self.audiopaths_and_text = audiopaths_and_text_new - self.lengths = lengths - - def get_sid(self, sid): - sid = torch.LongTensor([int(sid)]) - return sid - - def get_audio_text_pair(self, audiopath_and_text): - # separate filename and text - file = audiopath_and_text[0] - phone = audiopath_and_text[1] - dv = audiopath_and_text[2] - - phone = self.get_labels(phone) - spec, wav = self.get_audio(file) - dv = self.get_sid(dv) - - len_phone = phone.size()[0] - len_spec = spec.size()[-1] - if len_phone != len_spec: - len_min = min(len_phone, len_spec) - len_wav = len_min * self.hop_length - spec = spec[:, :len_min] - wav = wav[:, :len_wav] - phone = phone[:len_min, :] - return (spec, wav, phone, dv) - - def get_labels(self, phone): - phone = np.load(phone) - phone = np.repeat(phone, 2, axis=0) - n_num = min(phone.shape[0], 900) # DistributedBucketSampler - phone = phone[:n_num, :] - phone = torch.FloatTensor(phone) - return phone - - def get_audio(self, filename): - audio, sampling_rate = load_wav_to_torch(filename) - if sampling_rate != self.sampling_rate: - raise ValueError( - "{} SR doesn't match target {} SR".format( - sampling_rate, self.sampling_rate - ) - ) - audio_norm = audio - # audio_norm = audio / self.max_wav_value - # audio_norm = audio / np.abs(audio).max() - - audio_norm = audio_norm.unsqueeze(0) - spec_filename = filename.replace(".wav", ".spec.pt") - if os.path.exists(spec_filename): - try: - spec = torch.load(spec_filename) - except: - logger.warning("%s %s", spec_filename, traceback.format_exc()) - spec = spectrogram_torch( - audio_norm, - self.filter_length, - self.sampling_rate, - self.hop_length, - self.win_length, - center=False, - ) - spec = torch.squeeze(spec, 0) - torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) - else: - spec = spectrogram_torch( - audio_norm, - self.filter_length, - self.sampling_rate, - self.hop_length, - self.win_length, - center=False, - ) - spec = torch.squeeze(spec, 0) - torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) - return spec, audio_norm - - def __getitem__(self, index): - return self.get_audio_text_pair(self.audiopaths_and_text[index]) - - def __len__(self): - return len(self.audiopaths_and_text) - - -class TextAudioCollate: - """Zero-pads model inputs and targets""" - - def __init__(self, return_ids=False): - self.return_ids = return_ids - - def __call__(self, batch): - """Collate's training batch from normalized text and aduio - PARAMS - ------ - batch: [text_normalized, spec_normalized, wav_normalized] - """ - # Right zero-pad all one-hot text sequences to max input length - _, ids_sorted_decreasing = torch.sort( - torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True - ) - - max_spec_len = max([x[0].size(1) for x in batch]) - max_wave_len = max([x[1].size(1) for x in batch]) - spec_lengths = torch.LongTensor(len(batch)) - wave_lengths = torch.LongTensor(len(batch)) - spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len) - wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len) - spec_padded.zero_() - wave_padded.zero_() - - max_phone_len = max([x[2].size(0) for x in batch]) - phone_lengths = torch.LongTensor(len(batch)) - phone_padded = torch.FloatTensor( - len(batch), max_phone_len, batch[0][2].shape[1] - ) - phone_padded.zero_() - sid = torch.LongTensor(len(batch)) - - for i in range(len(ids_sorted_decreasing)): - row = batch[ids_sorted_decreasing[i]] - - spec = row[0] - spec_padded[i, :, : spec.size(1)] = spec - spec_lengths[i] = spec.size(1) - - wave = row[1] - wave_padded[i, :, : wave.size(1)] = wave - wave_lengths[i] = wave.size(1) - - phone = row[2] - phone_padded[i, : phone.size(0), :] = phone - phone_lengths[i] = phone.size(0) - - sid[i] = row[3] - - return ( - phone_padded, - phone_lengths, - spec_padded, - spec_lengths, - wave_padded, - wave_lengths, - sid, - ) - - -class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): - """ - Maintain similar input lengths in a batch. - Length groups are specified by boundaries. - Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}. - - It removes samples which are not included in the boundaries. - Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded. - """ - - def __init__( - self, - dataset, - batch_size, - boundaries, - num_replicas=None, - rank=None, - shuffle=True, - ): - super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) - self.lengths = dataset.lengths - self.batch_size = batch_size - self.boundaries = boundaries - - self.buckets, self.num_samples_per_bucket = self._create_buckets() - self.total_size = sum(self.num_samples_per_bucket) - self.num_samples = self.total_size // self.num_replicas - - def _create_buckets(self): - buckets = [[] for _ in range(len(self.boundaries) - 1)] - for i in range(len(self.lengths)): - length = self.lengths[i] - idx_bucket = self._bisect(length) - if idx_bucket != -1: - buckets[idx_bucket].append(i) - - for i in range(len(buckets) - 1, -1, -1): # - if len(buckets[i]) == 0: - buckets.pop(i) - self.boundaries.pop(i + 1) - - num_samples_per_bucket = [] - for i in range(len(buckets)): - len_bucket = len(buckets[i]) - total_batch_size = self.num_replicas * self.batch_size - rem = ( - total_batch_size - (len_bucket % total_batch_size) - ) % total_batch_size - num_samples_per_bucket.append(len_bucket + rem) - return buckets, num_samples_per_bucket - - def __iter__(self): - # deterministically shuffle based on epoch - g = torch.Generator() - g.manual_seed(self.epoch) - - indices = [] - if self.shuffle: - for bucket in self.buckets: - indices.append(torch.randperm(len(bucket), generator=g).tolist()) - else: - for bucket in self.buckets: - indices.append(list(range(len(bucket)))) - - batches = [] - for i in range(len(self.buckets)): - bucket = self.buckets[i] - len_bucket = len(bucket) - ids_bucket = indices[i] - num_samples_bucket = self.num_samples_per_bucket[i] - - # add extra samples to make it evenly divisible - rem = num_samples_bucket - len_bucket - ids_bucket = ( - ids_bucket - + ids_bucket * (rem // len_bucket) - + ids_bucket[: (rem % len_bucket)] - ) - - # subsample - ids_bucket = ids_bucket[self.rank :: self.num_replicas] - - # batching - for j in range(len(ids_bucket) // self.batch_size): - batch = [ - bucket[idx] - for idx in ids_bucket[ - j * self.batch_size : (j + 1) * self.batch_size - ] - ] - batches.append(batch) - - if self.shuffle: - batch_ids = torch.randperm(len(batches), generator=g).tolist() - batches = [batches[i] for i in batch_ids] - self.batches = batches - - assert len(self.batches) * self.batch_size == self.num_samples - return iter(self.batches) - - def _bisect(self, x, lo=0, hi=None): - if hi is None: - hi = len(self.boundaries) - 1 - - if hi > lo: - mid = (hi + lo) // 2 - if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]: - return mid - elif x <= self.boundaries[mid]: - return self._bisect(x, lo, mid) - else: - return self._bisect(x, mid + 1, hi) - else: - return -1 - - def __len__(self): - return self.num_samples // self.batch_size diff --git a/infer/lib/train/losses.py b/infer/lib/train/losses.py deleted file mode 100644 index aa7bd81..0000000 --- a/infer/lib/train/losses.py +++ /dev/null @@ -1,58 +0,0 @@ -import torch - - -def feature_loss(fmap_r, fmap_g): - loss = 0 - for dr, dg in zip(fmap_r, fmap_g): - for rl, gl in zip(dr, dg): - rl = rl.float().detach() - gl = gl.float() - loss += torch.mean(torch.abs(rl - gl)) - - return loss * 2 - - -def discriminator_loss(disc_real_outputs, disc_generated_outputs): - loss = 0 - r_losses = [] - g_losses = [] - for dr, dg in zip(disc_real_outputs, disc_generated_outputs): - dr = dr.float() - dg = dg.float() - r_loss = torch.mean((1 - dr) ** 2) - g_loss = torch.mean(dg**2) - loss += r_loss + g_loss - r_losses.append(r_loss.item()) - g_losses.append(g_loss.item()) - - return loss, r_losses, g_losses - - -def generator_loss(disc_outputs): - loss = 0 - gen_losses = [] - for dg in disc_outputs: - dg = dg.float() - l = torch.mean((1 - dg) ** 2) - gen_losses.append(l) - loss += l - - return loss, gen_losses - - -def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): - """ - z_p, logs_q: [b, h, t_t] - m_p, logs_p: [b, h, t_t] - """ - z_p = z_p.float() - logs_q = logs_q.float() - m_p = m_p.float() - logs_p = logs_p.float() - z_mask = z_mask.float() - - kl = logs_p - logs_q - 0.5 - kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) - kl = torch.sum(kl * z_mask) - l = kl / torch.sum(z_mask) - return l diff --git a/infer/lib/train/mel_processing.py b/infer/lib/train/mel_processing.py deleted file mode 100644 index 3751f1e..0000000 --- a/infer/lib/train/mel_processing.py +++ /dev/null @@ -1,127 +0,0 @@ -import torch -import torch.utils.data -from librosa.filters import mel as librosa_mel_fn -import logging - -logger = logging.getLogger(__name__) - -MAX_WAV_VALUE = 32768.0 - - -def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): - """ - PARAMS - ------ - C: compression factor - """ - return torch.log(torch.clamp(x, min=clip_val) * C) - - -def dynamic_range_decompression_torch(x, C=1): - """ - PARAMS - ------ - C: compression factor used to compress - """ - return torch.exp(x) / C - - -def spectral_normalize_torch(magnitudes): - return dynamic_range_compression_torch(magnitudes) - - -def spectral_de_normalize_torch(magnitudes): - return dynamic_range_decompression_torch(magnitudes) - - -# Reusable banks -mel_basis = {} -hann_window = {} - - -def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): - """Convert waveform into Linear-frequency Linear-amplitude spectrogram. - - Args: - y :: (B, T) - Audio waveforms - n_fft - sampling_rate - hop_size - win_size - center - Returns: - :: (B, Freq, Frame) - Linear-frequency Linear-amplitude spectrogram - """ - - # Window - Cache if needed - global hann_window - dtype_device = str(y.dtype) + "_" + str(y.device) - wnsize_dtype_device = str(win_size) + "_" + dtype_device - if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( - dtype=y.dtype, device=y.device - ) - - # Padding - y = torch.nn.functional.pad( - y.unsqueeze(1), - (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), - mode="reflect", - ) - y = y.squeeze(1) - - # Complex Spectrogram :: (B, T) -> (B, Freq, Frame, RealComplex=2) - spec = torch.stft( - y, - n_fft, - hop_length=hop_size, - win_length=win_size, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=True, - ) - - # Linear-frequency Linear-amplitude spectrogram :: (B, Freq, Frame, RealComplex=2) -> (B, Freq, Frame) - spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6) - return spec - - -def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): - # MelBasis - Cache if needed - global mel_basis - dtype_device = str(spec.dtype) + "_" + str(spec.device) - fmax_dtype_device = str(fmax) + "_" + dtype_device - if fmax_dtype_device not in mel_basis: - mel = librosa_mel_fn( - sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax - ) - mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( - dtype=spec.dtype, device=spec.device - ) - - # Mel-frequency Log-amplitude spectrogram :: (B, Freq=num_mels, Frame) - melspec = torch.matmul(mel_basis[fmax_dtype_device], spec) - melspec = spectral_normalize_torch(melspec) - return melspec - - -def mel_spectrogram_torch( - y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False -): - """Convert waveform into Mel-frequency Log-amplitude spectrogram. - - Args: - y :: (B, T) - Waveforms - Returns: - melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram - """ - # Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame) - spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center) - - # Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame) - melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax) - - return melspec diff --git a/infer/lib/train/process_ckpt.py b/infer/lib/train/process_ckpt.py deleted file mode 100644 index 2529ccf..0000000 --- a/infer/lib/train/process_ckpt.py +++ /dev/null @@ -1,261 +0,0 @@ -import os -import sys -import traceback -from collections import OrderedDict - -import torch - -from i18n.i18n import I18nAuto - -i18n = I18nAuto() - - -def savee(ckpt, sr, if_f0, name, epoch, version, hps): - try: - opt = OrderedDict() - opt["weight"] = {} - for key in ckpt.keys(): - if "enc_q" in key: - continue - opt["weight"][key] = ckpt[key].half() - opt["config"] = [ - hps.data.filter_length // 2 + 1, - 32, - hps.model.inter_channels, - hps.model.hidden_channels, - hps.model.filter_channels, - hps.model.n_heads, - hps.model.n_layers, - hps.model.kernel_size, - hps.model.p_dropout, - hps.model.resblock, - hps.model.resblock_kernel_sizes, - hps.model.resblock_dilation_sizes, - hps.model.upsample_rates, - hps.model.upsample_initial_channel, - hps.model.upsample_kernel_sizes, - hps.model.spk_embed_dim, - hps.model.gin_channels, - hps.data.sampling_rate, - ] - opt["info"] = "%sepoch" % epoch - opt["sr"] = sr - opt["f0"] = if_f0 - opt["version"] = version - torch.save(opt, "assets/weights/%s.pth" % name) - return "Success." - except: - return traceback.format_exc() - - -def show_info(path): - try: - a = torch.load(path, map_location="cpu") - return "模型信息:%s\n采样率:%s\n模型是否输入音高引导:%s\n版本:%s" % ( - a.get("info", "None"), - a.get("sr", "None"), - a.get("f0", "None"), - a.get("version", "None"), - ) - except: - return traceback.format_exc() - - -def extract_small_model(path, name, sr, if_f0, info, version): - try: - ckpt = torch.load(path, map_location="cpu") - if "model" in ckpt: - ckpt = ckpt["model"] - opt = OrderedDict() - opt["weight"] = {} - for key in ckpt.keys(): - if "enc_q" in key: - continue - opt["weight"][key] = ckpt[key].half() - if sr == "40k": - opt["config"] = [ - 1025, - 32, - 192, - 192, - 768, - 2, - 6, - 3, - 0, - "1", - [3, 7, 11], - [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - [10, 10, 2, 2], - 512, - [16, 16, 4, 4], - 109, - 256, - 40000, - ] - elif sr == "48k": - if version == "v1": - opt["config"] = [ - 1025, - 32, - 192, - 192, - 768, - 2, - 6, - 3, - 0, - "1", - [3, 7, 11], - [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - [10, 6, 2, 2, 2], - 512, - [16, 16, 4, 4, 4], - 109, - 256, - 48000, - ] - else: - opt["config"] = [ - 1025, - 32, - 192, - 192, - 768, - 2, - 6, - 3, - 0, - "1", - [3, 7, 11], - [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - [12, 10, 2, 2], - 512, - [24, 20, 4, 4], - 109, - 256, - 48000, - ] - elif sr == "32k": - if version == "v1": - opt["config"] = [ - 513, - 32, - 192, - 192, - 768, - 2, - 6, - 3, - 0, - "1", - [3, 7, 11], - [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - [10, 4, 2, 2, 2], - 512, - [16, 16, 4, 4, 4], - 109, - 256, - 32000, - ] - else: - opt["config"] = [ - 513, - 32, - 192, - 192, - 768, - 2, - 6, - 3, - 0, - "1", - [3, 7, 11], - [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - [10, 8, 2, 2], - 512, - [20, 16, 4, 4], - 109, - 256, - 32000, - ] - if info == "": - info = "Extracted model." - opt["info"] = info - opt["version"] = version - opt["sr"] = sr - opt["f0"] = int(if_f0) - torch.save(opt, "assets/weights/%s.pth" % name) - return "Success." - except: - return traceback.format_exc() - - -def change_info(path, info, name): - try: - ckpt = torch.load(path, map_location="cpu") - ckpt["info"] = info - if name == "": - name = os.path.basename(path) - torch.save(ckpt, "assets/weights/%s" % name) - return "Success." - except: - return traceback.format_exc() - - -def merge(path1, path2, alpha1, sr, f0, info, name, version): - try: - - def extract(ckpt): - a = ckpt["model"] - opt = OrderedDict() - opt["weight"] = {} - for key in a.keys(): - if "enc_q" in key: - continue - opt["weight"][key] = a[key] - return opt - - ckpt1 = torch.load(path1, map_location="cpu") - ckpt2 = torch.load(path2, map_location="cpu") - cfg = ckpt1["config"] - if "model" in ckpt1: - ckpt1 = extract(ckpt1) - else: - ckpt1 = ckpt1["weight"] - if "model" in ckpt2: - ckpt2 = extract(ckpt2) - else: - ckpt2 = ckpt2["weight"] - if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())): - return "Fail to merge the models. The model architectures are not the same." - opt = OrderedDict() - opt["weight"] = {} - for key in ckpt1.keys(): - # try: - if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape: - min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0]) - opt["weight"][key] = ( - alpha1 * (ckpt1[key][:min_shape0].float()) - + (1 - alpha1) * (ckpt2[key][:min_shape0].float()) - ).half() - else: - opt["weight"][key] = ( - alpha1 * (ckpt1[key].float()) + (1 - alpha1) * (ckpt2[key].float()) - ).half() - # except: - # pdb.set_trace() - opt["config"] = cfg - """ - if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 40000] - elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4], 109, 256, 48000] - elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000] - """ - opt["sr"] = sr - opt["f0"] = 1 if f0 == i18n("是") else 0 - opt["version"] = version - opt["info"] = info - torch.save(opt, "assets/weights/%s.pth" % name) - return "Success." - except: - return traceback.format_exc() diff --git a/infer/lib/train/utils.py b/infer/lib/train/utils.py deleted file mode 100644 index 765c54c..0000000 --- a/infer/lib/train/utils.py +++ /dev/null @@ -1,483 +0,0 @@ -import argparse -import glob -import json -import logging -import os -import subprocess -import sys -import shutil - -import numpy as np -import torch -from scipy.io.wavfile import read - -MATPLOTLIB_FLAG = False - -logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) -logger = logging - - -def load_checkpoint_d(checkpoint_path, combd, sbd, optimizer=None, load_opt=1): - assert os.path.isfile(checkpoint_path) - checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") - - ################## - def go(model, bkey): - saved_state_dict = checkpoint_dict[bkey] - if hasattr(model, "module"): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - new_state_dict = {} - for k, v in state_dict.items(): # 模型需要的shape - try: - new_state_dict[k] = saved_state_dict[k] - if saved_state_dict[k].shape != state_dict[k].shape: - logger.warning( - "shape-%s-mismatch. need: %s, get: %s", - k, - state_dict[k].shape, - saved_state_dict[k].shape, - ) # - raise KeyError - except: - # logger.info(traceback.format_exc()) - logger.info("%s is not in the checkpoint", k) # pretrain缺失的 - new_state_dict[k] = v # 模型自带的随机值 - if hasattr(model, "module"): - model.module.load_state_dict(new_state_dict, strict=False) - else: - model.load_state_dict(new_state_dict, strict=False) - return model - - go(combd, "combd") - model = go(sbd, "sbd") - ############# - logger.info("Loaded model weights") - - iteration = checkpoint_dict["iteration"] - learning_rate = checkpoint_dict["learning_rate"] - if ( - optimizer is not None and load_opt == 1 - ): ###加载不了,如果是空的的话,重新初始化,可能还会影响lr时间表的更新,因此在train文件最外围catch - # try: - optimizer.load_state_dict(checkpoint_dict["optimizer"]) - # except: - # traceback.print_exc() - logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration)) - return model, optimizer, learning_rate, iteration - - -# def load_checkpoint(checkpoint_path, model, optimizer=None): -# assert os.path.isfile(checkpoint_path) -# checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') -# iteration = checkpoint_dict['iteration'] -# learning_rate = checkpoint_dict['learning_rate'] -# if optimizer is not None: -# optimizer.load_state_dict(checkpoint_dict['optimizer']) -# # print(1111) -# saved_state_dict = checkpoint_dict['model'] -# # print(1111) -# -# if hasattr(model, 'module'): -# state_dict = model.module.state_dict() -# else: -# state_dict = model.state_dict() -# new_state_dict= {} -# for k, v in state_dict.items(): -# try: -# new_state_dict[k] = saved_state_dict[k] -# except: -# logger.info("%s is not in the checkpoint" % k) -# new_state_dict[k] = v -# if hasattr(model, 'module'): -# model.module.load_state_dict(new_state_dict) -# else: -# model.load_state_dict(new_state_dict) -# logger.info("Loaded checkpoint '{}' (epoch {})" .format( -# checkpoint_path, iteration)) -# return model, optimizer, learning_rate, iteration -def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1): - assert os.path.isfile(checkpoint_path) - checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") - - saved_state_dict = checkpoint_dict["model"] - if hasattr(model, "module"): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - new_state_dict = {} - for k, v in state_dict.items(): # 模型需要的shape - try: - new_state_dict[k] = saved_state_dict[k] - if saved_state_dict[k].shape != state_dict[k].shape: - logger.warning( - "shape-%s-mismatch|need-%s|get-%s", - k, - state_dict[k].shape, - saved_state_dict[k].shape, - ) # - raise KeyError - except: - # logger.info(traceback.format_exc()) - logger.info("%s is not in the checkpoint", k) # pretrain缺失的 - new_state_dict[k] = v # 模型自带的随机值 - if hasattr(model, "module"): - model.module.load_state_dict(new_state_dict, strict=False) - else: - model.load_state_dict(new_state_dict, strict=False) - logger.info("Loaded model weights") - - iteration = checkpoint_dict["iteration"] - learning_rate = checkpoint_dict["learning_rate"] - if ( - optimizer is not None and load_opt == 1 - ): ###加载不了,如果是空的的话,重新初始化,可能还会影响lr时间表的更新,因此在train文件最外围catch - # try: - optimizer.load_state_dict(checkpoint_dict["optimizer"]) - # except: - # traceback.print_exc() - logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration)) - return model, optimizer, learning_rate, iteration - - -def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): - logger.info( - "Saving model and optimizer state at epoch {} to {}".format( - iteration, checkpoint_path - ) - ) - if hasattr(model, "module"): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save( - { - "model": state_dict, - "iteration": iteration, - "optimizer": optimizer.state_dict(), - "learning_rate": learning_rate, - }, - checkpoint_path, - ) - - -def save_checkpoint_d(combd, sbd, optimizer, learning_rate, iteration, checkpoint_path): - logger.info( - "Saving model and optimizer state at epoch {} to {}".format( - iteration, checkpoint_path - ) - ) - if hasattr(combd, "module"): - state_dict_combd = combd.module.state_dict() - else: - state_dict_combd = combd.state_dict() - if hasattr(sbd, "module"): - state_dict_sbd = sbd.module.state_dict() - else: - state_dict_sbd = sbd.state_dict() - torch.save( - { - "combd": state_dict_combd, - "sbd": state_dict_sbd, - "iteration": iteration, - "optimizer": optimizer.state_dict(), - "learning_rate": learning_rate, - }, - checkpoint_path, - ) - - -def summarize( - writer, - global_step, - scalars={}, - histograms={}, - images={}, - audios={}, - audio_sampling_rate=22050, -): - for k, v in scalars.items(): - writer.add_scalar(k, v, global_step) - for k, v in histograms.items(): - writer.add_histogram(k, v, global_step) - for k, v in images.items(): - writer.add_image(k, v, global_step, dataformats="HWC") - for k, v in audios.items(): - writer.add_audio(k, v, global_step, audio_sampling_rate) - - -def latest_checkpoint_path(dir_path, regex="G_*.pth"): - f_list = glob.glob(os.path.join(dir_path, regex)) - f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) - x = f_list[-1] - logger.debug(x) - return x - - -def plot_spectrogram_to_numpy(spectrogram): - global MATPLOTLIB_FLAG - if not MATPLOTLIB_FLAG: - import matplotlib - - matplotlib.use("Agg") - MATPLOTLIB_FLAG = True - mpl_logger = logging.getLogger("matplotlib") - mpl_logger.setLevel(logging.WARNING) - import matplotlib.pylab as plt - import numpy as np - - fig, ax = plt.subplots(figsize=(10, 2)) - im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") - plt.colorbar(im, ax=ax) - plt.xlabel("Frames") - plt.ylabel("Channels") - plt.tight_layout() - - fig.canvas.draw() - data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") - data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) - plt.close() - return data - - -def plot_alignment_to_numpy(alignment, info=None): - global MATPLOTLIB_FLAG - if not MATPLOTLIB_FLAG: - import matplotlib - - matplotlib.use("Agg") - MATPLOTLIB_FLAG = True - mpl_logger = logging.getLogger("matplotlib") - mpl_logger.setLevel(logging.WARNING) - import matplotlib.pylab as plt - import numpy as np - - fig, ax = plt.subplots(figsize=(6, 4)) - im = ax.imshow( - alignment.transpose(), aspect="auto", origin="lower", interpolation="none" - ) - fig.colorbar(im, ax=ax) - xlabel = "Decoder timestep" - if info is not None: - xlabel += "\n\n" + info - plt.xlabel(xlabel) - plt.ylabel("Encoder timestep") - plt.tight_layout() - - fig.canvas.draw() - data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") - data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) - plt.close() - return data - - -def load_wav_to_torch(full_path): - sampling_rate, data = read(full_path) - return torch.FloatTensor(data.astype(np.float32)), sampling_rate - - -def load_filepaths_and_text(filename, split="|"): - try: - with open(filename, encoding="utf-8") as f: - filepaths_and_text = [line.strip().split(split) for line in f] - except UnicodeDecodeError: - with open(filename) as f: - filepaths_and_text = [line.strip().split(split) for line in f] - - return filepaths_and_text - - -def get_hparams(init=True): - """ - todo: - 结尾七人组: - 保存频率、总epoch done - bs done - pretrainG、pretrainD done - 卡号:os.en["CUDA_VISIBLE_DEVICES"] done - if_latest done - 模型:if_f0 done - 采样率:自动选择config done - 是否缓存数据集进GPU:if_cache_data_in_gpu done - - -m: - 自动决定training_files路径,改掉train_nsf_load_pretrain.py里的hps.data.training_files done - -c不要了 - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "-se", - "--save_every_epoch", - type=int, - required=True, - help="checkpoint save frequency (epoch)", - ) - parser.add_argument( - "-te", "--total_epoch", type=int, required=True, help="total_epoch" - ) - parser.add_argument( - "-pg", "--pretrainG", type=str, default="", help="Pretrained Generator path" - ) - parser.add_argument( - "-pd", "--pretrainD", type=str, default="", help="Pretrained Discriminator path" - ) - parser.add_argument("-g", "--gpus", type=str, default="0", help="split by -") - parser.add_argument( - "-bs", "--batch_size", type=int, required=True, help="batch size" - ) - parser.add_argument( - "-e", "--experiment_dir", type=str, required=True, help="experiment dir" - ) # -m - parser.add_argument( - "-sr", "--sample_rate", type=str, required=True, help="sample rate, 32k/40k/48k" - ) - parser.add_argument( - "-sw", - "--save_every_weights", - type=str, - default="0", - help="save the extracted model in weights directory when saving checkpoints", - ) - parser.add_argument( - "-v", "--version", type=str, required=True, help="model version" - ) - parser.add_argument( - "-f0", - "--if_f0", - type=int, - required=True, - help="use f0 as one of the inputs of the model, 1 or 0", - ) - parser.add_argument( - "-l", - "--if_latest", - type=int, - required=True, - help="if only save the latest G/D pth file, 1 or 0", - ) - parser.add_argument( - "-c", - "--if_cache_data_in_gpu", - type=int, - required=True, - help="if caching the dataset in GPU memory, 1 or 0", - ) - - args = parser.parse_args() - name = args.experiment_dir - experiment_dir = os.path.join("./logs", args.experiment_dir) - - config_save_path = os.path.join(experiment_dir, "config.json") - with open(config_save_path, "r") as f: - config = json.load(f) - - hparams = HParams(**config) - hparams.model_dir = hparams.experiment_dir = experiment_dir - hparams.save_every_epoch = args.save_every_epoch - hparams.name = name - hparams.total_epoch = args.total_epoch - hparams.pretrainG = args.pretrainG - hparams.pretrainD = args.pretrainD - hparams.version = args.version - hparams.gpus = args.gpus - hparams.train.batch_size = args.batch_size - hparams.sample_rate = args.sample_rate - hparams.if_f0 = args.if_f0 - hparams.if_latest = args.if_latest - hparams.save_every_weights = args.save_every_weights - hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu - hparams.data.training_files = "%s/filelist.txt" % experiment_dir - return hparams - - -def get_hparams_from_dir(model_dir): - config_save_path = os.path.join(model_dir, "config.json") - with open(config_save_path, "r") as f: - data = f.read() - config = json.loads(data) - - hparams = HParams(**config) - hparams.model_dir = model_dir - return hparams - - -def get_hparams_from_file(config_path): - with open(config_path, "r") as f: - data = f.read() - config = json.loads(data) - - hparams = HParams(**config) - return hparams - - -def check_git_hash(model_dir): - source_dir = os.path.dirname(os.path.realpath(__file__)) - if not os.path.exists(os.path.join(source_dir, ".git")): - logger.warning( - "{} is not a git repository, therefore hash value comparison will be ignored.".format( - source_dir - ) - ) - return - - cur_hash = subprocess.getoutput("git rev-parse HEAD") - - path = os.path.join(model_dir, "githash") - if os.path.exists(path): - saved_hash = open(path).read() - if saved_hash != cur_hash: - logger.warning( - "git hash values are different. {}(saved) != {}(current)".format( - saved_hash[:8], cur_hash[:8] - ) - ) - else: - open(path, "w").write(cur_hash) - - -def get_logger(model_dir, filename="train.log"): - global logger - logger = logging.getLogger(os.path.basename(model_dir)) - logger.setLevel(logging.DEBUG) - - formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") - if not os.path.exists(model_dir): - os.makedirs(model_dir) - h = logging.FileHandler(os.path.join(model_dir, filename)) - h.setLevel(logging.DEBUG) - h.setFormatter(formatter) - logger.addHandler(h) - return logger - - -class HParams: - def __init__(self, **kwargs): - for k, v in kwargs.items(): - if type(v) == dict: - v = HParams(**v) - self[k] = v - - def keys(self): - return self.__dict__.keys() - - def items(self): - return self.__dict__.items() - - def values(self): - return self.__dict__.values() - - def __len__(self): - return len(self.__dict__) - - def __getitem__(self, key): - return getattr(self, key) - - def __setitem__(self, key, value): - return setattr(self, key, value) - - def __contains__(self, key): - return key in self.__dict__ - - def __repr__(self): - return self.__dict__.__repr__() diff --git a/infer/modules/uvr5/modules.py b/infer/modules/uvr5/modules.py deleted file mode 100644 index 2084eb8..0000000 --- a/infer/modules/uvr5/modules.py +++ /dev/null @@ -1,108 +0,0 @@ -import os -import traceback -import logging - -logger = logging.getLogger(__name__) - -import ffmpeg -import torch - -from configs.config import Config -from infer.modules.uvr5.mdxnet import MDXNetDereverb -from infer.modules.uvr5.vr import AudioPre, AudioPreDeEcho - -config = Config() - - -def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0): - infos = [] - try: - inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - save_root_vocal = ( - save_root_vocal.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - ) - save_root_ins = ( - save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - ) - if model_name == "onnx_dereverb_By_FoxJoy": - pre_fun = MDXNetDereverb(15, config.device) - else: - func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho - pre_fun = func( - agg=int(agg), - model_path=os.path.join( - os.getenv("weight_uvr5_root"), model_name + ".pth" - ), - device=config.device, - is_half=config.is_half, - ) - is_hp3 = "HP3" in model_name - if inp_root != "": - paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)] - else: - paths = [path.name for path in paths] - for path in paths: - inp_path = os.path.join(inp_root, path) - need_reformat = 1 - done = 0 - try: - info = ffmpeg.probe(inp_path, cmd="ffprobe") - if ( - info["streams"][0]["channels"] == 2 - and info["streams"][0]["sample_rate"] == "44100" - ): - need_reformat = 0 - pre_fun._path_audio_( - inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3 - ) - done = 1 - except: - need_reformat = 1 - traceback.print_exc() - if need_reformat == 1: - tmp_path = "%s/%s.reformatted.wav" % ( - os.path.join(os.environ["TEMP"]), - os.path.basename(inp_path), - ) - os.system( - 'ffmpeg -i "%s" -vn -acodec pcm_s16le -ac 2 -ar 44100 "%s" -y' - % (inp_path, tmp_path) - ) - inp_path = tmp_path - try: - if done == 0: - pre_fun._path_audio_( - inp_path, save_root_ins, save_root_vocal, format0 - ) - infos.append("%s->Success" % (os.path.basename(inp_path))) - yield "\n".join(infos) - except: - try: - if done == 0: - pre_fun._path_audio_( - inp_path, save_root_ins, save_root_vocal, format0 - ) - infos.append("%s->Success" % (os.path.basename(inp_path))) - yield "\n".join(infos) - except: - infos.append( - "%s->%s" % (os.path.basename(inp_path), traceback.format_exc()) - ) - yield "\n".join(infos) - except: - infos.append(traceback.format_exc()) - yield "\n".join(infos) - finally: - try: - if model_name == "onnx_dereverb_By_FoxJoy": - del pre_fun.pred.model - del pre_fun.pred.model_ - else: - del pre_fun.model - del pre_fun - except: - traceback.print_exc() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - logger.info("Executed torch.cuda.empty_cache()") - yield "\n".join(infos)