diff --git a/infer-web.py b/infer-web.py index 0a47e07..4905990 100644 --- a/infer-web.py +++ b/infer-web.py @@ -10,13 +10,12 @@ load_dotenv("sha256.env") if sys.platform == "darwin": os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" -from infer.modules.vc import VC +from infer.modules.vc import VC, show_info from infer.modules.uvr5.modules import uvr from infer.lib.train.process_ckpt import ( change_info, extract_small_model, merge, - show_info, ) from i18n.i18n import I18nAuto from configs.config import Config @@ -838,6 +837,7 @@ with gr.Blocks(title="RVC WebUI") as app: clean_button.click( fn=clean, inputs=[], outputs=[sid0], api_name="infer_clean" ) + modelinfo = gr.Textbox(label=i18n("模型信息")) with gr.TabItem(i18n("单次推理")): with gr.Group(): with gr.Row(): @@ -846,24 +846,23 @@ with gr.Blocks(title="RVC WebUI") as app: label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0, ) - input_audio0 = gr.Textbox( + input_audio0 = gr.File( label=i18n( - "输入待处理音频文件路径(默认是正确格式示例)" + "待处理音频文件" ), - placeholder="C:\\Users\\Desktop\\audio_example.wav", - ) - file_index1 = gr.Textbox( - label=i18n( - "特征检索库文件路径,为空则使用下拉的选择结果" - ), - placeholder="C:\\Users\\Desktop\\model_example.index", - interactive=True, + file_types=["audio"] ) file_index2 = gr.Dropdown( label=i18n("自动检测index路径,下拉式选择(dropdown)"), choices=sorted(index_paths), interactive=True, ) + file_index1 = gr.File( + label=i18n( + "特征检索库文件路径,为空则使用下拉的选择结果" + ), + ) + with gr.Column(): f0method0 = gr.Radio( label=i18n( "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU" @@ -876,8 +875,6 @@ with gr.Blocks(title="RVC WebUI") as app: value="rmvpe", interactive=True, ) - - with gr.Column(): resample_sr0 = gr.Slider( minimum=0, maximum=48000, @@ -928,6 +925,10 @@ with gr.Blocks(title="RVC WebUI") as app: ), visible=False, ) + but0 = gr.Button(i18n("转换"), variant="primary") + vc_output2 = gr.Audio( + label=i18n("输出音频(右下角三个点,点了可以下载)") + ) refresh_button.click( fn=change_choices, @@ -935,19 +936,8 @@ with gr.Blocks(title="RVC WebUI") as app: outputs=[sid0, file_index2], api_name="infer_refresh", ) - # file_big_npy1 = gr.Textbox( - # label=i18n("特征文件路径"), - # value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy", - # interactive=True, - # ) with gr.Group(): - with gr.Column(): - but0 = gr.Button(i18n("转换"), variant="primary") - with gr.Row(): - vc_output1 = gr.Textbox(label=i18n("输出信息")) - vc_output2 = gr.Audio( - label=i18n("输出音频(右下角三个点,点了可以下载)") - ) + vc_output1 = gr.Textbox(label=i18n("输出信息")) but0.click( vc.vc_single, @@ -981,36 +971,28 @@ with gr.Blocks(title="RVC WebUI") as app: label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0, ) + dir_input = gr.Textbox( + label=i18n( + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)" + ), + placeholder="C:\\Users\\Desktop\\input_vocal_dir", + ) + inputs = gr.File( + file_count="multiple", + label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹"), + ) opt_input = gr.Textbox( label=i18n("指定输出文件夹"), value="opt" ) - file_index3 = gr.Textbox( - label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"), - value="", - interactive=True, - ) file_index4 = gr.Dropdown( label=i18n("自动检测index路径,下拉式选择(dropdown)"), choices=sorted(index_paths), interactive=True, ) - f0method1 = gr.Radio( + file_index3 = gr.File( label=i18n( - "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU" + "特征检索库文件路径,为空则使用下拉的选择结果" ), - choices=( - ["pm", "harvest", "crepe", "rmvpe"] - if config.dml == False - else ["pm", "harvest", "rmvpe"] - ), - value="rmvpe", - interactive=True, - ) - format1 = gr.Radio( - label=i18n("导出文件格式"), - choices=["wav", "flac", "mp3", "m4a"], - value="wav", - interactive=True, ) refresh_button.click( @@ -1026,6 +1008,18 @@ with gr.Blocks(title="RVC WebUI") as app: # ) with gr.Column(): + f0method1 = gr.Radio( + label=i18n( + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU" + ), + choices=( + ["pm", "harvest", "crepe", "rmvpe"] + if config.dml == False + else ["pm", "harvest", "rmvpe"] + ), + value="rmvpe", + interactive=True, + ) resample_sr1 = gr.Slider( minimum=0, maximum=48000, @@ -1070,48 +1064,42 @@ with gr.Blocks(title="RVC WebUI") as app: value=1, interactive=True, ) - with gr.Row(): - dir_input = gr.Textbox( - label=i18n( - "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)" - ), - placeholder="C:\\Users\\Desktop\\input_vocal_dir", - ) - inputs = gr.File( - file_count="multiple", - label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹"), - ) + format1 = gr.Radio( + label=i18n("导出文件格式"), + choices=["wav", "flac", "mp3", "m4a"], + value="wav", + interactive=True, + ) + but1 = gr.Button(i18n("转换"), variant="primary") - with gr.Row(): - but1 = gr.Button(i18n("转换"), variant="primary") - vc_output3 = gr.Textbox(label=i18n("输出信息")) + vc_output3 = gr.Textbox(label=i18n("输出信息")) - but1.click( - vc.vc_multi, - [ - spk_item, - dir_input, - opt_input, - inputs, - vc_transform1, - f0method1, - file_index3, - file_index4, - # file_big_npy2, - index_rate2, - filter_radius1, - resample_sr1, - rms_mix_rate1, - protect1, - format1, - ], - [vc_output3], - api_name="infer_convert_batch", - ) + but1.click( + vc.vc_multi, + [ + spk_item, + dir_input, + opt_input, + inputs, + vc_transform1, + f0method1, + file_index3, + file_index4, + # file_big_npy2, + index_rate2, + filter_radius1, + resample_sr1, + rms_mix_rate1, + protect1, + format1, + ], + [vc_output3], + api_name="infer_convert_batch", + ) sid0.change( fn=vc.get_vc, inputs=[sid0, protect0, protect1], - outputs=[spk_item, protect0, protect1, file_index2, file_index4], + outputs=[spk_item, protect0, protect1, file_index2, file_index4, modelinfo], api_name="infer_change_voice", ) with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")): diff --git a/infer/lib/train/process_ckpt.py b/infer/lib/train/process_ckpt.py index 2529ccf..23377f6 100644 --- a/infer/lib/train/process_ckpt.py +++ b/infer/lib/train/process_ckpt.py @@ -1,16 +1,17 @@ import os -import sys import traceback from collections import OrderedDict +from time import time import torch from i18n.i18n import I18nAuto +from infer.modules.vc import model_hash_ckpt, hash_id i18n = I18nAuto() - -def savee(ckpt, sr, if_f0, name, epoch, version, hps): +# add author sign +def save_small_model(ckpt, sr, if_f0, name, epoch, version, hps): try: opt = OrderedDict() opt["weight"] = {} @@ -39,28 +40,20 @@ def savee(ckpt, sr, if_f0, name, epoch, version, hps): hps.data.sampling_rate, ] opt["info"] = "%sepoch" % epoch + opt["name"] = name + opt["timestamp"] = int(time()) opt["sr"] = sr opt["f0"] = if_f0 opt["version"] = version + h = model_hash_ckpt(opt) + opt["hash"] = h + opt["id"] = hash_id(h) torch.save(opt, "assets/weights/%s.pth" % name) return "Success." except: return traceback.format_exc() -def show_info(path): - try: - a = torch.load(path, map_location="cpu") - return "模型信息:%s\n采样率:%s\n模型是否输入音高引导:%s\n版本:%s" % ( - a.get("info", "None"), - a.get("sr", "None"), - a.get("f0", "None"), - a.get("version", "None"), - ) - except: - return traceback.format_exc() - - def extract_small_model(path, name, sr, if_f0, info, version): try: ckpt = torch.load(path, map_location="cpu") @@ -182,9 +175,14 @@ def extract_small_model(path, name, sr, if_f0, info, version): if info == "": info = "Extracted model." opt["info"] = info + opt["name"] = name + opt["timestamp"] = int(time()) opt["version"] = version opt["sr"] = sr opt["f0"] = int(if_f0) + h = model_hash_ckpt(opt) + opt["hash"] = h + opt["id"] = hash_id(h) torch.save(opt, "assets/weights/%s.pth" % name) return "Success." except: @@ -251,10 +249,15 @@ def merge(path1, path2, alpha1, sr, f0, info, name, version): elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4], 109, 256, 48000] elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000] """ + opt["name"] = name + opt["timestamp"] = int(time()) opt["sr"] = sr opt["f0"] = 1 if f0 == i18n("是") else 0 opt["version"] = version opt["info"] = info + h = model_hash_ckpt(opt) + opt["hash"] = h + opt["id"] = hash_id(h) torch.save(opt, "assets/weights/%s.pth" % name) return "Success." except: diff --git a/infer/modules/train/train.py b/infer/modules/train/train.py index 38a5678..48c1f57 100644 --- a/infer/modules/train/train.py +++ b/infer/modules/train/train.py @@ -74,7 +74,7 @@ from infer.lib.train.losses import ( kl_loss, ) from infer.lib.train.mel_processing import mel_spectrogram_torch, spec_to_mel_torch -from infer.lib.train.process_ckpt import savee +from infer.lib.train.process_ckpt import save_small_model global_step = 0 @@ -602,7 +602,7 @@ def train_and_evaluate( % ( hps.name, epoch, - savee( + save_small_model( ckpt, hps.sample_rate, hps.if_f0, @@ -626,7 +626,7 @@ def train_and_evaluate( logger.info( "saving final ckpt:%s" % ( - savee( + save_small_model( ckpt, hps.sample_rate, hps.if_f0, hps.name, epoch, hps.version, hps ) ) diff --git a/infer/modules/vc/__init__.py b/infer/modules/vc/__init__.py index 98a4fb7..01141ef 100644 --- a/infer/modules/vc/__init__.py +++ b/infer/modules/vc/__init__.py @@ -1,3 +1,5 @@ from .pipeline import Pipeline from .modules import VC from .utils import get_index_path_from_model, load_hubert +from .info import show_info +from .hash import model_hash_ckpt, hash_id diff --git a/infer/modules/vc/hash.py b/infer/modules/vc/hash.py new file mode 100644 index 0000000..8e8334d --- /dev/null +++ b/infer/modules/vc/hash.py @@ -0,0 +1,170 @@ +import numpy as np +import torch +import hashlib +import pathlib +from scipy.fft import fft +from pybase16384 import encode_to_string, decode_from_string + +if __name__ == "__main__": + import os, sys + now_dir = os.getcwd() + sys.path.append(now_dir) + +from configs.config import Config, singleton_variable + +from .pipeline import Pipeline +from .utils import load_hubert + +from infer.lib.audio import load_audio + +class TorchSeedContext: + def __init__(self, seed): + self.seed = seed + self.state = None + + def __enter__(self): + self.state = torch.random.get_rng_state() + torch.manual_seed(self.seed) + + def __exit__(self, type, value, traceback): + torch.random.set_rng_state(self.state) + +half_hash_len = 512 +expand_factor = 65536*8 + +@singleton_variable +def original_audio_time_minus(): + __original_audio = load_audio(str(pathlib.Path(__file__).parent / "lgdsng.mp3"), 16000) + np.divide(__original_audio, np.abs(__original_audio).max(), __original_audio) + return -__original_audio + +@singleton_variable +def original_audio_freq_minus(): + __original_audio = load_audio(str(pathlib.Path(__file__).parent / "lgdsng.mp3"), 16000) + np.divide(__original_audio, np.abs(__original_audio).max(), __original_audio) + __original_audio = fft(__original_audio) + return -__original_audio + +def _cut_u16(n): + if n > 16384: n = 16384 + 16384*(1-np.exp((16384-n)/expand_factor)) + elif n < -16384: n = -16384 - 16384*(1-np.exp((n+16384)/expand_factor)) + return n + +# wave_hash will change time_field, use carefully +def wave_hash(time_field): + np.divide(time_field, np.abs(time_field).max(), time_field) + if len(time_field) != 48000: + raise Exception("time not hashable") + freq_field = fft(time_field) + if len(freq_field) != 48000: + raise Exception("freq not hashable") + np.add(time_field, original_audio_time_minus(), out=time_field) + np.add(freq_field, original_audio_freq_minus(), out=freq_field) + hash = np.zeros(half_hash_len//2*2, dtype='>i2') + d = 375 * 512 // half_hash_len + for i in range(half_hash_len//4): + a = i*2 + b = a+1 + x = a + half_hash_len//2 + y = x+1 + s = np.average(freq_field[i*d:(i+1)*d]) + hash[a] = np.int16(_cut_u16(round(32768*np.real(s)))) + hash[b] = np.int16(_cut_u16(round(32768*np.imag(s)))) + hash[x] = np.int16(_cut_u16(round(32768*np.sum(time_field[i*d:i*d+d//2])))) + hash[y] = np.int16(_cut_u16(round(32768*np.sum(time_field[i*d+d//2:(i+1)*d])))) + return encode_to_string(hash.tobytes()) + +def audio_hash(file): + return wave_hash(load_audio(file, 16000)) + +def model_hash(config, tgt_sr, net_g, if_f0, version): + pipeline = Pipeline(tgt_sr, config) + audio = load_audio(str(pathlib.Path(__file__).parent / "lgdsng.mp3"), 16000) + audio_max = np.abs(audio).max() / 0.95 + if audio_max > 1: + np.divide(audio, audio_max, audio) + audio_opt = pipeline.pipeline(load_hubert(config.device, config.is_half), net_g, 0, audio, + [0, 0, 0], 6, "rmvpe", "", 0, if_f0, 3, tgt_sr, 16000, 0.25, + version, 0.33) + opt_len = len(audio_opt) + diff = 48000 - opt_len + n = diff//2 + if n > 0: + audio_opt = np.pad(audio_opt, (n, n)) + elif n < 0: + n = -n + audio_opt = audio_opt[n:-n] + h = wave_hash(audio_opt) + del pipeline, audio, audio_opt + return h + +def model_hash_ckpt(cpt): + from infer.lib.infer_pack.models import ( + SynthesizerTrnMs256NSFsid, + SynthesizerTrnMs256NSFsid_nono, + SynthesizerTrnMs768NSFsid, + SynthesizerTrnMs768NSFsid_nono, + ) + config = Config() + with TorchSeedContext(114514): + tgt_sr = cpt["config"][-1] + if_f0 = cpt.get("f0", 1) + version = cpt.get("version", "v1") + synthesizer_class = { + ("v1", 1): SynthesizerTrnMs256NSFsid, + ("v1", 0): SynthesizerTrnMs256NSFsid_nono, + ("v2", 1): SynthesizerTrnMs768NSFsid, + ("v2", 0): SynthesizerTrnMs768NSFsid_nono, + } + net_g = synthesizer_class.get( + (version, if_f0), SynthesizerTrnMs256NSFsid + )(*cpt["config"], is_half=config.is_half) + + del net_g.enc_q + + net_g.load_state_dict(cpt["weight"], strict=False) + net_g.eval().to(config.device) + if config.is_half: + net_g = net_g.half() + else: + net_g = net_g.float() + + h = model_hash(config, tgt_sr, net_g, if_f0, version) + + del net_g + + return h + +def model_hash_from(path): + cpt = torch.load(path, map_location="cpu") + h = model_hash_ckpt(cpt) + del cpt + return h + +def _extend_difference(n, a, b): + if n < a: n = a + elif n > b: n = b + n -= a + n /= (b-a) + return n + +def hash_similarity(h1: str, h2: str) -> int: + h1b, h2b = decode_from_string(h1), decode_from_string(h2) + if len(h1b) != half_hash_len*2 or len(h2b) != half_hash_len*2: + raise Exception("invalid hash length") + h1n, h2n = np.frombuffer(h1b, dtype='>i2'), np.frombuffer(h2b, dtype='>i2') + d = 0 + for i in range(half_hash_len//4): + a = i*2 + b = a+1 + ax = complex(h1n[a], h1n[b]) + bx = complex(h2n[a], h2n[b]) + if abs(ax) == 0 or abs(bx) == 0: continue + d += np.abs(ax - bx) + frac = (np.linalg.norm(h1n) * np.linalg.norm(h2n)) + cosine = np.dot(h1n.astype(np.float32), h2n.astype(np.float32)) / frac if frac != 0 else 1.0 + distance = _extend_difference(np.exp(-d/expand_factor), 0.5, 1.0) + return round((abs(cosine) + distance) / 2, 6) + +def hash_id(h: str) -> str: + return encode_to_string(hashlib.md5(decode_from_string(h)).digest())[:-1] diff --git a/infer/modules/vc/info.py b/infer/modules/vc/info.py new file mode 100644 index 0000000..f2bf320 --- /dev/null +++ b/infer/modules/vc/info.py @@ -0,0 +1,50 @@ +import traceback +from i18n.i18n import I18nAuto +from datetime import datetime +import torch + +from .hash import model_hash_ckpt, hash_id + +i18n = I18nAuto() + +def show_model_info(cpt, show_long_id=False): + try: + h = model_hash_ckpt(cpt) + id = hash_id(h) + idread = cpt.get("id", "None") + hread = cpt.get("hash", "None") + if id != idread: + id += "("+i18n("实际计算")+"), "+idread+"("+i18n("从模型中读取")+")" + if not show_long_id: h = i18n("不显示") + elif h != hread: + h += "("+i18n("实际计算")+"), "+hread+"("+i18n("从模型中读取")+")" + txt = f"""{i18n("模型名")}: %s +{i18n("封装时间")}: %s +{i18n("信息")}: %s +{i18n("采样率")}: %s +{i18n("音高引导(f0)")}: %s +{i18n("版本")}: %s +{i18n("ID(短)")}: %s +{i18n("ID(长)")}: %s""" % ( + cpt.get("name", "None"), + datetime.fromtimestamp(float(cpt.get("timestamp", 0))), + cpt.get("info", "None"), + cpt.get("sr", "None"), + i18n("有") if cpt.get("f0", 0) == 1 else i18n("无"), + cpt.get("version", "None"), + id, h + ) + except: + txt = traceback.format_exc() + + return txt + +def show_info(path): + try: + a = torch.load(path, map_location="cpu") + txt = show_model_info(a, show_long_id=True) + del a + except: + txt = traceback.format_exc() + + return txt diff --git a/infer/modules/vc/lgdsng.mp3 b/infer/modules/vc/lgdsng.mp3 new file mode 100644 index 0000000..35a37c5 Binary files /dev/null and b/infer/modules/vc/lgdsng.mp3 differ diff --git a/infer/modules/vc/modules.py b/infer/modules/vc/modules.py index 06769fc..0bc35be 100644 --- a/infer/modules/vc/modules.py +++ b/infer/modules/vc/modules.py @@ -16,7 +16,7 @@ from infer.lib.infer_pack.models import ( SynthesizerTrnMs768NSFsid, SynthesizerTrnMs768NSFsid_nono, ) - +from .info import show_model_info from .pipeline import Pipeline from .utils import get_index_path_from_model, load_hubert @@ -136,6 +136,7 @@ class VC: to_return_protect1, index, index, + show_model_info(self.cpt) ) if to_return_protect else {"visible": True, "maximum": n_spk, "__type__": "update"} @@ -158,6 +159,8 @@ class VC: ): if input_audio_path is None: return "You need to upload an audio", None + elif hasattr(input_audio_path, "name"): + input_audio_path = str(input_audio_path.name) f0_up_key = int(f0_up_key) try: audio = load_audio(input_audio_path, 16000) @@ -170,6 +173,7 @@ class VC: self.hubert_model = load_hubert(self.config.device, self.config.is_half) if file_index: + if hasattr(file_index, "name"): file_index = str(file_index.name) file_index = ( file_index.strip(" ") .strip('"') @@ -207,12 +211,12 @@ class VC: else: tgt_sr = self.tgt_sr index_info = ( - "Index:\n%s." % file_index + "Index: %s." % file_index if os.path.exists(file_index) else "Index not used." ) return ( - "Success.\n%s\nTime:\nnpy: %.2fs, f0: %.2fs, infer: %.2fs." + "Success.\n%s\nTime: npy: %.2fs, f0: %.2fs, infer: %.2fs." % (index_info, *times), (tgt_sr, audio_opt), ) diff --git a/requirements-amd.txt b/requirements-amd.txt index cbda2da..5eb42c6 100644 --- a/requirements-amd.txt +++ b/requirements-amd.txt @@ -47,3 +47,4 @@ ffmpy==0.3.1 python-dotenv>=1.0.0 av torchfcpe +pybase16384 diff --git a/requirements-dml.txt b/requirements-dml.txt index 2dc1b67..7a6106e 100644 --- a/requirements-dml.txt +++ b/requirements-dml.txt @@ -45,3 +45,4 @@ ffmpy==0.3.1 python-dotenv>=1.0.0 av torchfcpe +pybase16384 diff --git a/requirements-ipex.txt b/requirements-ipex.txt index 48b0712..cac27bf 100644 --- a/requirements-ipex.txt +++ b/requirements-ipex.txt @@ -53,3 +53,4 @@ av FreeSimpleGUI sounddevice torchfcpe +pybase16384 diff --git a/requirements-py311.txt b/requirements-py311.txt index e002fbd..bb3ff29 100644 --- a/requirements-py311.txt +++ b/requirements-py311.txt @@ -46,3 +46,4 @@ torchfcpe ffmpy==0.3.1 python-dotenv>=1.0.0 av +pybase16384 diff --git a/requirements.txt b/requirements.txt index 7f2268d..de4dac1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -46,3 +46,4 @@ torchfcpe ffmpy==0.3.1 python-dotenv>=1.0.0 av +pybase16384