From 3dec36568c48af357ec6b7331a7c06348bd51abb Mon Sep 17 00:00:00 2001 From: yxlllc Date: Tue, 26 Dec 2023 00:23:36 +0800 Subject: [PATCH 1/5] optimize real-time vc --- configs/config.json | 2 +- gui_v1.py | 159 +++++++++++++++++++++++-------- infer/lib/infer_pack/models.py | 41 ++++---- infer/lib/jit/get_synthesizer.py | 1 + infer/lib/rmvpe.py | 12 ++- tools/rvc_for_realtime.py | 131 ++++++++++++------------- 6 files changed, 211 insertions(+), 135 deletions(-) diff --git a/configs/config.json b/configs/config.json index 0861200..f874bd5 100644 --- a/configs/config.json +++ b/configs/config.json @@ -1 +1 @@ -{"pth_path": "assets/weights/kikiV1.pth", "index_path": "logs/kikiV1.index", "sg_input_device": "VoiceMeeter Output (VB-Audio Vo (MME)", "sg_output_device": "VoiceMeeter Input (VB-Audio Voi (MME)", "threhold": -45.0, "pitch": 2.0, "rms_mix_rate": 0.0, "index_rate": 0.0, "block_time": 0.52, "crossfade_length": 0.15, "extra_time": 2.46, "n_cpu": 6.0, "use_jit": false, "f0method": "rmvpe"} \ No newline at end of file +{"pth_path": "assets/weights/kikiV1.pth", "index_path": "logs/kikiV1.index", "sg_input_device": "VoiceMeeter Output (VB-Audio Vo (MME)", "sg_output_device": "VoiceMeeter Input (VB-Audio Voi (MME)", "sr_type": "sr_model", "threhold": -60.0, "pitch": 12.0, "rms_mix_rate": 0.5, "index_rate": 0.0, "block_time": 0.2, "crossfade_length": 0.08, "extra_time": 2.00, "n_cpu": 4.0, "use_jit": false, "use_pv": false, "f0method": "fcpe"} \ No newline at end of file diff --git a/gui_v1.py b/gui_v1.py index 7f4c640..0f614e6 100644 --- a/gui_v1.py +++ b/gui_v1.py @@ -22,6 +22,26 @@ def printt(strr, *args): print(strr % args) +def phase_vocoder(a, b, fade_out, fade_in): + window = torch.sqrt(fade_out * fade_in) + fa = torch.fft.rfft(a * window) + fb = torch.fft.rfft(b * window) + absab = torch.abs(fa) + torch.abs(fb) + n = a.shape[0] + if n % 2 == 0: + absab[1:-1] *= 2 + else: + absab[1:] *= 2 + phia = torch.angle(fa) + phib = torch.angle(fb) + deltaphase = phib - phia + deltaphase = deltaphase - 2 * np.pi * torch.floor(deltaphase / 2 / np.pi + 0.5) + w = 2 * np.pi * torch.arange(n // 2 + 1).to(a) + deltaphase + t = torch.arange(n).unsqueeze(-1).to(a) / n + result = a * (fade_out ** 2) + b * (fade_in ** 2) + torch.sum(absab * torch.cos(w * t + phia), -1) * window / n + return result + + class Harvest(multiprocessing.Process): def __init__(self, inp_q, opt_q): multiprocessing.Process.__init__(self) @@ -118,6 +138,8 @@ if __name__ == "__main__": try: with open("configs/config.json", "r") as j: data = json.load(j) + data["sr_model"] = data["sr_type"] == "sr_model" + data["sr_device"] = data["sr_type"] == "sr_device" data["pm"] = data["f0method"] == "pm" data["harvest"] = data["f0method"] == "harvest" data["crepe"] = data["f0method"] == "crepe" @@ -134,6 +156,7 @@ if __name__ == "__main__": "index_path": " ", "sg_input_device": input_devices[sd.default.device[0]], "sg_output_device": output_devices[sd.default.device[1]], + "sr_type": "sr_model", "threhold": "-60", "pitch": "0", "index_rate": "0", @@ -143,7 +166,10 @@ if __name__ == "__main__": "extra_time": "2.5", "f0method": "rmvpe", "use_jit": False, + "use_pv": False, } + data["sr_model"] = data["sr_type"] == "sr_model" + data["sr_device"] = data["sr_type"] == "sr_device" data["pm"] = data["f0method"] == "pm" data["harvest"] = data["f0method"] == "harvest" data["crepe"] = data["f0method"] == "crepe" @@ -207,7 +233,25 @@ if __name__ == "__main__": default_value=data.get("sg_output_device", ""), ), ], - [sg.Button(i18n("重载设备列表"), key="reload_devices")], + [ + sg.Button(i18n("重载设备列表"), key="reload_devices"), + sg.Radio( + i18n("使用模型采样率"), + "sr_type", + key="sr_model", + default=data.get("sr_model", True), + enable_events=True, + ), + sg.Radio( + i18n("使用设备采样率"), + "sr_type", + key="sr_device", + default=data.get("sr_device", False), + enable_events=True, + ), + sg.Text(i18n("采样率:")), + sg.Text("", key="sr_stream"), + ], ], title=i18n("音频设备(请使用同种类驱动)"), ) @@ -222,7 +266,7 @@ if __name__ == "__main__": key="threhold", resolution=1, orientation="h", - default_value=data.get("threhold", "-60"), + default_value=data.get("threhold", -60), enable_events=True, ), ], @@ -233,7 +277,7 @@ if __name__ == "__main__": key="pitch", resolution=1, orientation="h", - default_value=data.get("pitch", "0"), + default_value=data.get("pitch", 0), enable_events=True, ), ], @@ -244,7 +288,7 @@ if __name__ == "__main__": key="index_rate", resolution=0.01, orientation="h", - default_value=data.get("index_rate", "0"), + default_value=data.get("index_rate", 0), enable_events=True, ), ], @@ -255,7 +299,7 @@ if __name__ == "__main__": key="rms_mix_rate", resolution=0.01, orientation="h", - default_value=data.get("rms_mix_rate", "0"), + default_value=data.get("rms_mix_rate", 0), enable_events=True, ), ], @@ -265,35 +309,35 @@ if __name__ == "__main__": "pm", "f0method", key="pm", - default=data.get("pm", "") == True, + default=data.get("pm", False), enable_events=True, ), sg.Radio( "harvest", "f0method", key="harvest", - default=data.get("harvest", "") == True, + default=data.get("harvest", False), enable_events=True, ), sg.Radio( "crepe", "f0method", key="crepe", - default=data.get("crepe", "") == True, + default=data.get("crepe", False), enable_events=True, ), sg.Radio( "rmvpe", "f0method", key="rmvpe", - default=data.get("rmvpe", "") == True, + default=data.get("rmvpe", False), enable_events=True, ), sg.Radio( "fcpe", "f0method", key="fcpe", - default=data.get("fcpe", "") == True, + default=data.get("fcpe", True), enable_events=True, ), ], @@ -305,11 +349,11 @@ if __name__ == "__main__": [ sg.Text(i18n("采样长度")), sg.Slider( - range=(0.05, 2.4), + range=(0.02, 2.4), key="block_time", resolution=0.01, orientation="h", - default_value=data.get("block_time", "0.25"), + default_value=data.get("block_time", 0.25), enable_events=True, ), ], @@ -320,7 +364,7 @@ if __name__ == "__main__": # key="device_latency", # resolution=0.001, # orientation="h", - # default_value=data.get("device_latency", "0.1"), + # default_value=data.get("device_latency", 0.1), # enable_events=True, # ), # ], @@ -344,7 +388,7 @@ if __name__ == "__main__": key="crossfade_length", resolution=0.01, orientation="h", - default_value=data.get("crossfade_length", "0.05"), + default_value=data.get("crossfade_length", 0.05), enable_events=True, ), ], @@ -355,7 +399,7 @@ if __name__ == "__main__": key="extra_time", resolution=0.01, orientation="h", - default_value=data.get("extra_time", "2.5"), + default_value=data.get("extra_time", 2.5), enable_events=True, ), ], @@ -370,6 +414,12 @@ if __name__ == "__main__": key="O_noise_reduce", enable_events=True, ), + sg.Checkbox( + i18n("启用相位声码器"), + key="use_pv", + default=data.get("use_pv", False), + enable_events=True, + ), # sg.Checkbox( # "JIT加速", # default=self.config.use_jit, @@ -443,6 +493,12 @@ if __name__ == "__main__": "index_path": values["index_path"], "sg_input_device": values["sg_input_device"], "sg_output_device": values["sg_output_device"], + "sr_type": ["sr_model", "sr_device"][ + [ + values["sr_model"], + values["sr_device"], + ].index(True) + ], "threhold": values["threhold"], "pitch": values["pitch"], "rms_mix_rate": values["rms_mix_rate"], @@ -454,6 +510,7 @@ if __name__ == "__main__": "n_cpu": values["n_cpu"], # "use_jit": values["use_jit"], "use_jit": False, + "use_pv": values["use_pv"], "f0method": ["pm", "harvest", "crepe", "rmvpe", "fcpe"][ [ values["pm"], @@ -477,6 +534,7 @@ if __name__ == "__main__": ) if values["I_noise_reduce"]: self.delay_time += values["crossfade_length"] + self.window["sr_stream"].update(self.gui_config.samplerate) self.window["delay_time"].update(int(self.delay_time * 1000)) if event == "stop_vc" and self.flag_vc == True: self.flag_vc = False @@ -505,6 +563,8 @@ if __name__ == "__main__": self.window["delay_time"].update(int(self.delay_time * 1000)) elif event == "O_noise_reduce": self.gui_config.O_noise_reduce = values["O_noise_reduce"] + elif event == "use_pv": + self.gui_config.use_pv = values["use_pv"] elif event in ["vc", "im"]: self.function = event elif event != "start_vc" and self.flag_vc == True: @@ -531,6 +591,12 @@ if __name__ == "__main__": # self.device_latency = values["device_latency"] self.gui_config.pth_path = values["pth_path"] self.gui_config.index_path = values["index_path"] + self.gui_config.sr_type = ["sr_model", "sr_device"][ + [ + values["sr_model"], + values["sr_device"], + ].index(True) + ] self.gui_config.threhold = values["threhold"] self.gui_config.pitch = values["pitch"] self.gui_config.block_time = values["block_time"] @@ -538,6 +604,7 @@ if __name__ == "__main__": self.gui_config.extra_time = values["extra_time"] self.gui_config.I_noise_reduce = values["I_noise_reduce"] self.gui_config.O_noise_reduce = values["O_noise_reduce"] + self.gui_config.use_pv = values["use_pv"] self.gui_config.rms_mix_rate = values["rms_mix_rate"] self.gui_config.index_rate = values["index_rate"] self.gui_config.n_cpu = values["n_cpu"] @@ -566,8 +633,8 @@ if __name__ == "__main__": self.config, self.rvc if hasattr(self, "rvc") else None, ) - self.gui_config.samplerate = self.rvc.tgt_sr - self.zc = self.rvc.tgt_sr // 100 + self.gui_config.samplerate = self.rvc.tgt_sr if self.gui_config.sr_type == "sr_model" else self.get_device_samplerate() + self.zc = self.gui_config.samplerate // 100 self.block_frame = ( int( np.round( @@ -589,6 +656,7 @@ if __name__ == "__main__": ) * self.zc ) + self.sola_buffer_frame = min(self.crossfade_frame, 4 * self.zc) self.sola_search_frame = self.zc self.extra_frame = ( int( @@ -622,14 +690,14 @@ if __name__ == "__main__": dtype="float64", ) self.sola_buffer: torch.Tensor = torch.zeros( - self.crossfade_frame, device=self.config.device, dtype=torch.float32 + self.sola_buffer_frame, device=self.config.device, dtype=torch.float32 ) self.nr_buffer: torch.Tensor = self.sola_buffer.clone() self.output_buffer: torch.Tensor = self.input_wav.clone() self.res_buffer: torch.Tensor = torch.zeros( 2 * self.zc, device=self.config.device, dtype=torch.float32 ) - self.valid_rate = 1 - (self.extra_frame - 1) / self.input_wav.shape[0] + self.skip_head = self.extra_frame // self.zc self.fade_in_window: torch.Tensor = ( torch.sin( 0.5 @@ -637,7 +705,7 @@ if __name__ == "__main__": * torch.linspace( 0.0, 1.0, - steps=self.crossfade_frame, + steps=self.sola_buffer_frame, device=self.config.device, dtype=torch.float32, ) @@ -650,6 +718,14 @@ if __name__ == "__main__": new_freq=16000, dtype=torch.float32, ).to(self.config.device) + if self.rvc.tgt_sr != self.gui_config.samplerate: + self.resampler2 = tat.Resample( + orig_freq=self.rvc.tgt_sr, + new_freq=self.gui_config.samplerate, + dtype=torch.float32, + ).to(self.config.device) + else: + self.resampler2 = None self.tg = TorchGate( sr=self.gui_config.samplerate, n_fft=4 * self.zc, prop_decrease=0.9 ).to(self.config.device) @@ -710,11 +786,11 @@ if __name__ == "__main__": input_wav = self.tg( input_wav.unsqueeze(0), self.input_wav.unsqueeze(0) )[0, 2 * self.zc :] - input_wav[: self.crossfade_frame] *= self.fade_in_window - input_wav[: self.crossfade_frame] += ( + input_wav[: self.sola_buffer_frame] *= self.fade_in_window + input_wav[: self.sola_buffer_frame] += ( self.nr_buffer * self.fade_out_window ) - self.nr_buffer[:] = input_wav[-self.crossfade_frame :] + self.nr_buffer[:] = input_wav[self.block_frame : self.block_frame + self.sola_buffer_frame] input_wav = torch.cat( (self.res_buffer[:], input_wav[: self.block_frame]) ) @@ -728,23 +804,16 @@ if __name__ == "__main__": )[160:] # infer if self.function == "vc": - f0_extractor_frame = self.block_frame_16k + 800 - if self.gui_config.f0method == "rmvpe": - f0_extractor_frame = ( - 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160 - ) infer_wav = self.rvc.infer( self.input_wav_res, - self.input_wav_res[-f0_extractor_frame:].cpu().numpy(), self.block_frame_16k, - self.valid_rate, + self.skip_head, self.pitch, self.pitchf, self.gui_config.f0method, ) - infer_wav = infer_wav[ - -self.crossfade_frame - self.sola_search_frame - self.block_frame : - ] + if self.resampler2 is not None: + infer_wav = self.resampler2(infer_wav) else: infer_wav = self.input_wav[ -self.crossfade_frame - self.sola_search_frame - self.block_frame : @@ -794,13 +863,13 @@ if __name__ == "__main__": ) # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC conv_input = infer_wav[ - None, None, : self.crossfade_frame + self.sola_search_frame + None, None, : self.sola_buffer_frame + self.sola_search_frame ] cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :]) cor_den = torch.sqrt( F.conv1d( conv_input**2, - torch.ones(1, 1, self.crossfade_frame, device=self.config.device), + torch.ones(1, 1, self.sola_buffer_frame, device=self.config.device), ) + 1e-8 ) @@ -813,9 +882,16 @@ if __name__ == "__main__": infer_wav = infer_wav[ sola_offset : sola_offset + self.block_frame + self.crossfade_frame ] - infer_wav[: self.crossfade_frame] *= self.fade_in_window - infer_wav[: self.crossfade_frame] += self.sola_buffer * self.fade_out_window - self.sola_buffer[:] = infer_wav[-self.crossfade_frame :] + if "privateuseone" in str(self.config.device) or not self.gui_config.use_pv: + infer_wav[: self.sola_buffer_frame] *= self.fade_in_window + infer_wav[: self.sola_buffer_frame] += self.sola_buffer * self.fade_out_window + else: + infer_wav[: self.sola_buffer_frame] = phase_vocoder( + self.sola_buffer, + infer_wav[: self.sola_buffer_frame], + self.fade_out_window, + self.fade_in_window) + self.sola_buffer[:] = infer_wav[self.block_frame : self.block_frame + self.sola_buffer_frame] if sys.platform == "darwin": outdata[:] = ( infer_wav[: -self.crossfade_frame].cpu().numpy()[:, np.newaxis] @@ -864,7 +940,7 @@ if __name__ == "__main__": input_devices_indices, output_devices_indices, ) - + def set_devices(self, input_device, output_device): """设置输出设备""" ( @@ -881,5 +957,8 @@ if __name__ == "__main__": ] printt("Input device: %s:%s", str(sd.default.device[0]), input_device) printt("Output device: %s:%s", str(sd.default.device[1]), output_device) - + + def get_device_samplerate(self): + return int(sd.query_devices(device=sd.default.device[0])['default_samplerate']) + gui = GUI() diff --git a/infer/lib/infer_pack/models.py b/infer/lib/infer_pack/models.py index f25e724..c2750ee 100644 --- a/infer/lib/infer_pack/models.py +++ b/infer/lib/infer_pack/models.py @@ -722,7 +722,8 @@ class SynthesizerTrnMs256NSFsid(nn.Module): def remove_weight_norm(self): self.dec.remove_weight_norm() self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() + if hasattr(self, "enc_q"): + self.enc_q.remove_weight_norm() def __prepare_scriptable__(self): for hook in self.dec._forward_pre_hooks.values(): @@ -783,14 +784,14 @@ class SynthesizerTrnMs256NSFsid(nn.Module): pitch: torch.Tensor, nsff0: torch.Tensor, sid: torch.Tensor, - rate: Optional[torch.Tensor] = None, + skip_head: Optional[torch.Tensor] = None, ): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if rate is not None: - assert isinstance(rate, torch.Tensor) - head = int(z_p.shape[2] * (1 - rate.item())) + if skip_head is not None: + assert isinstance(skip_head, torch.Tensor) + head = int(skip_head.item()) z_p = z_p[:, :, head:] x_mask = x_mask[:, :, head:] nsff0 = nsff0[:, head:] @@ -887,7 +888,8 @@ class SynthesizerTrnMs768NSFsid(nn.Module): def remove_weight_norm(self): self.dec.remove_weight_norm() self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() + if hasattr(self, "enc_q"): + self.enc_q.remove_weight_norm() def __prepare_scriptable__(self): for hook in self.dec._forward_pre_hooks.values(): @@ -941,13 +943,14 @@ class SynthesizerTrnMs768NSFsid(nn.Module): pitch: torch.Tensor, nsff0: torch.Tensor, sid: torch.Tensor, - rate: Optional[torch.Tensor] = None, + skip_head: Optional[torch.Tensor] = None, ): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if rate is not None: - head = int(z_p.shape[2] * (1.0 - rate.item())) + if skip_head is not None: + assert isinstance(skip_head, torch.Tensor) + head = int(skip_head.item()) z_p = z_p[:, :, head:] x_mask = x_mask[:, :, head:] nsff0 = nsff0[:, head:] @@ -1041,7 +1044,8 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): def remove_weight_norm(self): self.dec.remove_weight_norm() self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() + if hasattr(self, "enc_q"): + self.enc_q.remove_weight_norm() def __prepare_scriptable__(self): for hook in self.dec._forward_pre_hooks.values(): @@ -1087,13 +1091,14 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): phone: torch.Tensor, phone_lengths: torch.Tensor, sid: torch.Tensor, - rate: Optional[torch.Tensor] = None, + skip_head: Optional[torch.Tensor] = None, ): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if rate is not None: - head = int(z_p.shape[2] * (1.0 - rate.item())) + if skip_head is not None: + assert isinstance(skip_head, torch.Tensor) + head = int(skip_head.item()) z_p = z_p[:, :, head:] x_mask = x_mask[:, :, head:] z = self.flow(z_p, x_mask, g=g, reverse=True) @@ -1186,7 +1191,8 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module): def remove_weight_norm(self): self.dec.remove_weight_norm() self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() + if hasattr(self, "enc_q"): + self.enc_q.remove_weight_norm() def __prepare_scriptable__(self): for hook in self.dec._forward_pre_hooks.values(): @@ -1232,13 +1238,14 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module): phone: torch.Tensor, phone_lengths: torch.Tensor, sid: torch.Tensor, - rate: Optional[torch.Tensor] = None, + skip_head: Optional[torch.Tensor] = None, ): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if rate is not None: - head = int(z_p.shape[2] * (1.0 - rate.item())) + if skip_head is not None: + assert isinstance(skip_head, torch.Tensor) + head = int(skip_head.item()) z_p = z_p[:, :, head:] x_mask = x_mask[:, :, head:] z = self.flow(z_p, x_mask, g=g, reverse=True) diff --git a/infer/lib/jit/get_synthesizer.py b/infer/lib/jit/get_synthesizer.py index ef5fe58..b8db4fa 100644 --- a/infer/lib/jit/get_synthesizer.py +++ b/infer/lib/jit/get_synthesizer.py @@ -34,4 +34,5 @@ def get_synthesizer(pth_path, device=torch.device("cpu")): net_g.load_state_dict(cpt["weight"], strict=False) net_g = net_g.float() net_g.eval().to(device) + net_g.remove_weight_norm() return net_g, cpt diff --git a/infer/lib/rmvpe.py b/infer/lib/rmvpe.py index 9010d28..86c6899 100644 --- a/infer/lib/rmvpe.py +++ b/infer/lib/rmvpe.py @@ -593,16 +593,18 @@ class RMVPE: def infer_from_audio(self, audio, thred=0.03): # torch.cuda.synchronize() - t0 = ttime() + # t0 = ttime() + if not torch.is_tensor(audio): + audio = torch.from_numpy(audio) mel = self.mel_extractor( - torch.from_numpy(audio).float().to(self.device).unsqueeze(0), center=True + audio.float().to(self.device).unsqueeze(0), center=True ) # print(123123123,mel.device.type) # torch.cuda.synchronize() - t1 = ttime() + # t1 = ttime() hidden = self.mel2hidden(mel) # torch.cuda.synchronize() - t2 = ttime() + # t2 = ttime() # print(234234,hidden.device.type) if "privateuseone" not in str(self.device): hidden = hidden.squeeze(0).cpu().numpy() @@ -613,7 +615,7 @@ class RMVPE: f0 = self.decode(hidden, thred=thred) # torch.cuda.synchronize() - t3 = ttime() + # t3 = ttime() # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0)) return f0 diff --git a/tools/rvc_for_realtime.py b/tools/rvc_for_realtime.py index f36ffb3..2d54732 100644 --- a/tools/rvc_for_realtime.py +++ b/tools/rvc_for_realtime.py @@ -46,23 +46,22 @@ def printt(strr, *args): # config.is_half=False########强制cpu测试 class RVC: def __init__( - self, - key, - pth_path, - index_path, - index_rate, - n_cpu, - inp_q, - opt_q, - config: Config, - last_rvc=None, + self, + key, + pth_path, + index_path, + index_rate, + n_cpu, + inp_q, + opt_q, + config: Config, + last_rvc=None, ) -> None: """ 初始化 """ try: if config.dml == True: - def forward_dml(ctx, x, scale): ctx.scale = scale res = x.clone().detach() @@ -76,13 +75,10 @@ class RVC: # device="cpu"########强制cpu测试 self.device = config.device self.f0_up_key = key - self.time_step = 160 / 16000 * 1000 self.f0_min = 50 self.f0_max = 1100 self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) - self.sr = 16000 - self.window = 160 self.n_cpu = n_cpu self.use_jit = self.config.use_jit self.is_half = config.is_half @@ -184,6 +180,7 @@ class RVC: if last_rvc is not None and hasattr(last_rvc, "model_rmvpe"): self.model_rmvpe = last_rvc.model_rmvpe if last_rvc is not None and hasattr(last_rvc, "model_fcpe"): + self.device_fcpe = last_rvc.device_fcpe self.model_fcpe = last_rvc.model_fcpe except: printt(traceback.format_exc()) @@ -199,14 +196,10 @@ class RVC: self.index_rate = new_index_rate def get_f0_post(self, f0): - f0_min = self.f0_min - f0_max = self.f0_max - f0_mel_min = 1127 * np.log(1 + f0_min / 700) - f0_mel_max = 1127 * np.log(1 + f0_max / 700) f0bak = f0.copy() f0_mel = 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( - f0_mel_max - f0_mel_min + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / ( + self.f0_mel_max - self.f0_mel_min ) + 1 f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel > 255] = 255 @@ -221,6 +214,7 @@ class RVC: return self.get_f0_rmvpe(x, f0_up_key) if method == "fcpe": return self.get_f0_fcpe(x, f0_up_key) + x = x.cpu().numpy() if method == "pm": p_len = x.shape[0] // 160 + 1 f0_min = 65 @@ -262,7 +256,7 @@ class RVC: self.inp_q.put((idx, x[:tail], res_f0, n_cpu, ts)) else: self.inp_q.put( - (idx, x[part_length * idx - 320 : tail], res_f0, n_cpu, ts) + (idx, x[part_length * idx - 320: tail], res_f0, n_cpu, ts) ) while 1: res_ts = self.opt_q.get() @@ -277,20 +271,19 @@ class RVC: else: f0 = f0[2:] f0bak[ - part_length * idx // 160 : part_length * idx // 160 + f0.shape[0] + part_length * idx // 160: part_length * idx // 160 + f0.shape[0] ] = f0 f0bak = signal.medfilt(f0bak, 3) f0bak *= pow(2, f0_up_key / 12) return self.get_f0_post(f0bak) def get_f0_crepe(self, x, f0_up_key): - if "privateuseone" in str(self.device): ###不支持dml,cpu又太慢用不成,拿pm顶替 - return self.get_f0(x, f0_up_key, 1, "pm") - audio = torch.tensor(np.copy(x))[None].float() + if "privateuseone" in str(self.device): ###不支持dml,cpu又太慢用不成,拿fcpe顶替 + return self.get_f0(x, f0_up_key, 1, "fcpe") # printt("using crepe,device:%s"%self.device) f0, pd = torchcrepe.predict( - audio, - self.sr, + x.unsqueeze(0).float(), + 16000, 160, self.f0_min, self.f0_max, @@ -313,15 +306,11 @@ class RVC: printt("Loading rmvpe model") self.model_rmvpe = RMVPE( - # "rmvpe.pt", is_half=self.is_half if self.device.type!="privateuseone" else False, device=self.device if self.device.type!="privateuseone"else "cpu"####dml时强制对rmvpe用cpu跑 - # "rmvpe.pt", is_half=False, device=self.device####dml配置 - # "rmvpe.pt", is_half=False, device="cpu"####锁定cpu配置 "assets/rmvpe/rmvpe.pt", is_half=self.is_half, - device=self.device, ####正常逻辑 + device=self.device, use_jit=self.config.use_jit, ) - # self.model_rmvpe = RMVPE("aug2_58000_half.pt", is_half=self.is_half, device=self.device) f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) f0 *= pow(2, f0_up_key / 12) return self.get_f0_post(f0) @@ -329,41 +318,36 @@ class RVC: def get_f0_fcpe(self, x, f0_up_key): if hasattr(self, "model_fcpe") == False: from torchfcpe import spawn_bundled_infer_model - printt("Loading fcpe model") - self.model_fcpe = spawn_bundled_infer_model(self.device) - f0 = ( - self.model_fcpe.infer( - torch.from_numpy(x).to(self.device).unsqueeze(0).float(), - sr=16000, - decoder_mode="local_argmax", - threshold=0.006, - ) - .squeeze() - .cpu() - .numpy() - ) + if "privateuseone" in str(self.device): + self.device_fcpe = "cpu" + else: + self.device_fcpe = self.device + self.model_fcpe = spawn_bundled_infer_model(self.device_fcpe) + f0 = self.model_fcpe.infer( + x.to(self.device_fcpe).unsqueeze(0).float(), + sr=16000, + decoder_mode='local_argmax', + threshold=0.006, + ).squeeze().cpu().numpy() f0 *= pow(2, f0_up_key / 12) return self.get_f0_post(f0) def infer( - self, - feats: torch.Tensor, - indata: np.ndarray, - block_frame_16k, - rate, - cache_pitch, - cache_pitchf, - f0method, + self, + input_wav: torch.Tensor, + block_frame_16k, + skip_head, + cache_pitch, + cache_pitchf, + f0method, ) -> np.ndarray: - feats = feats.view(1, -1) - if self.config.is_half: - feats = feats.half() - else: - feats = feats.float() - feats = feats.to(self.device) t1 = ttime() with torch.no_grad(): + if self.config.is_half: + feats = input_wav.half().view(1, -1) + else: + feats = input_wav.float().view(1, -1) padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) inputs = { "source": feats, @@ -387,8 +371,8 @@ class RVC: if self.config.is_half: npy = npy.astype("float16") feats[0][-leng_replace_head:] = ( - torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate - + (1 - self.index_rate) * feats[0][-leng_replace_head:] + torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate + + (1 - self.index_rate) * feats[0][-leng_replace_head:] ) else: printt("Index search FAILED or disabled") @@ -398,7 +382,13 @@ class RVC: feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) t3 = ttime() if self.if_f0 == 1: - pitch, pitchf = self.get_f0(indata, self.f0_up_key, self.n_cpu, f0method) + f0_extractor_frame = block_frame_16k + 800 + if f0method == "rmvpe": + f0_extractor_frame = ( + 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160 + ) + input_wav = input_wav[-f0_extractor_frame:] + pitch, pitchf = self.get_f0(input_wav, self.f0_up_key, self.n_cpu, f0method) start_frame = block_frame_16k // 160 end_frame = len(cache_pitch) - (pitch.shape[0] - 4) + start_frame cache_pitch[:] = np.append(cache_pitch[start_frame:end_frame], pitch[3:-1]) @@ -412,31 +402,28 @@ class RVC: t4 = ttime() feats = feats[:, :p_len, :] if self.if_f0 == 1: - cache_pitch = cache_pitch[:p_len] - cache_pitchf = cache_pitchf[:p_len] - cache_pitch = torch.LongTensor(cache_pitch).unsqueeze(0).to(self.device) - cache_pitchf = torch.FloatTensor(cache_pitchf).unsqueeze(0).to(self.device) + cache_pitch = torch.LongTensor(cache_pitch[:p_len]).to(self.device).unsqueeze(0) + cache_pitchf = torch.FloatTensor(cache_pitchf[:p_len]).to(self.device).unsqueeze(0) p_len = torch.LongTensor([p_len]).to(self.device) - ii = 0 # sid - sid = torch.LongTensor([ii]).to(self.device) + sid = torch.LongTensor([0]).to(self.device) + skip_head = torch.LongTensor([skip_head]) with torch.no_grad(): if self.if_f0 == 1: - # printt(12222222222,feats.device,p_len.device,cache_pitch.device,cache_pitchf.device,sid.device,rate2) infered_audio = self.net_g.infer( feats, p_len, cache_pitch, cache_pitchf, sid, - torch.FloatTensor([rate]), + skip_head, )[0][0, 0].data.float() else: infered_audio = self.net_g.infer( - feats, p_len, sid, torch.FloatTensor([rate]) + feats, p_len, sid, skip_head )[0][0, 0].data.float() t5 = ttime() printt( - "Spent time: fea = %.2fs, index = %.2fs, f0 = %.2fs, model = %.2fs", + "Spent time: fea = %.3fs, index = %.3fs, f0 = %.3fs, model = %.3fs", t2 - t1, t3 - t2, t4 - t3, From d62e80fb8391e5b95fecdc24bac80436e3d54978 Mon Sep 17 00:00:00 2001 From: yxlllc Date: Tue, 26 Dec 2023 00:28:49 +0800 Subject: [PATCH 2/5] optimize real-time vc --- requirements-amd.txt | 1 + requirements-dml.txt | 1 + requirements-ipex.txt | 3 ++- requirements-win-for-realtime_vc_gui-dml.txt | 3 ++- requirements-win-for-realtime_vc_gui.txt | 1 + 5 files changed, 7 insertions(+), 2 deletions(-) diff --git a/requirements-amd.txt b/requirements-amd.txt index aa81a88..d0976a7 100644 --- a/requirements-amd.txt +++ b/requirements-amd.txt @@ -46,3 +46,4 @@ fastapi==0.88 ffmpy==0.3.1 python-dotenv>=1.0.0 av +torchfcpe diff --git a/requirements-dml.txt b/requirements-dml.txt index a49ed2d..b4690ae 100644 --- a/requirements-dml.txt +++ b/requirements-dml.txt @@ -44,3 +44,4 @@ fastapi==0.88 ffmpy==0.3.1 python-dotenv>=1.0.0 av +torchfcpe \ No newline at end of file diff --git a/requirements-ipex.txt b/requirements-ipex.txt index 610a0ce..19ff424 100644 --- a/requirements-ipex.txt +++ b/requirements-ipex.txt @@ -51,4 +51,5 @@ ffmpy==0.3.1 python-dotenv>=1.0.0 av PySimpleGUI -sounddevice \ No newline at end of file +sounddevice +torchfcpe \ No newline at end of file diff --git a/requirements-win-for-realtime_vc_gui-dml.txt b/requirements-win-for-realtime_vc_gui-dml.txt index 6514989..9aaf56d 100644 --- a/requirements-win-for-realtime_vc_gui-dml.txt +++ b/requirements-win-for-realtime_vc_gui-dml.txt @@ -26,4 +26,5 @@ PySimpleGUI sounddevice gradio noisereduce -onnxruntime-directml \ No newline at end of file +onnxruntime-directml +torchfcpe \ No newline at end of file diff --git a/requirements-win-for-realtime_vc_gui.txt b/requirements-win-for-realtime_vc_gui.txt index 37ca238..e187f85 100644 --- a/requirements-win-for-realtime_vc_gui.txt +++ b/requirements-win-for-realtime_vc_gui.txt @@ -26,3 +26,4 @@ PySimpleGUI sounddevice gradio noisereduce +torchfcpe From d7fb651f7c3ed90c72a084030341c620ef4a1a4c Mon Sep 17 00:00:00 2001 From: yxlllc Date: Tue, 26 Dec 2023 16:26:01 +0800 Subject: [PATCH 3/5] optimize real-time vc --- gui_v1.py | 20 ++++--------- infer/lib/infer_pack/models.py | 40 ++++++++++++++++--------- tools/rvc_for_realtime.py | 55 +++++++++++++++++----------------- 3 files changed, 58 insertions(+), 57 deletions(-) diff --git a/gui_v1.py b/gui_v1.py index 0f614e6..728cf7e 100644 --- a/gui_v1.py +++ b/gui_v1.py @@ -681,14 +681,6 @@ if __name__ == "__main__": device=self.config.device, dtype=torch.float32, ) - self.pitch: np.ndarray = np.zeros( - self.input_wav.shape[0] // self.zc, - dtype="int32", - ) - self.pitchf: np.ndarray = np.zeros( - self.input_wav.shape[0] // self.zc, - dtype="float64", - ) self.sola_buffer: torch.Tensor = torch.zeros( self.sola_buffer_frame, device=self.config.device, dtype=torch.float32 ) @@ -698,6 +690,7 @@ if __name__ == "__main__": 2 * self.zc, device=self.config.device, dtype=torch.float32 ) self.skip_head = self.extra_frame // self.zc + self.return_length = (self.block_frame + self.sola_buffer_frame + self.sola_search_frame) // self.zc self.fade_in_window: torch.Tensor = ( torch.sin( 0.5 @@ -808,8 +801,7 @@ if __name__ == "__main__": self.input_wav_res, self.block_frame_16k, self.skip_head, - self.pitch, - self.pitchf, + self.return_length, self.gui_config.f0method, ) if self.resampler2 is not None: @@ -879,9 +871,7 @@ if __name__ == "__main__": else: sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0]) printt("sola_offset = %d", int(sola_offset)) - infer_wav = infer_wav[ - sola_offset : sola_offset + self.block_frame + self.crossfade_frame - ] + infer_wav = infer_wav[sola_offset :] if "privateuseone" in str(self.config.device) or not self.gui_config.use_pv: infer_wav[: self.sola_buffer_frame] *= self.fade_in_window infer_wav[: self.sola_buffer_frame] += self.sola_buffer * self.fade_out_window @@ -894,11 +884,11 @@ if __name__ == "__main__": self.sola_buffer[:] = infer_wav[self.block_frame : self.block_frame + self.sola_buffer_frame] if sys.platform == "darwin": outdata[:] = ( - infer_wav[: -self.crossfade_frame].cpu().numpy()[:, np.newaxis] + infer_wav[: self.block_frame].cpu().numpy()[:, np.newaxis] ) else: outdata[:] = ( - infer_wav[: -self.crossfade_frame].repeat(2, 1).t().cpu().numpy() + infer_wav[: self.block_frame].repeat(2, 1).t().cpu().numpy() ) total_time = time.perf_counter() - start_time self.window["infer_time"].update(int(total_time * 1000)) diff --git a/infer/lib/infer_pack/models.py b/infer/lib/infer_pack/models.py index c2750ee..a81c1de 100644 --- a/infer/lib/infer_pack/models.py +++ b/infer/lib/infer_pack/models.py @@ -785,16 +785,19 @@ class SynthesizerTrnMs256NSFsid(nn.Module): nsff0: torch.Tensor, sid: torch.Tensor, skip_head: Optional[torch.Tensor] = None, + return_length: Optional[torch.Tensor] = None, ): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if skip_head is not None: + if skip_head is not None and return_length is not None: assert isinstance(skip_head, torch.Tensor) + assert isinstance(return_length, torch.Tensor) head = int(skip_head.item()) - z_p = z_p[:, :, head:] - x_mask = x_mask[:, :, head:] - nsff0 = nsff0[:, head:] + length = int(return_length.item()) + z_p = z_p[:, :, head: head + length] + x_mask = x_mask[:, :, head: head + length] + nsff0 = nsff0[:, head: head + length] z = self.flow(z_p, x_mask, g=g, reverse=True) o = self.dec(z * x_mask, nsff0, g=g) return o, x_mask, (z, z_p, m_p, logs_p) @@ -944,16 +947,19 @@ class SynthesizerTrnMs768NSFsid(nn.Module): nsff0: torch.Tensor, sid: torch.Tensor, skip_head: Optional[torch.Tensor] = None, + return_length: Optional[torch.Tensor] = None, ): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if skip_head is not None: + if skip_head is not None and return_length is not None: assert isinstance(skip_head, torch.Tensor) + assert isinstance(return_length, torch.Tensor) head = int(skip_head.item()) - z_p = z_p[:, :, head:] - x_mask = x_mask[:, :, head:] - nsff0 = nsff0[:, head:] + length = int(return_length.item()) + z_p = z_p[:, :, head: head + length] + x_mask = x_mask[:, :, head: head + length] + nsff0 = nsff0[:, head: head + length] z = self.flow(z_p, x_mask, g=g, reverse=True) o = self.dec(z * x_mask, nsff0, g=g) return o, x_mask, (z, z_p, m_p, logs_p) @@ -1092,15 +1098,18 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): phone_lengths: torch.Tensor, sid: torch.Tensor, skip_head: Optional[torch.Tensor] = None, + return_length: Optional[torch.Tensor] = None, ): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if skip_head is not None: + if skip_head is not None and return_length is not None: assert isinstance(skip_head, torch.Tensor) + assert isinstance(return_length, torch.Tensor) head = int(skip_head.item()) - z_p = z_p[:, :, head:] - x_mask = x_mask[:, :, head:] + length = int(return_length.item()) + z_p = z_p[:, :, head: head + length] + x_mask = x_mask[:, :, head: head + length] z = self.flow(z_p, x_mask, g=g, reverse=True) o = self.dec(z * x_mask, g=g) return o, x_mask, (z, z_p, m_p, logs_p) @@ -1239,15 +1248,18 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module): phone_lengths: torch.Tensor, sid: torch.Tensor, skip_head: Optional[torch.Tensor] = None, + return_length: Optional[torch.Tensor] = None, ): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if skip_head is not None: + if skip_head is not None and return_length is not None: assert isinstance(skip_head, torch.Tensor) + assert isinstance(return_length, torch.Tensor) head = int(skip_head.item()) - z_p = z_p[:, :, head:] - x_mask = x_mask[:, :, head:] + length = int(return_length.item()) + z_p = z_p[:, :, head: head + length] + x_mask = x_mask[:, :, head: head + length] z = self.flow(z_p, x_mask, g=g, reverse=True) o = self.dec(z * x_mask, g=g) return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/tools/rvc_for_realtime.py b/tools/rvc_for_realtime.py index 2d54732..257c44d 100644 --- a/tools/rvc_for_realtime.py +++ b/tools/rvc_for_realtime.py @@ -90,7 +90,9 @@ class RVC: self.pth_path: str = pth_path self.index_path = index_path self.index_rate = index_rate - + self.cache_pitch: np.ndarray = np.zeros(1024, dtype="int32") + self.cache_pitchf = np.zeros(1024, dtype="float32") + if last_rvc is None: models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( ["assets/hubert/hubert_base.pt"], @@ -329,8 +331,9 @@ class RVC: sr=16000, decoder_mode='local_argmax', threshold=0.006, - ).squeeze().cpu().numpy() + ) f0 *= pow(2, f0_up_key / 12) + f0 = f0.squeeze().cpu().numpy() return self.get_f0_post(f0) def infer( @@ -338,8 +341,7 @@ class RVC: input_wav: torch.Tensor, block_frame_16k, skip_head, - cache_pitch, - cache_pitchf, + return_length, f0method, ) -> np.ndarray: t1 = ttime() @@ -362,24 +364,22 @@ class RVC: t2 = ttime() try: if hasattr(self, "index") and self.index_rate != 0: - leng_replace_head = int(rate * feats[0].shape[0]) - npy = feats[0][-leng_replace_head:].cpu().numpy().astype("float32") + npy = feats[0][skip_head // 2:].cpu().numpy().astype("float32") score, ix = self.index.search(npy, k=8) weight = np.square(1 / score) weight /= weight.sum(axis=1, keepdims=True) npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) if self.config.is_half: npy = npy.astype("float16") - feats[0][-leng_replace_head:] = ( + feats[0][skip_head // 2:] = ( torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate - + (1 - self.index_rate) * feats[0][-leng_replace_head:] + + (1 - self.index_rate) * feats[0][skip_head // 2:] ) else: printt("Index search FAILED or disabled") except: traceback.print_exc() printt("Index search FAILED") - feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) t3 = ttime() if self.if_f0 == 1: f0_extractor_frame = block_frame_16k + 800 @@ -387,40 +387,39 @@ class RVC: f0_extractor_frame = ( 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160 ) - input_wav = input_wav[-f0_extractor_frame:] - pitch, pitchf = self.get_f0(input_wav, self.f0_up_key, self.n_cpu, f0method) + pitch, pitchf = self.get_f0(input_wav[-f0_extractor_frame: ], self.f0_up_key, self.n_cpu, f0method) start_frame = block_frame_16k // 160 - end_frame = len(cache_pitch) - (pitch.shape[0] - 4) + start_frame - cache_pitch[:] = np.append(cache_pitch[start_frame:end_frame], pitch[3:-1]) - cache_pitchf[:] = np.append( - cache_pitchf[start_frame:end_frame], pitchf[3:-1] + end_frame = len(self.cache_pitch) - (pitch.shape[0] - 4) + start_frame + self.cache_pitch[:] = np.append(self.cache_pitch[start_frame: end_frame], pitch[3:-1]) + self.cache_pitchf[:] = np.append( + self.cache_pitchf[start_frame: end_frame], pitchf[3:-1] ) - p_len = min(feats.shape[1], 13000, cache_pitch.shape[0]) - else: - cache_pitch, cache_pitchf = None, None - p_len = min(feats.shape[1], 13000) t4 = ttime() - feats = feats[:, :p_len, :] + p_len = input_wav.shape[0] // 160 if self.if_f0 == 1: - cache_pitch = torch.LongTensor(cache_pitch[:p_len]).to(self.device).unsqueeze(0) - cache_pitchf = torch.FloatTensor(cache_pitchf[:p_len]).to(self.device).unsqueeze(0) + cache_pitch = torch.LongTensor(self.cache_pitch[-p_len: ]).to(self.device).unsqueeze(0) + cache_pitchf = torch.FloatTensor(self.cache_pitchf[-p_len: ]).to(self.device).unsqueeze(0) + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) + feats = feats[:, :p_len, :] p_len = torch.LongTensor([p_len]).to(self.device) sid = torch.LongTensor([0]).to(self.device) skip_head = torch.LongTensor([skip_head]) + return_length = torch.LongTensor([return_length]) with torch.no_grad(): if self.if_f0 == 1: - infered_audio = self.net_g.infer( + infered_audio, _, _ = self.net_g.infer( feats, p_len, cache_pitch, cache_pitchf, sid, skip_head, - )[0][0, 0].data.float() + return_length, + ) else: - infered_audio = self.net_g.infer( - feats, p_len, sid, skip_head - )[0][0, 0].data.float() + infered_audio, _, _ = self.net_g.infer( + feats, p_len, sid, skip_head, return_length + ) t5 = ttime() printt( "Spent time: fea = %.3fs, index = %.3fs, f0 = %.3fs, model = %.3fs", @@ -429,4 +428,4 @@ class RVC: t4 - t3, t5 - t4, ) - return infered_audio + return infered_audio.squeeze().float() From 21775b187a2610be5faaf58a500eaf068620cde1 Mon Sep 17 00:00:00 2001 From: yxlllc Date: Tue, 26 Dec 2023 17:05:42 +0800 Subject: [PATCH 4/5] optimize real-time vc --- gui_v1.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gui_v1.py b/gui_v1.py index 728cf7e..dc2bdc8 100644 --- a/gui_v1.py +++ b/gui_v1.py @@ -559,7 +559,7 @@ if __name__ == "__main__": if stream_latency > 0: self.delay_time += ( 1 if values["I_noise_reduce"] else -1 - ) * values["crossfade_length"] + ) * min(values["crossfade_length"], 0.04) self.window["delay_time"].update(int(self.delay_time * 1000)) elif event == "O_noise_reduce": self.gui_config.O_noise_reduce = values["O_noise_reduce"] @@ -774,7 +774,7 @@ if __name__ == "__main__": # input noise reduction and resampling if self.gui_config.I_noise_reduce and self.function == "vc": input_wav = self.input_wav[ - -self.crossfade_frame - self.block_frame - 2 * self.zc : + -self.sola_buffer_frame - self.block_frame - 2 * self.zc : ] input_wav = self.tg( input_wav.unsqueeze(0), self.input_wav.unsqueeze(0) @@ -783,7 +783,7 @@ if __name__ == "__main__": input_wav[: self.sola_buffer_frame] += ( self.nr_buffer * self.fade_out_window ) - self.nr_buffer[:] = input_wav[self.block_frame : self.block_frame + self.sola_buffer_frame] + self.nr_buffer[:] = input_wav[self.block_frame :] input_wav = torch.cat( (self.res_buffer[:], input_wav[: self.block_frame]) ) @@ -824,7 +824,7 @@ if __name__ == "__main__": # volume envelop mixing if self.gui_config.rms_mix_rate < 1 and self.function == "vc": rms1 = librosa.feature.rms( - y=self.input_wav_res[-160 * infer_wav.shape[0] // self.zc :] + y=self.input_wav_res[160 * self.skip_head : 160 * (self.skip_head + self.return_length)] .cpu() .numpy(), frame_length=640, From aed19c3c6b3f43c4d2e13dbb4631098a2a66c55e Mon Sep 17 00:00:00 2001 From: yxlllc Date: Tue, 26 Dec 2023 17:41:25 +0800 Subject: [PATCH 5/5] optimize real-time vc --- gui_v1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gui_v1.py b/gui_v1.py index dc2bdc8..e5c6757 100644 --- a/gui_v1.py +++ b/gui_v1.py @@ -533,7 +533,7 @@ if __name__ == "__main__": + 0.01 ) if values["I_noise_reduce"]: - self.delay_time += values["crossfade_length"] + self.delay_time += min(values["crossfade_length"], 0.04) self.window["sr_stream"].update(self.gui_config.samplerate) self.window["delay_time"].update(int(self.delay_time * 1000)) if event == "stop_vc" and self.flag_vc == True: