diff --git a/README.md b/README.md index 743d5e7..f6ad770 100644 --- a/README.md +++ b/README.md @@ -28,9 +28,10 @@ ![image](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/assets/129054828/092e5c12-0d49-4168-a590-0b0ef6a4f630) -实时变声界面:go-realtime-gui.bat(我们已经实现端到端210ms延迟。如使用ASIO输入输出设备,已能实现端到端90ms延迟,但非常依赖硬件驱动支持。) +实时变声界面:go-realtime-gui.bat(我们已经实现端到端170ms延迟。如使用ASIO输入输出设备,已能实现端到端90ms延迟,但非常依赖硬件驱动支持。) + +![image](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/assets/129054828/730b4114-8805-44a1-ab1a-04668f3c30a6) -![rvc新延迟](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/assets/129054828/703b711b-6a71-439e-94f2-f4705487b1dc) > 底模使用接近50小时的开源高质量VCTK训练集训练,无版权方面的顾虑,请大家放心使用 diff --git a/gui_v1.py b/gui_v1.py index 86b52d3..ca5219e 100644 --- a/gui_v1.py +++ b/gui_v1.py @@ -12,7 +12,7 @@ now_dir = os.getcwd() sys.path.append(now_dir) import multiprocessing -stream_latency = -1 +flag_vc = False def printt(strr, *args): @@ -113,32 +113,36 @@ if __name__ == "__main__": self.pth_path: str = "" self.index_path: str = "" self.pitch: int = 0 - self.samplerate: int = 40000 - self.block_time: float = 1.0 # s - self.buffer_num: int = 1 + self.sr_type: str = "sr_model" + self.block_time: float = 0.25 # s self.threhold: int = -60 self.crossfade_time: float = 0.05 self.extra_time: float = 2.5 - self.I_noise_reduce = False - self.O_noise_reduce = False - self.rms_mix_rate = 0.0 - self.index_rate = 0.3 - self.n_cpu = min(n_cpu, 6) - self.f0method = "harvest" - self.sg_input_device = "" - self.sg_output_device = "" + self.I_noise_reduce: bool = False + self.O_noise_reduce: bool = False + self.use_pv: bool = False + self.rms_mix_rate: float = 0.0 + self.index_rate: float = 0.0 + self.n_cpu: int = min(n_cpu, 4) + self.f0method: str = "fcpe" + self.sg_input_device: str = "" + self.sg_output_device: str = "" class GUI: def __init__(self) -> None: self.gui_config = GUIConfig() self.config = Config() - self.flag_vc = False self.function = "vc" self.delay_time = 0 + self.input_devices = None + self.output_devices = None + self.input_devices_indices = None + self.output_devices_indices = None + self.stream = None + self.update_devices() self.launcher() def load(self): - input_devices, output_devices, _, _ = self.get_devices() try: with open("configs/config.json", "r") as j: data = json.load(j) @@ -149,25 +153,34 @@ if __name__ == "__main__": data["crepe"] = data["f0method"] == "crepe" data["rmvpe"] = data["f0method"] == "rmvpe" data["fcpe"] = data["f0method"] == "fcpe" - if data["sg_input_device"] not in input_devices: - data["sg_input_device"] = input_devices[sd.default.device[0]] - if data["sg_output_device"] not in output_devices: - data["sg_output_device"] = output_devices[sd.default.device[1]] + if data["sg_input_device"] not in self.input_devices: + data["sg_input_device"] = self.input_devices[ + self.input_devices_indices.index(sd.default.device[0]) + ] + if data["sg_output_device"] not in self.output_devices: + data["sg_output_device"] = self.output_devices[ + self.output_devices_indices.index(sd.default.device[1]) + ] except: with open("configs/config.json", "w") as j: data = { - "pth_path": " ", - "index_path": " ", - "sg_input_device": input_devices[sd.default.device[0]], - "sg_output_device": output_devices[sd.default.device[1]], + "pth_path": "", + "index_path": "", + "sg_input_device": self.input_devices[ + self.input_devices_indices.index(sd.default.device[0]) + ], + "sg_output_device": self.output_devices[ + self.output_devices_indices.index(sd.default.device[1]) + ], "sr_type": "sr_model", - "threhold": "-60", - "pitch": "0", - "index_rate": "0", - "rms_mix_rate": "0", - "block_time": "0.25", - "crossfade_length": "0.05", - "extra_time": "2.5", + "threhold": -60, + "pitch": 0, + "index_rate": 0, + "rms_mix_rate": 0, + "block_time": 0.25, + "crossfade_length": 0.05, + "extra_time": 2.5, + "n_cpu": 4, "f0method": "rmvpe", "use_jit": False, "use_pv": False, @@ -185,7 +198,6 @@ if __name__ == "__main__": data = self.load() self.config.use_jit = False # data.get("use_jit", self.config.use_jit) sg.theme("LightBlue3") - input_devices, output_devices, _, _ = self.get_devices() layout = [ [ sg.Frame( @@ -224,7 +236,7 @@ if __name__ == "__main__": [ sg.Text(i18n("输入设备")), sg.Combo( - input_devices, + self.input_devices, key="sg_input_device", default_value=data.get("sg_input_device", ""), ), @@ -232,7 +244,7 @@ if __name__ == "__main__": [ sg.Text(i18n("输出设备")), sg.Combo( - output_devices, + self.output_devices, key="sg_output_device", default_value=data.get("sg_output_device", ""), ), @@ -463,32 +475,27 @@ if __name__ == "__main__": self.event_handler() def event_handler(self): + global flag_vc while True: event, values = self.window.read() if event == sg.WINDOW_CLOSED: - self.flag_vc = False + self.stop_stream() exit() if event == "reload_devices": - prev_input = self.window["sg_input_device"].get() - prev_output = self.window["sg_output_device"].get() - input_devices, output_devices, _, _ = self.get_devices(update=True) - if prev_input not in input_devices: - self.gui_config.sg_input_device = input_devices[0] - else: - self.gui_config.sg_input_device = prev_input - self.window["sg_input_device"].Update(values=input_devices) + self.update_devices() + if self.gui_config.sg_input_device not in self.input_devices: + self.gui_config.sg_input_device = self.input_devices[0] + self.window["sg_input_device"].Update(values=self.input_devices) self.window["sg_input_device"].Update( value=self.gui_config.sg_input_device ) - if prev_output not in output_devices: - self.gui_config.sg_output_device = output_devices[0] - else: - self.gui_config.sg_output_device = prev_output - self.window["sg_output_device"].Update(values=output_devices) + if self.gui_config.sg_output_device not in self.output_devices: + self.gui_config.sg_output_device = self.output_devices[0] + self.window["sg_output_device"].Update(values=self.output_devices) self.window["sg_output_device"].Update( value=self.gui_config.sg_output_device ) - if event == "start_vc" and self.flag_vc == False: + if event == "start_vc" and not flag_vc: if self.set_values(values) == True: printt("cuda_is_available: %s", torch.cuda.is_available()) self.start_vc() @@ -527,22 +534,17 @@ if __name__ == "__main__": } with open("configs/config.json", "w") as j: json.dump(settings, j) - global stream_latency - while stream_latency < 0: - time.sleep(0.01) - self.delay_time = ( - stream_latency - + values["block_time"] - + values["crossfade_length"] - + 0.01 - ) + if self.stream is not None: + self.delay_time = ( + self.stream.latency[-1] + + values["block_time"] + + values["crossfade_length"] + + 0.01 + ) if values["I_noise_reduce"]: self.delay_time += min(values["crossfade_length"], 0.04) self.window["sr_stream"].update(self.gui_config.samplerate) self.window["delay_time"].update(int(self.delay_time * 1000)) - if event == "stop_vc" and self.flag_vc == True: - self.flag_vc = False - stream_latency = -1 # Parameter hot update if event == "threhold": self.gui_config.threhold = values["threhold"] @@ -560,7 +562,7 @@ if __name__ == "__main__": self.gui_config.f0method = event elif event == "I_noise_reduce": self.gui_config.I_noise_reduce = values["I_noise_reduce"] - if stream_latency > 0: + if self.stream is not None: self.delay_time += ( 1 if values["I_noise_reduce"] else -1 ) * min(values["crossfade_length"], 0.04) @@ -571,10 +573,9 @@ if __name__ == "__main__": self.gui_config.use_pv = values["use_pv"] elif event in ["vc", "im"]: self.function = event - elif event != "start_vc" and self.flag_vc == True: + elif event == "stop_vc" or event != "start_vc": # Other parameters do not support hot update - self.flag_vc = False - stream_latency = -1 + self.stop_stream() def set_values(self, values): if len(values["pth_path"].strip()) == 0: @@ -593,6 +594,8 @@ if __name__ == "__main__": self.set_devices(values["sg_input_device"], values["sg_output_device"]) self.config.use_jit = False # values["use_jit"] # self.device_latency = values["device_latency"] + self.gui_config.sg_input_device = values["sg_input_device"] + self.gui_config.sg_output_device = values["sg_output_device"] self.gui_config.pth_path = values["pth_path"] self.gui_config.index_path = values["index_path"] self.gui_config.sr_type = ["sr_model", "sr_device"][ @@ -625,7 +628,6 @@ if __name__ == "__main__": def start_vc(self): torch.cuda.empty_cache() - self.flag_vc = True self.rvc = rvc_for_realtime.RVC( self.gui_config.pitch, self.gui_config.pth_path, @@ -732,27 +734,30 @@ if __name__ == "__main__": self.tg = TorchGate( sr=self.gui_config.samplerate, n_fft=4 * self.zc, prop_decrease=0.9 ).to(self.config.device) - thread_vc = threading.Thread(target=self.soundinput) - thread_vc.start() + self.start_stream() - def soundinput(self): - """ - 接受音频输入 - """ - channels = 1 if sys.platform == "darwin" else 2 - with sd.Stream( - channels=channels, - callback=self.audio_callback, - blocksize=self.block_frame, - samplerate=self.gui_config.samplerate, - dtype="float32", - ) as stream: - global stream_latency - stream_latency = stream.latency[-1] - while self.flag_vc: - time.sleep(self.gui_config.block_time) - printt("Audio block passed.") - printt("ENDing VC") + def start_stream(self): + global flag_vc + if not flag_vc: + flag_vc = True + channels = 1 if sys.platform == "darwin" else 2 + self.stream = sd.Stream( + channels=channels, + callback=self.audio_callback, + blocksize=self.block_frame, + samplerate=self.gui_config.samplerate, + dtype="float32", + ) + self.stream.start() + + def stop_stream(self): + global flag_vc + if flag_vc: + flag_vc = False + if self.stream is not None: + self.stream.stop() + self.stream.close() + self.stream = None def audio_callback( self, indata: np.ndarray, outdata: np.ndarray, frames, times, status @@ -760,6 +765,7 @@ if __name__ == "__main__": """ 音频处理 """ + global flag_vc start_time = time.perf_counter() indata = librosa.to_mono(indata.T) if self.gui_config.threhold > -60: @@ -908,59 +914,47 @@ if __name__ == "__main__": infer_wav[: self.block_frame].repeat(2, 1).t().cpu().numpy() ) total_time = time.perf_counter() - start_time - self.window["infer_time"].update(int(total_time * 1000)) + if flag_vc: + self.window["infer_time"].update(int(total_time * 1000)) printt("Infer time: %.2f", total_time) - def get_devices(self, update: bool = True): + def update_devices(self): """获取设备列表""" - if update: - sd._terminate() - sd._initialize() + sd._terminate() + sd._initialize() devices = sd.query_devices() hostapis = sd.query_hostapis() for hostapi in hostapis: for device_idx in hostapi["devices"]: devices[device_idx]["hostapi_name"] = hostapi["name"] - input_devices = [ + self.input_devices = [ f"{d['name']} ({d['hostapi_name']})" for d in devices if d["max_input_channels"] > 0 ] - output_devices = [ + self.output_devices = [ f"{d['name']} ({d['hostapi_name']})" for d in devices if d["max_output_channels"] > 0 ] - input_devices_indices = [ + self.input_devices_indices = [ d["index"] if "index" in d else d["name"] for d in devices if d["max_input_channels"] > 0 ] - output_devices_indices = [ + self.output_devices_indices = [ d["index"] if "index" in d else d["name"] for d in devices if d["max_output_channels"] > 0 ] - return ( - input_devices, - output_devices, - input_devices_indices, - output_devices_indices, - ) def set_devices(self, input_device, output_device): """设置输出设备""" - ( - input_devices, - output_devices, - input_device_indices, - output_device_indices, - ) = self.get_devices() - sd.default.device[0] = input_device_indices[ - input_devices.index(input_device) + sd.default.device[0] = self.input_devices_indices[ + self.input_devices.index(input_device) ] - sd.default.device[1] = output_device_indices[ - output_devices.index(output_device) + sd.default.device[1] = self.output_devices_indices[ + self.output_devices.index(output_device) ] printt("Input device: %s:%s", str(sd.default.device[0]), input_device) printt("Output device: %s:%s", str(sd.default.device[1]), output_device)