add input wav and delay time monitor (#1295)

2025-04-25 22:18:58 +08:00 · 2023-09-21 22:17:06 +08:00 · 2023-09-21 22:17:06 +08:00 · ac1397f3f9
commit ac1397f3f9
parent a47aad5a3c
2 changed files with 63 additions and 29 deletions
--- a/gui_v1.py
+++ b/gui_v1.py
@ -14,7 +14,7 @@ sys.path.append(now_dir)
 import multiprocessing

 logger = logging.getLogger(__name__)
-
+stream_latency = -1

 class Harvest(multiprocessing.Process):
    def __init__(self, inp_q, opt_q):
@ -100,7 +100,8 @@ if __name__ == "__main__":
        def __init__(self) -> None:
            self.config = GUIConfig()
            self.flag_vc = False
-
+            self.function = 'vc'
+            self.delay_time = 0
            self.launcher()

        def load(self):
@ -112,6 +113,10 @@ if __name__ == "__main__":
                    data["harvest"] = data["f0method"] == "harvest"
                    data["crepe"] = data["f0method"] == "crepe"
                    data["rmvpe"] = data["f0method"] == "rmvpe"
+                    if data["sg_input_device"] not in input_devices:
+                        data["sg_input_device"] = input_devices[sd.default.device[0]]
+                    if data["sg_output_device"] not in output_devices:
+                        data["sg_output_device"] = output_devices[sd.default.device[1]] 
            except:
                with open("configs/config.json", "w") as j:
                    data = {
@ -342,6 +347,22 @@ if __name__ == "__main__":
                [
                    sg.Button(i18n("开始音频转换"), key="start_vc"),
                    sg.Button(i18n("停止音频转换"), key="stop_vc"),
+                    sg.Radio(
+                        i18n("输入监听"),
+                        "function",
+                        key="im",
+                        default=False,
+                        enable_events=True,
+                    ), 
+                    sg.Radio(
+                        i18n("输出变声"),
+                        "function",
+                        key="vc",
+                        default=True,
+                        enable_events=True,
+                    ),
+                    sg.Text(i18n("算法延迟(ms):")),
+                    sg.Text("0", key="delay_time"),
                    sg.Text(i18n("推理时间(ms):")),
                    sg.Text("0", key="infer_time"),
                ],
@ -403,9 +424,16 @@ if __name__ == "__main__":
                        }
                        with open("configs/config.json", "w") as j:
                            json.dump(settings, j)
+                        global stream_latency
+                        while stream_latency < 0:
+                            time.sleep(0.01)
+                        self.delay_time = stream_latency + values["block_time"] + values["crossfade_length"] + 0.01
+                        if values["I_noise_reduce"]:
+                            self.delay_time += values["crossfade_length"]
+                        self.window["delay_time"].update(int(self.delay_time * 1000))
                if event == "stop_vc" and self.flag_vc == True:
                    self.flag_vc = False
-
+                    stream_latency = -1
                # Parameter hot update
                if event == "threhold":
                    self.config.threhold = values["threhold"]
@ -423,11 +451,17 @@ if __name__ == "__main__":
                    self.config.f0method = event
                elif event == "I_noise_reduce":
                    self.config.I_noise_reduce = values["I_noise_reduce"]
+                    if stream_latency > 0:
+                        self.delay_time += (1 if values["I_noise_reduce"] else -1) * values["crossfade_length"]
+                        self.window["delay_time"].update(int(self.delay_time * 1000))
                elif event == "O_noise_reduce":
                    self.config.O_noise_reduce = values["O_noise_reduce"]
+                elif event in ["vc", "im"]:
+                    self.function = event
                elif event != "start_vc" and self.flag_vc == True:
                    # Other parameters do not support hot update
                    self.flag_vc = False
+                    stream_latency = -1

        def set_values(self, values):
            if len(values["pth_path"].strip()) == 0:
@ -565,7 +599,9 @@ if __name__ == "__main__":
                blocksize=self.block_frame,
                samplerate=self.config.samplerate,
                dtype="float32",
-            ):
+            ) as stream:
+                global stream_latency
+                stream_latency = stream.latency[-1]
                while self.flag_vc:
                    time.sleep(self.config.block_time)
                    logger.debug("Audio block passed.")
@ -597,7 +633,7 @@ if __name__ == "__main__":
                self.block_frame_16k :
            ].clone()
            # input noise reduction and resampling
-            if self.config.I_noise_reduce:
+            if self.config.I_noise_reduce and self.function == 'vc':
                input_wav = self.input_wav[
                    -self.crossfade_frame - self.block_frame - 2 * self.zc :
                ]
@ -621,6 +657,7 @@ if __name__ == "__main__":
                    self.input_wav[-self.block_frame - 2 * self.zc :]
                )[160:]
            # infer
+            if self.function == 'vc':
                f0_extractor_frame = self.block_frame_16k + 800
                if self.config.f0method == "rmvpe":
                    f0_extractor_frame = 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160
@ -636,8 +673,12 @@ if __name__ == "__main__":
                infer_wav = infer_wav[
                    -self.crossfade_frame - self.sola_search_frame - self.block_frame :
                ]
+            else:
+                infer_wav = self.input_wav[
+                    -self.crossfade_frame - self.sola_search_frame - self.block_frame :
+                ].clone()
            # output noise reduction
-            if self.config.O_noise_reduce:
+            if (self.config.O_noise_reduce and self.function == 'vc') or (self.config.I_noise_reduce and self.function == 'im'):
                self.output_buffer[: -self.block_frame] = self.output_buffer[
                    self.block_frame :
                ].clone()
@ -646,7 +687,7 @@ if __name__ == "__main__":
                    infer_wav.unsqueeze(0), self.output_buffer.unsqueeze(0)
                ).squeeze(0)
            # volume envelop mixing
-            if self.config.rms_mix_rate < 1:
+            if self.config.rms_mix_rate < 1 and self.function == 'vc':
                rms1 = librosa.feature.rms(
                    y=self.input_wav_res[-160 * infer_wav.shape[0] // self.zc :]
                    .cpu()
--- a/tools/torchgate/torchgate.py
+++ b/tools/torchgate/torchgate.py
@ -211,13 +211,6 @@ class TorchGate(torch.nn.Module):
        Returns:
            torch.Tensor: The denoised audio signal, with the same shape as the input signal.
        """
-        assert x.ndim == 2
-        if x.shape[-1] < self.win_length * 2:
-            raise Exception(f"x must be bigger than {self.win_length * 2}")
-
-        assert xn is None or xn.ndim == 1 or xn.ndim == 2
-        if xn is not None and xn.shape[-1] < self.win_length * 2:
-            raise Exception(f"xn must be bigger than {self.win_length * 2}")

        # Compute short-time Fourier transform (STFT)
        X = torch.stft(