From 3dec36568c48af357ec6b7331a7c06348bd51abb Mon Sep 17 00:00:00 2001
From: yxlllc <llc1995@sina.com>
Date: Tue, 26 Dec 2023 00:23:36 +0800
Subject: [PATCH 1/5] optimize real-time vc

---
 configs/config.json              |   2 +-
 gui_v1.py                        | 159 +++++++++++++++++++++++--------
 infer/lib/infer_pack/models.py   |  41 ++++----
 infer/lib/jit/get_synthesizer.py |   1 +
 infer/lib/rmvpe.py               |  12 ++-
 tools/rvc_for_realtime.py        | 131 ++++++++++++-------------
 6 files changed, 211 insertions(+), 135 deletions(-)

diff --git a/configs/config.json b/configs/config.json
index 0861200..f874bd5 100644
--- a/configs/config.json
+++ b/configs/config.json
@@ -1 +1 @@
-{"pth_path": "assets/weights/kikiV1.pth", "index_path": "logs/kikiV1.index", "sg_input_device": "VoiceMeeter Output (VB-Audio Vo (MME)", "sg_output_device": "VoiceMeeter Input (VB-Audio Voi (MME)", "threhold": -45.0, "pitch": 2.0, "rms_mix_rate": 0.0, "index_rate": 0.0, "block_time": 0.52, "crossfade_length": 0.15, "extra_time": 2.46, "n_cpu": 6.0, "use_jit": false, "f0method": "rmvpe"}
\ No newline at end of file
+{"pth_path": "assets/weights/kikiV1.pth", "index_path": "logs/kikiV1.index", "sg_input_device": "VoiceMeeter Output (VB-Audio Vo (MME)", "sg_output_device": "VoiceMeeter Input (VB-Audio Voi (MME)", "sr_type": "sr_model", "threhold": -60.0, "pitch": 12.0, "rms_mix_rate": 0.5, "index_rate": 0.0, "block_time": 0.2, "crossfade_length": 0.08, "extra_time": 2.00, "n_cpu": 4.0, "use_jit": false, "use_pv": false, "f0method": "fcpe"}
\ No newline at end of file
diff --git a/gui_v1.py b/gui_v1.py
index 7f4c640..0f614e6 100644
--- a/gui_v1.py
+++ b/gui_v1.py
@@ -22,6 +22,26 @@ def printt(strr, *args):
         print(strr % args)
 
 
+def phase_vocoder(a, b, fade_out, fade_in):
+    window = torch.sqrt(fade_out * fade_in)
+    fa = torch.fft.rfft(a * window)
+    fb = torch.fft.rfft(b * window)
+    absab = torch.abs(fa) + torch.abs(fb)
+    n = a.shape[0]
+    if n % 2 == 0:
+        absab[1:-1] *= 2
+    else:
+        absab[1:] *= 2
+    phia = torch.angle(fa)
+    phib = torch.angle(fb)
+    deltaphase = phib - phia
+    deltaphase = deltaphase - 2 * np.pi * torch.floor(deltaphase / 2 / np.pi + 0.5)
+    w = 2 * np.pi * torch.arange(n // 2 + 1).to(a) + deltaphase
+    t = torch.arange(n).unsqueeze(-1).to(a) / n
+    result = a * (fade_out ** 2) + b * (fade_in ** 2) + torch.sum(absab * torch.cos(w * t + phia), -1) * window / n
+    return result
+
+    
 class Harvest(multiprocessing.Process):
     def __init__(self, inp_q, opt_q):
         multiprocessing.Process.__init__(self)
@@ -118,6 +138,8 @@ if __name__ == "__main__":
             try:
                 with open("configs/config.json", "r") as j:
                     data = json.load(j)
+                    data["sr_model"] = data["sr_type"] == "sr_model"
+                    data["sr_device"] = data["sr_type"] == "sr_device"
                     data["pm"] = data["f0method"] == "pm"
                     data["harvest"] = data["f0method"] == "harvest"
                     data["crepe"] = data["f0method"] == "crepe"
@@ -134,6 +156,7 @@ if __name__ == "__main__":
                         "index_path": " ",
                         "sg_input_device": input_devices[sd.default.device[0]],
                         "sg_output_device": output_devices[sd.default.device[1]],
+                        "sr_type": "sr_model",
                         "threhold": "-60",
                         "pitch": "0",
                         "index_rate": "0",
@@ -143,7 +166,10 @@ if __name__ == "__main__":
                         "extra_time": "2.5",
                         "f0method": "rmvpe",
                         "use_jit": False,
+                        "use_pv": False,
                     }
+                    data["sr_model"] = data["sr_type"] == "sr_model"
+                    data["sr_device"] = data["sr_type"] == "sr_device"
                     data["pm"] = data["f0method"] == "pm"
                     data["harvest"] = data["f0method"] == "harvest"
                     data["crepe"] = data["f0method"] == "crepe"
@@ -207,7 +233,25 @@ if __name__ == "__main__":
                                     default_value=data.get("sg_output_device", ""),
                                 ),
                             ],
-                            [sg.Button(i18n("重载设备列表"), key="reload_devices")],
+                            [
+                                sg.Button(i18n("重载设备列表"), key="reload_devices"),
+                                sg.Radio(
+                                    i18n("使用模型采样率"),
+                                    "sr_type",
+                                    key="sr_model",
+                                    default=data.get("sr_model", True),
+                                    enable_events=True,
+                                ),
+                                sg.Radio(
+                                    i18n("使用设备采样率"),
+                                    "sr_type",
+                                    key="sr_device",
+                                    default=data.get("sr_device", False),
+                                    enable_events=True,
+                                ),
+                                sg.Text(i18n("采样率:")),
+                                sg.Text("", key="sr_stream"),
+                            ],
                         ],
                         title=i18n("音频设备(请使用同种类驱动)"),
                     )
@@ -222,7 +266,7 @@ if __name__ == "__main__":
                                     key="threhold",
                                     resolution=1,
                                     orientation="h",
-                                    default_value=data.get("threhold", "-60"),
+                                    default_value=data.get("threhold", -60),
                                     enable_events=True,
                                 ),
                             ],
@@ -233,7 +277,7 @@ if __name__ == "__main__":
                                     key="pitch",
                                     resolution=1,
                                     orientation="h",
-                                    default_value=data.get("pitch", "0"),
+                                    default_value=data.get("pitch", 0),
                                     enable_events=True,
                                 ),
                             ],
@@ -244,7 +288,7 @@ if __name__ == "__main__":
                                     key="index_rate",
                                     resolution=0.01,
                                     orientation="h",
-                                    default_value=data.get("index_rate", "0"),
+                                    default_value=data.get("index_rate", 0),
                                     enable_events=True,
                                 ),
                             ],
@@ -255,7 +299,7 @@ if __name__ == "__main__":
                                     key="rms_mix_rate",
                                     resolution=0.01,
                                     orientation="h",
-                                    default_value=data.get("rms_mix_rate", "0"),
+                                    default_value=data.get("rms_mix_rate", 0),
                                     enable_events=True,
                                 ),
                             ],
@@ -265,35 +309,35 @@ if __name__ == "__main__":
                                     "pm",
                                     "f0method",
                                     key="pm",
-                                    default=data.get("pm", "") == True,
+                                    default=data.get("pm", False),
                                     enable_events=True,
                                 ),
                                 sg.Radio(
                                     "harvest",
                                     "f0method",
                                     key="harvest",
-                                    default=data.get("harvest", "") == True,
+                                    default=data.get("harvest", False),
                                     enable_events=True,
                                 ),
                                 sg.Radio(
                                     "crepe",
                                     "f0method",
                                     key="crepe",
-                                    default=data.get("crepe", "") == True,
+                                    default=data.get("crepe", False),
                                     enable_events=True,
                                 ),
                                 sg.Radio(
                                     "rmvpe",
                                     "f0method",
                                     key="rmvpe",
-                                    default=data.get("rmvpe", "") == True,
+                                    default=data.get("rmvpe", False),
                                     enable_events=True,
                                 ),
                                 sg.Radio(
                                     "fcpe",
                                     "f0method",
                                     key="fcpe",
-                                    default=data.get("fcpe", "") == True,
+                                    default=data.get("fcpe", True),
                                     enable_events=True,
                                 ),
                             ],
@@ -305,11 +349,11 @@ if __name__ == "__main__":
                             [
                                 sg.Text(i18n("采样长度")),
                                 sg.Slider(
-                                    range=(0.05, 2.4),
+                                    range=(0.02, 2.4),
                                     key="block_time",
                                     resolution=0.01,
                                     orientation="h",
-                                    default_value=data.get("block_time", "0.25"),
+                                    default_value=data.get("block_time", 0.25),
                                     enable_events=True,
                                 ),
                             ],
@@ -320,7 +364,7 @@ if __name__ == "__main__":
                             #         key="device_latency",
                             #         resolution=0.001,
                             #         orientation="h",
-                            #         default_value=data.get("device_latency", "0.1"),
+                            #         default_value=data.get("device_latency", 0.1),
                             #         enable_events=True,
                             #     ),
                             # ],
@@ -344,7 +388,7 @@ if __name__ == "__main__":
                                     key="crossfade_length",
                                     resolution=0.01,
                                     orientation="h",
-                                    default_value=data.get("crossfade_length", "0.05"),
+                                    default_value=data.get("crossfade_length", 0.05),
                                     enable_events=True,
                                 ),
                             ],
@@ -355,7 +399,7 @@ if __name__ == "__main__":
                                     key="extra_time",
                                     resolution=0.01,
                                     orientation="h",
-                                    default_value=data.get("extra_time", "2.5"),
+                                    default_value=data.get("extra_time", 2.5),
                                     enable_events=True,
                                 ),
                             ],
@@ -370,6 +414,12 @@ if __name__ == "__main__":
                                     key="O_noise_reduce",
                                     enable_events=True,
                                 ),
+                                sg.Checkbox(
+                                    i18n("启用相位声码器"),
+                                    key="use_pv",
+                                    default=data.get("use_pv", False),
+                                    enable_events=True,
+                                ),
                                 # sg.Checkbox(
                                 #     "JIT加速",
                                 #     default=self.config.use_jit,
@@ -443,6 +493,12 @@ if __name__ == "__main__":
                             "index_path": values["index_path"],
                             "sg_input_device": values["sg_input_device"],
                             "sg_output_device": values["sg_output_device"],
+                            "sr_type": ["sr_model", "sr_device"][
+                                [
+                                    values["sr_model"],
+                                    values["sr_device"],
+                                ].index(True)
+                            ],
                             "threhold": values["threhold"],
                             "pitch": values["pitch"],
                             "rms_mix_rate": values["rms_mix_rate"],
@@ -454,6 +510,7 @@ if __name__ == "__main__":
                             "n_cpu": values["n_cpu"],
                             # "use_jit": values["use_jit"],
                             "use_jit": False,
+                            "use_pv": values["use_pv"],
                             "f0method": ["pm", "harvest", "crepe", "rmvpe", "fcpe"][
                                 [
                                     values["pm"],
@@ -477,6 +534,7 @@ if __name__ == "__main__":
                         )
                         if values["I_noise_reduce"]:
                             self.delay_time += values["crossfade_length"]
+                        self.window["sr_stream"].update(self.gui_config.samplerate)
                         self.window["delay_time"].update(int(self.delay_time * 1000))
                 if event == "stop_vc" and self.flag_vc == True:
                     self.flag_vc = False
@@ -505,6 +563,8 @@ if __name__ == "__main__":
                         self.window["delay_time"].update(int(self.delay_time * 1000))
                 elif event == "O_noise_reduce":
                     self.gui_config.O_noise_reduce = values["O_noise_reduce"]
+                elif event == "use_pv":
+                    self.gui_config.use_pv = values["use_pv"]
                 elif event in ["vc", "im"]:
                     self.function = event
                 elif event != "start_vc" and self.flag_vc == True:
@@ -531,6 +591,12 @@ if __name__ == "__main__":
             # self.device_latency = values["device_latency"]
             self.gui_config.pth_path = values["pth_path"]
             self.gui_config.index_path = values["index_path"]
+            self.gui_config.sr_type = ["sr_model", "sr_device"][
+                                [
+                                    values["sr_model"],
+                                    values["sr_device"],
+                                ].index(True)
+                            ]
             self.gui_config.threhold = values["threhold"]
             self.gui_config.pitch = values["pitch"]
             self.gui_config.block_time = values["block_time"]
@@ -538,6 +604,7 @@ if __name__ == "__main__":
             self.gui_config.extra_time = values["extra_time"]
             self.gui_config.I_noise_reduce = values["I_noise_reduce"]
             self.gui_config.O_noise_reduce = values["O_noise_reduce"]
+            self.gui_config.use_pv = values["use_pv"]
             self.gui_config.rms_mix_rate = values["rms_mix_rate"]
             self.gui_config.index_rate = values["index_rate"]
             self.gui_config.n_cpu = values["n_cpu"]
@@ -566,8 +633,8 @@ if __name__ == "__main__":
                 self.config,
                 self.rvc if hasattr(self, "rvc") else None,
             )
-            self.gui_config.samplerate = self.rvc.tgt_sr
-            self.zc = self.rvc.tgt_sr // 100
+            self.gui_config.samplerate = self.rvc.tgt_sr if self.gui_config.sr_type == "sr_model" else self.get_device_samplerate()
+            self.zc = self.gui_config.samplerate // 100
             self.block_frame = (
                 int(
                     np.round(
@@ -589,6 +656,7 @@ if __name__ == "__main__":
                 )
                 * self.zc
             )
+            self.sola_buffer_frame = min(self.crossfade_frame, 4 * self.zc)
             self.sola_search_frame = self.zc
             self.extra_frame = (
                 int(
@@ -622,14 +690,14 @@ if __name__ == "__main__":
                 dtype="float64",
             )
             self.sola_buffer: torch.Tensor = torch.zeros(
-                self.crossfade_frame, device=self.config.device, dtype=torch.float32
+                self.sola_buffer_frame, device=self.config.device, dtype=torch.float32
             )
             self.nr_buffer: torch.Tensor = self.sola_buffer.clone()
             self.output_buffer: torch.Tensor = self.input_wav.clone()
             self.res_buffer: torch.Tensor = torch.zeros(
                 2 * self.zc, device=self.config.device, dtype=torch.float32
             )
-            self.valid_rate = 1 - (self.extra_frame - 1) / self.input_wav.shape[0]
+            self.skip_head = self.extra_frame // self.zc
             self.fade_in_window: torch.Tensor = (
                 torch.sin(
                     0.5
@@ -637,7 +705,7 @@ if __name__ == "__main__":
                     * torch.linspace(
                         0.0,
                         1.0,
-                        steps=self.crossfade_frame,
+                        steps=self.sola_buffer_frame,
                         device=self.config.device,
                         dtype=torch.float32,
                     )
@@ -650,6 +718,14 @@ if __name__ == "__main__":
                 new_freq=16000,
                 dtype=torch.float32,
             ).to(self.config.device)
+            if self.rvc.tgt_sr != self.gui_config.samplerate:
+                self.resampler2 = tat.Resample(
+                    orig_freq=self.rvc.tgt_sr,
+                    new_freq=self.gui_config.samplerate,
+                    dtype=torch.float32,
+                ).to(self.config.device)
+            else:
+                self.resampler2 = None
             self.tg = TorchGate(
                 sr=self.gui_config.samplerate, n_fft=4 * self.zc, prop_decrease=0.9
             ).to(self.config.device)
@@ -710,11 +786,11 @@ if __name__ == "__main__":
                 input_wav = self.tg(
                     input_wav.unsqueeze(0), self.input_wav.unsqueeze(0)
                 )[0, 2 * self.zc :]
-                input_wav[: self.crossfade_frame] *= self.fade_in_window
-                input_wav[: self.crossfade_frame] += (
+                input_wav[: self.sola_buffer_frame] *= self.fade_in_window
+                input_wav[: self.sola_buffer_frame] += (
                     self.nr_buffer * self.fade_out_window
                 )
-                self.nr_buffer[:] = input_wav[-self.crossfade_frame :]
+                self.nr_buffer[:] = input_wav[self.block_frame : self.block_frame + self.sola_buffer_frame]
                 input_wav = torch.cat(
                     (self.res_buffer[:], input_wav[: self.block_frame])
                 )
@@ -728,23 +804,16 @@ if __name__ == "__main__":
                 )[160:]
             # infer
             if self.function == "vc":
-                f0_extractor_frame = self.block_frame_16k + 800
-                if self.gui_config.f0method == "rmvpe":
-                    f0_extractor_frame = (
-                        5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160
-                    )
                 infer_wav = self.rvc.infer(
                     self.input_wav_res,
-                    self.input_wav_res[-f0_extractor_frame:].cpu().numpy(),
                     self.block_frame_16k,
-                    self.valid_rate,
+                    self.skip_head,
                     self.pitch,
                     self.pitchf,
                     self.gui_config.f0method,
                 )
-                infer_wav = infer_wav[
-                    -self.crossfade_frame - self.sola_search_frame - self.block_frame :
-                ]
+                if self.resampler2 is not None:
+                    infer_wav = self.resampler2(infer_wav)
             else:
                 infer_wav = self.input_wav[
                     -self.crossfade_frame - self.sola_search_frame - self.block_frame :
@@ -794,13 +863,13 @@ if __name__ == "__main__":
                 )
             # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC
             conv_input = infer_wav[
-                None, None, : self.crossfade_frame + self.sola_search_frame
+                None, None, : self.sola_buffer_frame + self.sola_search_frame
             ]
             cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :])
             cor_den = torch.sqrt(
                 F.conv1d(
                     conv_input**2,
-                    torch.ones(1, 1, self.crossfade_frame, device=self.config.device),
+                    torch.ones(1, 1, self.sola_buffer_frame, device=self.config.device),
                 )
                 + 1e-8
             )
@@ -813,9 +882,16 @@ if __name__ == "__main__":
             infer_wav = infer_wav[
                 sola_offset : sola_offset + self.block_frame + self.crossfade_frame
             ]
-            infer_wav[: self.crossfade_frame] *= self.fade_in_window
-            infer_wav[: self.crossfade_frame] += self.sola_buffer * self.fade_out_window
-            self.sola_buffer[:] = infer_wav[-self.crossfade_frame :]
+            if "privateuseone" in str(self.config.device) or not self.gui_config.use_pv:
+                infer_wav[: self.sola_buffer_frame] *= self.fade_in_window
+                infer_wav[: self.sola_buffer_frame] += self.sola_buffer * self.fade_out_window
+            else:
+                infer_wav[: self.sola_buffer_frame] = phase_vocoder(
+                                                        self.sola_buffer,
+                                                        infer_wav[: self.sola_buffer_frame],
+                                                        self.fade_out_window,
+                                                        self.fade_in_window)
+            self.sola_buffer[:] = infer_wav[self.block_frame : self.block_frame + self.sola_buffer_frame]
             if sys.platform == "darwin":
                 outdata[:] = (
                     infer_wav[: -self.crossfade_frame].cpu().numpy()[:, np.newaxis]
@@ -864,7 +940,7 @@ if __name__ == "__main__":
                 input_devices_indices,
                 output_devices_indices,
             )
-
+                    
         def set_devices(self, input_device, output_device):
             """设置输出设备"""
             (
@@ -881,5 +957,8 @@ if __name__ == "__main__":
             ]
             printt("Input device: %s:%s", str(sd.default.device[0]), input_device)
             printt("Output device: %s:%s", str(sd.default.device[1]), output_device)
-
+        
+        def get_device_samplerate(self):
+            return int(sd.query_devices(device=sd.default.device[0])['default_samplerate'])
+            
     gui = GUI()
diff --git a/infer/lib/infer_pack/models.py b/infer/lib/infer_pack/models.py
index f25e724..c2750ee 100644
--- a/infer/lib/infer_pack/models.py
+++ b/infer/lib/infer_pack/models.py
@@ -722,7 +722,8 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
     def remove_weight_norm(self):
         self.dec.remove_weight_norm()
         self.flow.remove_weight_norm()
-        self.enc_q.remove_weight_norm()
+        if hasattr(self, "enc_q"):
+            self.enc_q.remove_weight_norm()
 
     def __prepare_scriptable__(self):
         for hook in self.dec._forward_pre_hooks.values():
@@ -783,14 +784,14 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
         pitch: torch.Tensor,
         nsff0: torch.Tensor,
         sid: torch.Tensor,
-        rate: Optional[torch.Tensor] = None,
+        skip_head: Optional[torch.Tensor] = None,
     ):
         g = self.emb_g(sid).unsqueeze(-1)
         m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
         z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
-        if rate is not None:
-            assert isinstance(rate, torch.Tensor)
-            head = int(z_p.shape[2] * (1 - rate.item()))
+        if skip_head is not None:
+            assert isinstance(skip_head, torch.Tensor)
+            head = int(skip_head.item())
             z_p = z_p[:, :, head:]
             x_mask = x_mask[:, :, head:]
             nsff0 = nsff0[:, head:]
@@ -887,7 +888,8 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
     def remove_weight_norm(self):
         self.dec.remove_weight_norm()
         self.flow.remove_weight_norm()
-        self.enc_q.remove_weight_norm()
+        if hasattr(self, "enc_q"):
+            self.enc_q.remove_weight_norm()
 
     def __prepare_scriptable__(self):
         for hook in self.dec._forward_pre_hooks.values():
@@ -941,13 +943,14 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
         pitch: torch.Tensor,
         nsff0: torch.Tensor,
         sid: torch.Tensor,
-        rate: Optional[torch.Tensor] = None,
+        skip_head: Optional[torch.Tensor] = None,
     ):
         g = self.emb_g(sid).unsqueeze(-1)
         m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
         z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
-        if rate is not None:
-            head = int(z_p.shape[2] * (1.0 - rate.item()))
+        if skip_head is not None:
+            assert isinstance(skip_head, torch.Tensor)
+            head = int(skip_head.item())
             z_p = z_p[:, :, head:]
             x_mask = x_mask[:, :, head:]
             nsff0 = nsff0[:, head:]
@@ -1041,7 +1044,8 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
     def remove_weight_norm(self):
         self.dec.remove_weight_norm()
         self.flow.remove_weight_norm()
-        self.enc_q.remove_weight_norm()
+        if hasattr(self, "enc_q"):
+            self.enc_q.remove_weight_norm()
 
     def __prepare_scriptable__(self):
         for hook in self.dec._forward_pre_hooks.values():
@@ -1087,13 +1091,14 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
         phone: torch.Tensor,
         phone_lengths: torch.Tensor,
         sid: torch.Tensor,
-        rate: Optional[torch.Tensor] = None,
+        skip_head: Optional[torch.Tensor] = None,
     ):
         g = self.emb_g(sid).unsqueeze(-1)
         m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
         z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
-        if rate is not None:
-            head = int(z_p.shape[2] * (1.0 - rate.item()))
+        if skip_head is not None:
+            assert isinstance(skip_head, torch.Tensor)
+            head = int(skip_head.item())
             z_p = z_p[:, :, head:]
             x_mask = x_mask[:, :, head:]
         z = self.flow(z_p, x_mask, g=g, reverse=True)
@@ -1186,7 +1191,8 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
     def remove_weight_norm(self):
         self.dec.remove_weight_norm()
         self.flow.remove_weight_norm()
-        self.enc_q.remove_weight_norm()
+        if hasattr(self, "enc_q"):
+            self.enc_q.remove_weight_norm()
 
     def __prepare_scriptable__(self):
         for hook in self.dec._forward_pre_hooks.values():
@@ -1232,13 +1238,14 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
         phone: torch.Tensor,
         phone_lengths: torch.Tensor,
         sid: torch.Tensor,
-        rate: Optional[torch.Tensor] = None,
+        skip_head: Optional[torch.Tensor] = None,
     ):
         g = self.emb_g(sid).unsqueeze(-1)
         m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
         z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
-        if rate is not None:
-            head = int(z_p.shape[2] * (1.0 - rate.item()))
+        if skip_head is not None:
+            assert isinstance(skip_head, torch.Tensor)
+            head = int(skip_head.item())
             z_p = z_p[:, :, head:]
             x_mask = x_mask[:, :, head:]
         z = self.flow(z_p, x_mask, g=g, reverse=True)
diff --git a/infer/lib/jit/get_synthesizer.py b/infer/lib/jit/get_synthesizer.py
index ef5fe58..b8db4fa 100644
--- a/infer/lib/jit/get_synthesizer.py
+++ b/infer/lib/jit/get_synthesizer.py
@@ -34,4 +34,5 @@ def get_synthesizer(pth_path, device=torch.device("cpu")):
     net_g.load_state_dict(cpt["weight"], strict=False)
     net_g = net_g.float()
     net_g.eval().to(device)
+    net_g.remove_weight_norm()
     return net_g, cpt
diff --git a/infer/lib/rmvpe.py b/infer/lib/rmvpe.py
index 9010d28..86c6899 100644
--- a/infer/lib/rmvpe.py
+++ b/infer/lib/rmvpe.py
@@ -593,16 +593,18 @@ class RMVPE:
 
     def infer_from_audio(self, audio, thred=0.03):
         # torch.cuda.synchronize()
-        t0 = ttime()
+        # t0 = ttime()
+        if not torch.is_tensor(audio):
+            audio = torch.from_numpy(audio)
         mel = self.mel_extractor(
-            torch.from_numpy(audio).float().to(self.device).unsqueeze(0), center=True
+            audio.float().to(self.device).unsqueeze(0), center=True
         )
         # print(123123123,mel.device.type)
         # torch.cuda.synchronize()
-        t1 = ttime()
+        # t1 = ttime()
         hidden = self.mel2hidden(mel)
         # torch.cuda.synchronize()
-        t2 = ttime()
+        # t2 = ttime()
         # print(234234,hidden.device.type)
         if "privateuseone" not in str(self.device):
             hidden = hidden.squeeze(0).cpu().numpy()
@@ -613,7 +615,7 @@ class RMVPE:
 
         f0 = self.decode(hidden, thred=thred)
         # torch.cuda.synchronize()
-        t3 = ttime()
+        # t3 = ttime()
         # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0))
         return f0
 
diff --git a/tools/rvc_for_realtime.py b/tools/rvc_for_realtime.py
index f36ffb3..2d54732 100644
--- a/tools/rvc_for_realtime.py
+++ b/tools/rvc_for_realtime.py
@@ -46,23 +46,22 @@ def printt(strr, *args):
 # config.is_half=False########强制cpu测试
 class RVC:
     def __init__(
-        self,
-        key,
-        pth_path,
-        index_path,
-        index_rate,
-        n_cpu,
-        inp_q,
-        opt_q,
-        config: Config,
-        last_rvc=None,
+            self,
+            key,
+            pth_path,
+            index_path,
+            index_rate,
+            n_cpu,
+            inp_q,
+            opt_q,
+            config: Config,
+            last_rvc=None,
     ) -> None:
         """
         初始化
         """
         try:
             if config.dml == True:
-
                 def forward_dml(ctx, x, scale):
                     ctx.scale = scale
                     res = x.clone().detach()
@@ -76,13 +75,10 @@ class RVC:
             # device="cpu"########强制cpu测试
             self.device = config.device
             self.f0_up_key = key
-            self.time_step = 160 / 16000 * 1000
             self.f0_min = 50
             self.f0_max = 1100
             self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
             self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
-            self.sr = 16000
-            self.window = 160
             self.n_cpu = n_cpu
             self.use_jit = self.config.use_jit
             self.is_half = config.is_half
@@ -184,6 +180,7 @@ class RVC:
             if last_rvc is not None and hasattr(last_rvc, "model_rmvpe"):
                 self.model_rmvpe = last_rvc.model_rmvpe
             if last_rvc is not None and hasattr(last_rvc, "model_fcpe"):
+                self.device_fcpe = last_rvc.device_fcpe
                 self.model_fcpe = last_rvc.model_fcpe
         except:
             printt(traceback.format_exc())
@@ -199,14 +196,10 @@ class RVC:
         self.index_rate = new_index_rate
 
     def get_f0_post(self, f0):
-        f0_min = self.f0_min
-        f0_max = self.f0_max
-        f0_mel_min = 1127 * np.log(1 + f0_min / 700)
-        f0_mel_max = 1127 * np.log(1 + f0_max / 700)
         f0bak = f0.copy()
         f0_mel = 1127 * np.log(1 + f0 / 700)
-        f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
-            f0_mel_max - f0_mel_min
+        f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (
+                self.f0_mel_max - self.f0_mel_min
         ) + 1
         f0_mel[f0_mel <= 1] = 1
         f0_mel[f0_mel > 255] = 255
@@ -221,6 +214,7 @@ class RVC:
             return self.get_f0_rmvpe(x, f0_up_key)
         if method == "fcpe":
             return self.get_f0_fcpe(x, f0_up_key)
+        x = x.cpu().numpy()
         if method == "pm":
             p_len = x.shape[0] // 160 + 1
             f0_min = 65
@@ -262,7 +256,7 @@ class RVC:
                 self.inp_q.put((idx, x[:tail], res_f0, n_cpu, ts))
             else:
                 self.inp_q.put(
-                    (idx, x[part_length * idx - 320 : tail], res_f0, n_cpu, ts)
+                    (idx, x[part_length * idx - 320: tail], res_f0, n_cpu, ts)
                 )
         while 1:
             res_ts = self.opt_q.get()
@@ -277,20 +271,19 @@ class RVC:
             else:
                 f0 = f0[2:]
             f0bak[
-                part_length * idx // 160 : part_length * idx // 160 + f0.shape[0]
+            part_length * idx // 160: part_length * idx // 160 + f0.shape[0]
             ] = f0
         f0bak = signal.medfilt(f0bak, 3)
         f0bak *= pow(2, f0_up_key / 12)
         return self.get_f0_post(f0bak)
 
     def get_f0_crepe(self, x, f0_up_key):
-        if "privateuseone" in str(self.device):  ###不支持dml，cpu又太慢用不成，拿pm顶替
-            return self.get_f0(x, f0_up_key, 1, "pm")
-        audio = torch.tensor(np.copy(x))[None].float()
+        if "privateuseone" in str(self.device):  ###不支持dml，cpu又太慢用不成，拿fcpe顶替
+            return self.get_f0(x, f0_up_key, 1, "fcpe")
         # printt("using crepe,device:%s"%self.device)
         f0, pd = torchcrepe.predict(
-            audio,
-            self.sr,
+            x.unsqueeze(0).float(),
+            16000,
             160,
             self.f0_min,
             self.f0_max,
@@ -313,15 +306,11 @@ class RVC:
 
             printt("Loading rmvpe model")
             self.model_rmvpe = RMVPE(
-                # "rmvpe.pt", is_half=self.is_half if self.device.type!="privateuseone" else False, device=self.device if self.device.type!="privateuseone"else "cpu"####dml时强制对rmvpe用cpu跑
-                #  "rmvpe.pt", is_half=False, device=self.device####dml配置
-                # "rmvpe.pt", is_half=False, device="cpu"####锁定cpu配置
                 "assets/rmvpe/rmvpe.pt",
                 is_half=self.is_half,
-                device=self.device,  ####正常逻辑
+                device=self.device,
                 use_jit=self.config.use_jit,
             )
-            # self.model_rmvpe = RMVPE("aug2_58000_half.pt", is_half=self.is_half, device=self.device)
         f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
         f0 *= pow(2, f0_up_key / 12)
         return self.get_f0_post(f0)
@@ -329,41 +318,36 @@ class RVC:
     def get_f0_fcpe(self, x, f0_up_key):
         if hasattr(self, "model_fcpe") == False:
             from torchfcpe import spawn_bundled_infer_model
-
             printt("Loading fcpe model")
-            self.model_fcpe = spawn_bundled_infer_model(self.device)
-        f0 = (
-            self.model_fcpe.infer(
-                torch.from_numpy(x).to(self.device).unsqueeze(0).float(),
-                sr=16000,
-                decoder_mode="local_argmax",
-                threshold=0.006,
-            )
-            .squeeze()
-            .cpu()
-            .numpy()
-        )
+            if "privateuseone" in str(self.device):
+                self.device_fcpe = "cpu"
+            else:
+                self.device_fcpe = self.device
+            self.model_fcpe = spawn_bundled_infer_model(self.device_fcpe)
+        f0 = self.model_fcpe.infer(
+            x.to(self.device_fcpe).unsqueeze(0).float(),
+            sr=16000,
+            decoder_mode='local_argmax',
+            threshold=0.006,
+        ).squeeze().cpu().numpy()
         f0 *= pow(2, f0_up_key / 12)
         return self.get_f0_post(f0)
 
     def infer(
-        self,
-        feats: torch.Tensor,
-        indata: np.ndarray,
-        block_frame_16k,
-        rate,
-        cache_pitch,
-        cache_pitchf,
-        f0method,
+            self,
+            input_wav: torch.Tensor,
+            block_frame_16k,
+            skip_head,
+            cache_pitch,
+            cache_pitchf,
+            f0method,
     ) -> np.ndarray:
-        feats = feats.view(1, -1)
-        if self.config.is_half:
-            feats = feats.half()
-        else:
-            feats = feats.float()
-        feats = feats.to(self.device)
         t1 = ttime()
         with torch.no_grad():
+            if self.config.is_half:
+                feats = input_wav.half().view(1, -1)
+            else:
+                feats = input_wav.float().view(1, -1)
             padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
             inputs = {
                 "source": feats,
@@ -387,8 +371,8 @@ class RVC:
                 if self.config.is_half:
                     npy = npy.astype("float16")
                 feats[0][-leng_replace_head:] = (
-                    torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate
-                    + (1 - self.index_rate) * feats[0][-leng_replace_head:]
+                        torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate
+                        + (1 - self.index_rate) * feats[0][-leng_replace_head:]
                 )
             else:
                 printt("Index search FAILED or disabled")
@@ -398,7 +382,13 @@ class RVC:
         feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
         t3 = ttime()
         if self.if_f0 == 1:
-            pitch, pitchf = self.get_f0(indata, self.f0_up_key, self.n_cpu, f0method)
+            f0_extractor_frame = block_frame_16k + 800
+            if f0method == "rmvpe":
+                f0_extractor_frame = (
+                    5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160
+                )
+            input_wav = input_wav[-f0_extractor_frame:]
+            pitch, pitchf = self.get_f0(input_wav, self.f0_up_key, self.n_cpu, f0method)
             start_frame = block_frame_16k // 160
             end_frame = len(cache_pitch) - (pitch.shape[0] - 4) + start_frame
             cache_pitch[:] = np.append(cache_pitch[start_frame:end_frame], pitch[3:-1])
@@ -412,31 +402,28 @@ class RVC:
         t4 = ttime()
         feats = feats[:, :p_len, :]
         if self.if_f0 == 1:
-            cache_pitch = cache_pitch[:p_len]
-            cache_pitchf = cache_pitchf[:p_len]
-            cache_pitch = torch.LongTensor(cache_pitch).unsqueeze(0).to(self.device)
-            cache_pitchf = torch.FloatTensor(cache_pitchf).unsqueeze(0).to(self.device)
+            cache_pitch = torch.LongTensor(cache_pitch[:p_len]).to(self.device).unsqueeze(0)
+            cache_pitchf = torch.FloatTensor(cache_pitchf[:p_len]).to(self.device).unsqueeze(0)
         p_len = torch.LongTensor([p_len]).to(self.device)
-        ii = 0  # sid
-        sid = torch.LongTensor([ii]).to(self.device)
+        sid = torch.LongTensor([0]).to(self.device)
+        skip_head = torch.LongTensor([skip_head])
         with torch.no_grad():
             if self.if_f0 == 1:
-                # printt(12222222222,feats.device,p_len.device,cache_pitch.device,cache_pitchf.device,sid.device,rate2)
                 infered_audio = self.net_g.infer(
                     feats,
                     p_len,
                     cache_pitch,
                     cache_pitchf,
                     sid,
-                    torch.FloatTensor([rate]),
+                    skip_head,
                 )[0][0, 0].data.float()
             else:
                 infered_audio = self.net_g.infer(
-                    feats, p_len, sid, torch.FloatTensor([rate])
+                    feats, p_len, sid, skip_head
                 )[0][0, 0].data.float()
         t5 = ttime()
         printt(
-            "Spent time: fea = %.2fs, index = %.2fs, f0 = %.2fs, model = %.2fs",
+            "Spent time: fea = %.3fs, index = %.3fs, f0 = %.3fs, model = %.3fs",
             t2 - t1,
             t3 - t2,
             t4 - t3,

From d62e80fb8391e5b95fecdc24bac80436e3d54978 Mon Sep 17 00:00:00 2001
From: yxlllc <llc1995@sina.com>
Date: Tue, 26 Dec 2023 00:28:49 +0800
Subject: [PATCH 2/5] optimize real-time vc

---
 requirements-amd.txt                         | 1 +
 requirements-dml.txt                         | 1 +
 requirements-ipex.txt                        | 3 ++-
 requirements-win-for-realtime_vc_gui-dml.txt | 3 ++-
 requirements-win-for-realtime_vc_gui.txt     | 1 +
 5 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/requirements-amd.txt b/requirements-amd.txt
index aa81a88..d0976a7 100644
--- a/requirements-amd.txt
+++ b/requirements-amd.txt
@@ -46,3 +46,4 @@ fastapi==0.88
 ffmpy==0.3.1
 python-dotenv>=1.0.0
 av
+torchfcpe
diff --git a/requirements-dml.txt b/requirements-dml.txt
index a49ed2d..b4690ae 100644
--- a/requirements-dml.txt
+++ b/requirements-dml.txt
@@ -44,3 +44,4 @@ fastapi==0.88
 ffmpy==0.3.1
 python-dotenv>=1.0.0
 av
+torchfcpe
\ No newline at end of file
diff --git a/requirements-ipex.txt b/requirements-ipex.txt
index 610a0ce..19ff424 100644
--- a/requirements-ipex.txt
+++ b/requirements-ipex.txt
@@ -51,4 +51,5 @@ ffmpy==0.3.1
 python-dotenv>=1.0.0
 av
 PySimpleGUI
-sounddevice
\ No newline at end of file
+sounddevice
+torchfcpe
\ No newline at end of file
diff --git a/requirements-win-for-realtime_vc_gui-dml.txt b/requirements-win-for-realtime_vc_gui-dml.txt
index 6514989..9aaf56d 100644
--- a/requirements-win-for-realtime_vc_gui-dml.txt
+++ b/requirements-win-for-realtime_vc_gui-dml.txt
@@ -26,4 +26,5 @@ PySimpleGUI
 sounddevice
 gradio
 noisereduce
-onnxruntime-directml
\ No newline at end of file
+onnxruntime-directml
+torchfcpe
\ No newline at end of file
diff --git a/requirements-win-for-realtime_vc_gui.txt b/requirements-win-for-realtime_vc_gui.txt
index 37ca238..e187f85 100644
--- a/requirements-win-for-realtime_vc_gui.txt
+++ b/requirements-win-for-realtime_vc_gui.txt
@@ -26,3 +26,4 @@ PySimpleGUI
 sounddevice
 gradio
 noisereduce
+torchfcpe

From d7fb651f7c3ed90c72a084030341c620ef4a1a4c Mon Sep 17 00:00:00 2001
From: yxlllc <llc1995@sina.com>
Date: Tue, 26 Dec 2023 16:26:01 +0800
Subject: [PATCH 3/5] optimize real-time vc

---
 gui_v1.py                      | 20 ++++---------
 infer/lib/infer_pack/models.py | 40 ++++++++++++++++---------
 tools/rvc_for_realtime.py      | 55 +++++++++++++++++-----------------
 3 files changed, 58 insertions(+), 57 deletions(-)

diff --git a/gui_v1.py b/gui_v1.py
index 0f614e6..728cf7e 100644
--- a/gui_v1.py
+++ b/gui_v1.py
@@ -681,14 +681,6 @@ if __name__ == "__main__":
                 device=self.config.device,
                 dtype=torch.float32,
             )
-            self.pitch: np.ndarray = np.zeros(
-                self.input_wav.shape[0] // self.zc,
-                dtype="int32",
-            )
-            self.pitchf: np.ndarray = np.zeros(
-                self.input_wav.shape[0] // self.zc,
-                dtype="float64",
-            )
             self.sola_buffer: torch.Tensor = torch.zeros(
                 self.sola_buffer_frame, device=self.config.device, dtype=torch.float32
             )
@@ -698,6 +690,7 @@ if __name__ == "__main__":
                 2 * self.zc, device=self.config.device, dtype=torch.float32
             )
             self.skip_head = self.extra_frame // self.zc
+            self.return_length = (self.block_frame + self.sola_buffer_frame + self.sola_search_frame) // self.zc
             self.fade_in_window: torch.Tensor = (
                 torch.sin(
                     0.5
@@ -808,8 +801,7 @@ if __name__ == "__main__":
                     self.input_wav_res,
                     self.block_frame_16k,
                     self.skip_head,
-                    self.pitch,
-                    self.pitchf,
+                    self.return_length,
                     self.gui_config.f0method,
                 )
                 if self.resampler2 is not None:
@@ -879,9 +871,7 @@ if __name__ == "__main__":
             else:
                 sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0])
             printt("sola_offset = %d", int(sola_offset))
-            infer_wav = infer_wav[
-                sola_offset : sola_offset + self.block_frame + self.crossfade_frame
-            ]
+            infer_wav = infer_wav[sola_offset :]
             if "privateuseone" in str(self.config.device) or not self.gui_config.use_pv:
                 infer_wav[: self.sola_buffer_frame] *= self.fade_in_window
                 infer_wav[: self.sola_buffer_frame] += self.sola_buffer * self.fade_out_window
@@ -894,11 +884,11 @@ if __name__ == "__main__":
             self.sola_buffer[:] = infer_wav[self.block_frame : self.block_frame + self.sola_buffer_frame]
             if sys.platform == "darwin":
                 outdata[:] = (
-                    infer_wav[: -self.crossfade_frame].cpu().numpy()[:, np.newaxis]
+                    infer_wav[: self.block_frame].cpu().numpy()[:, np.newaxis]
                 )
             else:
                 outdata[:] = (
-                    infer_wav[: -self.crossfade_frame].repeat(2, 1).t().cpu().numpy()
+                    infer_wav[: self.block_frame].repeat(2, 1).t().cpu().numpy()
                 )
             total_time = time.perf_counter() - start_time
             self.window["infer_time"].update(int(total_time * 1000))
diff --git a/infer/lib/infer_pack/models.py b/infer/lib/infer_pack/models.py
index c2750ee..a81c1de 100644
--- a/infer/lib/infer_pack/models.py
+++ b/infer/lib/infer_pack/models.py
@@ -785,16 +785,19 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
         nsff0: torch.Tensor,
         sid: torch.Tensor,
         skip_head: Optional[torch.Tensor] = None,
+        return_length: Optional[torch.Tensor] = None,
     ):
         g = self.emb_g(sid).unsqueeze(-1)
         m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
         z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
-        if skip_head is not None:
+        if skip_head is not None and return_length is not None:
             assert isinstance(skip_head, torch.Tensor)
+            assert isinstance(return_length, torch.Tensor)
             head = int(skip_head.item())
-            z_p = z_p[:, :, head:]
-            x_mask = x_mask[:, :, head:]
-            nsff0 = nsff0[:, head:]
+            length = int(return_length.item())
+            z_p = z_p[:, :, head: head + length]
+            x_mask = x_mask[:, :, head: head + length]
+            nsff0 = nsff0[:, head: head + length]
         z = self.flow(z_p, x_mask, g=g, reverse=True)
         o = self.dec(z * x_mask, nsff0, g=g)
         return o, x_mask, (z, z_p, m_p, logs_p)
@@ -944,16 +947,19 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
         nsff0: torch.Tensor,
         sid: torch.Tensor,
         skip_head: Optional[torch.Tensor] = None,
+        return_length: Optional[torch.Tensor] = None,
     ):
         g = self.emb_g(sid).unsqueeze(-1)
         m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
         z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
-        if skip_head is not None:
+        if skip_head is not None and return_length is not None:
             assert isinstance(skip_head, torch.Tensor)
+            assert isinstance(return_length, torch.Tensor)
             head = int(skip_head.item())
-            z_p = z_p[:, :, head:]
-            x_mask = x_mask[:, :, head:]
-            nsff0 = nsff0[:, head:]
+            length = int(return_length.item())
+            z_p = z_p[:, :, head: head + length]
+            x_mask = x_mask[:, :, head: head + length]
+            nsff0 = nsff0[:, head: head + length]
         z = self.flow(z_p, x_mask, g=g, reverse=True)
         o = self.dec(z * x_mask, nsff0, g=g)
         return o, x_mask, (z, z_p, m_p, logs_p)
@@ -1092,15 +1098,18 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
         phone_lengths: torch.Tensor,
         sid: torch.Tensor,
         skip_head: Optional[torch.Tensor] = None,
+        return_length: Optional[torch.Tensor] = None,
     ):
         g = self.emb_g(sid).unsqueeze(-1)
         m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
         z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
-        if skip_head is not None:
+        if skip_head is not None and return_length is not None:
             assert isinstance(skip_head, torch.Tensor)
+            assert isinstance(return_length, torch.Tensor)
             head = int(skip_head.item())
-            z_p = z_p[:, :, head:]
-            x_mask = x_mask[:, :, head:]
+            length = int(return_length.item())
+            z_p = z_p[:, :, head: head + length]
+            x_mask = x_mask[:, :, head: head + length]
         z = self.flow(z_p, x_mask, g=g, reverse=True)
         o = self.dec(z * x_mask, g=g)
         return o, x_mask, (z, z_p, m_p, logs_p)
@@ -1239,15 +1248,18 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
         phone_lengths: torch.Tensor,
         sid: torch.Tensor,
         skip_head: Optional[torch.Tensor] = None,
+        return_length: Optional[torch.Tensor] = None,
     ):
         g = self.emb_g(sid).unsqueeze(-1)
         m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
         z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
-        if skip_head is not None:
+        if skip_head is not None and return_length is not None:
             assert isinstance(skip_head, torch.Tensor)
+            assert isinstance(return_length, torch.Tensor)
             head = int(skip_head.item())
-            z_p = z_p[:, :, head:]
-            x_mask = x_mask[:, :, head:]
+            length = int(return_length.item())
+            z_p = z_p[:, :, head: head + length]
+            x_mask = x_mask[:, :, head: head + length]
         z = self.flow(z_p, x_mask, g=g, reverse=True)
         o = self.dec(z * x_mask, g=g)
         return o, x_mask, (z, z_p, m_p, logs_p)
diff --git a/tools/rvc_for_realtime.py b/tools/rvc_for_realtime.py
index 2d54732..257c44d 100644
--- a/tools/rvc_for_realtime.py
+++ b/tools/rvc_for_realtime.py
@@ -90,7 +90,9 @@ class RVC:
             self.pth_path: str = pth_path
             self.index_path = index_path
             self.index_rate = index_rate
-
+            self.cache_pitch: np.ndarray = np.zeros(1024, dtype="int32")
+            self.cache_pitchf = np.zeros(1024, dtype="float32")
+            
             if last_rvc is None:
                 models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
                     ["assets/hubert/hubert_base.pt"],
@@ -329,8 +331,9 @@ class RVC:
             sr=16000,
             decoder_mode='local_argmax',
             threshold=0.006,
-        ).squeeze().cpu().numpy()
+        )
         f0 *= pow(2, f0_up_key / 12)
+        f0 = f0.squeeze().cpu().numpy()
         return self.get_f0_post(f0)
 
     def infer(
@@ -338,8 +341,7 @@ class RVC:
             input_wav: torch.Tensor,
             block_frame_16k,
             skip_head,
-            cache_pitch,
-            cache_pitchf,
+            return_length,
             f0method,
     ) -> np.ndarray:
         t1 = ttime()
@@ -362,24 +364,22 @@ class RVC:
         t2 = ttime()
         try:
             if hasattr(self, "index") and self.index_rate != 0:
-                leng_replace_head = int(rate * feats[0].shape[0])
-                npy = feats[0][-leng_replace_head:].cpu().numpy().astype("float32")
+                npy = feats[0][skip_head // 2:].cpu().numpy().astype("float32")
                 score, ix = self.index.search(npy, k=8)
                 weight = np.square(1 / score)
                 weight /= weight.sum(axis=1, keepdims=True)
                 npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
                 if self.config.is_half:
                     npy = npy.astype("float16")
-                feats[0][-leng_replace_head:] = (
+                feats[0][skip_head // 2:] = (
                         torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate
-                        + (1 - self.index_rate) * feats[0][-leng_replace_head:]
+                        + (1 - self.index_rate) * feats[0][skip_head // 2:]
                 )
             else:
                 printt("Index search FAILED or disabled")
         except:
             traceback.print_exc()
             printt("Index search FAILED")
-        feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
         t3 = ttime()
         if self.if_f0 == 1:
             f0_extractor_frame = block_frame_16k + 800
@@ -387,40 +387,39 @@ class RVC:
                 f0_extractor_frame = (
                     5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160
                 )
-            input_wav = input_wav[-f0_extractor_frame:]
-            pitch, pitchf = self.get_f0(input_wav, self.f0_up_key, self.n_cpu, f0method)
+            pitch, pitchf = self.get_f0(input_wav[-f0_extractor_frame: ], self.f0_up_key, self.n_cpu, f0method)
             start_frame = block_frame_16k // 160
-            end_frame = len(cache_pitch) - (pitch.shape[0] - 4) + start_frame
-            cache_pitch[:] = np.append(cache_pitch[start_frame:end_frame], pitch[3:-1])
-            cache_pitchf[:] = np.append(
-                cache_pitchf[start_frame:end_frame], pitchf[3:-1]
+            end_frame = len(self.cache_pitch) - (pitch.shape[0] - 4) + start_frame
+            self.cache_pitch[:] = np.append(self.cache_pitch[start_frame: end_frame], pitch[3:-1])
+            self.cache_pitchf[:] = np.append(
+                self.cache_pitchf[start_frame: end_frame], pitchf[3:-1]
             )
-            p_len = min(feats.shape[1], 13000, cache_pitch.shape[0])
-        else:
-            cache_pitch, cache_pitchf = None, None
-            p_len = min(feats.shape[1], 13000)
         t4 = ttime()
-        feats = feats[:, :p_len, :]
+        p_len = input_wav.shape[0] // 160
         if self.if_f0 == 1:
-            cache_pitch = torch.LongTensor(cache_pitch[:p_len]).to(self.device).unsqueeze(0)
-            cache_pitchf = torch.FloatTensor(cache_pitchf[:p_len]).to(self.device).unsqueeze(0)
+            cache_pitch = torch.LongTensor(self.cache_pitch[-p_len: ]).to(self.device).unsqueeze(0)
+            cache_pitchf = torch.FloatTensor(self.cache_pitchf[-p_len: ]).to(self.device).unsqueeze(0)
+        feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
+        feats = feats[:, :p_len, :]
         p_len = torch.LongTensor([p_len]).to(self.device)
         sid = torch.LongTensor([0]).to(self.device)
         skip_head = torch.LongTensor([skip_head])
+        return_length = torch.LongTensor([return_length])
         with torch.no_grad():
             if self.if_f0 == 1:
-                infered_audio = self.net_g.infer(
+                infered_audio, _, _ = self.net_g.infer(
                     feats,
                     p_len,
                     cache_pitch,
                     cache_pitchf,
                     sid,
                     skip_head,
-                )[0][0, 0].data.float()
+                    return_length,
+                )
             else:
-                infered_audio = self.net_g.infer(
-                    feats, p_len, sid, skip_head
-                )[0][0, 0].data.float()
+                infered_audio, _, _ = self.net_g.infer(
+                    feats, p_len, sid, skip_head, return_length
+                )
         t5 = ttime()
         printt(
             "Spent time: fea = %.3fs, index = %.3fs, f0 = %.3fs, model = %.3fs",
@@ -429,4 +428,4 @@ class RVC:
             t4 - t3,
             t5 - t4,
         )
-        return infered_audio
+        return infered_audio.squeeze().float()

From 21775b187a2610be5faaf58a500eaf068620cde1 Mon Sep 17 00:00:00 2001
From: yxlllc <llc1995@sina.com>
Date: Tue, 26 Dec 2023 17:05:42 +0800
Subject: [PATCH 4/5] optimize real-time vc

---
 gui_v1.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gui_v1.py b/gui_v1.py
index 728cf7e..dc2bdc8 100644
--- a/gui_v1.py
+++ b/gui_v1.py
@@ -559,7 +559,7 @@ if __name__ == "__main__":
                     if stream_latency > 0:
                         self.delay_time += (
                             1 if values["I_noise_reduce"] else -1
-                        ) * values["crossfade_length"]
+                        ) * min(values["crossfade_length"], 0.04)
                         self.window["delay_time"].update(int(self.delay_time * 1000))
                 elif event == "O_noise_reduce":
                     self.gui_config.O_noise_reduce = values["O_noise_reduce"]
@@ -774,7 +774,7 @@ if __name__ == "__main__":
             # input noise reduction and resampling
             if self.gui_config.I_noise_reduce and self.function == "vc":
                 input_wav = self.input_wav[
-                    -self.crossfade_frame - self.block_frame - 2 * self.zc :
+                    -self.sola_buffer_frame - self.block_frame - 2 * self.zc :
                 ]
                 input_wav = self.tg(
                     input_wav.unsqueeze(0), self.input_wav.unsqueeze(0)
@@ -783,7 +783,7 @@ if __name__ == "__main__":
                 input_wav[: self.sola_buffer_frame] += (
                     self.nr_buffer * self.fade_out_window
                 )
-                self.nr_buffer[:] = input_wav[self.block_frame : self.block_frame + self.sola_buffer_frame]
+                self.nr_buffer[:] = input_wav[self.block_frame :]
                 input_wav = torch.cat(
                     (self.res_buffer[:], input_wav[: self.block_frame])
                 )
@@ -824,7 +824,7 @@ if __name__ == "__main__":
             # volume envelop mixing
             if self.gui_config.rms_mix_rate < 1 and self.function == "vc":
                 rms1 = librosa.feature.rms(
-                    y=self.input_wav_res[-160 * infer_wav.shape[0] // self.zc :]
+                    y=self.input_wav_res[160 * self.skip_head : 160 * (self.skip_head + self.return_length)]
                     .cpu()
                     .numpy(),
                     frame_length=640,

From aed19c3c6b3f43c4d2e13dbb4631098a2a66c55e Mon Sep 17 00:00:00 2001
From: yxlllc <llc1995@sina.com>
Date: Tue, 26 Dec 2023 17:41:25 +0800
Subject: [PATCH 5/5] optimize real-time vc

---
 gui_v1.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gui_v1.py b/gui_v1.py
index dc2bdc8..e5c6757 100644
--- a/gui_v1.py
+++ b/gui_v1.py
@@ -533,7 +533,7 @@ if __name__ == "__main__":
                             + 0.01
                         )
                         if values["I_noise_reduce"]:
-                            self.delay_time += values["crossfade_length"]
+                            self.delay_time += min(values["crossfade_length"], 0.04)
                         self.window["sr_stream"].update(self.gui_config.samplerate)
                         self.window["delay_time"].update(int(self.delay_time * 1000))
                 if event == "stop_vc" and self.flag_vc == True: