optimize: realtime inference (#1693)

* update real-time gui

* update real-time gui

* update real-time gui
This commit is contained in:
yxlllc 2024-01-16 19:22:55 +08:00 committed by GitHub
parent 26e2805f0e
commit c3e65cdf96
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 143 additions and 96 deletions

View File

@ -1 +1 @@
{"pth_path": "assets/weights/kikiV1.pth", "index_path": "logs/kikiV1.index", "sg_input_device": "VoiceMeeter Output (VB-Audio Vo (MME)", "sg_output_device": "VoiceMeeter Input (VB-Audio Voi (MME)", "sr_type": "sr_model", "threhold": -60.0, "pitch": 12.0, "rms_mix_rate": 0.5, "index_rate": 0.0, "block_time": 0.2, "crossfade_length": 0.08, "extra_time": 2.00, "n_cpu": 4.0, "use_jit": false, "use_pv": false, "f0method": "fcpe"} {"pth_path": "assets/weights/kikiV1.pth", "index_path": "logs/kikiV1.index", "sg_hostapi": "MME", "sg_wasapi_exclusive": false, "sg_input_device": "VoiceMeeter Output (VB-Audio Vo", "sg_output_device": "VoiceMeeter Input (VB-Audio Voi", "sr_type": "sr_device", "threhold": -60.0, "pitch": 12.0, "rms_mix_rate": 0.5, "index_rate": 0.0, "block_time": 0.15, "crossfade_length": 0.08, "extra_time": 2.0, "n_cpu": 4.0, "use_jit": false, "use_pv": false, "f0method": "fcpe"}

170
gui_v1.py
View File

@ -125,6 +125,8 @@ if __name__ == "__main__":
self.index_rate: float = 0.0 self.index_rate: float = 0.0
self.n_cpu: int = min(n_cpu, 4) self.n_cpu: int = min(n_cpu, 4)
self.f0method: str = "fcpe" self.f0method: str = "fcpe"
self.sg_hostapi: str = ""
self.wasapi_exclusive: bool = False
self.sg_input_device: str = "" self.sg_input_device: str = ""
self.sg_output_device: str = "" self.sg_output_device: str = ""
@ -134,6 +136,7 @@ if __name__ == "__main__":
self.config = Config() self.config = Config()
self.function = "vc" self.function = "vc"
self.delay_time = 0 self.delay_time = 0
self.hostapis = None
self.input_devices = None self.input_devices = None
self.output_devices = None self.output_devices = None
self.input_devices_indices = None self.input_devices_indices = None
@ -153,11 +156,22 @@ if __name__ == "__main__":
data["crepe"] = data["f0method"] == "crepe" data["crepe"] = data["f0method"] == "crepe"
data["rmvpe"] = data["f0method"] == "rmvpe" data["rmvpe"] = data["f0method"] == "rmvpe"
data["fcpe"] = data["f0method"] == "fcpe" data["fcpe"] = data["f0method"] == "fcpe"
if data["sg_input_device"] not in self.input_devices: if data["sg_hostapi"] in self.hostapis:
self.update_devices(hostapi_name=data["sg_hostapi"])
if data["sg_input_device"] not in self.input_devices or data["sg_output_device"] not in self.output_devices:
self.update_devices()
data["sg_hostapi"] = self.hostapis[0]
data["sg_input_device"] = self.input_devices[
self.input_devices_indices.index(sd.default.device[0])
]
data["sg_output_device"] = self.output_devices[
self.output_devices_indices.index(sd.default.device[1])
]
else:
data["sg_hostapi"] = self.hostapis[0]
data["sg_input_device"] = self.input_devices[ data["sg_input_device"] = self.input_devices[
self.input_devices_indices.index(sd.default.device[0]) self.input_devices_indices.index(sd.default.device[0])
] ]
if data["sg_output_device"] not in self.output_devices:
data["sg_output_device"] = self.output_devices[ data["sg_output_device"] = self.output_devices[
self.output_devices_indices.index(sd.default.device[1]) self.output_devices_indices.index(sd.default.device[1])
] ]
@ -166,6 +180,8 @@ if __name__ == "__main__":
data = { data = {
"pth_path": "", "pth_path": "",
"index_path": "", "index_path": "",
"sg_hostapi": self.hostapis[0],
"sg_wasapi_exclusive": False,
"sg_input_device": self.input_devices[ "sg_input_device": self.input_devices[
self.input_devices_indices.index(sd.default.device[0]) self.input_devices_indices.index(sd.default.device[0])
], ],
@ -233,12 +249,30 @@ if __name__ == "__main__":
[ [
sg.Frame( sg.Frame(
layout=[ layout=[
[
sg.Text(i18n("设备类型")),
sg.Combo(
self.hostapis,
key="sg_hostapi",
default_value=data.get("sg_hostapi", ""),
enable_events=True,
size=(20, 1)
),
sg.Checkbox(
i18n("独占 WASAPI 设备"),
key="sg_wasapi_exclusive",
default=data.get("sg_wasapi_exclusive", False),
enable_events=True,
),
],
[ [
sg.Text(i18n("输入设备")), sg.Text(i18n("输入设备")),
sg.Combo( sg.Combo(
self.input_devices, self.input_devices,
key="sg_input_device", key="sg_input_device",
default_value=data.get("sg_input_device", ""), default_value=data.get("sg_input_device", ""),
enable_events=True,
size=(45, 1),
), ),
], ],
[ [
@ -247,6 +281,8 @@ if __name__ == "__main__":
self.output_devices, self.output_devices,
key="sg_output_device", key="sg_output_device",
default_value=data.get("sg_output_device", ""), default_value=data.get("sg_output_device", ""),
enable_events=True,
size=(45, 1),
), ),
], ],
[ [
@ -269,7 +305,7 @@ if __name__ == "__main__":
sg.Text("", key="sr_stream"), sg.Text("", key="sr_stream"),
], ],
], ],
title=i18n("音频设备(请使用同种类驱动)"), title=i18n("音频设备"),
) )
], ],
[ [
@ -365,7 +401,7 @@ if __name__ == "__main__":
[ [
sg.Text(i18n("采样长度")), sg.Text(i18n("采样长度")),
sg.Slider( sg.Slider(
range=(0.02, 2.4), range=(0.02, 1.5),
key="block_time", key="block_time",
resolution=0.01, resolution=0.01,
orientation="h", orientation="h",
@ -481,8 +517,15 @@ if __name__ == "__main__":
if event == sg.WINDOW_CLOSED: if event == sg.WINDOW_CLOSED:
self.stop_stream() self.stop_stream()
exit() exit()
if event == "reload_devices": if event == "reload_devices" or event == "sg_hostapi":
self.update_devices() self.gui_config.sg_hostapi = values["sg_hostapi"]
self.update_devices(hostapi_name=values["sg_hostapi"])
if self.gui_config.sg_hostapi not in self.hostapis:
self.gui_config.sg_hostapi = self.hostapis[0]
self.window["sg_hostapi"].Update(values=self.hostapis)
self.window["sg_hostapi"].Update(
value=self.gui_config.sg_hostapi
)
if self.gui_config.sg_input_device not in self.input_devices: if self.gui_config.sg_input_device not in self.input_devices:
self.gui_config.sg_input_device = self.input_devices[0] self.gui_config.sg_input_device = self.input_devices[0]
self.window["sg_input_device"].Update(values=self.input_devices) self.window["sg_input_device"].Update(values=self.input_devices)
@ -502,6 +545,8 @@ if __name__ == "__main__":
settings = { settings = {
"pth_path": values["pth_path"], "pth_path": values["pth_path"],
"index_path": values["index_path"], "index_path": values["index_path"],
"sg_hostapi": values["sg_hostapi"],
"sg_wasapi_exclusive": values["sg_wasapi_exclusive"],
"sg_input_device": values["sg_input_device"], "sg_input_device": values["sg_input_device"],
"sg_output_device": values["sg_output_device"], "sg_output_device": values["sg_output_device"],
"sr_type": ["sr_model", "sr_device"][ "sr_type": ["sr_model", "sr_device"][
@ -544,7 +589,7 @@ if __name__ == "__main__":
if values["I_noise_reduce"]: if values["I_noise_reduce"]:
self.delay_time += min(values["crossfade_length"], 0.04) self.delay_time += min(values["crossfade_length"], 0.04)
self.window["sr_stream"].update(self.gui_config.samplerate) self.window["sr_stream"].update(self.gui_config.samplerate)
self.window["delay_time"].update(int(self.delay_time * 1000)) self.window["delay_time"].update(int(np.round(self.delay_time * 1000)))
# Parameter hot update # Parameter hot update
if event == "threhold": if event == "threhold":
self.gui_config.threhold = values["threhold"] self.gui_config.threhold = values["threhold"]
@ -566,7 +611,7 @@ if __name__ == "__main__":
self.delay_time += ( self.delay_time += (
1 if values["I_noise_reduce"] else -1 1 if values["I_noise_reduce"] else -1
) * min(values["crossfade_length"], 0.04) ) * min(values["crossfade_length"], 0.04)
self.window["delay_time"].update(int(self.delay_time * 1000)) self.window["delay_time"].update(int(np.round(self.delay_time * 1000)))
elif event == "O_noise_reduce": elif event == "O_noise_reduce":
self.gui_config.O_noise_reduce = values["O_noise_reduce"] self.gui_config.O_noise_reduce = values["O_noise_reduce"]
elif event == "use_pv": elif event == "use_pv":
@ -594,6 +639,8 @@ if __name__ == "__main__":
self.set_devices(values["sg_input_device"], values["sg_output_device"]) self.set_devices(values["sg_input_device"], values["sg_output_device"])
self.config.use_jit = False # values["use_jit"] self.config.use_jit = False # values["use_jit"]
# self.device_latency = values["device_latency"] # self.device_latency = values["device_latency"]
self.gui_config.sg_hostapi = values["sg_hostapi"]
self.gui_config.sg_wasapi_exclusive = values["sg_wasapi_exclusive"]
self.gui_config.sg_input_device = values["sg_input_device"] self.gui_config.sg_input_device = values["sg_input_device"]
self.gui_config.sg_output_device = values["sg_output_device"] self.gui_config.sg_output_device = values["sg_output_device"]
self.gui_config.pth_path = values["pth_path"] self.gui_config.pth_path = values["pth_path"]
@ -644,6 +691,7 @@ if __name__ == "__main__":
if self.gui_config.sr_type == "sr_model" if self.gui_config.sr_type == "sr_model"
else self.get_device_samplerate() else self.get_device_samplerate()
) )
self.gui_config.channels = self.get_device_channels()
self.zc = self.gui_config.samplerate // 100 self.zc = self.gui_config.samplerate // 100
self.block_frame = ( self.block_frame = (
int( int(
@ -686,19 +734,18 @@ if __name__ == "__main__":
device=self.config.device, device=self.config.device,
dtype=torch.float32, dtype=torch.float32,
) )
self.input_wav_denoise: torch.Tensor = self.input_wav.clone()
self.input_wav_res: torch.Tensor = torch.zeros( self.input_wav_res: torch.Tensor = torch.zeros(
160 * self.input_wav.shape[0] // self.zc, 160 * self.input_wav.shape[0] // self.zc,
device=self.config.device, device=self.config.device,
dtype=torch.float32, dtype=torch.float32,
) )
self.rms_buffer: np.ndarray = np.zeros(4 * self.zc, dtype="float32")
self.sola_buffer: torch.Tensor = torch.zeros( self.sola_buffer: torch.Tensor = torch.zeros(
self.sola_buffer_frame, device=self.config.device, dtype=torch.float32 self.sola_buffer_frame, device=self.config.device, dtype=torch.float32
) )
self.nr_buffer: torch.Tensor = self.sola_buffer.clone() self.nr_buffer: torch.Tensor = self.sola_buffer.clone()
self.output_buffer: torch.Tensor = self.input_wav.clone() self.output_buffer: torch.Tensor = self.input_wav.clone()
self.res_buffer: torch.Tensor = torch.zeros(
2 * self.zc, device=self.config.device, dtype=torch.float32
)
self.skip_head = self.extra_frame // self.zc self.skip_head = self.extra_frame // self.zc
self.return_length = ( self.return_length = (
self.block_frame + self.sola_buffer_frame + self.sola_search_frame self.block_frame + self.sola_buffer_frame + self.sola_search_frame
@ -740,13 +787,17 @@ if __name__ == "__main__":
global flag_vc global flag_vc
if not flag_vc: if not flag_vc:
flag_vc = True flag_vc = True
channels = 1 if sys.platform == "darwin" else 2 if 'WASAPI' in self.gui_config.sg_hostapi and self.gui_config.sg_wasapi_exclusive:
extra_settings = sd.WasapiSettings(exclusive=True)
else:
extra_settings = None
self.stream = sd.Stream( self.stream = sd.Stream(
channels=channels,
callback=self.audio_callback, callback=self.audio_callback,
blocksize=self.block_frame, blocksize=self.block_frame,
samplerate=self.gui_config.samplerate, samplerate=self.gui_config.samplerate,
channels=self.gui_config.channels,
dtype="float32", dtype="float32",
extra_settings=extra_settings,
) )
self.stream.start() self.stream.start()
@ -755,7 +806,7 @@ if __name__ == "__main__":
if flag_vc: if flag_vc:
flag_vc = False flag_vc = False
if self.stream is not None: if self.stream is not None:
self.stream.stop() self.stream.abort()
self.stream.close() self.stream.close()
self.stream = None self.stream = None
@ -769,47 +820,51 @@ if __name__ == "__main__":
start_time = time.perf_counter() start_time = time.perf_counter()
indata = librosa.to_mono(indata.T) indata = librosa.to_mono(indata.T)
if self.gui_config.threhold > -60: if self.gui_config.threhold > -60:
indata = np.append(self.rms_buffer, indata)
rms = librosa.feature.rms( rms = librosa.feature.rms(
y=indata, frame_length=4 * self.zc, hop_length=self.zc y=indata, frame_length=4 * self.zc, hop_length=self.zc
) )[:, 2:]
self.rms_buffer[:] = indata[-4 * self.zc :]
indata = indata[2 * self.zc - self.zc // 2 :]
db_threhold = ( db_threhold = (
librosa.amplitude_to_db(rms, ref=1.0)[0] < self.gui_config.threhold librosa.amplitude_to_db(rms, ref=1.0)[0] < self.gui_config.threhold
) )
for i in range(db_threhold.shape[0]): for i in range(db_threhold.shape[0]):
if db_threhold[i]: if db_threhold[i]:
indata[i * self.zc : (i + 1) * self.zc] = 0 indata[i * self.zc : (i + 1) * self.zc] = 0
indata = indata[self.zc // 2 :]
self.input_wav[: -self.block_frame] = self.input_wav[ self.input_wav[: -self.block_frame] = self.input_wav[
self.block_frame : self.block_frame :
].clone() ].clone()
self.input_wav[-self.block_frame :] = torch.from_numpy(indata).to( self.input_wav[-indata.shape[0] :] = torch.from_numpy(indata).to(
self.config.device self.config.device
) )
self.input_wav_res[: -self.block_frame_16k] = self.input_wav_res[ self.input_wav_res[: -self.block_frame_16k] = self.input_wav_res[
self.block_frame_16k : self.block_frame_16k :
].clone() ].clone()
# input noise reduction and resampling # input noise reduction and resampling
if self.gui_config.I_noise_reduce and self.function == "vc": if self.gui_config.I_noise_reduce:
self.input_wav_denoise[: -self.block_frame] = self.input_wav_denoise[
self.block_frame :
].clone()
input_wav = self.input_wav[ input_wav = self.input_wav[
-self.sola_buffer_frame - self.block_frame - 2 * self.zc : -self.sola_buffer_frame - self.block_frame:
] ]
input_wav = self.tg( input_wav = self.tg(
input_wav.unsqueeze(0), self.input_wav.unsqueeze(0) input_wav.unsqueeze(0), self.input_wav.unsqueeze(0)
)[0, 2 * self.zc :] ).squeeze(0)
input_wav[: self.sola_buffer_frame] *= self.fade_in_window input_wav[: self.sola_buffer_frame] *= self.fade_in_window
input_wav[: self.sola_buffer_frame] += ( input_wav[: self.sola_buffer_frame] += (
self.nr_buffer * self.fade_out_window self.nr_buffer * self.fade_out_window
) )
self.input_wav_denoise[-self.block_frame :] = input_wav[: self.block_frame]
self.nr_buffer[:] = input_wav[self.block_frame :] self.nr_buffer[:] = input_wav[self.block_frame :]
input_wav = torch.cat(
(self.res_buffer[:], input_wav[: self.block_frame])
)
self.res_buffer[:] = input_wav[-2 * self.zc :]
self.input_wav_res[-self.block_frame_16k - 160 :] = self.resampler( self.input_wav_res[-self.block_frame_16k - 160 :] = self.resampler(
input_wav self.input_wav_denoise[-self.block_frame - 2 * self.zc :]
)[160:] )[160:]
else: else:
self.input_wav_res[-self.block_frame_16k - 160 :] = self.resampler( self.input_wav_res[-160 * (indata.shape[0] // self.zc + 1) :] = self.resampler(
self.input_wav[-self.block_frame - 2 * self.zc :] self.input_wav[-indata.shape[0] - 2 * self.zc :]
)[160:] )[160:]
# infer # infer
if self.function == "vc": if self.function == "vc":
@ -822,14 +877,12 @@ if __name__ == "__main__":
) )
if self.resampler2 is not None: if self.resampler2 is not None:
infer_wav = self.resampler2(infer_wav) infer_wav = self.resampler2(infer_wav)
elif self.gui_config.I_noise_reduce:
infer_wav = self.input_wav_denoise[self.extra_frame :].clone()
else: else:
infer_wav = self.input_wav[ infer_wav = self.input_wav[self.extra_frame :].clone()
-self.crossfade_frame - self.sola_search_frame - self.block_frame :
].clone()
# output noise reduction # output noise reduction
if (self.gui_config.O_noise_reduce and self.function == "vc") or ( if self.gui_config.O_noise_reduce and self.function == "vc":
self.gui_config.I_noise_reduce and self.function == "im"
):
self.output_buffer[: -self.block_frame] = self.output_buffer[ self.output_buffer[: -self.block_frame] = self.output_buffer[
self.block_frame : self.block_frame :
].clone() ].clone()
@ -839,16 +892,14 @@ if __name__ == "__main__":
).squeeze(0) ).squeeze(0)
# volume envelop mixing # volume envelop mixing
if self.gui_config.rms_mix_rate < 1 and self.function == "vc": if self.gui_config.rms_mix_rate < 1 and self.function == "vc":
if self.gui_config.I_noise_reduce:
input_wav = self.input_wav_denoise[self.extra_frame :]
else:
input_wav = self.input_wav[self.extra_frame :]
rms1 = librosa.feature.rms( rms1 = librosa.feature.rms(
y=self.input_wav_res[ y=input_wav[: infer_wav.shape[0]].cpu().numpy(),
160 frame_length=4 * self.zc,
* self.skip_head : 160 hop_length=self.zc,
* (self.skip_head + self.return_length)
]
.cpu()
.numpy(),
frame_length=640,
hop_length=160,
) )
rms1 = torch.from_numpy(rms1).to(self.config.device) rms1 = torch.from_numpy(rms1).to(self.config.device)
rms1 = F.interpolate( rms1 = F.interpolate(
@ -907,19 +958,18 @@ if __name__ == "__main__":
self.sola_buffer[:] = infer_wav[ self.sola_buffer[:] = infer_wav[
self.block_frame : self.block_frame + self.sola_buffer_frame self.block_frame : self.block_frame + self.sola_buffer_frame
] ]
if sys.platform == "darwin": outdata[:] = (
outdata[:] = infer_wav[: self.block_frame].cpu().numpy()[:, np.newaxis] infer_wav[: self.block_frame].repeat(self.gui_config.channels, 1).t().cpu().numpy()
else: )
outdata[:] = (
infer_wav[: self.block_frame].repeat(2, 1).t().cpu().numpy()
)
total_time = time.perf_counter() - start_time total_time = time.perf_counter() - start_time
if flag_vc: if flag_vc:
self.window["infer_time"].update(int(total_time * 1000)) self.window["infer_time"].update(int(total_time * 1000))
printt("Infer time: %.2f", total_time) printt("Infer time: %.2f", total_time)
def update_devices(self): def update_devices(self, hostapi_name=None):
"""获取设备列表""" """获取设备列表"""
global flag_vc
flag_vc = False
sd._terminate() sd._terminate()
sd._initialize() sd._initialize()
devices = sd.query_devices() devices = sd.query_devices()
@ -927,25 +977,26 @@ if __name__ == "__main__":
for hostapi in hostapis: for hostapi in hostapis:
for device_idx in hostapi["devices"]: for device_idx in hostapi["devices"]:
devices[device_idx]["hostapi_name"] = hostapi["name"] devices[device_idx]["hostapi_name"] = hostapi["name"]
self.hostapis = [hostapi["name"] for hostapi in hostapis]
if hostapi_name not in self.hostapis:
hostapi_name = self.hostapis[0]
self.input_devices = [ self.input_devices = [
f"{d['name']} ({d['hostapi_name']})" d['name'] for d in devices
for d in devices if d["max_input_channels"] > 0 and d['hostapi_name'] == hostapi_name
if d["max_input_channels"] > 0
] ]
self.output_devices = [ self.output_devices = [
f"{d['name']} ({d['hostapi_name']})" d['name'] for d in devices
for d in devices if d["max_output_channels"] > 0 and d['hostapi_name'] == hostapi_name
if d["max_output_channels"] > 0
] ]
self.input_devices_indices = [ self.input_devices_indices = [
d["index"] if "index" in d else d["name"] d["index"] if "index" in d else d["name"]
for d in devices for d in devices
if d["max_input_channels"] > 0 if d["max_input_channels"] > 0 and d['hostapi_name'] == hostapi_name
] ]
self.output_devices_indices = [ self.output_devices_indices = [
d["index"] if "index" in d else d["name"] d["index"] if "index" in d else d["name"]
for d in devices for d in devices
if d["max_output_channels"] > 0 if d["max_output_channels"] > 0 and d['hostapi_name'] == hostapi_name
] ]
def set_devices(self, input_device, output_device): def set_devices(self, input_device, output_device):
@ -963,5 +1014,10 @@ if __name__ == "__main__":
return int( return int(
sd.query_devices(device=sd.default.device[0])["default_samplerate"] sd.query_devices(device=sd.default.device[0])["default_samplerate"]
) )
def get_device_channels(self):
max_input_channels = sd.query_devices(device=sd.default.device[0])["max_input_channels"]
max_output_channels = sd.query_devices(device=sd.default.device[1])["max_output_channels"]
return min(max_input_channels, max_output_channels, 2)
gui = GUI() gui = GUI()

View File

@ -129,7 +129,7 @@
"采样长度": "Sample length", "采样长度": "Sample length",
"重载设备列表": "Reload device list", "重载设备列表": "Reload device list",
"音调设置": "Pitch settings", "音调设置": "Pitch settings",
"音频设备(请使用同种类驱动)": "Audio device (please use the same type of driver)", "音频设备": "Audio device",
"音高算法": "pitch detection algorithm", "音高算法": "pitch detection algorithm",
"额外推理时长": "Extra inference time" "额外推理时长": "Extra inference time"
} }

View File

@ -129,7 +129,7 @@
"采样长度": "Longitud de muestreo", "采样长度": "Longitud de muestreo",
"重载设备列表": "Actualizar lista de dispositivos", "重载设备列表": "Actualizar lista de dispositivos",
"音调设置": "Ajuste de tono", "音调设置": "Ajuste de tono",
"音频设备(请使用同种类驱动)": "Dispositivo de audio (utilice el mismo tipo de controlador)", "音频设备": "Dispositivo de audio",
"音高算法": "Algoritmo de tono", "音高算法": "Algoritmo de tono",
"额外推理时长": "Tiempo de inferencia adicional" "额外推理时长": "Tiempo de inferencia adicional"
} }

View File

@ -129,7 +129,7 @@
"采样长度": "Longueur de l'échantillon", "采样长度": "Longueur de l'échantillon",
"重载设备列表": "Recharger la liste des dispositifs", "重载设备列表": "Recharger la liste des dispositifs",
"音调设置": "Réglages de la hauteur", "音调设置": "Réglages de la hauteur",
"音频设备(请使用同种类驱动)": "Périphérique audio (veuillez utiliser le même type de pilote)", "音频设备": "Périphérique audio",
"音高算法": "algorithme de détection de la hauteur", "音高算法": "algorithme de détection de la hauteur",
"额外推理时长": "Temps d'inférence supplémentaire" "额外推理时长": "Temps d'inférence supplémentaire"
} }

View File

@ -129,7 +129,7 @@
"采样长度": "Lunghezza del campione", "采样长度": "Lunghezza del campione",
"重载设备列表": "Ricaricare l'elenco dei dispositivi", "重载设备列表": "Ricaricare l'elenco dei dispositivi",
"音调设置": "Impostazioni del tono", "音调设置": "Impostazioni del tono",
"音频设备(请使用同种类驱动)": "Dispositivo audio (utilizzare lo stesso tipo di driver)", "音频设备": "Dispositivo audio",
"音高算法": "音高算法", "音高算法": "音高算法",
"额外推理时长": "Tempo di inferenza extra" "额外推理时长": "Tempo di inferenza extra"
} }

View File

@ -129,7 +129,7 @@
"采样长度": "サンプル長", "采样长度": "サンプル長",
"重载设备列表": "デバイスリストをリロードする", "重载设备列表": "デバイスリストをリロードする",
"音调设置": "音程設定", "音调设置": "音程設定",
"音频设备(请使用同种类驱动)": "オーディオデバイス(同じ種類のドライバーを使用してください)", "音频设备": "オーディオデバイス",
"音高算法": "ピッチアルゴリズム", "音高算法": "ピッチアルゴリズム",
"额外推理时长": "追加推論時間" "额外推理时长": "追加推論時間"
} }

View File

@ -129,7 +129,7 @@
"采样长度": "Длина сэмпла", "采样长度": "Длина сэмпла",
"重载设备列表": "Обновить список устройств", "重载设备列表": "Обновить список устройств",
"音调设置": "Настройка высоты звука", "音调设置": "Настройка высоты звука",
"音频设备(请使用同种类驱动)": "Аудиоустройство (пожалуйста, используйте такой же тип драйвера)", "音频设备": "Аудиоустройство",
"音高算法": "Алгоритм оценки высоты звука", "音高算法": "Алгоритм оценки высоты звука",
"额外推理时长": "Доп. время переработки" "额外推理时长": "Доп. время переработки"
} }

View File

@ -129,7 +129,7 @@
"采样长度": "Örnekleme uzunluğu", "采样长度": "Örnekleme uzunluğu",
"重载设备列表": "Cihaz listesini yeniden yükle", "重载设备列表": "Cihaz listesini yeniden yükle",
"音调设置": "Pitch ayarları", "音调设置": "Pitch ayarları",
"音频设备(请使用同种类驱动)": "Ses cihazı (aynı tür sürücüyü kullanın)", "音频设备": "Ses cihazı",
"音高算法": "音高算法", "音高算法": "音高算法",
"额外推理时长": "Ekstra çıkartma süresi" "额外推理时长": "Ekstra çıkartma süresi"
} }

View File

@ -106,6 +106,8 @@
"请选择pth文件": "请选择pth文件", "请选择pth文件": "请选择pth文件",
"请选择说话人id": "请选择说话人id", "请选择说话人id": "请选择说话人id",
"转换": "转换", "转换": "转换",
"设备类型": "设备类型",
"独占 WASAPI 设备": "独占 WASAPI 设备",
"输入实验名": "输入实验名", "输入实验名": "输入实验名",
"输入待处理音频文件夹路径": "输入待处理音频文件夹路径", "输入待处理音频文件夹路径": "输入待处理音频文件夹路径",
"输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)", "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)",
@ -129,7 +131,7 @@
"采样长度": "采样长度", "采样长度": "采样长度",
"重载设备列表": "重载设备列表", "重载设备列表": "重载设备列表",
"音调设置": "音调设置", "音调设置": "音调设置",
"音频设备(请使用同种类驱动)": "音频设备(请使用同种类驱动)", "音频设备": "音频设备",
"音高算法": "音高算法", "音高算法": "音高算法",
"额外推理时长": "额外推理时长" "额外推理时长": "额外推理时长"
} }

View File

@ -129,7 +129,7 @@
"采样长度": "取樣長度", "采样长度": "取樣長度",
"重载设备列表": "重載設備列表", "重载设备列表": "重載設備列表",
"音调设置": "音調設定", "音调设置": "音調設定",
"音频设备(请使用同种类驱动)": "音訊設備 (請使用同種類驅動)", "音频设备": "音訊設備",
"音高算法": "音高演算法", "音高算法": "音高演算法",
"额外推理时长": "額外推理時長" "额外推理时长": "額外推理時長"
} }

View File

@ -129,7 +129,7 @@
"采样长度": "取樣長度", "采样长度": "取樣長度",
"重载设备列表": "重載設備列表", "重载设备列表": "重載設備列表",
"音调设置": "音調設定", "音调设置": "音調設定",
"音频设备(请使用同种类驱动)": "音訊設備 (請使用同種類驅動)", "音频设备": "音訊設備",
"音高算法": "音高演算法", "音高算法": "音高演算法",
"额外推理时长": "額外推理時長" "额外推理时长": "額外推理時長"
} }

View File

@ -129,7 +129,7 @@
"采样长度": "取樣長度", "采样长度": "取樣長度",
"重载设备列表": "重載設備列表", "重载设备列表": "重載設備列表",
"音调设置": "音調設定", "音调设置": "音調設定",
"音频设备(请使用同种类驱动)": "音訊設備 (請使用同種類驅動)", "音频设备": "音訊設備",
"音高算法": "音高演算法", "音高算法": "音高演算法",
"额外推理时长": "額外推理時長" "额外推理时长": "額外推理時長"
} }

View File

@ -91,8 +91,8 @@ class RVC:
self.pth_path: str = pth_path self.pth_path: str = pth_path
self.index_path = index_path self.index_path = index_path
self.index_rate = index_rate self.index_rate = index_rate
self.cache_pitch: np.ndarray = np.zeros(1024, dtype="int32") self.cache_pitch: torch.Tensor = torch.zeros(1024, device=self.device, dtype=torch.long)
self.cache_pitchf = np.zeros(1024, dtype="float32") self.cache_pitchf = torch.zeros(1024, device=self.device, dtype=torch.float32)
if last_rvc is None: if last_rvc is None:
models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
@ -199,15 +199,17 @@ class RVC:
self.index_rate = new_index_rate self.index_rate = new_index_rate
def get_f0_post(self, f0): def get_f0_post(self, f0):
f0bak = f0.copy() if not torch.is_tensor(f0):
f0_mel = 1127 * np.log(1 + f0 / 700) f0 = torch.from_numpy(f0)
f0 = f0.float().to(self.device).squeeze()
f0_mel = 1127 * torch.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / ( f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (
self.f0_mel_max - self.f0_mel_min self.f0_mel_max - self.f0_mel_min
) + 1 ) + 1
f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255 f0_mel[f0_mel > 255] = 255
f0_coarse = np.rint(f0_mel).astype(np.int32) f0_coarse = torch.round(f0_mel).long()
return f0_coarse, f0bak return f0_coarse, f0
def get_f0(self, x, f0_up_key, n_cpu, method="harvest"): def get_f0(self, x, f0_up_key, n_cpu, method="harvest"):
n_cpu = int(n_cpu) n_cpu = int(n_cpu)
@ -299,14 +301,12 @@ class RVC:
pd = torchcrepe.filter.median(pd, 3) pd = torchcrepe.filter.median(pd, 3)
f0 = torchcrepe.filter.mean(f0, 3) f0 = torchcrepe.filter.mean(f0, 3)
f0[pd < 0.1] = 0 f0[pd < 0.1] = 0
f0 = f0[0].cpu().numpy()
f0 *= pow(2, f0_up_key / 12) f0 *= pow(2, f0_up_key / 12)
return self.get_f0_post(f0) return self.get_f0_post(f0)
def get_f0_rmvpe(self, x, f0_up_key): def get_f0_rmvpe(self, x, f0_up_key):
if hasattr(self, "model_rmvpe") == False: if hasattr(self, "model_rmvpe") == False:
from infer.lib.rmvpe import RMVPE from infer.lib.rmvpe import RMVPE
printt("Loading rmvpe model") printt("Loading rmvpe model")
self.model_rmvpe = RMVPE( self.model_rmvpe = RMVPE(
"assets/rmvpe/rmvpe.pt", "assets/rmvpe/rmvpe.pt",
@ -335,7 +335,6 @@ class RVC:
threshold=0.006, threshold=0.006,
) )
f0 *= pow(2, f0_up_key / 12) f0 *= pow(2, f0_up_key / 12)
f0 = f0.squeeze().cpu().numpy()
return self.get_f0_post(f0) return self.get_f0_post(f0)
def infer( def infer(
@ -383,6 +382,7 @@ class RVC:
traceback.print_exc() traceback.print_exc()
printt("Index search FAILED") printt("Index search FAILED")
t3 = ttime() t3 = ttime()
p_len = input_wav.shape[0] // 160
if self.if_f0 == 1: if self.if_f0 == 1:
f0_extractor_frame = block_frame_16k + 800 f0_extractor_frame = block_frame_16k + 800
if f0method == "rmvpe": if f0method == "rmvpe":
@ -390,25 +390,14 @@ class RVC:
pitch, pitchf = self.get_f0( pitch, pitchf = self.get_f0(
input_wav[-f0_extractor_frame:], self.f0_up_key, self.n_cpu, f0method input_wav[-f0_extractor_frame:], self.f0_up_key, self.n_cpu, f0method
) )
start_frame = block_frame_16k // 160 shift = block_frame_16k // 160
end_frame = len(self.cache_pitch) - (pitch.shape[0] - 4) + start_frame self.cache_pitch[: -shift] = self.cache_pitch[shift :].clone()
self.cache_pitch[:] = np.append( self.cache_pitchf[: -shift] = self.cache_pitchf[shift :].clone()
self.cache_pitch[start_frame:end_frame], pitch[3:-1] self.cache_pitch[4 - pitch.shape[0] :] = pitch[3:-1]
) self.cache_pitchf[4 - pitch.shape[0] :] = pitchf[3:-1]
self.cache_pitchf[:] = np.append( cache_pitch = self.cache_pitch[None, -p_len:]
self.cache_pitchf[start_frame:end_frame], pitchf[3:-1] cache_pitchf = self.cache_pitchf[None, -p_len:]
)
t4 = ttime() t4 = ttime()
p_len = input_wav.shape[0] // 160
if self.if_f0 == 1:
cache_pitch = (
torch.LongTensor(self.cache_pitch[-p_len:]).to(self.device).unsqueeze(0)
)
cache_pitchf = (
torch.FloatTensor(self.cache_pitchf[-p_len:])
.to(self.device)
.unsqueeze(0)
)
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
feats = feats[:, :p_len, :] feats = feats[:, :p_len, :]
p_len = torch.LongTensor([p_len]).to(self.device) p_len = torch.LongTensor([p_len]).to(self.device)