Merge pull request #1647 from yxlllc/dev

Optimize real-time functions
This commit is contained in:
RVC-Boss 2023-12-26 20:57:01 +08:00 committed by GitHub
commit 2fb732da62
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 266 additions and 184 deletions

View File

@ -1 +1 @@
{"pth_path": "assets/weights/kikiV1.pth", "index_path": "logs/kikiV1.index", "sg_input_device": "VoiceMeeter Output (VB-Audio Vo (MME)", "sg_output_device": "VoiceMeeter Input (VB-Audio Voi (MME)", "threhold": -45.0, "pitch": 2.0, "rms_mix_rate": 0.0, "index_rate": 0.0, "block_time": 0.52, "crossfade_length": 0.15, "extra_time": 2.46, "n_cpu": 6.0, "use_jit": false, "f0method": "rmvpe"} {"pth_path": "assets/weights/kikiV1.pth", "index_path": "logs/kikiV1.index", "sg_input_device": "VoiceMeeter Output (VB-Audio Vo (MME)", "sg_output_device": "VoiceMeeter Input (VB-Audio Voi (MME)", "sr_type": "sr_model", "threhold": -60.0, "pitch": 12.0, "rms_mix_rate": 0.5, "index_rate": 0.0, "block_time": 0.2, "crossfade_length": 0.08, "extra_time": 2.00, "n_cpu": 4.0, "use_jit": false, "use_pv": false, "f0method": "fcpe"}

183
gui_v1.py
View File

@ -22,6 +22,26 @@ def printt(strr, *args):
print(strr % args) print(strr % args)
def phase_vocoder(a, b, fade_out, fade_in):
window = torch.sqrt(fade_out * fade_in)
fa = torch.fft.rfft(a * window)
fb = torch.fft.rfft(b * window)
absab = torch.abs(fa) + torch.abs(fb)
n = a.shape[0]
if n % 2 == 0:
absab[1:-1] *= 2
else:
absab[1:] *= 2
phia = torch.angle(fa)
phib = torch.angle(fb)
deltaphase = phib - phia
deltaphase = deltaphase - 2 * np.pi * torch.floor(deltaphase / 2 / np.pi + 0.5)
w = 2 * np.pi * torch.arange(n // 2 + 1).to(a) + deltaphase
t = torch.arange(n).unsqueeze(-1).to(a) / n
result = a * (fade_out ** 2) + b * (fade_in ** 2) + torch.sum(absab * torch.cos(w * t + phia), -1) * window / n
return result
class Harvest(multiprocessing.Process): class Harvest(multiprocessing.Process):
def __init__(self, inp_q, opt_q): def __init__(self, inp_q, opt_q):
multiprocessing.Process.__init__(self) multiprocessing.Process.__init__(self)
@ -118,6 +138,8 @@ if __name__ == "__main__":
try: try:
with open("configs/config.json", "r") as j: with open("configs/config.json", "r") as j:
data = json.load(j) data = json.load(j)
data["sr_model"] = data["sr_type"] == "sr_model"
data["sr_device"] = data["sr_type"] == "sr_device"
data["pm"] = data["f0method"] == "pm" data["pm"] = data["f0method"] == "pm"
data["harvest"] = data["f0method"] == "harvest" data["harvest"] = data["f0method"] == "harvest"
data["crepe"] = data["f0method"] == "crepe" data["crepe"] = data["f0method"] == "crepe"
@ -134,6 +156,7 @@ if __name__ == "__main__":
"index_path": " ", "index_path": " ",
"sg_input_device": input_devices[sd.default.device[0]], "sg_input_device": input_devices[sd.default.device[0]],
"sg_output_device": output_devices[sd.default.device[1]], "sg_output_device": output_devices[sd.default.device[1]],
"sr_type": "sr_model",
"threhold": "-60", "threhold": "-60",
"pitch": "0", "pitch": "0",
"index_rate": "0", "index_rate": "0",
@ -143,7 +166,10 @@ if __name__ == "__main__":
"extra_time": "2.5", "extra_time": "2.5",
"f0method": "rmvpe", "f0method": "rmvpe",
"use_jit": False, "use_jit": False,
"use_pv": False,
} }
data["sr_model"] = data["sr_type"] == "sr_model"
data["sr_device"] = data["sr_type"] == "sr_device"
data["pm"] = data["f0method"] == "pm" data["pm"] = data["f0method"] == "pm"
data["harvest"] = data["f0method"] == "harvest" data["harvest"] = data["f0method"] == "harvest"
data["crepe"] = data["f0method"] == "crepe" data["crepe"] = data["f0method"] == "crepe"
@ -207,7 +233,25 @@ if __name__ == "__main__":
default_value=data.get("sg_output_device", ""), default_value=data.get("sg_output_device", ""),
), ),
], ],
[sg.Button(i18n("重载设备列表"), key="reload_devices")], [
sg.Button(i18n("重载设备列表"), key="reload_devices"),
sg.Radio(
i18n("使用模型采样率"),
"sr_type",
key="sr_model",
default=data.get("sr_model", True),
enable_events=True,
),
sg.Radio(
i18n("使用设备采样率"),
"sr_type",
key="sr_device",
default=data.get("sr_device", False),
enable_events=True,
),
sg.Text(i18n("采样率:")),
sg.Text("", key="sr_stream"),
],
], ],
title=i18n("音频设备(请使用同种类驱动)"), title=i18n("音频设备(请使用同种类驱动)"),
) )
@ -222,7 +266,7 @@ if __name__ == "__main__":
key="threhold", key="threhold",
resolution=1, resolution=1,
orientation="h", orientation="h",
default_value=data.get("threhold", "-60"), default_value=data.get("threhold", -60),
enable_events=True, enable_events=True,
), ),
], ],
@ -233,7 +277,7 @@ if __name__ == "__main__":
key="pitch", key="pitch",
resolution=1, resolution=1,
orientation="h", orientation="h",
default_value=data.get("pitch", "0"), default_value=data.get("pitch", 0),
enable_events=True, enable_events=True,
), ),
], ],
@ -244,7 +288,7 @@ if __name__ == "__main__":
key="index_rate", key="index_rate",
resolution=0.01, resolution=0.01,
orientation="h", orientation="h",
default_value=data.get("index_rate", "0"), default_value=data.get("index_rate", 0),
enable_events=True, enable_events=True,
), ),
], ],
@ -255,7 +299,7 @@ if __name__ == "__main__":
key="rms_mix_rate", key="rms_mix_rate",
resolution=0.01, resolution=0.01,
orientation="h", orientation="h",
default_value=data.get("rms_mix_rate", "0"), default_value=data.get("rms_mix_rate", 0),
enable_events=True, enable_events=True,
), ),
], ],
@ -265,35 +309,35 @@ if __name__ == "__main__":
"pm", "pm",
"f0method", "f0method",
key="pm", key="pm",
default=data.get("pm", "") == True, default=data.get("pm", False),
enable_events=True, enable_events=True,
), ),
sg.Radio( sg.Radio(
"harvest", "harvest",
"f0method", "f0method",
key="harvest", key="harvest",
default=data.get("harvest", "") == True, default=data.get("harvest", False),
enable_events=True, enable_events=True,
), ),
sg.Radio( sg.Radio(
"crepe", "crepe",
"f0method", "f0method",
key="crepe", key="crepe",
default=data.get("crepe", "") == True, default=data.get("crepe", False),
enable_events=True, enable_events=True,
), ),
sg.Radio( sg.Radio(
"rmvpe", "rmvpe",
"f0method", "f0method",
key="rmvpe", key="rmvpe",
default=data.get("rmvpe", "") == True, default=data.get("rmvpe", False),
enable_events=True, enable_events=True,
), ),
sg.Radio( sg.Radio(
"fcpe", "fcpe",
"f0method", "f0method",
key="fcpe", key="fcpe",
default=data.get("fcpe", "") == True, default=data.get("fcpe", True),
enable_events=True, enable_events=True,
), ),
], ],
@ -305,11 +349,11 @@ if __name__ == "__main__":
[ [
sg.Text(i18n("采样长度")), sg.Text(i18n("采样长度")),
sg.Slider( sg.Slider(
range=(0.05, 2.4), range=(0.02, 2.4),
key="block_time", key="block_time",
resolution=0.01, resolution=0.01,
orientation="h", orientation="h",
default_value=data.get("block_time", "0.25"), default_value=data.get("block_time", 0.25),
enable_events=True, enable_events=True,
), ),
], ],
@ -320,7 +364,7 @@ if __name__ == "__main__":
# key="device_latency", # key="device_latency",
# resolution=0.001, # resolution=0.001,
# orientation="h", # orientation="h",
# default_value=data.get("device_latency", "0.1"), # default_value=data.get("device_latency", 0.1),
# enable_events=True, # enable_events=True,
# ), # ),
# ], # ],
@ -344,7 +388,7 @@ if __name__ == "__main__":
key="crossfade_length", key="crossfade_length",
resolution=0.01, resolution=0.01,
orientation="h", orientation="h",
default_value=data.get("crossfade_length", "0.05"), default_value=data.get("crossfade_length", 0.05),
enable_events=True, enable_events=True,
), ),
], ],
@ -355,7 +399,7 @@ if __name__ == "__main__":
key="extra_time", key="extra_time",
resolution=0.01, resolution=0.01,
orientation="h", orientation="h",
default_value=data.get("extra_time", "2.5"), default_value=data.get("extra_time", 2.5),
enable_events=True, enable_events=True,
), ),
], ],
@ -370,6 +414,12 @@ if __name__ == "__main__":
key="O_noise_reduce", key="O_noise_reduce",
enable_events=True, enable_events=True,
), ),
sg.Checkbox(
i18n("启用相位声码器"),
key="use_pv",
default=data.get("use_pv", False),
enable_events=True,
),
# sg.Checkbox( # sg.Checkbox(
# "JIT加速", # "JIT加速",
# default=self.config.use_jit, # default=self.config.use_jit,
@ -443,6 +493,12 @@ if __name__ == "__main__":
"index_path": values["index_path"], "index_path": values["index_path"],
"sg_input_device": values["sg_input_device"], "sg_input_device": values["sg_input_device"],
"sg_output_device": values["sg_output_device"], "sg_output_device": values["sg_output_device"],
"sr_type": ["sr_model", "sr_device"][
[
values["sr_model"],
values["sr_device"],
].index(True)
],
"threhold": values["threhold"], "threhold": values["threhold"],
"pitch": values["pitch"], "pitch": values["pitch"],
"rms_mix_rate": values["rms_mix_rate"], "rms_mix_rate": values["rms_mix_rate"],
@ -454,6 +510,7 @@ if __name__ == "__main__":
"n_cpu": values["n_cpu"], "n_cpu": values["n_cpu"],
# "use_jit": values["use_jit"], # "use_jit": values["use_jit"],
"use_jit": False, "use_jit": False,
"use_pv": values["use_pv"],
"f0method": ["pm", "harvest", "crepe", "rmvpe", "fcpe"][ "f0method": ["pm", "harvest", "crepe", "rmvpe", "fcpe"][
[ [
values["pm"], values["pm"],
@ -476,7 +533,8 @@ if __name__ == "__main__":
+ 0.01 + 0.01
) )
if values["I_noise_reduce"]: if values["I_noise_reduce"]:
self.delay_time += values["crossfade_length"] self.delay_time += min(values["crossfade_length"], 0.04)
self.window["sr_stream"].update(self.gui_config.samplerate)
self.window["delay_time"].update(int(self.delay_time * 1000)) self.window["delay_time"].update(int(self.delay_time * 1000))
if event == "stop_vc" and self.flag_vc == True: if event == "stop_vc" and self.flag_vc == True:
self.flag_vc = False self.flag_vc = False
@ -501,10 +559,12 @@ if __name__ == "__main__":
if stream_latency > 0: if stream_latency > 0:
self.delay_time += ( self.delay_time += (
1 if values["I_noise_reduce"] else -1 1 if values["I_noise_reduce"] else -1
) * values["crossfade_length"] ) * min(values["crossfade_length"], 0.04)
self.window["delay_time"].update(int(self.delay_time * 1000)) self.window["delay_time"].update(int(self.delay_time * 1000))
elif event == "O_noise_reduce": elif event == "O_noise_reduce":
self.gui_config.O_noise_reduce = values["O_noise_reduce"] self.gui_config.O_noise_reduce = values["O_noise_reduce"]
elif event == "use_pv":
self.gui_config.use_pv = values["use_pv"]
elif event in ["vc", "im"]: elif event in ["vc", "im"]:
self.function = event self.function = event
elif event != "start_vc" and self.flag_vc == True: elif event != "start_vc" and self.flag_vc == True:
@ -531,6 +591,12 @@ if __name__ == "__main__":
# self.device_latency = values["device_latency"] # self.device_latency = values["device_latency"]
self.gui_config.pth_path = values["pth_path"] self.gui_config.pth_path = values["pth_path"]
self.gui_config.index_path = values["index_path"] self.gui_config.index_path = values["index_path"]
self.gui_config.sr_type = ["sr_model", "sr_device"][
[
values["sr_model"],
values["sr_device"],
].index(True)
]
self.gui_config.threhold = values["threhold"] self.gui_config.threhold = values["threhold"]
self.gui_config.pitch = values["pitch"] self.gui_config.pitch = values["pitch"]
self.gui_config.block_time = values["block_time"] self.gui_config.block_time = values["block_time"]
@ -538,6 +604,7 @@ if __name__ == "__main__":
self.gui_config.extra_time = values["extra_time"] self.gui_config.extra_time = values["extra_time"]
self.gui_config.I_noise_reduce = values["I_noise_reduce"] self.gui_config.I_noise_reduce = values["I_noise_reduce"]
self.gui_config.O_noise_reduce = values["O_noise_reduce"] self.gui_config.O_noise_reduce = values["O_noise_reduce"]
self.gui_config.use_pv = values["use_pv"]
self.gui_config.rms_mix_rate = values["rms_mix_rate"] self.gui_config.rms_mix_rate = values["rms_mix_rate"]
self.gui_config.index_rate = values["index_rate"] self.gui_config.index_rate = values["index_rate"]
self.gui_config.n_cpu = values["n_cpu"] self.gui_config.n_cpu = values["n_cpu"]
@ -566,8 +633,8 @@ if __name__ == "__main__":
self.config, self.config,
self.rvc if hasattr(self, "rvc") else None, self.rvc if hasattr(self, "rvc") else None,
) )
self.gui_config.samplerate = self.rvc.tgt_sr self.gui_config.samplerate = self.rvc.tgt_sr if self.gui_config.sr_type == "sr_model" else self.get_device_samplerate()
self.zc = self.rvc.tgt_sr // 100 self.zc = self.gui_config.samplerate // 100
self.block_frame = ( self.block_frame = (
int( int(
np.round( np.round(
@ -589,6 +656,7 @@ if __name__ == "__main__":
) )
* self.zc * self.zc
) )
self.sola_buffer_frame = min(self.crossfade_frame, 4 * self.zc)
self.sola_search_frame = self.zc self.sola_search_frame = self.zc
self.extra_frame = ( self.extra_frame = (
int( int(
@ -613,23 +681,16 @@ if __name__ == "__main__":
device=self.config.device, device=self.config.device,
dtype=torch.float32, dtype=torch.float32,
) )
self.pitch: np.ndarray = np.zeros(
self.input_wav.shape[0] // self.zc,
dtype="int32",
)
self.pitchf: np.ndarray = np.zeros(
self.input_wav.shape[0] // self.zc,
dtype="float64",
)
self.sola_buffer: torch.Tensor = torch.zeros( self.sola_buffer: torch.Tensor = torch.zeros(
self.crossfade_frame, device=self.config.device, dtype=torch.float32 self.sola_buffer_frame, device=self.config.device, dtype=torch.float32
) )
self.nr_buffer: torch.Tensor = self.sola_buffer.clone() self.nr_buffer: torch.Tensor = self.sola_buffer.clone()
self.output_buffer: torch.Tensor = self.input_wav.clone() self.output_buffer: torch.Tensor = self.input_wav.clone()
self.res_buffer: torch.Tensor = torch.zeros( self.res_buffer: torch.Tensor = torch.zeros(
2 * self.zc, device=self.config.device, dtype=torch.float32 2 * self.zc, device=self.config.device, dtype=torch.float32
) )
self.valid_rate = 1 - (self.extra_frame - 1) / self.input_wav.shape[0] self.skip_head = self.extra_frame // self.zc
self.return_length = (self.block_frame + self.sola_buffer_frame + self.sola_search_frame) // self.zc
self.fade_in_window: torch.Tensor = ( self.fade_in_window: torch.Tensor = (
torch.sin( torch.sin(
0.5 0.5
@ -637,7 +698,7 @@ if __name__ == "__main__":
* torch.linspace( * torch.linspace(
0.0, 0.0,
1.0, 1.0,
steps=self.crossfade_frame, steps=self.sola_buffer_frame,
device=self.config.device, device=self.config.device,
dtype=torch.float32, dtype=torch.float32,
) )
@ -650,6 +711,14 @@ if __name__ == "__main__":
new_freq=16000, new_freq=16000,
dtype=torch.float32, dtype=torch.float32,
).to(self.config.device) ).to(self.config.device)
if self.rvc.tgt_sr != self.gui_config.samplerate:
self.resampler2 = tat.Resample(
orig_freq=self.rvc.tgt_sr,
new_freq=self.gui_config.samplerate,
dtype=torch.float32,
).to(self.config.device)
else:
self.resampler2 = None
self.tg = TorchGate( self.tg = TorchGate(
sr=self.gui_config.samplerate, n_fft=4 * self.zc, prop_decrease=0.9 sr=self.gui_config.samplerate, n_fft=4 * self.zc, prop_decrease=0.9
).to(self.config.device) ).to(self.config.device)
@ -705,16 +774,16 @@ if __name__ == "__main__":
# input noise reduction and resampling # input noise reduction and resampling
if self.gui_config.I_noise_reduce and self.function == "vc": if self.gui_config.I_noise_reduce and self.function == "vc":
input_wav = self.input_wav[ input_wav = self.input_wav[
-self.crossfade_frame - self.block_frame - 2 * self.zc : -self.sola_buffer_frame - self.block_frame - 2 * self.zc :
] ]
input_wav = self.tg( input_wav = self.tg(
input_wav.unsqueeze(0), self.input_wav.unsqueeze(0) input_wav.unsqueeze(0), self.input_wav.unsqueeze(0)
)[0, 2 * self.zc :] )[0, 2 * self.zc :]
input_wav[: self.crossfade_frame] *= self.fade_in_window input_wav[: self.sola_buffer_frame] *= self.fade_in_window
input_wav[: self.crossfade_frame] += ( input_wav[: self.sola_buffer_frame] += (
self.nr_buffer * self.fade_out_window self.nr_buffer * self.fade_out_window
) )
self.nr_buffer[:] = input_wav[-self.crossfade_frame :] self.nr_buffer[:] = input_wav[self.block_frame :]
input_wav = torch.cat( input_wav = torch.cat(
(self.res_buffer[:], input_wav[: self.block_frame]) (self.res_buffer[:], input_wav[: self.block_frame])
) )
@ -728,23 +797,15 @@ if __name__ == "__main__":
)[160:] )[160:]
# infer # infer
if self.function == "vc": if self.function == "vc":
f0_extractor_frame = self.block_frame_16k + 800
if self.gui_config.f0method == "rmvpe":
f0_extractor_frame = (
5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160
)
infer_wav = self.rvc.infer( infer_wav = self.rvc.infer(
self.input_wav_res, self.input_wav_res,
self.input_wav_res[-f0_extractor_frame:].cpu().numpy(),
self.block_frame_16k, self.block_frame_16k,
self.valid_rate, self.skip_head,
self.pitch, self.return_length,
self.pitchf,
self.gui_config.f0method, self.gui_config.f0method,
) )
infer_wav = infer_wav[ if self.resampler2 is not None:
-self.crossfade_frame - self.sola_search_frame - self.block_frame : infer_wav = self.resampler2(infer_wav)
]
else: else:
infer_wav = self.input_wav[ infer_wav = self.input_wav[
-self.crossfade_frame - self.sola_search_frame - self.block_frame : -self.crossfade_frame - self.sola_search_frame - self.block_frame :
@ -763,7 +824,7 @@ if __name__ == "__main__":
# volume envelop mixing # volume envelop mixing
if self.gui_config.rms_mix_rate < 1 and self.function == "vc": if self.gui_config.rms_mix_rate < 1 and self.function == "vc":
rms1 = librosa.feature.rms( rms1 = librosa.feature.rms(
y=self.input_wav_res[-160 * infer_wav.shape[0] // self.zc :] y=self.input_wav_res[160 * self.skip_head : 160 * (self.skip_head + self.return_length)]
.cpu() .cpu()
.numpy(), .numpy(),
frame_length=640, frame_length=640,
@ -794,13 +855,13 @@ if __name__ == "__main__":
) )
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC
conv_input = infer_wav[ conv_input = infer_wav[
None, None, : self.crossfade_frame + self.sola_search_frame None, None, : self.sola_buffer_frame + self.sola_search_frame
] ]
cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :]) cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :])
cor_den = torch.sqrt( cor_den = torch.sqrt(
F.conv1d( F.conv1d(
conv_input**2, conv_input**2,
torch.ones(1, 1, self.crossfade_frame, device=self.config.device), torch.ones(1, 1, self.sola_buffer_frame, device=self.config.device),
) )
+ 1e-8 + 1e-8
) )
@ -810,19 +871,24 @@ if __name__ == "__main__":
else: else:
sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0]) sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0])
printt("sola_offset = %d", int(sola_offset)) printt("sola_offset = %d", int(sola_offset))
infer_wav = infer_wav[ infer_wav = infer_wav[sola_offset :]
sola_offset : sola_offset + self.block_frame + self.crossfade_frame if "privateuseone" in str(self.config.device) or not self.gui_config.use_pv:
] infer_wav[: self.sola_buffer_frame] *= self.fade_in_window
infer_wav[: self.crossfade_frame] *= self.fade_in_window infer_wav[: self.sola_buffer_frame] += self.sola_buffer * self.fade_out_window
infer_wav[: self.crossfade_frame] += self.sola_buffer * self.fade_out_window else:
self.sola_buffer[:] = infer_wav[-self.crossfade_frame :] infer_wav[: self.sola_buffer_frame] = phase_vocoder(
self.sola_buffer,
infer_wav[: self.sola_buffer_frame],
self.fade_out_window,
self.fade_in_window)
self.sola_buffer[:] = infer_wav[self.block_frame : self.block_frame + self.sola_buffer_frame]
if sys.platform == "darwin": if sys.platform == "darwin":
outdata[:] = ( outdata[:] = (
infer_wav[: -self.crossfade_frame].cpu().numpy()[:, np.newaxis] infer_wav[: self.block_frame].cpu().numpy()[:, np.newaxis]
) )
else: else:
outdata[:] = ( outdata[:] = (
infer_wav[: -self.crossfade_frame].repeat(2, 1).t().cpu().numpy() infer_wav[: self.block_frame].repeat(2, 1).t().cpu().numpy()
) )
total_time = time.perf_counter() - start_time total_time = time.perf_counter() - start_time
self.window["infer_time"].update(int(total_time * 1000)) self.window["infer_time"].update(int(total_time * 1000))
@ -882,4 +948,7 @@ if __name__ == "__main__":
printt("Input device: %s:%s", str(sd.default.device[0]), input_device) printt("Input device: %s:%s", str(sd.default.device[0]), input_device)
printt("Output device: %s:%s", str(sd.default.device[1]), output_device) printt("Output device: %s:%s", str(sd.default.device[1]), output_device)
def get_device_samplerate(self):
return int(sd.query_devices(device=sd.default.device[0])['default_samplerate'])
gui = GUI() gui = GUI()

View File

@ -722,7 +722,8 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
def remove_weight_norm(self): def remove_weight_norm(self):
self.dec.remove_weight_norm() self.dec.remove_weight_norm()
self.flow.remove_weight_norm() self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm() if hasattr(self, "enc_q"):
self.enc_q.remove_weight_norm()
def __prepare_scriptable__(self): def __prepare_scriptable__(self):
for hook in self.dec._forward_pre_hooks.values(): for hook in self.dec._forward_pre_hooks.values():
@ -783,17 +784,20 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
pitch: torch.Tensor, pitch: torch.Tensor,
nsff0: torch.Tensor, nsff0: torch.Tensor,
sid: torch.Tensor, sid: torch.Tensor,
rate: Optional[torch.Tensor] = None, skip_head: Optional[torch.Tensor] = None,
return_length: Optional[torch.Tensor] = None,
): ):
g = self.emb_g(sid).unsqueeze(-1) g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
if rate is not None: if skip_head is not None and return_length is not None:
assert isinstance(rate, torch.Tensor) assert isinstance(skip_head, torch.Tensor)
head = int(z_p.shape[2] * (1 - rate.item())) assert isinstance(return_length, torch.Tensor)
z_p = z_p[:, :, head:] head = int(skip_head.item())
x_mask = x_mask[:, :, head:] length = int(return_length.item())
nsff0 = nsff0[:, head:] z_p = z_p[:, :, head: head + length]
x_mask = x_mask[:, :, head: head + length]
nsff0 = nsff0[:, head: head + length]
z = self.flow(z_p, x_mask, g=g, reverse=True) z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, nsff0, g=g) o = self.dec(z * x_mask, nsff0, g=g)
return o, x_mask, (z, z_p, m_p, logs_p) return o, x_mask, (z, z_p, m_p, logs_p)
@ -887,7 +891,8 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
def remove_weight_norm(self): def remove_weight_norm(self):
self.dec.remove_weight_norm() self.dec.remove_weight_norm()
self.flow.remove_weight_norm() self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm() if hasattr(self, "enc_q"):
self.enc_q.remove_weight_norm()
def __prepare_scriptable__(self): def __prepare_scriptable__(self):
for hook in self.dec._forward_pre_hooks.values(): for hook in self.dec._forward_pre_hooks.values():
@ -941,16 +946,20 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
pitch: torch.Tensor, pitch: torch.Tensor,
nsff0: torch.Tensor, nsff0: torch.Tensor,
sid: torch.Tensor, sid: torch.Tensor,
rate: Optional[torch.Tensor] = None, skip_head: Optional[torch.Tensor] = None,
return_length: Optional[torch.Tensor] = None,
): ):
g = self.emb_g(sid).unsqueeze(-1) g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
if rate is not None: if skip_head is not None and return_length is not None:
head = int(z_p.shape[2] * (1.0 - rate.item())) assert isinstance(skip_head, torch.Tensor)
z_p = z_p[:, :, head:] assert isinstance(return_length, torch.Tensor)
x_mask = x_mask[:, :, head:] head = int(skip_head.item())
nsff0 = nsff0[:, head:] length = int(return_length.item())
z_p = z_p[:, :, head: head + length]
x_mask = x_mask[:, :, head: head + length]
nsff0 = nsff0[:, head: head + length]
z = self.flow(z_p, x_mask, g=g, reverse=True) z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, nsff0, g=g) o = self.dec(z * x_mask, nsff0, g=g)
return o, x_mask, (z, z_p, m_p, logs_p) return o, x_mask, (z, z_p, m_p, logs_p)
@ -1041,7 +1050,8 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
def remove_weight_norm(self): def remove_weight_norm(self):
self.dec.remove_weight_norm() self.dec.remove_weight_norm()
self.flow.remove_weight_norm() self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm() if hasattr(self, "enc_q"):
self.enc_q.remove_weight_norm()
def __prepare_scriptable__(self): def __prepare_scriptable__(self):
for hook in self.dec._forward_pre_hooks.values(): for hook in self.dec._forward_pre_hooks.values():
@ -1087,15 +1097,19 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
phone: torch.Tensor, phone: torch.Tensor,
phone_lengths: torch.Tensor, phone_lengths: torch.Tensor,
sid: torch.Tensor, sid: torch.Tensor,
rate: Optional[torch.Tensor] = None, skip_head: Optional[torch.Tensor] = None,
return_length: Optional[torch.Tensor] = None,
): ):
g = self.emb_g(sid).unsqueeze(-1) g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
if rate is not None: if skip_head is not None and return_length is not None:
head = int(z_p.shape[2] * (1.0 - rate.item())) assert isinstance(skip_head, torch.Tensor)
z_p = z_p[:, :, head:] assert isinstance(return_length, torch.Tensor)
x_mask = x_mask[:, :, head:] head = int(skip_head.item())
length = int(return_length.item())
z_p = z_p[:, :, head: head + length]
x_mask = x_mask[:, :, head: head + length]
z = self.flow(z_p, x_mask, g=g, reverse=True) z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, g=g) o = self.dec(z * x_mask, g=g)
return o, x_mask, (z, z_p, m_p, logs_p) return o, x_mask, (z, z_p, m_p, logs_p)
@ -1186,7 +1200,8 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
def remove_weight_norm(self): def remove_weight_norm(self):
self.dec.remove_weight_norm() self.dec.remove_weight_norm()
self.flow.remove_weight_norm() self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm() if hasattr(self, "enc_q"):
self.enc_q.remove_weight_norm()
def __prepare_scriptable__(self): def __prepare_scriptable__(self):
for hook in self.dec._forward_pre_hooks.values(): for hook in self.dec._forward_pre_hooks.values():
@ -1232,15 +1247,19 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
phone: torch.Tensor, phone: torch.Tensor,
phone_lengths: torch.Tensor, phone_lengths: torch.Tensor,
sid: torch.Tensor, sid: torch.Tensor,
rate: Optional[torch.Tensor] = None, skip_head: Optional[torch.Tensor] = None,
return_length: Optional[torch.Tensor] = None,
): ):
g = self.emb_g(sid).unsqueeze(-1) g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
if rate is not None: if skip_head is not None and return_length is not None:
head = int(z_p.shape[2] * (1.0 - rate.item())) assert isinstance(skip_head, torch.Tensor)
z_p = z_p[:, :, head:] assert isinstance(return_length, torch.Tensor)
x_mask = x_mask[:, :, head:] head = int(skip_head.item())
length = int(return_length.item())
z_p = z_p[:, :, head: head + length]
x_mask = x_mask[:, :, head: head + length]
z = self.flow(z_p, x_mask, g=g, reverse=True) z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, g=g) o = self.dec(z * x_mask, g=g)
return o, x_mask, (z, z_p, m_p, logs_p) return o, x_mask, (z, z_p, m_p, logs_p)

View File

@ -34,4 +34,5 @@ def get_synthesizer(pth_path, device=torch.device("cpu")):
net_g.load_state_dict(cpt["weight"], strict=False) net_g.load_state_dict(cpt["weight"], strict=False)
net_g = net_g.float() net_g = net_g.float()
net_g.eval().to(device) net_g.eval().to(device)
net_g.remove_weight_norm()
return net_g, cpt return net_g, cpt

View File

@ -593,16 +593,18 @@ class RMVPE:
def infer_from_audio(self, audio, thred=0.03): def infer_from_audio(self, audio, thred=0.03):
# torch.cuda.synchronize() # torch.cuda.synchronize()
t0 = ttime() # t0 = ttime()
if not torch.is_tensor(audio):
audio = torch.from_numpy(audio)
mel = self.mel_extractor( mel = self.mel_extractor(
torch.from_numpy(audio).float().to(self.device).unsqueeze(0), center=True audio.float().to(self.device).unsqueeze(0), center=True
) )
# print(123123123,mel.device.type) # print(123123123,mel.device.type)
# torch.cuda.synchronize() # torch.cuda.synchronize()
t1 = ttime() # t1 = ttime()
hidden = self.mel2hidden(mel) hidden = self.mel2hidden(mel)
# torch.cuda.synchronize() # torch.cuda.synchronize()
t2 = ttime() # t2 = ttime()
# print(234234,hidden.device.type) # print(234234,hidden.device.type)
if "privateuseone" not in str(self.device): if "privateuseone" not in str(self.device):
hidden = hidden.squeeze(0).cpu().numpy() hidden = hidden.squeeze(0).cpu().numpy()
@ -613,7 +615,7 @@ class RMVPE:
f0 = self.decode(hidden, thred=thred) f0 = self.decode(hidden, thred=thred)
# torch.cuda.synchronize() # torch.cuda.synchronize()
t3 = ttime() # t3 = ttime()
# print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0)) # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0))
return f0 return f0

View File

@ -46,3 +46,4 @@ fastapi==0.88
ffmpy==0.3.1 ffmpy==0.3.1
python-dotenv>=1.0.0 python-dotenv>=1.0.0
av av
torchfcpe

View File

@ -44,3 +44,4 @@ fastapi==0.88
ffmpy==0.3.1 ffmpy==0.3.1
python-dotenv>=1.0.0 python-dotenv>=1.0.0
av av
torchfcpe

View File

@ -52,3 +52,4 @@ python-dotenv>=1.0.0
av av
PySimpleGUI PySimpleGUI
sounddevice sounddevice
torchfcpe

View File

@ -27,3 +27,4 @@ sounddevice
gradio gradio
noisereduce noisereduce
onnxruntime-directml onnxruntime-directml
torchfcpe

View File

@ -26,3 +26,4 @@ PySimpleGUI
sounddevice sounddevice
gradio gradio
noisereduce noisereduce
torchfcpe

View File

@ -46,23 +46,22 @@ def printt(strr, *args):
# config.is_half=False########强制cpu测试 # config.is_half=False########强制cpu测试
class RVC: class RVC:
def __init__( def __init__(
self, self,
key, key,
pth_path, pth_path,
index_path, index_path,
index_rate, index_rate,
n_cpu, n_cpu,
inp_q, inp_q,
opt_q, opt_q,
config: Config, config: Config,
last_rvc=None, last_rvc=None,
) -> None: ) -> None:
""" """
初始化 初始化
""" """
try: try:
if config.dml == True: if config.dml == True:
def forward_dml(ctx, x, scale): def forward_dml(ctx, x, scale):
ctx.scale = scale ctx.scale = scale
res = x.clone().detach() res = x.clone().detach()
@ -76,13 +75,10 @@ class RVC:
# device="cpu"########强制cpu测试 # device="cpu"########强制cpu测试
self.device = config.device self.device = config.device
self.f0_up_key = key self.f0_up_key = key
self.time_step = 160 / 16000 * 1000
self.f0_min = 50 self.f0_min = 50
self.f0_max = 1100 self.f0_max = 1100
self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
self.sr = 16000
self.window = 160
self.n_cpu = n_cpu self.n_cpu = n_cpu
self.use_jit = self.config.use_jit self.use_jit = self.config.use_jit
self.is_half = config.is_half self.is_half = config.is_half
@ -94,6 +90,8 @@ class RVC:
self.pth_path: str = pth_path self.pth_path: str = pth_path
self.index_path = index_path self.index_path = index_path
self.index_rate = index_rate self.index_rate = index_rate
self.cache_pitch: np.ndarray = np.zeros(1024, dtype="int32")
self.cache_pitchf = np.zeros(1024, dtype="float32")
if last_rvc is None: if last_rvc is None:
models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
@ -184,6 +182,7 @@ class RVC:
if last_rvc is not None and hasattr(last_rvc, "model_rmvpe"): if last_rvc is not None and hasattr(last_rvc, "model_rmvpe"):
self.model_rmvpe = last_rvc.model_rmvpe self.model_rmvpe = last_rvc.model_rmvpe
if last_rvc is not None and hasattr(last_rvc, "model_fcpe"): if last_rvc is not None and hasattr(last_rvc, "model_fcpe"):
self.device_fcpe = last_rvc.device_fcpe
self.model_fcpe = last_rvc.model_fcpe self.model_fcpe = last_rvc.model_fcpe
except: except:
printt(traceback.format_exc()) printt(traceback.format_exc())
@ -199,14 +198,10 @@ class RVC:
self.index_rate = new_index_rate self.index_rate = new_index_rate
def get_f0_post(self, f0): def get_f0_post(self, f0):
f0_min = self.f0_min
f0_max = self.f0_max
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
f0bak = f0.copy() f0bak = f0.copy()
f0_mel = 1127 * np.log(1 + f0 / 700) f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (
f0_mel_max - f0_mel_min self.f0_mel_max - self.f0_mel_min
) + 1 ) + 1
f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255 f0_mel[f0_mel > 255] = 255
@ -221,6 +216,7 @@ class RVC:
return self.get_f0_rmvpe(x, f0_up_key) return self.get_f0_rmvpe(x, f0_up_key)
if method == "fcpe": if method == "fcpe":
return self.get_f0_fcpe(x, f0_up_key) return self.get_f0_fcpe(x, f0_up_key)
x = x.cpu().numpy()
if method == "pm": if method == "pm":
p_len = x.shape[0] // 160 + 1 p_len = x.shape[0] // 160 + 1
f0_min = 65 f0_min = 65
@ -262,7 +258,7 @@ class RVC:
self.inp_q.put((idx, x[:tail], res_f0, n_cpu, ts)) self.inp_q.put((idx, x[:tail], res_f0, n_cpu, ts))
else: else:
self.inp_q.put( self.inp_q.put(
(idx, x[part_length * idx - 320 : tail], res_f0, n_cpu, ts) (idx, x[part_length * idx - 320: tail], res_f0, n_cpu, ts)
) )
while 1: while 1:
res_ts = self.opt_q.get() res_ts = self.opt_q.get()
@ -277,20 +273,19 @@ class RVC:
else: else:
f0 = f0[2:] f0 = f0[2:]
f0bak[ f0bak[
part_length * idx // 160 : part_length * idx // 160 + f0.shape[0] part_length * idx // 160: part_length * idx // 160 + f0.shape[0]
] = f0 ] = f0
f0bak = signal.medfilt(f0bak, 3) f0bak = signal.medfilt(f0bak, 3)
f0bak *= pow(2, f0_up_key / 12) f0bak *= pow(2, f0_up_key / 12)
return self.get_f0_post(f0bak) return self.get_f0_post(f0bak)
def get_f0_crepe(self, x, f0_up_key): def get_f0_crepe(self, x, f0_up_key):
if "privateuseone" in str(self.device): ###不支持dmlcpu又太慢用不成拿pm顶替 if "privateuseone" in str(self.device): ###不支持dmlcpu又太慢用不成拿fcpe顶替
return self.get_f0(x, f0_up_key, 1, "pm") return self.get_f0(x, f0_up_key, 1, "fcpe")
audio = torch.tensor(np.copy(x))[None].float()
# printt("using crepe,device:%s"%self.device) # printt("using crepe,device:%s"%self.device)
f0, pd = torchcrepe.predict( f0, pd = torchcrepe.predict(
audio, x.unsqueeze(0).float(),
self.sr, 16000,
160, 160,
self.f0_min, self.f0_min,
self.f0_max, self.f0_max,
@ -313,15 +308,11 @@ class RVC:
printt("Loading rmvpe model") printt("Loading rmvpe model")
self.model_rmvpe = RMVPE( self.model_rmvpe = RMVPE(
# "rmvpe.pt", is_half=self.is_half if self.device.type!="privateuseone" else False, device=self.device if self.device.type!="privateuseone"else "cpu"####dml时强制对rmvpe用cpu跑
# "rmvpe.pt", is_half=False, device=self.device####dml配置
# "rmvpe.pt", is_half=False, device="cpu"####锁定cpu配置
"assets/rmvpe/rmvpe.pt", "assets/rmvpe/rmvpe.pt",
is_half=self.is_half, is_half=self.is_half,
device=self.device, ####正常逻辑 device=self.device,
use_jit=self.config.use_jit, use_jit=self.config.use_jit,
) )
# self.model_rmvpe = RMVPE("aug2_58000_half.pt", is_half=self.is_half, device=self.device)
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
f0 *= pow(2, f0_up_key / 12) f0 *= pow(2, f0_up_key / 12)
return self.get_f0_post(f0) return self.get_f0_post(f0)
@ -329,41 +320,36 @@ class RVC:
def get_f0_fcpe(self, x, f0_up_key): def get_f0_fcpe(self, x, f0_up_key):
if hasattr(self, "model_fcpe") == False: if hasattr(self, "model_fcpe") == False:
from torchfcpe import spawn_bundled_infer_model from torchfcpe import spawn_bundled_infer_model
printt("Loading fcpe model") printt("Loading fcpe model")
self.model_fcpe = spawn_bundled_infer_model(self.device) if "privateuseone" in str(self.device):
f0 = ( self.device_fcpe = "cpu"
self.model_fcpe.infer( else:
torch.from_numpy(x).to(self.device).unsqueeze(0).float(), self.device_fcpe = self.device
sr=16000, self.model_fcpe = spawn_bundled_infer_model(self.device_fcpe)
decoder_mode="local_argmax", f0 = self.model_fcpe.infer(
threshold=0.006, x.to(self.device_fcpe).unsqueeze(0).float(),
) sr=16000,
.squeeze() decoder_mode='local_argmax',
.cpu() threshold=0.006,
.numpy()
) )
f0 *= pow(2, f0_up_key / 12) f0 *= pow(2, f0_up_key / 12)
f0 = f0.squeeze().cpu().numpy()
return self.get_f0_post(f0) return self.get_f0_post(f0)
def infer( def infer(
self, self,
feats: torch.Tensor, input_wav: torch.Tensor,
indata: np.ndarray, block_frame_16k,
block_frame_16k, skip_head,
rate, return_length,
cache_pitch, f0method,
cache_pitchf,
f0method,
) -> np.ndarray: ) -> np.ndarray:
feats = feats.view(1, -1)
if self.config.is_half:
feats = feats.half()
else:
feats = feats.float()
feats = feats.to(self.device)
t1 = ttime() t1 = ttime()
with torch.no_grad(): with torch.no_grad():
if self.config.is_half:
feats = input_wav.half().view(1, -1)
else:
feats = input_wav.float().view(1, -1)
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
inputs = { inputs = {
"source": feats, "source": feats,
@ -378,68 +364,68 @@ class RVC:
t2 = ttime() t2 = ttime()
try: try:
if hasattr(self, "index") and self.index_rate != 0: if hasattr(self, "index") and self.index_rate != 0:
leng_replace_head = int(rate * feats[0].shape[0]) npy = feats[0][skip_head // 2:].cpu().numpy().astype("float32")
npy = feats[0][-leng_replace_head:].cpu().numpy().astype("float32")
score, ix = self.index.search(npy, k=8) score, ix = self.index.search(npy, k=8)
weight = np.square(1 / score) weight = np.square(1 / score)
weight /= weight.sum(axis=1, keepdims=True) weight /= weight.sum(axis=1, keepdims=True)
npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
if self.config.is_half: if self.config.is_half:
npy = npy.astype("float16") npy = npy.astype("float16")
feats[0][-leng_replace_head:] = ( feats[0][skip_head // 2:] = (
torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate
+ (1 - self.index_rate) * feats[0][-leng_replace_head:] + (1 - self.index_rate) * feats[0][skip_head // 2:]
) )
else: else:
printt("Index search FAILED or disabled") printt("Index search FAILED or disabled")
except: except:
traceback.print_exc() traceback.print_exc()
printt("Index search FAILED") printt("Index search FAILED")
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
t3 = ttime() t3 = ttime()
if self.if_f0 == 1: if self.if_f0 == 1:
pitch, pitchf = self.get_f0(indata, self.f0_up_key, self.n_cpu, f0method) f0_extractor_frame = block_frame_16k + 800
if f0method == "rmvpe":
f0_extractor_frame = (
5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160
)
pitch, pitchf = self.get_f0(input_wav[-f0_extractor_frame: ], self.f0_up_key, self.n_cpu, f0method)
start_frame = block_frame_16k // 160 start_frame = block_frame_16k // 160
end_frame = len(cache_pitch) - (pitch.shape[0] - 4) + start_frame end_frame = len(self.cache_pitch) - (pitch.shape[0] - 4) + start_frame
cache_pitch[:] = np.append(cache_pitch[start_frame:end_frame], pitch[3:-1]) self.cache_pitch[:] = np.append(self.cache_pitch[start_frame: end_frame], pitch[3:-1])
cache_pitchf[:] = np.append( self.cache_pitchf[:] = np.append(
cache_pitchf[start_frame:end_frame], pitchf[3:-1] self.cache_pitchf[start_frame: end_frame], pitchf[3:-1]
) )
p_len = min(feats.shape[1], 13000, cache_pitch.shape[0])
else:
cache_pitch, cache_pitchf = None, None
p_len = min(feats.shape[1], 13000)
t4 = ttime() t4 = ttime()
feats = feats[:, :p_len, :] p_len = input_wav.shape[0] // 160
if self.if_f0 == 1: if self.if_f0 == 1:
cache_pitch = cache_pitch[:p_len] cache_pitch = torch.LongTensor(self.cache_pitch[-p_len: ]).to(self.device).unsqueeze(0)
cache_pitchf = cache_pitchf[:p_len] cache_pitchf = torch.FloatTensor(self.cache_pitchf[-p_len: ]).to(self.device).unsqueeze(0)
cache_pitch = torch.LongTensor(cache_pitch).unsqueeze(0).to(self.device) feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
cache_pitchf = torch.FloatTensor(cache_pitchf).unsqueeze(0).to(self.device) feats = feats[:, :p_len, :]
p_len = torch.LongTensor([p_len]).to(self.device) p_len = torch.LongTensor([p_len]).to(self.device)
ii = 0 # sid sid = torch.LongTensor([0]).to(self.device)
sid = torch.LongTensor([ii]).to(self.device) skip_head = torch.LongTensor([skip_head])
return_length = torch.LongTensor([return_length])
with torch.no_grad(): with torch.no_grad():
if self.if_f0 == 1: if self.if_f0 == 1:
# printt(12222222222,feats.device,p_len.device,cache_pitch.device,cache_pitchf.device,sid.device,rate2) infered_audio, _, _ = self.net_g.infer(
infered_audio = self.net_g.infer(
feats, feats,
p_len, p_len,
cache_pitch, cache_pitch,
cache_pitchf, cache_pitchf,
sid, sid,
torch.FloatTensor([rate]), skip_head,
)[0][0, 0].data.float() return_length,
)
else: else:
infered_audio = self.net_g.infer( infered_audio, _, _ = self.net_g.infer(
feats, p_len, sid, torch.FloatTensor([rate]) feats, p_len, sid, skip_head, return_length
)[0][0, 0].data.float() )
t5 = ttime() t5 = ttime()
printt( printt(
"Spent time: fea = %.2fs, index = %.2fs, f0 = %.2fs, model = %.2fs", "Spent time: fea = %.3fs, index = %.3fs, f0 = %.3fs, model = %.3fs",
t2 - t1, t2 - t1,
t3 - t2, t3 - t2,
t4 - t3, t4 - t3,
t5 - t4, t5 - t4,
) )
return infered_audio return infered_audio.squeeze().float()