add formant shift for realtime-gui

This commit is contained in:
yxlllc 2024-04-21 23:48:32 +08:00
parent f5bb555b84
commit 8b908ccf6f
4 changed files with 68 additions and 13 deletions

View File

@ -1 +1 @@
{"pth_path": "assets/weights/kikiV1.pth", "index_path": "logs/kikiV1.index", "sg_hostapi": "MME", "sg_wasapi_exclusive": false, "sg_input_device": "VoiceMeeter Output (VB-Audio Vo", "sg_output_device": "VoiceMeeter Input (VB-Audio Voi", "sr_type": "sr_device", "threhold": -60.0, "pitch": 12.0, "rms_mix_rate": 0.5, "index_rate": 0.0, "block_time": 0.15, "crossfade_length": 0.08, "extra_time": 2.0, "n_cpu": 4.0, "use_jit": false, "use_pv": false, "f0method": "fcpe"} {"pth_path": "assets/weights/kikiV1.pth", "index_path": "logs/kikiV1.index", "sg_hostapi": "MME", "sg_wasapi_exclusive": false, "sg_input_device": "VoiceMeeter Output (VB-Audio Vo", "sg_output_device": "VoiceMeeter Input (VB-Audio Voi", "sr_type": "sr_device", "threhold": -60.0, "pitch": 12.0, "formant": 0.0, "rms_mix_rate": 0.5, "index_rate": 0.0, "block_time": 0.15, "crossfade_length": 0.08, "extra_time": 2.0, "n_cpu": 4.0, "use_jit": false, "use_pv": false, "f0method": "fcpe"}

View File

@ -114,6 +114,7 @@ if __name__ == "__main__":
self.pth_path: str = "" self.pth_path: str = ""
self.index_path: str = "" self.index_path: str = ""
self.pitch: int = 0 self.pitch: int = 0
self.formant: float = 0.0
self.sr_type: str = "sr_model" self.sr_type: str = "sr_model"
self.block_time: float = 0.25 # s self.block_time: float = 0.25 # s
self.threhold: int = -60 self.threhold: int = -60
@ -212,6 +213,7 @@ if __name__ == "__main__":
"sr_type": "sr_model", "sr_type": "sr_model",
"threhold": -60, "threhold": -60,
"pitch": 0, "pitch": 0,
"formant": 0.0,
"index_rate": 0, "index_rate": 0,
"rms_mix_rate": 0, "rms_mix_rate": 0,
"block_time": 0.25, "block_time": 0.25,
@ -353,6 +355,17 @@ if __name__ == "__main__":
default_value=data.get("pitch", 0), default_value=data.get("pitch", 0),
enable_events=True, enable_events=True,
), ),
],
[
sg.Text(i18n("共振偏移")),
sg.Slider(
range=(-5, 5),
key="formant",
resolution=0.01,
orientation="h",
default_value=data.get("formant", 0.0),
enable_events=True,
),
], ],
[ [
sg.Text(i18n("Index Rate")), sg.Text(i18n("Index Rate")),
@ -579,6 +592,7 @@ if __name__ == "__main__":
], ],
"threhold": values["threhold"], "threhold": values["threhold"],
"pitch": values["pitch"], "pitch": values["pitch"],
"formant": values["formant"],
"rms_mix_rate": values["rms_mix_rate"], "rms_mix_rate": values["rms_mix_rate"],
"index_rate": values["index_rate"], "index_rate": values["index_rate"],
# "device_latency": values["device_latency"], # "device_latency": values["device_latency"],
@ -621,6 +635,10 @@ if __name__ == "__main__":
self.gui_config.pitch = values["pitch"] self.gui_config.pitch = values["pitch"]
if hasattr(self, "rvc"): if hasattr(self, "rvc"):
self.rvc.change_key(values["pitch"]) self.rvc.change_key(values["pitch"])
elif event == "formant":
self.gui_config.formant = values["formant"]
if hasattr(self, "rvc"):
self.rvc.change_formant(values["formant"])
elif event == "index_rate": elif event == "index_rate":
self.gui_config.index_rate = values["index_rate"] self.gui_config.index_rate = values["index_rate"]
if hasattr(self, "rvc"): if hasattr(self, "rvc"):
@ -679,6 +697,7 @@ if __name__ == "__main__":
] ]
self.gui_config.threhold = values["threhold"] self.gui_config.threhold = values["threhold"]
self.gui_config.pitch = values["pitch"] self.gui_config.pitch = values["pitch"]
self.gui_config.formant = values["formant"]
self.gui_config.block_time = values["block_time"] self.gui_config.block_time = values["block_time"]
self.gui_config.crossfade_time = values["crossfade_length"] self.gui_config.crossfade_time = values["crossfade_length"]
self.gui_config.extra_time = values["extra_time"] self.gui_config.extra_time = values["extra_time"]
@ -703,6 +722,7 @@ if __name__ == "__main__":
torch.cuda.empty_cache() torch.cuda.empty_cache()
self.rvc = rtrvc.RVC( self.rvc = rtrvc.RVC(
self.gui_config.pitch, self.gui_config.pitch,
self.gui_config.formant,
self.gui_config.pth_path, self.gui_config.pth_path,
self.gui_config.index_path, self.gui_config.index_path,
self.gui_config.index_rate, self.gui_config.index_rate,

View File

@ -10,7 +10,6 @@ from torch import nn
from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
from torch.nn import functional as F from torch.nn import functional as F
from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
from infer.lib.infer_pack import attentions, commons, modules from infer.lib.infer_pack import attentions, commons, modules
from infer.lib.infer_pack.commons import get_padding, init_weights from infer.lib.infer_pack.commons import get_padding, init_weights
@ -250,7 +249,12 @@ class Generator(torch.nn.Module):
if gin_channels != 0: if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None): def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None, n_res: Optional[torch.Tensor] = None):
if n_res is not None:
assert isinstance(n_res, torch.Tensor)
n = int(n_res.item())
if n != x.shape[-1]:
x = F.interpolate(x, size=n, mode='linear')
x = self.conv_pre(x) x = self.conv_pre(x)
if g is not None: if g is not None:
x = x + self.cond(g) x = x + self.cond(g)
@ -528,10 +532,17 @@ class GeneratorNSF(torch.nn.Module):
self.upp = math.prod(upsample_rates) self.upp = math.prod(upsample_rates)
self.lrelu_slope = modules.LRELU_SLOPE self.lrelu_slope = modules.LRELU_SLOPE
def forward(self, x, f0, g: Optional[torch.Tensor] = None): def forward(self, x, f0, g: Optional[torch.Tensor] = None, n_res: Optional[torch.Tensor] = None):
har_source, noi_source, uv = self.m_source(f0, self.upp) har_source, noi_source, uv = self.m_source(f0, self.upp)
har_source = har_source.transpose(1, 2) har_source = har_source.transpose(1, 2)
if n_res is not None:
assert isinstance(n_res, torch.Tensor)
n = int(n_res.item())
if n * self.upp != har_source.shape[-1]:
har_source = F.interpolate(har_source, size=n*self.upp, mode='linear')
if n != x.shape[-1]:
x = F.interpolate(x, size=n, mode='linear')
x = self.conv_pre(x) x = self.conv_pre(x)
if g is not None: if g is not None:
x = x + self.cond(g) x = x + self.cond(g)
@ -558,6 +569,7 @@ class GeneratorNSF(torch.nn.Module):
x = F.leaky_relu(x) x = F.leaky_relu(x)
x = self.conv_post(x) x = self.conv_post(x)
x = torch.tanh(x) x = torch.tanh(x)
return x return x
def remove_weight_norm(self): def remove_weight_norm(self):
@ -748,6 +760,7 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
sid: torch.Tensor, sid: torch.Tensor,
skip_head: Optional[torch.Tensor] = None, skip_head: Optional[torch.Tensor] = None,
return_length: Optional[torch.Tensor] = None, return_length: Optional[torch.Tensor] = None,
return_length2: Optional[torch.Tensor] = None,
): ):
g = self.emb_g(sid).unsqueeze(-1) g = self.emb_g(sid).unsqueeze(-1)
if skip_head is not None and return_length is not None: if skip_head is not None and return_length is not None:
@ -767,7 +780,7 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True) z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, nsff0, g=g) o = self.dec(z * x_mask, nsff0, g=g, n_res=return_length2)
return o, x_mask, (z, z_p, m_p, logs_p) return o, x_mask, (z, z_p, m_p, logs_p)
@ -963,6 +976,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
sid: torch.Tensor, sid: torch.Tensor,
skip_head: Optional[torch.Tensor] = None, skip_head: Optional[torch.Tensor] = None,
return_length: Optional[torch.Tensor] = None, return_length: Optional[torch.Tensor] = None,
return_length2: Optional[torch.Tensor] = None,
): ):
g = self.emb_g(sid).unsqueeze(-1) g = self.emb_g(sid).unsqueeze(-1)
if skip_head is not None and return_length is not None: if skip_head is not None and return_length is not None:
@ -981,7 +995,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True) z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, g=g) o = self.dec(z * x_mask, g=g, n_res=return_length2)
return o, x_mask, (z, z_p, m_p, logs_p) return o, x_mask, (z, z_p, m_p, logs_p)

View File

@ -15,6 +15,7 @@ import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
import torchcrepe import torchcrepe
from torchaudio.transforms import Resample
now_dir = os.getcwd() now_dir = os.getcwd()
sys.path.append(now_dir) sys.path.append(now_dir)
@ -40,6 +41,7 @@ class RVC:
def __init__( def __init__(
self, self,
key, key,
formant,
pth_path, pth_path,
index_path, index_path,
index_rate, index_rate,
@ -68,6 +70,7 @@ class RVC:
# device="cpu"########强制cpu测试 # device="cpu"########强制cpu测试
self.device = config.device self.device = config.device
self.f0_up_key = key self.f0_up_key = key
self.formant_shift = formant
self.f0_min = 50 self.f0_min = 50
self.f0_max = 1100 self.f0_max = 1100
self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
@ -75,7 +78,7 @@ class RVC:
self.n_cpu = n_cpu self.n_cpu = n_cpu
self.use_jit = self.config.use_jit self.use_jit = self.config.use_jit
self.is_half = config.is_half self.is_half = config.is_half
if index_rate != 0: if index_rate != 0:
self.index = faiss.read_index(index_path) self.index = faiss.read_index(index_path)
self.big_npy = self.index.reconstruct_n(0, self.index.ntotal) self.big_npy = self.index.reconstruct_n(0, self.index.ntotal)
@ -89,7 +92,9 @@ class RVC:
self.cache_pitchf = torch.zeros( self.cache_pitchf = torch.zeros(
1024, device=self.device, dtype=torch.float32 1024, device=self.device, dtype=torch.float32
) )
self.resample_kernel = {}
if last_rvc is None: if last_rvc is None:
models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
["assets/hubert/hubert_base.pt"], ["assets/hubert/hubert_base.pt"],
@ -186,7 +191,10 @@ class RVC:
def change_key(self, new_key): def change_key(self, new_key):
self.f0_up_key = new_key self.f0_up_key = new_key
def change_formant(self, new_formant):
self.formant_shift = new_formant
def change_index_rate(self, new_index_rate): def change_index_rate(self, new_index_rate):
if new_index_rate != 0 and self.index_rate == 0: if new_index_rate != 0 and self.index_rate == 0:
self.index = faiss.read_index(self.index_path) self.index = faiss.read_index(self.index_path)
@ -198,7 +206,7 @@ class RVC:
if not torch.is_tensor(f0): if not torch.is_tensor(f0):
f0 = torch.from_numpy(f0) f0 = torch.from_numpy(f0)
f0 = f0.float().to(self.device).squeeze() f0 = f0.float().to(self.device).squeeze()
f0_mel = 1127 * torch.log(1 + f0 / 700) f0_mel = 1127 * torch.log(1 + f0 * pow(2, -self.formant_shift / 12) / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / ( f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (
self.f0_mel_max - self.f0_mel_min self.f0_mel_max - self.f0_mel_min
) + 1 ) + 1
@ -410,6 +418,8 @@ class RVC:
p_len = torch.LongTensor([p_len]).to(self.device) p_len = torch.LongTensor([p_len]).to(self.device)
sid = torch.LongTensor([0]).to(self.device) sid = torch.LongTensor([0]).to(self.device)
skip_head = torch.LongTensor([skip_head]) skip_head = torch.LongTensor([skip_head])
factor = pow(2, self.formant_shift / 12)
return_length2 = torch.LongTensor([int(np.ceil(return_length * factor))])
return_length = torch.LongTensor([return_length]) return_length = torch.LongTensor([return_length])
with torch.no_grad(): with torch.no_grad():
if self.if_f0 == 1: if self.if_f0 == 1:
@ -421,11 +431,22 @@ class RVC:
sid, sid,
skip_head, skip_head,
return_length, return_length,
return_length2,
) )
else: else:
infered_audio, _, _ = self.net_g.infer( infered_audio, _, _ = self.net_g.infer(
feats, p_len, sid, skip_head, return_length feats, p_len, sid, skip_head, return_length, return_length2
) )
infered_audio = infered_audio.squeeze(1).float()
upp_res = int(np.floor(factor * self.tgt_sr // 100))
if upp_res != self.tgt_sr // 100:
if upp_res not in self.resample_kernel:
self.resample_kernel[upp_res] = Resample(
orig_freq=upp_res,
new_freq=self.tgt_sr // 100,
dtype=torch.float32,
).to(self.device)
infered_audio = self.resample_kernel[upp_res](infered_audio[: ,: return_length * upp_res])
t5 = ttime() t5 = ttime()
printt( printt(
"Spent time: fea = %.3fs, index = %.3fs, f0 = %.3fs, model = %.3fs", "Spent time: fea = %.3fs, index = %.3fs, f0 = %.3fs, model = %.3fs",
@ -434,4 +455,4 @@ class RVC:
t4 - t3, t4 - t3,
t5 - t4, t5 - t4,
) )
return infered_audio.squeeze().float() return infered_audio.squeeze()