Format code (#727)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
This commit is contained in:
github-actions[bot] 2023-07-13 14:35:24 +08:00 committed by GitHub
parent 6c13f1fe52
commit 9739f3085d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 418 additions and 184 deletions

214
gui_v1.py
View File

@ -1,29 +1,34 @@
import os,sys import os, sys
now_dir = os.getcwd() now_dir = os.getcwd()
sys.path.append(now_dir) sys.path.append(now_dir)
import multiprocessing import multiprocessing
class Harvest(multiprocessing.Process): class Harvest(multiprocessing.Process):
def __init__(self,inp_q,opt_q): def __init__(self, inp_q, opt_q):
multiprocessing.Process.__init__(self) multiprocessing.Process.__init__(self)
self.inp_q=inp_q self.inp_q = inp_q
self.opt_q=opt_q self.opt_q = opt_q
def run(self): def run(self):
import numpy as np, pyworld import numpy as np, pyworld
while(1):
idx, x, res_f0,n_cpu,ts=self.inp_q.get() while 1:
f0,t=pyworld.harvest( idx, x, res_f0, n_cpu, ts = self.inp_q.get()
f0, t = pyworld.harvest(
x.astype(np.double), x.astype(np.double),
fs=16000, fs=16000,
f0_ceil=1100, f0_ceil=1100,
f0_floor=50, f0_floor=50,
frame_period=10, frame_period=10,
) )
res_f0[idx]=f0 res_f0[idx] = f0
if(len(res_f0.keys())>=n_cpu): if len(res_f0.keys()) >= n_cpu:
self.opt_q.put(ts) self.opt_q.put(ts)
if __name__ == '__main__':
if __name__ == "__main__":
from multiprocessing import Queue from multiprocessing import Queue
from queue import Empty from queue import Empty
import numpy as np import numpy as np
@ -43,11 +48,12 @@ if __name__ == '__main__':
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
current_dir = os.getcwd() current_dir = os.getcwd()
inp_q = Queue() inp_q = Queue()
opt_q=Queue() opt_q = Queue()
n_cpu=min(cpu_count(),8) n_cpu = min(cpu_count(), 8)
for _ in range(n_cpu): for _ in range(n_cpu):
Harvest(inp_q,opt_q).start() Harvest(inp_q, opt_q).start()
from rvc_for_realtime import RVC from rvc_for_realtime import RVC
class GUIConfig: class GUIConfig:
def __init__(self) -> None: def __init__(self) -> None:
self.pth_path: str = "" self.pth_path: str = ""
@ -62,9 +68,8 @@ if __name__ == '__main__':
self.I_noise_reduce = False self.I_noise_reduce = False
self.O_noise_reduce = False self.O_noise_reduce = False
self.index_rate = 0.3 self.index_rate = 0.3
self.n_cpu=min(n_cpu,8) self.n_cpu = min(n_cpu, 8)
self.f0method="harvest" self.f0method = "harvest"
class GUI: class GUI:
def __init__(self) -> None: def __init__(self) -> None:
@ -78,10 +83,10 @@ if __name__ == '__main__':
try: try:
with open("values1.json", "r") as j: with open("values1.json", "r") as j:
data = json.load(j) data = json.load(j)
data["pm"]=data["f0method"]=="pm" data["pm"] = data["f0method"] == "pm"
data["harvest"]=data["f0method"]=="harvest" data["harvest"] = data["f0method"] == "harvest"
data["crepe"]=data["f0method"]=="crepe" data["crepe"] = data["f0method"] == "crepe"
data["rmvpe"]=data["f0method"]=="rmvpe" data["rmvpe"] = data["f0method"] == "rmvpe"
except: except:
with open("values1.json", "w") as j: with open("values1.json", "w") as j:
data = { data = {
@ -191,10 +196,30 @@ if __name__ == '__main__':
], ],
[ [
sg.Text(i18n("音高算法")), sg.Text(i18n("音高算法")),
sg.Radio("pm","f0method",key="pm",default=data.get("pm","")==True), sg.Radio(
sg.Radio("harvest","f0method",key="harvest",default=data.get("harvest","")==True), "pm",
sg.Radio("crepe","f0method",key="crepe",default=data.get("crepe","")==True), "f0method",
sg.Radio("rmvpe","f0method",key="rmvpe",default=data.get("rmvpe","")==True), key="pm",
default=data.get("pm", "") == True,
),
sg.Radio(
"harvest",
"f0method",
key="harvest",
default=data.get("harvest", "") == True,
),
sg.Radio(
"crepe",
"f0method",
key="crepe",
default=data.get("crepe", "") == True,
),
sg.Radio(
"rmvpe",
"f0method",
key="rmvpe",
default=data.get("rmvpe", "") == True,
),
], ],
], ],
title=i18n("常规设置"), title=i18n("常规设置"),
@ -218,7 +243,9 @@ if __name__ == '__main__':
key="n_cpu", key="n_cpu",
resolution=1, resolution=1,
orientation="h", orientation="h",
default_value=data.get("n_cpu", min(self.config.n_cpu,n_cpu)), default_value=data.get(
"n_cpu", min(self.config.n_cpu, n_cpu)
),
), ),
], ],
[ [
@ -281,7 +308,14 @@ if __name__ == '__main__':
"crossfade_length": values["crossfade_length"], "crossfade_length": values["crossfade_length"],
"extra_time": values["extra_time"], "extra_time": values["extra_time"],
"n_cpu": values["n_cpu"], "n_cpu": values["n_cpu"],
"f0method": ["pm","harvest","crepe","rmvpe"][[values["pm"],values["harvest"],values["crepe"],values["rmvpe"]].index(True)], "f0method": ["pm", "harvest", "crepe", "rmvpe"][
[
values["pm"],
values["harvest"],
values["crepe"],
values["rmvpe"],
].index(True)
],
} }
with open("values1.json", "w") as j: with open("values1.json", "w") as j:
json.dump(settings, j) json.dump(settings, j)
@ -314,7 +348,14 @@ if __name__ == '__main__':
self.config.O_noise_reduce = values["O_noise_reduce"] self.config.O_noise_reduce = values["O_noise_reduce"]
self.config.index_rate = values["index_rate"] self.config.index_rate = values["index_rate"]
self.config.n_cpu = values["n_cpu"] self.config.n_cpu = values["n_cpu"]
self.config.f0method = ["pm","harvest","crepe","rmvpe"][[values["pm"],values["harvest"],values["crepe"],values["rmvpe"]].index(True)] self.config.f0method = ["pm", "harvest", "crepe", "rmvpe"][
[
values["pm"],
values["harvest"],
values["crepe"],
values["rmvpe"],
].index(True)
]
return True return True
def start_vc(self): def start_vc(self):
@ -325,20 +366,64 @@ if __name__ == '__main__':
self.config.pth_path, self.config.pth_path,
self.config.index_path, self.config.index_path,
self.config.index_rate, self.config.index_rate,
self.config.n_cpu,inp_q,opt_q,device self.config.n_cpu,
inp_q,
opt_q,
device,
)
self.config.samplerate = self.rvc.tgt_sr
self.config.crossfade_time = min(
self.config.crossfade_time, self.config.block_time
) )
self.config.samplerate=self.rvc.tgt_sr
self.config.crossfade_time=min(self.config.crossfade_time,self.config.block_time)
self.block_frame = int(self.config.block_time * self.config.samplerate) self.block_frame = int(self.config.block_time * self.config.samplerate)
self.crossfade_frame = int(self.config.crossfade_time * self.config.samplerate) self.crossfade_frame = int(
self.config.crossfade_time * self.config.samplerate
)
self.sola_search_frame = int(0.01 * self.config.samplerate) self.sola_search_frame = int(0.01 * self.config.samplerate)
self.extra_frame = int(self.config.extra_time * self.config.samplerate) self.extra_frame = int(self.config.extra_time * self.config.samplerate)
self.zc=self.rvc.tgt_sr//100 self.zc = self.rvc.tgt_sr // 100
self.input_wav: np.ndarray = np.zeros(int(np.ceil((self.extra_frame+ self.crossfade_frame+ self.sola_search_frame+ self.block_frame)/self.zc)*self.zc),dtype="float32",) self.input_wav: np.ndarray = np.zeros(
self.output_wav_cache: torch.Tensor = torch.zeros(int(np.ceil((self.extra_frame+ self.crossfade_frame+ self.sola_search_frame+ self.block_frame)/self.zc)*self.zc), device=device,dtype=torch.float32) int(
self.pitch: np.ndarray = np.zeros(self.input_wav.shape[0]//self.zc,dtype="int32",) np.ceil(
self.pitchf: np.ndarray = np.zeros(self.input_wav.shape[0]//self.zc,dtype="float64",) (
self.output_wav: torch.Tensor = torch.zeros(self.block_frame, device=device, dtype=torch.float32) self.extra_frame
+ self.crossfade_frame
+ self.sola_search_frame
+ self.block_frame
)
/ self.zc
)
* self.zc
),
dtype="float32",
)
self.output_wav_cache: torch.Tensor = torch.zeros(
int(
np.ceil(
(
self.extra_frame
+ self.crossfade_frame
+ self.sola_search_frame
+ self.block_frame
)
/ self.zc
)
* self.zc
),
device=device,
dtype=torch.float32,
)
self.pitch: np.ndarray = np.zeros(
self.input_wav.shape[0] // self.zc,
dtype="int32",
)
self.pitchf: np.ndarray = np.zeros(
self.input_wav.shape[0] // self.zc,
dtype="float64",
)
self.output_wav: torch.Tensor = torch.zeros(
self.block_frame, device=device, dtype=torch.float32
)
self.sola_buffer: torch.Tensor = torch.zeros( self.sola_buffer: torch.Tensor = torch.zeros(
self.crossfade_frame, device=device, dtype=torch.float32 self.crossfade_frame, device=device, dtype=torch.float32
) )
@ -384,22 +469,46 @@ if __name__ == '__main__':
rms = librosa.feature.rms( rms = librosa.feature.rms(
y=indata, frame_length=frame_length, hop_length=hop_length y=indata, frame_length=frame_length, hop_length=hop_length
) )
if(self.config.threhold>-60): if self.config.threhold > -60:
db_threhold = librosa.amplitude_to_db(rms, ref=1.0)[0] < self.config.threhold db_threhold = (
librosa.amplitude_to_db(rms, ref=1.0)[0] < self.config.threhold
)
for i in range(db_threhold.shape[0]): for i in range(db_threhold.shape[0]):
if db_threhold[i]: if db_threhold[i]:
indata[i * hop_length : (i + 1) * hop_length] = 0 indata[i * hop_length : (i + 1) * hop_length] = 0
self.input_wav[:] = np.append(self.input_wav[self.block_frame :], indata) self.input_wav[:] = np.append(self.input_wav[self.block_frame :], indata)
# infer # infer
inp=torch.from_numpy(self.input_wav).to(device) inp = torch.from_numpy(self.input_wav).to(device)
##0 ##0
res1=self.resampler(inp) res1 = self.resampler(inp)
###55% ###55%
rate1=self.block_frame/(self.extra_frame+ self.crossfade_frame+ self.sola_search_frame+ self.block_frame) rate1 = self.block_frame / (
rate2=(self.crossfade_frame + self.sola_search_frame + self.block_frame)/(self.extra_frame+ self.crossfade_frame+ self.sola_search_frame+ self.block_frame) self.extra_frame
res2=self.rvc.infer(res1,res1[-self.block_frame:].cpu().numpy(),rate1,rate2,self.pitch,self.pitchf,self.config.f0method) + self.crossfade_frame
self.output_wav_cache[-res2.shape[0]:]=res2 + self.sola_search_frame
infer_wav = self.output_wav_cache[-self.crossfade_frame - self.sola_search_frame - self.block_frame :] + self.block_frame
)
rate2 = (
self.crossfade_frame + self.sola_search_frame + self.block_frame
) / (
self.extra_frame
+ self.crossfade_frame
+ self.sola_search_frame
+ self.block_frame
)
res2 = self.rvc.infer(
res1,
res1[-self.block_frame :].cpu().numpy(),
rate1,
rate2,
self.pitch,
self.pitchf,
self.config.f0method,
)
self.output_wav_cache[-res2.shape[0] :] = res2
infer_wav = self.output_wav_cache[
-self.crossfade_frame - self.sola_search_frame - self.block_frame :
]
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC
cor_nom = F.conv1d( cor_nom = F.conv1d(
infer_wav[None, None, : self.crossfade_frame + self.sola_search_frame], infer_wav[None, None, : self.crossfade_frame + self.sola_search_frame],
@ -407,7 +516,9 @@ if __name__ == '__main__':
) )
cor_den = torch.sqrt( cor_den = torch.sqrt(
F.conv1d( F.conv1d(
infer_wav[None, None, : self.crossfade_frame + self.sola_search_frame] infer_wav[
None, None, : self.crossfade_frame + self.sola_search_frame
]
** 2, ** 2,
torch.ones(1, 1, self.crossfade_frame, device=device), torch.ones(1, 1, self.crossfade_frame, device=device),
) )
@ -491,12 +602,15 @@ if __name__ == '__main__':
input_device_indices, input_device_indices,
output_device_indices, output_device_indices,
) = self.get_devices() ) = self.get_devices()
sd.default.device[0] = input_device_indices[input_devices.index(input_device)] sd.default.device[0] = input_device_indices[
input_devices.index(input_device)
]
sd.default.device[1] = output_device_indices[ sd.default.device[1] = output_device_indices[
output_devices.index(output_device) output_devices.index(output_device)
] ]
print("input device:" + str(sd.default.device[0]) + ":" + str(input_device)) print("input device:" + str(sd.default.device[0]) + ":" + str(input_device))
print("output device:" + str(sd.default.device[1]) + ":" + str(output_device)) print(
"output device:" + str(sd.default.device[1]) + ":" + str(output_device)
)
gui = GUI() gui = GUI()

View File

@ -635,11 +635,11 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
g = self.emb_g(sid).unsqueeze(-1) g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
if(rate): if rate:
head=int(z_p.shape[2]*rate) head = int(z_p.shape[2] * rate)
z_p=z_p[:,:,-head:] z_p = z_p[:, :, -head:]
x_mask=x_mask[:,:,-head:] x_mask = x_mask[:, :, -head:]
nsff0=nsff0[:,-head:] nsff0 = nsff0[:, -head:]
z = self.flow(z_p, x_mask, g=g, reverse=True) z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, nsff0, g=g) o = self.dec(z * x_mask, nsff0, g=g)
return o, x_mask, (z, z_p, m_p, logs_p) return o, x_mask, (z, z_p, m_p, logs_p)
@ -751,11 +751,11 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
g = self.emb_g(sid).unsqueeze(-1) g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
if(rate): if rate:
head=int(z_p.shape[2]*rate) head = int(z_p.shape[2] * rate)
z_p=z_p[:,:,-head:] z_p = z_p[:, :, -head:]
x_mask=x_mask[:,:,-head:] x_mask = x_mask[:, :, -head:]
nsff0=nsff0[:,-head:] nsff0 = nsff0[:, -head:]
z = self.flow(z_p, x_mask, g=g, reverse=True) z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, nsff0, g=g) o = self.dec(z * x_mask, nsff0, g=g)
return o, x_mask, (z, z_p, m_p, logs_p) return o, x_mask, (z, z_p, m_p, logs_p)
@ -858,10 +858,10 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
g = self.emb_g(sid).unsqueeze(-1) g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
if(rate): if rate:
head=int(z_p.shape[2]*rate) head = int(z_p.shape[2] * rate)
z_p=z_p[:,:,-head:] z_p = z_p[:, :, -head:]
x_mask=x_mask[:,:,-head:] x_mask = x_mask[:, :, -head:]
z = self.flow(z_p, x_mask, g=g, reverse=True) z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, g=g) o = self.dec(z * x_mask, g=g)
return o, x_mask, (z, z_p, m_p, logs_p) return o, x_mask, (z, z_p, m_p, logs_p)
@ -964,10 +964,10 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
g = self.emb_g(sid).unsqueeze(-1) g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
if(rate): if rate:
head=int(z_p.shape[2]*rate) head = int(z_p.shape[2] * rate)
z_p=z_p[:,:,-head:] z_p = z_p[:, :, -head:]
x_mask=x_mask[:,:,-head:] x_mask = x_mask[:, :, -head:]
z = self.flow(z_p, x_mask, g=g, reverse=True) z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, g=g) o = self.dec(z * x_mask, g=g)
return o, x_mask, (z, z_p, m_p, logs_p) return o, x_mask, (z, z_p, m_p, logs_p)

244
rmvpe.py
View File

@ -1,34 +1,46 @@
import sys,torch,numpy as np,traceback,pdb import sys, torch, numpy as np, traceback, pdb
import torch.nn as nn import torch.nn as nn
from time import time as ttime from time import time as ttime
import torch.nn.functional as F import torch.nn.functional as F
class BiGRU(nn.Module): class BiGRU(nn.Module):
def __init__(self, input_features, hidden_features, num_layers): def __init__(self, input_features, hidden_features, num_layers):
super(BiGRU, self).__init__() super(BiGRU, self).__init__()
self.gru = nn.GRU(input_features, hidden_features, num_layers=num_layers, batch_first=True, bidirectional=True) self.gru = nn.GRU(
input_features,
hidden_features,
num_layers=num_layers,
batch_first=True,
bidirectional=True,
)
def forward(self, x): def forward(self, x):
return self.gru(x)[0] return self.gru(x)[0]
class ConvBlockRes(nn.Module): class ConvBlockRes(nn.Module):
def __init__(self, in_channels, out_channels, momentum=0.01): def __init__(self, in_channels, out_channels, momentum=0.01):
super(ConvBlockRes, self).__init__() super(ConvBlockRes, self).__init__()
self.conv = nn.Sequential( self.conv = nn.Sequential(
nn.Conv2d(in_channels=in_channels, nn.Conv2d(
out_channels=out_channels, in_channels=in_channels,
kernel_size=(3, 3), out_channels=out_channels,
stride=(1, 1), kernel_size=(3, 3),
padding=(1, 1), stride=(1, 1),
bias=False), padding=(1, 1),
bias=False,
),
nn.BatchNorm2d(out_channels, momentum=momentum), nn.BatchNorm2d(out_channels, momentum=momentum),
nn.ReLU(), nn.ReLU(),
nn.Conv2d(
nn.Conv2d(in_channels=out_channels, in_channels=out_channels,
out_channels=out_channels, out_channels=out_channels,
kernel_size=(3, 3), kernel_size=(3, 3),
stride=(1, 1), stride=(1, 1),
padding=(1, 1), padding=(1, 1),
bias=False), bias=False,
),
nn.BatchNorm2d(out_channels, momentum=momentum), nn.BatchNorm2d(out_channels, momentum=momentum),
nn.ReLU(), nn.ReLU(),
) )
@ -44,15 +56,29 @@ class ConvBlockRes(nn.Module):
else: else:
return self.conv(x) + x return self.conv(x) + x
class Encoder(nn.Module): class Encoder(nn.Module):
def __init__(self, in_channels, in_size, n_encoders, kernel_size, n_blocks, out_channels=16, momentum=0.01): def __init__(
self,
in_channels,
in_size,
n_encoders,
kernel_size,
n_blocks,
out_channels=16,
momentum=0.01,
):
super(Encoder, self).__init__() super(Encoder, self).__init__()
self.n_encoders = n_encoders self.n_encoders = n_encoders
self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
self.layers = nn.ModuleList() self.layers = nn.ModuleList()
self.latent_channels = [] self.latent_channels = []
for i in range(self.n_encoders): for i in range(self.n_encoders):
self.layers.append(ResEncoderBlock(in_channels, out_channels, kernel_size, n_blocks, momentum=momentum)) self.layers.append(
ResEncoderBlock(
in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
)
)
self.latent_channels.append([out_channels, in_size]) self.latent_channels.append([out_channels, in_size])
in_channels = out_channels in_channels = out_channels
out_channels *= 2 out_channels *= 2
@ -67,8 +93,12 @@ class Encoder(nn.Module):
_, x = self.layers[i](x) _, x = self.layers[i](x)
concat_tensors.append(_) concat_tensors.append(_)
return x, concat_tensors return x, concat_tensors
class ResEncoderBlock(nn.Module): class ResEncoderBlock(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01): def __init__(
self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
):
super(ResEncoderBlock, self).__init__() super(ResEncoderBlock, self).__init__()
self.n_blocks = n_blocks self.n_blocks = n_blocks
self.conv = nn.ModuleList() self.conv = nn.ModuleList()
@ -86,38 +116,48 @@ class ResEncoderBlock(nn.Module):
return x, self.pool(x) return x, self.pool(x)
else: else:
return x return x
class Intermediate(nn.Module):#
class Intermediate(nn.Module): #
def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01): def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
super(Intermediate, self).__init__() super(Intermediate, self).__init__()
self.n_inters = n_inters self.n_inters = n_inters
self.layers = nn.ModuleList() self.layers = nn.ModuleList()
self.layers.append(ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)) self.layers.append(
for i in range(self.n_inters-1): ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
self.layers.append(ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)) )
for i in range(self.n_inters - 1):
self.layers.append(
ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
)
def forward(self, x): def forward(self, x):
for i in range(self.n_inters): for i in range(self.n_inters):
x = self.layers[i](x) x = self.layers[i](x)
return x return x
class ResDecoderBlock(nn.Module): class ResDecoderBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01): def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
super(ResDecoderBlock, self).__init__() super(ResDecoderBlock, self).__init__()
out_padding = (0, 1) if stride == (1, 2) else (1, 1) out_padding = (0, 1) if stride == (1, 2) else (1, 1)
self.n_blocks = n_blocks self.n_blocks = n_blocks
self.conv1 = nn.Sequential( self.conv1 = nn.Sequential(
nn.ConvTranspose2d(in_channels=in_channels, nn.ConvTranspose2d(
out_channels=out_channels, in_channels=in_channels,
kernel_size=(3, 3), out_channels=out_channels,
stride=stride, kernel_size=(3, 3),
padding=(1, 1), stride=stride,
output_padding=out_padding, padding=(1, 1),
bias=False), output_padding=out_padding,
bias=False,
),
nn.BatchNorm2d(out_channels, momentum=momentum), nn.BatchNorm2d(out_channels, momentum=momentum),
nn.ReLU(), nn.ReLU(),
) )
self.conv2 = nn.ModuleList() self.conv2 = nn.ModuleList()
self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
for i in range(n_blocks-1): for i in range(n_blocks - 1):
self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
def forward(self, x, concat_tensor): def forward(self, x, concat_tensor):
@ -126,6 +166,8 @@ class ResDecoderBlock(nn.Module):
for i in range(self.n_blocks): for i in range(self.n_blocks):
x = self.conv2[i](x) x = self.conv2[i](x)
return x return x
class Decoder(nn.Module): class Decoder(nn.Module):
def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01): def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
super(Decoder, self).__init__() super(Decoder, self).__init__()
@ -133,20 +175,40 @@ class Decoder(nn.Module):
self.n_decoders = n_decoders self.n_decoders = n_decoders
for i in range(self.n_decoders): for i in range(self.n_decoders):
out_channels = in_channels // 2 out_channels = in_channels // 2
self.layers.append(ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)) self.layers.append(
ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
)
in_channels = out_channels in_channels = out_channels
def forward(self, x, concat_tensors): def forward(self, x, concat_tensors):
for i in range(self.n_decoders): for i in range(self.n_decoders):
x = self.layers[i](x, concat_tensors[-1-i]) x = self.layers[i](x, concat_tensors[-1 - i])
return x return x
class DeepUnet(nn.Module): class DeepUnet(nn.Module):
def __init__(self, kernel_size, n_blocks, en_de_layers=5, inter_layers=4, in_channels=1, en_out_channels=16): def __init__(
self,
kernel_size,
n_blocks,
en_de_layers=5,
inter_layers=4,
in_channels=1,
en_out_channels=16,
):
super(DeepUnet, self).__init__() super(DeepUnet, self).__init__()
self.encoder = Encoder(in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels) self.encoder = Encoder(
self.intermediate = Intermediate(self.encoder.out_channel // 2, self.encoder.out_channel, inter_layers, n_blocks) in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
self.decoder = Decoder(self.encoder.out_channel, en_de_layers, kernel_size, n_blocks) )
self.intermediate = Intermediate(
self.encoder.out_channel // 2,
self.encoder.out_channel,
inter_layers,
n_blocks,
)
self.decoder = Decoder(
self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
)
def forward(self, x): def forward(self, x):
x, concat_tensors = self.encoder(x) x, concat_tensors = self.encoder(x)
@ -154,24 +216,38 @@ class DeepUnet(nn.Module):
x = self.decoder(x, concat_tensors) x = self.decoder(x, concat_tensors)
return x return x
class E2E(nn.Module): class E2E(nn.Module):
def __init__(self, n_blocks, n_gru, kernel_size, en_de_layers=5, inter_layers=4, in_channels=1, def __init__(
en_out_channels=16): self,
n_blocks,
n_gru,
kernel_size,
en_de_layers=5,
inter_layers=4,
in_channels=1,
en_out_channels=16,
):
super(E2E, self).__init__() super(E2E, self).__init__()
self.unet = DeepUnet(kernel_size, n_blocks, en_de_layers, inter_layers, in_channels, en_out_channels) self.unet = DeepUnet(
kernel_size,
n_blocks,
en_de_layers,
inter_layers,
in_channels,
en_out_channels,
)
self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
if n_gru: if n_gru:
self.fc = nn.Sequential( self.fc = nn.Sequential(
BiGRU(3 * 128, 256, n_gru), BiGRU(3 * 128, 256, n_gru),
nn.Linear(512, 360), nn.Linear(512, 360),
nn.Dropout(0.25), nn.Dropout(0.25),
nn.Sigmoid() nn.Sigmoid(),
) )
else: else:
self.fc = nn.Sequential( self.fc = nn.Sequential(
nn.Linear(3 * N_MELS, N_CLASS), nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
nn.Dropout(0.25),
nn.Sigmoid()
) )
def forward(self, mel): def forward(self, mel):
@ -179,19 +255,23 @@ class E2E(nn.Module):
x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2) x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
x = self.fc(x) x = self.fc(x)
return x return x
from librosa.filters import mel from librosa.filters import mel
class MelSpectrogram(torch.nn.Module): class MelSpectrogram(torch.nn.Module):
def __init__( def __init__(
self, self,
is_half, is_half,
n_mel_channels, n_mel_channels,
sampling_rate, sampling_rate,
win_length, win_length,
hop_length, hop_length,
n_fft=None, n_fft=None,
mel_fmin=0, mel_fmin=0,
mel_fmax=None, mel_fmax=None,
clamp=1e-5 clamp=1e-5,
): ):
super().__init__() super().__init__()
n_fft = win_length if n_fft is None else n_fft n_fft = win_length if n_fft is None else n_fft
@ -202,7 +282,8 @@ class MelSpectrogram(torch.nn.Module):
n_mels=n_mel_channels, n_mels=n_mel_channels,
fmin=mel_fmin, fmin=mel_fmin,
fmax=mel_fmax, fmax=mel_fmax,
htk=True) htk=True,
)
mel_basis = torch.from_numpy(mel_basis).float() mel_basis = torch.from_numpy(mel_basis).float()
self.register_buffer("mel_basis", mel_basis) self.register_buffer("mel_basis", mel_basis)
self.n_fft = win_length if n_fft is None else n_fft self.n_fft = win_length if n_fft is None else n_fft
@ -211,16 +292,18 @@ class MelSpectrogram(torch.nn.Module):
self.sampling_rate = sampling_rate self.sampling_rate = sampling_rate
self.n_mel_channels = n_mel_channels self.n_mel_channels = n_mel_channels
self.clamp = clamp self.clamp = clamp
self.is_half=is_half self.is_half = is_half
def forward(self, audio, keyshift=0, speed=1, center=True): def forward(self, audio, keyshift=0, speed=1, center=True):
factor = 2 ** (keyshift / 12) factor = 2 ** (keyshift / 12)
n_fft_new = int(np.round(self.n_fft * factor)) n_fft_new = int(np.round(self.n_fft * factor))
win_length_new = int(np.round(self.win_length * factor)) win_length_new = int(np.round(self.win_length * factor))
hop_length_new = int(np.round(self.hop_length * speed)) hop_length_new = int(np.round(self.hop_length * speed))
keyshift_key = str(keyshift) + '_' + str(audio.device) keyshift_key = str(keyshift) + "_" + str(audio.device)
if keyshift_key not in self.hann_window: if keyshift_key not in self.hann_window:
self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(audio.device) self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
audio.device
)
fft = torch.stft( fft = torch.stft(
audio, audio,
n_fft=n_fft_new, n_fft=n_fft_new,
@ -228,51 +311,57 @@ class MelSpectrogram(torch.nn.Module):
win_length=win_length_new, win_length=win_length_new,
window=self.hann_window[keyshift_key], window=self.hann_window[keyshift_key],
center=center, center=center,
return_complex=True) return_complex=True,
)
magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
if keyshift != 0: if keyshift != 0:
size = self.n_fft // 2 + 1 size = self.n_fft // 2 + 1
resize = magnitude.size(1) resize = magnitude.size(1)
if resize < size: if resize < size:
magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
magnitude = magnitude[:, :size, :]* self.win_length / win_length_new magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
mel_output = torch.matmul(self.mel_basis, magnitude) mel_output = torch.matmul(self.mel_basis, magnitude)
if(self.is_half==True):mel_output=mel_output.half() if self.is_half == True:
mel_output = mel_output.half()
log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
return log_mel_spec return log_mel_spec
class RMVPE: class RMVPE:
def __init__(self, model_path,is_half, device=None): def __init__(self, model_path, is_half, device=None):
self.resample_kernel = {} self.resample_kernel = {}
model = E2E(4, 1, (2, 2)) model = E2E(4, 1, (2, 2))
ckpt = torch.load(model_path,map_location="cpu") ckpt = torch.load(model_path, map_location="cpu")
model.load_state_dict(ckpt) model.load_state_dict(ckpt)
model.eval() model.eval()
if(is_half==True):model=model.half() if is_half == True:
model = model.half()
self.model = model self.model = model
self.resample_kernel = {} self.resample_kernel = {}
self.is_half=is_half self.is_half = is_half
if device is None: if device is None:
device = 'cuda' if torch.cuda.is_available() else 'cpu' device = "cuda" if torch.cuda.is_available() else "cpu"
self.device=device self.device = device
self.mel_extractor = MelSpectrogram(is_half,128, 16000, 1024, 160, None, 30, 8000).to(device) self.mel_extractor = MelSpectrogram(
is_half, 128, 16000, 1024, 160, None, 30, 8000
).to(device)
self.model = self.model.to(device) self.model = self.model.to(device)
cents_mapping = (20 * np.arange(360) + 1997.3794084376191) cents_mapping = 20 * np.arange(360) + 1997.3794084376191
self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368 self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368
def mel2hidden(self, mel): def mel2hidden(self, mel):
with torch.no_grad(): with torch.no_grad():
n_frames = mel.shape[-1] n_frames = mel.shape[-1]
mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode='reflect') mel = F.pad(
mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect"
)
hidden = self.model(mel) hidden = self.model(mel)
return hidden[:, :n_frames] return hidden[:, :n_frames]
def decode(self, hidden, thred=0.03): def decode(self, hidden, thred=0.03):
cents_pred = self.to_local_average_cents(hidden, thred=thred) cents_pred = self.to_local_average_cents(hidden, thred=thred)
f0 = 10 * (2 ** (cents_pred / 1200)) f0 = 10 * (2 ** (cents_pred / 1200))
f0[f0==10]=0 f0[f0 == 10] = 0
# f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred]) # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred])
return f0 return f0
@ -286,15 +375,16 @@ class RMVPE:
hidden = self.mel2hidden(mel) hidden = self.mel2hidden(mel)
# torch.cuda.synchronize() # torch.cuda.synchronize()
# t2=ttime() # t2=ttime()
hidden=hidden.squeeze(0).cpu().numpy() hidden = hidden.squeeze(0).cpu().numpy()
if(self.is_half==True):hidden=hidden.astype("float32") if self.is_half == True:
hidden = hidden.astype("float32")
f0 = self.decode(hidden, thred=thred) f0 = self.decode(hidden, thred=thred)
# torch.cuda.synchronize() # torch.cuda.synchronize()
# t3=ttime() # t3=ttime()
# print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0)) # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0))
return f0 return f0
def to_local_average_cents(self,salience, thred=0.05): def to_local_average_cents(self, salience, thred=0.05):
# t0 = ttime() # t0 = ttime()
center = np.argmax(salience, axis=1) # 帧长#index center = np.argmax(salience, axis=1) # 帧长#index
salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368 salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368
@ -305,8 +395,8 @@ class RMVPE:
starts = center - 4 starts = center - 4
ends = center + 5 ends = center + 5
for idx in range(salience.shape[0]): for idx in range(salience.shape[0]):
todo_salience.append(salience[:, starts[idx]:ends[idx]][idx]) todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
todo_cents_mapping.append(self.cents_mapping[starts[idx]:ends[idx]]) todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
# t2 = ttime() # t2 = ttime()
todo_salience = np.array(todo_salience) # 帧长9 todo_salience = np.array(todo_salience) # 帧长9
todo_cents_mapping = np.array(todo_cents_mapping) # 帧长9 todo_cents_mapping = np.array(todo_cents_mapping) # 帧长9
@ -321,8 +411,6 @@ class RMVPE:
return devided return devided
# if __name__ == '__main__': # if __name__ == '__main__':
# audio, sampling_rate = sf.read("卢本伟语录~1.wav") # audio, sampling_rate = sf.read("卢本伟语录~1.wav")
# if len(audio.shape) > 1: # if len(audio.shape) > 1:

View File

@ -1,4 +1,4 @@
import faiss,torch,traceback,parselmouth,numpy as np,torchcrepe,torch.nn as nn,pyworld import faiss, torch, traceback, parselmouth, numpy as np, torchcrepe, torch.nn as nn, pyworld
from fairseq import checkpoint_utils from fairseq import checkpoint_utils
from lib.infer_pack.models import ( from lib.infer_pack.models import (
SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid,
@ -6,29 +6,32 @@ from lib.infer_pack.models import (
SynthesizerTrnMs768NSFsid, SynthesizerTrnMs768NSFsid,
SynthesizerTrnMs768NSFsid_nono, SynthesizerTrnMs768NSFsid_nono,
) )
import os,sys import os, sys
from time import time as ttime from time import time as ttime
import torch.nn.functional as F import torch.nn.functional as F
import scipy.signal as signal import scipy.signal as signal
now_dir = os.getcwd() now_dir = os.getcwd()
sys.path.append(now_dir) sys.path.append(now_dir)
from config import Config from config import Config
from multiprocessing import Manager as M from multiprocessing import Manager as M
mm = M() mm = M()
config = Config() config = Config()
class RVC: class RVC:
def __init__( def __init__(
self, key, pth_path, index_path, index_rate, n_cpu,inp_q,opt_q,device self, key, pth_path, index_path, index_rate, n_cpu, inp_q, opt_q, device
) -> None: ) -> None:
""" """
初始化 初始化
""" """
try: try:
global config global config
self.inp_q=inp_q self.inp_q = inp_q
self.opt_q=opt_q self.opt_q = opt_q
self.device=device self.device = device
self.f0_up_key = key self.f0_up_key = key
self.time_step = 160 / 16000 * 1000 self.time_step = 160 / 16000 * 1000
self.f0_min = 50 self.f0_min = 50
@ -81,7 +84,7 @@ class RVC:
self.net_g = self.net_g.half() self.net_g = self.net_g.half()
else: else:
self.net_g = self.net_g.float() self.net_g = self.net_g.float()
self.is_half=config.is_half self.is_half = config.is_half
except: except:
print(traceback.format_exc()) print(traceback.format_exc())
@ -102,29 +105,33 @@ class RVC:
def get_f0(self, x, f0_up_key, n_cpu, method="harvest"): def get_f0(self, x, f0_up_key, n_cpu, method="harvest"):
n_cpu = int(n_cpu) n_cpu = int(n_cpu)
if (method == "crepe"): return self.get_f0_crepe(x, f0_up_key) if method == "crepe":
if (method == "rmvpe"): return self.get_f0_rmvpe(x, f0_up_key) return self.get_f0_crepe(x, f0_up_key)
if (method == "pm"): if method == "rmvpe":
return self.get_f0_rmvpe(x, f0_up_key)
if method == "pm":
p_len = x.shape[0] // 160 p_len = x.shape[0] // 160
f0 = ( f0 = (
parselmouth.Sound(x, 16000) parselmouth.Sound(x, 16000)
.to_pitch_ac( .to_pitch_ac(
time_step=0.01, time_step=0.01,
voicing_threshold=0.6, voicing_threshold=0.6,
pitch_floor=50, pitch_floor=50,
pitch_ceiling=1100, pitch_ceiling=1100,
) )
.selected_array["frequency"] .selected_array["frequency"]
) )
pad_size = (p_len - len(f0) + 1) // 2 pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0: if pad_size > 0 or p_len - len(f0) - pad_size > 0:
print(pad_size, p_len - len(f0) - pad_size) print(pad_size, p_len - len(f0) - pad_size)
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") f0 = np.pad(
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
)
f0 *= pow(2, f0_up_key / 12) f0 *= pow(2, f0_up_key / 12)
return self.get_f0_post(f0) return self.get_f0_post(f0)
if (n_cpu == 1): if n_cpu == 1:
f0, t = pyworld.harvest( f0, t = pyworld.harvest(
x.astype(np.double), x.astype(np.double),
fs=16000, fs=16000,
@ -142,23 +149,27 @@ class RVC:
res_f0 = mm.dict() res_f0 = mm.dict()
for idx in range(n_cpu): for idx in range(n_cpu):
tail = part_length * (idx + 1) + 320 tail = part_length * (idx + 1) + 320
if (idx == 0): if idx == 0:
self.inp_q.put((idx, x[:tail], res_f0, n_cpu, ts)) self.inp_q.put((idx, x[:tail], res_f0, n_cpu, ts))
else: else:
self.inp_q.put((idx, x[part_length * idx - 320:tail], res_f0, n_cpu, ts)) self.inp_q.put(
while (1): (idx, x[part_length * idx - 320 : tail], res_f0, n_cpu, ts)
)
while 1:
res_ts = self.opt_q.get() res_ts = self.opt_q.get()
if (res_ts == ts): if res_ts == ts:
break break
f0s = [i[1] for i in sorted(res_f0.items(), key=lambda x: x[0])] f0s = [i[1] for i in sorted(res_f0.items(), key=lambda x: x[0])]
for idx, f0 in enumerate(f0s): for idx, f0 in enumerate(f0s):
if (idx == 0): if idx == 0:
f0 = f0[:-3] f0 = f0[:-3]
elif (idx != n_cpu - 1): elif idx != n_cpu - 1:
f0 = f0[2:-3] f0 = f0[2:-3]
else: else:
f0 = f0[2:-1] f0 = f0[2:-1]
f0bak[part_length * idx // 160:part_length * idx // 160 + f0.shape[0]] = f0 f0bak[
part_length * idx // 160 : part_length * idx // 160 + f0.shape[0]
] = f0
f0bak = signal.medfilt(f0bak, 3) f0bak = signal.medfilt(f0bak, 3)
f0bak *= pow(2, f0_up_key / 12) f0bak *= pow(2, f0_up_key / 12)
return self.get_f0_post(f0bak) return self.get_f0_post(f0bak)
@ -184,16 +195,28 @@ class RVC:
return self.get_f0_post(f0) return self.get_f0_post(f0)
def get_f0_rmvpe(self, x, f0_up_key): def get_f0_rmvpe(self, x, f0_up_key):
if (hasattr(self, "model_rmvpe") == False): if hasattr(self, "model_rmvpe") == False:
from rmvpe import RMVPE from rmvpe import RMVPE
print("loading rmvpe model") print("loading rmvpe model")
self.model_rmvpe = RMVPE("rmvpe.pt", is_half=self.is_half, device=self.device) self.model_rmvpe = RMVPE(
"rmvpe.pt", is_half=self.is_half, device=self.device
)
# self.model_rmvpe = RMVPE("aug2_58000_half.pt", is_half=self.is_half, device=self.device) # self.model_rmvpe = RMVPE("aug2_58000_half.pt", is_half=self.is_half, device=self.device)
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
f0 *= pow(2, f0_up_key / 12) f0 *= pow(2, f0_up_key / 12)
return self.get_f0_post(f0) return self.get_f0_post(f0)
def infer(self, feats: torch.Tensor, indata: np.ndarray, rate1, rate2, cache_pitch, cache_pitchf, f0method) -> np.ndarray: def infer(
self,
feats: torch.Tensor,
indata: np.ndarray,
rate1,
rate2,
cache_pitch,
cache_pitchf,
f0method,
) -> np.ndarray:
feats = feats.view(1, -1) feats = feats.view(1, -1)
if config.is_half: if config.is_half:
feats = feats.half() feats = feats.half()
@ -209,13 +232,12 @@ class RVC:
"output_layer": 9 if self.version == "v1" else 12, "output_layer": 9 if self.version == "v1" else 12,
} }
logits = self.model.extract_features(**inputs) logits = self.model.extract_features(**inputs)
feats = self.model.final_proj(logits[0]) if self.version == "v1" else logits[0] feats = (
self.model.final_proj(logits[0]) if self.version == "v1" else logits[0]
)
t2 = ttime() t2 = ttime()
try: try:
if ( if hasattr(self, "index") and self.index_rate != 0:
hasattr(self, "index")
and self.index_rate != 0
):
leng_replace_head = int(rate1 * feats[0].shape[0]) leng_replace_head = int(rate1 * feats[0].shape[0])
npy = feats[0][-leng_replace_head:].cpu().numpy().astype("float32") npy = feats[0][-leng_replace_head:].cpu().numpy().astype("float32")
score, ix = self.index.search(npy, k=8) score, ix = self.index.search(npy, k=8)
@ -237,8 +259,10 @@ class RVC:
t3 = ttime() t3 = ttime()
if self.if_f0 == 1: if self.if_f0 == 1:
pitch, pitchf = self.get_f0(indata, self.f0_up_key, self.n_cpu, f0method) pitch, pitchf = self.get_f0(indata, self.f0_up_key, self.n_cpu, f0method)
cache_pitch[:] = np.append(cache_pitch[pitch[:-1].shape[0]:], pitch[:-1]) cache_pitch[:] = np.append(cache_pitch[pitch[:-1].shape[0] :], pitch[:-1])
cache_pitchf[:] = np.append(cache_pitchf[pitchf[:-1].shape[0]:], pitchf[:-1]) cache_pitchf[:] = np.append(
cache_pitchf[pitchf[:-1].shape[0] :], pitchf[:-1]
)
p_len = min(feats.shape[1], 13000, cache_pitch.shape[0]) p_len = min(feats.shape[1], 13000, cache_pitch.shape[0])
else: else:
cache_pitch, cache_pitchf = None, None cache_pitch, cache_pitchf = None, None
@ -256,13 +280,17 @@ class RVC:
with torch.no_grad(): with torch.no_grad():
if self.if_f0 == 1: if self.if_f0 == 1:
infered_audio = ( infered_audio = (
self.net_g.infer(feats, p_len, cache_pitch, cache_pitchf, sid, rate2)[0][0, 0] self.net_g.infer(
.data.cpu() feats, p_len, cache_pitch, cache_pitchf, sid, rate2
.float() )[0][0, 0]
.data.cpu()
.float()
) )
else: else:
infered_audio = ( infered_audio = (
self.net_g.infer(feats, p_len, sid, rate2)[0][0, 0].data.cpu().float() self.net_g.infer(feats, p_len, sid, rate2)[0][0, 0]
.data.cpu()
.float()
) )
t5 = ttime() t5 = ttime()
print("time->fea-index-f0-model:", t2 - t1, t3 - t2, t4 - t3, t5 - t4) print("time->fea-index-f0-model:", t2 - t1, t3 - t2, t4 - t3, t5 - t4)

View File

@ -1,10 +1,11 @@
import numpy as np, parselmouth, torch, pdb,sys,os import numpy as np, parselmouth, torch, pdb, sys, os
from time import time as ttime from time import time as ttime
import torch.nn.functional as F import torch.nn.functional as F
import scipy.signal as signal import scipy.signal as signal
import pyworld, os, traceback, faiss, librosa, torchcrepe import pyworld, os, traceback, faiss, librosa, torchcrepe
from scipy import signal from scipy import signal
from functools import lru_cache from functools import lru_cache
now_dir = os.getcwd() now_dir = os.getcwd()
sys.path.append(now_dir) sys.path.append(now_dir)
@ -127,10 +128,13 @@ class VC(object):
f0[pd < 0.1] = 0 f0[pd < 0.1] = 0
f0 = f0[0].cpu().numpy() f0 = f0[0].cpu().numpy()
elif f0_method == "rmvpe": elif f0_method == "rmvpe":
if(hasattr(self,"model_rmvpe")==False): if hasattr(self, "model_rmvpe") == False:
from rmvpe import RMVPE from rmvpe import RMVPE
print("loading rmvpe model") print("loading rmvpe model")
self.model_rmvpe = RMVPE("rmvpe.pt",is_half=self.is_half, device=self.device) self.model_rmvpe = RMVPE(
"rmvpe.pt", is_half=self.is_half, device=self.device
)
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
f0 *= pow(2, f0_up_key / 12) f0 *= pow(2, f0_up_key / 12)
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))