Update vc_infer_pipeline.py

This commit is contained in:
RVC-Boss 2023-04-27 16:11:45 +08:00 committed by GitHub
parent 7b8a0bb6fc
commit 80b54499eb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 314 additions and 315 deletions

View File

@ -1,315 +1,314 @@
import numpy as np, parselmouth, torch, pdb import numpy as np, parselmouth, torch, pdb
from time import time as ttime from time import time as ttime
import torch.nn.functional as F import torch.nn.functional as F
from config import x_pad, x_query, x_center, x_max from config import x_pad, x_query, x_center, x_max
import scipy.signal as signal import scipy.signal as signal
import pyworld, os, traceback, faiss import pyworld, os, traceback, faiss
from scipy import signal from scipy import signal
bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
class VC(object): class VC(object):
def __init__(self, tgt_sr, device, is_half): def __init__(self, tgt_sr, device, is_half):
self.sr = 16000 # hubert输入采样率 self.sr = 16000 # hubert输入采样率
self.window = 160 # 每帧点数 self.window = 160 # 每帧点数
self.t_pad = self.sr * x_pad # 每条前后pad时间 self.t_pad = self.sr * x_pad # 每条前后pad时间
self.t_pad_tgt = tgt_sr * x_pad self.t_pad_tgt = tgt_sr * x_pad
self.t_pad2 = self.t_pad * 2 self.t_pad2 = self.t_pad * 2
self.t_query = self.sr * x_query # 查询切点前后查询时间 self.t_query = self.sr * x_query # 查询切点前后查询时间
self.t_center = self.sr * x_center # 查询切点位置 self.t_center = self.sr * x_center # 查询切点位置
self.t_max = self.sr * x_max # 免查询时长阈值 self.t_max = self.sr * x_max # 免查询时长阈值
self.device = device self.device = device
self.is_half = is_half self.is_half = is_half
def get_f0(self, x, p_len, f0_up_key, f0_method, inp_f0=None): def get_f0(self, x, p_len, f0_up_key, f0_method, inp_f0=None):
time_step = self.window / self.sr * 1000 time_step = self.window / self.sr * 1000
f0_min = 50 f0_min = 50
f0_max = 1100 f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700) f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700) f0_mel_max = 1127 * np.log(1 + f0_max / 700)
if f0_method == "pm": if f0_method == "pm":
f0 = ( f0 = (
parselmouth.Sound(x, self.sr) parselmouth.Sound(x, self.sr)
.to_pitch_ac( .to_pitch_ac(
time_step=time_step / 1000, time_step=time_step / 1000,
voicing_threshold=0.6, voicing_threshold=0.6,
pitch_floor=f0_min, pitch_floor=f0_min,
pitch_ceiling=f0_max, pitch_ceiling=f0_max,
) )
.selected_array["frequency"] .selected_array["frequency"]
) )
pad_size = (p_len - len(f0) + 1) // 2 pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0: if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad( f0 = np.pad(
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
) )
elif f0_method == "harvest": elif f0_method == "harvest":
f0, t = pyworld.harvest( f0, t = pyworld.harvest(
x.astype(np.double), x.astype(np.double),
fs=self.sr, fs=self.sr,
f0_ceil=f0_max, f0_ceil=f0_max,
f0_floor=f0_min, f0_floor=f0_min,
frame_period=10, frame_period=10,
) )
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr) f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
f0 = signal.medfilt(f0, 3) f0 = signal.medfilt(f0, 3)
f0 *= pow(2, f0_up_key / 12) f0 *= pow(2, f0_up_key / 12)
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
tf0 = self.sr // self.window # 每秒f0点数 tf0 = self.sr // self.window # 每秒f0点数
if inp_f0 is not None: if inp_f0 is not None:
delta_t = np.round( delta_t = np.round(
(inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
).astype("int16") ).astype("int16")
replace_f0 = np.interp( replace_f0 = np.interp(
list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
) )
shape = f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)].shape[0] shape = f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)].shape[0]
f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)] = replace_f0[:shape] f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)] = replace_f0[:shape]
# with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
f0bak = f0.copy() f0bak = f0.copy()
f0_mel = 1127 * np.log(1 + f0 / 700) f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
f0_mel_max - f0_mel_min f0_mel_max - f0_mel_min
) + 1 ) + 1
f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255 f0_mel[f0_mel > 255] = 255
f0_coarse = np.rint(f0_mel).astype(np.int) f0_coarse = np.rint(f0_mel).astype(np.int)
return f0_coarse, f0bak # 1-0 return f0_coarse, f0bak # 1-0
def vc( def vc(
self, self,
model, model,
net_g, net_g,
sid, sid,
audio0, audio0,
pitch, pitch,
pitchf, pitchf,
times, times,
index, index,
big_npy, big_npy,
index_rate, index_rate,
): # ,file_index,file_big_npy ): # ,file_index,file_big_npy
feats = torch.from_numpy(audio0) feats = torch.from_numpy(audio0)
if self.is_half: if self.is_half:
feats = feats.half() feats = feats.half()
else: else:
feats = feats.float() feats = feats.float()
if feats.dim() == 2: # double channels if feats.dim() == 2: # double channels
feats = feats.mean(-1) feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim() assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1) feats = feats.view(1, -1)
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
inputs = { inputs = {
"source": feats.to(self.device), "source": feats.to(self.device),
"padding_mask": padding_mask, "padding_mask": padding_mask,
"output_layer": 9, # layer 9 "output_layer": 9, # layer 9
} }
t0 = ttime() t0 = ttime()
with torch.no_grad(): with torch.no_grad():
logits = model.extract_features(**inputs) logits = model.extract_features(**inputs)
feats = model.final_proj(logits[0]) feats = model.final_proj(logits[0])
if ( if (
isinstance(index, type(None)) == False isinstance(index, type(None)) == False
and isinstance(big_npy, type(None)) == False and isinstance(big_npy, type(None)) == False
and index_rate != 0 and index_rate != 0
): ):
npy = feats[0].cpu().numpy() npy = feats[0].cpu().numpy()
if self.is_half: if self.is_half:
npy = npy.astype("float32") npy = npy.astype("float32")
# _, I = index.search(npy, 1) # _, I = index.search(npy, 1)
# npy = big_npy[I.squeeze()] # npy = big_npy[I.squeeze()]
#by github @nadare881 #by github @nadare881
score, ix = index.search(npy, k=8) score, ix = index.search(npy, k=8)
weight = np.square(1 / score) weight = np.square(1 / score)
weight /= weight.sum(axis=1, keepdims=True) weight /= weight.sum(axis=1, keepdims=True)
npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
if self.is_half: if self.is_half:
npy = npy.astype("float16") npy = npy.astype("float16")
feats = ( feats = (
torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
+ (1 - index_rate) * feats + (1 - index_rate) * feats
) )
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
t1 = ttime() t1 = ttime()
p_len = audio0.shape[0] // self.window p_len = audio0.shape[0] // self.window
if feats.shape[1] < p_len: if feats.shape[1] < p_len:
p_len = feats.shape[1] p_len = feats.shape[1]
if pitch != None and pitchf != None: if pitch != None and pitchf != None:
pitch = pitch[:, :p_len] pitch = pitch[:, :p_len]
pitchf = pitchf[:, :p_len] pitchf = pitchf[:, :p_len]
p_len = torch.tensor([p_len], device=self.device).long() p_len = torch.tensor([p_len], device=self.device).long()
with torch.no_grad(): with torch.no_grad():
if pitch != None and pitchf != None: if pitch != None and pitchf != None:
audio1 = ( audio1 = (
(net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768) (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768)
.data.cpu() .data.cpu()
.float() .float()
.numpy() .numpy()
.astype(np.int16) .astype(np.int16)
) )
else: else:
audio1 = ( audio1 = (
(net_g.infer(feats, p_len, sid)[0][0, 0] * 32768) (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768)
.data.cpu() .data.cpu()
.float() .float()
.numpy() .numpy()
.astype(np.int16) .astype(np.int16)
) )
del feats, p_len, padding_mask del feats, p_len, padding_mask
if torch.cuda.is_available(): if torch.cuda.is_available():
torch.cuda.empty_cache() torch.cuda.empty_cache()
t2 = ttime() t2 = ttime()
times[0] += t1 - t0 times[0] += t1 - t0
times[2] += t2 - t1 times[2] += t2 - t1
return audio1 return audio1
def pipeline( def pipeline(
self, self,
model, model,
net_g, net_g,
sid, sid,
audio, audio,
times, times,
f0_up_key, f0_up_key,
f0_method, f0_method,
file_index, file_index,
# file_big_npy, # file_big_npy,
index_rate, index_rate,
if_f0, if_f0,
f0_file=None, f0_file=None,
): ):
print(file_index!= "",os.path.exists(file_index) == True,index_rate != 0) if (
if ( file_index != ""
file_index != "" # and file_big_npy != ""
# and file_big_npy != "" # and os.path.exists(file_big_npy) == True
# and os.path.exists(file_big_npy) == True and os.path.exists(file_index) == True
and os.path.exists(file_index) == True and index_rate != 0
and index_rate != 0 ):
): try:
try: index = faiss.read_index(file_index)
index = faiss.read_index(file_index) # big_npy = np.load(file_big_npy)
# big_npy = np.load(file_big_npy) big_npy = index.reconstruct_n(0, index.ntotal)
big_npy = index.reconstruct_n(0, index.ntotal) except:
except: traceback.print_exc()
traceback.print_exc() index = big_npy = None
index = big_npy = None else:
else: index = big_npy = None
index = big_npy = None audio = signal.filtfilt(bh, ah, audio)
audio = signal.filtfilt(bh, ah, audio) audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") opt_ts = []
opt_ts = [] if audio_pad.shape[0] > self.t_max:
if audio_pad.shape[0] > self.t_max: audio_sum = np.zeros_like(audio)
audio_sum = np.zeros_like(audio) for i in range(self.window):
for i in range(self.window): audio_sum += audio_pad[i : i - self.window]
audio_sum += audio_pad[i : i - self.window] for t in range(self.t_center, audio.shape[0], self.t_center):
for t in range(self.t_center, audio.shape[0], self.t_center): opt_ts.append(
opt_ts.append( t
t - self.t_query
- self.t_query + np.where(
+ np.where( np.abs(audio_sum[t - self.t_query : t + self.t_query])
np.abs(audio_sum[t - self.t_query : t + self.t_query]) == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
== np.abs(audio_sum[t - self.t_query : t + self.t_query]).min() )[0][0]
)[0][0] )
) s = 0
s = 0 audio_opt = []
audio_opt = [] t = None
t = None t1 = ttime()
t1 = ttime() audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") p_len = audio_pad.shape[0] // self.window
p_len = audio_pad.shape[0] // self.window inp_f0 = None
inp_f0 = None if hasattr(f0_file, "name") == True:
if hasattr(f0_file, "name") == True: try:
try: with open(f0_file.name, "r") as f:
with open(f0_file.name, "r") as f: lines = f.read().strip("\n").split("\n")
lines = f.read().strip("\n").split("\n") inp_f0 = []
inp_f0 = [] for line in lines:
for line in lines: inp_f0.append([float(i) for i in line.split(",")])
inp_f0.append([float(i) for i in line.split(",")]) inp_f0 = np.array(inp_f0, dtype="float32")
inp_f0 = np.array(inp_f0, dtype="float32") except:
except: traceback.print_exc()
traceback.print_exc() sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() pitch, pitchf = None, None
pitch, pitchf = None, None if if_f0 == 1:
if if_f0 == 1: pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, inp_f0)
pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, inp_f0) pitch = pitch[:p_len]
pitch = pitch[:p_len] pitchf = pitchf[:p_len]
pitchf = pitchf[:p_len] pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() t2 = ttime()
t2 = ttime() times[1] += t2 - t1
times[1] += t2 - t1 for t in opt_ts:
for t in opt_ts: t = t // self.window * self.window
t = t // self.window * self.window if if_f0 == 1:
if if_f0 == 1: audio_opt.append(
audio_opt.append( self.vc(
self.vc( model,
model, net_g,
net_g, sid,
sid, audio_pad[s : t + self.t_pad2 + self.window],
audio_pad[s : t + self.t_pad2 + self.window], pitch[:, s // self.window : (t + self.t_pad2) // self.window],
pitch[:, s // self.window : (t + self.t_pad2) // self.window], pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
pitchf[:, s // self.window : (t + self.t_pad2) // self.window], times,
times, index,
index, big_npy,
big_npy, index_rate,
index_rate, )[self.t_pad_tgt : -self.t_pad_tgt]
)[self.t_pad_tgt : -self.t_pad_tgt] )
) else:
else: audio_opt.append(
audio_opt.append( self.vc(
self.vc( model,
model, net_g,
net_g, sid,
sid, audio_pad[s : t + self.t_pad2 + self.window],
audio_pad[s : t + self.t_pad2 + self.window], None,
None, None,
None, times,
times, index,
index, big_npy,
big_npy, index_rate,
index_rate, )[self.t_pad_tgt : -self.t_pad_tgt]
)[self.t_pad_tgt : -self.t_pad_tgt] )
) s = t
s = t if if_f0 == 1:
if if_f0 == 1: audio_opt.append(
audio_opt.append( self.vc(
self.vc( model,
model, net_g,
net_g, sid,
sid, audio_pad[t:],
audio_pad[t:], pitch[:, t // self.window :] if t is not None else pitch,
pitch[:, t // self.window :] if t is not None else pitch, pitchf[:, t // self.window :] if t is not None else pitchf,
pitchf[:, t // self.window :] if t is not None else pitchf, times,
times, index,
index, big_npy,
big_npy, index_rate,
index_rate, )[self.t_pad_tgt : -self.t_pad_tgt]
)[self.t_pad_tgt : -self.t_pad_tgt] )
) else:
else: audio_opt.append(
audio_opt.append( self.vc(
self.vc( model,
model, net_g,
net_g, sid,
sid, audio_pad[t:],
audio_pad[t:], None,
None, None,
None, times,
times, index,
index, big_npy,
big_npy, index_rate,
index_rate, )[self.t_pad_tgt : -self.t_pad_tgt]
)[self.t_pad_tgt : -self.t_pad_tgt] )
) audio_opt = np.concatenate(audio_opt)
audio_opt = np.concatenate(audio_opt) del pitch, pitchf, sid
del pitch, pitchf, sid if torch.cuda.is_available():
if torch.cuda.is_available(): torch.cuda.empty_cache()
torch.cuda.empty_cache() return audio_opt
return audio_opt