Add files via upload

This commit is contained in:
RVC-Boss 2023-04-27 23:34:03 +08:00 committed by GitHub
parent a149107c5a
commit af208d5210
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 352 additions and 326 deletions

View File

@ -1,2 +1,2 @@
runtime\python.exe infer-web.py --pycmd runtime\python.exe runtime\python.exe infer-web.py --pycmd runtime\python.exe --port 7897
pause pause

View File

@ -1,5 +1,5 @@
from multiprocessing import cpu_count from multiprocessing import cpu_count
import threading import threading,pdb,librosa
from time import sleep from time import sleep
from subprocess import Popen from subprocess import Popen
from time import sleep from time import sleep
@ -17,6 +17,7 @@ os.environ["TEMP"] = tmp
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
torch.manual_seed(114514) torch.manual_seed(114514)
from i18n import I18nAuto from i18n import I18nAuto
import ffmpeg
i18n = I18nAuto() i18n = I18nAuto()
# 判断是否有能用来训练和加速推理的N卡 # 判断是否有能用来训练和加速推理的N卡
@ -235,7 +236,7 @@ def vc_multi(
yield traceback.format_exc() yield traceback.format_exc()
def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins): def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins,agg):
infos = [] infos = []
try: try:
inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ") inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
@ -246,6 +247,7 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins):
save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ") save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
) )
pre_fun = _audio_pre_( pre_fun = _audio_pre_(
agg=int(agg),
model_path=os.path.join(weight_uvr5_root, model_name + ".pth"), model_path=os.path.join(weight_uvr5_root, model_name + ".pth"),
device=device, device=device,
is_half=is_half, is_half=is_half,
@ -254,10 +256,25 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins):
paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)] paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)]
else: else:
paths = [path.name for path in paths] paths = [path.name for path in paths]
for name in paths: for path in paths:
inp_path = os.path.join(inp_root, name) inp_path = os.path.join(inp_root, path)
need_reformat=1
done=0
try: try:
pre_fun._path_audio_(inp_path, save_root_ins, save_root_vocal) info = ffmpeg.probe(inp_path, cmd="ffprobe")
if(info["streams"][0]["channels"]==2 and info["streams"][0]["sample_rate"]=="44100"):
need_reformat=0
pre_fun._path_audio_(inp_path, save_root_ins, save_root_vocal)
done=1
except:
need_reformat = 1
traceback.print_exc()
if(need_reformat==1):
tmp_path="%s/%s.reformatted.wav"%(tmp,os.path.basename(inp_path))
os.system("ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y"%(inp_path,tmp_path))
inp_path=tmp_path
try:
if(done==0):pre_fun._path_audio_(inp_path, save_root_ins, save_root_vocal)
infos.append("%s->Success" % (os.path.basename(inp_path))) infos.append("%s->Success" % (os.path.basename(inp_path)))
yield "\n".join(infos) yield "\n".join(infos)
except: except:
@ -1147,6 +1164,15 @@ with gr.Blocks() as app:
) )
with gr.Column(): with gr.Column():
model_choose = gr.Dropdown(label=i18n("模型"), choices=uvr5_names) model_choose = gr.Dropdown(label=i18n("模型"), choices=uvr5_names)
agg = gr.Slider(
minimum=0,
maximum=20,
step=1,
label="人声提取激进程度",
value=10,
interactive=True,
visible=False#先不开放调整
)
opt_vocal_root = gr.Textbox( opt_vocal_root = gr.Textbox(
label=i18n("指定输出人声文件夹"), value="opt" label=i18n("指定输出人声文件夹"), value="opt"
) )
@ -1161,6 +1187,7 @@ with gr.Blocks() as app:
opt_vocal_root, opt_vocal_root,
wav_inputs, wav_inputs,
opt_ins_root, opt_ins_root,
agg
], ],
[vc_output4], [vc_output4],
) )
@ -1246,7 +1273,7 @@ with gr.Blocks() as app:
with gr.Row(): with gr.Row():
save_epoch10 = gr.Slider( save_epoch10 = gr.Slider(
minimum=0, minimum=0,
maximum=200, maximum=50,
step=1, step=1,
label=i18n("保存频率save_every_epoch"), label=i18n("保存频率save_every_epoch"),
value=5, value=5,

View File

@ -13,7 +13,7 @@ from scipy.io import wavfile
class _audio_pre_: class _audio_pre_:
def __init__(self, model_path, device, is_half): def __init__(self, agg,model_path, device, is_half):
self.model_path = model_path self.model_path = model_path
self.device = device self.device = device
self.data = { self.data = {
@ -22,7 +22,7 @@ class _audio_pre_:
"tta": False, "tta": False,
# Constants # Constants
"window_size": 512, "window_size": 512,
"agg": 10, "agg": agg,
"high_end_process": "mirroring", "high_end_process": "mirroring",
} }
nn_arch_sizes = [ nn_arch_sizes = [
@ -139,7 +139,7 @@ class _audio_pre_:
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
print("%s instruments done" % name) print("%s instruments done" % name)
wavfile.write( wavfile.write(
os.path.join(ins_root, "instrument_{}.wav".format(name)), os.path.join(ins_root, "instrument_{}_{}.wav".format(name,self.data["agg"])),
self.mp.param["sr"], self.mp.param["sr"],
(np.array(wav_instrument) * 32768).astype("int16"), (np.array(wav_instrument) * 32768).astype("int16"),
) # ) #
@ -155,7 +155,7 @@ class _audio_pre_:
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
print("%s vocals done" % name) print("%s vocals done" % name)
wavfile.write( wavfile.write(
os.path.join(vocal_root, "vocal_{}.wav".format(name)), os.path.join(vocal_root, "vocal_{}_{}.wav".format(name,self.data["agg"])),
self.mp.param["sr"], self.mp.param["sr"],
(np.array(wav_vocals) * 32768).astype("int16"), (np.array(wav_vocals) * 32768).astype("int16"),
) )

View File

@ -45,7 +45,7 @@ global_step = 0
def main(): def main():
# n_gpus = torch.cuda.device_count() # n_gpus = torch.cuda.device_count()
os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "51515" os.environ["MASTER_PORT"] = "51545"
mp.spawn( mp.spawn(
run, run,

View File

@ -1,314 +1,313 @@
import numpy as np, parselmouth, torch, pdb import numpy as np, parselmouth, torch, pdb
from time import time as ttime from time import time as ttime
import torch.nn.functional as F import torch.nn.functional as F
from config import x_pad, x_query, x_center, x_max from config import x_pad, x_query, x_center, x_max
import scipy.signal as signal import scipy.signal as signal
import pyworld, os, traceback, faiss import pyworld, os, traceback, faiss
from scipy import signal from scipy import signal
bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
class VC(object): class VC(object):
def __init__(self, tgt_sr, device, is_half): def __init__(self, tgt_sr, device, is_half):
self.sr = 16000 # hubert输入采样率 self.sr = 16000 # hubert输入采样率
self.window = 160 # 每帧点数 self.window = 160 # 每帧点数
self.t_pad = self.sr * x_pad # 每条前后pad时间 self.t_pad = self.sr * x_pad # 每条前后pad时间
self.t_pad_tgt = tgt_sr * x_pad self.t_pad_tgt = tgt_sr * x_pad
self.t_pad2 = self.t_pad * 2 self.t_pad2 = self.t_pad * 2
self.t_query = self.sr * x_query # 查询切点前后查询时间 self.t_query = self.sr * x_query # 查询切点前后查询时间
self.t_center = self.sr * x_center # 查询切点位置 self.t_center = self.sr * x_center # 查询切点位置
self.t_max = self.sr * x_max # 免查询时长阈值 self.t_max = self.sr * x_max # 免查询时长阈值
self.device = device self.device = device
self.is_half = is_half self.is_half = is_half
def get_f0(self, x, p_len, f0_up_key, f0_method, inp_f0=None): def get_f0(self, x, p_len, f0_up_key, f0_method, inp_f0=None):
time_step = self.window / self.sr * 1000 time_step = self.window / self.sr * 1000
f0_min = 50 f0_min = 50
f0_max = 1100 f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700) f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700) f0_mel_max = 1127 * np.log(1 + f0_max / 700)
if f0_method == "pm": if f0_method == "pm":
f0 = ( f0 = (
parselmouth.Sound(x, self.sr) parselmouth.Sound(x, self.sr)
.to_pitch_ac( .to_pitch_ac(
time_step=time_step / 1000, time_step=time_step / 1000,
voicing_threshold=0.6, voicing_threshold=0.6,
pitch_floor=f0_min, pitch_floor=f0_min,
pitch_ceiling=f0_max, pitch_ceiling=f0_max,
) )
.selected_array["frequency"] .selected_array["frequency"]
) )
pad_size = (p_len - len(f0) + 1) // 2 pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0: if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad( f0 = np.pad(
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
) )
elif f0_method == "harvest": elif f0_method == "harvest":
f0, t = pyworld.harvest( f0, t = pyworld.harvest(
x.astype(np.double), x.astype(np.double),
fs=self.sr, fs=self.sr,
f0_ceil=f0_max, f0_ceil=f0_max,
f0_floor=f0_min, f0_floor=f0_min,
frame_period=10, frame_period=10,
) )
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr) f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
f0 = signal.medfilt(f0, 3) f0 = signal.medfilt(f0, 3)
f0 *= pow(2, f0_up_key / 12) f0 *= pow(2, f0_up_key / 12)
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
tf0 = self.sr // self.window # 每秒f0点数 tf0 = self.sr // self.window # 每秒f0点数
if inp_f0 is not None: if inp_f0 is not None:
delta_t = np.round( delta_t = np.round(
(inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
).astype("int16") ).astype("int16")
replace_f0 = np.interp( replace_f0 = np.interp(
list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
) )
shape = f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)].shape[0] shape = f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)].shape[0]
f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)] = replace_f0[:shape] f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)] = replace_f0[:shape]
# with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
f0bak = f0.copy() f0bak = f0.copy()
f0_mel = 1127 * np.log(1 + f0 / 700) f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
f0_mel_max - f0_mel_min f0_mel_max - f0_mel_min
) + 1 ) + 1
f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255 f0_mel[f0_mel > 255] = 255
f0_coarse = np.rint(f0_mel).astype(np.int) f0_coarse = np.rint(f0_mel).astype(np.int)
return f0_coarse, f0bak # 1-0 return f0_coarse, f0bak # 1-0
def vc( def vc(
self, self,
model, model,
net_g, net_g,
sid, sid,
audio0, audio0,
pitch, pitch,
pitchf, pitchf,
times, times,
index, index,
big_npy, big_npy,
index_rate, index_rate,
): # ,file_index,file_big_npy ): # ,file_index,file_big_npy
feats = torch.from_numpy(audio0) feats = torch.from_numpy(audio0)
if self.is_half: if self.is_half:
feats = feats.half() feats = feats.half()
else: else:
feats = feats.float() feats = feats.float()
if feats.dim() == 2: # double channels if feats.dim() == 2: # double channels
feats = feats.mean(-1) feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim() assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1) feats = feats.view(1, -1)
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
inputs = { inputs = {
"source": feats.to(self.device), "source": feats.to(self.device),
"padding_mask": padding_mask, "padding_mask": padding_mask,
"output_layer": 9, # layer 9 "output_layer": 9, # layer 9
} }
t0 = ttime() t0 = ttime()
with torch.no_grad(): with torch.no_grad():
logits = model.extract_features(**inputs) logits = model.extract_features(**inputs)
feats = model.final_proj(logits[0]) feats = model.final_proj(logits[0])
if ( if (
isinstance(index, type(None)) == False isinstance(index, type(None)) == False
and isinstance(big_npy, type(None)) == False and isinstance(big_npy, type(None)) == False
and index_rate != 0 and index_rate != 0
): ):
npy = feats[0].cpu().numpy() npy = feats[0].cpu().numpy()
if self.is_half: if self.is_half:
npy = npy.astype("float32") npy = npy.astype("float32")
# _, I = index.search(npy, 1) # _, I = index.search(npy, 1)
# npy = big_npy[I.squeeze()] # npy = big_npy[I.squeeze()]
#by github @nadare881 score, ix = index.search(npy, k=8)
score, ix = index.search(npy, k=8) weight = np.square(1 / score)
weight = np.square(1 / score) weight /= weight.sum(axis=1, keepdims=True)
weight /= weight.sum(axis=1, keepdims=True) npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
if self.is_half:
if self.is_half: npy = npy.astype("float16")
npy = npy.astype("float16") feats = (
feats = ( torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats
+ (1 - index_rate) * feats )
)
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) t1 = ttime()
t1 = ttime() p_len = audio0.shape[0] // self.window
p_len = audio0.shape[0] // self.window if feats.shape[1] < p_len:
if feats.shape[1] < p_len: p_len = feats.shape[1]
p_len = feats.shape[1] if pitch != None and pitchf != None:
if pitch != None and pitchf != None: pitch = pitch[:, :p_len]
pitch = pitch[:, :p_len] pitchf = pitchf[:, :p_len]
pitchf = pitchf[:, :p_len] p_len = torch.tensor([p_len], device=self.device).long()
p_len = torch.tensor([p_len], device=self.device).long() with torch.no_grad():
with torch.no_grad(): if pitch != None and pitchf != None:
if pitch != None and pitchf != None: audio1 = (
audio1 = ( (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768)
(net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768) .data.cpu()
.data.cpu() .float()
.float() .numpy()
.numpy() .astype(np.int16)
.astype(np.int16) )
) else:
else: audio1 = (
audio1 = ( (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768)
(net_g.infer(feats, p_len, sid)[0][0, 0] * 32768) .data.cpu()
.data.cpu() .float()
.float() .numpy()
.numpy() .astype(np.int16)
.astype(np.int16) )
) del feats, p_len, padding_mask
del feats, p_len, padding_mask if torch.cuda.is_available():
if torch.cuda.is_available(): torch.cuda.empty_cache()
torch.cuda.empty_cache() t2 = ttime()
t2 = ttime() times[0] += t1 - t0
times[0] += t1 - t0 times[2] += t2 - t1
times[2] += t2 - t1 return audio1
return audio1
def pipeline(
def pipeline( self,
self, model,
model, net_g,
net_g, sid,
sid, audio,
audio, times,
times, f0_up_key,
f0_up_key, f0_method,
f0_method, file_index,
file_index, # file_big_npy,
# file_big_npy, index_rate,
index_rate, if_f0,
if_f0, f0_file=None,
f0_file=None, ):
): if (
if ( file_index != ""
file_index != "" # and file_big_npy != ""
# and file_big_npy != "" # and os.path.exists(file_big_npy) == True
# and os.path.exists(file_big_npy) == True and os.path.exists(file_index) == True
and os.path.exists(file_index) == True and index_rate != 0
and index_rate != 0 ):
): try:
try: index = faiss.read_index(file_index)
index = faiss.read_index(file_index) # big_npy = np.load(file_big_npy)
# big_npy = np.load(file_big_npy) big_npy = index.reconstruct_n(0, index.ntotal)
big_npy = index.reconstruct_n(0, index.ntotal) except:
except: traceback.print_exc()
traceback.print_exc() index = big_npy = None
index = big_npy = None else:
else: index = big_npy = None
index = big_npy = None audio = signal.filtfilt(bh, ah, audio)
audio = signal.filtfilt(bh, ah, audio) audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") opt_ts = []
opt_ts = [] if audio_pad.shape[0] > self.t_max:
if audio_pad.shape[0] > self.t_max: audio_sum = np.zeros_like(audio)
audio_sum = np.zeros_like(audio) for i in range(self.window):
for i in range(self.window): audio_sum += audio_pad[i : i - self.window]
audio_sum += audio_pad[i : i - self.window] for t in range(self.t_center, audio.shape[0], self.t_center):
for t in range(self.t_center, audio.shape[0], self.t_center): opt_ts.append(
opt_ts.append( t
t - self.t_query
- self.t_query + np.where(
+ np.where( np.abs(audio_sum[t - self.t_query : t + self.t_query])
np.abs(audio_sum[t - self.t_query : t + self.t_query]) == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
== np.abs(audio_sum[t - self.t_query : t + self.t_query]).min() )[0][0]
)[0][0] )
) s = 0
s = 0 audio_opt = []
audio_opt = [] t = None
t = None t1 = ttime()
t1 = ttime() audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") p_len = audio_pad.shape[0] // self.window
p_len = audio_pad.shape[0] // self.window inp_f0 = None
inp_f0 = None if hasattr(f0_file, "name") == True:
if hasattr(f0_file, "name") == True: try:
try: with open(f0_file.name, "r") as f:
with open(f0_file.name, "r") as f: lines = f.read().strip("\n").split("\n")
lines = f.read().strip("\n").split("\n") inp_f0 = []
inp_f0 = [] for line in lines:
for line in lines: inp_f0.append([float(i) for i in line.split(",")])
inp_f0.append([float(i) for i in line.split(",")]) inp_f0 = np.array(inp_f0, dtype="float32")
inp_f0 = np.array(inp_f0, dtype="float32") except:
except: traceback.print_exc()
traceback.print_exc() sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() pitch, pitchf = None, None
pitch, pitchf = None, None if if_f0 == 1:
if if_f0 == 1: pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, inp_f0)
pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, inp_f0) pitch = pitch[:p_len]
pitch = pitch[:p_len] pitchf = pitchf[:p_len]
pitchf = pitchf[:p_len] pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() t2 = ttime()
t2 = ttime() times[1] += t2 - t1
times[1] += t2 - t1 for t in opt_ts:
for t in opt_ts: t = t // self.window * self.window
t = t // self.window * self.window if if_f0 == 1:
if if_f0 == 1: audio_opt.append(
audio_opt.append( self.vc(
self.vc( model,
model, net_g,
net_g, sid,
sid, audio_pad[s : t + self.t_pad2 + self.window],
audio_pad[s : t + self.t_pad2 + self.window], pitch[:, s // self.window : (t + self.t_pad2) // self.window],
pitch[:, s // self.window : (t + self.t_pad2) // self.window], pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
pitchf[:, s // self.window : (t + self.t_pad2) // self.window], times,
times, index,
index, big_npy,
big_npy, index_rate,
index_rate, )[self.t_pad_tgt : -self.t_pad_tgt]
)[self.t_pad_tgt : -self.t_pad_tgt] )
) else:
else: audio_opt.append(
audio_opt.append( self.vc(
self.vc( model,
model, net_g,
net_g, sid,
sid, audio_pad[s : t + self.t_pad2 + self.window],
audio_pad[s : t + self.t_pad2 + self.window], None,
None, None,
None, times,
times, index,
index, big_npy,
big_npy, index_rate,
index_rate, )[self.t_pad_tgt : -self.t_pad_tgt]
)[self.t_pad_tgt : -self.t_pad_tgt] )
) s = t
s = t if if_f0 == 1:
if if_f0 == 1: audio_opt.append(
audio_opt.append( self.vc(
self.vc( model,
model, net_g,
net_g, sid,
sid, audio_pad[t:],
audio_pad[t:], pitch[:, t // self.window :] if t is not None else pitch,
pitch[:, t // self.window :] if t is not None else pitch, pitchf[:, t // self.window :] if t is not None else pitchf,
pitchf[:, t // self.window :] if t is not None else pitchf, times,
times, index,
index, big_npy,
big_npy, index_rate,
index_rate, )[self.t_pad_tgt : -self.t_pad_tgt]
)[self.t_pad_tgt : -self.t_pad_tgt] )
) else:
else: audio_opt.append(
audio_opt.append( self.vc(
self.vc( model,
model, net_g,
net_g, sid,
sid, audio_pad[t:],
audio_pad[t:], None,
None, None,
None, times,
times, index,
index, big_npy,
big_npy, index_rate,
index_rate, )[self.t_pad_tgt : -self.t_pad_tgt]
)[self.t_pad_tgt : -self.t_pad_tgt] )
) audio_opt = np.concatenate(audio_opt)
audio_opt = np.concatenate(audio_opt) del pitch, pitchf, sid
del pitch, pitchf, sid if torch.cuda.is_available():
if torch.cuda.is_available(): torch.cuda.empty_cache()
torch.cuda.empty_cache() return audio_opt
return audio_opt