Format code (#366)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
This commit is contained in:
github-actions[bot] 2023-05-28 16:06:11 +00:00 committed by GitHub
parent e569477457
commit e435b3bb8a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 262 additions and 170 deletions

207
MDXNet.py
View File

@ -1,5 +1,5 @@
import soundfile as sf import soundfile as sf
import torch,pdb,time,argparse,os,warnings,sys,librosa import torch, pdb, time, argparse, os, warnings, sys, librosa
import numpy as np import numpy as np
import onnxruntime as ort import onnxruntime as ort
from scipy.io.wavfile import write from scipy.io.wavfile import write
@ -8,96 +8,133 @@ import torch
import torch.nn as nn import torch.nn as nn
dim_c = 4 dim_c = 4
class Conv_TDF_net_trim():
def __init__(self, device, model_name, target_name,
L, dim_f, dim_t, n_fft, hop=1024): class Conv_TDF_net_trim:
def __init__(
self, device, model_name, target_name, L, dim_f, dim_t, n_fft, hop=1024
):
super(Conv_TDF_net_trim, self).__init__() super(Conv_TDF_net_trim, self).__init__()
self.dim_f = dim_f self.dim_f = dim_f
self.dim_t = 2 ** dim_t self.dim_t = 2**dim_t
self.n_fft = n_fft self.n_fft = n_fft
self.hop = hop self.hop = hop
self.n_bins = self.n_fft // 2 + 1 self.n_bins = self.n_fft // 2 + 1
self.chunk_size = hop * (self.dim_t - 1) self.chunk_size = hop * (self.dim_t - 1)
self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to(device) self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to(
device
)
self.target_name = target_name self.target_name = target_name
self.blender = 'blender' in model_name self.blender = "blender" in model_name
out_c = dim_c * 4 if target_name == '*' else dim_c out_c = dim_c * 4 if target_name == "*" else dim_c
self.freq_pad = torch.zeros([1, out_c, self.n_bins - self.dim_f, self.dim_t]).to(device) self.freq_pad = torch.zeros(
[1, out_c, self.n_bins - self.dim_f, self.dim_t]
).to(device)
self.n = L // 2 self.n = L // 2
def stft(self, x): def stft(self, x):
x = x.reshape([-1, self.chunk_size]) x = x.reshape([-1, self.chunk_size])
x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True, return_complex=True) x = torch.stft(
x,
n_fft=self.n_fft,
hop_length=self.hop,
window=self.window,
center=True,
return_complex=True,
)
x = torch.view_as_real(x) x = torch.view_as_real(x)
x = x.permute([0, 3, 1, 2]) x = x.permute([0, 3, 1, 2])
x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape([-1, dim_c, self.n_bins, self.dim_t]) x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape(
return x[:, :, :self.dim_f] [-1, dim_c, self.n_bins, self.dim_t]
)
return x[:, :, : self.dim_f]
def istft(self, x, freq_pad=None): def istft(self, x, freq_pad=None):
freq_pad = self.freq_pad.repeat([x.shape[0], 1, 1, 1]) if freq_pad is None else freq_pad freq_pad = (
self.freq_pad.repeat([x.shape[0], 1, 1, 1])
if freq_pad is None
else freq_pad
)
x = torch.cat([x, freq_pad], -2) x = torch.cat([x, freq_pad], -2)
c = 4 * 2 if self.target_name == '*' else 2 c = 4 * 2 if self.target_name == "*" else 2
x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape([-1, 2, self.n_bins, self.dim_t]) x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape(
[-1, 2, self.n_bins, self.dim_t]
)
x = x.permute([0, 2, 3, 1]) x = x.permute([0, 2, 3, 1])
x = x.contiguous() x = x.contiguous()
x = torch.view_as_complex(x) x = torch.view_as_complex(x)
x = torch.istft(x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True) x = torch.istft(
x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True
)
return x.reshape([-1, c, self.chunk_size]) return x.reshape([-1, c, self.chunk_size])
def get_models(device, dim_f, dim_t, n_fft): def get_models(device, dim_f, dim_t, n_fft):
return Conv_TDF_net_trim( return Conv_TDF_net_trim(
device=device, device=device,
model_name='Conv-TDF', target_name='vocals', model_name="Conv-TDF",
target_name="vocals",
L=11, L=11,
dim_f=dim_f, dim_t=dim_t, dim_f=dim_f,
n_fft=n_fft dim_t=dim_t,
n_fft=n_fft,
) )
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
cpu = torch.device('cpu') cpu = torch.device("cpu")
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class Predictor: class Predictor:
def __init__(self,args): def __init__(self, args):
self.args=args self.args = args
self.model_ = get_models(device=cpu, dim_f=args.dim_f, dim_t=args.dim_t, n_fft=args.n_fft) self.model_ = get_models(
self.model = ort.InferenceSession(os.path.join(args.onnx,self.model_.target_name+'.onnx'), providers=['CUDAExecutionProvider', 'CPUExecutionProvider']) device=cpu, dim_f=args.dim_f, dim_t=args.dim_t, n_fft=args.n_fft
print('onnx load done') )
self.model = ort.InferenceSession(
os.path.join(args.onnx, self.model_.target_name + ".onnx"),
providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
)
print("onnx load done")
def demix(self, mix): def demix(self, mix):
samples = mix.shape[-1] samples = mix.shape[-1]
margin = self.args.margin margin = self.args.margin
chunk_size = self.args.chunks*44100 chunk_size = self.args.chunks * 44100
assert not margin == 0, 'margin cannot be zero!' assert not margin == 0, "margin cannot be zero!"
if margin > chunk_size: if margin > chunk_size:
margin = chunk_size margin = chunk_size
segmented_mix = {} segmented_mix = {}
if self.args.chunks == 0 or samples < chunk_size: if self.args.chunks == 0 or samples < chunk_size:
chunk_size = samples chunk_size = samples
counter = -1 counter = -1
for skip in range(0, samples, chunk_size): for skip in range(0, samples, chunk_size):
counter+=1 counter += 1
s_margin = 0 if counter == 0 else margin s_margin = 0 if counter == 0 else margin
end = min(skip+chunk_size+margin, samples) end = min(skip + chunk_size + margin, samples)
start = skip-s_margin start = skip - s_margin
segmented_mix[skip] = mix[:,start:end].copy() segmented_mix[skip] = mix[:, start:end].copy()
if end == samples: if end == samples:
break break
sources = self.demix_base(segmented_mix, margin_size=margin) sources = self.demix_base(segmented_mix, margin_size=margin)
''' """
mix:(2,big_sample) mix:(2,big_sample)
segmented_mix:offset->(2,small_sample) segmented_mix:offset->(2,small_sample)
sources:(1,2,big_sample) sources:(1,2,big_sample)
''' """
return sources return sources
def demix_base(self, mixes, margin_size): def demix_base(self, mixes, margin_size):
chunked_sources = [] chunked_sources = []
progress_bar = tqdm(total=len(mixes)) progress_bar = tqdm(total=len(mixes))
@ -106,15 +143,17 @@ class Predictor:
cmix = mixes[mix] cmix = mixes[mix]
sources = [] sources = []
n_sample = cmix.shape[1] n_sample = cmix.shape[1]
model=self.model_ model = self.model_
trim = model.n_fft//2 trim = model.n_fft // 2
gen_size = model.chunk_size-2*trim gen_size = model.chunk_size - 2 * trim
pad = gen_size - n_sample%gen_size pad = gen_size - n_sample % gen_size
mix_p = np.concatenate((np.zeros((2,trim)), cmix, np.zeros((2,pad)), np.zeros((2,trim))), 1) mix_p = np.concatenate(
(np.zeros((2, trim)), cmix, np.zeros((2, pad)), np.zeros((2, trim))), 1
)
mix_waves = [] mix_waves = []
i = 0 i = 0
while i < n_sample + pad: while i < n_sample + pad:
waves = np.array(mix_p[:, i:i+model.chunk_size]) waves = np.array(mix_p[:, i : i + model.chunk_size])
mix_waves.append(waves) mix_waves.append(waves)
i += gen_size i += gen_size
mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(cpu) mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(cpu)
@ -122,68 +161,84 @@ class Predictor:
_ort = self.model _ort = self.model
spek = model.stft(mix_waves) spek = model.stft(mix_waves)
if self.args.denoise: if self.args.denoise:
spec_pred = -_ort.run(None, {'input': -spek.cpu().numpy()})[0]*0.5+_ort.run(None, {'input': spek.cpu().numpy()})[0]*0.5 spec_pred = (
-_ort.run(None, {"input": -spek.cpu().numpy()})[0] * 0.5
+ _ort.run(None, {"input": spek.cpu().numpy()})[0] * 0.5
)
tar_waves = model.istft(torch.tensor(spec_pred)) tar_waves = model.istft(torch.tensor(spec_pred))
else: else:
tar_waves = model.istft(torch.tensor(_ort.run(None, {'input': spek.cpu().numpy()})[0])) tar_waves = model.istft(
tar_signal = tar_waves[:,:,trim:-trim].transpose(0,1).reshape(2, -1).numpy()[:, :-pad] torch.tensor(_ort.run(None, {"input": spek.cpu().numpy()})[0])
)
tar_signal = (
tar_waves[:, :, trim:-trim]
.transpose(0, 1)
.reshape(2, -1)
.numpy()[:, :-pad]
)
start = 0 if mix == 0 else margin_size start = 0 if mix == 0 else margin_size
end = None if mix == list(mixes.keys())[::-1][0] else -margin_size end = None if mix == list(mixes.keys())[::-1][0] else -margin_size
if margin_size == 0: if margin_size == 0:
end = None end = None
sources.append(tar_signal[:,start:end]) sources.append(tar_signal[:, start:end])
progress_bar.update(1) progress_bar.update(1)
chunked_sources.append(sources) chunked_sources.append(sources)
_sources = np.concatenate(chunked_sources, axis=-1) _sources = np.concatenate(chunked_sources, axis=-1)
# del self.model # del self.model
progress_bar.close() progress_bar.close()
return _sources return _sources
def prediction(self, m,vocal_root,others_root,format):
os.makedirs(vocal_root,exist_ok=True) def prediction(self, m, vocal_root, others_root, format):
os.makedirs(others_root,exist_ok=True) os.makedirs(vocal_root, exist_ok=True)
os.makedirs(others_root, exist_ok=True)
basename = os.path.basename(m) basename = os.path.basename(m)
mix, rate = librosa.load(m, mono=False, sr=44100) mix, rate = librosa.load(m, mono=False, sr=44100)
if mix.ndim == 1: if mix.ndim == 1:
mix = np.asfortranarray([mix,mix]) mix = np.asfortranarray([mix, mix])
mix = mix.T mix = mix.T
sources = self.demix(mix.T) sources = self.demix(mix.T)
opt=sources[0].T opt = sources[0].T
sf.write("%s/%s_main_vocal.%s"%(vocal_root,basename,format), mix-opt, rate) sf.write(
sf.write("%s/%s_others.%s"%(others_root,basename,format), opt , rate) "%s/%s_main_vocal.%s" % (vocal_root, basename, format), mix - opt, rate
)
sf.write("%s/%s_others.%s" % (others_root, basename, format), opt, rate)
class MDXNetDereverb():
def __init__(self,chunks):
self.onnx="uvr5_weights/onnx_dereverb_By_FoxJoy"
self.shifts=10#'Predict with randomised equivariant stabilisation'
self.mixing="min_mag"#['default','min_mag','max_mag']
self.chunks=chunks
self.margin=44100
self.dim_t=9
self.dim_f=3072
self.n_fft=6144
self.denoise=True
self.pred=Predictor(self)
def _path_audio_(self,input,vocal_root,others_root,format): class MDXNetDereverb:
self.pred.prediction(input,vocal_root,others_root,format) def __init__(self, chunks):
self.onnx = "uvr5_weights/onnx_dereverb_By_FoxJoy"
self.shifts = 10 #'Predict with randomised equivariant stabilisation'
self.mixing = "min_mag" # ['default','min_mag','max_mag']
self.chunks = chunks
self.margin = 44100
self.dim_t = 9
self.dim_f = 3072
self.n_fft = 6144
self.denoise = True
self.pred = Predictor(self)
if __name__ == '__main__': def _path_audio_(self, input, vocal_root, others_root, format):
dereverb=MDXNetDereverb(15) self.pred.prediction(input, vocal_root, others_root, format)
if __name__ == "__main__":
dereverb = MDXNetDereverb(15)
from time import time as ttime from time import time as ttime
t0=ttime()
t0 = ttime()
dereverb._path_audio_( dereverb._path_audio_(
"雪雪伴奏对消HP5.wav", "雪雪伴奏对消HP5.wav",
"vocal", "vocal",
"others", "others",
) )
t1=ttime() t1 = ttime()
print(t1-t0) print(t1 - t0)
''' """
runtime\python.exe MDXNet.py runtime\python.exe MDXNet.py
@ -195,4 +250,4 @@ runtime\python.exe MDXNet.py
half15:0.7G->6.6G,22.69s half15:0.7G->6.6G,22.69s
fp32-15:0.7G->6.6G,20.85s fp32-15:0.7G->6.6G,20.85s
''' """

View File

@ -83,7 +83,7 @@ import gradio as gr
import logging import logging
from vc_infer_pipeline import VC from vc_infer_pipeline import VC
from config import Config from config import Config
from infer_uvr5 import _audio_pre_,_audio_pre_new from infer_uvr5 import _audio_pre_, _audio_pre_new
from my_utils import load_audio from my_utils import load_audio
from train.process_ckpt import show_info, change_info, merge, extract_small_model from train.process_ckpt import show_info, change_info, merge, extract_small_model
@ -134,7 +134,7 @@ for root, dirs, files in os.walk(index_root, topdown=False):
index_paths.append("%s/%s" % (root, name)) index_paths.append("%s/%s" % (root, name))
uvr5_names = [] uvr5_names = []
for name in os.listdir(weight_uvr5_root): for name in os.listdir(weight_uvr5_root):
if name.endswith(".pth")or "onnx"in name: if name.endswith(".pth") or "onnx" in name:
uvr5_names.append(name.replace(".pth", "")) uvr5_names.append(name.replace(".pth", ""))
@ -151,7 +151,7 @@ def vc_single(
filter_radius, filter_radius,
resample_sr, resample_sr,
rms_mix_rate, rms_mix_rate,
protect protect,
): # spk_item, input_audio0, vc_transform0,f0_file,f0method0 ): # spk_item, input_audio0, vc_transform0,f0_file,f0method0
global tgt_sr, net_g, vc, hubert_model, version global tgt_sr, net_g, vc, hubert_model, version
if input_audio_path is None: if input_audio_path is None:
@ -236,7 +236,7 @@ def vc_multi(
resample_sr, resample_sr,
rms_mix_rate, rms_mix_rate,
protect, protect,
format1 format1,
): ):
try: try:
dir_path = ( dir_path = (
@ -267,13 +267,15 @@ def vc_multi(
filter_radius, filter_radius,
resample_sr, resample_sr,
rms_mix_rate, rms_mix_rate,
protect protect,
) )
if "Success" in info: if "Success" in info:
try: try:
tgt_sr, audio_opt = opt tgt_sr, audio_opt = opt
sf.write( sf.write(
"%s/%s.%s" % (opt_root, os.path.basename(path),format1), audio_opt,tgt_sr "%s/%s.%s" % (opt_root, os.path.basename(path), format1),
audio_opt,
tgt_sr,
) )
except: except:
info += traceback.format_exc() info += traceback.format_exc()
@ -284,7 +286,7 @@ def vc_multi(
yield traceback.format_exc() yield traceback.format_exc()
def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg,format0): def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0):
infos = [] infos = []
try: try:
inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ") inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
@ -294,10 +296,10 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg,format0
save_root_ins = ( save_root_ins = (
save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ") save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
) )
if(model_name=="onnx_dereverb_By_FoxJoy"): if model_name == "onnx_dereverb_By_FoxJoy":
pre_fun=MDXNetDereverb(15) pre_fun = MDXNetDereverb(15)
else: else:
func=_audio_pre_ if "DeEcho"not in model_name else _audio_pre_new func = _audio_pre_ if "DeEcho" not in model_name else _audio_pre_new
pre_fun = func( pre_fun = func(
agg=int(agg), agg=int(agg),
model_path=os.path.join(weight_uvr5_root, model_name + ".pth"), model_path=os.path.join(weight_uvr5_root, model_name + ".pth"),
@ -319,7 +321,9 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg,format0
and info["streams"][0]["sample_rate"] == "44100" and info["streams"][0]["sample_rate"] == "44100"
): ):
need_reformat = 0 need_reformat = 0
pre_fun._path_audio_(inp_path, save_root_ins, save_root_vocal,format0) pre_fun._path_audio_(
inp_path, save_root_ins, save_root_vocal, format0
)
done = 1 done = 1
except: except:
need_reformat = 1 need_reformat = 1
@ -333,7 +337,9 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg,format0
inp_path = tmp_path inp_path = tmp_path
try: try:
if done == 0: if done == 0:
pre_fun._path_audio_(inp_path, save_root_ins, save_root_vocal,format0) pre_fun._path_audio_(
inp_path, save_root_ins, save_root_vocal, format0
)
infos.append("%s->Success" % (os.path.basename(inp_path))) infos.append("%s->Success" % (os.path.basename(inp_path)))
yield "\n".join(infos) yield "\n".join(infos)
except: except:
@ -346,7 +352,7 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg,format0
yield "\n".join(infos) yield "\n".join(infos)
finally: finally:
try: try:
if (model_name == "onnx_dereverb_By_FoxJoy"): if model_name == "onnx_dereverb_By_FoxJoy":
del pre_fun.pred.model del pre_fun.pred.model
del pre_fun.pred.model_ del pre_fun.pred.model_
else: else:
@ -804,7 +810,7 @@ def train_index(exp_dir1, version19):
faiss.write_index( faiss.write_index(
index, index,
"%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index" "%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index"
% (exp_dir, n_ivf, index_ivf.nprobe,exp_dir1, version19), % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
) )
# faiss.write_index(index, '%s/trained_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19)) # faiss.write_index(index, '%s/trained_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19))
infos.append("adding") infos.append("adding")
@ -815,11 +821,11 @@ def train_index(exp_dir1, version19):
faiss.write_index( faiss.write_index(
index, index,
"%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index" "%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index"
% (exp_dir, n_ivf, index_ivf.nprobe,exp_dir1, version19), % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
) )
infos.append( infos.append(
"成功构建索引added_IVF%s_Flat_nprobe_%s_%s_%s.index" "成功构建索引added_IVF%s_Flat_nprobe_%s_%s_%s.index"
% (n_ivf, index_ivf.nprobe,exp_dir1, version19) % (n_ivf, index_ivf.nprobe, exp_dir1, version19)
) )
# faiss.write_index(index, '%s/added_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19)) # faiss.write_index(index, '%s/added_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19))
# infos.append("成功构建索引added_IVF%s_Flat_FastScan_%s.index"%(n_ivf,version19)) # infos.append("成功构建索引added_IVF%s_Flat_FastScan_%s.index"%(n_ivf,version19))
@ -1044,7 +1050,7 @@ def train1key(
faiss.write_index( faiss.write_index(
index, index,
"%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index" "%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index"
% (model_log_dir, n_ivf, index_ivf.nprobe,exp_dir1, version19), % (model_log_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
) )
yield get_info_str("adding index") yield get_info_str("adding index")
batch_size_add = 8192 batch_size_add = 8192
@ -1053,11 +1059,11 @@ def train1key(
faiss.write_index( faiss.write_index(
index, index,
"%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index" "%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index"
% (model_log_dir, n_ivf, index_ivf.nprobe,exp_dir1, version19), % (model_log_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
) )
yield get_info_str( yield get_info_str(
"成功构建索引, added_IVF%s_Flat_nprobe_%s_%s_%s.index" "成功构建索引, added_IVF%s_Flat_nprobe_%s_%s_%s.index"
% (n_ivf, index_ivf.nprobe, exp_dir1,version19) % (n_ivf, index_ivf.nprobe, exp_dir1, version19)
) )
yield get_info_str(i18n("全流程结束!")) yield get_info_str(i18n("全流程结束!"))
@ -1175,8 +1181,10 @@ with gr.Blocks() as app:
value="E:\\codes\\py39\\test-20230416b\\todo-songs\\冬之花clip1.wav", value="E:\\codes\\py39\\test-20230416b\\todo-songs\\冬之花clip1.wav",
) )
f0method0 = gr.Radio( f0method0 = gr.Radio(
label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU"), label=i18n(
choices=["pm", "harvest","crepe"], "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU"
),
choices=["pm", "harvest", "crepe"],
value="pm", value="pm",
interactive=True, interactive=True,
) )
@ -1233,7 +1241,9 @@ with gr.Blocks() as app:
protect0 = gr.Slider( protect0 = gr.Slider(
minimum=0, minimum=0,
maximum=0.5, maximum=0.5,
label=i18n("保护清辅音和呼吸声防止电音撕裂等artifact拉满0.5不开启,调低加大保护力度但可能降低索引效果"), label=i18n(
"保护清辅音和呼吸声防止电音撕裂等artifact拉满0.5不开启,调低加大保护力度但可能降低索引效果"
),
value=0.33, value=0.33,
step=0.01, step=0.01,
interactive=True, interactive=True,
@ -1258,7 +1268,7 @@ with gr.Blocks() as app:
filter_radius0, filter_radius0,
resample_sr0, resample_sr0,
rms_mix_rate0, rms_mix_rate0,
protect0 protect0,
], ],
[vc_output1, vc_output2], [vc_output1, vc_output2],
) )
@ -1273,8 +1283,10 @@ with gr.Blocks() as app:
) )
opt_input = gr.Textbox(label=i18n("指定输出文件夹"), value="opt") opt_input = gr.Textbox(label=i18n("指定输出文件夹"), value="opt")
f0method1 = gr.Radio( f0method1 = gr.Radio(
label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU"), label=i18n(
choices=["pm", "harvest","crepe"], "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU"
),
choices=["pm", "harvest", "crepe"],
value="pm", value="pm",
interactive=True, interactive=True,
) )
@ -1328,7 +1340,9 @@ with gr.Blocks() as app:
protect1 = gr.Slider( protect1 = gr.Slider(
minimum=0, minimum=0,
maximum=0.5, maximum=0.5,
label=i18n("保护清辅音和呼吸声防止电音撕裂等artifact拉满0.5不开启,调低加大保护力度但可能降低索引效果"), label=i18n(
"保护清辅音和呼吸声防止电音撕裂等artifact拉满0.5不开启,调低加大保护力度但可能降低索引效果"
),
value=0.33, value=0.33,
step=0.01, step=0.01,
interactive=True, interactive=True,
@ -1342,9 +1356,9 @@ with gr.Blocks() as app:
file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹") file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹")
) )
with gr.Row(): with gr.Row():
format1= gr.Radio( format1 = gr.Radio(
label=i18n("导出文件格式"), label=i18n("导出文件格式"),
choices=["wav", "flac","mp3","m4a"], choices=["wav", "flac", "mp3", "m4a"],
value="flac", value="flac",
interactive=True, interactive=True,
) )
@ -1367,7 +1381,7 @@ with gr.Blocks() as app:
resample_sr1, resample_sr1,
rms_mix_rate1, rms_mix_rate1,
protect1, protect1,
format1 format1,
], ],
[vc_output3], [vc_output3],
) )
@ -1412,10 +1426,12 @@ with gr.Blocks() as app:
opt_vocal_root = gr.Textbox( opt_vocal_root = gr.Textbox(
label=i18n("指定输出主人声文件夹"), value="opt" label=i18n("指定输出主人声文件夹"), value="opt"
) )
opt_ins_root = gr.Textbox(label=i18n("指定输出非主人声文件夹"), value="opt") opt_ins_root = gr.Textbox(
format0= gr.Radio( label=i18n("指定输出非主人声文件夹"), value="opt"
)
format0 = gr.Radio(
label=i18n("导出文件格式"), label=i18n("导出文件格式"),
choices=["wav", "flac","mp3","m4a"], choices=["wav", "flac", "mp3", "m4a"],
value="flac", value="flac",
interactive=True, interactive=True,
) )
@ -1430,7 +1446,7 @@ with gr.Blocks() as app:
wav_inputs, wav_inputs,
opt_ins_root, opt_ins_root,
agg, agg,
format0 format0,
], ],
[vc_output4], [vc_output4],
) )

View File

@ -1,7 +1,9 @@
import os, sys, torch, warnings, pdb import os, sys, torch, warnings, pdb
now_dir = os.getcwd() now_dir = os.getcwd()
sys.path.append(now_dir) sys.path.append(now_dir)
from json import load as ll from json import load as ll
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
import librosa import librosa
import importlib import importlib
@ -15,6 +17,7 @@ import soundfile as sf
from uvr5_pack.lib_v5.nets_new import CascadedNet from uvr5_pack.lib_v5.nets_new import CascadedNet
from uvr5_pack.lib_v5 import nets_61968KB as nets from uvr5_pack.lib_v5 import nets_61968KB as nets
class _audio_pre_: class _audio_pre_:
def __init__(self, agg, model_path, device, is_half): def __init__(self, agg, model_path, device, is_half):
self.model_path = model_path self.model_path = model_path
@ -41,7 +44,7 @@ class _audio_pre_:
self.mp = mp self.mp = mp
self.model = model self.model = model
def _path_audio_(self, music_file, ins_root=None, vocal_root=None,format="flac"): def _path_audio_(self, music_file, ins_root=None, vocal_root=None, format="flac"):
if ins_root is None and vocal_root is None: if ins_root is None and vocal_root is None:
return "No save root." return "No save root."
name = os.path.basename(music_file) name = os.path.basename(music_file)
@ -122,9 +125,11 @@ class _audio_pre_:
print("%s instruments done" % name) print("%s instruments done" % name)
sf.write( sf.write(
os.path.join( os.path.join(
ins_root, "instrument_{}_{}.{}".format(name, self.data["agg"],format) ins_root,
"instrument_{}_{}.{}".format(name, self.data["agg"], format),
), ),
(np.array(wav_instrument) * 32768).astype("int16"), self.mp.param["sr"], (np.array(wav_instrument) * 32768).astype("int16"),
self.mp.param["sr"],
) # ) #
if vocal_root is not None: if vocal_root is not None:
if self.data["high_end_process"].startswith("mirroring"): if self.data["high_end_process"].startswith("mirroring"):
@ -139,11 +144,13 @@ class _audio_pre_:
print("%s vocals done" % name) print("%s vocals done" % name)
sf.write( sf.write(
os.path.join( os.path.join(
vocal_root, "vocal_{}_{}.{}".format(name, self.data["agg"],format) vocal_root, "vocal_{}_{}.{}".format(name, self.data["agg"], format)
), ),
(np.array(wav_vocals) * 32768).astype("int16"), self.mp.param["sr"], (np.array(wav_vocals) * 32768).astype("int16"),
self.mp.param["sr"],
) )
class _audio_pre_new: class _audio_pre_new:
def __init__(self, agg, model_path, device, is_half): def __init__(self, agg, model_path, device, is_half):
self.model_path = model_path self.model_path = model_path
@ -157,9 +164,9 @@ class _audio_pre_new:
"agg": agg, "agg": agg,
"high_end_process": "mirroring", "high_end_process": "mirroring",
} }
mp=ModelParameters("uvr5_pack/lib_v5/modelparams/4band_v3.json") mp = ModelParameters("uvr5_pack/lib_v5/modelparams/4band_v3.json")
nout=64 if "DeReverb"in model_path else 48 nout = 64 if "DeReverb" in model_path else 48
model = CascadedNet(mp.param["bins"] * 2,nout) model = CascadedNet(mp.param["bins"] * 2, nout)
cpk = torch.load(model_path, map_location="cpu") cpk = torch.load(model_path, map_location="cpu")
model.load_state_dict(cpk) model.load_state_dict(cpk)
model.eval() model.eval()
@ -171,7 +178,9 @@ class _audio_pre_new:
self.mp = mp self.mp = mp
self.model = model self.model = model
def _path_audio_(self, music_file, vocal_root=None, ins_root=None,format="flac"):#3个VR模型vocal和ins是反的 def _path_audio_(
self, music_file, vocal_root=None, ins_root=None, format="flac"
): # 3个VR模型vocal和ins是反的
if ins_root is None and vocal_root is None: if ins_root is None and vocal_root is None:
return "No save root." return "No save root."
name = os.path.basename(music_file) name = os.path.basename(music_file)
@ -252,9 +261,11 @@ class _audio_pre_new:
print("%s instruments done" % name) print("%s instruments done" % name)
sf.write( sf.write(
os.path.join( os.path.join(
ins_root, "main_vocal_{}_{}.{}".format(name, self.data["agg"],format) ins_root,
"main_vocal_{}_{}.{}".format(name, self.data["agg"], format),
), ),
(np.array(wav_instrument) * 32768).astype("int16"),self.mp.param["sr"], (np.array(wav_instrument) * 32768).astype("int16"),
self.mp.param["sr"],
) # ) #
if vocal_root is not None: if vocal_root is not None:
if self.data["high_end_process"].startswith("mirroring"): if self.data["high_end_process"].startswith("mirroring"):
@ -269,9 +280,10 @@ class _audio_pre_new:
print("%s vocals done" % name) print("%s vocals done" % name)
sf.write( sf.write(
os.path.join( os.path.join(
vocal_root, "others_{}_{}.{}".format(name, self.data["agg"],format) vocal_root, "others_{}_{}.{}".format(name, self.data["agg"], format)
), ),
(np.array(wav_vocals) * 32768).astype("int16"),self.mp.param["sr"], (np.array(wav_vocals) * 32768).astype("int16"),
self.mp.param["sr"],
) )
@ -283,7 +295,7 @@ if __name__ == "__main__":
# model_path = "uvr5_weights/VR-DeEchoNormal.pth" # model_path = "uvr5_weights/VR-DeEchoNormal.pth"
model_path = "uvr5_weights/DeEchoNormal.pth" model_path = "uvr5_weights/DeEchoNormal.pth"
# pre_fun = _audio_pre_(model_path=model_path, device=device, is_half=True,agg=10) # pre_fun = _audio_pre_(model_path=model_path, device=device, is_half=True,agg=10)
pre_fun = _audio_pre_new(model_path=model_path, device=device, is_half=True,agg=10) pre_fun = _audio_pre_new(model_path=model_path, device=device, is_half=True, agg=10)
audio_path = "雪雪伴奏对消HP5.wav" audio_path = "雪雪伴奏对消HP5.wav"
save_path = "opt" save_path = "opt"
pre_fun._path_audio_(audio_path, save_path, save_path) pre_fun._path_audio_(audio_path, save_path, save_path)

View File

@ -4,27 +4,29 @@ import torch.nn.functional as F
from uvr5_pack.lib_v5 import spec_utils from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module):
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__() super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential( self.conv = nn.Sequential(
nn.Conv2d( nn.Conv2d(
nin, nout, nin,
nout,
kernel_size=ksize, kernel_size=ksize,
stride=stride, stride=stride,
padding=pad, padding=pad,
dilation=dilation, dilation=dilation,
bias=False), bias=False,
),
nn.BatchNorm2d(nout), nn.BatchNorm2d(nout),
activ() activ(),
) )
def __call__(self, x): def __call__(self, x):
return self.conv(x) return self.conv(x)
class Encoder(nn.Module):
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__() super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ) self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
@ -38,15 +40,16 @@ class Encoder(nn.Module):
class Decoder(nn.Module): class Decoder(nn.Module):
def __init__(
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__() super(Decoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
# self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None): def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True) x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None: if skip is not None:
skip = spec_utils.crop_center(skip, x) skip = spec_utils.crop_center(skip, x)
@ -62,12 +65,11 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module): class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False): def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
super(ASPPModule, self).__init__() super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential( self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)), nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ) Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
) )
self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ) self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
self.conv3 = Conv2DBNActiv( self.conv3 = Conv2DBNActiv(
@ -84,7 +86,9 @@ class ASPPModule(nn.Module):
def forward(self, x): def forward(self, x):
_, _, h, w = x.size() _, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True) feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x) feat2 = self.conv2(x)
feat3 = self.conv3(x) feat3 = self.conv3(x)
feat4 = self.conv4(x) feat4 = self.conv4(x)
@ -99,19 +103,14 @@ class ASPPModule(nn.Module):
class LSTMModule(nn.Module): class LSTMModule(nn.Module):
def __init__(self, nin_conv, nin_lstm, nout_lstm): def __init__(self, nin_conv, nin_lstm, nout_lstm):
super(LSTMModule, self).__init__() super(LSTMModule, self).__init__()
self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0) self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
self.lstm = nn.LSTM( self.lstm = nn.LSTM(
input_size=nin_lstm, input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True
hidden_size=nout_lstm // 2,
bidirectional=True
) )
self.dense = nn.Sequential( self.dense = nn.Sequential(
nn.Linear(nout_lstm, nin_lstm), nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()
nn.BatchNorm1d(nin_lstm),
nn.ReLU()
) )
def forward(self, x): def forward(self, x):

View File

@ -3,9 +3,11 @@ from torch import nn
import torch.nn.functional as F import torch.nn.functional as F
from uvr5_pack.lib_v5 import layers_new as layers from uvr5_pack.lib_v5 import layers_new as layers
class BaseNet(nn.Module):
def __init__(self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))): class BaseNet(nn.Module):
def __init__(
self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))
):
super(BaseNet, self).__init__() super(BaseNet, self).__init__()
self.enc1 = layers.Conv2DBNActiv(nin, nout, 3, 1, 1) self.enc1 = layers.Conv2DBNActiv(nin, nout, 3, 1, 1)
self.enc2 = layers.Encoder(nout, nout * 2, 3, 2, 1) self.enc2 = layers.Encoder(nout, nout * 2, 3, 2, 1)
@ -38,8 +40,8 @@ class BaseNet(nn.Module):
return h return h
class CascadedNet(nn.Module):
class CascadedNet(nn.Module):
def __init__(self, n_fft, nout=32, nout_lstm=128): def __init__(self, n_fft, nout=32, nout_lstm=128):
super(CascadedNet, self).__init__() super(CascadedNet, self).__init__()
@ -50,24 +52,30 @@ class CascadedNet(nn.Module):
self.stg1_low_band_net = nn.Sequential( self.stg1_low_band_net = nn.Sequential(
BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm), BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm),
layers.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0) layers.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0),
) )
self.stg1_high_band_net = BaseNet(2, nout // 4, self.nin_lstm // 2, nout_lstm // 2) self.stg1_high_band_net = BaseNet(
2, nout // 4, self.nin_lstm // 2, nout_lstm // 2
)
self.stg2_low_band_net = nn.Sequential( self.stg2_low_band_net = nn.Sequential(
BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm), BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm),
layers.Conv2DBNActiv(nout, nout // 2, 1, 1, 0) layers.Conv2DBNActiv(nout, nout // 2, 1, 1, 0),
) )
self.stg2_high_band_net = BaseNet(nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2) self.stg2_high_band_net = BaseNet(
nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2
)
self.stg3_full_band_net = BaseNet(3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm) self.stg3_full_band_net = BaseNet(
3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm
)
self.out = nn.Conv2d(nout, 2, 1, bias=False) self.out = nn.Conv2d(nout, 2, 1, bias=False)
self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False) self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False)
def forward(self, x): def forward(self, x):
x = x[:, :, :self.max_bin] x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2 bandw = x.size()[2] // 2
l1_in = x[:, :, :bandw] l1_in = x[:, :, :bandw]
@ -89,7 +97,7 @@ class CascadedNet(nn.Module):
mask = F.pad( mask = F.pad(
input=mask, input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]), pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate' mode="replicate",
) )
if self.training: if self.training:
@ -98,7 +106,7 @@ class CascadedNet(nn.Module):
aux = F.pad( aux = F.pad(
input=aux, input=aux,
pad=(0, 0, 0, self.output_bin - aux.size()[2]), pad=(0, 0, 0, self.output_bin - aux.size()[2]),
mode='replicate' mode="replicate",
) )
return mask, aux return mask, aux
else: else:
@ -108,17 +116,17 @@ class CascadedNet(nn.Module):
mask = self.forward(x) mask = self.forward(x)
if self.offset > 0: if self.offset > 0:
mask = mask[:, :, :, self.offset:-self.offset] mask = mask[:, :, :, self.offset : -self.offset]
assert mask.size()[3] > 0 assert mask.size()[3] > 0
return mask return mask
def predict(self, x,aggressiveness=None): def predict(self, x, aggressiveness=None):
mask = self.forward(x) mask = self.forward(x)
pred_mag = x * mask pred_mag = x * mask
if self.offset > 0: if self.offset > 0:
pred_mag = pred_mag[:, :, :, self.offset:-self.offset] pred_mag = pred_mag[:, :, :, self.offset : -self.offset]
assert pred_mag.size()[3] > 0 assert pred_mag.size()[3] > 0
return pred_mag return pred_mag

View File

@ -2,7 +2,7 @@ import numpy as np, parselmouth, torch, pdb
from time import time as ttime from time import time as ttime
import torch.nn.functional as F import torch.nn.functional as F
import scipy.signal as signal import scipy.signal as signal
import pyworld, os, traceback, faiss, librosa,torchcrepe import pyworld, os, traceback, faiss, librosa, torchcrepe
from scipy import signal from scipy import signal
from functools import lru_cache from functools import lru_cache
@ -162,7 +162,7 @@ class VC(object):
big_npy, big_npy,
index_rate, index_rate,
version, version,
protect protect,
): # ,file_index,file_big_npy ): # ,file_index,file_big_npy
feats = torch.from_numpy(audio0) feats = torch.from_numpy(audio0)
if self.is_half: if self.is_half:
@ -184,8 +184,8 @@ class VC(object):
with torch.no_grad(): with torch.no_grad():
logits = model.extract_features(**inputs) logits = model.extract_features(**inputs)
feats = model.final_proj(logits[0]) if version == "v1" else logits[0] feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
if(protect<0.5): if protect < 0.5:
feats0=feats.clone() feats0 = feats.clone()
if ( if (
isinstance(index, type(None)) == False isinstance(index, type(None)) == False
and isinstance(big_npy, type(None)) == False and isinstance(big_npy, type(None)) == False
@ -211,8 +211,10 @@ class VC(object):
) )
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
if(protect<0.5): if protect < 0.5:
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
0, 2, 1
)
t1 = ttime() t1 = ttime()
p_len = audio0.shape[0] // self.window p_len = audio0.shape[0] // self.window
if feats.shape[1] < p_len: if feats.shape[1] < p_len:
@ -221,13 +223,13 @@ class VC(object):
pitch = pitch[:, :p_len] pitch = pitch[:, :p_len]
pitchf = pitchf[:, :p_len] pitchf = pitchf[:, :p_len]
if(protect<0.5): if protect < 0.5:
pitchff = pitchf.clone() pitchff = pitchf.clone()
pitchff[pitchf > 0] = 1 pitchff[pitchf > 0] = 1
pitchff[pitchf < 1] = protect pitchff[pitchf < 1] = protect
pitchff = pitchff.unsqueeze(-1) pitchff = pitchff.unsqueeze(-1)
feats = feats * pitchff + feats0 * (1 - pitchff) feats = feats * pitchff + feats0 * (1 - pitchff)
feats=feats.to(feats0.dtype) feats = feats.to(feats0.dtype)
p_len = torch.tensor([p_len], device=self.device).long() p_len = torch.tensor([p_len], device=self.device).long()
with torch.no_grad(): with torch.no_grad():
if pitch != None and pitchf != None: if pitch != None and pitchf != None:
@ -356,7 +358,7 @@ class VC(object):
big_npy, big_npy,
index_rate, index_rate,
version, version,
protect protect,
)[self.t_pad_tgt : -self.t_pad_tgt] )[self.t_pad_tgt : -self.t_pad_tgt]
) )
else: else:
@ -373,7 +375,7 @@ class VC(object):
big_npy, big_npy,
index_rate, index_rate,
version, version,
protect protect,
)[self.t_pad_tgt : -self.t_pad_tgt] )[self.t_pad_tgt : -self.t_pad_tgt]
) )
s = t s = t
@ -391,7 +393,7 @@ class VC(object):
big_npy, big_npy,
index_rate, index_rate,
version, version,
protect protect,
)[self.t_pad_tgt : -self.t_pad_tgt] )[self.t_pad_tgt : -self.t_pad_tgt]
) )
else: else:
@ -408,7 +410,7 @@ class VC(object):
big_npy, big_npy,
index_rate, index_rate,
version, version,
protect protect,
)[self.t_pad_tgt : -self.t_pad_tgt] )[self.t_pad_tgt : -self.t_pad_tgt]
) )
audio_opt = np.concatenate(audio_opt) audio_opt = np.concatenate(audio_opt)