Reformat and rewrite _get_name_params (#57)

* Reformat

* rewrite _get_name_params

* Add workflow for automatic formatting

* Revert "Add workflow for automatic formatting"

This reverts commit 9111c5dbc1.

* revert Retrieval_based_Voice_Conversion_WebUI.ipynb

---------

Co-authored-by: 源文雨 <41315874+fumiama@users.noreply.github.com>
This commit is contained in:
Ftps 2023-04-15 20:44:24 +09:00 committed by GitHub
parent aaa893c4b1
commit c8261b2ccc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
45 changed files with 4878 additions and 2456 deletions

View File

@ -1,13 +1,13 @@
########################硬件参数######################## ########################硬件参数########################
#填写cuda:x, cpu 或 mps, x指代第几张卡只支持 N卡 / Apple Silicon 加速 # 填写cuda:x, cpu 或 mps, x指代第几张卡只支持 N卡 / Apple Silicon 加速
device = "cuda:0" device = "cuda:0"
#9-10-20-30-40系显卡无脑True不影响质量>=20显卡开启有加速 # 9-10-20-30-40系显卡无脑True不影响质量>=20显卡开启有加速
is_half = True is_half = True
#默认0用上所有线程写数字限制CPU资源使用 # 默认0用上所有线程写数字限制CPU资源使用
n_cpu = 0 n_cpu = 0
########################硬件参数######################## ########################硬件参数########################
@ -16,31 +16,38 @@ n_cpu = 0
########################命令行参数######################## ########################命令行参数########################
import argparse import argparse
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--port", type=int, default=7865, help="Listen port") parser.add_argument("--port", type=int, default=7865, help="Listen port")
parser.add_argument("--pycmd", type=str, default="python", help="Python command") parser.add_argument("--pycmd", type=str, default="python", help="Python command")
parser.add_argument("--colab", action='store_true', help="Launch in colab") parser.add_argument("--colab", action="store_true", help="Launch in colab")
parser.add_argument("--noparallel", action='store_true', help="Disable parallel processing") parser.add_argument(
parser.add_argument("--noautoopen", action='store_true', help="Do not open in browser automatically") "--noparallel", action="store_true", help="Disable parallel processing"
)
parser.add_argument(
"--noautoopen", action="store_true", help="Do not open in browser automatically"
)
cmd_opts = parser.parse_args() cmd_opts = parser.parse_args()
python_cmd=cmd_opts.pycmd python_cmd = cmd_opts.pycmd
listen_port=cmd_opts.port listen_port = cmd_opts.port
iscolab=cmd_opts.colab iscolab = cmd_opts.colab
noparallel=cmd_opts.noparallel noparallel = cmd_opts.noparallel
noautoopen=cmd_opts.noautoopen noautoopen = cmd_opts.noautoopen
########################命令行参数######################## ########################命令行参数########################
import sys import sys
import torch import torch
# has_mps is only available in nightly pytorch (for now) and MasOS 12.3+. # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+.
# check `getattr` and try it for compatibility # check `getattr` and try it for compatibility
def has_mps() -> bool: def has_mps() -> bool:
if sys.platform != "darwin": if sys.platform != "darwin":
return False return False
else: else:
if not getattr(torch, 'has_mps', False): return False if not getattr(torch, "has_mps", False):
return False
try: try:
torch.zeros(1).to(torch.device("mps")) torch.zeros(1).to(torch.device("mps"))
return True return True
@ -48,32 +55,34 @@ def has_mps() -> bool:
return False return False
if(not torch.cuda.is_available()): if not torch.cuda.is_available():
if has_mps(): if has_mps():
print("没有发现支持的N卡, 使用MPS进行推理") print("没有发现支持的N卡, 使用MPS进行推理")
device = "mps" device = "mps"
else: else:
print("没有发现支持的N卡, 使用CPU进行推理") print("没有发现支持的N卡, 使用CPU进行推理")
device = "cpu" device = "cpu"
is_half = False is_half = False
if(device not in ["cpu", "mps"]): if device not in ["cpu", "mps"]:
gpu_name = torch.cuda.get_device_name(int(device.split(":")[-1])) gpu_name = torch.cuda.get_device_name(int(device.split(":")[-1]))
if("16" in gpu_name or "MX" in gpu_name): if "16" in gpu_name or "MX" in gpu_name:
print("16系显卡/MX系显卡强制单精度") print("16系显卡/MX系显卡强制单精度")
is_half = False is_half = False
from multiprocessing import cpu_count from multiprocessing import cpu_count
if(n_cpu==0): n_cpu=cpu_count()
if(is_half): if n_cpu == 0:
#6G显存配置 n_cpu = cpu_count()
x_pad = 3 if is_half:
x_query = 10 # 6G显存配置
x_center = 60 x_pad = 3
x_max = 65 x_query = 10
x_center = 60
x_max = 65
else: else:
#5G显存配置 # 5G显存配置
x_pad = 1 x_pad = 1
x_query = 6 x_query = 6
x_center = 38 x_center = 38
x_max = 41 x_max = 41

View File

@ -5,40 +5,43 @@ person = "Shiroha/shiroha.pth"
exported_path = "model.onnx" exported_path = "model.onnx"
cpt = torch.load(person, map_location="cpu") cpt = torch.load(person, map_location="cpu")
cpt["config"][-3]=cpt["weight"]["emb_g.weight"].shape[0]#n_spk cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
print(*cpt["config"]) print(*cpt["config"])
net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=False) net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=False)
net_g.load_state_dict(cpt["weight"], strict=False) net_g.load_state_dict(cpt["weight"], strict=False)
test_phone = torch.rand(1, 200, 256) test_phone = torch.rand(1, 200, 256)
test_phone_lengths = torch.tensor([200]).long() test_phone_lengths = torch.tensor([200]).long()
test_pitch = torch.randint(size=(1 ,200),low=5,high=255) test_pitch = torch.randint(size=(1, 200), low=5, high=255)
test_pitchf = torch.rand(1, 200) test_pitchf = torch.rand(1, 200)
test_ds = torch.LongTensor([0]) test_ds = torch.LongTensor([0])
test_rnd = torch.rand(1, 192, 200) test_rnd = torch.rand(1, 192, 200)
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"] input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
output_names = ["audio", ] output_names = [
device="cpu" "audio",
torch.onnx.export(net_g, ]
( device = "cpu"
test_phone.to(device), torch.onnx.export(
test_phone_lengths.to(device), net_g,
test_pitch.to(device), (
test_pitchf.to(device), test_phone.to(device),
test_ds.to(device), test_phone_lengths.to(device),
test_rnd.to(device) test_pitch.to(device),
), test_pitchf.to(device),
exported_path, test_ds.to(device),
dynamic_axes={ test_rnd.to(device),
"phone": [1], ),
"pitch": [1], exported_path,
"pitchf": [1], dynamic_axes={
"rnd": [2], "phone": [1],
}, "pitch": [1],
do_constant_folding=False, "pitchf": [1],
opset_version=16, "rnd": [2],
verbose=False, },
input_names=input_names, do_constant_folding=False,
output_names=output_names) opset_version=16,
verbose=False,
input_names=input_names,
output_names=output_names,
)

View File

@ -1,21 +1,26 @@
import os,traceback,sys,parselmouth import os, traceback, sys, parselmouth
import librosa import librosa
import pyworld import pyworld
from scipy.io import wavfile from scipy.io import wavfile
import numpy as np,logging import numpy as np, logging
logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger("numba").setLevel(logging.WARNING)
from multiprocessing import Process from multiprocessing import Process
exp_dir = sys.argv[1] exp_dir = sys.argv[1]
f = open("%s/extract_f0_feature.log"%exp_dir, "a+") f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
def printt(strr): def printt(strr):
print(strr) print(strr)
f.write("%s\n" % strr) f.write("%s\n" % strr)
f.flush() f.flush()
n_p = int(sys.argv[2]) n_p = int(sys.argv[2])
f0method = sys.argv[3] f0method = sys.argv[3]
class FeatureInput(object): class FeatureInput(object):
def __init__(self, samplerate=16000, hop_size=160): def __init__(self, samplerate=16000, hop_size=160):
self.fs = samplerate self.fs = samplerate
@ -27,21 +32,30 @@ class FeatureInput(object):
self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
def compute_f0(self, path,f0_method): def compute_f0(self, path, f0_method):
x, sr = librosa.load(path, self.fs) x, sr = librosa.load(path, self.fs)
p_len=x.shape[0]//self.hop p_len = x.shape[0] // self.hop
assert sr == self.fs assert sr == self.fs
if(f0_method=="pm"): if f0_method == "pm":
time_step = 160 / 16000 * 1000 time_step = 160 / 16000 * 1000
f0_min = 50 f0_min = 50
f0_max = 1100 f0_max = 1100
f0 = parselmouth.Sound(x, sr).to_pitch_ac( f0 = (
time_step=time_step / 1000, voicing_threshold=0.6, parselmouth.Sound(x, sr)
pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency'] .to_pitch_ac(
pad_size=(p_len - len(f0) + 1) // 2 time_step=time_step / 1000,
if(pad_size>0 or p_len - len(f0) - pad_size>0): voicing_threshold=0.6,
f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant') pitch_floor=f0_min,
elif(f0_method=="harvest"): pitch_ceiling=f0_max,
)
.selected_array["frequency"]
)
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
)
elif f0_method == "harvest":
f0, t = pyworld.harvest( f0, t = pyworld.harvest(
x.astype(np.double), x.astype(np.double),
fs=sr, fs=sr,
@ -50,7 +64,7 @@ class FeatureInput(object):
frame_period=1000 * self.hop / sr, frame_period=1000 * self.hop / sr,
) )
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs) f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
elif(f0_method=="dio"): elif f0_method == "dio":
f0, t = pyworld.dio( f0, t = pyworld.dio(
x.astype(np.double), x.astype(np.double),
fs=sr, fs=sr,
@ -77,45 +91,67 @@ class FeatureInput(object):
) )
return f0_coarse return f0_coarse
def go(self,paths,f0_method): def go(self, paths, f0_method):
if (len(paths) == 0): printt("no-f0-todo") if len(paths) == 0:
printt("no-f0-todo")
else: else:
printt("todo-f0-%s"%len(paths)) printt("todo-f0-%s" % len(paths))
n=max(len(paths)//5,1)#每个进程最多打印5条 n = max(len(paths) // 5, 1) # 每个进程最多打印5条
for idx,(inp_path,opt_path1,opt_path2) in enumerate(paths): for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
try: try:
if(idx%n==0):printt("f0ing,now-%s,all-%s,-%s"%(idx,len(paths),inp_path)) if idx % n == 0:
if(os.path.exists(opt_path1+".npy")==True and os.path.exists(opt_path2+".npy")==True):continue printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path))
featur_pit = self.compute_f0(inp_path,f0_method) if (
np.save(opt_path2,featur_pit,allow_pickle=False,)#nsf os.path.exists(opt_path1 + ".npy") == True
and os.path.exists(opt_path2 + ".npy") == True
):
continue
featur_pit = self.compute_f0(inp_path, f0_method)
np.save(
opt_path2,
featur_pit,
allow_pickle=False,
) # nsf
coarse_pit = self.coarse_f0(featur_pit) coarse_pit = self.coarse_f0(featur_pit)
np.save(opt_path1,coarse_pit,allow_pickle=False,)#ori np.save(
opt_path1,
coarse_pit,
allow_pickle=False,
) # ori
except: except:
printt("f0fail-%s-%s-%s" % (idx, inp_path,traceback.format_exc())) printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc()))
if __name__=='__main__':
if __name__ == "__main__":
# exp_dir=r"E:\codes\py39\dataset\mi-test" # exp_dir=r"E:\codes\py39\dataset\mi-test"
# n_p=16 # n_p=16
# f = open("%s/log_extract_f0.log"%exp_dir, "w") # f = open("%s/log_extract_f0.log"%exp_dir, "w")
printt(sys.argv) printt(sys.argv)
featureInput = FeatureInput() featureInput = FeatureInput()
paths=[] paths = []
inp_root= "%s/1_16k_wavs"%(exp_dir) inp_root = "%s/1_16k_wavs" % (exp_dir)
opt_root1="%s/2a_f0"%(exp_dir) opt_root1 = "%s/2a_f0" % (exp_dir)
opt_root2="%s/2b-f0nsf"%(exp_dir) opt_root2 = "%s/2b-f0nsf" % (exp_dir)
os.makedirs(opt_root1,exist_ok=True) os.makedirs(opt_root1, exist_ok=True)
os.makedirs(opt_root2,exist_ok=True) os.makedirs(opt_root2, exist_ok=True)
for name in sorted(list(os.listdir(inp_root))): for name in sorted(list(os.listdir(inp_root))):
inp_path="%s/%s"%(inp_root,name) inp_path = "%s/%s" % (inp_root, name)
if ("spec" in inp_path): continue if "spec" in inp_path:
opt_path1="%s/%s"%(opt_root1,name) continue
opt_path2="%s/%s"%(opt_root2,name) opt_path1 = "%s/%s" % (opt_root1, name)
paths.append([inp_path,opt_path1,opt_path2]) opt_path2 = "%s/%s" % (opt_root2, name)
paths.append([inp_path, opt_path1, opt_path2])
ps=[] ps = []
for i in range(n_p): for i in range(n_p):
p=Process(target=featureInput.go,args=(paths[i::n_p],f0method,)) p = Process(
target=featureInput.go,
args=(
paths[i::n_p],
f0method,
),
)
p.start() p.start()
ps.append(p) ps.append(p)
for p in ps: for p in ps:

View File

@ -1,33 +1,41 @@
import os,sys,traceback import os, sys, traceback
# device=sys.argv[1] # device=sys.argv[1]
n_part=int(sys.argv[2]) n_part = int(sys.argv[2])
i_part=int(sys.argv[3]) i_part = int(sys.argv[3])
if len(sys.argv) == 5: if len(sys.argv) == 5:
exp_dir=sys.argv[4] exp_dir = sys.argv[4]
else: else:
i_gpu=sys.argv[4] i_gpu = sys.argv[4]
exp_dir=sys.argv[5] exp_dir = sys.argv[5]
os.environ["CUDA_VISIBLE_DEVICES"]=str(i_gpu) os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu)
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
import soundfile as sf import soundfile as sf
import numpy as np import numpy as np
from fairseq import checkpoint_utils from fairseq import checkpoint_utils
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
f = open("%s/extract_f0_feature.log"%exp_dir, "a+") f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
def printt(strr): def printt(strr):
print(strr) print(strr)
f.write("%s\n" % strr) f.write("%s\n" % strr)
f.flush() f.flush()
printt(sys.argv) printt(sys.argv)
model_path = "hubert_base.pt" model_path = "hubert_base.pt"
printt(exp_dir) printt(exp_dir)
wavPath = "%s/1_16k_wavs"%exp_dir wavPath = "%s/1_16k_wavs" % exp_dir
outPath = "%s/3_feature256"%exp_dir outPath = "%s/3_feature256" % exp_dir
os.makedirs(outPath,exist_ok=True) os.makedirs(outPath, exist_ok=True)
# wave must be 16k, hop_size=320 # wave must be 16k, hop_size=320
def readwave(wav_path, normalize=False): def readwave(wav_path, normalize=False):
wav, sr = sf.read(wav_path) wav, sr = sf.read(wav_path)
@ -41,6 +49,8 @@ def readwave(wav_path, normalize=False):
feats = F.layer_norm(feats, feats.shape) feats = F.layer_norm(feats, feats.shape)
feats = feats.view(1, -1) feats = feats.view(1, -1)
return feats return feats
# HuBERT model # HuBERT model
printt("load model(s) from {}".format(model_path)) printt("load model(s) from {}".format(model_path))
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
@ -49,27 +59,32 @@ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
) )
model = models[0] model = models[0]
model = model.to(device) model = model.to(device)
printt("move model to %s"%device) printt("move model to %s" % device)
if device != "cpu": model = model.half() if device != "cpu":
model = model.half()
model.eval() model.eval()
todo=sorted(list(os.listdir(wavPath)))[i_part::n_part] todo = sorted(list(os.listdir(wavPath)))[i_part::n_part]
n = max(1,len(todo) // 10) # 最多打印十条 n = max(1, len(todo) // 10) # 最多打印十条
if(len(todo)==0):printt("no-feature-todo") if len(todo) == 0:
printt("no-feature-todo")
else: else:
printt("all-feature-%s"%len(todo)) printt("all-feature-%s" % len(todo))
for idx,file in enumerate(todo): for idx, file in enumerate(todo):
try: try:
if file.endswith(".wav"): if file.endswith(".wav"):
wav_path = "%s/%s"%(wavPath,file) wav_path = "%s/%s" % (wavPath, file)
out_path = "%s/%s"%(outPath,file.replace("wav","npy")) out_path = "%s/%s" % (outPath, file.replace("wav", "npy"))
if(os.path.exists(out_path)):continue if os.path.exists(out_path):
continue
feats = readwave(wav_path, normalize=saved_cfg.task.normalize) feats = readwave(wav_path, normalize=saved_cfg.task.normalize)
padding_mask = torch.BoolTensor(feats.shape).fill_(False) padding_mask = torch.BoolTensor(feats.shape).fill_(False)
inputs = { inputs = {
"source": feats.half().to(device) if device != "cpu" else feats.to(device), "source": feats.half().to(device)
if device != "cpu"
else feats.to(device),
"padding_mask": padding_mask.to(device), "padding_mask": padding_mask.to(device),
"output_layer": 9, # layer 9 "output_layer": 9, # layer 9
} }
@ -78,11 +93,12 @@ else:
feats = model.final_proj(logits[0]) feats = model.final_proj(logits[0])
feats = feats.squeeze(0).float().cpu().numpy() feats = feats.squeeze(0).float().cpu().numpy()
if(np.isnan(feats).sum()==0): if np.isnan(feats).sum() == 0:
np.save(out_path, feats, allow_pickle=False) np.save(out_path, feats, allow_pickle=False)
else: else:
printt("%s-contains nan"%file) printt("%s-contains nan" % file)
if (idx % n == 0):printt("now-%s,all-%s,%s,%s"%(len(todo),idx,file,feats.shape)) if idx % n == 0:
printt("now-%s,all-%s,%s,%s" % (len(todo), idx, file, feats.shape))
except: except:
printt(traceback.format_exc()) printt(traceback.format_exc())
printt("all-feature-done") printt("all-feature-done")

View File

@ -7,9 +7,10 @@ pattern = r"""i18n\((["'][^"']+["'])\)"""
# Initialize the dictionary to store key-value pairs # Initialize the dictionary to store key-value pairs
data = {} data = {}
def process(fn: str): def process(fn: str):
global data global data
with open(fn, 'r', encoding='utf-8') as f: with open(fn, "r", encoding="utf-8") as f:
contents = f.read() contents = f.read()
matches = re.findall(pattern, contents) matches = re.findall(pattern, contents)
for key in matches: for key in matches:
@ -17,12 +18,13 @@ def process(fn: str):
print("extract:", key) print("extract:", key)
data[key] = key data[key] = key
print("processing infer-web.py") print("processing infer-web.py")
process('infer-web.py') process("infer-web.py")
print("processing gui.py") print("processing gui.py")
process('gui.py') process("gui.py")
# Save as a JSON file # Save as a JSON file
with open('./locale/zh_CN.json', 'w', encoding='utf-8') as f: with open("./locale/zh_CN.json", "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=4) json.dump(data, f, ensure_ascii=False, indent=4)

572
gui.py
View File

@ -3,32 +3,36 @@ import sounddevice as sd
import noisereduce as nr import noisereduce as nr
import numpy as np import numpy as np
from fairseq import checkpoint_utils from fairseq import checkpoint_utils
import librosa,torch,parselmouth,faiss,time,threading import librosa, torch, parselmouth, faiss, time, threading
import torch.nn.functional as F import torch.nn.functional as F
import torchaudio.transforms as tat import torchaudio.transforms as tat
#import matplotlib.pyplot as plt # import matplotlib.pyplot as plt
from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono
from webui_locale import I18nAuto from webui_locale import I18nAuto
i18n = I18nAuto() i18n = I18nAuto()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class RVC: class RVC:
def __init__(self,key,hubert_path,pth_path,index_path,npy_path,index_rate) -> None: def __init__(
''' self, key, hubert_path, pth_path, index_path, npy_path, index_rate
) -> None:
"""
初始化 初始化
''' """
self.f0_up_key=key self.f0_up_key = key
self.time_step = 160 / 16000 * 1000 self.time_step = 160 / 16000 * 1000
self.f0_min = 50 self.f0_min = 50
self.f0_max = 1100 self.f0_max = 1100
self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
self.index=faiss.read_index(index_path) self.index = faiss.read_index(index_path)
self.index_rate=index_rate self.index_rate = index_rate
'''NOT YET USED''' """NOT YET USED"""
self.big_npy=np.load(npy_path) self.big_npy = np.load(npy_path)
model_path = hubert_path model_path = hubert_path
print("load model(s) from {}".format(model_path)) print("load model(s) from {}".format(model_path))
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
@ -41,9 +45,9 @@ class RVC:
self.model.eval() self.model.eval()
cpt = torch.load(pth_path, map_location="cpu") cpt = torch.load(pth_path, map_location="cpu")
tgt_sr = cpt["config"][-1] tgt_sr = cpt["config"][-1]
cpt["config"][-3]=cpt["weight"]["emb_g.weight"].shape[0]#n_spk cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
if_f0=cpt.get("f0",1) if_f0 = cpt.get("f0", 1)
if(if_f0==1): if if_f0 == 1:
self.net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=True) self.net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=True)
else: else:
self.net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) self.net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
@ -52,36 +56,43 @@ class RVC:
self.net_g.eval().to(device) self.net_g.eval().to(device)
self.net_g.half() self.net_g.half()
def get_f0_coarse(self, f0):
def get_f0_coarse(self,f0):
f0_mel = 1127 * np.log(1 + f0 / 700) f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (self.f0_mel_max - self.f0_mel_min) + 1 f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (
self.f0_mel_max - self.f0_mel_min
) + 1
f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255 f0_mel[f0_mel > 255] = 255
# f0_mel[f0_mel > 188] = 188 # f0_mel[f0_mel > 188] = 188
f0_coarse = np.rint(f0_mel).astype(np.int) f0_coarse = np.rint(f0_mel).astype(np.int)
return f0_coarse return f0_coarse
def get_f0(self,x, p_len,f0_up_key=0):
f0 = parselmouth.Sound(x, 16000).to_pitch_ac(
time_step=self.time_step / 1000, voicing_threshold=0.6,
pitch_floor=self.f0_min, pitch_ceiling=self.f0_max).selected_array['frequency']
pad_size=(p_len - len(f0) + 1) // 2 def get_f0(self, x, p_len, f0_up_key=0):
if(pad_size>0 or p_len - len(f0) - pad_size>0): f0 = (
f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant') parselmouth.Sound(x, 16000)
.to_pitch_ac(
time_step=self.time_step / 1000,
voicing_threshold=0.6,
pitch_floor=self.f0_min,
pitch_ceiling=self.f0_max,
)
.selected_array["frequency"]
)
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
f0 *= pow(2, f0_up_key / 12) f0 *= pow(2, f0_up_key / 12)
# f0=suofang(f0) # f0=suofang(f0)
f0bak = f0.copy() f0bak = f0.copy()
f0_coarse=self.get_f0_coarse(f0) f0_coarse = self.get_f0_coarse(f0)
return f0_coarse, f0bak return f0_coarse, f0bak
def infer(self,feats:torch.Tensor) -> np.ndarray: def infer(self, feats: torch.Tensor) -> np.ndarray:
''' """
推理函数 推理函数
''' """
audio=feats.clone().cpu().numpy() audio = feats.clone().cpu().numpy()
assert feats.dim() == 1, feats.dim() assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1) feats = feats.view(1, -1)
padding_mask = torch.BoolTensor(feats.shape).fill_(False) padding_mask = torch.BoolTensor(feats.shape).fill_(False)
@ -96,209 +107,389 @@ class RVC:
feats = self.model.final_proj(logits[0]) feats = self.model.final_proj(logits[0])
####索引优化 ####索引优化
if(isinstance(self.index,type(None))==False and isinstance(self.big_npy,type(None))==False and self.index_rate!=0): if (
isinstance(self.index, type(None)) == False
and isinstance(self.big_npy, type(None)) == False
and self.index_rate != 0
):
npy = feats[0].cpu().numpy().astype("float32") npy = feats[0].cpu().numpy().astype("float32")
_, I = self.index.search(npy, 1) _, I = self.index.search(npy, 1)
npy=self.big_npy[I.squeeze()].astype("float16") npy = self.big_npy[I.squeeze()].astype("float16")
feats = torch.from_numpy(npy).unsqueeze(0).to(device)*self.index_rate + (1-self.index_rate)*feats feats = (
torch.from_numpy(npy).unsqueeze(0).to(device) * self.index_rate
+ (1 - self.index_rate) * feats
)
feats=F.interpolate(feats.permute(0,2,1),scale_factor=2).permute(0,2,1) feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
torch.cuda.synchronize() torch.cuda.synchronize()
# p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存 # p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存
p_len = min(feats.shape[1],12000)# p_len = min(feats.shape[1], 12000) #
print(feats.shape) print(feats.shape)
pitch, pitchf = self.get_f0(audio, p_len,self.f0_up_key) pitch, pitchf = self.get_f0(audio, p_len, self.f0_up_key)
p_len = min(feats.shape[1],12000,pitch.shape[0])#太大了爆显存 p_len = min(feats.shape[1], 12000, pitch.shape[0]) # 太大了爆显存
torch.cuda.synchronize() torch.cuda.synchronize()
# print(feats.shape,pitch.shape) # print(feats.shape,pitch.shape)
feats = feats[:,:p_len, :] feats = feats[:, :p_len, :]
pitch = pitch[:p_len] pitch = pitch[:p_len]
pitchf = pitchf[:p_len] pitchf = pitchf[:p_len]
p_len = torch.LongTensor([p_len]).to(device) p_len = torch.LongTensor([p_len]).to(device)
pitch = torch.LongTensor(pitch).unsqueeze(0).to(device) pitch = torch.LongTensor(pitch).unsqueeze(0).to(device)
pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device) pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device)
ii=0#sid ii = 0 # sid
sid=torch.LongTensor([ii]).to(device) sid = torch.LongTensor([ii]).to(device)
with torch.no_grad(): with torch.no_grad():
infered_audio = self.net_g.infer(feats, p_len,pitch,pitchf,sid)[0][0, 0].data.cpu().float()#nsf infered_audio = (
self.net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
.data.cpu()
.float()
) # nsf
torch.cuda.synchronize() torch.cuda.synchronize()
return infered_audio return infered_audio
class Config: class Config:
def __init__(self) -> None: def __init__(self) -> None:
self.hubert_path:str='' self.hubert_path: str = ""
self.pth_path:str='' self.pth_path: str = ""
self.index_path:str='' self.index_path: str = ""
self.npy_path:str='' self.npy_path: str = ""
self.pitch:int=12 self.pitch: int = 12
self.samplerate:int=44100 self.samplerate: int = 44100
self.block_time:float=1.0#s self.block_time: float = 1.0 # s
self.buffer_num:int=1 self.buffer_num: int = 1
self.threhold:int=-30 self.threhold: int = -30
self.crossfade_time:float=0.08 self.crossfade_time: float = 0.08
self.extra_time:float=0.04 self.extra_time: float = 0.04
self.I_noise_reduce=False self.I_noise_reduce = False
self.O_noise_reduce=False self.O_noise_reduce = False
self.index_rate=0.3 self.index_rate = 0.3
class GUI: class GUI:
def __init__(self) -> None: def __init__(self) -> None:
self.config=Config() self.config = Config()
self.flag_vc=False self.flag_vc = False
self.launcher() self.launcher()
def launcher(self): def launcher(self):
sg.theme('LightBlue3') sg.theme("LightBlue3")
input_devices,output_devices,_, _=self.get_devices() input_devices, output_devices, _, _ = self.get_devices()
layout=[ layout = [
[ [
sg.Frame(title=i18n('加载模型'),layout=[ sg.Frame(
[sg.Input(default_text='TEMP\\hubert_base.pt',key='hubert_path'),sg.FileBrowse(i18n('Hubert模型'))], title=i18n("加载模型"),
[sg.Input(default_text='TEMP\\atri.pth',key='pth_path'),sg.FileBrowse(i18n('选择.pth文件'))], layout=[
[sg.Input(default_text='TEMP\\added_IVF512_Flat_atri_baseline_src_feat.index',key='index_path'),sg.FileBrowse(i18n('选择.index文件'))], [
[sg.Input(default_text='TEMP\\big_src_feature_atri.npy',key='npy_path'),sg.FileBrowse(i18n('选择.npy文件'))] sg.Input(
]) default_text="TEMP\\hubert_base.pt", key="hubert_path"
),
sg.FileBrowse(i18n("Hubert模型")),
],
[
sg.Input(default_text="TEMP\\atri.pth", key="pth_path"),
sg.FileBrowse(i18n("选择.pth文件")),
],
[
sg.Input(
default_text="TEMP\\added_IVF512_Flat_atri_baseline_src_feat.index",
key="index_path",
),
sg.FileBrowse(i18n("选择.index文件")),
],
[
sg.Input(
default_text="TEMP\\big_src_feature_atri.npy",
key="npy_path",
),
sg.FileBrowse(i18n("选择.npy文件")),
],
],
)
], ],
[ [
sg.Frame(layout=[ sg.Frame(
[sg.Text(i18n("输入设备")),sg.Combo(input_devices,key='sg_input_device',default_value=input_devices[sd.default.device[0]])], layout=[
[sg.Text(i18n("输出设备")),sg.Combo(output_devices,key='sg_output_device',default_value=output_devices[sd.default.device[1]])] [
],title=i18n("音频设备(请使用同种类驱动)")) sg.Text(i18n("输入设备")),
sg.Combo(
input_devices,
key="sg_input_device",
default_value=input_devices[sd.default.device[0]],
),
],
[
sg.Text(i18n("输出设备")),
sg.Combo(
output_devices,
key="sg_output_device",
default_value=output_devices[sd.default.device[1]],
),
],
],
title=i18n("音频设备(请使用同种类驱动)"),
)
], ],
[ [
sg.Frame(layout=[ sg.Frame(
[sg.Text(i18n("响应阈值")),sg.Slider(range=(-60,0),key='threhold',resolution=1,orientation='h',default_value=-30)], layout=[
[sg.Text(i18n("音调设置")),sg.Slider(range=(-24,24),key='pitch',resolution=1,orientation='h',default_value=12)], [
[sg.Text(i18n('Index Rate')),sg.Slider(range=(0.0,1.0),key='index_rate',resolution=0.01,orientation='h',default_value=0.5)] sg.Text(i18n("响应阈值")),
],title=i18n("常规设置")), sg.Slider(
sg.Frame(layout=[ range=(-60, 0),
[sg.Text(i18n("采样长度")),sg.Slider(range=(0.1,3.0),key='block_time',resolution=0.1,orientation='h',default_value=1.0)], key="threhold",
[sg.Text(i18n("淡入淡出长度")),sg.Slider(range=(0.01,0.15),key='crossfade_length',resolution=0.01,orientation='h',default_value=0.08)], resolution=1,
[sg.Text(i18n("额外推理时长")),sg.Slider(range=(0.05,3.00),key='extra_time',resolution=0.01,orientation='h',default_value=0.05)], orientation="h",
[sg.Checkbox(i18n('输入降噪'),key='I_noise_reduce'),sg.Checkbox(i18n('输出降噪'),key='O_noise_reduce')] default_value=-30,
],title=i18n("性能设置")) ),
],
[
sg.Text(i18n("音调设置")),
sg.Slider(
range=(-24, 24),
key="pitch",
resolution=1,
orientation="h",
default_value=12,
),
],
[
sg.Text(i18n("Index Rate")),
sg.Slider(
range=(0.0, 1.0),
key="index_rate",
resolution=0.01,
orientation="h",
default_value=0.5,
),
],
],
title=i18n("常规设置"),
),
sg.Frame(
layout=[
[
sg.Text(i18n("采样长度")),
sg.Slider(
range=(0.1, 3.0),
key="block_time",
resolution=0.1,
orientation="h",
default_value=1.0,
),
],
[
sg.Text(i18n("淡入淡出长度")),
sg.Slider(
range=(0.01, 0.15),
key="crossfade_length",
resolution=0.01,
orientation="h",
default_value=0.08,
),
],
[
sg.Text(i18n("额外推理时长")),
sg.Slider(
range=(0.05, 3.00),
key="extra_time",
resolution=0.01,
orientation="h",
default_value=0.05,
),
],
[
sg.Checkbox(i18n("输入降噪"), key="I_noise_reduce"),
sg.Checkbox(i18n("输出降噪"), key="O_noise_reduce"),
],
],
title=i18n("性能设置"),
),
],
[
sg.Button(i18n("开始音频转换"), key="start_vc"),
sg.Button(i18n("停止音频转换"), key="stop_vc"),
sg.Text(i18n("推理时间(ms):")),
sg.Text("0", key="infer_time"),
], ],
[sg.Button(i18n("开始音频转换"),key='start_vc'),sg.Button(i18n("停止音频转换"),key='stop_vc'),sg.Text(i18n("推理时间(ms):")),sg.Text("0",key='infer_time')]
] ]
self.window=sg.Window("RVC - GUI",layout=layout) self.window = sg.Window("RVC - GUI", layout=layout)
self.event_handler() self.event_handler()
def event_handler(self): def event_handler(self):
while True: while True:
event, values = self.window.read() event, values = self.window.read()
if event ==sg.WINDOW_CLOSED: if event == sg.WINDOW_CLOSED:
self.flag_vc=False self.flag_vc = False
exit() exit()
if event == 'start_vc' and self.flag_vc==False: if event == "start_vc" and self.flag_vc == False:
self.set_values(values) self.set_values(values)
print(str(self.config.__dict__)) print(str(self.config.__dict__))
print('using_cuda:'+str(torch.cuda.is_available())) print("using_cuda:" + str(torch.cuda.is_available()))
self.start_vc() self.start_vc()
if event=='stop_vc'and self.flag_vc==True: if event == "stop_vc" and self.flag_vc == True:
self.flag_vc = False self.flag_vc = False
def set_values(self, values):
self.set_devices(values["sg_input_device"], values["sg_output_device"])
self.config.hubert_path = values["hubert_path"]
self.config.pth_path = values["pth_path"]
self.config.index_path = values["index_path"]
self.config.npy_path = values["npy_path"]
self.config.threhold = values["threhold"]
self.config.pitch = values["pitch"]
self.config.block_time = values["block_time"]
self.config.crossfade_time = values["crossfade_length"]
self.config.extra_time = values["extra_time"]
self.config.I_noise_reduce = values["I_noise_reduce"]
self.config.O_noise_reduce = values["O_noise_reduce"]
self.config.index_rate = values["index_rate"]
def set_values(self,values):
self.set_devices(values["sg_input_device"],values['sg_output_device'])
self.config.hubert_path=values['hubert_path']
self.config.pth_path=values['pth_path']
self.config.index_path=values['index_path']
self.config.npy_path=values['npy_path']
self.config.threhold=values['threhold']
self.config.pitch=values['pitch']
self.config.block_time=values['block_time']
self.config.crossfade_time=values['crossfade_length']
self.config.extra_time=values['extra_time']
self.config.I_noise_reduce=values['I_noise_reduce']
self.config.O_noise_reduce=values['O_noise_reduce']
self.config.index_rate=values['index_rate']
def start_vc(self): def start_vc(self):
torch.cuda.empty_cache() torch.cuda.empty_cache()
self.flag_vc=True self.flag_vc = True
self.block_frame=int(self.config.block_time*self.config.samplerate) self.block_frame = int(self.config.block_time * self.config.samplerate)
self.crossfade_frame=int(self.config.crossfade_time*self.config.samplerate) self.crossfade_frame = int(self.config.crossfade_time * self.config.samplerate)
self.sola_search_frame=int(0.012*self.config.samplerate) self.sola_search_frame = int(0.012 * self.config.samplerate)
self.delay_frame=int(0.02*self.config.samplerate)#往前预留0.02s self.delay_frame = int(0.02 * self.config.samplerate) # 往前预留0.02s
self.extra_frame=int(self.config.extra_time*self.config.samplerate)#往后预留0.04s self.extra_frame = int(
self.rvc=None self.config.extra_time * self.config.samplerate
self.rvc=RVC(self.config.pitch,self.config.hubert_path,self.config.pth_path,self.config.index_path,self.config.npy_path,self.config.index_rate) ) # 往后预留0.04s
self.input_wav:np.ndarray=np.zeros(self.extra_frame+self.crossfade_frame+self.sola_search_frame+self.block_frame,dtype='float32') self.rvc = None
self.output_wav:torch.Tensor=torch.zeros(self.block_frame,device=device,dtype=torch.float32) self.rvc = RVC(
self.sola_buffer:torch.Tensor=torch.zeros(self.crossfade_frame,device=device,dtype=torch.float32) self.config.pitch,
self.fade_in_window:torch.Tensor=torch.linspace(0.0,1.0,steps=self.crossfade_frame,device=device,dtype=torch.float32) self.config.hubert_path,
self.fade_out_window:torch.Tensor = 1 - self.fade_in_window self.config.pth_path,
self.resampler1=tat.Resample(orig_freq=self.config.samplerate,new_freq=16000,dtype=torch.float32) self.config.index_path,
self.resampler2=tat.Resample(orig_freq=40000,new_freq=self.config.samplerate,dtype=torch.float32) self.config.npy_path,
thread_vc=threading.Thread(target=self.soundinput) self.config.index_rate,
)
self.input_wav: np.ndarray = np.zeros(
self.extra_frame
+ self.crossfade_frame
+ self.sola_search_frame
+ self.block_frame,
dtype="float32",
)
self.output_wav: torch.Tensor = torch.zeros(
self.block_frame, device=device, dtype=torch.float32
)
self.sola_buffer: torch.Tensor = torch.zeros(
self.crossfade_frame, device=device, dtype=torch.float32
)
self.fade_in_window: torch.Tensor = torch.linspace(
0.0, 1.0, steps=self.crossfade_frame, device=device, dtype=torch.float32
)
self.fade_out_window: torch.Tensor = 1 - self.fade_in_window
self.resampler1 = tat.Resample(
orig_freq=self.config.samplerate, new_freq=16000, dtype=torch.float32
)
self.resampler2 = tat.Resample(
orig_freq=40000, new_freq=self.config.samplerate, dtype=torch.float32
)
thread_vc = threading.Thread(target=self.soundinput)
thread_vc.start() thread_vc.start()
def soundinput(self): def soundinput(self):
''' """
接受音频输入 接受音频输入
''' """
with sd.Stream(callback=self.audio_callback, blocksize=self.block_frame,samplerate=self.config.samplerate,dtype='float32'): with sd.Stream(
callback=self.audio_callback,
blocksize=self.block_frame,
samplerate=self.config.samplerate,
dtype="float32",
):
while self.flag_vc: while self.flag_vc:
time.sleep(self.config.block_time) time.sleep(self.config.block_time)
print('Audio block passed.') print("Audio block passed.")
print('ENDing VC') print("ENDing VC")
def audio_callback(
def audio_callback(self,indata:np.ndarray,outdata:np.ndarray, frames, times, status): self, indata: np.ndarray, outdata: np.ndarray, frames, times, status
''' ):
"""
音频处理 音频处理
''' """
start_time=time.perf_counter() start_time = time.perf_counter()
indata=librosa.to_mono(indata.T) indata = librosa.to_mono(indata.T)
if self.config.I_noise_reduce: if self.config.I_noise_reduce:
indata[:]=nr.reduce_noise(y=indata,sr=self.config.samplerate) indata[:] = nr.reduce_noise(y=indata, sr=self.config.samplerate)
'''noise gate''' """noise gate"""
frame_length=2048 frame_length = 2048
hop_length=1024 hop_length = 1024
rms=librosa.feature.rms(y=indata,frame_length=frame_length,hop_length=hop_length) rms = librosa.feature.rms(
db_threhold=librosa.amplitude_to_db(rms,ref=1.0)[0]<self.config.threhold y=indata, frame_length=frame_length, hop_length=hop_length
#print(rms.shape,db.shape,db) )
db_threhold = librosa.amplitude_to_db(rms, ref=1.0)[0] < self.config.threhold
# print(rms.shape,db.shape,db)
for i in range(db_threhold.shape[0]): for i in range(db_threhold.shape[0]):
if db_threhold[i]: if db_threhold[i]:
indata[i*hop_length:(i+1)*hop_length]=0 indata[i * hop_length : (i + 1) * hop_length] = 0
self.input_wav[:]=np.append(self.input_wav[self.block_frame:],indata) self.input_wav[:] = np.append(self.input_wav[self.block_frame :], indata)
#infer # infer
print('input_wav:'+str(self.input_wav.shape)) print("input_wav:" + str(self.input_wav.shape))
#print('infered_wav:'+str(infer_wav.shape)) # print('infered_wav:'+str(infer_wav.shape))
infer_wav:torch.Tensor=self.resampler2(self.rvc.infer(self.resampler1(torch.from_numpy(self.input_wav))))[-self.crossfade_frame-self.sola_search_frame-self.block_frame:].to(device) infer_wav: torch.Tensor = self.resampler2(
print('infer_wav:'+str(infer_wav.shape)) self.rvc.infer(self.resampler1(torch.from_numpy(self.input_wav)))
)[-self.crossfade_frame - self.sola_search_frame - self.block_frame :].to(
device
)
print("infer_wav:" + str(infer_wav.shape))
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC
cor_nom=F.conv1d(infer_wav[None,None,:self.crossfade_frame + self.sola_search_frame],self.sola_buffer[None,None,:]) cor_nom = F.conv1d(
cor_den=torch.sqrt(F.conv1d(infer_wav[None,None,:self.crossfade_frame + self.sola_search_frame]**2,torch.ones(1, 1,self.crossfade_frame,device=device))+1e-8) infer_wav[None, None, : self.crossfade_frame + self.sola_search_frame],
sola_offset = torch.argmax( cor_nom[0, 0] / cor_den[0, 0]) self.sola_buffer[None, None, :],
print('sola offset: ' + str(int(sola_offset))) )
cor_den = torch.sqrt(
F.conv1d(
infer_wav[None, None, : self.crossfade_frame + self.sola_search_frame]
** 2,
torch.ones(1, 1, self.crossfade_frame, device=device),
)
+ 1e-8
)
sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0])
print("sola offset: " + str(int(sola_offset)))
# crossfade # crossfade
self.output_wav[:]=infer_wav[sola_offset : sola_offset + self.block_frame] self.output_wav[:] = infer_wav[sola_offset : sola_offset + self.block_frame]
self.output_wav[:self.crossfade_frame] *= self.fade_in_window self.output_wav[: self.crossfade_frame] *= self.fade_in_window
self.output_wav[:self.crossfade_frame] += self.sola_buffer[:] self.output_wav[: self.crossfade_frame] += self.sola_buffer[:]
if sola_offset < self.sola_search_frame: if sola_offset < self.sola_search_frame:
self.sola_buffer[:] = infer_wav[-self.sola_search_frame - self.crossfade_frame + sola_offset: -self.sola_search_frame + sola_offset]* self.fade_out_window self.sola_buffer[:] = (
infer_wav[
-self.sola_search_frame
- self.crossfade_frame
+ sola_offset : -self.sola_search_frame
+ sola_offset
]
* self.fade_out_window
)
else: else:
self.sola_buffer[:] = infer_wav[- self.crossfade_frame :]* self.fade_out_window self.sola_buffer[:] = (
infer_wav[-self.crossfade_frame :] * self.fade_out_window
)
if self.config.O_noise_reduce: if self.config.O_noise_reduce:
outdata[:]=np.tile(nr.reduce_noise(y=self.output_wav[:].cpu().numpy(),sr=self.config.samplerate),(2,1)).T outdata[:] = np.tile(
nr.reduce_noise(
y=self.output_wav[:].cpu().numpy(), sr=self.config.samplerate
),
(2, 1),
).T
else: else:
outdata[:]=self.output_wav[:].repeat(2, 1).t().cpu().numpy() outdata[:] = self.output_wav[:].repeat(2, 1).t().cpu().numpy()
total_time=time.perf_counter()-start_time total_time = time.perf_counter() - start_time
print('infer time:'+str(total_time)) print("infer time:" + str(total_time))
self.window['infer_time'].update(int(total_time*1000)) self.window["infer_time"].update(int(total_time * 1000))
def get_devices(self,update: bool = True): def get_devices(self, update: bool = True):
'''获取设备列表''' """获取设备列表"""
if update: if update:
sd._terminate() sd._terminate()
sd._initialize() sd._initialize()
@ -317,18 +508,33 @@ class GUI:
for d in devices for d in devices
if d["max_output_channels"] > 0 if d["max_output_channels"] > 0
] ]
input_devices_indices = [d["index"] for d in devices if d["max_input_channels"] > 0] input_devices_indices = [
d["index"] for d in devices if d["max_input_channels"] > 0
]
output_devices_indices = [ output_devices_indices = [
d["index"] for d in devices if d["max_output_channels"] > 0 d["index"] for d in devices if d["max_output_channels"] > 0
] ]
return input_devices, output_devices, input_devices_indices, output_devices_indices return (
input_devices,
def set_devices(self,input_device,output_device): output_devices,
'''设置输出设备''' input_devices_indices,
input_devices,output_devices,input_device_indices, output_device_indices=self.get_devices() output_devices_indices,
sd.default.device[0]=input_device_indices[input_devices.index(input_device)] )
sd.default.device[1]=output_device_indices[output_devices.index(output_device)]
print("input device:"+str(sd.default.device[0])+":"+str(input_device)) def set_devices(self, input_device, output_device):
print("output device:"+str(sd.default.device[1])+":"+str(output_device)) """设置输出设备"""
(
gui=GUI() input_devices,
output_devices,
input_device_indices,
output_device_indices,
) = self.get_devices()
sd.default.device[0] = input_device_indices[input_devices.index(input_device)]
sd.default.device[1] = output_device_indices[
output_devices.index(output_device)
]
print("input device:" + str(sd.default.device[0]) + ":" + str(input_device))
print("output device:" + str(sd.default.device[1]) + ":" + str(output_device))
gui = GUI()

File diff suppressed because it is too large Load Diff

View File

@ -1,14 +1,19 @@
''' """
对源特征进行检索 对源特征进行检索
''' """
import torch, pdb, os,parselmouth import torch, pdb, os, parselmouth
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import numpy as np import numpy as np
import soundfile as sf import soundfile as sf
# from models import SynthesizerTrn256#hifigan_nonsf # from models import SynthesizerTrn256#hifigan_nonsf
# from infer_pack.models import SynthesizerTrn256NSF as SynthesizerTrn256#hifigan_nsf # from infer_pack.models import SynthesizerTrn256NSF as SynthesizerTrn256#hifigan_nsf
from infer_pack.models import SynthesizerTrnMs256NSFsid as SynthesizerTrn256#hifigan_nsf from infer_pack.models import (
SynthesizerTrnMs256NSFsid as SynthesizerTrn256,
) # hifigan_nsf
# from infer_pack.models import SynthesizerTrnMs256NSFsid_sim as SynthesizerTrn256#hifigan_nsf # from infer_pack.models import SynthesizerTrnMs256NSFsid_sim as SynthesizerTrn256#hifigan_nsf
# from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf # from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf
# from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf # from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf
@ -16,15 +21,17 @@ from infer_pack.models import SynthesizerTrnMs256NSFsid as SynthesizerTrn256#hif
from scipy.io import wavfile from scipy.io import wavfile
from fairseq import checkpoint_utils from fairseq import checkpoint_utils
# import pyworld # import pyworld
import librosa import librosa
import torch.nn.functional as F import torch.nn.functional as F
import scipy.signal as signal import scipy.signal as signal
# import torchcrepe # import torchcrepe
from time import time as ttime from time import time as ttime
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = r"E:\codes\py39\vits_vc_gpu_train\hubert_base.pt"# model_path = r"E:\codes\py39\vits_vc_gpu_train\hubert_base.pt" #
print("load model(s) from {}".format(model_path)) print("load model(s) from {}".format(model_path))
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
[model_path], [model_path],
@ -37,7 +44,26 @@ model.eval()
# net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],183,256,is_half=True)#hifigan#512#256 # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],183,256,is_half=True)#hifigan#512#256
# net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],109,256,is_half=True)#hifigan#512#256 # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],109,256,is_half=True)#hifigan#512#256
net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],183,256,is_half=True)#hifigan#512#256#no_dropout net_g = SynthesizerTrn256(
1025,
32,
192,
192,
768,
2,
6,
3,
0,
"1",
[3, 7, 11],
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
[10, 10, 2, 2],
512,
[16, 16, 4, 4],
183,
256,
is_half=True,
) # hifigan#512#256#no_dropout
# net_g = SynthesizerTrn256(1025,32,192,192,768,2,3,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],0)#ts3 # net_g = SynthesizerTrn256(1025,32,192,192,768,2,3,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],0)#ts3
# net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2],512,[16,16,4],0)#hifigan-ps-sr # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2],512,[16,16,4],0)#hifigan-ps-sr
# #
@ -48,51 +74,66 @@ net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0,"1", [3,7,11],[[1,3,5], [1
# weights=torch.load("infer/ft-mi-freeze-vocoder-flow-enc_q_1k.pt") # weights=torch.load("infer/ft-mi-freeze-vocoder-flow-enc_q_1k.pt")
# weights=torch.load("infer/ft-mi-freeze-vocoder_true_1k.pt") # weights=torch.load("infer/ft-mi-freeze-vocoder_true_1k.pt")
# weights=torch.load("infer/ft-mi-sim1k.pt") # weights=torch.load("infer/ft-mi-sim1k.pt")
weights=torch.load("infer/ft-mi-no_opt-no_dropout.pt") weights = torch.load("infer/ft-mi-no_opt-no_dropout.pt")
print(net_g.load_state_dict(weights,strict=True)) print(net_g.load_state_dict(weights, strict=True))
net_g.eval().to(device) net_g.eval().to(device)
net_g.half() net_g.half()
def get_f0(x, p_len,f0_up_key=0):
def get_f0(x, p_len, f0_up_key=0):
time_step = 160 / 16000 * 1000 time_step = 160 / 16000 * 1000
f0_min = 50 f0_min = 50
f0_max = 1100 f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700) f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700) f0_mel_max = 1127 * np.log(1 + f0_max / 700)
f0 = parselmouth.Sound(x, 16000).to_pitch_ac( f0 = (
time_step=time_step / 1000, voicing_threshold=0.6, parselmouth.Sound(x, 16000)
pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency'] .to_pitch_ac(
time_step=time_step / 1000,
voicing_threshold=0.6,
pitch_floor=f0_min,
pitch_ceiling=f0_max,
)
.selected_array["frequency"]
)
pad_size=(p_len - len(f0) + 1) // 2 pad_size = (p_len - len(f0) + 1) // 2
if(pad_size>0 or p_len - len(f0) - pad_size>0): if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant') f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
f0 *= pow(2, f0_up_key / 12) f0 *= pow(2, f0_up_key / 12)
f0bak = f0.copy() f0bak = f0.copy()
f0_mel = 1127 * np.log(1 + f0 / 700) f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1 f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
f0_mel_max - f0_mel_min
) + 1
f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255 f0_mel[f0_mel > 255] = 255
# f0_mel[f0_mel > 188] = 188 # f0_mel[f0_mel > 188] = 188
f0_coarse = np.rint(f0_mel).astype(np.int) f0_coarse = np.rint(f0_mel).astype(np.int)
return f0_coarse, f0bak return f0_coarse, f0bak
import faiss import faiss
index=faiss.read_index("infer/added_IVF512_Flat_mi_baseline_src_feat.index")
big_npy=np.load("infer/big_src_feature_mi.npy") index = faiss.read_index("infer/added_IVF512_Flat_mi_baseline_src_feat.index")
ta0=ta1=ta2=0 big_npy = np.load("infer/big_src_feature_mi.npy")
for idx,name in enumerate(["冬之花clip1.wav",]):## ta0 = ta1 = ta2 = 0
wav_path = "todo-songs/%s" % name# for idx, name in enumerate(
f0_up_key=-2# [
"冬之花clip1.wav",
]
): ##
wav_path = "todo-songs/%s" % name #
f0_up_key = -2 #
audio, sampling_rate = sf.read(wav_path) audio, sampling_rate = sf.read(wav_path)
if len(audio.shape) > 1: if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0)) audio = librosa.to_mono(audio.transpose(1, 0))
if sampling_rate != 16000: if sampling_rate != 16000:
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
feats = torch.from_numpy(audio).float() feats = torch.from_numpy(audio).float()
if feats.dim() == 2: # double channels if feats.dim() == 2: # double channels
feats = feats.mean(-1) feats = feats.mean(-1)
@ -104,8 +145,9 @@ for idx,name in enumerate(["冬之花clip1.wav",]):##
"padding_mask": padding_mask.to(device), "padding_mask": padding_mask.to(device),
"output_layer": 9, # layer 9 "output_layer": 9, # layer 9
} }
if torch.cuda.is_available(): torch.cuda.synchronize() if torch.cuda.is_available():
t0=ttime() torch.cuda.synchronize()
t0 = ttime()
with torch.no_grad(): with torch.no_grad():
logits = model.extract_features(**inputs) logits = model.extract_features(**inputs)
feats = model.final_proj(logits[0]) feats = model.final_proj(logits[0])
@ -113,35 +155,45 @@ for idx,name in enumerate(["冬之花clip1.wav",]):##
####索引优化 ####索引优化
npy = feats[0].cpu().numpy().astype("float32") npy = feats[0].cpu().numpy().astype("float32")
D, I = index.search(npy, 1) D, I = index.search(npy, 1)
feats = torch.from_numpy(big_npy[I.squeeze()].astype("float16")).unsqueeze(0).to(device) feats = (
torch.from_numpy(big_npy[I.squeeze()].astype("float16")).unsqueeze(0).to(device)
)
feats=F.interpolate(feats.permute(0,2,1),scale_factor=2).permute(0,2,1) feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
if torch.cuda.is_available(): torch.cuda.synchronize() if torch.cuda.is_available():
t1=ttime() torch.cuda.synchronize()
t1 = ttime()
# p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存 # p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存
p_len = min(feats.shape[1],10000)# p_len = min(feats.shape[1], 10000) #
pitch, pitchf = get_f0(audio, p_len,f0_up_key) pitch, pitchf = get_f0(audio, p_len, f0_up_key)
p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存 p_len = min(feats.shape[1], 10000, pitch.shape[0]) # 太大了爆显存
if torch.cuda.is_available(): torch.cuda.synchronize() if torch.cuda.is_available():
t2=ttime() torch.cuda.synchronize()
feats = feats[:,:p_len, :] t2 = ttime()
feats = feats[:, :p_len, :]
pitch = pitch[:p_len] pitch = pitch[:p_len]
pitchf = pitchf[:p_len] pitchf = pitchf[:p_len]
p_len = torch.LongTensor([p_len]).to(device) p_len = torch.LongTensor([p_len]).to(device)
pitch = torch.LongTensor(pitch).unsqueeze(0).to(device) pitch = torch.LongTensor(pitch).unsqueeze(0).to(device)
sid=torch.LongTensor([0]).to(device) sid = torch.LongTensor([0]).to(device)
pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device) pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device)
with torch.no_grad(): with torch.no_grad():
audio = net_g.infer(feats, p_len,pitch,pitchf,sid)[0][0, 0].data.cpu().float().numpy()#nsf audio = (
if torch.cuda.is_available(): torch.cuda.synchronize() net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
t3=ttime() .data.cpu()
ta0+=(t1-t0) .float()
ta1+=(t2-t1) .numpy()
ta2+=(t3-t2) ) # nsf
if torch.cuda.is_available():
torch.cuda.synchronize()
t3 = ttime()
ta0 += t1 - t0
ta1 += t2 - t1
ta2 += t3 - t2
# wavfile.write("ft-mi_1k-index256-noD-%s.wav"%name, 40000, audio)## # wavfile.write("ft-mi_1k-index256-noD-%s.wav"%name, 40000, audio)##
# wavfile.write("ft-mi-freeze-vocoder-flow-enc_q_1k-%s.wav"%name, 40000, audio)## # wavfile.write("ft-mi-freeze-vocoder-flow-enc_q_1k-%s.wav"%name, 40000, audio)##
# wavfile.write("ft-mi-sim1k-%s.wav"%name, 40000, audio)## # wavfile.write("ft-mi-sim1k-%s.wav"%name, 40000, audio)##
wavfile.write("ft-mi-no_opt-no_dropout-%s.wav"%name, 40000, audio)## wavfile.write("ft-mi-no_opt-no_dropout-%s.wav" % name, 40000, audio) ##
print(ta0,ta1,ta2)# print(ta0, ta1, ta2) #

View File

@ -1,31 +1,31 @@
''' """
格式直接cid为自带的index位aid放不下了通过字典来查反正就5w个 格式直接cid为自带的index位aid放不下了通过字典来查反正就5w个
''' """
import faiss,numpy as np,os import faiss, numpy as np, os
# ###########如果是原始特征要先写save # ###########如果是原始特征要先写save
inp_root=r"E:\codes\py39\dataset\mi\2-co256" inp_root = r"E:\codes\py39\dataset\mi\2-co256"
npys=[] npys = []
for name in sorted(list(os.listdir(inp_root))): for name in sorted(list(os.listdir(inp_root))):
phone=np.load("%s/%s"%(inp_root,name)) phone = np.load("%s/%s" % (inp_root, name))
npys.append(phone) npys.append(phone)
big_npy=np.concatenate(npys,0) big_npy = np.concatenate(npys, 0)
print(big_npy.shape)#(6196072, 192)#fp32#4.43G print(big_npy.shape) # (6196072, 192)#fp32#4.43G
np.save("infer/big_src_feature_mi.npy",big_npy) np.save("infer/big_src_feature_mi.npy", big_npy)
##################train+add ##################train+add
# big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy") # big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy")
print(big_npy.shape) print(big_npy.shape)
index = faiss.index_factory(256, "IVF512,Flat")#mi index = faiss.index_factory(256, "IVF512,Flat") # mi
print("training") print("training")
index_ivf = faiss.extract_index_ivf(index)# index_ivf = faiss.extract_index_ivf(index) #
index_ivf.nprobe = 9 index_ivf.nprobe = 9
index.train(big_npy) index.train(big_npy)
faiss.write_index(index, 'infer/trained_IVF512_Flat_mi_baseline_src_feat.index') faiss.write_index(index, "infer/trained_IVF512_Flat_mi_baseline_src_feat.index")
print("adding") print("adding")
index.add(big_npy) index.add(big_npy)
faiss.write_index(index,"infer/added_IVF512_Flat_mi_baseline_src_feat.index") faiss.write_index(index, "infer/added_IVF512_Flat_mi_baseline_src_feat.index")
''' """
大小都是FP32 大小都是FP32
big_src_feature 2.95G big_src_feature 2.95G
(3098036, 256) (3098036, 256)
@ -33,4 +33,4 @@ big_emb 4.43G
(6196072, 192) (6196072, 192)
big_emb双倍是因为求特征要repeat后再加pitch big_emb双倍是因为求特征要repeat后再加pitch
''' """

View File

@ -1,11 +1,16 @@
import torch,pdb import torch, pdb
# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-suc\G_1000.pth")["model"]#sim_nsf# # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-suc\G_1000.pth")["model"]#sim_nsf#
# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder-flow-enc_q\G_1000.pth")["model"]#sim_nsf# # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder-flow-enc_q\G_1000.pth")["model"]#sim_nsf#
# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder\G_1000.pth")["model"]#sim_nsf# # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder\G_1000.pth")["model"]#sim_nsf#
# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-test\G_1000.pth")["model"]#sim_nsf# # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-test\G_1000.pth")["model"]#sim_nsf#
a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-no_opt-no_dropout\G_1000.pth")["model"]#sim_nsf# a = torch.load(
for key in a.keys():a[key]=a[key].half() r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-no_opt-no_dropout\G_1000.pth"
)[
"model"
] # sim_nsf#
for key in a.keys():
a[key] = a[key].half()
# torch.save(a,"ft-mi-freeze-vocoder_true_1k.pt")# # torch.save(a,"ft-mi-freeze-vocoder_true_1k.pt")#
# torch.save(a,"ft-mi-sim1k.pt")# # torch.save(a,"ft-mi-sim1k.pt")#
torch.save(a,"ft-mi-no_opt-no_dropout.pt")# torch.save(a, "ft-mi-no_opt-no_dropout.pt") #

View File

@ -48,8 +48,10 @@ def slice_segments(x, ids_str, segment_size=4):
idx_end = idx_str + segment_size idx_end = idx_str + segment_size
ret[i] = x[i, :, idx_str:idx_end] ret[i] = x[i, :, idx_str:idx_end]
return ret return ret
def slice_segments2(x, ids_str, segment_size=4): def slice_segments2(x, ids_str, segment_size=4):
ret = torch.zeros_like(x[:, :segment_size]) ret = torch.zeros_like(x[:, :segment_size])
for i in range(x.size(0)): for i in range(x.size(0)):
idx_str = ids_str[i] idx_str = ids_str[i]
idx_end = idx_str + segment_size idx_end = idx_str + segment_size

View File

@ -1,4 +1,4 @@
import math,pdb,os import math, pdb, os
from time import time as ttime from time import time as ttime
import torch import torch
from torch import nn from torch import nn
@ -12,9 +12,20 @@ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
from infer_pack.commons import init_weights from infer_pack.commons import init_weights
import numpy as np import numpy as np
from infer_pack import commons from infer_pack import commons
class TextEncoder256(nn.Module): class TextEncoder256(nn.Module):
def __init__( def __init__(
self, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, f0=True ): self,
out_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
f0=True,
):
super().__init__() super().__init__()
self.out_channels = out_channels self.out_channels = out_channels
self.hidden_channels = hidden_channels self.hidden_channels = hidden_channels
@ -24,8 +35,8 @@ class TextEncoder256(nn.Module):
self.kernel_size = kernel_size self.kernel_size = kernel_size
self.p_dropout = p_dropout self.p_dropout = p_dropout
self.emb_phone = nn.Linear(256, hidden_channels) self.emb_phone = nn.Linear(256, hidden_channels)
self.lrelu=nn.LeakyReLU(0.1,inplace=True) self.lrelu = nn.LeakyReLU(0.1, inplace=True)
if(f0==True): if f0 == True:
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
self.encoder = attentions.Encoder( self.encoder = attentions.Encoder(
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
@ -33,12 +44,12 @@ class TextEncoder256(nn.Module):
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def forward(self, phone, pitch, lengths): def forward(self, phone, pitch, lengths):
if(pitch==None): if pitch == None:
x = self.emb_phone(phone) x = self.emb_phone(phone)
else: else:
x = self.emb_phone(phone) + self.emb_pitch(pitch) x = self.emb_phone(phone) + self.emb_pitch(pitch)
x = x * math.sqrt(self.hidden_channels) # [b, t, h] x = x * math.sqrt(self.hidden_channels) # [b, t, h]
x=self.lrelu(x) x = self.lrelu(x)
x = torch.transpose(x, 1, -1) # [b, h, t] x = torch.transpose(x, 1, -1) # [b, h, t]
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
x.dtype x.dtype
@ -48,8 +59,20 @@ class TextEncoder256(nn.Module):
m, logs = torch.split(stats, self.out_channels, dim=1) m, logs = torch.split(stats, self.out_channels, dim=1)
return m, logs, x_mask return m, logs, x_mask
class TextEncoder256Sim(nn.Module): class TextEncoder256Sim(nn.Module):
def __init__( self, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, f0=True): def __init__(
self,
out_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
f0=True,
):
super().__init__() super().__init__()
self.out_channels = out_channels self.out_channels = out_channels
self.hidden_channels = hidden_channels self.hidden_channels = hidden_channels
@ -59,8 +82,8 @@ class TextEncoder256Sim(nn.Module):
self.kernel_size = kernel_size self.kernel_size = kernel_size
self.p_dropout = p_dropout self.p_dropout = p_dropout
self.emb_phone = nn.Linear(256, hidden_channels) self.emb_phone = nn.Linear(256, hidden_channels)
self.lrelu=nn.LeakyReLU(0.1,inplace=True) self.lrelu = nn.LeakyReLU(0.1, inplace=True)
if(f0==True): if f0 == True:
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
self.encoder = attentions.Encoder( self.encoder = attentions.Encoder(
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
@ -68,17 +91,21 @@ class TextEncoder256Sim(nn.Module):
self.proj = nn.Conv1d(hidden_channels, out_channels, 1) self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
def forward(self, phone, pitch, lengths): def forward(self, phone, pitch, lengths):
if(pitch==None): if pitch == None:
x = self.emb_phone(phone) x = self.emb_phone(phone)
else: else:
x = self.emb_phone(phone) + self.emb_pitch(pitch) x = self.emb_phone(phone) + self.emb_pitch(pitch)
x = x * math.sqrt(self.hidden_channels) # [b, t, h] x = x * math.sqrt(self.hidden_channels) # [b, t, h]
x=self.lrelu(x) x = self.lrelu(x)
x = torch.transpose(x, 1, -1) # [b, h, t] x = torch.transpose(x, 1, -1) # [b, h, t]
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(x.dtype) x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
x.dtype
)
x = self.encoder(x * x_mask, x_mask) x = self.encoder(x * x_mask, x_mask)
x = self.proj(x) * x_mask x = self.proj(x) * x_mask
return x,x_mask return x, x_mask
class ResidualCouplingBlock(nn.Module): class ResidualCouplingBlock(nn.Module):
def __init__( def __init__(
self, self,
@ -126,6 +153,8 @@ class ResidualCouplingBlock(nn.Module):
def remove_weight_norm(self): def remove_weight_norm(self):
for i in range(self.n_flows): for i in range(self.n_flows):
self.flows[i * 2].remove_weight_norm() self.flows[i * 2].remove_weight_norm()
class PosteriorEncoder(nn.Module): class PosteriorEncoder(nn.Module):
def __init__( def __init__(
self, self,
@ -169,6 +198,8 @@ class PosteriorEncoder(nn.Module):
def remove_weight_norm(self): def remove_weight_norm(self):
self.enc.remove_weight_norm() self.enc.remove_weight_norm()
class Generator(torch.nn.Module): class Generator(torch.nn.Module):
def __init__( def __init__(
self, self,
@ -243,8 +274,10 @@ class Generator(torch.nn.Module):
remove_weight_norm(l) remove_weight_norm(l)
for l in self.resblocks: for l in self.resblocks:
l.remove_weight_norm() l.remove_weight_norm()
class SineGen(torch.nn.Module): class SineGen(torch.nn.Module):
""" Definition of sine generator """Definition of sine generator
SineGen(samp_rate, harmonic_num = 0, SineGen(samp_rate, harmonic_num = 0,
sine_amp = 0.1, noise_std = 0.003, sine_amp = 0.1, noise_std = 0.003,
voiced_threshold = 0, voiced_threshold = 0,
@ -259,10 +292,15 @@ class SineGen(torch.nn.Module):
segment is always sin(np.pi) or cos(0) segment is always sin(np.pi) or cos(0)
""" """
def __init__(self, samp_rate, harmonic_num=0, def __init__(
sine_amp=0.1, noise_std=0.003, self,
voiced_threshold=0, samp_rate,
flag_for_pulse=False): harmonic_num=0,
sine_amp=0.1,
noise_std=0.003,
voiced_threshold=0,
flag_for_pulse=False,
):
super(SineGen, self).__init__() super(SineGen, self).__init__()
self.sine_amp = sine_amp self.sine_amp = sine_amp
self.noise_std = noise_std self.noise_std = noise_std
@ -277,8 +315,8 @@ class SineGen(torch.nn.Module):
uv = uv * (f0 > self.voiced_threshold) uv = uv * (f0 > self.voiced_threshold)
return uv return uv
def forward(self, f0,upp): def forward(self, f0, upp):
""" sine_tensor, uv = forward(f0) """sine_tensor, uv = forward(f0)
input F0: tensor(batchsize=1, length, dim=1) input F0: tensor(batchsize=1, length, dim=1)
f0 for unvoiced steps should be 0 f0 for unvoiced steps should be 0
output sine_tensor: tensor(batchsize=1, length, dim) output sine_tensor: tensor(batchsize=1, length, dim)
@ -286,32 +324,52 @@ class SineGen(torch.nn.Module):
""" """
with torch.no_grad(): with torch.no_grad():
f0 = f0[:, None].transpose(1, 2) f0 = f0[:, None].transpose(1, 2)
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,device=f0.device) f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
# fundamental component # fundamental component
f0_buf[:, :, 0] = f0[:, :, 0] f0_buf[:, :, 0] = f0[:, :, 0]
for idx in np.arange(self.harmonic_num):f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)# idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic for idx in np.arange(self.harmonic_num):
rad_values = (f0_buf / self.sampling_rate) % 1###%1意味着n_har的乘积无法后处理优化 f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
rand_ini = torch.rand(f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device) idx + 2
) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
rand_ini = torch.rand(
f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
)
rand_ini[:, 0] = 0 rand_ini[:, 0] = 0
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
tmp_over_one = torch.cumsum(rad_values, 1)# % 1 #####%1意味着后面的cumsum无法再优化 tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
tmp_over_one*=upp tmp_over_one *= upp
tmp_over_one=F.interpolate(tmp_over_one.transpose(2, 1), scale_factor=upp, mode='linear', align_corners=True).transpose(2, 1) tmp_over_one = F.interpolate(
rad_values=F.interpolate(rad_values.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)####### tmp_over_one.transpose(2, 1),
tmp_over_one%=1 scale_factor=upp,
mode="linear",
align_corners=True,
).transpose(2, 1)
rad_values = F.interpolate(
rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
).transpose(
2, 1
) #######
tmp_over_one %= 1
tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
cumsum_shift = torch.zeros_like(rad_values) cumsum_shift = torch.zeros_like(rad_values)
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
sine_waves = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi) sine_waves = torch.sin(
torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
)
sine_waves = sine_waves * self.sine_amp sine_waves = sine_waves * self.sine_amp
uv = self._f02uv(f0) uv = self._f02uv(f0)
uv = F.interpolate(uv.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1) uv = F.interpolate(
uv.transpose(2, 1), scale_factor=upp, mode="nearest"
).transpose(2, 1)
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
noise = noise_amp * torch.randn_like(sine_waves) noise = noise_amp * torch.randn_like(sine_waves)
sine_waves = sine_waves * uv + noise sine_waves = sine_waves * uv + noise
return sine_waves, uv, noise return sine_waves, uv, noise
class SourceModuleHnNSF(torch.nn.Module): class SourceModuleHnNSF(torch.nn.Module):
""" SourceModule for hn-nsf """SourceModule for hn-nsf
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
add_noise_std=0.003, voiced_threshod=0) add_noise_std=0.003, voiced_threshod=0)
sampling_rate: sampling_rate in Hz sampling_rate: sampling_rate in Hz
@ -328,26 +386,37 @@ class SourceModuleHnNSF(torch.nn.Module):
uv (batchsize, length, 1) uv (batchsize, length, 1)
""" """
def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1, def __init__(
add_noise_std=0.003, voiced_threshod=0,is_half=True): self,
sampling_rate,
harmonic_num=0,
sine_amp=0.1,
add_noise_std=0.003,
voiced_threshod=0,
is_half=True,
):
super(SourceModuleHnNSF, self).__init__() super(SourceModuleHnNSF, self).__init__()
self.sine_amp = sine_amp self.sine_amp = sine_amp
self.noise_std = add_noise_std self.noise_std = add_noise_std
self.is_half=is_half self.is_half = is_half
# to produce sine waveforms # to produce sine waveforms
self.l_sin_gen = SineGen(sampling_rate, harmonic_num, self.l_sin_gen = SineGen(
sine_amp, add_noise_std, voiced_threshod) sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
)
# to merge source harmonics into a single excitation # to merge source harmonics into a single excitation
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
self.l_tanh = torch.nn.Tanh() self.l_tanh = torch.nn.Tanh()
def forward(self, x,upp=None): def forward(self, x, upp=None):
sine_wavs, uv, _ = self.l_sin_gen(x,upp) sine_wavs, uv, _ = self.l_sin_gen(x, upp)
if(self.is_half):sine_wavs=sine_wavs.half() if self.is_half:
sine_wavs = sine_wavs.half()
sine_merge = self.l_tanh(self.l_linear(sine_wavs)) sine_merge = self.l_tanh(self.l_linear(sine_wavs))
return sine_merge,None,None# noise, uv return sine_merge, None, None # noise, uv
class GeneratorNSF(torch.nn.Module): class GeneratorNSF(torch.nn.Module):
def __init__( def __init__(
self, self,
@ -360,7 +429,7 @@ class GeneratorNSF(torch.nn.Module):
upsample_kernel_sizes, upsample_kernel_sizes,
gin_channels, gin_channels,
sr, sr,
is_half=False is_half=False,
): ):
super(GeneratorNSF, self).__init__() super(GeneratorNSF, self).__init__()
self.num_kernels = len(resblock_kernel_sizes) self.num_kernels = len(resblock_kernel_sizes)
@ -368,9 +437,7 @@ class GeneratorNSF(torch.nn.Module):
self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
self.m_source = SourceModuleHnNSF( self.m_source = SourceModuleHnNSF(
sampling_rate=sr, sampling_rate=sr, harmonic_num=0, is_half=is_half
harmonic_num=0,
is_half=is_half
) )
self.noise_convs = nn.ModuleList() self.noise_convs = nn.ModuleList()
self.conv_pre = Conv1d( self.conv_pre = Conv1d(
@ -393,9 +460,16 @@ class GeneratorNSF(torch.nn.Module):
) )
) )
if i + 1 < len(upsample_rates): if i + 1 < len(upsample_rates):
stride_f0 = np.prod(upsample_rates[i + 1:]) stride_f0 = np.prod(upsample_rates[i + 1 :])
self.noise_convs.append(Conv1d( self.noise_convs.append(
1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2)) Conv1d(
1,
c_cur,
kernel_size=stride_f0 * 2,
stride=stride_f0,
padding=stride_f0 // 2,
)
)
else: else:
self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
@ -413,10 +487,10 @@ class GeneratorNSF(torch.nn.Module):
if gin_channels != 0: if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
self.upp=np.prod(upsample_rates) self.upp = np.prod(upsample_rates)
def forward(self, x, f0,g=None): def forward(self, x, f0, g=None):
har_source, noi_source, uv = self.m_source(f0,self.upp) har_source, noi_source, uv = self.m_source(f0, self.upp)
har_source = har_source.transpose(1, 2) har_source = har_source.transpose(1, 2)
x = self.conv_pre(x) x = self.conv_pre(x)
if g is not None: if g is not None:
@ -444,11 +518,15 @@ class GeneratorNSF(torch.nn.Module):
remove_weight_norm(l) remove_weight_norm(l)
for l in self.resblocks: for l in self.resblocks:
l.remove_weight_norm() l.remove_weight_norm()
sr2sr={
"32k":32000,
"40k":40000, sr2sr = {
"48k":48000, "32k": 32000,
"40k": 40000,
"48k": 48000,
} }
class SynthesizerTrnMs256NSFsid(nn.Module): class SynthesizerTrnMs256NSFsid(nn.Module):
def __init__( def __init__(
self, self,
@ -472,10 +550,9 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
sr, sr,
**kwargs **kwargs
): ):
super().__init__() super().__init__()
if(type(sr)==type("strr")): if type(sr) == type("strr"):
sr=sr2sr[sr] sr = sr2sr[sr]
self.spec_channels = spec_channels self.spec_channels = spec_channels
self.inter_channels = inter_channels self.inter_channels = inter_channels
self.hidden_channels = hidden_channels self.hidden_channels = hidden_channels
@ -493,7 +570,7 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
self.segment_size = segment_size self.segment_size = segment_size
self.gin_channels = gin_channels self.gin_channels = gin_channels
# self.hop_length = hop_length# # self.hop_length = hop_length#
self.spk_embed_dim=spk_embed_dim self.spk_embed_dim = spk_embed_dim
self.enc_p = TextEncoder256( self.enc_p = TextEncoder256(
inter_channels, inter_channels,
hidden_channels, hidden_channels,
@ -511,7 +588,9 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
upsample_rates, upsample_rates,
upsample_initial_channel, upsample_initial_channel,
upsample_kernel_sizes, upsample_kernel_sizes,
gin_channels=gin_channels, sr=sr, is_half=kwargs["is_half"] gin_channels=gin_channels,
sr=sr,
is_half=kwargs["is_half"],
) )
self.enc_q = PosteriorEncoder( self.enc_q = PosteriorEncoder(
spec_channels, spec_channels,
@ -526,13 +605,16 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
) )
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
print("gin_channels:",gin_channels,"self.spk_embed_dim:",self.spk_embed_dim) print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
def remove_weight_norm(self): def remove_weight_norm(self):
self.dec.remove_weight_norm() self.dec.remove_weight_norm()
self.flow.remove_weight_norm() self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm() self.enc_q.remove_weight_norm()
def forward(self, phone, phone_lengths, pitch,pitchf, y, y_lengths,ds):#这里ds是id[bs,1] def forward(
self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
): # 这里ds是id[bs,1]
# print(1,pitch.shape)#[bs,t] # print(1,pitch.shape)#[bs,t]
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t广播的 g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t广播的
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
@ -542,20 +624,20 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
z, y_lengths, self.segment_size z, y_lengths, self.segment_size
) )
# print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
pitchf = commons.slice_segments2( pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
pitchf, ids_slice, self.segment_size
)
# print(-2,pitchf.shape,z_slice.shape) # print(-2,pitchf.shape,z_slice.shape)
o = self.dec(z_slice,pitchf, g=g) o = self.dec(z_slice, pitchf, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, phone, phone_lengths, pitch, nsff0,sid,max_len=None): def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
g = self.emb_g(sid).unsqueeze(-1) g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True) z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len], nsff0,g=g) o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
return o, x_mask, (z, z_p, m_p, logs_p) return o, x_mask, (z, z_p, m_p, logs_p)
class SynthesizerTrnMs256NSFsid_nono(nn.Module): class SynthesizerTrnMs256NSFsid_nono(nn.Module):
def __init__( def __init__(
self, self,
@ -579,7 +661,6 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
sr=None, sr=None,
**kwargs **kwargs
): ):
super().__init__() super().__init__()
self.spec_channels = spec_channels self.spec_channels = spec_channels
self.inter_channels = inter_channels self.inter_channels = inter_channels
@ -598,7 +679,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
self.segment_size = segment_size self.segment_size = segment_size
self.gin_channels = gin_channels self.gin_channels = gin_channels
# self.hop_length = hop_length# # self.hop_length = hop_length#
self.spk_embed_dim=spk_embed_dim self.spk_embed_dim = spk_embed_dim
self.enc_p = TextEncoder256( self.enc_p = TextEncoder256(
inter_channels, inter_channels,
hidden_channels, hidden_channels,
@ -606,7 +687,8 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
n_heads, n_heads,
n_layers, n_layers,
kernel_size, kernel_size,
p_dropout,f0=False p_dropout,
f0=False,
) )
self.dec = Generator( self.dec = Generator(
inter_channels, inter_channels,
@ -616,7 +698,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
upsample_rates, upsample_rates,
upsample_initial_channel, upsample_initial_channel,
upsample_kernel_sizes, upsample_kernel_sizes,
gin_channels=gin_channels gin_channels=gin_channels,
) )
self.enc_q = PosteriorEncoder( self.enc_q = PosteriorEncoder(
spec_channels, spec_channels,
@ -631,14 +713,14 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
) )
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
print("gin_channels:",gin_channels,"self.spk_embed_dim:",self.spk_embed_dim) print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
def remove_weight_norm(self): def remove_weight_norm(self):
self.dec.remove_weight_norm() self.dec.remove_weight_norm()
self.flow.remove_weight_norm() self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm() self.enc_q.remove_weight_norm()
def forward(self, phone, phone_lengths, y, y_lengths,ds):#这里ds是id[bs,1] def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id[bs,1]
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t广播的 g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t广播的
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
@ -649,13 +731,15 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
o = self.dec(z_slice, g=g) o = self.dec(z_slice, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, phone, phone_lengths,sid,max_len=None): def infer(self, phone, phone_lengths, sid, max_len=None):
g = self.emb_g(sid).unsqueeze(-1) g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True) z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len],g=g) o = self.dec((z * x_mask)[:, :, :max_len], g=g)
return o, x_mask, (z, z_p, m_p, logs_p) return o, x_mask, (z, z_p, m_p, logs_p)
class SynthesizerTrnMs256NSFsid_sim(nn.Module): class SynthesizerTrnMs256NSFsid_sim(nn.Module):
""" """
Synthesizer for Training Synthesizer for Training
@ -684,7 +768,6 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
use_sdp=True, use_sdp=True,
**kwargs **kwargs
): ):
super().__init__() super().__init__()
self.spec_channels = spec_channels self.spec_channels = spec_channels
self.inter_channels = inter_channels self.inter_channels = inter_channels
@ -703,7 +786,7 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
self.segment_size = segment_size self.segment_size = segment_size
self.gin_channels = gin_channels self.gin_channels = gin_channels
# self.hop_length = hop_length# # self.hop_length = hop_length#
self.spk_embed_dim=spk_embed_dim self.spk_embed_dim = spk_embed_dim
self.enc_p = TextEncoder256Sim( self.enc_p = TextEncoder256Sim(
inter_channels, inter_channels,
hidden_channels, hidden_channels,
@ -721,20 +804,24 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
upsample_rates, upsample_rates,
upsample_initial_channel, upsample_initial_channel,
upsample_kernel_sizes, upsample_kernel_sizes,
gin_channels=gin_channels,is_half=kwargs["is_half"] gin_channels=gin_channels,
is_half=kwargs["is_half"],
) )
self.flow = ResidualCouplingBlock( self.flow = ResidualCouplingBlock(
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
) )
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
print("gin_channels:",gin_channels,"self.spk_embed_dim:",self.spk_embed_dim) print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
def remove_weight_norm(self): def remove_weight_norm(self):
self.dec.remove_weight_norm() self.dec.remove_weight_norm()
self.flow.remove_weight_norm() self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm() self.enc_q.remove_weight_norm()
def forward(self, phone, phone_lengths, pitch, pitchf, y_lengths,ds): # y是spec不需要了现在 def forward(
self, phone, phone_lengths, pitch, pitchf, y_lengths, ds
): # y是spec不需要了现在
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t广播的 g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t广播的
x, x_mask = self.enc_p(phone, pitch, phone_lengths) x, x_mask = self.enc_p(phone, pitch, phone_lengths)
x = self.flow(x, x_mask, g=g, reverse=True) x = self.flow(x, x_mask, g=g, reverse=True)
@ -742,22 +829,24 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
x, y_lengths, self.segment_size x, y_lengths, self.segment_size
) )
pitchf = commons.slice_segments2( pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
pitchf, ids_slice, self.segment_size
)
o = self.dec(z_slice, pitchf, g=g) o = self.dec(z_slice, pitchf, g=g)
return o, ids_slice return o, ids_slice
def infer(self, phone, phone_lengths, pitch, pitchf, ds,max_len=None): # y是spec不需要了现在
def infer(
self, phone, phone_lengths, pitch, pitchf, ds, max_len=None
): # y是spec不需要了现在
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t广播的 g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t广播的
x, x_mask = self.enc_p(phone, pitch, phone_lengths) x, x_mask = self.enc_p(phone, pitch, phone_lengths)
x = self.flow(x, x_mask, g=g, reverse=True) x = self.flow(x, x_mask, g=g, reverse=True)
o = self.dec((x*x_mask)[:, :, :max_len], pitchf, g=g) o = self.dec((x * x_mask)[:, :, :max_len], pitchf, g=g)
return o, o return o, o
class MultiPeriodDiscriminator(torch.nn.Module): class MultiPeriodDiscriminator(torch.nn.Module):
def __init__(self, use_spectral_norm=False): def __init__(self, use_spectral_norm=False):
super(MultiPeriodDiscriminator, self).__init__() super(MultiPeriodDiscriminator, self).__init__()
periods = [2, 3, 5, 7, 11,17] periods = [2, 3, 5, 7, 11, 17]
# periods = [3, 5, 7, 11, 17, 23, 37] # periods = [3, 5, 7, 11, 17, 23, 37]
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
@ -767,7 +856,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
self.discriminators = nn.ModuleList(discs) self.discriminators = nn.ModuleList(discs)
def forward(self, y, y_hat): def forward(self, y, y_hat):
y_d_rs = []# y_d_rs = [] #
y_d_gs = [] y_d_gs = []
fmap_rs = [] fmap_rs = []
fmap_gs = [] fmap_gs = []
@ -783,6 +872,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
return y_d_rs, y_d_gs, fmap_rs, fmap_gs return y_d_rs, y_d_gs, fmap_rs, fmap_gs
class DiscriminatorS(torch.nn.Module): class DiscriminatorS(torch.nn.Module):
def __init__(self, use_spectral_norm=False): def __init__(self, use_spectral_norm=False):
super(DiscriminatorS, self).__init__() super(DiscriminatorS, self).__init__()
@ -812,6 +902,7 @@ class DiscriminatorS(torch.nn.Module):
return x, fmap return x, fmap
class DiscriminatorP(torch.nn.Module): class DiscriminatorP(torch.nn.Module):
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
super(DiscriminatorP, self).__init__() super(DiscriminatorP, self).__init__()
@ -889,4 +980,3 @@ class DiscriminatorP(torch.nn.Module):
x = torch.flatten(x, 1, -1) x = torch.flatten(x, 1, -1)
return x, fmap return x, fmap

View File

@ -1,4 +1,4 @@
import math,pdb,os import math, pdb, os
from time import time as ttime from time import time as ttime
import torch import torch
from torch import nn from torch import nn
@ -12,9 +12,20 @@ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
from infer_pack.commons import init_weights from infer_pack.commons import init_weights
import numpy as np import numpy as np
from infer_pack import commons from infer_pack import commons
class TextEncoder256(nn.Module): class TextEncoder256(nn.Module):
def __init__( def __init__(
self, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, f0=True ): self,
out_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
f0=True,
):
super().__init__() super().__init__()
self.out_channels = out_channels self.out_channels = out_channels
self.hidden_channels = hidden_channels self.hidden_channels = hidden_channels
@ -24,8 +35,8 @@ class TextEncoder256(nn.Module):
self.kernel_size = kernel_size self.kernel_size = kernel_size
self.p_dropout = p_dropout self.p_dropout = p_dropout
self.emb_phone = nn.Linear(256, hidden_channels) self.emb_phone = nn.Linear(256, hidden_channels)
self.lrelu=nn.LeakyReLU(0.1,inplace=True) self.lrelu = nn.LeakyReLU(0.1, inplace=True)
if(f0==True): if f0 == True:
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
self.encoder = attentions.Encoder( self.encoder = attentions.Encoder(
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
@ -33,12 +44,12 @@ class TextEncoder256(nn.Module):
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def forward(self, phone, pitch, lengths): def forward(self, phone, pitch, lengths):
if(pitch==None): if pitch == None:
x = self.emb_phone(phone) x = self.emb_phone(phone)
else: else:
x = self.emb_phone(phone) + self.emb_pitch(pitch) x = self.emb_phone(phone) + self.emb_pitch(pitch)
x = x * math.sqrt(self.hidden_channels) # [b, t, h] x = x * math.sqrt(self.hidden_channels) # [b, t, h]
x=self.lrelu(x) x = self.lrelu(x)
x = torch.transpose(x, 1, -1) # [b, h, t] x = torch.transpose(x, 1, -1) # [b, h, t]
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
x.dtype x.dtype
@ -48,8 +59,20 @@ class TextEncoder256(nn.Module):
m, logs = torch.split(stats, self.out_channels, dim=1) m, logs = torch.split(stats, self.out_channels, dim=1)
return m, logs, x_mask return m, logs, x_mask
class TextEncoder256Sim(nn.Module): class TextEncoder256Sim(nn.Module):
def __init__( self, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, f0=True): def __init__(
self,
out_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
f0=True,
):
super().__init__() super().__init__()
self.out_channels = out_channels self.out_channels = out_channels
self.hidden_channels = hidden_channels self.hidden_channels = hidden_channels
@ -59,8 +82,8 @@ class TextEncoder256Sim(nn.Module):
self.kernel_size = kernel_size self.kernel_size = kernel_size
self.p_dropout = p_dropout self.p_dropout = p_dropout
self.emb_phone = nn.Linear(256, hidden_channels) self.emb_phone = nn.Linear(256, hidden_channels)
self.lrelu=nn.LeakyReLU(0.1,inplace=True) self.lrelu = nn.LeakyReLU(0.1, inplace=True)
if(f0==True): if f0 == True:
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
self.encoder = attentions.Encoder( self.encoder = attentions.Encoder(
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
@ -68,17 +91,21 @@ class TextEncoder256Sim(nn.Module):
self.proj = nn.Conv1d(hidden_channels, out_channels, 1) self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
def forward(self, phone, pitch, lengths): def forward(self, phone, pitch, lengths):
if(pitch==None): if pitch == None:
x = self.emb_phone(phone) x = self.emb_phone(phone)
else: else:
x = self.emb_phone(phone) + self.emb_pitch(pitch) x = self.emb_phone(phone) + self.emb_pitch(pitch)
x = x * math.sqrt(self.hidden_channels) # [b, t, h] x = x * math.sqrt(self.hidden_channels) # [b, t, h]
x=self.lrelu(x) x = self.lrelu(x)
x = torch.transpose(x, 1, -1) # [b, h, t] x = torch.transpose(x, 1, -1) # [b, h, t]
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(x.dtype) x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
x.dtype
)
x = self.encoder(x * x_mask, x_mask) x = self.encoder(x * x_mask, x_mask)
x = self.proj(x) * x_mask x = self.proj(x) * x_mask
return x,x_mask return x, x_mask
class ResidualCouplingBlock(nn.Module): class ResidualCouplingBlock(nn.Module):
def __init__( def __init__(
self, self,
@ -126,6 +153,8 @@ class ResidualCouplingBlock(nn.Module):
def remove_weight_norm(self): def remove_weight_norm(self):
for i in range(self.n_flows): for i in range(self.n_flows):
self.flows[i * 2].remove_weight_norm() self.flows[i * 2].remove_weight_norm()
class PosteriorEncoder(nn.Module): class PosteriorEncoder(nn.Module):
def __init__( def __init__(
self, self,
@ -169,6 +198,8 @@ class PosteriorEncoder(nn.Module):
def remove_weight_norm(self): def remove_weight_norm(self):
self.enc.remove_weight_norm() self.enc.remove_weight_norm()
class Generator(torch.nn.Module): class Generator(torch.nn.Module):
def __init__( def __init__(
self, self,
@ -243,8 +274,10 @@ class Generator(torch.nn.Module):
remove_weight_norm(l) remove_weight_norm(l)
for l in self.resblocks: for l in self.resblocks:
l.remove_weight_norm() l.remove_weight_norm()
class SineGen(torch.nn.Module): class SineGen(torch.nn.Module):
""" Definition of sine generator """Definition of sine generator
SineGen(samp_rate, harmonic_num = 0, SineGen(samp_rate, harmonic_num = 0,
sine_amp = 0.1, noise_std = 0.003, sine_amp = 0.1, noise_std = 0.003,
voiced_threshold = 0, voiced_threshold = 0,
@ -259,10 +292,15 @@ class SineGen(torch.nn.Module):
segment is always sin(np.pi) or cos(0) segment is always sin(np.pi) or cos(0)
""" """
def __init__(self, samp_rate, harmonic_num=0, def __init__(
sine_amp=0.1, noise_std=0.003, self,
voiced_threshold=0, samp_rate,
flag_for_pulse=False): harmonic_num=0,
sine_amp=0.1,
noise_std=0.003,
voiced_threshold=0,
flag_for_pulse=False,
):
super(SineGen, self).__init__() super(SineGen, self).__init__()
self.sine_amp = sine_amp self.sine_amp = sine_amp
self.noise_std = noise_std self.noise_std = noise_std
@ -277,8 +315,8 @@ class SineGen(torch.nn.Module):
uv = uv * (f0 > self.voiced_threshold) uv = uv * (f0 > self.voiced_threshold)
return uv return uv
def forward(self, f0,upp): def forward(self, f0, upp):
""" sine_tensor, uv = forward(f0) """sine_tensor, uv = forward(f0)
input F0: tensor(batchsize=1, length, dim=1) input F0: tensor(batchsize=1, length, dim=1)
f0 for unvoiced steps should be 0 f0 for unvoiced steps should be 0
output sine_tensor: tensor(batchsize=1, length, dim) output sine_tensor: tensor(batchsize=1, length, dim)
@ -286,32 +324,52 @@ class SineGen(torch.nn.Module):
""" """
with torch.no_grad(): with torch.no_grad():
f0 = f0[:, None].transpose(1, 2) f0 = f0[:, None].transpose(1, 2)
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,device=f0.device) f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
# fundamental component # fundamental component
f0_buf[:, :, 0] = f0[:, :, 0] f0_buf[:, :, 0] = f0[:, :, 0]
for idx in np.arange(self.harmonic_num):f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)# idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic for idx in np.arange(self.harmonic_num):
rad_values = (f0_buf / self.sampling_rate) % 1###%1意味着n_har的乘积无法后处理优化 f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
rand_ini = torch.rand(f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device) idx + 2
) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
rand_ini = torch.rand(
f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
)
rand_ini[:, 0] = 0 rand_ini[:, 0] = 0
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
tmp_over_one = torch.cumsum(rad_values, 1)# % 1 #####%1意味着后面的cumsum无法再优化 tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
tmp_over_one*=upp tmp_over_one *= upp
tmp_over_one=F.interpolate(tmp_over_one.transpose(2, 1), scale_factor=upp, mode='linear', align_corners=True).transpose(2, 1) tmp_over_one = F.interpolate(
rad_values=F.interpolate(rad_values.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)####### tmp_over_one.transpose(2, 1),
tmp_over_one%=1 scale_factor=upp,
mode="linear",
align_corners=True,
).transpose(2, 1)
rad_values = F.interpolate(
rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
).transpose(
2, 1
) #######
tmp_over_one %= 1
tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
cumsum_shift = torch.zeros_like(rad_values) cumsum_shift = torch.zeros_like(rad_values)
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
sine_waves = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi) sine_waves = torch.sin(
torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
)
sine_waves = sine_waves * self.sine_amp sine_waves = sine_waves * self.sine_amp
uv = self._f02uv(f0) uv = self._f02uv(f0)
uv = F.interpolate(uv.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1) uv = F.interpolate(
uv.transpose(2, 1), scale_factor=upp, mode="nearest"
).transpose(2, 1)
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
noise = noise_amp * torch.randn_like(sine_waves) noise = noise_amp * torch.randn_like(sine_waves)
sine_waves = sine_waves * uv + noise sine_waves = sine_waves * uv + noise
return sine_waves, uv, noise return sine_waves, uv, noise
class SourceModuleHnNSF(torch.nn.Module): class SourceModuleHnNSF(torch.nn.Module):
""" SourceModule for hn-nsf """SourceModule for hn-nsf
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
add_noise_std=0.003, voiced_threshod=0) add_noise_std=0.003, voiced_threshod=0)
sampling_rate: sampling_rate in Hz sampling_rate: sampling_rate in Hz
@ -328,26 +386,37 @@ class SourceModuleHnNSF(torch.nn.Module):
uv (batchsize, length, 1) uv (batchsize, length, 1)
""" """
def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1, def __init__(
add_noise_std=0.003, voiced_threshod=0,is_half=True): self,
sampling_rate,
harmonic_num=0,
sine_amp=0.1,
add_noise_std=0.003,
voiced_threshod=0,
is_half=True,
):
super(SourceModuleHnNSF, self).__init__() super(SourceModuleHnNSF, self).__init__()
self.sine_amp = sine_amp self.sine_amp = sine_amp
self.noise_std = add_noise_std self.noise_std = add_noise_std
self.is_half=is_half self.is_half = is_half
# to produce sine waveforms # to produce sine waveforms
self.l_sin_gen = SineGen(sampling_rate, harmonic_num, self.l_sin_gen = SineGen(
sine_amp, add_noise_std, voiced_threshod) sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
)
# to merge source harmonics into a single excitation # to merge source harmonics into a single excitation
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
self.l_tanh = torch.nn.Tanh() self.l_tanh = torch.nn.Tanh()
def forward(self, x,upp=None): def forward(self, x, upp=None):
sine_wavs, uv, _ = self.l_sin_gen(x,upp) sine_wavs, uv, _ = self.l_sin_gen(x, upp)
if(self.is_half):sine_wavs=sine_wavs.half() if self.is_half:
sine_wavs = sine_wavs.half()
sine_merge = self.l_tanh(self.l_linear(sine_wavs)) sine_merge = self.l_tanh(self.l_linear(sine_wavs))
return sine_merge,None,None# noise, uv return sine_merge, None, None # noise, uv
class GeneratorNSF(torch.nn.Module): class GeneratorNSF(torch.nn.Module):
def __init__( def __init__(
self, self,
@ -360,7 +429,7 @@ class GeneratorNSF(torch.nn.Module):
upsample_kernel_sizes, upsample_kernel_sizes,
gin_channels, gin_channels,
sr, sr,
is_half=False is_half=False,
): ):
super(GeneratorNSF, self).__init__() super(GeneratorNSF, self).__init__()
self.num_kernels = len(resblock_kernel_sizes) self.num_kernels = len(resblock_kernel_sizes)
@ -368,9 +437,7 @@ class GeneratorNSF(torch.nn.Module):
self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
self.m_source = SourceModuleHnNSF( self.m_source = SourceModuleHnNSF(
sampling_rate=sr, sampling_rate=sr, harmonic_num=0, is_half=is_half
harmonic_num=0,
is_half=is_half
) )
self.noise_convs = nn.ModuleList() self.noise_convs = nn.ModuleList()
self.conv_pre = Conv1d( self.conv_pre = Conv1d(
@ -393,9 +460,16 @@ class GeneratorNSF(torch.nn.Module):
) )
) )
if i + 1 < len(upsample_rates): if i + 1 < len(upsample_rates):
stride_f0 = np.prod(upsample_rates[i + 1:]) stride_f0 = np.prod(upsample_rates[i + 1 :])
self.noise_convs.append(Conv1d( self.noise_convs.append(
1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2)) Conv1d(
1,
c_cur,
kernel_size=stride_f0 * 2,
stride=stride_f0,
padding=stride_f0 // 2,
)
)
else: else:
self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
@ -413,10 +487,10 @@ class GeneratorNSF(torch.nn.Module):
if gin_channels != 0: if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
self.upp=np.prod(upsample_rates) self.upp = np.prod(upsample_rates)
def forward(self, x, f0,g=None): def forward(self, x, f0, g=None):
har_source, noi_source, uv = self.m_source(f0,self.upp) har_source, noi_source, uv = self.m_source(f0, self.upp)
har_source = har_source.transpose(1, 2) har_source = har_source.transpose(1, 2)
x = self.conv_pre(x) x = self.conv_pre(x)
if g is not None: if g is not None:
@ -444,11 +518,15 @@ class GeneratorNSF(torch.nn.Module):
remove_weight_norm(l) remove_weight_norm(l)
for l in self.resblocks: for l in self.resblocks:
l.remove_weight_norm() l.remove_weight_norm()
sr2sr={
"32k":32000,
"40k":40000, sr2sr = {
"48k":48000, "32k": 32000,
"40k": 40000,
"48k": 48000,
} }
class SynthesizerTrnMs256NSFsid(nn.Module): class SynthesizerTrnMs256NSFsid(nn.Module):
def __init__( def __init__(
self, self,
@ -472,10 +550,9 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
sr, sr,
**kwargs **kwargs
): ):
super().__init__() super().__init__()
if(type(sr)==type("strr")): if type(sr) == type("strr"):
sr=sr2sr[sr] sr = sr2sr[sr]
self.spec_channels = spec_channels self.spec_channels = spec_channels
self.inter_channels = inter_channels self.inter_channels = inter_channels
self.hidden_channels = hidden_channels self.hidden_channels = hidden_channels
@ -493,7 +570,7 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
self.segment_size = segment_size self.segment_size = segment_size
self.gin_channels = gin_channels self.gin_channels = gin_channels
# self.hop_length = hop_length# # self.hop_length = hop_length#
self.spk_embed_dim=spk_embed_dim self.spk_embed_dim = spk_embed_dim
self.enc_p = TextEncoder256( self.enc_p = TextEncoder256(
inter_channels, inter_channels,
hidden_channels, hidden_channels,
@ -511,7 +588,9 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
upsample_rates, upsample_rates,
upsample_initial_channel, upsample_initial_channel,
upsample_kernel_sizes, upsample_kernel_sizes,
gin_channels=gin_channels, sr=sr, is_half=kwargs["is_half"] gin_channels=gin_channels,
sr=sr,
is_half=kwargs["is_half"],
) )
self.enc_q = PosteriorEncoder( self.enc_q = PosteriorEncoder(
spec_channels, spec_channels,
@ -526,21 +605,22 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
) )
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
print("gin_channels:",gin_channels,"self.spk_embed_dim:",self.spk_embed_dim) print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
def remove_weight_norm(self): def remove_weight_norm(self):
self.dec.remove_weight_norm() self.dec.remove_weight_norm()
self.flow.remove_weight_norm() self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm() self.enc_q.remove_weight_norm()
def forward(self, phone, phone_lengths, pitch, nsff0 ,sid, rnd, max_len=None): def forward(self, phone, phone_lengths, pitch, nsff0, sid, rnd, max_len=None):
g = self.emb_g(sid).unsqueeze(-1) g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True) z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len], nsff0,g=g) o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
return o return o
class SynthesizerTrnMs256NSFsid_sim(nn.Module): class SynthesizerTrnMs256NSFsid_sim(nn.Module):
""" """
Synthesizer for Training Synthesizer for Training
@ -569,7 +649,6 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
use_sdp=True, use_sdp=True,
**kwargs **kwargs
): ):
super().__init__() super().__init__()
self.spec_channels = spec_channels self.spec_channels = spec_channels
self.inter_channels = inter_channels self.inter_channels = inter_channels
@ -588,7 +667,7 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
self.segment_size = segment_size self.segment_size = segment_size
self.gin_channels = gin_channels self.gin_channels = gin_channels
# self.hop_length = hop_length# # self.hop_length = hop_length#
self.spk_embed_dim=spk_embed_dim self.spk_embed_dim = spk_embed_dim
self.enc_p = TextEncoder256Sim( self.enc_p = TextEncoder256Sim(
inter_channels, inter_channels,
hidden_channels, hidden_channels,
@ -606,30 +685,35 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
upsample_rates, upsample_rates,
upsample_initial_channel, upsample_initial_channel,
upsample_kernel_sizes, upsample_kernel_sizes,
gin_channels=gin_channels,is_half=kwargs["is_half"] gin_channels=gin_channels,
is_half=kwargs["is_half"],
) )
self.flow = ResidualCouplingBlock( self.flow = ResidualCouplingBlock(
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
) )
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
print("gin_channels:",gin_channels,"self.spk_embed_dim:",self.spk_embed_dim) print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
def remove_weight_norm(self): def remove_weight_norm(self):
self.dec.remove_weight_norm() self.dec.remove_weight_norm()
self.flow.remove_weight_norm() self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm() self.enc_q.remove_weight_norm()
def forward(self, phone, phone_lengths, pitch, pitchf, ds,max_len=None): # y是spec不需要了现在 def forward(
self, phone, phone_lengths, pitch, pitchf, ds, max_len=None
): # y是spec不需要了现在
g = self.emb_g(ds.unsqueeze(0)).unsqueeze(-1) # [b, 256, 1]##1是t广播的 g = self.emb_g(ds.unsqueeze(0)).unsqueeze(-1) # [b, 256, 1]##1是t广播的
x, x_mask = self.enc_p(phone, pitch, phone_lengths) x, x_mask = self.enc_p(phone, pitch, phone_lengths)
x = self.flow(x, x_mask, g=g, reverse=True) x = self.flow(x, x_mask, g=g, reverse=True)
o = self.dec((x*x_mask)[:, :, :max_len], pitchf, g=g) o = self.dec((x * x_mask)[:, :, :max_len], pitchf, g=g)
return o return o
class MultiPeriodDiscriminator(torch.nn.Module): class MultiPeriodDiscriminator(torch.nn.Module):
def __init__(self, use_spectral_norm=False): def __init__(self, use_spectral_norm=False):
super(MultiPeriodDiscriminator, self).__init__() super(MultiPeriodDiscriminator, self).__init__()
periods = [2, 3, 5, 7, 11,17] periods = [2, 3, 5, 7, 11, 17]
# periods = [3, 5, 7, 11, 17, 23, 37] # periods = [3, 5, 7, 11, 17, 23, 37]
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
@ -639,7 +723,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
self.discriminators = nn.ModuleList(discs) self.discriminators = nn.ModuleList(discs)
def forward(self, y, y_hat): def forward(self, y, y_hat):
y_d_rs = []# y_d_rs = [] #
y_d_gs = [] y_d_gs = []
fmap_rs = [] fmap_rs = []
fmap_gs = [] fmap_gs = []
@ -655,6 +739,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
return y_d_rs, y_d_gs, fmap_rs, fmap_gs return y_d_rs, y_d_gs, fmap_rs, fmap_gs
class DiscriminatorS(torch.nn.Module): class DiscriminatorS(torch.nn.Module):
def __init__(self, use_spectral_norm=False): def __init__(self, use_spectral_norm=False):
super(DiscriminatorS, self).__init__() super(DiscriminatorS, self).__init__()
@ -684,6 +769,7 @@ class DiscriminatorS(torch.nn.Module):
return x, fmap return x, fmap
class DiscriminatorP(torch.nn.Module): class DiscriminatorP(torch.nn.Module):
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
super(DiscriminatorP, self).__init__() super(DiscriminatorP, self).__init__()
@ -761,4 +847,3 @@ class DiscriminatorP(torch.nn.Module):
x = torch.flatten(x, 1, -1) x = torch.flatten(x, 1, -1)
return x, fmap return x, fmap

View File

@ -9,66 +9,63 @@ DEFAULT_MIN_BIN_HEIGHT = 1e-3
DEFAULT_MIN_DERIVATIVE = 1e-3 DEFAULT_MIN_DERIVATIVE = 1e-3
def piecewise_rational_quadratic_transform(inputs, def piecewise_rational_quadratic_transform(
unnormalized_widths, inputs,
unnormalized_heights, unnormalized_widths,
unnormalized_derivatives, unnormalized_heights,
inverse=False, unnormalized_derivatives,
tails=None, inverse=False,
tail_bound=1., tails=None,
min_bin_width=DEFAULT_MIN_BIN_WIDTH, tail_bound=1.0,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT, min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_derivative=DEFAULT_MIN_DERIVATIVE): min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE,
):
if tails is None: if tails is None:
spline_fn = rational_quadratic_spline spline_fn = rational_quadratic_spline
spline_kwargs = {} spline_kwargs = {}
else: else:
spline_fn = unconstrained_rational_quadratic_spline spline_fn = unconstrained_rational_quadratic_spline
spline_kwargs = { spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
'tails': tails,
'tail_bound': tail_bound
}
outputs, logabsdet = spline_fn( outputs, logabsdet = spline_fn(
inputs=inputs, inputs=inputs,
unnormalized_widths=unnormalized_widths, unnormalized_widths=unnormalized_widths,
unnormalized_heights=unnormalized_heights, unnormalized_heights=unnormalized_heights,
unnormalized_derivatives=unnormalized_derivatives, unnormalized_derivatives=unnormalized_derivatives,
inverse=inverse, inverse=inverse,
min_bin_width=min_bin_width, min_bin_width=min_bin_width,
min_bin_height=min_bin_height, min_bin_height=min_bin_height,
min_derivative=min_derivative, min_derivative=min_derivative,
**spline_kwargs **spline_kwargs
) )
return outputs, logabsdet return outputs, logabsdet
def searchsorted(bin_locations, inputs, eps=1e-6): def searchsorted(bin_locations, inputs, eps=1e-6):
bin_locations[..., -1] += eps bin_locations[..., -1] += eps
return torch.sum( return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
inputs[..., None] >= bin_locations,
dim=-1
) - 1
def unconstrained_rational_quadratic_spline(inputs, def unconstrained_rational_quadratic_spline(
unnormalized_widths, inputs,
unnormalized_heights, unnormalized_widths,
unnormalized_derivatives, unnormalized_heights,
inverse=False, unnormalized_derivatives,
tails='linear', inverse=False,
tail_bound=1., tails="linear",
min_bin_width=DEFAULT_MIN_BIN_WIDTH, tail_bound=1.0,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT, min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_derivative=DEFAULT_MIN_DERIVATIVE): min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE,
):
inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
outside_interval_mask = ~inside_interval_mask outside_interval_mask = ~inside_interval_mask
outputs = torch.zeros_like(inputs) outputs = torch.zeros_like(inputs)
logabsdet = torch.zeros_like(inputs) logabsdet = torch.zeros_like(inputs)
if tails == 'linear': if tails == "linear":
unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
constant = np.log(np.exp(1 - min_derivative) - 1) constant = np.log(np.exp(1 - min_derivative) - 1)
unnormalized_derivatives[..., 0] = constant unnormalized_derivatives[..., 0] = constant
@ -77,45 +74,57 @@ def unconstrained_rational_quadratic_spline(inputs,
outputs[outside_interval_mask] = inputs[outside_interval_mask] outputs[outside_interval_mask] = inputs[outside_interval_mask]
logabsdet[outside_interval_mask] = 0 logabsdet[outside_interval_mask] = 0
else: else:
raise RuntimeError('{} tails are not implemented.'.format(tails)) raise RuntimeError("{} tails are not implemented.".format(tails))
outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline( (
outputs[inside_interval_mask],
logabsdet[inside_interval_mask],
) = rational_quadratic_spline(
inputs=inputs[inside_interval_mask], inputs=inputs[inside_interval_mask],
unnormalized_widths=unnormalized_widths[inside_interval_mask, :], unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
unnormalized_heights=unnormalized_heights[inside_interval_mask, :], unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
inverse=inverse, inverse=inverse,
left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound, left=-tail_bound,
right=tail_bound,
bottom=-tail_bound,
top=tail_bound,
min_bin_width=min_bin_width, min_bin_width=min_bin_width,
min_bin_height=min_bin_height, min_bin_height=min_bin_height,
min_derivative=min_derivative min_derivative=min_derivative,
) )
return outputs, logabsdet return outputs, logabsdet
def rational_quadratic_spline(inputs,
unnormalized_widths, def rational_quadratic_spline(
unnormalized_heights, inputs,
unnormalized_derivatives, unnormalized_widths,
inverse=False, unnormalized_heights,
left=0., right=1., bottom=0., top=1., unnormalized_derivatives,
min_bin_width=DEFAULT_MIN_BIN_WIDTH, inverse=False,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT, left=0.0,
min_derivative=DEFAULT_MIN_DERIVATIVE): right=1.0,
bottom=0.0,
top=1.0,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE,
):
if torch.min(inputs) < left or torch.max(inputs) > right: if torch.min(inputs) < left or torch.max(inputs) > right:
raise ValueError('Input to a transform is not within its domain') raise ValueError("Input to a transform is not within its domain")
num_bins = unnormalized_widths.shape[-1] num_bins = unnormalized_widths.shape[-1]
if min_bin_width * num_bins > 1.0: if min_bin_width * num_bins > 1.0:
raise ValueError('Minimal bin width too large for the number of bins') raise ValueError("Minimal bin width too large for the number of bins")
if min_bin_height * num_bins > 1.0: if min_bin_height * num_bins > 1.0:
raise ValueError('Minimal bin height too large for the number of bins') raise ValueError("Minimal bin height too large for the number of bins")
widths = F.softmax(unnormalized_widths, dim=-1) widths = F.softmax(unnormalized_widths, dim=-1)
widths = min_bin_width + (1 - min_bin_width * num_bins) * widths widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
cumwidths = torch.cumsum(widths, dim=-1) cumwidths = torch.cumsum(widths, dim=-1)
cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0) cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
cumwidths = (right - left) * cumwidths + left cumwidths = (right - left) * cumwidths + left
cumwidths[..., 0] = left cumwidths[..., 0] = left
cumwidths[..., -1] = right cumwidths[..., -1] = right
@ -126,7 +135,7 @@ def rational_quadratic_spline(inputs,
heights = F.softmax(unnormalized_heights, dim=-1) heights = F.softmax(unnormalized_heights, dim=-1)
heights = min_bin_height + (1 - min_bin_height * num_bins) * heights heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
cumheights = torch.cumsum(heights, dim=-1) cumheights = torch.cumsum(heights, dim=-1)
cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0) cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
cumheights = (top - bottom) * cumheights + bottom cumheights = (top - bottom) * cumheights + bottom
cumheights[..., 0] = bottom cumheights[..., 0] = bottom
cumheights[..., -1] = top cumheights[..., -1] = top
@ -150,15 +159,13 @@ def rational_quadratic_spline(inputs,
input_heights = heights.gather(-1, bin_idx)[..., 0] input_heights = heights.gather(-1, bin_idx)[..., 0]
if inverse: if inverse:
a = (((inputs - input_cumheights) * (input_derivatives a = (inputs - input_cumheights) * (
+ input_derivatives_plus_one input_derivatives + input_derivatives_plus_one - 2 * input_delta
- 2 * input_delta) ) + input_heights * (input_delta - input_derivatives)
+ input_heights * (input_delta - input_derivatives))) b = input_heights * input_derivatives - (inputs - input_cumheights) * (
b = (input_heights * input_derivatives input_derivatives + input_derivatives_plus_one - 2 * input_delta
- (inputs - input_cumheights) * (input_derivatives )
+ input_derivatives_plus_one c = -input_delta * (inputs - input_cumheights)
- 2 * input_delta))
c = - input_delta * (inputs - input_cumheights)
discriminant = b.pow(2) - 4 * a * c discriminant = b.pow(2) - 4 * a * c
assert (discriminant >= 0).all() assert (discriminant >= 0).all()
@ -167,11 +174,15 @@ def rational_quadratic_spline(inputs,
outputs = root * input_bin_widths + input_cumwidths outputs = root * input_bin_widths + input_cumwidths
theta_one_minus_theta = root * (1 - root) theta_one_minus_theta = root * (1 - root)
denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) denominator = input_delta + (
* theta_one_minus_theta) (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2) * theta_one_minus_theta
+ 2 * input_delta * theta_one_minus_theta )
+ input_derivatives * (1 - root).pow(2)) derivative_numerator = input_delta.pow(2) * (
input_derivatives_plus_one * root.pow(2)
+ 2 * input_delta * theta_one_minus_theta
+ input_derivatives * (1 - root).pow(2)
)
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
return outputs, -logabsdet return outputs, -logabsdet
@ -179,15 +190,20 @@ def rational_quadratic_spline(inputs,
theta = (inputs - input_cumwidths) / input_bin_widths theta = (inputs - input_cumwidths) / input_bin_widths
theta_one_minus_theta = theta * (1 - theta) theta_one_minus_theta = theta * (1 - theta)
numerator = input_heights * (input_delta * theta.pow(2) numerator = input_heights * (
+ input_derivatives * theta_one_minus_theta) input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) )
* theta_one_minus_theta) denominator = input_delta + (
(input_derivatives + input_derivatives_plus_one - 2 * input_delta)
* theta_one_minus_theta
)
outputs = input_cumheights + numerator / denominator outputs = input_cumheights + numerator / denominator
derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2) derivative_numerator = input_delta.pow(2) * (
+ 2 * input_delta * theta_one_minus_theta input_derivatives_plus_one * theta.pow(2)
+ input_derivatives * (1 - theta).pow(2)) + 2 * input_delta * theta_one_minus_theta
+ input_derivatives * (1 - theta).pow(2)
)
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
return outputs, logabsdet return outputs, logabsdet

View File

@ -1,108 +1,171 @@
import os,sys,torch,warnings,pdb import os, sys, torch, warnings, pdb
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
import librosa import librosa
import importlib import importlib
import numpy as np import numpy as np
import hashlib , math import hashlib, math
from tqdm import tqdm from tqdm import tqdm
from uvr5_pack.lib_v5 import spec_utils from uvr5_pack.lib_v5 import spec_utils
from uvr5_pack.utils import _get_name_params,inference from uvr5_pack.utils import _get_name_params, inference
from uvr5_pack.lib_v5.model_param_init import ModelParameters from uvr5_pack.lib_v5.model_param_init import ModelParameters
from scipy.io import wavfile from scipy.io import wavfile
class _audio_pre_():
def __init__(self, model_path,device,is_half): class _audio_pre_:
def __init__(self, model_path, device, is_half):
self.model_path = model_path self.model_path = model_path
self.device = device self.device = device
self.data = { self.data = {
# Processing Options # Processing Options
'postprocess': False, "postprocess": False,
'tta': False, "tta": False,
# Constants # Constants
'window_size': 512, "window_size": 512,
'agg': 10, "agg": 10,
'high_end_process': 'mirroring', "high_end_process": "mirroring",
} }
nn_arch_sizes = [ nn_arch_sizes = [
31191, # default 31191, # default
33966,61968, 123821, 123812, 537238 # custom 33966,
61968,
123821,
123812,
537238, # custom
] ]
self.nn_architecture = list('{}KB'.format(s) for s in nn_arch_sizes) self.nn_architecture = list("{}KB".format(s) for s in nn_arch_sizes)
model_size = math.ceil(os.stat(model_path ).st_size / 1024) model_size = math.ceil(os.stat(model_path).st_size / 1024)
nn_architecture = '{}KB'.format(min(nn_arch_sizes, key=lambda x:abs(x-model_size))) nn_architecture = "{}KB".format(
nets = importlib.import_module('uvr5_pack.lib_v5.nets' + f'_{nn_architecture}'.replace('_{}KB'.format(nn_arch_sizes[0]), ''), package=None) min(nn_arch_sizes, key=lambda x: abs(x - model_size))
model_hash = hashlib.md5(open(model_path,'rb').read()).hexdigest() )
param_name ,model_params_d = _get_name_params(model_path , model_hash) nets = importlib.import_module(
"uvr5_pack.lib_v5.nets"
+ f"_{nn_architecture}".replace("_{}KB".format(nn_arch_sizes[0]), ""),
package=None,
)
model_hash = hashlib.md5(open(model_path, "rb").read()).hexdigest()
param_name, model_params_d = _get_name_params(model_path, model_hash)
mp = ModelParameters(model_params_d) mp = ModelParameters(model_params_d)
model = nets.CascadedASPPNet(mp.param['bins'] * 2) model = nets.CascadedASPPNet(mp.param["bins"] * 2)
cpk = torch.load( model_path , map_location='cpu') cpk = torch.load(model_path, map_location="cpu")
model.load_state_dict(cpk) model.load_state_dict(cpk)
model.eval() model.eval()
if(is_half):model = model.half().to(device) if is_half:
else:model = model.to(device) model = model.half().to(device)
else:
model = model.to(device)
self.mp = mp self.mp = mp
self.model = model self.model = model
def _path_audio_(self, music_file ,ins_root=None,vocal_root=None): def _path_audio_(self, music_file, ins_root=None, vocal_root=None):
if(ins_root is None and vocal_root is None):return "No save root." if ins_root is None and vocal_root is None:
name=os.path.basename(music_file) return "No save root."
if(ins_root is not None):os.makedirs(ins_root, exist_ok=True) name = os.path.basename(music_file)
if(vocal_root is not None):os.makedirs(vocal_root , exist_ok=True) if ins_root is not None:
os.makedirs(ins_root, exist_ok=True)
if vocal_root is not None:
os.makedirs(vocal_root, exist_ok=True)
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
bands_n = len(self.mp.param['band']) bands_n = len(self.mp.param["band"])
# print(bands_n) # print(bands_n)
for d in range(bands_n, 0, -1): for d in range(bands_n, 0, -1):
bp = self.mp.param['band'][d] bp = self.mp.param["band"][d]
if d == bands_n: # high-end band if d == bands_n: # high-end band
X_wave[d], _ = librosa.core.load(#理论上librosa读取可能对某些音频有bug应该上ffmpeg读取但是太麻烦了弃坑 (
music_file, bp['sr'], False, dtype=np.float32, res_type=bp['res_type']) X_wave[d],
_,
) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug应该上ffmpeg读取但是太麻烦了弃坑
music_file,
bp["sr"],
False,
dtype=np.float32,
res_type=bp["res_type"],
)
if X_wave[d].ndim == 1: if X_wave[d].ndim == 1:
X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
else: # lower bands else: # lower bands
X_wave[d] = librosa.core.resample(X_wave[d+1], self.mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type']) X_wave[d] = librosa.core.resample(
X_wave[d + 1],
self.mp.param["band"][d + 1]["sr"],
bp["sr"],
res_type=bp["res_type"],
)
# Stft of wave source # Stft of wave source
X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(X_wave[d], bp['hl'], bp['n_fft'], self.mp.param['mid_side'], self.mp.param['mid_side_b2'], self.mp.param['reverse']) X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
X_wave[d],
bp["hl"],
bp["n_fft"],
self.mp.param["mid_side"],
self.mp.param["mid_side_b2"],
self.mp.param["reverse"],
)
# pdb.set_trace() # pdb.set_trace()
if d == bands_n and self.data['high_end_process'] != 'none': if d == bands_n and self.data["high_end_process"] != "none":
input_high_end_h = (bp['n_fft']//2 - bp['crop_stop']) + ( self.mp.param['pre_filter_stop'] - self.mp.param['pre_filter_start']) input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
input_high_end = X_spec_s[d][:, bp['n_fft']//2-input_high_end_h:bp['n_fft']//2, :] self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
)
input_high_end = X_spec_s[d][
:, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
]
X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
aggresive_set = float(self.data['agg']/100) aggresive_set = float(self.data["agg"] / 100)
aggressiveness = {'value': aggresive_set, 'split_bin': self.mp.param['band'][1]['crop_stop']} aggressiveness = {
"value": aggresive_set,
"split_bin": self.mp.param["band"][1]["crop_stop"],
}
with torch.no_grad(): with torch.no_grad():
pred, X_mag, X_phase = inference(X_spec_m,self.device,self.model, aggressiveness,self.data) pred, X_mag, X_phase = inference(
X_spec_m, self.device, self.model, aggressiveness, self.data
)
# Postprocess # Postprocess
if self.data['postprocess']: if self.data["postprocess"]:
pred_inv = np.clip(X_mag - pred, 0, np.inf) pred_inv = np.clip(X_mag - pred, 0, np.inf)
pred = spec_utils.mask_silence(pred, pred_inv) pred = spec_utils.mask_silence(pred, pred_inv)
y_spec_m = pred * X_phase y_spec_m = pred * X_phase
v_spec_m = X_spec_m - y_spec_m v_spec_m = X_spec_m - y_spec_m
if (ins_root is not None): if ins_root is not None:
if self.data['high_end_process'].startswith('mirroring'): if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], y_spec_m, input_high_end, self.mp) input_high_end_ = spec_utils.mirroring(
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp,input_high_end_h, input_high_end_) self.data["high_end_process"], y_spec_m, input_high_end, self.mp
)
wav_instrument = spec_utils.cmb_spectrogram_to_wave(
y_spec_m, self.mp, input_high_end_h, input_high_end_
)
else: else:
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
print ('%s instruments done'%name) print("%s instruments done" % name)
wavfile.write(os.path.join(ins_root, 'instrument_{}.wav'.format(name) ), self.mp.param['sr'], (np.array(wav_instrument)*32768).astype("int16")) # wavfile.write(
if (vocal_root is not None): os.path.join(ins_root, "instrument_{}.wav".format(name)),
if self.data['high_end_process'].startswith('mirroring'): self.mp.param["sr"],
input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], v_spec_m, input_high_end, self.mp) (np.array(wav_instrument) * 32768).astype("int16"),
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp, input_high_end_h, input_high_end_) ) #
if vocal_root is not None:
if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring(
self.data["high_end_process"], v_spec_m, input_high_end, self.mp
)
wav_vocals = spec_utils.cmb_spectrogram_to_wave(
v_spec_m, self.mp, input_high_end_h, input_high_end_
)
else: else:
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
print ('%s vocals done'%name) print("%s vocals done" % name)
wavfile.write(os.path.join(vocal_root , 'vocal_{}.wav'.format(name) ), self.mp.param['sr'], (np.array(wav_vocals)*32768).astype("int16")) wavfile.write(
os.path.join(vocal_root, "vocal_{}.wav".format(name)),
self.mp.param["sr"],
(np.array(wav_vocals) * 32768).astype("int16"),
)
if __name__ == '__main__':
device = 'cuda' if __name__ == "__main__":
is_half=True device = "cuda"
model_path='uvr5_weights/2_HP-UVR.pth' is_half = True
pre_fun = _audio_pre_(model_path=model_path,device=device,is_half=True) model_path = "uvr5_weights/2_HP-UVR.pth"
audio_path = '神女劈观.aac' pre_fun = _audio_pre_(model_path=model_path, device=device, is_half=True)
save_path = 'opt' audio_path = "神女劈观.aac"
pre_fun._path_audio_(audio_path , save_path,save_path) save_path = "opt"
pre_fun._path_audio_(audio_path, save_path, save_path)

View File

@ -31,7 +31,9 @@ for lang_file in languages:
del lang_data[key] del lang_data[key]
# Sort the keys of the language file to match the order of the standard file # Sort the keys of the language file to match the order of the standard file
lang_data = OrderedDict(sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0]))) lang_data = OrderedDict(
sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0]))
)
# Save the updated language file # Save the updated language file
with open(lang_file, "w", encoding="utf-8") as f: with open(lang_file, "w", encoding="utf-8") as f:

View File

@ -1,11 +1,15 @@
import ffmpeg import ffmpeg
import numpy as np import numpy as np
def load_audio(file,sr):
def load_audio(file, sr):
try: try:
# https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
# This launches a subprocess to decode audio while down-mixing and resampling as necessary. # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
file=file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")#防止小白拷路径头尾带了空格和"和回车 file = (
file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
) # 防止小白拷路径头尾带了空格和"和回车
out, _ = ( out, _ = (
ffmpeg.input(file, threads=0) ffmpeg.input(file, threads=0)
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr) .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)

View File

@ -18,9 +18,7 @@ def get_rms(
x_shape_trimmed = list(y.shape) x_shape_trimmed = list(y.shape)
x_shape_trimmed[axis] -= frame_length - 1 x_shape_trimmed[axis] -= frame_length - 1
out_shape = tuple(x_shape_trimmed) + tuple([frame_length]) out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
xw = np.lib.stride_tricks.as_strided( xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
y, shape=out_shape, strides=out_strides
)
if axis < 0: if axis < 0:
target_axis = axis - 1 target_axis = axis - 1
else: else:
@ -38,19 +36,25 @@ def get_rms(
class Slicer: class Slicer:
def __init__(self, def __init__(
sr: int, self,
threshold: float = -40., sr: int,
min_length: int = 5000, threshold: float = -40.0,
min_interval: int = 300, min_length: int = 5000,
hop_size: int = 20, min_interval: int = 300,
max_sil_kept: int = 5000): hop_size: int = 20,
max_sil_kept: int = 5000,
):
if not min_length >= min_interval >= hop_size: if not min_length >= min_interval >= hop_size:
raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size') raise ValueError(
"The following condition must be satisfied: min_length >= min_interval >= hop_size"
)
if not max_sil_kept >= hop_size: if not max_sil_kept >= hop_size:
raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size') raise ValueError(
"The following condition must be satisfied: max_sil_kept >= hop_size"
)
min_interval = sr * min_interval / 1000 min_interval = sr * min_interval / 1000
self.threshold = 10 ** (threshold / 20.) self.threshold = 10 ** (threshold / 20.0)
self.hop_size = round(sr * hop_size / 1000) self.hop_size = round(sr * hop_size / 1000)
self.win_size = min(round(min_interval), 4 * self.hop_size) self.win_size = min(round(min_interval), 4 * self.hop_size)
self.min_length = round(sr * min_length / 1000 / self.hop_size) self.min_length = round(sr * min_length / 1000 / self.hop_size)
@ -59,9 +63,13 @@ class Slicer:
def _apply_slice(self, waveform, begin, end): def _apply_slice(self, waveform, begin, end):
if len(waveform.shape) > 1: if len(waveform.shape) > 1:
return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)] return waveform[
:, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
]
else: else:
return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)] return waveform[
begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
]
# @timeit # @timeit
def slice(self, waveform): def slice(self, waveform):
@ -71,7 +79,9 @@ class Slicer:
samples = waveform samples = waveform
if samples.shape[0] <= self.min_length: if samples.shape[0] <= self.min_length:
return [waveform] return [waveform]
rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0) rms_list = get_rms(
y=samples, frame_length=self.win_size, hop_length=self.hop_size
).squeeze(0)
sil_tags = [] sil_tags = []
silence_start = None silence_start = None
clip_start = 0 clip_start = 0
@ -87,23 +97,37 @@ class Slicer:
continue continue
# Clear recorded silence start if interval is not enough or clip is too short # Clear recorded silence start if interval is not enough or clip is too short
is_leading_silence = silence_start == 0 and i > self.max_sil_kept is_leading_silence = silence_start == 0 and i > self.max_sil_kept
need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length need_slice_middle = (
i - silence_start >= self.min_interval
and i - clip_start >= self.min_length
)
if not is_leading_silence and not need_slice_middle: if not is_leading_silence and not need_slice_middle:
silence_start = None silence_start = None
continue continue
# Need slicing. Record the range of silent frames to be removed. # Need slicing. Record the range of silent frames to be removed.
if i - silence_start <= self.max_sil_kept: if i - silence_start <= self.max_sil_kept:
pos = rms_list[silence_start: i + 1].argmin() + silence_start pos = rms_list[silence_start : i + 1].argmin() + silence_start
if silence_start == 0: if silence_start == 0:
sil_tags.append((0, pos)) sil_tags.append((0, pos))
else: else:
sil_tags.append((pos, pos)) sil_tags.append((pos, pos))
clip_start = pos clip_start = pos
elif i - silence_start <= self.max_sil_kept * 2: elif i - silence_start <= self.max_sil_kept * 2:
pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin() pos = rms_list[
i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
].argmin()
pos += i - self.max_sil_kept pos += i - self.max_sil_kept
pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start pos_l = (
pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept rms_list[
silence_start : silence_start + self.max_sil_kept + 1
].argmin()
+ silence_start
)
pos_r = (
rms_list[i - self.max_sil_kept : i + 1].argmin()
+ i
- self.max_sil_kept
)
if silence_start == 0: if silence_start == 0:
sil_tags.append((0, pos_r)) sil_tags.append((0, pos_r))
clip_start = pos_r clip_start = pos_r
@ -111,8 +135,17 @@ class Slicer:
sil_tags.append((min(pos_l, pos), max(pos_r, pos))) sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
clip_start = max(pos_r, pos) clip_start = max(pos_r, pos)
else: else:
pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start pos_l = (
pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept rms_list[
silence_start : silence_start + self.max_sil_kept + 1
].argmin()
+ silence_start
)
pos_r = (
rms_list[i - self.max_sil_kept : i + 1].argmin()
+ i
- self.max_sil_kept
)
if silence_start == 0: if silence_start == 0:
sil_tags.append((0, pos_r)) sil_tags.append((0, pos_r))
else: else:
@ -121,9 +154,12 @@ class Slicer:
silence_start = None silence_start = None
# Deal with trailing silence. # Deal with trailing silence.
total_frames = rms_list.shape[0] total_frames = rms_list.shape[0]
if silence_start is not None and total_frames - silence_start >= self.min_interval: if (
silence_start is not None
and total_frames - silence_start >= self.min_interval
):
silence_end = min(total_frames, silence_start + self.max_sil_kept) silence_end = min(total_frames, silence_start + self.max_sil_kept)
pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
sil_tags.append((pos, total_frames + 1)) sil_tags.append((pos, total_frames + 1))
# Apply and return slices. # Apply and return slices.
if len(sil_tags) == 0: if len(sil_tags) == 0:
@ -133,9 +169,13 @@ class Slicer:
if sil_tags[0][0] > 0: if sil_tags[0][0] > 0:
chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0])) chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
for i in range(len(sil_tags) - 1): for i in range(len(sil_tags) - 1):
chunks.append(self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0])) chunks.append(
self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0])
)
if sil_tags[-1][1] < total_frames: if sil_tags[-1][1] < total_frames:
chunks.append(self._apply_slice(waveform, sil_tags[-1][1], total_frames)) chunks.append(
self._apply_slice(waveform, sil_tags[-1][1], total_frames)
)
return chunks return chunks
@ -147,18 +187,45 @@ def main():
import soundfile import soundfile
parser = ArgumentParser() parser = ArgumentParser()
parser.add_argument('audio', type=str, help='The audio to be sliced') parser.add_argument("audio", type=str, help="The audio to be sliced")
parser.add_argument('--out', type=str, help='Output directory of the sliced audio clips') parser.add_argument(
parser.add_argument('--db_thresh', type=float, required=False, default=-40, "--out", type=str, help="Output directory of the sliced audio clips"
help='The dB threshold for silence detection') )
parser.add_argument('--min_length', type=int, required=False, default=5000, parser.add_argument(
help='The minimum milliseconds required for each sliced audio clip') "--db_thresh",
parser.add_argument('--min_interval', type=int, required=False, default=300, type=float,
help='The minimum milliseconds for a silence part to be sliced') required=False,
parser.add_argument('--hop_size', type=int, required=False, default=10, default=-40,
help='Frame length in milliseconds') help="The dB threshold for silence detection",
parser.add_argument('--max_sil_kept', type=int, required=False, default=500, )
help='The maximum silence length kept around the sliced clip, presented in milliseconds') parser.add_argument(
"--min_length",
type=int,
required=False,
default=5000,
help="The minimum milliseconds required for each sliced audio clip",
)
parser.add_argument(
"--min_interval",
type=int,
required=False,
default=300,
help="The minimum milliseconds for a silence part to be sliced",
)
parser.add_argument(
"--hop_size",
type=int,
required=False,
default=10,
help="Frame length in milliseconds",
)
parser.add_argument(
"--max_sil_kept",
type=int,
required=False,
default=500,
help="The maximum silence length kept around the sliced clip, presented in milliseconds",
)
args = parser.parse_args() args = parser.parse_args()
out = args.out out = args.out
if out is None: if out is None:
@ -170,7 +237,7 @@ def main():
min_length=args.min_length, min_length=args.min_length,
min_interval=args.min_interval, min_interval=args.min_interval,
hop_size=args.hop_size, hop_size=args.hop_size,
max_sil_kept=args.max_sil_kept max_sil_kept=args.max_sil_kept,
) )
chunks = slicer.slice(audio) chunks = slicer.slice(audio)
if not os.path.exists(out): if not os.path.exists(out):
@ -178,8 +245,16 @@ def main():
for i, chunk in enumerate(chunks): for i, chunk in enumerate(chunks):
if len(chunk.shape) > 1: if len(chunk.shape) > 1:
chunk = chunk.T chunk = chunk.T
soundfile.write(os.path.join(out, f'%s_%d.wav' % (os.path.basename(args.audio).rsplit('.', maxsplit=1)[0], i)), chunk, sr) soundfile.write(
os.path.join(
out,
f"%s_%d.wav"
% (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
),
chunk,
sr,
)
if __name__ == '__main__': if __name__ == "__main__":
main() main()

View File

@ -1,4 +1,4 @@
import os,traceback import os, traceback
import numpy as np import numpy as np
import torch import torch
import torch.utils.data import torch.utils.data
@ -6,6 +6,7 @@ import torch.utils.data
from mel_processing import spectrogram_torch from mel_processing import spectrogram_torch
from utils import load_wav_to_torch, load_filepaths_and_text from utils import load_wav_to_torch, load_filepaths_and_text
class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
""" """
1) loads audio, text pairs 1) loads audio, text pairs
@ -15,14 +16,14 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
def __init__(self, audiopaths_and_text, hparams): def __init__(self, audiopaths_and_text, hparams):
self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
self.max_wav_value = hparams.max_wav_value self.max_wav_value = hparams.max_wav_value
self.sampling_rate = hparams.sampling_rate self.sampling_rate = hparams.sampling_rate
self.filter_length = hparams.filter_length self.filter_length = hparams.filter_length
self.hop_length = hparams.hop_length self.hop_length = hparams.hop_length
self.win_length = hparams.win_length self.win_length = hparams.win_length
self.sampling_rate = hparams.sampling_rate self.sampling_rate = hparams.sampling_rate
self.min_text_len = getattr(hparams, "min_text_len", 1) self.min_text_len = getattr(hparams, "min_text_len", 1)
self.max_text_len = getattr(hparams, "max_text_len", 5000) self.max_text_len = getattr(hparams, "max_text_len", 5000)
self._filter() self._filter()
def _filter(self): def _filter(self):
@ -34,12 +35,13 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
# spec_length = wav_length // hop_length # spec_length = wav_length // hop_length
audiopaths_and_text_new = [] audiopaths_and_text_new = []
lengths = [] lengths = []
for audiopath, text, pitch,pitchf,dv in self.audiopaths_and_text: for audiopath, text, pitch, pitchf, dv in self.audiopaths_and_text:
if self.min_text_len <= len(text) and len(text) <= self.max_text_len: if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
audiopaths_and_text_new.append([audiopath, text, pitch,pitchf,dv]) audiopaths_and_text_new.append([audiopath, text, pitch, pitchf, dv])
lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length)) lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
self.audiopaths_and_text = audiopaths_and_text_new self.audiopaths_and_text = audiopaths_and_text_new
self.lengths = lengths self.lengths = lengths
def get_sid(self, sid): def get_sid(self, sid):
sid = torch.LongTensor([int(sid)]) sid = torch.LongTensor([int(sid)])
return sid return sid
@ -54,7 +56,7 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
phone, pitch, pitchf = self.get_labels(phone, pitch, pitchf) phone, pitch, pitchf = self.get_labels(phone, pitch, pitchf)
spec, wav = self.get_audio(file) spec, wav = self.get_audio(file)
dv=self.get_sid(dv) dv = self.get_sid(dv)
len_phone = phone.size()[0] len_phone = phone.size()[0]
len_spec = spec.size()[-1] len_spec = spec.size()[-1]
@ -71,9 +73,9 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
pitch = pitch[:len_min] pitch = pitch[:len_min]
pitchf = pitchf[:len_min] pitchf = pitchf[:len_min]
return (spec, wav, phone, pitch,pitchf,dv) return (spec, wav, phone, pitch, pitchf, dv)
def get_labels(self, phone, pitch,pitchf): def get_labels(self, phone, pitch, pitchf):
phone = np.load(phone) phone = np.load(phone)
phone = np.repeat(phone, 2, axis=0) phone = np.repeat(phone, 2, axis=0)
pitch = np.load(pitch) pitch = np.load(pitch)
@ -86,7 +88,7 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
phone = torch.FloatTensor(phone) phone = torch.FloatTensor(phone)
pitch = torch.LongTensor(pitch) pitch = torch.LongTensor(pitch)
pitchf = torch.FloatTensor(pitchf) pitchf = torch.FloatTensor(pitchf)
return phone, pitch,pitchf return phone, pitch, pitchf
def get_audio(self, filename): def get_audio(self, filename):
audio, sampling_rate = load_wav_to_torch(filename) audio, sampling_rate = load_wav_to_torch(filename)
@ -103,10 +105,15 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
try: try:
spec = torch.load(spec_filename) spec = torch.load(spec_filename)
except: except:
print (spec_filename,traceback.format_exc()) print(spec_filename, traceback.format_exc())
spec = spectrogram_torch(audio_norm, self.filter_length, spec = spectrogram_torch(
self.sampling_rate, self.hop_length, self.win_length, audio_norm,
center=False) self.filter_length,
self.sampling_rate,
self.hop_length,
self.win_length,
center=False,
)
spec = torch.squeeze(spec, 0) spec = torch.squeeze(spec, 0)
torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
else: else:
@ -127,6 +134,8 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
def __len__(self): def __len__(self):
return len(self.audiopaths_and_text) return len(self.audiopaths_and_text)
class TextAudioCollateMultiNSFsid: class TextAudioCollateMultiNSFsid:
"""Zero-pads model inputs and targets""" """Zero-pads model inputs and targets"""
@ -155,7 +164,9 @@ class TextAudioCollateMultiNSFsid:
max_phone_len = max([x[2].size(0) for x in batch]) max_phone_len = max([x[2].size(0) for x in batch])
phone_lengths = torch.LongTensor(len(batch)) phone_lengths = torch.LongTensor(len(batch))
phone_padded = torch.FloatTensor(len(batch), max_phone_len, batch[0][2].shape[1])#(spec, wav, phone, pitch) phone_padded = torch.FloatTensor(
len(batch), max_phone_len, batch[0][2].shape[1]
) # (spec, wav, phone, pitch)
pitch_padded = torch.LongTensor(len(batch), max_phone_len) pitch_padded = torch.LongTensor(len(batch), max_phone_len)
pitchf_padded = torch.FloatTensor(len(batch), max_phone_len) pitchf_padded = torch.FloatTensor(len(batch), max_phone_len)
phone_padded.zero_() phone_padded.zero_()
@ -187,7 +198,6 @@ class TextAudioCollateMultiNSFsid:
# dv[i] = row[5] # dv[i] = row[5]
sid[i] = row[5] sid[i] = row[5]
return ( return (
phone_padded, phone_padded,
phone_lengths, phone_lengths,
@ -198,9 +208,10 @@ class TextAudioCollateMultiNSFsid:
wave_padded, wave_padded,
wave_lengths, wave_lengths,
# dv # dv
sid sid,
) )
class TextAudioLoader(torch.utils.data.Dataset): class TextAudioLoader(torch.utils.data.Dataset):
""" """
1) loads audio, text pairs 1) loads audio, text pairs
@ -210,14 +221,14 @@ class TextAudioLoader(torch.utils.data.Dataset):
def __init__(self, audiopaths_and_text, hparams): def __init__(self, audiopaths_and_text, hparams):
self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
self.max_wav_value = hparams.max_wav_value self.max_wav_value = hparams.max_wav_value
self.sampling_rate = hparams.sampling_rate self.sampling_rate = hparams.sampling_rate
self.filter_length = hparams.filter_length self.filter_length = hparams.filter_length
self.hop_length = hparams.hop_length self.hop_length = hparams.hop_length
self.win_length = hparams.win_length self.win_length = hparams.win_length
self.sampling_rate = hparams.sampling_rate self.sampling_rate = hparams.sampling_rate
self.min_text_len = getattr(hparams, "min_text_len", 1) self.min_text_len = getattr(hparams, "min_text_len", 1)
self.max_text_len = getattr(hparams, "max_text_len", 5000) self.max_text_len = getattr(hparams, "max_text_len", 5000)
self._filter() self._filter()
def _filter(self): def _filter(self):
@ -229,12 +240,13 @@ class TextAudioLoader(torch.utils.data.Dataset):
# spec_length = wav_length // hop_length # spec_length = wav_length // hop_length
audiopaths_and_text_new = [] audiopaths_and_text_new = []
lengths = [] lengths = []
for audiopath, text,dv in self.audiopaths_and_text: for audiopath, text, dv in self.audiopaths_and_text:
if self.min_text_len <= len(text) and len(text) <= self.max_text_len: if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
audiopaths_and_text_new.append([audiopath, text,dv]) audiopaths_and_text_new.append([audiopath, text, dv])
lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length)) lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
self.audiopaths_and_text = audiopaths_and_text_new self.audiopaths_and_text = audiopaths_and_text_new
self.lengths = lengths self.lengths = lengths
def get_sid(self, sid): def get_sid(self, sid):
sid = torch.LongTensor([int(sid)]) sid = torch.LongTensor([int(sid)])
return sid return sid
@ -247,7 +259,7 @@ class TextAudioLoader(torch.utils.data.Dataset):
phone = self.get_labels(phone) phone = self.get_labels(phone)
spec, wav = self.get_audio(file) spec, wav = self.get_audio(file)
dv=self.get_sid(dv) dv = self.get_sid(dv)
len_phone = phone.size()[0] len_phone = phone.size()[0]
len_spec = spec.size()[-1] len_spec = spec.size()[-1]
@ -257,7 +269,7 @@ class TextAudioLoader(torch.utils.data.Dataset):
spec = spec[:, :len_min] spec = spec[:, :len_min]
wav = wav[:, :len_wav] wav = wav[:, :len_wav]
phone = phone[:len_min, :] phone = phone[:len_min, :]
return (spec, wav, phone,dv) return (spec, wav, phone, dv)
def get_labels(self, phone): def get_labels(self, phone):
phone = np.load(phone) phone = np.load(phone)
@ -282,10 +294,15 @@ class TextAudioLoader(torch.utils.data.Dataset):
try: try:
spec = torch.load(spec_filename) spec = torch.load(spec_filename)
except: except:
print (spec_filename,traceback.format_exc()) print(spec_filename, traceback.format_exc())
spec = spectrogram_torch(audio_norm, self.filter_length, spec = spectrogram_torch(
self.sampling_rate, self.hop_length, self.win_length, audio_norm,
center=False) self.filter_length,
self.sampling_rate,
self.hop_length,
self.win_length,
center=False,
)
spec = torch.squeeze(spec, 0) spec = torch.squeeze(spec, 0)
torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
else: else:
@ -306,6 +323,8 @@ class TextAudioLoader(torch.utils.data.Dataset):
def __len__(self): def __len__(self):
return len(self.audiopaths_and_text) return len(self.audiopaths_and_text)
class TextAudioCollate: class TextAudioCollate:
"""Zero-pads model inputs and targets""" """Zero-pads model inputs and targets"""
@ -334,7 +353,9 @@ class TextAudioCollate:
max_phone_len = max([x[2].size(0) for x in batch]) max_phone_len = max([x[2].size(0) for x in batch])
phone_lengths = torch.LongTensor(len(batch)) phone_lengths = torch.LongTensor(len(batch))
phone_padded = torch.FloatTensor(len(batch), max_phone_len, batch[0][2].shape[1]) phone_padded = torch.FloatTensor(
len(batch), max_phone_len, batch[0][2].shape[1]
)
phone_padded.zero_() phone_padded.zero_()
sid = torch.LongTensor(len(batch)) sid = torch.LongTensor(len(batch))
@ -355,7 +376,6 @@ class TextAudioCollate:
sid[i] = row[3] sid[i] = row[3]
return ( return (
phone_padded, phone_padded,
phone_lengths, phone_lengths,
@ -363,9 +383,10 @@ class TextAudioCollate:
spec_lengths, spec_lengths,
wave_padded, wave_padded,
wave_lengths, wave_lengths,
sid sid,
) )
class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
""" """
Maintain similar input lengths in a batch. Maintain similar input lengths in a batch.
@ -402,7 +423,7 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
if idx_bucket != -1: if idx_bucket != -1:
buckets[idx_bucket].append(i) buckets[idx_bucket].append(i)
for i in range(len(buckets) - 1, -1, -1):# for i in range(len(buckets) - 1, -1, -1): #
if len(buckets[i]) == 0: if len(buckets[i]) == 0:
buckets.pop(i) buckets.pop(i)
self.boundaries.pop(i + 1) self.boundaries.pop(i + 1)

View File

@ -1,6 +1,7 @@
import torch import torch
from torch.nn import functional as F from torch.nn import functional as F
def feature_loss(fmap_r, fmap_g): def feature_loss(fmap_r, fmap_g):
loss = 0 loss = 0
for dr, dg in zip(fmap_r, fmap_g): for dr, dg in zip(fmap_r, fmap_g):

View File

@ -78,7 +78,8 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False)
center=center, center=center,
pad_mode="reflect", pad_mode="reflect",
normalized=False, normalized=False,
onesided=True,return_complex=False onesided=True,
return_complex=False,
) )
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
@ -139,8 +140,18 @@ def mel_spectrogram_torch(
# normalized=False, # normalized=False,
# onesided=True, # onesided=True,
# ) # )
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], spec = torch.stft(
center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) y,
n_fft,
hop_length=hop_size,
win_length=win_size,
window=hann_window[wnsize_dtype_device],
center=center,
pad_mode="reflect",
normalized=False,
onesided=True,
return_complex=False,
)
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
spec = torch.matmul(mel_basis[fmax_dtype_device], spec) spec = torch.matmul(mel_basis[fmax_dtype_device], spec)

View File

@ -1,101 +1,248 @@
import torch,traceback,os,pdb import torch, traceback, os, pdb
from collections import OrderedDict from collections import OrderedDict
def savee(ckpt,sr,if_f0,name,epoch):
def savee(ckpt, sr, if_f0, name, epoch):
try: try:
opt = OrderedDict() opt = OrderedDict()
opt["weight"] = {} opt["weight"] = {}
for key in ckpt.keys(): for key in ckpt.keys():
if ("enc_q" in key): continue if "enc_q" in key:
continue
opt["weight"][key] = ckpt[key].half() opt["weight"][key] = ckpt[key].half()
if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4], 109, 256, 40000] if sr == "40k":
elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4,4], 109, 256, 48000] opt["config"] = [
elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000] 1025,
opt["info"] = "%sepoch"%epoch 32,
192,
192,
768,
2,
6,
3,
0,
"1",
[3, 7, 11],
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
[10, 10, 2, 2],
512,
[16, 16, 4, 4],
109,
256,
40000,
]
elif sr == "48k":
opt["config"] = [
1025,
32,
192,
192,
768,
2,
6,
3,
0,
"1",
[3, 7, 11],
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
[10, 6, 2, 2, 2],
512,
[16, 16, 4, 4, 4],
109,
256,
48000,
]
elif sr == "32k":
opt["config"] = [
513,
32,
192,
192,
768,
2,
6,
3,
0,
"1",
[3, 7, 11],
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
[10, 4, 2, 2, 2],
512,
[16, 16, 4, 4, 4],
109,
256,
32000,
]
opt["info"] = "%sepoch" % epoch
opt["sr"] = sr opt["sr"] = sr
opt["f0"] =if_f0 opt["f0"] = if_f0
torch.save(opt, "weights/%s.pth"%name) torch.save(opt, "weights/%s.pth" % name)
return "Success." return "Success."
except: except:
return traceback.format_exc() return traceback.format_exc()
def show_info(path): def show_info(path):
try: try:
a = torch.load(path, map_location="cpu") a = torch.load(path, map_location="cpu")
return "模型信息:%s\n采样率:%s\n模型是否输入音高引导:%s"%(a.get("info","None"),a.get("sr","None"),a.get("f0","None"),) return "模型信息:%s\n采样率:%s\n模型是否输入音高引导:%s" % (
a.get("info", "None"),
a.get("sr", "None"),
a.get("f0", "None"),
)
except: except:
return traceback.format_exc() return traceback.format_exc()
def extract_small_model(path,name,sr,if_f0,info):
def extract_small_model(path, name, sr, if_f0, info):
try: try:
ckpt = torch.load(path, map_location="cpu") ckpt = torch.load(path, map_location="cpu")
if("model"in ckpt):ckpt=ckpt["model"] if "model" in ckpt:
ckpt = ckpt["model"]
opt = OrderedDict() opt = OrderedDict()
opt["weight"] = {} opt["weight"] = {}
for key in ckpt.keys(): for key in ckpt.keys():
if ("enc_q" in key): continue if "enc_q" in key:
continue
opt["weight"][key] = ckpt[key].half() opt["weight"][key] = ckpt[key].half()
if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4], 109, 256, 40000] if sr == "40k":
elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4,4], 109, 256, 48000] opt["config"] = [
elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000] 1025,
if(info==""):info="Extracted model." 32,
192,
192,
768,
2,
6,
3,
0,
"1",
[3, 7, 11],
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
[10, 10, 2, 2],
512,
[16, 16, 4, 4],
109,
256,
40000,
]
elif sr == "48k":
opt["config"] = [
1025,
32,
192,
192,
768,
2,
6,
3,
0,
"1",
[3, 7, 11],
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
[10, 6, 2, 2, 2],
512,
[16, 16, 4, 4, 4],
109,
256,
48000,
]
elif sr == "32k":
opt["config"] = [
513,
32,
192,
192,
768,
2,
6,
3,
0,
"1",
[3, 7, 11],
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
[10, 4, 2, 2, 2],
512,
[16, 16, 4, 4, 4],
109,
256,
32000,
]
if info == "":
info = "Extracted model."
opt["info"] = info opt["info"] = info
opt["sr"] = sr opt["sr"] = sr
opt["f0"] =int(if_f0) opt["f0"] = int(if_f0)
torch.save(opt, "weights/%s.pth"%name) torch.save(opt, "weights/%s.pth" % name)
return "Success." return "Success."
except: except:
return traceback.format_exc() return traceback.format_exc()
def change_info(path,info,name):
def change_info(path, info, name):
try: try:
ckpt = torch.load(path, map_location="cpu") ckpt = torch.load(path, map_location="cpu")
ckpt["info"]=info ckpt["info"] = info
if(name==""):name=os.path.basename(path) if name == "":
torch.save(ckpt, "weights/%s"%name) name = os.path.basename(path)
torch.save(ckpt, "weights/%s" % name)
return "Success." return "Success."
except: except:
return traceback.format_exc() return traceback.format_exc()
def merge(path1,path2,alpha1,sr,f0,info,name):
def merge(path1, path2, alpha1, sr, f0, info, name):
try: try:
def extract(ckpt): def extract(ckpt):
a = ckpt["model"] a = ckpt["model"]
opt = OrderedDict() opt = OrderedDict()
opt["weight"] = {} opt["weight"] = {}
for key in a.keys(): for key in a.keys():
if ("enc_q" in key): continue if "enc_q" in key:
continue
opt["weight"][key] = a[key] opt["weight"][key] = a[key]
return opt return opt
ckpt1 = torch.load(path1, map_location="cpu") ckpt1 = torch.load(path1, map_location="cpu")
ckpt2 = torch.load(path2, map_location="cpu") ckpt2 = torch.load(path2, map_location="cpu")
cfg = ckpt1["config"] cfg = ckpt1["config"]
if("model"in ckpt1): ckpt1=extract(ckpt1) if "model" in ckpt1:
else: ckpt1=ckpt1["weight"] ckpt1 = extract(ckpt1)
if("model"in ckpt2): ckpt2=extract(ckpt2) else:
else: ckpt2=ckpt2["weight"] ckpt1 = ckpt1["weight"]
if(sorted(list(ckpt1.keys()))!=sorted(list(ckpt2.keys()))):return "Fail to merge the models. The model architectures are not the same." if "model" in ckpt2:
ckpt2 = extract(ckpt2)
else:
ckpt2 = ckpt2["weight"]
if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())):
return "Fail to merge the models. The model architectures are not the same."
opt = OrderedDict() opt = OrderedDict()
opt["weight"] = {} opt["weight"] = {}
for key in ckpt1.keys(): for key in ckpt1.keys():
# try: # try:
if(key=="emb_g.weight"and ckpt1[key].shape!=ckpt2[key].shape): if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape:
min_shape0=min(ckpt1[key].shape[0],ckpt2[key].shape[0]) min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0])
opt["weight"][key] = (alpha1 * (ckpt1[key][:min_shape0].float()) + (1 - alpha1) * (ckpt2[key][:min_shape0].float())).half() opt["weight"][key] = (
else: alpha1 * (ckpt1[key][:min_shape0].float())
opt["weight"][key] = (alpha1*(ckpt1[key].float())+(1-alpha1)*(ckpt2[key].float())).half() + (1 - alpha1) * (ckpt2[key][:min_shape0].float())
# except: ).half()
# pdb.set_trace() else:
opt["weight"][key] = (
alpha1 * (ckpt1[key].float()) + (1 - alpha1) * (ckpt2[key].float())
).half()
# except:
# pdb.set_trace()
opt["config"] = cfg opt["config"] = cfg
''' """
if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 40000] if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 40000]
elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4], 109, 256, 48000] elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4], 109, 256, 48000]
elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000] elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000]
''' """
opt["sr"]=sr opt["sr"] = sr
opt["f0"]=1 if f0==""else 0 opt["f0"] = 1 if f0 == "" else 0
opt["info"]=info opt["info"] = info
torch.save(opt, "weights/%s.pth"%name) torch.save(opt, "weights/%s.pth" % name)
return "Success." return "Success."
except: except:
return traceback.format_exc() return traceback.format_exc()

View File

@ -1,4 +1,4 @@
import os,traceback import os, traceback
import glob import glob
import sys import sys
import argparse import argparse
@ -14,44 +14,53 @@ MATPLOTLIB_FLAG = False
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logger = logging logger = logging
def load_checkpoint_d(checkpoint_path, combd,sbd, optimizer=None,load_opt=1):
assert os.path.isfile(checkpoint_path)
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
################## def load_checkpoint_d(checkpoint_path, combd, sbd, optimizer=None, load_opt=1):
def go(model,bkey): assert os.path.isfile(checkpoint_path)
saved_state_dict = checkpoint_dict[bkey] checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
if hasattr(model, 'module'):state_dict = model.module.state_dict()
else:state_dict = model.state_dict()
new_state_dict= {}
for k, v in state_dict.items():#模型需要的shape
try:
new_state_dict[k] = saved_state_dict[k]
if(saved_state_dict[k].shape!=state_dict[k].shape):
print("shape-%s-mismatch|need-%s|get-%s"%(k,state_dict[k].shape,saved_state_dict[k].shape))#
raise KeyError
except:
# logger.info(traceback.format_exc())
logger.info("%s is not in the checkpoint" % k)#pretrain缺失的
new_state_dict[k] = v#模型自带的随机值
if hasattr(model, 'module'):
model.module.load_state_dict(new_state_dict,strict=False)
else:
model.load_state_dict(new_state_dict,strict=False)
go(combd,"combd")
go(sbd,"sbd")
#############
logger.info("Loaded model weights")
iteration = checkpoint_dict['iteration'] ##################
learning_rate = checkpoint_dict['learning_rate'] def go(model, bkey):
if optimizer is not None and load_opt==1:###加载不了如果是空的的话重新初始化可能还会影响lr时间表的更新因此在train文件最外围catch saved_state_dict = checkpoint_dict[bkey]
# try: if hasattr(model, "module"):
optimizer.load_state_dict(checkpoint_dict['optimizer']) state_dict = model.module.state_dict()
# except: else:
# traceback.print_exc() state_dict = model.state_dict()
logger.info("Loaded checkpoint '{}' (epoch {})" .format(checkpoint_path, iteration)) new_state_dict = {}
return model, optimizer, learning_rate, iteration for k, v in state_dict.items(): # 模型需要的shape
try:
new_state_dict[k] = saved_state_dict[k]
if saved_state_dict[k].shape != state_dict[k].shape:
print(
"shape-%s-mismatch|need-%s|get-%s"
% (k, state_dict[k].shape, saved_state_dict[k].shape)
) #
raise KeyError
except:
# logger.info(traceback.format_exc())
logger.info("%s is not in the checkpoint" % k) # pretrain缺失的
new_state_dict[k] = v # 模型自带的随机值
if hasattr(model, "module"):
model.module.load_state_dict(new_state_dict, strict=False)
else:
model.load_state_dict(new_state_dict, strict=False)
go(combd, "combd")
go(sbd, "sbd")
#############
logger.info("Loaded model weights")
iteration = checkpoint_dict["iteration"]
learning_rate = checkpoint_dict["learning_rate"]
if (
optimizer is not None and load_opt == 1
): ###加载不了如果是空的的话重新初始化可能还会影响lr时间表的更新因此在train文件最外围catch
# try:
optimizer.load_state_dict(checkpoint_dict["optimizer"])
# except:
# traceback.print_exc()
logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration))
return model, optimizer, learning_rate, iteration
# def load_checkpoint(checkpoint_path, model, optimizer=None): # def load_checkpoint(checkpoint_path, model, optimizer=None):
@ -83,303 +92,380 @@ def load_checkpoint_d(checkpoint_path, combd,sbd, optimizer=None,load_opt=1):
# logger.info("Loaded checkpoint '{}' (epoch {})" .format( # logger.info("Loaded checkpoint '{}' (epoch {})" .format(
# checkpoint_path, iteration)) # checkpoint_path, iteration))
# return model, optimizer, learning_rate, iteration # return model, optimizer, learning_rate, iteration
def load_checkpoint(checkpoint_path, model, optimizer=None,load_opt=1): def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1):
assert os.path.isfile(checkpoint_path) assert os.path.isfile(checkpoint_path)
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
saved_state_dict = checkpoint_dict['model'] saved_state_dict = checkpoint_dict["model"]
if hasattr(model, 'module'): if hasattr(model, "module"):
state_dict = model.module.state_dict() state_dict = model.module.state_dict()
else: else:
state_dict = model.state_dict() state_dict = model.state_dict()
new_state_dict= {} new_state_dict = {}
for k, v in state_dict.items():#模型需要的shape for k, v in state_dict.items(): # 模型需要的shape
try: try:
new_state_dict[k] = saved_state_dict[k] new_state_dict[k] = saved_state_dict[k]
if(saved_state_dict[k].shape!=state_dict[k].shape): if saved_state_dict[k].shape != state_dict[k].shape:
print("shape-%s-mismatch|need-%s|get-%s"%(k,state_dict[k].shape,saved_state_dict[k].shape))# print(
raise KeyError "shape-%s-mismatch|need-%s|get-%s"
except: % (k, state_dict[k].shape, saved_state_dict[k].shape)
# logger.info(traceback.format_exc()) ) #
logger.info("%s is not in the checkpoint" % k)#pretrain缺失的 raise KeyError
new_state_dict[k] = v#模型自带的随机值 except:
if hasattr(model, 'module'): # logger.info(traceback.format_exc())
model.module.load_state_dict(new_state_dict,strict=False) logger.info("%s is not in the checkpoint" % k) # pretrain缺失的
else: new_state_dict[k] = v # 模型自带的随机值
model.load_state_dict(new_state_dict,strict=False) if hasattr(model, "module"):
logger.info("Loaded model weights") model.module.load_state_dict(new_state_dict, strict=False)
else:
model.load_state_dict(new_state_dict, strict=False)
logger.info("Loaded model weights")
iteration = checkpoint_dict['iteration'] iteration = checkpoint_dict["iteration"]
learning_rate = checkpoint_dict['learning_rate'] learning_rate = checkpoint_dict["learning_rate"]
if optimizer is not None and load_opt==1:###加载不了如果是空的的话重新初始化可能还会影响lr时间表的更新因此在train文件最外围catch if (
# try: optimizer is not None and load_opt == 1
optimizer.load_state_dict(checkpoint_dict['optimizer']) ): ###加载不了如果是空的的话重新初始化可能还会影响lr时间表的更新因此在train文件最外围catch
# except: # try:
# traceback.print_exc() optimizer.load_state_dict(checkpoint_dict["optimizer"])
logger.info("Loaded checkpoint '{}' (epoch {})" .format(checkpoint_path, iteration)) # except:
return model, optimizer, learning_rate, iteration # traceback.print_exc()
logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration))
return model, optimizer, learning_rate, iteration
def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
logger.info("Saving model and optimizer state at epoch {} to {}".format( logger.info(
iteration, checkpoint_path)) "Saving model and optimizer state at epoch {} to {}".format(
if hasattr(model, 'module'): iteration, checkpoint_path
state_dict = model.module.state_dict() )
else: )
state_dict = model.state_dict() if hasattr(model, "module"):
torch.save({'model': state_dict, state_dict = model.module.state_dict()
'iteration': iteration, else:
'optimizer': optimizer.state_dict(), state_dict = model.state_dict()
'learning_rate': learning_rate}, checkpoint_path) torch.save(
{
"model": state_dict,
"iteration": iteration,
"optimizer": optimizer.state_dict(),
"learning_rate": learning_rate,
},
checkpoint_path,
)
def save_checkpoint_d(combd, sbd, optimizer, learning_rate, iteration, checkpoint_path): def save_checkpoint_d(combd, sbd, optimizer, learning_rate, iteration, checkpoint_path):
logger.info("Saving model and optimizer state at epoch {} to {}".format( logger.info(
iteration, checkpoint_path)) "Saving model and optimizer state at epoch {} to {}".format(
if hasattr(combd, 'module'): state_dict_combd = combd.module.state_dict() iteration, checkpoint_path
else:state_dict_combd = combd.state_dict() )
if hasattr(sbd, 'module'): state_dict_sbd = sbd.module.state_dict() )
else:state_dict_sbd = sbd.state_dict() if hasattr(combd, "module"):
torch.save({ state_dict_combd = combd.module.state_dict()
'combd': state_dict_combd, else:
'sbd': state_dict_sbd, state_dict_combd = combd.state_dict()
'iteration': iteration, if hasattr(sbd, "module"):
'optimizer': optimizer.state_dict(), state_dict_sbd = sbd.module.state_dict()
'learning_rate': learning_rate}, checkpoint_path) else:
state_dict_sbd = sbd.state_dict()
torch.save(
{
"combd": state_dict_combd,
"sbd": state_dict_sbd,
"iteration": iteration,
"optimizer": optimizer.state_dict(),
"learning_rate": learning_rate,
},
checkpoint_path,
)
def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050): def summarize(
for k, v in scalars.items(): writer,
writer.add_scalar(k, v, global_step) global_step,
for k, v in histograms.items(): scalars={},
writer.add_histogram(k, v, global_step) histograms={},
for k, v in images.items(): images={},
writer.add_image(k, v, global_step, dataformats='HWC') audios={},
for k, v in audios.items(): audio_sampling_rate=22050,
writer.add_audio(k, v, global_step, audio_sampling_rate) ):
for k, v in scalars.items():
writer.add_scalar(k, v, global_step)
for k, v in histograms.items():
writer.add_histogram(k, v, global_step)
for k, v in images.items():
writer.add_image(k, v, global_step, dataformats="HWC")
for k, v in audios.items():
writer.add_audio(k, v, global_step, audio_sampling_rate)
def latest_checkpoint_path(dir_path, regex="G_*.pth"): def latest_checkpoint_path(dir_path, regex="G_*.pth"):
f_list = glob.glob(os.path.join(dir_path, regex)) f_list = glob.glob(os.path.join(dir_path, regex))
f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
x = f_list[-1] x = f_list[-1]
print(x) print(x)
return x return x
def plot_spectrogram_to_numpy(spectrogram): def plot_spectrogram_to_numpy(spectrogram):
global MATPLOTLIB_FLAG global MATPLOTLIB_FLAG
if not MATPLOTLIB_FLAG: if not MATPLOTLIB_FLAG:
import matplotlib import matplotlib
matplotlib.use("Agg")
MATPLOTLIB_FLAG = True
mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.WARNING)
import matplotlib.pylab as plt
import numpy as np
fig, ax = plt.subplots(figsize=(10,2))
im = ax.imshow(spectrogram, aspect="auto", origin="lower",
interpolation='none')
plt.colorbar(im, ax=ax)
plt.xlabel("Frames")
plt.ylabel("Channels")
plt.tight_layout()
fig.canvas.draw() matplotlib.use("Agg")
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') MATPLOTLIB_FLAG = True
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) mpl_logger = logging.getLogger("matplotlib")
plt.close() mpl_logger.setLevel(logging.WARNING)
return data import matplotlib.pylab as plt
import numpy as np
fig, ax = plt.subplots(figsize=(10, 2))
im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
plt.colorbar(im, ax=ax)
plt.xlabel("Frames")
plt.ylabel("Channels")
plt.tight_layout()
fig.canvas.draw()
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
plt.close()
return data
def plot_alignment_to_numpy(alignment, info=None): def plot_alignment_to_numpy(alignment, info=None):
global MATPLOTLIB_FLAG global MATPLOTLIB_FLAG
if not MATPLOTLIB_FLAG: if not MATPLOTLIB_FLAG:
import matplotlib import matplotlib
matplotlib.use("Agg")
MATPLOTLIB_FLAG = True
mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.WARNING)
import matplotlib.pylab as plt
import numpy as np
fig, ax = plt.subplots(figsize=(6, 4)) matplotlib.use("Agg")
im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower', MATPLOTLIB_FLAG = True
interpolation='none') mpl_logger = logging.getLogger("matplotlib")
fig.colorbar(im, ax=ax) mpl_logger.setLevel(logging.WARNING)
xlabel = 'Decoder timestep' import matplotlib.pylab as plt
if info is not None: import numpy as np
xlabel += '\n\n' + info
plt.xlabel(xlabel)
plt.ylabel('Encoder timestep')
plt.tight_layout()
fig.canvas.draw() fig, ax = plt.subplots(figsize=(6, 4))
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') im = ax.imshow(
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) alignment.transpose(), aspect="auto", origin="lower", interpolation="none"
plt.close() )
return data fig.colorbar(im, ax=ax)
xlabel = "Decoder timestep"
if info is not None:
xlabel += "\n\n" + info
plt.xlabel(xlabel)
plt.ylabel("Encoder timestep")
plt.tight_layout()
fig.canvas.draw()
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
plt.close()
return data
def load_wav_to_torch(full_path): def load_wav_to_torch(full_path):
sampling_rate, data = read(full_path) sampling_rate, data = read(full_path)
return torch.FloatTensor(data.astype(np.float32)), sampling_rate return torch.FloatTensor(data.astype(np.float32)), sampling_rate
def load_filepaths_and_text(filename, split="|"): def load_filepaths_and_text(filename, split="|"):
with open(filename, encoding='utf-8') as f: with open(filename, encoding="utf-8") as f:
filepaths_and_text = [line.strip().split(split) for line in f] filepaths_and_text = [line.strip().split(split) for line in f]
return filepaths_and_text return filepaths_and_text
def get_hparams(init=True): def get_hparams(init=True):
''' """
todo: todo:
结尾七人组 结尾七人组
保存频率总epoch done 保存频率总epoch done
bs done bs done
pretrainGpretrainD done pretrainGpretrainD done
卡号os.en["CUDA_VISIBLE_DEVICES"] done 卡号os.en["CUDA_VISIBLE_DEVICES"] done
if_latest todo if_latest todo
模型if_f0 todo 模型if_f0 todo
采样率自动选择config done 采样率自动选择config done
是否缓存数据集进GPU:if_cache_data_in_gpu done 是否缓存数据集进GPU:if_cache_data_in_gpu done
-m: -m:
自动决定training_files路径,改掉train_nsf_load_pretrain.py里的hps.data.training_files done 自动决定training_files路径,改掉train_nsf_load_pretrain.py里的hps.data.training_files done
-c不要了 -c不要了
''' """
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
# parser.add_argument('-c', '--config', type=str, default="configs/40k.json",help='JSON file for configuration') # parser.add_argument('-c', '--config', type=str, default="configs/40k.json",help='JSON file for configuration')
parser.add_argument('-se', '--save_every_epoch', type=int, required=True,help='checkpoint save frequency (epoch)') parser.add_argument(
parser.add_argument('-te', '--total_epoch', type=int, required=True,help='total_epoch') "-se",
parser.add_argument('-pg', '--pretrainG', type=str, default="",help='Pretrained Discriminator path') "--save_every_epoch",
parser.add_argument('-pd', '--pretrainD', type=str, default="",help='Pretrained Generator path') type=int,
parser.add_argument('-g', '--gpus', type=str, default="0",help='split by -') required=True,
parser.add_argument('-bs', '--batch_size', type=int, required=True,help='batch size') help="checkpoint save frequency (epoch)",
parser.add_argument('-e', '--experiment_dir', type=str, required=True,help='experiment dir')#-m )
parser.add_argument('-sr', '--sample_rate', type=str, required=True,help='sample rate, 32k/40k/48k') parser.add_argument(
parser.add_argument('-f0', '--if_f0', type=int, required=True,help='use f0 as one of the inputs of the model, 1 or 0') "-te", "--total_epoch", type=int, required=True, help="total_epoch"
parser.add_argument('-l', '--if_latest', type=int, required=True,help='if only save the latest G/D pth file, 1 or 0') )
parser.add_argument('-c', '--if_cache_data_in_gpu', type=int, required=True,help='if caching the dataset in GPU memory, 1 or 0') parser.add_argument(
"-pg", "--pretrainG", type=str, default="", help="Pretrained Discriminator path"
)
parser.add_argument(
"-pd", "--pretrainD", type=str, default="", help="Pretrained Generator path"
)
parser.add_argument("-g", "--gpus", type=str, default="0", help="split by -")
parser.add_argument(
"-bs", "--batch_size", type=int, required=True, help="batch size"
)
parser.add_argument(
"-e", "--experiment_dir", type=str, required=True, help="experiment dir"
) # -m
parser.add_argument(
"-sr", "--sample_rate", type=str, required=True, help="sample rate, 32k/40k/48k"
)
parser.add_argument(
"-f0",
"--if_f0",
type=int,
required=True,
help="use f0 as one of the inputs of the model, 1 or 0",
)
parser.add_argument(
"-l",
"--if_latest",
type=int,
required=True,
help="if only save the latest G/D pth file, 1 or 0",
)
parser.add_argument(
"-c",
"--if_cache_data_in_gpu",
type=int,
required=True,
help="if caching the dataset in GPU memory, 1 or 0",
)
args = parser.parse_args() args = parser.parse_args()
name = args.experiment_dir name = args.experiment_dir
experiment_dir = os.path.join("./logs", args.experiment_dir) experiment_dir = os.path.join("./logs", args.experiment_dir)
if not os.path.exists(experiment_dir): if not os.path.exists(experiment_dir):
os.makedirs(experiment_dir) os.makedirs(experiment_dir)
config_path = "configs/%s.json"%args.sample_rate config_path = "configs/%s.json" % args.sample_rate
config_save_path = os.path.join(experiment_dir, "config.json") config_save_path = os.path.join(experiment_dir, "config.json")
if init: if init:
with open(config_path, "r") as f: with open(config_path, "r") as f:
data = f.read() data = f.read()
with open(config_save_path, "w") as f: with open(config_save_path, "w") as f:
f.write(data) f.write(data)
else: else:
with open(config_save_path, "r") as f: with open(config_save_path, "r") as f:
data = f.read() data = f.read()
config = json.loads(data) config = json.loads(data)
hparams = HParams(**config) hparams = HParams(**config)
hparams.model_dir = hparams.experiment_dir = experiment_dir hparams.model_dir = hparams.experiment_dir = experiment_dir
hparams.save_every_epoch = args.save_every_epoch hparams.save_every_epoch = args.save_every_epoch
hparams.name = name hparams.name = name
hparams.total_epoch = args.total_epoch hparams.total_epoch = args.total_epoch
hparams.pretrainG = args.pretrainG hparams.pretrainG = args.pretrainG
hparams.pretrainD = args.pretrainD hparams.pretrainD = args.pretrainD
hparams.gpus = args.gpus hparams.gpus = args.gpus
hparams.train.batch_size = args.batch_size hparams.train.batch_size = args.batch_size
hparams.sample_rate = args.sample_rate hparams.sample_rate = args.sample_rate
hparams.if_f0 = args.if_f0 hparams.if_f0 = args.if_f0
hparams.if_latest = args.if_latest hparams.if_latest = args.if_latest
hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu
hparams.data.training_files = "%s/filelist.txt"%experiment_dir hparams.data.training_files = "%s/filelist.txt" % experiment_dir
return hparams return hparams
def get_hparams_from_dir(model_dir): def get_hparams_from_dir(model_dir):
config_save_path = os.path.join(model_dir, "config.json") config_save_path = os.path.join(model_dir, "config.json")
with open(config_save_path, "r") as f: with open(config_save_path, "r") as f:
data = f.read() data = f.read()
config = json.loads(data) config = json.loads(data)
hparams =HParams(**config) hparams = HParams(**config)
hparams.model_dir = model_dir hparams.model_dir = model_dir
return hparams return hparams
def get_hparams_from_file(config_path): def get_hparams_from_file(config_path):
with open(config_path, "r") as f: with open(config_path, "r") as f:
data = f.read() data = f.read()
config = json.loads(data) config = json.loads(data)
hparams =HParams(**config) hparams = HParams(**config)
return hparams return hparams
def check_git_hash(model_dir): def check_git_hash(model_dir):
source_dir = os.path.dirname(os.path.realpath(__file__)) source_dir = os.path.dirname(os.path.realpath(__file__))
if not os.path.exists(os.path.join(source_dir, ".git")): if not os.path.exists(os.path.join(source_dir, ".git")):
logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format( logger.warn(
source_dir "{} is not a git repository, therefore hash value comparison will be ignored.".format(
)) source_dir
return )
)
return
cur_hash = subprocess.getoutput("git rev-parse HEAD") cur_hash = subprocess.getoutput("git rev-parse HEAD")
path = os.path.join(model_dir, "githash") path = os.path.join(model_dir, "githash")
if os.path.exists(path): if os.path.exists(path):
saved_hash = open(path).read() saved_hash = open(path).read()
if saved_hash != cur_hash: if saved_hash != cur_hash:
logger.warn("git hash values are different. {}(saved) != {}(current)".format( logger.warn(
saved_hash[:8], cur_hash[:8])) "git hash values are different. {}(saved) != {}(current)".format(
else: saved_hash[:8], cur_hash[:8]
open(path, "w").write(cur_hash) )
)
else:
open(path, "w").write(cur_hash)
def get_logger(model_dir, filename="train.log"): def get_logger(model_dir, filename="train.log"):
global logger global logger
logger = logging.getLogger(os.path.basename(model_dir)) logger = logging.getLogger(os.path.basename(model_dir))
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
if not os.path.exists(model_dir): if not os.path.exists(model_dir):
os.makedirs(model_dir) os.makedirs(model_dir)
h = logging.FileHandler(os.path.join(model_dir, filename)) h = logging.FileHandler(os.path.join(model_dir, filename))
h.setLevel(logging.DEBUG) h.setLevel(logging.DEBUG)
h.setFormatter(formatter) h.setFormatter(formatter)
logger.addHandler(h) logger.addHandler(h)
return logger return logger
class HParams(): class HParams:
def __init__(self, **kwargs): def __init__(self, **kwargs):
for k, v in kwargs.items(): for k, v in kwargs.items():
if type(v) == dict: if type(v) == dict:
v = HParams(**v) v = HParams(**v)
self[k] = v self[k] = v
def keys(self):
return self.__dict__.keys()
def items(self): def keys(self):
return self.__dict__.items() return self.__dict__.keys()
def values(self): def items(self):
return self.__dict__.values() return self.__dict__.items()
def __len__(self): def values(self):
return len(self.__dict__) return self.__dict__.values()
def __getitem__(self, key): def __len__(self):
return getattr(self, key) return len(self.__dict__)
def __setitem__(self, key, value): def __getitem__(self, key):
return setattr(self, key, value) return getattr(self, key)
def __contains__(self, key): def __setitem__(self, key, value):
return key in self.__dict__ return setattr(self, key, value)
def __repr__(self): def __contains__(self, key):
return self.__dict__.__repr__() return key in self.__dict__
def __repr__(self):
return self.__dict__.__repr__()

View File

@ -1,12 +1,15 @@
import sys,os import sys, os
now_dir=os.getcwd()
sys.path.append(os.path.join(now_dir,"train")) now_dir = os.getcwd()
sys.path.append(os.path.join(now_dir, "train"))
import utils import utils
hps = utils.get_hparams() hps = utils.get_hparams()
os.environ["CUDA_VISIBLE_DEVICES"]=hps.gpus.replace("-",",") os.environ["CUDA_VISIBLE_DEVICES"] = hps.gpus.replace("-", ",")
n_gpus=len(hps.gpus.split("-")) n_gpus = len(hps.gpus.split("-"))
from random import shuffle from random import shuffle
import traceback,json,argparse,itertools,math,torch,pdb import traceback, json, argparse, itertools, math, torch, pdb
torch.backends.cudnn.deterministic = False torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = False torch.backends.cudnn.benchmark = False
from torch import nn, optim from torch import nn, optim
@ -20,9 +23,16 @@ from torch.cuda.amp import autocast, GradScaler
from infer_pack import commons from infer_pack import commons
from time import time as ttime from time import time as ttime
from data_utils import TextAudioLoaderMultiNSFsid,TextAudioLoader, TextAudioCollateMultiNSFsid,TextAudioCollate, DistributedBucketSampler from data_utils import (
TextAudioLoaderMultiNSFsid,
TextAudioLoader,
TextAudioCollateMultiNSFsid,
TextAudioCollate,
DistributedBucketSampler,
)
from infer_pack.models import ( from infer_pack.models import (
SynthesizerTrnMs256NSFsid,SynthesizerTrnMs256NSFsid_nono, SynthesizerTrnMs256NSFsid,
SynthesizerTrnMs256NSFsid_nono,
MultiPeriodDiscriminator, MultiPeriodDiscriminator,
) )
from losses import generator_loss, discriminator_loss, feature_loss, kl_loss from losses import generator_loss, discriminator_loss, feature_loss, kl_loss
@ -32,13 +42,11 @@ from mel_processing import mel_spectrogram_torch, spec_to_mel_torch
global_step = 0 global_step = 0
def main(): def main():
# n_gpus = torch.cuda.device_count() # n_gpus = torch.cuda.device_count()
os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "5555" os.environ["MASTER_PORT"] = "5555"
mp.spawn( mp.spawn(
run, run,
nprocs=n_gpus, nprocs=n_gpus,
@ -62,13 +70,16 @@ def run(rank, n_gpus, hps):
backend="gloo", init_method="env://", world_size=n_gpus, rank=rank backend="gloo", init_method="env://", world_size=n_gpus, rank=rank
) )
torch.manual_seed(hps.train.seed) torch.manual_seed(hps.train.seed)
if torch.cuda.is_available(): torch.cuda.set_device(rank) if torch.cuda.is_available():
torch.cuda.set_device(rank)
if (hps.if_f0 == 1):train_dataset = TextAudioLoaderMultiNSFsid(hps.data.training_files, hps.data) if hps.if_f0 == 1:
else:train_dataset = TextAudioLoader(hps.data.training_files, hps.data) train_dataset = TextAudioLoaderMultiNSFsid(hps.data.training_files, hps.data)
else:
train_dataset = TextAudioLoader(hps.data.training_files, hps.data)
train_sampler = DistributedBucketSampler( train_sampler = DistributedBucketSampler(
train_dataset, train_dataset,
hps.train.batch_size*n_gpus, hps.train.batch_size * n_gpus,
# [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1200,1400], # 16s # [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1200,1400], # 16s
[100, 200, 300, 400, 500, 600, 700, 800, 900], # 16s [100, 200, 300, 400, 500, 600, 700, 800, 900], # 16s
num_replicas=n_gpus, num_replicas=n_gpus,
@ -77,8 +88,10 @@ def run(rank, n_gpus, hps):
) )
# It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit. # It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit.
# num_workers=8 -> num_workers=4 # num_workers=8 -> num_workers=4
if (hps.if_f0 == 1):collate_fn = TextAudioCollateMultiNSFsid() if hps.if_f0 == 1:
else:collate_fn = TextAudioCollate() collate_fn = TextAudioCollateMultiNSFsid()
else:
collate_fn = TextAudioCollate()
train_loader = DataLoader( train_loader = DataLoader(
train_dataset, train_dataset,
num_workers=4, num_workers=4,
@ -89,13 +102,26 @@ def run(rank, n_gpus, hps):
persistent_workers=True, persistent_workers=True,
prefetch_factor=8, prefetch_factor=8,
) )
if(hps.if_f0==1): if hps.if_f0 == 1:
net_g = SynthesizerTrnMs256NSFsid(hps.data.filter_length // 2 + 1,hps.train.segment_size // hps.data.hop_length,**hps.model,is_half=hps.train.fp16_run,sr=hps.sample_rate) net_g = SynthesizerTrnMs256NSFsid(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model,
is_half=hps.train.fp16_run,
sr=hps.sample_rate,
)
else: else:
net_g = SynthesizerTrnMs256NSFsid_nono(hps.data.filter_length // 2 + 1,hps.train.segment_size // hps.data.hop_length,**hps.model,is_half=hps.train.fp16_run) net_g = SynthesizerTrnMs256NSFsid_nono(
if torch.cuda.is_available(): net_g = net_g.cuda(rank) hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model,
is_half=hps.train.fp16_run,
)
if torch.cuda.is_available():
net_g = net_g.cuda(rank)
net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm) net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm)
if torch.cuda.is_available(): net_d = net_d.cuda(rank) if torch.cuda.is_available():
net_d = net_d.cuda(rank)
optim_g = torch.optim.AdamW( optim_g = torch.optim.AdamW(
net_g.parameters(), net_g.parameters(),
hps.train.learning_rate, hps.train.learning_rate,
@ -110,30 +136,42 @@ def run(rank, n_gpus, hps):
) )
# net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True) # net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
# net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True) # net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
if torch.cuda.is_available(): if torch.cuda.is_available():
net_g = DDP(net_g, device_ids=[rank]) net_g = DDP(net_g, device_ids=[rank])
net_d = DDP(net_d, device_ids=[rank]) net_d = DDP(net_d, device_ids=[rank])
else: else:
net_g = DDP(net_g) net_g = DDP(net_g)
net_d = DDP(net_d) net_d = DDP(net_d)
try:#如果能加载自动resume try: # 如果能加载自动resume
_, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d) # D多半加载没事 _, _, _, epoch_str = utils.load_checkpoint(
utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d
) # D多半加载没事
if rank == 0: if rank == 0:
logger.info("loaded D") logger.info("loaded D")
# _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0) # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0)
_, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g) _, _, _, epoch_str = utils.load_checkpoint(
utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g
)
global_step = (epoch_str - 1) * len(train_loader) global_step = (epoch_str - 1) * len(train_loader)
# epoch_str = 1 # epoch_str = 1
# global_step = 0 # global_step = 0
except:#如果首次不能加载加载pretrain except: # 如果首次不能加载加载pretrain
traceback.print_exc() traceback.print_exc()
epoch_str = 1 epoch_str = 1
global_step = 0 global_step = 0
if rank == 0: if rank == 0:
logger.info("loaded pretrained %s %s"%(hps.pretrainG,hps.pretrainD)) logger.info("loaded pretrained %s %s" % (hps.pretrainG, hps.pretrainD))
print(net_g.module.load_state_dict(torch.load(hps.pretrainG,map_location="cpu")["model"]))##测试不加载优化器 print(
print(net_d.module.load_state_dict(torch.load(hps.pretrainD,map_location="cpu")["model"])) net_g.module.load_state_dict(
torch.load(hps.pretrainG, map_location="cpu")["model"]
)
) ##测试不加载优化器
print(
net_d.module.load_state_dict(
torch.load(hps.pretrainD, map_location="cpu")["model"]
)
)
scheduler_g = torch.optim.lr_scheduler.ExponentialLR( scheduler_g = torch.optim.lr_scheduler.ExponentialLR(
optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2 optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
@ -144,7 +182,7 @@ def run(rank, n_gpus, hps):
scaler = GradScaler(enabled=hps.train.fp16_run) scaler = GradScaler(enabled=hps.train.fp16_run)
cache=[] cache = []
for epoch in range(epoch_str, hps.train.epochs + 1): for epoch in range(epoch_str, hps.train.epochs + 1):
if rank == 0: if rank == 0:
train_and_evaluate( train_and_evaluate(
@ -157,7 +195,8 @@ def run(rank, n_gpus, hps):
scaler, scaler,
[train_loader, None], [train_loader, None],
logger, logger,
[writer, writer_eval],cache [writer, writer_eval],
cache,
) )
else: else:
train_and_evaluate( train_and_evaluate(
@ -170,14 +209,15 @@ def run(rank, n_gpus, hps):
scaler, scaler,
[train_loader, None], [train_loader, None],
None, None,
None,cache None,
cache,
) )
scheduler_g.step() scheduler_g.step()
scheduler_d.step() scheduler_d.step()
def train_and_evaluate( def train_and_evaluate(
rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers,cache rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers, cache
): ):
net_g, net_d = nets net_g, net_d = nets
optim_g, optim_d = optims optim_g, optim_d = optims
@ -190,168 +230,90 @@ def train_and_evaluate(
net_g.train() net_g.train()
net_d.train() net_d.train()
if(cache==[]or hps.if_cache_data_in_gpu==False):#第一个epoch把cache全部填满训练集 if cache == [] or hps.if_cache_data_in_gpu == False: # 第一个epoch把cache全部填满训练集
# print("caching") # print("caching")
for batch_idx, info in enumerate(train_loader): for batch_idx, info in enumerate(train_loader):
if (hps.if_f0 == 1):phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths,sid=info if hps.if_f0 == 1:
else:phone,phone_lengths,spec,spec_lengths,wave,wave_lengths,sid=info (
phone,
phone_lengths,
pitch,
pitchf,
spec,
spec_lengths,
wave,
wave_lengths,
sid,
) = info
else:
phone, phone_lengths, spec, spec_lengths, wave, wave_lengths, sid = info
if torch.cuda.is_available(): if torch.cuda.is_available():
phone, phone_lengths = phone.cuda(rank, non_blocking=True), phone_lengths.cuda(rank, non_blocking=True ) phone, phone_lengths = phone.cuda(
if (hps.if_f0 == 1):pitch,pitchf = pitch.cuda(rank, non_blocking=True),pitchf.cuda(rank, non_blocking=True) rank, non_blocking=True
), phone_lengths.cuda(rank, non_blocking=True)
if hps.if_f0 == 1:
pitch, pitchf = pitch.cuda(rank, non_blocking=True), pitchf.cuda(
rank, non_blocking=True
)
sid = sid.cuda(rank, non_blocking=True) sid = sid.cuda(rank, non_blocking=True)
spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True) spec, spec_lengths = spec.cuda(
wave, wave_lengths = wave.cuda(rank, non_blocking=True), wave_lengths.cuda(rank, non_blocking=True) rank, non_blocking=True
if(hps.if_cache_data_in_gpu==True): ), spec_lengths.cuda(rank, non_blocking=True)
if (hps.if_f0 == 1):cache.append((batch_idx, (phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths ,sid))) wave, wave_lengths = wave.cuda(
else:cache.append((batch_idx, (phone,phone_lengths,spec,spec_lengths,wave,wave_lengths ,sid))) rank, non_blocking=True
with autocast(enabled=hps.train.fp16_run): ), wave_lengths.cuda(rank, non_blocking=True)
if (hps.if_f0 == 1):y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, pitch,pitchf, spec, spec_lengths,sid) if hps.if_cache_data_in_gpu == True:
else:y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, spec, spec_lengths,sid) if hps.if_f0 == 1:
mel = spec_to_mel_torch(spec,hps.data.filter_length,hps.data.n_mel_channels,hps.data.sampling_rate,hps.data.mel_fmin,hps.data.mel_fmax,) cache.append(
y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length) (
with autocast(enabled=False): batch_idx,
y_hat_mel = mel_spectrogram_torch( (
y_hat.float().squeeze(1), phone,
hps.data.filter_length, phone_lengths,
hps.data.n_mel_channels, pitch,
hps.data.sampling_rate, pitchf,
hps.data.hop_length, spec,
hps.data.win_length, spec_lengths,
hps.data.mel_fmin, wave,
hps.data.mel_fmax, wave_lengths,
) sid,
if(hps.train.fp16_run==True): ),
y_hat_mel=y_hat_mel.half()
wave = commons.slice_segments(
wave, ids_slice * hps.data.hop_length, hps.train.segment_size
) # slice
# Discriminator
y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach())
with autocast(enabled=False):
loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(
y_d_hat_r, y_d_hat_g
)
optim_d.zero_grad()
scaler.scale(loss_disc).backward()
scaler.unscale_(optim_d)
grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
scaler.step(optim_d)
with autocast(enabled=hps.train.fp16_run):
# Generator
y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat)
with autocast(enabled=False):
loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
loss_fm = feature_loss(fmap_r, fmap_g)
loss_gen, losses_gen = generator_loss(y_d_hat_g)
loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
optim_g.zero_grad()
scaler.scale(loss_gen_all).backward()
scaler.unscale_(optim_g)
grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
scaler.step(optim_g)
scaler.update()
if rank == 0:
if global_step % hps.train.log_interval == 0:
lr = optim_g.param_groups[0]["lr"]
logger.info(
"Train Epoch: {} [{:.0f}%]".format(
epoch, 100.0 * batch_idx / len(train_loader)
) )
) )
# Amor For Tensorboard display else:
if loss_mel > 50: cache.append(
loss_mel = 50 (
if loss_kl > 5: batch_idx,
loss_kl = 5 (
phone,
logger.info([global_step, lr]) phone_lengths,
logger.info( spec,
f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}" spec_lengths,
wave,
wave_lengths,
sid,
),
)
) )
scalar_dict = {
"loss/g/total": loss_gen_all,
"loss/d/total": loss_disc,
"learning_rate": lr,
"grad_norm_d": grad_norm_d,
"grad_norm_g": grad_norm_g,
}
scalar_dict.update(
{"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl}
)
scalar_dict.update(
{"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}
)
scalar_dict.update(
{"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}
)
scalar_dict.update(
{"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}
)
image_dict = {
"slice/mel_org": utils.plot_spectrogram_to_numpy(
y_mel[0].data.cpu().numpy()
),
"slice/mel_gen": utils.plot_spectrogram_to_numpy(
y_hat_mel[0].data.cpu().numpy()
),
"all/mel": utils.plot_spectrogram_to_numpy(
mel[0].data.cpu().numpy()
),
}
utils.summarize(
writer=writer,
global_step=global_step,
images=image_dict,
scalars=scalar_dict,
)
global_step += 1
# if global_step % hps.train.eval_interval == 0:
if epoch % hps.save_every_epoch == 0 and rank == 0:
if(hps.if_latest==0):
utils.save_checkpoint(
net_g,
optim_g,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "G_{}.pth".format(global_step)),
)
utils.save_checkpoint(
net_d,
optim_d,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "D_{}.pth".format(global_step)),
)
else:
utils.save_checkpoint(
net_g,
optim_g,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "G_{}.pth".format(2333333)),
)
utils.save_checkpoint(
net_d,
optim_d,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "D_{}.pth".format(2333333)),
)
else:#后续的epoch直接使用打乱的cache
shuffle(cache)
# print("using cache")
for batch_idx, info in cache:
if (hps.if_f0 == 1):phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths,sid=info
else:phone,phone_lengths,spec,spec_lengths,wave,wave_lengths,sid=info
with autocast(enabled=hps.train.fp16_run): with autocast(enabled=hps.train.fp16_run):
if (hps.if_f0 == 1):y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, pitch,pitchf, spec, spec_lengths,sid) if hps.if_f0 == 1:
else:y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, spec, spec_lengths,sid) (
y_hat,
ids_slice,
x_mask,
z_mask,
(z, z_p, m_p, logs_p, m_q, logs_q),
) = net_g(
phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid
)
else:
(
y_hat,
ids_slice,
x_mask,
z_mask,
(z, z_p, m_p, logs_p, m_q, logs_q),
) = net_g(phone, phone_lengths, spec, spec_lengths, sid)
mel = spec_to_mel_torch( mel = spec_to_mel_torch(
spec, spec,
hps.data.filter_length, hps.data.filter_length,
@ -374,8 +336,200 @@ def train_and_evaluate(
hps.data.mel_fmin, hps.data.mel_fmin,
hps.data.mel_fmax, hps.data.mel_fmax,
) )
if(hps.train.fp16_run==True): if hps.train.fp16_run == True:
y_hat_mel=y_hat_mel.half() y_hat_mel = y_hat_mel.half()
wave = commons.slice_segments(
wave, ids_slice * hps.data.hop_length, hps.train.segment_size
) # slice
# Discriminator
y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach())
with autocast(enabled=False):
loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(
y_d_hat_r, y_d_hat_g
)
optim_d.zero_grad()
scaler.scale(loss_disc).backward()
scaler.unscale_(optim_d)
grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
scaler.step(optim_d)
with autocast(enabled=hps.train.fp16_run):
# Generator
y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat)
with autocast(enabled=False):
loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
loss_fm = feature_loss(fmap_r, fmap_g)
loss_gen, losses_gen = generator_loss(y_d_hat_g)
loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
optim_g.zero_grad()
scaler.scale(loss_gen_all).backward()
scaler.unscale_(optim_g)
grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
scaler.step(optim_g)
scaler.update()
if rank == 0:
if global_step % hps.train.log_interval == 0:
lr = optim_g.param_groups[0]["lr"]
logger.info(
"Train Epoch: {} [{:.0f}%]".format(
epoch, 100.0 * batch_idx / len(train_loader)
)
)
# Amor For Tensorboard display
if loss_mel > 50:
loss_mel = 50
if loss_kl > 5:
loss_kl = 5
logger.info([global_step, lr])
logger.info(
f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}"
)
scalar_dict = {
"loss/g/total": loss_gen_all,
"loss/d/total": loss_disc,
"learning_rate": lr,
"grad_norm_d": grad_norm_d,
"grad_norm_g": grad_norm_g,
}
scalar_dict.update(
{
"loss/g/fm": loss_fm,
"loss/g/mel": loss_mel,
"loss/g/kl": loss_kl,
}
)
scalar_dict.update(
{"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}
)
scalar_dict.update(
{
"loss/d_r/{}".format(i): v
for i, v in enumerate(losses_disc_r)
}
)
scalar_dict.update(
{
"loss/d_g/{}".format(i): v
for i, v in enumerate(losses_disc_g)
}
)
image_dict = {
"slice/mel_org": utils.plot_spectrogram_to_numpy(
y_mel[0].data.cpu().numpy()
),
"slice/mel_gen": utils.plot_spectrogram_to_numpy(
y_hat_mel[0].data.cpu().numpy()
),
"all/mel": utils.plot_spectrogram_to_numpy(
mel[0].data.cpu().numpy()
),
}
utils.summarize(
writer=writer,
global_step=global_step,
images=image_dict,
scalars=scalar_dict,
)
global_step += 1
# if global_step % hps.train.eval_interval == 0:
if epoch % hps.save_every_epoch == 0 and rank == 0:
if hps.if_latest == 0:
utils.save_checkpoint(
net_g,
optim_g,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "G_{}.pth".format(global_step)),
)
utils.save_checkpoint(
net_d,
optim_d,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "D_{}.pth".format(global_step)),
)
else:
utils.save_checkpoint(
net_g,
optim_g,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "G_{}.pth".format(2333333)),
)
utils.save_checkpoint(
net_d,
optim_d,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "D_{}.pth".format(2333333)),
)
else: # 后续的epoch直接使用打乱的cache
shuffle(cache)
# print("using cache")
for batch_idx, info in cache:
if hps.if_f0 == 1:
(
phone,
phone_lengths,
pitch,
pitchf,
spec,
spec_lengths,
wave,
wave_lengths,
sid,
) = info
else:
phone, phone_lengths, spec, spec_lengths, wave, wave_lengths, sid = info
with autocast(enabled=hps.train.fp16_run):
if hps.if_f0 == 1:
(
y_hat,
ids_slice,
x_mask,
z_mask,
(z, z_p, m_p, logs_p, m_q, logs_q),
) = net_g(
phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid
)
else:
(
y_hat,
ids_slice,
x_mask,
z_mask,
(z, z_p, m_p, logs_p, m_q, logs_q),
) = net_g(phone, phone_lengths, spec, spec_lengths, sid)
mel = spec_to_mel_torch(
spec,
hps.data.filter_length,
hps.data.n_mel_channels,
hps.data.sampling_rate,
hps.data.mel_fmin,
hps.data.mel_fmax,
)
y_mel = commons.slice_segments(
mel, ids_slice, hps.train.segment_size // hps.data.hop_length
)
with autocast(enabled=False):
y_hat_mel = mel_spectrogram_torch(
y_hat.float().squeeze(1),
hps.data.filter_length,
hps.data.n_mel_channels,
hps.data.sampling_rate,
hps.data.hop_length,
hps.data.win_length,
hps.data.mel_fmin,
hps.data.mel_fmax,
)
if hps.train.fp16_run == True:
y_hat_mel = y_hat_mel.half()
wave = commons.slice_segments( wave = commons.slice_segments(
wave, ids_slice * hps.data.hop_length, hps.train.segment_size wave, ids_slice * hps.data.hop_length, hps.train.segment_size
) # slice ) # slice
@ -435,17 +589,27 @@ def train_and_evaluate(
"grad_norm_g": grad_norm_g, "grad_norm_g": grad_norm_g,
} }
scalar_dict.update( scalar_dict.update(
{"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl} {
"loss/g/fm": loss_fm,
"loss/g/mel": loss_mel,
"loss/g/kl": loss_kl,
}
) )
scalar_dict.update( scalar_dict.update(
{"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)} {"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}
) )
scalar_dict.update( scalar_dict.update(
{"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)} {
"loss/d_r/{}".format(i): v
for i, v in enumerate(losses_disc_r)
}
) )
scalar_dict.update( scalar_dict.update(
{"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)} {
"loss/d_g/{}".format(i): v
for i, v in enumerate(losses_disc_g)
}
) )
image_dict = { image_dict = {
"slice/mel_org": utils.plot_spectrogram_to_numpy( "slice/mel_org": utils.plot_spectrogram_to_numpy(
@ -467,7 +631,7 @@ def train_and_evaluate(
global_step += 1 global_step += 1
# if global_step % hps.train.eval_interval == 0: # if global_step % hps.train.eval_interval == 0:
if epoch % hps.save_every_epoch == 0 and rank == 0: if epoch % hps.save_every_epoch == 0 and rank == 0:
if(hps.if_latest==0): if hps.if_latest == 0:
utils.save_checkpoint( utils.save_checkpoint(
net_g, net_g,
optim_g, optim_g,
@ -498,15 +662,20 @@ def train_and_evaluate(
os.path.join(hps.model_dir, "D_{}.pth".format(2333333)), os.path.join(hps.model_dir, "D_{}.pth".format(2333333)),
) )
if rank == 0: if rank == 0:
logger.info("====> Epoch: {}".format(epoch)) logger.info("====> Epoch: {}".format(epoch))
if(epoch>=hps.total_epoch and rank == 0): if epoch >= hps.total_epoch and rank == 0:
logger.info("Training is done. The program is closed.") logger.info("Training is done. The program is closed.")
from process_ckpt import savee#def savee(ckpt,sr,if_f0,name,epoch): from process_ckpt import savee # def savee(ckpt,sr,if_f0,name,epoch):
if hasattr(net_g, 'module'):ckpt = net_g.module.state_dict()
else:ckpt = net_g.state_dict() if hasattr(net_g, "module"):
logger.info("saving final ckpt:%s"%(savee(ckpt,hps.sample_rate,hps.if_f0,hps.name,epoch))) ckpt = net_g.module.state_dict()
else:
ckpt = net_g.state_dict()
logger.info(
"saving final ckpt:%s"
% (savee(ckpt, hps.sample_rate, hps.if_f0, hps.name, epoch))
)
os._exit(2333333) os._exit(2333333)

View File

@ -1,5 +1,6 @@
import sys,os,multiprocessing import sys, os, multiprocessing
now_dir=os.getcwd()
now_dir = os.getcwd()
sys.path.append(now_dir) sys.path.append(now_dir)
inp_root = sys.argv[1] inp_root = sys.argv[1]
@ -7,15 +8,17 @@ sr = int(sys.argv[2])
n_p = int(sys.argv[3]) n_p = int(sys.argv[3])
exp_dir = sys.argv[4] exp_dir = sys.argv[4]
noparallel = sys.argv[5] == "True" noparallel = sys.argv[5] == "True"
import numpy as np,os,traceback import numpy as np, os, traceback
from slicer2 import Slicer from slicer2 import Slicer
import librosa,traceback import librosa, traceback
from scipy.io import wavfile from scipy.io import wavfile
import multiprocessing import multiprocessing
from my_utils import load_audio from my_utils import load_audio
mutex = multiprocessing.Lock() mutex = multiprocessing.Lock()
f = open("%s/preprocess.log"%exp_dir, "a+") f = open("%s/preprocess.log" % exp_dir, "a+")
def println(strr): def println(strr):
mutex.acquire() mutex.acquire()
print(strr) print(strr)
@ -23,81 +26,101 @@ def println(strr):
f.flush() f.flush()
mutex.release() mutex.release()
class PreProcess():
def __init__(self,sr,exp_dir): class PreProcess:
def __init__(self, sr, exp_dir):
self.slicer = Slicer( self.slicer = Slicer(
sr=sr, sr=sr,
threshold=-32, threshold=-32,
min_length=800, min_length=800,
min_interval=400, min_interval=400,
hop_size=15, hop_size=15,
max_sil_kept=150 max_sil_kept=150,
) )
self.sr=sr self.sr = sr
self.per=3.7 self.per = 3.7
self.overlap=0.3 self.overlap = 0.3
self.tail=self.per+self.overlap self.tail = self.per + self.overlap
self.max=0.95 self.max = 0.95
self.alpha=0.8 self.alpha = 0.8
self.exp_dir=exp_dir self.exp_dir = exp_dir
self.gt_wavs_dir="%s/0_gt_wavs"%exp_dir self.gt_wavs_dir = "%s/0_gt_wavs" % exp_dir
self.wavs16k_dir="%s/1_16k_wavs"%exp_dir self.wavs16k_dir = "%s/1_16k_wavs" % exp_dir
os.makedirs(self.exp_dir,exist_ok=True) os.makedirs(self.exp_dir, exist_ok=True)
os.makedirs(self.gt_wavs_dir,exist_ok=True) os.makedirs(self.gt_wavs_dir, exist_ok=True)
os.makedirs(self.wavs16k_dir,exist_ok=True) os.makedirs(self.wavs16k_dir, exist_ok=True)
def norm_write(self,tmp_audio,idx0,idx1): def norm_write(self, tmp_audio, idx0, idx1):
tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (self.max * self.alpha)) + (1 - self.alpha) * tmp_audio tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (self.max * self.alpha)) + (
wavfile.write("%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), self.sr, (tmp_audio*32768).astype(np.int16)) 1 - self.alpha
) * tmp_audio
wavfile.write(
"%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1),
self.sr,
(tmp_audio * 32768).astype(np.int16),
)
tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000) tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000)
wavfile.write("%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), 16000, (tmp_audio*32768).astype(np.int16)) wavfile.write(
"%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
16000,
(tmp_audio * 32768).astype(np.int16),
)
def pipeline(self,path, idx0): def pipeline(self, path, idx0):
try: try:
audio = load_audio(path,self.sr) audio = load_audio(path, self.sr)
idx1=0 idx1 = 0
for audio in self.slicer.slice(audio): for audio in self.slicer.slice(audio):
i = 0 i = 0
while (1): while 1:
start = int(self.sr * (self.per - self.overlap) * i) start = int(self.sr * (self.per - self.overlap) * i)
i += 1 i += 1
if (len(audio[start:]) > self.tail * self.sr): if len(audio[start:]) > self.tail * self.sr:
tmp_audio = audio[start:start + int(self.per * self.sr)] tmp_audio = audio[start : start + int(self.per * self.sr)]
self.norm_write(tmp_audio,idx0,idx1) self.norm_write(tmp_audio, idx0, idx1)
idx1 += 1 idx1 += 1
else: else:
tmp_audio = audio[start:] tmp_audio = audio[start:]
break break
self.norm_write(tmp_audio, idx0, idx1) self.norm_write(tmp_audio, idx0, idx1)
println("%s->Suc."%path) println("%s->Suc." % path)
except: except:
println("%s->%s"%(path,traceback.format_exc())) println("%s->%s" % (path, traceback.format_exc()))
def pipeline_mp(self,infos): def pipeline_mp(self, infos):
for path, idx0 in infos: for path, idx0 in infos:
self.pipeline(path,idx0) self.pipeline(path, idx0)
def pipeline_mp_inp_dir(self,inp_root,n_p): def pipeline_mp_inp_dir(self, inp_root, n_p):
try: try:
infos = [("%s/%s" % (inp_root, name), idx) for idx, name in enumerate(sorted(list(os.listdir(inp_root))))] infos = [
("%s/%s" % (inp_root, name), idx)
for idx, name in enumerate(sorted(list(os.listdir(inp_root))))
]
if noparallel: if noparallel:
for i in range(n_p): self.pipeline_mp(infos[i::n_p])
else:
ps=[]
for i in range(n_p): for i in range(n_p):
p=multiprocessing.Process(target=self.pipeline_mp,args=(infos[i::n_p],)) self.pipeline_mp(infos[i::n_p])
else:
ps = []
for i in range(n_p):
p = multiprocessing.Process(
target=self.pipeline_mp, args=(infos[i::n_p],)
)
p.start() p.start()
ps.append(p) ps.append(p)
for p in ps:p.join() for p in ps:
p.join()
except: except:
println("Fail. %s"%traceback.format_exc()) println("Fail. %s" % traceback.format_exc())
def preprocess_trainset(inp_root, sr, n_p, exp_dir): def preprocess_trainset(inp_root, sr, n_p, exp_dir):
pp=PreProcess(sr,exp_dir) pp = PreProcess(sr, exp_dir)
println("start preprocess") println("start preprocess")
println(sys.argv) println(sys.argv)
pp.pipeline_mp_inp_dir(inp_root,n_p) pp.pipeline_mp_inp_dir(inp_root, n_p)
println("end preprocess") println("end preprocess")
if __name__=='__main__':
if __name__ == "__main__":
preprocess_trainset(inp_root, sr, n_p, exp_dir) preprocess_trainset(inp_root, sr, n_p, exp_dir)

View File

@ -10,7 +10,6 @@ from uvr5_pack.lib_v5 import spec_utils
class VocalRemoverValidationSet(torch.utils.data.Dataset): class VocalRemoverValidationSet(torch.utils.data.Dataset):
def __init__(self, patch_list): def __init__(self, patch_list):
self.patch_list = patch_list self.patch_list = patch_list
@ -21,7 +20,7 @@ class VocalRemoverValidationSet(torch.utils.data.Dataset):
path = self.patch_list[idx] path = self.patch_list[idx]
data = np.load(path) data = np.load(path)
X, y = data['X'], data['y'] X, y = data["X"], data["y"]
X_mag = np.abs(X) X_mag = np.abs(X)
y_mag = np.abs(y) y_mag = np.abs(y)
@ -30,16 +29,22 @@ class VocalRemoverValidationSet(torch.utils.data.Dataset):
def make_pair(mix_dir, inst_dir): def make_pair(mix_dir, inst_dir):
input_exts = ['.wav', '.m4a', '.mp3', '.mp4', '.flac'] input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"]
X_list = sorted([ X_list = sorted(
os.path.join(mix_dir, fname) [
for fname in os.listdir(mix_dir) os.path.join(mix_dir, fname)
if os.path.splitext(fname)[1] in input_exts]) for fname in os.listdir(mix_dir)
y_list = sorted([ if os.path.splitext(fname)[1] in input_exts
os.path.join(inst_dir, fname) ]
for fname in os.listdir(inst_dir) )
if os.path.splitext(fname)[1] in input_exts]) y_list = sorted(
[
os.path.join(inst_dir, fname)
for fname in os.listdir(inst_dir)
if os.path.splitext(fname)[1] in input_exts
]
)
filelist = list(zip(X_list, y_list)) filelist = list(zip(X_list, y_list))
@ -47,10 +52,11 @@ def make_pair(mix_dir, inst_dir):
def train_val_split(dataset_dir, split_mode, val_rate, val_filelist): def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
if split_mode == 'random': if split_mode == "random":
filelist = make_pair( filelist = make_pair(
os.path.join(dataset_dir, 'mixtures'), os.path.join(dataset_dir, "mixtures"),
os.path.join(dataset_dir, 'instruments')) os.path.join(dataset_dir, "instruments"),
)
random.shuffle(filelist) random.shuffle(filelist)
@ -60,19 +66,23 @@ def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
val_filelist = filelist[-val_size:] val_filelist = filelist[-val_size:]
else: else:
train_filelist = [ train_filelist = [
pair for pair in filelist pair for pair in filelist if list(pair) not in val_filelist
if list(pair) not in val_filelist] ]
elif split_mode == 'subdirs': elif split_mode == "subdirs":
if len(val_filelist) != 0: if len(val_filelist) != 0:
raise ValueError('The `val_filelist` option is not available in `subdirs` mode') raise ValueError(
"The `val_filelist` option is not available in `subdirs` mode"
)
train_filelist = make_pair( train_filelist = make_pair(
os.path.join(dataset_dir, 'training/mixtures'), os.path.join(dataset_dir, "training/mixtures"),
os.path.join(dataset_dir, 'training/instruments')) os.path.join(dataset_dir, "training/instruments"),
)
val_filelist = make_pair( val_filelist = make_pair(
os.path.join(dataset_dir, 'validation/mixtures'), os.path.join(dataset_dir, "validation/mixtures"),
os.path.join(dataset_dir, 'validation/instruments')) os.path.join(dataset_dir, "validation/instruments"),
)
return train_filelist, val_filelist return train_filelist, val_filelist
@ -81,7 +91,9 @@ def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha):
perm = np.random.permutation(len(X)) perm = np.random.permutation(len(X))
for i, idx in enumerate(tqdm(perm)): for i, idx in enumerate(tqdm(perm)):
if np.random.uniform() < reduction_rate: if np.random.uniform() < reduction_rate:
y[idx] = spec_utils.reduce_vocal_aggressively(X[idx], y[idx], reduction_mask) y[idx] = spec_utils.reduce_vocal_aggressively(
X[idx], y[idx], reduction_mask
)
if np.random.uniform() < 0.5: if np.random.uniform() < 0.5:
# swap channel # swap channel
@ -116,10 +128,8 @@ def make_padding(width, cropsize, offset):
def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset): def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset):
len_dataset = patches * len(filelist) len_dataset = patches * len(filelist)
X_dataset = np.zeros( X_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
(len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64) y_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
y_dataset = np.zeros(
(len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
for i, (X_path, y_path) in enumerate(tqdm(filelist)): for i, (X_path, y_path) in enumerate(tqdm(filelist)):
X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft) X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
@ -127,22 +137,24 @@ def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset
X, y = X / coef, y / coef X, y = X / coef, y / coef
l, r, roi_size = make_padding(X.shape[2], cropsize, offset) l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant') X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode='constant') y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches) starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches)
ends = starts + cropsize ends = starts + cropsize
for j in range(patches): for j in range(patches):
idx = i * patches + j idx = i * patches + j
X_dataset[idx] = X_pad[:, :, starts[j]:ends[j]] X_dataset[idx] = X_pad[:, :, starts[j] : ends[j]]
y_dataset[idx] = y_pad[:, :, starts[j]:ends[j]] y_dataset[idx] = y_pad[:, :, starts[j] : ends[j]]
return X_dataset, y_dataset return X_dataset, y_dataset
def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset): def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
patch_list = [] patch_list = []
patch_dir = 'cs{}_sr{}_hl{}_nf{}_of{}'.format(cropsize, sr, hop_length, n_fft, offset) patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(
cropsize, sr, hop_length, n_fft, offset
)
os.makedirs(patch_dir, exist_ok=True) os.makedirs(patch_dir, exist_ok=True)
for i, (X_path, y_path) in enumerate(tqdm(filelist)): for i, (X_path, y_path) in enumerate(tqdm(filelist)):
@ -153,18 +165,19 @@ def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
X, y = X / coef, y / coef X, y = X / coef, y / coef
l, r, roi_size = make_padding(X.shape[2], cropsize, offset) l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant') X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode='constant') y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
len_dataset = int(np.ceil(X.shape[2] / roi_size)) len_dataset = int(np.ceil(X.shape[2] / roi_size))
for j in range(len_dataset): for j in range(len_dataset):
outpath = os.path.join(patch_dir, '{}_p{}.npz'.format(basename, j)) outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j))
start = j * roi_size start = j * roi_size
if not os.path.exists(outpath): if not os.path.exists(outpath):
np.savez( np.savez(
outpath, outpath,
X=X_pad[:, :, start:start + cropsize], X=X_pad[:, :, start : start + cropsize],
y=y_pad[:, :, start:start + cropsize]) y=y_pad[:, :, start : start + cropsize],
)
patch_list.append(outpath) patch_list.append(outpath)
return VocalRemoverValidationSet(patch_list) return VocalRemoverValidationSet(patch_list)

View File

@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module): class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__() super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential( self.conv = nn.Sequential(
nn.Conv2d( nn.Conv2d(
nin, nout, nin,
nout,
kernel_size=ksize, kernel_size=ksize,
stride=stride, stride=stride,
padding=pad, padding=pad,
dilation=dilation, dilation=dilation,
bias=False), bias=False,
),
nn.BatchNorm2d(nout), nn.BatchNorm2d(nout),
activ() activ(),
) )
def __call__(self, x): def __call__(self, x):
@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module): class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__() super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential( self.conv = nn.Sequential(
nn.Conv2d( nn.Conv2d(
nin, nin, nin,
nin,
kernel_size=ksize, kernel_size=ksize,
stride=stride, stride=stride,
padding=pad, padding=pad,
dilation=dilation, dilation=dilation,
groups=nin, groups=nin,
bias=False), bias=False,
nn.Conv2d( ),
nin, nout, nn.Conv2d(nin, nout, kernel_size=1, bias=False),
kernel_size=1,
bias=False),
nn.BatchNorm2d(nout), nn.BatchNorm2d(nout),
activ() activ(),
) )
def __call__(self, x): def __call__(self, x):
@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module): class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__() super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@ -65,14 +63,15 @@ class Encoder(nn.Module):
class Decoder(nn.Module): class Decoder(nn.Module):
def __init__(
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__() super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None): def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True) x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None: if skip is not None:
skip = spec_utils.crop_center(skip, x) skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1) x = torch.cat([x, skip], dim=1)
@ -85,28 +84,31 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module): class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
super(ASPPModule, self).__init__() super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential( self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)), nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
) )
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv( self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv( self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv( self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential( self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
nn.Dropout2d(0.1)
) )
def forward(self, x): def forward(self, x):
_, _, h, w = x.size() _, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True) feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x) feat2 = self.conv2(x)
feat3 = self.conv3(x) feat3 = self.conv3(x)
feat4 = self.conv4(x) feat4 = self.conv4(x)

View File

@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module): class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__() super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential( self.conv = nn.Sequential(
nn.Conv2d( nn.Conv2d(
nin, nout, nin,
nout,
kernel_size=ksize, kernel_size=ksize,
stride=stride, stride=stride,
padding=pad, padding=pad,
dilation=dilation, dilation=dilation,
bias=False), bias=False,
),
nn.BatchNorm2d(nout), nn.BatchNorm2d(nout),
activ() activ(),
) )
def __call__(self, x): def __call__(self, x):
@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module): class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__() super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential( self.conv = nn.Sequential(
nn.Conv2d( nn.Conv2d(
nin, nin, nin,
nin,
kernel_size=ksize, kernel_size=ksize,
stride=stride, stride=stride,
padding=pad, padding=pad,
dilation=dilation, dilation=dilation,
groups=nin, groups=nin,
bias=False), bias=False,
nn.Conv2d( ),
nin, nout, nn.Conv2d(nin, nout, kernel_size=1, bias=False),
kernel_size=1,
bias=False),
nn.BatchNorm2d(nout), nn.BatchNorm2d(nout),
activ() activ(),
) )
def __call__(self, x): def __call__(self, x):
@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module): class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__() super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@ -65,14 +63,15 @@ class Encoder(nn.Module):
class Decoder(nn.Module): class Decoder(nn.Module):
def __init__(
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__() super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None): def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True) x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None: if skip is not None:
skip = spec_utils.crop_center(skip, x) skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1) x = torch.cat([x, skip], dim=1)
@ -85,28 +84,31 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module): class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
super(ASPPModule, self).__init__() super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential( self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)), nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
) )
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv( self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv( self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv( self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential( self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
nn.Dropout2d(0.1)
) )
def forward(self, x): def forward(self, x):
_, _, h, w = x.size() _, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True) feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x) feat2 = self.conv2(x)
feat3 = self.conv3(x) feat3 = self.conv3(x)
feat4 = self.conv4(x) feat4 = self.conv4(x)

View File

@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module): class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__() super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential( self.conv = nn.Sequential(
nn.Conv2d( nn.Conv2d(
nin, nout, nin,
nout,
kernel_size=ksize, kernel_size=ksize,
stride=stride, stride=stride,
padding=pad, padding=pad,
dilation=dilation, dilation=dilation,
bias=False), bias=False,
),
nn.BatchNorm2d(nout), nn.BatchNorm2d(nout),
activ() activ(),
) )
def __call__(self, x): def __call__(self, x):
@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module): class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__() super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential( self.conv = nn.Sequential(
nn.Conv2d( nn.Conv2d(
nin, nin, nin,
nin,
kernel_size=ksize, kernel_size=ksize,
stride=stride, stride=stride,
padding=pad, padding=pad,
dilation=dilation, dilation=dilation,
groups=nin, groups=nin,
bias=False), bias=False,
nn.Conv2d( ),
nin, nout, nn.Conv2d(nin, nout, kernel_size=1, bias=False),
kernel_size=1,
bias=False),
nn.BatchNorm2d(nout), nn.BatchNorm2d(nout),
activ() activ(),
) )
def __call__(self, x): def __call__(self, x):
@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module): class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__() super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@ -65,14 +63,15 @@ class Encoder(nn.Module):
class Decoder(nn.Module): class Decoder(nn.Module):
def __init__(
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__() super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None): def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True) x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None: if skip is not None:
skip = spec_utils.crop_center(skip, x) skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1) x = torch.cat([x, skip], dim=1)
@ -85,28 +84,31 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module): class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
super(ASPPModule, self).__init__() super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential( self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)), nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
) )
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv( self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv( self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv( self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential( self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
nn.Dropout2d(0.1)
) )
def forward(self, x): def forward(self, x):
_, _, h, w = x.size() _, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True) feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x) feat2 = self.conv2(x)
feat3 = self.conv3(x) feat3 = self.conv3(x)
feat4 = self.conv4(x) feat4 = self.conv4(x)

View File

@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module): class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__() super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential( self.conv = nn.Sequential(
nn.Conv2d( nn.Conv2d(
nin, nout, nin,
nout,
kernel_size=ksize, kernel_size=ksize,
stride=stride, stride=stride,
padding=pad, padding=pad,
dilation=dilation, dilation=dilation,
bias=False), bias=False,
),
nn.BatchNorm2d(nout), nn.BatchNorm2d(nout),
activ() activ(),
) )
def __call__(self, x): def __call__(self, x):
@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module): class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__() super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential( self.conv = nn.Sequential(
nn.Conv2d( nn.Conv2d(
nin, nin, nin,
nin,
kernel_size=ksize, kernel_size=ksize,
stride=stride, stride=stride,
padding=pad, padding=pad,
dilation=dilation, dilation=dilation,
groups=nin, groups=nin,
bias=False), bias=False,
nn.Conv2d( ),
nin, nout, nn.Conv2d(nin, nout, kernel_size=1, bias=False),
kernel_size=1,
bias=False),
nn.BatchNorm2d(nout), nn.BatchNorm2d(nout),
activ() activ(),
) )
def __call__(self, x): def __call__(self, x):
@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module): class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__() super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@ -65,14 +63,15 @@ class Encoder(nn.Module):
class Decoder(nn.Module): class Decoder(nn.Module):
def __init__(
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__() super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None): def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True) x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None: if skip is not None:
skip = spec_utils.crop_center(skip, x) skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1) x = torch.cat([x, skip], dim=1)
@ -85,32 +84,37 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module): class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
super(ASPPModule, self).__init__() super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential( self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)), nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
) )
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv( self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv( self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv( self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv6 = SeperableConv2DBNActiv( self.conv6 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv7 = SeperableConv2DBNActiv( self.conv7 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential( self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
nn.Dropout2d(0.1)
) )
def forward(self, x): def forward(self, x):
_, _, h, w = x.size() _, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True) feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x) feat2 = self.conv2(x)
feat3 = self.conv3(x) feat3 = self.conv3(x)
feat4 = self.conv4(x) feat4 = self.conv4(x)

View File

@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module): class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__() super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential( self.conv = nn.Sequential(
nn.Conv2d( nn.Conv2d(
nin, nout, nin,
nout,
kernel_size=ksize, kernel_size=ksize,
stride=stride, stride=stride,
padding=pad, padding=pad,
dilation=dilation, dilation=dilation,
bias=False), bias=False,
),
nn.BatchNorm2d(nout), nn.BatchNorm2d(nout),
activ() activ(),
) )
def __call__(self, x): def __call__(self, x):
@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module): class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__() super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential( self.conv = nn.Sequential(
nn.Conv2d( nn.Conv2d(
nin, nin, nin,
nin,
kernel_size=ksize, kernel_size=ksize,
stride=stride, stride=stride,
padding=pad, padding=pad,
dilation=dilation, dilation=dilation,
groups=nin, groups=nin,
bias=False), bias=False,
nn.Conv2d( ),
nin, nout, nn.Conv2d(nin, nout, kernel_size=1, bias=False),
kernel_size=1,
bias=False),
nn.BatchNorm2d(nout), nn.BatchNorm2d(nout),
activ() activ(),
) )
def __call__(self, x): def __call__(self, x):
@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module): class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__() super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@ -65,14 +63,15 @@ class Encoder(nn.Module):
class Decoder(nn.Module): class Decoder(nn.Module):
def __init__(
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__() super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None): def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True) x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None: if skip is not None:
skip = spec_utils.crop_center(skip, x) skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1) x = torch.cat([x, skip], dim=1)
@ -85,32 +84,37 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module): class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
super(ASPPModule, self).__init__() super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential( self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)), nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
) )
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv( self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv( self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv( self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv6 = SeperableConv2DBNActiv( self.conv6 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv7 = SeperableConv2DBNActiv( self.conv7 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential( self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
nn.Dropout2d(0.1)
) )
def forward(self, x): def forward(self, x):
_, _, h, w = x.size() _, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True) feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x) feat2 = self.conv2(x)
feat3 = self.conv3(x) feat3 = self.conv3(x)
feat4 = self.conv4(x) feat4 = self.conv4(x)

View File

@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module): class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__() super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential( self.conv = nn.Sequential(
nn.Conv2d( nn.Conv2d(
nin, nout, nin,
nout,
kernel_size=ksize, kernel_size=ksize,
stride=stride, stride=stride,
padding=pad, padding=pad,
dilation=dilation, dilation=dilation,
bias=False), bias=False,
),
nn.BatchNorm2d(nout), nn.BatchNorm2d(nout),
activ() activ(),
) )
def __call__(self, x): def __call__(self, x):
@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module): class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__() super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential( self.conv = nn.Sequential(
nn.Conv2d( nn.Conv2d(
nin, nin, nin,
nin,
kernel_size=ksize, kernel_size=ksize,
stride=stride, stride=stride,
padding=pad, padding=pad,
dilation=dilation, dilation=dilation,
groups=nin, groups=nin,
bias=False), bias=False,
nn.Conv2d( ),
nin, nout, nn.Conv2d(nin, nout, kernel_size=1, bias=False),
kernel_size=1,
bias=False),
nn.BatchNorm2d(nout), nn.BatchNorm2d(nout),
activ() activ(),
) )
def __call__(self, x): def __call__(self, x):
@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module): class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__() super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@ -65,14 +63,15 @@ class Encoder(nn.Module):
class Decoder(nn.Module): class Decoder(nn.Module):
def __init__(
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__() super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None): def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True) x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None: if skip is not None:
skip = spec_utils.crop_center(skip, x) skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1) x = torch.cat([x, skip], dim=1)
@ -85,32 +84,37 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module): class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
super(ASPPModule, self).__init__() super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential( self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)), nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
) )
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv( self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv( self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv( self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv6 = SeperableConv2DBNActiv( self.conv6 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv7 = SeperableConv2DBNActiv( self.conv7 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential( self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
nn.Dropout2d(0.1)
) )
def forward(self, x): def forward(self, x):
_, _, h, w = x.size() _, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True) feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x) feat2 = self.conv2(x)
feat3 = self.conv3(x) feat3 = self.conv3(x)
feat4 = self.conv4(x) feat4 = self.conv4(x)

View File

@ -3,33 +3,33 @@ import os
import pathlib import pathlib
default_param = {} default_param = {}
default_param['bins'] = 768 default_param["bins"] = 768
default_param['unstable_bins'] = 9 # training only default_param["unstable_bins"] = 9 # training only
default_param['reduction_bins'] = 762 # training only default_param["reduction_bins"] = 762 # training only
default_param['sr'] = 44100 default_param["sr"] = 44100
default_param['pre_filter_start'] = 757 default_param["pre_filter_start"] = 757
default_param['pre_filter_stop'] = 768 default_param["pre_filter_stop"] = 768
default_param['band'] = {} default_param["band"] = {}
default_param['band'][1] = { default_param["band"][1] = {
'sr': 11025, "sr": 11025,
'hl': 128, "hl": 128,
'n_fft': 960, "n_fft": 960,
'crop_start': 0, "crop_start": 0,
'crop_stop': 245, "crop_stop": 245,
'lpf_start': 61, # inference only "lpf_start": 61, # inference only
'res_type': 'polyphase' "res_type": "polyphase",
} }
default_param['band'][2] = { default_param["band"][2] = {
'sr': 44100, "sr": 44100,
'hl': 512, "hl": 512,
'n_fft': 1536, "n_fft": 1536,
'crop_start': 24, "crop_start": 24,
'crop_stop': 547, "crop_stop": 547,
'hpf_start': 81, # inference only "hpf_start": 81, # inference only
'res_type': 'sinc_best' "res_type": "sinc_best",
} }
@ -40,21 +40,30 @@ def int_keys(d):
k = int(k) k = int(k)
r[k] = v r[k] = v
return r return r
class ModelParameters(object): class ModelParameters(object):
def __init__(self, config_path=''): def __init__(self, config_path=""):
if '.pth' == pathlib.Path(config_path).suffix: if ".pth" == pathlib.Path(config_path).suffix:
import zipfile import zipfile
with zipfile.ZipFile(config_path, 'r') as zip: with zipfile.ZipFile(config_path, "r") as zip:
self.param = json.loads(zip.read('param.json'), object_pairs_hook=int_keys) self.param = json.loads(
elif '.json' == pathlib.Path(config_path).suffix: zip.read("param.json"), object_pairs_hook=int_keys
with open(config_path, 'r') as f: )
elif ".json" == pathlib.Path(config_path).suffix:
with open(config_path, "r") as f:
self.param = json.loads(f.read(), object_pairs_hook=int_keys) self.param = json.loads(f.read(), object_pairs_hook=int_keys)
else: else:
self.param = default_param self.param = default_param
for k in ['mid_side', 'mid_side_b', 'mid_side_b2', 'stereo_w', 'stereo_n', 'reverse']: for k in [
"mid_side",
"mid_side_b",
"mid_side_b2",
"stereo_w",
"stereo_n",
"reverse",
]:
if not k in self.param: if not k in self.param:
self.param[k] = False self.param[k] = False

View File

@ -7,7 +7,6 @@ from uvr5_pack.lib_v5 import spec_utils
class BaseASPPNet(nn.Module): class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)): def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__() super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@ -39,7 +38,6 @@ class BaseASPPNet(nn.Module):
class CascadedASPPNet(nn.Module): class CascadedASPPNet(nn.Module):
def __init__(self, n_fft): def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__() super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 16) self.stg1_low_band_net = BaseASPPNet(2, 16)
@ -64,13 +62,16 @@ class CascadedASPPNet(nn.Module):
mix = x.detach() mix = x.detach()
x = x.clone() x = x.clone()
x = x[:, :, :self.max_bin] x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2 bandw = x.size()[2] // 2
aux1 = torch.cat([ aux1 = torch.cat(
self.stg1_low_band_net(x[:, :, :bandw]), [
self.stg1_high_band_net(x[:, :, bandw:]) self.stg1_low_band_net(x[:, :, :bandw]),
], dim=2) self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1) h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@ -82,24 +83,33 @@ class CascadedASPPNet(nn.Module):
mask = F.pad( mask = F.pad(
input=mask, input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]), pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate') mode="replicate",
)
if self.training: if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1)) aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad( aux1 = F.pad(
input=aux1, input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]), pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate') mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2)) aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad( aux2 = F.pad(
input=aux2, input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]), pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate') mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix return mask * mix, aux1 * mix, aux2 * mix
else: else:
if aggressiveness: if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3) mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value']) mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix return mask * mix
@ -107,7 +117,7 @@ class CascadedASPPNet(nn.Module):
h = self.forward(x_mag, aggressiveness) h = self.forward(x_mag, aggressiveness)
if self.offset > 0: if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset] h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0 assert h.size()[3] > 0
return h return h

View File

@ -6,7 +6,6 @@ from uvr5_pack.lib_v5 import layers_123821KB as layers
class BaseASPPNet(nn.Module): class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)): def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__() super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@ -38,7 +37,6 @@ class BaseASPPNet(nn.Module):
class CascadedASPPNet(nn.Module): class CascadedASPPNet(nn.Module):
def __init__(self, n_fft): def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__() super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 32) self.stg1_low_band_net = BaseASPPNet(2, 32)
@ -63,13 +61,16 @@ class CascadedASPPNet(nn.Module):
mix = x.detach() mix = x.detach()
x = x.clone() x = x.clone()
x = x[:, :, :self.max_bin] x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2 bandw = x.size()[2] // 2
aux1 = torch.cat([ aux1 = torch.cat(
self.stg1_low_band_net(x[:, :, :bandw]), [
self.stg1_high_band_net(x[:, :, bandw:]) self.stg1_low_band_net(x[:, :, :bandw]),
], dim=2) self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1) h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@ -81,24 +82,33 @@ class CascadedASPPNet(nn.Module):
mask = F.pad( mask = F.pad(
input=mask, input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]), pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate') mode="replicate",
)
if self.training: if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1)) aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad( aux1 = F.pad(
input=aux1, input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]), pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate') mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2)) aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad( aux2 = F.pad(
input=aux2, input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]), pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate') mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix return mask * mix, aux1 * mix, aux2 * mix
else: else:
if aggressiveness: if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3) mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value']) mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix return mask * mix
@ -106,7 +116,7 @@ class CascadedASPPNet(nn.Module):
h = self.forward(x_mag, aggressiveness) h = self.forward(x_mag, aggressiveness)
if self.offset > 0: if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset] h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0 assert h.size()[3] > 0
return h return h

View File

@ -6,7 +6,6 @@ from uvr5_pack.lib_v5 import layers_123821KB as layers
class BaseASPPNet(nn.Module): class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)): def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__() super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@ -38,7 +37,6 @@ class BaseASPPNet(nn.Module):
class CascadedASPPNet(nn.Module): class CascadedASPPNet(nn.Module):
def __init__(self, n_fft): def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__() super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 32) self.stg1_low_band_net = BaseASPPNet(2, 32)
@ -63,13 +61,16 @@ class CascadedASPPNet(nn.Module):
mix = x.detach() mix = x.detach()
x = x.clone() x = x.clone()
x = x[:, :, :self.max_bin] x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2 bandw = x.size()[2] // 2
aux1 = torch.cat([ aux1 = torch.cat(
self.stg1_low_band_net(x[:, :, :bandw]), [
self.stg1_high_band_net(x[:, :, bandw:]) self.stg1_low_band_net(x[:, :, :bandw]),
], dim=2) self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1) h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@ -81,24 +82,33 @@ class CascadedASPPNet(nn.Module):
mask = F.pad( mask = F.pad(
input=mask, input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]), pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate') mode="replicate",
)
if self.training: if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1)) aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad( aux1 = F.pad(
input=aux1, input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]), pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate') mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2)) aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad( aux2 = F.pad(
input=aux2, input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]), pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate') mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix return mask * mix, aux1 * mix, aux2 * mix
else: else:
if aggressiveness: if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3) mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value']) mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix return mask * mix
@ -106,7 +116,7 @@ class CascadedASPPNet(nn.Module):
h = self.forward(x_mag, aggressiveness) h = self.forward(x_mag, aggressiveness)
if self.offset > 0: if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset] h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0 assert h.size()[3] > 0
return h return h

View File

@ -6,7 +6,6 @@ from uvr5_pack.lib_v5 import layers_33966KB as layers
class BaseASPPNet(nn.Module): class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16, 32)): def __init__(self, nin, ch, dilations=(4, 8, 16, 32)):
super(BaseASPPNet, self).__init__() super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@ -38,7 +37,6 @@ class BaseASPPNet(nn.Module):
class CascadedASPPNet(nn.Module): class CascadedASPPNet(nn.Module):
def __init__(self, n_fft): def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__() super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 16) self.stg1_low_band_net = BaseASPPNet(2, 16)
@ -63,13 +61,16 @@ class CascadedASPPNet(nn.Module):
mix = x.detach() mix = x.detach()
x = x.clone() x = x.clone()
x = x[:, :, :self.max_bin] x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2 bandw = x.size()[2] // 2
aux1 = torch.cat([ aux1 = torch.cat(
self.stg1_low_band_net(x[:, :, :bandw]), [
self.stg1_high_band_net(x[:, :, bandw:]) self.stg1_low_band_net(x[:, :, :bandw]),
], dim=2) self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1) h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@ -81,24 +82,33 @@ class CascadedASPPNet(nn.Module):
mask = F.pad( mask = F.pad(
input=mask, input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]), pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate') mode="replicate",
)
if self.training: if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1)) aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad( aux1 = F.pad(
input=aux1, input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]), pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate') mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2)) aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad( aux2 = F.pad(
input=aux2, input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]), pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate') mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix return mask * mix, aux1 * mix, aux2 * mix
else: else:
if aggressiveness: if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3) mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value']) mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix return mask * mix
@ -106,7 +116,7 @@ class CascadedASPPNet(nn.Module):
h = self.forward(x_mag, aggressiveness) h = self.forward(x_mag, aggressiveness)
if self.offset > 0: if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset] h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0 assert h.size()[3] > 0
return h return h

View File

@ -7,7 +7,6 @@ from uvr5_pack.lib_v5 import layers_537238KB as layers
class BaseASPPNet(nn.Module): class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)): def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__() super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@ -39,7 +38,6 @@ class BaseASPPNet(nn.Module):
class CascadedASPPNet(nn.Module): class CascadedASPPNet(nn.Module):
def __init__(self, n_fft): def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__() super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 64) self.stg1_low_band_net = BaseASPPNet(2, 64)
@ -64,13 +62,16 @@ class CascadedASPPNet(nn.Module):
mix = x.detach() mix = x.detach()
x = x.clone() x = x.clone()
x = x[:, :, :self.max_bin] x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2 bandw = x.size()[2] // 2
aux1 = torch.cat([ aux1 = torch.cat(
self.stg1_low_band_net(x[:, :, :bandw]), [
self.stg1_high_band_net(x[:, :, bandw:]) self.stg1_low_band_net(x[:, :, :bandw]),
], dim=2) self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1) h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@ -82,24 +83,33 @@ class CascadedASPPNet(nn.Module):
mask = F.pad( mask = F.pad(
input=mask, input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]), pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate') mode="replicate",
)
if self.training: if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1)) aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad( aux1 = F.pad(
input=aux1, input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]), pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate') mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2)) aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad( aux2 = F.pad(
input=aux2, input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]), pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate') mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix return mask * mix, aux1 * mix, aux2 * mix
else: else:
if aggressiveness: if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3) mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value']) mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix return mask * mix
@ -107,7 +117,7 @@ class CascadedASPPNet(nn.Module):
h = self.forward(x_mag, aggressiveness) h = self.forward(x_mag, aggressiveness)
if self.offset > 0: if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset] h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0 assert h.size()[3] > 0
return h return h

View File

@ -7,7 +7,6 @@ from uvr5_pack.lib_v5 import layers_537238KB as layers
class BaseASPPNet(nn.Module): class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)): def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__() super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@ -39,7 +38,6 @@ class BaseASPPNet(nn.Module):
class CascadedASPPNet(nn.Module): class CascadedASPPNet(nn.Module):
def __init__(self, n_fft): def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__() super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 64) self.stg1_low_band_net = BaseASPPNet(2, 64)
@ -64,13 +62,16 @@ class CascadedASPPNet(nn.Module):
mix = x.detach() mix = x.detach()
x = x.clone() x = x.clone()
x = x[:, :, :self.max_bin] x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2 bandw = x.size()[2] // 2
aux1 = torch.cat([ aux1 = torch.cat(
self.stg1_low_band_net(x[:, :, :bandw]), [
self.stg1_high_band_net(x[:, :, bandw:]) self.stg1_low_band_net(x[:, :, :bandw]),
], dim=2) self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1) h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@ -82,24 +83,33 @@ class CascadedASPPNet(nn.Module):
mask = F.pad( mask = F.pad(
input=mask, input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]), pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate') mode="replicate",
)
if self.training: if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1)) aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad( aux1 = F.pad(
input=aux1, input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]), pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate') mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2)) aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad( aux2 = F.pad(
input=aux2, input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]), pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate') mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix return mask * mix, aux1 * mix, aux2 * mix
else: else:
if aggressiveness: if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3) mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value']) mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix return mask * mix
@ -107,7 +117,7 @@ class CascadedASPPNet(nn.Module):
h = self.forward(x_mag, aggressiveness) h = self.forward(x_mag, aggressiveness)
if self.offset > 0: if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset] h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0 assert h.size()[3] > 0
return h return h

View File

@ -6,7 +6,6 @@ from uvr5_pack.lib_v5 import layers_123821KB as layers
class BaseASPPNet(nn.Module): class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)): def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__() super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@ -38,7 +37,6 @@ class BaseASPPNet(nn.Module):
class CascadedASPPNet(nn.Module): class CascadedASPPNet(nn.Module):
def __init__(self, n_fft): def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__() super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 32) self.stg1_low_band_net = BaseASPPNet(2, 32)
@ -63,13 +61,16 @@ class CascadedASPPNet(nn.Module):
mix = x.detach() mix = x.detach()
x = x.clone() x = x.clone()
x = x[:, :, :self.max_bin] x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2 bandw = x.size()[2] // 2
aux1 = torch.cat([ aux1 = torch.cat(
self.stg1_low_band_net(x[:, :, :bandw]), [
self.stg1_high_band_net(x[:, :, bandw:]) self.stg1_low_band_net(x[:, :, :bandw]),
], dim=2) self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1) h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@ -81,24 +82,33 @@ class CascadedASPPNet(nn.Module):
mask = F.pad( mask = F.pad(
input=mask, input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]), pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate') mode="replicate",
)
if self.training: if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1)) aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad( aux1 = F.pad(
input=aux1, input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]), pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate') mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2)) aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad( aux2 = F.pad(
input=aux2, input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]), pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate') mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix return mask * mix, aux1 * mix, aux2 * mix
else: else:
if aggressiveness: if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3) mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value']) mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix return mask * mix
@ -106,7 +116,7 @@ class CascadedASPPNet(nn.Module):
h = self.forward(x_mag, aggressiveness) h = self.forward(x_mag, aggressiveness)
if self.offset > 0: if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset] h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0 assert h.size()[3] > 0
return h return h

View File

@ -1,8 +1,9 @@
import os,librosa import os, librosa
import numpy as np import numpy as np
import soundfile as sf import soundfile as sf
from tqdm import tqdm from tqdm import tqdm
import json,math ,hashlib import json, math, hashlib
def crop_center(h1, h2): def crop_center(h1, h2):
h1_shape = h1.size() h1_shape = h1.size()
@ -11,7 +12,7 @@ def crop_center(h1, h2):
if h1_shape[3] == h2_shape[3]: if h1_shape[3] == h2_shape[3]:
return h1 return h1
elif h1_shape[3] < h2_shape[3]: elif h1_shape[3] < h2_shape[3]:
raise ValueError('h1_shape[3] must be greater than h2_shape[3]') raise ValueError("h1_shape[3] must be greater than h2_shape[3]")
# s_freq = (h2_shape[2] - h1_shape[2]) // 2 # s_freq = (h2_shape[2] - h1_shape[2]) // 2
# e_freq = s_freq + h1_shape[2] # e_freq = s_freq + h1_shape[2]
@ -22,7 +23,9 @@ def crop_center(h1, h2):
return h1 return h1
def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False): def wave_to_spectrogram(
wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
):
if reverse: if reverse:
wave_left = np.flip(np.asfortranarray(wave[0])) wave_left = np.flip(np.asfortranarray(wave[0]))
wave_right = np.flip(np.asfortranarray(wave[1])) wave_right = np.flip(np.asfortranarray(wave[1]))
@ -30,21 +33,23 @@ def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=Fal
wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2) wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1])) wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
elif mid_side_b2: elif mid_side_b2:
wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5)) wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5)) wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
else: else:
wave_left = np.asfortranarray(wave[0]) wave_left = np.asfortranarray(wave[0])
wave_right = np.asfortranarray(wave[1]) wave_right = np.asfortranarray(wave[1])
spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length) spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length)
spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length) spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
spec = np.asfortranarray([spec_left, spec_right]) spec = np.asfortranarray([spec_left, spec_right])
return spec return spec
def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False): def wave_to_spectrogram_mt(
wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
):
import threading import threading
if reverse: if reverse:
@ -54,62 +59,75 @@ def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=
wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2) wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1])) wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
elif mid_side_b2: elif mid_side_b2:
wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5)) wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5)) wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
else: else:
wave_left = np.asfortranarray(wave[0]) wave_left = np.asfortranarray(wave[0])
wave_right = np.asfortranarray(wave[1]) wave_right = np.asfortranarray(wave[1])
def run_thread(**kwargs): def run_thread(**kwargs):
global spec_left global spec_left
spec_left = librosa.stft(**kwargs) spec_left = librosa.stft(**kwargs)
thread = threading.Thread(target=run_thread, kwargs={'y': wave_left, 'n_fft': n_fft, 'hop_length': hop_length}) thread = threading.Thread(
target=run_thread,
kwargs={"y": wave_left, "n_fft": n_fft, "hop_length": hop_length},
)
thread.start() thread.start()
spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length) spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
thread.join() thread.join()
spec = np.asfortranarray([spec_left, spec_right]) spec = np.asfortranarray([spec_left, spec_right])
return spec return spec
def combine_spectrograms(specs, mp): def combine_spectrograms(specs, mp):
l = min([specs[i].shape[2] for i in specs]) l = min([specs[i].shape[2] for i in specs])
spec_c = np.zeros(shape=(2, mp.param['bins'] + 1, l), dtype=np.complex64) spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64)
offset = 0 offset = 0
bands_n = len(mp.param['band']) bands_n = len(mp.param["band"])
for d in range(1, bands_n + 1): for d in range(1, bands_n + 1):
h = mp.param['band'][d]['crop_stop'] - mp.param['band'][d]['crop_start'] h = mp.param["band"][d]["crop_stop"] - mp.param["band"][d]["crop_start"]
spec_c[:, offset:offset+h, :l] = specs[d][:, mp.param['band'][d]['crop_start']:mp.param['band'][d]['crop_stop'], :l] spec_c[:, offset : offset + h, :l] = specs[d][
:, mp.param["band"][d]["crop_start"] : mp.param["band"][d]["crop_stop"], :l
]
offset += h offset += h
if offset > mp.param['bins']: if offset > mp.param["bins"]:
raise ValueError('Too much bins') raise ValueError("Too much bins")
# lowpass fiter # lowpass fiter
if mp.param['pre_filter_start'] > 0: # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']: if (
mp.param["pre_filter_start"] > 0
): # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
if bands_n == 1: if bands_n == 1:
spec_c = fft_lp_filter(spec_c, mp.param['pre_filter_start'], mp.param['pre_filter_stop']) spec_c = fft_lp_filter(
spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"]
)
else: else:
gp = 1 gp = 1
for b in range(mp.param['pre_filter_start'] + 1, mp.param['pre_filter_stop']): for b in range(
g = math.pow(10, -(b - mp.param['pre_filter_start']) * (3.5 - gp) / 20.0) mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]
):
g = math.pow(
10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0
)
gp = g gp = g
spec_c[:, b, :] *= g spec_c[:, b, :] *= g
return np.asfortranarray(spec_c)
def spectrogram_to_image(spec, mode='magnitude'): return np.asfortranarray(spec_c)
if mode == 'magnitude':
def spectrogram_to_image(spec, mode="magnitude"):
if mode == "magnitude":
if np.iscomplexobj(spec): if np.iscomplexobj(spec):
y = np.abs(spec) y = np.abs(spec)
else: else:
y = spec y = spec
y = np.log10(y ** 2 + 1e-8) y = np.log10(y**2 + 1e-8)
elif mode == 'phase': elif mode == "phase":
if np.iscomplexobj(spec): if np.iscomplexobj(spec):
y = np.angle(spec) y = np.angle(spec)
else: else:
@ -121,9 +139,7 @@ def spectrogram_to_image(spec, mode='magnitude'):
if y.ndim == 3: if y.ndim == 3:
img = img.transpose(1, 2, 0) img = img.transpose(1, 2, 0)
img = np.concatenate([ img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2)
np.max(img, axis=2, keepdims=True), img
], axis=2)
return img return img
@ -136,12 +152,12 @@ def reduce_vocal_aggressively(X, y, softmask):
v_mask = v_mag_tmp > y_mag_tmp v_mask = v_mag_tmp > y_mag_tmp
y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf) y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf)
return y_mag * np.exp(1.j * np.angle(y)) return y_mag * np.exp(1.0j * np.angle(y))
def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32): def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
if min_range < fade_size * 2: if min_range < fade_size * 2:
raise ValueError('min_range must be >= fade_area * 2') raise ValueError("min_range must be >= fade_area * 2")
mag = mag.copy() mag = mag.copy()
@ -159,72 +175,106 @@ def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
if s != 0: if s != 0:
weight = np.linspace(0, 1, fade_size) weight = np.linspace(0, 1, fade_size)
mag[:, :, s:s + fade_size] += weight * ref[:, :, s:s + fade_size] mag[:, :, s : s + fade_size] += weight * ref[:, :, s : s + fade_size]
else: else:
s -= fade_size s -= fade_size
if e != mag.shape[2]: if e != mag.shape[2]:
weight = np.linspace(1, 0, fade_size) weight = np.linspace(1, 0, fade_size)
mag[:, :, e - fade_size:e] += weight * ref[:, :, e - fade_size:e] mag[:, :, e - fade_size : e] += weight * ref[:, :, e - fade_size : e]
else: else:
e += fade_size e += fade_size
mag[:, :, s + fade_size:e - fade_size] += ref[:, :, s + fade_size:e - fade_size] mag[:, :, s + fade_size : e - fade_size] += ref[
:, :, s + fade_size : e - fade_size
]
old_e = e old_e = e
return mag return mag
def align_wave_head_and_tail(a, b): def align_wave_head_and_tail(a, b):
l = min([a[0].size, b[0].size]) l = min([a[0].size, b[0].size])
return a[:l,:l], b[:l,:l] return a[:l, :l], b[:l, :l]
def cache_or_load(mix_path, inst_path, mp): def cache_or_load(mix_path, inst_path, mp):
mix_basename = os.path.splitext(os.path.basename(mix_path))[0] mix_basename = os.path.splitext(os.path.basename(mix_path))[0]
inst_basename = os.path.splitext(os.path.basename(inst_path))[0] inst_basename = os.path.splitext(os.path.basename(inst_path))[0]
cache_dir = 'mph{}'.format(hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode('utf-8')).hexdigest()) cache_dir = "mph{}".format(
mix_cache_dir = os.path.join('cache', cache_dir) hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest()
inst_cache_dir = os.path.join('cache', cache_dir) )
mix_cache_dir = os.path.join("cache", cache_dir)
inst_cache_dir = os.path.join("cache", cache_dir)
os.makedirs(mix_cache_dir, exist_ok=True) os.makedirs(mix_cache_dir, exist_ok=True)
os.makedirs(inst_cache_dir, exist_ok=True) os.makedirs(inst_cache_dir, exist_ok=True)
mix_cache_path = os.path.join(mix_cache_dir, mix_basename + '.npy') mix_cache_path = os.path.join(mix_cache_dir, mix_basename + ".npy")
inst_cache_path = os.path.join(inst_cache_dir, inst_basename + '.npy') inst_cache_path = os.path.join(inst_cache_dir, inst_basename + ".npy")
if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path): if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path):
X_spec_m = np.load(mix_cache_path) X_spec_m = np.load(mix_cache_path)
y_spec_m = np.load(inst_cache_path) y_spec_m = np.load(inst_cache_path)
else: else:
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
for d in range(len(mp.param['band']), 0, -1): for d in range(len(mp.param["band"]), 0, -1):
bp = mp.param['band'][d] bp = mp.param["band"][d]
if d == len(mp.param['band']): # high-end band if d == len(mp.param["band"]): # high-end band
X_wave[d], _ = librosa.load( X_wave[d], _ = librosa.load(
mix_path, bp['sr'], False, dtype=np.float32, res_type=bp['res_type']) mix_path, bp["sr"], False, dtype=np.float32, res_type=bp["res_type"]
)
y_wave[d], _ = librosa.load( y_wave[d], _ = librosa.load(
inst_path, bp['sr'], False, dtype=np.float32, res_type=bp['res_type']) inst_path,
else: # lower bands bp["sr"],
X_wave[d] = librosa.resample(X_wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type']) False,
y_wave[d] = librosa.resample(y_wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type']) dtype=np.float32,
res_type=bp["res_type"],
)
else: # lower bands
X_wave[d] = librosa.resample(
X_wave[d + 1],
mp.param["band"][d + 1]["sr"],
bp["sr"],
res_type=bp["res_type"],
)
y_wave[d] = librosa.resample(
y_wave[d + 1],
mp.param["band"][d + 1]["sr"],
bp["sr"],
res_type=bp["res_type"],
)
X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d]) X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d])
X_spec_s[d] = wave_to_spectrogram(X_wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']) X_spec_s[d] = wave_to_spectrogram(
y_spec_s[d] = wave_to_spectrogram(y_wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']) X_wave[d],
bp["hl"],
bp["n_fft"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
)
y_spec_s[d] = wave_to_spectrogram(
y_wave[d],
bp["hl"],
bp["n_fft"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
)
del X_wave, y_wave del X_wave, y_wave
X_spec_m = combine_spectrograms(X_spec_s, mp) X_spec_m = combine_spectrograms(X_spec_s, mp)
y_spec_m = combine_spectrograms(y_spec_s, mp) y_spec_m = combine_spectrograms(y_spec_s, mp)
if X_spec_m.shape != y_spec_m.shape: if X_spec_m.shape != y_spec_m.shape:
raise ValueError('The combined spectrograms are different: ' + mix_path) raise ValueError("The combined spectrograms are different: " + mix_path)
_, ext = os.path.splitext(mix_path) _, ext = os.path.splitext(mix_path)
@ -244,72 +294,129 @@ def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse):
if reverse: if reverse:
return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)]) return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
elif mid_side: elif mid_side:
return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]) return np.asfortranarray(
[np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
)
elif mid_side_b2: elif mid_side_b2:
return np.asfortranarray([np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)]) return np.asfortranarray(
[
np.add(wave_right / 1.25, 0.4 * wave_left),
np.subtract(wave_left / 1.25, 0.4 * wave_right),
]
)
else: else:
return np.asfortranarray([wave_left, wave_right]) return np.asfortranarray([wave_left, wave_right])
def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2): def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
import threading import threading
spec_left = np.asfortranarray(spec[0]) spec_left = np.asfortranarray(spec[0])
spec_right = np.asfortranarray(spec[1]) spec_right = np.asfortranarray(spec[1])
def run_thread(**kwargs): def run_thread(**kwargs):
global wave_left global wave_left
wave_left = librosa.istft(**kwargs) wave_left = librosa.istft(**kwargs)
thread = threading.Thread(target=run_thread, kwargs={'stft_matrix': spec_left, 'hop_length': hop_length}) thread = threading.Thread(
target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length}
)
thread.start() thread.start()
wave_right = librosa.istft(spec_right, hop_length=hop_length) wave_right = librosa.istft(spec_right, hop_length=hop_length)
thread.join() thread.join()
if reverse: if reverse:
return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)]) return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
elif mid_side: elif mid_side:
return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]) return np.asfortranarray(
[np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
)
elif mid_side_b2: elif mid_side_b2:
return np.asfortranarray([np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)]) return np.asfortranarray(
[
np.add(wave_right / 1.25, 0.4 * wave_left),
np.subtract(wave_left / 1.25, 0.4 * wave_right),
]
)
else: else:
return np.asfortranarray([wave_left, wave_right]) return np.asfortranarray([wave_left, wave_right])
def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None): def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None):
wave_band = {} wave_band = {}
bands_n = len(mp.param['band']) bands_n = len(mp.param["band"])
offset = 0 offset = 0
for d in range(1, bands_n + 1): for d in range(1, bands_n + 1):
bp = mp.param['band'][d] bp = mp.param["band"][d]
spec_s = np.ndarray(shape=(2, bp['n_fft'] // 2 + 1, spec_m.shape[2]), dtype=complex) spec_s = np.ndarray(
h = bp['crop_stop'] - bp['crop_start'] shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex
spec_s[:, bp['crop_start']:bp['crop_stop'], :] = spec_m[:, offset:offset+h, :] )
h = bp["crop_stop"] - bp["crop_start"]
spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[
:, offset : offset + h, :
]
offset += h offset += h
if d == bands_n: # higher if d == bands_n: # higher
if extra_bins_h: # if --high_end_process bypass if extra_bins_h: # if --high_end_process bypass
max_bin = bp['n_fft'] // 2 max_bin = bp["n_fft"] // 2
spec_s[:, max_bin-extra_bins_h:max_bin, :] = extra_bins[:, :extra_bins_h, :] spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[
if bp['hpf_start'] > 0: :, :extra_bins_h, :
spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1) ]
if bp["hpf_start"] > 0:
spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
if bands_n == 1: if bands_n == 1:
wave = spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']) wave = spectrogram_to_wave(
spec_s,
bp["hl"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
)
else: else:
wave = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])) wave = np.add(
wave,
spectrogram_to_wave(
spec_s,
bp["hl"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
),
)
else: else:
sr = mp.param['band'][d+1]['sr'] sr = mp.param["band"][d + 1]["sr"]
if d == 1: # lower if d == 1: # lower
spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop']) spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
wave = librosa.resample(spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']), bp['sr'], sr, res_type="sinc_fastest") wave = librosa.resample(
else: # mid spectrogram_to_wave(
spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1) spec_s,
spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop']) bp["hl"],
wave2 = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])) mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
),
bp["sr"],
sr,
res_type="sinc_fastest",
)
else: # mid
spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
wave2 = np.add(
wave,
spectrogram_to_wave(
spec_s,
bp["hl"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
),
)
# wave = librosa.core.resample(wave2, bp['sr'], sr, res_type="sinc_fastest") # wave = librosa.core.resample(wave2, bp['sr'], sr, res_type="sinc_fastest")
wave = librosa.core.resample(wave2, bp['sr'], sr,res_type='scipy') wave = librosa.core.resample(wave2, bp["sr"], sr, res_type="scipy")
return wave.T return wave.T
@ -318,7 +425,7 @@ def fft_lp_filter(spec, bin_start, bin_stop):
for b in range(bin_start, bin_stop): for b in range(bin_start, bin_stop):
g -= 1 / (bin_stop - bin_start) g -= 1 / (bin_stop - bin_start)
spec[:, b, :] = g * spec[:, b, :] spec[:, b, :] = g * spec[:, b, :]
spec[:, bin_stop:, :] *= 0 spec[:, bin_stop:, :] *= 0
return spec return spec
@ -329,42 +436,69 @@ def fft_hp_filter(spec, bin_start, bin_stop):
for b in range(bin_start, bin_stop, -1): for b in range(bin_start, bin_stop, -1):
g -= 1 / (bin_start - bin_stop) g -= 1 / (bin_start - bin_stop)
spec[:, b, :] = g * spec[:, b, :] spec[:, b, :] = g * spec[:, b, :]
spec[:, 0:bin_stop+1, :] *= 0 spec[:, 0 : bin_stop + 1, :] *= 0
return spec return spec
def mirroring(a, spec_m, input_high_end, mp): def mirroring(a, spec_m, input_high_end, mp):
if 'mirroring' == a: if "mirroring" == a:
mirror = np.flip(np.abs(spec_m[:, mp.param['pre_filter_start']-10-input_high_end.shape[1]:mp.param['pre_filter_start']-10, :]), 1) mirror = np.flip(
mirror = mirror * np.exp(1.j * np.angle(input_high_end)) np.abs(
spec_m[
return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror) :,
mp.param["pre_filter_start"]
if 'mirroring2' == a: - 10
mirror = np.flip(np.abs(spec_m[:, mp.param['pre_filter_start']-10-input_high_end.shape[1]:mp.param['pre_filter_start']-10, :]), 1) - input_high_end.shape[1] : mp.param["pre_filter_start"]
- 10,
:,
]
),
1,
)
mirror = mirror * np.exp(1.0j * np.angle(input_high_end))
return np.where(
np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror
)
if "mirroring2" == a:
mirror = np.flip(
np.abs(
spec_m[
:,
mp.param["pre_filter_start"]
- 10
- input_high_end.shape[1] : mp.param["pre_filter_start"]
- 10,
:,
]
),
1,
)
mi = np.multiply(mirror, input_high_end * 1.7) mi = np.multiply(mirror, input_high_end * 1.7)
return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi) return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi)
def ensembling(a, specs): def ensembling(a, specs):
for i in range(1, len(specs)): for i in range(1, len(specs)):
if i == 1: if i == 1:
spec = specs[0] spec = specs[0]
ln = min([spec.shape[2], specs[i].shape[2]]) ln = min([spec.shape[2], specs[i].shape[2]])
spec = spec[:,:,:ln] spec = spec[:, :, :ln]
specs[i] = specs[i][:,:,:ln] specs[i] = specs[i][:, :, :ln]
if 'min_mag' == a: if "min_mag" == a:
spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec) spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec)
if 'max_mag' == a: if "max_mag" == a:
spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec) spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec)
return spec return spec
def stft(wave, nfft, hl): def stft(wave, nfft, hl):
wave_left = np.asfortranarray(wave[0]) wave_left = np.asfortranarray(wave[0])
wave_right = np.asfortranarray(wave[1]) wave_right = np.asfortranarray(wave[1])
@ -374,6 +508,7 @@ def stft(wave, nfft, hl):
return spec return spec
def istft(spec, hl): def istft(spec, hl):
spec_left = np.asfortranarray(spec[0]) spec_left = np.asfortranarray(spec[0])
spec_right = np.asfortranarray(spec[1]) spec_right = np.asfortranarray(spec[1])
@ -389,62 +524,94 @@ if __name__ == "__main__":
import time import time
import argparse import argparse
from model_param_init import ModelParameters from model_param_init import ModelParameters
p = argparse.ArgumentParser() p = argparse.ArgumentParser()
p.add_argument('--algorithm', '-a', type=str, choices=['invert', 'invert_p', 'min_mag', 'max_mag', 'deep', 'align'], default='min_mag') p.add_argument(
p.add_argument('--model_params', '-m', type=str, default=os.path.join('modelparams', '1band_sr44100_hl512.json')) "--algorithm",
p.add_argument('--output_name', '-o', type=str, default='output') "-a",
p.add_argument('--vocals_only', '-v', action='store_true') type=str,
p.add_argument('input', nargs='+') choices=["invert", "invert_p", "min_mag", "max_mag", "deep", "align"],
default="min_mag",
)
p.add_argument(
"--model_params",
"-m",
type=str,
default=os.path.join("modelparams", "1band_sr44100_hl512.json"),
)
p.add_argument("--output_name", "-o", type=str, default="output")
p.add_argument("--vocals_only", "-v", action="store_true")
p.add_argument("input", nargs="+")
args = p.parse_args() args = p.parse_args()
start_time = time.time() start_time = time.time()
if args.algorithm.startswith('invert') and len(args.input) != 2: if args.algorithm.startswith("invert") and len(args.input) != 2:
raise ValueError('There should be two input files.') raise ValueError("There should be two input files.")
if not args.algorithm.startswith('invert') and len(args.input) < 2: if not args.algorithm.startswith("invert") and len(args.input) < 2:
raise ValueError('There must be at least two input files.') raise ValueError("There must be at least two input files.")
wave, specs = {}, {} wave, specs = {}, {}
mp = ModelParameters(args.model_params) mp = ModelParameters(args.model_params)
for i in range(len(args.input)): for i in range(len(args.input)):
spec = {} spec = {}
for d in range(len(mp.param['band']), 0, -1): for d in range(len(mp.param["band"]), 0, -1):
bp = mp.param['band'][d] bp = mp.param["band"][d]
if d == len(mp.param['band']): # high-end band if d == len(mp.param["band"]): # high-end band
wave[d], _ = librosa.load( wave[d], _ = librosa.load(
args.input[i], bp['sr'], False, dtype=np.float32, res_type=bp['res_type']) args.input[i],
bp["sr"],
if len(wave[d].shape) == 1: # mono to stereo False,
dtype=np.float32,
res_type=bp["res_type"],
)
if len(wave[d].shape) == 1: # mono to stereo
wave[d] = np.array([wave[d], wave[d]]) wave[d] = np.array([wave[d], wave[d]])
else: # lower bands else: # lower bands
wave[d] = librosa.resample(wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type']) wave[d] = librosa.resample(
wave[d + 1],
spec[d] = wave_to_spectrogram(wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']) mp.param["band"][d + 1]["sr"],
bp["sr"],
res_type=bp["res_type"],
)
spec[d] = wave_to_spectrogram(
wave[d],
bp["hl"],
bp["n_fft"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
)
specs[i] = combine_spectrograms(spec, mp) specs[i] = combine_spectrograms(spec, mp)
del wave del wave
if args.algorithm == 'deep': if args.algorithm == "deep":
d_spec = np.where(np.abs(specs[0]) <= np.abs(spec[1]), specs[0], spec[1]) d_spec = np.where(np.abs(specs[0]) <= np.abs(spec[1]), specs[0], spec[1])
v_spec = d_spec - specs[1] v_spec = d_spec - specs[1]
sf.write(os.path.join('{}.wav'.format(args.output_name)), cmb_spectrogram_to_wave(v_spec, mp), mp.param['sr']) sf.write(
os.path.join("{}.wav".format(args.output_name)),
if args.algorithm.startswith('invert'): cmb_spectrogram_to_wave(v_spec, mp),
mp.param["sr"],
)
if args.algorithm.startswith("invert"):
ln = min([specs[0].shape[2], specs[1].shape[2]]) ln = min([specs[0].shape[2], specs[1].shape[2]])
specs[0] = specs[0][:,:,:ln] specs[0] = specs[0][:, :, :ln]
specs[1] = specs[1][:,:,:ln] specs[1] = specs[1][:, :, :ln]
if 'invert_p' == args.algorithm: if "invert_p" == args.algorithm:
X_mag = np.abs(specs[0]) X_mag = np.abs(specs[0])
y_mag = np.abs(specs[1]) y_mag = np.abs(specs[1])
max_mag = np.where(X_mag >= y_mag, X_mag, y_mag) max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
v_spec = specs[1] - max_mag * np.exp(1.j * np.angle(specs[0])) v_spec = specs[1] - max_mag * np.exp(1.0j * np.angle(specs[0]))
else: else:
specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2) specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2)
v_spec = specs[0] - specs[1] v_spec = specs[0] - specs[1]
@ -458,28 +625,43 @@ if __name__ == "__main__":
y_image = spectrogram_to_image(y_mag) y_image = spectrogram_to_image(y_mag)
v_image = spectrogram_to_image(v_mag) v_image = spectrogram_to_image(v_mag)
cv2.imwrite('{}_X.png'.format(args.output_name), X_image) cv2.imwrite("{}_X.png".format(args.output_name), X_image)
cv2.imwrite('{}_y.png'.format(args.output_name), y_image) cv2.imwrite("{}_y.png".format(args.output_name), y_image)
cv2.imwrite('{}_v.png'.format(args.output_name), v_image) cv2.imwrite("{}_v.png".format(args.output_name), v_image)
sf.write('{}_X.wav'.format(args.output_name), cmb_spectrogram_to_wave(specs[0], mp), mp.param['sr'])
sf.write('{}_y.wav'.format(args.output_name), cmb_spectrogram_to_wave(specs[1], mp), mp.param['sr'])
sf.write('{}_v.wav'.format(args.output_name), cmb_spectrogram_to_wave(v_spec, mp), mp.param['sr'])
else:
if not args.algorithm == 'deep':
sf.write(os.path.join('ensembled','{}.wav'.format(args.output_name)), cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp), mp.param['sr'])
if args.algorithm == 'align': sf.write(
"{}_X.wav".format(args.output_name),
cmb_spectrogram_to_wave(specs[0], mp),
mp.param["sr"],
)
sf.write(
"{}_y.wav".format(args.output_name),
cmb_spectrogram_to_wave(specs[1], mp),
mp.param["sr"],
)
sf.write(
"{}_v.wav".format(args.output_name),
cmb_spectrogram_to_wave(v_spec, mp),
mp.param["sr"],
)
else:
if not args.algorithm == "deep":
sf.write(
os.path.join("ensembled", "{}.wav".format(args.output_name)),
cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp),
mp.param["sr"],
)
if args.algorithm == "align":
trackalignment = [ trackalignment = [
{ {
'file1':'"{}"'.format(args.input[0]), "file1": '"{}"'.format(args.input[0]),
'file2':'"{}"'.format(args.input[1]) "file2": '"{}"'.format(args.input[1]),
} }
] ]
for i,e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."): for i, e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."):
os.system(f"python lib/align_tracks.py {e['file1']} {e['file2']}") os.system(f"python lib/align_tracks.py {e['file1']} {e['file2']}")
#print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1)) # print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1))

263
uvr5_pack/name_params.json Normal file
View File

@ -0,0 +1,263 @@
{
"equivalent" : [
{
"model_hash_name" : [
{
"hash_name": "47939caf0cfe52a0e81442b85b971dfd",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "4e4ecb9764c50a8c414fee6e10395bbe",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_v2.json",
"param_name": "4band_v2"
},
{
"hash_name": "ca106edd563e034bde0bdec4bb7a4b36",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_v2.json",
"param_name": "4band_v2"
},
{
"hash_name": "e60a1e84803ce4efc0a6551206cc4b71",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "a82f14e75892e55e994376edbf0c8435",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "6dd9eaa6f0420af9f1d403aaafa4cc06",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
"param_name": "4band_v2_sn"
},
{
"hash_name": "08611fb99bd59eaa79ad27c58d137727",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
"param_name": "4band_v2_sn"
},
{
"hash_name": "5c7bbca45a187e81abbbd351606164e5",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
"param_name": "3band_44100_msb2"
},
{
"hash_name": "d6b2cb685a058a091e5e7098192d3233",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
"param_name": "3band_44100_msb2"
},
{
"hash_name": "c1b9f38170a7c90e96f027992eb7c62b",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "c3448ec923fa0edf3d03a19e633faa53",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "68aa2c8093d0080704b200d140f59e54",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100.json",
"param_name": "3band_44100"
},
{
"hash_name": "fdc83be5b798e4bd29fe00fe6600e147",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
"param_name": "3band_44100_mid.json"
},
{
"hash_name": "2ce34bc92fd57f55db16b7a4def3d745",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
"param_name": "3band_44100_mid.json"
},
{
"hash_name": "52fdca89576f06cf4340b74a4730ee5f",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100.json"
},
{
"hash_name": "41191165b05d38fc77f072fa9e8e8a30",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100.json"
},
{
"hash_name": "89e83b511ad474592689e562d5b1f80e",
"model_params": "uvr5_pack/lib_v5/modelparams/2band_32000.json",
"param_name": "2band_32000.json"
},
{
"hash_name": "0b954da81d453b716b114d6d7c95177f",
"model_params": "uvr5_pack/lib_v5/modelparams/2band_32000.json",
"param_name": "2band_32000.json"
}
],
"v4 Models": [
{
"hash_name": "6a00461c51c2920fd68937d4609ed6c8",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json",
"param_name": "1band_sr16000_hl512"
},
{
"hash_name": "0ab504864d20f1bd378fe9c81ef37140",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
"param_name": "1band_sr32000_hl512"
},
{
"hash_name": "7dd21065bf91c10f7fccb57d7d83b07f",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
"param_name": "1band_sr32000_hl512"
},
{
"hash_name": "80ab74d65e515caa3622728d2de07d23",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
"param_name": "1band_sr32000_hl512"
},
{
"hash_name": "edc115e7fc523245062200c00caa847f",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
"param_name": "1band_sr33075_hl384"
},
{
"hash_name": "28063e9f6ab5b341c5f6d3c67f2045b7",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
"param_name": "1band_sr33075_hl384"
},
{
"hash_name": "b58090534c52cbc3e9b5104bad666ef2",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
"param_name": "1band_sr44100_hl512"
},
{
"hash_name": "0cdab9947f1b0928705f518f3c78ea8f",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
"param_name": "1band_sr44100_hl512"
},
{
"hash_name": "ae702fed0238afb5346db8356fe25f13",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json",
"param_name": "1band_sr44100_hl1024"
}
]
}
],
"User Models" : [
{
"1 Band": [
{
"hash_name": "1band_sr16000_hl512",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json",
"param_name": "1band_sr16000_hl512"
},
{
"hash_name": "1band_sr32000_hl512",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
"param_name": "1band_sr16000_hl512"
},
{
"hash_name": "1band_sr33075_hl384",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
"param_name": "1band_sr33075_hl384"
},
{
"hash_name": "1band_sr44100_hl256",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json",
"param_name": "1band_sr44100_hl256"
},
{
"hash_name": "1band_sr44100_hl512",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
"param_name": "1band_sr44100_hl512"
},
{
"hash_name": "1band_sr44100_hl1024",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json",
"param_name": "1band_sr44100_hl1024"
}
],
"2 Band": [
{
"hash_name": "2band_44100_lofi",
"model_params": "uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json",
"param_name": "2band_44100_lofi"
},
{
"hash_name": "2band_32000",
"model_params": "uvr5_pack/lib_v5/modelparams/2band_32000.json",
"param_name": "2band_32000"
},
{
"hash_name": "2band_48000",
"model_params": "uvr5_pack/lib_v5/modelparams/2band_48000.json",
"param_name": "2band_48000"
}
],
"3 Band": [
{
"hash_name": "3band_44100",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100.json",
"param_name": "3band_44100"
},
{
"hash_name": "3band_44100_mid",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
"param_name": "3band_44100_mid"
},
{
"hash_name": "3band_44100_msb2",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
"param_name": "3band_44100_msb2"
}
],
"4 Band": [
{
"hash_name": "4band_44100",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "4band_44100_mid",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_mid.json",
"param_name": "4band_44100_mid"
},
{
"hash_name": "4band_44100_msb",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_msb.json",
"param_name": "4band_44100_msb"
},
{
"hash_name": "4band_44100_msb2",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json",
"param_name": "4band_44100_msb2"
},
{
"hash_name": "4band_44100_reverse",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json",
"param_name": "4band_44100_reverse"
},
{
"hash_name": "4band_44100_sw",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_sw.json",
"param_name": "4band_44100_sw"
},
{
"hash_name": "4band_v2",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_v2.json",
"param_name": "4band_v2"
},
{
"hash_name": "4band_v2_sn",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
"param_name": "4band_v2_sn"
},
{
"hash_name": "tmodelparam",
"model_params": "uvr5_pack/lib_v5/modelparams/tmodelparam.json",
"param_name": "User Model Param Set"
}
]
}
]
}

View File

@ -1,6 +1,15 @@
import torch import torch
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
import json
def load_data(file_name: str = "./uvr5_pack/data.json") -> dict:
with open(file_name, "r") as f:
data = json.load(f)
return data
def make_padding(width, cropsize, offset): def make_padding(width, cropsize, offset):
left = offset left = offset
@ -10,233 +19,102 @@ def make_padding(width, cropsize, offset):
right = roi_size - (width % roi_size) + left right = roi_size - (width % roi_size) + left
return left, right, roi_size return left, right, roi_size
def inference(X_spec, device, model, aggressiveness,data):
'''
def inference(X_spec, device, model, aggressiveness, data):
"""
data dic configs data dic configs
''' """
def _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness,is_half=True): def _execute(
X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True
):
model.eval() model.eval()
with torch.no_grad(): with torch.no_grad():
preds = [] preds = []
iterations = [n_window] iterations = [n_window]
total_iterations = sum(iterations) total_iterations = sum(iterations)
for i in tqdm(range(n_window)): for i in tqdm(range(n_window)):
start = i * roi_size start = i * roi_size
X_mag_window = X_mag_pad[None, :, :, start:start + data['window_size']] X_mag_window = X_mag_pad[
None, :, :, start : start + data["window_size"]
]
X_mag_window = torch.from_numpy(X_mag_window) X_mag_window = torch.from_numpy(X_mag_window)
if(is_half):X_mag_window=X_mag_window.half() if is_half:
X_mag_window=X_mag_window.to(device) X_mag_window = X_mag_window.half()
X_mag_window = X_mag_window.to(device)
pred = model.predict(X_mag_window, aggressiveness) pred = model.predict(X_mag_window, aggressiveness)
pred = pred.detach().cpu().numpy() pred = pred.detach().cpu().numpy()
preds.append(pred[0]) preds.append(pred[0])
pred = np.concatenate(preds, axis=2) pred = np.concatenate(preds, axis=2)
return pred return pred
def preprocess(X_spec): def preprocess(X_spec):
X_mag = np.abs(X_spec) X_mag = np.abs(X_spec)
X_phase = np.angle(X_spec) X_phase = np.angle(X_spec)
return X_mag, X_phase return X_mag, X_phase
X_mag, X_phase = preprocess(X_spec) X_mag, X_phase = preprocess(X_spec)
coef = X_mag.max() coef = X_mag.max()
X_mag_pre = X_mag / coef X_mag_pre = X_mag / coef
n_frame = X_mag_pre.shape[2] n_frame = X_mag_pre.shape[2]
pad_l, pad_r, roi_size = make_padding(n_frame, pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset)
data['window_size'], model.offset)
n_window = int(np.ceil(n_frame / roi_size)) n_window = int(np.ceil(n_frame / roi_size))
X_mag_pad = np.pad( X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant')
if(list(model.state_dict().values())[0].dtype==torch.float16):is_half=True if list(model.state_dict().values())[0].dtype == torch.float16:
else:is_half=False is_half = True
pred = _execute(X_mag_pad, roi_size, n_window, else:
device, model, aggressiveness,is_half) is_half = False
pred = _execute(
X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
)
pred = pred[:, :, :n_frame] pred = pred[:, :, :n_frame]
if data['tta']: if data["tta"]:
pad_l += roi_size // 2 pad_l += roi_size // 2
pad_r += roi_size // 2 pad_r += roi_size // 2
n_window += 1 n_window += 1
X_mag_pad = np.pad( X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant')
pred_tta = _execute(X_mag_pad, roi_size, n_window, pred_tta = _execute(
device, model, aggressiveness,is_half) X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
pred_tta = pred_tta[:, :, roi_size // 2:] )
pred_tta = pred_tta[:, :, roi_size // 2 :]
pred_tta = pred_tta[:, :, :n_frame] pred_tta = pred_tta[:, :, :n_frame]
return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.j * X_phase) return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase)
else: else:
return pred * coef, X_mag, np.exp(1.j * X_phase) return pred * coef, X_mag, np.exp(1.0j * X_phase)
def _get_name_params(model_path , model_hash): def _get_name_params(model_path, model_hash):
data = load_data()
flag = False
ModelName = model_path ModelName = model_path
if model_hash == '47939caf0cfe52a0e81442b85b971dfd': for type in list(data):
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json') for model in list(data[type][0]):
param_name_auto=str('4band_44100') for i in range(len(data[type][0][model])):
if model_hash == '4e4ecb9764c50a8c414fee6e10395bbe': if str(data[type][0][model][i]["hash_name"]) == model_hash:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2.json') flag = True
param_name_auto=str('4band_v2') elif str(data[type][0][model][i]["hash_name"]) in ModelName:
if model_hash == 'ca106edd563e034bde0bdec4bb7a4b36': flag = True
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2.json')
param_name_auto=str('4band_v2')
if model_hash == 'e60a1e84803ce4efc0a6551206cc4b71':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100')
if model_hash == 'a82f14e75892e55e994376edbf0c8435':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100')
if model_hash == '6dd9eaa6f0420af9f1d403aaafa4cc06':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2_sn.json')
param_name_auto=str('4band_v2_sn')
if model_hash == '08611fb99bd59eaa79ad27c58d137727':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2_sn.json')
param_name_auto=str('4band_v2_sn')
if model_hash == '5c7bbca45a187e81abbbd351606164e5':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json')
param_name_auto=str('3band_44100_msb2')
if model_hash == 'd6b2cb685a058a091e5e7098192d3233':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json')
param_name_auto=str('3band_44100_msb2')
if model_hash == 'c1b9f38170a7c90e96f027992eb7c62b':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100')
if model_hash == 'c3448ec923fa0edf3d03a19e633faa53':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100')
if model_hash == '68aa2c8093d0080704b200d140f59e54':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100.json')
param_name_auto=str('3band_44100.json')
if model_hash == 'fdc83be5b798e4bd29fe00fe6600e147':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_mid.json')
param_name_auto=str('3band_44100_mid.json')
if model_hash == '2ce34bc92fd57f55db16b7a4def3d745':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_mid.json')
param_name_auto=str('3band_44100_mid.json')
if model_hash == '52fdca89576f06cf4340b74a4730ee5f':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100.json')
if model_hash == '41191165b05d38fc77f072fa9e8e8a30':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100.json')
if model_hash == '89e83b511ad474592689e562d5b1f80e':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_32000.json')
param_name_auto=str('2band_32000.json')
if model_hash == '0b954da81d453b716b114d6d7c95177f':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_32000.json')
param_name_auto=str('2band_32000.json')
#v4 Models if flag:
if model_hash == '6a00461c51c2920fd68937d4609ed6c8': model_params_auto = data[type][0][model][i]["model_params"]
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json') param_name_auto = data[type][0][model][i]["param_name"]
param_name_auto=str('1band_sr16000_hl512') if type == "equivalent":
if model_hash == '0ab504864d20f1bd378fe9c81ef37140': return param_name_auto, model_params_auto
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json') else:
param_name_auto=str('1band_sr32000_hl512') flag = False
if model_hash == '7dd21065bf91c10f7fccb57d7d83b07f': return param_name_auto, model_params_auto
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json')
param_name_auto=str('1band_sr32000_hl512')
if model_hash == '80ab74d65e515caa3622728d2de07d23':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json')
param_name_auto=str('1band_sr32000_hl512')
if model_hash == 'edc115e7fc523245062200c00caa847f':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json')
param_name_auto=str('1band_sr33075_hl384')
if model_hash == '28063e9f6ab5b341c5f6d3c67f2045b7':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json')
param_name_auto=str('1band_sr33075_hl384')
if model_hash == 'b58090534c52cbc3e9b5104bad666ef2':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json')
param_name_auto=str('1band_sr44100_hl512')
if model_hash == '0cdab9947f1b0928705f518f3c78ea8f':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json')
param_name_auto=str('1band_sr44100_hl512')
if model_hash == 'ae702fed0238afb5346db8356fe25f13':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json')
param_name_auto=str('1band_sr44100_hl1024')
#User Models
#1 Band
if '1band_sr16000_hl512' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json')
param_name_auto=str('1band_sr16000_hl512')
if '1band_sr32000_hl512' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json')
param_name_auto=str('1band_sr32000_hl512')
if '1band_sr33075_hl384' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json')
param_name_auto=str('1band_sr33075_hl384')
if '1band_sr44100_hl256' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json')
param_name_auto=str('1band_sr44100_hl256')
if '1band_sr44100_hl512' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json')
param_name_auto=str('1band_sr44100_hl512')
if '1band_sr44100_hl1024' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json')
param_name_auto=str('1band_sr44100_hl1024')
#2 Band
if '2band_44100_lofi' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json')
param_name_auto=str('2band_44100_lofi')
if '2band_32000' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_32000.json')
param_name_auto=str('2band_32000')
if '2band_48000' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_48000.json')
param_name_auto=str('2band_48000')
#3 Band
if '3band_44100' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100.json')
param_name_auto=str('3band_44100')
if '3band_44100_mid' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_mid.json')
param_name_auto=str('3band_44100_mid')
if '3band_44100_msb2' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json')
param_name_auto=str('3band_44100_msb2')
#4 Band
if '4band_44100' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100')
if '4band_44100_mid' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_mid.json')
param_name_auto=str('4band_44100_mid')
if '4band_44100_msb' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_msb.json')
param_name_auto=str('4band_44100_msb')
if '4band_44100_msb2' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json')
param_name_auto=str('4band_44100_msb2')
if '4band_44100_reverse' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json')
param_name_auto=str('4band_44100_reverse')
if '4band_44100_sw' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_sw.json')
param_name_auto=str('4band_44100_sw')
if '4band_v2' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2.json')
param_name_auto=str('4band_v2')
if '4band_v2_sn' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2_sn.json')
param_name_auto=str('4band_v2_sn')
if 'tmodelparam' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/tmodelparam.json')
param_name_auto=str('User Model Param Set')
return param_name_auto , model_params_auto

View File

@ -1,36 +1,47 @@
import numpy as np,parselmouth,torch,pdb import numpy as np, parselmouth, torch, pdb
from time import time as ttime from time import time as ttime
import torch.nn.functional as F import torch.nn.functional as F
from config import x_pad,x_query,x_center,x_max from config import x_pad, x_query, x_center, x_max
import scipy.signal as signal import scipy.signal as signal
import pyworld,os,traceback,faiss import pyworld, os, traceback, faiss
class VC(object):
def __init__(self,tgt_sr,device,is_half):
self.sr=16000#hubert输入采样率
self.window=160#每帧点数
self.t_pad=self.sr*x_pad#每条前后pad时间
self.t_pad_tgt=tgt_sr*x_pad
self.t_pad2=self.t_pad*2
self.t_query=self.sr*x_query#查询切点前后查询时间
self.t_center=self.sr*x_center#查询切点位置
self.t_max=self.sr*x_max#免查询时长阈值
self.device=device
self.is_half=is_half
def get_f0(self,x, p_len,f0_up_key,f0_method,inp_f0=None):
class VC(object):
def __init__(self, tgt_sr, device, is_half):
self.sr = 16000 # hubert输入采样率
self.window = 160 # 每帧点数
self.t_pad = self.sr * x_pad # 每条前后pad时间
self.t_pad_tgt = tgt_sr * x_pad
self.t_pad2 = self.t_pad * 2
self.t_query = self.sr * x_query # 查询切点前后查询时间
self.t_center = self.sr * x_center # 查询切点位置
self.t_max = self.sr * x_max # 免查询时长阈值
self.device = device
self.is_half = is_half
def get_f0(self, x, p_len, f0_up_key, f0_method, inp_f0=None):
time_step = self.window / self.sr * 1000 time_step = self.window / self.sr * 1000
f0_min = 50 f0_min = 50
f0_max = 1100 f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700) f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700) f0_mel_max = 1127 * np.log(1 + f0_max / 700)
if(f0_method=="pm"): if f0_method == "pm":
f0 = parselmouth.Sound(x, self.sr).to_pitch_ac( f0 = (
time_step=time_step / 1000, voicing_threshold=0.6, parselmouth.Sound(x, self.sr)
pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency'] .to_pitch_ac(
pad_size=(p_len - len(f0) + 1) // 2 time_step=time_step / 1000,
if(pad_size>0 or p_len - len(f0) - pad_size>0): voicing_threshold=0.6,
f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant') pitch_floor=f0_min,
elif(f0_method=="harvest"): pitch_ceiling=f0_max,
)
.selected_array["frequency"]
)
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
)
elif f0_method == "harvest":
f0, t = pyworld.harvest( f0, t = pyworld.harvest(
x.astype(np.double), x.astype(np.double),
fs=self.sr, fs=self.sr,
@ -42,25 +53,45 @@ class VC(object):
f0 = signal.medfilt(f0, 3) f0 = signal.medfilt(f0, 3)
f0 *= pow(2, f0_up_key / 12) f0 *= pow(2, f0_up_key / 12)
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
tf0=self.sr//self.window#每秒f0点数 tf0 = self.sr // self.window # 每秒f0点数
if (inp_f0 is not None): if inp_f0 is not None:
delta_t=np.round((inp_f0[:,0].max()-inp_f0[:,0].min())*tf0+1).astype("int16") delta_t = np.round(
replace_f0=np.interp(list(range(delta_t)), inp_f0[:, 0]*100, inp_f0[:, 1]) (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
shape=f0[x_pad*tf0:x_pad*tf0+len(replace_f0)].shape[0] ).astype("int16")
f0[x_pad*tf0:x_pad*tf0+len(replace_f0)]=replace_f0[:shape] replace_f0 = np.interp(
list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
)
shape = f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)].shape[0]
f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)] = replace_f0[:shape]
# with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
f0bak = f0.copy() f0bak = f0.copy()
f0_mel = 1127 * np.log(1 + f0 / 700) f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1 f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
f0_mel_max - f0_mel_min
) + 1
f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255 f0_mel[f0_mel > 255] = 255
f0_coarse = np.rint(f0_mel).astype(np.int) f0_coarse = np.rint(f0_mel).astype(np.int)
return f0_coarse, f0bak#1-0 return f0_coarse, f0bak # 1-0
def vc(self,model,net_g,sid,audio0,pitch,pitchf,times,index,big_npy,index_rate):#,file_index,file_big_npy def vc(
self,
model,
net_g,
sid,
audio0,
pitch,
pitchf,
times,
index,
big_npy,
index_rate,
): # ,file_index,file_big_npy
feats = torch.from_numpy(audio0) feats = torch.from_numpy(audio0)
if(self.is_half):feats=feats.half() if self.is_half:
else:feats=feats.float() feats = feats.half()
else:
feats = feats.float()
if feats.dim() == 2: # double channels if feats.dim() == 2: # double channels
feats = feats.mean(-1) feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim() assert feats.dim() == 1, feats.dim()
@ -75,91 +106,196 @@ class VC(object):
t0 = ttime() t0 = ttime()
with torch.no_grad(): with torch.no_grad():
logits = model.extract_features(**inputs) logits = model.extract_features(**inputs)
feats = model.final_proj(logits[0]) feats = model.final_proj(logits[0])
if(isinstance(index,type(None))==False and isinstance(big_npy,type(None))==False and index_rate!=0): if (
isinstance(index, type(None)) == False
and isinstance(big_npy, type(None)) == False
and index_rate != 0
):
npy = feats[0].cpu().numpy() npy = feats[0].cpu().numpy()
if(self.is_half):npy=npy.astype("float32") if self.is_half:
npy = npy.astype("float32")
_, I = index.search(npy, 1) _, I = index.search(npy, 1)
npy=big_npy[I.squeeze()] npy = big_npy[I.squeeze()]
if(self.is_half):npy=npy.astype("float16") if self.is_half:
feats = torch.from_numpy(npy).unsqueeze(0).to(self.device)*index_rate + (1-index_rate)*feats npy = npy.astype("float16")
feats = (
torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
+ (1 - index_rate) * feats
)
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
t1 = ttime() t1 = ttime()
p_len = audio0.shape[0]//self.window p_len = audio0.shape[0] // self.window
if(feats.shape[1]<p_len): if feats.shape[1] < p_len:
p_len=feats.shape[1] p_len = feats.shape[1]
if(pitch!=None and pitchf!=None): if pitch != None and pitchf != None:
pitch=pitch[:,:p_len] pitch = pitch[:, :p_len]
pitchf=pitchf[:,:p_len] pitchf = pitchf[:, :p_len]
p_len=torch.tensor([p_len],device=self.device).long() p_len = torch.tensor([p_len], device=self.device).long()
with torch.no_grad(): with torch.no_grad():
if(pitch!=None and pitchf!=None): if pitch != None and pitchf != None:
audio1 = (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16) audio1 = (
(net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768)
.data.cpu()
.float()
.numpy()
.astype(np.int16)
)
else: else:
audio1 = (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16) audio1 = (
del feats,p_len,padding_mask (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768)
if torch.cuda.is_available(): torch.cuda.empty_cache() .data.cpu()
.float()
.numpy()
.astype(np.int16)
)
del feats, p_len, padding_mask
if torch.cuda.is_available():
torch.cuda.empty_cache()
t2 = ttime() t2 = ttime()
times[0] += (t1 - t0) times[0] += t1 - t0
times[2] += (t2 - t1) times[2] += t2 - t1
return audio1 return audio1
def pipeline(self,model,net_g,sid,audio,times,f0_up_key,f0_method,file_index,file_big_npy,index_rate,if_f0,f0_file=None): def pipeline(
if(file_big_npy!=""and file_index!=""and os.path.exists(file_big_npy)==True and os.path.exists(file_index)==True and index_rate!=0): self,
model,
net_g,
sid,
audio,
times,
f0_up_key,
f0_method,
file_index,
file_big_npy,
index_rate,
if_f0,
f0_file=None,
):
if (
file_big_npy != ""
and file_index != ""
and os.path.exists(file_big_npy) == True
and os.path.exists(file_index) == True
and index_rate != 0
):
try: try:
index = faiss.read_index(file_index) index = faiss.read_index(file_index)
big_npy = np.load(file_big_npy) big_npy = np.load(file_big_npy)
except: except:
traceback.print_exc() traceback.print_exc()
index=big_npy=None index = big_npy = None
else: else:
index=big_npy=None index = big_npy = None
audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode='reflect') audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
opt_ts = [] opt_ts = []
if(audio_pad.shape[0]>self.t_max): if audio_pad.shape[0] > self.t_max:
audio_sum = np.zeros_like(audio) audio_sum = np.zeros_like(audio)
for i in range(self.window): audio_sum += audio_pad[i:i - self.window] for i in range(self.window):
for t in range(self.t_center, audio.shape[0],self.t_center):opt_ts.append(t - self.t_query + np.where(np.abs(audio_sum[t - self.t_query:t + self.t_query]) == np.abs(audio_sum[t - self.t_query:t + self.t_query]).min())[0][0]) audio_sum += audio_pad[i : i - self.window]
for t in range(self.t_center, audio.shape[0], self.t_center):
opt_ts.append(
t
- self.t_query
+ np.where(
np.abs(audio_sum[t - self.t_query : t + self.t_query])
== np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
)[0][0]
)
s = 0 s = 0
audio_opt=[] audio_opt = []
t=None t = None
t1=ttime() t1 = ttime()
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode='reflect') audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
p_len=audio_pad.shape[0]//self.window p_len = audio_pad.shape[0] // self.window
inp_f0=None inp_f0 = None
if(hasattr(f0_file,'name') ==True): if hasattr(f0_file, "name") == True:
try: try:
with open(f0_file.name,"r")as f: with open(f0_file.name, "r") as f:
lines=f.read().strip("\n").split("\n") lines = f.read().strip("\n").split("\n")
inp_f0=[] inp_f0 = []
for line in lines:inp_f0.append([float(i)for i in line.split(",")]) for line in lines:
inp_f0=np.array(inp_f0,dtype="float32") inp_f0.append([float(i) for i in line.split(",")])
inp_f0 = np.array(inp_f0, dtype="float32")
except: except:
traceback.print_exc() traceback.print_exc()
sid=torch.tensor(sid,device=self.device).unsqueeze(0).long() sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
pitch, pitchf=None,None pitch, pitchf = None, None
if(if_f0==1): if if_f0 == 1:
pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key,f0_method,inp_f0) pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, inp_f0)
pitch = pitch[:p_len] pitch = pitch[:p_len]
pitchf = pitchf[:p_len] pitchf = pitchf[:p_len]
pitch = torch.tensor(pitch,device=self.device).unsqueeze(0).long() pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
pitchf = torch.tensor(pitchf,device=self.device).unsqueeze(0).float() pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
t2=ttime() t2 = ttime()
times[1] += (t2 - t1) times[1] += t2 - t1
for t in opt_ts: for t in opt_ts:
t=t//self.window*self.window t = t // self.window * self.window
if (if_f0 == 1): if if_f0 == 1:
audio_opt.append(self.vc(model,net_g,sid,audio_pad[s:t+self.t_pad2+self.window],pitch[:,s//self.window:(t+self.t_pad2)//self.window],pitchf[:,s//self.window:(t+self.t_pad2)//self.window],times,index,big_npy,index_rate)[self.t_pad_tgt:-self.t_pad_tgt]) audio_opt.append(
self.vc(
model,
net_g,
sid,
audio_pad[s : t + self.t_pad2 + self.window],
pitch[:, s // self.window : (t + self.t_pad2) // self.window],
pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
times,
index,
big_npy,
index_rate,
)[self.t_pad_tgt : -self.t_pad_tgt]
)
else: else:
audio_opt.append(self.vc(model,net_g,sid,audio_pad[s:t+self.t_pad2+self.window],None,None,times,index,big_npy,index_rate)[self.t_pad_tgt:-self.t_pad_tgt]) audio_opt.append(
self.vc(
model,
net_g,
sid,
audio_pad[s : t + self.t_pad2 + self.window],
None,
None,
times,
index,
big_npy,
index_rate,
)[self.t_pad_tgt : -self.t_pad_tgt]
)
s = t s = t
if (if_f0 == 1): if if_f0 == 1:
audio_opt.append(self.vc(model,net_g,sid,audio_pad[t:],pitch[:,t//self.window:]if t is not None else pitch,pitchf[:,t//self.window:]if t is not None else pitchf,times,index,big_npy,index_rate)[self.t_pad_tgt:-self.t_pad_tgt]) audio_opt.append(
self.vc(
model,
net_g,
sid,
audio_pad[t:],
pitch[:, t // self.window :] if t is not None else pitch,
pitchf[:, t // self.window :] if t is not None else pitchf,
times,
index,
big_npy,
index_rate,
)[self.t_pad_tgt : -self.t_pad_tgt]
)
else: else:
audio_opt.append(self.vc(model,net_g,sid,audio_pad[t:],None,None,times,index,big_npy,index_rate)[self.t_pad_tgt:-self.t_pad_tgt]) audio_opt.append(
audio_opt=np.concatenate(audio_opt) self.vc(
del pitch,pitchf,sid model,
if torch.cuda.is_available(): torch.cuda.empty_cache() net_g,
sid,
audio_pad[t:],
None,
None,
times,
index,
big_npy,
index_rate,
)[self.t_pad_tgt : -self.t_pad_tgt]
)
audio_opt = np.concatenate(audio_opt)
del pitch, pitchf, sid
if torch.cuda.is_available():
torch.cuda.empty_cache()
return audio_opt return audio_opt

View File

@ -1,16 +1,18 @@
import locale import locale
import json import json
def load_language_list(language): def load_language_list(language):
with open(f"./locale/{language}.json", "r", encoding="utf-8") as f: with open(f"./locale/{language}.json", "r", encoding="utf-8") as f:
language_list = json.load(f) language_list = json.load(f)
return language_list return language_list
class I18nAuto: class I18nAuto:
def __init__(self, language=None): def __init__(self, language=None):
if language is None: if language is None:
language = 'auto' language = "auto"
if language == 'auto': if language == "auto":
language = locale.getdefaultlocale()[0] language = locale.getdefaultlocale()[0]
self.language = language self.language = language
print("Use Language:", language) print("Use Language:", language)