From b4c653142da945f0b93538c37b4b13c6874386dd Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 24 Apr 2023 20:35:56 +0800 Subject: [PATCH] Format code (#142) Co-authored-by: github-actions[bot] --- export_onnx.py | 38 +++++++++++++++------------ extract_f0_print.py | 2 +- gui.py | 17 +++++------- i18n.py | 6 +++-- infer-web.py | 36 +++++++++++++++---------- my_utils.py | 2 +- train/data_utils.py | 8 +++--- trainset_preprocess_pipeline_print.py | 6 +++-- 8 files changed, 64 insertions(+), 51 deletions(-) diff --git a/export_onnx.py b/export_onnx.py index 8b62b47..719aa7b 100644 --- a/export_onnx.py +++ b/export_onnx.py @@ -2,27 +2,29 @@ from infer_pack.models_onnx_moess import SynthesizerTrnMs256NSFsidM from infer_pack.models_onnx import SynthesizerTrnMs256NSFsidO import torch -if __name__ == '__main__': - MoeVS = True #模型是否为MoeVoiceStudio(原MoeSS)使用 +if __name__ == "__main__": + MoeVS = True # 模型是否为MoeVoiceStudio(原MoeSS)使用 - ModelPath = "Shiroha/shiroha.pth" #模型路径 - ExportedPath = "model.onnx" #输出路径 - hidden_channels = 256 # hidden_channels,为768Vec做准备 - cpt = torch.load(ModelPath, map_location="cpu") - cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk + ModelPath = "Shiroha/shiroha.pth" # 模型路径 + ExportedPath = "model.onnx" # 输出路径 + hidden_channels = 256 # hidden_channels,为768Vec做准备 + cpt = torch.load(ModelPath, map_location="cpu") + cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk print(*cpt["config"]) - test_phone = torch.rand(1, 200, hidden_channels) # hidden unit - test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用) - test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹) - test_pitchf = torch.rand(1, 200) # nsf基频 - test_ds = torch.LongTensor([0]) # 说话人ID - test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子) + test_phone = torch.rand(1, 200, hidden_channels) # hidden unit + test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用) + test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹) + test_pitchf = torch.rand(1, 200) # nsf基频 + test_ds = torch.LongTensor([0]) # 说话人ID + test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子) - device = "cpu" #导出时设备(不影响使用模型) + device = "cpu" # 导出时设备(不影响使用模型) if MoeVS: - net_g = SynthesizerTrnMs256NSFsidM(*cpt["config"], is_half=False) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16) + net_g = SynthesizerTrnMs256NSFsidM( + *cpt["config"], is_half=False + ) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16) net_g.load_state_dict(cpt["weight"], strict=False) input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"] output_names = [ @@ -52,7 +54,9 @@ if __name__ == '__main__': output_names=output_names, ) else: - net_g = SynthesizerTrnMs256NSFsidO(*cpt["config"], is_half=False) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16) + net_g = SynthesizerTrnMs256NSFsidO( + *cpt["config"], is_half=False + ) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16) net_g.load_state_dict(cpt["weight"], strict=False) input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds"] output_names = [ @@ -78,4 +82,4 @@ if __name__ == '__main__': verbose=False, input_names=input_names, output_names=output_names, - ) \ No newline at end of file + ) diff --git a/extract_f0_print.py b/extract_f0_print.py index f848a0a..d2fef0f 100644 --- a/extract_f0_print.py +++ b/extract_f0_print.py @@ -35,7 +35,7 @@ class FeatureInput(object): def compute_f0(self, path, f0_method): # default resample type of librosa.resample is "soxr_hq". # Quality: soxr_vhq > soxr_hq - x, sr = librosa.load(path, self.fs)#, res_type='soxr_vhq' + x, sr = librosa.load(path, self.fs) # , res_type='soxr_vhq' p_len = x.shape[0] // self.hop assert sr == self.fs if f0_method == "pm": diff --git a/gui.py b/gui.py index 6215435..4146c63 100644 --- a/gui.py +++ b/gui.py @@ -67,7 +67,7 @@ class RVC: print(e) def get_f0(self, x, f0_up_key, inp_f0=None): - x_pad=1 + x_pad = 1 f0_min = 50 f0_max = 1100 f0_mel_min = 1127 * np.log(1 + f0_min / 700) @@ -137,7 +137,7 @@ class RVC: feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) torch.cuda.synchronize() print(feats.shape) - if(self.if_f0==1): + if self.if_f0 == 1: pitch, pitchf = self.get_f0(audio, self.f0_up_key) p_len = min(feats.shape[1], 13000, pitch.shape[0]) # 太大了爆显存 else: @@ -146,7 +146,7 @@ class RVC: torch.cuda.synchronize() # print(feats.shape,pitch.shape) feats = feats[:, :p_len, :] - if(self.if_f0==1): + if self.if_f0 == 1: pitch = pitch[:p_len] pitchf = pitchf[:p_len] pitch = torch.LongTensor(pitch).unsqueeze(0).to(device) @@ -155,17 +155,15 @@ class RVC: ii = 0 # sid sid = torch.LongTensor([ii]).to(device) with torch.no_grad(): - if(self.if_f0==1): + if self.if_f0 == 1: infered_audio = ( self.net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] .data.cpu() .float() ) else: - infered_audio = ( - self.net_g.infer(feats, p_len, sid)[0][0, 0] - .data.cpu() - .float() + infered_audio = ( + self.net_g.infer(feats, p_len, sid)[0][0, 0].data.cpu().float() ) torch.cuda.synchronize() return infered_audio @@ -387,7 +385,7 @@ class GUI: self.config.pth_path, self.config.index_path, self.config.npy_path, - self.config.index_rate + self.config.index_rate, ) self.input_wav: np.ndarray = np.zeros( self.extra_frame @@ -511,7 +509,6 @@ class GUI: total_time = time.perf_counter() - start_time self.window["infer_time"].update(int(total_time * 1000)) print("infer time:" + str(total_time)) - def get_devices(self, update: bool = True): """获取设备列表""" diff --git a/i18n.py b/i18n.py index ec7a866..4cbbe5e 100644 --- a/i18n.py +++ b/i18n.py @@ -11,8 +11,10 @@ def load_language_list(language): class I18nAuto: def __init__(self, language=None): - if language in ['auto', None]: - language = locale.getdefaultlocale()[0]#getlocale can't identify the system's language ((None, None)) + if language in ["auto", None]: + language = locale.getdefaultlocale()[ + 0 + ] # getlocale can't identify the system's language ((None, None)) if not os.path.exists(f"./i18n/{language}.json"): language = "en_US" self.language = language diff --git a/infer-web.py b/infer-web.py index 771a65c..a1cf3c6 100644 --- a/infer-web.py +++ b/infer-web.py @@ -119,7 +119,6 @@ for name in os.listdir(weight_uvr5_root): uvr5_names.append(name.replace(".pth", "")) - def vc_single( sid, input_audio, @@ -888,23 +887,27 @@ def change_info_(ckpt_path): from infer_pack.models_onnx_moess import SynthesizerTrnMs256NSFsidM from infer_pack.models_onnx import SynthesizerTrnMs256NSFsidO + + def export_onnx(ModelPath, ExportedPath, MoeVS=True): - hidden_channels = 256 # hidden_channels,为768Vec做准备 - cpt = torch.load(ModelPath, map_location="cpu") - cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk + hidden_channels = 256 # hidden_channels,为768Vec做准备 + cpt = torch.load(ModelPath, map_location="cpu") + cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk print(*cpt["config"]) - test_phone = torch.rand(1, 200, hidden_channels) # hidden unit - test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用) - test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹) - test_pitchf = torch.rand(1, 200) # nsf基频 - test_ds = torch.LongTensor([0]) # 说话人ID - test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子) + test_phone = torch.rand(1, 200, hidden_channels) # hidden unit + test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用) + test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹) + test_pitchf = torch.rand(1, 200) # nsf基频 + test_ds = torch.LongTensor([0]) # 说话人ID + test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子) - device = "cpu" #导出时设备(不影响使用模型) + device = "cpu" # 导出时设备(不影响使用模型) if MoeVS: - net_g = SynthesizerTrnMs256NSFsidM(*cpt["config"], is_half=False) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16) + net_g = SynthesizerTrnMs256NSFsidM( + *cpt["config"], is_half=False + ) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16) net_g.load_state_dict(cpt["weight"], strict=False) input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"] output_names = [ @@ -934,7 +937,9 @@ def export_onnx(ModelPath, ExportedPath, MoeVS=True): output_names=output_names, ) else: - net_g = SynthesizerTrnMs256NSFsidO(*cpt["config"], is_half=False) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16) + net_g = SynthesizerTrnMs256NSFsidO( + *cpt["config"], is_half=False + ) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16) net_g.load_state_dict(cpt["weight"], strict=False) input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds"] output_names = [ @@ -963,6 +968,7 @@ def export_onnx(ModelPath, ExportedPath, MoeVS=True): ) return "Finished" + with gr.Blocks() as app: gr.Markdown( value=i18n( @@ -1443,7 +1449,9 @@ with gr.Blocks() as app: with gr.Row(): ckpt_dir = gr.Textbox(label=i18n("RVC模型路径"), value="", interactive=True) with gr.Row(): - onnx_dir = gr.Textbox(label=i18n("Onnx输出路径"), value="", interactive=True) + onnx_dir = gr.Textbox( + label=i18n("Onnx输出路径"), value="", interactive=True + ) with gr.Row(): moevs = gr.Checkbox(label=i18n("MoeVS模型"), value=True) infoOnnx = gr.Label(label="Null") diff --git a/my_utils.py b/my_utils.py index 8b7e427..776939d 100644 --- a/my_utils.py +++ b/my_utils.py @@ -18,4 +18,4 @@ def load_audio(file, sr): except Exception as e: raise RuntimeError(f"Failed to load audio: {e}") - return np.frombuffer(out, np.float32).flatten() \ No newline at end of file + return np.frombuffer(out, np.float32).flatten() diff --git a/train/data_utils.py b/train/data_utils.py index 87a435f..6e00a7a 100644 --- a/train/data_utils.py +++ b/train/data_utils.py @@ -99,8 +99,8 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): ) ) audio_norm = audio -# audio_norm = audio / self.max_wav_value -# audio_norm = audio / np.abs(audio).max() + # audio_norm = audio / self.max_wav_value + # audio_norm = audio / np.abs(audio).max() audio_norm = audio_norm.unsqueeze(0) spec_filename = filename.replace(".wav", ".spec.pt") @@ -291,8 +291,8 @@ class TextAudioLoader(torch.utils.data.Dataset): ) ) audio_norm = audio -# audio_norm = audio / self.max_wav_value -# audio_norm = audio / np.abs(audio).max() + # audio_norm = audio / self.max_wav_value + # audio_norm = audio / np.abs(audio).max() audio_norm = audio_norm.unsqueeze(0) spec_filename = filename.replace(".wav", ".spec.pt") diff --git a/trainset_preprocess_pipeline_print.py b/trainset_preprocess_pipeline_print.py index f40309a..5da8781 100644 --- a/trainset_preprocess_pipeline_print.py +++ b/trainset_preprocess_pipeline_print.py @@ -61,7 +61,9 @@ class PreProcess: self.sr, tmp_audio.astype(np.float32), ) - tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000)#, res_type="soxr_vhq" + tmp_audio = librosa.resample( + tmp_audio, orig_sr=self.sr, target_sr=16000 + ) # , res_type="soxr_vhq" wavfile.write( "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), 16000, @@ -72,7 +74,7 @@ class PreProcess: try: audio = load_audio(path, self.sr) # zero phased digital filter cause pre-ringing noise... - # audio = signal.filtfilt(self.bh, self.ah, audio) + # audio = signal.filtfilt(self.bh, self.ah, audio) audio = signal.lfilter(self.bh, self.ah, audio) idx1 = 0