From fe0f32c6a6270dc8a477801fab19359c39ba6975 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Sun, 26 Nov 2023 21:03:10 +0900 Subject: [PATCH 01/20] swap description (#1554) Co-authored-by: donghyeop son <42092560+sondonghup@users.noreply.github.com> --- infer/lib/train/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/infer/lib/train/utils.py b/infer/lib/train/utils.py index c411c9b..e7bb783 100644 --- a/infer/lib/train/utils.py +++ b/infer/lib/train/utils.py @@ -312,10 +312,10 @@ def get_hparams(init=True): "-te", "--total_epoch", type=int, required=True, help="total_epoch" ) parser.add_argument( - "-pg", "--pretrainG", type=str, default="", help="Pretrained Discriminator path" + "-pg", "--pretrainG", type=str, default="", help="Pretrained Generator path" ) parser.add_argument( - "-pd", "--pretrainD", type=str, default="", help="Pretrained Generator path" + "-pd", "--pretrainD", type=str, default="", help="Pretrained Discriminator path" ) parser.add_argument("-g", "--gpus", type=str, default="0", help="split by -") parser.add_argument( From 45da3644f4f2aecda893b7bff48f0cd8fc90973a Mon Sep 17 00:00:00 2001 From: David Edwards Date: Thu, 30 Nov 2023 04:32:24 -0700 Subject: [PATCH 02/20] feat: add en_US translations and missing i18n in web client (#1576) * chore(format): run black on main (#1448) Co-authored-by: github-actions[bot] * translate untranslated english * Fix trainset_dir4 default value in infer-web.py --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: github-actions[bot] Co-authored-by: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> --- i18n/locale/en_US.json | 9 +++++---- infer-web.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/i18n/locale/en_US.json b/i18n/locale/en_US.json index dba5ec3..c95223c 100644 --- a/i18n/locale/en_US.json +++ b/i18n/locale/en_US.json @@ -38,7 +38,7 @@ "加载模型": "Load model", "加载预训练底模D路径": "Load pre-trained base model D path:", "加载预训练底模G路径": "Load pre-trained base model G path:", - "单次推理": "单次推理", + "单次推理": "Single Inference", "卸载音色省显存": "Unload voice to save GPU memory:", "变调(整数, 半音数量, 升八度12降八度-12)": "Transpose (integer, number of semitones, raise by an octave: 12, lower by an octave: -12):", "后处理重采样至最终采样率,0为不进行重采样": "Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling:", @@ -54,7 +54,7 @@ "很遗憾您这没有能用的显卡来支持您训练": "Unfortunately, there is no compatible GPU available to support your training.", "性能设置": "Performance settings", "总训练轮数total_epoch": "Total training epochs (total_epoch):", - "批量推理": "批量推理", + "批量推理": "Batch Inference", "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Batch conversion. Enter the folder containing the audio files to be converted or upload multiple audio files. The converted audio will be output in the specified folder (default: 'opt').", "指定输出主人声文件夹": "Specify the output folder for vocals:", "指定输出文件夹": "Specify output folder:", @@ -120,11 +120,12 @@ "选择.pth文件": "Select the .pth file", "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "Select the pitch extraction algorithm ('pm': faster extraction but lower-quality speech; 'harvest': better bass but extremely slow; 'crepe': better quality but GPU intensive), 'rmvpe': best quality, and little GPU requirement", - "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "Select the pitch extraction algorithm: when extracting singing, you can use 'pm' to speed up. For high-quality speech with fast performance, but worse CPU usage, you can use 'dio'. 'harvest' results in better quality but is slower. 'rmvpe' has the best results and consumes less CPU/GPU", "采样长度": "Sample length", "重载设备列表": "Reload device list", "音调设置": "Pitch settings", "音频设备(请使用同种类驱动)": "Audio device (please use the same type of driver)", "音高算法": "pitch detection algorithm", - "额外推理时长": "Extra inference time" + "额外推理时长": "Extra inference time", + "E:\\语音音频+标注\\米津玄师\\src": "C:\\Users\\Desktop\\src" } diff --git a/infer-web.py b/infer-web.py index 9c356c1..8c5f021 100644 --- a/infer-web.py +++ b/infer-web.py @@ -1142,7 +1142,7 @@ with gr.Blocks(title="RVC WebUI") as app: ) with gr.Row(): trainset_dir4 = gr.Textbox( - label=i18n("输入训练文件夹路径"), value="E:\\语音音频+标注\\米津玄师\\src" + label=i18n("输入训练文件夹路径"), value=i18n("E:\\语音音频+标注\\米津玄师\\src") ) spk_id5 = gr.Slider( minimum=0, From 7c753e6ac1dce55a5eb38e4ede4dbeb86e1e1891 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 30 Nov 2023 20:34:39 +0900 Subject: [PATCH 03/20] chore(i18n): sync locale on dev (#1585) Co-authored-by: github-actions[bot] --- i18n/locale/en_US.json | 4 ++-- i18n/locale/es_ES.json | 1 + i18n/locale/fr_FR.json | 1 + i18n/locale/it_IT.json | 1 + i18n/locale/ja_JP.json | 1 + i18n/locale/ru_RU.json | 1 + i18n/locale/tr_TR.json | 1 + i18n/locale/zh_CN.json | 1 + i18n/locale/zh_HK.json | 1 + i18n/locale/zh_SG.json | 1 + i18n/locale/zh_TW.json | 1 + 11 files changed, 12 insertions(+), 2 deletions(-) diff --git a/i18n/locale/en_US.json b/i18n/locale/en_US.json index c95223c..e4f5100 100644 --- a/i18n/locale/en_US.json +++ b/i18n/locale/en_US.json @@ -3,6 +3,7 @@ "A模型权重": "Weight (w) for Model A:", "A模型路径": "Path to Model A:", "B模型路径": "Path to Model B:", + "E:\\语音音频+标注\\米津玄师\\src": "C:\\Users\\Desktop\\src", "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0 curve file (optional). One pitch per line. Replaces the default F0 and pitch modulation:", "Index Rate": "Index Rate", "Onnx导出": "Export Onnx", @@ -126,6 +127,5 @@ "音调设置": "Pitch settings", "音频设备(请使用同种类驱动)": "Audio device (please use the same type of driver)", "音高算法": "pitch detection algorithm", - "额外推理时长": "Extra inference time", - "E:\\语音音频+标注\\米津玄师\\src": "C:\\Users\\Desktop\\src" + "额外推理时长": "Extra inference time" } diff --git a/i18n/locale/es_ES.json b/i18n/locale/es_ES.json index fdd17f0..09ea011 100644 --- a/i18n/locale/es_ES.json +++ b/i18n/locale/es_ES.json @@ -3,6 +3,7 @@ "A模型权重": "Un peso modelo para el modelo A.", "A模型路径": "Modelo A ruta.", "B模型路径": "Modelo B ruta.", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "Archivo de curva F0, opcional, un tono por línea, en lugar de F0 predeterminado y cambio de tono", "Index Rate": "Tasa de índice", "Onnx导出": "Exportar Onnx", diff --git a/i18n/locale/fr_FR.json b/i18n/locale/fr_FR.json index 1c76ccd..cc6321c 100644 --- a/i18n/locale/fr_FR.json +++ b/i18n/locale/fr_FR.json @@ -3,6 +3,7 @@ "A模型权重": "Poids (w) pour le modèle A :", "A模型路径": "Chemin d'accès au modèle A :", "B模型路径": "Chemin d'accès au modèle B :", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "Fichier de courbe F0 (facultatif). Une hauteur par ligne. Remplace la fréquence fondamentale par défaut et la modulation de la hauteur :", "Index Rate": "Taux d'indexation", "Onnx导出": "Exporter en ONNX", diff --git a/i18n/locale/it_IT.json b/i18n/locale/it_IT.json index 02eac59..fc31aa6 100644 --- a/i18n/locale/it_IT.json +++ b/i18n/locale/it_IT.json @@ -3,6 +3,7 @@ "A模型权重": "Peso (w) per il modello A:", "A模型路径": "Percorso per il modello A:", "B模型路径": "Percorso per il modello B:", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "File curva F0 (opzionale). ", "Index Rate": "Tasso di indice", "Onnx导出": "Esporta Onnx", diff --git a/i18n/locale/ja_JP.json b/i18n/locale/ja_JP.json index d02f331..a96e4ba 100644 --- a/i18n/locale/ja_JP.json +++ b/i18n/locale/ja_JP.json @@ -3,6 +3,7 @@ "A模型权重": "Aモデルの重み", "A模型路径": "Aモデルのパス", "B模型路径": "Bモデルのパス", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0(最低共振周波数)カーブファイル(オプション、1行に1ピッチ、デフォルトのF0(最低共振周波数)とエレベーションを置き換えます。)", "Index Rate": "Index Rate", "Onnx导出": "Onnxエクスポート", diff --git a/i18n/locale/ru_RU.json b/i18n/locale/ru_RU.json index 9d7ef8e..6bdff9c 100644 --- a/i18n/locale/ru_RU.json +++ b/i18n/locale/ru_RU.json @@ -3,6 +3,7 @@ "A模型权重": "Весы (w) модели А:", "A模型路径": "Путь к модели А:", "B模型路径": "Путь к модели Б:", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "Файл дуги F0 (не обязательно). Одна тональность на каждую строчку. Заменяет обычный F0 и модуляцию тональности:", "Index Rate": "Темп индекса", "Onnx导出": "Экспорт ONNX", diff --git a/i18n/locale/tr_TR.json b/i18n/locale/tr_TR.json index 04c6102..8fe7aa9 100644 --- a/i18n/locale/tr_TR.json +++ b/i18n/locale/tr_TR.json @@ -3,6 +3,7 @@ "A模型权重": "A Modeli Ağırlığı:", "A模型路径": "A Modeli Yolu:", "B模型路径": "B Modeli Yolu:", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0 eğrisi dosyası (isteğe bağlı). Her satırda bir pitch değeri bulunur. Varsayılan F0 ve pitch modülasyonunu değiştirir:", "Index Rate": "Index Oranı", "Onnx导出": "Onnx Dışa Aktar", diff --git a/i18n/locale/zh_CN.json b/i18n/locale/zh_CN.json index 2c77001..dd56851 100644 --- a/i18n/locale/zh_CN.json +++ b/i18n/locale/zh_CN.json @@ -3,6 +3,7 @@ "A模型权重": "A模型权重", "A模型路径": "A模型路径", "B模型路径": "B模型路径", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调", "Index Rate": "Index Rate", "Onnx导出": "Onnx导出", diff --git a/i18n/locale/zh_HK.json b/i18n/locale/zh_HK.json index b7f6171..51ebf75 100644 --- a/i18n/locale/zh_HK.json +++ b/i18n/locale/zh_HK.json @@ -3,6 +3,7 @@ "A模型权重": "A模型權重", "A模型路径": "A模型路徑", "B模型路径": "B模型路徑", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案,可選,一行一個音高,代替預設的F0及升降調", "Index Rate": "Index Rate", "Onnx导出": "Onnx导出", diff --git a/i18n/locale/zh_SG.json b/i18n/locale/zh_SG.json index b7f6171..51ebf75 100644 --- a/i18n/locale/zh_SG.json +++ b/i18n/locale/zh_SG.json @@ -3,6 +3,7 @@ "A模型权重": "A模型權重", "A模型路径": "A模型路徑", "B模型路径": "B模型路徑", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案,可選,一行一個音高,代替預設的F0及升降調", "Index Rate": "Index Rate", "Onnx导出": "Onnx导出", diff --git a/i18n/locale/zh_TW.json b/i18n/locale/zh_TW.json index b7f6171..51ebf75 100644 --- a/i18n/locale/zh_TW.json +++ b/i18n/locale/zh_TW.json @@ -3,6 +3,7 @@ "A模型权重": "A模型權重", "A模型路径": "A模型路徑", "B模型路径": "B模型路徑", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案,可選,一行一個音高,代替預設的F0及升降調", "Index Rate": "Index Rate", "Onnx导出": "Onnx导出", From 786005f0de5abe31f246abda7375c4bef325677c Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Sun, 10 Dec 2023 16:32:44 +0900 Subject: [PATCH 04/20] fix flow (#1605) --- infer/lib/infer_pack/models_onnx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infer/lib/infer_pack/models_onnx.py b/infer/lib/infer_pack/models_onnx.py index ff60414..97308ef 100644 --- a/infer/lib/infer_pack/models_onnx.py +++ b/infer/lib/infer_pack/models_onnx.py @@ -149,7 +149,7 @@ class ResidualCouplingBlock(nn.Module): x, _ = flow(x, x_mask, g=g, reverse=reverse) else: for flow in reversed(self.flows): - x = flow(x, x_mask, g=g, reverse=reverse) + x, _ = flow(x, x_mask, g=g, reverse=reverse) return x def remove_weight_norm(self): From 45133bc752afd9a82144f05f762ccddbde17bf73 Mon Sep 17 00:00:00 2001 From: Ftps Date: Sun, 10 Dec 2023 19:33:12 +0900 Subject: [PATCH 05/20] FIx F0 predictor for Harvet --- infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py index 27f3356..2b13917 100644 --- a/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py +++ b/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py @@ -65,7 +65,7 @@ class HarvestF0Predictor(F0Predictor): p_len = wav.shape[0] // self.hop_length f0, t = pyworld.harvest( wav.astype(np.double), - fs=self.hop_length, + fs=self.sampling_rate, f0_ceil=self.f0_max, f0_floor=self.f0_min, frame_period=1000 * self.hop_length / self.sampling_rate, From e7e9d5934d847f3c43ff767fe482b689d114d6fc Mon Sep 17 00:00:00 2001 From: CN_ChiTu <36254426+CNChTu@users.noreply.github.com> Date: Thu, 14 Dec 2023 21:01:46 +0800 Subject: [PATCH 06/20] add fcpe for realtime --- tools/rvc_for_realtime.py | 859 +++++++++++++++++++------------------- 1 file changed, 438 insertions(+), 421 deletions(-) diff --git a/tools/rvc_for_realtime.py b/tools/rvc_for_realtime.py index 68358bb..8e16e87 100644 --- a/tools/rvc_for_realtime.py +++ b/tools/rvc_for_realtime.py @@ -1,421 +1,438 @@ -from io import BytesIO -import os -import pickle -import sys -import traceback -from infer.lib import jit -from infer.lib.jit.get_synthesizer import get_synthesizer -from time import time as ttime -import fairseq -import faiss -import numpy as np -import parselmouth -import pyworld -import scipy.signal as signal -import torch -import torch.nn as nn -import torch.nn.functional as F -import torchcrepe - -from infer.lib.infer_pack.models import ( - SynthesizerTrnMs256NSFsid, - SynthesizerTrnMs256NSFsid_nono, - SynthesizerTrnMs768NSFsid, - SynthesizerTrnMs768NSFsid_nono, -) - -now_dir = os.getcwd() -sys.path.append(now_dir) -from multiprocessing import Manager as M - -from configs.config import Config - -# config = Config() - -mm = M() - - -def printt(strr, *args): - if len(args) == 0: - print(strr) - else: - print(strr % args) - - -# config.device=torch.device("cpu")########强制cpu测试 -# config.is_half=False########强制cpu测试 -class RVC: - def __init__( - self, - key, - pth_path, - index_path, - index_rate, - n_cpu, - inp_q, - opt_q, - config: Config, - last_rvc=None, - ) -> None: - """ - 初始化 - """ - try: - if config.dml == True: - - def forward_dml(ctx, x, scale): - ctx.scale = scale - res = x.clone().detach() - return res - - fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml - # global config - self.config = config - self.inp_q = inp_q - self.opt_q = opt_q - # device="cpu"########强制cpu测试 - self.device = config.device - self.f0_up_key = key - self.time_step = 160 / 16000 * 1000 - self.f0_min = 50 - self.f0_max = 1100 - self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) - self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) - self.sr = 16000 - self.window = 160 - self.n_cpu = n_cpu - self.use_jit = self.config.use_jit - self.is_half = config.is_half - - if index_rate != 0: - self.index = faiss.read_index(index_path) - self.big_npy = self.index.reconstruct_n(0, self.index.ntotal) - printt("Index search enabled") - self.pth_path: str = pth_path - self.index_path = index_path - self.index_rate = index_rate - - if last_rvc is None: - models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( - ["assets/hubert/hubert_base.pt"], - suffix="", - ) - hubert_model = models[0] - hubert_model = hubert_model.to(self.device) - if self.is_half: - hubert_model = hubert_model.half() - else: - hubert_model = hubert_model.float() - hubert_model.eval() - self.model = hubert_model - else: - self.model = last_rvc.model - - self.net_g: nn.Module = None - - def set_default_model(): - self.net_g, cpt = get_synthesizer(self.pth_path, self.device) - self.tgt_sr = cpt["config"][-1] - cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] - self.if_f0 = cpt.get("f0", 1) - self.version = cpt.get("version", "v1") - if self.is_half: - self.net_g = self.net_g.half() - else: - self.net_g = self.net_g.float() - - def set_jit_model(): - jit_pth_path = self.pth_path.rstrip(".pth") - jit_pth_path += ".half.jit" if self.is_half else ".jit" - reload = False - if str(self.device) == "cuda": - self.device = torch.device("cuda:0") - if os.path.exists(jit_pth_path): - cpt = jit.load(jit_pth_path) - model_device = cpt["device"] - if model_device != str(self.device): - reload = True - else: - reload = True - - if reload: - cpt = jit.synthesizer_jit_export( - self.pth_path, - "script", - None, - device=self.device, - is_half=self.is_half, - ) - - self.tgt_sr = cpt["config"][-1] - self.if_f0 = cpt.get("f0", 1) - self.version = cpt.get("version", "v1") - self.net_g = torch.jit.load( - BytesIO(cpt["model"]), map_location=self.device - ) - self.net_g.infer = self.net_g.forward - self.net_g.eval().to(self.device) - - def set_synthesizer(): - if self.use_jit and not config.dml: - if self.is_half and "cpu" in str(self.device): - printt( - "Use default Synthesizer model. \ - Jit is not supported on the CPU for half floating point" - ) - set_default_model() - else: - set_jit_model() - else: - set_default_model() - - if last_rvc is None or last_rvc.pth_path != self.pth_path: - set_synthesizer() - else: - self.tgt_sr = last_rvc.tgt_sr - self.if_f0 = last_rvc.if_f0 - self.version = last_rvc.version - self.is_half = last_rvc.is_half - if last_rvc.use_jit != self.use_jit: - set_synthesizer() - else: - self.net_g = last_rvc.net_g - - if last_rvc is not None and hasattr(last_rvc, "model_rmvpe"): - self.model_rmvpe = last_rvc.model_rmvpe - except: - printt(traceback.format_exc()) - - def change_key(self, new_key): - self.f0_up_key = new_key - - def change_index_rate(self, new_index_rate): - if new_index_rate != 0 and self.index_rate == 0: - self.index = faiss.read_index(self.index_path) - self.big_npy = self.index.reconstruct_n(0, self.index.ntotal) - printt("Index search enabled") - self.index_rate = new_index_rate - - def get_f0_post(self, f0): - f0_min = self.f0_min - f0_max = self.f0_max - f0_mel_min = 1127 * np.log(1 + f0_min / 700) - f0_mel_max = 1127 * np.log(1 + f0_max / 700) - f0bak = f0.copy() - f0_mel = 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( - f0_mel_max - f0_mel_min - ) + 1 - f0_mel[f0_mel <= 1] = 1 - f0_mel[f0_mel > 255] = 255 - f0_coarse = np.rint(f0_mel).astype(np.int32) - return f0_coarse, f0bak - - def get_f0(self, x, f0_up_key, n_cpu, method="harvest"): - n_cpu = int(n_cpu) - if method == "crepe": - return self.get_f0_crepe(x, f0_up_key) - if method == "rmvpe": - return self.get_f0_rmvpe(x, f0_up_key) - if method == "pm": - p_len = x.shape[0] // 160 + 1 - f0_min = 65 - l_pad = int(np.ceil(1.5 / f0_min * 16000)) - r_pad = l_pad + 1 - s = parselmouth.Sound(np.pad(x, (l_pad, r_pad)), 16000).to_pitch_ac( - time_step=0.01, - voicing_threshold=0.6, - pitch_floor=f0_min, - pitch_ceiling=1100, - ) - assert np.abs(s.t1 - 1.5 / f0_min) < 0.001 - f0 = s.selected_array["frequency"] - if len(f0) < p_len: - f0 = np.pad(f0, (0, p_len - len(f0))) - f0 = f0[:p_len] - f0 *= pow(2, f0_up_key / 12) - return self.get_f0_post(f0) - if n_cpu == 1: - f0, t = pyworld.harvest( - x.astype(np.double), - fs=16000, - f0_ceil=1100, - f0_floor=50, - frame_period=10, - ) - f0 = signal.medfilt(f0, 3) - f0 *= pow(2, f0_up_key / 12) - return self.get_f0_post(f0) - f0bak = np.zeros(x.shape[0] // 160 + 1, dtype=np.float64) - length = len(x) - part_length = 160 * ((length // 160 - 1) // n_cpu + 1) - n_cpu = (length // 160 - 1) // (part_length // 160) + 1 - ts = ttime() - res_f0 = mm.dict() - for idx in range(n_cpu): - tail = part_length * (idx + 1) + 320 - if idx == 0: - self.inp_q.put((idx, x[:tail], res_f0, n_cpu, ts)) - else: - self.inp_q.put( - (idx, x[part_length * idx - 320 : tail], res_f0, n_cpu, ts) - ) - while 1: - res_ts = self.opt_q.get() - if res_ts == ts: - break - f0s = [i[1] for i in sorted(res_f0.items(), key=lambda x: x[0])] - for idx, f0 in enumerate(f0s): - if idx == 0: - f0 = f0[:-3] - elif idx != n_cpu - 1: - f0 = f0[2:-3] - else: - f0 = f0[2:] - f0bak[ - part_length * idx // 160 : part_length * idx // 160 + f0.shape[0] - ] = f0 - f0bak = signal.medfilt(f0bak, 3) - f0bak *= pow(2, f0_up_key / 12) - return self.get_f0_post(f0bak) - - def get_f0_crepe(self, x, f0_up_key): - if "privateuseone" in str(self.device): ###不支持dml,cpu又太慢用不成,拿pm顶替 - return self.get_f0(x, f0_up_key, 1, "pm") - audio = torch.tensor(np.copy(x))[None].float() - # printt("using crepe,device:%s"%self.device) - f0, pd = torchcrepe.predict( - audio, - self.sr, - 160, - self.f0_min, - self.f0_max, - "full", - batch_size=512, - # device=self.device if self.device.type!="privateuseone" else "cpu",###crepe不用半精度全部是全精度所以不愁###cpu延迟高到没法用 - device=self.device, - return_periodicity=True, - ) - pd = torchcrepe.filter.median(pd, 3) - f0 = torchcrepe.filter.mean(f0, 3) - f0[pd < 0.1] = 0 - f0 = f0[0].cpu().numpy() - f0 *= pow(2, f0_up_key / 12) - return self.get_f0_post(f0) - - def get_f0_rmvpe(self, x, f0_up_key): - if hasattr(self, "model_rmvpe") == False: - from infer.lib.rmvpe import RMVPE - - printt("Loading rmvpe model") - self.model_rmvpe = RMVPE( - # "rmvpe.pt", is_half=self.is_half if self.device.type!="privateuseone" else False, device=self.device if self.device.type!="privateuseone"else "cpu"####dml时强制对rmvpe用cpu跑 - # "rmvpe.pt", is_half=False, device=self.device####dml配置 - # "rmvpe.pt", is_half=False, device="cpu"####锁定cpu配置 - "assets/rmvpe/rmvpe.pt", - is_half=self.is_half, - device=self.device, ####正常逻辑 - use_jit=self.config.use_jit, - ) - # self.model_rmvpe = RMVPE("aug2_58000_half.pt", is_half=self.is_half, device=self.device) - f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) - f0 *= pow(2, f0_up_key / 12) - return self.get_f0_post(f0) - - def infer( - self, - feats: torch.Tensor, - indata: np.ndarray, - block_frame_16k, - rate, - cache_pitch, - cache_pitchf, - f0method, - ) -> np.ndarray: - feats = feats.view(1, -1) - if self.config.is_half: - feats = feats.half() - else: - feats = feats.float() - feats = feats.to(self.device) - t1 = ttime() - with torch.no_grad(): - padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) - inputs = { - "source": feats, - "padding_mask": padding_mask, - "output_layer": 9 if self.version == "v1" else 12, - } - logits = self.model.extract_features(**inputs) - feats = ( - self.model.final_proj(logits[0]) if self.version == "v1" else logits[0] - ) - feats = torch.cat((feats, feats[:, -1:, :]), 1) - t2 = ttime() - try: - if hasattr(self, "index") and self.index_rate != 0: - leng_replace_head = int(rate * feats[0].shape[0]) - npy = feats[0][-leng_replace_head:].cpu().numpy().astype("float32") - score, ix = self.index.search(npy, k=8) - weight = np.square(1 / score) - weight /= weight.sum(axis=1, keepdims=True) - npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) - if self.config.is_half: - npy = npy.astype("float16") - feats[0][-leng_replace_head:] = ( - torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate - + (1 - self.index_rate) * feats[0][-leng_replace_head:] - ) - else: - printt("Index search FAILED or disabled") - except: - traceback.print_exc() - printt("Index search FAILED") - feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) - t3 = ttime() - if self.if_f0 == 1: - pitch, pitchf = self.get_f0(indata, self.f0_up_key, self.n_cpu, f0method) - start_frame = block_frame_16k // 160 - end_frame = len(cache_pitch) - (pitch.shape[0] - 4) + start_frame - cache_pitch[:] = np.append(cache_pitch[start_frame:end_frame], pitch[3:-1]) - cache_pitchf[:] = np.append( - cache_pitchf[start_frame:end_frame], pitchf[3:-1] - ) - p_len = min(feats.shape[1], 13000, cache_pitch.shape[0]) - else: - cache_pitch, cache_pitchf = None, None - p_len = min(feats.shape[1], 13000) - t4 = ttime() - feats = feats[:, :p_len, :] - if self.if_f0 == 1: - cache_pitch = cache_pitch[:p_len] - cache_pitchf = cache_pitchf[:p_len] - cache_pitch = torch.LongTensor(cache_pitch).unsqueeze(0).to(self.device) - cache_pitchf = torch.FloatTensor(cache_pitchf).unsqueeze(0).to(self.device) - p_len = torch.LongTensor([p_len]).to(self.device) - ii = 0 # sid - sid = torch.LongTensor([ii]).to(self.device) - with torch.no_grad(): - if self.if_f0 == 1: - # printt(12222222222,feats.device,p_len.device,cache_pitch.device,cache_pitchf.device,sid.device,rate2) - infered_audio = self.net_g.infer( - feats, - p_len, - cache_pitch, - cache_pitchf, - sid, - torch.FloatTensor([rate]), - )[0][0, 0].data.float() - else: - infered_audio = self.net_g.infer( - feats, p_len, sid, torch.FloatTensor([rate]) - )[0][0, 0].data.float() - t5 = ttime() - printt( - "Spent time: fea = %.2fs, index = %.2fs, f0 = %.2fs, model = %.2fs", - t2 - t1, - t3 - t2, - t4 - t3, - t5 - t4, - ) - return infered_audio +from io import BytesIO +import os +import pickle +import sys +import traceback +from infer.lib import jit +from infer.lib.jit.get_synthesizer import get_synthesizer +from time import time as ttime +import fairseq +import faiss +import numpy as np +import parselmouth +import pyworld +import scipy.signal as signal +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchcrepe + +from infer.lib.infer_pack.models import ( + SynthesizerTrnMs256NSFsid, + SynthesizerTrnMs256NSFsid_nono, + SynthesizerTrnMs768NSFsid, + SynthesizerTrnMs768NSFsid_nono, +) + +now_dir = os.getcwd() +sys.path.append(now_dir) +from multiprocessing import Manager as M + +from configs.config import Config + +# config = Config() + +mm = M() + + +def printt(strr, *args): + if len(args) == 0: + print(strr) + else: + print(strr % args) + + +# config.device=torch.device("cpu")########强制cpu测试 +# config.is_half=False########强制cpu测试 +class RVC: + def __init__( + self, + key, + pth_path, + index_path, + index_rate, + n_cpu, + inp_q, + opt_q, + config: Config, + last_rvc=None, + ) -> None: + """ + 初始化 + """ + try: + if config.dml == True: + def forward_dml(ctx, x, scale): + ctx.scale = scale + res = x.clone().detach() + return res + + fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml + # global config + self.config = config + self.inp_q = inp_q + self.opt_q = opt_q + # device="cpu"########强制cpu测试 + self.device = config.device + self.f0_up_key = key + self.time_step = 160 / 16000 * 1000 + self.f0_min = 50 + self.f0_max = 1100 + self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) + self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) + self.sr = 16000 + self.window = 160 + self.n_cpu = n_cpu + self.use_jit = self.config.use_jit + self.is_half = config.is_half + + if index_rate != 0: + self.index = faiss.read_index(index_path) + self.big_npy = self.index.reconstruct_n(0, self.index.ntotal) + printt("Index search enabled") + self.pth_path: str = pth_path + self.index_path = index_path + self.index_rate = index_rate + + if last_rvc is None: + models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( + ["assets/hubert/hubert_base.pt"], + suffix="", + ) + hubert_model = models[0] + hubert_model = hubert_model.to(self.device) + if self.is_half: + hubert_model = hubert_model.half() + else: + hubert_model = hubert_model.float() + hubert_model.eval() + self.model = hubert_model + else: + self.model = last_rvc.model + + self.net_g: nn.Module = None + + def set_default_model(): + self.net_g, cpt = get_synthesizer(self.pth_path, self.device) + self.tgt_sr = cpt["config"][-1] + cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] + self.if_f0 = cpt.get("f0", 1) + self.version = cpt.get("version", "v1") + if self.is_half: + self.net_g = self.net_g.half() + else: + self.net_g = self.net_g.float() + + def set_jit_model(): + jit_pth_path = self.pth_path.rstrip(".pth") + jit_pth_path += ".half.jit" if self.is_half else ".jit" + reload = False + if str(self.device) == "cuda": + self.device = torch.device("cuda:0") + if os.path.exists(jit_pth_path): + cpt = jit.load(jit_pth_path) + model_device = cpt["device"] + if model_device != str(self.device): + reload = True + else: + reload = True + + if reload: + cpt = jit.synthesizer_jit_export( + self.pth_path, + "script", + None, + device=self.device, + is_half=self.is_half, + ) + + self.tgt_sr = cpt["config"][-1] + self.if_f0 = cpt.get("f0", 1) + self.version = cpt.get("version", "v1") + self.net_g = torch.jit.load( + BytesIO(cpt["model"]), map_location=self.device + ) + self.net_g.infer = self.net_g.forward + self.net_g.eval().to(self.device) + + def set_synthesizer(): + if self.use_jit and not config.dml: + if self.is_half and "cpu" in str(self.device): + printt( + "Use default Synthesizer model. \ + Jit is not supported on the CPU for half floating point" + ) + set_default_model() + else: + set_jit_model() + else: + set_default_model() + + if last_rvc is None or last_rvc.pth_path != self.pth_path: + set_synthesizer() + else: + self.tgt_sr = last_rvc.tgt_sr + self.if_f0 = last_rvc.if_f0 + self.version = last_rvc.version + self.is_half = last_rvc.is_half + if last_rvc.use_jit != self.use_jit: + set_synthesizer() + else: + self.net_g = last_rvc.net_g + + if last_rvc is not None and hasattr(last_rvc, "model_rmvpe"): + self.model_rmvpe = last_rvc.model_rmvpe + if last_rvc is not None and hasattr(last_rvc, "model_fcpe"): + self.model_fcpe = last_rvc.model_fcpe + except: + printt(traceback.format_exc()) + + def change_key(self, new_key): + self.f0_up_key = new_key + + def change_index_rate(self, new_index_rate): + if new_index_rate != 0 and self.index_rate == 0: + self.index = faiss.read_index(self.index_path) + self.big_npy = self.index.reconstruct_n(0, self.index.ntotal) + printt("Index search enabled") + self.index_rate = new_index_rate + + def get_f0_post(self, f0): + f0_min = self.f0_min + f0_max = self.f0_max + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + f0bak = f0.copy() + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(np.int32) + return f0_coarse, f0bak + + def get_f0(self, x, f0_up_key, n_cpu, method="harvest"): + n_cpu = int(n_cpu) + if method == "crepe": + return self.get_f0_crepe(x, f0_up_key) + if method == "rmvpe": + return self.get_f0_rmvpe(x, f0_up_key) + if method == "fcpe": + return self.get_f0_fcpe(x, f0_up_key) + if method == "pm": + p_len = x.shape[0] // 160 + 1 + f0_min = 65 + l_pad = int(np.ceil(1.5 / f0_min * 16000)) + r_pad = l_pad + 1 + s = parselmouth.Sound(np.pad(x, (l_pad, r_pad)), 16000).to_pitch_ac( + time_step=0.01, + voicing_threshold=0.6, + pitch_floor=f0_min, + pitch_ceiling=1100, + ) + assert np.abs(s.t1 - 1.5 / f0_min) < 0.001 + f0 = s.selected_array["frequency"] + if len(f0) < p_len: + f0 = np.pad(f0, (0, p_len - len(f0))) + f0 = f0[:p_len] + f0 *= pow(2, f0_up_key / 12) + return self.get_f0_post(f0) + if n_cpu == 1: + f0, t = pyworld.harvest( + x.astype(np.double), + fs=16000, + f0_ceil=1100, + f0_floor=50, + frame_period=10, + ) + f0 = signal.medfilt(f0, 3) + f0 *= pow(2, f0_up_key / 12) + return self.get_f0_post(f0) + f0bak = np.zeros(x.shape[0] // 160 + 1, dtype=np.float64) + length = len(x) + part_length = 160 * ((length // 160 - 1) // n_cpu + 1) + n_cpu = (length // 160 - 1) // (part_length // 160) + 1 + ts = ttime() + res_f0 = mm.dict() + for idx in range(n_cpu): + tail = part_length * (idx + 1) + 320 + if idx == 0: + self.inp_q.put((idx, x[:tail], res_f0, n_cpu, ts)) + else: + self.inp_q.put( + (idx, x[part_length * idx - 320: tail], res_f0, n_cpu, ts) + ) + while 1: + res_ts = self.opt_q.get() + if res_ts == ts: + break + f0s = [i[1] for i in sorted(res_f0.items(), key=lambda x: x[0])] + for idx, f0 in enumerate(f0s): + if idx == 0: + f0 = f0[:-3] + elif idx != n_cpu - 1: + f0 = f0[2:-3] + else: + f0 = f0[2:] + f0bak[ + part_length * idx // 160: part_length * idx // 160 + f0.shape[0] + ] = f0 + f0bak = signal.medfilt(f0bak, 3) + f0bak *= pow(2, f0_up_key / 12) + return self.get_f0_post(f0bak) + + def get_f0_crepe(self, x, f0_up_key): + if "privateuseone" in str(self.device): ###不支持dml,cpu又太慢用不成,拿pm顶替 + return self.get_f0(x, f0_up_key, 1, "pm") + audio = torch.tensor(np.copy(x))[None].float() + # printt("using crepe,device:%s"%self.device) + f0, pd = torchcrepe.predict( + audio, + self.sr, + 160, + self.f0_min, + self.f0_max, + "full", + batch_size=512, + # device=self.device if self.device.type!="privateuseone" else "cpu",###crepe不用半精度全部是全精度所以不愁###cpu延迟高到没法用 + device=self.device, + return_periodicity=True, + ) + pd = torchcrepe.filter.median(pd, 3) + f0 = torchcrepe.filter.mean(f0, 3) + f0[pd < 0.1] = 0 + f0 = f0[0].cpu().numpy() + f0 *= pow(2, f0_up_key / 12) + return self.get_f0_post(f0) + + def get_f0_rmvpe(self, x, f0_up_key): + if hasattr(self, "model_rmvpe") == False: + from infer.lib.rmvpe import RMVPE + + printt("Loading rmvpe model") + self.model_rmvpe = RMVPE( + # "rmvpe.pt", is_half=self.is_half if self.device.type!="privateuseone" else False, device=self.device if self.device.type!="privateuseone"else "cpu"####dml时强制对rmvpe用cpu跑 + # "rmvpe.pt", is_half=False, device=self.device####dml配置 + # "rmvpe.pt", is_half=False, device="cpu"####锁定cpu配置 + "assets/rmvpe/rmvpe.pt", + is_half=self.is_half, + device=self.device, ####正常逻辑 + use_jit=self.config.use_jit, + ) + # self.model_rmvpe = RMVPE("aug2_58000_half.pt", is_half=self.is_half, device=self.device) + f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + f0 *= pow(2, f0_up_key / 12) + return self.get_f0_post(f0) + + def get_f0_fcpe(self, x, f0_up_key): + if hasattr(self, "model_fcpe") == False: + from torchfcpe import spawn_bundled_infer_model + printt("Loading fcpe model") + self.model_fcpe = spawn_bundled_infer_model(self.device) + f0 = self.model_fcpe.infer( + torch.from_numpy(x).to(self.device).unsqueeze(0).float(), + sr=16000, + decoder_mode='local_argmax', + threshold=0.006, + ).squeeze().cpu().numpy() + f0 *= pow(2, f0_up_key / 12) + return self.get_f0_post(f0) + + def infer( + self, + feats: torch.Tensor, + indata: np.ndarray, + block_frame_16k, + rate, + cache_pitch, + cache_pitchf, + f0method, + ) -> np.ndarray: + feats = feats.view(1, -1) + if self.config.is_half: + feats = feats.half() + else: + feats = feats.float() + feats = feats.to(self.device) + t1 = ttime() + with torch.no_grad(): + padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) + inputs = { + "source": feats, + "padding_mask": padding_mask, + "output_layer": 9 if self.version == "v1" else 12, + } + logits = self.model.extract_features(**inputs) + feats = ( + self.model.final_proj(logits[0]) if self.version == "v1" else logits[0] + ) + feats = torch.cat((feats, feats[:, -1:, :]), 1) + t2 = ttime() + try: + if hasattr(self, "index") and self.index_rate != 0: + leng_replace_head = int(rate * feats[0].shape[0]) + npy = feats[0][-leng_replace_head:].cpu().numpy().astype("float32") + score, ix = self.index.search(npy, k=8) + weight = np.square(1 / score) + weight /= weight.sum(axis=1, keepdims=True) + npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) + if self.config.is_half: + npy = npy.astype("float16") + feats[0][-leng_replace_head:] = ( + torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate + + (1 - self.index_rate) * feats[0][-leng_replace_head:] + ) + else: + printt("Index search FAILED or disabled") + except: + traceback.print_exc() + printt("Index search FAILED") + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) + t3 = ttime() + if self.if_f0 == 1: + pitch, pitchf = self.get_f0(indata, self.f0_up_key, self.n_cpu, f0method) + start_frame = block_frame_16k // 160 + end_frame = len(cache_pitch) - (pitch.shape[0] - 4) + start_frame + cache_pitch[:] = np.append(cache_pitch[start_frame:end_frame], pitch[3:-1]) + cache_pitchf[:] = np.append( + cache_pitchf[start_frame:end_frame], pitchf[3:-1] + ) + p_len = min(feats.shape[1], 13000, cache_pitch.shape[0]) + else: + cache_pitch, cache_pitchf = None, None + p_len = min(feats.shape[1], 13000) + t4 = ttime() + feats = feats[:, :p_len, :] + if self.if_f0 == 1: + cache_pitch = cache_pitch[:p_len] + cache_pitchf = cache_pitchf[:p_len] + cache_pitch = torch.LongTensor(cache_pitch).unsqueeze(0).to(self.device) + cache_pitchf = torch.FloatTensor(cache_pitchf).unsqueeze(0).to(self.device) + p_len = torch.LongTensor([p_len]).to(self.device) + ii = 0 # sid + sid = torch.LongTensor([ii]).to(self.device) + with torch.no_grad(): + if self.if_f0 == 1: + # printt(12222222222,feats.device,p_len.device,cache_pitch.device,cache_pitchf.device,sid.device,rate2) + infered_audio = self.net_g.infer( + feats, + p_len, + cache_pitch, + cache_pitchf, + sid, + torch.FloatTensor([rate]), + )[0][0, 0].data.float() + else: + infered_audio = self.net_g.infer( + feats, p_len, sid, torch.FloatTensor([rate]) + )[0][0, 0].data.float() + t5 = ttime() + printt( + "Spent time: fea = %.2fs, index = %.2fs, f0 = %.2fs, model = %.2fs", + t2 - t1, + t3 - t2, + t4 - t3, + t5 - t4, + ) + return infered_audio From 89746605601403334b95f487580cdbcda2efdd22 Mon Sep 17 00:00:00 2001 From: CN_ChiTu <36254426+CNChTu@users.noreply.github.com> Date: Thu, 14 Dec 2023 21:08:36 +0800 Subject: [PATCH 07/20] add fcpe for realtime --- gui_v1.py | 1759 +++++++++++++++++++++++++++-------------------------- 1 file changed, 885 insertions(+), 874 deletions(-) diff --git a/gui_v1.py b/gui_v1.py index 3254892..7f4c640 100644 --- a/gui_v1.py +++ b/gui_v1.py @@ -1,874 +1,885 @@ -import os -import sys -from dotenv import load_dotenv - -load_dotenv() - -os.environ["OMP_NUM_THREADS"] = "4" -if sys.platform == "darwin": - os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" - -now_dir = os.getcwd() -sys.path.append(now_dir) -import multiprocessing - -stream_latency = -1 - - -def printt(strr, *args): - if len(args) == 0: - print(strr) - else: - print(strr % args) - - -class Harvest(multiprocessing.Process): - def __init__(self, inp_q, opt_q): - multiprocessing.Process.__init__(self) - self.inp_q = inp_q - self.opt_q = opt_q - - def run(self): - import numpy as np - import pyworld - - while 1: - idx, x, res_f0, n_cpu, ts = self.inp_q.get() - f0, t = pyworld.harvest( - x.astype(np.double), - fs=16000, - f0_ceil=1100, - f0_floor=50, - frame_period=10, - ) - res_f0[idx] = f0 - if len(res_f0.keys()) >= n_cpu: - self.opt_q.put(ts) - - -if __name__ == "__main__": - import json - import multiprocessing - import re - import threading - import time - import traceback - from multiprocessing import Queue, cpu_count - from queue import Empty - - import librosa - from tools.torchgate import TorchGate - import numpy as np - import PySimpleGUI as sg - import sounddevice as sd - import torch - import torch.nn.functional as F - import torchaudio.transforms as tat - - import tools.rvc_for_realtime as rvc_for_realtime - from i18n.i18n import I18nAuto - from configs.config import Config - - i18n = I18nAuto() - - # device = rvc_for_realtime.config.device - # device = torch.device( - # "cuda" - # if torch.cuda.is_available() - # else ("mps" if torch.backends.mps.is_available() else "cpu") - # ) - current_dir = os.getcwd() - inp_q = Queue() - opt_q = Queue() - n_cpu = min(cpu_count(), 8) - for _ in range(n_cpu): - Harvest(inp_q, opt_q).start() - - class GUIConfig: - def __init__(self) -> None: - self.pth_path: str = "" - self.index_path: str = "" - self.pitch: int = 0 - self.samplerate: int = 40000 - self.block_time: float = 1.0 # s - self.buffer_num: int = 1 - self.threhold: int = -60 - self.crossfade_time: float = 0.05 - self.extra_time: float = 2.5 - self.I_noise_reduce = False - self.O_noise_reduce = False - self.rms_mix_rate = 0.0 - self.index_rate = 0.3 - self.n_cpu = min(n_cpu, 6) - self.f0method = "harvest" - self.sg_input_device = "" - self.sg_output_device = "" - - class GUI: - def __init__(self) -> None: - self.gui_config = GUIConfig() - self.config = Config() - self.flag_vc = False - self.function = "vc" - self.delay_time = 0 - self.launcher() - - def load(self): - input_devices, output_devices, _, _ = self.get_devices() - try: - with open("configs/config.json", "r") as j: - data = json.load(j) - data["pm"] = data["f0method"] == "pm" - data["harvest"] = data["f0method"] == "harvest" - data["crepe"] = data["f0method"] == "crepe" - data["rmvpe"] = data["f0method"] == "rmvpe" - if data["sg_input_device"] not in input_devices: - data["sg_input_device"] = input_devices[sd.default.device[0]] - if data["sg_output_device"] not in output_devices: - data["sg_output_device"] = output_devices[sd.default.device[1]] - except: - with open("configs/config.json", "w") as j: - data = { - "pth_path": " ", - "index_path": " ", - "sg_input_device": input_devices[sd.default.device[0]], - "sg_output_device": output_devices[sd.default.device[1]], - "threhold": "-60", - "pitch": "0", - "index_rate": "0", - "rms_mix_rate": "0", - "block_time": "0.25", - "crossfade_length": "0.05", - "extra_time": "2.5", - "f0method": "rmvpe", - "use_jit": False, - } - data["pm"] = data["f0method"] == "pm" - data["harvest"] = data["f0method"] == "harvest" - data["crepe"] = data["f0method"] == "crepe" - data["rmvpe"] = data["f0method"] == "rmvpe" - return data - - def launcher(self): - data = self.load() - self.config.use_jit = False # data.get("use_jit", self.config.use_jit) - sg.theme("LightBlue3") - input_devices, output_devices, _, _ = self.get_devices() - layout = [ - [ - sg.Frame( - title=i18n("加载模型"), - layout=[ - [ - sg.Input( - default_text=data.get("pth_path", ""), - key="pth_path", - ), - sg.FileBrowse( - i18n("选择.pth文件"), - initial_folder=os.path.join( - os.getcwd(), "assets/weights" - ), - file_types=((". pth"),), - ), - ], - [ - sg.Input( - default_text=data.get("index_path", ""), - key="index_path", - ), - sg.FileBrowse( - i18n("选择.index文件"), - initial_folder=os.path.join(os.getcwd(), "logs"), - file_types=((". index"),), - ), - ], - ], - ) - ], - [ - sg.Frame( - layout=[ - [ - sg.Text(i18n("输入设备")), - sg.Combo( - input_devices, - key="sg_input_device", - default_value=data.get("sg_input_device", ""), - ), - ], - [ - sg.Text(i18n("输出设备")), - sg.Combo( - output_devices, - key="sg_output_device", - default_value=data.get("sg_output_device", ""), - ), - ], - [sg.Button(i18n("重载设备列表"), key="reload_devices")], - ], - title=i18n("音频设备(请使用同种类驱动)"), - ) - ], - [ - sg.Frame( - layout=[ - [ - sg.Text(i18n("响应阈值")), - sg.Slider( - range=(-60, 0), - key="threhold", - resolution=1, - orientation="h", - default_value=data.get("threhold", "-60"), - enable_events=True, - ), - ], - [ - sg.Text(i18n("音调设置")), - sg.Slider( - range=(-24, 24), - key="pitch", - resolution=1, - orientation="h", - default_value=data.get("pitch", "0"), - enable_events=True, - ), - ], - [ - sg.Text(i18n("Index Rate")), - sg.Slider( - range=(0.0, 1.0), - key="index_rate", - resolution=0.01, - orientation="h", - default_value=data.get("index_rate", "0"), - enable_events=True, - ), - ], - [ - sg.Text(i18n("响度因子")), - sg.Slider( - range=(0.0, 1.0), - key="rms_mix_rate", - resolution=0.01, - orientation="h", - default_value=data.get("rms_mix_rate", "0"), - enable_events=True, - ), - ], - [ - sg.Text(i18n("音高算法")), - sg.Radio( - "pm", - "f0method", - key="pm", - default=data.get("pm", "") == True, - enable_events=True, - ), - sg.Radio( - "harvest", - "f0method", - key="harvest", - default=data.get("harvest", "") == True, - enable_events=True, - ), - sg.Radio( - "crepe", - "f0method", - key="crepe", - default=data.get("crepe", "") == True, - enable_events=True, - ), - sg.Radio( - "rmvpe", - "f0method", - key="rmvpe", - default=data.get("rmvpe", "") == True, - enable_events=True, - ), - ], - ], - title=i18n("常规设置"), - ), - sg.Frame( - layout=[ - [ - sg.Text(i18n("采样长度")), - sg.Slider( - range=(0.05, 2.4), - key="block_time", - resolution=0.01, - orientation="h", - default_value=data.get("block_time", "0.25"), - enable_events=True, - ), - ], - # [ - # sg.Text("设备延迟"), - # sg.Slider( - # range=(0, 1), - # key="device_latency", - # resolution=0.001, - # orientation="h", - # default_value=data.get("device_latency", "0.1"), - # enable_events=True, - # ), - # ], - [ - sg.Text(i18n("harvest进程数")), - sg.Slider( - range=(1, n_cpu), - key="n_cpu", - resolution=1, - orientation="h", - default_value=data.get( - "n_cpu", min(self.gui_config.n_cpu, n_cpu) - ), - enable_events=True, - ), - ], - [ - sg.Text(i18n("淡入淡出长度")), - sg.Slider( - range=(0.01, 0.15), - key="crossfade_length", - resolution=0.01, - orientation="h", - default_value=data.get("crossfade_length", "0.05"), - enable_events=True, - ), - ], - [ - sg.Text(i18n("额外推理时长")), - sg.Slider( - range=(0.05, 5.00), - key="extra_time", - resolution=0.01, - orientation="h", - default_value=data.get("extra_time", "2.5"), - enable_events=True, - ), - ], - [ - sg.Checkbox( - i18n("输入降噪"), - key="I_noise_reduce", - enable_events=True, - ), - sg.Checkbox( - i18n("输出降噪"), - key="O_noise_reduce", - enable_events=True, - ), - # sg.Checkbox( - # "JIT加速", - # default=self.config.use_jit, - # key="use_jit", - # enable_events=False, - # ), - ], - # [sg.Text("注:首次使用JIT加速时,会出现卡顿,\n 并伴随一些噪音,但这是正常现象!")], - ], - title=i18n("性能设置"), - ), - ], - [ - sg.Button(i18n("开始音频转换"), key="start_vc"), - sg.Button(i18n("停止音频转换"), key="stop_vc"), - sg.Radio( - i18n("输入监听"), - "function", - key="im", - default=False, - enable_events=True, - ), - sg.Radio( - i18n("输出变声"), - "function", - key="vc", - default=True, - enable_events=True, - ), - sg.Text(i18n("算法延迟(ms):")), - sg.Text("0", key="delay_time"), - sg.Text(i18n("推理时间(ms):")), - sg.Text("0", key="infer_time"), - ], - ] - self.window = sg.Window("RVC - GUI", layout=layout, finalize=True) - self.event_handler() - - def event_handler(self): - while True: - event, values = self.window.read() - if event == sg.WINDOW_CLOSED: - self.flag_vc = False - exit() - if event == "reload_devices": - prev_input = self.window["sg_input_device"].get() - prev_output = self.window["sg_output_device"].get() - input_devices, output_devices, _, _ = self.get_devices(update=True) - if prev_input not in input_devices: - self.gui_config.sg_input_device = input_devices[0] - else: - self.gui_config.sg_input_device = prev_input - self.window["sg_input_device"].Update(values=input_devices) - self.window["sg_input_device"].Update( - value=self.gui_config.sg_input_device - ) - if prev_output not in output_devices: - self.gui_config.sg_output_device = output_devices[0] - else: - self.gui_config.sg_output_device = prev_output - self.window["sg_output_device"].Update(values=output_devices) - self.window["sg_output_device"].Update( - value=self.gui_config.sg_output_device - ) - if event == "start_vc" and self.flag_vc == False: - if self.set_values(values) == True: - printt("cuda_is_available: %s", torch.cuda.is_available()) - self.start_vc() - settings = { - "pth_path": values["pth_path"], - "index_path": values["index_path"], - "sg_input_device": values["sg_input_device"], - "sg_output_device": values["sg_output_device"], - "threhold": values["threhold"], - "pitch": values["pitch"], - "rms_mix_rate": values["rms_mix_rate"], - "index_rate": values["index_rate"], - # "device_latency": values["device_latency"], - "block_time": values["block_time"], - "crossfade_length": values["crossfade_length"], - "extra_time": values["extra_time"], - "n_cpu": values["n_cpu"], - # "use_jit": values["use_jit"], - "use_jit": False, - "f0method": ["pm", "harvest", "crepe", "rmvpe"][ - [ - values["pm"], - values["harvest"], - values["crepe"], - values["rmvpe"], - ].index(True) - ], - } - with open("configs/config.json", "w") as j: - json.dump(settings, j) - global stream_latency - while stream_latency < 0: - time.sleep(0.01) - self.delay_time = ( - stream_latency - + values["block_time"] - + values["crossfade_length"] - + 0.01 - ) - if values["I_noise_reduce"]: - self.delay_time += values["crossfade_length"] - self.window["delay_time"].update(int(self.delay_time * 1000)) - if event == "stop_vc" and self.flag_vc == True: - self.flag_vc = False - stream_latency = -1 - # Parameter hot update - if event == "threhold": - self.gui_config.threhold = values["threhold"] - elif event == "pitch": - self.gui_config.pitch = values["pitch"] - if hasattr(self, "rvc"): - self.rvc.change_key(values["pitch"]) - elif event == "index_rate": - self.gui_config.index_rate = values["index_rate"] - if hasattr(self, "rvc"): - self.rvc.change_index_rate(values["index_rate"]) - elif event == "rms_mix_rate": - self.gui_config.rms_mix_rate = values["rms_mix_rate"] - elif event in ["pm", "harvest", "crepe", "rmvpe"]: - self.gui_config.f0method = event - elif event == "I_noise_reduce": - self.gui_config.I_noise_reduce = values["I_noise_reduce"] - if stream_latency > 0: - self.delay_time += ( - 1 if values["I_noise_reduce"] else -1 - ) * values["crossfade_length"] - self.window["delay_time"].update(int(self.delay_time * 1000)) - elif event == "O_noise_reduce": - self.gui_config.O_noise_reduce = values["O_noise_reduce"] - elif event in ["vc", "im"]: - self.function = event - elif event != "start_vc" and self.flag_vc == True: - # Other parameters do not support hot update - self.flag_vc = False - stream_latency = -1 - - def set_values(self, values): - if len(values["pth_path"].strip()) == 0: - sg.popup(i18n("请选择pth文件")) - return False - if len(values["index_path"].strip()) == 0: - sg.popup(i18n("请选择index文件")) - return False - pattern = re.compile("[^\x00-\x7F]+") - if pattern.findall(values["pth_path"]): - sg.popup(i18n("pth文件路径不可包含中文")) - return False - if pattern.findall(values["index_path"]): - sg.popup(i18n("index文件路径不可包含中文")) - return False - self.set_devices(values["sg_input_device"], values["sg_output_device"]) - self.config.use_jit = False # values["use_jit"] - # self.device_latency = values["device_latency"] - self.gui_config.pth_path = values["pth_path"] - self.gui_config.index_path = values["index_path"] - self.gui_config.threhold = values["threhold"] - self.gui_config.pitch = values["pitch"] - self.gui_config.block_time = values["block_time"] - self.gui_config.crossfade_time = values["crossfade_length"] - self.gui_config.extra_time = values["extra_time"] - self.gui_config.I_noise_reduce = values["I_noise_reduce"] - self.gui_config.O_noise_reduce = values["O_noise_reduce"] - self.gui_config.rms_mix_rate = values["rms_mix_rate"] - self.gui_config.index_rate = values["index_rate"] - self.gui_config.n_cpu = values["n_cpu"] - self.gui_config.f0method = ["pm", "harvest", "crepe", "rmvpe"][ - [ - values["pm"], - values["harvest"], - values["crepe"], - values["rmvpe"], - ].index(True) - ] - return True - - def start_vc(self): - torch.cuda.empty_cache() - self.flag_vc = True - self.rvc = rvc_for_realtime.RVC( - self.gui_config.pitch, - self.gui_config.pth_path, - self.gui_config.index_path, - self.gui_config.index_rate, - self.gui_config.n_cpu, - inp_q, - opt_q, - self.config, - self.rvc if hasattr(self, "rvc") else None, - ) - self.gui_config.samplerate = self.rvc.tgt_sr - self.zc = self.rvc.tgt_sr // 100 - self.block_frame = ( - int( - np.round( - self.gui_config.block_time - * self.gui_config.samplerate - / self.zc - ) - ) - * self.zc - ) - self.block_frame_16k = 160 * self.block_frame // self.zc - self.crossfade_frame = ( - int( - np.round( - self.gui_config.crossfade_time - * self.gui_config.samplerate - / self.zc - ) - ) - * self.zc - ) - self.sola_search_frame = self.zc - self.extra_frame = ( - int( - np.round( - self.gui_config.extra_time - * self.gui_config.samplerate - / self.zc - ) - ) - * self.zc - ) - self.input_wav: torch.Tensor = torch.zeros( - self.extra_frame - + self.crossfade_frame - + self.sola_search_frame - + self.block_frame, - device=self.config.device, - dtype=torch.float32, - ) - self.input_wav_res: torch.Tensor = torch.zeros( - 160 * self.input_wav.shape[0] // self.zc, - device=self.config.device, - dtype=torch.float32, - ) - self.pitch: np.ndarray = np.zeros( - self.input_wav.shape[0] // self.zc, - dtype="int32", - ) - self.pitchf: np.ndarray = np.zeros( - self.input_wav.shape[0] // self.zc, - dtype="float64", - ) - self.sola_buffer: torch.Tensor = torch.zeros( - self.crossfade_frame, device=self.config.device, dtype=torch.float32 - ) - self.nr_buffer: torch.Tensor = self.sola_buffer.clone() - self.output_buffer: torch.Tensor = self.input_wav.clone() - self.res_buffer: torch.Tensor = torch.zeros( - 2 * self.zc, device=self.config.device, dtype=torch.float32 - ) - self.valid_rate = 1 - (self.extra_frame - 1) / self.input_wav.shape[0] - self.fade_in_window: torch.Tensor = ( - torch.sin( - 0.5 - * np.pi - * torch.linspace( - 0.0, - 1.0, - steps=self.crossfade_frame, - device=self.config.device, - dtype=torch.float32, - ) - ) - ** 2 - ) - self.fade_out_window: torch.Tensor = 1 - self.fade_in_window - self.resampler = tat.Resample( - orig_freq=self.gui_config.samplerate, - new_freq=16000, - dtype=torch.float32, - ).to(self.config.device) - self.tg = TorchGate( - sr=self.gui_config.samplerate, n_fft=4 * self.zc, prop_decrease=0.9 - ).to(self.config.device) - thread_vc = threading.Thread(target=self.soundinput) - thread_vc.start() - - def soundinput(self): - """ - 接受音频输入 - """ - channels = 1 if sys.platform == "darwin" else 2 - with sd.Stream( - channels=channels, - callback=self.audio_callback, - blocksize=self.block_frame, - samplerate=self.gui_config.samplerate, - dtype="float32", - ) as stream: - global stream_latency - stream_latency = stream.latency[-1] - while self.flag_vc: - time.sleep(self.gui_config.block_time) - printt("Audio block passed.") - printt("ENDing VC") - - def audio_callback( - self, indata: np.ndarray, outdata: np.ndarray, frames, times, status - ): - """ - 音频处理 - """ - start_time = time.perf_counter() - indata = librosa.to_mono(indata.T) - if self.gui_config.threhold > -60: - rms = librosa.feature.rms( - y=indata, frame_length=4 * self.zc, hop_length=self.zc - ) - db_threhold = ( - librosa.amplitude_to_db(rms, ref=1.0)[0] < self.gui_config.threhold - ) - for i in range(db_threhold.shape[0]): - if db_threhold[i]: - indata[i * self.zc : (i + 1) * self.zc] = 0 - self.input_wav[: -self.block_frame] = self.input_wav[ - self.block_frame : - ].clone() - self.input_wav[-self.block_frame :] = torch.from_numpy(indata).to( - self.config.device - ) - self.input_wav_res[: -self.block_frame_16k] = self.input_wav_res[ - self.block_frame_16k : - ].clone() - # input noise reduction and resampling - if self.gui_config.I_noise_reduce and self.function == "vc": - input_wav = self.input_wav[ - -self.crossfade_frame - self.block_frame - 2 * self.zc : - ] - input_wav = self.tg( - input_wav.unsqueeze(0), self.input_wav.unsqueeze(0) - )[0, 2 * self.zc :] - input_wav[: self.crossfade_frame] *= self.fade_in_window - input_wav[: self.crossfade_frame] += ( - self.nr_buffer * self.fade_out_window - ) - self.nr_buffer[:] = input_wav[-self.crossfade_frame :] - input_wav = torch.cat( - (self.res_buffer[:], input_wav[: self.block_frame]) - ) - self.res_buffer[:] = input_wav[-2 * self.zc :] - self.input_wav_res[-self.block_frame_16k - 160 :] = self.resampler( - input_wav - )[160:] - else: - self.input_wav_res[-self.block_frame_16k - 160 :] = self.resampler( - self.input_wav[-self.block_frame - 2 * self.zc :] - )[160:] - # infer - if self.function == "vc": - f0_extractor_frame = self.block_frame_16k + 800 - if self.gui_config.f0method == "rmvpe": - f0_extractor_frame = ( - 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160 - ) - infer_wav = self.rvc.infer( - self.input_wav_res, - self.input_wav_res[-f0_extractor_frame:].cpu().numpy(), - self.block_frame_16k, - self.valid_rate, - self.pitch, - self.pitchf, - self.gui_config.f0method, - ) - infer_wav = infer_wav[ - -self.crossfade_frame - self.sola_search_frame - self.block_frame : - ] - else: - infer_wav = self.input_wav[ - -self.crossfade_frame - self.sola_search_frame - self.block_frame : - ].clone() - # output noise reduction - if (self.gui_config.O_noise_reduce and self.function == "vc") or ( - self.gui_config.I_noise_reduce and self.function == "im" - ): - self.output_buffer[: -self.block_frame] = self.output_buffer[ - self.block_frame : - ].clone() - self.output_buffer[-self.block_frame :] = infer_wav[-self.block_frame :] - infer_wav = self.tg( - infer_wav.unsqueeze(0), self.output_buffer.unsqueeze(0) - ).squeeze(0) - # volume envelop mixing - if self.gui_config.rms_mix_rate < 1 and self.function == "vc": - rms1 = librosa.feature.rms( - y=self.input_wav_res[-160 * infer_wav.shape[0] // self.zc :] - .cpu() - .numpy(), - frame_length=640, - hop_length=160, - ) - rms1 = torch.from_numpy(rms1).to(self.config.device) - rms1 = F.interpolate( - rms1.unsqueeze(0), - size=infer_wav.shape[0] + 1, - mode="linear", - align_corners=True, - )[0, 0, :-1] - rms2 = librosa.feature.rms( - y=infer_wav[:].cpu().numpy(), - frame_length=4 * self.zc, - hop_length=self.zc, - ) - rms2 = torch.from_numpy(rms2).to(self.config.device) - rms2 = F.interpolate( - rms2.unsqueeze(0), - size=infer_wav.shape[0] + 1, - mode="linear", - align_corners=True, - )[0, 0, :-1] - rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-3) - infer_wav *= torch.pow( - rms1 / rms2, torch.tensor(1 - self.gui_config.rms_mix_rate) - ) - # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC - conv_input = infer_wav[ - None, None, : self.crossfade_frame + self.sola_search_frame - ] - cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :]) - cor_den = torch.sqrt( - F.conv1d( - conv_input**2, - torch.ones(1, 1, self.crossfade_frame, device=self.config.device), - ) - + 1e-8 - ) - if sys.platform == "darwin": - _, sola_offset = torch.max(cor_nom[0, 0] / cor_den[0, 0]) - sola_offset = sola_offset.item() - else: - sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0]) - printt("sola_offset = %d", int(sola_offset)) - infer_wav = infer_wav[ - sola_offset : sola_offset + self.block_frame + self.crossfade_frame - ] - infer_wav[: self.crossfade_frame] *= self.fade_in_window - infer_wav[: self.crossfade_frame] += self.sola_buffer * self.fade_out_window - self.sola_buffer[:] = infer_wav[-self.crossfade_frame :] - if sys.platform == "darwin": - outdata[:] = ( - infer_wav[: -self.crossfade_frame].cpu().numpy()[:, np.newaxis] - ) - else: - outdata[:] = ( - infer_wav[: -self.crossfade_frame].repeat(2, 1).t().cpu().numpy() - ) - total_time = time.perf_counter() - start_time - self.window["infer_time"].update(int(total_time * 1000)) - printt("Infer time: %.2f", total_time) - - def get_devices(self, update: bool = True): - """获取设备列表""" - if update: - sd._terminate() - sd._initialize() - devices = sd.query_devices() - hostapis = sd.query_hostapis() - for hostapi in hostapis: - for device_idx in hostapi["devices"]: - devices[device_idx]["hostapi_name"] = hostapi["name"] - input_devices = [ - f"{d['name']} ({d['hostapi_name']})" - for d in devices - if d["max_input_channels"] > 0 - ] - output_devices = [ - f"{d['name']} ({d['hostapi_name']})" - for d in devices - if d["max_output_channels"] > 0 - ] - input_devices_indices = [ - d["index"] if "index" in d else d["name"] - for d in devices - if d["max_input_channels"] > 0 - ] - output_devices_indices = [ - d["index"] if "index" in d else d["name"] - for d in devices - if d["max_output_channels"] > 0 - ] - return ( - input_devices, - output_devices, - input_devices_indices, - output_devices_indices, - ) - - def set_devices(self, input_device, output_device): - """设置输出设备""" - ( - input_devices, - output_devices, - input_device_indices, - output_device_indices, - ) = self.get_devices() - sd.default.device[0] = input_device_indices[ - input_devices.index(input_device) - ] - sd.default.device[1] = output_device_indices[ - output_devices.index(output_device) - ] - printt("Input device: %s:%s", str(sd.default.device[0]), input_device) - printt("Output device: %s:%s", str(sd.default.device[1]), output_device) - - gui = GUI() +import os +import sys +from dotenv import load_dotenv + +load_dotenv() + +os.environ["OMP_NUM_THREADS"] = "4" +if sys.platform == "darwin": + os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" + +now_dir = os.getcwd() +sys.path.append(now_dir) +import multiprocessing + +stream_latency = -1 + + +def printt(strr, *args): + if len(args) == 0: + print(strr) + else: + print(strr % args) + + +class Harvest(multiprocessing.Process): + def __init__(self, inp_q, opt_q): + multiprocessing.Process.__init__(self) + self.inp_q = inp_q + self.opt_q = opt_q + + def run(self): + import numpy as np + import pyworld + + while 1: + idx, x, res_f0, n_cpu, ts = self.inp_q.get() + f0, t = pyworld.harvest( + x.astype(np.double), + fs=16000, + f0_ceil=1100, + f0_floor=50, + frame_period=10, + ) + res_f0[idx] = f0 + if len(res_f0.keys()) >= n_cpu: + self.opt_q.put(ts) + + +if __name__ == "__main__": + import json + import multiprocessing + import re + import threading + import time + import traceback + from multiprocessing import Queue, cpu_count + from queue import Empty + + import librosa + from tools.torchgate import TorchGate + import numpy as np + import PySimpleGUI as sg + import sounddevice as sd + import torch + import torch.nn.functional as F + import torchaudio.transforms as tat + + import tools.rvc_for_realtime as rvc_for_realtime + from i18n.i18n import I18nAuto + from configs.config import Config + + i18n = I18nAuto() + + # device = rvc_for_realtime.config.device + # device = torch.device( + # "cuda" + # if torch.cuda.is_available() + # else ("mps" if torch.backends.mps.is_available() else "cpu") + # ) + current_dir = os.getcwd() + inp_q = Queue() + opt_q = Queue() + n_cpu = min(cpu_count(), 8) + for _ in range(n_cpu): + Harvest(inp_q, opt_q).start() + + class GUIConfig: + def __init__(self) -> None: + self.pth_path: str = "" + self.index_path: str = "" + self.pitch: int = 0 + self.samplerate: int = 40000 + self.block_time: float = 1.0 # s + self.buffer_num: int = 1 + self.threhold: int = -60 + self.crossfade_time: float = 0.05 + self.extra_time: float = 2.5 + self.I_noise_reduce = False + self.O_noise_reduce = False + self.rms_mix_rate = 0.0 + self.index_rate = 0.3 + self.n_cpu = min(n_cpu, 6) + self.f0method = "harvest" + self.sg_input_device = "" + self.sg_output_device = "" + + class GUI: + def __init__(self) -> None: + self.gui_config = GUIConfig() + self.config = Config() + self.flag_vc = False + self.function = "vc" + self.delay_time = 0 + self.launcher() + + def load(self): + input_devices, output_devices, _, _ = self.get_devices() + try: + with open("configs/config.json", "r") as j: + data = json.load(j) + data["pm"] = data["f0method"] == "pm" + data["harvest"] = data["f0method"] == "harvest" + data["crepe"] = data["f0method"] == "crepe" + data["rmvpe"] = data["f0method"] == "rmvpe" + data["fcpe"] = data["f0method"] == "fcpe" + if data["sg_input_device"] not in input_devices: + data["sg_input_device"] = input_devices[sd.default.device[0]] + if data["sg_output_device"] not in output_devices: + data["sg_output_device"] = output_devices[sd.default.device[1]] + except: + with open("configs/config.json", "w") as j: + data = { + "pth_path": " ", + "index_path": " ", + "sg_input_device": input_devices[sd.default.device[0]], + "sg_output_device": output_devices[sd.default.device[1]], + "threhold": "-60", + "pitch": "0", + "index_rate": "0", + "rms_mix_rate": "0", + "block_time": "0.25", + "crossfade_length": "0.05", + "extra_time": "2.5", + "f0method": "rmvpe", + "use_jit": False, + } + data["pm"] = data["f0method"] == "pm" + data["harvest"] = data["f0method"] == "harvest" + data["crepe"] = data["f0method"] == "crepe" + data["rmvpe"] = data["f0method"] == "rmvpe" + data["fcpe"] = data["f0method"] == "fcpe" + return data + + def launcher(self): + data = self.load() + self.config.use_jit = False # data.get("use_jit", self.config.use_jit) + sg.theme("LightBlue3") + input_devices, output_devices, _, _ = self.get_devices() + layout = [ + [ + sg.Frame( + title=i18n("加载模型"), + layout=[ + [ + sg.Input( + default_text=data.get("pth_path", ""), + key="pth_path", + ), + sg.FileBrowse( + i18n("选择.pth文件"), + initial_folder=os.path.join( + os.getcwd(), "assets/weights" + ), + file_types=((". pth"),), + ), + ], + [ + sg.Input( + default_text=data.get("index_path", ""), + key="index_path", + ), + sg.FileBrowse( + i18n("选择.index文件"), + initial_folder=os.path.join(os.getcwd(), "logs"), + file_types=((". index"),), + ), + ], + ], + ) + ], + [ + sg.Frame( + layout=[ + [ + sg.Text(i18n("输入设备")), + sg.Combo( + input_devices, + key="sg_input_device", + default_value=data.get("sg_input_device", ""), + ), + ], + [ + sg.Text(i18n("输出设备")), + sg.Combo( + output_devices, + key="sg_output_device", + default_value=data.get("sg_output_device", ""), + ), + ], + [sg.Button(i18n("重载设备列表"), key="reload_devices")], + ], + title=i18n("音频设备(请使用同种类驱动)"), + ) + ], + [ + sg.Frame( + layout=[ + [ + sg.Text(i18n("响应阈值")), + sg.Slider( + range=(-60, 0), + key="threhold", + resolution=1, + orientation="h", + default_value=data.get("threhold", "-60"), + enable_events=True, + ), + ], + [ + sg.Text(i18n("音调设置")), + sg.Slider( + range=(-24, 24), + key="pitch", + resolution=1, + orientation="h", + default_value=data.get("pitch", "0"), + enable_events=True, + ), + ], + [ + sg.Text(i18n("Index Rate")), + sg.Slider( + range=(0.0, 1.0), + key="index_rate", + resolution=0.01, + orientation="h", + default_value=data.get("index_rate", "0"), + enable_events=True, + ), + ], + [ + sg.Text(i18n("响度因子")), + sg.Slider( + range=(0.0, 1.0), + key="rms_mix_rate", + resolution=0.01, + orientation="h", + default_value=data.get("rms_mix_rate", "0"), + enable_events=True, + ), + ], + [ + sg.Text(i18n("音高算法")), + sg.Radio( + "pm", + "f0method", + key="pm", + default=data.get("pm", "") == True, + enable_events=True, + ), + sg.Radio( + "harvest", + "f0method", + key="harvest", + default=data.get("harvest", "") == True, + enable_events=True, + ), + sg.Radio( + "crepe", + "f0method", + key="crepe", + default=data.get("crepe", "") == True, + enable_events=True, + ), + sg.Radio( + "rmvpe", + "f0method", + key="rmvpe", + default=data.get("rmvpe", "") == True, + enable_events=True, + ), + sg.Radio( + "fcpe", + "f0method", + key="fcpe", + default=data.get("fcpe", "") == True, + enable_events=True, + ), + ], + ], + title=i18n("常规设置"), + ), + sg.Frame( + layout=[ + [ + sg.Text(i18n("采样长度")), + sg.Slider( + range=(0.05, 2.4), + key="block_time", + resolution=0.01, + orientation="h", + default_value=data.get("block_time", "0.25"), + enable_events=True, + ), + ], + # [ + # sg.Text("设备延迟"), + # sg.Slider( + # range=(0, 1), + # key="device_latency", + # resolution=0.001, + # orientation="h", + # default_value=data.get("device_latency", "0.1"), + # enable_events=True, + # ), + # ], + [ + sg.Text(i18n("harvest进程数")), + sg.Slider( + range=(1, n_cpu), + key="n_cpu", + resolution=1, + orientation="h", + default_value=data.get( + "n_cpu", min(self.gui_config.n_cpu, n_cpu) + ), + enable_events=True, + ), + ], + [ + sg.Text(i18n("淡入淡出长度")), + sg.Slider( + range=(0.01, 0.15), + key="crossfade_length", + resolution=0.01, + orientation="h", + default_value=data.get("crossfade_length", "0.05"), + enable_events=True, + ), + ], + [ + sg.Text(i18n("额外推理时长")), + sg.Slider( + range=(0.05, 5.00), + key="extra_time", + resolution=0.01, + orientation="h", + default_value=data.get("extra_time", "2.5"), + enable_events=True, + ), + ], + [ + sg.Checkbox( + i18n("输入降噪"), + key="I_noise_reduce", + enable_events=True, + ), + sg.Checkbox( + i18n("输出降噪"), + key="O_noise_reduce", + enable_events=True, + ), + # sg.Checkbox( + # "JIT加速", + # default=self.config.use_jit, + # key="use_jit", + # enable_events=False, + # ), + ], + # [sg.Text("注:首次使用JIT加速时,会出现卡顿,\n 并伴随一些噪音,但这是正常现象!")], + ], + title=i18n("性能设置"), + ), + ], + [ + sg.Button(i18n("开始音频转换"), key="start_vc"), + sg.Button(i18n("停止音频转换"), key="stop_vc"), + sg.Radio( + i18n("输入监听"), + "function", + key="im", + default=False, + enable_events=True, + ), + sg.Radio( + i18n("输出变声"), + "function", + key="vc", + default=True, + enable_events=True, + ), + sg.Text(i18n("算法延迟(ms):")), + sg.Text("0", key="delay_time"), + sg.Text(i18n("推理时间(ms):")), + sg.Text("0", key="infer_time"), + ], + ] + self.window = sg.Window("RVC - GUI", layout=layout, finalize=True) + self.event_handler() + + def event_handler(self): + while True: + event, values = self.window.read() + if event == sg.WINDOW_CLOSED: + self.flag_vc = False + exit() + if event == "reload_devices": + prev_input = self.window["sg_input_device"].get() + prev_output = self.window["sg_output_device"].get() + input_devices, output_devices, _, _ = self.get_devices(update=True) + if prev_input not in input_devices: + self.gui_config.sg_input_device = input_devices[0] + else: + self.gui_config.sg_input_device = prev_input + self.window["sg_input_device"].Update(values=input_devices) + self.window["sg_input_device"].Update( + value=self.gui_config.sg_input_device + ) + if prev_output not in output_devices: + self.gui_config.sg_output_device = output_devices[0] + else: + self.gui_config.sg_output_device = prev_output + self.window["sg_output_device"].Update(values=output_devices) + self.window["sg_output_device"].Update( + value=self.gui_config.sg_output_device + ) + if event == "start_vc" and self.flag_vc == False: + if self.set_values(values) == True: + printt("cuda_is_available: %s", torch.cuda.is_available()) + self.start_vc() + settings = { + "pth_path": values["pth_path"], + "index_path": values["index_path"], + "sg_input_device": values["sg_input_device"], + "sg_output_device": values["sg_output_device"], + "threhold": values["threhold"], + "pitch": values["pitch"], + "rms_mix_rate": values["rms_mix_rate"], + "index_rate": values["index_rate"], + # "device_latency": values["device_latency"], + "block_time": values["block_time"], + "crossfade_length": values["crossfade_length"], + "extra_time": values["extra_time"], + "n_cpu": values["n_cpu"], + # "use_jit": values["use_jit"], + "use_jit": False, + "f0method": ["pm", "harvest", "crepe", "rmvpe", "fcpe"][ + [ + values["pm"], + values["harvest"], + values["crepe"], + values["rmvpe"], + values["fcpe"], + ].index(True) + ], + } + with open("configs/config.json", "w") as j: + json.dump(settings, j) + global stream_latency + while stream_latency < 0: + time.sleep(0.01) + self.delay_time = ( + stream_latency + + values["block_time"] + + values["crossfade_length"] + + 0.01 + ) + if values["I_noise_reduce"]: + self.delay_time += values["crossfade_length"] + self.window["delay_time"].update(int(self.delay_time * 1000)) + if event == "stop_vc" and self.flag_vc == True: + self.flag_vc = False + stream_latency = -1 + # Parameter hot update + if event == "threhold": + self.gui_config.threhold = values["threhold"] + elif event == "pitch": + self.gui_config.pitch = values["pitch"] + if hasattr(self, "rvc"): + self.rvc.change_key(values["pitch"]) + elif event == "index_rate": + self.gui_config.index_rate = values["index_rate"] + if hasattr(self, "rvc"): + self.rvc.change_index_rate(values["index_rate"]) + elif event == "rms_mix_rate": + self.gui_config.rms_mix_rate = values["rms_mix_rate"] + elif event in ["pm", "harvest", "crepe", "rmvpe", "fcpe"]: + self.gui_config.f0method = event + elif event == "I_noise_reduce": + self.gui_config.I_noise_reduce = values["I_noise_reduce"] + if stream_latency > 0: + self.delay_time += ( + 1 if values["I_noise_reduce"] else -1 + ) * values["crossfade_length"] + self.window["delay_time"].update(int(self.delay_time * 1000)) + elif event == "O_noise_reduce": + self.gui_config.O_noise_reduce = values["O_noise_reduce"] + elif event in ["vc", "im"]: + self.function = event + elif event != "start_vc" and self.flag_vc == True: + # Other parameters do not support hot update + self.flag_vc = False + stream_latency = -1 + + def set_values(self, values): + if len(values["pth_path"].strip()) == 0: + sg.popup(i18n("请选择pth文件")) + return False + if len(values["index_path"].strip()) == 0: + sg.popup(i18n("请选择index文件")) + return False + pattern = re.compile("[^\x00-\x7F]+") + if pattern.findall(values["pth_path"]): + sg.popup(i18n("pth文件路径不可包含中文")) + return False + if pattern.findall(values["index_path"]): + sg.popup(i18n("index文件路径不可包含中文")) + return False + self.set_devices(values["sg_input_device"], values["sg_output_device"]) + self.config.use_jit = False # values["use_jit"] + # self.device_latency = values["device_latency"] + self.gui_config.pth_path = values["pth_path"] + self.gui_config.index_path = values["index_path"] + self.gui_config.threhold = values["threhold"] + self.gui_config.pitch = values["pitch"] + self.gui_config.block_time = values["block_time"] + self.gui_config.crossfade_time = values["crossfade_length"] + self.gui_config.extra_time = values["extra_time"] + self.gui_config.I_noise_reduce = values["I_noise_reduce"] + self.gui_config.O_noise_reduce = values["O_noise_reduce"] + self.gui_config.rms_mix_rate = values["rms_mix_rate"] + self.gui_config.index_rate = values["index_rate"] + self.gui_config.n_cpu = values["n_cpu"] + self.gui_config.f0method = ["pm", "harvest", "crepe", "rmvpe", "fcpe"][ + [ + values["pm"], + values["harvest"], + values["crepe"], + values["rmvpe"], + values["fcpe"], + ].index(True) + ] + return True + + def start_vc(self): + torch.cuda.empty_cache() + self.flag_vc = True + self.rvc = rvc_for_realtime.RVC( + self.gui_config.pitch, + self.gui_config.pth_path, + self.gui_config.index_path, + self.gui_config.index_rate, + self.gui_config.n_cpu, + inp_q, + opt_q, + self.config, + self.rvc if hasattr(self, "rvc") else None, + ) + self.gui_config.samplerate = self.rvc.tgt_sr + self.zc = self.rvc.tgt_sr // 100 + self.block_frame = ( + int( + np.round( + self.gui_config.block_time + * self.gui_config.samplerate + / self.zc + ) + ) + * self.zc + ) + self.block_frame_16k = 160 * self.block_frame // self.zc + self.crossfade_frame = ( + int( + np.round( + self.gui_config.crossfade_time + * self.gui_config.samplerate + / self.zc + ) + ) + * self.zc + ) + self.sola_search_frame = self.zc + self.extra_frame = ( + int( + np.round( + self.gui_config.extra_time + * self.gui_config.samplerate + / self.zc + ) + ) + * self.zc + ) + self.input_wav: torch.Tensor = torch.zeros( + self.extra_frame + + self.crossfade_frame + + self.sola_search_frame + + self.block_frame, + device=self.config.device, + dtype=torch.float32, + ) + self.input_wav_res: torch.Tensor = torch.zeros( + 160 * self.input_wav.shape[0] // self.zc, + device=self.config.device, + dtype=torch.float32, + ) + self.pitch: np.ndarray = np.zeros( + self.input_wav.shape[0] // self.zc, + dtype="int32", + ) + self.pitchf: np.ndarray = np.zeros( + self.input_wav.shape[0] // self.zc, + dtype="float64", + ) + self.sola_buffer: torch.Tensor = torch.zeros( + self.crossfade_frame, device=self.config.device, dtype=torch.float32 + ) + self.nr_buffer: torch.Tensor = self.sola_buffer.clone() + self.output_buffer: torch.Tensor = self.input_wav.clone() + self.res_buffer: torch.Tensor = torch.zeros( + 2 * self.zc, device=self.config.device, dtype=torch.float32 + ) + self.valid_rate = 1 - (self.extra_frame - 1) / self.input_wav.shape[0] + self.fade_in_window: torch.Tensor = ( + torch.sin( + 0.5 + * np.pi + * torch.linspace( + 0.0, + 1.0, + steps=self.crossfade_frame, + device=self.config.device, + dtype=torch.float32, + ) + ) + ** 2 + ) + self.fade_out_window: torch.Tensor = 1 - self.fade_in_window + self.resampler = tat.Resample( + orig_freq=self.gui_config.samplerate, + new_freq=16000, + dtype=torch.float32, + ).to(self.config.device) + self.tg = TorchGate( + sr=self.gui_config.samplerate, n_fft=4 * self.zc, prop_decrease=0.9 + ).to(self.config.device) + thread_vc = threading.Thread(target=self.soundinput) + thread_vc.start() + + def soundinput(self): + """ + 接受音频输入 + """ + channels = 1 if sys.platform == "darwin" else 2 + with sd.Stream( + channels=channels, + callback=self.audio_callback, + blocksize=self.block_frame, + samplerate=self.gui_config.samplerate, + dtype="float32", + ) as stream: + global stream_latency + stream_latency = stream.latency[-1] + while self.flag_vc: + time.sleep(self.gui_config.block_time) + printt("Audio block passed.") + printt("ENDing VC") + + def audio_callback( + self, indata: np.ndarray, outdata: np.ndarray, frames, times, status + ): + """ + 音频处理 + """ + start_time = time.perf_counter() + indata = librosa.to_mono(indata.T) + if self.gui_config.threhold > -60: + rms = librosa.feature.rms( + y=indata, frame_length=4 * self.zc, hop_length=self.zc + ) + db_threhold = ( + librosa.amplitude_to_db(rms, ref=1.0)[0] < self.gui_config.threhold + ) + for i in range(db_threhold.shape[0]): + if db_threhold[i]: + indata[i * self.zc : (i + 1) * self.zc] = 0 + self.input_wav[: -self.block_frame] = self.input_wav[ + self.block_frame : + ].clone() + self.input_wav[-self.block_frame :] = torch.from_numpy(indata).to( + self.config.device + ) + self.input_wav_res[: -self.block_frame_16k] = self.input_wav_res[ + self.block_frame_16k : + ].clone() + # input noise reduction and resampling + if self.gui_config.I_noise_reduce and self.function == "vc": + input_wav = self.input_wav[ + -self.crossfade_frame - self.block_frame - 2 * self.zc : + ] + input_wav = self.tg( + input_wav.unsqueeze(0), self.input_wav.unsqueeze(0) + )[0, 2 * self.zc :] + input_wav[: self.crossfade_frame] *= self.fade_in_window + input_wav[: self.crossfade_frame] += ( + self.nr_buffer * self.fade_out_window + ) + self.nr_buffer[:] = input_wav[-self.crossfade_frame :] + input_wav = torch.cat( + (self.res_buffer[:], input_wav[: self.block_frame]) + ) + self.res_buffer[:] = input_wav[-2 * self.zc :] + self.input_wav_res[-self.block_frame_16k - 160 :] = self.resampler( + input_wav + )[160:] + else: + self.input_wav_res[-self.block_frame_16k - 160 :] = self.resampler( + self.input_wav[-self.block_frame - 2 * self.zc :] + )[160:] + # infer + if self.function == "vc": + f0_extractor_frame = self.block_frame_16k + 800 + if self.gui_config.f0method == "rmvpe": + f0_extractor_frame = ( + 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160 + ) + infer_wav = self.rvc.infer( + self.input_wav_res, + self.input_wav_res[-f0_extractor_frame:].cpu().numpy(), + self.block_frame_16k, + self.valid_rate, + self.pitch, + self.pitchf, + self.gui_config.f0method, + ) + infer_wav = infer_wav[ + -self.crossfade_frame - self.sola_search_frame - self.block_frame : + ] + else: + infer_wav = self.input_wav[ + -self.crossfade_frame - self.sola_search_frame - self.block_frame : + ].clone() + # output noise reduction + if (self.gui_config.O_noise_reduce and self.function == "vc") or ( + self.gui_config.I_noise_reduce and self.function == "im" + ): + self.output_buffer[: -self.block_frame] = self.output_buffer[ + self.block_frame : + ].clone() + self.output_buffer[-self.block_frame :] = infer_wav[-self.block_frame :] + infer_wav = self.tg( + infer_wav.unsqueeze(0), self.output_buffer.unsqueeze(0) + ).squeeze(0) + # volume envelop mixing + if self.gui_config.rms_mix_rate < 1 and self.function == "vc": + rms1 = librosa.feature.rms( + y=self.input_wav_res[-160 * infer_wav.shape[0] // self.zc :] + .cpu() + .numpy(), + frame_length=640, + hop_length=160, + ) + rms1 = torch.from_numpy(rms1).to(self.config.device) + rms1 = F.interpolate( + rms1.unsqueeze(0), + size=infer_wav.shape[0] + 1, + mode="linear", + align_corners=True, + )[0, 0, :-1] + rms2 = librosa.feature.rms( + y=infer_wav[:].cpu().numpy(), + frame_length=4 * self.zc, + hop_length=self.zc, + ) + rms2 = torch.from_numpy(rms2).to(self.config.device) + rms2 = F.interpolate( + rms2.unsqueeze(0), + size=infer_wav.shape[0] + 1, + mode="linear", + align_corners=True, + )[0, 0, :-1] + rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-3) + infer_wav *= torch.pow( + rms1 / rms2, torch.tensor(1 - self.gui_config.rms_mix_rate) + ) + # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC + conv_input = infer_wav[ + None, None, : self.crossfade_frame + self.sola_search_frame + ] + cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :]) + cor_den = torch.sqrt( + F.conv1d( + conv_input**2, + torch.ones(1, 1, self.crossfade_frame, device=self.config.device), + ) + + 1e-8 + ) + if sys.platform == "darwin": + _, sola_offset = torch.max(cor_nom[0, 0] / cor_den[0, 0]) + sola_offset = sola_offset.item() + else: + sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0]) + printt("sola_offset = %d", int(sola_offset)) + infer_wav = infer_wav[ + sola_offset : sola_offset + self.block_frame + self.crossfade_frame + ] + infer_wav[: self.crossfade_frame] *= self.fade_in_window + infer_wav[: self.crossfade_frame] += self.sola_buffer * self.fade_out_window + self.sola_buffer[:] = infer_wav[-self.crossfade_frame :] + if sys.platform == "darwin": + outdata[:] = ( + infer_wav[: -self.crossfade_frame].cpu().numpy()[:, np.newaxis] + ) + else: + outdata[:] = ( + infer_wav[: -self.crossfade_frame].repeat(2, 1).t().cpu().numpy() + ) + total_time = time.perf_counter() - start_time + self.window["infer_time"].update(int(total_time * 1000)) + printt("Infer time: %.2f", total_time) + + def get_devices(self, update: bool = True): + """获取设备列表""" + if update: + sd._terminate() + sd._initialize() + devices = sd.query_devices() + hostapis = sd.query_hostapis() + for hostapi in hostapis: + for device_idx in hostapi["devices"]: + devices[device_idx]["hostapi_name"] = hostapi["name"] + input_devices = [ + f"{d['name']} ({d['hostapi_name']})" + for d in devices + if d["max_input_channels"] > 0 + ] + output_devices = [ + f"{d['name']} ({d['hostapi_name']})" + for d in devices + if d["max_output_channels"] > 0 + ] + input_devices_indices = [ + d["index"] if "index" in d else d["name"] + for d in devices + if d["max_input_channels"] > 0 + ] + output_devices_indices = [ + d["index"] if "index" in d else d["name"] + for d in devices + if d["max_output_channels"] > 0 + ] + return ( + input_devices, + output_devices, + input_devices_indices, + output_devices_indices, + ) + + def set_devices(self, input_device, output_device): + """设置输出设备""" + ( + input_devices, + output_devices, + input_device_indices, + output_device_indices, + ) = self.get_devices() + sd.default.device[0] = input_device_indices[ + input_devices.index(input_device) + ] + sd.default.device[1] = output_device_indices[ + output_devices.index(output_device) + ] + printt("Input device: %s:%s", str(sd.default.device[0]), input_device) + printt("Output device: %s:%s", str(sd.default.device[1]), output_device) + + gui = GUI() From e6cda00fcfe326d8cd85e4daf27cb582507b36b8 Mon Sep 17 00:00:00 2001 From: CN_ChiTu <36254426+CNChTu@users.noreply.github.com> Date: Thu, 14 Dec 2023 21:09:41 +0800 Subject: [PATCH 08/20] add fcpe for realtime --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index f0e7181..cb6e93e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -42,6 +42,7 @@ onnxruntime; sys_platform == 'darwin' onnxruntime-gpu; sys_platform != 'darwin' torchcrepe==0.0.20 fastapi==0.88 +torchfcpe ffmpy==0.3.1 python-dotenv>=1.0.0 av From 0f8a5facd9ea2e8b99d77bfa481c7825a11fa99e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 20 Dec 2023 14:47:37 +0900 Subject: [PATCH 09/20] chore(format): run black on dev (#1619) Co-authored-by: github-actions[bot] --- tools/rvc_for_realtime.py | 65 ++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 29 deletions(-) diff --git a/tools/rvc_for_realtime.py b/tools/rvc_for_realtime.py index 8e16e87..f36ffb3 100644 --- a/tools/rvc_for_realtime.py +++ b/tools/rvc_for_realtime.py @@ -46,22 +46,23 @@ def printt(strr, *args): # config.is_half=False########强制cpu测试 class RVC: def __init__( - self, - key, - pth_path, - index_path, - index_rate, - n_cpu, - inp_q, - opt_q, - config: Config, - last_rvc=None, + self, + key, + pth_path, + index_path, + index_rate, + n_cpu, + inp_q, + opt_q, + config: Config, + last_rvc=None, ) -> None: """ 初始化 """ try: if config.dml == True: + def forward_dml(ctx, x, scale): ctx.scale = scale res = x.clone().detach() @@ -205,7 +206,7 @@ class RVC: f0bak = f0.copy() f0_mel = 1127 * np.log(1 + f0 / 700) f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( - f0_mel_max - f0_mel_min + f0_mel_max - f0_mel_min ) + 1 f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel > 255] = 255 @@ -261,7 +262,7 @@ class RVC: self.inp_q.put((idx, x[:tail], res_f0, n_cpu, ts)) else: self.inp_q.put( - (idx, x[part_length * idx - 320: tail], res_f0, n_cpu, ts) + (idx, x[part_length * idx - 320 : tail], res_f0, n_cpu, ts) ) while 1: res_ts = self.opt_q.get() @@ -276,7 +277,7 @@ class RVC: else: f0 = f0[2:] f0bak[ - part_length * idx // 160: part_length * idx // 160 + f0.shape[0] + part_length * idx // 160 : part_length * idx // 160 + f0.shape[0] ] = f0 f0bak = signal.medfilt(f0bak, 3) f0bak *= pow(2, f0_up_key / 12) @@ -328,26 +329,32 @@ class RVC: def get_f0_fcpe(self, x, f0_up_key): if hasattr(self, "model_fcpe") == False: from torchfcpe import spawn_bundled_infer_model + printt("Loading fcpe model") self.model_fcpe = spawn_bundled_infer_model(self.device) - f0 = self.model_fcpe.infer( - torch.from_numpy(x).to(self.device).unsqueeze(0).float(), - sr=16000, - decoder_mode='local_argmax', - threshold=0.006, - ).squeeze().cpu().numpy() + f0 = ( + self.model_fcpe.infer( + torch.from_numpy(x).to(self.device).unsqueeze(0).float(), + sr=16000, + decoder_mode="local_argmax", + threshold=0.006, + ) + .squeeze() + .cpu() + .numpy() + ) f0 *= pow(2, f0_up_key / 12) return self.get_f0_post(f0) def infer( - self, - feats: torch.Tensor, - indata: np.ndarray, - block_frame_16k, - rate, - cache_pitch, - cache_pitchf, - f0method, + self, + feats: torch.Tensor, + indata: np.ndarray, + block_frame_16k, + rate, + cache_pitch, + cache_pitchf, + f0method, ) -> np.ndarray: feats = feats.view(1, -1) if self.config.is_half: @@ -380,8 +387,8 @@ class RVC: if self.config.is_half: npy = npy.astype("float16") feats[0][-leng_replace_head:] = ( - torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate - + (1 - self.index_rate) * feats[0][-leng_replace_head:] + torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate + + (1 - self.index_rate) * feats[0][-leng_replace_head:] ) else: printt("Index search FAILED or disabled") From 78f03e7dc0563e438307ff62c76a062b46083ec4 Mon Sep 17 00:00:00 2001 From: Blaise <133521603+blaise-tk@users.noreply.github.com> Date: Fri, 22 Dec 2023 02:35:51 +0100 Subject: [PATCH 10/20] Fix return_complex warning on training (#1627) * Fix return_complex warning on training * remove unused prints --- infer/lib/train/mel_processing.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/infer/lib/train/mel_processing.py b/infer/lib/train/mel_processing.py index 04a11f1..14a960f 100644 --- a/infer/lib/train/mel_processing.py +++ b/infer/lib/train/mel_processing.py @@ -38,7 +38,6 @@ def spectral_de_normalize_torch(magnitudes): mel_basis = {} hann_window = {} - def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): """Convert waveform into Linear-frequency Linear-amplitude spectrogram. @@ -52,12 +51,7 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) Returns: :: (B, Freq, Frame) - Linear-frequency Linear-amplitude spectrogram """ - # Validation - if torch.min(y) < -1.07: - logger.debug("min value is %s", str(torch.min(y))) - if torch.max(y) > 1.07: - logger.debug("max value is %s", str(torch.max(y))) - + # Window - Cache if needed global hann_window dtype_device = str(y.dtype) + "_" + str(y.device) @@ -66,7 +60,7 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( dtype=y.dtype, device=y.device ) - + # Padding y = torch.nn.functional.pad( y.unsqueeze(1), @@ -74,7 +68,7 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) mode="reflect", ) y = y.squeeze(1) - + # Complex Spectrogram :: (B, T) -> (B, Freq, Frame, RealComplex=2) spec = torch.stft( y, @@ -86,14 +80,13 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) pad_mode="reflect", normalized=False, onesided=True, - return_complex=False, + return_complex=True, ) - + # Linear-frequency Linear-amplitude spectrogram :: (B, Freq, Frame, RealComplex=2) -> (B, Freq, Frame) - spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6) return spec - def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): # MelBasis - Cache if needed global mel_basis From 3dec36568c48af357ec6b7331a7c06348bd51abb Mon Sep 17 00:00:00 2001 From: yxlllc Date: Tue, 26 Dec 2023 00:23:36 +0800 Subject: [PATCH 11/20] optimize real-time vc --- configs/config.json | 2 +- gui_v1.py | 159 +++++++++++++++++++++++-------- infer/lib/infer_pack/models.py | 41 ++++---- infer/lib/jit/get_synthesizer.py | 1 + infer/lib/rmvpe.py | 12 ++- tools/rvc_for_realtime.py | 131 ++++++++++++------------- 6 files changed, 211 insertions(+), 135 deletions(-) diff --git a/configs/config.json b/configs/config.json index 0861200..f874bd5 100644 --- a/configs/config.json +++ b/configs/config.json @@ -1 +1 @@ -{"pth_path": "assets/weights/kikiV1.pth", "index_path": "logs/kikiV1.index", "sg_input_device": "VoiceMeeter Output (VB-Audio Vo (MME)", "sg_output_device": "VoiceMeeter Input (VB-Audio Voi (MME)", "threhold": -45.0, "pitch": 2.0, "rms_mix_rate": 0.0, "index_rate": 0.0, "block_time": 0.52, "crossfade_length": 0.15, "extra_time": 2.46, "n_cpu": 6.0, "use_jit": false, "f0method": "rmvpe"} \ No newline at end of file +{"pth_path": "assets/weights/kikiV1.pth", "index_path": "logs/kikiV1.index", "sg_input_device": "VoiceMeeter Output (VB-Audio Vo (MME)", "sg_output_device": "VoiceMeeter Input (VB-Audio Voi (MME)", "sr_type": "sr_model", "threhold": -60.0, "pitch": 12.0, "rms_mix_rate": 0.5, "index_rate": 0.0, "block_time": 0.2, "crossfade_length": 0.08, "extra_time": 2.00, "n_cpu": 4.0, "use_jit": false, "use_pv": false, "f0method": "fcpe"} \ No newline at end of file diff --git a/gui_v1.py b/gui_v1.py index 7f4c640..0f614e6 100644 --- a/gui_v1.py +++ b/gui_v1.py @@ -22,6 +22,26 @@ def printt(strr, *args): print(strr % args) +def phase_vocoder(a, b, fade_out, fade_in): + window = torch.sqrt(fade_out * fade_in) + fa = torch.fft.rfft(a * window) + fb = torch.fft.rfft(b * window) + absab = torch.abs(fa) + torch.abs(fb) + n = a.shape[0] + if n % 2 == 0: + absab[1:-1] *= 2 + else: + absab[1:] *= 2 + phia = torch.angle(fa) + phib = torch.angle(fb) + deltaphase = phib - phia + deltaphase = deltaphase - 2 * np.pi * torch.floor(deltaphase / 2 / np.pi + 0.5) + w = 2 * np.pi * torch.arange(n // 2 + 1).to(a) + deltaphase + t = torch.arange(n).unsqueeze(-1).to(a) / n + result = a * (fade_out ** 2) + b * (fade_in ** 2) + torch.sum(absab * torch.cos(w * t + phia), -1) * window / n + return result + + class Harvest(multiprocessing.Process): def __init__(self, inp_q, opt_q): multiprocessing.Process.__init__(self) @@ -118,6 +138,8 @@ if __name__ == "__main__": try: with open("configs/config.json", "r") as j: data = json.load(j) + data["sr_model"] = data["sr_type"] == "sr_model" + data["sr_device"] = data["sr_type"] == "sr_device" data["pm"] = data["f0method"] == "pm" data["harvest"] = data["f0method"] == "harvest" data["crepe"] = data["f0method"] == "crepe" @@ -134,6 +156,7 @@ if __name__ == "__main__": "index_path": " ", "sg_input_device": input_devices[sd.default.device[0]], "sg_output_device": output_devices[sd.default.device[1]], + "sr_type": "sr_model", "threhold": "-60", "pitch": "0", "index_rate": "0", @@ -143,7 +166,10 @@ if __name__ == "__main__": "extra_time": "2.5", "f0method": "rmvpe", "use_jit": False, + "use_pv": False, } + data["sr_model"] = data["sr_type"] == "sr_model" + data["sr_device"] = data["sr_type"] == "sr_device" data["pm"] = data["f0method"] == "pm" data["harvest"] = data["f0method"] == "harvest" data["crepe"] = data["f0method"] == "crepe" @@ -207,7 +233,25 @@ if __name__ == "__main__": default_value=data.get("sg_output_device", ""), ), ], - [sg.Button(i18n("重载设备列表"), key="reload_devices")], + [ + sg.Button(i18n("重载设备列表"), key="reload_devices"), + sg.Radio( + i18n("使用模型采样率"), + "sr_type", + key="sr_model", + default=data.get("sr_model", True), + enable_events=True, + ), + sg.Radio( + i18n("使用设备采样率"), + "sr_type", + key="sr_device", + default=data.get("sr_device", False), + enable_events=True, + ), + sg.Text(i18n("采样率:")), + sg.Text("", key="sr_stream"), + ], ], title=i18n("音频设备(请使用同种类驱动)"), ) @@ -222,7 +266,7 @@ if __name__ == "__main__": key="threhold", resolution=1, orientation="h", - default_value=data.get("threhold", "-60"), + default_value=data.get("threhold", -60), enable_events=True, ), ], @@ -233,7 +277,7 @@ if __name__ == "__main__": key="pitch", resolution=1, orientation="h", - default_value=data.get("pitch", "0"), + default_value=data.get("pitch", 0), enable_events=True, ), ], @@ -244,7 +288,7 @@ if __name__ == "__main__": key="index_rate", resolution=0.01, orientation="h", - default_value=data.get("index_rate", "0"), + default_value=data.get("index_rate", 0), enable_events=True, ), ], @@ -255,7 +299,7 @@ if __name__ == "__main__": key="rms_mix_rate", resolution=0.01, orientation="h", - default_value=data.get("rms_mix_rate", "0"), + default_value=data.get("rms_mix_rate", 0), enable_events=True, ), ], @@ -265,35 +309,35 @@ if __name__ == "__main__": "pm", "f0method", key="pm", - default=data.get("pm", "") == True, + default=data.get("pm", False), enable_events=True, ), sg.Radio( "harvest", "f0method", key="harvest", - default=data.get("harvest", "") == True, + default=data.get("harvest", False), enable_events=True, ), sg.Radio( "crepe", "f0method", key="crepe", - default=data.get("crepe", "") == True, + default=data.get("crepe", False), enable_events=True, ), sg.Radio( "rmvpe", "f0method", key="rmvpe", - default=data.get("rmvpe", "") == True, + default=data.get("rmvpe", False), enable_events=True, ), sg.Radio( "fcpe", "f0method", key="fcpe", - default=data.get("fcpe", "") == True, + default=data.get("fcpe", True), enable_events=True, ), ], @@ -305,11 +349,11 @@ if __name__ == "__main__": [ sg.Text(i18n("采样长度")), sg.Slider( - range=(0.05, 2.4), + range=(0.02, 2.4), key="block_time", resolution=0.01, orientation="h", - default_value=data.get("block_time", "0.25"), + default_value=data.get("block_time", 0.25), enable_events=True, ), ], @@ -320,7 +364,7 @@ if __name__ == "__main__": # key="device_latency", # resolution=0.001, # orientation="h", - # default_value=data.get("device_latency", "0.1"), + # default_value=data.get("device_latency", 0.1), # enable_events=True, # ), # ], @@ -344,7 +388,7 @@ if __name__ == "__main__": key="crossfade_length", resolution=0.01, orientation="h", - default_value=data.get("crossfade_length", "0.05"), + default_value=data.get("crossfade_length", 0.05), enable_events=True, ), ], @@ -355,7 +399,7 @@ if __name__ == "__main__": key="extra_time", resolution=0.01, orientation="h", - default_value=data.get("extra_time", "2.5"), + default_value=data.get("extra_time", 2.5), enable_events=True, ), ], @@ -370,6 +414,12 @@ if __name__ == "__main__": key="O_noise_reduce", enable_events=True, ), + sg.Checkbox( + i18n("启用相位声码器"), + key="use_pv", + default=data.get("use_pv", False), + enable_events=True, + ), # sg.Checkbox( # "JIT加速", # default=self.config.use_jit, @@ -443,6 +493,12 @@ if __name__ == "__main__": "index_path": values["index_path"], "sg_input_device": values["sg_input_device"], "sg_output_device": values["sg_output_device"], + "sr_type": ["sr_model", "sr_device"][ + [ + values["sr_model"], + values["sr_device"], + ].index(True) + ], "threhold": values["threhold"], "pitch": values["pitch"], "rms_mix_rate": values["rms_mix_rate"], @@ -454,6 +510,7 @@ if __name__ == "__main__": "n_cpu": values["n_cpu"], # "use_jit": values["use_jit"], "use_jit": False, + "use_pv": values["use_pv"], "f0method": ["pm", "harvest", "crepe", "rmvpe", "fcpe"][ [ values["pm"], @@ -477,6 +534,7 @@ if __name__ == "__main__": ) if values["I_noise_reduce"]: self.delay_time += values["crossfade_length"] + self.window["sr_stream"].update(self.gui_config.samplerate) self.window["delay_time"].update(int(self.delay_time * 1000)) if event == "stop_vc" and self.flag_vc == True: self.flag_vc = False @@ -505,6 +563,8 @@ if __name__ == "__main__": self.window["delay_time"].update(int(self.delay_time * 1000)) elif event == "O_noise_reduce": self.gui_config.O_noise_reduce = values["O_noise_reduce"] + elif event == "use_pv": + self.gui_config.use_pv = values["use_pv"] elif event in ["vc", "im"]: self.function = event elif event != "start_vc" and self.flag_vc == True: @@ -531,6 +591,12 @@ if __name__ == "__main__": # self.device_latency = values["device_latency"] self.gui_config.pth_path = values["pth_path"] self.gui_config.index_path = values["index_path"] + self.gui_config.sr_type = ["sr_model", "sr_device"][ + [ + values["sr_model"], + values["sr_device"], + ].index(True) + ] self.gui_config.threhold = values["threhold"] self.gui_config.pitch = values["pitch"] self.gui_config.block_time = values["block_time"] @@ -538,6 +604,7 @@ if __name__ == "__main__": self.gui_config.extra_time = values["extra_time"] self.gui_config.I_noise_reduce = values["I_noise_reduce"] self.gui_config.O_noise_reduce = values["O_noise_reduce"] + self.gui_config.use_pv = values["use_pv"] self.gui_config.rms_mix_rate = values["rms_mix_rate"] self.gui_config.index_rate = values["index_rate"] self.gui_config.n_cpu = values["n_cpu"] @@ -566,8 +633,8 @@ if __name__ == "__main__": self.config, self.rvc if hasattr(self, "rvc") else None, ) - self.gui_config.samplerate = self.rvc.tgt_sr - self.zc = self.rvc.tgt_sr // 100 + self.gui_config.samplerate = self.rvc.tgt_sr if self.gui_config.sr_type == "sr_model" else self.get_device_samplerate() + self.zc = self.gui_config.samplerate // 100 self.block_frame = ( int( np.round( @@ -589,6 +656,7 @@ if __name__ == "__main__": ) * self.zc ) + self.sola_buffer_frame = min(self.crossfade_frame, 4 * self.zc) self.sola_search_frame = self.zc self.extra_frame = ( int( @@ -622,14 +690,14 @@ if __name__ == "__main__": dtype="float64", ) self.sola_buffer: torch.Tensor = torch.zeros( - self.crossfade_frame, device=self.config.device, dtype=torch.float32 + self.sola_buffer_frame, device=self.config.device, dtype=torch.float32 ) self.nr_buffer: torch.Tensor = self.sola_buffer.clone() self.output_buffer: torch.Tensor = self.input_wav.clone() self.res_buffer: torch.Tensor = torch.zeros( 2 * self.zc, device=self.config.device, dtype=torch.float32 ) - self.valid_rate = 1 - (self.extra_frame - 1) / self.input_wav.shape[0] + self.skip_head = self.extra_frame // self.zc self.fade_in_window: torch.Tensor = ( torch.sin( 0.5 @@ -637,7 +705,7 @@ if __name__ == "__main__": * torch.linspace( 0.0, 1.0, - steps=self.crossfade_frame, + steps=self.sola_buffer_frame, device=self.config.device, dtype=torch.float32, ) @@ -650,6 +718,14 @@ if __name__ == "__main__": new_freq=16000, dtype=torch.float32, ).to(self.config.device) + if self.rvc.tgt_sr != self.gui_config.samplerate: + self.resampler2 = tat.Resample( + orig_freq=self.rvc.tgt_sr, + new_freq=self.gui_config.samplerate, + dtype=torch.float32, + ).to(self.config.device) + else: + self.resampler2 = None self.tg = TorchGate( sr=self.gui_config.samplerate, n_fft=4 * self.zc, prop_decrease=0.9 ).to(self.config.device) @@ -710,11 +786,11 @@ if __name__ == "__main__": input_wav = self.tg( input_wav.unsqueeze(0), self.input_wav.unsqueeze(0) )[0, 2 * self.zc :] - input_wav[: self.crossfade_frame] *= self.fade_in_window - input_wav[: self.crossfade_frame] += ( + input_wav[: self.sola_buffer_frame] *= self.fade_in_window + input_wav[: self.sola_buffer_frame] += ( self.nr_buffer * self.fade_out_window ) - self.nr_buffer[:] = input_wav[-self.crossfade_frame :] + self.nr_buffer[:] = input_wav[self.block_frame : self.block_frame + self.sola_buffer_frame] input_wav = torch.cat( (self.res_buffer[:], input_wav[: self.block_frame]) ) @@ -728,23 +804,16 @@ if __name__ == "__main__": )[160:] # infer if self.function == "vc": - f0_extractor_frame = self.block_frame_16k + 800 - if self.gui_config.f0method == "rmvpe": - f0_extractor_frame = ( - 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160 - ) infer_wav = self.rvc.infer( self.input_wav_res, - self.input_wav_res[-f0_extractor_frame:].cpu().numpy(), self.block_frame_16k, - self.valid_rate, + self.skip_head, self.pitch, self.pitchf, self.gui_config.f0method, ) - infer_wav = infer_wav[ - -self.crossfade_frame - self.sola_search_frame - self.block_frame : - ] + if self.resampler2 is not None: + infer_wav = self.resampler2(infer_wav) else: infer_wav = self.input_wav[ -self.crossfade_frame - self.sola_search_frame - self.block_frame : @@ -794,13 +863,13 @@ if __name__ == "__main__": ) # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC conv_input = infer_wav[ - None, None, : self.crossfade_frame + self.sola_search_frame + None, None, : self.sola_buffer_frame + self.sola_search_frame ] cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :]) cor_den = torch.sqrt( F.conv1d( conv_input**2, - torch.ones(1, 1, self.crossfade_frame, device=self.config.device), + torch.ones(1, 1, self.sola_buffer_frame, device=self.config.device), ) + 1e-8 ) @@ -813,9 +882,16 @@ if __name__ == "__main__": infer_wav = infer_wav[ sola_offset : sola_offset + self.block_frame + self.crossfade_frame ] - infer_wav[: self.crossfade_frame] *= self.fade_in_window - infer_wav[: self.crossfade_frame] += self.sola_buffer * self.fade_out_window - self.sola_buffer[:] = infer_wav[-self.crossfade_frame :] + if "privateuseone" in str(self.config.device) or not self.gui_config.use_pv: + infer_wav[: self.sola_buffer_frame] *= self.fade_in_window + infer_wav[: self.sola_buffer_frame] += self.sola_buffer * self.fade_out_window + else: + infer_wav[: self.sola_buffer_frame] = phase_vocoder( + self.sola_buffer, + infer_wav[: self.sola_buffer_frame], + self.fade_out_window, + self.fade_in_window) + self.sola_buffer[:] = infer_wav[self.block_frame : self.block_frame + self.sola_buffer_frame] if sys.platform == "darwin": outdata[:] = ( infer_wav[: -self.crossfade_frame].cpu().numpy()[:, np.newaxis] @@ -864,7 +940,7 @@ if __name__ == "__main__": input_devices_indices, output_devices_indices, ) - + def set_devices(self, input_device, output_device): """设置输出设备""" ( @@ -881,5 +957,8 @@ if __name__ == "__main__": ] printt("Input device: %s:%s", str(sd.default.device[0]), input_device) printt("Output device: %s:%s", str(sd.default.device[1]), output_device) - + + def get_device_samplerate(self): + return int(sd.query_devices(device=sd.default.device[0])['default_samplerate']) + gui = GUI() diff --git a/infer/lib/infer_pack/models.py b/infer/lib/infer_pack/models.py index f25e724..c2750ee 100644 --- a/infer/lib/infer_pack/models.py +++ b/infer/lib/infer_pack/models.py @@ -722,7 +722,8 @@ class SynthesizerTrnMs256NSFsid(nn.Module): def remove_weight_norm(self): self.dec.remove_weight_norm() self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() + if hasattr(self, "enc_q"): + self.enc_q.remove_weight_norm() def __prepare_scriptable__(self): for hook in self.dec._forward_pre_hooks.values(): @@ -783,14 +784,14 @@ class SynthesizerTrnMs256NSFsid(nn.Module): pitch: torch.Tensor, nsff0: torch.Tensor, sid: torch.Tensor, - rate: Optional[torch.Tensor] = None, + skip_head: Optional[torch.Tensor] = None, ): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if rate is not None: - assert isinstance(rate, torch.Tensor) - head = int(z_p.shape[2] * (1 - rate.item())) + if skip_head is not None: + assert isinstance(skip_head, torch.Tensor) + head = int(skip_head.item()) z_p = z_p[:, :, head:] x_mask = x_mask[:, :, head:] nsff0 = nsff0[:, head:] @@ -887,7 +888,8 @@ class SynthesizerTrnMs768NSFsid(nn.Module): def remove_weight_norm(self): self.dec.remove_weight_norm() self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() + if hasattr(self, "enc_q"): + self.enc_q.remove_weight_norm() def __prepare_scriptable__(self): for hook in self.dec._forward_pre_hooks.values(): @@ -941,13 +943,14 @@ class SynthesizerTrnMs768NSFsid(nn.Module): pitch: torch.Tensor, nsff0: torch.Tensor, sid: torch.Tensor, - rate: Optional[torch.Tensor] = None, + skip_head: Optional[torch.Tensor] = None, ): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if rate is not None: - head = int(z_p.shape[2] * (1.0 - rate.item())) + if skip_head is not None: + assert isinstance(skip_head, torch.Tensor) + head = int(skip_head.item()) z_p = z_p[:, :, head:] x_mask = x_mask[:, :, head:] nsff0 = nsff0[:, head:] @@ -1041,7 +1044,8 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): def remove_weight_norm(self): self.dec.remove_weight_norm() self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() + if hasattr(self, "enc_q"): + self.enc_q.remove_weight_norm() def __prepare_scriptable__(self): for hook in self.dec._forward_pre_hooks.values(): @@ -1087,13 +1091,14 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): phone: torch.Tensor, phone_lengths: torch.Tensor, sid: torch.Tensor, - rate: Optional[torch.Tensor] = None, + skip_head: Optional[torch.Tensor] = None, ): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if rate is not None: - head = int(z_p.shape[2] * (1.0 - rate.item())) + if skip_head is not None: + assert isinstance(skip_head, torch.Tensor) + head = int(skip_head.item()) z_p = z_p[:, :, head:] x_mask = x_mask[:, :, head:] z = self.flow(z_p, x_mask, g=g, reverse=True) @@ -1186,7 +1191,8 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module): def remove_weight_norm(self): self.dec.remove_weight_norm() self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() + if hasattr(self, "enc_q"): + self.enc_q.remove_weight_norm() def __prepare_scriptable__(self): for hook in self.dec._forward_pre_hooks.values(): @@ -1232,13 +1238,14 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module): phone: torch.Tensor, phone_lengths: torch.Tensor, sid: torch.Tensor, - rate: Optional[torch.Tensor] = None, + skip_head: Optional[torch.Tensor] = None, ): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if rate is not None: - head = int(z_p.shape[2] * (1.0 - rate.item())) + if skip_head is not None: + assert isinstance(skip_head, torch.Tensor) + head = int(skip_head.item()) z_p = z_p[:, :, head:] x_mask = x_mask[:, :, head:] z = self.flow(z_p, x_mask, g=g, reverse=True) diff --git a/infer/lib/jit/get_synthesizer.py b/infer/lib/jit/get_synthesizer.py index ef5fe58..b8db4fa 100644 --- a/infer/lib/jit/get_synthesizer.py +++ b/infer/lib/jit/get_synthesizer.py @@ -34,4 +34,5 @@ def get_synthesizer(pth_path, device=torch.device("cpu")): net_g.load_state_dict(cpt["weight"], strict=False) net_g = net_g.float() net_g.eval().to(device) + net_g.remove_weight_norm() return net_g, cpt diff --git a/infer/lib/rmvpe.py b/infer/lib/rmvpe.py index 9010d28..86c6899 100644 --- a/infer/lib/rmvpe.py +++ b/infer/lib/rmvpe.py @@ -593,16 +593,18 @@ class RMVPE: def infer_from_audio(self, audio, thred=0.03): # torch.cuda.synchronize() - t0 = ttime() + # t0 = ttime() + if not torch.is_tensor(audio): + audio = torch.from_numpy(audio) mel = self.mel_extractor( - torch.from_numpy(audio).float().to(self.device).unsqueeze(0), center=True + audio.float().to(self.device).unsqueeze(0), center=True ) # print(123123123,mel.device.type) # torch.cuda.synchronize() - t1 = ttime() + # t1 = ttime() hidden = self.mel2hidden(mel) # torch.cuda.synchronize() - t2 = ttime() + # t2 = ttime() # print(234234,hidden.device.type) if "privateuseone" not in str(self.device): hidden = hidden.squeeze(0).cpu().numpy() @@ -613,7 +615,7 @@ class RMVPE: f0 = self.decode(hidden, thred=thred) # torch.cuda.synchronize() - t3 = ttime() + # t3 = ttime() # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0)) return f0 diff --git a/tools/rvc_for_realtime.py b/tools/rvc_for_realtime.py index f36ffb3..2d54732 100644 --- a/tools/rvc_for_realtime.py +++ b/tools/rvc_for_realtime.py @@ -46,23 +46,22 @@ def printt(strr, *args): # config.is_half=False########强制cpu测试 class RVC: def __init__( - self, - key, - pth_path, - index_path, - index_rate, - n_cpu, - inp_q, - opt_q, - config: Config, - last_rvc=None, + self, + key, + pth_path, + index_path, + index_rate, + n_cpu, + inp_q, + opt_q, + config: Config, + last_rvc=None, ) -> None: """ 初始化 """ try: if config.dml == True: - def forward_dml(ctx, x, scale): ctx.scale = scale res = x.clone().detach() @@ -76,13 +75,10 @@ class RVC: # device="cpu"########强制cpu测试 self.device = config.device self.f0_up_key = key - self.time_step = 160 / 16000 * 1000 self.f0_min = 50 self.f0_max = 1100 self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) - self.sr = 16000 - self.window = 160 self.n_cpu = n_cpu self.use_jit = self.config.use_jit self.is_half = config.is_half @@ -184,6 +180,7 @@ class RVC: if last_rvc is not None and hasattr(last_rvc, "model_rmvpe"): self.model_rmvpe = last_rvc.model_rmvpe if last_rvc is not None and hasattr(last_rvc, "model_fcpe"): + self.device_fcpe = last_rvc.device_fcpe self.model_fcpe = last_rvc.model_fcpe except: printt(traceback.format_exc()) @@ -199,14 +196,10 @@ class RVC: self.index_rate = new_index_rate def get_f0_post(self, f0): - f0_min = self.f0_min - f0_max = self.f0_max - f0_mel_min = 1127 * np.log(1 + f0_min / 700) - f0_mel_max = 1127 * np.log(1 + f0_max / 700) f0bak = f0.copy() f0_mel = 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( - f0_mel_max - f0_mel_min + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / ( + self.f0_mel_max - self.f0_mel_min ) + 1 f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel > 255] = 255 @@ -221,6 +214,7 @@ class RVC: return self.get_f0_rmvpe(x, f0_up_key) if method == "fcpe": return self.get_f0_fcpe(x, f0_up_key) + x = x.cpu().numpy() if method == "pm": p_len = x.shape[0] // 160 + 1 f0_min = 65 @@ -262,7 +256,7 @@ class RVC: self.inp_q.put((idx, x[:tail], res_f0, n_cpu, ts)) else: self.inp_q.put( - (idx, x[part_length * idx - 320 : tail], res_f0, n_cpu, ts) + (idx, x[part_length * idx - 320: tail], res_f0, n_cpu, ts) ) while 1: res_ts = self.opt_q.get() @@ -277,20 +271,19 @@ class RVC: else: f0 = f0[2:] f0bak[ - part_length * idx // 160 : part_length * idx // 160 + f0.shape[0] + part_length * idx // 160: part_length * idx // 160 + f0.shape[0] ] = f0 f0bak = signal.medfilt(f0bak, 3) f0bak *= pow(2, f0_up_key / 12) return self.get_f0_post(f0bak) def get_f0_crepe(self, x, f0_up_key): - if "privateuseone" in str(self.device): ###不支持dml,cpu又太慢用不成,拿pm顶替 - return self.get_f0(x, f0_up_key, 1, "pm") - audio = torch.tensor(np.copy(x))[None].float() + if "privateuseone" in str(self.device): ###不支持dml,cpu又太慢用不成,拿fcpe顶替 + return self.get_f0(x, f0_up_key, 1, "fcpe") # printt("using crepe,device:%s"%self.device) f0, pd = torchcrepe.predict( - audio, - self.sr, + x.unsqueeze(0).float(), + 16000, 160, self.f0_min, self.f0_max, @@ -313,15 +306,11 @@ class RVC: printt("Loading rmvpe model") self.model_rmvpe = RMVPE( - # "rmvpe.pt", is_half=self.is_half if self.device.type!="privateuseone" else False, device=self.device if self.device.type!="privateuseone"else "cpu"####dml时强制对rmvpe用cpu跑 - # "rmvpe.pt", is_half=False, device=self.device####dml配置 - # "rmvpe.pt", is_half=False, device="cpu"####锁定cpu配置 "assets/rmvpe/rmvpe.pt", is_half=self.is_half, - device=self.device, ####正常逻辑 + device=self.device, use_jit=self.config.use_jit, ) - # self.model_rmvpe = RMVPE("aug2_58000_half.pt", is_half=self.is_half, device=self.device) f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) f0 *= pow(2, f0_up_key / 12) return self.get_f0_post(f0) @@ -329,41 +318,36 @@ class RVC: def get_f0_fcpe(self, x, f0_up_key): if hasattr(self, "model_fcpe") == False: from torchfcpe import spawn_bundled_infer_model - printt("Loading fcpe model") - self.model_fcpe = spawn_bundled_infer_model(self.device) - f0 = ( - self.model_fcpe.infer( - torch.from_numpy(x).to(self.device).unsqueeze(0).float(), - sr=16000, - decoder_mode="local_argmax", - threshold=0.006, - ) - .squeeze() - .cpu() - .numpy() - ) + if "privateuseone" in str(self.device): + self.device_fcpe = "cpu" + else: + self.device_fcpe = self.device + self.model_fcpe = spawn_bundled_infer_model(self.device_fcpe) + f0 = self.model_fcpe.infer( + x.to(self.device_fcpe).unsqueeze(0).float(), + sr=16000, + decoder_mode='local_argmax', + threshold=0.006, + ).squeeze().cpu().numpy() f0 *= pow(2, f0_up_key / 12) return self.get_f0_post(f0) def infer( - self, - feats: torch.Tensor, - indata: np.ndarray, - block_frame_16k, - rate, - cache_pitch, - cache_pitchf, - f0method, + self, + input_wav: torch.Tensor, + block_frame_16k, + skip_head, + cache_pitch, + cache_pitchf, + f0method, ) -> np.ndarray: - feats = feats.view(1, -1) - if self.config.is_half: - feats = feats.half() - else: - feats = feats.float() - feats = feats.to(self.device) t1 = ttime() with torch.no_grad(): + if self.config.is_half: + feats = input_wav.half().view(1, -1) + else: + feats = input_wav.float().view(1, -1) padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) inputs = { "source": feats, @@ -387,8 +371,8 @@ class RVC: if self.config.is_half: npy = npy.astype("float16") feats[0][-leng_replace_head:] = ( - torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate - + (1 - self.index_rate) * feats[0][-leng_replace_head:] + torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate + + (1 - self.index_rate) * feats[0][-leng_replace_head:] ) else: printt("Index search FAILED or disabled") @@ -398,7 +382,13 @@ class RVC: feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) t3 = ttime() if self.if_f0 == 1: - pitch, pitchf = self.get_f0(indata, self.f0_up_key, self.n_cpu, f0method) + f0_extractor_frame = block_frame_16k + 800 + if f0method == "rmvpe": + f0_extractor_frame = ( + 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160 + ) + input_wav = input_wav[-f0_extractor_frame:] + pitch, pitchf = self.get_f0(input_wav, self.f0_up_key, self.n_cpu, f0method) start_frame = block_frame_16k // 160 end_frame = len(cache_pitch) - (pitch.shape[0] - 4) + start_frame cache_pitch[:] = np.append(cache_pitch[start_frame:end_frame], pitch[3:-1]) @@ -412,31 +402,28 @@ class RVC: t4 = ttime() feats = feats[:, :p_len, :] if self.if_f0 == 1: - cache_pitch = cache_pitch[:p_len] - cache_pitchf = cache_pitchf[:p_len] - cache_pitch = torch.LongTensor(cache_pitch).unsqueeze(0).to(self.device) - cache_pitchf = torch.FloatTensor(cache_pitchf).unsqueeze(0).to(self.device) + cache_pitch = torch.LongTensor(cache_pitch[:p_len]).to(self.device).unsqueeze(0) + cache_pitchf = torch.FloatTensor(cache_pitchf[:p_len]).to(self.device).unsqueeze(0) p_len = torch.LongTensor([p_len]).to(self.device) - ii = 0 # sid - sid = torch.LongTensor([ii]).to(self.device) + sid = torch.LongTensor([0]).to(self.device) + skip_head = torch.LongTensor([skip_head]) with torch.no_grad(): if self.if_f0 == 1: - # printt(12222222222,feats.device,p_len.device,cache_pitch.device,cache_pitchf.device,sid.device,rate2) infered_audio = self.net_g.infer( feats, p_len, cache_pitch, cache_pitchf, sid, - torch.FloatTensor([rate]), + skip_head, )[0][0, 0].data.float() else: infered_audio = self.net_g.infer( - feats, p_len, sid, torch.FloatTensor([rate]) + feats, p_len, sid, skip_head )[0][0, 0].data.float() t5 = ttime() printt( - "Spent time: fea = %.2fs, index = %.2fs, f0 = %.2fs, model = %.2fs", + "Spent time: fea = %.3fs, index = %.3fs, f0 = %.3fs, model = %.3fs", t2 - t1, t3 - t2, t4 - t3, From d62e80fb8391e5b95fecdc24bac80436e3d54978 Mon Sep 17 00:00:00 2001 From: yxlllc Date: Tue, 26 Dec 2023 00:28:49 +0800 Subject: [PATCH 12/20] optimize real-time vc --- requirements-amd.txt | 1 + requirements-dml.txt | 1 + requirements-ipex.txt | 3 ++- requirements-win-for-realtime_vc_gui-dml.txt | 3 ++- requirements-win-for-realtime_vc_gui.txt | 1 + 5 files changed, 7 insertions(+), 2 deletions(-) diff --git a/requirements-amd.txt b/requirements-amd.txt index aa81a88..d0976a7 100644 --- a/requirements-amd.txt +++ b/requirements-amd.txt @@ -46,3 +46,4 @@ fastapi==0.88 ffmpy==0.3.1 python-dotenv>=1.0.0 av +torchfcpe diff --git a/requirements-dml.txt b/requirements-dml.txt index a49ed2d..b4690ae 100644 --- a/requirements-dml.txt +++ b/requirements-dml.txt @@ -44,3 +44,4 @@ fastapi==0.88 ffmpy==0.3.1 python-dotenv>=1.0.0 av +torchfcpe \ No newline at end of file diff --git a/requirements-ipex.txt b/requirements-ipex.txt index 610a0ce..19ff424 100644 --- a/requirements-ipex.txt +++ b/requirements-ipex.txt @@ -51,4 +51,5 @@ ffmpy==0.3.1 python-dotenv>=1.0.0 av PySimpleGUI -sounddevice \ No newline at end of file +sounddevice +torchfcpe \ No newline at end of file diff --git a/requirements-win-for-realtime_vc_gui-dml.txt b/requirements-win-for-realtime_vc_gui-dml.txt index 6514989..9aaf56d 100644 --- a/requirements-win-for-realtime_vc_gui-dml.txt +++ b/requirements-win-for-realtime_vc_gui-dml.txt @@ -26,4 +26,5 @@ PySimpleGUI sounddevice gradio noisereduce -onnxruntime-directml \ No newline at end of file +onnxruntime-directml +torchfcpe \ No newline at end of file diff --git a/requirements-win-for-realtime_vc_gui.txt b/requirements-win-for-realtime_vc_gui.txt index 37ca238..e187f85 100644 --- a/requirements-win-for-realtime_vc_gui.txt +++ b/requirements-win-for-realtime_vc_gui.txt @@ -26,3 +26,4 @@ PySimpleGUI sounddevice gradio noisereduce +torchfcpe From d7fb651f7c3ed90c72a084030341c620ef4a1a4c Mon Sep 17 00:00:00 2001 From: yxlllc Date: Tue, 26 Dec 2023 16:26:01 +0800 Subject: [PATCH 13/20] optimize real-time vc --- gui_v1.py | 20 ++++--------- infer/lib/infer_pack/models.py | 40 ++++++++++++++++--------- tools/rvc_for_realtime.py | 55 +++++++++++++++++----------------- 3 files changed, 58 insertions(+), 57 deletions(-) diff --git a/gui_v1.py b/gui_v1.py index 0f614e6..728cf7e 100644 --- a/gui_v1.py +++ b/gui_v1.py @@ -681,14 +681,6 @@ if __name__ == "__main__": device=self.config.device, dtype=torch.float32, ) - self.pitch: np.ndarray = np.zeros( - self.input_wav.shape[0] // self.zc, - dtype="int32", - ) - self.pitchf: np.ndarray = np.zeros( - self.input_wav.shape[0] // self.zc, - dtype="float64", - ) self.sola_buffer: torch.Tensor = torch.zeros( self.sola_buffer_frame, device=self.config.device, dtype=torch.float32 ) @@ -698,6 +690,7 @@ if __name__ == "__main__": 2 * self.zc, device=self.config.device, dtype=torch.float32 ) self.skip_head = self.extra_frame // self.zc + self.return_length = (self.block_frame + self.sola_buffer_frame + self.sola_search_frame) // self.zc self.fade_in_window: torch.Tensor = ( torch.sin( 0.5 @@ -808,8 +801,7 @@ if __name__ == "__main__": self.input_wav_res, self.block_frame_16k, self.skip_head, - self.pitch, - self.pitchf, + self.return_length, self.gui_config.f0method, ) if self.resampler2 is not None: @@ -879,9 +871,7 @@ if __name__ == "__main__": else: sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0]) printt("sola_offset = %d", int(sola_offset)) - infer_wav = infer_wav[ - sola_offset : sola_offset + self.block_frame + self.crossfade_frame - ] + infer_wav = infer_wav[sola_offset :] if "privateuseone" in str(self.config.device) or not self.gui_config.use_pv: infer_wav[: self.sola_buffer_frame] *= self.fade_in_window infer_wav[: self.sola_buffer_frame] += self.sola_buffer * self.fade_out_window @@ -894,11 +884,11 @@ if __name__ == "__main__": self.sola_buffer[:] = infer_wav[self.block_frame : self.block_frame + self.sola_buffer_frame] if sys.platform == "darwin": outdata[:] = ( - infer_wav[: -self.crossfade_frame].cpu().numpy()[:, np.newaxis] + infer_wav[: self.block_frame].cpu().numpy()[:, np.newaxis] ) else: outdata[:] = ( - infer_wav[: -self.crossfade_frame].repeat(2, 1).t().cpu().numpy() + infer_wav[: self.block_frame].repeat(2, 1).t().cpu().numpy() ) total_time = time.perf_counter() - start_time self.window["infer_time"].update(int(total_time * 1000)) diff --git a/infer/lib/infer_pack/models.py b/infer/lib/infer_pack/models.py index c2750ee..a81c1de 100644 --- a/infer/lib/infer_pack/models.py +++ b/infer/lib/infer_pack/models.py @@ -785,16 +785,19 @@ class SynthesizerTrnMs256NSFsid(nn.Module): nsff0: torch.Tensor, sid: torch.Tensor, skip_head: Optional[torch.Tensor] = None, + return_length: Optional[torch.Tensor] = None, ): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if skip_head is not None: + if skip_head is not None and return_length is not None: assert isinstance(skip_head, torch.Tensor) + assert isinstance(return_length, torch.Tensor) head = int(skip_head.item()) - z_p = z_p[:, :, head:] - x_mask = x_mask[:, :, head:] - nsff0 = nsff0[:, head:] + length = int(return_length.item()) + z_p = z_p[:, :, head: head + length] + x_mask = x_mask[:, :, head: head + length] + nsff0 = nsff0[:, head: head + length] z = self.flow(z_p, x_mask, g=g, reverse=True) o = self.dec(z * x_mask, nsff0, g=g) return o, x_mask, (z, z_p, m_p, logs_p) @@ -944,16 +947,19 @@ class SynthesizerTrnMs768NSFsid(nn.Module): nsff0: torch.Tensor, sid: torch.Tensor, skip_head: Optional[torch.Tensor] = None, + return_length: Optional[torch.Tensor] = None, ): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if skip_head is not None: + if skip_head is not None and return_length is not None: assert isinstance(skip_head, torch.Tensor) + assert isinstance(return_length, torch.Tensor) head = int(skip_head.item()) - z_p = z_p[:, :, head:] - x_mask = x_mask[:, :, head:] - nsff0 = nsff0[:, head:] + length = int(return_length.item()) + z_p = z_p[:, :, head: head + length] + x_mask = x_mask[:, :, head: head + length] + nsff0 = nsff0[:, head: head + length] z = self.flow(z_p, x_mask, g=g, reverse=True) o = self.dec(z * x_mask, nsff0, g=g) return o, x_mask, (z, z_p, m_p, logs_p) @@ -1092,15 +1098,18 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): phone_lengths: torch.Tensor, sid: torch.Tensor, skip_head: Optional[torch.Tensor] = None, + return_length: Optional[torch.Tensor] = None, ): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if skip_head is not None: + if skip_head is not None and return_length is not None: assert isinstance(skip_head, torch.Tensor) + assert isinstance(return_length, torch.Tensor) head = int(skip_head.item()) - z_p = z_p[:, :, head:] - x_mask = x_mask[:, :, head:] + length = int(return_length.item()) + z_p = z_p[:, :, head: head + length] + x_mask = x_mask[:, :, head: head + length] z = self.flow(z_p, x_mask, g=g, reverse=True) o = self.dec(z * x_mask, g=g) return o, x_mask, (z, z_p, m_p, logs_p) @@ -1239,15 +1248,18 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module): phone_lengths: torch.Tensor, sid: torch.Tensor, skip_head: Optional[torch.Tensor] = None, + return_length: Optional[torch.Tensor] = None, ): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if skip_head is not None: + if skip_head is not None and return_length is not None: assert isinstance(skip_head, torch.Tensor) + assert isinstance(return_length, torch.Tensor) head = int(skip_head.item()) - z_p = z_p[:, :, head:] - x_mask = x_mask[:, :, head:] + length = int(return_length.item()) + z_p = z_p[:, :, head: head + length] + x_mask = x_mask[:, :, head: head + length] z = self.flow(z_p, x_mask, g=g, reverse=True) o = self.dec(z * x_mask, g=g) return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/tools/rvc_for_realtime.py b/tools/rvc_for_realtime.py index 2d54732..257c44d 100644 --- a/tools/rvc_for_realtime.py +++ b/tools/rvc_for_realtime.py @@ -90,7 +90,9 @@ class RVC: self.pth_path: str = pth_path self.index_path = index_path self.index_rate = index_rate - + self.cache_pitch: np.ndarray = np.zeros(1024, dtype="int32") + self.cache_pitchf = np.zeros(1024, dtype="float32") + if last_rvc is None: models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( ["assets/hubert/hubert_base.pt"], @@ -329,8 +331,9 @@ class RVC: sr=16000, decoder_mode='local_argmax', threshold=0.006, - ).squeeze().cpu().numpy() + ) f0 *= pow(2, f0_up_key / 12) + f0 = f0.squeeze().cpu().numpy() return self.get_f0_post(f0) def infer( @@ -338,8 +341,7 @@ class RVC: input_wav: torch.Tensor, block_frame_16k, skip_head, - cache_pitch, - cache_pitchf, + return_length, f0method, ) -> np.ndarray: t1 = ttime() @@ -362,24 +364,22 @@ class RVC: t2 = ttime() try: if hasattr(self, "index") and self.index_rate != 0: - leng_replace_head = int(rate * feats[0].shape[0]) - npy = feats[0][-leng_replace_head:].cpu().numpy().astype("float32") + npy = feats[0][skip_head // 2:].cpu().numpy().astype("float32") score, ix = self.index.search(npy, k=8) weight = np.square(1 / score) weight /= weight.sum(axis=1, keepdims=True) npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) if self.config.is_half: npy = npy.astype("float16") - feats[0][-leng_replace_head:] = ( + feats[0][skip_head // 2:] = ( torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate - + (1 - self.index_rate) * feats[0][-leng_replace_head:] + + (1 - self.index_rate) * feats[0][skip_head // 2:] ) else: printt("Index search FAILED or disabled") except: traceback.print_exc() printt("Index search FAILED") - feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) t3 = ttime() if self.if_f0 == 1: f0_extractor_frame = block_frame_16k + 800 @@ -387,40 +387,39 @@ class RVC: f0_extractor_frame = ( 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160 ) - input_wav = input_wav[-f0_extractor_frame:] - pitch, pitchf = self.get_f0(input_wav, self.f0_up_key, self.n_cpu, f0method) + pitch, pitchf = self.get_f0(input_wav[-f0_extractor_frame: ], self.f0_up_key, self.n_cpu, f0method) start_frame = block_frame_16k // 160 - end_frame = len(cache_pitch) - (pitch.shape[0] - 4) + start_frame - cache_pitch[:] = np.append(cache_pitch[start_frame:end_frame], pitch[3:-1]) - cache_pitchf[:] = np.append( - cache_pitchf[start_frame:end_frame], pitchf[3:-1] + end_frame = len(self.cache_pitch) - (pitch.shape[0] - 4) + start_frame + self.cache_pitch[:] = np.append(self.cache_pitch[start_frame: end_frame], pitch[3:-1]) + self.cache_pitchf[:] = np.append( + self.cache_pitchf[start_frame: end_frame], pitchf[3:-1] ) - p_len = min(feats.shape[1], 13000, cache_pitch.shape[0]) - else: - cache_pitch, cache_pitchf = None, None - p_len = min(feats.shape[1], 13000) t4 = ttime() - feats = feats[:, :p_len, :] + p_len = input_wav.shape[0] // 160 if self.if_f0 == 1: - cache_pitch = torch.LongTensor(cache_pitch[:p_len]).to(self.device).unsqueeze(0) - cache_pitchf = torch.FloatTensor(cache_pitchf[:p_len]).to(self.device).unsqueeze(0) + cache_pitch = torch.LongTensor(self.cache_pitch[-p_len: ]).to(self.device).unsqueeze(0) + cache_pitchf = torch.FloatTensor(self.cache_pitchf[-p_len: ]).to(self.device).unsqueeze(0) + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) + feats = feats[:, :p_len, :] p_len = torch.LongTensor([p_len]).to(self.device) sid = torch.LongTensor([0]).to(self.device) skip_head = torch.LongTensor([skip_head]) + return_length = torch.LongTensor([return_length]) with torch.no_grad(): if self.if_f0 == 1: - infered_audio = self.net_g.infer( + infered_audio, _, _ = self.net_g.infer( feats, p_len, cache_pitch, cache_pitchf, sid, skip_head, - )[0][0, 0].data.float() + return_length, + ) else: - infered_audio = self.net_g.infer( - feats, p_len, sid, skip_head - )[0][0, 0].data.float() + infered_audio, _, _ = self.net_g.infer( + feats, p_len, sid, skip_head, return_length + ) t5 = ttime() printt( "Spent time: fea = %.3fs, index = %.3fs, f0 = %.3fs, model = %.3fs", @@ -429,4 +428,4 @@ class RVC: t4 - t3, t5 - t4, ) - return infered_audio + return infered_audio.squeeze().float() From 21775b187a2610be5faaf58a500eaf068620cde1 Mon Sep 17 00:00:00 2001 From: yxlllc Date: Tue, 26 Dec 2023 17:05:42 +0800 Subject: [PATCH 14/20] optimize real-time vc --- gui_v1.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gui_v1.py b/gui_v1.py index 728cf7e..dc2bdc8 100644 --- a/gui_v1.py +++ b/gui_v1.py @@ -559,7 +559,7 @@ if __name__ == "__main__": if stream_latency > 0: self.delay_time += ( 1 if values["I_noise_reduce"] else -1 - ) * values["crossfade_length"] + ) * min(values["crossfade_length"], 0.04) self.window["delay_time"].update(int(self.delay_time * 1000)) elif event == "O_noise_reduce": self.gui_config.O_noise_reduce = values["O_noise_reduce"] @@ -774,7 +774,7 @@ if __name__ == "__main__": # input noise reduction and resampling if self.gui_config.I_noise_reduce and self.function == "vc": input_wav = self.input_wav[ - -self.crossfade_frame - self.block_frame - 2 * self.zc : + -self.sola_buffer_frame - self.block_frame - 2 * self.zc : ] input_wav = self.tg( input_wav.unsqueeze(0), self.input_wav.unsqueeze(0) @@ -783,7 +783,7 @@ if __name__ == "__main__": input_wav[: self.sola_buffer_frame] += ( self.nr_buffer * self.fade_out_window ) - self.nr_buffer[:] = input_wav[self.block_frame : self.block_frame + self.sola_buffer_frame] + self.nr_buffer[:] = input_wav[self.block_frame :] input_wav = torch.cat( (self.res_buffer[:], input_wav[: self.block_frame]) ) @@ -824,7 +824,7 @@ if __name__ == "__main__": # volume envelop mixing if self.gui_config.rms_mix_rate < 1 and self.function == "vc": rms1 = librosa.feature.rms( - y=self.input_wav_res[-160 * infer_wav.shape[0] // self.zc :] + y=self.input_wav_res[160 * self.skip_head : 160 * (self.skip_head + self.return_length)] .cpu() .numpy(), frame_length=640, From aed19c3c6b3f43c4d2e13dbb4631098a2a66c55e Mon Sep 17 00:00:00 2001 From: yxlllc Date: Tue, 26 Dec 2023 17:41:25 +0800 Subject: [PATCH 15/20] optimize real-time vc --- gui_v1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gui_v1.py b/gui_v1.py index dc2bdc8..e5c6757 100644 --- a/gui_v1.py +++ b/gui_v1.py @@ -533,7 +533,7 @@ if __name__ == "__main__": + 0.01 ) if values["I_noise_reduce"]: - self.delay_time += values["crossfade_length"] + self.delay_time += min(values["crossfade_length"], 0.04) self.window["sr_stream"].update(self.gui_config.samplerate) self.window["delay_time"].update(int(self.delay_time * 1000)) if event == "stop_vc" and self.flag_vc == True: From 997a956f4f0b0f9c7d1a9954145752825a1d1bd6 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 26 Dec 2023 22:01:20 +0900 Subject: [PATCH 16/20] chore(i18n): sync locale on dev (#1650) Co-authored-by: github-actions[bot] --- i18n/locale/en_US.json | 4 ++++ i18n/locale/es_ES.json | 4 ++++ i18n/locale/fr_FR.json | 4 ++++ i18n/locale/it_IT.json | 4 ++++ i18n/locale/ja_JP.json | 4 ++++ i18n/locale/ru_RU.json | 4 ++++ i18n/locale/tr_TR.json | 4 ++++ i18n/locale/zh_CN.json | 4 ++++ i18n/locale/zh_HK.json | 4 ++++ i18n/locale/zh_SG.json | 4 ++++ i18n/locale/zh_TW.json | 4 ++++ 11 files changed, 44 insertions(+) diff --git a/i18n/locale/en_US.json b/i18n/locale/en_US.json index e4f5100..d585505 100644 --- a/i18n/locale/en_US.json +++ b/i18n/locale/en_US.json @@ -26,6 +26,8 @@ "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Batch processing for vocal accompaniment separation using the UVR5 model.
Example of a valid folder path format: D:\\path\\to\\input\\folder (copy it from the file manager address bar).
The model is divided into three categories:
1. Preserve vocals: Choose this option for audio without harmonies. It preserves vocals better than HP5. It includes two built-in models: HP2 and HP3. HP3 may slightly leak accompaniment but preserves vocals slightly better than HP2.
2. Preserve main vocals only: Choose this option for audio with harmonies. It may weaken the main vocals. It includes one built-in model: HP5.
3. De-reverb and de-delay models (by FoxJoy):
  (1) MDX-Net: The best choice for stereo reverb removal but cannot remove mono reverb;
 (234) DeEcho: Removes delay effects. Aggressive mode removes more thoroughly than Normal mode. DeReverb additionally removes reverb and can remove mono reverb, but not very effectively for heavily reverberated high-frequency content.
De-reverb/de-delay notes:
1. The processing time for the DeEcho-DeReverb model is approximately twice as long as the other two DeEcho models.
2. The MDX-Net-Dereverb model is quite slow.
3. The recommended cleanest configuration is to apply MDX-Net first and then DeEcho-Aggressive.", "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Enter the GPU index(es) separated by '-', e.g., 0-1-2 to use GPU 0, 1, and 2:", "伴奏人声分离&去混响&去回声": "Vocals/Accompaniment Separation & Reverberation Removal", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", "保存名": "Save name:", "保存的文件名, 默认空为和源文件同名": "Save file name (default: same as the source file):", "保存的模型名不带后缀": "Saved model name (without extension):", @@ -44,6 +46,7 @@ "变调(整数, 半音数量, 升八度12降八度-12)": "Transpose (integer, number of semitones, raise by an octave: 12, lower by an octave: -12):", "后处理重采样至最终采样率,0为不进行重采样": "Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling:", "否": "No", + "启用相位声码器": "启用相位声码器", "响应阈值": "Response threshold", "响度因子": "loudness factor", "处理数据": "Process data", @@ -122,6 +125,7 @@ "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "Select the pitch extraction algorithm ('pm': faster extraction but lower-quality speech; 'harvest': better bass but extremely slow; 'crepe': better quality but GPU intensive), 'rmvpe': best quality, and little GPU requirement", "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "Select the pitch extraction algorithm: when extracting singing, you can use 'pm' to speed up. For high-quality speech with fast performance, but worse CPU usage, you can use 'dio'. 'harvest' results in better quality but is slower. 'rmvpe' has the best results and consumes less CPU/GPU", + "采样率:": "采样率:", "采样长度": "Sample length", "重载设备列表": "Reload device list", "音调设置": "Pitch settings", diff --git a/i18n/locale/es_ES.json b/i18n/locale/es_ES.json index 09ea011..08b8176 100644 --- a/i18n/locale/es_ES.json +++ b/i18n/locale/es_ES.json @@ -26,6 +26,8 @@ "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Procesamiento por lotes para la separación de acompañamiento vocal utilizando el modelo UVR5.
Ejemplo de formato de ruta de carpeta válido: D:\\ruta\\a\\la\\carpeta\\de\\entrada (copiar desde la barra de direcciones del administrador de archivos).
El modelo se divide en tres categorías:
1. Preservar voces: Elija esta opción para audio sin armonías. Preserva las voces mejor que HP5. Incluye dos modelos incorporados: HP2 y HP3. HP3 puede filtrar ligeramente el acompañamiento pero conserva las voces un poco mejor que HP2.
2. Preservar solo voces principales: Elija esta opción para audio con armonías. Puede debilitar las voces principales. Incluye un modelo incorporado: HP5.
3. Modelos de des-reverberación y des-retardo (por FoxJoy):
  (1) MDX-Net: La mejor opción para la eliminación de reverberación estéreo pero no puede eliminar la reverberación mono;
 (234) DeEcho: Elimina efectos de retardo. El modo Agresivo elimina más a fondo que el modo Normal. DeReverb adicionalmente elimina la reverberación y puede eliminar la reverberación mono, pero no muy efectivamente para contenido de alta frecuencia fuertemente reverberado.
Notas de des-reverberación/des-retardo:
1. El tiempo de procesamiento para el modelo DeEcho-DeReverb es aproximadamente el doble que los otros dos modelos DeEcho.
2. El modelo MDX-Net-Dereverb es bastante lento.
3. La configuración más limpia recomendada es aplicar primero MDX-Net y luego DeEcho-Agresivo.", "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Separe los números de identificación de la GPU con '-' al ingresarlos. Por ejemplo, '0-1-2' significa usar GPU 0, GPU 1 y GPU 2.", "伴奏人声分离&去混响&去回声": "Separación de voz acompañante & eliminación de reverberación & eco", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", "保存名": "Guardar nombre", "保存的文件名, 默认空为和源文件同名": "Nombre del archivo que se guardará, el valor predeterminado es el mismo que el nombre del archivo de origen", "保存的模型名不带后缀": "Nombre del modelo guardado sin extensión.", @@ -44,6 +46,7 @@ "变调(整数, 半音数量, 升八度12降八度-12)": "Cambio de tono (entero, número de semitonos, subir una octava +12 o bajar una octava -12)", "后处理重采样至最终采样率,0为不进行重采样": "Remuestreo posterior al proceso a la tasa de muestreo final, 0 significa no remuestrear", "否": "No", + "启用相位声码器": "启用相位声码器", "响应阈值": "Umbral de respuesta", "响度因子": "factor de sonoridad", "处理数据": "Procesar datos", @@ -122,6 +125,7 @@ "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "Seleccione el algoritmo de extracción de tono, las voces de entrada se pueden acelerar con pm, harvest tiene buenos graves pero es muy lento, crepe es bueno pero se come las GPUs", "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "Seleccione el algoritmo de extracción de tono, use 'pm' para acelerar la entrada de canto, 'harvest' es bueno para los graves pero extremadamente lento, 'crepe' tiene buenos resultados pero consume GPU", "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "Seleccione el algoritmo de extracción de tono: la canción de entrada se puede acelerar con pm, la voz de alta calidad pero CPU pobre se puede acelerar con dio, harvest es mejor pero más lento, rmvpe es el mejor y se come ligeramente la CPU/GPU", + "采样率:": "采样率:", "采样长度": "Longitud de muestreo", "重载设备列表": "Actualizar lista de dispositivos", "音调设置": "Ajuste de tono", diff --git a/i18n/locale/fr_FR.json b/i18n/locale/fr_FR.json index cc6321c..db93e9a 100644 --- a/i18n/locale/fr_FR.json +++ b/i18n/locale/fr_FR.json @@ -26,6 +26,8 @@ "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Traitement en lot pour la séparation de la voix et de l'accompagnement vocal à l'aide du modèle UVR5.
Exemple d'un format de chemin de dossier valide : D:\\chemin\\vers\\dossier\\d'entrée (copiez-le depuis la barre d'adresse du gestionnaire de fichiers).
Le modèle est divisé en trois catégories :
1. Préserver la voix : Choisissez cette option pour l'audio sans harmonies. Elle préserve la voix mieux que HP5. Il comprend deux modèles intégrés : HP2 et HP3. HP3 peut légèrement laisser passer l'accompagnement mais préserve légèrement mieux la voix que HP2.
2. Préserver uniquement la voix principale : Choisissez cette option pour l'audio avec harmonies. Cela peut affaiblir la voix principale. Il comprend un modèle intégré : HP5.
3. Modèles de suppression de la réverbération et du délai (par FoxJoy) :
  (1) MDX-Net : Le meilleur choix pour la suppression de la réverbération stéréo, mais ne peut pas supprimer la réverbération mono.
  (234) DeEcho : Supprime les effets de délai. Le mode Aggressive supprime plus efficacement que le mode Normal. DeReverb supprime également la réverbération et peut supprimer la réverbération mono, mais pas très efficacement pour les contenus à haute fréquence fortement réverbérés.
Notes sur la suppression de la réverbération et du délai :
1. Le temps de traitement pour le modèle DeEcho-DeReverb est environ deux fois plus long que pour les autres deux modèles DeEcho.
2. Le modèle MDX-Net-Dereverb est assez lent.
3. La configuration la plus propre recommandée est d'appliquer d'abord MDX-Net, puis DeEcho-Aggressive.", "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Entrez le(s) index GPU séparé(s) par '-', par exemple, 0-1-2 pour utiliser les GPU 0, 1 et 2 :", "伴奏人声分离&去混响&去回声": "Séparation des voix/accompagnement et suppression de la réverbération", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", "保存名": "Nom de sauvegarde :", "保存的文件名, 默认空为和源文件同名": "Nom du fichier de sauvegarde (par défaut : identique au nom du fichier source) :", "保存的模型名不带后缀": "Nom du modèle enregistré (sans extension) :", @@ -44,6 +46,7 @@ "变调(整数, 半音数量, 升八度12降八度-12)": "Transposer (entier, nombre de demi-tons, monter d'une octave : 12, descendre d'une octave : -12) :", "后处理重采样至最终采样率,0为不进行重采样": "Rééchantillonner l'audio de sortie en post-traitement à la fréquence d'échantillonnage finale. Réglez sur 0 pour ne pas effectuer de rééchantillonnage :", "否": "Non", + "启用相位声码器": "启用相位声码器", "响应阈值": "Seuil de réponse", "响度因子": "Facteur de volume sonore", "处理数据": "Traitement des données", @@ -122,6 +125,7 @@ "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "Sélection de l'algorithme d'extraction de la hauteur, les voix d'entrée peuvent être accélérées avec pm, harvest a de bonnes basses mais est très lent, crepe est bon mais consomme beaucoup de ressources GPU.", "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "Sélectionnez l'algorithme d'extraction de la hauteur de ton (\"pm\" : extraction plus rapide mais parole de moindre qualité ; \"harvest\" : meilleure basse mais extrêmement lente ; \"crepe\" : meilleure qualité mais utilisation intensive du GPU), \"rmvpe\" : meilleure qualité et peu d'utilisation du GPU.", "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "Sélection de l'algorithme d'extraction de la hauteur : la chanson d'entrée peut être traitée plus rapidement par pm, avec une voix de haute qualité mais un CPU médiocre, par dio, harvest est meilleur mais plus lent, rmvpe est le meilleur, mais consomme légèrement le CPU/GPU.", + "采样率:": "采样率:", "采样长度": "Longueur de l'échantillon", "重载设备列表": "Recharger la liste des dispositifs", "音调设置": "Réglages de la hauteur", diff --git a/i18n/locale/it_IT.json b/i18n/locale/it_IT.json index fc31aa6..dc089be 100644 --- a/i18n/locale/it_IT.json +++ b/i18n/locale/it_IT.json @@ -26,6 +26,8 @@ "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Elaborazione batch per la separazione dell'accompagnamento vocale utilizzando il modello UVR5.
Esempio di un formato di percorso di cartella valido: D:\\path\\to\\input\\folder (copialo dalla barra degli indirizzi del file manager).
Il modello è suddiviso in tre categorie:
1. Conserva la voce: scegli questa opzione per l'audio senza armonie.
2. Mantieni solo la voce principale: scegli questa opzione per l'audio con armonie.
3. Modelli di de-riverbero e de-delay (di FoxJoy):
  (1) MDX-Net: la scelta migliore per la rimozione del riverbero stereo ma non può rimuovere il riverbero mono;

Note di de-riverbero/de-delay:
1. Il tempo di elaborazione per il modello DeEcho-DeReverb è circa il doppio rispetto agli altri due modelli DeEcho.
2. Il modello MDX-Net-Dereverb è piuttosto lento.
3. La configurazione più pulita consigliata consiste nell'applicare prima MDX-Net e poi DeEcho-Aggressive.", "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Inserisci gli indici GPU separati da '-', ad esempio 0-1-2 per utilizzare GPU 0, 1 e 2:", "伴奏人声分离&去混响&去回声": "Separazione voce/accompagnamento", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", "保存名": "Salva nome:", "保存的文件名, 默认空为和源文件同名": "Salva il nome del file (predefinito: uguale al file di origine):", "保存的模型名不带后缀": "Nome del modello salvato (senza estensione):", @@ -44,6 +46,7 @@ "变调(整数, 半音数量, 升八度12降八度-12)": "Trasposizione (numero intero, numero di semitoni, alza di un'ottava: 12, abbassa di un'ottava: -12):", "后处理重采样至最终采样率,0为不进行重采样": "Ricampiona l'audio di output in post-elaborazione alla frequenza di campionamento finale. ", "否": "NO", + "启用相位声码器": "启用相位声码器", "响应阈值": "Soglia di risposta", "响度因子": "fattore di sonorità", "处理数据": "Processa dati", @@ -122,6 +125,7 @@ "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "Seleziona l'algoritmo di estrazione del tono (\"pm\": estrazione più veloce ma risultato di qualità inferiore; \"harvest\": bassi migliori ma estremamente lenti; \"crepe\": qualità migliore ma utilizzo intensivo della GPU):", "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", + "采样率:": "采样率:", "采样长度": "Lunghezza del campione", "重载设备列表": "Ricaricare l'elenco dei dispositivi", "音调设置": "Impostazioni del tono", diff --git a/i18n/locale/ja_JP.json b/i18n/locale/ja_JP.json index a96e4ba..c5b33ff 100644 --- a/i18n/locale/ja_JP.json +++ b/i18n/locale/ja_JP.json @@ -26,6 +26,8 @@ "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "UVR5モデルを使用したボーカル伴奏の分離バッチ処理。
有効なフォルダーパスフォーマットの例: D:\\path\\to\\input\\folder (エクスプローラーのアドレスバーからコピーします)。
モデルは三つのカテゴリに分かれています:
1. ボーカルを保持: ハーモニーのないオーディオに対してこれを選択します。HP5よりもボーカルをより良く保持します。HP2とHP3の二つの内蔵モデルが含まれています。HP3は伴奏をわずかに漏らす可能性がありますが、HP2よりもわずかにボーカルをより良く保持します。
2. 主なボーカルのみを保持: ハーモニーのあるオーディオに対してこれを選択します。主なボーカルを弱める可能性があります。HP5の一つの内蔵モデルが含まれています。
3. ディリバーブとディレイモデル (by FoxJoy):
  (1) MDX-Net: ステレオリバーブの除去に最適な選択肢ですが、モノリバーブは除去できません;
 (234) DeEcho: ディレイ効果を除去します。AggressiveモードはNormalモードよりも徹底的に除去します。DeReverbはさらにリバーブを除去し、モノリバーブを除去することができますが、高周波のリバーブが強い内容に対しては非常に効果的ではありません。
ディリバーブ/ディレイに関する注意点:
1. DeEcho-DeReverbモデルの処理時間は、他の二つのDeEchoモデルの約二倍です。
2. MDX-Net-Dereverbモデルは非常に遅いです。
3. 推奨される最もクリーンな設定は、最初にMDX-Netを適用し、その後にDeEcho-Aggressiveを適用することです。", "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "ハイフンで区切って使用するGPUの番号を入力します。例えば0-1-2はGPU0、GPU1、GPU2を使用します", "伴奏人声分离&去混响&去回声": "伴奏ボーカル分離&残響除去&エコー除去", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", "保存名": "保存ファイル名", "保存的文件名, 默认空为和源文件同名": "保存するファイル名、デフォルトでは空欄で元のファイル名と同じ名前になります", "保存的模型名不带后缀": "拡張子のない保存するモデル名", @@ -44,6 +46,7 @@ "变调(整数, 半音数量, 升八度12降八度-12)": "ピッチ変更(整数、半音数、上下オクターブ12-12)", "后处理重采样至最终采样率,0为不进行重采样": "最終的なサンプリングレートへのポストプロセッシングのリサンプリング リサンプリングしない場合は0", "否": "いいえ", + "启用相位声码器": "启用相位声码器", "响应阈值": "反応閾値", "响度因子": "ラウドネス係数", "处理数据": "データ処理", @@ -122,6 +125,7 @@ "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "ピッチ抽出アルゴリズムの選択、歌声はpmで高速化でき、harvestは低音が良いが信じられないほど遅く、crepeは良く動くがGPUを食います。", "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "ピッチ抽出アルゴリズムの選択、歌声はpmで高速化でき、harvestは低音が良いが信じられないほど遅く、crepeは良く動くがGPUを喰います", "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "ピッチ抽出アルゴリズムの選択:歌声はpmで高速化でき、入力した音声が高音質でCPUが貧弱な場合はdioで高速化でき、harvestの方が良いが遅く、rmvpeがベストだがCPU/GPUを若干食います。", + "采样率:": "采样率:", "采样长度": "サンプル長", "重载设备列表": "デバイスリストをリロードする", "音调设置": "音程設定", diff --git a/i18n/locale/ru_RU.json b/i18n/locale/ru_RU.json index 6bdff9c..f01bc8f 100644 --- a/i18n/locale/ru_RU.json +++ b/i18n/locale/ru_RU.json @@ -26,6 +26,8 @@ "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Пакетная обработка для разделения вокального сопровождения с использованием модели UVR5.
Пример допустимого формата пути к папке: D:\\path\\to\\input\\folder
Модель разделена на три категории:
1. Сохранить вокал: выберите этот вариант для звука без гармоний. Он сохраняет вокал лучше, чем HP5. Он включает в себя две встроенные модели: HP2 и HP3. HP3 может немного пропускать инструментал, но сохраняет вокал немного лучше, чем HP2.
2. Сохранить только основной вокал: выберите этот вариант для звука с гармониями. Это может ослабить основной вокал. Он включает одну встроенную модель: HP5.
3. Модели удаления реверберации и задержки (от FoxJoy):
  (1) MDX-Net: лучший выбор для удаления стереореверберации, но он не может удалить монореверберацию;
 (234) DeEcho: удаляет эффекты задержки. Агрессивный режим удаляет более тщательно, чем Нормальный режим. DeReverb дополнительно удаляет реверберацию и может удалять монореверберацию, но не очень эффективно для сильно реверберированного высокочастотного контента.
Примечания по удалению реверберации/задержки:
1. Время обработки для модели DeEcho-DeReverb примерно в два раза больше, чем для двух других моделей DeEcho.
2. Модель MDX-Net-Dereverb довольно медленная.
3. Рекомендуемая самая чистая конфигурация — сначала применить MDX-Net, а затем DeEcho-Aggressive.", "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Введите, какие(-ую) GPU(-у) хотите использовать через '-', например 0-1-2, чтобы использовать GPU с номерами 0, 1 и 2:", "伴奏人声分离&去混响&去回声": "Разделение вокала/аккомпанемента и удаление эхо", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", "保存名": "Имя файла для сохранения:", "保存的文件名, 默认空为和源文件同名": "Название сохранённого файла (по умолчанию: такое же, как и у входного):", "保存的模型名不带后缀": "Имя файла модели для сохранения (без расширения):", @@ -44,6 +46,7 @@ "变调(整数, 半音数量, 升八度12降八度-12)": "Изменить высоту голоса (укажите количество полутонов; чтобы поднять голос на октаву, выберите 12, понизить на октаву — -12):", "后处理重采样至最终采样率,0为不进行重采样": "Изменить частоту дискретизации в выходном файле на финальную. Поставьте 0, чтобы ничего не изменялось:", "否": "Нет", + "启用相位声码器": "启用相位声码器", "响应阈值": "Порог ответа", "响度因子": "коэффициент громкости", "处理数据": "Обработать данные", @@ -122,6 +125,7 @@ "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "Выберите алгоритм оценки высоты голоса ('pm': работает быстро, но даёт низкое качество речи; 'harvest': басы лучше, но работает очень медленно; 'crepe': лучшее качество, но сильно нагружает GPU; 'rmvpe': лучшее качество и минимальная нагрузка на GPU):", "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", + "采样率:": "采样率:", "采样长度": "Длина сэмпла", "重载设备列表": "Обновить список устройств", "音调设置": "Настройка высоты звука", diff --git a/i18n/locale/tr_TR.json b/i18n/locale/tr_TR.json index 8fe7aa9..bd1c17b 100644 --- a/i18n/locale/tr_TR.json +++ b/i18n/locale/tr_TR.json @@ -26,6 +26,8 @@ "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Batch işleme kullanarak vokal eşlik ayrımı için UVR5 modeli kullanılır.
Geçerli bir klasör yol formatı örneği: D:\\path\\to\\input\\folder (dosya yöneticisi adres çubuğundan kopyalanır).
Model üç kategoriye ayrılır:
1. Vokalleri koru: Bu seçeneği, harmoni içermeyen sesler için kullanın. HP5'ten daha iyi bir şekilde vokalleri korur. İki dahili model içerir: HP2 ve HP3. HP3, eşlik sesini hafifçe sızdırabilir, ancak vokalleri HP2'den biraz daha iyi korur.
2. Sadece ana vokalleri koru: Bu seçeneği, harmoni içeren sesler için kullanın. Ana vokalleri zayıflatabilir. Bir dahili model içerir: HP5.
3. Reverb ve gecikme modelleri (FoxJoy tarafından):
  (1) MDX-Net: Stereo reverb'i kaldırmak için en iyi seçenek, ancak mono reverb'i kaldıramaz;
 (234) DeEcho: Gecikme efektlerini kaldırır. Agresif mod, Normal moda göre daha kapsamlı bir şekilde kaldırma yapar. DeReverb ayrıca reverb'i kaldırır ve mono reverb'i kaldırabilir, ancak yoğun yankılı yüksek frekanslı içerikler için çok etkili değildir.
Reverb/gecikme notları:
1. DeEcho-DeReverb modelinin işleme süresi diğer iki DeEcho modeline göre yaklaşık olarak iki kat daha uzundur.
2. MDX-Net-Dereverb modeli oldukça yavaştır.
3. Tavsiye edilen en temiz yapılandırma önce MDX-Net'i uygulamak ve ardından DeEcho-Aggressive uygulamaktır.", "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "GPU indekslerini '-' ile ayırarak girin, örneğin 0-1-2, GPU 0, 1 ve 2'yi kullanmak için:", "伴奏人声分离&去混响&去回声": "Vokal/Müzik Ayrıştırma ve Yankı Giderme", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", "保存名": "Kaydetme Adı:", "保存的文件名, 默认空为和源文件同名": "Kaydedilecek dosya adı (varsayılan: kaynak dosya ile aynı):", "保存的模型名不带后缀": "Kaydedilecek model adı (uzantı olmadan):", @@ -44,6 +46,7 @@ "变调(整数, 半音数量, 升八度12降八度-12)": "Transpoze et (tamsayı, yarıton sayısıyla; bir oktav yükseltmek için: 12, bir oktav düşürmek için: -12):", "后处理重采样至最终采样率,0为不进行重采样": "Son işleme aşamasında çıktı sesini son örnekleme hızına yeniden örnekle. 0 değeri için yeniden örnekleme yapılmaz:", "否": "Hayır", + "启用相位声码器": "启用相位声码器", "响应阈值": "Tepki eşiği", "响度因子": "ses yüksekliği faktörü", "处理数据": "Verileri işle", @@ -122,6 +125,7 @@ "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "Pitch algoritmasını seçin ('pm': daha hızlı çıkarır ancak daha düşük kaliteli konuşma; 'harvest': daha iyi konuşma sesi ancak son derece yavaş; 'crepe': daha da iyi kalite ancak GPU yoğunluğu gerektirir):", "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", + "采样率:": "采样率:", "采样长度": "Örnekleme uzunluğu", "重载设备列表": "Cihaz listesini yeniden yükle", "音调设置": "Pitch ayarları", diff --git a/i18n/locale/zh_CN.json b/i18n/locale/zh_CN.json index dd56851..32ca5ef 100644 --- a/i18n/locale/zh_CN.json +++ b/i18n/locale/zh_CN.json @@ -26,6 +26,8 @@ "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。", "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2", "伴奏人声分离&去混响&去回声": "伴奏人声分离&去混响&去回声", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", "保存名": "保存名", "保存的文件名, 默认空为和源文件同名": "保存的文件名, 默认空为和源文件同名", "保存的模型名不带后缀": "保存的模型名不带后缀", @@ -44,6 +46,7 @@ "变调(整数, 半音数量, 升八度12降八度-12)": "变调(整数, 半音数量, 升八度12降八度-12)", "后处理重采样至最终采样率,0为不进行重采样": "后处理重采样至最终采样率,0为不进行重采样", "否": "否", + "启用相位声码器": "启用相位声码器", "响应阈值": "响应阈值", "响度因子": "响度因子", "处理数据": "处理数据", @@ -122,6 +125,7 @@ "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU", "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", + "采样率:": "采样率:", "采样长度": "采样长度", "重载设备列表": "重载设备列表", "音调设置": "音调设置", diff --git a/i18n/locale/zh_HK.json b/i18n/locale/zh_HK.json index 51ebf75..93aaff3 100644 --- a/i18n/locale/zh_HK.json +++ b/i18n/locale/zh_HK.json @@ -26,6 +26,8 @@ "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。
有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。
模型分為三類:
1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;
2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。
3. 消除混響和延遲模型(由FoxJoy提供):
  (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;
 (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。
消除混響/延遲注意事項:
1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;
2. MDX-Net-Dereverb模型相當慢;
3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。", "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2", "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", "保存名": "儲存名", "保存的文件名, 默认空为和源文件同名": "儲存的檔案名,預設空為與來源檔案同名", "保存的模型名不带后缀": "儲存的模型名不帶副檔名", @@ -44,6 +46,7 @@ "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", "后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣", "否": "否", + "启用相位声码器": "启用相位声码器", "响应阈值": "響應閾值", "响度因子": "響度因子", "处理数据": "處理資料", @@ -122,6 +125,7 @@ "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU", "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", + "采样率:": "采样率:", "采样长度": "取樣長度", "重载设备列表": "重載設備列表", "音调设置": "音調設定", diff --git a/i18n/locale/zh_SG.json b/i18n/locale/zh_SG.json index 51ebf75..93aaff3 100644 --- a/i18n/locale/zh_SG.json +++ b/i18n/locale/zh_SG.json @@ -26,6 +26,8 @@ "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。
有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。
模型分為三類:
1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;
2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。
3. 消除混響和延遲模型(由FoxJoy提供):
  (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;
 (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。
消除混響/延遲注意事項:
1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;
2. MDX-Net-Dereverb模型相當慢;
3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。", "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2", "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", "保存名": "儲存名", "保存的文件名, 默认空为和源文件同名": "儲存的檔案名,預設空為與來源檔案同名", "保存的模型名不带后缀": "儲存的模型名不帶副檔名", @@ -44,6 +46,7 @@ "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", "后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣", "否": "否", + "启用相位声码器": "启用相位声码器", "响应阈值": "響應閾值", "响度因子": "響度因子", "处理数据": "處理資料", @@ -122,6 +125,7 @@ "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU", "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", + "采样率:": "采样率:", "采样长度": "取樣長度", "重载设备列表": "重載設備列表", "音调设置": "音調設定", diff --git a/i18n/locale/zh_TW.json b/i18n/locale/zh_TW.json index 51ebf75..93aaff3 100644 --- a/i18n/locale/zh_TW.json +++ b/i18n/locale/zh_TW.json @@ -26,6 +26,8 @@ "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。
有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。
模型分為三類:
1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;
2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。
3. 消除混響和延遲模型(由FoxJoy提供):
  (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;
 (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。
消除混響/延遲注意事項:
1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;
2. MDX-Net-Dereverb模型相當慢;
3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。", "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2", "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", "保存名": "儲存名", "保存的文件名, 默认空为和源文件同名": "儲存的檔案名,預設空為與來源檔案同名", "保存的模型名不带后缀": "儲存的模型名不帶副檔名", @@ -44,6 +46,7 @@ "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", "后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣", "否": "否", + "启用相位声码器": "启用相位声码器", "响应阈值": "響應閾值", "响度因子": "響度因子", "处理数据": "處理資料", @@ -122,6 +125,7 @@ "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU", "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", + "采样率:": "采样率:", "采样长度": "取樣長度", "重载设备列表": "重載設備列表", "音调设置": "音調設定", From 5449f84f06d48094ede804e09c3bda325a014d4f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 26 Dec 2023 22:03:02 +0900 Subject: [PATCH 17/20] chore(format): run black on dev (#1638) Co-authored-by: github-actions[bot] --- gui_v1.py | 67 +++++++++++++++++---------- infer/lib/infer_pack/models.py | 20 ++++---- infer/lib/train/mel_processing.py | 10 ++-- tools/rvc_for_realtime.py | 76 +++++++++++++++++-------------- 4 files changed, 102 insertions(+), 71 deletions(-) diff --git a/gui_v1.py b/gui_v1.py index e5c6757..86b52d3 100644 --- a/gui_v1.py +++ b/gui_v1.py @@ -38,10 +38,14 @@ def phase_vocoder(a, b, fade_out, fade_in): deltaphase = deltaphase - 2 * np.pi * torch.floor(deltaphase / 2 / np.pi + 0.5) w = 2 * np.pi * torch.arange(n // 2 + 1).to(a) + deltaphase t = torch.arange(n).unsqueeze(-1).to(a) / n - result = a * (fade_out ** 2) + b * (fade_in ** 2) + torch.sum(absab * torch.cos(w * t + phia), -1) * window / n + result = ( + a * (fade_out**2) + + b * (fade_in**2) + + torch.sum(absab * torch.cos(w * t + phia), -1) * window / n + ) return result - + class Harvest(multiprocessing.Process): def __init__(self, inp_q, opt_q): multiprocessing.Process.__init__(self) @@ -592,11 +596,11 @@ if __name__ == "__main__": self.gui_config.pth_path = values["pth_path"] self.gui_config.index_path = values["index_path"] self.gui_config.sr_type = ["sr_model", "sr_device"][ - [ - values["sr_model"], - values["sr_device"], - ].index(True) - ] + [ + values["sr_model"], + values["sr_device"], + ].index(True) + ] self.gui_config.threhold = values["threhold"] self.gui_config.pitch = values["pitch"] self.gui_config.block_time = values["block_time"] @@ -633,7 +637,11 @@ if __name__ == "__main__": self.config, self.rvc if hasattr(self, "rvc") else None, ) - self.gui_config.samplerate = self.rvc.tgt_sr if self.gui_config.sr_type == "sr_model" else self.get_device_samplerate() + self.gui_config.samplerate = ( + self.rvc.tgt_sr + if self.gui_config.sr_type == "sr_model" + else self.get_device_samplerate() + ) self.zc = self.gui_config.samplerate // 100 self.block_frame = ( int( @@ -690,7 +698,9 @@ if __name__ == "__main__": 2 * self.zc, device=self.config.device, dtype=torch.float32 ) self.skip_head = self.extra_frame // self.zc - self.return_length = (self.block_frame + self.sola_buffer_frame + self.sola_search_frame) // self.zc + self.return_length = ( + self.block_frame + self.sola_buffer_frame + self.sola_search_frame + ) // self.zc self.fade_in_window: torch.Tensor = ( torch.sin( 0.5 @@ -824,7 +834,11 @@ if __name__ == "__main__": # volume envelop mixing if self.gui_config.rms_mix_rate < 1 and self.function == "vc": rms1 = librosa.feature.rms( - y=self.input_wav_res[160 * self.skip_head : 160 * (self.skip_head + self.return_length)] + y=self.input_wav_res[ + 160 + * self.skip_head : 160 + * (self.skip_head + self.return_length) + ] .cpu() .numpy(), frame_length=640, @@ -871,21 +885,24 @@ if __name__ == "__main__": else: sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0]) printt("sola_offset = %d", int(sola_offset)) - infer_wav = infer_wav[sola_offset :] + infer_wav = infer_wav[sola_offset:] if "privateuseone" in str(self.config.device) or not self.gui_config.use_pv: infer_wav[: self.sola_buffer_frame] *= self.fade_in_window - infer_wav[: self.sola_buffer_frame] += self.sola_buffer * self.fade_out_window + infer_wav[: self.sola_buffer_frame] += ( + self.sola_buffer * self.fade_out_window + ) else: infer_wav[: self.sola_buffer_frame] = phase_vocoder( - self.sola_buffer, - infer_wav[: self.sola_buffer_frame], - self.fade_out_window, - self.fade_in_window) - self.sola_buffer[:] = infer_wav[self.block_frame : self.block_frame + self.sola_buffer_frame] - if sys.platform == "darwin": - outdata[:] = ( - infer_wav[: self.block_frame].cpu().numpy()[:, np.newaxis] + self.sola_buffer, + infer_wav[: self.sola_buffer_frame], + self.fade_out_window, + self.fade_in_window, ) + self.sola_buffer[:] = infer_wav[ + self.block_frame : self.block_frame + self.sola_buffer_frame + ] + if sys.platform == "darwin": + outdata[:] = infer_wav[: self.block_frame].cpu().numpy()[:, np.newaxis] else: outdata[:] = ( infer_wav[: self.block_frame].repeat(2, 1).t().cpu().numpy() @@ -930,7 +947,7 @@ if __name__ == "__main__": input_devices_indices, output_devices_indices, ) - + def set_devices(self, input_device, output_device): """设置输出设备""" ( @@ -947,8 +964,10 @@ if __name__ == "__main__": ] printt("Input device: %s:%s", str(sd.default.device[0]), input_device) printt("Output device: %s:%s", str(sd.default.device[1]), output_device) - + def get_device_samplerate(self): - return int(sd.query_devices(device=sd.default.device[0])['default_samplerate']) - + return int( + sd.query_devices(device=sd.default.device[0])["default_samplerate"] + ) + gui = GUI() diff --git a/infer/lib/infer_pack/models.py b/infer/lib/infer_pack/models.py index a81c1de..e489634 100644 --- a/infer/lib/infer_pack/models.py +++ b/infer/lib/infer_pack/models.py @@ -795,9 +795,9 @@ class SynthesizerTrnMs256NSFsid(nn.Module): assert isinstance(return_length, torch.Tensor) head = int(skip_head.item()) length = int(return_length.item()) - z_p = z_p[:, :, head: head + length] - x_mask = x_mask[:, :, head: head + length] - nsff0 = nsff0[:, head: head + length] + z_p = z_p[:, :, head : head + length] + x_mask = x_mask[:, :, head : head + length] + nsff0 = nsff0[:, head : head + length] z = self.flow(z_p, x_mask, g=g, reverse=True) o = self.dec(z * x_mask, nsff0, g=g) return o, x_mask, (z, z_p, m_p, logs_p) @@ -957,9 +957,9 @@ class SynthesizerTrnMs768NSFsid(nn.Module): assert isinstance(return_length, torch.Tensor) head = int(skip_head.item()) length = int(return_length.item()) - z_p = z_p[:, :, head: head + length] - x_mask = x_mask[:, :, head: head + length] - nsff0 = nsff0[:, head: head + length] + z_p = z_p[:, :, head : head + length] + x_mask = x_mask[:, :, head : head + length] + nsff0 = nsff0[:, head : head + length] z = self.flow(z_p, x_mask, g=g, reverse=True) o = self.dec(z * x_mask, nsff0, g=g) return o, x_mask, (z, z_p, m_p, logs_p) @@ -1108,8 +1108,8 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): assert isinstance(return_length, torch.Tensor) head = int(skip_head.item()) length = int(return_length.item()) - z_p = z_p[:, :, head: head + length] - x_mask = x_mask[:, :, head: head + length] + z_p = z_p[:, :, head : head + length] + x_mask = x_mask[:, :, head : head + length] z = self.flow(z_p, x_mask, g=g, reverse=True) o = self.dec(z * x_mask, g=g) return o, x_mask, (z, z_p, m_p, logs_p) @@ -1258,8 +1258,8 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module): assert isinstance(return_length, torch.Tensor) head = int(skip_head.item()) length = int(return_length.item()) - z_p = z_p[:, :, head: head + length] - x_mask = x_mask[:, :, head: head + length] + z_p = z_p[:, :, head : head + length] + x_mask = x_mask[:, :, head : head + length] z = self.flow(z_p, x_mask, g=g, reverse=True) o = self.dec(z * x_mask, g=g) return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/infer/lib/train/mel_processing.py b/infer/lib/train/mel_processing.py index 14a960f..3751f1e 100644 --- a/infer/lib/train/mel_processing.py +++ b/infer/lib/train/mel_processing.py @@ -38,6 +38,7 @@ def spectral_de_normalize_torch(magnitudes): mel_basis = {} hann_window = {} + def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): """Convert waveform into Linear-frequency Linear-amplitude spectrogram. @@ -51,7 +52,7 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) Returns: :: (B, Freq, Frame) - Linear-frequency Linear-amplitude spectrogram """ - + # Window - Cache if needed global hann_window dtype_device = str(y.dtype) + "_" + str(y.device) @@ -60,7 +61,7 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( dtype=y.dtype, device=y.device ) - + # Padding y = torch.nn.functional.pad( y.unsqueeze(1), @@ -68,7 +69,7 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) mode="reflect", ) y = y.squeeze(1) - + # Complex Spectrogram :: (B, T) -> (B, Freq, Frame, RealComplex=2) spec = torch.stft( y, @@ -82,11 +83,12 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) onesided=True, return_complex=True, ) - + # Linear-frequency Linear-amplitude spectrogram :: (B, Freq, Frame, RealComplex=2) -> (B, Freq, Frame) spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6) return spec + def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): # MelBasis - Cache if needed global mel_basis diff --git a/tools/rvc_for_realtime.py b/tools/rvc_for_realtime.py index 257c44d..ff1ea88 100644 --- a/tools/rvc_for_realtime.py +++ b/tools/rvc_for_realtime.py @@ -46,22 +46,23 @@ def printt(strr, *args): # config.is_half=False########强制cpu测试 class RVC: def __init__( - self, - key, - pth_path, - index_path, - index_rate, - n_cpu, - inp_q, - opt_q, - config: Config, - last_rvc=None, + self, + key, + pth_path, + index_path, + index_rate, + n_cpu, + inp_q, + opt_q, + config: Config, + last_rvc=None, ) -> None: """ 初始化 """ try: if config.dml == True: + def forward_dml(ctx, x, scale): ctx.scale = scale res = x.clone().detach() @@ -92,7 +93,7 @@ class RVC: self.index_rate = index_rate self.cache_pitch: np.ndarray = np.zeros(1024, dtype="int32") self.cache_pitchf = np.zeros(1024, dtype="float32") - + if last_rvc is None: models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( ["assets/hubert/hubert_base.pt"], @@ -201,7 +202,7 @@ class RVC: f0bak = f0.copy() f0_mel = 1127 * np.log(1 + f0 / 700) f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / ( - self.f0_mel_max - self.f0_mel_min + self.f0_mel_max - self.f0_mel_min ) + 1 f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel > 255] = 255 @@ -258,7 +259,7 @@ class RVC: self.inp_q.put((idx, x[:tail], res_f0, n_cpu, ts)) else: self.inp_q.put( - (idx, x[part_length * idx - 320: tail], res_f0, n_cpu, ts) + (idx, x[part_length * idx - 320 : tail], res_f0, n_cpu, ts) ) while 1: res_ts = self.opt_q.get() @@ -273,7 +274,7 @@ class RVC: else: f0 = f0[2:] f0bak[ - part_length * idx // 160: part_length * idx // 160 + f0.shape[0] + part_length * idx // 160 : part_length * idx // 160 + f0.shape[0] ] = f0 f0bak = signal.medfilt(f0bak, 3) f0bak *= pow(2, f0_up_key / 12) @@ -320,6 +321,7 @@ class RVC: def get_f0_fcpe(self, x, f0_up_key): if hasattr(self, "model_fcpe") == False: from torchfcpe import spawn_bundled_infer_model + printt("Loading fcpe model") if "privateuseone" in str(self.device): self.device_fcpe = "cpu" @@ -329,7 +331,7 @@ class RVC: f0 = self.model_fcpe.infer( x.to(self.device_fcpe).unsqueeze(0).float(), sr=16000, - decoder_mode='local_argmax', + decoder_mode="local_argmax", threshold=0.006, ) f0 *= pow(2, f0_up_key / 12) @@ -337,12 +339,12 @@ class RVC: return self.get_f0_post(f0) def infer( - self, - input_wav: torch.Tensor, - block_frame_16k, - skip_head, - return_length, - f0method, + self, + input_wav: torch.Tensor, + block_frame_16k, + skip_head, + return_length, + f0method, ) -> np.ndarray: t1 = ttime() with torch.no_grad(): @@ -364,16 +366,16 @@ class RVC: t2 = ttime() try: if hasattr(self, "index") and self.index_rate != 0: - npy = feats[0][skip_head // 2:].cpu().numpy().astype("float32") + npy = feats[0][skip_head // 2 :].cpu().numpy().astype("float32") score, ix = self.index.search(npy, k=8) weight = np.square(1 / score) weight /= weight.sum(axis=1, keepdims=True) npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) if self.config.is_half: npy = npy.astype("float16") - feats[0][skip_head // 2:] = ( - torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate - + (1 - self.index_rate) * feats[0][skip_head // 2:] + feats[0][skip_head // 2 :] = ( + torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate + + (1 - self.index_rate) * feats[0][skip_head // 2 :] ) else: printt("Index search FAILED or disabled") @@ -384,21 +386,29 @@ class RVC: if self.if_f0 == 1: f0_extractor_frame = block_frame_16k + 800 if f0method == "rmvpe": - f0_extractor_frame = ( - 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160 - ) - pitch, pitchf = self.get_f0(input_wav[-f0_extractor_frame: ], self.f0_up_key, self.n_cpu, f0method) + f0_extractor_frame = 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160 + pitch, pitchf = self.get_f0( + input_wav[-f0_extractor_frame:], self.f0_up_key, self.n_cpu, f0method + ) start_frame = block_frame_16k // 160 end_frame = len(self.cache_pitch) - (pitch.shape[0] - 4) + start_frame - self.cache_pitch[:] = np.append(self.cache_pitch[start_frame: end_frame], pitch[3:-1]) + self.cache_pitch[:] = np.append( + self.cache_pitch[start_frame:end_frame], pitch[3:-1] + ) self.cache_pitchf[:] = np.append( - self.cache_pitchf[start_frame: end_frame], pitchf[3:-1] + self.cache_pitchf[start_frame:end_frame], pitchf[3:-1] ) t4 = ttime() p_len = input_wav.shape[0] // 160 if self.if_f0 == 1: - cache_pitch = torch.LongTensor(self.cache_pitch[-p_len: ]).to(self.device).unsqueeze(0) - cache_pitchf = torch.FloatTensor(self.cache_pitchf[-p_len: ]).to(self.device).unsqueeze(0) + cache_pitch = ( + torch.LongTensor(self.cache_pitch[-p_len:]).to(self.device).unsqueeze(0) + ) + cache_pitchf = ( + torch.FloatTensor(self.cache_pitchf[-p_len:]) + .to(self.device) + .unsqueeze(0) + ) feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) feats = feats[:, :p_len, :] p_len = torch.LongTensor([p_len]).to(self.device) From 8bd34d1881b79570191db957429f03d74f3cd602 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C6=A1n=20Phan=20Trung?= <94152483+sonphantrung@users.noreply.github.com> Date: Tue, 26 Dec 2023 20:11:16 +0700 Subject: [PATCH 18/20] chore: get run.sh to be POSIX Compliant (#1649) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Create CONTRIBUTING.md * Update CONTRIBUTING.md * Update CONTRIBUTING.md * fix: get run.sh posix compliant * Update venv.sh * Delete CONTRIBUTING.md --------- Co-authored-by: 源文雨 <41315874+fumiama@users.noreply.github.com> --- run.sh | 20 ++++++++++---------- venv.sh | 2 ++ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/run.sh b/run.sh index 704c9ff..d3f720e 100755 --- a/run.sh +++ b/run.sh @@ -1,27 +1,27 @@ -#!/bin/bash +#!/bin/sh -if [[ "$(uname)" == "Darwin" ]]; then +if [ "$(uname)" = "Darwin" ]; then # macOS specific env: export PYTORCH_ENABLE_MPS_FALLBACK=1 export PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 -elif [[ "$(uname)" != "Linux" ]]; then +elif [ "$(uname)" != "Linux" ]; then echo "Unsupported operating system." exit 1 fi if [ -d ".venv" ]; then echo "Activate venv..." - source .venv/bin/activate + . .venv/bin/activate else echo "Create venv..." requirements_file="requirements.txt" # Check if Python 3.8 is installed - if ! command -v python3 &> /dev/null; then + if ! command -v python3 >/dev/null 2>&1; then echo "Python 3 not found. Attempting to install 3.8..." - if [[ "$(uname)" == "Darwin" ]] && command -v brew &> /dev/null; then + if [ "$(uname)" = "Darwin" ] && command -v brew >/dev/null 2>&1; then brew install python@3.8 - elif [[ "$(uname)" == "Linux" ]] && command -v apt-get &> /dev/null; then + elif [ "$(uname)" = "Linux" ] && command -v apt-get >/dev/null 2>&1; then sudo apt-get update sudo apt-get install python3.8 else @@ -31,13 +31,13 @@ else fi python3 -m venv .venv - source .venv/bin/activate + . .venv/bin/activate # Check if required packages are installed and install them if not if [ -f "${requirements_file}" ]; then installed_packages=$(python3 -m pip freeze) while IFS= read -r package; do - [[ "${package}" =~ ^#.* ]] && continue + expr "${package}" : "^#.*" > /dev/null && continue package_name=$(echo "${package}" | sed 's/[<>=!].*//') if ! echo "${installed_packages}" | grep -q "${package_name}"; then echo "${package_name} not found. Attempting to install..." @@ -53,7 +53,7 @@ fi # Download models ./tools/dlmodels.sh -if [[ $? -ne 0 ]]; then +if [ $? -ne 0 ]; then exit 1 fi diff --git a/venv.sh b/venv.sh index aa23099..577283b 100755 --- a/venv.sh +++ b/venv.sh @@ -1 +1,3 @@ +#!/bin/sh + python3.8 -m venv .venv From 1b680a9690c1a9d174294b222a6b3b9a5dacea43 Mon Sep 17 00:00:00 2001 From: Derry Tutt <82726593+everypizza1@users.noreply.github.com> Date: Tue, 26 Dec 2023 07:52:02 -0600 Subject: [PATCH 19/20] Update README.en.md Made it seem more human. --- docs/en/README.en.md | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/docs/en/README.en.md b/docs/en/README.en.md index 7e1889d..9c17df3 100644 --- a/docs/en/README.en.md +++ b/docs/en/README.en.md @@ -32,26 +32,25 @@ Realtime Voice Conversion GUI:go-realtime-gui.bat ![image](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/assets/129054828/143246a9-8b42-4dd1-a197-430ede4d15d7) -> The dataset for the pre-training model uses nearly 50 hours of high quality VCTK open source dataset. +> The dataset for the pre-training model uses nearly 50 hours of high quality audio from the VCTK open source dataset. -> High quality licensed song datasets will be added to training-set one after another for your use, without worrying about copyright infringement. +> High quality licensed song datasets will be added to the training-set often for your use, without having to worry about copyright infringement. > Please look forward to the pretrained base model of RVCv3, which has larger parameters, more training data, better results, unchanged inference speed, and requires less training data for training. -## Summary -This repository has the following features: +## Features: + Reduce tone leakage by replacing the source feature to training-set feature using top1 retrieval; -+ Easy and fast training, even on relatively poor graphics cards; -+ Training with a small amount of data also obtains relatively good results (>=10min low noise speech recommended); -+ Supporting model fusion to change timbres (using ckpt processing tab->ckpt merge); -+ Easy-to-use Webui interface; -+ Use the UVR5 model to quickly separate vocals and instruments. -+ Use the most powerful High-pitch Voice Extraction Algorithm [InterSpeech2023-RMVPE](#Credits) to prevent the muted sound problem. Provides the best results (significantly) and is faster, with even lower resource consumption than Crepe_full. -+ AMD/Intel graphics cards acceleration supported. ++ Easy + fast training, even on poor graphics cards; ++ Training with a small amounts of data (>=10min low noise speech recommended); ++ Model fusion to change timbres (using ckpt processing tab->ckpt merge); ++ Easy-to-use WebUI; ++ UVR5 model to quickly separate vocals and instruments; ++ High-pitch Voice Extraction Algorithm [InterSpeech2023-RMVPE](#Credits) to prevent a muted sound problem. Provides the best results (significantly) and is faster with lower resource consumption than Crepe_full; ++ AMD/Intel graphics cards acceleration supported; + Intel ARC graphics cards acceleration with IPEX supported. ## Preparing the environment -The following commands need to be executed in the environment of Python version 3.8 or higher. +The following commands need to be executed with Python 3.8 or higher. (Windows/Linux) First install the main dependencies through pip: @@ -166,7 +165,7 @@ You might also need to set these environment variables (e.g. on a RX6700XT): export ROCM_PATH=/opt/rocm export HSA_OVERRIDE_GFX_VERSION=10.3.0 ```` -Also make sure your user is part of the `render` and `video` group: +Make sure your user is part of the `render` and `video` group: ```` sudo usermod -aG render $USERNAME sudo usermod -aG video $USERNAME From e56584c6e2d864b17fba17452e8cc46696a3dcc3 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Tue, 26 Dec 2023 22:24:04 +0800 Subject: [PATCH 20/20] Add files via upload --- infer/modules/uvr5/vr.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/infer/modules/uvr5/vr.py b/infer/modules/uvr5/vr.py index d3fbac4..ed57784 100644 --- a/infer/modules/uvr5/vr.py +++ b/infer/modules/uvr5/vr.py @@ -307,14 +307,14 @@ class AudioPreDeEcho: sf.write( os.path.join( ins_root, - "instrument_{}_{}.{}".format(name, self.data["agg"], format), + "vocal_{}_{}.{}".format(name, self.data["agg"], format), ), (np.array(wav_instrument) * 32768).astype("int16"), self.mp.param["sr"], ) # else: path = os.path.join( - ins_root, "instrument_{}_{}.wav".format(name, self.data["agg"]) + ins_root, "vocal_{}_{}.wav".format(name, self.data["agg"]) ) sf.write( path, @@ -344,14 +344,14 @@ class AudioPreDeEcho: sf.write( os.path.join( vocal_root, - "vocal_{}_{}.{}".format(name, self.data["agg"], format), + "instrument_{}_{}.{}".format(name, self.data["agg"], format), ), (np.array(wav_vocals) * 32768).astype("int16"), self.mp.param["sr"], ) else: path = os.path.join( - vocal_root, "vocal_{}_{}.wav".format(name, self.data["agg"]) + vocal_root, "instrument_{}_{}.wav".format(name, self.data["agg"]) ) sf.write( path,