From 13da7b3f960b79fce5e83bc2a57de77d79c0b5e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=BA=90=E6=96=87=E9=9B=A8?= <41315874+fumiama@users.noreply.github.com> Date: Sun, 16 Apr 2023 17:30:32 +0800 Subject: [PATCH 01/40] =?UTF-8?q?fix=20#73:=20=E4=BC=B4=E5=A5=8F=E4=BA=BA?= =?UTF-8?q?=E5=A3=B0=E5=88=86=E7=A6=BB=E6=97=B6=E6=8A=A5=E9=94=99=EF=BC=9A?= =?UTF-8?q?FileNotFoundError=20(#74)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix #73: 伴奏人声分离时报错:FileNotFoundError * Apply Code Formatter Change --------- Co-authored-by: fumiama --- i18n/locale_diff.py | 8 +++++--- uvr5_pack/utils.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/i18n/locale_diff.py b/i18n/locale_diff.py index e9f8861..6419266 100644 --- a/i18n/locale_diff.py +++ b/i18n/locale_diff.py @@ -7,7 +7,9 @@ standard_file = "zh_CN.json" # Find all JSON files in the directory dir_path = "./" -languages = [f for f in os.listdir(dir_path) if f.endswith(".json") and f != standard_file] +languages = [ + f for f in os.listdir(dir_path) if f.endswith(".json") and f != standard_file +] # Load the standard file with open(standard_file, "r", encoding="utf-8") as f: @@ -35,8 +37,8 @@ for lang_file in languages: # Sort the keys of the language file to match the order of the standard file lang_data = OrderedDict( sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0])) - ) + ) # Save the updated language file with open(lang_file, "w", encoding="utf-8") as f: - json.dump(lang_data, f, ensure_ascii=False, indent=4) \ No newline at end of file + json.dump(lang_data, f, ensure_ascii=False, indent=4) diff --git a/uvr5_pack/utils.py b/uvr5_pack/utils.py index 30bc59e..1d91f96 100644 --- a/uvr5_pack/utils.py +++ b/uvr5_pack/utils.py @@ -4,7 +4,7 @@ from tqdm import tqdm import json -def load_data(file_name: str = "./uvr5_pack/data.json") -> dict: +def load_data(file_name: str = "./uvr5_pack/name_params.json") -> dict: with open(file_name, "r") as f: data = json.load(f) From f47627c65007d49872389829a5015d5c83714199 Mon Sep 17 00:00:00 2001 From: Nasser Aldosari Date: Sun, 16 Apr 2023 12:30:42 +0300 Subject: [PATCH 02/40] English translations (#75) Modified some existing translations and translated non-existing ones using GPT-4, manually proofread, may require double proofreading from experts to ensure accuracy. --- i18n/en_US.json | 110 ++++++++++++++++++++++++------------------------ 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/i18n/en_US.json b/i18n/en_US.json index aae0ec0..2666262 100644 --- a/i18n/en_US.json +++ b/i18n/en_US.json @@ -1,99 +1,99 @@ { "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录使用需遵守的协议-LICENSE.txt.": "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录使用需遵守的协议-LICENSE.txt.", "模型推理": "Model inference", - "推理音色": "Inferencing timbre", - "刷新音色列表": "Refresh timbre list", - "卸载音色省显存": "Unload timbre to save GPU memory", - "请选择说话人id": "Please select a speaker id", - "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ", - "变调(整数, 半音数量, 升八度12降八度-12)": "变调(整数, 半音数量, 升八度12降八度-12)", - "输入待处理音频文件路径(默认是正确格式示例)": "输入待处理音频文件路径(默认是正确格式示例)", + "推理音色": "Inferencing voice", + "刷新音色列表": "Refresh voice list", + "卸载音色省显存": "Unload voice to save GPU memory", + "请选择说话人id": "select a speaker ID", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Recommended +12 key for male-to-female voice conversion, -12 key for female-to-male voice conversion. If the pitch range is too wide and causes distortion, adjust it to a suitable range by yourself.", + "变调(整数, 半音数量, 升八度12降八度-12)": "Pitch shifting (integer, number of semitones, raise by an octave +12 or lower by an octave -12)", + "输入待处理音频文件路径(默认是正确格式示例)": "Enter the file path of the audio to be processed (default is the correct format example)", "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "Select the algorithm for pitch extraction. Use 'pm' to speed up for singing voices, or use 'harvest' for better low-pitched voices, but it is extremely slow.", "特征检索库文件路径": "Feature search database file path", "特征文件路径": "Feature file path", - "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0 curve file, optional, one pitch per line, instead of default F0 and pitch shifting", "转换": "Conversion", "输出信息": "Output information", - "输出音频(右下角三个点,点了可以下载)": "输出音频(右下角三个点,点了可以下载)", - "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ", + "输出音频(右下角三个点,点了可以下载)": "Output audio (click the three dots in the lower right corner to download)", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Batch conversion, input the folder containing audio files to be converted, or upload multiple audio files. The converted audio will be output in the specified folder (default opt).", "指定输出文件夹": "Specify output folder", "检索特征占比": "Search feature ratio", - "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)", - "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹", - "伴奏人声分离": "Accompaniment and vocal separation", - "人声伴奏分离批量处理, 使用UVR5模型.
不带和声用HP2, 带和声且提取的人声不需要和声用HP5
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)": "人声伴奏分离批量处理, 使用UVR5模型.
不带和声用HP2, 带和声且提取的人声不需要和声用HP5
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)", - "输入待处理音频文件夹路径": "Input audio folder path", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Enter the path to the audio folder to be processed (just copy it from the file manager address bar)", + "也可批量输入音频文件, 二选一, 优先读文件夹": "Multiple audio files can also be inputted, either of the two options, with priority given to the folder", + "伴奏人声分离": "Instrumental and vocal separation", + "人声伴奏分离批量处理, 使用UVR5模型.
不带和声用HP2, 带和声且提取的人声不需要和声用HP5
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)": "Batch processing of instrumental and vocal separation using UVR5 model.
Use HP2 for vocal separation without harmonics, and use HP5 for vocal separation with harmonics and the extracted vocals do not need to have harmonics.
Example of a qualified folder path: E:\\codes\\py39\\vits_vc_gpu\\test_sample (just copy it from the file manager address bar)", + "输入待处理音频文件夹路径": "Input the path to the audio folder to be processed", "模型": "Model", - "指定输出人声文件夹": "Specify output vocal folder", - "指定输出乐器文件夹": "Specify output instrumental folder", + "指定输出人声文件夹": "Specify vocals output folder", + "指定输出乐器文件夹": "Specify instrumentals output folder", "训练": "Train", - "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1: Fill in the experiment configuration. Experiment data is stored in the 'logs' directory, with each experiment in a separate folder. The experiment name path needs to be entered manually and should contain the experiment configuration, logs, and trained model files.", "输入实验名": "Input experiment name", "目标采样率": "Target sample rate", - "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否带音高指导(唱歌一定要, 语音可以不要)", - "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "Whether the model has pitch guidance (necessary for singing, but not required for speech)", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a: Automatically traverse the training folder and slice and normalize all audio files that can be decoded into audio. Two 'wav' folders will be generated in the experiment directory. Currently, only single-person training is supported.", "输入训练文件夹路径": "Input training folder path", "请指定说话人id": "Please specify speaker ID", "处理数据": "Process data", - "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)", - "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "step2b: Use CPU to extract pitch (if the model has pitch guidance) and GPU to extract features (select card number).", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Separate the GPU id numbers with '-' when inputting. For example, '0-1-2' means using GPU 0, GPU 1, and GPU 2.", "显卡信息": "GPU information", - "提取音高使用的CPU进程数": "Number of CPU processes used for pitch extraction", + "提取音高使用的CPU进程数": "Number of CPU threads to use for pitch extraction", "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢": "Select pitch extraction algorithm: Use 'pm' for faster processing of singing voice, 'dio' for high-quality speech but slower processing, and 'harvest' for the best quality but slowest processing.", "特征提取": "Feature extraction", - "step3: 填写训练设置, 开始训练模型和索引": "step3: 填写训练设置, 开始训练模型和索引", - "保存频率save_every_epoch": "Save frequency (save_every_epoch)", + "step3: 填写训练设置, 开始训练模型和索引": "step3: Fill in the training settings and start training the model and index.", + "保存频率save_every_epoch": "Saving frequency (save_every_epoch)", "总训练轮数total_epoch": "Total training epochs (total_epoch)", "是否仅保存最新的ckpt文件以节省硬盘空间": "Whether to save only the latest ckpt file to save disk space", - "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Whether to cache all training sets in GPU memory. Small datasets (under 10 minutes) can be cached to speed up training, but caching large datasets can cause GPU memory errors and does not increase speed significantly.", "加载预训练底模G路径": "Load pre-trained base model G path.", "加载预训练底模D路径": "Load pre-trained base model D path.", "训练模型": "Train model.", "训练特征索引": "Train feature index.", "一键训练": "One-click training.", - "ckpt处理": "ckpt processing.", - "模型融合, 可用于测试音色融合": "模型融合, 可用于测试音色融合", + "ckpt处理": "Ckpt processing.", + "模型融合, 可用于测试音色融合": "Model fusion, can be used for merging diffrent voices", "A模型路径": "A model path.", "B模型路径": "B model path.", - "A模型权重": "A model weight.", + "A模型权重": "A model weight for model A.", "模型是否带音高指导": "Whether the model has pitch guidance.", "要置入的模型信息": "Model information to be placed.", "保存的模型名不带后缀": "Saved model name without extension.", "融合": "Fusion.", - "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型信息(仅支持weights文件夹下提取的小模型文件)", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Modify model information (only supports small model files extracted under the weights folder).", "模型路径": "Model path", "要改的模型信息": "Model information to be modified", - "保存的文件名, 默认空为和源文件同名": "保存的文件名, 默认空为和源文件同名", + "保存的文件名, 默认空为和源文件同名": "Name of the file to be saved, default is the same as the source file name", "修改": "Modify", - "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型信息(仅支持weights文件夹下提取的小模型文件)", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "View model information (only applicable to small model files extracted from the 'weights' folder)", "查看": "View", - "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Model extraction (input the path of a large model file in the 'logs' folder), applicable when you want to extract a small model file after training halfway and it was not saved automatically, or when you want to test an intermediate model", "保存名": "Save Name", - "模型是否带音高指导,1是0否": "Whether the model comes with pitch guidance, 1 for yes, 0 for no", + "模型是否带音高指导,1是0否": "Whether the model has pitch guidance, 1 for yes, 0 for no", "提取": "Extract", "招募音高曲线前端编辑器": "Recruit front-end editors for pitch curves", - "加开发群联系我xxxxx": "加开发群联系我xxxxx", + "加开发群联系我xxxxx": "Join the development group to contact me at xxxxx", "点击查看交流、问题反馈群号": "Click to view the communication and problem feedback group number", "xxxxx": "xxxxx", - "加载模型": "加载模型", - "Hubert模型": "Hubert File", - "选择.pth文件": "选择.pth文件", - "选择.index文件": "选择.index文件", - "选择.npy文件": "选择.npy文件", - "输入设备": "输入设备", - "输出设备": "输出设备", - "音频设备(请使用同种类驱动)": "音频设备(请使用同种类驱动)", - "响应阈值": "响应阈值", - "音调设置": "音调设置", + "加载模型": "Load Model", + "Hubert模型": "Hubert Model", + "选择.pth文件": "Select .pth file", + "选择.index文件": "Select .index file", + "选择.npy文件": "Select .npy file", + "输入设备": "Input device", + "输出设备": "Output device", + "音频设备(请使用同种类驱动)": "Audio device (please use the same type of driver)", + "响应阈值": "Response threshold", + "音调设置": "Pitch setting", "Index Rate": "Index Rate", - "常规设置": "常规设置", - "采样长度": "采样长度", - "淡入淡出长度": "淡入淡出长度", - "额外推理时长": "额外推理时长", - "输入降噪": "Input Noisereduce", - "输出降噪": "Output Noisereduce", - "性能设置": "性能设置", - "开始音频转换": "开始音频转换", - "停止音频转换": "停止音频转换", + "常规设置": "General Settings", + "采样长度": "Sampling length", + "淡入淡出长度": "Fade in/out length", + "额外推理时长": "Additional inference time", + "输入降噪": "Input Noise Reduction", + "输出降噪": "Output Noise Reduction", + "性能设置": "Performance settings", + "开始音频转换": "Start Audio Conversion", + "停止音频转换": "Stop Audio Conversion", "推理时间(ms):": "Infer Time(ms):" -} \ No newline at end of file +} From 4ce152827c484ba450ebcba884ada4ab5b651ab7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=BA=90=E6=96=87=E9=9B=A8?= <41315874+fumiama@users.noreply.github.com> Date: Sun, 16 Apr 2023 17:30:50 +0800 Subject: [PATCH 03/40] =?UTF-8?q?fix:=20i18n=20rename=20=E4=B8=8D=E5=85=A8?= =?UTF-8?q?=20(#77)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- i18n.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/i18n.py b/i18n.py index bb9e585..d535482 100644 --- a/i18n.py +++ b/i18n.py @@ -4,7 +4,7 @@ import os def load_language_list(language): - with open(f"./locale/{language}.json", "r", encoding="utf-8") as f: + with open(f"./i18n/{language}.json", "r", encoding="utf-8") as f: language_list = json.load(f) return language_list @@ -15,7 +15,7 @@ class I18nAuto: language = "auto" if language == "auto": language = locale.getdefaultlocale()[0] - if not os.path.exists(f"./locale/{language}.json"): + if not os.path.exists(f"./i18n/{language}.json"): language = "en_US" self.language = language print("Use Language:", language) From 343aa6fc66604f0283ab89b29608573d1d86ff17 Mon Sep 17 00:00:00 2001 From: liujing04 <129054828+liujing04@users.noreply.github.com> Date: Sun, 16 Apr 2023 09:32:32 +0000 Subject: [PATCH 04/40] Update infer-web.py --- infer-web.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/infer-web.py b/infer-web.py index 54a1c4d..488dd7f 100644 --- a/infer-web.py +++ b/infer-web.py @@ -36,6 +36,10 @@ else: or "20" in gpu_name or "30" in gpu_name or "40" in gpu_name + or "A2" in gpu_name.upper() + or "A3" in gpu_name.upper() + or "A4" in gpu_name.upper() + or "P4" in gpu_name.upper() or "A50" in gpu_name.upper() or "70" in gpu_name or "80" in gpu_name From c935e75d5292b6ad49426ecc32f3fe1a3387698a Mon Sep 17 00:00:00 2001 From: liujing04 <129054828+liujing04@users.noreply.github.com> Date: Sun, 16 Apr 2023 09:56:31 +0000 Subject: [PATCH 05/40] Update Changelog_CN.md --- Changelog_CN.md | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/Changelog_CN.md b/Changelog_CN.md index 72ad004..a206001 100644 --- a/Changelog_CN.md +++ b/Changelog_CN.md @@ -1,15 +1,39 @@ 20230409 -修正训练参数,提升显卡平均利用率,A100最高从25%提升至90%左右,V100:50%->90%左右,2060S:60%->85%左右,P40:25%->95%左右,训练速度显著提升 + 1-修正训练参数,提升显卡平均利用率,A100最高从25%提升至90%左右,V100:50%->90%左右,2060S:60%->85%左右,P40:25%->95%左右,训练速度显著提升 -修正参数:总batch_size改为每张卡的batch_size + 2-修正参数:总batch_size改为每张卡的batch_size -修正total_epoch:最大限制100解锁至1000;默认10提升至默认20 + 3-修正total_epoch:最大限制100解锁至1000;默认10提升至默认20 -修复ckpt提取识别是否带音高错误导致推理异常的问题 + 4-修复ckpt提取识别是否带音高错误导致推理异常的问题 -修复分布式训练每个rank都保存一次ckpt的问题 + 5-修复分布式训练每个rank都保存一次ckpt的问题 -特征提取进行nan特征过滤 + 6-特征提取进行nan特征过滤 -修复静音输入输出随机辅音or噪声的问题(老版模型需要重做训练集重训) + 7-修复静音输入输出随机辅音or噪声的问题(老版模型需要重做训练集重训) + +20230416更新 + + 1-新增本地实时变声迷你GUI,双击go-realtime-gui.bat启动 + + 2-训练推理均对<50Hz的频段进行滤波过滤 + + 3-训练推理音高提取pyworld最低音高从默认80下降至50,50-80hz间的男声低音不会哑 + + 4-WebUI支持根据系统区域变更语言(现支持en_US,ja_JP,zh_CN,zh_HK,zh_SG,zh_TW,不支持的默认en_US) + + 5-修正部分显卡识别(例如V100-16G识别失败,P4识别失败) + +后续计划: + + 1-收集呼吸wav加入训练集修正呼吸变声电音的问题 + + 2-研究更优的默认faiss索引配置,计划将索引打包进weights/xxx.pth中,取消推理界面的 特征/检索库 选择 + + 3-根据显存情况和显卡架构自动给到最优配置(batch size,训练集切块,推理音频长度相关的config,训练是否fp16),未来所有>=4G显存的>=pascal架构的显卡都可以训练或推理,而<4G显存的显卡不会进行支持 + + 4-我们正在训练增加了歌声训练集的底模,未来会公开 + + 5-推理音高识别选项加入"是否开启中值滤波" From ae8aaf7624933489ddfcedd3952a65d5ad3404d0 Mon Sep 17 00:00:00 2001 From: liujing04 <129054828+liujing04@users.noreply.github.com> Date: Sun, 16 Apr 2023 10:06:15 +0000 Subject: [PATCH 06/40] Update trainset_preprocess_pipeline_print.py --- trainset_preprocess_pipeline_print.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trainset_preprocess_pipeline_print.py b/trainset_preprocess_pipeline_print.py index 40617a1..68b89d2 100644 --- a/trainset_preprocess_pipeline_print.py +++ b/trainset_preprocess_pipeline_print.py @@ -31,7 +31,7 @@ class PreProcess: def __init__(self, sr, exp_dir): self.slicer = Slicer( sr=sr, - threshold=-32, + threshold=-40, min_length=800, min_interval=400, hop_size=15, From 5bcaa171eaa8ce99764f5b6d60fa78f13c9dee9d Mon Sep 17 00:00:00 2001 From: liujing04 <129054828+liujing04@users.noreply.github.com> Date: Sun, 16 Apr 2023 10:43:53 +0000 Subject: [PATCH 07/40] Update Changelog_CN.md --- Changelog_CN.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Changelog_CN.md b/Changelog_CN.md index a206001..f8b58d6 100644 --- a/Changelog_CN.md +++ b/Changelog_CN.md @@ -37,3 +37,6 @@  4-我们正在训练增加了歌声训练集的底模,未来会公开  5-推理音高识别选项加入"是否开启中值滤波" + + 6-增加选项:每次epoch保存的小模型均进行提取; 增加选项:设置默认测试集音频,每次保存的小模型均在保存后对其进行推理导出,用户可试听(来选择哪个中间epoch最好) + From 0719b4aa5eae1f18bfb3e87648bd1acd4abdc0a9 Mon Sep 17 00:00:00 2001 From: liujing04 <129054828+liujing04@users.noreply.github.com> Date: Sun, 16 Apr 2023 18:56:20 +0800 Subject: [PATCH 08/40] Add files via upload --- go-realtime-gui.bat | 2 ++ gui.py | 5 ++++- infer-web.py | 4 +++- train_nsf_sim_cache_sid_load_pretrain.py | 5 +++-- trainset_preprocess_pipeline_print.py | 3 +++ vc_infer_pipeline.py | 4 +++- 6 files changed, 18 insertions(+), 5 deletions(-) create mode 100644 go-realtime-gui.bat diff --git a/go-realtime-gui.bat b/go-realtime-gui.bat new file mode 100644 index 0000000..ed07321 --- /dev/null +++ b/go-realtime-gui.bat @@ -0,0 +1,2 @@ +runtime\python.exe gui.py +pause diff --git a/gui.py b/gui.py index 19b8fed..24fcf1e 100644 --- a/gui.py +++ b/gui.py @@ -1,3 +1,6 @@ +import os,sys +now_dir = os.getcwd() +sys.path.append(now_dir) import PySimpleGUI as sg import sounddevice as sd import noisereduce as nr @@ -12,7 +15,7 @@ from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFs from i18n import I18nAuto i18n = I18nAuto() - +print(i18n.language_map) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") diff --git a/infer-web.py b/infer-web.py index 488dd7f..00d55b2 100644 --- a/infer-web.py +++ b/infer-web.py @@ -139,6 +139,8 @@ def vc_single( if hubert_model == None: load_hubert() if_f0 = cpt.get("f0", 1) + file_index = file_index.strip(" ").strip('"').strip("\n").strip('"').strip(" ").replace("trained","added")#防止小白写错,自动帮他替换掉 + file_big_npy = file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ") audio_opt = vc.pipeline( hubert_model, net_g, @@ -936,7 +938,7 @@ with gr.Blocks() as app: minimum=0, maximum=1, label="检索特征占比", - value=1, + value=0.6, interactive=True, ) f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调")) diff --git a/train_nsf_sim_cache_sid_load_pretrain.py b/train_nsf_sim_cache_sid_load_pretrain.py index 1735201..fd38dd5 100644 --- a/train_nsf_sim_cache_sid_load_pretrain.py +++ b/train_nsf_sim_cache_sid_load_pretrain.py @@ -21,7 +21,7 @@ import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel as DDP from torch.cuda.amp import autocast, GradScaler from infer_pack import commons - +from time import sleep from time import time as ttime from data_utils import ( TextAudioLoaderMultiNSFsid, @@ -45,7 +45,7 @@ global_step = 0 def main(): # n_gpus = torch.cuda.device_count() os.environ["MASTER_ADDR"] = "localhost" - os.environ["MASTER_PORT"] = "5555" + os.environ["MASTER_PORT"] = "51515" mp.spawn( run, @@ -676,6 +676,7 @@ def train_and_evaluate( "saving final ckpt:%s" % (savee(ckpt, hps.sample_rate, hps.if_f0, hps.name, epoch)) ) + sleep(1) os._exit(2333333) diff --git a/trainset_preprocess_pipeline_print.py b/trainset_preprocess_pipeline_print.py index 68b89d2..5167c82 100644 --- a/trainset_preprocess_pipeline_print.py +++ b/trainset_preprocess_pipeline_print.py @@ -1,4 +1,5 @@ import sys, os, multiprocessing +from scipy import signal now_dir = os.getcwd() sys.path.append(now_dir) @@ -38,6 +39,7 @@ class PreProcess: max_sil_kept=150, ) self.sr = sr + self.bh, self.ah = signal.butter(N=5, Wn=48, btype='high', fs=self.sr) self.per = 3.7 self.overlap = 0.3 self.tail = self.per + self.overlap @@ -69,6 +71,7 @@ class PreProcess: def pipeline(self, path, idx0): try: audio = load_audio(path, self.sr) + audio = signal.filtfilt(self.bh, self.ah, audio) idx1 = 0 for audio in self.slicer.slice(audio): i = 0 diff --git a/vc_infer_pipeline.py b/vc_infer_pipeline.py index 0668e41..1fa41d1 100644 --- a/vc_infer_pipeline.py +++ b/vc_infer_pipeline.py @@ -4,7 +4,8 @@ import torch.nn.functional as F from config import x_pad, x_query, x_center, x_max import scipy.signal as signal import pyworld, os, traceback, faiss - +from scipy import signal +bh, ah = signal.butter(N=5, Wn=48, btype='high', fs=16000) class VC(object): def __init__(self, tgt_sr, device, is_half): @@ -189,6 +190,7 @@ class VC(object): index = big_npy = None else: index = big_npy = None + audio = signal.filtfilt(bh, ah, audio) audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") opt_ts = [] if audio_pad.shape[0] > self.t_max: From bfe974ea9f8477532f37782b79f06809bbb27b57 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Mon, 17 Apr 2023 11:49:57 +0900 Subject: [PATCH 09/40] Fix action when PR send (#83) --- .github/workflows/pull_format.yml | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/pull_format.yml b/.github/workflows/pull_format.yml index 1a3f9ca..60f6b4d 100644 --- a/.github/workflows/pull_format.yml +++ b/.github/workflows/pull_format.yml @@ -2,19 +2,21 @@ name: pull format on: [pull_request] +permissions: + contents: write jobs: pull_format: - permissions: - actions: write - checks: write - contents: write runs-on: ubuntu-latest continue-on-error: true steps: - - uses: actions/checkout@v3 + - name: checkout + continue-on-error: true + uses: actions/checkout@v3 with: ref: ${{ github.head_ref }} - + fetch-depth: 0 + + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: From a4c64b0253f3a519e646bd749a9043705f36daec Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Mon, 17 Apr 2023 12:09:05 +0900 Subject: [PATCH 10/40] Autoformat when pushed directly (#79) * Create push_format.yml * remove unused --- .github/workflows/push_format.yml | 50 +++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 .github/workflows/push_format.yml diff --git a/.github/workflows/push_format.yml b/.github/workflows/push_format.yml new file mode 100644 index 0000000..f43741b --- /dev/null +++ b/.github/workflows/push_format.yml @@ -0,0 +1,50 @@ +name: push format + +on: + push: + branches: + - main + +jobs: + push_format: + permissions: + actions: write + checks: write + contents: write + issues: write + repository-projects: write + pull-requests: write + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + ref: ${{github.ref_name}} + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Black + run: pip install black + + - name: Run Black + # run: black $(git ls-files '*.py') + run: black . + + - name: Commit Back + continue-on-error: true + id: commitback + run: | + git config --local user.email "github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + git add --all + git commit -m "Format code" + + - name: Create Pull Request + if: steps.commitback.outcome == 'success' + continue-on-error: true + uses: peter-evans/create-pull-request@v4 + with: + body: Apply Code Formatter Change + commit-message: Automatic code format From 5ab6713bb3f711a05c180c3c908db8d4fa053eb0 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Mon, 17 Apr 2023 17:15:59 +0900 Subject: [PATCH 11/40] fix permission (#87) --- .github/workflows/push_format.yml | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/.github/workflows/push_format.yml b/.github/workflows/push_format.yml index f43741b..c4ce77f 100644 --- a/.github/workflows/push_format.yml +++ b/.github/workflows/push_format.yml @@ -5,15 +5,11 @@ on: branches: - main +permissions: + contents: write + pull-requests: write jobs: push_format: - permissions: - actions: write - checks: write - contents: write - issues: write - repository-projects: write - pull-requests: write runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 From b0f8a4c7d17ee9d84c9415d40c03a60c01ea5fcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=BA=90=E6=96=87=E9=9B=A8?= <41315874+fumiama@users.noreply.github.com> Date: Mon, 17 Apr 2023 20:49:29 +0800 Subject: [PATCH 12/40] fix: json format (#84) * Update extract_locale.py * Apply Code Formatter Change * Update locale_diff.py * Apply Code Formatter Change --------- Co-authored-by: fumiama --- extract_locale.py | 1 + gui.py | 3 ++- i18n/locale_diff.py | 1 + infer-web.py | 13 +++++++++++-- trainset_preprocess_pipeline_print.py | 2 +- vc_infer_pipeline.py | 4 +++- 6 files changed, 19 insertions(+), 5 deletions(-) diff --git a/extract_locale.py b/extract_locale.py index 546335b..c42bda5 100644 --- a/extract_locale.py +++ b/extract_locale.py @@ -28,3 +28,4 @@ process("gui.py") # Save as a JSON file with open("./i18n/zh_CN.json", "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=4) + f.write("\n") diff --git a/gui.py b/gui.py index 24fcf1e..db9a8d6 100644 --- a/gui.py +++ b/gui.py @@ -1,4 +1,5 @@ -import os,sys +import os, sys + now_dir = os.getcwd() sys.path.append(now_dir) import PySimpleGUI as sg diff --git a/i18n/locale_diff.py b/i18n/locale_diff.py index 6419266..2572779 100644 --- a/i18n/locale_diff.py +++ b/i18n/locale_diff.py @@ -42,3 +42,4 @@ for lang_file in languages: # Save the updated language file with open(lang_file, "w", encoding="utf-8") as f: json.dump(lang_data, f, ensure_ascii=False, indent=4) + f.write("\n") diff --git a/infer-web.py b/infer-web.py index 00d55b2..b027f0e 100644 --- a/infer-web.py +++ b/infer-web.py @@ -139,8 +139,17 @@ def vc_single( if hubert_model == None: load_hubert() if_f0 = cpt.get("f0", 1) - file_index = file_index.strip(" ").strip('"').strip("\n").strip('"').strip(" ").replace("trained","added")#防止小白写错,自动帮他替换掉 - file_big_npy = file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + file_index = ( + file_index.strip(" ") + .strip('"') + .strip("\n") + .strip('"') + .strip(" ") + .replace("trained", "added") + ) # 防止小白写错,自动帮他替换掉 + file_big_npy = ( + file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) audio_opt = vc.pipeline( hubert_model, net_g, diff --git a/trainset_preprocess_pipeline_print.py b/trainset_preprocess_pipeline_print.py index 5167c82..caaf533 100644 --- a/trainset_preprocess_pipeline_print.py +++ b/trainset_preprocess_pipeline_print.py @@ -39,7 +39,7 @@ class PreProcess: max_sil_kept=150, ) self.sr = sr - self.bh, self.ah = signal.butter(N=5, Wn=48, btype='high', fs=self.sr) + self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr) self.per = 3.7 self.overlap = 0.3 self.tail = self.per + self.overlap diff --git a/vc_infer_pipeline.py b/vc_infer_pipeline.py index 1fa41d1..f85f355 100644 --- a/vc_infer_pipeline.py +++ b/vc_infer_pipeline.py @@ -5,7 +5,9 @@ from config import x_pad, x_query, x_center, x_max import scipy.signal as signal import pyworld, os, traceback, faiss from scipy import signal -bh, ah = signal.butter(N=5, Wn=48, btype='high', fs=16000) + +bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) + class VC(object): def __init__(self, tgt_sr, device, is_half): From 88a43e14d164222d13451af1140dbbbf22da5324 Mon Sep 17 00:00:00 2001 From: EntropyRiser <82279347+EntropyRiser@users.noreply.github.com> Date: Mon, 17 Apr 2023 20:49:42 +0800 Subject: [PATCH 13/40] Add non-search inference support. (#82) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: 源文雨 <41315874+fumiama@users.noreply.github.com> --- gui.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/gui.py b/gui.py index db9a8d6..ecd1d81 100644 --- a/gui.py +++ b/gui.py @@ -16,7 +16,6 @@ from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFs from i18n import I18nAuto i18n = I18nAuto() -print(i18n.language_map) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -33,10 +32,11 @@ class RVC: self.f0_max = 1100 self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) - self.index = faiss.read_index(index_path) + if index_rate !=0: + self.index = faiss.read_index(index_path) + self.big_npy = np.load(npy_path) + print('index search enabled') self.index_rate = index_rate - """NOT YET USED""" - self.big_npy = np.load(npy_path) model_path = hubert_path print("load model(s) from {}".format(model_path)) models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( @@ -112,8 +112,8 @@ class RVC: ####索引优化 if ( - isinstance(self.index, type(None)) == False - and isinstance(self.big_npy, type(None)) == False + hasattr(self,'index') + and hasattr(self,'big_npy') and self.index_rate != 0 ): npy = feats[0].cpu().numpy().astype("float32") @@ -123,6 +123,8 @@ class RVC: torch.from_numpy(npy).unsqueeze(0).to(device) * self.index_rate + (1 - self.index_rate) * feats ) + else: + print('index search FAIL or disabled') feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) torch.cuda.synchronize() From 35379217e810a9e8f7f353823f9b402e785d35c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=BA=90=E6=96=87=E9=9B=A8?= <41315874+fumiama@users.noreply.github.com> Date: Mon, 17 Apr 2023 20:49:54 +0800 Subject: [PATCH 14/40] =?UTF-8?q?=E4=BC=98=E5=8C=96=20change=20log=20?= =?UTF-8?q?=E6=A0=BC=E5=BC=8F=20(#86)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 优化 change log 格式 * Apply Code Formatter Change --------- Co-authored-by: fumiama --- Changelog_CN.md | 61 +++++++++++++++++-------------------------------- 1 file changed, 21 insertions(+), 40 deletions(-) diff --git a/Changelog_CN.md b/Changelog_CN.md index f8b58d6..2e23717 100644 --- a/Changelog_CN.md +++ b/Changelog_CN.md @@ -1,42 +1,23 @@ -20230409 +### 20230409 +- 修正训练参数,提升显卡平均利用率,A100最高从25%提升至90%左右,V100:50%->90%左右,2060S:60%->85%左右,P40:25%->95%左右,训练速度显著提升 +- 修正参数:总batch_size改为每张卡的batch_size +- 修正total_epoch:最大限制100解锁至1000;默认10提升至默认20 +- 修复ckpt提取识别是否带音高错误导致推理异常的问题 +- 修复分布式训练每个rank都保存一次ckpt的问题 +- 特征提取进行nan特征过滤 +- 修复静音输入输出随机辅音or噪声的问题(老版模型需要重做训练集重训) - 1-修正训练参数,提升显卡平均利用率,A100最高从25%提升至90%左右,V100:50%->90%左右,2060S:60%->85%左右,P40:25%->95%左右,训练速度显著提升 - - 2-修正参数:总batch_size改为每张卡的batch_size - - 3-修正total_epoch:最大限制100解锁至1000;默认10提升至默认20 - - 4-修复ckpt提取识别是否带音高错误导致推理异常的问题 - - 5-修复分布式训练每个rank都保存一次ckpt的问题 - - 6-特征提取进行nan特征过滤 - - 7-修复静音输入输出随机辅音or噪声的问题(老版模型需要重做训练集重训) - -20230416更新 - - 1-新增本地实时变声迷你GUI,双击go-realtime-gui.bat启动 - - 2-训练推理均对<50Hz的频段进行滤波过滤 - - 3-训练推理音高提取pyworld最低音高从默认80下降至50,50-80hz间的男声低音不会哑 - - 4-WebUI支持根据系统区域变更语言(现支持en_US,ja_JP,zh_CN,zh_HK,zh_SG,zh_TW,不支持的默认en_US) - - 5-修正部分显卡识别(例如V100-16G识别失败,P4识别失败) - -后续计划: - - 1-收集呼吸wav加入训练集修正呼吸变声电音的问题 - - 2-研究更优的默认faiss索引配置,计划将索引打包进weights/xxx.pth中,取消推理界面的 特征/检索库 选择 - - 3-根据显存情况和显卡架构自动给到最优配置(batch size,训练集切块,推理音频长度相关的config,训练是否fp16),未来所有>=4G显存的>=pascal架构的显卡都可以训练或推理,而<4G显存的显卡不会进行支持 - - 4-我们正在训练增加了歌声训练集的底模,未来会公开 - - 5-推理音高识别选项加入"是否开启中值滤波" - - 6-增加选项:每次epoch保存的小模型均进行提取; 增加选项:设置默认测试集音频,每次保存的小模型均在保存后对其进行推理导出,用户可试听(来选择哪个中间epoch最好) +### 20230416更新 +- 新增本地实时变声迷你GUI,双击go-realtime-gui.bat启动 +- 训练推理均对<50Hz的频段进行滤波过滤 +- 训练推理音高提取pyworld最低音高从默认80下降至50,50-80hz间的男声低音不会哑 +- WebUI支持根据系统区域变更语言(现支持en_US,ja_JP,zh_CN,zh_HK,zh_SG,zh_TW,不支持的默认en_US) +- 修正部分显卡识别(例如V100-16G识别失败,P4识别失败) +### 后续计划: +- 收集呼吸wav加入训练集修正呼吸变声电音的问题 +- 研究更优的默认faiss索引配置,计划将索引打包进weights/xxx.pth中,取消推理界面的 特征/检索库 选择 +- 根据显存情况和显卡架构自动给到最优配置(batch size,训练集切块,推理音频长度相关的config,训练是否fp16),未来所有>=4G显存的>=pascal架构的显卡都可以训练或推理,而<4G显存的显卡不会进行支持 +- 我们正在训练增加了歌声训练集的底模,未来会公开 +- 推理音高识别选项加入"是否开启中值滤波" +- 增加选项:每次epoch保存的小模型均进行提取; 增加选项:设置默认测试集音频,每次保存的小模型均在保存后对其进行推理导出,用户可试听(来选择哪个中间epoch最好) From 1e71efb2656e2da7394503b621ea608021b647dd Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 17 Apr 2023 14:09:03 +0000 Subject: [PATCH 15/40] Format code (#89) Co-authored-by: github-actions[bot] --- gui.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/gui.py b/gui.py index ecd1d81..9aee2e9 100644 --- a/gui.py +++ b/gui.py @@ -32,10 +32,10 @@ class RVC: self.f0_max = 1100 self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) - if index_rate !=0: + if index_rate != 0: self.index = faiss.read_index(index_path) self.big_npy = np.load(npy_path) - print('index search enabled') + print("index search enabled") self.index_rate = index_rate model_path = hubert_path print("load model(s) from {}".format(model_path)) @@ -111,11 +111,7 @@ class RVC: feats = self.model.final_proj(logits[0]) ####索引优化 - if ( - hasattr(self,'index') - and hasattr(self,'big_npy') - and self.index_rate != 0 - ): + if hasattr(self, "index") and hasattr(self, "big_npy") and self.index_rate != 0: npy = feats[0].cpu().numpy().astype("float32") _, I = self.index.search(npy, 1) npy = self.big_npy[I.squeeze()].astype("float16") @@ -124,7 +120,7 @@ class RVC: + (1 - self.index_rate) * feats ) else: - print('index search FAIL or disabled') + print("index search FAIL or disabled") feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) torch.cuda.synchronize() From 294b751e34273fc60cdc7bb30ceb3000cfb65e01 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Mon, 17 Apr 2023 23:37:00 +0900 Subject: [PATCH 16/40] some change translation (#91) --- i18n/ja_JP.json | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/i18n/ja_JP.json b/i18n/ja_JP.json index 270f464..ddc362f 100644 --- a/i18n/ja_JP.json +++ b/i18n/ja_JP.json @@ -11,17 +11,17 @@ "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "ピッチ抽出アルゴリズムを選択してください。歌声の場合は、pmを使用して速度を上げることができます。低音が重要な場合は、harvestを使用できますが、非常に遅くなります。", "特征检索库文件路径": "特徴量検索データベースのファイルパス", "特征文件路径": "特徴量ファイルのパス", - "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0(最低共振周波数)カーブファイル(オプション、1行に1ピッチ、デフォルトのF0(最低共振周波数)とエレベーションを置き換えます。)", "转换": "変換", "输出信息": "出力情報", "输出音频(右下角三个点,点了可以下载)": "出力音声(右下の三点をクリックしてダウンロードできます)", - "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "一括変換、変換する音声フォルダを入力、または複数の音声ファイルをアップロードし、指定したフォルダ(デフォルトのopt)に変換した音声を出力します。", "指定输出文件夹": "出力フォルダを指定してください", "检索特征占比": "検索特徴率", "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "処理対象音声フォルダーのパスを入力してください(ファイルマネージャのアドレスバーからコピーしてください)", "也可批量输入音频文件, 二选一, 优先读文件夹": "複数の音声ファイルを一括で入力することもできますが、フォルダーを優先して読み込みます", "伴奏人声分离": "伴奏とボーカルの分離", - "人声伴奏分离批量处理, 使用UVR5模型.
不带和声用HP2, 带和声且提取的人声不需要和声用HP5
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)": "人声伴奏分离批量处理, 使用UVR5模型.
不带和声用HP2, 带和声且提取的人声不需要和声用HP5
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)", + "人声伴奏分离批量处理, 使用UVR5模型.
不带和声用HP2, 带和声且提取的人声不需要和声用HP5
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)": "UVR5モデルを使用した、声帯分離バッチ処理です。
HP2はハーモニー、ハーモニーのあるボーカルとハーモニーのないボーカルを抽出したものはHP5を使ってください
フォルダーパスの形式例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(エクスプローラーのアドレスバーからコピーするだけです)", "输入待处理音频文件夹路径": "処理するオーディオファイルのフォルダパスを入力してください", "模型": "モデル", "指定输出人声文件夹": "人の声を出力するフォルダを指定してください", @@ -60,7 +60,7 @@ "要置入的模型信息": "挿入するモデル情報", "保存的模型名不带后缀": "拡張子のない保存するモデル名", "融合": "フュージョン", - "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型信息(仅支持weights文件夹下提取的小模型文件)", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "モデル情報の修正(weightsフォルダから抽出された小さなモデルファイルのみ対応)", "模型路径": "モデルパス", "要改的模型信息": "変更するモデル情報", "保存的文件名, 默认空为和源文件同名": "保存するファイル名、デフォルトでは空欄で元のファイル名と同じ名前になります", @@ -68,18 +68,18 @@ "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "モデル情報を表示する(小さいモデルファイルはweightsフォルダーからのみサポートされています)", "查看": "表示", "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "モデル抽出(ログフォルダー内の大きなファイルのモデルパスを入力)、モデルを半分までトレーニングし、自動的に小さいファイルモデルを保存しなかったり、中間モデルをテストしたい場合に適用されます。", - "保存名": "保存するファイル名", + "保存名": "保存ファイル名", "模型是否带音高指导,1是0否": "モデルに音高ガイドを付けるかどうか、1は付ける、0は付けない", "提取": "抽出", "招募音高曲线前端编辑器": "音高曲線フロントエンドエディターを募集", "加开发群联系我xxxxx": "開発グループに参加して私に連絡してくださいxxxxx", "点击查看交流、问题反馈群号": "クリックして交流、問題フィードバックグループ番号を表示", "xxxxx": "xxxxx", - "加载模型": "モデルをロードする", + "加载模型": "モデルをロード", "Hubert模型": "Hubert模型", - "选择.pth文件": ".pthファイルを選択する", - "选择.index文件": ".indexファイルを選択する", - "选择.npy文件": ".npyファイルを選択する", + "选择.pth文件": ".pthファイルを選択", + "选择.index文件": ".indexファイルを選択", + "选择.npy文件": ".npyファイルを選択", "输入设备": "入力デバイス", "输出设备": "出力デバイス", "音频设备(请使用同种类驱动)": "オーディオデバイス(同じ種類のドライバーを使用してください)", @@ -93,7 +93,7 @@ "输入降噪": "入力ノイズの低減", "输出降噪": "出力ノイズの低減", "性能设置": "パフォーマンス設定", - "开始音频转换": "音声変換を開始する", - "停止音频转换": "音声変換を停止する", + "开始音频转换": "音声変換を開始", + "停止音频转换": "音声変換を停止", "推理时间(ms):": "推論時間(ms):" -} \ No newline at end of file +} From 0ca936c226556187ebeb135521e2d17b790526d0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 17 Apr 2023 15:26:59 +0000 Subject: [PATCH 17/40] =?UTF-8?q?=F0=9F=8E=A8=20=E5=90=8C=E6=AD=A5=20local?= =?UTF-8?q?e=20(#90)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- i18n/zh_CN.json | 2 +- i18n/zh_HK.json | 2 +- i18n/zh_SG.json | 2 +- i18n/zh_TW.json | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/i18n/zh_CN.json b/i18n/zh_CN.json index d5d744d..5e4f918 100644 --- a/i18n/zh_CN.json +++ b/i18n/zh_CN.json @@ -96,4 +96,4 @@ "开始音频转换": "开始音频转换", "停止音频转换": "停止音频转换", "推理时间(ms):": "推理时间(ms):" -} \ No newline at end of file +} diff --git a/i18n/zh_HK.json b/i18n/zh_HK.json index 338f05f..4b47d95 100644 --- a/i18n/zh_HK.json +++ b/i18n/zh_HK.json @@ -96,4 +96,4 @@ "开始音频转换": "開始音訊轉換", "停止音频转换": "停止音訊轉換", "推理时间(ms):": "推理時間(ms):" -} \ No newline at end of file +} diff --git a/i18n/zh_SG.json b/i18n/zh_SG.json index 338f05f..4b47d95 100644 --- a/i18n/zh_SG.json +++ b/i18n/zh_SG.json @@ -96,4 +96,4 @@ "开始音频转换": "開始音訊轉換", "停止音频转换": "停止音訊轉換", "推理时间(ms):": "推理時間(ms):" -} \ No newline at end of file +} diff --git a/i18n/zh_TW.json b/i18n/zh_TW.json index 338f05f..4b47d95 100644 --- a/i18n/zh_TW.json +++ b/i18n/zh_TW.json @@ -96,4 +96,4 @@ "开始音频转换": "開始音訊轉換", "停止音频转换": "停止音訊轉換", "推理时间(ms):": "推理時間(ms):" -} \ No newline at end of file +} From 58397a92dc6c54ed7ae391ab479b40eef8c2da81 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Tue, 18 Apr 2023 15:03:30 +0900 Subject: [PATCH 18/40] Automatically change faiss version (#92) --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e00ebda..9b03196 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,8 @@ scipy==1.9.3 librosa==0.9.2 llvmlite==0.39.0 fairseq==0.12.2 -faiss-cpu==1.7.2 +faiss-cpu==1.7.0; sys_platform == "darwin" +faiss-cpu==1.7.2; sys_platform != "darwin" gradio Cython future>=0.18.3 From aca68fad09620308cc95afe4e8ac62bad473d8c8 Mon Sep 17 00:00:00 2001 From: Kazuki <47811498+KazukiUruma@users.noreply.github.com> Date: Wed, 19 Apr 2023 12:02:02 +0900 Subject: [PATCH 19/40] improved Japanese translation. (#101) --- docs/README.ja.md | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/docs/README.ja.md b/docs/README.ja.md index cd953ff..a2e1e0a 100644 --- a/docs/README.ja.md +++ b/docs/README.ja.md @@ -21,45 +21,45 @@ VITSに基づく使いやすい音声変換(voice changer)framework

[**English**](./README.en.md) | [**中文简体**](../README.md) | [**日本語**](./README.ja.md) -> デモ動画は[こちら](https://www.bilibili.com/video/BV1pm4y1z7Gm/)でご覧ください +> デモ動画は[こちら](https://www.bilibili.com/video/BV1pm4y1z7Gm/)でご覧ください。 > RVCによるリアルタイム音声変換: [w-okada/voice-changer](https://github.com/w-okada/voice-changer) -> 基底modelを訓練(training)したのは、約50時間の高品質なオープンソースのデータセット。著作権侵害を心配することなく使用できるように。 +> 著作権侵害を心配することなく使用できるように、基底モデルは約50時間の高品質なオープンソースデータセットで訓練されています。 -> 今後は次々と使用許可のある高品質歌声資料集を追加し、基底modelを訓練する。 +> 今後も、次々と使用許可のある高品質な歌声の資料集を追加し、基底モデルを訓練する予定です。 ## はじめに -本repoは下記の特徴があります +本リポジトリには下記の特徴があります。 -+ 調子(tone)の漏洩が下がれるためtop1検索で源特徴量を訓練集特徴量に置換 -+ 古い又は安いGPUでも高速に訓練できる -+ 小さい訓練集でもかなりいいmodelを得られる(10分以上の低noise音声を推奨) -+ modelを融合し音色をmergeできる(ckpt processing->ckpt mergeで使用) -+ 使いやすいWebUI -+ UVR5 Modelも含めるため人声とBGMを素早く分離できる ++ Top1検索を用いることで、生の特徴量を訓練用データセット特徴量に変換し、トーンリーケージを削減します。 ++ 比較的貧弱なGPUでも、高速かつ簡単に訓練できます。 ++ 少量のデータセットからでも、比較的良い結果を得ることができます。(10分以上のノイズの少ない音声を推奨します。) ++ モデルを融合することで、音声を混ぜることができます。(ckpt processingタブの、ckpt mergeを使用します。) ++ 使いやすいWebUI。 ++ UVR5 Modelも含んでいるため、人の声とBGMを素早く分離できます。 ## 環境構築 -poetryで依存関係をinstallすることをお勧めします。 +Poetryで依存関係をインストールすることをお勧めします。 -下記のcommandsは、Python3.8以上の環境で実行する必要があります: +下記のコマンドは、Python3.8以上の環境で実行する必要があります: ```bash -# PyTorch関連の依存関係をinstall。install済の場合はskip +# PyTorch関連の依存関係をインストール。インストール済の場合は省略。 # 参照先: https://pytorch.org/get-started/locally/ pip install torch torchvision torchaudio #Windows+ Nvidia Ampere Architecture(RTX30xx)の場合、 #21 に従い、pytorchに対応するcuda versionを指定する必要があります。 #pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117 -# PyTorch関連の依存関係をinstall。install済の場合はskip +# PyTorch関連の依存関係をインストール。インストール済の場合は省略。 # 参照先: https://python-poetry.org/docs/#installation curl -sSL https://install.python-poetry.org | python3 - -# Poetry経由で依存関係をinstall +# Poetry経由で依存関係をインストール poetry install ``` -pipでも依存関係のinstallが可能です: +pipでも依存関係のインストールが可能です: **注意**:`faiss 1.7.2`は`macOS`で`Segmentation Fault: 11`を起こすので、`requirements.txt`の該当行を `faiss-cpu==1.7.0`に変更してください。 @@ -68,11 +68,11 @@ pip install -r requirements.txt ``` ## 基底modelsを準備 -RVCは推論/訓練のために、様々な事前訓練を行った基底modelsが必要です。 +RVCは推論/訓練のために、様々な事前訓練を行った基底モデルを必要とします。 modelsは[Hugging Face space](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/)からダウンロードできます。 -以下は、RVCに必要な基底modelsやその他のfilesの一覧です。 +以下は、RVCに必要な基底モデルやその他のファイルの一覧です。 ```bash hubert_base.pt @@ -80,16 +80,16 @@ hubert_base.pt ./uvr5_weights -# ffmpegがすでにinstallされている場合はskip +# ffmpegがすでにinstallされている場合は省略 ./ffmpeg ``` -その後、下記のcommandでWebUIを起動 +その後、下記のコマンドでWebUIを起動します。 ```bash python infer-web.py ``` -Windowsをお使いの方は、直接に`RVC-beta.7z`をダウンロード後に展開し、`go-web.bat`をclickでWebUIを起動。(7zipが必要です) +Windowsをお使いの方は、直接`RVC-beta.7z`をダウンロード後に展開し、`go-web.bat`をクリックすることで、WebUIを起動することができます。(7zipが必要です。) -また、repoに[小白简易教程.doc](./小白简易教程.doc)がありますので、参考にしてください(中国語版のみ)。 +また、リポジトリに[小白简易教程.doc](./小白简易教程.doc)がありますので、参考にしてください(中国語版のみ)。 ## 参考プロジェクト + [ContentVec](https://github.com/auspicious3000/contentvec/) From 8bf1e0e0264e7b01755a15323c3cb8e277f5443e Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Wed, 19 Apr 2023 14:45:04 +0900 Subject: [PATCH 20/40] Update faiss description (#95) --- README.md | 2 +- docs/README.en.md | 2 +- docs/README.ja.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index c904a52..572a31e 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ poetry install 你也可以通过pip来安装依赖: -**注意**: `MacOS`下`faiss 1.7.2`版本会导致抛出段错误,请将`requirements.txt`的对应条目改为`faiss-cpu==1.7.0` +**注意**: `MacOS`下`faiss 1.7.2`版本会导致抛出段错误,在手动安装时请使用命令`pip install faiss-cpu==1.7.0`指定使用`1.7.0`版本 ```bash pip install -r requirements.txt diff --git a/docs/README.en.md b/docs/README.en.md index a97e1df..fc3ab63 100644 --- a/docs/README.en.md +++ b/docs/README.en.md @@ -55,7 +55,7 @@ poetry install ``` You can also use pip to install the dependencies -**Notice**: `faiss 1.7.2` will raise Segmentation Fault: 11 under `MacOS`, please change corresponding line in `requirements.txt` to `faiss-cpu==1.7.0` +**Notice**: `faiss 1.7.2` will raise Segmentation Fault: 11 under `MacOS`, please use `pip install faiss-cpu==1.7.0` if you use pip to install it manually. ```bash pip install -r requirements.txt diff --git a/docs/README.ja.md b/docs/README.ja.md index a2e1e0a..ca822b5 100644 --- a/docs/README.ja.md +++ b/docs/README.ja.md @@ -61,7 +61,7 @@ poetry install pipでも依存関係のインストールが可能です: -**注意**:`faiss 1.7.2`は`macOS`で`Segmentation Fault: 11`を起こすので、`requirements.txt`の該当行を `faiss-cpu==1.7.0`に変更してください。 +**注意**:`faiss 1.7.2`は`macOS`で`Segmentation Fault: 11`を起こすので、マニュアルインストールする場合は、 `pip install faiss-cpu==1.7.0`を実行してください。 ```bash pip install -r requirements.txt From a2dadfc9317fc9a6ba3357aed4379261b71d1a01 Mon Sep 17 00:00:00 2001 From: Rice Cake Date: Fri, 21 Apr 2023 16:30:08 +0800 Subject: [PATCH 21/40] Update README.en.md (#113) --- docs/README.en.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/README.en.md b/docs/README.en.md index fc3ab63..d8bd6f4 100644 --- a/docs/README.en.md +++ b/docs/README.en.md @@ -24,6 +24,9 @@ An easy-to-use SVC framework based on VITS.

> Realtime Voice Conversion Software using RVC : [w-okada/voice-changer](https://github.com/w-okada/voice-changer) +> The dataset for the pre-training model uses nearly 50 hours of high quality VCTK open source dataset. + +> High quality licensed song datasets will be added to training-set one after another for your use, without worrying about copyright infringement. ## Summary This repository has the following features: + Reduce tone leakage by replacing source feature to training-set feature using top1 retrieval; @@ -32,7 +35,6 @@ This repository has the following features: + Supporting model fusion to change timbres (using ckpt processing tab->ckpt merge); + Easy-to-use Webui interface; + Use the UVR5 model to quickly separate vocals and instruments. -+ The dataset for the pre-training model uses nearly 50 hours of high quality VCTK open source dataset, and high quality licensed song datasets will be added to training-set one after another for your use, without worrying about copyright infringement. ## Preparing the environment We recommend you install the dependencies through poetry. @@ -43,8 +45,7 @@ The following commands need to be executed in the environment of Python version pip install torch torchvision torchaudio #For Windows + Nvidia Ampere Architecture(RTX30xx), you need to specify the cuda version corresponding to pytorch according to the experience of https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI/issues/21 - -pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117 +#pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117 # Install the Poetry dependency management tool, skip if installed # Reference: https://python-poetry.org/docs/#installation @@ -83,8 +84,6 @@ python infer-web.py ``` If you are using Windows, you can download and extract `RVC-beta.7z` to use RVC directly and use `go-web.bat` to start Webui. -We will develop an English version of the WebUI in 2 weeks. - There's also a tutorial on RVC in Chinese and you can check it out if needed. ## Credits From c94151242714391ac1677733a7000db4bc7fd59f Mon Sep 17 00:00:00 2001 From: Yugo Ogura <15419227+Spice-Z@users.noreply.github.com> Date: Sat, 22 Apr 2023 01:33:11 +0900 Subject: [PATCH 22/40] chore: Just fix typo in README.ja.md (#114) --- docs/README.ja.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/README.ja.md b/docs/README.ja.md index ca822b5..2dd6201 100644 --- a/docs/README.ja.md +++ b/docs/README.ja.md @@ -100,7 +100,7 @@ Windowsをお使いの方は、直接`RVC-beta.7z`をダウンロード後に展 + [Ultimate Vocal Remover](https://github.com/Anjok07/ultimatevocalremovergui) + [audio-slicer](https://github.com/openvpi/audio-slicer) -## 貢献者(contributer)の皆様の尽力に感謝します +## 貢献者(contributor)の皆様の尽力に感謝します From ebc0b227c159c0941ac4c6d1a303b762b05c3ee4 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Sat, 22 Apr 2023 01:35:37 +0900 Subject: [PATCH 23/40] Update i18n.py (#117) --- i18n.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/i18n.py b/i18n.py index d535482..5767d88 100644 --- a/i18n.py +++ b/i18n.py @@ -11,10 +11,8 @@ def load_language_list(language): class I18nAuto: def __init__(self, language=None): - if language is None: - language = "auto" - if language == "auto": - language = locale.getdefaultlocale()[0] + if language in ['auto', None]: + language = locale.getlocale()[0] if not os.path.exists(f"./i18n/{language}.json"): language = "en_US" self.language = language From 8acc0f2b7109c5e3746682e91ea874e829106b5f Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Sat, 22 Apr 2023 01:36:10 +0900 Subject: [PATCH 24/40] fix port (#118) --- config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.py b/config.py index e5c0810..e0aa86c 100644 --- a/config.py +++ b/config.py @@ -30,7 +30,7 @@ parser.add_argument( cmd_opts = parser.parse_args() python_cmd = cmd_opts.pycmd -listen_port = cmd_opts.port +listen_port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865 iscolab = cmd_opts.colab noparallel = cmd_opts.noparallel noautoopen = cmd_opts.noautoopen From 9b513a237572ed37e362aaeb2f5904504f2d48ed Mon Sep 17 00:00:00 2001 From: nadare Date: Sat, 22 Apr 2023 15:04:56 +0900 Subject: [PATCH 25/40] Training tutorial (#109) * add training tips in ja * add english edition(using google translate) --- docs/training_tips_en.md | 52 +++++++++++++++++++++++++++++++++++++++ docs/training_tips_ja.md | 53 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 docs/training_tips_en.md create mode 100644 docs/training_tips_ja.md diff --git a/docs/training_tips_en.md b/docs/training_tips_en.md new file mode 100644 index 0000000..ad95ecf --- /dev/null +++ b/docs/training_tips_en.md @@ -0,0 +1,52 @@ +Instructions and tips for RVC training +====================================== +This TIPS explains how data training is done. + +# Training flow +I will explain along the steps in the training tab of the GUI. + +## step1 +Set the experiment name here. You can also set here whether the model should take pitch into account. +Data for each experiment is placed in `/logs/experiment name/`. + +## step2a +Loads and preprocesses audio. + +### load audio +If you specify a folder with audio, the audio files in that folder will be read automatically. +For example, if you specify `C:Users\hoge\voices`, `C:Users\hoge\voices\voice.mp3` will be loaded, but `C:Users\hoge\voices\dir\voice.mp3` will Not loaded. + +Since ffmpeg is used internally for reading audio, if the extension is supported by ffmpeg, it will be read automatically. +After converting to int16 with ffmpeg, convert to float32 and normalize between -1 to 1. + +### denoising +The audio is smoothed by scipy's filtfilt. + +### Audio Split +First, the input audio is divided by detecting parts of silence that last longer than a certain period (max_sil_kept=5 seconds?). After splitting the audio on silence, split the audio every 4 seconds with an overlap of 0.3 seconds. For audio separated within 4 seconds, after normalizing the volume, convert the wav file to `/logs/experiment name/0_gt_wavs` and then convert it to 16k sampling rate to `/logs/experiment name/1_16k_wavs ` as a wav file. + +## step2b +### Extract pitch +Extract pitch information from wav files. Extract the pitch information (=f0) using the method built into parselmouth or pyworld and save it in `/logs/experiment name/2a_f0`. Then logarithmically convert the pitch information to an integer between 1 and 255 and save it in `/logs/experiment name/2b-f0nsf`. + +### Extract feature_print +Convert the wav file to embedding in advance using HuBERT. Read the wav file saved in `/logs/experiment name/1_16k_wavs`, convert the wav file to 256-dimensional features with HuBERT, and save in npy format in `/logs/experiment name/3_feature256`. + +## step3 +train the model. +### Glossary for Beginners +In deep learning, the data set is divided and the learning proceeds little by little. In one model update (step), batch_size data are retrieved and predictions and error corrections are performed. Doing this once for a dataset counts as one epoch. + +Therefore, the learning time is the learning time per step x (the number of data in the dataset / batch size) x the number of epochs. In general, the larger the batch size, the more stable the learning becomes (learning time per step ÷ batch size) becomes smaller, but it uses more GPU memory. GPU RAM can be checked with the nvidia-smi command. Learning can be done in a short time by increasing the batch size as much as possible according to the machine of the execution environment. + +### Specify pretrained model +RVC starts training the model from pretrained weights instead of from 0, so it can be trained with a small dataset. By default it loads `rvc-location/pretrained/f0G40k.pth` and `rvc-location/pretrained/f0D40k.pth`. When learning, model parameters are saved in `logs/experiment name/G_{}.pth` and `logs/experiment name/D_{}.pth` for each save_every_epoch, but by specifying this path, you can start learning. You can restart or start training from model weights learned in a different experiment. + +### learning index +RVC saves the HuBERT feature values used during training, and during inference, searches for feature values that are similar to the feature values used during learning to perform inference. In order to perform this search at high speed, the index is learned in advance. +For index learning, we use the approximate neighborhood search library faiss. Read the feature value of `/logs/experiment name/3_feature256`, save the combined feature value as `/logs/experiment name/total_fea.npy`, and use it to learn the index `/logs/experiment name Save it as /add_XXX.index`. + +### Button description +- Train model: After executing step2b, press this button to train the model. +- Train feature index: After training the model, perform index learning. +- One-click training: step2b, model training and feature index training all at once. \ No newline at end of file diff --git a/docs/training_tips_ja.md b/docs/training_tips_ja.md new file mode 100644 index 0000000..40b835c --- /dev/null +++ b/docs/training_tips_ja.md @@ -0,0 +1,53 @@ +RVCの訓練における説明、およびTIPS +=============================== +本TIPSではどのようにデータの訓練が行われているかを説明します。 + +# 訓練の流れ +GUIの訓練タブのstepに沿って説明します。 + +## step1 +実験名の設定を行います。また、モデルにピッチを考慮させるかもここで設定できます。 +各実験のデータは`/logs/実験名/`に配置されます。 + +## step2a +音声の読み込みと前処理を行います。 + +### load audio +音声のあるフォルダを指定すると、そのフォルダ内にある音声ファイルを自動で読み込みます。 +例えば`C:Users\hoge\voices`を指定した場合、`C:Users\hoge\voices\voice.mp3`は読み込まれますが、`C:Users\hoge\voices\dir\voice.mp3`は読み込まれません。 + +音声の読み込みには内部でffmpegを利用しているので、ffmpegで対応している拡張子であれば自動的に読み込まれます。 +ffmpegでint16に変換した後、float32に変換し、-1 ~ 1の間に正規化されます。 + +### denoising +音声についてscipyのfiltfiltによる平滑化を行います。 + +### 音声の分割 +入力した音声はまず、一定期間(max_sil_kept=5秒?)より長く無音が続く部分を検知して音声を分割します。無音で音声を分割した後は、0.3秒のoverlapを含む4秒ごとに音声を分割します。4秒以内に区切られた音声は、音量の正規化を行った後wavファイルを`/logs/実験名/0_gt_wavs`に、そこから16kのサンプリングレートに変換して`/logs/実験名/1_16k_wavs`にwavファイルで保存します。 + +## step2b +### ピッチの抽出 +wavファイルからピッチ(音の高低)の情報を抽出します。parselmouthやpyworldに内蔵されている手法でピッチ情報(=f0)を抽出し、`/logs/実験名/2a_f0`に保存します。その後、ピッチ情報を対数で変換して1~255の整数に変換し、`/logs/実験名/2b-f0nsf`に保存します。 + +### feature_printの抽出 +HuBERTを用いてwavファイルを事前にembeddingに変換します。`/logs/実験名/1_16k_wavs`に保存したwavファイルを読み込み、HuBERTでwavファイルを256次元の特徴量に変換し、npy形式で`/logs/実験名/3_feature256`に保存します。 + +## step3 +モデルのトレーニングを行います。 +### 初心者向け用語解説 +深層学習ではデータセットを分割し、少しずつ学習を進めていきます。一回のモデルの更新(step)では、batch_size個のデータを取り出し予測と誤差の修正を行います。これをデータセットに対して一通り行うと一epochと数えます。 + +そのため、学習時間は 1step当たりの学習時間 x (データセット内のデータ数 ÷ バッチサイズ) x epoch数 かかります。一般にバッチサイズを大きくするほど学習は安定し、(1step当たりの学習時間÷バッチサイズ)は小さくなりますが、その分GPUのメモリを多く使用します。GPUのRAMはnvidia-smiコマンド等で確認できます。実行環境のマシンに合わせてバッチサイズをできるだけ大きくするとより短時間で学習が可能です。 + +### pretrained modelの指定 +RVCではモデルの訓練を0からではなく、事前学習済みの重みから開始するため、少ないデータセットで学習を行えます。デフォルトでは`RVCのある場所/pretrained/f0G40k.pth`と`RVCのある場所/pretrained/f0D40k.pth`を読み込みます。学習時はsave_every_epochごとにモデルのパラメータが`logs/実験名/G_{}.pth`と`logs/実験名/D_{}.pth`に保存されますが、このパスを指定することで学習を再開したり、もしくは違う実験で学習したモデルの重みから学習を開始できます。 + +### indexの学習 +RVCでは学習時に使われたHuBERTの特徴量を保存し、推論時は学習時の特徴量から近い特徴量を探してきて推論を行います。この検索を高速に行うために事前にindexの学習を行います。 +indexの学習には近似近傍探索ライブラリのfaissを用います。`/logs/実験名/3_feature256`の特徴量を読み込み、全て結合させた特徴量を`/logs/実験名/total_fea.npy`として保存、それを用いて学習したindexを`/logs/実験名/add_XXX.index`として保存します。 + +### ボタンの説明 +- モデルのトレーニング: step2bまでを実行した後、このボタンを押すとモデルの学習を行います。 +- 特徴インデックスのトレーニング: モデルのトレーニング後、indexの学習を行います。 +- ワンクリックトレーニング: step2bまでとモデルのトレーニング、特徴インデックスのトレーニングを一括で行います。 + From 334da847d2b6b37afb276cbec990e699385a306a Mon Sep 17 00:00:00 2001 From: Rice Cake Date: Sat, 22 Apr 2023 14:06:18 +0800 Subject: [PATCH 26/40] Update README.en.md (#121) * Update README.en.md * Update README.en.md --- docs/README.en.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/README.en.md b/docs/README.en.md index d8bd6f4..849e142 100644 --- a/docs/README.en.md +++ b/docs/README.en.md @@ -87,7 +87,13 @@ If you are using Windows, you can download and extract `RVC-beta.7z` to use RVC There's also a tutorial on RVC in Chinese and you can check it out if needed. ## Credits - ++ [ContentVec](https://github.com/auspicious3000/contentvec/) ++ [VITS](https://github.com/jaywalnut310/vits) ++ [HIFIGAN](https://github.com/jik876/hifi-gan) ++ [Gradio](https://github.com/gradio-app/gradio) ++ [FFmpeg](https://github.com/FFmpeg/FFmpeg) ++ [Ultimate Vocal Remover](https://github.com/Anjok07/ultimatevocalremovergui) ++ [audio-slicer](https://github.com/openvpi/audio-slicer) ## Thanks to all contributors for their efforts From 2f51e932bfca4c00e0cdb176c5926f9627784a2f Mon Sep 17 00:00:00 2001 From: EntropyRiser <82279347+EntropyRiser@users.noreply.github.com> Date: Sat, 22 Apr 2023 19:32:49 +0800 Subject: [PATCH 27/40] Change f0 predictor to harvest. (#123) Co-authored-by: EntropyRiser <1832783120@qq.com> --- gui.py | 139 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 76 insertions(+), 63 deletions(-) diff --git a/gui.py b/gui.py index 9aee2e9..36cd910 100644 --- a/gui.py +++ b/gui.py @@ -7,9 +7,10 @@ import sounddevice as sd import noisereduce as nr import numpy as np from fairseq import checkpoint_utils -import librosa, torch, parselmouth, faiss, time, threading +import librosa, torch, pyworld, faiss, time, threading import torch.nn.functional as F import torchaudio.transforms as tat +import scipy.signal as signal # import matplotlib.pyplot as plt from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono @@ -26,71 +27,82 @@ class RVC: """ 初始化 """ - self.f0_up_key = key - self.time_step = 160 / 16000 * 1000 - self.f0_min = 50 - self.f0_max = 1100 - self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) - self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) - if index_rate != 0: - self.index = faiss.read_index(index_path) - self.big_npy = np.load(npy_path) - print("index search enabled") - self.index_rate = index_rate - model_path = hubert_path - print("load model(s) from {}".format(model_path)) - models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( - [model_path], - suffix="", - ) - self.model = models[0] - self.model = self.model.to(device) - self.model = self.model.half() - self.model.eval() - cpt = torch.load(pth_path, map_location="cpu") - tgt_sr = cpt["config"][-1] - cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk - if_f0 = cpt.get("f0", 1) - if if_f0 == 1: - self.net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=True) - else: - self.net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) - del self.net_g.enc_q - print(self.net_g.load_state_dict(cpt["weight"], strict=False)) - self.net_g.eval().to(device) - self.net_g.half() + try: + self.f0_up_key = key + self.time_step = 160 / 16000 * 1000 + self.f0_min = 50 + self.f0_max = 1100 + self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) + self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) + self.sr = 16000 + self.window = 160 + if index_rate != 0: + self.index = faiss.read_index(index_path) + self.big_npy = np.load(npy_path) + print("index search enabled") + self.index_rate = index_rate + model_path = hubert_path + print("load model(s) from {}".format(model_path)) + models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( + [model_path], + suffix="", + ) + self.model = models[0] + self.model = self.model.to(device) + self.model = self.model.half() + self.model.eval() + cpt = torch.load(pth_path, map_location="cpu") + tgt_sr = cpt["config"][-1] + cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk + if_f0 = cpt.get("f0", 1) + if if_f0 == 1: + self.net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=True) + else: + self.net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) + del self.net_g.enc_q + print(self.net_g.load_state_dict(cpt["weight"], strict=False)) + self.net_g.eval().to(device) + self.net_g.half() + except Exception as e: + print(e) - def get_f0_coarse(self, f0): + def get_f0(self, x, f0_up_key, inp_f0=None): + x_pad=1 + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + f0, t = pyworld.harvest( + x.astype(np.double), + fs=self.sr, + f0_ceil=f0_max, + f0_floor=f0_min, + frame_period=10, + ) + f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr) + f0 = signal.medfilt(f0, 3) + f0 *= pow(2, f0_up_key / 12) + # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) + tf0 = self.sr // self.window # 每秒f0点数 + if inp_f0 is not None: + delta_t = np.round( + (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 + ).astype("int16") + replace_f0 = np.interp( + list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] + ) + shape = f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)].shape[0] + f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)] = replace_f0[:shape] + # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) + f0bak = f0.copy() f0_mel = 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / ( - self.f0_mel_max - self.f0_mel_min + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min ) + 1 f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel > 255] = 255 - # f0_mel[f0_mel > 188] = 188 f0_coarse = np.rint(f0_mel).astype(np.int) - return f0_coarse - - def get_f0(self, x, p_len, f0_up_key=0): - f0 = ( - parselmouth.Sound(x, 16000) - .to_pitch_ac( - time_step=self.time_step / 1000, - voicing_threshold=0.6, - pitch_floor=self.f0_min, - pitch_ceiling=self.f0_max, - ) - .selected_array["frequency"] - ) - - pad_size = (p_len - len(f0) + 1) // 2 - if pad_size > 0 or p_len - len(f0) - pad_size > 0: - f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") - f0 *= pow(2, f0_up_key / 12) - # f0=suofang(f0) - f0bak = f0.copy() - f0_coarse = self.get_f0_coarse(f0) - return f0_coarse, f0bak + return f0_coarse, f0bak # 1-0 def infer(self, feats: torch.Tensor) -> np.ndarray: """ @@ -127,7 +139,7 @@ class RVC: # p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存 p_len = min(feats.shape[1], 12000) # print(feats.shape) - pitch, pitchf = self.get_f0(audio, p_len, self.f0_up_key) + pitch, pitchf = self.get_f0(audio, self.f0_up_key) p_len = min(feats.shape[1], 12000, pitch.shape[0]) # 太大了爆显存 torch.cuda.synchronize() # print(feats.shape,pitch.shape) @@ -365,7 +377,7 @@ class GUI: self.config.pth_path, self.config.index_path, self.config.npy_path, - self.config.index_rate, + self.config.index_rate ) self.input_wav: np.ndarray = np.zeros( self.extra_frame @@ -487,8 +499,9 @@ class GUI: else: outdata[:] = self.output_wav[:].repeat(2, 1).t().cpu().numpy() total_time = time.perf_counter() - start_time - print("infer time:" + str(total_time)) self.window["infer_time"].update(int(total_time * 1000)) + print("infer time:" + str(total_time)) + def get_devices(self, update: bool = True): """获取设备列表""" From c423f77a16d857b544d5c3b16419a167ebf8cf97 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Sat, 22 Apr 2023 11:38:00 +0000 Subject: [PATCH 28/40] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=97=A0f0=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B=E7=9A=84=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 增加无f0模型的支持 --- gui.py | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/gui.py b/gui.py index 36cd910..6215435 100644 --- a/gui.py +++ b/gui.py @@ -54,8 +54,8 @@ class RVC: cpt = torch.load(pth_path, map_location="cpu") tgt_sr = cpt["config"][-1] cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk - if_f0 = cpt.get("f0", 1) - if if_f0 == 1: + self.if_f0 = cpt.get("f0", 1) + if self.if_f0 == 1: self.net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=True) else: self.net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) @@ -136,27 +136,37 @@ class RVC: feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) torch.cuda.synchronize() - # p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存 - p_len = min(feats.shape[1], 12000) # print(feats.shape) - pitch, pitchf = self.get_f0(audio, self.f0_up_key) - p_len = min(feats.shape[1], 12000, pitch.shape[0]) # 太大了爆显存 + if(self.if_f0==1): + pitch, pitchf = self.get_f0(audio, self.f0_up_key) + p_len = min(feats.shape[1], 13000, pitch.shape[0]) # 太大了爆显存 + else: + pitch, pitchf = None, None + p_len = min(feats.shape[1], 13000) # 太大了爆显存 torch.cuda.synchronize() # print(feats.shape,pitch.shape) feats = feats[:, :p_len, :] - pitch = pitch[:p_len] - pitchf = pitchf[:p_len] + if(self.if_f0==1): + pitch = pitch[:p_len] + pitchf = pitchf[:p_len] + pitch = torch.LongTensor(pitch).unsqueeze(0).to(device) + pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device) p_len = torch.LongTensor([p_len]).to(device) - pitch = torch.LongTensor(pitch).unsqueeze(0).to(device) - pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device) ii = 0 # sid sid = torch.LongTensor([ii]).to(device) with torch.no_grad(): - infered_audio = ( - self.net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] - .data.cpu() - .float() - ) # nsf + if(self.if_f0==1): + infered_audio = ( + self.net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] + .data.cpu() + .float() + ) + else: + infered_audio = ( + self.net_g.infer(feats, p_len, sid)[0][0, 0] + .data.cpu() + .float() + ) torch.cuda.synchronize() return infered_audio From 297d92bf5dd0d23000fdd6f9df339e25dd0925b6 Mon Sep 17 00:00:00 2001 From: autumnmotor <59357372+autumnmotor@users.noreply.github.com> Date: Sat, 22 Apr 2023 20:39:47 +0900 Subject: [PATCH 29/40] some change precision audio processing (#94) * some change precision audio processing * fix clipping problem in resample resample sometimes causes signal clipping, not just librosa.resample * fix error --- extract_f0_print.py | 4 +++- my_utils.py | 4 ++-- train/data_utils.py | 10 ++++++++-- trainset_preprocess_pipeline_print.py | 23 +++++++++++++++++++---- 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/extract_f0_print.py b/extract_f0_print.py index d330c90..d2bc805 100644 --- a/extract_f0_print.py +++ b/extract_f0_print.py @@ -33,7 +33,9 @@ class FeatureInput(object): self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) def compute_f0(self, path, f0_method): - x, sr = librosa.load(path, self.fs) + # default resample type of librosa.resample is "soxr_hq". + # Quality: soxr_vhq > soxr_hq + x, sr = librosa.load(path, self.fs, res_type='soxr_vhq') p_len = x.shape[0] // self.hop assert sr == self.fs if f0_method == "pm": diff --git a/my_utils.py b/my_utils.py index 89a1527..8b7e427 100644 --- a/my_utils.py +++ b/my_utils.py @@ -12,10 +12,10 @@ def load_audio(file, sr): ) # 防止小白拷路径头尾带了空格和"和回车 out, _ = ( ffmpeg.input(file, threads=0) - .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr) + .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) ) except Exception as e: raise RuntimeError(f"Failed to load audio: {e}") - return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 + return np.frombuffer(out, np.float32).flatten() \ No newline at end of file diff --git a/train/data_utils.py b/train/data_utils.py index ee7d4d1..87a435f 100644 --- a/train/data_utils.py +++ b/train/data_utils.py @@ -98,7 +98,10 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): sampling_rate, self.sampling_rate ) ) - audio_norm = audio / self.max_wav_value + audio_norm = audio +# audio_norm = audio / self.max_wav_value +# audio_norm = audio / np.abs(audio).max() + audio_norm = audio_norm.unsqueeze(0) spec_filename = filename.replace(".wav", ".spec.pt") if os.path.exists(spec_filename): @@ -287,7 +290,10 @@ class TextAudioLoader(torch.utils.data.Dataset): sampling_rate, self.sampling_rate ) ) - audio_norm = audio / self.max_wav_value + audio_norm = audio +# audio_norm = audio / self.max_wav_value +# audio_norm = audio / np.abs(audio).max() + audio_norm = audio_norm.unsqueeze(0) spec_filename = filename.replace(".wav", ".spec.pt") if os.path.exists(spec_filename): diff --git a/trainset_preprocess_pipeline_print.py b/trainset_preprocess_pipeline_print.py index caaf533..7b5833a 100644 --- a/trainset_preprocess_pipeline_print.py +++ b/trainset_preprocess_pipeline_print.py @@ -59,19 +59,34 @@ class PreProcess: wavfile.write( "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), self.sr, - (tmp_audio * 32768).astype(np.int16), + (tmp_audio * 1).astype(np.float32), ) - tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000) + + # default resample type of librosa.resample is "soxr_hq". + # Quality: soxr_vhq > soxr_hq + tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000, res_type="soxr_vhq") + tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (self.max * self.alpha)) + ( + 1 - self.alpha + ) * tmp_audio + wavfile.write( + "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), + self.sr, + (tmp_audio * 1).astype(np.float32), + ) + wavfile.write( "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), 16000, - (tmp_audio * 32768).astype(np.int16), + (tmp_audio * 1).astype(np.float32), ) def pipeline(self, path, idx0): try: audio = load_audio(path, self.sr) - audio = signal.filtfilt(self.bh, self.ah, audio) + # zero phased digital filter cause pre-ringing noise... + # audio = signal.filtfilt(self.bh, self.ah, audio) + audio = signal.lfilter(self.bh, self.ah, audio) + idx1 = 0 for audio in self.slicer.slice(audio): i = 0 From 5d5ab5465f132a4ecb021df808ce0dface3c9544 Mon Sep 17 00:00:00 2001 From: tarepan Date: Sat, 22 Apr 2023 21:05:00 +0900 Subject: [PATCH 30/40] Refactor GPU cache during training (#108) --- train_nsf_sim_cache_sid_load_pretrain.py | 606 +++++++++-------------- 1 file changed, 228 insertions(+), 378 deletions(-) diff --git a/train_nsf_sim_cache_sid_load_pretrain.py b/train_nsf_sim_cache_sid_load_pretrain.py index fd38dd5..320af5f 100644 --- a/train_nsf_sim_cache_sid_load_pretrain.py +++ b/train_nsf_sim_cache_sid_load_pretrain.py @@ -230,39 +230,50 @@ def train_and_evaluate( net_g.train() net_d.train() - if cache == [] or hps.if_cache_data_in_gpu == False: # 第一个epoch把cache全部填满训练集 - # print("caching") - for batch_idx, info in enumerate(train_loader): - if hps.if_f0 == 1: - ( - phone, - phone_lengths, - pitch, - pitchf, - spec, - spec_lengths, - wave, - wave_lengths, - sid, - ) = info - else: - phone, phone_lengths, spec, spec_lengths, wave, wave_lengths, sid = info - if torch.cuda.is_available(): - phone, phone_lengths = phone.cuda( - rank, non_blocking=True - ), phone_lengths.cuda(rank, non_blocking=True) + + # Prepare data iterator + if hps.if_cache_data_in_gpu == True: + # Use Cache + data_iterator = cache + if cache == []: + # Make new cache + for batch_idx, info in enumerate(train_loader): + # Unpack if hps.if_f0 == 1: - pitch, pitchf = pitch.cuda(rank, non_blocking=True), pitchf.cuda( - rank, non_blocking=True - ) - sid = sid.cuda(rank, non_blocking=True) - spec, spec_lengths = spec.cuda( - rank, non_blocking=True - ), spec_lengths.cuda(rank, non_blocking=True) - wave, wave_lengths = wave.cuda( - rank, non_blocking=True - ), wave_lengths.cuda(rank, non_blocking=True) - if hps.if_cache_data_in_gpu == True: + ( + phone, + phone_lengths, + pitch, + pitchf, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ) = info + else: + ( + phone, + phone_lengths, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ) = info + # Load on CUDA + if torch.cuda.is_available(): + phone = phone.cuda(rank, non_blocking=True) + phone_lengths = phone_lengths.cuda(rank, non_blocking=True) + if hps.if_f0 == 1: + pitch = pitch.cuda(rank, non_blocking=True) + pitchf = pitchf.cuda(rank, non_blocking=True) + sid = sid.cuda(rank, non_blocking=True) + spec = spec.cuda(rank, non_blocking=True) + spec_lengths = spec_lengths.cuda(rank, non_blocking=True) + wave = wave.cuda(rank, non_blocking=True) + wave_lengths = wave_lengths.cuda(rank, non_blocking=True) + # Cache on list if hps.if_f0 == 1: cache.append( ( @@ -295,372 +306,211 @@ def train_and_evaluate( ), ) ) - with autocast(enabled=hps.train.fp16_run): - if hps.if_f0 == 1: - ( - y_hat, - ids_slice, - x_mask, - z_mask, - (z, z_p, m_p, logs_p, m_q, logs_q), - ) = net_g( - phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid - ) - else: - ( - y_hat, - ids_slice, - x_mask, - z_mask, - (z, z_p, m_p, logs_p, m_q, logs_q), - ) = net_g(phone, phone_lengths, spec, spec_lengths, sid) - mel = spec_to_mel_torch( - spec, - hps.data.filter_length, - hps.data.n_mel_channels, - hps.data.sampling_rate, - hps.data.mel_fmin, - hps.data.mel_fmax, - ) - y_mel = commons.slice_segments( - mel, ids_slice, hps.train.segment_size // hps.data.hop_length - ) - with autocast(enabled=False): - y_hat_mel = mel_spectrogram_torch( - y_hat.float().squeeze(1), - hps.data.filter_length, - hps.data.n_mel_channels, - hps.data.sampling_rate, - hps.data.hop_length, - hps.data.win_length, - hps.data.mel_fmin, - hps.data.mel_fmax, - ) - if hps.train.fp16_run == True: - y_hat_mel = y_hat_mel.half() - wave = commons.slice_segments( - wave, ids_slice * hps.data.hop_length, hps.train.segment_size - ) # slice + else: + # Load shuffled cache + shuffle(cache) + else: + # Loader + data_iterator = enumerate(train_loader) - # Discriminator - y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach()) - with autocast(enabled=False): - loss_disc, losses_disc_r, losses_disc_g = discriminator_loss( - y_d_hat_r, y_d_hat_g - ) - optim_d.zero_grad() - scaler.scale(loss_disc).backward() - scaler.unscale_(optim_d) - grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) - scaler.step(optim_d) + # Run steps + for batch_idx, info in data_iterator: + # Data + ## Unpack + if hps.if_f0 == 1: + ( + phone, + phone_lengths, + pitch, + pitchf, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ) = info + else: + phone, phone_lengths, spec, spec_lengths, wave, wave_lengths, sid = info + ## Load on CUDA + if (hps.if_cache_data_in_gpu == False) and torch.cuda.is_available(): + phone = phone.cuda(rank, non_blocking=True) + phone_lengths = phone_lengths.cuda(rank, non_blocking=True) + if hps.if_f0 == 1: + pitch = pitch.cuda(rank, non_blocking=True) + pitchf = pitchf.cuda(rank, non_blocking=True) + sid = sid.cuda(rank, non_blocking=True) + spec = spec.cuda(rank, non_blocking=True) + spec_lengths = spec_lengths.cuda(rank, non_blocking=True) + wave = wave.cuda(rank, non_blocking=True) + wave_lengths = wave_lengths.cuda(rank, non_blocking=True) - with autocast(enabled=hps.train.fp16_run): - # Generator - y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat) - with autocast(enabled=False): - loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel - loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl - loss_fm = feature_loss(fmap_r, fmap_g) - loss_gen, losses_gen = generator_loss(y_d_hat_g) - loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl - optim_g.zero_grad() - scaler.scale(loss_gen_all).backward() - scaler.unscale_(optim_g) - grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) - scaler.step(optim_g) - scaler.update() - - if rank == 0: - if global_step % hps.train.log_interval == 0: - lr = optim_g.param_groups[0]["lr"] - logger.info( - "Train Epoch: {} [{:.0f}%]".format( - epoch, 100.0 * batch_idx / len(train_loader) - ) - ) - # Amor For Tensorboard display - if loss_mel > 50: - loss_mel = 50 - if loss_kl > 5: - loss_kl = 5 - - logger.info([global_step, lr]) - logger.info( - f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}" - ) - scalar_dict = { - "loss/g/total": loss_gen_all, - "loss/d/total": loss_disc, - "learning_rate": lr, - "grad_norm_d": grad_norm_d, - "grad_norm_g": grad_norm_g, - } - scalar_dict.update( - { - "loss/g/fm": loss_fm, - "loss/g/mel": loss_mel, - "loss/g/kl": loss_kl, - } - ) - - scalar_dict.update( - {"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)} - ) - scalar_dict.update( - { - "loss/d_r/{}".format(i): v - for i, v in enumerate(losses_disc_r) - } - ) - scalar_dict.update( - { - "loss/d_g/{}".format(i): v - for i, v in enumerate(losses_disc_g) - } - ) - image_dict = { - "slice/mel_org": utils.plot_spectrogram_to_numpy( - y_mel[0].data.cpu().numpy() - ), - "slice/mel_gen": utils.plot_spectrogram_to_numpy( - y_hat_mel[0].data.cpu().numpy() - ), - "all/mel": utils.plot_spectrogram_to_numpy( - mel[0].data.cpu().numpy() - ), - } - utils.summarize( - writer=writer, - global_step=global_step, - images=image_dict, - scalars=scalar_dict, - ) - global_step += 1 - # if global_step % hps.train.eval_interval == 0: - if epoch % hps.save_every_epoch == 0 and rank == 0: - if hps.if_latest == 0: - utils.save_checkpoint( - net_g, - optim_g, - hps.train.learning_rate, - epoch, - os.path.join(hps.model_dir, "G_{}.pth".format(global_step)), - ) - utils.save_checkpoint( - net_d, - optim_d, - hps.train.learning_rate, - epoch, - os.path.join(hps.model_dir, "D_{}.pth".format(global_step)), - ) - else: - utils.save_checkpoint( - net_g, - optim_g, - hps.train.learning_rate, - epoch, - os.path.join(hps.model_dir, "G_{}.pth".format(2333333)), - ) - utils.save_checkpoint( - net_d, - optim_d, - hps.train.learning_rate, - epoch, - os.path.join(hps.model_dir, "D_{}.pth".format(2333333)), - ) - - else: # 后续的epoch直接使用打乱的cache - shuffle(cache) - # print("using cache") - for batch_idx, info in cache: + # Calculate + with autocast(enabled=hps.train.fp16_run): if hps.if_f0 == 1: ( - phone, - phone_lengths, - pitch, - pitchf, - spec, - spec_lengths, - wave, - wave_lengths, - sid, - ) = info + y_hat, + ids_slice, + x_mask, + z_mask, + (z, z_p, m_p, logs_p, m_q, logs_q), + ) = net_g(phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid) else: - phone, phone_lengths, spec, spec_lengths, wave, wave_lengths, sid = info - with autocast(enabled=hps.train.fp16_run): - if hps.if_f0 == 1: - ( - y_hat, - ids_slice, - x_mask, - z_mask, - (z, z_p, m_p, logs_p, m_q, logs_q), - ) = net_g( - phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid - ) - else: - ( - y_hat, - ids_slice, - x_mask, - z_mask, - (z, z_p, m_p, logs_p, m_q, logs_q), - ) = net_g(phone, phone_lengths, spec, spec_lengths, sid) - mel = spec_to_mel_torch( - spec, + ( + y_hat, + ids_slice, + x_mask, + z_mask, + (z, z_p, m_p, logs_p, m_q, logs_q), + ) = net_g(phone, phone_lengths, spec, spec_lengths, sid) + mel = spec_to_mel_torch( + spec, + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.mel_fmin, + hps.data.mel_fmax, + ) + y_mel = commons.slice_segments( + mel, ids_slice, hps.train.segment_size // hps.data.hop_length + ) + with autocast(enabled=False): + y_hat_mel = mel_spectrogram_torch( + y_hat.float().squeeze(1), hps.data.filter_length, hps.data.n_mel_channels, hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, hps.data.mel_fmin, hps.data.mel_fmax, ) - y_mel = commons.slice_segments( - mel, ids_slice, hps.train.segment_size // hps.data.hop_length + if hps.train.fp16_run == True: + y_hat_mel = y_hat_mel.half() + wave = commons.slice_segments( + wave, ids_slice * hps.data.hop_length, hps.train.segment_size + ) # slice + + # Discriminator + y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach()) + with autocast(enabled=False): + loss_disc, losses_disc_r, losses_disc_g = discriminator_loss( + y_d_hat_r, y_d_hat_g ) - with autocast(enabled=False): - y_hat_mel = mel_spectrogram_torch( - y_hat.float().squeeze(1), - hps.data.filter_length, - hps.data.n_mel_channels, - hps.data.sampling_rate, - hps.data.hop_length, - hps.data.win_length, - hps.data.mel_fmin, - hps.data.mel_fmax, + optim_d.zero_grad() + scaler.scale(loss_disc).backward() + scaler.unscale_(optim_d) + grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) + scaler.step(optim_d) + + with autocast(enabled=hps.train.fp16_run): + # Generator + y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat) + with autocast(enabled=False): + loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel + loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl + loss_fm = feature_loss(fmap_r, fmap_g) + loss_gen, losses_gen = generator_loss(y_d_hat_g) + loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + optim_g.zero_grad() + scaler.scale(loss_gen_all).backward() + scaler.unscale_(optim_g) + grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) + scaler.step(optim_g) + scaler.update() + + if rank == 0: + if global_step % hps.train.log_interval == 0: + lr = optim_g.param_groups[0]["lr"] + logger.info( + "Train Epoch: {} [{:.0f}%]".format( + epoch, 100.0 * batch_idx / len(train_loader) ) - if hps.train.fp16_run == True: - y_hat_mel = y_hat_mel.half() - wave = commons.slice_segments( - wave, ids_slice * hps.data.hop_length, hps.train.segment_size - ) # slice + ) + # Amor For Tensorboard display + if loss_mel > 50: + loss_mel = 50 + if loss_kl > 5: + loss_kl = 5 - # Discriminator - y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach()) - with autocast(enabled=False): - loss_disc, losses_disc_r, losses_disc_g = discriminator_loss( - y_d_hat_r, y_d_hat_g - ) - optim_d.zero_grad() - scaler.scale(loss_disc).backward() - scaler.unscale_(optim_d) - grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) - scaler.step(optim_d) - - with autocast(enabled=hps.train.fp16_run): - # Generator - y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat) - with autocast(enabled=False): - loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel - loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl - - loss_fm = feature_loss(fmap_r, fmap_g) - loss_gen, losses_gen = generator_loss(y_d_hat_g) - loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl - optim_g.zero_grad() - scaler.scale(loss_gen_all).backward() - scaler.unscale_(optim_g) - grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) - scaler.step(optim_g) - scaler.update() - - if rank == 0: - if global_step % hps.train.log_interval == 0: - lr = optim_g.param_groups[0]["lr"] - logger.info( - "Train Epoch: {} [{:.0f}%]".format( - epoch, 100.0 * batch_idx / len(train_loader) - ) - ) - # Amor For Tensorboard display - if loss_mel > 50: - loss_mel = 50 - if loss_kl > 5: - loss_kl = 5 - - logger.info([global_step, lr]) - logger.info( - f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}" - ) - scalar_dict = { - "loss/g/total": loss_gen_all, - "loss/d/total": loss_disc, - "learning_rate": lr, - "grad_norm_d": grad_norm_d, - "grad_norm_g": grad_norm_g, + logger.info([global_step, lr]) + logger.info( + f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}" + ) + scalar_dict = { + "loss/g/total": loss_gen_all, + "loss/d/total": loss_disc, + "learning_rate": lr, + "grad_norm_d": grad_norm_d, + "grad_norm_g": grad_norm_g, + } + scalar_dict.update( + { + "loss/g/fm": loss_fm, + "loss/g/mel": loss_mel, + "loss/g/kl": loss_kl, } - scalar_dict.update( - { - "loss/g/fm": loss_fm, - "loss/g/mel": loss_mel, - "loss/g/kl": loss_kl, - } - ) + ) - scalar_dict.update( - {"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)} - ) - scalar_dict.update( - { - "loss/d_r/{}".format(i): v - for i, v in enumerate(losses_disc_r) - } - ) - scalar_dict.update( - { - "loss/d_g/{}".format(i): v - for i, v in enumerate(losses_disc_g) - } - ) - image_dict = { - "slice/mel_org": utils.plot_spectrogram_to_numpy( - y_mel[0].data.cpu().numpy() - ), - "slice/mel_gen": utils.plot_spectrogram_to_numpy( - y_hat_mel[0].data.cpu().numpy() - ), - "all/mel": utils.plot_spectrogram_to_numpy( - mel[0].data.cpu().numpy() - ), - } - utils.summarize( - writer=writer, - global_step=global_step, - images=image_dict, - scalars=scalar_dict, - ) - global_step += 1 - # if global_step % hps.train.eval_interval == 0: - if epoch % hps.save_every_epoch == 0 and rank == 0: - if hps.if_latest == 0: - utils.save_checkpoint( - net_g, - optim_g, - hps.train.learning_rate, - epoch, - os.path.join(hps.model_dir, "G_{}.pth".format(global_step)), + scalar_dict.update( + {"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)} ) - utils.save_checkpoint( - net_d, - optim_d, - hps.train.learning_rate, - epoch, - os.path.join(hps.model_dir, "D_{}.pth".format(global_step)), + scalar_dict.update( + {"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)} ) - else: - utils.save_checkpoint( - net_g, - optim_g, - hps.train.learning_rate, - epoch, - os.path.join(hps.model_dir, "G_{}.pth".format(2333333)), + scalar_dict.update( + {"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)} ) - utils.save_checkpoint( - net_d, - optim_d, - hps.train.learning_rate, - epoch, - os.path.join(hps.model_dir, "D_{}.pth".format(2333333)), + image_dict = { + "slice/mel_org": utils.plot_spectrogram_to_numpy( + y_mel[0].data.cpu().numpy() + ), + "slice/mel_gen": utils.plot_spectrogram_to_numpy( + y_hat_mel[0].data.cpu().numpy() + ), + "all/mel": utils.plot_spectrogram_to_numpy( + mel[0].data.cpu().numpy() + ), + } + utils.summarize( + writer=writer, + global_step=global_step, + images=image_dict, + scalars=scalar_dict, ) + global_step += 1 + # /Run steps + + if epoch % hps.save_every_epoch == 0 and rank == 0: + if hps.if_latest == 0: + utils.save_checkpoint( + net_g, + optim_g, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "G_{}.pth".format(global_step)), + ) + utils.save_checkpoint( + net_d, + optim_d, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "D_{}.pth".format(global_step)), + ) + else: + utils.save_checkpoint( + net_g, + optim_g, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "G_{}.pth".format(2333333)), + ) + utils.save_checkpoint( + net_d, + optim_d, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "D_{}.pth".format(2333333)), + ) if rank == 0: logger.info("====> Epoch: {}".format(epoch)) From 978539ad0e18322b71a93059064967134b99d933 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Sat, 22 Apr 2023 12:17:32 +0000 Subject: [PATCH 31/40] Update extract_f0_print.py --- extract_f0_print.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extract_f0_print.py b/extract_f0_print.py index d2bc805..f848a0a 100644 --- a/extract_f0_print.py +++ b/extract_f0_print.py @@ -35,7 +35,7 @@ class FeatureInput(object): def compute_f0(self, path, f0_method): # default resample type of librosa.resample is "soxr_hq". # Quality: soxr_vhq > soxr_hq - x, sr = librosa.load(path, self.fs, res_type='soxr_vhq') + x, sr = librosa.load(path, self.fs)#, res_type='soxr_vhq' p_len = x.shape[0] // self.hop assert sr == self.fs if f0_method == "pm": From 44de5de8402925f3e6c761bca5f9fa40ed8c31ae Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Sat, 22 Apr 2023 12:22:16 +0000 Subject: [PATCH 32/40] Update i18n.py --- i18n.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/i18n.py b/i18n.py index 5767d88..ec7a866 100644 --- a/i18n.py +++ b/i18n.py @@ -12,7 +12,7 @@ def load_language_list(language): class I18nAuto: def __init__(self, language=None): if language in ['auto', None]: - language = locale.getlocale()[0] + language = locale.getdefaultlocale()[0]#getlocale can't identify the system's language ((None, None)) if not os.path.exists(f"./i18n/{language}.json"): language = "en_US" self.language = language From bb535a4f71058cf188fc20925e80804b2a8dca2e Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Sat, 22 Apr 2023 12:24:12 +0000 Subject: [PATCH 33/40] Update en_US.json --- i18n/en_US.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/i18n/en_US.json b/i18n/en_US.json index 2666262..61cd54d 100644 --- a/i18n/en_US.json +++ b/i18n/en_US.json @@ -51,7 +51,7 @@ "训练模型": "Train model.", "训练特征索引": "Train feature index.", "一键训练": "One-click training.", - "ckpt处理": "Ckpt processing.", + "ckpt处理": "Ckpt processing", "模型融合, 可用于测试音色融合": "Model fusion, can be used for merging diffrent voices", "A模型路径": "A model path.", "B模型路径": "B model path.", From 4fdb858a025489b6077fce708044383420511e9f Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Sat, 22 Apr 2023 21:41:50 +0800 Subject: [PATCH 34/40] Add files via upload --- trainset_preprocess_pipeline_print.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/trainset_preprocess_pipeline_print.py b/trainset_preprocess_pipeline_print.py index 7b5833a..7ca0b26 100644 --- a/trainset_preprocess_pipeline_print.py +++ b/trainset_preprocess_pipeline_print.py @@ -59,25 +59,13 @@ class PreProcess: wavfile.write( "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), self.sr, - (tmp_audio * 1).astype(np.float32), + tmp_audio.astype(np.float32), ) - - # default resample type of librosa.resample is "soxr_hq". - # Quality: soxr_vhq > soxr_hq - tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000, res_type="soxr_vhq") - tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (self.max * self.alpha)) + ( - 1 - self.alpha - ) * tmp_audio - wavfile.write( - "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), - self.sr, - (tmp_audio * 1).astype(np.float32), - ) - + tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000)#, res_type="soxr_vhq" wavfile.write( "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), 16000, - (tmp_audio * 1).astype(np.float32), + tmp_audio.astype(np.float32), ) def pipeline(self, path, idx0): From a02ef401adb11bcb9e34488536c2871fd1169a79 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Sat, 22 Apr 2023 14:39:17 +0000 Subject: [PATCH 35/40] Update trainset_preprocess_pipeline_print.py --- trainset_preprocess_pipeline_print.py | 1 + 1 file changed, 1 insertion(+) diff --git a/trainset_preprocess_pipeline_print.py b/trainset_preprocess_pipeline_print.py index 7ca0b26..f40309a 100644 --- a/trainset_preprocess_pipeline_print.py +++ b/trainset_preprocess_pipeline_print.py @@ -87,6 +87,7 @@ class PreProcess: idx1 += 1 else: tmp_audio = audio[start:] + idx1 += 1 break self.norm_write(tmp_audio, idx0, idx1) println("%s->Suc." % path) From 329d739e70dea7f0708f8538b985c4a39a999480 Mon Sep 17 00:00:00 2001 From: tarepan Date: Mon, 24 Apr 2023 12:45:20 +0900 Subject: [PATCH 36/40] Refactor mel module (#132) * Refactor wave-to-mel * Add docstring on mel * Refactor mel module import and variable names --- train/mel_processing.py | 108 ++++++++++++++-------------------------- 1 file changed, 38 insertions(+), 70 deletions(-) diff --git a/train/mel_processing.py b/train/mel_processing.py index 315b3d1..0c1867b 100644 --- a/train/mel_processing.py +++ b/train/mel_processing.py @@ -1,18 +1,8 @@ -import math -import os -import random import torch -from torch import nn -import torch.nn.functional as F import torch.utils.data -import numpy as np -import librosa -import librosa.util as librosa_util -from librosa.util import normalize, pad_center, tiny -from scipy.signal import get_window -from scipy.io.wavfile import read from librosa.filters import mel as librosa_mel_fn + MAX_WAV_VALUE = 32768.0 @@ -35,25 +25,38 @@ def dynamic_range_decompression_torch(x, C=1): def spectral_normalize_torch(magnitudes): - output = dynamic_range_compression_torch(magnitudes) - return output + return dynamic_range_compression_torch(magnitudes) def spectral_de_normalize_torch(magnitudes): - output = dynamic_range_decompression_torch(magnitudes) - return output + return dynamic_range_decompression_torch(magnitudes) +# Reusable banks mel_basis = {} hann_window = {} def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): + """Convert waveform into Linear-frequency Linear-amplitude spectrogram. + + Args: + y :: (B, T) - Audio waveforms + n_fft + sampling_rate + hop_size + win_size + center + Returns: + :: (B, Freq, Frame) - Linear-frequency Linear-amplitude spectrogram + """ + # Validation if torch.min(y) < -1.0: print("min value is ", torch.min(y)) if torch.max(y) > 1.0: print("max value is ", torch.max(y)) + # Window - Cache if needed global hann_window dtype_device = str(y.dtype) + "_" + str(y.device) wnsize_dtype_device = str(win_size) + "_" + dtype_device @@ -62,6 +65,7 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) dtype=y.dtype, device=y.device ) + # Padding y = torch.nn.functional.pad( y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), @@ -69,6 +73,7 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) ) y = y.squeeze(1) + # Complex Spectrogram :: (B, T) -> (B, Freq, Frame, RealComplex=2) spec = torch.stft( y, n_fft, @@ -82,11 +87,13 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) return_complex=False, ) + # Linear-frequency Linear-amplitude spectrogram :: (B, Freq, Frame, RealComplex=2) -> (B, Freq, Frame) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) return spec def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): + # MelBasis - Cache if needed global mel_basis dtype_device = str(spec.dtype) + "_" + str(spec.device) fmax_dtype_device = str(fmax) + "_" + dtype_device @@ -95,66 +102,27 @@ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( dtype=spec.dtype, device=spec.device ) - spec = torch.matmul(mel_basis[fmax_dtype_device], spec) - spec = spectral_normalize_torch(spec) - return spec + + # Mel-frequency Log-amplitude spectrogram :: (B, Freq=num_mels, Frame) + melspec = torch.matmul(mel_basis[fmax_dtype_device], spec) + melspec = spectral_normalize_torch(melspec) + return melspec def mel_spectrogram_torch( y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False ): - if torch.min(y) < -1.0: - print("min value is ", torch.min(y)) - if torch.max(y) > 1.0: - print("max value is ", torch.max(y)) + """Convert waveform into Mel-frequency Log-amplitude spectrogram. - global mel_basis, hann_window - dtype_device = str(y.dtype) + "_" + str(y.device) - fmax_dtype_device = str(fmax) + "_" + dtype_device - wnsize_dtype_device = str(win_size) + "_" + dtype_device - if fmax_dtype_device not in mel_basis: - mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) - mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( - dtype=y.dtype, device=y.device - ) - if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( - dtype=y.dtype, device=y.device - ) + Args: + y :: (B, T) - Waveforms + Returns: + melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram + """ + # Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame) + spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center) - y = torch.nn.functional.pad( - y.unsqueeze(1), - (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), - mode="reflect", - ) - y = y.squeeze(1) + # Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame) + melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax) - # spec = torch.stft( - # y, - # n_fft, - # hop_length=hop_size, - # win_length=win_size, - # window=hann_window[wnsize_dtype_device], - # center=center, - # pad_mode="reflect", - # normalized=False, - # onesided=True, - # ) - spec = torch.stft( - y, - n_fft, - hop_length=hop_size, - win_length=win_size, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=False, - ) - spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) - - spec = torch.matmul(mel_basis[fmax_dtype_device], spec) - spec = spectral_normalize_torch(spec) - - return spec + return melspec From fb1d4b188278ff26e8b8adf44a552a9c135d55b6 Mon Sep 17 00:00:00 2001 From: tarepan Date: Mon, 24 Apr 2023 19:35:09 +0900 Subject: [PATCH 37/40] Fix deprecated positional arguments in mel (#133) --- train/mel_processing.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/train/mel_processing.py b/train/mel_processing.py index 0c1867b..668ea86 100644 --- a/train/mel_processing.py +++ b/train/mel_processing.py @@ -98,7 +98,9 @@ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): dtype_device = str(spec.dtype) + "_" + str(spec.device) fmax_dtype_device = str(fmax) + "_" + dtype_device if fmax_dtype_device not in mel_basis: - mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) + mel = librosa_mel_fn( + sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax + ) mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( dtype=spec.dtype, device=spec.device ) From 9bac0ffaa7e3f687c28de06959d506a3c921068c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=CE=9D=CE=B1=CF=81=CE=BF=CF=85=CF=83=CE=AD=C2=B7=CE=BC?= =?UTF-8?q?=C2=B7=CE=B3=CE=B9=CE=BF=CF=85=CE=BC=CE=B5=CE=BC=CE=AF=C2=B7?= =?UTF-8?q?=CE=A7=CE=B9=CE=BD=CE=B1=CE=BA=CE=AC=CE=BD=CE=BD=CE=B1?= <40709280+NaruseMioShirakana@users.noreply.github.com> Date: Mon, 24 Apr 2023 19:55:05 +0800 Subject: [PATCH 38/40] =?UTF-8?q?Onnx=E5=AF=BC=E5=87=BA=E6=8B=93=E5=B1=95?= =?UTF-8?q?=E4=BB=A5=E5=8F=8AWebUI=E6=94=AF=E6=8C=81=20(#140)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add files via upload * Add files via upload * Add files via upload * Add files via upload --- export_onnx.py | 118 +++-- export_onnx_old.py | 47 ++ i18n/en_US.json | 7 +- i18n/ja_JP.json | 7 +- i18n/zh_CN.json | 7 +- i18n/zh_HK.json | 7 +- i18n/zh_SG.json | 7 +- i18n/zh_TW.json | 7 +- infer-web.py | 90 ++++ infer_pack/models_onnx.py | 95 +--- infer_pack/models_onnx_moess.py | 849 ++++++++++++++++++++++++++++++++ 11 files changed, 1101 insertions(+), 140 deletions(-) create mode 100644 export_onnx_old.py create mode 100644 infer_pack/models_onnx_moess.py diff --git a/export_onnx.py b/export_onnx.py index d4a8c62..8b62b47 100644 --- a/export_onnx.py +++ b/export_onnx.py @@ -1,47 +1,81 @@ -from infer_pack.models_onnx import SynthesizerTrnMs256NSFsid +from infer_pack.models_onnx_moess import SynthesizerTrnMs256NSFsidM +from infer_pack.models_onnx import SynthesizerTrnMs256NSFsidO import torch -person = "Shiroha/shiroha.pth" -exported_path = "model.onnx" +if __name__ == '__main__': + MoeVS = True #模型是否为MoeVoiceStudio(原MoeSS)使用 + ModelPath = "Shiroha/shiroha.pth" #模型路径 + ExportedPath = "model.onnx" #输出路径 + hidden_channels = 256 # hidden_channels,为768Vec做准备 + cpt = torch.load(ModelPath, map_location="cpu") + cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk + print(*cpt["config"]) -cpt = torch.load(person, map_location="cpu") -cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk -print(*cpt["config"]) -net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=False) -net_g.load_state_dict(cpt["weight"], strict=False) + test_phone = torch.rand(1, 200, hidden_channels) # hidden unit + test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用) + test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹) + test_pitchf = torch.rand(1, 200) # nsf基频 + test_ds = torch.LongTensor([0]) # 说话人ID + test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子) -test_phone = torch.rand(1, 200, 256) -test_phone_lengths = torch.tensor([200]).long() -test_pitch = torch.randint(size=(1, 200), low=5, high=255) -test_pitchf = torch.rand(1, 200) -test_ds = torch.LongTensor([0]) -test_rnd = torch.rand(1, 192, 200) -input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"] -output_names = [ - "audio", -] -device = "cpu" -torch.onnx.export( - net_g, - ( - test_phone.to(device), - test_phone_lengths.to(device), - test_pitch.to(device), - test_pitchf.to(device), - test_ds.to(device), - test_rnd.to(device), - ), - exported_path, - dynamic_axes={ - "phone": [1], - "pitch": [1], - "pitchf": [1], - "rnd": [2], - }, - do_constant_folding=False, - opset_version=16, - verbose=False, - input_names=input_names, - output_names=output_names, -) + device = "cpu" #导出时设备(不影响使用模型) + + if MoeVS: + net_g = SynthesizerTrnMs256NSFsidM(*cpt["config"], is_half=False) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16) + net_g.load_state_dict(cpt["weight"], strict=False) + input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"] + output_names = [ + "audio", + ] + torch.onnx.export( + net_g, + ( + test_phone.to(device), + test_phone_lengths.to(device), + test_pitch.to(device), + test_pitchf.to(device), + test_ds.to(device), + test_rnd.to(device), + ), + ExportedPath, + dynamic_axes={ + "phone": [1], + "pitch": [1], + "pitchf": [1], + "rnd": [2], + }, + do_constant_folding=False, + opset_version=16, + verbose=False, + input_names=input_names, + output_names=output_names, + ) + else: + net_g = SynthesizerTrnMs256NSFsidO(*cpt["config"], is_half=False) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16) + net_g.load_state_dict(cpt["weight"], strict=False) + input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds"] + output_names = [ + "audio", + ] + torch.onnx.export( + net_g, + ( + test_phone.to(device), + test_phone_lengths.to(device), + test_pitch.to(device), + test_pitchf.to(device), + test_ds.to(device), + ), + ExportedPath, + dynamic_axes={ + "phone": [1], + "pitch": [1], + "pitchf": [1], + }, + do_constant_folding=False, + opset_version=16, + verbose=False, + input_names=input_names, + output_names=output_names, + ) \ No newline at end of file diff --git a/export_onnx_old.py b/export_onnx_old.py new file mode 100644 index 0000000..bff6d06 --- /dev/null +++ b/export_onnx_old.py @@ -0,0 +1,47 @@ +from infer_pack.models_onnx_moess import SynthesizerTrnMs256NSFsidM +import torch + +person = "Shiroha/shiroha.pth" +exported_path = "model.onnx" + + +cpt = torch.load(person, map_location="cpu") +cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk +print(*cpt["config"]) +net_g = SynthesizerTrnMs256NSFsidM(*cpt["config"], is_half=False) +net_g.load_state_dict(cpt["weight"], strict=False) + +test_phone = torch.rand(1, 200, 256) +test_phone_lengths = torch.tensor([200]).long() +test_pitch = torch.randint(size=(1, 200), low=5, high=255) +test_pitchf = torch.rand(1, 200) +test_ds = torch.LongTensor([0]) +test_rnd = torch.rand(1, 192, 200) +input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"] +output_names = [ + "audio", +] +device = "cpu" +torch.onnx.export( + net_g, + ( + test_phone.to(device), + test_phone_lengths.to(device), + test_pitch.to(device), + test_pitchf.to(device), + test_ds.to(device), + test_rnd.to(device), + ), + exported_path, + dynamic_axes={ + "phone": [1], + "pitch": [1], + "pitchf": [1], + "rnd": [2], + }, + do_constant_folding=False, + opset_version=16, + verbose=False, + input_names=input_names, + output_names=output_names, +) diff --git a/i18n/en_US.json b/i18n/en_US.json index 61cd54d..7ce4ac2 100644 --- a/i18n/en_US.json +++ b/i18n/en_US.json @@ -95,5 +95,10 @@ "性能设置": "Performance settings", "开始音频转换": "Start Audio Conversion", "停止音频转换": "Stop Audio Conversion", - "推理时间(ms):": "Infer Time(ms):" + "推理时间(ms):": "Infer Time(ms):", + "Onnx导出": "Onnx", + "RVC模型路径": "RVC Path", + "Onnx输出路径": "Onnx Export Path", + "导出Onnx模型": "Export Onnx Model", + "MoeVS模型": "MoeSS?" } diff --git a/i18n/ja_JP.json b/i18n/ja_JP.json index ddc362f..4d28f56 100644 --- a/i18n/ja_JP.json +++ b/i18n/ja_JP.json @@ -95,5 +95,10 @@ "性能设置": "パフォーマンス設定", "开始音频转换": "音声変換を開始", "停止音频转换": "音声変換を停止", - "推理时间(ms):": "推論時間(ms):" + "推理时间(ms):": "推論時間(ms):", + "Onnx导出": "Onnx", + "RVC模型路径": "RVCルパス", + "Onnx输出路径": "Onnx出力パス", + "导出Onnx模型": "Onnxに変換", + "MoeVS模型": "MoeSS?" } diff --git a/i18n/zh_CN.json b/i18n/zh_CN.json index 5e4f918..f5c4dc7 100644 --- a/i18n/zh_CN.json +++ b/i18n/zh_CN.json @@ -95,5 +95,10 @@ "性能设置": "性能设置", "开始音频转换": "开始音频转换", "停止音频转换": "停止音频转换", - "推理时间(ms):": "推理时间(ms):" + "推理时间(ms):": "推理时间(ms):", + "Onnx导出": "Onnx导出", + "RVC模型路径": "RVC模型路径", + "Onnx输出路径": "Onnx输出路径", + "导出Onnx模型": "导出Onnx模型", + "MoeVS模型": "MoeSS模型" } diff --git a/i18n/zh_HK.json b/i18n/zh_HK.json index 4b47d95..6b7fb59 100644 --- a/i18n/zh_HK.json +++ b/i18n/zh_HK.json @@ -95,5 +95,10 @@ "性能设置": "效能設定", "开始音频转换": "開始音訊轉換", "停止音频转换": "停止音訊轉換", - "推理时间(ms):": "推理時間(ms):" + "推理时间(ms):": "推理時間(ms):", + "Onnx导出": "Onnx导出", + "RVC模型路径": "RVC模型路径", + "Onnx输出路径": "Onnx输出路径", + "导出Onnx模型": "导出Onnx模型", + "MoeVS模型": "MoeSS模型" } diff --git a/i18n/zh_SG.json b/i18n/zh_SG.json index 4b47d95..6b7fb59 100644 --- a/i18n/zh_SG.json +++ b/i18n/zh_SG.json @@ -95,5 +95,10 @@ "性能设置": "效能設定", "开始音频转换": "開始音訊轉換", "停止音频转换": "停止音訊轉換", - "推理时间(ms):": "推理時間(ms):" + "推理时间(ms):": "推理時間(ms):", + "Onnx导出": "Onnx导出", + "RVC模型路径": "RVC模型路径", + "Onnx输出路径": "Onnx输出路径", + "导出Onnx模型": "导出Onnx模型", + "MoeVS模型": "MoeSS模型" } diff --git a/i18n/zh_TW.json b/i18n/zh_TW.json index 4b47d95..6b7fb59 100644 --- a/i18n/zh_TW.json +++ b/i18n/zh_TW.json @@ -95,5 +95,10 @@ "性能设置": "效能設定", "开始音频转换": "開始音訊轉換", "停止音频转换": "停止音訊轉換", - "推理时间(ms):": "推理時間(ms):" + "推理时间(ms):": "推理時間(ms):", + "Onnx导出": "Onnx导出", + "RVC模型路径": "RVC模型路径", + "Onnx输出路径": "Onnx输出路径", + "导出Onnx模型": "导出Onnx模型", + "MoeVS模型": "MoeSS模型" } diff --git a/infer-web.py b/infer-web.py index b027f0e..771a65c 100644 --- a/infer-web.py +++ b/infer-web.py @@ -119,6 +119,7 @@ for name in os.listdir(weight_uvr5_root): uvr5_names.append(name.replace(".pth", "")) + def vc_single( sid, input_audio, @@ -885,6 +886,83 @@ def change_info_(ckpt_path): return {"__type__": "update"}, {"__type__": "update"} +from infer_pack.models_onnx_moess import SynthesizerTrnMs256NSFsidM +from infer_pack.models_onnx import SynthesizerTrnMs256NSFsidO +def export_onnx(ModelPath, ExportedPath, MoeVS=True): + hidden_channels = 256 # hidden_channels,为768Vec做准备 + cpt = torch.load(ModelPath, map_location="cpu") + cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk + print(*cpt["config"]) + + test_phone = torch.rand(1, 200, hidden_channels) # hidden unit + test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用) + test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹) + test_pitchf = torch.rand(1, 200) # nsf基频 + test_ds = torch.LongTensor([0]) # 说话人ID + test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子) + + device = "cpu" #导出时设备(不影响使用模型) + + if MoeVS: + net_g = SynthesizerTrnMs256NSFsidM(*cpt["config"], is_half=False) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16) + net_g.load_state_dict(cpt["weight"], strict=False) + input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"] + output_names = [ + "audio", + ] + torch.onnx.export( + net_g, + ( + test_phone.to(device), + test_phone_lengths.to(device), + test_pitch.to(device), + test_pitchf.to(device), + test_ds.to(device), + test_rnd.to(device), + ), + ExportedPath, + dynamic_axes={ + "phone": [1], + "pitch": [1], + "pitchf": [1], + "rnd": [2], + }, + do_constant_folding=False, + opset_version=16, + verbose=False, + input_names=input_names, + output_names=output_names, + ) + else: + net_g = SynthesizerTrnMs256NSFsidO(*cpt["config"], is_half=False) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16) + net_g.load_state_dict(cpt["weight"], strict=False) + input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds"] + output_names = [ + "audio", + ] + torch.onnx.export( + net_g, + ( + test_phone.to(device), + test_phone_lengths.to(device), + test_pitch.to(device), + test_pitchf.to(device), + test_ds.to(device), + ), + ExportedPath, + dynamic_axes={ + "phone": [1], + "pitch": [1], + "pitchf": [1], + }, + do_constant_folding=False, + opset_version=16, + verbose=False, + input_names=input_names, + output_names=output_names, + ) + return "Finished" + with gr.Blocks() as app: gr.Markdown( value=i18n( @@ -1361,6 +1439,18 @@ with gr.Blocks() as app: info7, ) + with gr.TabItem(i18n("Onnx导出")): + with gr.Row(): + ckpt_dir = gr.Textbox(label=i18n("RVC模型路径"), value="", interactive=True) + with gr.Row(): + onnx_dir = gr.Textbox(label=i18n("Onnx输出路径"), value="", interactive=True) + with gr.Row(): + moevs = gr.Checkbox(label=i18n("MoeVS模型"), value=True) + infoOnnx = gr.Label(label="Null") + with gr.Row(): + butOnnx = gr.Button(i18n("导出Onnx模型"), variant="primary") + butOnnx.click(export_onnx, [ckpt_dir, onnx_dir, moevs], infoOnnx) + # with gr.TabItem(i18n("招募音高曲线前端编辑器")): # gr.Markdown(value=i18n("加开发群联系我xxxxx")) # with gr.TabItem(i18n("点击查看交流、问题反馈群号")): diff --git a/infer_pack/models_onnx.py b/infer_pack/models_onnx.py index a5f405c..18991b5 100644 --- a/infer_pack/models_onnx.py +++ b/infer_pack/models_onnx.py @@ -527,7 +527,7 @@ sr2sr = { } -class SynthesizerTrnMs256NSFsid(nn.Module): +class SynthesizerTrnMs256NSFsidO(nn.Module): def __init__( self, spec_channels, @@ -612,104 +612,15 @@ class SynthesizerTrnMs256NSFsid(nn.Module): self.flow.remove_weight_norm() self.enc_q.remove_weight_norm() - def forward(self, phone, phone_lengths, pitch, nsff0, sid, rnd, max_len=None): + def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) return o -class SynthesizerTrnMs256NSFsid_sim(nn.Module): - """ - Synthesizer for Training - """ - - def __init__( - self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - # hop_length, - gin_channels=0, - use_sdp=True, - **kwargs - ): - super().__init__() - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - # self.hop_length = hop_length# - self.spk_embed_dim = spk_embed_dim - self.enc_p = TextEncoder256Sim( - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - ) - self.dec = GeneratorNSF( - inter_channels, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, - is_half=kwargs["is_half"], - ) - - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels - ) - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) - - def remove_weight_norm(self): - self.dec.remove_weight_norm() - self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() - - def forward( - self, phone, phone_lengths, pitch, pitchf, ds, max_len=None - ): # y是spec不需要了现在 - g = self.emb_g(ds.unsqueeze(0)).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 - x, x_mask = self.enc_p(phone, pitch, phone_lengths) - x = self.flow(x, x_mask, g=g, reverse=True) - o = self.dec((x * x_mask)[:, :, :max_len], pitchf, g=g) - return o - - class MultiPeriodDiscriminator(torch.nn.Module): def __init__(self, use_spectral_norm=False): super(MultiPeriodDiscriminator, self).__init__() diff --git a/infer_pack/models_onnx_moess.py b/infer_pack/models_onnx_moess.py new file mode 100644 index 0000000..ea33489 --- /dev/null +++ b/infer_pack/models_onnx_moess.py @@ -0,0 +1,849 @@ +import math, pdb, os +from time import time as ttime +import torch +from torch import nn +from torch.nn import functional as F +from infer_pack import modules +from infer_pack import attentions +from infer_pack import commons +from infer_pack.commons import init_weights, get_padding +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from infer_pack.commons import init_weights +import numpy as np +from infer_pack import commons + + +class TextEncoder256(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(256, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, phone, pitch, lengths): + if pitch == None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class TextEncoder256Sim(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(256, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = nn.Conv1d(hidden_channels, out_channels, 1) + + def forward(self, phone, pitch, lengths): + if pitch == None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + x = self.proj(x) * x_mask + return x, x_mask + + +class ResidualCouplingBlock(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + ): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append( + modules.ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ) + ) + self.flows.append(modules.Flip()) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x + + def remove_weight_norm(self): + for i in range(self.n_flows): + self.flows[i * 2].remove_weight_norm() + + +class PosteriorEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, g=None): + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + def remove_weight_norm(self): + self.enc.remove_weight_norm() + + +class Generator(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=0, + ): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x, g=None): + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +class SineGen(torch.nn.Module): + """Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__( + self, + samp_rate, + harmonic_num=0, + sine_amp=0.1, + noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False, + ): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + def _f02uv(self, f0): + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + return uv + + def forward(self, f0, upp): + """sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + with torch.no_grad(): + f0 = f0[:, None].transpose(1, 2) + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in np.arange(self.harmonic_num): + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( + idx + 2 + ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic + rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化 + rand_ini = torch.rand( + f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device + ) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化 + tmp_over_one *= upp + tmp_over_one = F.interpolate( + tmp_over_one.transpose(2, 1), + scale_factor=upp, + mode="linear", + align_corners=True, + ).transpose(2, 1) + rad_values = F.interpolate( + rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose( + 2, 1 + ) ####### + tmp_over_one %= 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + sine_waves = torch.sin( + torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi + ) + sine_waves = sine_waves * self.sine_amp + uv = self._f02uv(f0) + uv = F.interpolate( + uv.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose(2, 1) + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + """SourceModule for hn-nsf + SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + harmonic_num: number of harmonic above F0 (default: 0) + sine_amp: amplitude of sine source signal (default: 0.1) + add_noise_std: std of additive Gaussian noise (default: 0.003) + note that amplitude of noise in unvoiced is decided + by sine_amp + voiced_threshold: threhold to set U/V given F0 (default: 0) + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + uv (batchsize, length, 1) + """ + + def __init__( + self, + sampling_rate, + harmonic_num=0, + sine_amp=0.1, + add_noise_std=0.003, + voiced_threshod=0, + is_half=True, + ): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + self.is_half = is_half + # to produce sine waveforms + self.l_sin_gen = SineGen( + sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod + ) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x, upp=None): + sine_wavs, uv, _ = self.l_sin_gen(x, upp) + if self.is_half: + sine_wavs = sine_wavs.half() + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + return sine_merge, None, None # noise, uv + + +class GeneratorNSF(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels, + sr, + is_half=False, + ): + super(GeneratorNSF, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + + self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF( + sampling_rate=sr, harmonic_num=0, is_half=is_half + ) + self.noise_convs = nn.ModuleList() + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + c_cur = upsample_initial_channel // (2 ** (i + 1)) + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + if i + 1 < len(upsample_rates): + stride_f0 = np.prod(upsample_rates[i + 1 :]) + self.noise_convs.append( + Conv1d( + 1, + c_cur, + kernel_size=stride_f0 * 2, + stride=stride_f0, + padding=stride_f0 // 2, + ) + ) + else: + self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + self.upp = np.prod(upsample_rates) + + def forward(self, x, f0, g=None): + har_source, noi_source, uv = self.m_source(f0, self.upp) + har_source = har_source.transpose(1, 2) + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + x_source = self.noise_convs[i](har_source) + x = x + x_source + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +sr2sr = { + "32k": 32000, + "40k": 40000, + "48k": 48000, +} + + +class SynthesizerTrnMs256NSFsidM(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + **kwargs + ): + super().__init__() + if type(sr) == type("strr"): + sr = sr2sr[sr] + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder256( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward(self, phone, phone_lengths, pitch, nsff0, sid, rnd, max_len=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + return o + + +class SynthesizerTrnMs256NSFsid_sim(nn.Module): + """ + Synthesizer for Training + """ + + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + # hop_length, + gin_channels=0, + use_sdp=True, + **kwargs + ): + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder256Sim( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + is_half=kwargs["is_half"], + ) + + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward( + self, phone, phone_lengths, pitch, pitchf, ds, max_len=None + ): # y是spec不需要了现在 + g = self.emb_g(ds.unsqueeze(0)).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + x, x_mask = self.enc_p(phone, pitch, phone_lengths) + x = self.flow(x, x_mask, g=g, reverse=True) + o = self.dec((x * x_mask)[:, :, :max_len], pitchf, g=g) + return o + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2, 3, 5, 7, 11, 17] + # periods = [3, 5, 7, 11, 17, 23, 37] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 1024, + 1024, + (kernel_size, 1), + 1, + padding=(get_padding(kernel_size, 1), 0), + ) + ), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap From f6ef9bca0c0c195d24978fce3ac6ce3cd1bc338b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=BA=90=E6=96=87=E9=9B=A8?= <41315874+fumiama@users.noreply.github.com> Date: Mon, 24 Apr 2023 20:17:49 +0800 Subject: [PATCH 39/40] =?UTF-8?q?fix=20#115:=20=E9=9A=90=E8=97=8F=E5=85=81?= =?UTF-8?q?=E8=AE=B8=E7=9A=84=20exception?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- train_nsf_sim_cache_sid_load_pretrain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train_nsf_sim_cache_sid_load_pretrain.py b/train_nsf_sim_cache_sid_load_pretrain.py index 320af5f..f7840f6 100644 --- a/train_nsf_sim_cache_sid_load_pretrain.py +++ b/train_nsf_sim_cache_sid_load_pretrain.py @@ -157,7 +157,7 @@ def run(rank, n_gpus, hps): # epoch_str = 1 # global_step = 0 except: # 如果首次不能加载,加载pretrain - traceback.print_exc() + # traceback.print_exc() epoch_str = 1 global_step = 0 if rank == 0: From fdf12a4add8733ce36245b44ab5f2280f2cab65f Mon Sep 17 00:00:00 2001 From: nadare Date: Mon, 24 Apr 2023 21:18:34 +0900 Subject: [PATCH 40/40] Faiss Tutorial for Developers (#97) * add faiss tutorial (WIP) * add embedding tips --- docs/faiss_tips_en.md | 146 ++++++++++++++++++++++++++++++++++++++++++ docs/faiss_tips_ja.md | 146 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 292 insertions(+) create mode 100644 docs/faiss_tips_en.md create mode 100644 docs/faiss_tips_ja.md diff --git a/docs/faiss_tips_en.md b/docs/faiss_tips_en.md new file mode 100644 index 0000000..e91de9e --- /dev/null +++ b/docs/faiss_tips_en.md @@ -0,0 +1,146 @@ +faiss tuning TIPS +================== +# about faiss +faiss is a library of neighborhood searches for dense vectors, developed by facebook research, which efficiently implements many approximate neighborhood search methods. +Approximate Neighbor Search finds similar vectors quickly while sacrificing some accuracy. + +## faiss in RVC +In RVC, for the embedding of features converted by HuBERT, we search for embeddings similar to the embedding generated from the training data and mix them to achieve a conversion that is closer to the original speech. However, since this search takes time if performed naively, high-speed conversion is realized by using approximate neighborhood search. + +# implementation overview +In '/logs/your-experiment/3_feature256' where the model is located, features extracted by HuBERT from each voice data are located. +From here we read the npy files in order sorted by filename and concatenate the vectors to create big_npy. (This vector has shape [N, 256].) +After saving big_npy as /logs/your-experiment/total_fea.npy, train it with faiss. + +As of 2023/04/18, IVF based on L2 distance is used using the index factory function of faiss. +The number of IVF divisions (n_ivf) is N//39, and n_probe uses int(np.power(n_ivf, 0.3)). (Look around train_index in infer-web.py.) + +In this article, I will first explain the meaning of these parameters, and then write advice for developers to create a better index. + +# Explanation of the method +## index factory +An index factory is a unique faiss notation that expresses a pipeline that connects multiple approximate neighborhood search methods as a string. +This allows you to try various approximate neighborhood search methods simply by changing the index factory string. +In RVC it is used like this: + +```python +index = faiss.index_factory(256, "IVF%s,Flat" % n_ivf) +``` +Among the arguments of index_factory, the first is the number of dimensions of the vector, the second is the index factory string, and the third is the distance to use. + +For more detailed notation +https://github.com/facebookresearch/faiss/wiki/The-index-factory + +## index for distance +There are two typical indexes used as similarity of embedding as follows. + +- Euclidean distance (METRIC_L2) +- inner product (METRIC_INNER_PRODUCT) + +Euclidean distance takes the squared difference in each dimension, sums the differences in all dimensions, and then takes the square root. This is the same as the distance in 2D and 3D that we use on a daily basis. +The inner product is not used as an index of similarity as it is, and the cosine similarity that takes the inner product after being normalized by the L2 norm is generally used. + +Which is better depends on the case, but cosine similarity is often used in embedding obtained by word2vec and similar image retrieval models learned by ArcFace. If you want to do l2 normalization on vector X with numpy, you can do it with the following code with eps small enough to avoid 0 division. + +```python +X_normed = X / np.maximum(eps, np.linalg.norm(X, ord=2, axis=-1, keepdims=True)) +``` + +Also, for the index factory, you can change the distance index used for calculation by choosing the value to pass as the third argument. + +```python +index = faiss.index_factory(dimention, text, faiss.METRIC_INNER_PRODUCT) +``` + +## IVF +IVF (Inverted file indexes) is an algorithm similar to the inverted index in full-text search. +During learning, the search target is clustered with kmeans, and Voronoi partitioning is performed using the cluster center. Each data point is assigned a cluster, so we create a dictionary that looks up the data points from the clusters. + +For example, if clusters are assigned as follows +|index|Cluster| +|-----|-------| +|1|A| +|2|B| +|3|A| +|4|C| +|5|B| + +The resulting inverted index looks like this: + +|cluster|index| +|-------|-----| +|A|1, 3| +|B|2, 5| +|C|4| + +When searching, we first search n_probe clusters from the clusters, and then calculate the distances for the data points belonging to each cluster. + +# recommend parameter +There are official guidelines on how to choose an index, so I will explain accordingly. +https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index + +For datasets below 1M, 4bit-PQ is the most efficient method available in faiss as of April 2023. +Combining this with IVF, narrowing down the candidates with 4bit-PQ, and finally recalculating the distance with an accurate index can be described by using the following index factory. + +```python +index = faiss.index_factory(256, "IVF1024,PQ128x4fs,RFlat") +``` + +## Recommended parameters for IVF +Consider the case of too many IVFs. For example, if coarse quantization by IVF is performed for the number of data, this is the same as a naive exhaustive search and is inefficient. +For 1M or less, IVF values are recommended between 4*sqrt(N) ~ 16*sqrt(N) for N number of data points. + +Since the calculation time increases in proportion to the number of n_probes, please consult with the accuracy and choose appropriately. Personally, I don't think RVC needs that much accuracy, so n_probe = 1 is fine. + +## FastScan +FastScan is a method that enables high-speed approximation of distances by Cartesian product quantization by performing them in registers. +Cartesian product quantization performs clustering independently for each d dimension (usually d = 2) during learning, calculates the distance between clusters in advance, and creates a lookup table. At the time of prediction, the distance of each dimension can be calculated in O(1) by looking at the lookup table. +So the number you specify after PQ usually specifies half the dimension of the vector. + +For a more detailed description of FastScan, please refer to the official documentation. +https://github.com/facebookresearch/faiss/wiki/Fast-accumulation-of-PQ-and-AQ-codes-(FastScan) + +## RFlat +RFlat is an instruction to recalculate the rough distance calculated by FastScan with the exact distance specified by the third argument of index factory. +When getting k neighbors, k*k_factor points are recalculated. + +# Techniques for embedding +## alpha query expansion +Query expansion is a technique used in searches, for example in full-text searches, where a few words are added to the entered search sentence to improve search accuracy. Several methods have also been proposed for vector search, among which α-query expansion is known as a highly effective method that does not require additional learning. In the paper, it is introduced in [Attention-Based Query Expansion Learning](https://arxiv.org/abs/2007.08019), etc., and [2nd place solution of kaggle shopee competition](https://www.kaggle.com/code/lyakaap/2nd-place-solution/notebook). + +α-query expansion can be done by summing a vector with neighboring vectors with weights raised to the power of similarity. How to paste the code example. Replace big_npy with α query expansion. + +```python +alpha = 3. +index = faiss.index_factory(256, "IVF512,PQ128x4fs,RFlat") +original_norm = np.maximum(np.linalg.norm(big_npy, ord=2, axis=1, keepdims=True), 1e-9) +big_npy /= original_norm +index.train(big_npy) +index.add(big_npy) +dist, neighbor = index.search(big_npy, num_expand) + +expand_arrays = [] +ixs = np.arange(big_npy.shape[0]) +for i in range(-(-big_npy.shape[0]//batch_size)): + ix = ixs[i*batch_size:(i+1)*batch_size] + weight = np.power(np.einsum("nd,nmd->nm", big_npy[ix], big_npy[neighbor[ix]]), alpha) + expand_arrays.append(np.sum(big_npy[neighbor[ix]] * np.expand_dims(weight, axis=2),axis=1)) +big_npy = np.concatenate(expand_arrays, axis=0) + +# normalize index version +big_npy = big_npy / np.maximum(np.linalg.norm(big_npy, ord=2, axis=1, keepdims=True), 1e-9) +``` + +This is a technique that can be applied both to the query that does the search and to the DB being searched. + +## Compress embedding with MiniBatch KMeans +If total_fea.npy is too large, it is possible to shrink the vector using KMeans. +Compression of embedding is possible with the following code. Specify the size you want to compress for n_clusters, and specify 256 * number of CPU cores for batch_size to fully benefit from CPU parallelization. + +```python +import multiprocessing +from sklearn.cluster import MiniBatchKMeans +kmeans = MiniBatchKMeans(n_clusters=10000, batch_size=256 * multiprocessing.cpu_count(), init="random") +kmeans.fit(big_npy) +sample_npy = kmeans.cluster_centers_ +``` \ No newline at end of file diff --git a/docs/faiss_tips_ja.md b/docs/faiss_tips_ja.md new file mode 100644 index 0000000..e838494 --- /dev/null +++ b/docs/faiss_tips_ja.md @@ -0,0 +1,146 @@ +faiss tuning TIPS +================== +# about faiss +faissはfacebook researchの開発する、密なベクトルに対する近傍探索をまとめたライブラリで、多くの近似近傍探索の手法を効率的に実装しています。 +近似近傍探索はある程度精度を犠牲にしながら高速に類似するベクトルを探します。 + +## faiss in RVC +RVCではHuBERTで変換した特徴量のEmbeddingに対し、学習データから生成されたEmbeddingと類似するものを検索し、混ぜることでより元の音声に近い変換を実現しています。ただ、この検索は愚直に行うと時間がかかるため、近似近傍探索を用いることで高速な変換を実現しています。 + +# 実装のoverview +モデルが配置されている '/logs/your-experiment/3_feature256'には各音声データからHuBERTで抽出された特徴量が配置されています。 +ここからnpyファイルをファイル名でソートした順番で読み込み、ベクトルを連結してbig_npyを作成します。(このベクトルのshapeは[N, 256]です。) +big_npyを/logs/your-experiment/total_fea.npyとして保存した後、faissを学習させます。 + +2023/04/18時点ではfaissのindex factoryの機能を用いて、L2距離に基づくIVFを用いています。 +IVFの分割数(n_ivf)はN//39で、n_probeはint(np.power(n_ivf, 0.3))が採用されています。(infer-web.pyのtrain_index周りを探してください。) + +本Tipsではまずこれらのパラメータの意味を解説し、その後よりよいindexを作成するための開発者向けアドバイスを書きます。 + +# 手法の解説 +## index factory +index factoryは複数の近似近傍探索の手法を繋げるパイプラインをstringで表記するfaiss独自の記法です。 +これにより、index factoryの文字列を変更するだけで様々な近似近傍探索の手法を試せます。 +RVCでは以下のように使われています。 + +```python +index = faiss.index_factory(256, "IVF%s,Flat" % n_ivf) +``` +index_factoryの引数のうち、1つ目はベクトルの次元数、2つ目はindex factoryの文字列で、3つ目には用いる距離を指定することができます。 + +より詳細な記法については +https://github.com/facebookresearch/faiss/wiki/The-index-factory + +## 距離指標 +embeddingの類似度として用いられる代表的な指標として以下の二つがあります。 + +- ユークリッド距離(METRIC_L2) +- 内積(METRIC_INNER_PRODUCT) + +ユークリッド距離では各次元において二乗の差をとり、全次元の差を足してから平方根をとります。これは日常的に用いる2次元、3次元での距離と同じです。 +内積はこのままでは類似度の指標として用いず、一般的にはL2ノルムで正規化してから内積をとるコサイン類似度を用います。 + +どちらがよいかは場合によりますが、word2vec等で得られるembeddingやArcFace等で学習した類似画像検索のモデルではコサイン類似度が用いられることが多いです。ベクトルXに対してl2正規化をnumpyで行う場合は、0 divisionを避けるために十分に小さな値をepsとして以下のコードで可能です。 + +```python +X_normed = X / np.maximum(eps, np.linalg.norm(X, ord=2, axis=-1, keepdims=True)) +``` + +また、index factoryには第3引数に渡す値を選ぶことで計算に用いる距離指標を変更できます。 + +```python +index = faiss.index_factory(dimention, text, faiss.METRIC_INNER_PRODUCT) +``` + +## IVF +IVF(Inverted file indexes)は全文検索における転置インデックスと似たようなアルゴリズムです。 +学習時には検索対象に対してkmeansでクラスタリングを行い、クラスタ中心を用いてボロノイ分割を行います。各データ点には一つずつクラスタが割り当てられるので、クラスタからデータ点を逆引きする辞書を作成します。 + +例えば以下のようにクラスタが割り当てられた場合 +|index|クラスタ| +|-----|-------| +|1|A| +|2|B| +|3|A| +|4|C| +|5|B| + +作成される転置インデックスは以下のようになります。 + +|クラスタ|index| +|-------|-----| +|A|1, 3| +|B|2, 5| +|C|4| + +検索時にはまずクラスタからn_probe個のクラスタを検索し、次にそれぞれのクラスタに属するデータ点について距離を計算します。 + +# 推奨されるパラメータ +indexの選び方については公式にガイドラインがあるので、それに準じて説明します。 +https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index + +1M以下のデータセットにおいては4bit-PQが2023年4月時点ではfaissで利用できる最も効率的な手法です。 +これをIVFと組み合わせ、4bit-PQで候補を絞り、最後に正確な指標で距離を再計算するには以下のindex factoryを用いることで記載できます。 + +```python +index = faiss.index_factory(256, "IVF1024,PQ128x4fs,RFlat") +``` + +## IVFの推奨パラメータ +IVFの数が多すぎる場合、たとえばデータ数の数だけIVFによる粗量子化を行うと、これは愚直な全探索と同じになり効率が悪いです。 +1M以下の場合ではIVFの値はデータ点の数Nに対して4*sqrt(N) ~ 16*sqrt(N)に推奨しています。 + +n_probeはn_probeの数に比例して計算時間が増えるので、精度と相談して適切に選んでください。個人的にはRVCにおいてそこまで精度は必要ないと思うのでn_probe = 1で良いと思います。 + +## FastScan +FastScanは直積量子化で大まかに距離を近似するのを、レジスタ内で行うことにより高速に行うようにした手法です。 +直積量子化は学習時にd次元ごと(通常はd=2)に独立してクラスタリングを行い、クラスタ同士の距離を事前計算してlookup tableを作成します。予測時はlookup tableを見ることで各次元の距離をO(1)で計算できます。 +そのため、PQの次に指定する数字は通常ベクトルの半分の次元を指定します。 + +FastScanに関するより詳細な説明は公式のドキュメントを参照してください。 +https://github.com/facebookresearch/faiss/wiki/Fast-accumulation-of-PQ-and-AQ-codes-(FastScan) + +## RFlat +RFlatはFastScanで計算した大まかな距離を、index factoryの第三引数で指定した正確な距離で再計算する指示です。 +k個の近傍を取得する際は、k*k_factor個の点について再計算が行われます。 + +# Embeddingに関するテクニック +## alpha query expansion +クエリ拡張は検索で使われるテクニックで、例えば全文検索では入力された検索文に単語を幾つか追加することで検索精度を上げることがあります。ベクトル検索にもいくつか提唱されていて、その内追加の学習がいらず効果が高い手法としてα-query expansionが知られています。論文では[Attention-Based Query Expansion Learning](https://arxiv.org/abs/2007.08019)などで紹介されていて、[kaggleのshopeeコンペの2位の解法](https://www.kaggle.com/code/lyakaap/2nd-place-solution/notebook)にも用いられていました。 + +α-query expansionはあるベクトルに対し、近傍のベクトルを類似度のα乗した重みで足し合わせることでできます。いかにコードの例を張ります。big_npyをα query expansionしたものに置き換えます。 + +```python +alpha = 3. +index = faiss.index_factory(256, "IVF512,PQ128x4fs,RFlat") +original_norm = np.maximum(np.linalg.norm(big_npy, ord=2, axis=1, keepdims=True), 1e-9) +big_npy /= original_norm +index.train(big_npy) +index.add(big_npy) +dist, neighbor = index.search(big_npy, num_expand) + +expand_arrays = [] +ixs = np.arange(big_npy.shape[0]) +for i in range(-(-big_npy.shape[0]//batch_size)): + ix = ixs[i*batch_size:(i+1)*batch_size] + weight = np.power(np.einsum("nd,nmd->nm", big_npy[ix], big_npy[neighbor[ix]]), alpha) + expand_arrays.append(np.sum(big_npy[neighbor[ix]] * np.expand_dims(weight, axis=2),axis=1)) +big_npy = np.concatenate(expand_arrays, axis=0) + +# normalize index version +big_npy = big_npy / np.maximum(np.linalg.norm(big_npy, ord=2, axis=1, keepdims=True), 1e-9) +``` + +これは、検索を行うクエリにも、検索対象のDBにも適応可能なテクニックです。 + +## MiniBatch KMeansによるembeddingの圧縮 +total_fea.npyが大きすぎる場合、KMeansを用いてベクトルを小さくすることが可能です。 +以下のコードで、embeddingの圧縮が可能です。n_clustersは圧縮したい大きさを指定し、batch_sizeは256 * CPUのコア数を指定することでCPUの並列化の恩恵を十分に得ることができます。 + +```python +import multiprocessing +from sklearn.cluster import MiniBatchKMeans +kmeans = MiniBatchKMeans(n_clusters=10000, batch_size=256 * multiprocessing.cpu_count(), init="random") +kmeans.fit(big_npy) +sample_npy = kmeans.cluster_centers_ +``` \ No newline at end of file