chore(sync): merge dev into main (#1379)

* Optimize latency (#1259)

* add attribute:   configs/config.py
	Optimize latency:   tools/rvc_for_realtime.py

* new file:   assets/Synthesizer_inputs.pth

* fix:   configs/config.py
	fix:   tools/rvc_for_realtime.py

* fix bug:   infer/lib/infer_pack/models.py

* new file:   assets/hubert_inputs.pth
	new file:   assets/rmvpe_inputs.pth
	modified:   configs/config.py
	new features:   infer/lib/rmvpe.py
	new features:   tools/jit_export/__init__.py
	new features:   tools/jit_export/get_hubert.py
	new features:   tools/jit_export/get_rmvpe.py
	new features:   tools/jit_export/get_synthesizer.py
	optimize:   tools/rvc_for_realtime.py

* optimize:   tools/jit_export/get_synthesizer.py
	fix bug:   tools/jit_export/__init__.py

* Fixed a bug caused by using half on the CPU:   infer/lib/rmvpe.py
	Fixed a bug caused by using half on the CPU:   tools/jit_export/__init__.py
	Fixed CIRCULAR IMPORT:   tools/jit_export/get_rmvpe.py
	Fixed CIRCULAR IMPORT:   tools/jit_export/get_synthesizer.py
	Fixed a bug caused by using half on the CPU:   tools/rvc_for_realtime.py

* Remove useless code:   infer/lib/rmvpe.py

* Delete gui_v1 copy.py

* Delete .vscode/launch.json

* Delete jit_export_test.py

* Delete tools/rvc_for_realtime copy.py

* Delete configs/config.json

* Delete .gitignore

* Fix exceptions caused by switching inference devices:   infer/lib/rmvpe.py
	Fix exceptions caused by switching inference devices:   tools/jit_export/__init__.py
	Fix exceptions caused by switching inference devices:   tools/rvc_for_realtime.py

* restore

* replace(you can undo this commit)

* remove debug_print

---------

Co-authored-by: Ftps <ftpsflandre@gmail.com>

* Fixed some bugs when exporting ONNX model (#1254)

* fix import (#1280)

* fix import

* lint

* 🎨 同步 locale (#1242)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>

* Fix jit load and import issue (#1282)

* fix jit model loading :   infer/lib/rmvpe.py

* modified:   assets/hubert/.gitignore
	move file:    assets/hubert_inputs.pth -> assets/hubert/hubert_inputs.pth
	modified:   assets/rmvpe/.gitignore
	move file:    assets/rmvpe_inputs.pth -> assets/rmvpe/rmvpe_inputs.pth
	fix import:   gui_v1.py

* feat(workflow): trigger on dev

* feat(workflow): add close-pr on non-dev branch

* Add input wav and delay time monitor for real-time gui (#1293)

* feat(workflow): trigger on dev

* feat(workflow): add close-pr on non-dev branch

* 🎨 同步 locale (#1289)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>

* feat: edit PR template

* add input wav and delay time monitor

---------

Co-authored-by: 源文雨 <41315874+fumiama@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com>

* Optimize latency using scripted jit (#1291)

* feat(workflow): trigger on dev

* feat(workflow): add close-pr on non-dev branch

* 🎨 同步 locale (#1289)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>

* feat: edit PR template

* Optimize-latency-using-scripted:   configs/config.py
	Optimize-latency-using-scripted:   infer/lib/infer_pack/attentions.py
	Optimize-latency-using-scripted:   infer/lib/infer_pack/commons.py
	Optimize-latency-using-scripted:   infer/lib/infer_pack/models.py
	Optimize-latency-using-scripted:   infer/lib/infer_pack/modules.py
	Optimize-latency-using-scripted:   infer/lib/jit/__init__.py
	Optimize-latency-using-scripted:   infer/lib/jit/get_hubert.py
	Optimize-latency-using-scripted:   infer/lib/jit/get_rmvpe.py
	Optimize-latency-using-scripted:   infer/lib/jit/get_synthesizer.py
	Optimize-latency-using-scripted:   infer/lib/rmvpe.py
	Optimize-latency-using-scripted:   tools/rvc_for_realtime.py

* modified:   infer/lib/infer_pack/models.py

* fix some bug:   configs/config.py
	fix some bug:   infer/lib/infer_pack/models.py
	fix some bug:   infer/lib/rmvpe.py

* Fixed abnormal reference of logger in multiprocessing:   infer/modules/train/train.py

---------

Co-authored-by: 源文雨 <41315874+fumiama@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>

* Format code (#1298)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>

* 🎨 同步 locale (#1299)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>

* feat: optimize actions

* feat(workflow): add sync dev

* feat: optimize actions

* feat: optimize actions

* feat: optimize actions

* feat: optimize actions

* feat: add jit options (#1303)

Delete useless code:   infer/lib/jit/get_synthesizer.py
	Optimized code:   tools/rvc_for_realtime.py

* Code refactor + re-design inference ui (#1304)

* Code refacor + re-design inference ui

* Fix tabname

* i18n jp

---------

Co-authored-by: Ftps <ftpsflandre@gmail.com>

* feat: optimize actions

* feat: optimize actions

* Update README & en_US locale file (#1309)

* critical: some bug fixes (#1322)

* JIT acceleration switch does not support hot update

* fix padding bug of rmvpe in torch-directml

* fix padding bug of rmvpe in torch-directml

* Fix STFT under torch_directml (#1330)

* chore(format): run black on dev (#1318)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>

* chore(i18n): sync locale on dev (#1317)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>

* feat: allow for tta to be passed to uvr (#1361)

* chore(format): run black on dev (#1373)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>

* Added script for automatically download all needed models at install (#1366)

* Delete modules.py

* Add files via upload

* Add files via upload

* Add files via upload

* Add files via upload

* chore(i18n): sync locale on dev (#1377)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>

* chore(format): run black on dev (#1376)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>

* Update IPEX library (#1362)

* Update IPEX library

* Update ipex index

* chore(format): run black on dev (#1378)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>

---------

Co-authored-by: Chengjia Jiang <46401978+ChasonJiang@users.noreply.github.com>
Co-authored-by: Ftps <ftpsflandre@gmail.com>
Co-authored-by: shizuku_nia <102004222+ShizukuNia@users.noreply.github.com>
Co-authored-by: Ftps <63702646+Tps-F@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: 源文雨 <41315874+fumiama@users.noreply.github.com>
Co-authored-by: yxlllc <33565655+yxlllc@users.noreply.github.com>
Co-authored-by: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com>
Co-authored-by: Blaise <133521603+blaise-tk@users.noreply.github.com>
Co-authored-by: Rice Cake <gak141808@gmail.com>
Co-authored-by: AWAS666 <33494149+AWAS666@users.noreply.github.com>
Co-authored-by: Dmitry <nda2911@yandex.ru>
Co-authored-by: Disty0 <47277141+Disty0@users.noreply.github.com>
This commit is contained in:
github-actions[bot] 2023-10-06 17:14:33 +08:00 committed by GitHub
parent fe166e7f3d
commit e9dd11bddb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
42 changed files with 2014 additions and 1120 deletions

View File

@ -68,12 +68,16 @@ poetry install
你也可以通过 pip 来安装依赖: 你也可以通过 pip 来安装依赖:
```bash ```bash
N卡 N卡
pip install -r requirements.txt
pip install -r requirements.txt
A卡/I卡 A卡/I卡
pip install -r requirements-dml.txt pip install -r requirements-dml.txt
A卡RocmLinux
pip install -r requirements-amd.txt
I卡IPEXLinux
pip install -r requirements-ipex.txt
``` ```
------ ------
@ -122,11 +126,34 @@ https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/rmvpe.pt
```bash ```bash
python infer-web.py python infer-web.py
``` ```
如果你正在使用Windows 或 macOS你可以直接下载并解压`RVC-beta.7z`,前者可以运行`go-web.bat`以启动WebUI后者则运行命令`sh ./run.sh`以启动WebUI。 如果你正在使用Windows 或 macOS你可以直接下载并解压`RVC-beta.7z`,前者可以运行`go-web.bat`以启动WebUI后者则运行命令`sh ./run.sh`以启动WebUI。
对于需要使用IPEX技术的I卡用户请先在终端执行`source /opt/intel/oneapi/setvars.sh`仅Linux
仓库内还有一份`小白简易教程.doc`以供参考。 仓库内还有一份`小白简易教程.doc`以供参考。
## AMD显卡Rocm相关仅Linux
如果你想基于AMD的Rocm技术在Linux系统上运行RVC请先在[这里](https://rocm.docs.amd.com/en/latest/deploy/linux/os-native/install.html)安装所需的驱动。
若你使用的是Arch Linux可以使用pacman来安装所需驱动
````
pacman -S rocm-hip-sdk rocm-opencl-sdk
````
对于某些型号的显卡你可能需要额外配置如下的环境变量RX6700XT
````
export ROCM_PATH=/opt/rocm
export HSA_OVERRIDE_GFX_VERSION=10.3.0
````
同时确保你的当前用户处于`render`与`video`用户组内:
````
sudo usermod -aG render $USERNAME
sudo usermod -aG video $USERNAME
````
之后运行WebUI
```bash
python infer-web.py
```
## 参考项目 ## 参考项目
+ [ContentVec](https://github.com/auspicious3000/contentvec/) + [ContentVec](https://github.com/auspicious3000/contentvec/)
+ [VITS](https://github.com/jaywalnut310/vits) + [VITS](https://github.com/jaywalnut310/vits)

Binary file not shown.

View File

@ -1,2 +1,3 @@
* *
!.gitignore !.gitignore
!hubert_inputs.pth

Binary file not shown.

View File

@ -1,2 +1,3 @@
* *
!.gitignore !.gitignore
!rmvpe_inputs.pth

Binary file not shown.

View File

@ -1,15 +1 @@
{ {"pth_path": "assets/weights/kikiV1.pth", "index_path": "logs/kikiV1.index", "sg_input_device": "VoiceMeeter Output (VB-Audio Vo (MME)", "sg_output_device": "VoiceMeeter Input (VB-Audio Voi (MME)", "threhold": -45.0, "pitch": 2.0, "rms_mix_rate": 0.0, "index_rate": 0.0, "block_time": 0.52, "crossfade_length": 0.15, "extra_time": 2.46, "n_cpu": 6.0, "use_jit": false, "f0method": "rmvpe"}
"pth_path": "assets/weights/kikiV1.pth",
"index_path": "logs/kikiV1.index",
"sg_input_device": "VoiceMeeter Output (VB-Audio Vo (MME)",
"sg_output_device": "VoiceMeeter Aux Input (VB-Audio (MME)",
"threhold": -45.0,
"pitch": 12.0,
"index_rate": 0.0,
"rms_mix_rate": 0.0,
"block_time": 0.25,
"crossfade_length": 0.04,
"extra_time": 2.0,
"n_cpu": 6.0,
"f0method": "rmvpe"
}

View File

@ -13,7 +13,7 @@ try:
from infer.modules.ipex import ipex_init from infer.modules.ipex import ipex_init
ipex_init() ipex_init()
except Exception: except Exception: # pylint: disable=broad-exception-caught
pass pass
import logging import logging
@ -44,6 +44,7 @@ class Config:
def __init__(self): def __init__(self):
self.device = "cuda:0" self.device = "cuda:0"
self.is_half = True self.is_half = True
self.use_jit = False
self.n_cpu = 0 self.n_cpu = 0
self.gpu_name = None self.gpu_name = None
self.json_config = self.load_config_json() self.json_config = self.load_config_json()
@ -122,6 +123,15 @@ class Config:
def use_fp32_config(self): def use_fp32_config(self):
for config_file in version_config_list: for config_file in version_config_list:
self.json_config[config_file]["train"]["fp16_run"] = False self.json_config[config_file]["train"]["fp16_run"] = False
with open(f"configs/{config_file}", "r") as f:
strr = f.read().replace("true", "false")
with open(f"configs/{config_file}", "w") as f:
f.write(strr)
with open("infer/modules/train/preprocess.py", "r") as f:
strr = f.read().replace("3.7", "3.0")
with open("infer/modules/train/preprocess.py", "w") as f:
f.write(strr)
print("overwrite preprocess and configs.json")
def device_config(self) -> tuple: def device_config(self) -> tuple:
if torch.cuda.is_available(): if torch.cuda.is_available():
@ -237,4 +247,5 @@ class Config:
) )
except: except:
pass pass
print("is_half:%s, device:%s" % (self.is_half, self.device))
return x_pad, x_query, x_center, x_max return x_pad, x_query, x_center, x_max

View File

@ -97,7 +97,12 @@ sh ./run.sh
## Preparation of other Pre-models ## Preparation of other Pre-models
RVC requires other pre-models to infer and train. RVC requires other pre-models to infer and train.
You need to download them from our [Huggingface space](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/). ```bash
#Download all needed models from https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/
python tools/download_models.py
```
Or just download them by yourself from our [Huggingface space](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/).
Here's a list of Pre-models and other files that RVC needs: Here's a list of Pre-models and other files that RVC needs:
```bash ```bash

209
gui_v1.py
View File

@ -1,5 +1,4 @@
import os import os
import logging
import sys import sys
from dotenv import load_dotenv from dotenv import load_dotenv
@ -13,10 +12,16 @@ now_dir = os.getcwd()
sys.path.append(now_dir) sys.path.append(now_dir)
import multiprocessing import multiprocessing
logger = logging.getLogger(__name__)
stream_latency = -1 stream_latency = -1
def printt(strr, *args):
if len(args) == 0:
print(strr)
else:
print(strr % args)
class Harvest(multiprocessing.Process): class Harvest(multiprocessing.Process):
def __init__(self, inp_q, opt_q): def __init__(self, inp_q, opt_q):
multiprocessing.Process.__init__(self) multiprocessing.Process.__init__(self)
@ -62,9 +67,11 @@ if __name__ == "__main__":
import tools.rvc_for_realtime as rvc_for_realtime import tools.rvc_for_realtime as rvc_for_realtime
from i18n.i18n import I18nAuto from i18n.i18n import I18nAuto
from configs.config import Config
i18n = I18nAuto() i18n = I18nAuto()
device = rvc_for_realtime.config.device
# device = rvc_for_realtime.config.device
# device = torch.device( # device = torch.device(
# "cuda" # "cuda"
# if torch.cuda.is_available() # if torch.cuda.is_available()
@ -86,8 +93,8 @@ if __name__ == "__main__":
self.block_time: float = 1.0 # s self.block_time: float = 1.0 # s
self.buffer_num: int = 1 self.buffer_num: int = 1
self.threhold: int = -60 self.threhold: int = -60
self.crossfade_time: float = 0.04 self.crossfade_time: float = 0.05
self.extra_time: float = 2.0 self.extra_time: float = 2.5
self.I_noise_reduce = False self.I_noise_reduce = False
self.O_noise_reduce = False self.O_noise_reduce = False
self.rms_mix_rate = 0.0 self.rms_mix_rate = 0.0
@ -99,7 +106,8 @@ if __name__ == "__main__":
class GUI: class GUI:
def __init__(self) -> None: def __init__(self) -> None:
self.config = GUIConfig() self.gui_config = GUIConfig()
self.config = Config()
self.flag_vc = False self.flag_vc = False
self.function = "vc" self.function = "vc"
self.delay_time = 0 self.delay_time = 0
@ -130,9 +138,10 @@ if __name__ == "__main__":
"index_rate": "0", "index_rate": "0",
"rms_mix_rate": "0", "rms_mix_rate": "0",
"block_time": "0.25", "block_time": "0.25",
"crossfade_length": "0.04", "crossfade_length": "0.05",
"extra_time": "2", "extra_time": "2.5",
"f0method": "rmvpe", "f0method": "rmvpe",
"use_jit": False,
} }
data["pm"] = data["f0method"] == "pm" data["pm"] = data["f0method"] == "pm"
data["harvest"] = data["f0method"] == "harvest" data["harvest"] = data["f0method"] == "harvest"
@ -142,6 +151,7 @@ if __name__ == "__main__":
def launcher(self): def launcher(self):
data = self.load() data = self.load()
self.config.use_jit = False # data.get("use_jit", self.config.use_jit)
sg.theme("LightBlue3") sg.theme("LightBlue3")
input_devices, output_devices, _, _ = self.get_devices() input_devices, output_devices, _, _ = self.get_devices()
layout = [ layout = [
@ -294,6 +304,17 @@ if __name__ == "__main__":
enable_events=True, enable_events=True,
), ),
], ],
# [
# sg.Text("设备延迟"),
# sg.Slider(
# range=(0, 1),
# key="device_latency",
# resolution=0.001,
# orientation="h",
# default_value=data.get("device_latency", "0.1"),
# enable_events=True,
# ),
# ],
[ [
sg.Text(i18n("harvest进程数")), sg.Text(i18n("harvest进程数")),
sg.Slider( sg.Slider(
@ -302,7 +323,7 @@ if __name__ == "__main__":
resolution=1, resolution=1,
orientation="h", orientation="h",
default_value=data.get( default_value=data.get(
"n_cpu", min(self.config.n_cpu, n_cpu) "n_cpu", min(self.gui_config.n_cpu, n_cpu)
), ),
enable_events=True, enable_events=True,
), ),
@ -314,7 +335,7 @@ if __name__ == "__main__":
key="crossfade_length", key="crossfade_length",
resolution=0.01, resolution=0.01,
orientation="h", orientation="h",
default_value=data.get("crossfade_length", "0.04"), default_value=data.get("crossfade_length", "0.05"),
enable_events=True, enable_events=True,
), ),
], ],
@ -325,7 +346,7 @@ if __name__ == "__main__":
key="extra_time", key="extra_time",
resolution=0.01, resolution=0.01,
orientation="h", orientation="h",
default_value=data.get("extra_time", "2.0"), default_value=data.get("extra_time", "2.5"),
enable_events=True, enable_events=True,
), ),
], ],
@ -340,7 +361,14 @@ if __name__ == "__main__":
key="O_noise_reduce", key="O_noise_reduce",
enable_events=True, enable_events=True,
), ),
# sg.Checkbox(
# "JIT加速",
# default=self.config.use_jit,
# key="use_jit",
# enable_events=False,
# ),
], ],
# [sg.Text("注首次使用JIT加速时会出现卡顿\n 并伴随一些噪音,但这是正常现象!")],
], ],
title=i18n("性能设置"), title=i18n("性能设置"),
), ),
@ -382,24 +410,24 @@ if __name__ == "__main__":
prev_output = self.window["sg_output_device"].get() prev_output = self.window["sg_output_device"].get()
input_devices, output_devices, _, _ = self.get_devices(update=True) input_devices, output_devices, _, _ = self.get_devices(update=True)
if prev_input not in input_devices: if prev_input not in input_devices:
self.config.sg_input_device = input_devices[0] self.gui_config.sg_input_device = input_devices[0]
else: else:
self.config.sg_input_device = prev_input self.gui_config.sg_input_device = prev_input
self.window["sg_input_device"].Update(values=input_devices) self.window["sg_input_device"].Update(values=input_devices)
self.window["sg_input_device"].Update( self.window["sg_input_device"].Update(
value=self.config.sg_input_device value=self.gui_config.sg_input_device
) )
if prev_output not in output_devices: if prev_output not in output_devices:
self.config.sg_output_device = output_devices[0] self.gui_config.sg_output_device = output_devices[0]
else: else:
self.config.sg_output_device = prev_output self.gui_config.sg_output_device = prev_output
self.window["sg_output_device"].Update(values=output_devices) self.window["sg_output_device"].Update(values=output_devices)
self.window["sg_output_device"].Update( self.window["sg_output_device"].Update(
value=self.config.sg_output_device value=self.gui_config.sg_output_device
) )
if event == "start_vc" and self.flag_vc == False: if event == "start_vc" and self.flag_vc == False:
if self.set_values(values) == True: if self.set_values(values) == True:
logger.info("cuda_is_available: %s", torch.cuda.is_available()) printt("cuda_is_available: %s", torch.cuda.is_available())
self.start_vc() self.start_vc()
settings = { settings = {
"pth_path": values["pth_path"], "pth_path": values["pth_path"],
@ -410,10 +438,13 @@ if __name__ == "__main__":
"pitch": values["pitch"], "pitch": values["pitch"],
"rms_mix_rate": values["rms_mix_rate"], "rms_mix_rate": values["rms_mix_rate"],
"index_rate": values["index_rate"], "index_rate": values["index_rate"],
# "device_latency": values["device_latency"],
"block_time": values["block_time"], "block_time": values["block_time"],
"crossfade_length": values["crossfade_length"], "crossfade_length": values["crossfade_length"],
"extra_time": values["extra_time"], "extra_time": values["extra_time"],
"n_cpu": values["n_cpu"], "n_cpu": values["n_cpu"],
# "use_jit": values["use_jit"],
"use_jit": False,
"f0method": ["pm", "harvest", "crepe", "rmvpe"][ "f0method": ["pm", "harvest", "crepe", "rmvpe"][
[ [
values["pm"], values["pm"],
@ -442,28 +473,28 @@ if __name__ == "__main__":
stream_latency = -1 stream_latency = -1
# Parameter hot update # Parameter hot update
if event == "threhold": if event == "threhold":
self.config.threhold = values["threhold"] self.gui_config.threhold = values["threhold"]
elif event == "pitch": elif event == "pitch":
self.config.pitch = values["pitch"] self.gui_config.pitch = values["pitch"]
if hasattr(self, "rvc"): if hasattr(self, "rvc"):
self.rvc.change_key(values["pitch"]) self.rvc.change_key(values["pitch"])
elif event == "index_rate": elif event == "index_rate":
self.config.index_rate = values["index_rate"] self.gui_config.index_rate = values["index_rate"]
if hasattr(self, "rvc"): if hasattr(self, "rvc"):
self.rvc.change_index_rate(values["index_rate"]) self.rvc.change_index_rate(values["index_rate"])
elif event == "rms_mix_rate": elif event == "rms_mix_rate":
self.config.rms_mix_rate = values["rms_mix_rate"] self.gui_config.rms_mix_rate = values["rms_mix_rate"]
elif event in ["pm", "harvest", "crepe", "rmvpe"]: elif event in ["pm", "harvest", "crepe", "rmvpe"]:
self.config.f0method = event self.gui_config.f0method = event
elif event == "I_noise_reduce": elif event == "I_noise_reduce":
self.config.I_noise_reduce = values["I_noise_reduce"] self.gui_config.I_noise_reduce = values["I_noise_reduce"]
if stream_latency > 0: if stream_latency > 0:
self.delay_time += ( self.delay_time += (
1 if values["I_noise_reduce"] else -1 1 if values["I_noise_reduce"] else -1
) * values["crossfade_length"] ) * values["crossfade_length"]
self.window["delay_time"].update(int(self.delay_time * 1000)) self.window["delay_time"].update(int(self.delay_time * 1000))
elif event == "O_noise_reduce": elif event == "O_noise_reduce":
self.config.O_noise_reduce = values["O_noise_reduce"] self.gui_config.O_noise_reduce = values["O_noise_reduce"]
elif event in ["vc", "im"]: elif event in ["vc", "im"]:
self.function = event self.function = event
elif event != "start_vc" and self.flag_vc == True: elif event != "start_vc" and self.flag_vc == True:
@ -486,19 +517,21 @@ if __name__ == "__main__":
sg.popup(i18n("index文件路径不可包含中文")) sg.popup(i18n("index文件路径不可包含中文"))
return False return False
self.set_devices(values["sg_input_device"], values["sg_output_device"]) self.set_devices(values["sg_input_device"], values["sg_output_device"])
self.config.pth_path = values["pth_path"] self.config.use_jit = False # values["use_jit"]
self.config.index_path = values["index_path"] # self.device_latency = values["device_latency"]
self.config.threhold = values["threhold"] self.gui_config.pth_path = values["pth_path"]
self.config.pitch = values["pitch"] self.gui_config.index_path = values["index_path"]
self.config.block_time = values["block_time"] self.gui_config.threhold = values["threhold"]
self.config.crossfade_time = values["crossfade_length"] self.gui_config.pitch = values["pitch"]
self.config.extra_time = values["extra_time"] self.gui_config.block_time = values["block_time"]
self.config.I_noise_reduce = values["I_noise_reduce"] self.gui_config.crossfade_time = values["crossfade_length"]
self.config.O_noise_reduce = values["O_noise_reduce"] self.gui_config.extra_time = values["extra_time"]
self.config.rms_mix_rate = values["rms_mix_rate"] self.gui_config.I_noise_reduce = values["I_noise_reduce"]
self.config.index_rate = values["index_rate"] self.gui_config.O_noise_reduce = values["O_noise_reduce"]
self.config.n_cpu = values["n_cpu"] self.gui_config.rms_mix_rate = values["rms_mix_rate"]
self.config.f0method = ["pm", "harvest", "crepe", "rmvpe"][ self.gui_config.index_rate = values["index_rate"]
self.gui_config.n_cpu = values["n_cpu"]
self.gui_config.f0method = ["pm", "harvest", "crepe", "rmvpe"][
[ [
values["pm"], values["pm"],
values["harvest"], values["harvest"],
@ -512,34 +545,48 @@ if __name__ == "__main__":
torch.cuda.empty_cache() torch.cuda.empty_cache()
self.flag_vc = True self.flag_vc = True
self.rvc = rvc_for_realtime.RVC( self.rvc = rvc_for_realtime.RVC(
self.config.pitch, self.gui_config.pitch,
self.config.pth_path, self.gui_config.pth_path,
self.config.index_path, self.gui_config.index_path,
self.config.index_rate, self.gui_config.index_rate,
self.config.n_cpu, self.gui_config.n_cpu,
inp_q, inp_q,
opt_q, opt_q,
device, self.config,
self.rvc if hasattr(self, "rvc") else None, self.rvc if hasattr(self, "rvc") else None,
) )
self.config.samplerate = self.rvc.tgt_sr self.gui_config.samplerate = self.rvc.tgt_sr
self.zc = self.rvc.tgt_sr // 100 self.zc = self.rvc.tgt_sr // 100
self.block_frame = ( self.block_frame = (
int(np.round(self.config.block_time * self.config.samplerate / self.zc)) int(
np.round(
self.gui_config.block_time
* self.gui_config.samplerate
/ self.zc
)
)
* self.zc * self.zc
) )
self.block_frame_16k = 160 * self.block_frame // self.zc self.block_frame_16k = 160 * self.block_frame // self.zc
self.crossfade_frame = ( self.crossfade_frame = (
int( int(
np.round( np.round(
self.config.crossfade_time * self.config.samplerate / self.zc self.gui_config.crossfade_time
* self.gui_config.samplerate
/ self.zc
) )
) )
* self.zc * self.zc
) )
self.sola_search_frame = self.zc self.sola_search_frame = self.zc
self.extra_frame = ( self.extra_frame = (
int(np.round(self.config.extra_time * self.config.samplerate / self.zc)) int(
np.round(
self.gui_config.extra_time
* self.gui_config.samplerate
/ self.zc
)
)
* self.zc * self.zc
) )
self.input_wav: torch.Tensor = torch.zeros( self.input_wav: torch.Tensor = torch.zeros(
@ -547,12 +594,12 @@ if __name__ == "__main__":
+ self.crossfade_frame + self.crossfade_frame
+ self.sola_search_frame + self.sola_search_frame
+ self.block_frame, + self.block_frame,
device=device, device=self.config.device,
dtype=torch.float32, dtype=torch.float32,
) )
self.input_wav_res: torch.Tensor = torch.zeros( self.input_wav_res: torch.Tensor = torch.zeros(
160 * self.input_wav.shape[0] // self.zc, 160 * self.input_wav.shape[0] // self.zc,
device=device, device=self.config.device,
dtype=torch.float32, dtype=torch.float32,
) )
self.pitch: np.ndarray = np.zeros( self.pitch: np.ndarray = np.zeros(
@ -564,12 +611,12 @@ if __name__ == "__main__":
dtype="float64", dtype="float64",
) )
self.sola_buffer: torch.Tensor = torch.zeros( self.sola_buffer: torch.Tensor = torch.zeros(
self.crossfade_frame, device=device, dtype=torch.float32 self.crossfade_frame, device=self.config.device, dtype=torch.float32
) )
self.nr_buffer: torch.Tensor = self.sola_buffer.clone() self.nr_buffer: torch.Tensor = self.sola_buffer.clone()
self.output_buffer: torch.Tensor = self.input_wav.clone() self.output_buffer: torch.Tensor = self.input_wav.clone()
self.res_buffer: torch.Tensor = torch.zeros( self.res_buffer: torch.Tensor = torch.zeros(
2 * self.zc, device=device, dtype=torch.float32 2 * self.zc, device=self.config.device, dtype=torch.float32
) )
self.valid_rate = 1 - (self.extra_frame - 1) / self.input_wav.shape[0] self.valid_rate = 1 - (self.extra_frame - 1) / self.input_wav.shape[0]
self.fade_in_window: torch.Tensor = ( self.fade_in_window: torch.Tensor = (
@ -580,7 +627,7 @@ if __name__ == "__main__":
0.0, 0.0,
1.0, 1.0,
steps=self.crossfade_frame, steps=self.crossfade_frame,
device=device, device=self.config.device,
dtype=torch.float32, dtype=torch.float32,
) )
) )
@ -588,11 +635,13 @@ if __name__ == "__main__":
) )
self.fade_out_window: torch.Tensor = 1 - self.fade_in_window self.fade_out_window: torch.Tensor = 1 - self.fade_in_window
self.resampler = tat.Resample( self.resampler = tat.Resample(
orig_freq=self.config.samplerate, new_freq=16000, dtype=torch.float32 orig_freq=self.gui_config.samplerate,
).to(device) new_freq=16000,
dtype=torch.float32,
).to(self.config.device)
self.tg = TorchGate( self.tg = TorchGate(
sr=self.config.samplerate, n_fft=4 * self.zc, prop_decrease=0.9 sr=self.gui_config.samplerate, n_fft=4 * self.zc, prop_decrease=0.9
).to(device) ).to(self.config.device)
thread_vc = threading.Thread(target=self.soundinput) thread_vc = threading.Thread(target=self.soundinput)
thread_vc.start() thread_vc.start()
@ -605,15 +654,15 @@ if __name__ == "__main__":
channels=channels, channels=channels,
callback=self.audio_callback, callback=self.audio_callback,
blocksize=self.block_frame, blocksize=self.block_frame,
samplerate=self.config.samplerate, samplerate=self.gui_config.samplerate,
dtype="float32", dtype="float32",
) as stream: ) as stream:
global stream_latency global stream_latency
stream_latency = stream.latency[-1] stream_latency = stream.latency[-1]
while self.flag_vc: while self.flag_vc:
time.sleep(self.config.block_time) time.sleep(self.gui_config.block_time)
logger.debug("Audio block passed.") printt("Audio block passed.")
logger.debug("ENDing VC") printt("ENDing VC")
def audio_callback( def audio_callback(
self, indata: np.ndarray, outdata: np.ndarray, frames, times, status self, indata: np.ndarray, outdata: np.ndarray, frames, times, status
@ -623,12 +672,12 @@ if __name__ == "__main__":
""" """
start_time = time.perf_counter() start_time = time.perf_counter()
indata = librosa.to_mono(indata.T) indata = librosa.to_mono(indata.T)
if self.config.threhold > -60: if self.gui_config.threhold > -60:
rms = librosa.feature.rms( rms = librosa.feature.rms(
y=indata, frame_length=4 * self.zc, hop_length=self.zc y=indata, frame_length=4 * self.zc, hop_length=self.zc
) )
db_threhold = ( db_threhold = (
librosa.amplitude_to_db(rms, ref=1.0)[0] < self.config.threhold librosa.amplitude_to_db(rms, ref=1.0)[0] < self.gui_config.threhold
) )
for i in range(db_threhold.shape[0]): for i in range(db_threhold.shape[0]):
if db_threhold[i]: if db_threhold[i]:
@ -636,12 +685,14 @@ if __name__ == "__main__":
self.input_wav[: -self.block_frame] = self.input_wav[ self.input_wav[: -self.block_frame] = self.input_wav[
self.block_frame : self.block_frame :
].clone() ].clone()
self.input_wav[-self.block_frame :] = torch.from_numpy(indata).to(device) self.input_wav[-self.block_frame :] = torch.from_numpy(indata).to(
self.config.device
)
self.input_wav_res[: -self.block_frame_16k] = self.input_wav_res[ self.input_wav_res[: -self.block_frame_16k] = self.input_wav_res[
self.block_frame_16k : self.block_frame_16k :
].clone() ].clone()
# input noise reduction and resampling # input noise reduction and resampling
if self.config.I_noise_reduce and self.function == "vc": if self.gui_config.I_noise_reduce and self.function == "vc":
input_wav = self.input_wav[ input_wav = self.input_wav[
-self.crossfade_frame - self.block_frame - 2 * self.zc : -self.crossfade_frame - self.block_frame - 2 * self.zc :
] ]
@ -667,7 +718,7 @@ if __name__ == "__main__":
# infer # infer
if self.function == "vc": if self.function == "vc":
f0_extractor_frame = self.block_frame_16k + 800 f0_extractor_frame = self.block_frame_16k + 800
if self.config.f0method == "rmvpe": if self.gui_config.f0method == "rmvpe":
f0_extractor_frame = ( f0_extractor_frame = (
5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160
) )
@ -678,7 +729,7 @@ if __name__ == "__main__":
self.valid_rate, self.valid_rate,
self.pitch, self.pitch,
self.pitchf, self.pitchf,
self.config.f0method, self.gui_config.f0method,
) )
infer_wav = infer_wav[ infer_wav = infer_wav[
-self.crossfade_frame - self.sola_search_frame - self.block_frame : -self.crossfade_frame - self.sola_search_frame - self.block_frame :
@ -688,8 +739,8 @@ if __name__ == "__main__":
-self.crossfade_frame - self.sola_search_frame - self.block_frame : -self.crossfade_frame - self.sola_search_frame - self.block_frame :
].clone() ].clone()
# output noise reduction # output noise reduction
if (self.config.O_noise_reduce and self.function == "vc") or ( if (self.gui_config.O_noise_reduce and self.function == "vc") or (
self.config.I_noise_reduce and self.function == "im" self.gui_config.I_noise_reduce and self.function == "im"
): ):
self.output_buffer[: -self.block_frame] = self.output_buffer[ self.output_buffer[: -self.block_frame] = self.output_buffer[
self.block_frame : self.block_frame :
@ -699,7 +750,7 @@ if __name__ == "__main__":
infer_wav.unsqueeze(0), self.output_buffer.unsqueeze(0) infer_wav.unsqueeze(0), self.output_buffer.unsqueeze(0)
).squeeze(0) ).squeeze(0)
# volume envelop mixing # volume envelop mixing
if self.config.rms_mix_rate < 1 and self.function == "vc": if self.gui_config.rms_mix_rate < 1 and self.function == "vc":
rms1 = librosa.feature.rms( rms1 = librosa.feature.rms(
y=self.input_wav_res[-160 * infer_wav.shape[0] // self.zc :] y=self.input_wav_res[-160 * infer_wav.shape[0] // self.zc :]
.cpu() .cpu()
@ -707,7 +758,7 @@ if __name__ == "__main__":
frame_length=640, frame_length=640,
hop_length=160, hop_length=160,
) )
rms1 = torch.from_numpy(rms1).to(device) rms1 = torch.from_numpy(rms1).to(self.config.device)
rms1 = F.interpolate( rms1 = F.interpolate(
rms1.unsqueeze(0), rms1.unsqueeze(0),
size=infer_wav.shape[0] + 1, size=infer_wav.shape[0] + 1,
@ -719,7 +770,7 @@ if __name__ == "__main__":
frame_length=4 * self.zc, frame_length=4 * self.zc,
hop_length=self.zc, hop_length=self.zc,
) )
rms2 = torch.from_numpy(rms2).to(device) rms2 = torch.from_numpy(rms2).to(self.config.device)
rms2 = F.interpolate( rms2 = F.interpolate(
rms2.unsqueeze(0), rms2.unsqueeze(0),
size=infer_wav.shape[0] + 1, size=infer_wav.shape[0] + 1,
@ -728,7 +779,7 @@ if __name__ == "__main__":
)[0, 0, :-1] )[0, 0, :-1]
rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-3) rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-3)
infer_wav *= torch.pow( infer_wav *= torch.pow(
rms1 / rms2, torch.tensor(1 - self.config.rms_mix_rate) rms1 / rms2, torch.tensor(1 - self.gui_config.rms_mix_rate)
) )
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC
conv_input = infer_wav[ conv_input = infer_wav[
@ -738,7 +789,7 @@ if __name__ == "__main__":
cor_den = torch.sqrt( cor_den = torch.sqrt(
F.conv1d( F.conv1d(
conv_input**2, conv_input**2,
torch.ones(1, 1, self.crossfade_frame, device=device), torch.ones(1, 1, self.crossfade_frame, device=self.config.device),
) )
+ 1e-8 + 1e-8
) )
@ -747,7 +798,7 @@ if __name__ == "__main__":
sola_offset = sola_offset.item() sola_offset = sola_offset.item()
else: else:
sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0]) sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0])
logger.debug("sola_offset = %d", int(sola_offset)) printt("sola_offset = %d", int(sola_offset))
infer_wav = infer_wav[ infer_wav = infer_wav[
sola_offset : sola_offset + self.block_frame + self.crossfade_frame sola_offset : sola_offset + self.block_frame + self.crossfade_frame
] ]
@ -764,7 +815,7 @@ if __name__ == "__main__":
) )
total_time = time.perf_counter() - start_time total_time = time.perf_counter() - start_time
self.window["infer_time"].update(int(total_time * 1000)) self.window["infer_time"].update(int(total_time * 1000))
logger.info("Infer time: %.2f", total_time) printt("Infer time: %.2f", total_time)
def get_devices(self, update: bool = True): def get_devices(self, update: bool = True):
"""获取设备列表""" """获取设备列表"""
@ -817,9 +868,7 @@ if __name__ == "__main__":
sd.default.device[1] = output_device_indices[ sd.default.device[1] = output_device_indices[
output_devices.index(output_device) output_devices.index(output_device)
] ]
logger.info("Input device: %s:%s", str(sd.default.device[0]), input_device) printt("Input device: %s:%s", str(sd.default.device[0]), input_device)
logger.info( printt("Output device: %s:%s", str(sd.default.device[1]), output_device)
"Output device: %s:%s", str(sd.default.device[1]), output_device
)
gui = GUI() gui = GUI()

View File

@ -38,6 +38,7 @@
"加载模型": "Load model", "加载模型": "Load model",
"加载预训练底模D路径": "Load pre-trained base model D path:", "加载预训练底模D路径": "Load pre-trained base model D path:",
"加载预训练底模G路径": "Load pre-trained base model G path:", "加载预训练底模G路径": "Load pre-trained base model G path:",
"单次推理": "单次推理",
"卸载音色省显存": "Unload voice to save GPU memory:", "卸载音色省显存": "Unload voice to save GPU memory:",
"变调(整数, 半音数量, 升八度12降八度-12)": "Transpose (integer, number of semitones, raise by an octave: 12, lower by an octave: -12):", "变调(整数, 半音数量, 升八度12降八度-12)": "Transpose (integer, number of semitones, raise by an octave: 12, lower by an octave: -12):",
"后处理重采样至最终采样率0为不进行重采样": "Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling:", "后处理重采样至最终采样率0为不进行重采样": "Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling:",
@ -53,6 +54,7 @@
"很遗憾您这没有能用的显卡来支持您训练": "Unfortunately, there is no compatible GPU available to support your training.", "很遗憾您这没有能用的显卡来支持您训练": "Unfortunately, there is no compatible GPU available to support your training.",
"性能设置": "Performance settings", "性能设置": "Performance settings",
"总训练轮数total_epoch": "Total training epochs (total_epoch):", "总训练轮数total_epoch": "Total training epochs (total_epoch):",
"批量推理": "批量推理",
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Batch conversion. Enter the folder containing the audio files to be converted or upload multiple audio files. The converted audio will be output in the specified folder (default: 'opt').", "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Batch conversion. Enter the folder containing the audio files to be converted or upload multiple audio files. The converted audio will be output in the specified folder (default: 'opt').",
"指定输出主人声文件夹": "Specify the output folder for vocals:", "指定输出主人声文件夹": "Specify the output folder for vocals:",
"指定输出文件夹": "Specify output folder:", "指定输出文件夹": "Specify output folder:",
@ -86,7 +88,7 @@
"特征检索库文件路径,为空则使用下拉的选择结果": "Path to the feature index file. Leave blank to use the selected result from the dropdown:", "特征检索库文件路径,为空则使用下拉的选择结果": "Path to the feature index file. Leave blank to use the selected result from the dropdown:",
"男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Recommended +12 key for male to female conversion, and -12 key for female to male conversion. If the sound range goes too far and the voice is distorted, you can also adjust it to the appropriate range by yourself.", "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Recommended +12 key for male to female conversion, and -12 key for female to male conversion. If the sound range goes too far and the voice is distorted, you can also adjust it to the appropriate range by yourself.",
"目标采样率": "Target sample rate:", "目标采样率": "Target sample rate:",
"算法延迟(ms):": "算法延迟(ms):", "算法延迟(ms):": "Algorithmic delays(ms):",
"自动检测index路径,下拉式选择(dropdown)": "Auto-detect index path and select from the dropdown:", "自动检测index路径,下拉式选择(dropdown)": "Auto-detect index path and select from the dropdown:",
"融合": "Fusion", "融合": "Fusion",
"要改的模型信息": "Model information to be modified:", "要改的模型信息": "Model information to be modified:",
@ -96,8 +98,8 @@
"训练特征索引": "Train feature index", "训练特征索引": "Train feature index",
"训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Training complete. You can check the training logs in the console or the 'train.log' file under the experiment folder.", "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Training complete. You can check the training logs in the console or the 'train.log' file under the experiment folder.",
"请指定说话人id": "Please specify the speaker/singer ID:", "请指定说话人id": "Please specify the speaker/singer ID:",
"请选择index文件": "请选择index文件", "请选择index文件": "Please choose the .index file",
"请选择pth文件": "请选择pth文件", "请选择pth文件": "Please choose the .pth file",
"请选择说话人id": "Select Speaker/Singer ID:", "请选择说话人id": "Select Speaker/Singer ID:",
"转换": "Convert", "转换": "Convert",
"输入实验名": "Enter the experiment name:", "输入实验名": "Enter the experiment name:",
@ -105,12 +107,12 @@
"输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Enter the path of the audio folder to be processed (copy it from the address bar of the file manager):", "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Enter the path of the audio folder to be processed (copy it from the address bar of the file manager):",
"输入待处理音频文件路径(默认是正确格式示例)": "Enter the path of the audio file to be processed (default is the correct format example):", "输入待处理音频文件路径(默认是正确格式示例)": "Enter the path of the audio file to be processed (default is the correct format example):",
"输入源音量包络替换输出音量包络融合比例越靠近1越使用输出包络": "Adjust the volume envelope scaling. Closer to 0, the more it mimicks the volume of the original vocals. Can help mask noise and make volume sound more natural when set relatively low. Closer to 1 will be more of a consistently loud volume:", "输入源音量包络替换输出音量包络融合比例越靠近1越使用输出包络": "Adjust the volume envelope scaling. Closer to 0, the more it mimicks the volume of the original vocals. Can help mask noise and make volume sound more natural when set relatively low. Closer to 1 will be more of a consistently loud volume:",
"输入监听": "输入监听", "输入监听": "Input voice monitor",
"输入训练文件夹路径": "Enter the path of the training folder:", "输入训练文件夹路径": "Enter the path of the training folder:",
"输入设备": "Input device", "输入设备": "Input device",
"输入降噪": "Input noise reduction", "输入降噪": "Input noise reduction",
"输出信息": "Output information", "输出信息": "Output information",
"输出变声": "输出变声", "输出变声": "Output converted voice",
"输出设备": "Output device", "输出设备": "Output device",
"输出降噪": "Output noise reduction", "输出降噪": "Output noise reduction",
"输出音频(右下角三个点,点了可以下载)": "Export audio (click on the three dots in the lower right corner to download)", "输出音频(右下角三个点,点了可以下载)": "Export audio (click on the three dots in the lower right corner to download)",

View File

@ -38,6 +38,7 @@
"加载模型": "Cargar modelo", "加载模型": "Cargar modelo",
"加载预训练底模D路径": "Cargue la ruta del modelo D base pre-entrenada.", "加载预训练底模D路径": "Cargue la ruta del modelo D base pre-entrenada.",
"加载预训练底模G路径": "Cargue la ruta del modelo G base pre-entrenada.", "加载预训练底模G路径": "Cargue la ruta del modelo G base pre-entrenada.",
"单次推理": "单次推理",
"卸载音色省显存": "Descargue la voz para ahorrar memoria GPU", "卸载音色省显存": "Descargue la voz para ahorrar memoria GPU",
"变调(整数, 半音数量, 升八度12降八度-12)": "Cambio de tono (entero, número de semitonos, subir una octava +12 o bajar una octava -12)", "变调(整数, 半音数量, 升八度12降八度-12)": "Cambio de tono (entero, número de semitonos, subir una octava +12 o bajar una octava -12)",
"后处理重采样至最终采样率0为不进行重采样": "Remuestreo posterior al proceso a la tasa de muestreo final, 0 significa no remuestrear", "后处理重采样至最终采样率0为不进行重采样": "Remuestreo posterior al proceso a la tasa de muestreo final, 0 significa no remuestrear",
@ -53,6 +54,7 @@
"很遗憾您这没有能用的显卡来支持您训练": "Lamentablemente, no tiene una tarjeta gráfica adecuada para soportar su entrenamiento", "很遗憾您这没有能用的显卡来支持您训练": "Lamentablemente, no tiene una tarjeta gráfica adecuada para soportar su entrenamiento",
"性能设置": "Configuración de rendimiento", "性能设置": "Configuración de rendimiento",
"总训练轮数total_epoch": "Total de épocas de entrenamiento (total_epoch)", "总训练轮数total_epoch": "Total de épocas de entrenamiento (total_epoch)",
"批量推理": "批量推理",
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Conversión por lotes, ingrese la carpeta que contiene los archivos de audio para convertir o cargue varios archivos de audio. El audio convertido se emitirá en la carpeta especificada (opción predeterminada).", "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Conversión por lotes, ingrese la carpeta que contiene los archivos de audio para convertir o cargue varios archivos de audio. El audio convertido se emitirá en la carpeta especificada (opción predeterminada).",
"指定输出主人声文件夹": "Especifique la carpeta de salida para la voz principal", "指定输出主人声文件夹": "Especifique la carpeta de salida para la voz principal",
"指定输出文件夹": "Especificar carpeta de salida", "指定输出文件夹": "Especificar carpeta de salida",

View File

@ -38,6 +38,7 @@
"加载模型": "Charger le modèle.", "加载模型": "Charger le modèle.",
"加载预训练底模D路径": "Charger le chemin du modèle de base pré-entraîné D :", "加载预训练底模D路径": "Charger le chemin du modèle de base pré-entraîné D :",
"加载预训练底模G路径": "Charger le chemin du modèle de base pré-entraîné G :", "加载预训练底模G路径": "Charger le chemin du modèle de base pré-entraîné G :",
"单次推理": "单次推理",
"卸载音色省显存": "Décharger la voix pour économiser la mémoire GPU.", "卸载音色省显存": "Décharger la voix pour économiser la mémoire GPU.",
"变调(整数, 半音数量, 升八度12降八度-12)": "Transposer (entier, nombre de demi-tons, monter d'une octave : 12, descendre d'une octave : -12) :", "变调(整数, 半音数量, 升八度12降八度-12)": "Transposer (entier, nombre de demi-tons, monter d'une octave : 12, descendre d'une octave : -12) :",
"后处理重采样至最终采样率0为不进行重采样": "Rééchantillonner l'audio de sortie en post-traitement à la fréquence d'échantillonnage finale. Réglez sur 0 pour ne pas effectuer de rééchantillonnage :", "后处理重采样至最终采样率0为不进行重采样": "Rééchantillonner l'audio de sortie en post-traitement à la fréquence d'échantillonnage finale. Réglez sur 0 pour ne pas effectuer de rééchantillonnage :",
@ -53,6 +54,7 @@
"很遗憾您这没有能用的显卡来支持您训练": "Malheureusement, il n'y a pas de GPU compatible disponible pour prendre en charge votre entrainement.", "很遗憾您这没有能用的显卡来支持您训练": "Malheureusement, il n'y a pas de GPU compatible disponible pour prendre en charge votre entrainement.",
"性能设置": "Paramètres de performance", "性能设置": "Paramètres de performance",
"总训练轮数total_epoch": "Nombre total d'époques d'entraînement (total_epoch) :", "总训练轮数total_epoch": "Nombre total d'époques d'entraînement (total_epoch) :",
"批量推理": "批量推理",
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Conversion en lot. Entrez le dossier contenant les fichiers audio à convertir ou téléchargez plusieurs fichiers audio. Les fichiers audio convertis seront enregistrés dans le dossier spécifié (par défaut : 'opt').", "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Conversion en lot. Entrez le dossier contenant les fichiers audio à convertir ou téléchargez plusieurs fichiers audio. Les fichiers audio convertis seront enregistrés dans le dossier spécifié (par défaut : 'opt').",
"指定输出主人声文件夹": "Spécifiez le dossier de sortie pour les fichiers de voix :", "指定输出主人声文件夹": "Spécifiez le dossier de sortie pour les fichiers de voix :",
"指定输出文件夹": "Spécifiez le dossier de sortie :", "指定输出文件夹": "Spécifiez le dossier de sortie :",

View File

@ -38,6 +38,7 @@
"加载模型": "Carica modello", "加载模型": "Carica modello",
"加载预训练底模D路径": "Carica il percorso D del modello base pre-addestrato:", "加载预训练底模D路径": "Carica il percorso D del modello base pre-addestrato:",
"加载预训练底模G路径": "Carica il percorso G del modello base pre-addestrato:", "加载预训练底模G路径": "Carica il percorso G del modello base pre-addestrato:",
"单次推理": "单次推理",
"卸载音色省显存": "Scarica la voce per risparmiare memoria della GPU:", "卸载音色省显存": "Scarica la voce per risparmiare memoria della GPU:",
"变调(整数, 半音数量, 升八度12降八度-12)": "Trasposizione (numero intero, numero di semitoni, alza di un'ottava: 12, abbassa di un'ottava: -12):", "变调(整数, 半音数量, 升八度12降八度-12)": "Trasposizione (numero intero, numero di semitoni, alza di un'ottava: 12, abbassa di un'ottava: -12):",
"后处理重采样至最终采样率0为不进行重采样": "Ricampiona l'audio di output in post-elaborazione alla frequenza di campionamento finale. ", "后处理重采样至最终采样率0为不进行重采样": "Ricampiona l'audio di output in post-elaborazione alla frequenza di campionamento finale. ",
@ -53,6 +54,7 @@
"很遗憾您这没有能用的显卡来支持您训练": "Sfortunatamente, non è disponibile alcuna GPU compatibile per supportare l'addestramento.", "很遗憾您这没有能用的显卡来支持您训练": "Sfortunatamente, non è disponibile alcuna GPU compatibile per supportare l'addestramento.",
"性能设置": "Impostazioni delle prestazioni", "性能设置": "Impostazioni delle prestazioni",
"总训练轮数total_epoch": "Epoch totali di addestramento (total_epoch):", "总训练轮数total_epoch": "Epoch totali di addestramento (total_epoch):",
"批量推理": "批量推理",
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Conversione massiva. Inserisci il percorso della cartella che contiene i file da convertire o carica più file audio. I file convertiti finiranno nella cartella specificata. (default: opt) ", "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Conversione massiva. Inserisci il percorso della cartella che contiene i file da convertire o carica più file audio. I file convertiti finiranno nella cartella specificata. (default: opt) ",
"指定输出主人声文件夹": "Specifica la cartella di output per le voci:", "指定输出主人声文件夹": "Specifica la cartella di output per le voci:",
"指定输出文件夹": "Specifica la cartella di output:", "指定输出文件夹": "Specifica la cartella di output:",

View File

@ -38,6 +38,7 @@
"加载模型": "モデルをロード", "加载模型": "モデルをロード",
"加载预训练底模D路径": "事前学習済みのDモデルのパス", "加载预训练底模D路径": "事前学習済みのDモデルのパス",
"加载预训练底模G路径": "事前学習済みのGモデルのパス", "加载预训练底模G路径": "事前学習済みのGモデルのパス",
"单次推理": "单次推理",
"卸载音色省显存": "音源を削除してメモリを節約", "卸载音色省显存": "音源を削除してメモリを節約",
"变调(整数, 半音数量, 升八度12降八度-12)": "ピッチ変更(整数、半音数、上下オクターブ12-12)", "变调(整数, 半音数量, 升八度12降八度-12)": "ピッチ変更(整数、半音数、上下オクターブ12-12)",
"后处理重采样至最终采样率0为不进行重采样": "最終的なサンプリングレートへのポストプロセッシングのリサンプリング リサンプリングしない場合は0", "后处理重采样至最终采样率0为不进行重采样": "最終的なサンプリングレートへのポストプロセッシングのリサンプリング リサンプリングしない場合は0",
@ -53,6 +54,7 @@
"很遗憾您这没有能用的显卡来支持您训练": "トレーニングに対応したGPUが動作しないのは残念です。", "很遗憾您这没有能用的显卡来支持您训练": "トレーニングに対応したGPUが動作しないのは残念です。",
"性能设置": "パフォーマンス設定", "性能设置": "パフォーマンス設定",
"总训练轮数total_epoch": "総エポック数", "总训练轮数total_epoch": "総エポック数",
"批量推理": "批量推理",
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "一括変換、変換する音声フォルダを入力、または複数の音声ファイルをアップロードし、指定したフォルダ(デフォルトのopt)に変換した音声を出力します。", "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "一括変換、変換する音声フォルダを入力、または複数の音声ファイルをアップロードし、指定したフォルダ(デフォルトのopt)に変換した音声を出力します。",
"指定输出主人声文件夹": "マスターの出力音声フォルダーを指定する", "指定输出主人声文件夹": "マスターの出力音声フォルダーを指定する",
"指定输出文件夹": "出力フォルダを指定してください", "指定输出文件夹": "出力フォルダを指定してください",

View File

@ -38,6 +38,7 @@
"加载模型": "Загрузить модель", "加载模型": "Загрузить модель",
"加载预训练底模D路径": "Путь к предварительно обученной базовой модели D:", "加载预训练底模D路径": "Путь к предварительно обученной базовой модели D:",
"加载预训练底模G路径": "Путь к предварительно обученной базовой модели G:", "加载预训练底模G路径": "Путь к предварительно обученной базовой модели G:",
"单次推理": "单次推理",
"卸载音色省显存": "Выгрузить модель из памяти GPU для освобождения ресурсов", "卸载音色省显存": "Выгрузить модель из памяти GPU для освобождения ресурсов",
"变调(整数, 半音数量, 升八度12降八度-12)": "Изменить высоту голоса (укажите количество полутонов; чтобы поднять голос на октаву, выберите 12, понизить на октаву — -12):", "变调(整数, 半音数量, 升八度12降八度-12)": "Изменить высоту голоса (укажите количество полутонов; чтобы поднять голос на октаву, выберите 12, понизить на октаву — -12):",
"后处理重采样至最终采样率0为不进行重采样": "Изменить частоту дискретизации в выходном файле на финальную. Поставьте 0, чтобы ничего не изменялось:", "后处理重采样至最终采样率0为不进行重采样": "Изменить частоту дискретизации в выходном файле на финальную. Поставьте 0, чтобы ничего не изменялось:",
@ -53,6 +54,7 @@
"很遗憾您这没有能用的显卡来支持您训练": "К сожалению, у вас нету графического процессора, который поддерживает обучение моделей.", "很遗憾您这没有能用的显卡来支持您训练": "К сожалению, у вас нету графического процессора, который поддерживает обучение моделей.",
"性能设置": "Настройки быстроты", "性能设置": "Настройки быстроты",
"总训练轮数total_epoch": "Полное количество эпох (total_epoch):", "总训练轮数total_epoch": "Полное количество эпох (total_epoch):",
"批量推理": "批量推理",
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Массовое преобразование. Введите путь к папке, в которой находятся файлы для преобразования голоса или выгрузите несколько аудиофайлов. Сконвертированные файлы будут сохранены в указанной папке (по умолчанию: 'opt').", "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Массовое преобразование. Введите путь к папке, в которой находятся файлы для преобразования голоса или выгрузите несколько аудиофайлов. Сконвертированные файлы будут сохранены в указанной папке (по умолчанию: 'opt').",
"指定输出主人声文件夹": "Путь к папке для сохранения вокала:", "指定输出主人声文件夹": "Путь к папке для сохранения вокала:",
"指定输出文件夹": "Папка для результатов:", "指定输出文件夹": "Папка для результатов:",

View File

@ -38,6 +38,7 @@
"加载模型": "Model yükle", "加载模型": "Model yükle",
"加载预训练底模D路径": "Önceden eğitilmiş temel D modelini yükleme yolu:", "加载预训练底模D路径": "Önceden eğitilmiş temel D modelini yükleme yolu:",
"加载预训练底模G路径": "Önceden eğitilmiş temel G modelini yükleme yolu:", "加载预训练底模G路径": "Önceden eğitilmiş temel G modelini yükleme yolu:",
"单次推理": "单次推理",
"卸载音色省显存": "GPU bellek kullanımını azaltmak için sesi kaldır", "卸载音色省显存": "GPU bellek kullanımını azaltmak için sesi kaldır",
"变调(整数, 半音数量, 升八度12降八度-12)": "Transpoze et (tamsayı, yarıton sayısıyla; bir oktav yükseltmek için: 12, bir oktav düşürmek için: -12):", "变调(整数, 半音数量, 升八度12降八度-12)": "Transpoze et (tamsayı, yarıton sayısıyla; bir oktav yükseltmek için: 12, bir oktav düşürmek için: -12):",
"后处理重采样至最终采样率0为不进行重采样": "Son işleme aşamasında çıktı sesini son örnekleme hızına yeniden örnekle. 0 değeri için yeniden örnekleme yapılmaz:", "后处理重采样至最终采样率0为不进行重采样": "Son işleme aşamasında çıktı sesini son örnekleme hızına yeniden örnekle. 0 değeri için yeniden örnekleme yapılmaz:",
@ -53,6 +54,7 @@
"很遗憾您这没有能用的显卡来支持您训练": "Maalesef, eğitiminizi desteklemek için uyumlu bir GPU bulunmamaktadır.", "很遗憾您这没有能用的显卡来支持您训练": "Maalesef, eğitiminizi desteklemek için uyumlu bir GPU bulunmamaktadır.",
"性能设置": "Performans ayarları", "性能设置": "Performans ayarları",
"总训练轮数total_epoch": "Toplam eğitim turu (total_epoch):", "总训练轮数total_epoch": "Toplam eğitim turu (total_epoch):",
"批量推理": "批量推理",
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Toplu dönüştür. Dönüştürülecek ses dosyalarının bulunduğu klasörü girin veya birden çok ses dosyasını yükleyin. Dönüştürülen ses dosyaları belirtilen klasöre ('opt' varsayılan olarak) dönüştürülecektir", "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Toplu dönüştür. Dönüştürülecek ses dosyalarının bulunduğu klasörü girin veya birden çok ses dosyasını yükleyin. Dönüştürülen ses dosyaları belirtilen klasöre ('opt' varsayılan olarak) dönüştürülecektir",
"指定输出主人声文件夹": "Vokal için çıkış klasörünü belirtin:", "指定输出主人声文件夹": "Vokal için çıkış klasörünü belirtin:",
"指定输出文件夹": ıkış klasörünü belirt:", "指定输出文件夹": ıkış klasörünü belirt:",

View File

@ -38,6 +38,7 @@
"加载模型": "加载模型", "加载模型": "加载模型",
"加载预训练底模D路径": "加载预训练底模D路径", "加载预训练底模D路径": "加载预训练底模D路径",
"加载预训练底模G路径": "加载预训练底模G路径", "加载预训练底模G路径": "加载预训练底模G路径",
"单次推理": "单次推理",
"卸载音色省显存": "卸载音色省显存", "卸载音色省显存": "卸载音色省显存",
"变调(整数, 半音数量, 升八度12降八度-12)": "变调(整数, 半音数量, 升八度12降八度-12)", "变调(整数, 半音数量, 升八度12降八度-12)": "变调(整数, 半音数量, 升八度12降八度-12)",
"后处理重采样至最终采样率0为不进行重采样": "后处理重采样至最终采样率0为不进行重采样", "后处理重采样至最终采样率0为不进行重采样": "后处理重采样至最终采样率0为不进行重采样",
@ -53,6 +54,7 @@
"很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练",
"性能设置": "性能设置", "性能设置": "性能设置",
"总训练轮数total_epoch": "总训练轮数total_epoch", "总训练轮数total_epoch": "总训练轮数total_epoch",
"批量推理": "批量推理",
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ", "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ",
"指定输出主人声文件夹": "指定输出主人声文件夹", "指定输出主人声文件夹": "指定输出主人声文件夹",
"指定输出文件夹": "指定输出文件夹", "指定输出文件夹": "指定输出文件夹",

View File

@ -38,6 +38,7 @@
"加载模型": "載入模型", "加载模型": "載入模型",
"加载预训练底模D路径": "加載預訓練底模D路徑", "加载预训练底模D路径": "加載預訓練底模D路徑",
"加载预训练底模G路径": "加載預訓練底模G路徑", "加载预训练底模G路径": "加載預訓練底模G路徑",
"单次推理": "单次推理",
"卸载音色省显存": "卸載音色節省 VRAM", "卸载音色省显存": "卸載音色節省 VRAM",
"变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
"后处理重采样至最终采样率0为不进行重采样": "後處理重採樣至最終採樣率0為不進行重採樣", "后处理重采样至最终采样率0为不进行重采样": "後處理重採樣至最終採樣率0為不進行重採樣",
@ -53,6 +54,7 @@
"很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练",
"性能设置": "效能設定", "性能设置": "效能設定",
"总训练轮数total_epoch": "總訓練輪數total_epoch", "总训练轮数total_epoch": "總訓練輪數total_epoch",
"批量推理": "批量推理",
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。", "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。",
"指定输出主人声文件夹": "指定输出主人声文件夹", "指定输出主人声文件夹": "指定输出主人声文件夹",
"指定输出文件夹": "指定輸出資料夾", "指定输出文件夹": "指定輸出資料夾",

View File

@ -38,6 +38,7 @@
"加载模型": "載入模型", "加载模型": "載入模型",
"加载预训练底模D路径": "加載預訓練底模D路徑", "加载预训练底模D路径": "加載預訓練底模D路徑",
"加载预训练底模G路径": "加載預訓練底模G路徑", "加载预训练底模G路径": "加載預訓練底模G路徑",
"单次推理": "单次推理",
"卸载音色省显存": "卸載音色節省 VRAM", "卸载音色省显存": "卸載音色節省 VRAM",
"变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
"后处理重采样至最终采样率0为不进行重采样": "後處理重採樣至最終採樣率0為不進行重採樣", "后处理重采样至最终采样率0为不进行重采样": "後處理重採樣至最終採樣率0為不進行重採樣",
@ -53,6 +54,7 @@
"很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练",
"性能设置": "效能設定", "性能设置": "效能設定",
"总训练轮数total_epoch": "總訓練輪數total_epoch", "总训练轮数total_epoch": "總訓練輪數total_epoch",
"批量推理": "批量推理",
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。", "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。",
"指定输出主人声文件夹": "指定输出主人声文件夹", "指定输出主人声文件夹": "指定输出主人声文件夹",
"指定输出文件夹": "指定輸出資料夾", "指定输出文件夹": "指定輸出資料夾",

View File

@ -38,6 +38,7 @@
"加载模型": "載入模型", "加载模型": "載入模型",
"加载预训练底模D路径": "加載預訓練底模D路徑", "加载预训练底模D路径": "加載預訓練底模D路徑",
"加载预训练底模G路径": "加載預訓練底模G路徑", "加载预训练底模G路径": "加載預訓練底模G路徑",
"单次推理": "单次推理",
"卸载音色省显存": "卸載音色節省 VRAM", "卸载音色省显存": "卸載音色節省 VRAM",
"变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
"后处理重采样至最终采样率0为不进行重采样": "後處理重採樣至最終採樣率0為不進行重採樣", "后处理重采样至最终采样率0为不进行重采样": "後處理重採樣至最終採樣率0為不進行重採樣",
@ -53,6 +54,7 @@
"很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练",
"性能设置": "效能設定", "性能设置": "效能設定",
"总训练轮数total_epoch": "總訓練輪數total_epoch", "总训练轮数total_epoch": "總訓練輪數total_epoch",
"批量推理": "批量推理",
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。", "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。",
"指定输出主人声文件夹": "指定输出主人声文件夹", "指定输出主人声文件夹": "指定输出主人声文件夹",
"指定输出文件夹": "指定輸出資料夾", "指定输出文件夹": "指定輸出資料夾",

View File

@ -1,36 +1,46 @@
import os, sys import os
import sys
now_dir = os.getcwd() now_dir = os.getcwd()
sys.path.append(now_dir) sys.path.append(now_dir)
import logging from infer.modules.vc.modules import VC
import shutil from infer.modules.uvr5.modules import uvr
import threading
import traceback
import warnings
from random import shuffle
from subprocess import Popen
from time import sleep
import json
import pathlib
import fairseq
import faiss
import gradio as gr
import numpy as np
import torch
from dotenv import load_dotenv
from sklearn.cluster import MiniBatchKMeans
from configs.config import Config
from i18n.i18n import I18nAuto
from infer.lib.train.process_ckpt import ( from infer.lib.train.process_ckpt import (
change_info, change_info,
extract_small_model, extract_small_model,
merge, merge,
show_info, show_info,
) )
from infer.modules.uvr5.modules import uvr from i18n.i18n import I18nAuto
from infer.modules.vc.modules import VC from configs.config import Config
from sklearn.cluster import MiniBatchKMeans
from dotenv import load_dotenv
import torch
try:
import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
if torch.xpu.is_available():
from infer.modules.ipex import ipex_init
ipex_init()
except Exception: # pylint: disable=broad-exception-caught
pass
import numpy as np
import gradio as gr
import faiss
import fairseq
import pathlib
import json
from time import sleep
from subprocess import Popen
from random import shuffle
import warnings
import traceback
import threading
import shutil
import logging
logging.getLogger("numba").setLevel(logging.WARNING) logging.getLogger("numba").setLevel(logging.WARNING)
@ -165,10 +175,10 @@ def clean():
return {"value": "", "__type__": "update"} return {"value": "", "__type__": "update"}
def export_onnx(): def export_onnx(ModelPath, ExportedPath):
from infer.modules.onnx.export import export_onnx as eo from infer.modules.onnx.export import export_onnx as eo
eo() eo(ModelPath, ExportedPath)
sr_dict = { sr_dict = {
@ -219,8 +229,9 @@ def preprocess_dataset(trainset_dir, exp_dir, sr, n_p):
per, per,
) )
logger.info(cmd) logger.info(cmd)
p = Popen(cmd, shell=True) # , stdin=PIPE, stdout=PIPE,stderr=PIPE,cwd=now_dir # , stdin=PIPE, stdout=PIPE,stderr=PIPE,cwd=now_dir
###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读 p = Popen(cmd, shell=True)
# 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
done = [False] done = [False]
threading.Thread( threading.Thread(
target=if_done, target=if_done,
@ -263,7 +274,7 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp
p = Popen( p = Popen(
cmd, shell=True, cwd=now_dir cmd, shell=True, cwd=now_dir
) # , stdin=PIPE, stdout=PIPE,stderr=PIPE ) # , stdin=PIPE, stdout=PIPE,stderr=PIPE
###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读 # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
done = [False] done = [False]
threading.Thread( threading.Thread(
target=if_done, target=if_done,
@ -295,7 +306,7 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp
cmd, shell=True, cwd=now_dir cmd, shell=True, cwd=now_dir
) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
ps.append(p) ps.append(p)
###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读 # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
done = [False] done = [False]
threading.Thread( threading.Thread(
target=if_done_multi, # target=if_done_multi, #
@ -331,7 +342,7 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp
log = f.read() log = f.read()
logger.info(log) logger.info(log)
yield log yield log
####对不同part分别开多进程 # 对不同part分别开多进程
""" """
n_part=int(sys.argv[1]) n_part=int(sys.argv[1])
i_part=int(sys.argv[2]) i_part=int(sys.argv[2])
@ -360,7 +371,7 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp
cmd, shell=True, cwd=now_dir cmd, shell=True, cwd=now_dir
) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
ps.append(p) ps.append(p)
###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读 # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
done = [False] done = [False]
threading.Thread( threading.Thread(
target=if_done_multi, target=if_done_multi,
@ -701,11 +712,11 @@ def train1key(
infos.append(strr) infos.append(strr)
return "\n".join(infos) return "\n".join(infos)
####### step1:处理数据 # step1:处理数据
yield get_info_str(i18n("step1:正在处理数据")) yield get_info_str(i18n("step1:正在处理数据"))
[get_info_str(_) for _ in preprocess_dataset(trainset_dir4, exp_dir1, sr2, np7)] [get_info_str(_) for _ in preprocess_dataset(trainset_dir4, exp_dir1, sr2, np7)]
####### step2a:提取音高 # step2a:提取音高
yield get_info_str(i18n("step2:正在提取音高&正在提取特征")) yield get_info_str(i18n("step2:正在提取音高&正在提取特征"))
[ [
get_info_str(_) get_info_str(_)
@ -714,7 +725,7 @@ def train1key(
) )
] ]
####### step3a:训练模型 # step3a:训练模型
yield get_info_str(i18n("step3a:正在训练模型")) yield get_info_str(i18n("step3a:正在训练模型"))
click_train( click_train(
exp_dir1, exp_dir1,
@ -734,7 +745,7 @@ def train1key(
) )
yield get_info_str(i18n("训练结束, 您可查看控制台训练日志或实验文件夹下的train.log")) yield get_info_str(i18n("训练结束, 您可查看控制台训练日志或实验文件夹下的train.log"))
####### step3b:训练索引 # step3b:训练索引
[get_info_str(_) for _ in train_index(exp_dir1, version19)] [get_info_str(_) for _ in train_index(exp_dir1, version19)]
yield get_info_str(i18n("全流程结束!")) yield get_info_str(i18n("全流程结束!"))
@ -768,6 +779,7 @@ def change_f0_method(f0method8):
with gr.Blocks(title="RVC WebUI") as app: with gr.Blocks(title="RVC WebUI") as app:
gr.Markdown("## RVC WebUI")
gr.Markdown( gr.Markdown(
value=i18n( value=i18n(
"本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>." "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>."
@ -777,6 +789,7 @@ with gr.Blocks(title="RVC WebUI") as app:
with gr.TabItem(i18n("模型推理")): with gr.TabItem(i18n("模型推理")):
with gr.Row(): with gr.Row():
sid0 = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names)) sid0 = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names))
with gr.Column():
refresh_button = gr.Button(i18n("刷新音色列表和索引路径"), variant="primary") refresh_button = gr.Button(i18n("刷新音色列表和索引路径"), variant="primary")
clean_button = gr.Button(i18n("卸载音色省显存"), variant="primary") clean_button = gr.Button(i18n("卸载音色省显存"), variant="primary")
spk_item = gr.Slider( spk_item = gr.Slider(
@ -791,10 +804,8 @@ with gr.Blocks(title="RVC WebUI") as app:
clean_button.click( clean_button.click(
fn=clean, inputs=[], outputs=[sid0], api_name="infer_clean" fn=clean, inputs=[], outputs=[sid0], api_name="infer_clean"
) )
with gr.TabItem(i18n("单次推理")):
with gr.Group(): with gr.Group():
gr.Markdown(
value=i18n("男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ")
)
with gr.Row(): with gr.Row():
with gr.Column(): with gr.Column():
vc_transform0 = gr.Number( vc_transform0 = gr.Number(
@ -802,7 +813,17 @@ with gr.Blocks(title="RVC WebUI") as app:
) )
input_audio0 = gr.Textbox( input_audio0 = gr.Textbox(
label=i18n("输入待处理音频文件路径(默认是正确格式示例)"), label=i18n("输入待处理音频文件路径(默认是正确格式示例)"),
value="E:\\codes\\py39\\test-20230416b\\todo-songs\\冬之花clip1.wav", placeholder="C:\\Users\\Desktop\\audio_example.wav",
)
file_index1 = gr.Textbox(
label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"),
placeholder="C:\\Users\\Desktop\\model_example.index",
interactive=True,
)
file_index2 = gr.Dropdown(
label=i18n("自动检测index路径,下拉式选择(dropdown)"),
choices=sorted(index_paths),
interactive=True,
) )
f0method0 = gr.Radio( f0method0 = gr.Radio(
label=i18n( label=i18n(
@ -811,46 +832,10 @@ with gr.Blocks(title="RVC WebUI") as app:
choices=["pm", "harvest", "crepe", "rmvpe"] choices=["pm", "harvest", "crepe", "rmvpe"]
if config.dml == False if config.dml == False
else ["pm", "harvest", "rmvpe"], else ["pm", "harvest", "rmvpe"],
value="pm", value="rmvpe",
interactive=True,
)
filter_radius0 = gr.Slider(
minimum=0,
maximum=7,
label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波数值为滤波半径使用可以削弱哑音"),
value=3,
step=1,
interactive=True,
)
with gr.Column():
file_index1 = gr.Textbox(
label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"),
value="",
interactive=True,
)
file_index2 = gr.Dropdown(
label=i18n("自动检测index路径,下拉式选择(dropdown)"),
choices=sorted(index_paths),
interactive=True,
)
refresh_button.click(
fn=change_choices,
inputs=[],
outputs=[sid0, file_index2],
api_name="infer_refresh",
)
# file_big_npy1 = gr.Textbox(
# label=i18n("特征文件路径"),
# value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
# interactive=True,
# )
index_rate1 = gr.Slider(
minimum=0,
maximum=1,
label=i18n("检索特征占比"),
value=0.75,
interactive=True, interactive=True,
) )
with gr.Column(): with gr.Column():
resample_sr0 = gr.Slider( resample_sr0 = gr.Slider(
minimum=0, minimum=0,
@ -877,11 +862,46 @@ with gr.Blocks(title="RVC WebUI") as app:
step=0.01, step=0.01,
interactive=True, interactive=True,
) )
f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调")) filter_radius0 = gr.Slider(
minimum=0,
maximum=7,
label=i18n(
">=3则使用对harvest音高识别的结果使用中值滤波数值为滤波半径使用可以削弱哑音"
),
value=3,
step=1,
interactive=True,
)
index_rate1 = gr.Slider(
minimum=0,
maximum=1,
label=i18n("检索特征占比"),
value=0.75,
interactive=True,
)
f0_file = gr.File(
label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"),
visible=False,
)
refresh_button.click(
fn=change_choices,
inputs=[],
outputs=[sid0, file_index2],
api_name="infer_refresh",
)
# file_big_npy1 = gr.Textbox(
# label=i18n("特征文件路径"),
# value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
# interactive=True,
# )
with gr.Group():
with gr.Column():
but0 = gr.Button(i18n("转换"), variant="primary") but0 = gr.Button(i18n("转换"), variant="primary")
with gr.Row(): with gr.Row():
vc_output1 = gr.Textbox(label=i18n("输出信息")) vc_output1 = gr.Textbox(label=i18n("输出信息"))
vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)")) vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)"))
but0.click( but0.click(
vc.vc_single, vc.vc_single,
[ [
@ -902,7 +922,7 @@ with gr.Blocks(title="RVC WebUI") as app:
[vc_output1, vc_output2], [vc_output1, vc_output2],
api_name="infer_convert", api_name="infer_convert",
) )
with gr.Group(): with gr.TabItem(i18n("批量推理")):
gr.Markdown( gr.Markdown(
value=i18n("批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ") value=i18n("批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ")
) )
@ -912,25 +932,6 @@ with gr.Blocks(title="RVC WebUI") as app:
label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0 label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0
) )
opt_input = gr.Textbox(label=i18n("指定输出文件夹"), value="opt") opt_input = gr.Textbox(label=i18n("指定输出文件夹"), value="opt")
f0method1 = gr.Radio(
label=i18n(
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU"
),
choices=["pm", "harvest", "crepe", "rmvpe"]
if config.dml == False
else ["pm", "harvest", "rmvpe"],
value="pm",
interactive=True,
)
filter_radius1 = gr.Slider(
minimum=0,
maximum=7,
label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波数值为滤波半径使用可以削弱哑音"),
value=3,
step=1,
interactive=True,
)
with gr.Column():
file_index3 = gr.Textbox( file_index3 = gr.Textbox(
label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"), label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"),
value="", value="",
@ -941,6 +942,23 @@ with gr.Blocks(title="RVC WebUI") as app:
choices=sorted(index_paths), choices=sorted(index_paths),
interactive=True, interactive=True,
) )
f0method1 = gr.Radio(
label=i18n(
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU"
),
choices=["pm", "harvest", "crepe", "rmvpe"]
if config.dml == False
else ["pm", "harvest", "rmvpe"],
value="rmvpe",
interactive=True,
)
format1 = gr.Radio(
label=i18n("导出文件格式"),
choices=["wav", "flac", "mp3", "m4a"],
value="wav",
interactive=True,
)
refresh_button.click( refresh_button.click(
fn=lambda: change_choices()[1], fn=lambda: change_choices()[1],
inputs=[], inputs=[],
@ -952,13 +970,7 @@ with gr.Blocks(title="RVC WebUI") as app:
# value="E:\\codes\\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy", # value="E:\\codes\\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
# interactive=True, # interactive=True,
# ) # )
index_rate2 = gr.Slider(
minimum=0,
maximum=1,
label=i18n("检索特征占比"),
value=1,
interactive=True,
)
with gr.Column(): with gr.Column():
resample_sr1 = gr.Slider( resample_sr1 = gr.Slider(
minimum=0, minimum=0,
@ -985,23 +997,34 @@ with gr.Blocks(title="RVC WebUI") as app:
step=0.01, step=0.01,
interactive=True, interactive=True,
) )
with gr.Column(): filter_radius1 = gr.Slider(
minimum=0,
maximum=7,
label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波数值为滤波半径使用可以削弱哑音"),
value=3,
step=1,
interactive=True,
)
index_rate2 = gr.Slider(
minimum=0,
maximum=1,
label=i18n("检索特征占比"),
value=1,
interactive=True,
)
with gr.Row():
dir_input = gr.Textbox( dir_input = gr.Textbox(
label=i18n("输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)"), label=i18n("输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)"),
value="E:\codes\py39\\test-20230416b\\todo-songs", placeholder="C:\\Users\\Desktop\\input_vocal_dir",
) )
inputs = gr.File( inputs = gr.File(
file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹") file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹")
) )
with gr.Row(): with gr.Row():
format1 = gr.Radio(
label=i18n("导出文件格式"),
choices=["wav", "flac", "mp3", "m4a"],
value="flac",
interactive=True,
)
but1 = gr.Button(i18n("转换"), variant="primary") but1 = gr.Button(i18n("转换"), variant="primary")
vc_output3 = gr.Textbox(label=i18n("输出信息")) vc_output3 = gr.Textbox(label=i18n("输出信息"))
but1.click( but1.click(
vc.vc_multi, vc.vc_multi,
[ [
@ -1041,7 +1064,7 @@ with gr.Blocks(title="RVC WebUI") as app:
with gr.Column(): with gr.Column():
dir_wav_input = gr.Textbox( dir_wav_input = gr.Textbox(
label=i18n("输入待处理音频文件夹路径"), label=i18n("输入待处理音频文件夹路径"),
value="E:\\codes\\py39\\test-20230416b\\todo-songs\\todo-songs", placeholder="C:\\Users\\Desktop\\todo-songs",
) )
wav_inputs = gr.File( wav_inputs = gr.File(
file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹") file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹")

View File

@ -1,5 +1,6 @@
import copy import copy
import math import math
from typing import Optional
import numpy as np import numpy as np
import torch import torch
@ -22,11 +23,11 @@ class Encoder(nn.Module):
window_size=10, window_size=10,
**kwargs **kwargs
): ):
super().__init__() super(Encoder, self).__init__()
self.hidden_channels = hidden_channels self.hidden_channels = hidden_channels
self.filter_channels = filter_channels self.filter_channels = filter_channels
self.n_heads = n_heads self.n_heads = n_heads
self.n_layers = n_layers self.n_layers = int(n_layers)
self.kernel_size = kernel_size self.kernel_size = kernel_size
self.p_dropout = p_dropout self.p_dropout = p_dropout
self.window_size = window_size self.window_size = window_size
@ -61,14 +62,17 @@ class Encoder(nn.Module):
def forward(self, x, x_mask): def forward(self, x, x_mask):
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
x = x * x_mask x = x * x_mask
for i in range(self.n_layers): zippep = zip(
y = self.attn_layers[i](x, x, attn_mask) self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2
)
for attn_layers, norm_layers_1, ffn_layers, norm_layers_2 in zippep:
y = attn_layers(x, x, attn_mask)
y = self.drop(y) y = self.drop(y)
x = self.norm_layers_1[i](x + y) x = norm_layers_1(x + y)
y = self.ffn_layers[i](x, x_mask) y = ffn_layers(x, x_mask)
y = self.drop(y) y = self.drop(y)
x = self.norm_layers_2[i](x + y) x = norm_layers_2(x + y)
x = x * x_mask x = x * x_mask
return x return x
@ -86,7 +90,7 @@ class Decoder(nn.Module):
proximal_init=True, proximal_init=True,
**kwargs **kwargs
): ):
super().__init__() super(Decoder, self).__init__()
self.hidden_channels = hidden_channels self.hidden_channels = hidden_channels
self.filter_channels = filter_channels self.filter_channels = filter_channels
self.n_heads = n_heads self.n_heads = n_heads
@ -172,7 +176,7 @@ class MultiHeadAttention(nn.Module):
proximal_bias=False, proximal_bias=False,
proximal_init=False, proximal_init=False,
): ):
super().__init__() super(MultiHeadAttention, self).__init__()
assert channels % n_heads == 0 assert channels % n_heads == 0
self.channels = channels self.channels = channels
@ -213,19 +217,28 @@ class MultiHeadAttention(nn.Module):
self.conv_k.weight.copy_(self.conv_q.weight) self.conv_k.weight.copy_(self.conv_q.weight)
self.conv_k.bias.copy_(self.conv_q.bias) self.conv_k.bias.copy_(self.conv_q.bias)
def forward(self, x, c, attn_mask=None): def forward(
self, x: torch.Tensor, c: torch.Tensor, attn_mask: Optional[torch.Tensor] = None
):
q = self.conv_q(x) q = self.conv_q(x)
k = self.conv_k(c) k = self.conv_k(c)
v = self.conv_v(c) v = self.conv_v(c)
x, self.attn = self.attention(q, k, v, mask=attn_mask) x, _ = self.attention(q, k, v, mask=attn_mask)
x = self.conv_o(x) x = self.conv_o(x)
return x return x
def attention(self, query, key, value, mask=None): def attention(
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
mask: Optional[torch.Tensor] = None,
):
# reshape [b, d, t] -> [b, n_h, t, d_k] # reshape [b, d, t] -> [b, n_h, t, d_k]
b, d, t_s, t_t = (*key.size(), query.size(2)) b, d, t_s = key.size()
t_t = query.size(2)
query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
@ -292,16 +305,17 @@ class MultiHeadAttention(nn.Module):
ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
return ret return ret
def _get_relative_embeddings(self, relative_embeddings, length): def _get_relative_embeddings(self, relative_embeddings, length: int):
max_relative_position = 2 * self.window_size + 1 max_relative_position = 2 * self.window_size + 1
# Pad first before slice to avoid using cond ops. # Pad first before slice to avoid using cond ops.
pad_length = max(length - (self.window_size + 1), 0) pad_length: int = max(length - (self.window_size + 1), 0)
slice_start_position = max((self.window_size + 1) - length, 0) slice_start_position = max((self.window_size + 1) - length, 0)
slice_end_position = slice_start_position + 2 * length - 1 slice_end_position = slice_start_position + 2 * length - 1
if pad_length > 0: if pad_length > 0:
padded_relative_embeddings = F.pad( padded_relative_embeddings = F.pad(
relative_embeddings, relative_embeddings,
commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), # commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
[0, 0, pad_length, pad_length, 0, 0],
) )
else: else:
padded_relative_embeddings = relative_embeddings padded_relative_embeddings = relative_embeddings
@ -317,12 +331,18 @@ class MultiHeadAttention(nn.Module):
""" """
batch, heads, length, _ = x.size() batch, heads, length, _ = x.size()
# Concat columns of pad to shift from relative to absolute indexing. # Concat columns of pad to shift from relative to absolute indexing.
x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) x = F.pad(
x,
# commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])
[0, 1, 0, 0, 0, 0, 0, 0],
)
# Concat extra elements so to add up to shape (len+1, 2*len-1). # Concat extra elements so to add up to shape (len+1, 2*len-1).
x_flat = x.view([batch, heads, length * 2 * length]) x_flat = x.view([batch, heads, length * 2 * length])
x_flat = F.pad( x_flat = F.pad(
x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) x_flat,
# commons.convert_pad_shape([[0, 0], [0, 0], [0, int(length) - 1]])
[0, int(length) - 1, 0, 0, 0, 0],
) )
# Reshape and slice out the padded elements. # Reshape and slice out the padded elements.
@ -339,15 +359,21 @@ class MultiHeadAttention(nn.Module):
batch, heads, length, _ = x.size() batch, heads, length, _ = x.size()
# padd along column # padd along column
x = F.pad( x = F.pad(
x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) x,
# commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, int(length) - 1]])
[0, int(length) - 1, 0, 0, 0, 0, 0, 0],
) )
x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) x_flat = x.view([batch, heads, int(length**2) + int(length * (length - 1))])
# add 0's in the beginning that will skew the elements after reshape # add 0's in the beginning that will skew the elements after reshape
x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) x_flat = F.pad(
x_flat,
# commons.convert_pad_shape([[0, 0], [0, 0], [int(length), 0]])
[length, 0, 0, 0, 0, 0],
)
x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
return x_final return x_final
def _attention_bias_proximal(self, length): def _attention_bias_proximal(self, length: int):
"""Bias for self-attention to encourage attention to close positions. """Bias for self-attention to encourage attention to close positions.
Args: Args:
length: an integer scalar. length: an integer scalar.
@ -367,10 +393,10 @@ class FFN(nn.Module):
filter_channels, filter_channels,
kernel_size, kernel_size,
p_dropout=0.0, p_dropout=0.0,
activation=None, activation: str = None,
causal=False, causal=False,
): ):
super().__init__() super(FFN, self).__init__()
self.in_channels = in_channels self.in_channels = in_channels
self.out_channels = out_channels self.out_channels = out_channels
self.filter_channels = filter_channels self.filter_channels = filter_channels
@ -378,40 +404,56 @@ class FFN(nn.Module):
self.p_dropout = p_dropout self.p_dropout = p_dropout
self.activation = activation self.activation = activation
self.causal = causal self.causal = causal
self.is_activation = True if activation == "gelu" else False
if causal: # if causal:
self.padding = self._causal_padding # self.padding = self._causal_padding
else: # else:
self.padding = self._same_padding # self.padding = self._same_padding
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
self.drop = nn.Dropout(p_dropout) self.drop = nn.Dropout(p_dropout)
def forward(self, x, x_mask): def padding(self, x: torch.Tensor, x_mask: torch.Tensor) -> torch.Tensor:
x = self.conv_1(self.padding(x * x_mask)) if self.causal:
if self.activation == "gelu": padding = self._causal_padding(x * x_mask)
else:
padding = self._same_padding(x * x_mask)
return padding
def forward(self, x: torch.Tensor, x_mask: torch.Tensor):
x = self.conv_1(self.padding(x, x_mask))
if self.is_activation:
x = x * torch.sigmoid(1.702 * x) x = x * torch.sigmoid(1.702 * x)
else: else:
x = torch.relu(x) x = torch.relu(x)
x = self.drop(x) x = self.drop(x)
x = self.conv_2(self.padding(x * x_mask))
x = self.conv_2(self.padding(x, x_mask))
return x * x_mask return x * x_mask
def _causal_padding(self, x): def _causal_padding(self, x):
if self.kernel_size == 1: if self.kernel_size == 1:
return x return x
pad_l = self.kernel_size - 1 pad_l: int = self.kernel_size - 1
pad_r = 0 pad_r: int = 0
padding = [[0, 0], [0, 0], [pad_l, pad_r]] # padding = [[0, 0], [0, 0], [pad_l, pad_r]]
x = F.pad(x, commons.convert_pad_shape(padding)) x = F.pad(
x,
# commons.convert_pad_shape(padding)
[pad_l, pad_r, 0, 0, 0, 0],
)
return x return x
def _same_padding(self, x): def _same_padding(self, x):
if self.kernel_size == 1: if self.kernel_size == 1:
return x return x
pad_l = (self.kernel_size - 1) // 2 pad_l: int = (self.kernel_size - 1) // 2
pad_r = self.kernel_size // 2 pad_r: int = self.kernel_size // 2
padding = [[0, 0], [0, 0], [pad_l, pad_r]] # padding = [[0, 0], [0, 0], [pad_l, pad_r]]
x = F.pad(x, commons.convert_pad_shape(padding)) x = F.pad(
x,
# commons.convert_pad_shape(padding)
[pad_l, pad_r, 0, 0, 0, 0],
)
return x return x

View File

@ -1,3 +1,4 @@
from typing import List, Optional
import math import math
import numpy as np import numpy as np
@ -16,10 +17,10 @@ def get_padding(kernel_size, dilation=1):
return int((kernel_size * dilation - dilation) / 2) return int((kernel_size * dilation - dilation) / 2)
def convert_pad_shape(pad_shape): # def convert_pad_shape(pad_shape):
l = pad_shape[::-1] # l = pad_shape[::-1]
pad_shape = [item for sublist in l for item in sublist] # pad_shape = [item for sublist in l for item in sublist]
return pad_shape # return pad_shape
def kl_divergence(m_p, logs_p, m_q, logs_q): def kl_divergence(m_p, logs_p, m_q, logs_q):
@ -113,10 +114,14 @@ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
return acts return acts
def convert_pad_shape(pad_shape): # def convert_pad_shape(pad_shape):
l = pad_shape[::-1] # l = pad_shape[::-1]
pad_shape = [item for sublist in l for item in sublist] # pad_shape = [item for sublist in l for item in sublist]
return pad_shape # return pad_shape
def convert_pad_shape(pad_shape: List[List[int]]) -> List[int]:
return torch.tensor(pad_shape).flip(0).reshape(-1).int().tolist()
def shift_1d(x): def shift_1d(x):
@ -124,7 +129,7 @@ def shift_1d(x):
return x return x
def sequence_mask(length, max_length=None): def sequence_mask(length: torch.Tensor, max_length: Optional[int] = None):
if max_length is None: if max_length is None:
max_length = length.max() max_length = length.max()
x = torch.arange(max_length, dtype=length.dtype, device=length.device) x = torch.arange(max_length, dtype=length.dtype, device=length.device)

View File

@ -1,5 +1,6 @@
import math import math
import logging import logging
from typing import Optional
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -28,25 +29,32 @@ class TextEncoder256(nn.Module):
p_dropout, p_dropout,
f0=True, f0=True,
): ):
super().__init__() super(TextEncoder256, self).__init__()
self.out_channels = out_channels self.out_channels = out_channels
self.hidden_channels = hidden_channels self.hidden_channels = hidden_channels
self.filter_channels = filter_channels self.filter_channels = filter_channels
self.n_heads = n_heads self.n_heads = n_heads
self.n_layers = n_layers self.n_layers = n_layers
self.kernel_size = kernel_size self.kernel_size = kernel_size
self.p_dropout = p_dropout self.p_dropout = float(p_dropout)
self.emb_phone = nn.Linear(256, hidden_channels) self.emb_phone = nn.Linear(256, hidden_channels)
self.lrelu = nn.LeakyReLU(0.1, inplace=True) self.lrelu = nn.LeakyReLU(0.1, inplace=True)
if f0 == True: if f0 == True:
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
self.encoder = attentions.Encoder( self.encoder = attentions.Encoder(
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
float(p_dropout),
) )
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def forward(self, phone, pitch, lengths): def forward(
if pitch == None: self, phone: torch.Tensor, pitch: Optional[torch.Tensor], lengths: torch.Tensor
):
if pitch is None:
x = self.emb_phone(phone) x = self.emb_phone(phone)
else: else:
x = self.emb_phone(phone) + self.emb_pitch(pitch) x = self.emb_phone(phone) + self.emb_pitch(pitch)
@ -75,25 +83,30 @@ class TextEncoder768(nn.Module):
p_dropout, p_dropout,
f0=True, f0=True,
): ):
super().__init__() super(TextEncoder768, self).__init__()
self.out_channels = out_channels self.out_channels = out_channels
self.hidden_channels = hidden_channels self.hidden_channels = hidden_channels
self.filter_channels = filter_channels self.filter_channels = filter_channels
self.n_heads = n_heads self.n_heads = n_heads
self.n_layers = n_layers self.n_layers = n_layers
self.kernel_size = kernel_size self.kernel_size = kernel_size
self.p_dropout = p_dropout self.p_dropout = float(p_dropout)
self.emb_phone = nn.Linear(768, hidden_channels) self.emb_phone = nn.Linear(768, hidden_channels)
self.lrelu = nn.LeakyReLU(0.1, inplace=True) self.lrelu = nn.LeakyReLU(0.1, inplace=True)
if f0 == True: if f0 == True:
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
self.encoder = attentions.Encoder( self.encoder = attentions.Encoder(
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
float(p_dropout),
) )
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def forward(self, phone, pitch, lengths): def forward(self, phone: torch.Tensor, pitch: torch.Tensor, lengths: torch.Tensor):
if pitch == None: if pitch is None:
x = self.emb_phone(phone) x = self.emb_phone(phone)
else: else:
x = self.emb_phone(phone) + self.emb_pitch(pitch) x = self.emb_phone(phone) + self.emb_pitch(pitch)
@ -121,7 +134,7 @@ class ResidualCouplingBlock(nn.Module):
n_flows=4, n_flows=4,
gin_channels=0, gin_channels=0,
): ):
super().__init__() super(ResidualCouplingBlock, self).__init__()
self.channels = channels self.channels = channels
self.hidden_channels = hidden_channels self.hidden_channels = hidden_channels
self.kernel_size = kernel_size self.kernel_size = kernel_size
@ -145,19 +158,36 @@ class ResidualCouplingBlock(nn.Module):
) )
self.flows.append(modules.Flip()) self.flows.append(modules.Flip())
def forward(self, x, x_mask, g=None, reverse=False): def forward(
self,
x: torch.Tensor,
x_mask: torch.Tensor,
g: Optional[torch.Tensor] = None,
reverse: bool = False,
):
if not reverse: if not reverse:
for flow in self.flows: for flow in self.flows:
x, _ = flow(x, x_mask, g=g, reverse=reverse) x, _ = flow(x, x_mask, g=g, reverse=reverse)
else: else:
for flow in reversed(self.flows): for flow in self.flows[::-1]:
x = flow(x, x_mask, g=g, reverse=reverse) x, _ = flow.forward(x, x_mask, g=g, reverse=reverse)
return x return x
def remove_weight_norm(self): def remove_weight_norm(self):
for i in range(self.n_flows): for i in range(self.n_flows):
self.flows[i * 2].remove_weight_norm() self.flows[i * 2].remove_weight_norm()
def __prepare_scriptable__(self):
for i in range(self.n_flows):
for hook in self.flows[i * 2]._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(self.flows[i * 2])
return self
class PosteriorEncoder(nn.Module): class PosteriorEncoder(nn.Module):
def __init__( def __init__(
@ -170,7 +200,7 @@ class PosteriorEncoder(nn.Module):
n_layers, n_layers,
gin_channels=0, gin_channels=0,
): ):
super().__init__() super(PosteriorEncoder, self).__init__()
self.in_channels = in_channels self.in_channels = in_channels
self.out_channels = out_channels self.out_channels = out_channels
self.hidden_channels = hidden_channels self.hidden_channels = hidden_channels
@ -189,7 +219,9 @@ class PosteriorEncoder(nn.Module):
) )
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def forward(self, x, x_lengths, g=None): def forward(
self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None
):
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
x.dtype x.dtype
) )
@ -203,6 +235,15 @@ class PosteriorEncoder(nn.Module):
def remove_weight_norm(self): def remove_weight_norm(self):
self.enc.remove_weight_norm() self.enc.remove_weight_norm()
def __prepare_scriptable__(self):
for hook in self.enc._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(self.enc)
return self
class Generator(torch.nn.Module): class Generator(torch.nn.Module):
def __init__( def __init__(
@ -252,7 +293,7 @@ class Generator(torch.nn.Module):
if gin_channels != 0: if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
def forward(self, x, g=None): def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None):
x = self.conv_pre(x) x = self.conv_pre(x)
if g is not None: if g is not None:
x = x + self.cond(g) x = x + self.cond(g)
@ -273,6 +314,28 @@ class Generator(torch.nn.Module):
return x return x
def __prepare_scriptable__(self):
for l in self.ups:
for hook in l._forward_pre_hooks.values():
# The hook we want to remove is an instance of WeightNorm class, so
# normally we would do `if isinstance(...)` but this class is not accessible
# because of shadowing, so we check the module name directly.
# https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(l)
for l in self.resblocks:
for hook in l._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(l)
return self
def remove_weight_norm(self): def remove_weight_norm(self):
for l in self.ups: for l in self.ups:
remove_weight_norm(l) remove_weight_norm(l)
@ -293,7 +356,7 @@ class SineGen(torch.nn.Module):
voiced_thoreshold: F0 threshold for U/V classification (default 0) voiced_thoreshold: F0 threshold for U/V classification (default 0)
flag_for_pulse: this SinGen is used inside PulseGen (default False) flag_for_pulse: this SinGen is used inside PulseGen (default False)
Note: when flag_for_pulse is True, the first time step of a voiced Note: when flag_for_pulse is True, the first time step of a voiced
segment is always sin(np.pi) or cos(0) segment is always sin(torch.pi) or cos(0)
""" """
def __init__( def __init__(
@ -321,7 +384,7 @@ class SineGen(torch.nn.Module):
uv = uv.float() uv = uv.float()
return uv return uv
def forward(self, f0, upp): def forward(self, f0: torch.Tensor, upp: int):
"""sine_tensor, uv = forward(f0) """sine_tensor, uv = forward(f0)
input F0: tensor(batchsize=1, length, dim=1) input F0: tensor(batchsize=1, length, dim=1)
f0 for unvoiced steps should be 0 f0 for unvoiced steps should be 0
@ -333,7 +396,7 @@ class SineGen(torch.nn.Module):
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
# fundamental component # fundamental component
f0_buf[:, :, 0] = f0[:, :, 0] f0_buf[:, :, 0] = f0[:, :, 0]
for idx in np.arange(self.harmonic_num): for idx in range(self.harmonic_num):
f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
idx + 2 idx + 2
) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
@ -347,12 +410,12 @@ class SineGen(torch.nn.Module):
tmp_over_one *= upp tmp_over_one *= upp
tmp_over_one = F.interpolate( tmp_over_one = F.interpolate(
tmp_over_one.transpose(2, 1), tmp_over_one.transpose(2, 1),
scale_factor=upp, scale_factor=float(upp),
mode="linear", mode="linear",
align_corners=True, align_corners=True,
).transpose(2, 1) ).transpose(2, 1)
rad_values = F.interpolate( rad_values = F.interpolate(
rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" rad_values.transpose(2, 1), scale_factor=float(upp), mode="nearest"
).transpose( ).transpose(
2, 1 2, 1
) ####### ) #######
@ -361,12 +424,12 @@ class SineGen(torch.nn.Module):
cumsum_shift = torch.zeros_like(rad_values) cumsum_shift = torch.zeros_like(rad_values)
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
sine_waves = torch.sin( sine_waves = torch.sin(
torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * torch.pi
) )
sine_waves = sine_waves * self.sine_amp sine_waves = sine_waves * self.sine_amp
uv = self._f02uv(f0) uv = self._f02uv(f0)
uv = F.interpolate( uv = F.interpolate(
uv.transpose(2, 1), scale_factor=upp, mode="nearest" uv.transpose(2, 1), scale_factor=float(upp), mode="nearest"
).transpose(2, 1) ).transpose(2, 1)
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
noise = noise_amp * torch.randn_like(sine_waves) noise = noise_amp * torch.randn_like(sine_waves)
@ -414,18 +477,19 @@ class SourceModuleHnNSF(torch.nn.Module):
# to merge source harmonics into a single excitation # to merge source harmonics into a single excitation
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
self.l_tanh = torch.nn.Tanh() self.l_tanh = torch.nn.Tanh()
# self.ddtype:int = -1
def forward(self, x, upp=None): def forward(self, x: torch.Tensor, upp: int = 1):
if hasattr(self, "ddtype") == False: # if self.ddtype ==-1:
self.ddtype = self.l_linear.weight.dtype # self.ddtype = self.l_linear.weight.dtype
sine_wavs, uv, _ = self.l_sin_gen(x, upp) sine_wavs, uv, _ = self.l_sin_gen(x, upp)
# print(x.dtype,sine_wavs.dtype,self.l_linear.weight.dtype) # print(x.dtype,sine_wavs.dtype,self.l_linear.weight.dtype)
# if self.is_half: # if self.is_half:
# sine_wavs = sine_wavs.half() # sine_wavs = sine_wavs.half()
# sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(x))) # sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(x)))
# print(sine_wavs.dtype,self.ddtype) # print(sine_wavs.dtype,self.ddtype)
if sine_wavs.dtype != self.ddtype: # if sine_wavs.dtype != self.l_linear.weight.dtype:
sine_wavs = sine_wavs.to(self.ddtype) sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype)
sine_merge = self.l_tanh(self.l_linear(sine_wavs)) sine_merge = self.l_tanh(self.l_linear(sine_wavs))
return sine_merge, None, None # noise, uv return sine_merge, None, None # noise, uv
@ -448,7 +512,7 @@ class GeneratorNSF(torch.nn.Module):
self.num_kernels = len(resblock_kernel_sizes) self.num_kernels = len(resblock_kernel_sizes)
self.num_upsamples = len(upsample_rates) self.num_upsamples = len(upsample_rates)
self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates))
self.m_source = SourceModuleHnNSF( self.m_source = SourceModuleHnNSF(
sampling_rate=sr, harmonic_num=0, is_half=is_half sampling_rate=sr, harmonic_num=0, is_half=is_half
) )
@ -473,7 +537,7 @@ class GeneratorNSF(torch.nn.Module):
) )
) )
if i + 1 < len(upsample_rates): if i + 1 < len(upsample_rates):
stride_f0 = np.prod(upsample_rates[i + 1 :]) stride_f0 = math.prod(upsample_rates[i + 1 :])
self.noise_convs.append( self.noise_convs.append(
Conv1d( Conv1d(
1, 1,
@ -500,26 +564,35 @@ class GeneratorNSF(torch.nn.Module):
if gin_channels != 0: if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
self.upp = np.prod(upsample_rates) self.upp = math.prod(upsample_rates)
def forward(self, x, f0, g=None): self.lrelu_slope = modules.LRELU_SLOPE
def forward(self, x, f0, g: Optional[torch.Tensor] = None):
har_source, noi_source, uv = self.m_source(f0, self.upp) har_source, noi_source, uv = self.m_source(f0, self.upp)
har_source = har_source.transpose(1, 2) har_source = har_source.transpose(1, 2)
x = self.conv_pre(x) x = self.conv_pre(x)
if g is not None: if g is not None:
x = x + self.cond(g) x = x + self.cond(g)
# torch.jit.script() does not support direct indexing of torch modules
for i in range(self.num_upsamples): # That's why I wrote this
x = F.leaky_relu(x, modules.LRELU_SLOPE) for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)):
x = self.ups[i](x) if i < self.num_upsamples:
x_source = self.noise_convs[i](har_source) x = F.leaky_relu(x, self.lrelu_slope)
x = ups(x)
x_source = noise_convs(har_source)
x = x + x_source x = x + x_source
xs = None xs: Optional[torch.Tensor] = None
for j in range(self.num_kernels): l = [i * self.num_kernels + j for j in range(self.num_kernels)]
for j, resblock in enumerate(self.resblocks):
if j in l:
if xs is None: if xs is None:
xs = self.resblocks[i * self.num_kernels + j](x) xs = resblock(x)
else: else:
xs += self.resblocks[i * self.num_kernels + j](x) xs += resblock(x)
# This assertion cannot be ignored! \
# If ignored, it will cause torch.jit.script() compilation errors
assert isinstance(xs, torch.Tensor)
x = xs / self.num_kernels x = xs / self.num_kernels
x = F.leaky_relu(x) x = F.leaky_relu(x)
x = self.conv_post(x) x = self.conv_post(x)
@ -532,6 +605,27 @@ class GeneratorNSF(torch.nn.Module):
for l in self.resblocks: for l in self.resblocks:
l.remove_weight_norm() l.remove_weight_norm()
def __prepare_scriptable__(self):
for l in self.ups:
for hook in l._forward_pre_hooks.values():
# The hook we want to remove is an instance of WeightNorm class, so
# normally we would do `if isinstance(...)` but this class is not accessible
# because of shadowing, so we check the module name directly.
# https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(l)
for l in self.resblocks:
for hook in self.resblocks._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(l)
return self
sr2sr = { sr2sr = {
"32k": 32000, "32k": 32000,
@ -563,8 +657,8 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
sr, sr,
**kwargs **kwargs
): ):
super().__init__() super(SynthesizerTrnMs256NSFsid, self).__init__()
if type(sr) == type("strr"): if isinstance(sr, str):
sr = sr2sr[sr] sr = sr2sr[sr]
self.spec_channels = spec_channels self.spec_channels = spec_channels
self.inter_channels = inter_channels self.inter_channels = inter_channels
@ -573,7 +667,7 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
self.n_heads = n_heads self.n_heads = n_heads
self.n_layers = n_layers self.n_layers = n_layers
self.kernel_size = kernel_size self.kernel_size = kernel_size
self.p_dropout = p_dropout self.p_dropout = float(p_dropout)
self.resblock = resblock self.resblock = resblock
self.resblock_kernel_sizes = resblock_kernel_sizes self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes self.resblock_dilation_sizes = resblock_dilation_sizes
@ -591,7 +685,7 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
n_heads, n_heads,
n_layers, n_layers,
kernel_size, kernel_size,
p_dropout, float(p_dropout),
) )
self.dec = GeneratorNSF( self.dec = GeneratorNSF(
inter_channels, inter_channels,
@ -630,8 +724,42 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
self.flow.remove_weight_norm() self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm() self.enc_q.remove_weight_norm()
def __prepare_scriptable__(self):
for hook in self.dec._forward_pre_hooks.values():
# The hook we want to remove is an instance of WeightNorm class, so
# normally we would do `if isinstance(...)` but this class is not accessible
# because of shadowing, so we check the module name directly.
# https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(self.dec)
for hook in self.flow._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(self.flow)
if hasattr(self, "enc_q"):
for hook in self.enc_q._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(self.enc_q)
return self
@torch.jit.ignore
def forward( def forward(
self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds self,
phone: torch.Tensor,
phone_lengths: torch.Tensor,
pitch: torch.Tensor,
pitchf: torch.Tensor,
y: torch.Tensor,
y_lengths: torch.Tensor,
ds: Optional[torch.Tensor] = None,
): # 这里ds是id[bs,1] ): # 这里ds是id[bs,1]
# print(1,pitch.shape)#[bs,t] # print(1,pitch.shape)#[bs,t]
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t广播的 g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t广播的
@ -647,15 +775,25 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
o = self.dec(z_slice, pitchf, g=g) o = self.dec(z_slice, pitchf, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None): @torch.jit.export
def infer(
self,
phone: torch.Tensor,
phone_lengths: torch.Tensor,
pitch: torch.Tensor,
nsff0: torch.Tensor,
sid: torch.Tensor,
rate: Optional[torch.Tensor] = None,
):
g = self.emb_g(sid).unsqueeze(-1) g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
if rate: if rate is not None:
head = int(z_p.shape[2] * rate) assert isinstance(rate, torch.Tensor)
z_p = z_p[:, :, -head:] head = int(z_p.shape[2] * (1 - rate.item()))
x_mask = x_mask[:, :, -head:] z_p = z_p[:, :, head:]
nsff0 = nsff0[:, -head:] x_mask = x_mask[:, :, head:]
nsff0 = nsff0[:, head:]
z = self.flow(z_p, x_mask, g=g, reverse=True) z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, nsff0, g=g) o = self.dec(z * x_mask, nsff0, g=g)
return o, x_mask, (z, z_p, m_p, logs_p) return o, x_mask, (z, z_p, m_p, logs_p)
@ -684,8 +822,8 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
sr, sr,
**kwargs **kwargs
): ):
super().__init__() super(SynthesizerTrnMs768NSFsid, self).__init__()
if type(sr) == type("strr"): if isinstance(sr, str):
sr = sr2sr[sr] sr = sr2sr[sr]
self.spec_channels = spec_channels self.spec_channels = spec_channels
self.inter_channels = inter_channels self.inter_channels = inter_channels
@ -694,7 +832,7 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
self.n_heads = n_heads self.n_heads = n_heads
self.n_layers = n_layers self.n_layers = n_layers
self.kernel_size = kernel_size self.kernel_size = kernel_size
self.p_dropout = p_dropout self.p_dropout = float(p_dropout)
self.resblock = resblock self.resblock = resblock
self.resblock_kernel_sizes = resblock_kernel_sizes self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes self.resblock_dilation_sizes = resblock_dilation_sizes
@ -712,7 +850,7 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
n_heads, n_heads,
n_layers, n_layers,
kernel_size, kernel_size,
p_dropout, float(p_dropout),
) )
self.dec = GeneratorNSF( self.dec = GeneratorNSF(
inter_channels, inter_channels,
@ -751,6 +889,33 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
self.flow.remove_weight_norm() self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm() self.enc_q.remove_weight_norm()
def __prepare_scriptable__(self):
for hook in self.dec._forward_pre_hooks.values():
# The hook we want to remove is an instance of WeightNorm class, so
# normally we would do `if isinstance(...)` but this class is not accessible
# because of shadowing, so we check the module name directly.
# https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(self.dec)
for hook in self.flow._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(self.flow)
if hasattr(self, "enc_q"):
for hook in self.enc_q._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(self.enc_q)
return self
@torch.jit.ignore
def forward( def forward(
self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
): # 这里ds是id[bs,1] ): # 这里ds是id[bs,1]
@ -768,15 +933,24 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
o = self.dec(z_slice, pitchf, g=g) o = self.dec(z_slice, pitchf, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None): @torch.jit.export
def infer(
self,
phone: torch.Tensor,
phone_lengths: torch.Tensor,
pitch: torch.Tensor,
nsff0: torch.Tensor,
sid: torch.Tensor,
rate: Optional[torch.Tensor] = None,
):
g = self.emb_g(sid).unsqueeze(-1) g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
if rate: if rate is not None:
head = int(z_p.shape[2] * rate) head = int(z_p.shape[2] * (1.0 - rate.item()))
z_p = z_p[:, :, -head:] z_p = z_p[:, :, head:]
x_mask = x_mask[:, :, -head:] x_mask = x_mask[:, :, head:]
nsff0 = nsff0[:, -head:] nsff0 = nsff0[:, head:]
z = self.flow(z_p, x_mask, g=g, reverse=True) z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, nsff0, g=g) o = self.dec(z * x_mask, nsff0, g=g)
return o, x_mask, (z, z_p, m_p, logs_p) return o, x_mask, (z, z_p, m_p, logs_p)
@ -805,7 +979,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
sr=None, sr=None,
**kwargs **kwargs
): ):
super().__init__() super(SynthesizerTrnMs256NSFsid_nono, self).__init__()
self.spec_channels = spec_channels self.spec_channels = spec_channels
self.inter_channels = inter_channels self.inter_channels = inter_channels
self.hidden_channels = hidden_channels self.hidden_channels = hidden_channels
@ -813,7 +987,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
self.n_heads = n_heads self.n_heads = n_heads
self.n_layers = n_layers self.n_layers = n_layers
self.kernel_size = kernel_size self.kernel_size = kernel_size
self.p_dropout = p_dropout self.p_dropout = float(p_dropout)
self.resblock = resblock self.resblock = resblock
self.resblock_kernel_sizes = resblock_kernel_sizes self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes self.resblock_dilation_sizes = resblock_dilation_sizes
@ -831,7 +1005,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
n_heads, n_heads,
n_layers, n_layers,
kernel_size, kernel_size,
p_dropout, float(p_dropout),
f0=False, f0=False,
) )
self.dec = Generator( self.dec = Generator(
@ -869,6 +1043,33 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
self.flow.remove_weight_norm() self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm() self.enc_q.remove_weight_norm()
def __prepare_scriptable__(self):
for hook in self.dec._forward_pre_hooks.values():
# The hook we want to remove is an instance of WeightNorm class, so
# normally we would do `if isinstance(...)` but this class is not accessible
# because of shadowing, so we check the module name directly.
# https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(self.dec)
for hook in self.flow._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(self.flow)
if hasattr(self, "enc_q"):
for hook in self.enc_q._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(self.enc_q)
return self
@torch.jit.ignore
def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id[bs,1] def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id[bs,1]
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t广播的 g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t广播的
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
@ -880,14 +1081,22 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
o = self.dec(z_slice, g=g) o = self.dec(z_slice, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, phone, phone_lengths, sid, rate=None): @torch.jit.export
def infer(
self,
phone: torch.Tensor,
phone_lengths: torch.Tensor,
sid: torch.Tensor,
rate: Optional[torch.Tensor] = None,
):
g = self.emb_g(sid).unsqueeze(-1) g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
if rate: if rate is not None:
head = int(z_p.shape[2] * rate) head = int(z_p.shape[2] * (1.0 - rate.item()))
z_p = z_p[:, :, -head:] z_p = z_p[:, :, head:]
x_mask = x_mask[:, :, -head:] x_mask = x_mask[:, :, head:]
nsff0 = nsff0[:, head:]
z = self.flow(z_p, x_mask, g=g, reverse=True) z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, g=g) o = self.dec(z * x_mask, g=g)
return o, x_mask, (z, z_p, m_p, logs_p) return o, x_mask, (z, z_p, m_p, logs_p)
@ -916,7 +1125,7 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
sr=None, sr=None,
**kwargs **kwargs
): ):
super().__init__() super(self, SynthesizerTrnMs768NSFsid_nono).__init__()
self.spec_channels = spec_channels self.spec_channels = spec_channels
self.inter_channels = inter_channels self.inter_channels = inter_channels
self.hidden_channels = hidden_channels self.hidden_channels = hidden_channels
@ -924,7 +1133,7 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
self.n_heads = n_heads self.n_heads = n_heads
self.n_layers = n_layers self.n_layers = n_layers
self.kernel_size = kernel_size self.kernel_size = kernel_size
self.p_dropout = p_dropout self.p_dropout = float(p_dropout)
self.resblock = resblock self.resblock = resblock
self.resblock_kernel_sizes = resblock_kernel_sizes self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes self.resblock_dilation_sizes = resblock_dilation_sizes
@ -942,7 +1151,7 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
n_heads, n_heads,
n_layers, n_layers,
kernel_size, kernel_size,
p_dropout, float(p_dropout),
f0=False, f0=False,
) )
self.dec = Generator( self.dec = Generator(
@ -980,6 +1189,33 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
self.flow.remove_weight_norm() self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm() self.enc_q.remove_weight_norm()
def __prepare_scriptable__(self):
for hook in self.dec._forward_pre_hooks.values():
# The hook we want to remove is an instance of WeightNorm class, so
# normally we would do `if isinstance(...)` but this class is not accessible
# because of shadowing, so we check the module name directly.
# https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(self.dec)
for hook in self.flow._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(self.flow)
if hasattr(self, "enc_q"):
for hook in self.enc_q._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(self.enc_q)
return self
@torch.jit.ignore
def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id[bs,1] def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id[bs,1]
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t广播的 g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t广播的
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
@ -991,14 +1227,22 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
o = self.dec(z_slice, g=g) o = self.dec(z_slice, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, phone, phone_lengths, sid, rate=None): @torch.jit.export
def infer(
self,
phone: torch.Tensor,
phone_lengths: torch.Tensor,
sid: torch.Tensor,
rate: Optional[torch.Tensor] = None,
):
g = self.emb_g(sid).unsqueeze(-1) g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
if rate: if rate is not None:
head = int(z_p.shape[2] * rate) head = int(z_p.shape[2] * (1.0 - rate.item()))
z_p = z_p[:, :, -head:] z_p = z_p[:, :, head:]
x_mask = x_mask[:, :, -head:] x_mask = x_mask[:, :, head:]
nsff0 = nsff0[:, head:]
z = self.flow(z_p, x_mask, g=g, reverse=True) z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, g=g) o = self.dec(z * x_mask, g=g)
return o, x_mask, (z, z_p, m_p, logs_p) return o, x_mask, (z, z_p, m_p, logs_p)

View File

@ -551,7 +551,7 @@ class SynthesizerTrnMsNSFsidM(nn.Module):
gin_channels, gin_channels,
sr, sr,
version, version,
**kwargs **kwargs,
): ):
super().__init__() super().__init__()
if type(sr) == type("strr"): if type(sr) == type("strr"):
@ -621,10 +621,7 @@ class SynthesizerTrnMsNSFsidM(nn.Module):
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
self.speaker_map = None self.speaker_map = None
logger.debug( logger.debug(
"gin_channels: " f"gin_channels: {gin_channels}, self.spk_embed_dim: {self.spk_embed_dim}"
+ gin_channels
+ ", self.spk_embed_dim: "
+ self.spk_embed_dim
) )
def remove_weight_norm(self): def remove_weight_norm(self):

View File

@ -1,5 +1,6 @@
import copy import copy
import math import math
from typing import Optional, Tuple
import numpy as np import numpy as np
import scipy import scipy
@ -18,7 +19,7 @@ LRELU_SLOPE = 0.1
class LayerNorm(nn.Module): class LayerNorm(nn.Module):
def __init__(self, channels, eps=1e-5): def __init__(self, channels, eps=1e-5):
super().__init__() super(LayerNorm, self).__init__()
self.channels = channels self.channels = channels
self.eps = eps self.eps = eps
@ -41,13 +42,13 @@ class ConvReluNorm(nn.Module):
n_layers, n_layers,
p_dropout, p_dropout,
): ):
super().__init__() super(ConvReluNorm, self).__init__()
self.in_channels = in_channels self.in_channels = in_channels
self.hidden_channels = hidden_channels self.hidden_channels = hidden_channels
self.out_channels = out_channels self.out_channels = out_channels
self.kernel_size = kernel_size self.kernel_size = kernel_size
self.n_layers = n_layers self.n_layers = n_layers
self.p_dropout = p_dropout self.p_dropout = float(p_dropout)
assert n_layers > 1, "Number of layers should be larger than 0." assert n_layers > 1, "Number of layers should be larger than 0."
self.conv_layers = nn.ModuleList() self.conv_layers = nn.ModuleList()
@ -58,7 +59,7 @@ class ConvReluNorm(nn.Module):
) )
) )
self.norm_layers.append(LayerNorm(hidden_channels)) self.norm_layers.append(LayerNorm(hidden_channels))
self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout)) self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(float(p_dropout)))
for _ in range(n_layers - 1): for _ in range(n_layers - 1):
self.conv_layers.append( self.conv_layers.append(
nn.Conv1d( nn.Conv1d(
@ -89,13 +90,13 @@ class DDSConv(nn.Module):
""" """
def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
super().__init__() super(DDSConv, self).__init__()
self.channels = channels self.channels = channels
self.kernel_size = kernel_size self.kernel_size = kernel_size
self.n_layers = n_layers self.n_layers = n_layers
self.p_dropout = p_dropout self.p_dropout = float(p_dropout)
self.drop = nn.Dropout(p_dropout) self.drop = nn.Dropout(float(p_dropout))
self.convs_sep = nn.ModuleList() self.convs_sep = nn.ModuleList()
self.convs_1x1 = nn.ModuleList() self.convs_1x1 = nn.ModuleList()
self.norms_1 = nn.ModuleList() self.norms_1 = nn.ModuleList()
@ -117,7 +118,7 @@ class DDSConv(nn.Module):
self.norms_1.append(LayerNorm(channels)) self.norms_1.append(LayerNorm(channels))
self.norms_2.append(LayerNorm(channels)) self.norms_2.append(LayerNorm(channels))
def forward(self, x, x_mask, g=None): def forward(self, x, x_mask, g: Optional[torch.Tensor] = None):
if g is not None: if g is not None:
x = x + g x = x + g
for i in range(self.n_layers): for i in range(self.n_layers):
@ -149,11 +150,11 @@ class WN(torch.nn.Module):
self.dilation_rate = dilation_rate self.dilation_rate = dilation_rate
self.n_layers = n_layers self.n_layers = n_layers
self.gin_channels = gin_channels self.gin_channels = gin_channels
self.p_dropout = p_dropout self.p_dropout = float(p_dropout)
self.in_layers = torch.nn.ModuleList() self.in_layers = torch.nn.ModuleList()
self.res_skip_layers = torch.nn.ModuleList() self.res_skip_layers = torch.nn.ModuleList()
self.drop = nn.Dropout(p_dropout) self.drop = nn.Dropout(float(p_dropout))
if gin_channels != 0: if gin_channels != 0:
cond_layer = torch.nn.Conv1d( cond_layer = torch.nn.Conv1d(
@ -184,15 +185,19 @@ class WN(torch.nn.Module):
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
self.res_skip_layers.append(res_skip_layer) self.res_skip_layers.append(res_skip_layer)
def forward(self, x, x_mask, g=None, **kwargs): def forward(
self, x: torch.Tensor, x_mask: torch.Tensor, g: Optional[torch.Tensor] = None
):
output = torch.zeros_like(x) output = torch.zeros_like(x)
n_channels_tensor = torch.IntTensor([self.hidden_channels]) n_channels_tensor = torch.IntTensor([self.hidden_channels])
if g is not None: if g is not None:
g = self.cond_layer(g) g = self.cond_layer(g)
for i in range(self.n_layers): for i, (in_layer, res_skip_layer) in enumerate(
x_in = self.in_layers[i](x) zip(self.in_layers, self.res_skip_layers)
):
x_in = in_layer(x)
if g is not None: if g is not None:
cond_offset = i * 2 * self.hidden_channels cond_offset = i * 2 * self.hidden_channels
g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
@ -202,7 +207,7 @@ class WN(torch.nn.Module):
acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
acts = self.drop(acts) acts = self.drop(acts)
res_skip_acts = self.res_skip_layers[i](acts) res_skip_acts = res_skip_layer(acts)
if i < self.n_layers - 1: if i < self.n_layers - 1:
res_acts = res_skip_acts[:, : self.hidden_channels, :] res_acts = res_skip_acts[:, : self.hidden_channels, :]
x = (x + res_acts) * x_mask x = (x + res_acts) * x_mask
@ -219,6 +224,30 @@ class WN(torch.nn.Module):
for l in self.res_skip_layers: for l in self.res_skip_layers:
torch.nn.utils.remove_weight_norm(l) torch.nn.utils.remove_weight_norm(l)
def __prepare_scriptable__(self):
if self.gin_channels != 0:
for hook in self.cond_layer._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(self.cond_layer)
for l in self.in_layers:
for hook in l._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(l)
for l in self.res_skip_layers:
for hook in l._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(l)
return self
class ResBlock1(torch.nn.Module): class ResBlock1(torch.nn.Module):
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
@ -294,14 +323,15 @@ class ResBlock1(torch.nn.Module):
] ]
) )
self.convs2.apply(init_weights) self.convs2.apply(init_weights)
self.lrelu_slope = LRELU_SLOPE
def forward(self, x, x_mask=None): def forward(self, x: torch.Tensor, x_mask: Optional[torch.Tensor] = None):
for c1, c2 in zip(self.convs1, self.convs2): for c1, c2 in zip(self.convs1, self.convs2):
xt = F.leaky_relu(x, LRELU_SLOPE) xt = F.leaky_relu(x, self.lrelu_slope)
if x_mask is not None: if x_mask is not None:
xt = xt * x_mask xt = xt * x_mask
xt = c1(xt) xt = c1(xt)
xt = F.leaky_relu(xt, LRELU_SLOPE) xt = F.leaky_relu(xt, self.lrelu_slope)
if x_mask is not None: if x_mask is not None:
xt = xt * x_mask xt = xt * x_mask
xt = c2(xt) xt = c2(xt)
@ -316,6 +346,23 @@ class ResBlock1(torch.nn.Module):
for l in self.convs2: for l in self.convs2:
remove_weight_norm(l) remove_weight_norm(l)
def __prepare_scriptable__(self):
for l in self.convs1:
for hook in l._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(l)
for l in self.convs2:
for hook in l._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(l)
return self
class ResBlock2(torch.nn.Module): class ResBlock2(torch.nn.Module):
def __init__(self, channels, kernel_size=3, dilation=(1, 3)): def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
@ -345,10 +392,11 @@ class ResBlock2(torch.nn.Module):
] ]
) )
self.convs.apply(init_weights) self.convs.apply(init_weights)
self.lrelu_slope = LRELU_SLOPE
def forward(self, x, x_mask=None): def forward(self, x, x_mask: Optional[torch.Tensor] = None):
for c in self.convs: for c in self.convs:
xt = F.leaky_relu(x, LRELU_SLOPE) xt = F.leaky_relu(x, self.lrelu_slope)
if x_mask is not None: if x_mask is not None:
xt = xt * x_mask xt = xt * x_mask
xt = c(xt) xt = c(xt)
@ -361,9 +409,25 @@ class ResBlock2(torch.nn.Module):
for l in self.convs: for l in self.convs:
remove_weight_norm(l) remove_weight_norm(l)
def __prepare_scriptable__(self):
for l in self.convs:
for hook in l._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(l)
return self
class Log(nn.Module): class Log(nn.Module):
def forward(self, x, x_mask, reverse=False, **kwargs): def forward(
self,
x: torch.Tensor,
x_mask: torch.Tensor,
g: Optional[torch.Tensor] = None,
reverse: bool = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
if not reverse: if not reverse:
y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
logdet = torch.sum(-y, [1, 2]) logdet = torch.sum(-y, [1, 2])
@ -374,18 +438,27 @@ class Log(nn.Module):
class Flip(nn.Module): class Flip(nn.Module):
def forward(self, x, *args, reverse=False, **kwargs): # torch.jit.script() Compiled functions \
# can't take variable number of arguments or \
# use keyword-only arguments with defaults
def forward(
self,
x: torch.Tensor,
x_mask: torch.Tensor,
g: Optional[torch.Tensor] = None,
reverse: bool = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
x = torch.flip(x, [1]) x = torch.flip(x, [1])
if not reverse: if not reverse:
logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
return x, logdet return x, logdet
else: else:
return x return x, torch.zeros([1], device=x.device)
class ElementwiseAffine(nn.Module): class ElementwiseAffine(nn.Module):
def __init__(self, channels): def __init__(self, channels):
super().__init__() super(ElementwiseAffine, self).__init__()
self.channels = channels self.channels = channels
self.m = nn.Parameter(torch.zeros(channels, 1)) self.m = nn.Parameter(torch.zeros(channels, 1))
self.logs = nn.Parameter(torch.zeros(channels, 1)) self.logs = nn.Parameter(torch.zeros(channels, 1))
@ -414,7 +487,7 @@ class ResidualCouplingLayer(nn.Module):
mean_only=False, mean_only=False,
): ):
assert channels % 2 == 0, "channels should be divisible by 2" assert channels % 2 == 0, "channels should be divisible by 2"
super().__init__() super(ResidualCouplingLayer, self).__init__()
self.channels = channels self.channels = channels
self.hidden_channels = hidden_channels self.hidden_channels = hidden_channels
self.kernel_size = kernel_size self.kernel_size = kernel_size
@ -429,14 +502,20 @@ class ResidualCouplingLayer(nn.Module):
kernel_size, kernel_size,
dilation_rate, dilation_rate,
n_layers, n_layers,
p_dropout=p_dropout, p_dropout=float(p_dropout),
gin_channels=gin_channels, gin_channels=gin_channels,
) )
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
self.post.weight.data.zero_() self.post.weight.data.zero_()
self.post.bias.data.zero_() self.post.bias.data.zero_()
def forward(self, x, x_mask, g=None, reverse=False): def forward(
self,
x: torch.Tensor,
x_mask: torch.Tensor,
g: Optional[torch.Tensor] = None,
reverse: bool = False,
):
x0, x1 = torch.split(x, [self.half_channels] * 2, 1) x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
h = self.pre(x0) * x_mask h = self.pre(x0) * x_mask
h = self.enc(h, x_mask, g=g) h = self.enc(h, x_mask, g=g)
@ -455,11 +534,20 @@ class ResidualCouplingLayer(nn.Module):
else: else:
x1 = (x1 - m) * torch.exp(-logs) * x_mask x1 = (x1 - m) * torch.exp(-logs) * x_mask
x = torch.cat([x0, x1], 1) x = torch.cat([x0, x1], 1)
return x return x, torch.zeros([1])
def remove_weight_norm(self): def remove_weight_norm(self):
self.enc.remove_weight_norm() self.enc.remove_weight_norm()
def __prepare_scriptable__(self):
for hook in self.enc._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(self.enc)
return self
class ConvFlow(nn.Module): class ConvFlow(nn.Module):
def __init__( def __init__(
@ -471,7 +559,7 @@ class ConvFlow(nn.Module):
num_bins=10, num_bins=10,
tail_bound=5.0, tail_bound=5.0,
): ):
super().__init__() super(ConvFlow, self).__init__()
self.in_channels = in_channels self.in_channels = in_channels
self.filter_channels = filter_channels self.filter_channels = filter_channels
self.kernel_size = kernel_size self.kernel_size = kernel_size
@ -488,7 +576,13 @@ class ConvFlow(nn.Module):
self.proj.weight.data.zero_() self.proj.weight.data.zero_()
self.proj.bias.data.zero_() self.proj.bias.data.zero_()
def forward(self, x, x_mask, g=None, reverse=False): def forward(
self,
x: torch.Tensor,
x_mask: torch.Tensor,
g: Optional[torch.Tensor] = None,
reverse=False,
):
x0, x1 = torch.split(x, [self.half_channels] * 2, 1) x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
h = self.pre(x0) h = self.pre(x0)
h = self.convs(h, x_mask, g=g) h = self.convs(h, x_mask, g=g)

163
infer/lib/jit/__init__.py Normal file
View File

@ -0,0 +1,163 @@
from io import BytesIO
import pickle
import time
import torch
from tqdm import tqdm
from collections import OrderedDict
def load_inputs(path, device, is_half=False):
parm = torch.load(path, map_location=torch.device("cpu"))
for key in parm.keys():
parm[key] = parm[key].to(device)
if is_half and parm[key].dtype == torch.float32:
parm[key] = parm[key].half()
elif not is_half and parm[key].dtype == torch.float16:
parm[key] = parm[key].float()
return parm
def benchmark(
model, inputs_path, device=torch.device("cpu"), epoch=1000, is_half=False
):
parm = load_inputs(inputs_path, device, is_half)
total_ts = 0.0
bar = tqdm(range(epoch))
for i in bar:
start_time = time.perf_counter()
o = model(**parm)
total_ts += time.perf_counter() - start_time
print(f"num_epoch: {epoch} | avg time(ms): {(total_ts*1000)/epoch}")
def jit_warm_up(model, inputs_path, device=torch.device("cpu"), epoch=5, is_half=False):
benchmark(model, inputs_path, device, epoch=epoch, is_half=is_half)
def to_jit_model(
model_path,
model_type: str,
mode: str = "trace",
inputs_path: str = None,
device=torch.device("cpu"),
is_half=False,
):
model = None
if model_type.lower() == "synthesizer":
from .get_synthesizer import get_synthesizer
model, _ = get_synthesizer(model_path, device)
model.forward = model.infer
elif model_type.lower() == "rmvpe":
from .get_rmvpe import get_rmvpe
model = get_rmvpe(model_path, device)
elif model_type.lower() == "hubert":
from .get_hubert import get_hubert_model
model = get_hubert_model(model_path, device)
model.forward = model.infer
else:
raise ValueError(f"No model type named {model_type}")
model = model.eval()
model = model.half() if is_half else model.float()
if mode == "trace":
assert not inputs_path
inputs = load_inputs(inputs_path, device, is_half)
model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs)
elif mode == "script":
model_jit = torch.jit.script(model)
model_jit.to(device)
model_jit = model_jit.half() if is_half else model_jit.float()
# model = model.half() if is_half else model.float()
return (model, model_jit)
def export(
model: torch.nn.Module,
mode: str = "trace",
inputs: dict = None,
device=torch.device("cpu"),
is_half: bool = False,
) -> dict:
model = model.half() if is_half else model.float()
model.eval()
if mode == "trace":
assert inputs is not None
model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs)
elif mode == "script":
model_jit = torch.jit.script(model)
model_jit.to(device)
model_jit = model_jit.half() if is_half else model_jit.float()
buffer = BytesIO()
# model_jit=model_jit.cpu()
torch.jit.save(model_jit, buffer)
del model_jit
cpt = OrderedDict()
cpt["model"] = buffer.getvalue()
cpt["is_half"] = is_half
return cpt
def load(path: str):
with open(path, "rb") as f:
return pickle.load(f)
def save(ckpt: dict, save_path: str):
with open(save_path, "wb") as f:
pickle.dump(ckpt, f)
def rmvpe_jit_export(
model_path: str,
mode: str = "script",
inputs_path: str = None,
save_path: str = None,
device=torch.device("cpu"),
is_half=False,
):
if not save_path:
save_path = model_path.rstrip(".pth")
save_path += ".half.jit" if is_half else ".jit"
if "cuda" in str(device) and ":" not in str(device):
device = torch.device("cuda:0")
from .get_rmvpe import get_rmvpe
model = get_rmvpe(model_path, device)
inputs = None
if mode == "trace":
inputs = load_inputs(inputs_path, device, is_half)
ckpt = export(model, mode, inputs, device, is_half)
ckpt["device"] = str(device)
save(ckpt, save_path)
return ckpt
def synthesizer_jit_export(
model_path: str,
mode: str = "script",
inputs_path: str = None,
save_path: str = None,
device=torch.device("cpu"),
is_half=False,
):
if not save_path:
save_path = model_path.rstrip(".pth")
save_path += ".half.jit" if is_half else ".jit"
if "cuda" in str(device) and ":" not in str(device):
device = torch.device("cuda:0")
from .get_synthesizer import get_synthesizer
model, cpt = get_synthesizer(model_path, device)
assert isinstance(cpt, dict)
model.forward = model.infer
inputs = None
if mode == "trace":
inputs = load_inputs(inputs_path, device, is_half)
ckpt = export(model, mode, inputs, device, is_half)
cpt.pop("weight")
cpt["model"] = ckpt["model"]
cpt["device"] = device
save(cpt, save_path)
return cpt

342
infer/lib/jit/get_hubert.py Normal file
View File

@ -0,0 +1,342 @@
import math
import random
from typing import Optional, Tuple
from fairseq.checkpoint_utils import load_model_ensemble_and_task
import numpy as np
import torch
import torch.nn.functional as F
# from fairseq.data.data_utils import compute_mask_indices
from fairseq.utils import index_put
# @torch.jit.script
def pad_to_multiple(x, multiple, dim=-1, value=0):
# Inspired from https://github.com/lucidrains/local-attention/blob/master/local_attention/local_attention.py#L41
if x is None:
return None, 0
tsz = x.size(dim)
m = tsz / multiple
remainder = math.ceil(m) * multiple - tsz
if int(tsz % multiple) == 0:
return x, 0
pad_offset = (0,) * (-1 - dim) * 2
return F.pad(x, (*pad_offset, 0, remainder), value=value), remainder
def extract_features(
self,
x,
padding_mask=None,
tgt_layer=None,
min_layer=0,
):
if padding_mask is not None:
x = index_put(x, padding_mask, 0)
x_conv = self.pos_conv(x.transpose(1, 2))
x_conv = x_conv.transpose(1, 2)
x = x + x_conv
if not self.layer_norm_first:
x = self.layer_norm(x)
# pad to the sequence length dimension
x, pad_length = pad_to_multiple(x, self.required_seq_len_multiple, dim=-2, value=0)
if pad_length > 0 and padding_mask is None:
padding_mask = x.new_zeros((x.size(0), x.size(1)), dtype=torch.bool)
padding_mask[:, -pad_length:] = True
else:
padding_mask, _ = pad_to_multiple(
padding_mask, self.required_seq_len_multiple, dim=-1, value=True
)
x = F.dropout(x, p=self.dropout, training=self.training)
# B x T x C -> T x B x C
x = x.transpose(0, 1)
layer_results = []
r = None
for i, layer in enumerate(self.layers):
dropout_probability = np.random.random() if self.layerdrop > 0 else 1
if not self.training or (dropout_probability > self.layerdrop):
x, (z, lr) = layer(
x, self_attn_padding_mask=padding_mask, need_weights=False
)
if i >= min_layer:
layer_results.append((x, z, lr))
if i == tgt_layer:
r = x
break
if r is not None:
x = r
# T x B x C -> B x T x C
x = x.transpose(0, 1)
# undo paddding
if pad_length > 0:
x = x[:, :-pad_length]
def undo_pad(a, b, c):
return (
a[:-pad_length],
b[:-pad_length] if b is not None else b,
c[:-pad_length],
)
layer_results = [undo_pad(*u) for u in layer_results]
return x, layer_results
def compute_mask_indices(
shape: Tuple[int, int],
padding_mask: Optional[torch.Tensor],
mask_prob: float,
mask_length: int,
mask_type: str = "static",
mask_other: float = 0.0,
min_masks: int = 0,
no_overlap: bool = False,
min_space: int = 0,
require_same_masks: bool = True,
mask_dropout: float = 0.0,
) -> torch.Tensor:
"""
Computes random mask spans for a given shape
Args:
shape: the the shape for which to compute masks.
should be of size 2 where first element is batch size and 2nd is timesteps
padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
however due to overlaps, the actual number will be smaller (unless no_overlap is True)
mask_type: how to compute mask lengths
static = fixed size
uniform = sample from uniform distribution [mask_other, mask_length*2]
normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
poisson = sample from possion distribution with lambda = mask length
min_masks: minimum number of masked spans
no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
require_same_masks: if true, will randomly drop out masks until same amount of masks remains in each sample
mask_dropout: randomly dropout this percentage of masks in each example
"""
bsz, all_sz = shape
mask = torch.full((bsz, all_sz), False)
all_num_mask = int(
# add a random number for probabilistic rounding
mask_prob * all_sz / float(mask_length)
+ torch.rand([1]).item()
)
all_num_mask = max(min_masks, all_num_mask)
mask_idcs = []
for i in range(bsz):
if padding_mask is not None:
sz = all_sz - padding_mask[i].long().sum().item()
num_mask = int(mask_prob * sz / float(mask_length) + np.random.rand())
num_mask = max(min_masks, num_mask)
else:
sz = all_sz
num_mask = all_num_mask
if mask_type == "static":
lengths = torch.full([num_mask], mask_length)
elif mask_type == "uniform":
lengths = torch.randint(mask_other, mask_length * 2 + 1, size=[num_mask])
elif mask_type == "normal":
lengths = torch.normal(mask_length, mask_other, size=[num_mask])
lengths = [max(1, int(round(x))) for x in lengths]
else:
raise Exception("unknown mask selection " + mask_type)
if sum(lengths) == 0:
lengths[0] = min(mask_length, sz - 1)
if no_overlap:
mask_idc = []
def arrange(s, e, length, keep_length):
span_start = torch.randint(low=s, high=e - length, size=[1]).item()
mask_idc.extend(span_start + i for i in range(length))
new_parts = []
if span_start - s - min_space >= keep_length:
new_parts.append((s, span_start - min_space + 1))
if e - span_start - length - min_space > keep_length:
new_parts.append((span_start + length + min_space, e))
return new_parts
parts = [(0, sz)]
min_length = min(lengths)
for length in sorted(lengths, reverse=True):
t = [e - s if e - s >= length + min_space else 0 for s, e in parts]
lens = torch.asarray(t, dtype=torch.int)
l_sum = torch.sum(lens)
if l_sum == 0:
break
probs = lens / torch.sum(lens)
c = torch.multinomial(probs.float(), len(parts)).item()
s, e = parts.pop(c)
parts.extend(arrange(s, e, length, min_length))
mask_idc = torch.asarray(mask_idc)
else:
min_len = min(lengths)
if sz - min_len <= num_mask:
min_len = sz - num_mask - 1
mask_idc = torch.asarray(
random.sample([i for i in range(sz - min_len)], num_mask)
)
mask_idc = torch.asarray(
[
mask_idc[j] + offset
for j in range(len(mask_idc))
for offset in range(lengths[j])
]
)
mask_idcs.append(torch.unique(mask_idc[mask_idc < sz]))
min_len = min([len(m) for m in mask_idcs])
for i, mask_idc in enumerate(mask_idcs):
if isinstance(mask_idc, torch.Tensor):
mask_idc = torch.asarray(mask_idc, dtype=torch.float)
if len(mask_idc) > min_len and require_same_masks:
mask_idc = torch.asarray(
random.sample([i for i in range(mask_idc)], min_len)
)
if mask_dropout > 0:
num_holes = int(round(len(mask_idc) * mask_dropout))
mask_idc = torch.asarray(
random.sample([i for i in range(mask_idc)], len(mask_idc) - num_holes)
)
mask[i, mask_idc.int()] = True
return mask
def apply_mask(self, x, padding_mask, target_list):
B, T, C = x.shape
torch.zeros_like(x)
if self.mask_prob > 0:
mask_indices = compute_mask_indices(
(B, T),
padding_mask,
self.mask_prob,
self.mask_length,
self.mask_selection,
self.mask_other,
min_masks=2,
no_overlap=self.no_mask_overlap,
min_space=self.mask_min_space,
)
mask_indices = mask_indices.to(x.device)
x[mask_indices] = self.mask_emb
else:
mask_indices = None
if self.mask_channel_prob > 0:
mask_channel_indices = compute_mask_indices(
(B, C),
None,
self.mask_channel_prob,
self.mask_channel_length,
self.mask_channel_selection,
self.mask_channel_other,
no_overlap=self.no_mask_channel_overlap,
min_space=self.mask_channel_min_space,
)
mask_channel_indices = (
mask_channel_indices.to(x.device).unsqueeze(1).expand(-1, T, -1)
)
x[mask_channel_indices] = 0
return x, mask_indices
def get_hubert_model(
model_path="assets/hubert/hubert_base.pt", device=torch.device("cpu")
):
models, _, _ = load_model_ensemble_and_task(
[model_path],
suffix="",
)
hubert_model = models[0]
hubert_model = hubert_model.to(device)
def _apply_mask(x, padding_mask, target_list):
return apply_mask(hubert_model, x, padding_mask, target_list)
hubert_model.apply_mask = _apply_mask
def _extract_features(
x,
padding_mask=None,
tgt_layer=None,
min_layer=0,
):
return extract_features(
hubert_model.encoder,
x,
padding_mask=padding_mask,
tgt_layer=tgt_layer,
min_layer=min_layer,
)
hubert_model.encoder.extract_features = _extract_features
hubert_model._forward = hubert_model.forward
def hubert_extract_features(
self,
source: torch.Tensor,
padding_mask: Optional[torch.Tensor] = None,
mask: bool = False,
ret_conv: bool = False,
output_layer: Optional[int] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
res = self._forward(
source,
padding_mask=padding_mask,
mask=mask,
features_only=True,
output_layer=output_layer,
)
feature = res["features"] if ret_conv else res["x"]
return feature, res["padding_mask"]
def _hubert_extract_features(
source: torch.Tensor,
padding_mask: Optional[torch.Tensor] = None,
mask: bool = False,
ret_conv: bool = False,
output_layer: Optional[int] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
return hubert_extract_features(
hubert_model, source, padding_mask, mask, ret_conv, output_layer
)
hubert_model.extract_features = _hubert_extract_features
def infer(source, padding_mask, output_layer: torch.Tensor):
output_layer = output_layer.item()
logits = hubert_model.extract_features(
source=source, padding_mask=padding_mask, output_layer=output_layer
)
feats = hubert_model.final_proj(logits[0]) if output_layer == 9 else logits[0]
return feats
hubert_model.infer = infer
# hubert_model.forward=infer
# hubert_model.forward
return hubert_model

View File

@ -0,0 +1,12 @@
import torch
def get_rmvpe(model_path="assets/rmvpe/rmvpe.pt", device=torch.device("cpu")):
from infer.lib.rmvpe import E2E
model = E2E(4, 1, (2, 2))
ckpt = torch.load(model_path, map_location=device)
model.load_state_dict(ckpt)
model.eval()
model = model.to(device)
return model

View File

@ -0,0 +1,37 @@
import torch
def get_synthesizer(pth_path, device=torch.device("cpu")):
from infer.lib.infer_pack.models import (
SynthesizerTrnMs256NSFsid,
SynthesizerTrnMs256NSFsid_nono,
SynthesizerTrnMs768NSFsid,
SynthesizerTrnMs768NSFsid_nono,
)
cpt = torch.load(pth_path, map_location=torch.device("cpu"))
# tgt_sr = cpt["config"][-1]
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
if_f0 = cpt.get("f0", 1)
version = cpt.get("version", "v1")
if version == "v1":
if if_f0 == 1:
net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=False)
else:
net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
elif version == "v2":
if if_f0 == 1:
net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=False)
else:
net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
del net_g.enc_q
# net_g.forward = net_g.infer
# ckpt = {}
# ckpt["config"] = cpt["config"]
# ckpt["f0"] = if_f0
# ckpt["version"] = version
# ckpt["info"] = cpt.get("info", "0epoch")
net_g.load_state_dict(cpt["weight"], strict=False)
net_g = net_g.float()
net_g.eval().to(device)
return net_g, cpt

View File

@ -1,8 +1,11 @@
import pdb, os from io import BytesIO
import os
from typing import List, Optional, Tuple
import numpy as np import numpy as np
import torch import torch
from infer.lib import jit
try: try:
# Fix "Torch not compiled with CUDA enabled" # Fix "Torch not compiled with CUDA enabled"
import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
@ -11,7 +14,7 @@ try:
from infer.modules.ipex import ipex_init from infer.modules.ipex import ipex_init
ipex_init() ipex_init()
except Exception: except Exception: # pylint: disable=broad-exception-caught
pass pass
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
@ -23,58 +26,6 @@ import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
###stft codes from https://github.com/pseeth/torch-stft/blob/master/torch_stft/util.py
def window_sumsquare(
window,
n_frames,
hop_length=200,
win_length=800,
n_fft=800,
dtype=np.float32,
norm=None,
):
"""
# from librosa 0.6
Compute the sum-square envelope of a window function at a given hop length.
This is used to estimate modulation effects induced by windowing
observations in short-time fourier transforms.
Parameters
----------
window : string, tuple, number, callable, or list-like
Window specification, as in `get_window`
n_frames : int > 0
The number of analysis frames
hop_length : int > 0
The number of samples to advance between frames
win_length : [optional]
The length of the window function. By default, this matches `n_fft`.
n_fft : int > 0
The length of each analysis frame.
dtype : np.dtype
The data type of the output
Returns
-------
wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
The sum-squared envelope of the window function
"""
if win_length is None:
win_length = n_fft
n = n_fft + hop_length * (n_frames - 1)
x = np.zeros(n, dtype=dtype)
# Compute the squared window at the desired length
win_sq = get_window(window, win_length, fftbins=True)
win_sq = normalize(win_sq, norm=norm) ** 2
win_sq = pad_center(win_sq, n_fft)
# Fill the envelope
for i in range(n_frames):
sample = i * hop_length
x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
return x
class STFT(torch.nn.Module): class STFT(torch.nn.Module):
def __init__( def __init__(
self, filter_length=1024, hop_length=512, win_length=None, window="hann" self, filter_length=1024, hop_length=512, win_length=None, window="hann"
@ -101,17 +52,14 @@ class STFT(torch.nn.Module):
self.window = window self.window = window
self.forward_transform = None self.forward_transform = None
self.pad_amount = int(self.filter_length / 2) self.pad_amount = int(self.filter_length / 2)
scale = self.filter_length / self.hop_length
fourier_basis = np.fft.fft(np.eye(self.filter_length)) fourier_basis = np.fft.fft(np.eye(self.filter_length))
cutoff = int((self.filter_length / 2 + 1)) cutoff = int((self.filter_length / 2 + 1))
fourier_basis = np.vstack( fourier_basis = np.vstack(
[np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])] [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
) )
forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) forward_basis = torch.FloatTensor(fourier_basis)
inverse_basis = torch.FloatTensor( inverse_basis = torch.FloatTensor(np.linalg.pinv(fourier_basis))
np.linalg.pinv(scale * fourier_basis).T[:, None, :]
)
assert filter_length >= self.win_length assert filter_length >= self.win_length
# get window and zero center pad it to filter_length # get window and zero center pad it to filter_length
@ -121,12 +69,13 @@ class STFT(torch.nn.Module):
# window the bases # window the bases
forward_basis *= fft_window forward_basis *= fft_window
inverse_basis *= fft_window inverse_basis = (inverse_basis.T * fft_window).T
self.register_buffer("forward_basis", forward_basis.float()) self.register_buffer("forward_basis", forward_basis.float())
self.register_buffer("inverse_basis", inverse_basis.float()) self.register_buffer("inverse_basis", inverse_basis.float())
self.register_buffer("fft_window", fft_window.float())
def transform(self, input_data): def transform(self, input_data, return_phase=False):
"""Take input data (audio) to STFT domain. """Take input data (audio) to STFT domain.
Arguments: Arguments:
@ -138,33 +87,24 @@ class STFT(torch.nn.Module):
phase {tensor} -- Phase of STFT with shape (num_batch, phase {tensor} -- Phase of STFT with shape (num_batch,
num_frequencies, num_frames) num_frequencies, num_frames)
""" """
num_batches = input_data.shape[0]
num_samples = input_data.shape[-1]
self.num_samples = num_samples
# similar to librosa, reflect-pad the input
input_data = input_data.view(num_batches, 1, num_samples)
# print(1234,input_data.shape)
input_data = F.pad( input_data = F.pad(
input_data.unsqueeze(1), input_data,
(self.pad_amount, self.pad_amount, 0, 0, 0, 0), (self.pad_amount, self.pad_amount),
mode="reflect", mode="reflect",
).squeeze(1)
# print(2333,input_data.shape,self.forward_basis.shape,self.hop_length)
# pdb.set_trace()
forward_transform = F.conv1d(
input_data, self.forward_basis, stride=self.hop_length, padding=0
) )
forward_transform = input_data.unfold(
1, self.filter_length, self.hop_length
).permute(0, 2, 1)
forward_transform = torch.matmul(self.forward_basis, forward_transform)
cutoff = int((self.filter_length / 2) + 1) cutoff = int((self.filter_length / 2) + 1)
real_part = forward_transform[:, :cutoff, :] real_part = forward_transform[:, :cutoff, :]
imag_part = forward_transform[:, cutoff:, :] imag_part = forward_transform[:, cutoff:, :]
magnitude = torch.sqrt(real_part**2 + imag_part**2) magnitude = torch.sqrt(real_part**2 + imag_part**2)
# phase = torch.atan2(imag_part.data, real_part.data) if return_phase:
phase = torch.atan2(imag_part.data, real_part.data)
return magnitude # , phase return magnitude, phase
else:
return magnitude
def inverse(self, magnitude, phase): def inverse(self, magnitude, phase):
"""Call the inverse STFT (iSTFT), given magnitude and phase tensors produced """Call the inverse STFT (iSTFT), given magnitude and phase tensors produced
@ -180,42 +120,25 @@ class STFT(torch.nn.Module):
inverse_transform {tensor} -- Reconstructed audio given magnitude and phase. Of inverse_transform {tensor} -- Reconstructed audio given magnitude and phase. Of
shape (num_batch, num_samples) shape (num_batch, num_samples)
""" """
recombine_magnitude_phase = torch.cat( cat = torch.cat(
[magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1 [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
) )
fold = torch.nn.Fold(
inverse_transform = F.conv_transpose1d( output_size=(1, (cat.size(-1) - 1) * self.hop_length + self.filter_length),
recombine_magnitude_phase, kernel_size=(1, self.filter_length),
self.inverse_basis, stride=(1, self.hop_length),
stride=self.hop_length,
padding=0,
) )
inverse_transform = torch.matmul(self.inverse_basis, cat)
if self.window is not None: inverse_transform = fold(inverse_transform)[
window_sum = window_sumsquare( :, 0, 0, self.pad_amount : -self.pad_amount
self.window,
magnitude.size(-1),
hop_length=self.hop_length,
win_length=self.win_length,
n_fft=self.filter_length,
dtype=np.float32,
)
# remove modulation effects
approx_nonzero_indices = torch.from_numpy(
np.where(window_sum > tiny(window_sum))[0]
)
window_sum = torch.from_numpy(window_sum).to(inverse_transform.device)
inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
approx_nonzero_indices
] ]
window_square_sum = (
# scale by hop ratio self.fft_window.pow(2).repeat(cat.size(-1), 1).T.unsqueeze(0)
inverse_transform *= float(self.filter_length) / self.hop_length )
window_square_sum = fold(window_square_sum)[
inverse_transform = inverse_transform[..., self.pad_amount :] :, 0, 0, self.pad_amount : -self.pad_amount
inverse_transform = inverse_transform[..., : self.num_samples] ]
inverse_transform = inverse_transform.squeeze(1) inverse_transform /= window_square_sum
return inverse_transform return inverse_transform
def forward(self, input_data): def forward(self, input_data):
@ -228,7 +151,7 @@ class STFT(torch.nn.Module):
reconstruction {tensor} -- Reconstructed audio given magnitude and phase. Of reconstruction {tensor} -- Reconstructed audio given magnitude and phase. Of
shape (num_batch, num_samples) shape (num_batch, num_samples)
""" """
self.magnitude, self.phase = self.transform(input_data) self.magnitude, self.phase = self.transform(input_data, return_phase=True)
reconstruction = self.inverse(self.magnitude, self.phase) reconstruction = self.inverse(self.magnitude, self.phase)
return reconstruction return reconstruction
@ -276,17 +199,15 @@ class ConvBlockRes(nn.Module):
nn.BatchNorm2d(out_channels, momentum=momentum), nn.BatchNorm2d(out_channels, momentum=momentum),
nn.ReLU(), nn.ReLU(),
) )
# self.shortcut:Optional[nn.Module] = None
if in_channels != out_channels: if in_channels != out_channels:
self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
self.is_shortcut = True
else:
self.is_shortcut = False
def forward(self, x): def forward(self, x: torch.Tensor):
if self.is_shortcut: if not hasattr(self, "shortcut"):
return self.conv(x) + self.shortcut(x)
else:
return self.conv(x) + x return self.conv(x) + x
else:
return self.conv(x) + self.shortcut(x)
class Encoder(nn.Module): class Encoder(nn.Module):
@ -318,12 +239,12 @@ class Encoder(nn.Module):
self.out_size = in_size self.out_size = in_size
self.out_channel = out_channels self.out_channel = out_channels
def forward(self, x): def forward(self, x: torch.Tensor):
concat_tensors = [] concat_tensors: List[torch.Tensor] = []
x = self.bn(x) x = self.bn(x)
for i in range(self.n_encoders): for i, layer in enumerate(self.layers):
_, x = self.layers[i](x) t, x = layer(x)
concat_tensors.append(_) concat_tensors.append(t)
return x, concat_tensors return x, concat_tensors
@ -342,8 +263,8 @@ class ResEncoderBlock(nn.Module):
self.pool = nn.AvgPool2d(kernel_size=kernel_size) self.pool = nn.AvgPool2d(kernel_size=kernel_size)
def forward(self, x): def forward(self, x):
for i in range(self.n_blocks): for i, conv in enumerate(self.conv):
x = self.conv[i](x) x = conv(x)
if self.kernel_size is not None: if self.kernel_size is not None:
return x, self.pool(x) return x, self.pool(x)
else: else:
@ -364,8 +285,8 @@ class Intermediate(nn.Module): #
) )
def forward(self, x): def forward(self, x):
for i in range(self.n_inters): for i, layer in enumerate(self.layers):
x = self.layers[i](x) x = layer(x)
return x return x
@ -395,8 +316,8 @@ class ResDecoderBlock(nn.Module):
def forward(self, x, concat_tensor): def forward(self, x, concat_tensor):
x = self.conv1(x) x = self.conv1(x)
x = torch.cat((x, concat_tensor), dim=1) x = torch.cat((x, concat_tensor), dim=1)
for i in range(self.n_blocks): for i, conv2 in enumerate(self.conv2):
x = self.conv2[i](x) x = conv2(x)
return x return x
@ -412,9 +333,9 @@ class Decoder(nn.Module):
) )
in_channels = out_channels in_channels = out_channels
def forward(self, x, concat_tensors): def forward(self, x: torch.Tensor, concat_tensors: List[torch.Tensor]):
for i in range(self.n_decoders): for i, layer in enumerate(self.layers):
x = self.layers[i](x, concat_tensors[-1 - i]) x = layer(x, concat_tensors[-1 - i])
return x return x
@ -442,7 +363,7 @@ class DeepUnet(nn.Module):
self.encoder.out_channel, en_de_layers, kernel_size, n_blocks self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
) )
def forward(self, x): def forward(self, x: torch.Tensor) -> torch.Tensor:
x, concat_tensors = self.encoder(x) x, concat_tensors = self.encoder(x)
x = self.intermediate(x) x = self.intermediate(x)
x = self.decoder(x, concat_tensors) x = self.decoder(x, concat_tensors)
@ -536,33 +457,28 @@ class MelSpectrogram(torch.nn.Module):
keyshift_key = str(keyshift) + "_" + str(audio.device) keyshift_key = str(keyshift) + "_" + str(audio.device)
if keyshift_key not in self.hann_window: if keyshift_key not in self.hann_window:
self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to( self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
# "cpu"if(audio.device.type=="privateuseone") else audio.device
audio.device audio.device
) )
# fft = torch.stft(#doesn't support pytorch_dml if "privateuseone" in str(audio.device):
# # audio.cpu() if(audio.device.type=="privateuseone")else audio, if not hasattr(self, "stft"):
# audio,
# n_fft=n_fft_new,
# hop_length=hop_length_new,
# win_length=win_length_new,
# window=self.hann_window[keyshift_key],
# center=center,
# return_complex=True,
# )
# magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
# print(1111111111)
# print(222222222222222,audio.device,self.is_half)
if hasattr(self, "stft") == False:
# print(n_fft_new,hop_length_new,win_length_new,audio.shape)
self.stft = STFT( self.stft = STFT(
filter_length=n_fft_new, filter_length=n_fft_new,
hop_length=hop_length_new, hop_length=hop_length_new,
win_length=win_length_new, win_length=win_length_new,
window="hann", window="hann",
).to(audio.device) ).to(audio.device)
magnitude = self.stft.transform(audio) # phase magnitude = self.stft.transform(audio)
# if (audio.device.type == "privateuseone"): else:
# magnitude=magnitude.to(audio.device) fft = torch.stft(
audio,
n_fft=n_fft_new,
hop_length=hop_length_new,
win_length=win_length_new,
window=self.hann_window[keyshift_key],
center=center,
return_complex=True,
)
magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
if keyshift != 0: if keyshift != 0:
size = self.n_fft // 2 + 1 size = self.n_fft // 2 + 1
resize = magnitude.size(1) resize = magnitude.size(1)
@ -573,17 +489,16 @@ class MelSpectrogram(torch.nn.Module):
if self.is_half == True: if self.is_half == True:
mel_output = mel_output.half() mel_output = mel_output.half()
log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
# print(log_mel_spec.device.type)
return log_mel_spec return log_mel_spec
class RMVPE: class RMVPE:
def __init__(self, model_path, is_half, device=None): def __init__(self, model_path: str, is_half, device=None, use_jit=False):
self.resample_kernel = {} self.resample_kernel = {}
self.resample_kernel = {} self.resample_kernel = {}
self.is_half = is_half self.is_half = is_half
if device is None: if device is None:
device = "cuda" if torch.cuda.is_available() else "cpu" device = "cuda:0" if torch.cuda.is_available() else "cpu"
self.device = device self.device = device
self.mel_extractor = MelSpectrogram( self.mel_extractor = MelSpectrogram(
is_half, 128, 16000, 1024, 160, None, 30, 8000 is_half, 128, 16000, 1024, 160, None, 30, 8000
@ -597,13 +512,56 @@ class RMVPE:
) )
self.model = ort_session self.model = ort_session
else: else:
if str(self.device) == "cuda":
self.device = torch.device("cuda:0")
def get_jit_model():
jit_model_path = model_path.rstrip(".pth")
jit_model_path += ".half.jit" if is_half else ".jit"
reload = False
if os.path.exists(jit_model_path):
ckpt = jit.load(jit_model_path)
model_device = ckpt["device"]
if model_device != str(self.device):
reload = True
else:
reload = True
if reload:
ckpt = jit.rmvpe_jit_export(
model_path=model_path,
mode="script",
inputs_path=None,
save_path=jit_model_path,
device=device,
is_half=is_half,
)
model = torch.jit.load(BytesIO(ckpt["model"]), map_location=device)
return model
def get_default_model():
model = E2E(4, 1, (2, 2)) model = E2E(4, 1, (2, 2))
ckpt = torch.load(model_path, map_location="cpu") ckpt = torch.load(model_path, map_location="cpu")
model.load_state_dict(ckpt) model.load_state_dict(ckpt)
model.eval() model.eval()
if is_half == True: if is_half:
model = model.half() model = model.half()
self.model = model else:
model = model.float()
return model
if use_jit:
if is_half and "cpu" in str(self.device):
logger.warning(
"Use default rmvpe model. \
Jit is not supported on the CPU for half floating point"
)
self.model = get_default_model()
else:
self.model = get_jit_model()
else:
self.model = get_default_model()
self.model = self.model.to(device) self.model = self.model.to(device)
cents_mapping = 20 * np.arange(360) + 1997.3794084376191 cents_mapping = 20 * np.arange(360) + 1997.3794084376191
self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368 self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368
@ -611,9 +569,9 @@ class RMVPE:
def mel2hidden(self, mel): def mel2hidden(self, mel):
with torch.no_grad(): with torch.no_grad():
n_frames = mel.shape[-1] n_frames = mel.shape[-1]
mel = F.pad( n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames
mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="constant" if n_pad > 0:
) mel = F.pad(mel, (0, n_pad), mode="constant")
if "privateuseone" in str(self.device): if "privateuseone" in str(self.device):
onnx_input_name = self.model.get_inputs()[0].name onnx_input_name = self.model.get_inputs()[0].name
onnx_outputs_names = self.model.get_outputs()[0].name onnx_outputs_names = self.model.get_outputs()[0].name
@ -622,6 +580,7 @@ class RMVPE:
input_feed={onnx_input_name: mel.cpu().numpy()}, input_feed={onnx_input_name: mel.cpu().numpy()},
)[0] )[0]
else: else:
mel = mel.half() if self.is_half else mel.float()
hidden = self.model(mel) hidden = self.model(mel)
return hidden[:, :n_frames] return hidden[:, :n_frames]

View File

@ -17,7 +17,6 @@ def ipex_init(): # pylint: disable=too-many-statements
torch.cuda.device = torch.xpu.device torch.cuda.device = torch.xpu.device
torch.cuda.device_count = torch.xpu.device_count torch.cuda.device_count = torch.xpu.device_count
torch.cuda.device_of = torch.xpu.device_of torch.cuda.device_of = torch.xpu.device_of
torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard
torch.cuda.get_device_name = torch.xpu.get_device_name torch.cuda.get_device_name = torch.xpu.get_device_name
torch.cuda.get_device_properties = torch.xpu.get_device_properties torch.cuda.get_device_properties = torch.xpu.get_device_properties
torch.cuda.init = torch.xpu.init torch.cuda.init = torch.xpu.init
@ -169,9 +168,23 @@ def ipex_init(): # pylint: disable=too-many-statements
torch.cuda.get_device_properties.minor = 7 torch.cuda.get_device_properties.minor = 7
torch.cuda.ipc_collect = lambda *args, **kwargs: None torch.cuda.ipc_collect = lambda *args, **kwargs: None
torch.cuda.utilization = lambda *args, **kwargs: 0 torch.cuda.utilization = lambda *args, **kwargs: 0
if hasattr(torch.xpu, "getDeviceIdListForCard"):
torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard
torch.cuda.get_device_id_list_per_card = torch.xpu.getDeviceIdListForCard
else:
torch.cuda.getDeviceIdListForCard = torch.xpu.get_device_id_list_per_card
torch.cuda.get_device_id_list_per_card = (
torch.xpu.get_device_id_list_per_card
)
ipex_hijacks() ipex_hijacks()
attention_init() attention_init()
try:
from .diffusers import ipex_diffusers
ipex_diffusers()
except Exception: # pylint: disable=broad-exception-caught
pass
except Exception as e: except Exception as e:
return False, e return False, e
return True, None return True, None

View File

@ -16,17 +16,15 @@ def torch_bmm(input, mat2, *, out=None):
input.shape[1], input.shape[1],
mat2.shape[2], mat2.shape[2],
) )
block_multiply = 2.4 if input.dtype == torch.float32 else 1.2 block_multiply = input.element_size()
block_size = ( slice_block_size = input_tokens * mat2_shape / 1024 / 1024 * block_multiply
(batch_size_attention * input_tokens * mat2_shape) / 1024 * block_multiply block_size = batch_size_attention * slice_block_size
) # MB
split_slice_size = batch_size_attention split_slice_size = batch_size_attention
if block_size >= 4000: if block_size > 4:
do_split = True do_split = True
# Find something divisible with the input_tokens # Find something divisible with the input_tokens
while ( while (split_slice_size * slice_block_size) > 4:
(split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply
) > 4000:
split_slice_size = split_slice_size // 2 split_slice_size = split_slice_size // 2
if split_slice_size <= 1: if split_slice_size <= 1:
split_slice_size = 1 split_slice_size = 1
@ -34,16 +32,12 @@ def torch_bmm(input, mat2, *, out=None):
else: else:
do_split = False do_split = False
split_block_size = (
(split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply
) # MB
split_2_slice_size = input_tokens split_2_slice_size = input_tokens
if split_block_size >= 4000: if split_slice_size * slice_block_size > 4:
slice_block_size2 = split_slice_size * mat2_shape / 1024 / 1024 * block_multiply
do_split_2 = True do_split_2 = True
# Find something divisible with the input_tokens # Find something divisible with the input_tokens
while ( while (split_2_slice_size * slice_block_size2) > 4:
(split_slice_size * split_2_slice_size * mat2_shape) / 1024 * block_multiply
) > 4000:
split_2_slice_size = split_2_slice_size // 2 split_2_slice_size = split_2_slice_size // 2
if split_2_slice_size <= 1: if split_2_slice_size <= 1:
split_2_slice_size = 1 split_2_slice_size = 1
@ -91,22 +85,25 @@ def scaled_dot_product_attention(
query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False
): ):
# ARC GPUs can't allocate more than 4GB to a single block, Slice it: # ARC GPUs can't allocate more than 4GB to a single block, Slice it:
if len(query.shape) == 3:
batch_size_attention, query_tokens, shape_four = query.shape
shape_one = 1
no_shape_one = True
else:
shape_one, batch_size_attention, query_tokens, shape_four = query.shape shape_one, batch_size_attention, query_tokens, shape_four = query.shape
block_multiply = 2.4 if query.dtype == torch.float32 else 1.2 no_shape_one = False
block_size = (
(shape_one * batch_size_attention * query_tokens * shape_four) block_multiply = query.element_size()
/ 1024 slice_block_size = (
* block_multiply shape_one * query_tokens * shape_four / 1024 / 1024 * block_multiply
) # MB )
block_size = batch_size_attention * slice_block_size
split_slice_size = batch_size_attention split_slice_size = batch_size_attention
if block_size >= 4000: if block_size > 4:
do_split = True do_split = True
# Find something divisible with the shape_one # Find something divisible with the shape_one
while ( while (split_slice_size * slice_block_size) > 4:
(shape_one * split_slice_size * query_tokens * shape_four)
/ 1024
* block_multiply
) > 4000:
split_slice_size = split_slice_size // 2 split_slice_size = split_slice_size // 2
if split_slice_size <= 1: if split_slice_size <= 1:
split_slice_size = 1 split_slice_size = 1
@ -114,20 +111,14 @@ def scaled_dot_product_attention(
else: else:
do_split = False do_split = False
split_block_size = (
(shape_one * split_slice_size * query_tokens * shape_four)
/ 1024
* block_multiply
) # MB
split_2_slice_size = query_tokens split_2_slice_size = query_tokens
if split_block_size >= 4000: if split_slice_size * slice_block_size > 4:
slice_block_size2 = (
shape_one * split_slice_size * shape_four / 1024 / 1024 * block_multiply
)
do_split_2 = True do_split_2 = True
# Find something divisible with the batch_size_attention # Find something divisible with the batch_size_attention
while ( while (split_2_slice_size * slice_block_size2) > 4:
(shape_one * split_slice_size * split_2_slice_size * shape_four)
/ 1024
* block_multiply
) > 4000:
split_2_slice_size = split_2_slice_size // 2 split_2_slice_size = split_2_slice_size // 2
if split_2_slice_size <= 1: if split_2_slice_size <= 1:
split_2_slice_size = 1 split_2_slice_size = 1
@ -146,13 +137,45 @@ def scaled_dot_product_attention(
): # pylint: disable=invalid-name ): # pylint: disable=invalid-name
start_idx_2 = i2 * split_2_slice_size start_idx_2 = i2 * split_2_slice_size
end_idx_2 = (i2 + 1) * split_2_slice_size end_idx_2 = (i2 + 1) * split_2_slice_size
if no_shape_one:
hidden_states[
start_idx:end_idx, start_idx_2:end_idx_2
] = original_scaled_dot_product_attention(
query[start_idx:end_idx, start_idx_2:end_idx_2],
key[start_idx:end_idx, start_idx_2:end_idx_2],
value[start_idx:end_idx, start_idx_2:end_idx_2],
attn_mask=attn_mask[
start_idx:end_idx, start_idx_2:end_idx_2
]
if attn_mask is not None
else attn_mask,
dropout_p=dropout_p,
is_causal=is_causal,
)
else:
hidden_states[ hidden_states[
:, start_idx:end_idx, start_idx_2:end_idx_2 :, start_idx:end_idx, start_idx_2:end_idx_2
] = original_scaled_dot_product_attention( ] = original_scaled_dot_product_attention(
query[:, start_idx:end_idx, start_idx_2:end_idx_2], query[:, start_idx:end_idx, start_idx_2:end_idx_2],
key[:, start_idx:end_idx, start_idx_2:end_idx_2], key[:, start_idx:end_idx, start_idx_2:end_idx_2],
value[:, start_idx:end_idx, start_idx_2:end_idx_2], value[:, start_idx:end_idx, start_idx_2:end_idx_2],
attn_mask=attn_mask[:, start_idx:end_idx, start_idx_2:end_idx_2] attn_mask=attn_mask[
:, start_idx:end_idx, start_idx_2:end_idx_2
]
if attn_mask is not None
else attn_mask,
dropout_p=dropout_p,
is_causal=is_causal,
)
else:
if no_shape_one:
hidden_states[
start_idx:end_idx
] = original_scaled_dot_product_attention(
query[start_idx:end_idx],
key[start_idx:end_idx],
value[start_idx:end_idx],
attn_mask=attn_mask[start_idx:end_idx]
if attn_mask is not None if attn_mask is not None
else attn_mask, else attn_mask,
dropout_p=dropout_p, dropout_p=dropout_p,

View File

@ -23,14 +23,16 @@ try:
if torch.xpu.is_available(): if torch.xpu.is_available():
from infer.modules.ipex import ipex_init from infer.modules.ipex import ipex_init
from infer.modules.ipex.gradscaler import gradscaler_init
ipex_init()
from torch.xpu.amp import autocast from torch.xpu.amp import autocast
from infer.modules.ipex.gradscaler import gradscaler_init
GradScaler = gradscaler_init() GradScaler = gradscaler_init()
ipex_init()
else: else:
from torch.cuda.amp import GradScaler, autocast from torch.cuda.amp import GradScaler, autocast
except Exception: except Exception: # pylint: disable=broad-exception-caught
from torch.cuda.amp import GradScaler, autocast from torch.cuda.amp import GradScaler, autocast
torch.backends.cudnn.deterministic = False torch.backends.cudnn.deterministic = False
@ -104,14 +106,11 @@ def main():
os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = str(randint(20000, 55555)) os.environ["MASTER_PORT"] = str(randint(20000, 55555))
children = [] children = []
logger = utils.get_logger(hps.model_dir)
for i in range(n_gpus): for i in range(n_gpus):
subproc = mp.Process( subproc = mp.Process(
target=run, target=run,
args=( args=(i, n_gpus, hps, logger),
i,
n_gpus,
hps,
),
) )
children.append(subproc) children.append(subproc)
subproc.start() subproc.start()
@ -120,10 +119,10 @@ def main():
children[i].join() children[i].join()
def run(rank, n_gpus, hps): def run(rank, n_gpus, hps, logger: logging.Logger):
global global_step global global_step
if rank == 0: if rank == 0:
logger = utils.get_logger(hps.model_dir) # logger = utils.get_logger(hps.model_dir)
logger.info(hps) logger.info(hps)
# utils.check_git_hash(hps.model_dir) # utils.check_git_hash(hps.model_dir)
writer = SummaryWriter(log_dir=hps.model_dir) writer = SummaryWriter(log_dir=hps.model_dir)

View File

@ -16,13 +16,13 @@ from infer.lib.uvr5_pack.utils import inference
class AudioPre: class AudioPre:
def __init__(self, agg, model_path, device, is_half): def __init__(self, agg, model_path, device, is_half, tta=False):
self.model_path = model_path self.model_path = model_path
self.device = device self.device = device
self.data = { self.data = {
# Processing Options # Processing Options
"postprocess": False, "postprocess": False,
"tta": False, "tta": tta,
# Constants # Constants
"window_size": 512, "window_size": 512,
"agg": agg, "agg": agg,
@ -180,13 +180,13 @@ class AudioPre:
class AudioPreDeEcho: class AudioPreDeEcho:
def __init__(self, agg, model_path, device, is_half): def __init__(self, agg, model_path, device, is_half, tta=False):
self.model_path = model_path self.model_path = model_path
self.device = device self.device = device
self.data = { self.data = {
# Processing Options # Processing Options
"postprocess": False, "postprocess": False,
"tta": False, "tta": tta,
# Constants # Constants
"window_size": 512, "window_size": 512,
"agg": agg, "agg": agg,

View File

@ -54,16 +54,10 @@ class VC:
if sid == "" or sid == []: if sid == "" or sid == []:
if self.hubert_model is not None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的 if self.hubert_model is not None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
logger.info("Clean model cache") logger.info("Clean model cache")
del ( del (self.net_g, self.n_spk, self.hubert_model, self.tgt_sr) # ,cpt
self.net_g,
self.n_spk,
self.vc,
self.hubert_model,
self.tgt_sr,
) # ,cpt
self.hubert_model = ( self.hubert_model = (
self.net_g self.net_g
) = self.n_spk = self.vc = self.hubert_model = self.tgt_sr = None ) = self.n_spk = self.hubert_model = self.tgt_sr = None
if torch.cuda.is_available(): if torch.cuda.is_available():
torch.cuda.empty_cache() torch.cuda.empty_cache()
###楼下不这么折腾清理不干净 ###楼下不这么折腾清理不干净

View File

@ -1,307 +0,0 @@
import traceback
import logging
logger = logging.getLogger(__name__)
import numpy as np
import soundfile as sf
import torch
from io import BytesIO
from infer.lib.audio import load_audio, wav2
from infer.lib.infer_pack.models import (
SynthesizerTrnMs256NSFsid,
SynthesizerTrnMs256NSFsid_nono,
SynthesizerTrnMs768NSFsid,
SynthesizerTrnMs768NSFsid_nono,
)
from infer.modules.vc.pipeline import Pipeline
from infer.modules.vc.utils import *
class VC:
def __init__(self, config):
self.n_spk = None
self.tgt_sr = None
self.net_g = None
self.pipeline = None
self.cpt = None
self.version = None
self.if_f0 = None
self.version = None
self.hubert_model = None
self.config = config
def get_vc(self, sid, *to_return_protect):
logger.info("Get sid: " + sid)
to_return_protect0 = {
"visible": self.if_f0 != 0,
"value": to_return_protect[0]
if self.if_f0 != 0 and to_return_protect
else 0.5,
"__type__": "update",
}
to_return_protect1 = {
"visible": self.if_f0 != 0,
"value": to_return_protect[1]
if self.if_f0 != 0 and to_return_protect
else 0.33,
"__type__": "update",
}
if sid == "" or sid == []:
if self.hubert_model is not None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
logger.info("Clean model cache")
del (
self.net_g,
self.n_spk,
self.vc,
self.hubert_model,
self.tgt_sr,
) # ,cpt
self.hubert_model = (
self.net_g
) = self.n_spk = self.vc = self.hubert_model = self.tgt_sr = None
if torch.cuda.is_available():
torch.cuda.empty_cache()
###楼下不这么折腾清理不干净
self.if_f0 = self.cpt.get("f0", 1)
self.version = self.cpt.get("version", "v1")
if self.version == "v1":
if self.if_f0 == 1:
self.net_g = SynthesizerTrnMs256NSFsid(
*self.cpt["config"], is_half=self.config.is_half
)
else:
self.net_g = SynthesizerTrnMs256NSFsid_nono(*self.cpt["config"])
elif self.version == "v2":
if self.if_f0 == 1:
self.net_g = SynthesizerTrnMs768NSFsid(
*self.cpt["config"], is_half=self.config.is_half
)
else:
self.net_g = SynthesizerTrnMs768NSFsid_nono(*self.cpt["config"])
del self.net_g, self.cpt
if torch.cuda.is_available():
torch.cuda.empty_cache()
return (
{"visible": False, "__type__": "update"},
{
"visible": True,
"value": to_return_protect0,
"__type__": "update",
},
{
"visible": True,
"value": to_return_protect1,
"__type__": "update",
},
"",
"",
)
person = f'{os.getenv("weight_root")}/{sid}'
logger.info(f"Loading: {person}")
self.cpt = torch.load(person, map_location="cpu")
self.tgt_sr = self.cpt["config"][-1]
self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0] # n_spk
self.if_f0 = self.cpt.get("f0", 1)
self.version = self.cpt.get("version", "v1")
synthesizer_class = {
("v1", 1): SynthesizerTrnMs256NSFsid,
("v1", 0): SynthesizerTrnMs256NSFsid_nono,
("v2", 1): SynthesizerTrnMs768NSFsid,
("v2", 0): SynthesizerTrnMs768NSFsid_nono,
}
self.net_g = synthesizer_class.get(
(self.version, self.if_f0), SynthesizerTrnMs256NSFsid
)(*self.cpt["config"], is_half=self.config.is_half)
del self.net_g.enc_q
self.net_g.load_state_dict(self.cpt["weight"], strict=False)
self.net_g.eval().to(self.config.device)
if self.config.is_half:
self.net_g = self.net_g.half()
else:
self.net_g = self.net_g.float()
self.pipeline = Pipeline(self.tgt_sr, self.config)
n_spk = self.cpt["config"][-3]
index = {"value": get_index_path_from_model(sid), "__type__": "update"}
logger.info("Select index: " + index["value"])
return (
(
{"visible": True, "maximum": n_spk, "__type__": "update"},
to_return_protect0,
to_return_protect1,
index,
index,
)
if to_return_protect
else {"visible": True, "maximum": n_spk, "__type__": "update"}
)
def vc_single(
self,
sid,
input_audio_path,
f0_up_key,
f0_file,
f0_method,
file_index,
file_index2,
index_rate,
filter_radius,
resample_sr,
rms_mix_rate,
protect,
):
if input_audio_path is None:
return "You need to upload an audio", None
f0_up_key = int(f0_up_key)
try:
audio = load_audio(input_audio_path, 16000)
audio_max = np.abs(audio).max() / 0.95
if audio_max > 1:
audio /= audio_max
times = [0, 0, 0]
if self.hubert_model is None:
self.hubert_model = load_hubert(self.config)
file_index = (
(
file_index.strip(" ")
.strip('"')
.strip("\n")
.strip('"')
.strip(" ")
.replace("trained", "added")
)
if file_index != ""
else file_index2
) # 防止小白写错,自动帮他替换掉
audio_opt = self.pipeline.pipeline(
self.hubert_model,
self.net_g,
sid,
audio,
input_audio_path,
times,
f0_up_key,
f0_method,
file_index,
index_rate,
self.if_f0,
filter_radius,
self.tgt_sr,
resample_sr,
rms_mix_rate,
self.version,
protect,
f0_file,
)
if self.tgt_sr != resample_sr >= 16000:
tgt_sr = resample_sr
else:
tgt_sr = self.tgt_sr
index_info = (
"Index:\n%s." % file_index
if os.path.exists(file_index)
else "Index not used."
)
return (
"Success.\n%s\nTime:\nnpy: %.2fs, f0: %.2fs, infer: %.2fs."
% (index_info, *times),
(tgt_sr, audio_opt),
)
except:
info = traceback.format_exc()
logger.warning(info)
return info, (None, None)
def vc_multi(
self,
sid,
dir_path,
opt_root,
paths,
f0_up_key,
f0_method,
file_index,
file_index2,
index_rate,
filter_radius,
resample_sr,
rms_mix_rate,
protect,
format1,
):
try:
dir_path = (
dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
) # 防止小白拷路径头尾带了空格和"和回车
opt_root = opt_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
os.makedirs(opt_root, exist_ok=True)
try:
if dir_path != "":
paths = [
os.path.join(dir_path, name) for name in os.listdir(dir_path)
]
else:
paths = [path.name for path in paths]
except:
traceback.print_exc()
paths = [path.name for path in paths]
infos = []
for path in paths:
info, opt = self.vc_single(
sid,
path,
f0_up_key,
None,
f0_method,
file_index,
file_index2,
# file_big_npy,
index_rate,
filter_radius,
resample_sr,
rms_mix_rate,
protect,
)
if "Success" in info:
try:
tgt_sr, audio_opt = opt
if format1 in ["wav", "flac"]:
sf.write(
"%s/%s.%s"
% (opt_root, os.path.basename(path), format1),
audio_opt,
tgt_sr,
)
else:
path = "%s/%s.%s" % (
opt_root,
os.path.basename(path),
format1,
)
with BytesIO() as wavf:
sf.write(wavf, audio_opt, tgt_sr, format="wav")
wavf.seek(0, 0)
with open(path, "wb") as outf:
wav2(wavf, outf, format1)
except:
info += traceback.format_exc()
infos.append("%s->%s" % (os.path.basename(path), info))
yield "\n".join(infos)
yield "\n".join(infos)
except:
yield traceback.format_exc()

View File

@ -2,7 +2,7 @@ torch==2.0.1a0
intel_extension_for_pytorch==2.0.110+xpu intel_extension_for_pytorch==2.0.110+xpu
torchvision==0.15.2a0 torchvision==0.15.2a0
https://github.com/Disty0/Retrieval-based-Voice-Conversion-WebUI/releases/download/torchaudio_wheels_for_ipex/torchaudio-2.0.2+31de77d-cp310-cp310-linux_x86_64.whl https://github.com/Disty0/Retrieval-based-Voice-Conversion-WebUI/releases/download/torchaudio_wheels_for_ipex/torchaudio-2.0.2+31de77d-cp310-cp310-linux_x86_64.whl
-f https://developer.intel.com/ipex-whl-stable-xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
joblib>=1.1.0 joblib>=1.1.0
numba==0.56.4 numba==0.56.4
numpy==1.23.5 numpy==1.23.5

79
tools/download_models.py Normal file
View File

@ -0,0 +1,79 @@
import os
from pathlib import Path
import requests
RVC_DOWNLOAD_LINK = "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/"
BASE_DIR = Path(__file__).resolve().parent.parent
def dl_model(link, model_name, dir_name):
with requests.get(f"{link}{model_name}") as r:
r.raise_for_status()
os.makedirs(os.path.dirname(dir_name / model_name), exist_ok=True)
with open(dir_name / model_name, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
if __name__ == "__main__":
print("Downloading hubert_base.pt...")
dl_model(RVC_DOWNLOAD_LINK, "hubert_base.pt", BASE_DIR / "assets/hubert")
print("Downloading rmvpe.pt...")
dl_model(RVC_DOWNLOAD_LINK, "rmvpe.pt", BASE_DIR / "assets/rmvpe")
print("Downloading vocals.onnx...")
dl_model(
RVC_DOWNLOAD_LINK + "uvr5_weights/onnx_dereverb_By_FoxJoy/",
"vocals.onnx",
BASE_DIR / "assets/uvr5_weights/onnx_dereverb_By_FoxJoy",
)
rvc_models_dir = BASE_DIR / "assets/pretrained"
print("Downloading pretrained models:")
model_names = [
"D32k.pth",
"D40k.pth",
"D48k.pth",
"G32k.pth",
"G40k.pth",
"G48k.pth",
"f0D32k.pth",
"f0D40k.pth",
"f0D48k.pth",
"f0G32k.pth",
"f0G40k.pth",
"f0G48k.pth",
]
for model in model_names:
print(f"Downloading {model}...")
dl_model(RVC_DOWNLOAD_LINK + "pretrained/", model, rvc_models_dir)
rvc_models_dir = BASE_DIR / "assets/pretrained_v2"
print("Downloading pretrained models v2:")
for model in model_names:
print(f"Downloading {model}...")
dl_model(RVC_DOWNLOAD_LINK + "pretrained_v2/", model, rvc_models_dir)
print("Downloading uvr5_weights:")
rvc_models_dir = BASE_DIR / "assets/uvr5_weights"
model_names = [
"HP2-%E4%BA%BA%E5%A3%B0vocals%2B%E9%9D%9E%E4%BA%BA%E5%A3%B0instrumentals.pth",
"HP2_all_vocals.pth",
"HP3_all_vocals.pth",
"HP5-%E4%B8%BB%E6%97%8B%E5%BE%8B%E4%BA%BA%E5%A3%B0vocals%2B%E5%85%B6%E4%BB%96instrumentals.pth",
"HP5_only_main_vocal.pth",
"VR-DeEchoAggressive.pth",
"VR-DeEchoDeReverb.pth",
"VR-DeEchoNormal.pth",
]
for model in model_names:
print(f"Downloading {model}...")
dl_model(RVC_DOWNLOAD_LINK + "uvr5_weights/", model, rvc_models_dir)
print("All models downloaded!")

View File

@ -1,12 +1,11 @@
from io import BytesIO
import os import os
import pickle
import sys import sys
import traceback import traceback
import logging from infer.lib import jit
from infer.lib.jit.get_synthesizer import get_synthesizer
logger = logging.getLogger(__name__)
from time import time as ttime from time import time as ttime
import fairseq import fairseq
import faiss import faiss
import numpy as np import numpy as np
@ -31,17 +30,16 @@ from multiprocessing import Manager as M
from configs.config import Config from configs.config import Config
config = Config() # config = Config()
mm = M() mm = M()
if config.dml == True:
def forward_dml(ctx, x, scale):
ctx.scale = scale
res = x.clone().detach()
return res
fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml def printt(strr, *args):
if len(args) == 0:
print(strr)
else:
print(strr % args)
# config.device=torch.device("cpu")########强制cpu测试 # config.device=torch.device("cpu")########强制cpu测试
@ -56,18 +54,27 @@ class RVC:
n_cpu, n_cpu,
inp_q, inp_q,
opt_q, opt_q,
device, config: Config,
last_rvc=None, last_rvc=None,
) -> None: ) -> None:
""" """
初始化 初始化
""" """
try: try:
global config if config.dml == True:
def forward_dml(ctx, x, scale):
ctx.scale = scale
res = x.clone().detach()
return res
fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml
# global config
self.config = config
self.inp_q = inp_q self.inp_q = inp_q
self.opt_q = opt_q self.opt_q = opt_q
# device="cpu"########强制cpu测试 # device="cpu"########强制cpu测试
self.device = device self.device = config.device
self.f0_up_key = key self.f0_up_key = key
self.time_step = 160 / 16000 * 1000 self.time_step = 160 / 16000 * 1000
self.f0_min = 50 self.f0_min = 50
@ -77,11 +84,14 @@ class RVC:
self.sr = 16000 self.sr = 16000
self.window = 160 self.window = 160
self.n_cpu = n_cpu self.n_cpu = n_cpu
self.use_jit = self.config.use_jit
self.is_half = config.is_half
if index_rate != 0: if index_rate != 0:
self.index = faiss.read_index(index_path) self.index = faiss.read_index(index_path)
self.big_npy = self.index.reconstruct_n(0, self.index.ntotal) self.big_npy = self.index.reconstruct_n(0, self.index.ntotal)
logger.info("Index search enabled") printt("Index search enabled")
self.pth_path = pth_path self.pth_path: str = pth_path
self.index_path = index_path self.index_path = index_path
self.index_rate = index_rate self.index_rate = index_rate
@ -91,8 +101,8 @@ class RVC:
suffix="", suffix="",
) )
hubert_model = models[0] hubert_model = models[0]
hubert_model = hubert_model.to(device) hubert_model = hubert_model.to(self.device)
if config.is_half: if self.is_half:
hubert_model = hubert_model.half() hubert_model = hubert_model.half()
else: else:
hubert_model = hubert_model.float() hubert_model = hubert_model.float()
@ -101,46 +111,80 @@ class RVC:
else: else:
self.model = last_rvc.model self.model = last_rvc.model
if last_rvc is None or last_rvc.pth_path != self.pth_path: self.net_g: nn.Module = None
cpt = torch.load(self.pth_path, map_location="cpu")
def set_default_model():
self.net_g, cpt = get_synthesizer(self.pth_path, self.device)
self.tgt_sr = cpt["config"][-1] self.tgt_sr = cpt["config"][-1]
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
self.if_f0 = cpt.get("f0", 1) self.if_f0 = cpt.get("f0", 1)
self.version = cpt.get("version", "v1") self.version = cpt.get("version", "v1")
if self.version == "v1": if self.is_half:
if self.if_f0 == 1:
self.net_g = SynthesizerTrnMs256NSFsid(
*cpt["config"], is_half=config.is_half
)
else:
self.net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
elif self.version == "v2":
if self.if_f0 == 1:
self.net_g = SynthesizerTrnMs768NSFsid(
*cpt["config"], is_half=config.is_half
)
else:
self.net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
del self.net_g.enc_q
logger.debug(self.net_g.load_state_dict(cpt["weight"], strict=False))
self.net_g.eval().to(device)
# print(2333333333,device,config.device,self.device)#net_g是devicehubert是config.device
if config.is_half:
self.net_g = self.net_g.half() self.net_g = self.net_g.half()
else: else:
self.net_g = self.net_g.float() self.net_g = self.net_g.float()
self.is_half = config.is_half
def set_jit_model():
jit_pth_path = self.pth_path.rstrip(".pth")
jit_pth_path += ".half.jit" if self.is_half else ".jit"
reload = False
if str(self.device) == "cuda":
self.device = torch.device("cuda:0")
if os.path.exists(jit_pth_path):
cpt = jit.load(jit_pth_path)
model_device = cpt["device"]
if model_device != str(self.device):
reload = True
else:
reload = True
if reload:
cpt = jit.synthesizer_jit_export(
self.pth_path,
"script",
None,
device=self.device,
is_half=self.is_half,
)
self.tgt_sr = cpt["config"][-1]
self.if_f0 = cpt.get("f0", 1)
self.version = cpt.get("version", "v1")
self.net_g = torch.jit.load(
BytesIO(cpt["model"]), map_location=self.device
)
self.net_g.infer = self.net_g.forward
self.net_g.eval().to(self.device)
def set_synthesizer():
if self.use_jit and not config.dml:
if self.is_half and "cpu" in str(self.device):
printt(
"Use default Synthesizer model. \
Jit is not supported on the CPU for half floating point"
)
set_default_model()
else:
set_jit_model()
else:
set_default_model()
if last_rvc is None or last_rvc.pth_path != self.pth_path:
set_synthesizer()
else: else:
self.tgt_sr = last_rvc.tgt_sr self.tgt_sr = last_rvc.tgt_sr
self.if_f0 = last_rvc.if_f0 self.if_f0 = last_rvc.if_f0
self.version = last_rvc.version self.version = last_rvc.version
self.net_g = last_rvc.net_g
self.is_half = last_rvc.is_half self.is_half = last_rvc.is_half
if last_rvc.use_jit != self.use_jit:
set_synthesizer()
else:
self.net_g = last_rvc.net_g
if last_rvc is not None and hasattr(last_rvc, "model_rmvpe"): if last_rvc is not None and hasattr(last_rvc, "model_rmvpe"):
self.model_rmvpe = last_rvc.model_rmvpe self.model_rmvpe = last_rvc.model_rmvpe
except: except:
logger.warning(traceback.format_exc()) printt(traceback.format_exc())
def change_key(self, new_key): def change_key(self, new_key):
self.f0_up_key = new_key self.f0_up_key = new_key
@ -149,7 +193,7 @@ class RVC:
if new_index_rate != 0 and self.index_rate == 0: if new_index_rate != 0 and self.index_rate == 0:
self.index = faiss.read_index(self.index_path) self.index = faiss.read_index(self.index_path)
self.big_npy = self.index.reconstruct_n(0, self.index.ntotal) self.big_npy = self.index.reconstruct_n(0, self.index.ntotal)
logger.info("Index search enabled") printt("Index search enabled")
self.index_rate = new_index_rate self.index_rate = new_index_rate
def get_f0_post(self, f0): def get_f0_post(self, f0):
@ -188,7 +232,7 @@ class RVC:
pad_size = (p_len - len(f0) + 1) // 2 pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0: if pad_size > 0 or p_len - len(f0) - pad_size > 0:
# print(pad_size, p_len - len(f0) - pad_size) # printt(pad_size, p_len - len(f0) - pad_size)
f0 = np.pad( f0 = np.pad(
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
) )
@ -243,7 +287,7 @@ class RVC:
if "privateuseone" in str(self.device): ###不支持dmlcpu又太慢用不成拿pm顶替 if "privateuseone" in str(self.device): ###不支持dmlcpu又太慢用不成拿pm顶替
return self.get_f0(x, f0_up_key, 1, "pm") return self.get_f0(x, f0_up_key, 1, "pm")
audio = torch.tensor(np.copy(x))[None].float() audio = torch.tensor(np.copy(x))[None].float()
# print("using crepe,device:%s"%self.device) # printt("using crepe,device:%s"%self.device)
f0, pd = torchcrepe.predict( f0, pd = torchcrepe.predict(
audio, audio,
self.sr, self.sr,
@ -267,7 +311,7 @@ class RVC:
if hasattr(self, "model_rmvpe") == False: if hasattr(self, "model_rmvpe") == False:
from infer.lib.rmvpe import RMVPE from infer.lib.rmvpe import RMVPE
logger.info("Loading rmvpe model") printt("Loading rmvpe model")
self.model_rmvpe = RMVPE( self.model_rmvpe = RMVPE(
# "rmvpe.pt", is_half=self.is_half if self.device.type!="privateuseone" else False, device=self.device if self.device.type!="privateuseone"else "cpu"####dml时强制对rmvpe用cpu跑 # "rmvpe.pt", is_half=self.is_half if self.device.type!="privateuseone" else False, device=self.device if self.device.type!="privateuseone"else "cpu"####dml时强制对rmvpe用cpu跑
# "rmvpe.pt", is_half=False, device=self.device####dml配置 # "rmvpe.pt", is_half=False, device=self.device####dml配置
@ -275,6 +319,7 @@ class RVC:
"assets/rmvpe/rmvpe.pt", "assets/rmvpe/rmvpe.pt",
is_half=self.is_half, is_half=self.is_half,
device=self.device, ####正常逻辑 device=self.device, ####正常逻辑
use_jit=self.config.use_jit,
) )
# self.model_rmvpe = RMVPE("aug2_58000_half.pt", is_half=self.is_half, device=self.device) # self.model_rmvpe = RMVPE("aug2_58000_half.pt", is_half=self.is_half, device=self.device)
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
@ -292,7 +337,7 @@ class RVC:
f0method, f0method,
) -> np.ndarray: ) -> np.ndarray:
feats = feats.view(1, -1) feats = feats.view(1, -1)
if config.is_half: if self.config.is_half:
feats = feats.half() feats = feats.half()
else: else:
feats = feats.float() feats = feats.float()
@ -319,17 +364,17 @@ class RVC:
weight = np.square(1 / score) weight = np.square(1 / score)
weight /= weight.sum(axis=1, keepdims=True) weight /= weight.sum(axis=1, keepdims=True)
npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
if config.is_half: if self.config.is_half:
npy = npy.astype("float16") npy = npy.astype("float16")
feats[0][-leng_replace_head:] = ( feats[0][-leng_replace_head:] = (
torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate
+ (1 - self.index_rate) * feats[0][-leng_replace_head:] + (1 - self.index_rate) * feats[0][-leng_replace_head:]
) )
else: else:
logger.warning("Index search FAILED or disabled") printt("Index search FAILED or disabled")
except: except:
traceback.print_exc() traceback.printt_exc()
logger.warning("Index search FAILED") printt("Index search FAILED")
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
t3 = ttime() t3 = ttime()
if self.if_f0 == 1: if self.if_f0 == 1:
@ -356,16 +401,21 @@ class RVC:
sid = torch.LongTensor([ii]).to(self.device) sid = torch.LongTensor([ii]).to(self.device)
with torch.no_grad(): with torch.no_grad():
if self.if_f0 == 1: if self.if_f0 == 1:
# print(12222222222,feats.device,p_len.device,cache_pitch.device,cache_pitchf.device,sid.device,rate2) # printt(12222222222,feats.device,p_len.device,cache_pitch.device,cache_pitchf.device,sid.device,rate2)
infered_audio = self.net_g.infer( infered_audio = self.net_g.infer(
feats, p_len, cache_pitch, cache_pitchf, sid, rate feats,
p_len,
cache_pitch,
cache_pitchf,
sid,
torch.FloatTensor([rate]),
)[0][0, 0].data.float() )[0][0, 0].data.float()
else: else:
infered_audio = self.net_g.infer(feats, p_len, sid, rate)[0][ infered_audio = self.net_g.infer(
0, 0 feats, p_len, sid, torch.FloatTensor([rate])
].data.float() )[0][0, 0].data.float()
t5 = ttime() t5 = ttime()
logger.info( printt(
"Spent time: fea = %.2fs, index = %.2fs, f0 = %.2fs, model = %.2fs", "Spent time: fea = %.2fs, index = %.2fs, f0 = %.2fs, model = %.2fs",
t2 - t1, t2 - t1,
t3 - t2, t3 - t2,

View File

@ -1,4 +1,5 @@
import torch import torch
from infer.lib.rmvpe import STFT
from torch.nn.functional import conv1d, conv2d from torch.nn.functional import conv1d, conv2d
from typing import Union, Optional from typing import Union, Optional
from .utils import linspace, temperature_sigmoid, amp_to_db from .utils import linspace, temperature_sigmoid, amp_to_db
@ -139,6 +140,16 @@ class TorchGate(torch.nn.Module):
are set to 1, and the rest are set to 0. are set to 1, and the rest are set to 0.
""" """
if xn is not None: if xn is not None:
if "privateuseone" in str(xn.device):
if not hasattr(self, "stft"):
self.stft = STFT(
filter_length=self.n_fft,
hop_length=self.hop_length,
win_length=self.win_length,
window="hann",
).to(xn.device)
XN = self.stft.transform(xn)
else:
XN = torch.stft( XN = torch.stft(
xn, xn,
n_fft=self.n_fft, n_fft=self.n_fft,
@ -149,7 +160,6 @@ class TorchGate(torch.nn.Module):
center=True, center=True,
window=torch.hann_window(self.win_length).to(xn.device), window=torch.hann_window(self.win_length).to(xn.device),
) )
XN_db = amp_to_db(XN).to(dtype=X_db.dtype) XN_db = amp_to_db(XN).to(dtype=X_db.dtype)
else: else:
XN_db = X_db XN_db = X_db
@ -213,6 +223,16 @@ class TorchGate(torch.nn.Module):
""" """
# Compute short-time Fourier transform (STFT) # Compute short-time Fourier transform (STFT)
if "privateuseone" in str(x.device):
if not hasattr(self, "stft"):
self.stft = STFT(
filter_length=self.n_fft,
hop_length=self.hop_length,
win_length=self.win_length,
window="hann",
).to(x.device)
X, phase = self.stft.transform(x, return_phase=True)
else:
X = torch.stft( X = torch.stft(
x, x,
n_fft=self.n_fft, n_fft=self.n_fft,
@ -231,7 +251,7 @@ class TorchGate(torch.nn.Module):
sig_mask = self._stationary_mask(amp_to_db(X), xn) sig_mask = self._stationary_mask(amp_to_db(X), xn)
# Propagate decrease in signal power # Propagate decrease in signal power
sig_mask = self.prop_decrease * (sig_mask * 1.0 - 1.0) + 1.0 sig_mask = self.prop_decrease * (sig_mask.float() - 1.0) + 1.0
# Smooth signal mask with 2D convolution # Smooth signal mask with 2D convolution
if self.smoothing_filter is not None: if self.smoothing_filter is not None:
@ -245,6 +265,9 @@ class TorchGate(torch.nn.Module):
Y = X * sig_mask.squeeze(1) Y = X * sig_mask.squeeze(1)
# Inverse STFT to obtain time-domain signal # Inverse STFT to obtain time-domain signal
if "privateuseone" in str(Y.device):
y = self.stft.inverse(Y, phase)
else:
y = torch.istft( y = torch.istft(
Y, Y,
n_fft=self.n_fft, n_fft=self.n_fft,