diff --git a/README.md b/README.md index b93d9dc..7e629d2 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,28 @@ -# Retrieval-based-Voice-Conversion-WebUI +
+ +

Retrieval-based-Voice-Conversion-WebUI

+一个基于VITS的简单易用的语音转换(变声器)框架

[![madewithlove](https://forthebadge.com/images/badges/built-with-love.svg)](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI) +
+ [![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/liujing04/Retrieval-based-Voice-Conversion-WebUI/blob/main/Retrieval_based_Voice_Conversion_WebUI.ipynb) [![Licence](https://img.shields.io/github/license/liujing04/Retrieval-based-Voice-Conversion-WebUI?style=for-the-badge)](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI/blob/main/%E4%BD%BF%E7%94%A8%E9%9C%80%E9%81%B5%E5%AE%88%E7%9A%84%E5%8D%8F%E8%AE%AE-LICENSE.txt) [![Huggingface](https://img.shields.io/badge/🤗%20-Spaces-blue.svg?style=for-the-badge)](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/) -### 使用了RVC的实时语音转换 : [w-okada/voice-changer](https://github.com/w-okada/voice-changer) ------- +
-一个基于VITS的简单易用的语音转换(变声器)框架。 +------ [**更新日志**](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI/blob/main/Changelog_CN.md) [**English**](./README_en.md) | [**中文简体**](./README.md) > 点此查看我们的[演示视频](https://www.bilibili.com/video/BV1pm4y1z7Gm/) ! + +> 使用了RVC的实时语音转换: [w-okada/voice-changer](https://github.com/w-okada/voice-changer) + ## 简介 本仓库具有以下特点: + 使用top1特征模型检索来杜绝音色泄漏; @@ -47,9 +54,13 @@ poetry install ``` 你也可以通过pip来安装依赖: + +**注意**: `MacOS`下`faiss 1.7.2`版本会导致抛出段错误,请将`requirements.txt`的对应条目改为`faiss-cpu==1.7.0` + ```bash pip install -r requirements.txt ``` + ## 其他预模型准备 RVC需要其他的一些预模型来推理和训练。 diff --git a/README_en.md b/README_en.md index 6fe55e8..6148d57 100644 --- a/README_en.md +++ b/README_en.md @@ -22,6 +22,7 @@ This repository has the following features: + Supporting model fusion to change timbres; + Easy-to-use Webui interface; + Use the UVR5 model to quickly separate vocals and instruments. ++ The dataset for the pre-training model uses nearly 50 hours of high quality VCTK open source, and high quality licensed song datasets will be added one after another for your use, without worrying about copyright infringement. ## Preparing the environment We recommend you install the dependencies through poetry. @@ -31,6 +32,10 @@ The following commands need to be executed in the environment of Python version # Reference: https://pytorch.org/get-started/locally/ pip install torch torchvision torchaudio +#For Windows + 30-series Nvidia cards, you need to specify the cuda version corresponding to pytorch according to the experience of https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI/issues/21 + +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117 + # Install the Poetry dependency management tool, skip if installed # Reference: https://python-poetry.org/docs/#installation curl -sSL https://install.python-poetry.org | python3 - @@ -40,9 +45,12 @@ poetry install ``` You can also use pip to install the dependencies +**Notice**: `faiss 1.7.2` will raise Segmentation Fault: 11 under `MacOS`, please change corresponding line in `requirements.txt` to `faiss-cpu==1.7.0` + ```bash pip install -r requirements.txt ``` + ## Preparation of other Pre-models RVC requires other pre-models to infer and train. diff --git a/README_v0.md b/README_v0.md deleted file mode 100644 index 237679d..0000000 --- a/README_v0.md +++ /dev/null @@ -1,32 +0,0 @@ -# Retrieval-based-Voice-Conversion-WebUI - -[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/liujing04/Retrieval-based-Voice-Conversion-WebUI/blob/main/Retrieval_based_Voice_Conversion_WebUI.ipynb) - -缺失的2个文件夹和2个文件: - -hubert_base.pt - -ffmpeg(自己确保ffmpeg命令能执行就行) - -pretrained文件夹 - -uvr5_weights文件夹 - -文件太大github传不动,去huggingface上下https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main - -当然你也可以直接看看RVC-beta.7z这个文件→_→ - -按照requirements.txt用pip装好环境,python infer-web.py就能用了 - -根据经验,librosa numpy和numba三个包最好写死版本否则容易有坑,其他的包版本不太重要 - -宣传视频:https://www.bilibili.com/video/BV1pm4y1z7Gm/ - -教程见小白简易教程.doc - -We will develop an English version windows WebUI APP in 2 weeks. - - -### Realtime Voice Conversion Software using RVC - -https://github.com/w-okada/voice-changer diff --git a/Retrieval_based_Voice_Conversion_WebUI.ipynb b/Retrieval_based_Voice_Conversion_WebUI.ipynb index eb549d8..a47f3b4 100644 --- a/Retrieval_based_Voice_Conversion_WebUI.ipynb +++ b/Retrieval_based_Voice_Conversion_WebUI.ipynb @@ -129,10 +129,10 @@ "#@title 从谷歌云盘加载打包好的数据集到/content/dataset\n", "\n", "#@markdown 数据集位置\n", - "DATASET = \"/content/drive/MyDrive/dataset/lulucall_48k.zip\" #@param {type:\"string\"}\n", + "DATASET = \"/content/drive/MyDrive/dataset/lulu20230327_32k.zip\" #@param {type:\"string\"}\n", "\n", "!mkdir -p /content/dataset\n", - "!unzip -d /content/dataset {DATASET}" + "!unzip -d /content/dataset -B {DATASET}" ], "metadata": { "id": "Mwk7Q0Loqzjx" @@ -140,13 +140,26 @@ "execution_count": null, "outputs": [] }, + { + "cell_type": "code", + "source": [ + "#@title 重命名数据集中的重名文件\n", + "!ls -a /content/dataset/\n", + "!rename 's/(\\w+)\\.(\\w+)~(\\d*)/$1_$3.$2/' /content/dataset/*.*~*" + ], + "metadata": { + "id": "PDlFxWHWEynD" + }, + "execution_count": null, + "outputs": [] + }, { "cell_type": "code", "source": [ "#@title 启动web\n", "%cd /content/Retrieval-based-Voice-Conversion-WebUI\n", - "%load_ext tensorboard\n", - "%tensorboard --logdir /content/Retrieval-based-Voice-Conversion-WebUI/logs\n", + "# %load_ext tensorboard\n", + "# %tensorboard --logdir /content/Retrieval-based-Voice-Conversion-WebUI/logs\n", "!python3 infer-web.py --colab --pycmd python3" ], "metadata": { @@ -164,7 +177,7 @@ "#@markdown 模型名\n", "MODELNAME = \"lulu\" #@param {type:\"string\"}\n", "#@markdown 模型epoch\n", - "MODELEPOCH = 7500 #@param {type:\"integer\"}\n", + "MODELEPOCH = 9600 #@param {type:\"integer\"}\n", "\n", "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_D_{MODELEPOCH}.pth\n", "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_G_{MODELEPOCH}.pth\n", @@ -188,7 +201,7 @@ "#@markdown 模型名\n", "MODELNAME = \"lulu\" #@param {type:\"string\"}\n", "#@markdown 模型epoch\n", - "MODELEPOCH = 6000 #@param {type:\"integer\"}\n", + "MODELEPOCH = 7500 #@param {type:\"integer\"}\n", "\n", "!mkdir -p /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}\n", "\n", @@ -241,7 +254,7 @@ "#@markdown 模型名\n", "MODELNAME = \"lulu\" #@param {type:\"string\"}\n", "#@markdown 停止的epoch\n", - "MODELEPOCH = 2500 #@param {type:\"integer\"}\n", + "MODELEPOCH = 3200 #@param {type:\"integer\"}\n", "#@markdown 保存epoch间隔\n", "EPOCHSAVE = 100 #@param {type:\"integer\"}\n", "#@markdown 采样率\n", @@ -262,7 +275,7 @@ "#@markdown 模型名\n", "MODELNAME = \"lulu\" #@param {type:\"string\"}\n", "#@markdown 选中模型epoch\n", - "MODELEPOCH = 7700 #@param {type:\"integer\"}\n", + "MODELEPOCH = 9600 #@param {type:\"integer\"}\n", "\n", "!echo \"备份选中的模型。。。\"\n", "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/{MODELNAME}_D_{MODELEPOCH}.pth\n", @@ -292,7 +305,7 @@ "#@markdown 模型名\n", "MODELNAME = \"lulu\" #@param {type:\"string\"}\n", "#@markdown 选中模型epoch\n", - "MODELEPOCH = 7700 #@param {type:\"integer\"}\n", + "MODELEPOCH = 9600 #@param {type:\"integer\"}\n", "\n", "!echo \"备份选中的模型。。。\"\n", "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/{MODELNAME}_D_{MODELEPOCH}.pth\n", diff --git a/infer-web.py b/infer-web.py index c838a2a..d2cd506 100644 --- a/infer-web.py +++ b/infer-web.py @@ -9,7 +9,7 @@ import faiss ncpu=cpu_count() ngpu=torch.cuda.device_count() gpu_infos=[] -if(torch.cuda.is_available()==False or ngpu==0):if_gpu_ok=False +if((not torch.cuda.is_available()) or ngpu==0):if_gpu_ok=False else: if_gpu_ok = False for i in range(ngpu): @@ -79,7 +79,7 @@ def vc_single(sid,input_audio,f0_up_key,f0_file,f0_method,file_index,file_big_np if(hubert_model==None):load_hubert() if_f0 = cpt.get("f0", 1) audio_opt=vc.pipeline(hubert_model,net_g,sid,audio,times,f0_up_key,f0_method,file_index,file_big_npy,index_rate,if_f0,f0_file=f0_file) - print(times) + print("npy: ", times[0], "s, f0:", times[1], "s, infer: ", times[2], "s", sep='') return "Success", (tgt_sr, audio_opt) except: info=traceback.format_exc() @@ -140,7 +140,7 @@ def uvr(model_name,inp_root,save_root_vocal,paths,save_root_ins): except: traceback.print_exc() print("clean_empty_cache") - torch.cuda.empty_cache() + if torch.cuda.is_available(): torch.cuda.empty_cache() yield "\n".join(infos) #一个选项卡全局只能有一个音色 @@ -152,7 +152,7 @@ def get_vc(sid): print("clean_empty_cache") del net_g, n_spk, vc, hubert_model,tgt_sr#,cpt hubert_model = net_g=n_spk=vc=hubert_model=tgt_sr=None - torch.cuda.empty_cache() + if torch.cuda.is_available(): torch.cuda.empty_cache() ###楼下不这么折腾清理不干净 if_f0 = cpt.get("f0", 1) if (if_f0 == 1): @@ -160,7 +160,7 @@ def get_vc(sid): else: net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) del net_g,cpt - torch.cuda.empty_cache() + if torch.cuda.is_available(): torch.cuda.empty_cache() cpt=None return {"visible": False, "__type__": "update"} person = "%s/%s" % (weight_root, sid) diff --git a/infer/infer-pm-index256.py b/infer/infer-pm-index256.py index dd94834..add0245 100644 --- a/infer/infer-pm-index256.py +++ b/infer/infer-pm-index256.py @@ -104,7 +104,7 @@ for idx,name in enumerate(["冬之花clip1.wav",]):## "padding_mask": padding_mask.to(device), "output_layer": 9, # layer 9 } - torch.cuda.synchronize() + if torch.cuda.is_available(): torch.cuda.synchronize() t0=ttime() with torch.no_grad(): logits = model.extract_features(**inputs) @@ -116,13 +116,13 @@ for idx,name in enumerate(["冬之花clip1.wav",]):## feats = torch.from_numpy(big_npy[I.squeeze()].astype("float16")).unsqueeze(0).to(device) feats=F.interpolate(feats.permute(0,2,1),scale_factor=2).permute(0,2,1) - torch.cuda.synchronize() + if torch.cuda.is_available(): torch.cuda.synchronize() t1=ttime() # p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存 p_len = min(feats.shape[1],10000)# pitch, pitchf = get_f0(audio, p_len,f0_up_key) p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存 - torch.cuda.synchronize() + if torch.cuda.is_available(): torch.cuda.synchronize() t2=ttime() feats = feats[:,:p_len, :] pitch = pitch[:p_len] @@ -133,7 +133,7 @@ for idx,name in enumerate(["冬之花clip1.wav",]):## pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device) with torch.no_grad(): audio = net_g.infer(feats, p_len,pitch,pitchf,sid)[0][0, 0].data.cpu().float().numpy()#nsf - torch.cuda.synchronize() + if torch.cuda.is_available(): torch.cuda.synchronize() t3=ttime() ta0+=(t1-t0) ta1+=(t2-t1) diff --git a/vc_infer_pipeline.py b/vc_infer_pipeline.py index 30b03e3..e05ef4c 100644 --- a/vc_infer_pipeline.py +++ b/vc_infer_pipeline.py @@ -79,7 +79,7 @@ class VC(object): if(isinstance(index,type(None))==False and isinstance(big_npy,type(None))==False and index_rate!=0): npy = feats[0].cpu().numpy() if(self.is_half==True):npy=npy.astype("float32") - D, I = index.search(npy, 1) + _, I = index.search(npy, 1) npy=big_npy[I.squeeze()] if(self.is_half==True):npy=npy.astype("float16") feats = torch.from_numpy(npy).unsqueeze(0).to(self.device)*index_rate + (1-index_rate)*feats @@ -99,7 +99,7 @@ class VC(object): else: audio1 = (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16) del feats,p_len,padding_mask - torch.cuda.empty_cache() + if torch.cuda.is_available(): torch.cuda.empty_cache() t2 = ttime() times[0] += (t1 - t0) times[2] += (t2 - t1) @@ -160,5 +160,5 @@ class VC(object): audio_opt.append(self.vc(model,net_g,sid,audio_pad[t:],None,None,times,index,big_npy,index_rate)[self.t_pad_tgt:-self.t_pad_tgt]) audio_opt=np.concatenate(audio_opt) del pitch,pitchf,sid - torch.cuda.empty_cache() + if torch.cuda.is_available(): torch.cuda.empty_cache() return audio_opt