diff --git a/infer/lib/audio.py b/infer/lib/audio.py index 56acbdc..13c12f5 100644 --- a/infer/lib/audio.py +++ b/infer/lib/audio.py @@ -1,3 +1,6 @@ +import os +import traceback + import librosa import numpy as np import av @@ -47,10 +50,14 @@ def audio2(i, o, format, sr): def load_audio(file, sr): + file = ( + file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) # 防止小白拷路径头尾带了空格和"和回车 + if os.path.exists(file) == False: + raise RuntimeError( + "You input a wrong audio path that does not exists, please fix it!" + ) try: - file = ( - file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - ) # 防止小白拷路径头尾带了空格和"和回车 with open(file, "rb") as f: with BytesIO() as out: audio2(f, out, "f32le", sr) @@ -62,5 +69,5 @@ def load_audio(file, sr): audio = np.mean(audio, -1) return librosa.resample(audio, orig_sr=file[0], target_sr=16000) - except Exception as e: - raise RuntimeError(f"Failed to load audio: {e}") + except: + raise RuntimeError(traceback.format_exc()) diff --git a/infer/modules/uvr5/mdxnet.py b/infer/modules/uvr5/mdxnet.py index 86a0668..2f246db 100644 --- a/infer/modules/uvr5/mdxnet.py +++ b/infer/modules/uvr5/mdxnet.py @@ -216,16 +216,26 @@ class Predictor: path_other = "%s/%s_others.wav" % (others_root, basename) sf.write(path_vocal, mix - opt, rate) sf.write(path_other, opt, rate) + opt_path_vocal = path_vocal[:-4] + ".%s" % format + opt_path_other = path_other[:-4] + ".%s" % format if os.path.exists(path_vocal): os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" - % (path_vocal, path_vocal[:-4] + ".%s" % format) + "ffmpeg -i %s -vn %s -q:a 2 -y" % (path_vocal, opt_path_vocal) ) + if os.path.exists(opt_path_vocal): + try: + os.remove(path_vocal) + except: + pass if os.path.exists(path_other): os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" - % (path_other, path_other[:-4] + ".%s" % format) + "ffmpeg -i %s -vn %s -q:a 2 -y" % (path_other, opt_path_other) ) + if os.path.exists(opt_path_other): + try: + os.remove(path_other) + except: + pass class MDXNetDereverb: @@ -242,5 +252,5 @@ class MDXNetDereverb: self.pred = Predictor(self) self.device = device - def path_audio(self, input, vocal_root, others_root, format): + def _path_audio_(self, input, vocal_root, others_root, format, is_hp3=False): self.pred.prediction(input, vocal_root, others_root, format) diff --git a/infer/modules/uvr5/modules.py b/infer/modules/uvr5/modules.py index f63ac6a..bce3cef 100644 --- a/infer/modules/uvr5/modules.py +++ b/infer/modules/uvr5/modules.py @@ -9,7 +9,7 @@ import torch from configs.config import Config from infer.modules.uvr5.mdxnet import MDXNetDereverb -from infer.modules.uvr5.preprocess import AudioPre, AudioPreDeEcho +from infer.modules.uvr5.vr import AudioPre, AudioPreDeEcho config = Config() @@ -36,6 +36,7 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format device=config.device, is_half=config.is_half, ) + is_hp3 = "HP3" in model_name if inp_root != "": paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)] else: @@ -52,7 +53,7 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format ): need_reformat = 0 pre_fun._path_audio_( - inp_path, save_root_ins, save_root_vocal, format0 + inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3 ) done = 1 except: @@ -70,7 +71,7 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format inp_path = tmp_path try: if done == 0: - pre_fun.path_audio( + pre_fun._path_audio_( inp_path, save_root_ins, save_root_vocal, format0 ) infos.append("%s->Success" % (os.path.basename(inp_path))) diff --git a/infer/modules/uvr5/preprocess.py b/infer/modules/uvr5/vr.py similarity index 85% rename from infer/modules/uvr5/preprocess.py rename to infer/modules/uvr5/vr.py index c22b291..d3fbac4 100644 --- a/infer/modules/uvr5/preprocess.py +++ b/infer/modules/uvr5/vr.py @@ -41,7 +41,9 @@ class AudioPre: self.mp = mp self.model = model - def _path_audio_(self, music_file, ins_root=None, vocal_root=None, format="flac"): + def _path_audio_( + self, music_file, ins_root=None, vocal_root=None, format="flac", is_hp3=False + ): if ins_root is None and vocal_root is None: return "No save root." name = os.path.basename(music_file) @@ -120,18 +122,22 @@ class AudioPre: else: wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) logger.info("%s instruments done" % name) + if is_hp3 == True: + head = "vocal_" + else: + head = "instrument_" if format in ["wav", "flac"]: sf.write( os.path.join( ins_root, - "instrument_{}_{}.{}".format(name, self.data["agg"], format), + head + "{}_{}.{}".format(name, self.data["agg"], format), ), (np.array(wav_instrument) * 32768).astype("int16"), self.mp.param["sr"], ) # else: path = os.path.join( - ins_root, "instrument_{}_{}.wav".format(name, self.data["agg"]) + ins_root, head + "{}_{}.wav".format(name, self.data["agg"]) ) sf.write( path, @@ -139,11 +145,18 @@ class AudioPre: self.mp.param["sr"], ) if os.path.exists(path): - os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" - % (path, path[:-4] + ".%s" % format) - ) + opt_format_path = path[:-4] + ".%s" % format + os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path)) + if os.path.exists(opt_format_path): + try: + os.remove(path) + except: + pass if vocal_root is not None: + if is_hp3 == True: + head = "instrument_" + else: + head = "vocal_" if self.data["high_end_process"].startswith("mirroring"): input_high_end_ = spec_utils.mirroring( self.data["high_end_process"], v_spec_m, input_high_end, self.mp @@ -158,14 +171,14 @@ class AudioPre: sf.write( os.path.join( vocal_root, - "vocal_{}_{}.{}".format(name, self.data["agg"], format), + head + "{}_{}.{}".format(name, self.data["agg"], format), ), (np.array(wav_vocals) * 32768).astype("int16"), self.mp.param["sr"], ) else: path = os.path.join( - vocal_root, "vocal_{}_{}.wav".format(name, self.data["agg"]) + vocal_root, head + "{}_{}.wav".format(name, self.data["agg"]) ) sf.write( path, @@ -173,10 +186,13 @@ class AudioPre: self.mp.param["sr"], ) if os.path.exists(path): - os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" - % (path, path[:-4] + ".%s" % format) - ) + opt_format_path = path[:-4] + ".%s" % format + os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path)) + if os.path.exists(opt_format_path): + try: + os.remove(path) + except: + pass class AudioPreDeEcho: @@ -207,7 +223,7 @@ class AudioPreDeEcho: self.model = model def _path_audio_( - self, music_file, vocal_root=None, ins_root=None, format="flac" + self, music_file, vocal_root=None, ins_root=None, format="flac", is_hp3=False ): # 3个VR模型vocal和ins是反的 if ins_root is None and vocal_root is None: return "No save root." @@ -306,10 +322,13 @@ class AudioPreDeEcho: self.mp.param["sr"], ) if os.path.exists(path): - os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" - % (path, path[:-4] + ".%s" % format) - ) + opt_format_path = path[:-4] + ".%s" % format + os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path)) + if os.path.exists(opt_format_path): + try: + os.remove(path) + except: + pass if vocal_root is not None: if self.data["high_end_process"].startswith("mirroring"): input_high_end_ = spec_utils.mirroring( @@ -340,7 +359,10 @@ class AudioPreDeEcho: self.mp.param["sr"], ) if os.path.exists(path): - os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" - % (path, path[:-4] + ".%s" % format) - ) + opt_format_path = path[:-4] + ".%s" % format + os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path)) + if os.path.exists(opt_format_path): + try: + os.remove(path) + except: + pass diff --git a/pyproject.toml b/pyproject.toml index fd67580..6a91a9c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,7 @@ uvicorn = "^0.21.1" colorama = "^0.4.6" torchcrepe = "0.0.20" python-dotenv = "^1.0.0" +av = "^10.0.0" [tool.poetry.dev-dependencies]