feat(infer): add model hash identification

and optimize infer-web ui
2025-05-06 20:01:37 +08:00 · 2024-06-02 22:47:52 +09:00 · 2024-06-02 22:47:52 +09:00 · b9ad0258ae
commit b9ad0258ae
parent 7e48279c6c
13 changed files with 327 additions and 105 deletions
--- a/infer-web.py
+++ b/infer-web.py
@ -10,13 +10,12 @@ load_dotenv("sha256.env")
 if sys.platform == "darwin":
    os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
-from infer.modules.vc import VC
+from infer.modules.vc import VC, show_info
 from infer.modules.uvr5.modules import uvr
 from infer.lib.train.process_ckpt import (
    change_info,
    extract_small_model,
    merge,
    show_info,
 )
 from i18n.i18n import I18nAuto
 from configs.config import Config
@ -838,6 +837,7 @@ with gr.Blocks(title="RVC WebUI") as app:
                clean_button.click(
                    fn=clean, inputs=[], outputs=[sid0], api_name="infer_clean"
                )
            modelinfo = gr.Textbox(label=i18n("模型信息"))
            with gr.TabItem(i18n("单次推理")):
                with gr.Group():
                    with gr.Row():
@ -846,24 +846,23 @@ with gr.Blocks(title="RVC WebUI") as app:
                                label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"),
                                value=0,
                            )
-                            input_audio0 = gr.Textbox(
+                            input_audio0 = gr.File(
                                label=i18n(
-                                    "输入待处理音频文件路径(默认是正确格式示例)"
+                                    "待处理音频文件"
                                ),
-                                placeholder="C:\\Users\\Desktop\\audio_example.wav",
+                                file_types=["audio"]
                            )
                            file_index1 = gr.Textbox(
                                label=i18n(
                                    "特征检索库文件路径,为空则使用下拉的选择结果"
                                ),
                                placeholder="C:\\Users\\Desktop\\model_example.index",
                                interactive=True,
                            )
                            file_index2 = gr.Dropdown(
                                label=i18n("自动检测index路径,下拉式选择(dropdown)"),
                                choices=sorted(index_paths),
                                interactive=True,
                            )
                            file_index1 = gr.File(
                                label=i18n(
                                    "特征检索库文件路径,为空则使用下拉的选择结果"
                                ),
                            )
                        with gr.Column():
                            f0method0 = gr.Radio(
                                label=i18n(
                                    "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU"
@ -876,8 +875,6 @@ with gr.Blocks(title="RVC WebUI") as app:
                                value="rmvpe",
                                interactive=True,
                            )
                        with gr.Column():
                            resample_sr0 = gr.Slider(
                                minimum=0,
                                maximum=48000,
@ -928,6 +925,10 @@ with gr.Blocks(title="RVC WebUI") as app:
                                ),
                                visible=False,
                            )
                            but0 = gr.Button(i18n("转换"), variant="primary")
                            vc_output2 = gr.Audio(
                                label=i18n("输出音频(右下角三个点,点了可以下载)")
                            )
                            refresh_button.click(
                                fn=change_choices,
@ -935,19 +936,8 @@ with gr.Blocks(title="RVC WebUI") as app:
                                outputs=[sid0, file_index2],
                                api_name="infer_refresh",
                            )
                            # file_big_npy1 = gr.Textbox(
                            #     label=i18n("特征文件路径"),
                            #     value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
                            #     interactive=True,
                            # )
                with gr.Group():
-                    with gr.Column():
+                        vc_output1 = gr.Textbox(label=i18n("输出信息"))
                        but0 = gr.Button(i18n("转换"), variant="primary")
                        with gr.Row():
                            vc_output1 = gr.Textbox(label=i18n("输出信息"))
                            vc_output2 = gr.Audio(
                                label=i18n("输出音频(右下角三个点,点了可以下载)")
                            )
                        but0.click(
                            vc.vc_single,
@ -981,36 +971,28 @@ with gr.Blocks(title="RVC WebUI") as app:
                            label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"),
                            value=0,
                        )
                        dir_input = gr.Textbox(
                            label=i18n(
                                "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)"
                            ),
                            placeholder="C:\\Users\\Desktop\\input_vocal_dir",
                        )
                        inputs = gr.File(
                            file_count="multiple",
                            label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹"),
                        )
                        opt_input = gr.Textbox(
                            label=i18n("指定输出文件夹"), value="opt"
                        )
                        file_index3 = gr.Textbox(
                            label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"),
                            value="",
                            interactive=True,
                        )
                        file_index4 = gr.Dropdown(
                            label=i18n("自动检测index路径,下拉式选择(dropdown)"),
                            choices=sorted(index_paths),
                            interactive=True,
                        )
-                        f0method1 = gr.Radio(
+                        file_index3 = gr.File(
                            label=i18n(
-                                "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU"
+                                "特征检索库文件路径,为空则使用下拉的选择结果"
                            ),
                            choices=(
                                ["pm", "harvest", "crepe", "rmvpe"]
                                if config.dml == False
                                else ["pm", "harvest", "rmvpe"]
                            ),
                            value="rmvpe",
                            interactive=True,
                        )
                        format1 = gr.Radio(
                            label=i18n("导出文件格式"),
                            choices=["wav", "flac", "mp3", "m4a"],
                            value="wav",
                            interactive=True,
                        )
                        refresh_button.click(
@ -1026,6 +1008,18 @@ with gr.Blocks(title="RVC WebUI") as app:
                        # )
                    with gr.Column():
                        f0method1 = gr.Radio(
                            label=i18n(
                                "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU"
                            ),
                            choices=(
                                ["pm", "harvest", "crepe", "rmvpe"]
                                if config.dml == False
                                else ["pm", "harvest", "rmvpe"]
                            ),
                            value="rmvpe",
                            interactive=True,
                        )
                        resample_sr1 = gr.Slider(
                            minimum=0,
                            maximum=48000,
@ -1070,48 +1064,42 @@ with gr.Blocks(title="RVC WebUI") as app:
                            value=1,
                            interactive=True,
                        )
-                with gr.Row():
+                        format1 = gr.Radio(
-                    dir_input = gr.Textbox(
+                            label=i18n("导出文件格式"),
-                        label=i18n(
+                            choices=["wav", "flac", "mp3", "m4a"],
-                            "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)"
+                            value="wav",
-                        ),
+                            interactive=True,
-                        placeholder="C:\\Users\\Desktop\\input_vocal_dir",
+                        )
-                    )
+                        but1 = gr.Button(i18n("转换"), variant="primary")
                    inputs = gr.File(
                        file_count="multiple",
                        label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹"),
                    )
-                with gr.Row():
+                vc_output3 = gr.Textbox(label=i18n("输出信息"))
                    but1 = gr.Button(i18n("转换"), variant="primary")
                    vc_output3 = gr.Textbox(label=i18n("输出信息"))
-                    but1.click(
+                but1.click(
-                        vc.vc_multi,
+                    vc.vc_multi,
-                        [
+                    [
-                            spk_item,
+                        spk_item,
-                            dir_input,
+                        dir_input,
-                            opt_input,
+                        opt_input,
-                            inputs,
+                        inputs,
-                            vc_transform1,
+                        vc_transform1,
-                            f0method1,
+                        f0method1,
-                            file_index3,
+                        file_index3,
-                            file_index4,
+                        file_index4,
-                            # file_big_npy2,
+                        # file_big_npy2,
-                            index_rate2,
+                        index_rate2,
-                            filter_radius1,
+                        filter_radius1,
-                            resample_sr1,
+                        resample_sr1,
-                            rms_mix_rate1,
+                        rms_mix_rate1,
-                            protect1,
+                        protect1,
-                            format1,
+                        format1,
-                        ],
+                    ],
-                        [vc_output3],
+                    [vc_output3],
-                        api_name="infer_convert_batch",
+                    api_name="infer_convert_batch",
-                    )
+                )
                sid0.change(
                    fn=vc.get_vc,
                    inputs=[sid0, protect0, protect1],
-                    outputs=[spk_item, protect0, protect1, file_index2, file_index4],
+                    outputs=[spk_item, protect0, protect1, file_index2, file_index4, modelinfo],
                    api_name="infer_change_voice",
                )
        with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")):
--- a/infer/lib/train/process_ckpt.py
+++ b/infer/lib/train/process_ckpt.py
@ -1,16 +1,17 @@
 import os
 import sys
 import traceback
 from collections import OrderedDict
 from time import time
 import torch
 from i18n.i18n import I18nAuto
 from infer.modules.vc import model_hash_ckpt, hash_id
 i18n = I18nAuto()
-
+# add author sign
-def savee(ckpt, sr, if_f0, name, epoch, version, hps):
+def save_small_model(ckpt, sr, if_f0, name, epoch, version, hps):
    try:
        opt = OrderedDict()
        opt["weight"] = {}
@ -39,28 +40,20 @@ def savee(ckpt, sr, if_f0, name, epoch, version, hps):
            hps.data.sampling_rate,
        ]
        opt["info"] = "%sepoch" % epoch
        opt["name"] = name
        opt["timestamp"] = int(time())
        opt["sr"] = sr
        opt["f0"] = if_f0
        opt["version"] = version
        h = model_hash_ckpt(opt)
        opt["hash"] = h
        opt["id"] = hash_id(h)
        torch.save(opt, "assets/weights/%s.pth" % name)
        return "Success."
    except:
        return traceback.format_exc()
 def show_info(path):
    try:
        a = torch.load(path, map_location="cpu")
        return "模型信息:%s\n采样率:%s\n模型是否输入音高引导:%s\n版本:%s" % (
            a.get("info", "None"),
            a.get("sr", "None"),
            a.get("f0", "None"),
            a.get("version", "None"),
        )
    except:
        return traceback.format_exc()
 def extract_small_model(path, name, sr, if_f0, info, version):
    try:
        ckpt = torch.load(path, map_location="cpu")
@ -182,9 +175,14 @@ def extract_small_model(path, name, sr, if_f0, info, version):
        if info == "":
            info = "Extracted model."
        opt["info"] = info
        opt["name"] = name
        opt["timestamp"] = int(time())
        opt["version"] = version
        opt["sr"] = sr
        opt["f0"] = int(if_f0)
        h = model_hash_ckpt(opt)
        opt["hash"] = h
        opt["id"] = hash_id(h)
        torch.save(opt, "assets/weights/%s.pth" % name)
        return "Success."
    except:
@ -251,10 +249,15 @@ def merge(path1, path2, alpha1, sr, f0, info, name, version):
        elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4], 109, 256, 48000]
        elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000]
        """
        opt["name"] = name
        opt["timestamp"] = int(time())
        opt["sr"] = sr
        opt["f0"] = 1 if f0 == i18n("是") else 0
        opt["version"] = version
        opt["info"] = info
        h = model_hash_ckpt(opt)
        opt["hash"] = h
        opt["id"] = hash_id(h)
        torch.save(opt, "assets/weights/%s.pth" % name)
        return "Success."
    except:
--- a/infer/modules/train/train.py
+++ b/infer/modules/train/train.py
@ -74,7 +74,7 @@ from infer.lib.train.losses import (
    kl_loss,
 )
 from infer.lib.train.mel_processing import mel_spectrogram_torch, spec_to_mel_torch
-from infer.lib.train.process_ckpt import savee
+from infer.lib.train.process_ckpt import save_small_model
 global_step = 0
@ -602,7 +602,7 @@ def train_and_evaluate(
                % (
                    hps.name,
                    epoch,
-                    savee(
+                    save_small_model(
                        ckpt,
                        hps.sample_rate,
                        hps.if_f0,
@ -626,7 +626,7 @@ def train_and_evaluate(
        logger.info(
            "saving final ckpt:%s"
            % (
-                savee(
+                save_small_model(
                    ckpt, hps.sample_rate, hps.if_f0, hps.name, epoch, hps.version, hps
                )
            )
--- a/infer/modules/vc/init.py
+++ b/infer/modules/vc/init.py
@ -1,3 +1,5 @@
 from .pipeline import Pipeline
 from .modules import VC
 from .utils import get_index_path_from_model, load_hubert
 from .info import show_info
 from .hash import model_hash_ckpt, hash_id
--- a/infer/modules/vc/hash.py
+++ b/infer/modules/vc/hash.py
@ -0,0 +1,170 @@
 import numpy as np
 import torch
 import hashlib
 import pathlib
 from scipy.fft import fft
 from pybase16384 import encode_to_string, decode_from_string
 if __name__ == "__main__":
    import os, sys
    now_dir = os.getcwd()
    sys.path.append(now_dir)
 from configs.config import Config, singleton_variable
 from .pipeline import Pipeline
 from .utils import load_hubert
 from infer.lib.audio import load_audio
 class TorchSeedContext:
    def __init__(self, seed):
        self.seed = seed
        self.state = None
    def __enter__(self):
        self.state = torch.random.get_rng_state()
        torch.manual_seed(self.seed)
    def __exit__(self, type, value, traceback):
        torch.random.set_rng_state(self.state)
 half_hash_len = 512
 expand_factor = 65536*8
@singleton_variable
 def original_audio_time_minus():
    __original_audio = load_audio(str(pathlib.Path(__file__).parent / "lgdsng.mp3"), 16000)
    np.divide(__original_audio, np.abs(__original_audio).max(), __original_audio)
    return -__original_audio
@singleton_variable
 def original_audio_freq_minus():
    __original_audio = load_audio(str(pathlib.Path(__file__).parent / "lgdsng.mp3"), 16000)
    np.divide(__original_audio, np.abs(__original_audio).max(), __original_audio)
    __original_audio = fft(__original_audio)
    return -__original_audio
 def _cut_u16(n):
    if n > 16384: n = 16384 + 16384*(1-np.exp((16384-n)/expand_factor))
    elif n < -16384: n = -16384 - 16384*(1-np.exp((n+16384)/expand_factor))
    return n
 # wave_hash will change time_field, use carefully
 def wave_hash(time_field):
    np.divide(time_field, np.abs(time_field).max(), time_field)
    if len(time_field) != 48000:
        raise Exception("time not hashable")
    freq_field = fft(time_field)
    if len(freq_field) != 48000:
        raise Exception("freq not hashable")
    np.add(time_field, original_audio_time_minus(), out=time_field)
    np.add(freq_field, original_audio_freq_minus(), out=freq_field)
    hash = np.zeros(half_hash_len//2*2, dtype='>i2')
    d = 375 * 512 // half_hash_len
    for i in range(half_hash_len//4):
        a = i*2
        b = a+1
        x = a + half_hash_len//2
        y = x+1
        s = np.average(freq_field[i*d:(i+1)*d])
        hash[a] = np.int16(_cut_u16(round(32768*np.real(s))))
        hash[b] = np.int16(_cut_u16(round(32768*np.imag(s))))
        hash[x] = np.int16(_cut_u16(round(32768*np.sum(time_field[i*d:i*d+d//2]))))
        hash[y] = np.int16(_cut_u16(round(32768*np.sum(time_field[i*d+d//2:(i+1)*d]))))
    return encode_to_string(hash.tobytes())
 def audio_hash(file):
    return wave_hash(load_audio(file, 16000))
 def model_hash(config, tgt_sr, net_g, if_f0, version):
    pipeline = Pipeline(tgt_sr, config)
    audio = load_audio(str(pathlib.Path(__file__).parent / "lgdsng.mp3"), 16000)
    audio_max = np.abs(audio).max() / 0.95
    if audio_max > 1:
        np.divide(audio, audio_max, audio)
    audio_opt = pipeline.pipeline(load_hubert(config.device, config.is_half), net_g, 0, audio,
                      [0, 0, 0], 6, "rmvpe", "", 0, if_f0, 3, tgt_sr, 16000, 0.25,
                      version, 0.33)
    opt_len = len(audio_opt)
    diff = 48000 - opt_len
    n = diff//2
    if n > 0:
        audio_opt = np.pad(audio_opt, (n, n))
    elif n < 0:
        n = -n
        audio_opt = audio_opt[n:-n]
    h = wave_hash(audio_opt)
    del pipeline, audio, audio_opt
    return h
 def model_hash_ckpt(cpt):
    from infer.lib.infer_pack.models import (
        SynthesizerTrnMs256NSFsid,
        SynthesizerTrnMs256NSFsid_nono,
        SynthesizerTrnMs768NSFsid,
        SynthesizerTrnMs768NSFsid_nono,
    )
    config = Config()
    with TorchSeedContext(114514):
        tgt_sr = cpt["config"][-1]
        if_f0 = cpt.get("f0", 1)
        version = cpt.get("version", "v1")
        synthesizer_class = {
            ("v1", 1): SynthesizerTrnMs256NSFsid,
            ("v1", 0): SynthesizerTrnMs256NSFsid_nono,
            ("v2", 1): SynthesizerTrnMs768NSFsid,
            ("v2", 0): SynthesizerTrnMs768NSFsid_nono,
        }
        net_g = synthesizer_class.get(
            (version, if_f0), SynthesizerTrnMs256NSFsid
        )(*cpt["config"], is_half=config.is_half)
        del net_g.enc_q
        net_g.load_state_dict(cpt["weight"], strict=False)
        net_g.eval().to(config.device)
        if config.is_half:
            net_g = net_g.half()
        else:
            net_g = net_g.float()
        h = model_hash(config, tgt_sr, net_g, if_f0, version)
    del net_g
    return h
 def model_hash_from(path):
    cpt = torch.load(path, map_location="cpu")
    h = model_hash_ckpt(cpt)
    del cpt
    return h
 def _extend_difference(n, a, b):
    if n < a: n = a
    elif n > b: n = b
    n -= a
    n /= (b-a)
    return n
 def hash_similarity(h1: str, h2: str) -> int:
    h1b, h2b = decode_from_string(h1), decode_from_string(h2)
    if len(h1b) != half_hash_len*2 or len(h2b) != half_hash_len*2:
        raise Exception("invalid hash length")
    h1n, h2n = np.frombuffer(h1b, dtype='>i2'), np.frombuffer(h2b, dtype='>i2')
    d = 0
    for i in range(half_hash_len//4):
        a = i*2
        b = a+1
        ax = complex(h1n[a], h1n[b])
        bx = complex(h2n[a], h2n[b])
        if abs(ax) == 0 or abs(bx) == 0: continue
        d += np.abs(ax - bx)
    frac = (np.linalg.norm(h1n) * np.linalg.norm(h2n))
    cosine = np.dot(h1n.astype(np.float32), h2n.astype(np.float32)) / frac if frac != 0 else 1.0
    distance = _extend_difference(np.exp(-d/expand_factor), 0.5, 1.0)
    return round((abs(cosine) + distance) / 2, 6)
 def hash_id(h: str) -> str:
    return encode_to_string(hashlib.md5(decode_from_string(h)).digest())[:-1]
--- a/infer/modules/vc/info.py
+++ b/infer/modules/vc/info.py
@ -0,0 +1,50 @@
 import traceback
 from i18n.i18n import I18nAuto
 from datetime import datetime
 import torch
 from .hash import model_hash_ckpt, hash_id
 i18n = I18nAuto()
 def show_model_info(cpt, show_long_id=False):
    try:
        h = model_hash_ckpt(cpt)
        id = hash_id(h)
        idread = cpt.get("id", "None")
        hread = cpt.get("hash", "None")
        if id != idread:
            id += "("+i18n("实际计算")+"), "+idread+"("+i18n("从模型中读取")+")"
        if not show_long_id: h = i18n("不显示")
        elif h != hread:
            h += "("+i18n("实际计算")+"), "+hread+"("+i18n("从模型中读取")+")"
        txt = f"""{i18n("模型名")}: %s
 {i18n("封装时间")}: %s
 {i18n("信息")}: %s
 {i18n("采样率")}: %s
 {i18n("音高引导(f0)")}: %s
 {i18n("版本")}: %s
 {i18n("ID(短)")}: %s
 {i18n("ID(长)")}: %s""" % (
            cpt.get("name", "None"),
            datetime.fromtimestamp(float(cpt.get("timestamp", 0))),
            cpt.get("info", "None"),
            cpt.get("sr", "None"),
            i18n("有") if cpt.get("f0", 0) == 1 else i18n("无"),
            cpt.get("version", "None"),
            id, h
        )
    except:
        txt = traceback.format_exc()
    return txt
 def show_info(path):
    try:
        a = torch.load(path, map_location="cpu")
        txt = show_model_info(a, show_long_id=True)
        del a
    except:
        txt = traceback.format_exc()
    return txt
--- a/infer/modules/vc/lgdsng.mp3
+++ b/infer/modules/vc/lgdsng.mp3
--- a/infer/modules/vc/modules.py
+++ b/infer/modules/vc/modules.py
@ -16,7 +16,7 @@ from infer.lib.infer_pack.models import (
    SynthesizerTrnMs768NSFsid,
    SynthesizerTrnMs768NSFsid_nono,
 )
-
+from .info import show_model_info
 from .pipeline import Pipeline
 from .utils import get_index_path_from_model, load_hubert
@ -136,6 +136,7 @@ class VC:
                to_return_protect1,
                index,
                index,
                show_model_info(self.cpt)
            )
            if to_return_protect
            else {"visible": True, "maximum": n_spk, "__type__": "update"}
@ -158,6 +159,8 @@ class VC:
    ):
        if input_audio_path is None:
            return "You need to upload an audio", None
        elif hasattr(input_audio_path, "name"):
            input_audio_path = str(input_audio_path.name)
        f0_up_key = int(f0_up_key)
        try:
            audio = load_audio(input_audio_path, 16000)
@ -170,6 +173,7 @@ class VC:
                self.hubert_model = load_hubert(self.config.device, self.config.is_half)
            if file_index:
                if hasattr(file_index, "name"): file_index = str(file_index.name)
                file_index = (
                    file_index.strip(" ")
                    .strip('"')
@ -207,12 +211,12 @@ class VC:
            else:
                tgt_sr = self.tgt_sr
            index_info = (
-                "Index:\n%s." % file_index
+                "Index: %s." % file_index
                if os.path.exists(file_index)
                else "Index not used."
            )
            return (
-                "Success.\n%s\nTime:\nnpy: %.2fs, f0: %.2fs, infer: %.2fs."
+                "Success.\n%s\nTime: npy: %.2fs, f0: %.2fs, infer: %.2fs."
                % (index_info, *times),
                (tgt_sr, audio_opt),
            )
--- a/requirements-amd.txt
+++ b/requirements-amd.txt
@ -47,3 +47,4 @@ ffmpy==0.3.1
 python-dotenv>=1.0.0
 av
 torchfcpe
 pybase16384
--- a/requirements-dml.txt
+++ b/requirements-dml.txt
@ -45,3 +45,4 @@ ffmpy==0.3.1
 python-dotenv>=1.0.0
 av
 torchfcpe
 pybase16384
--- a/requirements-ipex.txt
+++ b/requirements-ipex.txt
@ -53,3 +53,4 @@ av
 FreeSimpleGUI
 sounddevice
 torchfcpe
 pybase16384
--- a/requirements-py311.txt
+++ b/requirements-py311.txt
@ -46,3 +46,4 @@ torchfcpe
 ffmpy==0.3.1
 python-dotenv>=1.0.0
 av
 pybase16384
--- a/requirements.txt
+++ b/requirements.txt
@ -46,3 +46,4 @@ torchfcpe
 ffmpy==0.3.1
 python-dotenv>=1.0.0
 av
 pybase16384