Create mergd UI

This commit is contained in:
VSlobolinskyi 2025-03-21 15:58:55 +02:00
parent 3db3db1bae
commit 7317afc9c0
7 changed files with 315 additions and 36 deletions

View File

@ -1,33 +1,24 @@
#!/usr/bin/env python3
import gradio as gr
import traceback
from rvc_ui.initialization import now_dir, config, vc
from merged_ui.main import build_merged_ui
from rvc_ui.initialization import config
from rvc_ui.main import build_rvc_ui
from spark_ui.main import build_spark_ui
def build_unified_ui():
rvc_ui = build_rvc_ui() # Returns a gr.Blocks instance for RVC WebUI
def build_standalone_ui():
with gr.Blocks(title="Unified Inference UI") as app:
gr.Markdown("## Unified Inference UI: RVC WebUI and Spark TTS")
with gr.Tabs():
with gr.TabItem("RVC WebUI"):
rvc_ui.render()
with gr.TabItem("Spark TTS"):
# Instead of calling render() on the Spark UI object,
# we'll directly build it in this context
try:
# Create the Spark UI directly in this tab's context
build_spark_ui()
except Exception as e:
gr.Markdown(f"Error building Spark TTS: {str(e)}")
gr.Markdown(traceback.format_exc())
build_spark_ui()
with gr.TabItem("RVC WebUI"):
build_rvc_ui()
return app
if __name__ == "__main__":
app = build_unified_ui()
# Needed for RVC
app = build_merged_ui()
if config.iscolab:
app.queue(concurrency_count=511, max_size=1022).launch(share=True)
else:

View File

284
modules/merged_ui/main.py Normal file
View File

@ -0,0 +1,284 @@
import os
import gradio as gr
import torch
from time import sleep
# Import modules from your packages
from rvc_ui.initialization import now_dir, config, vc
from rvc_ui.main import build_rvc_ui, names, index_paths
from spark_ui.main import build_spark_ui, initialize_model, run_tts
from spark.sparktts.utils.token_parser import LEVELS_MAP_UI
def build_merged_ui():
# Initialize the Spark TTS model
model_dir = "spark/pretrained_models/Spark-TTS-0.5B"
device = 0
spark_model = initialize_model(model_dir, device=device)
# Create the UI
with gr.Blocks(title="Unified TTS-RVC Pipeline") as merged_ui:
gr.Markdown("## Voice Generation and Conversion Pipeline")
gr.Markdown("Generate speech with Spark TTS and process it through RVC for voice conversion")
with gr.Tabs():
with gr.TabItem("TTS-to-RVC Pipeline"):
gr.Markdown("### Step 1: Generate speech with Spark TTS")
# TTS Generation Section
with gr.Tabs():
# Voice Clone option
with gr.TabItem("Voice Clone"):
with gr.Row():
prompt_wav_upload = gr.Audio(
sources="upload",
type="filepath",
label="Reference voice (upload)",
)
prompt_wav_record = gr.Audio(
sources="microphone",
type="filepath",
label="Reference voice (record)",
)
with gr.Row():
tts_text_input = gr.Textbox(
label="Text to synthesize",
lines=3,
placeholder="Enter text for TTS"
)
prompt_text_input = gr.Textbox(
label="Text of prompt speech (Optional)",
lines=3,
placeholder="Enter text of the reference audio",
)
# Voice Creation option
with gr.TabItem("Voice Creation"):
with gr.Row():
with gr.Column():
gender = gr.Radio(
choices=["male", "female"], value="male", label="Gender"
)
pitch = gr.Slider(
minimum=1, maximum=5, step=1, value=3, label="Pitch"
)
speed = gr.Slider(
minimum=1, maximum=5, step=1, value=3, label="Speed"
)
with gr.Column():
tts_text_input_creation = gr.Textbox(
label="Text to synthesize",
lines=3,
placeholder="Enter text for TTS",
value="Generate speech with this text and then convert the voice with RVC.",
)
tts_audio_output = gr.Audio(label="Generated TTS Audio")
with gr.Row():
generate_clone_button = gr.Button("Generate with Voice Clone", variant="primary")
generate_create_button = gr.Button("Generate with Voice Creation", variant="primary")
# Hidden text field to store the TTS audio path
tts_audio_path = gr.Textbox(visible=False)
gr.Markdown("### Step 2: Convert with RVC")
# RVC Settings
with gr.Row():
with gr.Column():
sid0 = gr.Dropdown(
label="Target voice model:", choices=sorted(names)
)
vc_transform0 = gr.Number(
label="Transpose (semitones):",
value=0,
)
f0method0 = gr.Radio(
label="Pitch extraction algorithm:",
choices=(
["pm", "harvest", "crepe", "rmvpe"]
if config.dml == False
else ["pm", "harvest", "rmvpe"]
),
value="rmvpe",
interactive=True,
)
file_index1 = gr.Textbox(
label="Path to feature index file (leave blank for auto):",
placeholder="Leave blank to use dropdown selection",
interactive=True,
)
file_index2 = gr.Dropdown(
label="Select feature index:",
choices=sorted(index_paths),
interactive=True,
)
with gr.Column():
index_rate1 = gr.Slider(
minimum=0,
maximum=1,
label="Feature search ratio (accent strength):",
value=0.75,
interactive=True,
)
filter_radius0 = gr.Slider(
minimum=0,
maximum=7,
label="Median filter radius (3+ reduces breathiness):",
value=3,
step=1,
interactive=True,
)
rms_mix_rate0 = gr.Slider(
minimum=0,
maximum=1,
label="Volume envelope scaling (0=original, 1=constant):",
value=0.25,
interactive=True,
)
protect0 = gr.Slider(
minimum=0,
maximum=0.5,
label="Consonant protection (0=max, 0.5=disable):",
value=0.33,
step=0.01,
interactive=True,
)
resample_sr0 = gr.Slider(
minimum=0,
maximum=48000,
label="Output sample rate (0=no resampling):",
value=0,
step=1,
interactive=True,
)
# Speaker ID (hidden)
spk_item = gr.Slider(
minimum=0,
maximum=2333,
step=1,
label="Speaker ID:",
value=0,
visible=False,
interactive=True,
)
# Process button and outputs
process_button = gr.Button("Process with RVC", variant="primary")
with gr.Row():
vc_output1 = gr.Textbox(label="Output information")
vc_output2 = gr.Audio(label="Final converted audio")
# Function to handle voice clone TTS generation
def voice_clone_tts(text, prompt_text, prompt_wav_upload, prompt_wav_record):
prompt_speech = prompt_wav_upload if prompt_wav_upload else prompt_wav_record
prompt_text_clean = None if not prompt_text or len(prompt_text) < 2 else prompt_text
audio_output_path = run_tts(
text,
spark_model,
prompt_text=prompt_text_clean,
prompt_speech=prompt_speech
)
return audio_output_path, audio_output_path
# Function to handle voice creation TTS generation
def voice_creation_tts(text, gender, pitch, speed):
pitch_val = LEVELS_MAP_UI[int(pitch)]
speed_val = LEVELS_MAP_UI[int(speed)]
audio_output_path = run_tts(
text,
spark_model,
gender=gender,
pitch=pitch_val,
speed=speed_val
)
return audio_output_path, audio_output_path
# Function to process audio with RVC
def process_with_rvc(
tts_path, spk_item, vc_transform, f0method,
file_index1, file_index2, index_rate, filter_radius,
resample_sr, rms_mix_rate, protect
):
# Make sure we have a TTS file to process
if not tts_path or not os.path.exists(tts_path):
return "No TTS audio generated yet", None
# Call RVC processing function
f0_file = None # We're not using an F0 curve file in this pipeline
output_info, output_audio = vc.vc_single(
spk_item, tts_path, vc_transform, f0_file, f0method,
file_index1, file_index2, index_rate, filter_radius,
resample_sr, rms_mix_rate, protect
)
return output_info, output_audio
# Connect functions to buttons
generate_clone_button.click(
voice_clone_tts,
inputs=[
tts_text_input,
prompt_text_input,
prompt_wav_upload,
prompt_wav_record,
],
outputs=[tts_audio_output, tts_audio_path],
)
generate_create_button.click(
voice_creation_tts,
inputs=[
tts_text_input_creation,
gender,
pitch,
speed,
],
outputs=[tts_audio_output, tts_audio_path],
)
process_button.click(
process_with_rvc,
inputs=[
tts_audio_path,
spk_item,
vc_transform0,
f0method0,
file_index1,
file_index2,
index_rate1,
filter_radius0,
resample_sr0,
rms_mix_rate0,
protect0,
],
outputs=[vc_output1, vc_output2],
)
def modified_get_vc(sid0_value, protect0_value):
protect1_value = protect0_value
outputs = vc.get_vc(sid0_value, protect0_value, protect1_value)
if isinstance(outputs, tuple) and len(outputs) >= 3:
return outputs[0], outputs[1], outputs[3]
return 0, protect0_value, file_index2.choices[0] if file_index2.choices else ""
sid0.change(
fn=modified_get_vc,
inputs=[sid0, protect0],
outputs=[spk_item, protect0, file_index2],
)
return merged_ui
if __name__ == "__main__":
build_merged_ui()

View File

@ -38,7 +38,7 @@ F0GPUVisible = config.dml == False
# Build Gradio UI
def build_rvc_ui():
with gr.Blocks(title="RVC WebUI") as rvc_ui:
with gr.Blocks(title="RVC WebUI") as app:
gr.Markdown("## RVC WebUI")
gr.Markdown(
value="This software is open source under the MIT license. The author does not have any control over the software. Users who use the software and distribute the sounds exported by the software are solely responsible. <br>If you do not agree with this clause, you cannot use or reference any codes and files within the software package. See the root directory <b>Agreement-LICENSE.txt</b> for details."
@ -326,7 +326,7 @@ def build_rvc_ui():
],
api_name="infer_change_voice",
)
return rvc_ui
return app
if __name__ == "__main__":

View File

@ -1,3 +1,4 @@
# spark_ui/main.py
import os
import torch
import soundfile as sf
@ -120,7 +121,7 @@ def build_spark_ui():
)
return audio_output_path
with gr.Blocks() as demo:
with gr.Blocks() as app:
# Use HTML for centered title
gr.HTML('<h1 style="text-align: center;">Spark-TTS by SparkAudio</h1>')
with gr.Tabs():
@ -204,7 +205,7 @@ def build_spark_ui():
outputs=[audio_output],
)
return demo
return app
if __name__ == "__main__":
build_spark_ui()

2
poetry.lock generated
View File

@ -4417,4 +4417,4 @@ watchdog = ["watchdog (>=2.3)"]
[metadata]
lock-version = "2.1"
python-versions = ">=3.11,<3.12"
content-hash = "58fd044b5e4a2d7ba9fbb0291188be2eb15788e9c5ef70024b2effa5b9915681"
content-hash = "e73427a82e6d9b8531424dc6552247cca063044c9bd2aa9410f2a37bee9e8d3b"

View File

@ -16,6 +16,9 @@ from = "modules"
include = "spark_ui"
from = "modules"
[[tool.poetry.packages]]
include = "merged_ui"
from = "modules"
[tool.poetry.dependencies]
python = ">=3.11,<3.12"
@ -36,38 +39,38 @@ torch-directml = "^0.2.5.dev240914"
# ---------------------------------------------------------------------------
fairseq = { git = "https://github.com/One-sixth/fairseq.git" }
joblib = ">=1.1.0"
numba = "*"
llvmlite = "*"
Cython = "*"
numba = ">=0.56.0"
llvmlite = ">=0.39.0"
Cython = ">=0.29.0"
numpy = ">=1.0,<2.0"
scipy = "*"
scipy = ">=1.9.0"
librosa = "==0.10.2"
faiss-cpu = "*"
gradio = "3.50.0"
faiss-cpu = ">=1.7.0"
gradio = "==3.50.0"
soundfile = "0.12.1"
ffmpeg-python = ">=0.2.0"
matplotlib = ">=3.7.0"
matplotlib-inline = ">=0.1.3"
praat-parselmouth = ">=0.4.2"
tensorboardX = "*"
tensorboard = "*"
tensorboardX = ">=2.5.0"
tensorboard = ">=2.10.0"
Pillow = ">=9.1.1"
scikit-learn = "*"
scikit-learn = ">=1.0.0"
tqdm = ">=4.63.1"
uvicorn = ">=0.21.1"
pyworld = "==0.3.2"
onnxruntime = { version = "*", markers = "sys_platform == 'darwin'" }
onnxruntime-gpu = { version = "*", markers = "sys_platform != 'darwin'" }
onnxruntime = { version = ">=1.13.0", markers = "sys_platform == 'darwin'" }
onnxruntime-gpu = { version = ">=1.13.0", markers = "sys_platform != 'darwin'" }
torchcrepe = "==0.0.23"
fastapi = "==0.88"
torchfcpe = "*"
torchfcpe = ">=0.0.1"
ffmpy = "==0.3.1"
python-dotenv = ">=1.0.0"
av = "*"
autoflake = "^2.3.1"
av = ">=9.0.0"
autoflake = "2.3.1"
einops = "0.8.1"
einx = "0.3.0"
transformers = "^4.49.0"
transformers = "4.49.0"
[tool.poetry.group.dev.dependencies]
black = "^25.1.0"