mirror of
https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI.git
synced 2025-04-05 04:08:58 +08:00
Add text splitting
This commit is contained in:
parent
9a0af2de15
commit
a18a2c6b94
@ -4,7 +4,6 @@ import gradio as gr
|
|||||||
from merged_ui.utils import generate_and_process_with_rvc, modified_get_vc
|
from merged_ui.utils import generate_and_process_with_rvc, modified_get_vc
|
||||||
from rvc_ui.initialization import config
|
from rvc_ui.initialization import config
|
||||||
from rvc_ui.main import names, index_paths
|
from rvc_ui.main import names, index_paths
|
||||||
from spark.sparktts.utils.token_parser import LEVELS_MAP_UI
|
|
||||||
|
|
||||||
def build_merged_ui():
|
def build_merged_ui():
|
||||||
"""
|
"""
|
||||||
@ -18,6 +17,7 @@ def build_merged_ui():
|
|||||||
with gr.Tabs():
|
with gr.Tabs():
|
||||||
with gr.TabItem("TTS-to-RVC Pipeline"):
|
with gr.TabItem("TTS-to-RVC Pipeline"):
|
||||||
gr.Markdown("### Generate speech with Spark TTS and convert with RVC")
|
gr.Markdown("### Generate speech with Spark TTS and convert with RVC")
|
||||||
|
gr.Markdown("*Note: For multi-sentence text, each sentence will be processed separately and then combined.*")
|
||||||
|
|
||||||
# TTS Generation Section
|
# TTS Generation Section
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
@ -36,7 +36,7 @@ def build_merged_ui():
|
|||||||
tts_text_input = gr.Textbox(
|
tts_text_input = gr.Textbox(
|
||||||
label="Text to synthesize",
|
label="Text to synthesize",
|
||||||
lines=3,
|
lines=3,
|
||||||
placeholder="Enter text for TTS"
|
placeholder="Enter text for TTS. Multiple sentences will be processed individually."
|
||||||
)
|
)
|
||||||
prompt_text_input = gr.Textbox(
|
prompt_text_input = gr.Textbox(
|
||||||
label="Text of prompt speech (Optional)",
|
label="Text of prompt speech (Optional)",
|
||||||
@ -130,8 +130,8 @@ def build_merged_ui():
|
|||||||
generate_with_rvc_button = gr.Button("Generate with RVC", variant="primary")
|
generate_with_rvc_button = gr.Button("Generate with RVC", variant="primary")
|
||||||
|
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
vc_output1 = gr.Textbox(label="Output information")
|
vc_output1 = gr.Textbox(label="Output information", lines=10)
|
||||||
vc_output2 = gr.Audio(label="Final converted audio")
|
vc_output2 = gr.Audio(label="Final concatenated audio")
|
||||||
|
|
||||||
# Connect generate function to button
|
# Connect generate function to button
|
||||||
generate_with_rvc_button.click(
|
generate_with_rvc_button.click(
|
||||||
|
@ -1,5 +1,10 @@
|
|||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
import re
|
||||||
|
import numpy as np
|
||||||
|
from time import sleep
|
||||||
|
import soundfile as sf
|
||||||
|
from pydub import AudioSegment
|
||||||
|
|
||||||
# Import modules from your packages
|
# Import modules from your packages
|
||||||
from rvc_ui.initialization import vc
|
from rvc_ui.initialization import vc
|
||||||
@ -11,31 +16,45 @@ model_dir = "spark/pretrained_models/Spark-TTS-0.5B"
|
|||||||
device = 0
|
device = 0
|
||||||
spark_model = initialize_model(model_dir, device=device)
|
spark_model = initialize_model(model_dir, device=device)
|
||||||
|
|
||||||
def generate_and_process_with_rvc(
|
def split_into_sentences(text):
|
||||||
text, prompt_text, prompt_wav_upload, prompt_wav_record,
|
"""
|
||||||
|
Split text into sentences using regular expressions.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): The input text to split
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: A list of sentences
|
||||||
|
"""
|
||||||
|
# Split on period, exclamation mark, or question mark followed by space or end of string
|
||||||
|
sentences = re.split(r'(?<=[.!?])\s+|(?<=[.!?])$', text)
|
||||||
|
# Remove any empty sentences
|
||||||
|
sentences = [s.strip() for s in sentences if s.strip()]
|
||||||
|
return sentences
|
||||||
|
|
||||||
|
def process_single_sentence(
|
||||||
|
sentence_index, sentence, prompt_speech, prompt_text_clean,
|
||||||
spk_item, vc_transform, f0method,
|
spk_item, vc_transform, f0method,
|
||||||
file_index1, file_index2, index_rate, filter_radius,
|
file_index1, file_index2, index_rate, filter_radius,
|
||||||
resample_sr, rms_mix_rate, protect
|
resample_sr, rms_mix_rate, protect,
|
||||||
|
base_fragment_num
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Handle combined TTS and RVC processing and save outputs to TEMP directories
|
Process a single sentence through the TTS and RVC pipeline
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sentence_index (int): Index of the sentence in the original text
|
||||||
|
sentence (str): The sentence text to process
|
||||||
|
... (other parameters are the same as generate_and_process_with_rvc)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (spark_output_path, rvc_output_path, success, info_message)
|
||||||
"""
|
"""
|
||||||
# Ensure TEMP directories exist
|
fragment_num = base_fragment_num + sentence_index
|
||||||
os.makedirs("./TEMP/spark", exist_ok=True)
|
|
||||||
os.makedirs("./TEMP/rvc", exist_ok=True)
|
|
||||||
|
|
||||||
# Get next fragment number
|
|
||||||
fragment_num = 1
|
|
||||||
while (os.path.exists(f"./TEMP/spark/fragment_{fragment_num}.wav") or
|
|
||||||
os.path.exists(f"./TEMP/rvc/fragment_{fragment_num}.wav")):
|
|
||||||
fragment_num += 1
|
|
||||||
|
|
||||||
# First generate TTS audio
|
|
||||||
prompt_speech = prompt_wav_upload if prompt_wav_upload else prompt_wav_record
|
|
||||||
prompt_text_clean = None if not prompt_text or len(prompt_text) < 2 else prompt_text
|
|
||||||
|
|
||||||
|
# Generate TTS audio for this sentence
|
||||||
tts_path = run_tts(
|
tts_path = run_tts(
|
||||||
text,
|
sentence,
|
||||||
spark_model,
|
spark_model,
|
||||||
prompt_text=prompt_text_clean,
|
prompt_text=prompt_text_clean,
|
||||||
prompt_speech=prompt_speech
|
prompt_speech=prompt_speech
|
||||||
@ -43,7 +62,7 @@ def generate_and_process_with_rvc(
|
|||||||
|
|
||||||
# Make sure we have a TTS file to process
|
# Make sure we have a TTS file to process
|
||||||
if not tts_path or not os.path.exists(tts_path):
|
if not tts_path or not os.path.exists(tts_path):
|
||||||
return "Failed to generate TTS audio", None
|
return None, None, False, f"Failed to generate TTS audio for sentence: {sentence}"
|
||||||
|
|
||||||
# Save Spark output to TEMP/spark
|
# Save Spark output to TEMP/spark
|
||||||
spark_output_path = f"./TEMP/spark/fragment_{fragment_num}.wav"
|
spark_output_path = f"./TEMP/spark/fragment_{fragment_num}.wav"
|
||||||
@ -70,7 +89,6 @@ def generate_and_process_with_rvc(
|
|||||||
elif isinstance(output_audio, tuple) and len(output_audio) >= 2:
|
elif isinstance(output_audio, tuple) and len(output_audio) >= 2:
|
||||||
# Case 2: output_audio might be (sample_rate, audio_data)
|
# Case 2: output_audio might be (sample_rate, audio_data)
|
||||||
try:
|
try:
|
||||||
import soundfile as sf
|
|
||||||
sf.write(rvc_output_path, output_audio[1], output_audio[0])
|
sf.write(rvc_output_path, output_audio[1], output_audio[0])
|
||||||
rvc_saved = True
|
rvc_saved = True
|
||||||
except Exception as inner_e:
|
except Exception as inner_e:
|
||||||
@ -82,14 +100,120 @@ def generate_and_process_with_rvc(
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
output_info += f"\nError saving RVC output: {str(e)}"
|
output_info += f"\nError saving RVC output: {str(e)}"
|
||||||
|
|
||||||
# Add file paths to output info
|
# Prepare info message
|
||||||
output_info += f"\nSpark output saved to: {spark_output_path}"
|
info_message = f"Sentence {sentence_index+1}: {sentence[:30]}{'...' if len(sentence) > 30 else ''}\n"
|
||||||
|
info_message += f" - Spark output: {spark_output_path}\n"
|
||||||
if rvc_saved:
|
if rvc_saved:
|
||||||
output_info += f"\nRVC output saved to: {rvc_output_path}"
|
info_message += f" - RVC output: {rvc_output_path}"
|
||||||
else:
|
else:
|
||||||
output_info += f"\nCould not automatically save RVC output to {rvc_output_path}"
|
info_message += f" - Could not save RVC output to {rvc_output_path}"
|
||||||
|
|
||||||
return output_info, output_audio
|
return spark_output_path, rvc_output_path, rvc_saved, info_message
|
||||||
|
|
||||||
|
def concatenate_audio_files(file_paths, output_path, sample_rate=44100):
|
||||||
|
"""
|
||||||
|
Concatenate multiple audio files into a single file
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_paths (list): List of paths to audio files
|
||||||
|
output_path (str): Path to save the concatenated audio
|
||||||
|
sample_rate (int): Sample rate for the output file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if concatenation was successful, False otherwise
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Use pydub to concatenate audio files
|
||||||
|
combined = AudioSegment.empty()
|
||||||
|
for file_path in file_paths:
|
||||||
|
segment = AudioSegment.from_file(file_path)
|
||||||
|
combined += segment
|
||||||
|
|
||||||
|
# Export the combined audio
|
||||||
|
combined.export(output_path, format="wav")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error concatenating audio files: {str(e)}")
|
||||||
|
|
||||||
|
# Fallback method using soundfile
|
||||||
|
try:
|
||||||
|
all_audio = []
|
||||||
|
for file_path in file_paths:
|
||||||
|
data, sr = sf.read(file_path)
|
||||||
|
# Convert to mono if stereo
|
||||||
|
if len(data.shape) > 1 and data.shape[1] > 1:
|
||||||
|
data = data.mean(axis=1)
|
||||||
|
all_audio.append(data)
|
||||||
|
|
||||||
|
# Concatenate all audio data
|
||||||
|
concatenated = np.concatenate(all_audio)
|
||||||
|
sf.write(output_path, concatenated, sample_rate)
|
||||||
|
return True
|
||||||
|
except Exception as e2:
|
||||||
|
print(f"Fallback concatenation failed: {str(e2)}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def generate_and_process_with_rvc(
|
||||||
|
text, prompt_text, prompt_wav_upload, prompt_wav_record,
|
||||||
|
spk_item, vc_transform, f0method,
|
||||||
|
file_index1, file_index2, index_rate, filter_radius,
|
||||||
|
resample_sr, rms_mix_rate, protect
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Handle combined TTS and RVC processing for multiple sentences and save outputs to TEMP directories
|
||||||
|
"""
|
||||||
|
# Ensure TEMP directories exist
|
||||||
|
os.makedirs("./TEMP/spark", exist_ok=True)
|
||||||
|
os.makedirs("./TEMP/rvc", exist_ok=True)
|
||||||
|
|
||||||
|
# Split text into sentences
|
||||||
|
sentences = split_into_sentences(text)
|
||||||
|
if not sentences:
|
||||||
|
return "No valid text to process.", None
|
||||||
|
|
||||||
|
# Get next base fragment number
|
||||||
|
base_fragment_num = 1
|
||||||
|
while any(os.path.exists(f"./TEMP/spark/fragment_{base_fragment_num + i}.wav") or
|
||||||
|
os.path.exists(f"./TEMP/rvc/fragment_{base_fragment_num + i}.wav")
|
||||||
|
for i in range(len(sentences))):
|
||||||
|
base_fragment_num += 1
|
||||||
|
|
||||||
|
# Process reference speech
|
||||||
|
prompt_speech = prompt_wav_upload if prompt_wav_upload else prompt_wav_record
|
||||||
|
prompt_text_clean = None if not prompt_text or len(prompt_text) < 2 else prompt_text
|
||||||
|
|
||||||
|
# Process each sentence
|
||||||
|
results = []
|
||||||
|
info_messages = [f"Processing {len(sentences)} sentences..."]
|
||||||
|
|
||||||
|
for i, sentence in enumerate(sentences):
|
||||||
|
spark_path, rvc_path, success, info = process_single_sentence(
|
||||||
|
i, sentence, prompt_speech, prompt_text_clean,
|
||||||
|
spk_item, vc_transform, f0method,
|
||||||
|
file_index1, file_index2, index_rate, filter_radius,
|
||||||
|
resample_sr, rms_mix_rate, protect,
|
||||||
|
base_fragment_num
|
||||||
|
)
|
||||||
|
|
||||||
|
info_messages.append(info)
|
||||||
|
if success and rvc_path:
|
||||||
|
results.append(rvc_path)
|
||||||
|
|
||||||
|
# If no sentences were successfully processed
|
||||||
|
if not results:
|
||||||
|
return "\n".join(info_messages) + "\n\nNo sentences were successfully processed.", None
|
||||||
|
|
||||||
|
# Concatenate all successful RVC fragments
|
||||||
|
final_output_path = f"./TEMP/final_output_{base_fragment_num}.wav"
|
||||||
|
concatenation_success = concatenate_audio_files(results, final_output_path)
|
||||||
|
|
||||||
|
if concatenation_success:
|
||||||
|
info_messages.append(f"\nAll fragments concatenated successfully to: {final_output_path}")
|
||||||
|
return "\n".join(info_messages), final_output_path
|
||||||
|
else:
|
||||||
|
# If concatenation failed but we have at least one successful fragment, return the first one
|
||||||
|
info_messages.append(f"\nFailed to concatenate fragments. Returning first successful fragment.")
|
||||||
|
return "\n".join(info_messages), results[0]
|
||||||
|
|
||||||
def modified_get_vc(sid0_value, protect0_value, file_index2_component):
|
def modified_get_vc(sid0_value, protect0_value, file_index2_component):
|
||||||
"""
|
"""
|
||||||
@ -101,4 +225,4 @@ def modified_get_vc(sid0_value, protect0_value, file_index2_component):
|
|||||||
if isinstance(outputs, tuple) and len(outputs) >= 3:
|
if isinstance(outputs, tuple) and len(outputs) >= 3:
|
||||||
return outputs[0], outputs[1], outputs[3]
|
return outputs[0], outputs[1], outputs[3]
|
||||||
|
|
||||||
return 0, protect0_value, file_index2_component.choices[0] if file_index2_component.choices else ""
|
return 0, protect0_value, file_index2_component.choices[0] if file_index2_component.choices else ""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user