From a18a2c6b94d5d1841c46916afe7e35e6224389a0 Mon Sep 17 00:00:00 2001 From: VSlobolinskyi Date: Fri, 21 Mar 2025 18:30:57 +0200 Subject: [PATCH] Add text splitting --- modules/merged_ui/main.py | 8 +- modules/merged_ui/utils.py | 176 +++++++++++++++++++++++++++++++------ 2 files changed, 154 insertions(+), 30 deletions(-) diff --git a/modules/merged_ui/main.py b/modules/merged_ui/main.py index 7c15002..ade00a7 100644 --- a/modules/merged_ui/main.py +++ b/modules/merged_ui/main.py @@ -4,7 +4,6 @@ import gradio as gr from merged_ui.utils import generate_and_process_with_rvc, modified_get_vc from rvc_ui.initialization import config from rvc_ui.main import names, index_paths -from spark.sparktts.utils.token_parser import LEVELS_MAP_UI def build_merged_ui(): """ @@ -18,6 +17,7 @@ def build_merged_ui(): with gr.Tabs(): with gr.TabItem("TTS-to-RVC Pipeline"): gr.Markdown("### Generate speech with Spark TTS and convert with RVC") + gr.Markdown("*Note: For multi-sentence text, each sentence will be processed separately and then combined.*") # TTS Generation Section with gr.Row(): @@ -36,7 +36,7 @@ def build_merged_ui(): tts_text_input = gr.Textbox( label="Text to synthesize", lines=3, - placeholder="Enter text for TTS" + placeholder="Enter text for TTS. Multiple sentences will be processed individually." ) prompt_text_input = gr.Textbox( label="Text of prompt speech (Optional)", @@ -130,8 +130,8 @@ def build_merged_ui(): generate_with_rvc_button = gr.Button("Generate with RVC", variant="primary") with gr.Row(): - vc_output1 = gr.Textbox(label="Output information") - vc_output2 = gr.Audio(label="Final converted audio") + vc_output1 = gr.Textbox(label="Output information", lines=10) + vc_output2 = gr.Audio(label="Final concatenated audio") # Connect generate function to button generate_with_rvc_button.click( diff --git a/modules/merged_ui/utils.py b/modules/merged_ui/utils.py index 7c87708..7e95ee4 100644 --- a/modules/merged_ui/utils.py +++ b/modules/merged_ui/utils.py @@ -1,5 +1,10 @@ import os import shutil +import re +import numpy as np +from time import sleep +import soundfile as sf +from pydub import AudioSegment # Import modules from your packages from rvc_ui.initialization import vc @@ -11,31 +16,45 @@ model_dir = "spark/pretrained_models/Spark-TTS-0.5B" device = 0 spark_model = initialize_model(model_dir, device=device) -def generate_and_process_with_rvc( - text, prompt_text, prompt_wav_upload, prompt_wav_record, +def split_into_sentences(text): + """ + Split text into sentences using regular expressions. + + Args: + text (str): The input text to split + + Returns: + list: A list of sentences + """ + # Split on period, exclamation mark, or question mark followed by space or end of string + sentences = re.split(r'(?<=[.!?])\s+|(?<=[.!?])$', text) + # Remove any empty sentences + sentences = [s.strip() for s in sentences if s.strip()] + return sentences + +def process_single_sentence( + sentence_index, sentence, prompt_speech, prompt_text_clean, spk_item, vc_transform, f0method, file_index1, file_index2, index_rate, filter_radius, - resample_sr, rms_mix_rate, protect + resample_sr, rms_mix_rate, protect, + base_fragment_num ): """ - Handle combined TTS and RVC processing and save outputs to TEMP directories + Process a single sentence through the TTS and RVC pipeline + + Args: + sentence_index (int): Index of the sentence in the original text + sentence (str): The sentence text to process + ... (other parameters are the same as generate_and_process_with_rvc) + + Returns: + tuple: (spark_output_path, rvc_output_path, success, info_message) """ - # Ensure TEMP directories exist - os.makedirs("./TEMP/spark", exist_ok=True) - os.makedirs("./TEMP/rvc", exist_ok=True) - - # Get next fragment number - fragment_num = 1 - while (os.path.exists(f"./TEMP/spark/fragment_{fragment_num}.wav") or - os.path.exists(f"./TEMP/rvc/fragment_{fragment_num}.wav")): - fragment_num += 1 - - # First generate TTS audio - prompt_speech = prompt_wav_upload if prompt_wav_upload else prompt_wav_record - prompt_text_clean = None if not prompt_text or len(prompt_text) < 2 else prompt_text + fragment_num = base_fragment_num + sentence_index + # Generate TTS audio for this sentence tts_path = run_tts( - text, + sentence, spark_model, prompt_text=prompt_text_clean, prompt_speech=prompt_speech @@ -43,7 +62,7 @@ def generate_and_process_with_rvc( # Make sure we have a TTS file to process if not tts_path or not os.path.exists(tts_path): - return "Failed to generate TTS audio", None + return None, None, False, f"Failed to generate TTS audio for sentence: {sentence}" # Save Spark output to TEMP/spark spark_output_path = f"./TEMP/spark/fragment_{fragment_num}.wav" @@ -70,7 +89,6 @@ def generate_and_process_with_rvc( elif isinstance(output_audio, tuple) and len(output_audio) >= 2: # Case 2: output_audio might be (sample_rate, audio_data) try: - import soundfile as sf sf.write(rvc_output_path, output_audio[1], output_audio[0]) rvc_saved = True except Exception as inner_e: @@ -82,14 +100,120 @@ def generate_and_process_with_rvc( except Exception as e: output_info += f"\nError saving RVC output: {str(e)}" - # Add file paths to output info - output_info += f"\nSpark output saved to: {spark_output_path}" + # Prepare info message + info_message = f"Sentence {sentence_index+1}: {sentence[:30]}{'...' if len(sentence) > 30 else ''}\n" + info_message += f" - Spark output: {spark_output_path}\n" if rvc_saved: - output_info += f"\nRVC output saved to: {rvc_output_path}" + info_message += f" - RVC output: {rvc_output_path}" else: - output_info += f"\nCould not automatically save RVC output to {rvc_output_path}" + info_message += f" - Could not save RVC output to {rvc_output_path}" - return output_info, output_audio + return spark_output_path, rvc_output_path, rvc_saved, info_message + +def concatenate_audio_files(file_paths, output_path, sample_rate=44100): + """ + Concatenate multiple audio files into a single file + + Args: + file_paths (list): List of paths to audio files + output_path (str): Path to save the concatenated audio + sample_rate (int): Sample rate for the output file + + Returns: + bool: True if concatenation was successful, False otherwise + """ + try: + # Use pydub to concatenate audio files + combined = AudioSegment.empty() + for file_path in file_paths: + segment = AudioSegment.from_file(file_path) + combined += segment + + # Export the combined audio + combined.export(output_path, format="wav") + return True + except Exception as e: + print(f"Error concatenating audio files: {str(e)}") + + # Fallback method using soundfile + try: + all_audio = [] + for file_path in file_paths: + data, sr = sf.read(file_path) + # Convert to mono if stereo + if len(data.shape) > 1 and data.shape[1] > 1: + data = data.mean(axis=1) + all_audio.append(data) + + # Concatenate all audio data + concatenated = np.concatenate(all_audio) + sf.write(output_path, concatenated, sample_rate) + return True + except Exception as e2: + print(f"Fallback concatenation failed: {str(e2)}") + return False + +def generate_and_process_with_rvc( + text, prompt_text, prompt_wav_upload, prompt_wav_record, + spk_item, vc_transform, f0method, + file_index1, file_index2, index_rate, filter_radius, + resample_sr, rms_mix_rate, protect +): + """ + Handle combined TTS and RVC processing for multiple sentences and save outputs to TEMP directories + """ + # Ensure TEMP directories exist + os.makedirs("./TEMP/spark", exist_ok=True) + os.makedirs("./TEMP/rvc", exist_ok=True) + + # Split text into sentences + sentences = split_into_sentences(text) + if not sentences: + return "No valid text to process.", None + + # Get next base fragment number + base_fragment_num = 1 + while any(os.path.exists(f"./TEMP/spark/fragment_{base_fragment_num + i}.wav") or + os.path.exists(f"./TEMP/rvc/fragment_{base_fragment_num + i}.wav") + for i in range(len(sentences))): + base_fragment_num += 1 + + # Process reference speech + prompt_speech = prompt_wav_upload if prompt_wav_upload else prompt_wav_record + prompt_text_clean = None if not prompt_text or len(prompt_text) < 2 else prompt_text + + # Process each sentence + results = [] + info_messages = [f"Processing {len(sentences)} sentences..."] + + for i, sentence in enumerate(sentences): + spark_path, rvc_path, success, info = process_single_sentence( + i, sentence, prompt_speech, prompt_text_clean, + spk_item, vc_transform, f0method, + file_index1, file_index2, index_rate, filter_radius, + resample_sr, rms_mix_rate, protect, + base_fragment_num + ) + + info_messages.append(info) + if success and rvc_path: + results.append(rvc_path) + + # If no sentences were successfully processed + if not results: + return "\n".join(info_messages) + "\n\nNo sentences were successfully processed.", None + + # Concatenate all successful RVC fragments + final_output_path = f"./TEMP/final_output_{base_fragment_num}.wav" + concatenation_success = concatenate_audio_files(results, final_output_path) + + if concatenation_success: + info_messages.append(f"\nAll fragments concatenated successfully to: {final_output_path}") + return "\n".join(info_messages), final_output_path + else: + # If concatenation failed but we have at least one successful fragment, return the first one + info_messages.append(f"\nFailed to concatenate fragments. Returning first successful fragment.") + return "\n".join(info_messages), results[0] def modified_get_vc(sid0_value, protect0_value, file_index2_component): """ @@ -101,4 +225,4 @@ def modified_get_vc(sid0_value, protect0_value, file_index2_component): if isinstance(outputs, tuple) and len(outputs) >= 3: return outputs[0], outputs[1], outputs[3] - return 0, protect0_value, file_index2_component.choices[0] if file_index2_component.choices else "" \ No newline at end of file + return 0, protect0_value, file_index2_component.choices[0] if file_index2_component.choices else ""