Stream audio output

2025-04-24 21:48:59 +08:00 · 2025-03-21 22:13:14 +02:00 · 2025-03-21 22:13:14 +02:00 · 249c5d1049
commit 249c5d1049
parent 87e06f30ec
3 changed files with 29 additions and 26 deletions
--- a/infer-web.py
+++ b/infer-web.py
@ -20,9 +20,9 @@ def build_standalone_ui():
 if __name__ == "__main__":
    app = build_merged_ui()
    if config.iscolab:
-        app.launch(share=True)
+        app.queue().launch(share=True)
    else:
-        app.launch(
+        app.queue().launch(
            server_name="localhost",
            inbrowser=not config.noautoopen,
            server_port=config.listen_port,
--- a/modules/merged_ui/main.py
+++ b/modules/merged_ui/main.py
@ -17,7 +17,7 @@ def build_merged_ui():
        with gr.Tabs():
            with gr.TabItem("TTS-to-RVC Pipeline"):
                gr.Markdown("### Generate speech with Spark TTS and convert with RVC")
-                gr.Markdown("*Note: For multi-sentence text, each sentence will be processed separately and then combined.*")
+                gr.Markdown("*Note: For multi-sentence text, each sentence will be processed separately and streamed as it’s ready.*")
                
                # TTS Generation Section
                with gr.Row():
@ -131,9 +131,9 @@ def build_merged_ui():
                
                with gr.Row():
                    vc_output1 = gr.Textbox(label="Output information", lines=10)
-                    vc_output2 = gr.Audio(label="Final concatenated audio")
+                    vc_output2 = gr.Audio(label="Streaming concatenated audio", autoplay=True)
                
-                # Connect generate function to button
+                # Connect generate function to button with streaming enabled
                generate_with_rvc_button.click(
                    generate_and_process_with_rvc,
                    inputs=[
@ -152,7 +152,7 @@ def build_merged_ui():
                        rms_mix_rate0,
                        protect0,
                    ],
-                    outputs=[vc_output1, vc_output2],
+                    outputs=[vc_output1, vc_output2]
                )
                
                # Connect modified_get_vc function for dropdown change
--- a/modules/merged_ui/utils.py
+++ b/modules/merged_ui/utils.py
@ -218,7 +218,7 @@ def generate_and_process_with_rvc(
    resample_sr, rms_mix_rate, protect
 ):
    """
-    Handle combined TTS and RVC processing for multiple sentences and save outputs to TEMP directories
+    Handle combined TTS and RVC processing for multiple sentences and yield outputs as they are processed.
    """
    # Ensure TEMP directories exist
    os.makedirs("./TEMP/spark", exist_ok=True)
@ -227,7 +227,8 @@ def generate_and_process_with_rvc(
    # Split text into sentences
    sentences = split_into_sentences(text)
    if not sentences:
-        return "No valid text to process.", None
+        yield "No valid text to process.", None
+        return
    
    # Get next base fragment number
    base_fragment_num = 1
@ -240,10 +241,12 @@ def generate_and_process_with_rvc(
    prompt_speech = prompt_wav_upload if prompt_wav_upload else prompt_wav_record
    prompt_text_clean = None if not prompt_text or len(prompt_text) < 2 else prompt_text
    
-    # Process each sentence
-    results = []
    info_messages = [f"Processing {len(sentences)} sentences..."]
+    results = []
    
+    # Yield initial message with no audio yet
+    yield "\n".join(info_messages), None
+
    for i, sentence in enumerate(sentences):
        spark_path, rvc_path, success, info = process_single_sentence(
            i, sentence, prompt_speech, prompt_text_clean,
@ -256,22 +259,22 @@ def generate_and_process_with_rvc(
        info_messages.append(info)
        if success and rvc_path:
            results.append(rvc_path)
-    
-    # If no sentences were successfully processed
-    if not results:
-        return "\n".join(info_messages) + "\n\nNo sentences were successfully processed.", None
-    
-    # Concatenate all successful RVC fragments
-    final_output_path = f"./TEMP/final_output_{base_fragment_num}.wav"
-    concatenation_success = concatenate_audio_files(results, final_output_path)
-    
-    if concatenation_success:
-        info_messages.append(f"\nAll fragments concatenated successfully to: {final_output_path}")
-        return "\n".join(info_messages), final_output_path
-    else:
-        # If concatenation failed but we have at least one successful fragment, return the first one
-        info_messages.append(f"\nFailed to concatenate fragments. Returning first successful fragment.")
-        return "\n".join(info_messages), results[0]
+        
+        # Build partial concatenation from results so far if any fragment exists
+        if results:
+            partial_output_path = f"./TEMP/partial_output_{base_fragment_num}.wav"
+            concatenation_success = concatenate_audio_files(results, partial_output_path)
+            if not concatenation_success:
+                # Fallback: use the latest processed fragment
+                partial_output_path = results[-1]
+        else:
+            partial_output_path = None
+        
+        # Yield the current info and the partial audio so far
+        yield "\n".join(info_messages), partial_output_path
+
+    # Optionally, yield one final update (could be identical to the last yield)
+    yield "\n".join(info_messages), partial_output_path

 def modified_get_vc(sid0_value, protect0_value, file_index2_component):
    """