Final update to the audio streaming

This commit is contained in:
VSlobolinskyi 2025-03-21 22:32:43 +02:00
parent 249c5d1049
commit 0de7a6283b

View File

@ -1,4 +1,5 @@
from datetime import datetime import datetime
import time
import logging import logging
import os import os
import platform import platform
@ -8,6 +9,7 @@ import numpy as np
import soundfile as sf import soundfile as sf
from pydub import AudioSegment from pydub import AudioSegment
import torch import torch
from pydub import AudioSegment
# Import modules from your packages # Import modules from your packages
from spark.cli.SparkTTS import SparkTTS from spark.cli.SparkTTS import SparkTTS
@ -219,6 +221,9 @@ def generate_and_process_with_rvc(
): ):
""" """
Handle combined TTS and RVC processing for multiple sentences and yield outputs as they are processed. Handle combined TTS and RVC processing for multiple sentences and yield outputs as they are processed.
The output is just the latest processed audio.
Before yielding a new audio fragment, the function waits for the previous one to finish playing,
based on its duration.
""" """
# Ensure TEMP directories exist # Ensure TEMP directories exist
os.makedirs("./TEMP/spark", exist_ok=True) os.makedirs("./TEMP/spark", exist_ok=True)
@ -242,11 +247,13 @@ def generate_and_process_with_rvc(
prompt_text_clean = None if not prompt_text or len(prompt_text) < 2 else prompt_text prompt_text_clean = None if not prompt_text or len(prompt_text) < 2 else prompt_text
info_messages = [f"Processing {len(sentences)} sentences..."] info_messages = [f"Processing {len(sentences)} sentences..."]
results = []
# Yield initial message with no audio yet # Yield initial message with no audio yet
yield "\n".join(info_messages), None yield "\n".join(info_messages), None
# Set up a timer to simulate playback duration
next_available_time = time.time()
for i, sentence in enumerate(sentences): for i, sentence in enumerate(sentences):
spark_path, rvc_path, success, info = process_single_sentence( spark_path, rvc_path, success, info = process_single_sentence(
i, sentence, prompt_speech, prompt_text_clean, i, sentence, prompt_speech, prompt_text_clean,
@ -257,24 +264,23 @@ def generate_and_process_with_rvc(
) )
info_messages.append(info) info_messages.append(info)
# Only update output if processing was successful and we have an audio file
if success and rvc_path: if success and rvc_path:
results.append(rvc_path) try:
audio_seg = AudioSegment.from_file(rvc_path)
# Build partial concatenation from results so far if any fragment exists duration = audio_seg.duration_seconds
if results: except Exception as e:
partial_output_path = f"./TEMP/partial_output_{base_fragment_num}.wav" duration = 0
concatenation_success = concatenate_audio_files(results, partial_output_path)
if not concatenation_success:
# Fallback: use the latest processed fragment
partial_output_path = results[-1]
else:
partial_output_path = None
# Yield the current info and the partial audio so far
yield "\n".join(info_messages), partial_output_path
# Optionally, yield one final update (could be identical to the last yield) current_time = time.time()
yield "\n".join(info_messages), partial_output_path if current_time < next_available_time:
time.sleep(next_available_time - current_time)
yield "\n".join(info_messages), rvc_path
next_available_time = time.time() + duration
yield "\n".join(info_messages), rvc_path
def modified_get_vc(sid0_value, protect0_value, file_index2_component): def modified_get_vc(sid0_value, protect0_value, file_index2_component):
""" """