2025-03-21 19:17:49 +02:00

111 lines
4.4 KiB
Python

import os
import time
# Import and reuse existing functions
from merged_ui.utils import (
split_into_sentences,
process_single_sentence,
concatenate_audio_files,
split_into_sentences,
process_single_sentence,
initialize_model
)
# Initialize the Spark TTS model (moved outside function to avoid reinitializing)
model_dir = "spark/pretrained_models/Spark-TTS-0.5B"
device = 0
spark_model = initialize_model(model_dir, device=device)
def generate_and_process_with_rvc_streaming(
text, prompt_text, prompt_wav_upload, prompt_wav_record,
spk_item, vc_transform, f0method,
file_index1, file_index2, index_rate, filter_radius,
resample_sr, rms_mix_rate, protect
):
"""
Stream process TTS and RVC, yielding audio updates as sentences are processed
This is a generator function that yields (status_text, audio_path) tuples
"""
# Ensure TEMP directories exist
os.makedirs("./TEMP/spark", exist_ok=True)
os.makedirs("./TEMP/rvc", exist_ok=True)
os.makedirs("./TEMP/stream", exist_ok=True)
# Split text into sentences
sentences = split_into_sentences(text)
if not sentences:
yield "No valid text to process.", None
return
# Get timestamp to create unique session ID for this run
session_id = str(int(time.time()))
# Process reference speech
prompt_speech = prompt_wav_upload if prompt_wav_upload else prompt_wav_record
prompt_text_clean = None if not prompt_text or len(prompt_text) < 2 else prompt_text
# Initialize status
status_messages = [f"Starting to process {len(sentences)} sentences..."]
# Create a list to track processed fragments
processed_fragments = []
# Create a temporary directory for streaming
stream_dir = f"./TEMP/stream/{session_id}"
os.makedirs(stream_dir, exist_ok=True)
# Yield initial status
yield "\n".join(status_messages), None
# Process each sentence and update the stream
for i, sentence in enumerate(sentences):
# Add current sentence to status
current_msg = f"Processing sentence {i+1}/{len(sentences)}: {sentence[:30]}..."
status_messages.append(current_msg)
yield "\n".join(status_messages), None
# Process this sentence
spark_path, rvc_path, success, info = process_single_sentence(
i, sentence, prompt_speech, prompt_text_clean,
spk_item, vc_transform, f0method,
file_index1, file_index2, index_rate, filter_radius,
resample_sr, rms_mix_rate, protect,
int(session_id) # Use session ID as base fragment number
)
# Update status with processing result
status_messages[-1] = info
if success and rvc_path and os.path.exists(rvc_path):
processed_fragments.append(rvc_path)
# Create a streaming update file by concatenating all fragments processed so far
stream_path = os.path.join(stream_dir, f"stream_update_{i+1}.wav")
# Concatenate all fragments processed so far
concatenate_success = concatenate_audio_files(processed_fragments, stream_path)
if concatenate_success:
# Yield the updated status and the current stream path
yield "\n".join(status_messages), stream_path
else:
# If concatenation failed, just yield the most recent fragment
yield "\n".join(status_messages), rvc_path
else:
# If processing failed, update status but don't update audio
yield "\n".join(status_messages), None if not processed_fragments else processed_fragments[-1]
# Final streaming update with completion message
if processed_fragments:
# Create final output file
final_output_path = f"./TEMP/stream/{session_id}/final_output.wav"
concatenate_success = concatenate_audio_files(processed_fragments, final_output_path)
if concatenate_success:
status_messages.append(f"\nAll {len(sentences)} sentences processed successfully!")
yield "\n".join(status_messages), final_output_path
else:
status_messages.append("\nWarning: Failed to create final concatenated file.")
yield "\n".join(status_messages), processed_fragments[-1]
else:
status_messages.append("\nNo sentences were successfully processed.")
yield "\n".join(status_messages), None