use crate::{
audio::AudioBuffer,
error::Result,
traits::{AcousticModel, G2p, Vocoder},
types::SynthesisConfig,
VoirsError,
};
use futures::StreamExt;
use std::sync::Arc;
use std::time::Instant;
use tracing::{debug, info, warn};
#[derive(Clone)]
pub struct SynthesisOrchestrator {
g2p: Arc<dyn G2p>,
acoustic: Arc<dyn AcousticModel>,
vocoder: Arc<dyn Vocoder>,
test_mode: bool,
}
impl SynthesisOrchestrator {
pub fn new(
g2p: Arc<dyn G2p>,
acoustic: Arc<dyn AcousticModel>,
vocoder: Arc<dyn Vocoder>,
) -> Self {
Self {
g2p,
acoustic,
vocoder,
test_mode: false,
}
}
pub fn with_test_mode(
g2p: Arc<dyn G2p>,
acoustic: Arc<dyn AcousticModel>,
vocoder: Arc<dyn Vocoder>,
test_mode: bool,
) -> Self {
Self {
g2p,
acoustic,
vocoder,
test_mode,
}
}
pub async fn synthesize(&self, text: &str, config: &SynthesisConfig) -> Result<AudioBuffer> {
let start_time = Instant::now();
info!("Starting synthesis for text: {}", text);
let memory_monitor = MemoryMonitor::new(self.test_mode);
let phonemes = self.text_to_phonemes(text, config).await?;
memory_monitor.check_memory_usage("after G2P")?;
let mel = self.phonemes_to_mel(&phonemes, config).await?;
memory_monitor.check_memory_usage("after acoustic model")?;
let mut audio = self.mel_to_audio(&mel, config).await?;
memory_monitor.check_memory_usage("after vocoder")?;
self.apply_post_processing(&mut audio, config).await?;
let duration = start_time.elapsed();
info!(
"Synthesis complete: {:.2}s audio generated in {:.2}s",
audio.duration(),
duration.as_secs_f64()
);
Ok(audio)
}
pub async fn synthesize_ssml(
&self,
ssml: &str,
config: &SynthesisConfig,
) -> Result<AudioBuffer> {
info!("Synthesizing SSML markup with enhanced processing");
let parsed = self.parse_ssml(ssml)?;
let enhanced_config = self.apply_ssml_instructions(&parsed.instructions, config)?;
let mut audio = self.synthesize(&parsed.text, &enhanced_config).await?;
self.apply_ssml_post_processing(&mut audio, &parsed.instructions, config)
.await?;
info!(
"SSML synthesis complete with {} instructions processed",
parsed.instructions.len()
);
Ok(audio)
}
pub async fn synthesize_stream(
&self,
text: &str,
config: &SynthesisConfig,
) -> Result<impl futures::Stream<Item = Result<AudioBuffer>>> {
info!("Starting streaming synthesis for long text");
let chunks = self.split_text_for_streaming(text, config)?;
let g2p = Arc::clone(&self.g2p);
let acoustic = Arc::clone(&self.acoustic);
let vocoder = Arc::clone(&self.vocoder);
let test_mode = self.test_mode;
let config = config.clone();
let stream = futures::stream::iter(chunks)
.map(move |chunk| {
let g2p = Arc::clone(&g2p);
let acoustic = Arc::clone(&acoustic);
let vocoder = Arc::clone(&vocoder);
let config = config.clone();
async move {
let orchestrator =
SynthesisOrchestrator::with_test_mode(g2p, acoustic, vocoder, test_mode);
orchestrator.synthesize(&chunk, &config).await
}
})
.buffer_unordered(4);
Ok(stream)
}
async fn text_to_phonemes(
&self,
text: &str,
config: &SynthesisConfig,
) -> Result<Vec<crate::types::Phoneme>> {
let start_time = Instant::now();
debug!("Converting text to phonemes");
let phonemes = self
.g2p
.to_phonemes(text, Some(config.language))
.await
.map_err(|e| VoirsError::synthesis_failed(text, e))?;
let duration = start_time.elapsed();
debug!(
"G2P conversion complete: {} phonemes in {:.2}ms",
phonemes.len(),
duration.as_millis()
);
Ok(phonemes)
}
async fn phonemes_to_mel(
&self,
phonemes: &[crate::types::Phoneme],
config: &SynthesisConfig,
) -> Result<crate::types::MelSpectrogram> {
let start_time = Instant::now();
debug!("Converting phonemes to mel spectrogram");
let mel = self
.acoustic
.synthesize(phonemes, Some(config))
.await
.map_err(|e| VoirsError::SynthesisFailed {
text: format!("{} phonemes", phonemes.len()),
text_length: phonemes.len(),
stage: crate::error::types::SynthesisStage::AcousticModeling,
cause: e.into(),
})?;
let duration = start_time.elapsed();
debug!(
"Acoustic model synthesis complete: {}x{} mel in {:.2}ms",
mel.n_mels,
mel.n_frames,
duration.as_millis()
);
Ok(mel)
}
async fn mel_to_audio(
&self,
mel: &crate::types::MelSpectrogram,
config: &SynthesisConfig,
) -> Result<AudioBuffer> {
let start_time = Instant::now();
debug!("Converting mel spectrogram to audio");
let audio = self.vocoder.vocode(mel, Some(config)).await.map_err(|e| {
VoirsError::SynthesisFailed {
text: format!("{}x{} mel", mel.n_mels, mel.n_frames),
text_length: mel.n_frames as usize,
stage: crate::error::types::SynthesisStage::Vocoding,
cause: e.into(),
}
})?;
let duration = start_time.elapsed();
debug!(
"Vocoder synthesis complete: {:.2}s audio in {:.2}ms",
audio.duration(),
duration.as_millis()
);
Ok(audio)
}
async fn apply_post_processing(
&self,
audio: &mut AudioBuffer,
config: &SynthesisConfig,
) -> Result<()> {
debug!("Applying post-processing");
if config.volume_gain != 0.0 {
audio.apply_gain(config.volume_gain)?;
}
if config.enable_enhancement {
self.apply_enhancement(audio, config).await?;
}
if !config.effects.is_empty() {
self.apply_effects(audio, config).await?;
}
Ok(())
}
async fn apply_enhancement(
&self,
audio: &mut AudioBuffer,
config: &SynthesisConfig,
) -> Result<()> {
debug!("Applying audio enhancement");
if !config.enable_enhancement {
return Ok(());
}
audio.normalize(0.95)?;
if config.volume_gain != 0.0 {
audio.apply_gain(config.volume_gain)?;
}
if config.speaking_rate != 1.0 {
self.apply_time_stretch(audio, config.speaking_rate)?;
}
if config.pitch_shift != 0.0 {
self.apply_pitch_shift(audio, config.pitch_shift)?;
}
debug!("Audio enhancement complete");
Ok(())
}
async fn apply_effects(&self, audio: &mut AudioBuffer, config: &SynthesisConfig) -> Result<()> {
debug!("Applying audio effects: {:?}", config.effects);
use crate::types::AudioEffect;
for effect in &config.effects {
match effect {
AudioEffect::Reverb {
room_size,
damping,
wet_level,
} => {
debug!(
"Applying reverb: room_size={}, damping={}, wet_level={}",
room_size, damping, wet_level
);
self.apply_reverb(audio, *room_size, *damping, *wet_level)?;
}
AudioEffect::Delay {
delay_time,
feedback,
wet_level,
} => {
debug!(
"Applying delay: delay_time={}, feedback={}, wet_level={}",
delay_time, feedback, wet_level
);
self.apply_delay(audio, *delay_time, *feedback, *wet_level)?;
}
AudioEffect::Equalizer {
low_gain,
mid_gain,
high_gain,
} => {
debug!(
"Applying equalizer: low_gain={}, mid_gain={}, high_gain={}",
low_gain, mid_gain, high_gain
);
self.apply_equalizer(audio, *low_gain, *mid_gain, *high_gain)?;
}
AudioEffect::Compressor {
threshold,
ratio,
attack,
release,
} => {
debug!(
"Applying compressor: threshold={}, ratio={}, attack={}, release={}",
threshold, ratio, attack, release
);
self.apply_compressor(audio, *threshold, *ratio, *attack, *release)?;
}
}
}
debug!("Audio effects processing complete");
Ok(())
}
fn apply_time_stretch(&self, audio: &mut AudioBuffer, speaking_rate: f32) -> Result<()> {
debug!("Applying time stretch with rate: {}", speaking_rate);
if speaking_rate != 1.0 {
let stretched = audio.time_stretch(1.0 / speaking_rate)?;
*audio = stretched;
}
Ok(())
}
fn apply_pitch_shift(&self, audio: &mut AudioBuffer, pitch_shift: f32) -> Result<()> {
debug!("Applying pitch shift: {} semitones", pitch_shift);
if pitch_shift != 0.0 {
let shifted = audio.pitch_shift(pitch_shift)?;
*audio = shifted;
}
Ok(())
}
fn apply_reverb(
&self,
audio: &mut AudioBuffer,
room_size: f32,
damping: f32,
wet_level: f32,
) -> Result<()> {
debug!(
"Applying reverb: room_size={}, damping={}, wet_level={}",
room_size, damping, wet_level
);
audio.reverb(room_size, damping, wet_level)?;
Ok(())
}
fn apply_delay(
&self,
audio: &mut AudioBuffer,
delay_time: f32,
feedback: f32,
wet_level: f32,
) -> Result<()> {
debug!(
"Applying delay: delay_time={}, feedback={}, wet_level={}",
delay_time, feedback, wet_level
);
let sample_rate = audio.sample_rate() as f32;
let delay_samples = (delay_time * sample_rate) as usize;
let samples = audio.samples_mut();
if delay_samples > 0 && delay_samples < samples.len() {
let mut delayed_samples = vec![0.0; samples.len()];
for i in delay_samples..samples.len() {
let delayed_value = samples[i - delay_samples] * feedback;
delayed_samples[i] = delayed_value;
if i + delay_samples < samples.len() {
delayed_samples[i + delay_samples] += delayed_value * feedback;
}
}
for i in 0..samples.len() {
samples[i] = samples[i] * (1.0 - wet_level) + delayed_samples[i] * wet_level;
}
}
Ok(())
}
fn apply_equalizer(
&self,
audio: &mut AudioBuffer,
low_gain: f32,
mid_gain: f32,
high_gain: f32,
) -> Result<()> {
debug!(
"Applying equalizer: low_gain={}, mid_gain={}, high_gain={}",
low_gain, mid_gain, high_gain
);
let samples = audio.samples_mut();
for sample in samples.iter_mut() {
let low_component = *sample * 0.3 * low_gain;
let mid_component = *sample * 0.4 * mid_gain;
let high_component = *sample * 0.3 * high_gain;
*sample = low_component + mid_component + high_component;
}
Ok(())
}
fn apply_compressor(
&self,
audio: &mut AudioBuffer,
threshold: f32,
ratio: f32,
attack: f32,
release: f32,
) -> Result<()> {
debug!(
"Applying compressor: threshold={}, ratio={}, attack={}, release={}",
threshold, ratio, attack, release
);
let sample_rate = audio.sample_rate() as f32;
let samples = audio.samples_mut();
let attack_coeff = (-1.0 / (attack * sample_rate)).exp();
let release_coeff = (-1.0 / (release * sample_rate)).exp();
let mut envelope = 0.0;
for sample in samples.iter_mut() {
let input_level = sample.abs();
if input_level > envelope {
envelope += (input_level - envelope) * (1.0 - attack_coeff);
} else {
envelope += (input_level - envelope) * (1.0 - release_coeff);
}
if envelope > threshold {
let excess = envelope - threshold;
let compressed_excess = excess / ratio;
let gain_reduction = (threshold + compressed_excess) / envelope;
*sample *= gain_reduction;
}
}
Ok(())
}
fn apply_ssml_instructions(
&self,
instructions: &[SsmlInstruction],
base_config: &SynthesisConfig,
) -> Result<SynthesisConfig> {
let mut config = base_config.clone();
for instruction in instructions {
match instruction.tag.as_str() {
"prosody" => {
self.apply_prosody_instruction(&mut config, &instruction.attributes)?;
}
"voice" => {
self.apply_voice_instruction(&mut config, &instruction.attributes)?;
}
"emphasis" => {
self.apply_emphasis_instruction(&mut config, &instruction.attributes)?;
}
"audio" => {
debug!("Audio tag found, will be processed in post-processing");
}
"break" => {
debug!("Break tag found, will be processed in post-processing");
}
_ => {
debug!("Unsupported SSML tag: {}", instruction.tag);
}
}
}
Ok(config)
}
async fn apply_ssml_post_processing(
&self,
audio: &mut AudioBuffer,
instructions: &[SsmlInstruction],
config: &SynthesisConfig,
) -> Result<()> {
for instruction in instructions {
match instruction.tag.as_str() {
"break" => {
self.apply_break_instruction(audio, &instruction.attributes, config)
.await?;
}
"audio" => {
warn!("Audio tag insertion not yet implemented");
}
_ => {
}
}
}
Ok(())
}
fn apply_prosody_instruction(
&self,
config: &mut SynthesisConfig,
attributes: &std::collections::HashMap<String, String>,
) -> Result<()> {
if let Some(rate) = attributes.get("rate") {
match rate.as_str() {
"x-slow" => config.speaking_rate = 0.5,
"slow" => config.speaking_rate = 0.75,
"medium" => config.speaking_rate = 1.0,
"fast" => config.speaking_rate = 1.25,
"x-fast" => config.speaking_rate = 1.5,
_ => {
if rate.ends_with('%') {
if let Ok(percent) = rate.trim_end_matches('%').parse::<f32>() {
config.speaking_rate = percent / 100.0;
}
} else if let Ok(value) = rate.parse::<f32>() {
config.speaking_rate = value;
}
}
}
}
if let Some(pitch) = attributes.get("pitch") {
match pitch.as_str() {
"x-low" => config.pitch_shift = -12.0,
"low" => config.pitch_shift = -6.0,
"medium" => config.pitch_shift = 0.0,
"high" => config.pitch_shift = 6.0,
"x-high" => config.pitch_shift = 12.0,
_ => {
if pitch.ends_with("Hz") {
warn!("Hz pitch specification not fully implemented");
} else if let Ok(value) = pitch.parse::<f32>() {
config.pitch_shift = value;
}
}
}
}
if let Some(volume) = attributes.get("volume") {
match volume.as_str() {
"silent" => config.volume_gain = -60.0,
"x-soft" => config.volume_gain = -20.0,
"soft" => config.volume_gain = -10.0,
"medium" => config.volume_gain = 0.0,
"loud" => config.volume_gain = 6.0,
"x-loud" => config.volume_gain = 12.0,
_ => {
if volume.ends_with("dB") {
if let Ok(db) = volume.trim_end_matches("dB").parse::<f32>() {
config.volume_gain = db;
}
} else if let Ok(value) = volume.parse::<f32>() {
config.volume_gain = 20.0 * value.log10();
}
}
}
}
debug!(
"Applied prosody: rate={}, pitch={}, volume={}",
config.speaking_rate, config.pitch_shift, config.volume_gain
);
Ok(())
}
fn apply_voice_instruction(
&self,
_config: &mut SynthesisConfig,
attributes: &std::collections::HashMap<String, String>,
) -> Result<()> {
if let Some(name) = attributes.get("name") {
debug!(
"Voice change requested to: {} (requires pipeline-level handling)",
name
);
}
if let Some(gender) = attributes.get("gender") {
debug!(
"Voice gender requested: {} (requires voice switching)",
gender
);
}
if let Some(age) = attributes.get("age") {
debug!("Voice age requested: {} (requires voice switching)", age);
}
Ok(())
}
fn apply_emphasis_instruction(
&self,
config: &mut SynthesisConfig,
attributes: &std::collections::HashMap<String, String>,
) -> Result<()> {
let level = attributes
.get("level")
.map(|s| s.as_str())
.unwrap_or("moderate");
match level {
"strong" => {
config.volume_gain += 3.0;
config.speaking_rate *= 0.9; }
"moderate" => {
config.volume_gain += 1.5;
config.speaking_rate *= 0.95;
}
"reduced" => {
config.volume_gain -= 1.5;
config.speaking_rate *= 1.05;
}
_ => {
debug!("Unknown emphasis level: {}", level);
}
}
debug!("Applied emphasis level: {}", level);
Ok(())
}
async fn apply_break_instruction(
&self,
audio: &mut AudioBuffer,
attributes: &std::collections::HashMap<String, String>,
_config: &SynthesisConfig,
) -> Result<()> {
let duration = if let Some(time) = attributes.get("time") {
self.parse_time_duration(time)?
} else if let Some(strength) = attributes.get("strength") {
match strength.as_str() {
"none" => 0.0,
"x-weak" => 0.1,
"weak" => 0.25,
"medium" => 0.5,
"strong" => 1.0,
"x-strong" => 2.0,
_ => 0.5,
}
} else {
0.5 };
if duration > 0.0 {
let silence = AudioBuffer::silence(duration, audio.sample_rate(), audio.channels());
audio.append(&silence)?;
debug!("Added break of {:.2}s", duration);
}
Ok(())
}
fn parse_time_duration(&self, time_str: &str) -> Result<f32> {
if time_str.ends_with("ms") {
time_str
.trim_end_matches("ms")
.parse::<f32>()
.map(|ms| ms / 1000.0)
.map_err(|_| VoirsError::config_error(format!("Invalid time duration: {time_str}")))
} else if time_str.ends_with('s') {
time_str
.trim_end_matches('s')
.parse::<f32>()
.map_err(|_| VoirsError::config_error(format!("Invalid time duration: {time_str}")))
} else {
time_str
.parse::<f32>()
.map_err(|_| VoirsError::config_error(format!("Invalid time duration: {time_str}")))
}
}
fn parse_ssml(&self, ssml: &str) -> Result<SsmlParseResult> {
debug!("Parsing SSML markup with {} characters", ssml.len());
let mut text = String::new();
let mut instructions = Vec::new();
let mut chars = ssml.chars().peekable();
let mut position = 0;
while let Some(ch) = chars.next() {
if ch == '<' {
if let Some(instruction) = self.parse_ssml_tag(&mut chars, position)? {
instructions.push(instruction);
}
} else if ch != '\n' && ch != '\r' && ch != '\t' {
text.push(ch);
position += 1;
} else if ch == ' ' {
text.push(ch);
position += 1;
}
}
debug!(
"SSML parsing complete: {} instructions, {} characters of text",
instructions.len(),
text.len()
);
Ok(SsmlParseResult { text, instructions })
}
fn parse_ssml_tag(
&self,
chars: &mut std::iter::Peekable<std::str::Chars>,
position: usize,
) -> Result<Option<SsmlInstruction>> {
let mut tag_content = String::new();
while let Some(&ch) = chars.peek() {
if ch == '>' {
chars.next(); break;
}
let Some(next_ch) = chars.next() else {
break;
};
tag_content.push(next_ch);
}
if tag_content.is_empty() {
return Ok(None);
}
if tag_content.starts_with('/') {
return Ok(None);
}
let parts: Vec<&str> = tag_content.split_whitespace().collect();
if parts.is_empty() {
return Ok(None);
}
let tag_name = parts[0].to_lowercase();
let mut attributes = std::collections::HashMap::new();
for part in parts.iter().skip(1) {
if let Some(eq_pos) = part.find('=') {
let key = part[..eq_pos].to_string();
let mut value = part[eq_pos + 1..].to_string();
if value.ends_with('/') {
value.pop();
}
let value = value.trim_matches('"').trim_matches('\'').to_string();
attributes.insert(key, value);
}
}
debug!(
"Parsed SSML tag: {} with {} attributes at position {}",
tag_name,
attributes.len(),
position
);
Ok(Some(SsmlInstruction {
tag: tag_name,
attributes,
position,
}))
}
#[allow(dead_code)]
fn strip_ssml_tags(&self, ssml: &str) -> String {
ssml.chars()
.fold(
(String::new(), false),
|(mut result, in_tag), ch| match ch {
'<' => (result, true),
'>' => (result, false),
_ if !in_tag => {
result.push(ch);
(result, in_tag)
}
_ => (result, in_tag),
},
)
.0
.trim()
.to_string()
}
fn split_text_for_streaming(
&self,
text: &str,
config: &SynthesisConfig,
) -> Result<Vec<String>> {
let chunk_size = config.streaming_chunk_size.unwrap_or(100);
let words: Vec<&str> = text.split_whitespace().collect();
let chunks: Vec<String> = words
.chunks(chunk_size)
.map(|chunk| chunk.join(" "))
.collect();
debug!("Split text into {} chunks", chunks.len());
Ok(chunks)
}
}
#[derive(Debug)]
struct SsmlParseResult {
text: String,
#[allow(dead_code)]
instructions: Vec<SsmlInstruction>,
}
#[derive(Debug)]
struct SsmlInstruction {
#[allow(dead_code)]
tag: String,
#[allow(dead_code)]
attributes: std::collections::HashMap<String, String>,
#[allow(dead_code)]
position: usize,
}
struct MemoryMonitor {
start_memory: usize,
test_mode: bool,
}
impl MemoryMonitor {
fn new(test_mode: bool) -> Self {
Self {
start_memory: Self::get_memory_usage(test_mode),
test_mode,
}
}
fn check_memory_usage(&self, stage: &str) -> Result<()> {
let current_memory = Self::get_memory_usage(self.test_mode);
let memory_increase = current_memory.saturating_sub(self.start_memory);
debug!(
"Memory usage {} - Current: {} bytes (+{})",
stage, current_memory, memory_increase
);
if memory_increase > 1_000_000_000 {
warn!("High memory usage detected: {} bytes", memory_increase);
}
Ok(())
}
fn get_memory_usage(test_mode: bool) -> usize {
if test_mode {
debug!("Skipping memory usage tracking in test mode");
return 0;
}
#[cfg(target_os = "linux")]
{
use std::fs;
if let Ok(status) = fs::read_to_string("/proc/self/status") {
for line in status.lines() {
if line.starts_with("VmRSS:") {
if let Some(kb_str) = line.split_whitespace().nth(1) {
if let Ok(kb) = kb_str.parse::<usize>() {
return kb * 1024; }
}
}
}
}
}
#[cfg(target_os = "macos")]
{
use std::process::Command;
if let Ok(output) = Command::new("ps")
.args(["-o", "rss=", "-p", &std::process::id().to_string()])
.output()
{
if let Ok(output_str) = String::from_utf8(output.stdout) {
if let Ok(kb) = output_str.trim().parse::<usize>() {
return kb * 1024; }
}
}
}
#[cfg(target_os = "windows")]
{
use std::mem;
use std::ptr;
#[allow(non_snake_case)]
#[repr(C)]
struct PROCESS_MEMORY_COUNTERS {
cb: u32,
PageFaultCount: u32,
PeakWorkingSetSize: usize,
WorkingSetSize: usize,
QuotaPeakPagedPoolUsage: usize,
QuotaPagedPoolUsage: usize,
QuotaPeakNonPagedPoolUsage: usize,
QuotaNonPagedPoolUsage: usize,
PagefileUsage: usize,
PeakPagefileUsage: usize,
}
extern "system" {
fn GetCurrentProcess() -> *mut std::ffi::c_void;
fn GetProcessMemoryInfo(
process: *mut std::ffi::c_void,
memory_counters: *mut PROCESS_MEMORY_COUNTERS,
cb: u32,
) -> i32;
}
unsafe {
let mut memory_counters: PROCESS_MEMORY_COUNTERS = mem::zeroed();
memory_counters.cb = mem::size_of::<PROCESS_MEMORY_COUNTERS>() as u32;
let process_handle = GetCurrentProcess();
let success = GetProcessMemoryInfo(
process_handle,
&mut memory_counters,
mem::size_of::<PROCESS_MEMORY_COUNTERS>() as u32,
);
if success != 0 {
return memory_counters.WorkingSetSize;
} else {
debug!("Failed to get Windows process memory info, falling back to 0");
}
}
}
0
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_synthesis_orchestrator() {
let g2p = Arc::new(crate::pipeline::DummyG2p::new());
let acoustic = Arc::new(crate::pipeline::DummyAcoustic::new());
let vocoder = Arc::new(crate::pipeline::DummyVocoder::new());
let orchestrator = SynthesisOrchestrator::with_test_mode(g2p, acoustic, vocoder, true);
let config = SynthesisConfig::default();
let result = orchestrator.synthesize("Hello, world!", &config).await;
assert!(result.is_ok());
let audio = result.unwrap();
assert!(audio.duration() > 0.0);
}
#[tokio::test]
async fn test_ssml_synthesis() {
let g2p = Arc::new(crate::pipeline::DummyG2p::new());
let acoustic = Arc::new(crate::pipeline::DummyAcoustic::new());
let vocoder = Arc::new(crate::pipeline::DummyVocoder::new());
let orchestrator = SynthesisOrchestrator::with_test_mode(g2p, acoustic, vocoder, true);
let config = SynthesisConfig::default();
let ssml = "<speak>Hello, <break time='1s'/> world!</speak>";
let result = orchestrator.synthesize_ssml(ssml, &config).await;
assert!(result.is_ok());
}
#[tokio::test]
async fn test_streaming_synthesis() {
let g2p = Arc::new(crate::pipeline::DummyG2p::new());
let acoustic = Arc::new(crate::pipeline::DummyAcoustic::new());
let vocoder = Arc::new(crate::pipeline::DummyVocoder::new());
let orchestrator = SynthesisOrchestrator::with_test_mode(g2p, acoustic, vocoder, true);
let config = SynthesisConfig::default();
let text = "This is a long text that will be split into chunks for streaming synthesis.";
let stream = orchestrator.synthesize_stream(text, &config).await;
assert!(stream.is_ok());
}
}