voirs-sdk 0.1.0-rc.1

Unified SDK and public API for VoiRS speech synthesis
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
//! Streaming synthesis module with modular architecture.
//!
//! This module provides comprehensive streaming synthesis capabilities organized into
//! modular components:
//!
//! - [`pipeline`] - Core streaming pipeline functionality and chunk processing
//! - [`realtime`] - Real-time synthesis processing and low-latency operations
//! - [`management`] - Stream management, ordering, and state tracking
//!
//! # Example
//!
//! ```no_run
//! use voirs_sdk::streaming::{StreamingConfig};
//! use voirs_sdk::pipeline::VoirsPipelineBuilder;
//! use std::sync::Arc;
//! use futures::StreamExt;
//!
//! # async fn example() -> voirs_sdk::Result<()> {
//! // Create a pipeline
//! let pipeline = Arc::new(VoirsPipelineBuilder::new()
//!     .with_voice("en-US-female-calm")
//!     .build()
//!     .await?);
//!
//! // Use built-in streaming synthesis
//! let text = "This is a long text that will be streamed in chunks.";
//! let mut stream = pipeline.synthesize_stream(text).await?;
//!
//! while let Some(chunk) = stream.next().await {
//!     let chunk = chunk?;
//!     println!("Generated chunk: {:.2}s audio", chunk.duration());
//! }
//! # Ok(())
//! # }
//! ```

pub mod management;
pub mod pipeline;
pub mod realtime;
pub mod synthesis_optimizer;
pub mod unified;

// Re-export the main types for convenience
pub use management::{
    AudioChunk, ChunkMetadata, CombinationStrategy, LatencyStats, OrderedChunkStream,
    QualityMetrics, StreamCombiner, StreamStats, StreamingConfig, StreamingState,
    ThroughputMetrics,
};
pub use pipeline::StreamingPipeline;
pub use realtime::RealtimeProcessor;
pub use synthesis_optimizer::{
    ChunkConfig, LatencyBenchmarkReport, MemoryPool, OptimizationStats, QualityController,
    StreamingSynthesisOptimizer, SynthesisMetrics,
};
pub use unified::{
    FeatureStreamingConfig, FeatureStreamingResult, MusicalNote, StreamingMetadata,
    StreamingParameters, UnifiedStreamingPipeline, UnifiedStreamingRequest, UnifiedStreamingResult,
    UnifiedStreamingSynthesis,
};

#[cfg(test)]
mod tests {
    use super::*;
    use crate::pipeline::{DummyAcoustic, DummyG2p, DummyVocoder};
    use futures::StreamExt;
    use std::{sync::Arc, time::Duration};
    use tokio::time::sleep;

    fn create_test_pipeline() -> StreamingPipeline {
        StreamingPipeline::new(
            Arc::new(DummyG2p::new()),
            Arc::new(DummyAcoustic::new()),
            Arc::new(DummyVocoder::new()),
            StreamingConfig::default(),
        )
    }

    #[tokio::test]
    async fn test_end_to_end_streaming() {
        let pipeline = create_test_pipeline();

        let text = "This is a comprehensive test of the streaming synthesis system. \
                   It should split this text into multiple chunks and process them efficiently. \
                   Each chunk should be processed with high quality and low latency.";

        let mut stream = pipeline.synthesize_stream(text).await.unwrap();
        let mut chunks = Vec::new();
        let mut total_duration = 0.0;

        while let Some(result) = stream.next().await {
            let chunk = result.unwrap();

            // Verify chunk properties
            assert!(chunk.audio.duration() > 0.0);
            assert!(!chunk.text.trim().is_empty());
            assert!(chunk.metadata.phoneme_count > 0);
            assert!(chunk.metadata.mel_frames > 0);

            total_duration += chunk.audio.duration();
            chunks.push(chunk);
        }

        // Should have multiple chunks
        assert!(chunks.len() > 1);
        assert!(total_duration > 0.0);

        // Verify chunks are ordered correctly
        for (i, chunk) in chunks.iter().enumerate() {
            assert_eq!(chunk.chunk_id, i);
        }
    }

    #[tokio::test]
    async fn test_real_time_synthesis() {
        let pipeline = create_test_pipeline();

        // Create a text stream that arrives over time
        let text_parts = vec![
            "Hello there, ".to_string(),
            "this is a real-time ".to_string(),
            "synthesis test. ".to_string(),
            "Each part arrives separately.".to_string(),
        ];

        let text_stream = futures::stream::iter(text_parts)
            .then(|text| async move {
                sleep(Duration::from_millis(50)).await; // Simulate real-time arrival
                text
            })
            .boxed();

        let mut audio_stream = pipeline.synthesize_realtime(text_stream).await.unwrap();
        let mut chunk_count = 0;

        while let Some(result) = audio_stream.next().await {
            let chunk = result.unwrap();
            assert!(chunk.audio.duration() > 0.0);
            chunk_count += 1;
        }

        assert!(chunk_count > 0);
    }

    #[tokio::test]
    async fn test_streaming_configuration_impact() {
        let g2p = Arc::new(DummyG2p::new());
        let acoustic = Arc::new(DummyAcoustic::new());
        let vocoder = Arc::new(DummyVocoder::new());

        // Test low latency config
        let low_latency_config = StreamingConfig::low_latency();
        let low_latency_pipeline = StreamingPipeline::new(
            g2p.clone(),
            acoustic.clone(),
            vocoder.clone(),
            low_latency_config.clone(),
        );

        // Test high quality config
        let high_quality_config = StreamingConfig::high_quality();
        let high_quality_pipeline = StreamingPipeline::new(
            g2p.clone(),
            acoustic.clone(),
            vocoder.clone(),
            high_quality_config.clone(),
        );

        let test_text = "This is a test text for comparing different configurations.";

        // Compare chunk sizes
        let low_latency_chunks = low_latency_pipeline.split_text_for_streaming(test_text);
        let high_quality_chunks = high_quality_pipeline.split_text_for_streaming(test_text);

        // Low latency should create smaller chunks
        assert!(low_latency_chunks.len() >= high_quality_chunks.len());

        for chunk in &low_latency_chunks {
            assert!(chunk.len() <= low_latency_config.max_chunk_chars);
        }

        for chunk in &high_quality_chunks {
            assert!(chunk.len() <= high_quality_config.max_chunk_chars);
        }
    }

    #[tokio::test]
    async fn test_ordered_chunk_stream() {
        // Create chunks in random order
        let chunks = vec![
            Ok(AudioChunk::new(
                2,
                crate::audio::AudioBuffer::sine_wave(440.0, 0.1, 22050, 0.5),
                "Third chunk".to_string(),
                Duration::from_millis(10),
                5,
                20,
            )),
            Ok(AudioChunk::new(
                0,
                crate::audio::AudioBuffer::sine_wave(440.0, 0.1, 22050, 0.5),
                "First chunk".to_string(),
                Duration::from_millis(10),
                5,
                20,
            )),
            Ok(AudioChunk::new(
                1,
                crate::audio::AudioBuffer::sine_wave(440.0, 0.1, 22050, 0.5),
                "Second chunk".to_string(),
                Duration::from_millis(10),
                5,
                20,
            )),
        ];

        let stream = futures::stream::iter(chunks);
        let mut ordered_stream = OrderedChunkStream::new(Box::pin(stream), 10);

        let mut received_chunks = Vec::new();
        while let Some(result) = ordered_stream.next().await {
            received_chunks.push(result.unwrap());
        }

        // Should receive chunks in correct order
        assert_eq!(received_chunks.len(), 3);
        assert_eq!(received_chunks[0].chunk_id, 0);
        assert_eq!(received_chunks[1].chunk_id, 1);
        assert_eq!(received_chunks[2].chunk_id, 2);

        // Check stream statistics
        let stats = ordered_stream.stats();
        assert_eq!(stats.chunks_received, 3);
        assert_eq!(stats.chunks_delivered, 3);
        assert!(stats.is_healthy());
    }

    #[tokio::test]
    async fn test_performance_monitoring() {
        let pipeline = create_test_pipeline();

        let text = "This is a performance monitoring test.";
        let mut stream = pipeline.synthesize_stream(text).await.unwrap();

        while let Some(result) = stream.next().await {
            let chunk = result.unwrap();

            // Check real-time factor
            let rtf = chunk.real_time_factor();
            assert!(rtf > 0.0);

            // Check efficiency score
            let efficiency = chunk.efficiency_score();
            assert!(efficiency > 0.0);
            assert!(efficiency <= 1.0);

            // Metadata should be complete
            assert!(chunk.metadata.confidence_score >= 0.0);
            assert!(chunk.metadata.confidence_score <= 1.0);
        }

        // Check pipeline state
        let state = pipeline.get_state().await;
        assert!(state.chunks_processed > 0);
        assert!(state.total_duration > 0.0);
    }

    #[tokio::test]
    async fn test_streaming_state_management() {
        let pipeline = create_test_pipeline();

        // Initial state
        let initial_state = pipeline.get_state().await;
        assert_eq!(initial_state.chunks_processed, 0);

        // Process some text
        let text = "Test text for state management.";
        let mut stream = pipeline.synthesize_stream(text).await.unwrap();

        // Consume stream
        while (stream.next().await).is_some() {}

        // Check updated state
        let final_state = pipeline.get_state().await;
        assert!(final_state.chunks_processed > 0);
        assert!(final_state.total_duration > 0.0);
        assert!(final_state.total_chars_processed > 0);

        // Reset state
        pipeline.reset_state().await;
        let reset_state = pipeline.get_state().await;
        assert_eq!(reset_state.chunks_processed, 0);
    }

    #[tokio::test]
    async fn test_error_handling() {
        let pipeline = create_test_pipeline();

        // Test with empty text
        let empty_stream = pipeline.synthesize_stream("").await.unwrap();
        let chunks: Vec<_> = empty_stream.collect().await;
        assert!(chunks.is_empty());

        // Test with whitespace only
        let whitespace_stream = pipeline.synthesize_stream("   \n\t  ").await.unwrap();
        let chunks: Vec<_> = whitespace_stream.collect().await;
        assert!(chunks.is_empty());
    }

    #[tokio::test]
    async fn test_chunk_metadata_completeness() {
        let pipeline = create_test_pipeline();

        let text = "This is a sentence. This is another sentence!";
        let mut stream = pipeline.synthesize_stream(text).await.unwrap();

        while let Some(result) = stream.next().await {
            let chunk = result.unwrap();

            // Verify metadata completeness
            assert!(chunk.metadata.phoneme_count > 0);
            assert!(chunk.metadata.mel_frames > 0);
            assert!(chunk.metadata.confidence_score >= 0.0);
            assert!(chunk.metadata.confidence_score <= 1.0);

            // Check boundary detection
            if chunk.text.trim_end().ends_with('.') || chunk.text.trim_end().ends_with('!') {
                assert!(chunk.metadata.is_sentence_boundary);
            }

            // Test metadata export
            let json = chunk.export_metadata().unwrap();
            assert!(json.contains("phoneme_count"));
            assert!(json.contains("mel_frames"));
        }
    }

    #[tokio::test]
    async fn test_adaptive_configuration() {
        let mut config = StreamingConfig {
            adaptive_chunking: true,
            ..Default::default()
        };

        let original_max_chars = config.max_chunk_chars;

        // Simulate poor performance (high RTF, high latency)
        config.adapt_for_performance(2.0, Duration::from_millis(800));
        assert!(config.max_chunk_chars < original_max_chars);

        // Reset and simulate good performance
        config.max_chunk_chars = original_max_chars;
        config.adapt_for_performance(0.1, Duration::from_millis(50));
        assert!(config.max_chunk_chars >= original_max_chars);
    }

    #[tokio::test]
    async fn test_streaming_with_different_text_types() {
        let pipeline = create_test_pipeline();

        let test_cases = vec![
            ("Short.", 1),
            ("This is a medium length sentence with multiple words.", 1),
            ("This is the first sentence. This is the second sentence. This is the third sentence.", 2),
            ("This is the first sentence of a very long text. This is the second sentence that should cause it to be split. This is the third sentence to ensure multiple chunks.", 2),
        ];

        for (text, expected_min_chunks) in test_cases {
            let stream = pipeline.synthesize_stream(text).await.unwrap();
            let chunks: Vec<_> = stream.collect().await;

            assert!(
                chunks.len() >= expected_min_chunks,
                "Text '{}' should produce at least {} chunks, got {}",
                text,
                expected_min_chunks,
                chunks.len()
            );

            for chunk in chunks {
                let chunk = chunk.unwrap();
                assert!(chunk.audio.duration() > 0.0);
                assert!(!chunk.text.trim().is_empty());
            }
        }
    }

    #[test]
    fn test_configuration_presets() {
        let low_latency = StreamingConfig::low_latency();
        let high_quality = StreamingConfig::high_quality();
        let batch = StreamingConfig::batch_processing();

        // Low latency should have smaller chunks and lower latency limits
        assert!(low_latency.max_chunk_chars < high_quality.max_chunk_chars);
        assert!(low_latency.max_latency < high_quality.max_latency);
        assert!(low_latency.quality_vs_latency < high_quality.quality_vs_latency);

        // Batch processing should allow larger chunks and more concurrency
        assert!(batch.max_chunk_chars >= high_quality.max_chunk_chars);
        assert!(batch.max_concurrent_chunks >= high_quality.max_concurrent_chunks);

        // All configurations should be valid
        assert!(low_latency.validate().is_ok());
        assert!(high_quality.validate().is_ok());
        assert!(batch.validate().is_ok());
    }

    #[test]
    fn test_quality_metrics() {
        let mut metrics = QualityMetrics::default();

        // Create test chunk with good performance
        let audio = crate::audio::AudioBuffer::sine_wave(440.0, 1.0, 22050, 0.5);
        let good_chunk = AudioChunk::new(
            0,
            audio.clone(),
            "Good performance".to_string(),
            Duration::from_millis(100), // 0.1 RTF for 1s audio
            10,
            100,
        );

        metrics.update_with_chunk(&good_chunk);
        assert!(metrics.real_time_factor < 1.0);
        assert!(!metrics.is_quality_degrading());

        // Create chunk with poor performance
        let poor_chunk = AudioChunk::new(
            1,
            audio,
            "Poor performance".to_string(),
            Duration::from_millis(2000), // 2.0 RTF for 1s audio
            10,
            100,
        );

        metrics.update_with_chunk(&poor_chunk);
        assert!(metrics.peak_rtf > 1.0);
        assert!(metrics.is_quality_degrading());
    }
}