autoagents-speech 0.3.7

Speech (TTS/STT) provider abstractions for AutoAgents
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
//! Streaming TTS pipeline.
//!
//! Accepts a token stream (e.g., from an LLM), chunks tokens into sentences via
//! [`SentenceChunker`], synthesizes each sentence concurrently, and yields audio
//! chunks in the correct sequential order.
//!
//! # Architecture
//!
//! Two concurrent tasks run in parallel (inspired by LiveKit Agents):
//!
//! ```text
//! token_stream
//!   │  push_token() each token
//!//! SentenceChunker ──► (seq_idx, sentence) ──► mpsc channel
//!//! tokio::spawn per sentence → tts.generate_speech(sentence)
//!//! results: (seq_idx, AudioChunk) → BTreeMap reorder buffer
//!   ▼  yield in sequential order
//! output Stream<AudioChunk>
//! ```
//!
//! The BTreeMap reorder buffer is critical: sentence 2 may finish TTS before
//! sentence 1, but audio must be emitted in the original text order.

use std::collections::BTreeMap;
use std::pin::Pin;
use std::sync::Arc;
use std::task::{Context, Poll};

use futures::Stream;
use tokio::sync::{Semaphore, mpsc};

use crate::error::TTSResult;
use crate::provider::TTSSpeechProvider;
use crate::types::{AudioChunk, SpeechRequest};

use super::chunker::{ChunkerConfig, SentenceChunker};

/// A streaming TTS pipeline that chunks token streams into sentences and
/// synthesizes them concurrently with ordered output.
pub struct StreamingTtsPipeline<T: TTSSpeechProvider + Send + Sync + 'static> {
    tts: Arc<T>,
    config: ChunkerConfig,
}

impl<T: TTSSpeechProvider + Send + Sync + 'static> StreamingTtsPipeline<T> {
    /// Create a new pipeline with the given TTS provider and default chunker config.
    pub fn new(tts: Arc<T>) -> Self {
        Self {
            tts,
            config: ChunkerConfig::default(),
        }
    }

    /// Create a new pipeline with a custom chunker configuration.
    pub fn with_config(tts: Arc<T>, config: ChunkerConfig) -> Self {
        Self { tts, config }
    }

    /// Run the pipeline.
    ///
    /// Consumes a stream of text tokens (e.g., from an LLM) and produces an
    /// ordered stream of audio chunks. Each sentence is synthesized concurrently,
    /// but audio is yielded in the original text order.
    ///
    /// # Arguments
    /// * `token_stream` - Stream of text tokens (individual words/subwords from LLM)
    /// * `base_request` - Template `SpeechRequest`; the `text` field is replaced
    ///   per sentence chunk while voice/format/sample_rate are preserved.
    ///
    /// # Returns
    /// A stream of `Result<AudioChunk, TTSError>` in sequential sentence order.
    pub fn run<S>(&self, token_stream: S, base_request: SpeechRequest) -> OrderedAudioStream
    where
        S: Stream<Item = String> + Send + 'static,
    {
        let (result_tx, result_rx) = mpsc::channel::<(usize, TTSResult<Vec<AudioChunk>>)>(32);
        let tts = Arc::clone(&self.tts);
        let config = self.config.clone();
        let base = base_request;

        // Spawn the producer task: chunks tokens into sentences, dispatches TTS
        tokio::spawn(async move {
            Self::producer_task(token_stream, config, tts, base, result_tx).await;
        });

        OrderedAudioStream::new(result_rx)
    }

    /// Producer task: reads tokens, chunks into sentences, spawns TTS for each.
    async fn producer_task<S>(
        token_stream: S,
        config: ChunkerConfig,
        tts: Arc<T>,
        base_request: SpeechRequest,
        result_tx: mpsc::Sender<(usize, TTSResult<Vec<AudioChunk>>)>,
    ) where
        S: Stream<Item = String> + Send + 'static,
    {
        use futures::StreamExt;

        let mut chunker = SentenceChunker::with_config(config);
        let mut seq_idx: usize = 0;

        // Semaphore to serialize TTS generation - some TTS backends (like PocketTTS)
        // are not thread-safe for concurrent generation.
        let tts_semaphore = Arc::new(Semaphore::new(1));

        let mut token_stream = std::pin::pin!(token_stream);

        // Process incoming tokens
        while let Some(token) = token_stream.next().await {
            for sentence in chunker.push_token(&token) {
                let idx = seq_idx;
                seq_idx += 1;
                Self::spawn_tts_task(
                    idx,
                    sentence,
                    Arc::clone(&tts),
                    base_request.clone(),
                    result_tx.clone(),
                    Arc::clone(&tts_semaphore),
                );
            }
        }

        // LLM stream ended — flush any remaining text
        if let Some(sentence) = chunker.force_flush() {
            let idx = seq_idx;
            Self::spawn_tts_task(idx, sentence, tts, base_request, result_tx, tts_semaphore);
        }

        // result_tx is dropped here, signalling the consumer that no more
        // sentences will arrive. The spawned TTS tasks still hold clones.
    }

    /// Spawn a TTS synthesis task for a single sentence.
    fn spawn_tts_task(
        seq_idx: usize,
        sentence: String,
        tts: Arc<T>,
        base_request: SpeechRequest,
        result_tx: mpsc::Sender<(usize, TTSResult<Vec<AudioChunk>>)>,
        semaphore: Arc<Semaphore>,
    ) {
        tokio::spawn(async move {
            // Acquire semaphore to serialize TTS calls
            let _permit = match semaphore.acquire().await {
                Ok(permit) => permit,
                Err(_) => {
                    // Semaphore closed - pipeline is shutting down
                    return;
                }
            };

            let request = SpeechRequest {
                text: sentence,
                voice: base_request.voice,
                format: base_request.format,
                sample_rate: base_request.sample_rate,
            };

            let result = match tts.generate_speech(request).await {
                Ok(response) => {
                    // Convert SpeechResponse into a single AudioChunk
                    let chunk = AudioChunk {
                        samples: response.audio.samples,
                        sample_rate: response.audio.sample_rate,
                        is_final: false, // will be set by the consumer for the last chunk
                    };
                    Ok(vec![chunk])
                }
                Err(e) => Err(e),
            };

            // If the receiver is dropped, we just silently discard
            let _ = result_tx.send((seq_idx, result)).await;
        });
    }
}

/// Ordered audio stream that yields chunks in sequential sentence order.
///
/// Internally buffers out-of-order TTS results in a BTreeMap and yields
/// from the front only when the next expected sequence number is available.
pub struct OrderedAudioStream {
    /// Channel receiving (seq_idx, result) from TTS tasks
    result_rx: mpsc::Receiver<(usize, TTSResult<Vec<AudioChunk>>)>,
    /// Reorder buffer: seq_idx → audio chunks
    buffer: BTreeMap<usize, TTSResult<Vec<AudioChunk>>>,
    /// Next sequence index to yield
    next_seq: usize,
    /// Pending chunks from the current sequence (being drained)
    pending_chunks: Vec<AudioChunk>,
    /// Whether the channel is closed (no more results coming)
    channel_closed: bool,
    /// Whether we've sent the final chunk
    done: bool,
    /// Track the highest sequence index we've seen (to detect gaps)
    max_seq_seen: Option<usize>,
}

impl OrderedAudioStream {
    fn new(result_rx: mpsc::Receiver<(usize, TTSResult<Vec<AudioChunk>>)>) -> Self {
        Self {
            result_rx,
            buffer: BTreeMap::new(),
            next_seq: 0,
            pending_chunks: Vec::new(),
            channel_closed: false,
            done: false,
            max_seq_seen: None,
        }
    }

    /// Check if `next_seq` is missing and will never arrive.
    ///
    /// This happens when the channel is closed and `next_seq` is not in buffer.
    /// Since no more results can arrive, the missing sequence will never be
    /// delivered (the TTS task panicked or failed to send its result).
    fn is_seq_missing(&self) -> bool {
        if !self.channel_closed {
            return false;
        }
        // If buffer is empty, there's nothing to skip to
        if self.buffer.is_empty() {
            return false;
        }
        // If next_seq is in buffer, it's not missing
        if self.buffer.contains_key(&self.next_seq) {
            return false;
        }
        // Channel is closed, buffer is non-empty, but next_seq isn't there.
        // This means next_seq will never arrive - it's missing.
        true
    }

    /// Skip to the next available sequence when a gap is detected.
    fn skip_to_next_available(&mut self) {
        if let Some(&min_key) = self.buffer.keys().next() {
            self.next_seq = min_key;
        }
    }

    /// Try to drain any buffered results that match the next expected sequence.
    fn try_drain_buffered(&mut self) -> Option<TTSResult<AudioChunk>> {
        // First, drain any pending chunks from a previous sequence
        if let Some(chunk) = self.pending_chunks.pop() {
            return Some(Ok(chunk));
        }

        // Check if the next expected sequence is in the buffer
        if let Some(result) = self.buffer.remove(&self.next_seq) {
            self.next_seq += 1;
            match result {
                Ok(mut chunks) => {
                    if chunks.is_empty() {
                        return None;
                    }
                    // Reverse so we can pop from the end efficiently
                    chunks.reverse();
                    self.pending_chunks = chunks;
                    self.pending_chunks.pop().map(Ok)
                }
                Err(e) => Some(Err(e)),
            }
        } else {
            None
        }
    }
}

impl Stream for OrderedAudioStream {
    type Item = TTSResult<AudioChunk>;

    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
        let this = self.get_mut();

        if this.done {
            return Poll::Ready(None);
        }

        loop {
            // Try to yield from buffered/pending data
            if let Some(item) = this.try_drain_buffered() {
                return Poll::Ready(Some(item));
            }

            // If the channel is closed and buffer is empty, we're done
            if this.channel_closed && this.buffer.is_empty() && this.pending_chunks.is_empty() {
                this.done = true;
                return Poll::Ready(None);
            }

            // Check for missing sequence (TTS task panicked/failed to send)
            if this.is_seq_missing() {
                // Skip the missing sequence and continue with what we have
                this.skip_to_next_available();
                // Loop back to try draining from the new position
                continue;
            }

            // Try to receive more results
            match this.result_rx.poll_recv(cx) {
                Poll::Ready(Some((seq_idx, result))) => {
                    // Track max sequence seen for gap detection
                    this.max_seq_seen = Some(this.max_seq_seen.map_or(seq_idx, |m| m.max(seq_idx)));
                    this.buffer.insert(seq_idx, result);
                    // Loop back to try draining
                }
                Poll::Ready(None) => {
                    // Channel closed — all producers done
                    this.channel_closed = true;
                    // Loop back to drain remaining buffer
                }
                Poll::Pending => {
                    return Poll::Pending;
                }
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::error::TTSError;
    use crate::types::{AudioData, AudioFormat, SpeechResponse, VoiceIdentifier};
    use async_trait::async_trait;
    use futures::StreamExt;

    /// Helper: create a base SpeechRequest template for tests.
    fn test_request() -> SpeechRequest {
        SpeechRequest {
            text: String::default(),
            voice: VoiceIdentifier::new("test"),
            format: AudioFormat::Wav,
            sample_rate: Some(24000),
        }
    }

    /// A mock TTS provider that returns samples derived from the input text length.
    struct MockTtsProvider;

    #[async_trait]
    impl TTSSpeechProvider for MockTtsProvider {
        async fn generate_speech(&self, request: SpeechRequest) -> TTSResult<SpeechResponse> {
            // Small delay to simulate real TTS
            tokio::time::sleep(std::time::Duration::from_millis(10)).await;
            let len = request.text.len();
            Ok(SpeechResponse {
                audio: AudioData {
                    samples: vec![0.5_f32; len],
                    channels: 1,
                    sample_rate: 24000,
                },
                text: request.text,
                duration_ms: len as u64,
            })
        }
    }

    #[tokio::test]
    async fn test_pipeline_single_sentence() {
        let tts = Arc::new(MockTtsProvider);
        let pipeline = StreamingTtsPipeline::with_config(
            tts,
            ChunkerConfig {
                min_chunk_chars: 1,
                max_chunk_chars: 250,
            },
        );

        let tokens = futures::stream::iter(vec!["Hello world.".to_string()]);

        let mut stream = pipeline.run(tokens, test_request());
        let mut chunks = Vec::new();
        while let Some(result) = stream.next().await {
            chunks.push(result.unwrap());
        }
        assert!(!chunks.is_empty());
    }

    #[tokio::test]
    async fn test_pipeline_multiple_sentences() {
        let tts = Arc::new(MockTtsProvider);
        let pipeline = StreamingTtsPipeline::with_config(
            tts,
            ChunkerConfig {
                min_chunk_chars: 1,
                max_chunk_chars: 250,
            },
        );

        // Token-by-token, yielding two sentences
        let tokens = futures::stream::iter(
            "Hello world. How are you today?"
                .split_inclusive(' ')
                .map(|s| s.to_string())
                .collect::<Vec<_>>(),
        );

        let mut stream = pipeline.run(tokens, test_request());
        let mut chunks = Vec::new();
        while let Some(result) = stream.next().await {
            chunks.push(result.unwrap());
        }
        // Should have at least 2 chunks (one per sentence)
        assert!(
            chunks.len() >= 2,
            "Expected >= 2 chunks, got {}",
            chunks.len()
        );
    }

    #[tokio::test]
    async fn test_pipeline_ordered_output() {
        /// A mock that delays longer for shorter text to test reordering.
        struct SlowShortTts;

        #[async_trait]
        impl TTSSpeechProvider for SlowShortTts {
            async fn generate_speech(&self, request: SpeechRequest) -> TTSResult<SpeechResponse> {
                // Shorter sentences take LONGER → forces out-of-order completion
                let delay_ms = if request.text.len() < 20 { 50 } else { 5 };
                tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await;
                let marker = request.text.len() as f32;
                Ok(SpeechResponse {
                    audio: AudioData {
                        samples: vec![marker; 10],
                        channels: 1,
                        sample_rate: 24000,
                    },
                    text: request.text,
                    duration_ms: 10,
                })
            }
        }

        let tts = Arc::new(SlowShortTts);
        let pipeline = StreamingTtsPipeline::with_config(
            tts,
            ChunkerConfig {
                min_chunk_chars: 1,
                max_chunk_chars: 250,
            },
        );

        // Two sentences: first is short (slow), second is long (fast)
        let tokens = futures::stream::iter(vec![
            "Hi! ".to_string(),
            "This is a much longer second sentence for testing. ".to_string(),
        ]);

        let mut stream = pipeline.run(tokens, test_request());
        let mut sample_markers = Vec::new();
        while let Some(result) = stream.next().await {
            let chunk = result.unwrap();
            if !chunk.samples.is_empty() {
                sample_markers.push(chunk.samples[0]);
            }
        }

        // Output should be in original order (short sentence first, then long)
        // despite the short sentence taking longer to synthesize
        assert!(sample_markers.len() >= 2);
        // First chunk marker should be for shorter text
        assert!(
            sample_markers[0] < sample_markers[1],
            "Chunks should be in original sentence order: {:?}",
            sample_markers
        );
    }

    #[tokio::test]
    async fn test_pipeline_empty_stream() {
        let tts = Arc::new(MockTtsProvider);
        let pipeline = StreamingTtsPipeline::new(tts);

        let tokens = futures::stream::empty::<String>();

        let mut stream = pipeline.run(tokens, test_request());
        let mut count = 0;
        while let Some(_result) = stream.next().await {
            count += 1;
        }
        assert_eq!(count, 0);
    }

    #[tokio::test]
    async fn test_pipeline_tts_error_propagation() {
        struct FailingTts;

        #[async_trait]
        impl TTSSpeechProvider for FailingTts {
            async fn generate_speech(&self, _request: SpeechRequest) -> TTSResult<SpeechResponse> {
                Err(TTSError::Other(
                    "synthesis failed".to_string(),
                    "test".to_string(),
                ))
            }
        }

        let tts = Arc::new(FailingTts);
        let pipeline = StreamingTtsPipeline::with_config(
            tts,
            ChunkerConfig {
                min_chunk_chars: 1,
                max_chunk_chars: 250,
            },
        );

        let tokens = futures::stream::iter(vec!["Hello world.".to_string()]);

        let mut stream = pipeline.run(tokens, test_request());
        let result = stream.next().await;
        assert!(result.is_some());
        assert!(result.unwrap().is_err());
    }

    /// Simulate an agent `run_stream` that emits text deltas followed by the
    /// full accumulated response (the `done=true` duplicate). Without the
    /// `SkipLast` adapter on the caller side, the pipeline would synthesize
    /// the response twice. This test verifies that the pipeline itself works
    /// correctly with clean token input (no duplicate).
    #[tokio::test]
    async fn test_pipeline_no_duplicate_when_tokens_are_clean() {
        // Mock that records every text it synthesizes
        use std::sync::Mutex;

        struct RecordingTts {
            calls: Arc<Mutex<Vec<String>>>,
        }

        #[async_trait]
        impl TTSSpeechProvider for RecordingTts {
            async fn generate_speech(&self, request: SpeechRequest) -> TTSResult<SpeechResponse> {
                self.calls.lock().unwrap().push(request.text.clone());
                Ok(SpeechResponse {
                    audio: AudioData {
                        samples: vec![0.1; 10],
                        channels: 1,
                        sample_rate: 24000,
                    },
                    text: request.text,
                    duration_ms: 10,
                })
            }
        }

        let calls = Arc::new(Mutex::new(Vec::new()));
        let tts = Arc::new(RecordingTts {
            calls: Arc::clone(&calls),
        });

        let pipeline = StreamingTtsPipeline::with_config(
            tts,
            ChunkerConfig {
                min_chunk_chars: 1,
                max_chunk_chars: 250,
            },
        );

        // Simulate CLEAN token stream (no final duplicate):
        // "Hello there." and "How are you?" as token deltas only.
        let tokens = futures::stream::iter(vec![
            "Hello ".to_string(),
            "there. ".to_string(),
            "How ".to_string(),
            "are ".to_string(),
            "you?".to_string(),
        ]);

        let mut stream = pipeline.run(tokens, test_request());
        while let Some(result) = stream.next().await {
            result.unwrap();
        }

        let synthesized = calls.lock().unwrap().clone();
        assert_eq!(
            synthesized.len(),
            2,
            "Expected exactly 2 TTS calls (one per sentence), got {}: {:?}",
            synthesized.len(),
            synthesized
        );
        assert_eq!(synthesized[0].trim(), "Hello there.");
        assert_eq!(synthesized[1].trim(), "How are you?");
    }

    /// Demonstrate what happens if the final duplicate is NOT removed:
    /// the pipeline synthesizes the entire response again.
    #[tokio::test]
    async fn test_pipeline_duplicate_when_final_included() {
        use std::sync::Mutex;

        struct RecordingTts {
            calls: Arc<Mutex<Vec<String>>>,
        }

        #[async_trait]
        impl TTSSpeechProvider for RecordingTts {
            async fn generate_speech(&self, request: SpeechRequest) -> TTSResult<SpeechResponse> {
                self.calls.lock().unwrap().push(request.text.clone());
                Ok(SpeechResponse {
                    audio: AudioData {
                        samples: vec![0.1; 10],
                        channels: 1,
                        sample_rate: 24000,
                    },
                    text: request.text,
                    duration_ms: 10,
                })
            }
        }

        let calls = Arc::new(Mutex::new(Vec::new()));
        let tts = Arc::new(RecordingTts {
            calls: Arc::clone(&calls),
        });

        let pipeline = StreamingTtsPipeline::with_config(
            tts,
            ChunkerConfig {
                min_chunk_chars: 1,
                max_chunk_chars: 250,
            },
        );

        // Simulate BROKEN token stream (with final duplicate):
        // deltas + full accumulated response at the end (what run_stream does)
        let tokens = futures::stream::iter(vec![
            "Hello ".to_string(),
            "there. ".to_string(),
            "How ".to_string(),
            "are ".to_string(),
            "you?".to_string(),
            // ↓ This is the done=true duplicate the agent adds
            "Hello there. How are you?".to_string(),
        ]);

        let mut stream = pipeline.run(tokens, test_request());
        while let Some(result) = stream.next().await {
            result.unwrap();
        }

        let synthesized = calls.lock().unwrap().clone();
        // With the duplicate, we get 4 TTS calls instead of 2:
        // "Hello there." + "How are you?" from deltas,
        // then "Hello there." + "How are you?" AGAIN from the duplicate.
        assert!(
            synthesized.len() > 2,
            "With duplicate included, expected >2 TTS calls, got {}: {:?}",
            synthesized.len(),
            synthesized
        );
    }

    #[tokio::test]
    async fn test_audio_chunk_carries_sample_rate() {
        let tts = Arc::new(MockTtsProvider);
        let pipeline = StreamingTtsPipeline::with_config(
            tts,
            ChunkerConfig {
                min_chunk_chars: 1,
                max_chunk_chars: 250,
            },
        );

        let tokens = futures::stream::iter(vec!["Hello world.".to_string()]);

        let mut stream = pipeline.run(tokens, test_request());
        if let Some(Ok(chunk)) = stream.next().await {
            assert_eq!(
                chunk.sample_rate, 24000,
                "AudioChunk should carry sample_rate from TTS response"
            );
        } else {
            panic!("Expected at least one audio chunk");
        }
    }

    /// Test that the stream terminates correctly when a sequence is missing
    /// (e.g., a TTS task panicked before sending its result).
    /// This verifies the fix for the infinite loop bug where poll_next would
    /// keep polling a closed channel if next_seq was missing.
    #[tokio::test]
    async fn test_missing_sequence_does_not_hang() {
        use tokio::time::{Duration, timeout};

        // Directly test OrderedAudioStream with a controlled channel
        let (tx, rx) = mpsc::channel::<(usize, TTSResult<Vec<AudioChunk>>)>(32);
        let mut stream = OrderedAudioStream::new(rx);

        // Send sequence 1, but NOT sequence 0
        // This simulates a TTS task for sequence 0 panicking
        let chunk = AudioChunk {
            samples: vec![0.5; 10],
            sample_rate: 24000,
            is_final: false,
        };
        tx.send((1, Ok(vec![chunk.clone()]))).await.unwrap();

        // Close the channel (no more results coming)
        drop(tx);

        // The stream should NOT hang - it should skip missing seq 0 and yield seq 1
        let result = timeout(Duration::from_secs(1), stream.next()).await;
        assert!(
            result.is_ok(),
            "Stream should not hang when sequence is missing"
        );

        // Should have received the chunk from sequence 1
        let item = result.unwrap();
        assert!(item.is_some(), "Should yield buffered sequence 1");
        assert!(item.unwrap().is_ok());

        // Stream should terminate
        let result = timeout(Duration::from_secs(1), stream.next()).await;
        assert!(result.is_ok(), "Stream should terminate cleanly");
        assert!(result.unwrap().is_none(), "Stream should be done");
    }

    /// Test that consecutive missing sequences are handled correctly.
    #[tokio::test]
    async fn test_multiple_missing_sequences() {
        use tokio::time::{Duration, timeout};

        let (tx, rx) = mpsc::channel::<(usize, TTSResult<Vec<AudioChunk>>)>(32);
        let mut stream = OrderedAudioStream::new(rx);

        // Only send sequence 3, missing 0, 1, and 2
        let chunk = AudioChunk {
            samples: vec![0.5; 10],
            sample_rate: 24000,
            is_final: false,
        };
        tx.send((3, Ok(vec![chunk]))).await.unwrap();
        drop(tx);

        // Should skip to seq 3 and yield it
        let result = timeout(Duration::from_secs(1), stream.next()).await;
        assert!(result.is_ok(), "Should not hang with multiple missing seqs");
        assert!(result.unwrap().is_some());

        // Should terminate
        let result = timeout(Duration::from_secs(1), stream.next()).await;
        assert!(result.is_ok());
        assert!(result.unwrap().is_none());
    }
}