car-voice 0.14.0

Voice I/O capability for CAR — mic capture, VAD, listener/speaker traits
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
//! Software audio mixer that feeds VPIO's output bus.
//!
//! For VPIO's hardware AEC to actually cancel Tokhn's bed and TTS from
//! the user's mic, those audio sources have to flow *through* VPIO.
//! Bevy's default audio system uses cpal/rodio on a separate output
//! device, so VPIO has no idea what's playing and can't reference it
//! for cancellation. This module is the bridge: it owns rodio decoders
//! for the bed stems and the TTS clip queue, mixes them in software,
//! and exposes a `pull` method that VPIO's realtime output render
//! callback calls each tick.
//!
//! Threading model:
//!
//! - **Bevy main thread** writes per-stem target volumes through atomic
//!   floats (no locks) and pushes new TTS clips into a crossbeam channel
//!   (lock-free SPSC).
//! - **VPIO realtime audio thread** holds the only `&mut Mixer`, calls
//!   `pull(out, channels)` once per render cycle. The atomic reads are
//!   wait-free; the TTS channel pop is `try_recv` (wait-free).
//!
//! Sample format: f32 samples, target sample rate is whatever VPIO's
//! output bus chose (we query it at init time). All sources are
//! resampled / channel-adapted to that target via rodio's
//! `UniformSourceIterator`.

use std::fs::File;
use std::io::{BufReader, Cursor};
use std::path::PathBuf;
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::Arc;

use crossbeam_channel::{Receiver, Sender};
use rodio::{Decoder, Source};

use crate::{Result, VoiceError};

/// Roles match `app-bevy/src/audio_bed.rs` exactly so the same target
/// gain math drives both layers during the migration.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum StemRole {
    Pad,
    Texture,
    Harmonic,
    Percussive,
}

impl StemRole {
    pub const ALL: [StemRole; 4] = [
        StemRole::Pad,
        StemRole::Texture,
        StemRole::Harmonic,
        StemRole::Percussive,
    ];

    pub const fn file_name(self) -> &'static str {
        match self {
            Self::Pad => "pad.mp3",
            Self::Texture => "texture.mp3",
            Self::Harmonic => "harmonic.mp3",
            Self::Percussive => "percussive.mp3",
        }
    }

    fn index(self) -> usize {
        match self {
            Self::Pad => 0,
            Self::Texture => 1,
            Self::Harmonic => 2,
            Self::Percussive => 3,
        }
    }
}

/// Lock-free atomic f32 backed by `AtomicU32`.
#[derive(Default)]
pub struct AtomicF32(AtomicU32);

impl AtomicF32 {
    pub fn new(v: f32) -> Self {
        Self(AtomicU32::new(v.to_bits()))
    }
    pub fn load(&self) -> f32 {
        f32::from_bits(self.0.load(Ordering::Relaxed))
    }
    pub fn store(&self, v: f32) {
        self.0.store(v.to_bits(), Ordering::Relaxed);
    }
}

/// Shared handle the Bevy world uses to *talk to* the mixer running on
/// the audio thread. Cheap to clone (Arc inside).
#[derive(Clone)]
pub struct VoiceMixerHandle {
    /// Per-stem target volumes (0..1+). Read by the realtime mixer
    /// every callback; written by Bevy systems whenever the mood/arc
    /// state changes.
    stem_gains: Arc<[AtomicF32; 4]>,
    /// Master scale that the mixer multiplies the entire output by.
    /// Used for global mute / fade.
    master_gain: Arc<AtomicF32>,
    /// Channel for decoded TTS samples. The handle decodes incoming
    /// MP3 bytes on the *caller's* thread (typically a tokio task
    /// after the ElevenLabs HTTP call returns) and sends interleaved
    /// f32 samples. The audio thread just walks the buffer with a
    /// cursor — no decoder calls in the realtime path.
    tts_tx: Sender<Vec<f32>>,
    /// Channel for queuing a full-set stem swap. The handle sends four
    /// pre-decoded interleaved f32 buffers (one per role) and the
    /// realtime callback installs them with a gain-envelope crossfade
    /// from the currently-playing stems. Used for theme changes.
    stem_swap_tx: Sender<[Vec<f32>; 4]>,
    /// True while the audio thread has at least one TTS clip queued
    /// or playing — Bevy reads this for the mic indicator.
    speaking_now: Arc<std::sync::atomic::AtomicBool>,
    /// Set true to ask the realtime callback to drop the active TTS
    /// clip + drain the queue on its next pull. Used for barge-in.
    stop_tts_flag: Arc<std::sync::atomic::AtomicBool>,
    /// Captured at construction so `queue_tts` knows what to resample
    /// to. Single-write/multi-read so plain copies are fine.
    target_rate: u32,
    target_channels: u16,
}

impl VoiceMixerHandle {
    /// Update one stem's target gain. Wait-free.
    pub fn set_stem_gain(&self, role: StemRole, gain: f32) {
        self.stem_gains[role.index()].store(gain);
    }

    /// Update the master scale.
    pub fn set_master_gain(&self, gain: f32) {
        self.master_gain.store(gain);
    }

    /// Queue a TTS clip for playback over the bed. Decodes the MP3
    /// bytes on the *current* thread (NOT the audio thread) so the
    /// realtime callback never has to allocate or run a codec. The
    /// caller is the tokio task that just downloaded the audio from
    /// ElevenLabs — adding ~10ms of decode there is fine.
    pub fn queue_tts(&self, bytes: Vec<u8>) {
        let cursor = Cursor::new(bytes);
        let decoder = match Decoder::new(cursor) {
            Ok(d) => d,
            Err(e) => {
                tracing::warn!("[mixer] queue_tts decode failed: {e}");
                return;
            }
        };
        let samples = decode_source_to_interleaved(decoder, self.target_rate, self.target_channels);
        if samples.is_empty() {
            tracing::warn!("[mixer] queue_tts produced zero samples");
            return;
        }
        if self.tts_tx.send(samples).is_err() {
            tracing::warn!("[mixer] queue_tts send failed (mixer gone)");
        }
    }

    /// Is the mixer currently playing a TTS clip?
    pub fn is_speaking(&self) -> bool {
        self.speaking_now.load(Ordering::Relaxed)
    }

    /// Clone the underlying `Arc<AtomicBool>` so other components
    /// (notably the VPIO capture worker thread) can poll the flag
    /// directly without holding a reference to the whole handle.
    /// Used to detect TTS playback for barge-in handling.
    pub fn speaking_flag(&self) -> Arc<std::sync::atomic::AtomicBool> {
        Arc::clone(&self.speaking_now)
    }

    /// Hand four new stem buffers to the realtime mixer. The callback
    /// crossfades from the currently-playing stems to the new ones over
    /// roughly three-quarters of a second. Buffers are in the same
    /// interleaved f32 format and sample rate as the ones passed to
    /// [`VoiceAudioMixer::from_stem_samples`], in role order
    /// `[Pad, Texture, Harmonic, Percussive]`.
    ///
    /// Wait-free: the channel pop is `try_recv` on the audio thread.
    pub fn swap_stems(&self, stems: [Vec<f32>; 4]) {
        if self.stem_swap_tx.send(stems).is_err() {
            tracing::warn!("[mixer] swap_stems send failed (mixer gone)");
        }
    }

    /// Target sample rate the mixer expects for queued audio.
    pub fn target_rate(&self) -> u32 {
        self.target_rate
    }

    /// Target channel count the mixer expects for queued audio.
    pub fn target_channels(&self) -> u16 {
        self.target_channels
    }

    /// Halt the active TTS clip immediately. Used by the barge-in
    /// path: when the user starts speaking while Tokhn is talking,
    /// Tokhn shuts up so the user can be heard. Sets a flag the
    /// realtime callback checks each pull; the actual clip drop +
    /// queue drain happens on the audio thread (so the channel
    /// receiver isn't touched from outside its owner). Wait-free.
    pub fn stop_tts(&self) {
        self.stop_tts_flag
            .store(true, std::sync::atomic::Ordering::Relaxed);
        // Best-effort eager update so any external "is_speaking"
        // reader sees the new state right away. The realtime
        // callback re-confirms it.
        self.speaking_now
            .store(false, std::sync::atomic::Ordering::Relaxed);
    }
}

/// The mixer itself, owned by the audio thread. Bevy never touches
/// this directly — it goes through `VoiceMixerHandle`.
///
/// **Realtime safety**: every field is allocated *before* the audio
/// thread starts pulling samples. The hot path (`pull`) does no
/// I/O, no allocation, no locking. This is critical for VPIO's AEC:
/// any stall in the output render callback corrupts the reference
/// signal and the echo path estimator never converges, so Tokhn's
/// own voice leaks back into the mic.
pub struct VoiceAudioMixer {
    /// Four bed stems, fully decoded + resampled into memory at init.
    /// The audio thread plays them with a wrapping cursor — no file
    /// I/O ever happens during playback.
    stems: [PreloadedStem; 4],
    /// Previous stem set retained during a theme-change crossfade. The
    /// callback mixes old and new by a linear ramp driven by
    /// [`crossfade_samples_remaining`]; when the ramp reaches zero
    /// this is cleared so the old buffers can drop on the audio thread
    /// (a Vec drop is allocation-free for already-owned memory but we
    /// prefer to keep it out of the hot path by doing it on the same
    /// frame the fade ends — cost is amortized).
    fading_out: Option<[PreloadedStem; 4]>,
    /// Frames (output samples) remaining in the active crossfade. When
    /// `fading_out` is `Some`, this counts down from
    /// [`crossfade_total_samples`] to zero.
    crossfade_samples_remaining: usize,
    /// Total frames in the crossfade ramp. Used to compute the mix
    /// ratio: `t = 1 - remaining / total`.
    crossfade_total_samples: usize,
    /// Cached gain reads (loaded from atomics each callback).
    stem_gains: Arc<[AtomicF32; 4]>,
    master_gain: Arc<AtomicF32>,
    /// TTS playback queue: at most a few clips at a time. Each item
    /// is a fully-decoded interleaved `Vec<f32>` so the audio thread
    /// never has to invoke a codec.
    tts_rx: Receiver<Vec<f32>>,
    /// Stem swap queue. The handle sends four new stem buffers; the
    /// realtime callback drains on its next pull and starts a
    /// crossfade.
    stem_swap_rx: Receiver<[Vec<f32>; 4]>,
    /// Captured at construction so the crossfade length can be
    /// computed from [`CROSSFADE_SECONDS`]. Single-write/multi-read.
    target_rate: u32,
    target_channels: u16,
    /// Currently playing TTS clip, if any. TTS clips are decoded once
    /// into a `Vec<f32>` when they arrive — no per-sample decoder calls.
    active_tts: Option<TtsClip>,
    speaking_now: Arc<std::sync::atomic::AtomicBool>,
    /// Set true by the handle's `stop_tts()` for barge-in. The pull()
    /// realtime callback notices it, drops the active clip, and drains
    /// any queued clips so they don't resume immediately.
    stop_tts_flag: Arc<std::sync::atomic::AtomicBool>,
}

/// Duration of the crossfade between stem sets when a theme swap
/// arrives. Long enough to feel musical, short enough that the new
/// bed doesn't feel delayed.
const CROSSFADE_SECONDS: f32 = 0.75;

/// A bed stem fully decoded and resampled to the mixer's target
/// sample rate / channel count. Played with a wrapping cursor so the
/// realtime callback never has to touch the filesystem.
struct PreloadedStem {
    samples: Vec<f32>,
    cursor: usize,
}

impl PreloadedStem {
    fn next_sample(&mut self) -> f32 {
        if self.samples.is_empty() {
            return 0.0;
        }
        let s = self.samples[self.cursor];
        self.cursor += 1;
        if self.cursor >= self.samples.len() {
            self.cursor = 0;
        }
        s
    }
}

/// A decoded TTS utterance. We store the full sample buffer up front
/// so playback is just a cursor advance.
struct TtsClip {
    samples: Vec<f32>,
    cursor: usize,
}

impl TtsClip {
    fn next_sample(&mut self) -> Option<f32> {
        if self.cursor >= self.samples.len() {
            return None;
        }
        let s = self.samples[self.cursor];
        self.cursor += 1;
        Some(s)
    }
}

impl VoiceAudioMixer {
    /// Construct a mixer + handle pair. **Decodes all four bed stems
    /// fully into memory** at this point — that's the only way to make
    /// the realtime callback truly allocation- and I/O-free, which is
    /// the precondition for VPIO's AEC reference signal to be clean
    /// enough to actually cancel the bed from the mic.
    ///
    /// `speaking_now` is the shared "Tokhn is currently playing TTS"
    /// flag. Pass an `Arc::new(AtomicBool::new(false))` you've also
    /// shared with the capture worker (via
    /// `VoiceProcessingListener::with_speaking_flag`) so that
    /// listener can suppress mic capture during own playback.
    pub fn new(
        asset_root: &PathBuf,
        theme_hint: &str,
        target_rate: u32,
        target_channels: u16,
        speaking_now: Arc<std::sync::atomic::AtomicBool>,
    ) -> Result<(Self, VoiceMixerHandle)> {
        let stems = preload_stems(asset_root, theme_hint, target_rate, target_channels)?;

        let stem_gains = Arc::new([
            AtomicF32::new(0.0),
            AtomicF32::new(0.0),
            AtomicF32::new(0.0),
            AtomicF32::new(0.0),
        ]);
        let master_gain = Arc::new(AtomicF32::new(1.0));
        let stop_tts_flag = Arc::new(std::sync::atomic::AtomicBool::new(false));
        let (tts_tx, tts_rx) = crossbeam_channel::unbounded::<Vec<f32>>();
        let (stem_swap_tx, stem_swap_rx) = crossbeam_channel::unbounded::<[Vec<f32>; 4]>();

        let handle = VoiceMixerHandle {
            stem_gains: Arc::clone(&stem_gains),
            master_gain: Arc::clone(&master_gain),
            tts_tx,
            stem_swap_tx,
            speaking_now: Arc::clone(&speaking_now),
            stop_tts_flag: Arc::clone(&stop_tts_flag),
            target_rate,
            target_channels,
        };

        let mixer = Self {
            stems,
            fading_out: None,
            crossfade_samples_remaining: 0,
            crossfade_total_samples: 0,
            stem_gains,
            master_gain,
            tts_rx,
            stem_swap_rx,
            target_rate,
            target_channels,
            active_tts: None,
            speaking_now,
            stop_tts_flag,
        };

        Ok((mixer, handle))
    }

    /// Construct a mixer from pre-decoded, already-interleaved stem
    /// buffers. Use this when stems come from procedural synthesis (or
    /// any other in-memory source) instead of on-disk MP3s.
    ///
    /// Each element of `stems` is an interleaved f32 buffer at
    /// `target_rate` with `target_channels` channels that will be played
    /// with a wrapping cursor. For seamless looping the caller should
    /// supply buffers whose start and end samples match (zero-crossings
    /// or an integer number of cycles of the lowest frequency).
    ///
    /// Order matches [`StemRole::ALL`]: `[Pad, Texture, Harmonic,
    /// Percussive]`.
    pub fn from_stem_samples(
        stems: [Vec<f32>; 4],
        target_rate: u32,
        target_channels: u16,
        speaking_now: Arc<std::sync::atomic::AtomicBool>,
    ) -> (Self, VoiceMixerHandle) {
        let [pad, texture, harmonic, percussive] = stems;
        let stems = [
            PreloadedStem {
                samples: pad,
                cursor: 0,
            },
            PreloadedStem {
                samples: texture,
                cursor: 0,
            },
            PreloadedStem {
                samples: harmonic,
                cursor: 0,
            },
            PreloadedStem {
                samples: percussive,
                cursor: 0,
            },
        ];

        let stem_gains = Arc::new([
            AtomicF32::new(0.0),
            AtomicF32::new(0.0),
            AtomicF32::new(0.0),
            AtomicF32::new(0.0),
        ]);
        let master_gain = Arc::new(AtomicF32::new(1.0));
        let stop_tts_flag = Arc::new(std::sync::atomic::AtomicBool::new(false));
        let (tts_tx, tts_rx) = crossbeam_channel::unbounded::<Vec<f32>>();
        let (stem_swap_tx, stem_swap_rx) = crossbeam_channel::unbounded::<[Vec<f32>; 4]>();

        let handle = VoiceMixerHandle {
            stem_gains: Arc::clone(&stem_gains),
            master_gain: Arc::clone(&master_gain),
            tts_tx,
            stem_swap_tx,
            speaking_now: Arc::clone(&speaking_now),
            stop_tts_flag: Arc::clone(&stop_tts_flag),
            target_rate,
            target_channels,
        };

        let mixer = Self {
            stems,
            fading_out: None,
            crossfade_samples_remaining: 0,
            crossfade_total_samples: 0,
            stem_gains,
            master_gain,
            tts_rx,
            stem_swap_rx,
            target_rate,
            target_channels,
            active_tts: None,
            speaking_now,
            stop_tts_flag,
        };

        (mixer, handle)
    }

    /// Pull interleaved samples into `out` from the realtime audio
    /// callback. **Allocation-free, I/O-free, lock-free.** Every frame
    /// of `out` gets a value — silence is `0.0`, never a missing slot.
    /// VPIO's AEC depends on this contract: gaps in the reference
    /// signal poison the echo path estimator and Tokhn's voice leaks
    /// back into the mic.
    pub fn pull(&mut self, out: &mut [f32]) {
        // Barge-in: if the handle asked us to stop, drop the active
        // clip and drain any pending ones so they don't resume the
        // moment the user stops talking.
        if self.stop_tts_flag.swap(false, Ordering::Relaxed) {
            self.active_tts = None;
            self.speaking_now.store(false, Ordering::Relaxed);
            while self.tts_rx.try_recv().is_ok() {}
        }

        // Drain the stem-swap channel. If multiple swaps queued (rare),
        // the latest wins — prior newcomers are silently discarded,
        // which avoids cascading crossfades. The previous stem set is
        // stashed in `fading_out` so the crossfade can mix it down.
        let mut latest_swap: Option<[Vec<f32>; 4]> = None;
        while let Ok(new_stems) = self.stem_swap_rx.try_recv() {
            latest_swap = Some(new_stems);
        }
        if let Some([pad, tex, harm, perc]) = latest_swap {
            let new_stems = [
                PreloadedStem {
                    samples: pad,
                    cursor: 0,
                },
                PreloadedStem {
                    samples: tex,
                    cursor: 0,
                },
                PreloadedStem {
                    samples: harm,
                    cursor: 0,
                },
                PreloadedStem {
                    samples: perc,
                    cursor: 0,
                },
            ];
            let prior = std::mem::replace(&mut self.stems, new_stems);
            self.fading_out = Some(prior);
            let frames = (self.target_rate as f32 * CROSSFADE_SECONDS) as usize;
            self.crossfade_total_samples = frames.saturating_mul(self.target_channels as usize);
            self.crossfade_samples_remaining = self.crossfade_total_samples;
        }

        // Drain the TTS queue and start the next clip if we're idle.
        // The pop is wait-free (`try_recv`) and the popped value is
        // already decoded — no codec calls in this function.
        if self.active_tts.is_none() {
            if let Ok(samples) = self.tts_rx.try_recv() {
                self.active_tts = Some(TtsClip { samples, cursor: 0 });
                self.speaking_now.store(true, Ordering::Relaxed);
            }
        }

        let master = self.master_gain.load();
        let gains = [
            self.stem_gains[0].load(),
            self.stem_gains[1].load(),
            self.stem_gains[2].load(),
            self.stem_gains[3].load(),
        ];

        for slot in out.iter_mut() {
            // Always advance every stem cursor so the layers stay
            // phase-locked even when one is muted.
            let s0 = self.stems[0].next_sample();
            let s1 = self.stems[1].next_sample();
            let s2 = self.stems[2].next_sample();
            let s3 = self.stems[3].next_sample();
            let new_mix = s0 * gains[0] + s1 * gains[1] + s2 * gains[2] + s3 * gains[3];

            // Crossfade with the outgoing stems if a swap is in
            // progress. `t` ramps 0 → 1 over the full crossfade: old
            // at full for the first sample, new at full by the last.
            let bed_mix = if let Some(out_stems) = self.fading_out.as_mut() {
                let o0 = out_stems[0].next_sample();
                let o1 = out_stems[1].next_sample();
                let o2 = out_stems[2].next_sample();
                let o3 = out_stems[3].next_sample();
                let old_mix = o0 * gains[0] + o1 * gains[1] + o2 * gains[2] + o3 * gains[3];

                let t = if self.crossfade_total_samples > 0 {
                    1.0 - (self.crossfade_samples_remaining as f32
                        / self.crossfade_total_samples as f32)
                } else {
                    1.0
                };
                let t = t.clamp(0.0, 1.0);

                if self.crossfade_samples_remaining > 0 {
                    self.crossfade_samples_remaining -= 1;
                }
                if self.crossfade_samples_remaining == 0 {
                    self.fading_out = None;
                    self.crossfade_total_samples = 0;
                }
                (1.0 - t) * old_mix + t * new_mix
            } else {
                new_mix
            };

            let mut sample = bed_mix;

            // Layer TTS on top.
            if let Some(tts) = self.active_tts.as_mut() {
                if let Some(t) = tts.next_sample() {
                    sample += t;
                } else {
                    self.active_tts = None;
                    self.speaking_now.store(false, Ordering::Relaxed);
                }
            }

            *slot = (sample * master).clamp(-1.0, 1.0);
        }
    }
}

// ─────────────────────────────────────────────────────────────────────
// Stem preloading
// ─────────────────────────────────────────────────────────────────────

/// Decode + resample all four bed stems into memory at startup. Each
/// returned [`PreloadedStem`] owns a `Vec<f32>` of interleaved samples
/// at the mixer's target rate / channel count, so the realtime callback
/// can play them with nothing more than a cursor advance.
///
/// The decoded buffers are then **seam-stitched** so the wrap from the
/// last sample to the first is silent. The MP3 stem files weren't
/// authored as seamless loops — without this stitch step the cursor
/// produces an audible click every loop.
fn preload_stems(
    asset_root: &PathBuf,
    theme_hint: &str,
    target_rate: u32,
    target_channels: u16,
) -> Result<[PreloadedStem; 4]> {
    let stems_dir = asset_root
        .join("audio")
        .join("themes")
        .join(theme_hint)
        .join("stems");
    let mut stems: Vec<PreloadedStem> = Vec::with_capacity(4);
    for role in StemRole::ALL {
        let path = stems_dir.join(role.file_name());
        let mut samples = decode_to_memory(&path, target_rate, target_channels)?;
        stitch_loop_seam(&mut samples, target_rate, target_channels);
        tracing::info!(
            "[mixer] preloaded stem {} ({} samples, {:.1}s @ {} Hz {} ch)",
            role.file_name(),
            samples.len(),
            samples.len() as f32 / (target_rate as f32 * target_channels as f32),
            target_rate,
            target_channels,
        );
        stems.push(PreloadedStem { samples, cursor: 0 });
    }
    let [pad, texture, harmonic, percussive]: [PreloadedStem; 4] = stems
        .try_into()
        .map_err(|_| VoiceError::Device("expected exactly 4 stems".into()))?;
    Ok([pad, texture, harmonic, percussive])
}

/// Make a buffer loop seamlessly by crossfading the last ~30 ms with
/// the first ~30 ms in place. After this pass:
///
/// - The last `fade_len` samples are a linear blend from `samples[end-fade]`
///   into `samples[0]` (so the trailing edge slopes toward the head).
/// - The first `fade_len` samples are the same blend in reverse (so the
///   leading edge slopes away from the tail).
///
/// The wrap-around from `samples[len-1]` back to `samples[0]` no longer
/// has a discontinuity, so the cursor's wrap is silent.
///
/// One-shot at preload time — never called from the realtime thread.
fn stitch_loop_seam(samples: &mut Vec<f32>, target_rate: u32, target_channels: u16) {
    if samples.is_empty() {
        return;
    }
    let channels = target_channels.max(1) as usize;
    let frames = samples.len() / channels;
    // 30 ms of crossfade — long enough to be inaudible at the seam,
    // short enough to be invisible against the rest of the loop.
    let fade_frames = ((target_rate as usize * 30) / 1000).min(frames / 4);
    if fade_frames < 8 {
        // Buffer too short for a meaningful crossfade.
        return;
    }

    // Snapshot the original head + tail before we mutate either.
    let head: Vec<f32> = samples[..fade_frames * channels].to_vec();
    let tail_start = (frames - fade_frames) * channels;
    let tail: Vec<f32> = samples[tail_start..].to_vec();

    // Blend the tail into the head at its current position.
    // tail*cos² + head*sin² for an equal-power crossfade at the seam.
    for f in 0..fade_frames {
        // t goes 0 → 1 across the fade window
        let t = f as f32 / (fade_frames - 1).max(1) as f32;
        // Equal-power weights — tail fades out, head fades in
        let w_tail = ((1.0 - t) * std::f32::consts::FRAC_PI_2).cos().powi(2);
        let w_head = (t * std::f32::consts::FRAC_PI_2).sin().powi(2);
        for c in 0..channels {
            let src = tail[f * channels + c] * w_tail + head[f * channels + c] * w_head;
            samples[(frames - fade_frames + f) * channels + c] = src;
        }
    }

    // Mirror at the head: blend head*cos² + tail*sin² so the buffer's
    // first samples already match what the just-computed tail ends on.
    for f in 0..fade_frames {
        let t = f as f32 / (fade_frames - 1).max(1) as f32;
        let w_head = ((1.0 - t) * std::f32::consts::FRAC_PI_2).cos().powi(2);
        let w_tail = (t * std::f32::consts::FRAC_PI_2).sin().powi(2);
        for c in 0..channels {
            let src = head[f * channels + c] * w_head + tail[f * channels + c] * w_tail;
            samples[f * channels + c] = src;
        }
    }
}

/// Decode an MP3 file fully into a `Vec<f32>` of interleaved samples
/// at the requested rate / channel count. Used at startup only — never
/// called from the audio thread.
///
/// Avoids `rodio::source::UniformSourceIterator` because rodio 0.20.1's
/// `ChannelCountConverter` panics with `attempt to subtract with overflow`
/// for some MP3 channel layouts. We do channel + sample-rate conversion
/// ourselves with simple, panic-free code: linear interpolation for
/// resampling, average-and-duplicate for channel adaptation.
fn decode_to_memory(path: &PathBuf, target_rate: u32, target_channels: u16) -> Result<Vec<f32>> {
    let file = File::open(path)
        .map_err(|e| VoiceError::Device(format!("open stem {}: {e}", path.display())))?;
    let decoder = Decoder::new(BufReader::new(file))
        .map_err(|e| VoiceError::Device(format!("decode stem {}: {e}", path.display())))?;
    let samples = decode_source_to_interleaved(decoder, target_rate, target_channels);
    if samples.is_empty() {
        return Err(VoiceError::Device(format!(
            "stem {} decoded to zero samples",
            path.display()
        )));
    }
    Ok(samples)
}

/// Convert any rodio `Source` (with `i16` samples — the rodio default)
/// into a `Vec<f32>` of interleaved samples at the target rate and
/// channel count, using a hand-rolled converter that doesn't go through
/// `UniformSourceIterator`.
///
/// Steps:
/// 1. Read source rate + channels.
/// 2. Drain the decoder into per-channel `Vec<f32>` deinterleaved.
/// 3. Resample each channel to `target_rate` via linear interpolation.
/// 4. Map source channels to target channels:
///    - `1 → N`: duplicate the mono channel into all N targets.
///    - `N → 1`: average all source channels.
///    - `N → M (N == M)`: passthrough.
///    - other: take the first `min(N, M)` and zero-pad the rest.
/// 5. Re-interleave.
fn decode_source_to_interleaved<S>(source: S, target_rate: u32, target_channels: u16) -> Vec<f32>
where
    S: Source<Item = i16>,
{
    let src_channels = source.channels().max(1) as usize;
    let src_rate = source.sample_rate().max(1);
    let target_channels = target_channels.max(1) as usize;

    // 1. Drain into deinterleaved per-channel buffers.
    let mut channel_data: Vec<Vec<f32>> = vec![Vec::new(); src_channels];
    let mut idx = 0usize;
    for sample in source {
        let f = (sample as f32) / (i16::MAX as f32);
        channel_data[idx % src_channels].push(f);
        idx += 1;
    }

    if channel_data.iter().all(|c| c.is_empty()) {
        return Vec::new();
    }

    // 2. Resample each channel to the target rate.
    let resampled: Vec<Vec<f32>> = channel_data
        .into_iter()
        .map(|c| linear_resample(&c, src_rate, target_rate))
        .collect();

    // 3. Adapt channel count.
    let target_per_channel = resampled[0].len();
    let mut target_channels_data: Vec<Vec<f32>> = (0..target_channels)
        .map(|i| {
            if src_channels == 1 {
                // Mono → N: duplicate the only channel into every output.
                resampled[0].clone()
            } else if target_channels == 1 {
                // N → mono: average across source channels.
                let mut mono = vec![0.0f32; target_per_channel];
                for c in resampled.iter() {
                    for (j, s) in c.iter().enumerate() {
                        mono[j] += *s;
                    }
                }
                let inv = 1.0 / src_channels as f32;
                for s in mono.iter_mut() {
                    *s *= inv;
                }
                let _ = i;
                mono
            } else if i < src_channels {
                // Passthrough for matching index.
                resampled[i].clone()
            } else {
                // Extra target channels: zero-pad.
                vec![0.0; target_per_channel]
            }
        })
        .collect();
    if target_channels == 1 {
        // The closure produced N copies of the mono mix above; keep one.
        target_channels_data.truncate(1);
    }

    // 4. Re-interleave.
    let frames = target_channels_data[0].len();
    let total = frames * target_channels;
    let mut out = Vec::with_capacity(total);
    for f in 0..frames {
        for ch in 0..target_channels {
            out.push(target_channels_data[ch][f]);
        }
    }
    out
}

/// Linear-interpolate `samples` (one channel, at `src_rate` Hz) to
/// `target_rate` Hz. Returns a new `Vec<f32>`. Pass-through for the
/// equal-rate case so resampling has zero cost when the bed already
/// matches VPIO's negotiated output rate.
#[cfg(test)]
mod stitch_tests {
    use super::stitch_loop_seam;

    #[test]
    fn empty_buffer_is_a_noop() {
        let mut buf: Vec<f32> = vec![];
        stitch_loop_seam(&mut buf, 44_100, 2);
        assert!(buf.is_empty());
    }

    #[test]
    fn short_buffer_skips_crossfade_safely() {
        // Less than fade_len * 4 frames — the function should bail out
        // without panicking and leave the buffer alone.
        let mut buf: Vec<f32> = vec![0.5; 16];
        let original = buf.clone();
        stitch_loop_seam(&mut buf, 44_100, 2);
        assert_eq!(buf, original);
    }

    #[test]
    fn seam_value_matches_after_stitch() {
        // Two-channel sawtooth: tail value differs sharply from head
        // value before stitching. After stitching the LAST frame
        // should equal the FIRST frame so the wrap is silent.
        let frames = 44_100; // 1s at 44.1k mono
        let channels = 1usize;
        let mut buf: Vec<f32> = (0..frames)
            .map(|i| (i as f32 / frames as f32) - 0.5) // -0.5 → +0.5 ramp
            .collect();
        // Pre-stitch: last sample is ~+0.5, first is -0.5. Big jump.
        let pre_jump = (buf[frames - 1] - buf[0]).abs();
        assert!(pre_jump > 0.9);
        stitch_loop_seam(&mut buf, 44_100, channels as u16);
        // Post-stitch: last sample should be very close to first.
        let post_jump = (buf[frames - 1] - buf[0]).abs();
        assert!(
            post_jump < pre_jump,
            "stitch should reduce wrap discontinuity (was {pre_jump}, now {post_jump})"
        );
    }
}

#[cfg(test)]
mod resample_tests {
    use super::linear_resample;

    #[test]
    fn passthrough_when_rates_match() {
        let s = vec![0.1, 0.2, 0.3, 0.4];
        let out = linear_resample(&s, 44100, 44100);
        assert_eq!(out, s);
    }

    #[test]
    fn upsample_doubles_length_approx() {
        let s = vec![0.0, 1.0, 0.0, 1.0];
        let out = linear_resample(&s, 22050, 44100);
        assert_eq!(out.len(), 8);
    }

    #[test]
    fn downsample_halves_length_approx() {
        let s = vec![0.0, 0.25, 0.5, 0.75, 1.0, 0.75, 0.5, 0.25];
        let out = linear_resample(&s, 44100, 22050);
        assert_eq!(out.len(), 4);
    }

    #[test]
    fn empty_input_yields_empty_output() {
        assert!(linear_resample(&[], 44100, 22050).is_empty());
    }
}

fn linear_resample(samples: &[f32], src_rate: u32, target_rate: u32) -> Vec<f32> {
    if samples.is_empty() {
        return Vec::new();
    }
    if src_rate == target_rate {
        return samples.to_vec();
    }
    let ratio = src_rate as f64 / target_rate as f64;
    let out_len = ((samples.len() as f64) / ratio).floor() as usize;
    let mut out = Vec::with_capacity(out_len);
    for i in 0..out_len {
        let src_pos = (i as f64) * ratio;
        let lo = src_pos.floor() as usize;
        let hi = (lo + 1).min(samples.len() - 1);
        let t = (src_pos - lo as f64) as f32;
        let s = samples[lo] * (1.0 - t) + samples[hi] * t;
        out.push(s);
    }
    out
}