car-voice 0.14.0

//! Software audio mixer that feeds VPIO's output bus.
//!
//! For VPIO's hardware AEC to actually cancel Tokhn's bed and TTS from
//! the user's mic, those audio sources have to flow *through* VPIO.
//! Bevy's default audio system uses cpal/rodio on a separate output
//! device, so VPIO has no idea what's playing and can't reference it
//! for cancellation. This module is the bridge: it owns rodio decoders
//! for the bed stems and the TTS clip queue, mixes them in software,
//! and exposes a `pull` method that VPIO's realtime output render
//! callback calls each tick.
//!
//! Threading model:
//!
//! - **Bevy main thread** writes per-stem target volumes through atomic
//!   floats (no locks) and pushes new TTS clips into a crossbeam channel
//!   (lock-free SPSC).
//! - **VPIO realtime audio thread** holds the only `&mut Mixer`, calls
//!   `pull(out, channels)` once per render cycle. The atomic reads are
//!   wait-free; the TTS channel pop is `try_recv` (wait-free).
//!
//! Sample format: f32 samples, target sample rate is whatever VPIO's
//! output bus chose (we query it at init time). All sources are
//! resampled / channel-adapted to that target via rodio's
//! `UniformSourceIterator`.

use std::fs::File;
use std::io::{BufReader, Cursor};
use std::path::PathBuf;
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::Arc;

use crossbeam_channel::{Receiver, Sender};
use rodio::{Decoder, Source};

use crate::{Result, VoiceError};

/// Roles match `app-bevy/src/audio_bed.rs` exactly so the same target
/// gain math drives both layers during the migration.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum StemRole {
    Pad,
    Texture,
    Harmonic,
    Percussive,
}

impl StemRole {
    pub const ALL: [StemRole; 4] = [
        StemRole::Pad,
        StemRole::Texture,
        StemRole::Harmonic,
        StemRole::Percussive,
    ];

    pub const fn file_name(self) -> &'static str {
        match self {
            Self::Pad => "pad.mp3",
            Self::Texture => "texture.mp3",
            Self::Harmonic => "harmonic.mp3",
            Self::Percussive => "percussive.mp3",
        }
    }

    fn index(self) -> usize {
        match self {
            Self::Pad => 0,
            Self::Texture => 1,
            Self::Harmonic => 2,
            Self::Percussive => 3,
        }
    }
}

/// Lock-free atomic f32 backed by `AtomicU32`.
#[derive(Default)]
pub struct AtomicF32(AtomicU32);

impl AtomicF32 {
    pub fn new(v: f32) -> Self {
        Self(AtomicU32::new(v.to_bits()))
    }
    pub fn load(&self) -> f32 {
        f32::from_bits(self.0.load(Ordering::Relaxed))
    }
    pub fn store(&self, v: f32) {
        self.0.store(v.to_bits(), Ordering::Relaxed);
    }
}

/// Shared handle the Bevy world uses to *talk to* the mixer running on
/// the audio thread. Cheap to clone (Arc inside).
#[derive(Clone)]
pub struct VoiceMixerHandle {
    /// Per-stem target volumes (0..1+). Read by the realtime mixer
    /// every callback; written by Bevy systems whenever the mood/arc
    /// state changes.
    stem_gains: Arc<[AtomicF32; 4]>,
    /// Master scale that the mixer multiplies the entire output by.
    /// Used for global mute / fade.
    master_gain: Arc<AtomicF32>,
    /// Channel for decoded TTS samples. The handle decodes incoming
    /// MP3 bytes on the *caller's* thread (typically a tokio task
    /// after the ElevenLabs HTTP call returns) and sends interleaved
    /// f32 samples. The audio thread just walks the buffer with a
    /// cursor — no decoder calls in the realtime path.
    tts_tx: Sender<Vec<f32>>,
    /// Channel for queuing a full-set stem swap. The handle sends four
    /// pre-decoded interleaved f32 buffers (one per role) and the
    /// realtime callback installs them with a gain-envelope crossfade
    /// from the currently-playing stems. Used for theme changes.
    stem_swap_tx: Sender<[Vec<f32>; 4]>,
    /// True while the audio thread has at least one TTS clip queued
    /// or playing — Bevy reads this for the mic indicator.
    speaking_now: Arc<std::sync::atomic::AtomicBool>,
    /// Set true to ask the realtime callback to drop the active TTS
    /// clip + drain the queue on its next pull. Used for barge-in.
    stop_tts_flag: Arc<std::sync::atomic::AtomicBool>,
    /// Captured at construction so `queue_tts` knows what to resample
    /// to. Single-write/multi-read so plain copies are fine.
    target_rate: u32,
    target_channels: u16,
}

impl VoiceMixerHandle {
    /// Update one stem's target gain. Wait-free.
    pub fn set_stem_gain(&self, role: StemRole, gain: f32) {
        self.stem_gains[role.index()].store(gain);
    }

    /// Update the master scale.
    pub fn set_master_gain(&self, gain: f32) {
        self.master_gain.store(gain);
    }

    /// Queue a TTS clip for playback over the bed. Decodes the MP3
    /// bytes on the *current* thread (NOT the audio thread) so the
    /// realtime callback never has to allocate or run a codec. The
    /// caller is the tokio task that just downloaded the audio from
    /// ElevenLabs — adding ~10ms of decode there is fine.
    pub fn queue_tts(&self, bytes: Vec<u8>) {
        let cursor = Cursor::new(bytes);
        let decoder = match Decoder::new(cursor) {
            Ok(d) => d,
            Err(e) => {
                tracing::warn!("[mixer] queue_tts decode failed: {e}");
                return;
            }
        };
        let samples = decode_source_to_interleaved(decoder, self.target_rate, self.target_channels);
        if samples.is_empty() {
            tracing::warn!("[mixer] queue_tts produced zero samples");
            return;
        }
        if self.tts_tx.send(samples).is_err() {
            tracing::warn!("[mixer] queue_tts send failed (mixer gone)");
        }
    }

    /// Is the mixer currently playing a TTS clip?
    pub fn is_speaking(&self) -> bool {
        self.speaking_now.load(Ordering::Relaxed)
    }

    /// Clone the underlying `Arc<AtomicBool>` so other components
    /// (notably the VPIO capture worker thread) can poll the flag
    /// directly without holding a reference to the whole handle.
    /// Used to detect TTS playback for barge-in handling.
    pub fn speaking_flag(&self) -> Arc<std::sync::atomic::AtomicBool> {
        Arc::clone(&self.speaking_now)
    }

    /// Hand four new stem buffers to the realtime mixer. The callback
    /// crossfades from the currently-playing stems to the new ones over
    /// roughly three-quarters of a second. Buffers are in the same
    /// interleaved f32 format and sample rate as the ones passed to
    /// [`VoiceAudioMixer::from_stem_samples`], in role order
    /// `[Pad, Texture, Harmonic, Percussive]`.
    ///
    /// Wait-free: the channel pop is `try_recv` on the audio thread.
    pub fn swap_stems(&self, stems: [Vec<f32>; 4]) {
        if self.stem_swap_tx.send(stems).is_err() {
            tracing::warn!("[mixer] swap_stems send failed (mixer gone)");
        }
    }

    /// Target sample rate the mixer expects for queued audio.
    pub fn target_rate(&self) -> u32 {
        self.target_rate
    }

    /// Target channel count the mixer expects for queued audio.
    pub fn target_channels(&self) -> u16 {
        self.target_channels
    }

    /// Halt the active TTS clip immediately. Used by the barge-in
    /// path: when the user starts speaking while Tokhn is talking,
    /// Tokhn shuts up so the user can be heard. Sets a flag the
    /// realtime callback checks each pull; the actual clip drop +
    /// queue drain happens on the audio thread (so the channel
    /// receiver isn't touched from outside its owner). Wait-free.
    pub fn stop_tts(&self) {
        self.stop_tts_flag
            .store(true, std::sync::atomic::Ordering::Relaxed);
        // Best-effort eager update so any external "is_speaking"
        // reader sees the new state right away. The realtime
        // callback re-confirms it.
        self.speaking_now
            .store(false, std::sync::atomic::Ordering::Relaxed);
    }
}

/// The mixer itself, owned by the audio thread. Bevy never touches
/// this directly — it goes through `VoiceMixerHandle`.
///
/// **Realtime safety**: every field is allocated *before* the audio
/// thread starts pulling samples. The hot path (`pull`) does no
/// I/O, no allocation, no locking. This is critical for VPIO's AEC:
/// any stall in the output render callback corrupts the reference
/// signal and the echo path estimator never converges, so Tokhn's
/// own voice leaks back into the mic.
pub struct VoiceAudioMixer {
    /// Four bed stems, fully decoded + resampled into memory at init.
    /// The audio thread plays them with a wrapping cursor — no file
    /// I/O ever happens during playback.
    stems: [PreloadedStem; 4],
    /// Previous stem set retained during a theme-change crossfade. The
    /// callback mixes old and new by a linear ramp driven by
    /// [`crossfade_samples_remaining`]; when the ramp reaches zero
    /// this is cleared so the old buffers can drop on the audio thread
    /// (a Vec drop is allocation-free for already-owned memory but we
    /// prefer to keep it out of the hot path by doing it on the same
    /// frame the fade ends — cost is amortized).
    fading_out: Option<[PreloadedStem; 4]>,
    /// Frames (output samples) remaining in the active crossfade. When
    /// `fading_out` is `Some`, this counts down from
    /// [`crossfade_total_samples`] to zero.
    crossfade_samples_remaining: usize,
    /// Total frames in the crossfade ramp. Used to compute the mix
    /// ratio: `t = 1 - remaining / total`.
    crossfade_total_samples: usize,
    /// Cached gain reads (loaded from atomics each callback).
    stem_gains: Arc<[AtomicF32; 4]>,
    master_gain: Arc<AtomicF32>,
    /// TTS playback queue: at most a few clips at a time. Each item
    /// is a fully-decoded interleaved `Vec<f32>` so the audio thread
    /// never has to invoke a codec.
    tts_rx: Receiver<Vec<f32>>,
    /// Stem swap queue. The handle sends four new stem buffers; the
    /// realtime callback drains on its next pull and starts a
    /// crossfade.
    stem_swap_rx: Receiver<[Vec<f32>; 4]>,
    /// Captured at construction so the crossfade length can be
    /// computed from [`CROSSFADE_SECONDS`]. Single-write/multi-read.
    target_rate: u32,
    target_channels: u16,
    /// Currently playing TTS clip, if any. TTS clips are decoded once
    /// into a `Vec<f32>` when they arrive — no per-sample decoder calls.
    active_tts: Option<TtsClip>,
    speaking_now: Arc<std::sync::atomic::AtomicBool>,
    /// Set true by the handle's `stop_tts()` for barge-in. The pull()
    /// realtime callback notices it, drops the active clip, and drains
    /// any queued clips so they don't resume immediately.
    stop_tts_flag: Arc<std::sync::atomic::AtomicBool>,
}

/// Duration of the crossfade between stem sets when a theme swap
/// arrives. Long enough to feel musical, short enough that the new
/// bed doesn't feel delayed.
const CROSSFADE_SECONDS: f32 = 0.75;

/// A bed stem fully decoded and resampled to the mixer's target
/// sample rate / channel count. Played with a wrapping cursor so the
/// realtime callback never has to touch the filesystem.
struct PreloadedStem {
    samples: Vec<f32>,
    cursor: usize,
}

impl PreloadedStem {
    fn next_sample(&mut self) -> f32 {
        if self.samples.is_empty() {
            return 0.0;
        }
        let s = self.samples[self.cursor];
        self.cursor += 1;
        if self.cursor >= self.samples.len() {
            self.cursor = 0;
        }
        s
    }
}

/// A decoded TTS utterance. We store the full sample buffer up front
/// so playback is just a cursor advance.
struct TtsClip {
    samples: Vec<f32>,
    cursor: usize,
}

impl TtsClip {
    fn next_sample(&mut self) -> Option<f32> {
        if self.cursor >= self.samples.len() {
            return None;
        }
        let s = self.samples[self.cursor];
        self.cursor += 1;
        Some(s)
    }
}

impl VoiceAudioMixer {
    /// Construct a mixer + handle pair. **Decodes all four bed stems
    /// fully into memory** at this point — that's the only way to make
    /// the realtime callback truly allocation- and I/O-free, which is
    /// the precondition for VPIO's AEC reference signal to be clean
    /// enough to actually cancel the bed from the mic.
    ///
    /// `speaking_now` is the shared "Tokhn is currently playing TTS"
    /// flag. Pass an `Arc::new(AtomicBool::new(false))` you've also
    /// shared with the capture worker (via
    /// `VoiceProcessingListener::with_speaking_flag`) so that
    /// listener can suppress mic capture during own playback.
    pub fn new(
        asset_root: &PathBuf,
        theme_hint: &str,
        target_rate: u32,
        target_channels: u16,
        speaking_now: Arc<std::sync::atomic::AtomicBool>,
    ) -> Result<(Self, VoiceMixerHandle)> {
        let stems = preload_stems(asset_root, theme_hint, target_rate, target_channels)?;

        let stem_gains = Arc::new([
            AtomicF32::new(0.0),
            AtomicF32::new(0.0),
            AtomicF32::new(0.0),
            AtomicF32::new(0.0),
        ]);
        let master_gain = Arc::new(AtomicF32::new(1.0));
        let stop_tts_flag = Arc::new(std::sync::atomic::AtomicBool::new(false));
        let (tts_tx, tts_rx) = crossbeam_channel::unbounded::<Vec<f32>>();
        let (stem_swap_tx, stem_swap_rx) = crossbeam_channel::unbounded::<[Vec<f32>; 4]>();

        let handle = VoiceMixerHandle {
            stem_gains: Arc::clone(&stem_gains),
            master_gain: Arc::clone(&master_gain),
            tts_tx,
            stem_swap_tx,
            speaking_now: Arc::clone(&speaking_now),
            stop_tts_flag: Arc::clone(&stop_tts_flag),
            target_rate,
            target_channels,
        };

        let mixer = Self {
            stems,
            fading_out: None,
            crossfade_samples_remaining: 0,
            crossfade_total_samples: 0,
            stem_gains,
            master_gain,
            tts_rx,
            stem_swap_rx,
            target_rate,
            target_channels,
            active_tts: None,
            speaking_now,
            stop_tts_flag,
        };

        Ok((mixer, handle))
    }

    /// Construct a mixer from pre-decoded, already-interleaved stem
    /// buffers. Use this when stems come from procedural synthesis (or
    /// any other in-memory source) instead of on-disk MP3s.
    ///
    /// Each element of `stems` is an interleaved f32 buffer at
    /// `target_rate` with `target_channels` channels that will be played
    /// with a wrapping cursor. For seamless looping the caller should
    /// supply buffers whose start and end samples match (zero-crossings
    /// or an integer number of cycles of the lowest frequency).
    ///
    /// Order matches [`StemRole::ALL`]: `[Pad, Texture, Harmonic,
    /// Percussive]`.
    pub fn from_stem_samples(
        stems: [Vec<f32>; 4],
        target_rate: u32,
        target_channels: u16,
        speaking_now: Arc<std::sync::atomic::AtomicBool>,
    ) -> (Self, VoiceMixerHandle) {
        let [pad, texture, harmonic, percussive] = stems;
        let stems = [
            PreloadedStem {
                samples: pad,
                cursor: 0,
            },
            PreloadedStem {
                samples: texture,
                cursor: 0,
            },
            PreloadedStem {
                samples: harmonic,
                cursor: 0,
            },
            PreloadedStem {
                samples: percussive,
                cursor: 0,
            },
        ];

        let stem_gains = Arc::new([
            AtomicF32::new(0.0),
            AtomicF32::new(0.0),
            AtomicF32::new(0.0),
            AtomicF32::new(0.0),
        ]);
        let master_gain = Arc::new(AtomicF32::new(1.0));
        let stop_tts_flag = Arc::new(std::sync::atomic::AtomicBool::new(false));
        let (tts_tx, tts_rx) = crossbeam_channel::unbounded::<Vec<f32>>();
        let (stem_swap_tx, stem_swap_rx) = crossbeam_channel::unbounded::<[Vec<f32>; 4]>();

        let handle = VoiceMixerHandle {
            stem_gains: Arc::clone(&stem_gains),
            master_gain: Arc::clone(&master_gain),
            tts_tx,
            stem_swap_tx,
            speaking_now: Arc::clone(&speaking_now),
            stop_tts_flag: Arc::clone(&stop_tts_flag),
            target_rate,
            target_channels,
        };

        let mixer = Self {
            stems,
            fading_out: None,
            crossfade_samples_remaining: 0,
            crossfade_total_samples: 0,
            stem_gains,
            master_gain,
            tts_rx,
            stem_swap_rx,
            target_rate,
            target_channels,
            active_tts: None,
            speaking_now,
            stop_tts_flag,
        };

        (mixer, handle)
    }

    /// Pull interleaved samples into `out` from the realtime audio
    /// callback. **Allocation-free, I/O-free, lock-free.** Every frame
    /// of `out` gets a value — silence is `0.0`, never a missing slot.
    /// VPIO's AEC depends on this contract: gaps in the reference
    /// signal poison the echo path estimator and Tokhn's voice leaks
    /// back into the mic.
    pub fn pull(&mut self, out: &mut [f32]) {
        // Barge-in: if the handle asked us to stop, drop the active
        // clip and drain any pending ones so they don't resume the
        // moment the user stops talking.
        if self.stop_tts_flag.swap(false, Ordering::Relaxed) {
            self.active_tts = None;
            self.speaking_now.store(false, Ordering::Relaxed);
            while self.tts_rx.try_recv().is_ok() {}
        }

        // Drain the stem-swap channel. If multiple swaps queued (rare),
        // the latest wins — prior newcomers are silently discarded,
        // which avoids cascading crossfades. The previous stem set is
        // stashed in `fading_out` so the crossfade can mix it down.
        let mut latest_swap: Option<[Vec<f32>; 4]> = None;
        while let Ok(new_stems) = self.stem_swap_rx.try_recv() {
            latest_swap = Some(new_stems);
        }
        if let Some([pad, tex, harm, perc]) = latest_swap {
            let new_stems = [
                PreloadedStem {
                    samples: pad,
                    cursor: 0,
                },
                PreloadedStem {
                    samples: tex,
                    cursor: 0,
                },
                PreloadedStem {
                    samples: harm,
                    cursor: 0,
                },
                PreloadedStem {
                    samples: perc,
                    cursor: 0,
                },
            ];
            let prior = std::mem::replace(&mut self.stems, new_stems);
            self.fading_out = Some(prior);
            let frames = (self.target_rate as f32 * CROSSFADE_SECONDS) as usize;
            self.crossfade_total_samples = frames.saturating_mul(self.target_channels as usize);
            self.crossfade_samples_remaining = self.crossfade_total_samples;
        }

        // Drain the TTS queue and start the next clip if we're idle.
        // The pop is wait-free (`try_recv`) and the popped value is
        // already decoded — no codec calls in this function.
        if self.active_tts.is_none() {
            if let Ok(samples) = self.tts_rx.try_recv() {
                self.active_tts = Some(TtsClip { samples, cursor: 0 });
                self.speaking_now.store(true, Ordering::Relaxed);
            }
        }

        let master = self.master_gain.load();
        let gains = [
            self.stem_gains[0].load(),
            self.stem_gains[1].load(),
            self.stem_gains[2].load(),
            self.stem_gains[3].load(),
        ];

        for slot in out.iter_mut() {
            // Always advance every stem cursor so the layers stay
            // phase-locked even when one is muted.
            let s0 = self.stems[0].next_sample();
            let s1 = self.stems[1].next_sample();
            let s2 = self.stems[2].next_sample();
            let s3 = self.stems[3].next_sample();
            let new_mix = s0 * gains[0] + s1 * gains[1] + s2 * gains[2] + s3 * gains[3];

            // Crossfade with the outgoing stems if a swap is in
            // progress. `t` ramps 0 → 1 over the full crossfade: old
            // at full for the first sample, new at full by the last.
            let bed_mix = if let Some(out_stems) = self.fading_out.as_mut() {
                let o0 = out_stems[0].next_sample();
                let o1 = out_stems[1].next_sample();
                let o2 = out_stems[2].next_sample();
                let o3 = out_stems[3].next_sample();
                let old_mix = o0 * gains[0] + o1 * gains[1] + o2 * gains[2] + o3 * gains[3];

                let t = if self.crossfade_total_samples > 0 {
                    1.0 - (self.crossfade_samples_remaining as f32
                        / self.crossfade_total_samples as f32)
                } else {
                    1.0
                };
                let t = t.clamp(0.0, 1.0);

                if self.crossfade_samples_remaining > 0 {
                    self.crossfade_samples_remaining -= 1;
                }
                if self.crossfade_samples_remaining == 0 {
                    self.fading_out = None;
                    self.crossfade_total_samples = 0;
                }
                (1.0 - t) * old_mix + t * new_mix
            } else {
                new_mix
            };

            let mut sample = bed_mix;

            // Layer TTS on top.
            if let Some(tts) = self.active_tts.as_mut() {
                if let Some(t) = tts.next_sample() {
                    sample += t;
                } else {
                    self.active_tts = None;
                    self.speaking_now.store(false, Ordering::Relaxed);
                }
            }

            *slot = (sample * master).clamp(-1.0, 1.0);
        }
    }
}

// ─────────────────────────────────────────────────────────────────────
// Stem preloading
// ─────────────────────────────────────────────────────────────────────

/// Decode + resample all four bed stems into memory at startup. Each
/// returned [`PreloadedStem`] owns a `Vec<f32>` of interleaved samples
/// at the mixer's target rate / channel count, so the realtime callback
/// can play them with nothing more than a cursor advance.
///
/// The decoded buffers are then **seam-stitched** so the wrap from the
/// last sample to the first is silent. The MP3 stem files weren't
/// authored as seamless loops — without this stitch step the cursor
/// produces an audible click every loop.
fn preload_stems(
    asset_root: &PathBuf,
    theme_hint: &str,
    target_rate: u32,
    target_channels: u16,
) -> Result<[PreloadedStem; 4]> {
    let stems_dir = asset_root
        .join("audio")
        .join("themes")
        .join(theme_hint)
        .join("stems");
    let mut stems: Vec<PreloadedStem> = Vec::with_capacity(4);
    for role in StemRole::ALL {
        let path = stems_dir.join(role.file_name());
        let mut samples = decode_to_memory(&path, target_rate, target_channels)?;
        stitch_loop_seam(&mut samples, target_rate, target_channels);
        tracing::info!(
            "[mixer] preloaded stem {} ({} samples, {:.1}s @ {} Hz {} ch)",
            role.file_name(),
            samples.len(),
            samples.len() as f32 / (target_rate as f32 * target_channels as f32),
            target_rate,
            target_channels,
        );
        stems.push(PreloadedStem { samples, cursor: 0 });
    }
    let [pad, texture, harmonic, percussive]: [PreloadedStem; 4] = stems
        .try_into()
        .map_err(|_| VoiceError::Device("expected exactly 4 stems".into()))?;
    Ok([pad, texture, harmonic, percussive])
}

/// Make a buffer loop seamlessly by crossfading the last ~30 ms with
/// the first ~30 ms in place. After this pass:
///
/// - The last `fade_len` samples are a linear blend from `samples[end-fade]`
///   into `samples[0]` (so the trailing edge slopes toward the head).
/// - The first `fade_len` samples are the same blend in reverse (so the
///   leading edge slopes away from the tail).
///
/// The wrap-around from `samples[len-1]` back to `samples[0]` no longer
/// has a discontinuity, so the cursor's wrap is silent.
///
/// One-shot at preload time — never called from the realtime thread.
fn stitch_loop_seam(samples: &mut Vec<f32>, target_rate: u32, target_channels: u16) {
    if samples.is_empty() {
        return;
    }
    let channels = target_channels.max(1) as usize;
    let frames = samples.len() / channels;
    // 30 ms of crossfade — long enough to be inaudible at the seam,
    // short enough to be invisible against the rest of the loop.
    let fade_frames = ((target_rate as usize * 30) / 1000).min(frames / 4);
    if fade_frames < 8 {
        // Buffer too short for a meaningful crossfade.
        return;
    }

    // Snapshot the original head + tail before we mutate either.
    let head: Vec<f32> = samples[..fade_frames * channels].to_vec();
    let tail_start = (frames - fade_frames) * channels;
    let tail: Vec<f32> = samples[tail_start..].to_vec();

    // Blend the tail into the head at its current position.
    // tail*cos² + head*sin² for an equal-power crossfade at the seam.
    for f in 0..fade_frames {
        // t goes 0 → 1 across the fade window
        let t = f as f32 / (fade_frames - 1).max(1) as f32;
        // Equal-power weights — tail fades out, head fades in
        let w_tail = ((1.0 - t) * std::f32::consts::FRAC_PI_2).cos().powi(2);
        let w_head = (t * std::f32::consts::FRAC_PI_2).sin().powi(2);
        for c in 0..channels {
            let src = tail[f * channels + c] * w_tail + head[f * channels + c] * w_head;
            samples[(frames - fade_frames + f) * channels + c] = src;
        }
    }

    // Mirror at the head: blend head*cos² + tail*sin² so the buffer's
    // first samples already match what the just-computed tail ends on.
    for f in 0..fade_frames {
        let t = f as f32 / (fade_frames - 1).max(1) as f32;
        let w_head = ((1.0 - t) * std::f32::consts::FRAC_PI_2).cos().powi(2);
        let w_tail = (t * std::f32::consts::FRAC_PI_2).sin().powi(2);
        for c in 0..channels {
            let src = head[f * channels + c] * w_head + tail[f * channels + c] * w_tail;
            samples[f * channels + c] = src;
        }
    }
}

/// Decode an MP3 file fully into a `Vec<f32>` of interleaved samples
/// at the requested rate / channel count. Used at startup only — never
/// called from the audio thread.
///
/// Avoids `rodio::source::UniformSourceIterator` because rodio 0.20.1's
/// `ChannelCountConverter` panics with `attempt to subtract with overflow`
/// for some MP3 channel layouts. We do channel + sample-rate conversion
/// ourselves with simple, panic-free code: linear interpolation for
/// resampling, average-and-duplicate for channel adaptation.
fn decode_to_memory(path: &PathBuf, target_rate: u32, target_channels: u16) -> Result<Vec<f32>> {
    let file = File::open(path)
        .map_err(|e| VoiceError::Device(format!("open stem {}: {e}", path.display())))?;
    let decoder = Decoder::new(BufReader::new(file))
        .map_err(|e| VoiceError::Device(format!("decode stem {}: {e}", path.display())))?;
    let samples = decode_source_to_interleaved(decoder, target_rate, target_channels);
    if samples.is_empty() {
        return Err(VoiceError::Device(format!(
            "stem {} decoded to zero samples",
            path.display()
        )));
    }
    Ok(samples)
}

/// Convert any rodio `Source` (with `i16` samples — the rodio default)
/// into a `Vec<f32>` of interleaved samples at the target rate and
/// channel count, using a hand-rolled converter that doesn't go through
/// `UniformSourceIterator`.
///
/// Steps:
/// 1. Read source rate + channels.
/// 2. Drain the decoder into per-channel `Vec<f32>` deinterleaved.
/// 3. Resample each channel to `target_rate` via linear interpolation.
/// 4. Map source channels to target channels:
///    - `1 → N`: duplicate the mono channel into all N targets.
///    - `N → 1`: average all source channels.
///    - `N → M (N == M)`: passthrough.
///    - other: take the first `min(N, M)` and zero-pad the rest.
/// 5. Re-interleave.
fn decode_source_to_interleaved<S>(source: S, target_rate: u32, target_channels: u16) -> Vec<f32>
where
    S: Source<Item = i16>,
{
    let src_channels = source.channels().max(1) as usize;
    let src_rate = source.sample_rate().max(1);
    let target_channels = target_channels.max(1) as usize;

    // 1. Drain into deinterleaved per-channel buffers.
    let mut channel_data: Vec<Vec<f32>> = vec![Vec::new(); src_channels];
    let mut idx = 0usize;
    for sample in source {
        let f = (sample as f32) / (i16::MAX as f32);
        channel_data[idx % src_channels].push(f);
        idx += 1;
    }

    if channel_data.iter().all(|c| c.is_empty()) {
        return Vec::new();
    }

    // 2. Resample each channel to the target rate.
    let resampled: Vec<Vec<f32>> = channel_data
        .into_iter()
        .map(|c| linear_resample(&c, src_rate, target_rate))
        .collect();

    // 3. Adapt channel count.
    let target_per_channel = resampled[0].len();
    let mut target_channels_data: Vec<Vec<f32>> = (0..target_channels)
        .map(|i| {
            if src_channels == 1 {
                // Mono → N: duplicate the only channel into every output.
                resampled[0].clone()
            } else if target_channels == 1 {
                // N → mono: average across source channels.
                let mut mono = vec![0.0f32; target_per_channel];
                for c in resampled.iter() {
                    for (j, s) in c.iter().enumerate() {
                        mono[j] += *s;
                    }
                }
                let inv = 1.0 / src_channels as f32;
                for s in mono.iter_mut() {
                    *s *= inv;
                }
                let _ = i;
                mono
            } else if i < src_channels {
                // Passthrough for matching index.
                resampled[i].clone()
            } else {
                // Extra target channels: zero-pad.
                vec![0.0; target_per_channel]
            }
        })
        .collect();
    if target_channels == 1 {
        // The closure produced N copies of the mono mix above; keep one.
        target_channels_data.truncate(1);
    }

    // 4. Re-interleave.
    let frames = target_channels_data[0].len();
    let total = frames * target_channels;
    let mut out = Vec::with_capacity(total);
    for f in 0..frames {
        for ch in 0..target_channels {
            out.push(target_channels_data[ch][f]);
        }
    }
    out
}

/// Linear-interpolate `samples` (one channel, at `src_rate` Hz) to
/// `target_rate` Hz. Returns a new `Vec<f32>`. Pass-through for the
/// equal-rate case so resampling has zero cost when the bed already
/// matches VPIO's negotiated output rate.
#[cfg(test)]
mod stitch_tests {
    use super::stitch_loop_seam;

    #[test]
    fn empty_buffer_is_a_noop() {
        let mut buf: Vec<f32> = vec![];
        stitch_loop_seam(&mut buf, 44_100, 2);
        assert!(buf.is_empty());
    }

    #[test]
    fn short_buffer_skips_crossfade_safely() {
        // Less than fade_len * 4 frames — the function should bail out
        // without panicking and leave the buffer alone.
        let mut buf: Vec<f32> = vec![0.5; 16];
        let original = buf.clone();
        stitch_loop_seam(&mut buf, 44_100, 2);
        assert_eq!(buf, original);
    }

    #[test]
    fn seam_value_matches_after_stitch() {
        // Two-channel sawtooth: tail value differs sharply from head
        // value before stitching. After stitching the LAST frame
        // should equal the FIRST frame so the wrap is silent.
        let frames = 44_100; // 1s at 44.1k mono
        let channels = 1usize;
        let mut buf: Vec<f32> = (0..frames)
            .map(|i| (i as f32 / frames as f32) - 0.5) // -0.5 → +0.5 ramp
            .collect();
        // Pre-stitch: last sample is ~+0.5, first is -0.5. Big jump.
        let pre_jump = (buf[frames - 1] - buf[0]).abs();
        assert!(pre_jump > 0.9);
        stitch_loop_seam(&mut buf, 44_100, channels as u16);
        // Post-stitch: last sample should be very close to first.
        let post_jump = (buf[frames - 1] - buf[0]).abs();
        assert!(
            post_jump < pre_jump,
            "stitch should reduce wrap discontinuity (was {pre_jump}, now {post_jump})"
        );
    }
}

#[cfg(test)]
mod resample_tests {
    use super::linear_resample;

    #[test]
    fn passthrough_when_rates_match() {
        let s = vec![0.1, 0.2, 0.3, 0.4];
        let out = linear_resample(&s, 44100, 44100);
        assert_eq!(out, s);
    }

    #[test]
    fn upsample_doubles_length_approx() {
        let s = vec![0.0, 1.0, 0.0, 1.0];
        let out = linear_resample(&s, 22050, 44100);
        assert_eq!(out.len(), 8);
    }

    #[test]
    fn downsample_halves_length_approx() {
        let s = vec![0.0, 0.25, 0.5, 0.75, 1.0, 0.75, 0.5, 0.25];
        let out = linear_resample(&s, 44100, 22050);
        assert_eq!(out.len(), 4);
    }

    #[test]
    fn empty_input_yields_empty_output() {
        assert!(linear_resample(&[], 44100, 22050).is_empty());
    }
}

fn linear_resample(samples: &[f32], src_rate: u32, target_rate: u32) -> Vec<f32> {
    if samples.is_empty() {
        return Vec::new();
    }
    if src_rate == target_rate {
        return samples.to_vec();
    }
    let ratio = src_rate as f64 / target_rate as f64;
    let out_len = ((samples.len() as f64) / ratio).floor() as usize;
    let mut out = Vec::with_capacity(out_len);
    for i in 0..out_len {
        let src_pos = (i as f64) * ratio;
        let lo = src_pos.floor() as usize;
        let hi = (lo + 1).min(samples.len() - 1);
        let t = (src_pos - lo as f64) as f32;
        let s = samples[lo] * (1.0 - t) + samples[hi] * t;
        out.push(s);
    }
    out
}