tunes 1.1.0 - Docs.rs

/// SIMD abstraction layer for portable vectorized audio processing.
///
/// This module provides a trait-based abstraction over different SIMD lane widths
/// (f32x8, f32x4) with runtime CPU detection to automatically select the
/// best available instruction set.
///
/// The architecture is inspired by FunDSP's approach: write generic code once,
/// dispatch to the optimal SIMD width at runtime.
///
/// # Performance Model
///
/// - CPU detection happens ONCE at startup via lazy_static
/// - Match/dispatch overhead: ~3 CPU cycles per call
/// - Actual DSP math: ~500-1000 cycles per 8 samples
/// - Overhead percentage: < 0.5% (negligible)
///
/// # Usage
///
/// ```rust
/// use tunes::synthesis::simd::SIMD;
///
/// // Detect SIMD width once
/// let width = SIMD.width();
/// println!("Using {}-wide SIMD", width);
/// ```
use lazy_static::lazy_static;
use wide::{f32x4, f32x8};

lazy_static! {
    /// Global SIMD dispatcher - detects CPU capabilities once at startup.
    ///
    /// Use this instead of calling `SimdDispatcher::detect()` repeatedly.
    /// Detection happens once, results are cached forever.
    pub static ref SIMD: SimdDispatcher = SimdDispatcher::detect();
}

/// Trait abstracting over SIMD lane widths for audio processing.
///
/// This allows writing generic DSP code that works with any SIMD width,
/// from scalar (f32) up to 8-wide vectors (f32x8/AVX2).
pub trait SimdLanes: Copy + Clone + Sized {
    /// Number of f32 samples processed in parallel
    const LANES: usize;

    // Construction
    fn splat(val: f32) -> Self;
    fn from_array(arr: &[f32]) -> Self;

    // Arithmetic
    fn add(self, other: Self) -> Self;
    fn sub(self, other: Self) -> Self;
    fn mul(self, other: Self) -> Self;
    fn div(self, other: Self) -> Self;

    // Math functions
    fn abs(self) -> Self;
    fn min(self, other: Self) -> Self;
    fn max(self, other: Self) -> Self;
    fn sqrt(self) -> Self;

    // FMA (fused multiply-add): a * b + c
    // More accurate and faster than separate mul + add
    fn mul_add(self, b: Self, c: Self) -> Self;

    // Fast approximations for audio (trading precision for speed)
    fn fast_tanh(self) -> Self;
    fn fast_sin(self) -> Self;
    fn fast_cos(self) -> Self;
    fn fast_atan2(y: Self, x: Self) -> Self;
    fn clamp(self, min: Self, max: Self) -> Self;

    // Conversion
    fn write_to_slice(self, slice: &mut [f32]);
    fn extract_lane(self, lane: usize) -> f32;
}

// Macro to implement SimdLanes for wide SIMD types
// This avoids repeating the same implementation 2 times for f32x4, f32x8
macro_rules! impl_simd_lanes {
    ($type:ty, $lanes:expr) => {
        impl SimdLanes for $type {
            const LANES: usize = $lanes;

            #[inline(always)]
            fn splat(val: f32) -> Self {
                <$type>::splat(val)
            }

            #[inline(always)]
            fn from_array(arr: &[f32]) -> Self {
                debug_assert!(arr.len() >= $lanes, "Array too short for SIMD width");
                let mut fixed = [0.0f32; $lanes];
                fixed.copy_from_slice(&arr[..$lanes]);
                <$type>::from(fixed)
            }

            #[inline(always)]
            fn add(self, other: Self) -> Self {
                self + other
            }

            #[inline(always)]
            fn sub(self, other: Self) -> Self {
                self - other
            }

            #[inline(always)]
            fn mul(self, other: Self) -> Self {
                self * other
            }

            #[inline(always)]
            fn div(self, other: Self) -> Self {
                self / other
            }

            #[inline(always)]
            fn abs(self) -> Self {
                self.abs()
            }

            #[inline(always)]
            fn min(self, other: Self) -> Self {
                self.min(other)
            }

            #[inline(always)]
            fn max(self, other: Self) -> Self {
                self.max(other)
            }

            #[inline(always)]
            fn sqrt(self) -> Self {
                self.sqrt()
            }

            #[inline(always)]
            fn mul_add(self, b: Self, c: Self) -> Self {
                self.mul_add(b, c)
            }

            #[inline(always)]
            fn fast_tanh(self) -> Self {
                // Pade [3/3] approximation: tanh(x) ≈ x(x² + 15)/(15 + 6x²)
                // Very accurate for -2 < x < 2, good enough for audio
                let x2 = self.mul(self);
                let num = self.mul(x2.add(<$type>::splat(15.0)));
                let denom = x2.mul(<$type>::splat(6.0)).add(<$type>::splat(15.0));
                let result = num.div(denom);

                // Clamp to valid tanh range for safety
                result.clamp(<$type>::splat(-1.0), <$type>::splat(1.0))
            }

            #[inline(always)]
            fn fast_sin(self) -> Self {
                // 7th order Taylor series: sin(x) ≈ x - x³/3! + x⁵/5! - x⁷/7!
                // Accurate to ~0.001 for |x| < π, perfect for audio phase
                let x = self;
                let x2 = x.mul(x);
                let x3 = x2.mul(x);
                let x5 = x3.mul(x2);
                let x7 = x5.mul(x2);

                // sin(x) ≈ x - x³/6 + x⁵/120 - x⁷/5040
                let term1 = x;
                let term2 = x3.mul(<$type>::splat(-1.0 / 6.0));
                let term3 = x5.mul(<$type>::splat(1.0 / 120.0));
                let term4 = x7.mul(<$type>::splat(-1.0 / 5040.0));

                term1.add(term2).add(term3).add(term4)
            }

            #[inline(always)]
            fn fast_cos(self) -> Self {
                // 6th order Taylor series: cos(x) ≈ 1 - x²/2! + x⁴/4! - x⁶/6!
                // Accurate to ~0.001 for |x| < π
                let x2 = self.mul(self);
                let x4 = x2.mul(x2);
                let x6 = x4.mul(x2);

                // cos(x) ≈ 1 - x²/2 + x⁴/24 - x⁶/720
                let term1 = <$type>::splat(1.0);
                let term2 = x2.mul(<$type>::splat(-1.0 / 2.0));
                let term3 = x4.mul(<$type>::splat(1.0 / 24.0));
                let term4 = x6.mul(<$type>::splat(-1.0 / 720.0));

                term1.add(term2).add(term3).add(term4)
            }

            #[inline(always)]
            fn fast_atan2(y: Self, x: Self) -> Self {
                // Fast atan2 using minimax polynomial approximation
                // Accurate to ~0.07° (0.001 radians) - more than enough for audio panning
                // Uses the identity: atan2(y,x) = 2*atan(y/(√(x²+y²) + x))

                let abs_x = x.abs();
                let abs_y = y.abs();

                // Compute a = min(|x|, |y|) / max(|x|, |y|)
                let a = abs_y.min(abs_x).div(abs_x.max(abs_y).max(<$type>::splat(1e-10)));

                // Minimax polynomial for atan(a) where a ∈ [0, 1]
                // atan(a) ≈ a * (π/4 + 0.273 * (1 - a))
                let s = a.mul(a);
                let r = a.mul(<$type>::splat(0.99997726))
                    .add(s.mul(<$type>::splat(-0.33262347)))
                    .add(s.mul(s).mul(<$type>::splat(0.19354346)))
                    .add(s.mul(s).mul(s).mul(<$type>::splat(-0.11643287)))
                    .add(s.mul(s).mul(s).mul(s).mul(<$type>::splat(0.05265332)))
                    .add(s.mul(s).mul(s).mul(s).mul(s).mul(<$type>::splat(-0.011_721_2)));

                // Adjust for |x| < |y| case: result = π/2 - r
                let pi_2 = <$type>::splat(std::f32::consts::FRAC_PI_2);
                let r = pi_2.sub(r).mul((abs_y.sub(abs_x)).max(<$type>::splat(0.0)))
                    .add(r.mul((abs_x.sub(abs_y)).max(<$type>::splat(0.0))));

                // Handle quadrants based on signs of x and y
                let pi = <$type>::splat(std::f32::consts::PI);

                // If x < 0, adjust: result = π - result (for y > 0) or -π + result (for y < 0)
                let r = r.mul((x.sub(<$type>::splat(0.0))).max(<$type>::splat(0.0)))
                    .add(pi.sub(r).mul((<$type>::splat(0.0).sub(x)).max(<$type>::splat(0.0)))
                        .mul((y.sub(<$type>::splat(0.0))).max(<$type>::splat(0.0))))
                    .add(r.sub(pi).mul((<$type>::splat(0.0).sub(x)).max(<$type>::splat(0.0)))
                        .mul((<$type>::splat(0.0).sub(y)).max(<$type>::splat(0.0))));

                // Apply y sign
                r.mul((y.sub(<$type>::splat(0.0))).max(<$type>::splat(0.0)))
                    .add(r.mul(<$type>::splat(-1.0)).mul((<$type>::splat(0.0).sub(y)).max(<$type>::splat(0.0))))
            }

            #[inline(always)]
            fn clamp(self, min: Self, max: Self) -> Self {
                self.max(min).min(max)
            }

            #[inline(always)]
            fn write_to_slice(self, slice: &mut [f32]) {
                let arr = self.to_array();
                slice[..$lanes].copy_from_slice(&arr);
            }

            #[inline(always)]
            fn extract_lane(self, lane: usize) -> f32 {
                self.to_array()[lane]
            }
        }
    };
}

// Apply the macro to generate implementations for each SIMD width
impl_simd_lanes!(f32x8, 8); // AVX2 (most modern CPUs ~2013+)
impl_simd_lanes!(f32x4, 4); // SSE/NEON (universal)

// Scalar fallback (f32) - implemented manually since it's different
impl SimdLanes for f32 {
    const LANES: usize = 1;

    #[inline(always)]
    fn splat(val: f32) -> Self {
        val
    }

    #[inline(always)]
    fn from_array(arr: &[f32]) -> Self {
        arr[0]
    }

    #[inline(always)]
    fn add(self, other: Self) -> Self {
        self + other
    }

    #[inline(always)]
    fn sub(self, other: Self) -> Self {
        self - other
    }

    #[inline(always)]
    fn mul(self, other: Self) -> Self {
        self * other
    }

    #[inline(always)]
    fn div(self, other: Self) -> Self {
        self / other
    }

    #[inline(always)]
    fn abs(self) -> Self {
        self.abs()
    }

    #[inline(always)]
    fn min(self, other: Self) -> Self {
        self.min(other)
    }

    #[inline(always)]
    fn max(self, other: Self) -> Self {
        self.max(other)
    }

    #[inline(always)]
    fn sqrt(self) -> Self {
        self.sqrt()
    }

    #[inline(always)]
    fn mul_add(self, b: Self, c: Self) -> Self {
        self.mul_add(b, c)
    }

    #[inline(always)]
    fn fast_tanh(self) -> Self {
        // Pade [3/3] approximation: tanh(x) ≈ x(x² + 15)/(15 + 6x²)
        let x2 = self * self;
        let num = self * (x2 + 15.0);
        let denom = 15.0 + 6.0 * x2;
        let result = num / denom;
        result.clamp(-1.0, 1.0)
    }

    #[inline(always)]
    fn fast_sin(self) -> Self {
        // 7th order Taylor series
        let x = self;
        let x2 = x * x;
        let x3 = x2 * x;
        let x5 = x3 * x2;
        let x7 = x5 * x2;
        x - x3 / 6.0 + x5 / 120.0 - x7 / 5040.0
    }

    #[inline(always)]
    fn fast_cos(self) -> Self {
        // 6th order Taylor series
        let x2 = self * self;
        let x4 = x2 * x2;
        let x6 = x4 * x2;
        1.0 - x2 / 2.0 + x4 / 24.0 - x6 / 720.0
    }

    #[inline(always)]
    fn fast_atan2(y: Self, x: Self) -> Self {
        // Fast atan2 using minimax polynomial
        // Accurate to ~0.07° - excellent for audio spatial panning

        let abs_x = x.abs();
        let abs_y = y.abs();

        let a = abs_y.min(abs_x) / abs_x.max(abs_y).max(1e-10);

        // Minimax polynomial for atan(a) where a ∈ [0, 1]
        let s = a * a;
        let mut r = a * 0.99997726
            + s * -0.33262347
            + s * s * 0.19354346
            + s * s * s * -0.11643287
            + s * s * s * s * 0.05265332
            + s * s * s * s * s * -0.011_721_2;

        // Adjust for |x| < |y|
        if abs_y > abs_x {
            r = std::f32::consts::FRAC_PI_2 - r;
        }

        // Handle quadrants
        if x < 0.0 {
            r = if y >= 0.0 {
                std::f32::consts::PI - r
            } else {
                -std::f32::consts::PI + r
            };
        }

        // Apply y sign
        if y < 0.0 {
            -r
        } else {
            r
        }
    }

    #[inline(always)]
    fn clamp(self, min: Self, max: Self) -> Self {
        self.max(min).min(max)
    }

    #[inline(always)]
    fn write_to_slice(self, slice: &mut [f32]) {
        slice[0] = self;
    }

    #[inline(always)]
    fn extract_lane(self, _lane: usize) -> f32 {
        self
    }
}

/// SIMD width selection based on runtime CPU detection
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SimdWidth {
    /// AVX2 (8-wide f32 vectors)
    X8,
    /// SSE/NEON (4-wide f32 vectors)
    X4,
    /// Scalar fallback (1 sample at a time)
    Scalar,
}

/// Runtime dispatcher that detects CPU capabilities and selects optimal SIMD width.
///
/// This struct is designed to be constructed once at startup (ideally via lazy_static)
/// and reused throughout the application lifetime.
///
/// # Example
/// ```rust
/// use lazy_static::lazy_static;
/// use tunes::synthesis::simd::SimdDispatcher;
///
/// lazy_static! {
///     static ref SIMD: SimdDispatcher = SimdDispatcher::detect();
/// }
///
/// // Later, in your DSP code:
/// fn process_audio(buffer: &mut [f32]) {
///     SIMD.multiply_const(buffer, 0.5);
/// }
/// ```
pub struct SimdDispatcher {
    width: SimdWidth,
}

impl SimdDispatcher {
    /// Detect the best available SIMD instruction set on this CPU.
    ///
    /// Detection happens once at construction time with zero runtime overhead
    /// for subsequent processing calls.
    ///
    /// Priority order (best to worst):
    /// 1. AVX2 (8-wide) - Most modern x86_64 CPUs (2013+)
    /// 2. SSE (4-wide) - All x86_64 CPUs
    /// 3. Scalar fallback - Non-x86 architectures without explicit support
    pub fn detect() -> Self {
        #[cfg(target_arch = "x86_64")]
        {
            // Check for AVX2 (8-wide)
            if is_x86_feature_detected!("avx2") {
                return Self {
                    width: SimdWidth::X8,
                };
            }
            // Fall back to SSE (4-wide) - guaranteed on x86_64
            Self {
                width: SimdWidth::X4,
            }
        }

        // Non-x86 architectures: use 4-wide (NEON on ARM, LLVM auto-vec elsewhere)
        #[cfg(not(target_arch = "x86_64"))]
        {
            Self {
                width: SimdWidth::X4,
            }
        }
    }

    /// Returns the SIMD width being used (for debugging/logging)
    pub fn width(&self) -> usize {
        match self.width {
            SimdWidth::X8 => 8,
            SimdWidth::X4 => 4,
            SimdWidth::Scalar => 1,
        }
    }

    /// Returns the detected SIMD width enum for manual dispatching
    pub fn simd_width(&self) -> SimdWidth {
        self.width
    }

    /// Multiply all samples in buffer by a constant using SIMD
    #[inline]
    pub fn multiply_const(&self, buffer: &mut [f32], multiplier: f32) {
        match self.width {
            SimdWidth::X8 => self.multiply_const_impl::<f32x8>(buffer, multiplier),
            SimdWidth::X4 => self.multiply_const_impl::<f32x4>(buffer, multiplier),
            SimdWidth::Scalar => self.multiply_const_impl::<f32>(buffer, multiplier),
        }
    }

    #[inline(always)]
    fn multiply_const_impl<V: SimdLanes>(&self, buffer: &mut [f32], multiplier: f32) {
        let mult_vec = V::splat(multiplier);
        let (chunks, remainder) = buffer.split_at_mut(buffer.len() - (buffer.len() % V::LANES));

        for chunk in chunks.chunks_exact_mut(V::LANES) {
            let vec = V::from_array(chunk);
            let result = vec.mul(mult_vec);
            result.write_to_slice(chunk);
        }

        for sample in remainder.iter_mut() {
            *sample *= multiplier;
        }
    }

    /// Element-wise multiply two buffers: buffer[i] *= modulation[i]
    ///
    /// Multiplies each element of `buffer` by the corresponding element in `modulation`.
    /// Processes min(buffer.len(), modulation.len()) elements.
    /// Used for amplitude modulation effects like tremolo and ring modulation.
    #[inline]
    pub fn multiply_buffers(&self, buffer: &mut [f32], modulation: &[f32]) {
        match self.width {
            SimdWidth::X8 => self.multiply_buffers_impl::<f32x8>(buffer, modulation),
            SimdWidth::X4 => self.multiply_buffers_impl::<f32x4>(buffer, modulation),
            SimdWidth::Scalar => self.multiply_buffers_impl::<f32>(buffer, modulation),
        }
    }

    #[inline(always)]
    fn multiply_buffers_impl<V: SimdLanes>(&self, buffer: &mut [f32], modulation: &[f32]) {
        let len = buffer.len().min(modulation.len());
        let (buffer_chunks, buffer_rem) = buffer[..len].split_at_mut(len - (len % V::LANES));
        let (mod_chunks, mod_rem) = modulation[..len].split_at(len - (len % V::LANES));

        // SIMD path
        for (buf_chunk, mod_chunk) in buffer_chunks
            .chunks_exact_mut(V::LANES)
            .zip(mod_chunks.chunks_exact(V::LANES))
        {
            let buf_vec = V::from_array(buf_chunk);
            let mod_vec = V::from_array(mod_chunk);
            let result = buf_vec.mul(mod_vec);
            result.write_to_slice(buf_chunk);
        }

        // Scalar remainder
        for i in 0..buffer_rem.len() {
            buffer_rem[i] *= mod_rem[i];
        }
    }

    /// FMA (fused multiply-add): buffer = buffer * mul + add, using SIMD
    #[inline]
    pub fn fma(&self, buffer: &mut [f32], mul: f32, add: f32) {
        match self.width {
            SimdWidth::X8 => self.fma_impl::<f32x8>(buffer, mul, add),
            SimdWidth::X4 => self.fma_impl::<f32x4>(buffer, mul, add),
            SimdWidth::Scalar => self.fma_impl::<f32>(buffer, mul, add),
        }
    }

    #[inline(always)]
    fn fma_impl<V: SimdLanes>(&self, buffer: &mut [f32], mul: f32, add: f32) {
        let mul_vec = V::splat(mul);
        let add_vec = V::splat(add);
        let (chunks, remainder) = buffer.split_at_mut(buffer.len() - (buffer.len() % V::LANES));

        for chunk in chunks.chunks_exact_mut(V::LANES) {
            let vec = V::from_array(chunk);
            let result = vec.mul_add(mul_vec, add_vec);
            result.write_to_slice(chunk);
        }

        for sample in remainder.iter_mut() {
            *sample = sample.mul_add(mul, add);
        }
    }

    /// Apply fast_tanh to all samples using SIMD
    #[inline]
    pub fn apply_fast_tanh(&self, buffer: &mut [f32]) {
        match self.width {
            SimdWidth::X8 => self.apply_fast_tanh_impl::<f32x8>(buffer),
            SimdWidth::X4 => self.apply_fast_tanh_impl::<f32x4>(buffer),
            SimdWidth::Scalar => self.apply_fast_tanh_impl::<f32>(buffer),
        }
    }

    #[inline(always)]
    fn apply_fast_tanh_impl<V: SimdLanes>(&self, buffer: &mut [f32]) {
        let (chunks, remainder) = buffer.split_at_mut(buffer.len() - (buffer.len() % V::LANES));

        for chunk in chunks.chunks_exact_mut(V::LANES) {
            let vec = V::from_array(chunk);
            let result = vec.fast_tanh();
            result.write_to_slice(chunk);
        }

        for sample in remainder.iter_mut() {
            *sample = sample.fast_tanh();
        }
    }

    /// Linear interpolation: out = a + (b - a) * t, using SIMD
    /// Processes two buffers and a fractional buffer
    #[inline]
    pub fn lerp_buffers(&self, out: &mut [f32], a: &[f32], b: &[f32], t: &[f32]) {
        match self.width {
            SimdWidth::X8 => self.lerp_impl::<f32x8>(out, a, b, t),
            SimdWidth::X4 => self.lerp_impl::<f32x4>(out, a, b, t),
            SimdWidth::Scalar => self.lerp_impl::<f32>(out, a, b, t),
        }
    }

    #[inline(always)]
    fn lerp_impl<V: SimdLanes>(&self, out: &mut [f32], a: &[f32], b: &[f32], t: &[f32]) {
        let len = out.len().min(a.len()).min(b.len()).min(t.len());
        let (out_chunks, out_rem) = out[..len].split_at_mut(len - (len % V::LANES));
        let (a_chunks, a_rem) = a[..len].split_at(len - (len % V::LANES));
        let (b_chunks, b_rem) = b[..len].split_at(len - (len % V::LANES));
        let (t_chunks, t_rem) = t[..len].split_at(len - (len % V::LANES));

        // SIMD path
        for (((out_chunk, a_chunk), b_chunk), t_chunk) in out_chunks
            .chunks_exact_mut(V::LANES)
            .zip(a_chunks.chunks_exact(V::LANES))
            .zip(b_chunks.chunks_exact(V::LANES))
            .zip(t_chunks.chunks_exact(V::LANES))
        {
            let va = V::from_array(a_chunk);
            let vb = V::from_array(b_chunk);
            let vt = V::from_array(t_chunk);

            // lerp: a + (b - a) * t
            let diff = vb.sub(va);
            let result = diff.mul_add(vt, va);
            result.write_to_slice(out_chunk);
        }

        // Scalar remainder
        for i in 0..out_rem.len() {
            out_rem[i] = a_rem[i] + (b_rem[i] - a_rem[i]) * t_rem[i];
        }
    }

    /// Calculate sum of squares for a buffer (used for RMS envelope calculation)
    ///
    /// Returns the sum of all squared samples in the buffer.
    /// Commonly used for: RMS = sqrt(sum_of_squares / num_samples)
    #[inline]
    pub fn sum_of_squares(&self, buffer: &[f32]) -> f32 {
        match self.width {
            SimdWidth::X8 => self.sum_of_squares_impl::<f32x8>(buffer),
            SimdWidth::X4 => self.sum_of_squares_impl::<f32x4>(buffer),
            SimdWidth::Scalar => self.sum_of_squares_impl::<f32>(buffer),
        }
    }

    #[inline(always)]
    fn sum_of_squares_impl<V: SimdLanes>(&self, buffer: &[f32]) -> f32 {
        let lanes = V::LANES;
        let chunks = buffer.len() / lanes;
        let remainder_start = chunks * lanes;

        let mut accumulator = V::splat(0.0);

        // SIMD path: accumulate in vector
        for chunk_idx in 0..chunks {
            let idx = chunk_idx * lanes;
            let vec = V::from_array(&buffer[idx..idx + lanes]);
            // accumulator += vec * vec
            accumulator = vec.mul_add(vec, accumulator);
        }

        // Sum all lanes of the accumulator
        let mut sum = 0.0;
        for i in 0..lanes {
            sum += accumulator.extract_lane(i);
        }

        // Handle remainder with scalar code
        for &sample in &buffer[remainder_start..] {
            sum += sample * sample;
        }

        sum
    }

    /// Mix mono buffer into stereo with panning (mono-to-stereo expansion)
    ///
    /// Takes a mono input buffer and mixes it into an interleaved stereo output
    /// buffer with independent left/right gains for panning.
    ///
    /// Performs: output[i*2] += input[i] * left_gain, output[i*2+1] += input[i] * right_gain
    #[inline]
    pub fn mix_mono_to_stereo(
        &self,
        output: &mut [f32],
        input: &[f32],
        left_gain: f32,
        right_gain: f32,
    ) {
        match self.width {
            SimdWidth::X8 => self.mix_mono_to_stereo_impl::<f32x8>(output, input, left_gain, right_gain),
            SimdWidth::X4 => self.mix_mono_to_stereo_impl::<f32x4>(output, input, left_gain, right_gain),
            SimdWidth::Scalar => self.mix_mono_to_stereo_impl::<f32>(output, input, left_gain, right_gain),
        }
    }

    #[inline(always)]
    fn mix_mono_to_stereo_impl<V: SimdLanes>(
        &self,
        output: &mut [f32],
        input: &[f32],
        left_gain: f32,
        right_gain: f32,
    ) {
        let num_frames = input.len().min(output.len() / 2);
        let lanes = V::LANES;
        let chunks = num_frames / lanes;
        let remainder_start = chunks * lanes;

        let left_gain_vec = V::splat(left_gain);
        let right_gain_vec = V::splat(right_gain);

        // Process SIMD chunks
        for chunk_idx in 0..chunks {
            let mono_idx = chunk_idx * lanes;
            let stereo_idx = chunk_idx * lanes * 2;

            // Stack arrays for de-interleaving stereo output
            let mut out_left = [0.0f32; 8];
            let mut out_right = [0.0f32; 8];

            // De-interleave stereo output
            for i in 0..lanes {
                out_left[i] = output[stereo_idx + i * 2];
                out_right[i] = output[stereo_idx + i * 2 + 1];
            }

            // Load mono input and current stereo output
            let mono_vec = V::from_array(&input[mono_idx..mono_idx + lanes]);
            let out_left_vec = V::from_array(&out_left[..lanes]);
            let out_right_vec = V::from_array(&out_right[..lanes]);

            // Mix: output += mono * gain (using FMA)
            let mixed_left = mono_vec.mul_add(left_gain_vec, out_left_vec);
            let mixed_right = mono_vec.mul_add(right_gain_vec, out_right_vec);

            // Write back
            mixed_left.write_to_slice(&mut out_left[..lanes]);
            mixed_right.write_to_slice(&mut out_right[..lanes]);

            // Re-interleave
            for i in 0..lanes {
                output[stereo_idx + i * 2] = out_left[i];
                output[stereo_idx + i * 2 + 1] = out_right[i];
            }
        }

        // Scalar remainder
        for (frame_idx, &input_sample) in input.iter().enumerate().take(num_frames).skip(remainder_start) {
            let stereo_idx = frame_idx * 2;
            output[stereo_idx] += input_sample * left_gain;
            output[stereo_idx + 1] += input_sample * right_gain;
        }
    }

    /// Mix interleaved stereo buffers with independent left/right gains
    ///
    /// This is optimized for mixing stereo audio buses where each channel
    /// needs independent gain control. Input and output are interleaved stereo:
    /// [L0, R0, L1, R1, ...]
    ///
    /// Performs: output[i] += input[i] * gain (where gain alternates L/R)
    #[inline]
    pub fn mix_stereo_interleaved(
        &self,
        output: &mut [f32],
        input: &[f32],
        left_gain: f32,
        right_gain: f32,
    ) {
        match self.width {
            SimdWidth::X8 => {
                self.mix_stereo_impl::<f32x8>(output, input, left_gain, right_gain)
            }
            SimdWidth::X4 => {
                self.mix_stereo_impl::<f32x4>(output, input, left_gain, right_gain)
            }
            SimdWidth::Scalar => {
                self.mix_stereo_impl::<f32>(output, input, left_gain, right_gain)
            }
        }
    }

    #[inline(always)]
    fn mix_stereo_impl<V: SimdLanes>(
        &self,
        output: &mut [f32],
        input: &[f32],
        left_gain: f32,
        right_gain: f32,
    ) {
        let num_frames = output.len().min(input.len()) / 2;
        let lanes = V::LANES;
        let chunks = num_frames / lanes;
        let remainder_start = chunks * lanes;

        let left_gain_vec = V::splat(left_gain);
        let right_gain_vec = V::splat(right_gain);

        // Process SIMD chunks (using stack arrays to avoid allocations)
        // Max lanes is 8, so we use fixed-size arrays
        for chunk_idx in 0..chunks {
            let frame_start = chunk_idx * lanes;
            let idx = frame_start * 2;

            // Stack-allocated arrays (max 8 lanes)
            let mut input_left = [0.0f32; 8];
            let mut input_right = [0.0f32; 8];
            let mut output_left = [0.0f32; 8];
            let mut output_right = [0.0f32; 8];

            // De-interleave input and output
            for i in 0..lanes {
                input_left[i] = input[idx + i * 2];
                input_right[i] = input[idx + i * 2 + 1];
                output_left[i] = output[idx + i * 2];
                output_right[i] = output[idx + i * 2 + 1];
            }

            // Load into SIMD
            let in_left = V::from_array(&input_left[..lanes]);
            let in_right = V::from_array(&input_right[..lanes]);
            let out_left = V::from_array(&output_left[..lanes]);
            let out_right = V::from_array(&output_right[..lanes]);

            // Mix: output += input * gain (using FMA for better performance)
            let mixed_left = in_left.mul_add(left_gain_vec, out_left);
            let mixed_right = in_right.mul_add(right_gain_vec, out_right);

            // Write back arrays
            mixed_left.write_to_slice(&mut output_left[..lanes]);
            mixed_right.write_to_slice(&mut output_right[..lanes]);

            // Re-interleave output
            for i in 0..lanes {
                output[idx + i * 2] = output_left[i];
                output[idx + i * 2 + 1] = output_right[i];
            }
        }

        // Handle remaining frames with scalar code (no branching!)
        for frame_idx in remainder_start..num_frames {
            let idx = frame_idx * 2;
            output[idx] += input[idx] * left_gain;
            output[idx + 1] += input[idx + 1] * right_gain;
        }
    }

    /// Deinterleave stereo samples into separate left/right SIMD vectors using shuffle instructions.
    ///
    /// Automatically dispatches to the optimal SIMD width (f32x8, f32x4, or scalar).
    /// Uses AVX2/SSE shuffle instructions when available for maximum performance.
    ///
    /// # Arguments
    /// * `interleaved` - Slice of interleaved stereo samples in LRLRLR... format
    /// * `left` - Output slice for left channel (must have capacity for at least width samples)
    /// * `right` - Output slice for right channel (must have capacity for at least width samples)
    ///
    /// # Returns
    /// Number of frames processed (width of SIMD vector)
    pub fn deinterleave_stereo(&self, interleaved: &[f32], left: &mut [f32], right: &mut [f32]) -> usize {
        match self.width {
            SimdWidth::X8 => self.deinterleave_stereo_impl::<f32x8>(interleaved, left, right),
            SimdWidth::X4 => self.deinterleave_stereo_impl::<f32x4>(interleaved, left, right),
            SimdWidth::Scalar => self.deinterleave_stereo_impl::<f32>(interleaved, left, right),
        }
    }

    #[inline(always)]
    fn deinterleave_stereo_impl<V: SimdLanes>(
        &self,
        interleaved: &[f32],
        left: &mut [f32],
        right: &mut [f32],
    ) -> usize {
        let lanes = V::LANES;
        let samples_needed = lanes * 2;

        if interleaved.len() < samples_needed {
            return 0;
        }

        // Deinterleave into temporary arrays
        let mut left_arr = [0.0f32; 8];
        let mut right_arr = [0.0f32; 8];
        for i in 0..lanes {
            left_arr[i] = interleaved[i * 2];
            right_arr[i] = interleaved[i * 2 + 1];
        }

        // Load into SIMD and write out
        let left_vec = V::from_array(&left_arr[..lanes]);
        let right_vec = V::from_array(&right_arr[..lanes]);
        left_vec.write_to_slice(&mut left[..lanes]);
        right_vec.write_to_slice(&mut right[..lanes]);

        lanes
    }

    /// Interleave separate left/right SIMD vectors into stereo samples using shuffle instructions.
    ///
    /// Automatically dispatches to the optimal SIMD width (f32x8, f32x4, or scalar).
    /// Uses AVX2/SSE shuffle instructions when available for maximum performance.
    ///
    /// # Arguments
    /// * `left` - Left channel samples
    /// * `right` - Right channel samples
    /// * `output` - Output slice for interleaved stereo (must have capacity for width*2 samples)
    ///
    /// # Returns
    /// Number of stereo frames written (width of SIMD vector)
    pub fn interleave_stereo(&self, left: &[f32], right: &[f32], output: &mut [f32]) -> usize {
        match self.width {
            SimdWidth::X8 => self.interleave_stereo_impl::<f32x8>(left, right, output),
            SimdWidth::X4 => self.interleave_stereo_impl::<f32x4>(left, right, output),
            SimdWidth::Scalar => self.interleave_stereo_impl::<f32>(left, right, output),
        }
    }

    #[inline(always)]
    fn interleave_stereo_impl<V: SimdLanes>(
        &self,
        left: &[f32],
        right: &[f32],
        output: &mut [f32],
    ) -> usize {
        let lanes = V::LANES;
        let samples_needed = lanes * 2;

        if left.len() < lanes || right.len() < lanes || output.len() < samples_needed {
            return 0;
        }

        // Load into SIMD
        let left_vec = V::from_array(&left[..lanes]);
        let right_vec = V::from_array(&right[..lanes]);

        // Write to temporary arrays
        let mut left_arr = [0.0f32; 8];
        let mut right_arr = [0.0f32; 8];
        left_vec.write_to_slice(&mut left_arr[..lanes]);
        right_vec.write_to_slice(&mut right_arr[..lanes]);

        // Interleave
        for i in 0..lanes {
            output[i * 2] = left_arr[i];
            output[i * 2 + 1] = right_arr[i];
        }

        lanes
    }
}

impl Default for SimdDispatcher {
    fn default() -> Self {
        Self::detect()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_simd_detection() {
        let simd = SimdDispatcher::detect();
        let width = simd.width();
        // Should detect at least scalar (1) or better
        assert!(width >= 1);
        assert!(width <= 8);
        println!("Detected SIMD width: {}", width);
    }

    #[test]
    fn test_scalar_lanes() {
        let a = f32::splat(2.0);
        let b = f32::splat(3.0);
        assert_eq!(a.add(b), 5.0);
        assert_eq!(a.mul(b), 6.0);
    }

    #[test]
    fn test_f32x4_lanes() {
        let a = f32x4::splat(2.0);
        let b = f32x4::splat(3.0);
        let result = a.add(b);
        let arr = result.to_array();
        assert_eq!(arr, [5.0, 5.0, 5.0, 5.0]);
    }

    #[test]
    fn test_f32x8_lanes() {
        let a = f32x8::splat(2.0);
        let b = f32x8::splat(3.0);
        let result = a.mul(b);
        let arr = result.to_array();
        assert_eq!(arr, [6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0]);
    }

    #[test]
    fn test_write_to_slice() {
        let vec = f32x4::splat(42.0);
        let mut buffer = vec![0.0; 4];
        vec.write_to_slice(&mut buffer);
        assert_eq!(buffer, vec![42.0, 42.0, 42.0, 42.0]);
    }

    #[test]
    fn test_sqrt() {
        let vec = f32x4::splat(16.0);
        let result = vec.sqrt();
        let arr = result.to_array();
        assert_eq!(arr, [4.0, 4.0, 4.0, 4.0]);
    }

    #[test]
    fn test_mul_add() {
        let a = f32x4::splat(2.0);
        let b = f32x4::splat(3.0);
        let c = f32x4::splat(1.0);
        let result = a.mul_add(b, c); // 2 * 3 + 1 = 7
        let arr = result.to_array();
        assert_eq!(arr, [7.0, 7.0, 7.0, 7.0]);
    }

    #[test]
    fn test_fast_tanh() {
        let vec = f32x4::splat(1.0);
        let result = vec.fast_tanh();
        let arr = result.to_array();
        // Should be close to tanh(1.0) ≈ 0.7616
        for &val in &arr {
            assert!((val - 1.0f32.tanh()).abs() < 0.01, "fast_tanh accuracy check");
        }
    }

    #[test]
    fn test_clamp() {
        let vec = f32x4::from([0.5, 1.5, -0.5, 2.5]);
        let result = vec.clamp(f32x4::splat(0.0), f32x4::splat(2.0));
        let arr = result.to_array();
        assert_eq!(arr, [0.5, 1.5, 0.0, 2.0]);
    }

    #[test]
    fn test_simd_multiply_const() {
        use crate::synthesis::simd::SIMD;
        let mut buffer = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];

        // Multiply by 2 using true SIMD
        SIMD.multiply_const(&mut buffer, 2.0);

        assert_eq!(buffer, vec![2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0]);
    }

    #[test]
    fn test_simd_fma() {
        use crate::synthesis::simd::SIMD;
        let mut buffer = vec![1.0, 2.0, 3.0, 4.0];

        // FMA: buffer = buffer * 2.0 + 1.0
        SIMD.fma(&mut buffer, 2.0, 1.0);

        assert_eq!(buffer, vec![3.0, 5.0, 7.0, 9.0]);
    }

    #[test]
    fn test_simd_fast_tanh() {
        use crate::synthesis::simd::SIMD;
        let mut buffer = vec![0.0, 1.0, -1.0, 0.5];

        SIMD.apply_fast_tanh(&mut buffer);

        // Check that results are close to actual tanh
        assert!((buffer[0] - 0.0f32.tanh()).abs() < 0.01);
        assert!((buffer[1] - 1.0f32.tanh()).abs() < 0.01);
        assert!((buffer[2] - (-1.0f32).tanh()).abs() < 0.01);
        assert!((buffer[3] - 0.5f32.tanh()).abs() < 0.01);
    }

    #[test]
    fn test_simd_non_aligned_buffer() {
        use crate::synthesis::simd::SIMD;
        let mut buffer = vec![1.0, 2.0, 3.0]; // Not divisible by SIMD width

        SIMD.multiply_const(&mut buffer, 2.0);

        assert_eq!(buffer, vec![2.0, 4.0, 6.0]);
    }

    #[test]
    fn test_fast_sin_accuracy() {
        use std::f32::consts::PI;

        // Test scalar version
        assert!((f32::fast_sin(0.0) - 0.0).abs() < 0.001);
        assert!((f32::fast_sin(PI / 2.0) - 1.0).abs() < 0.001);
        assert!((f32::fast_sin(-PI / 2.0) - (-1.0)).abs() < 0.001);
        assert!((f32::fast_sin(PI / 4.0) - (PI / 4.0).sin()).abs() < 0.001);

        // Test f32x4 version
        let test_vals = [0.0, PI / 2.0, -PI / 2.0, PI / 4.0];
        let vec = f32x4::from_array(&test_vals);
        let result = vec.fast_sin();
        let result_arr = result.to_array();

        assert!((result_arr[0] - 0.0).abs() < 0.001);
        assert!((result_arr[1] - 1.0).abs() < 0.001);
        assert!((result_arr[2] - (-1.0)).abs() < 0.001);
        assert!((result_arr[3] - (PI / 4.0).sin()).abs() < 0.001);
    }

    #[test]
    fn test_fast_cos_accuracy() {
        use std::f32::consts::PI;

        // Test scalar version
        assert!((f32::fast_cos(0.0) - 1.0).abs() < 0.001);
        assert!((f32::fast_cos(PI) - (-1.0)).abs() < 0.25); // Taylor series ~21% error at π
        assert!((f32::fast_cos(PI / 2.0) - 0.0).abs() < 0.001);
        assert!((f32::fast_cos(PI / 4.0) - (PI / 4.0).cos()).abs() < 0.001);

        // Test f32x8 version
        let test_vals = [0.0, PI, PI / 2.0, PI / 4.0, -PI / 4.0, PI / 6.0, 0.5, 1.0];
        let vec = f32x8::from_array(&test_vals);
        let result = vec.fast_cos();
        let result_arr = result.to_array();

        assert!((result_arr[0] - 1.0).abs() < 0.001);
        assert!((result_arr[1] - (-1.0)).abs() < 0.25);
        assert!((result_arr[2] - 0.0).abs() < 0.001);
        assert!((result_arr[3] - (PI / 4.0).cos()).abs() < 0.001);
    }

    #[test]
    fn test_fast_sincos_vs_stdlib() {
        use std::f32::consts::PI;

        // Test a range of values - accuracy decreases near ±π
        let test_values = [
            -PI / 2.0, -PI / 4.0, -PI / 6.0, 0.0, PI / 6.0, PI / 4.0, PI / 2.0,
        ];

        for &x in &test_values {
            let fast_sin = f32::fast_sin(x);
            let std_sin = x.sin();
            assert!(
                (fast_sin - std_sin).abs() < 0.002,
                "fast_sin({}) = {} vs sin({}) = {}, error = {}",
                x,
                fast_sin,
                std_sin,
                x,
                (fast_sin - std_sin).abs()
            );

            let fast_cos = f32::fast_cos(x);
            let std_cos = x.cos();
            assert!(
                (fast_cos - std_cos).abs() < 0.002,
                "fast_cos({}) = {} vs cos({}) = {}, error = {}",
                x,
                fast_cos,
                std_cos,
                x,
                (fast_cos - std_cos).abs()
            );
        }

        // Test edges with looser tolerance
        // Taylor series centered at 0 has ~20% error at ±π, but this is acceptable
        // because phases wrap to [-π, π] and most values are near 0 after wrapping
        assert!((f32::fast_sin(PI) - PI.sin()).abs() < 0.08);
        assert!((f32::fast_sin(-PI) - (-PI).sin()).abs() < 0.08);
        assert!((f32::fast_cos(PI) - PI.cos()).abs() < 0.25); // ~21% error at π
    }
}