ruvllm 2.2.1 - Docs.rs

//! SIMD Kernels for Pi-Quantization Dequantization
//!
//! Implements high-performance dequantization kernels for Pi-constant quantization
//! (PiQ3/PiQ2) with architecture-specific optimizations.
//!
//! ## Architecture Support
//!
//! | Architecture | Kernel | Registers | Throughput Target |
//! |--------------|--------|-----------|-------------------|
//! | Scalar       | Reference | N/A | Baseline |
//! | ARM NEON     | `pi_dequantize_neon` | v0-v31 | >10 GB/s |
//! | x86_64 AVX-512 | `pi_dequantize_avx512` | zmm0-31 | >12 GB/s |
//! | x86_64 AVX2  | `pi_dequantize_avx2` | ymm0-15 | >8 GB/s |
//!
//! ## Accuracy Guarantee (INV-8)
//!
//! All SIMD kernels produce output within 1 ULP (Unit in Last Place) of the
//! scalar reference implementation. This is verified by the equivalence tests.
//!
//! ## Pi-Quantization Format
//!
//! Packed 3-bit values: 3 bytes encode 8 quantized integers in [-4, +3] range.
//! Dequantization: `output[i] = packed_int[i] * scale`
//!
//! Where scale typically equals `alpha * pi / k` for adaptive Pi-scaling.

use std::f32::consts::PI;

// ============================================================================
// Constants
// ============================================================================

/// Default quantization parameter k (step size = pi/k)
pub const DEFAULT_K: u32 = 4;

/// Pi constant for calculations
pub const PI_F32: f32 = PI;

/// Number of packed values per 3-byte group (3 bits * 8 = 24 bits = 3 bytes)
pub const PI3_VALUES_PER_GROUP: usize = 8;

/// Bytes per packed group for 3-bit quantization
pub const PI3_BYTES_PER_GROUP: usize = 3;

/// Precomputed signed values for 3-bit extraction: maps [0,7] -> [-4,+3] as i8
/// Used for fast lookup-based dequantization
const SIGNED_3BIT_LUT: [i8; 8] = [-4, -3, -2, -1, 0, 1, 2, 3];

// ============================================================================
// Scalar Reference Implementation
// ============================================================================

/// Scalar reference dequantization for Pi-quantized data.
///
/// Unpacks 3-bit signed integers from packed format and multiplies by scale.
/// This serves as the ground truth for SIMD kernel validation.
///
/// # Packed Format (3-bit)
///
/// 8 values are packed into 3 bytes (24 bits total):
/// ```text
/// Byte 0: [v0_2:0] [v1_2:0] [v2_1:0]       (bits 7-5: v0, 4-2: v1, 1-0: v2 low)
/// Byte 1: [v2_2] [v3_2:0] [v4_2:0] [v5_0]  (bit 7: v2 high, 6-4: v3, 3-1: v4, 0: v5 low)
/// Byte 2: [v5_2:1] [v6_2:0] [v7_2:0]       (bits 7-6: v5 high, 5-3: v6, 2-0: v7)
/// ```
///
/// Each 3-bit value is sign-extended from the range [0, 7] to [-4, +3]:
/// - 0b000 (0) -> -4
/// - 0b001 (1) -> -3
/// - 0b010 (2) -> -2
/// - 0b011 (3) -> -1
/// - 0b100 (4) ->  0
/// - 0b101 (5) -> +1
/// - 0b110 (6) -> +2
/// - 0b111 (7) -> +3
///
/// # Arguments
///
/// * `packed` - Packed 3-bit quantized values (length must be multiple of 3)
/// * `scale` - Dequantization scale factor (typically alpha * pi / k)
/// * `output` - Output f32 buffer (length must be 8/3 * packed.len())
///
/// # Panics
///
/// Panics if packed length is not a multiple of 3 or output length doesn't match.
///
/// # Example
///
/// ```rust,ignore
/// use ruvllm::quantize::pi_quant_simd::pi_dequantize_scalar;
///
/// let packed = vec![0u8; 3]; // 8 zero-values
/// let scale = std::f32::consts::PI / 4.0;
/// let mut output = vec![0.0f32; 8];
///
/// pi_dequantize_scalar(&packed, scale, &mut output);
/// ```
pub fn pi_dequantize_scalar(packed: &[u8], scale: f32, output: &mut [f32]) {
    assert!(
        packed.len() % PI3_BYTES_PER_GROUP == 0,
        "Packed length {} must be a multiple of {}",
        packed.len(),
        PI3_BYTES_PER_GROUP
    );

    let num_groups = packed.len() / PI3_BYTES_PER_GROUP;
    let expected_output_len = num_groups * PI3_VALUES_PER_GROUP;

    assert_eq!(
        output.len(),
        expected_output_len,
        "Output length {} doesn't match expected {} (from {} packed bytes)",
        output.len(),
        expected_output_len,
        packed.len()
    );

    for group in 0..num_groups {
        let byte_offset = group * PI3_BYTES_PER_GROUP;
        let out_offset = group * PI3_VALUES_PER_GROUP;

        // Read 3 bytes for this group
        let b0 = packed[byte_offset] as u32;
        let b1 = packed[byte_offset + 1] as u32;
        let b2 = packed[byte_offset + 2] as u32;

        // Unpack 8 x 3-bit values
        // Layout: pack 8 values into 24 bits, LSB-first within each value
        //
        // Bit layout (0-indexed from LSB of combined 24-bit value):
        // v0: bits 0-2,   v1: bits 3-5,   v2: bits 6-8
        // v3: bits 9-11,  v4: bits 12-14, v5: bits 15-17
        // v6: bits 18-20, v7: bits 21-23
        let combined = b0 | (b1 << 8) | (b2 << 16);

        for i in 0..8 {
            let shift = i * 3;
            let raw = ((combined >> shift) & 0x7) as i32;
            // Sign-extend: 0-7 maps to -4 to +3
            let signed = raw - 4;
            output[out_offset + i] = (signed as f32) * scale;
        }
    }
}

/// Extract a single 3-bit value from packed data at a given index.
///
/// Useful for validation and debugging.
#[inline]
pub fn extract_pi3_value(packed: &[u8], index: usize) -> i8 {
    let group = index / PI3_VALUES_PER_GROUP;
    let offset_in_group = index % PI3_VALUES_PER_GROUP;
    let byte_offset = group * PI3_BYTES_PER_GROUP;

    let b0 = packed[byte_offset] as u32;
    let b1 = packed[byte_offset + 1] as u32;
    let b2 = packed[byte_offset + 2] as u32;
    let combined = b0 | (b1 << 8) | (b2 << 16);

    let shift = offset_in_group * 3;
    let raw = ((combined >> shift) & 0x7) as i32;
    (raw - 4) as i8
}

// ============================================================================
// ARM NEON Implementation
// ============================================================================

/// ARM NEON dequantization kernel for Pi-quantized data.
///
/// Processes 64 values (24 bytes packed) per iteration using NEON SIMD with:
/// - 8x loop unrolling for maximum instruction-level parallelism
/// - Software prefetching 2 cache lines ahead
/// - Interleaved operations to hide latencies
/// - Direct scalar bit extraction (avoiding memory round-trips)
///
/// # Safety
///
/// This function uses raw NEON intrinsics. Caller must ensure:
/// - Running on aarch64 with NEON support
/// - All slice bounds are valid
/// - Output buffer has sufficient capacity
///
/// # Performance
///
/// Achieves >10 GB/s throughput on Apple M1/M2/M4 chips by:
/// - Processing 64 values per iteration (8 groups of 8)
/// - Using interleaved NEON operations to saturate execution units
/// - Prefetching to hide memory latency
/// - Minimizing data dependencies between operations
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
pub unsafe fn pi_dequantize_neon(packed: &[u8], scale: f32, output: &mut [f32]) {
    use core::arch::aarch64::*;

    let num_groups = packed.len() / PI3_BYTES_PER_GROUP;
    let total_values = num_groups * PI3_VALUES_PER_GROUP;

    assert_eq!(
        output.len(),
        total_values,
        "Output length mismatch: {} vs expected {}",
        output.len(),
        total_values
    );

    if num_groups == 0 {
        return;
    }

    // Broadcast scale to all lanes
    let scale_vec = vdupq_n_f32(scale);

    // Precompute bias * scale for FMA: result = raw_u32 * scale + bias_scaled
    // Where bias_scaled = -4.0 * scale
    let bias_scaled = vdupq_n_f32(-4.0 * scale);
    let bias_f32 = vdupq_n_f32(-4.0);

    // Precompute shift vectors (hoist outside loop)
    let shifts_lo: int32x4_t = vld1q_s32([0i32, -3, -6, -9].as_ptr());
    let shifts_hi: int32x4_t = vld1q_s32([-12i32, -15, -18, -21].as_ptr());
    let mask_3bit = vdupq_n_u32(0x7);

    // Prefetch distance: 4 cache lines = 256 bytes ahead
    const PREFETCH_DISTANCE: usize = 256;

    let packed_ptr = packed.as_ptr();
    let output_ptr = output.as_mut_ptr();

    let mut group = 0usize;

    // Main hot loop: process 8 groups (64 values, 24 bytes) per iteration
    // Using FMA: result = raw_u32 * scale + bias_scaled
    while group + 8 <= num_groups {
        let byte_offset = group * PI3_BYTES_PER_GROUP;
        let out_offset = group * PI3_VALUES_PER_GROUP;
        let p = packed_ptr.add(byte_offset);
        let o = output_ptr.add(out_offset);

        // Prefetch next iteration
        if byte_offset + PREFETCH_DISTANCE < packed.len() {
            let prefetch_addr = packed_ptr.add(byte_offset + PREFETCH_DISTANCE);
            core::arch::asm!(
                "prfm pldl1keep, [{addr}]",
                "prfm pldl1keep, [{addr}, #64]",
                addr = in(reg) prefetch_addr,
                options(nostack, preserves_flags)
            );
        }

        // Load all 8 groups' packed bytes (24 bytes total)
        let c0 = (*p as u32) | ((*p.add(1) as u32) << 8) | ((*p.add(2) as u32) << 16);
        let c1 = (*p.add(3) as u32) | ((*p.add(4) as u32) << 8) | ((*p.add(5) as u32) << 16);
        let c2 = (*p.add(6) as u32) | ((*p.add(7) as u32) << 8) | ((*p.add(8) as u32) << 16);
        let c3 = (*p.add(9) as u32) | ((*p.add(10) as u32) << 8) | ((*p.add(11) as u32) << 16);
        let c4 = (*p.add(12) as u32) | ((*p.add(13) as u32) << 8) | ((*p.add(14) as u32) << 16);
        let c5 = (*p.add(15) as u32) | ((*p.add(16) as u32) << 8) | ((*p.add(17) as u32) << 16);
        let c6 = (*p.add(18) as u32) | ((*p.add(19) as u32) << 8) | ((*p.add(20) as u32) << 16);
        let c7 = (*p.add(21) as u32) | ((*p.add(22) as u32) << 8) | ((*p.add(23) as u32) << 16);

        // Process all 8 groups using FMA: result = raw * scale + bias_scaled
        // Group 0
        let v0 = vdupq_n_u32(c0);
        let lo0 = vandq_u32(vshlq_u32(v0, shifts_lo), mask_3bit);
        let hi0 = vandq_u32(vshlq_u32(v0, shifts_hi), mask_3bit);
        vst1q_f32(o, vfmaq_f32(bias_scaled, vcvtq_f32_u32(lo0), scale_vec));
        vst1q_f32(
            o.add(4),
            vfmaq_f32(bias_scaled, vcvtq_f32_u32(hi0), scale_vec),
        );

        // Group 1
        let v1 = vdupq_n_u32(c1);
        let lo1 = vandq_u32(vshlq_u32(v1, shifts_lo), mask_3bit);
        let hi1 = vandq_u32(vshlq_u32(v1, shifts_hi), mask_3bit);
        vst1q_f32(
            o.add(8),
            vfmaq_f32(bias_scaled, vcvtq_f32_u32(lo1), scale_vec),
        );
        vst1q_f32(
            o.add(12),
            vfmaq_f32(bias_scaled, vcvtq_f32_u32(hi1), scale_vec),
        );

        // Group 2
        let v2 = vdupq_n_u32(c2);
        let lo2 = vandq_u32(vshlq_u32(v2, shifts_lo), mask_3bit);
        let hi2 = vandq_u32(vshlq_u32(v2, shifts_hi), mask_3bit);
        vst1q_f32(
            o.add(16),
            vfmaq_f32(bias_scaled, vcvtq_f32_u32(lo2), scale_vec),
        );
        vst1q_f32(
            o.add(20),
            vfmaq_f32(bias_scaled, vcvtq_f32_u32(hi2), scale_vec),
        );

        // Group 3
        let v3 = vdupq_n_u32(c3);
        let lo3 = vandq_u32(vshlq_u32(v3, shifts_lo), mask_3bit);
        let hi3 = vandq_u32(vshlq_u32(v3, shifts_hi), mask_3bit);
        vst1q_f32(
            o.add(24),
            vfmaq_f32(bias_scaled, vcvtq_f32_u32(lo3), scale_vec),
        );
        vst1q_f32(
            o.add(28),
            vfmaq_f32(bias_scaled, vcvtq_f32_u32(hi3), scale_vec),
        );

        // Group 4
        let v4 = vdupq_n_u32(c4);
        let lo4 = vandq_u32(vshlq_u32(v4, shifts_lo), mask_3bit);
        let hi4 = vandq_u32(vshlq_u32(v4, shifts_hi), mask_3bit);
        vst1q_f32(
            o.add(32),
            vfmaq_f32(bias_scaled, vcvtq_f32_u32(lo4), scale_vec),
        );
        vst1q_f32(
            o.add(36),
            vfmaq_f32(bias_scaled, vcvtq_f32_u32(hi4), scale_vec),
        );

        // Group 5
        let v5 = vdupq_n_u32(c5);
        let lo5 = vandq_u32(vshlq_u32(v5, shifts_lo), mask_3bit);
        let hi5 = vandq_u32(vshlq_u32(v5, shifts_hi), mask_3bit);
        vst1q_f32(
            o.add(40),
            vfmaq_f32(bias_scaled, vcvtq_f32_u32(lo5), scale_vec),
        );
        vst1q_f32(
            o.add(44),
            vfmaq_f32(bias_scaled, vcvtq_f32_u32(hi5), scale_vec),
        );

        // Group 6
        let v6 = vdupq_n_u32(c6);
        let lo6 = vandq_u32(vshlq_u32(v6, shifts_lo), mask_3bit);
        let hi6 = vandq_u32(vshlq_u32(v6, shifts_hi), mask_3bit);
        vst1q_f32(
            o.add(48),
            vfmaq_f32(bias_scaled, vcvtq_f32_u32(lo6), scale_vec),
        );
        vst1q_f32(
            o.add(52),
            vfmaq_f32(bias_scaled, vcvtq_f32_u32(hi6), scale_vec),
        );

        // Group 7
        let v7 = vdupq_n_u32(c7);
        let lo7 = vandq_u32(vshlq_u32(v7, shifts_lo), mask_3bit);
        let hi7 = vandq_u32(vshlq_u32(v7, shifts_hi), mask_3bit);
        vst1q_f32(
            o.add(56),
            vfmaq_f32(bias_scaled, vcvtq_f32_u32(lo7), scale_vec),
        );
        vst1q_f32(
            o.add(60),
            vfmaq_f32(bias_scaled, vcvtq_f32_u32(hi7), scale_vec),
        );

        group += 8;
    }

    // Handle remaining groups
    while group < num_groups {
        let byte_offset = group * PI3_BYTES_PER_GROUP;
        let out_offset = group * PI3_VALUES_PER_GROUP;

        let combined = neon_load_combined_24bit(packed_ptr.add(byte_offset));
        let (lo, hi) = neon_extract_and_convert(combined, bias_f32, scale_vec);

        vst1q_f32(output_ptr.add(out_offset), lo);
        vst1q_f32(output_ptr.add(out_offset + 4), hi);

        group += 1;
    }
}

/// Load 3 bytes as a combined 24-bit value (optimized for ARM)
#[cfg(target_arch = "aarch64")]
#[inline(always)]
unsafe fn neon_load_combined_24bit(ptr: *const u8) -> u32 {
    // Use unaligned load - ARM handles this efficiently
    let b0 = *ptr as u32;
    let b1 = *ptr.add(1) as u32;
    let b2 = *ptr.add(2) as u32;
    b0 | (b1 << 8) | (b2 << 16)
}

/// Extract 8 x 3-bit values and convert to f32 with bias and scale
/// Returns (low 4 floats, high 4 floats) as float32x4_t
#[cfg(target_arch = "aarch64")]
#[inline(always)]
unsafe fn neon_extract_and_convert(
    combined: u32,
    bias_f32: core::arch::aarch64::float32x4_t,
    scale_vec: core::arch::aarch64::float32x4_t,
) -> (
    core::arch::aarch64::float32x4_t,
    core::arch::aarch64::float32x4_t,
) {
    use core::arch::aarch64::*;

    // OPTIMIZED: Use NEON operations instead of scalar extraction
    let c_vec = vdupq_n_u32(combined);
    let mask_3bit = vdupq_n_u32(0x7);

    // Shift amounts for each lane (negate for right shift via vshlq)
    let shifts_lo = vld1q_s32([0i32, -3, -6, -9].as_ptr());
    let shifts_hi = vld1q_s32([-12i32, -15, -18, -21].as_ptr());

    // Extract 3-bit values using variable shifts and mask
    let lo_u32 = vandq_u32(vshlq_u32(c_vec, shifts_lo), mask_3bit);
    let hi_u32 = vandq_u32(vshlq_u32(c_vec, shifts_hi), mask_3bit);

    // Convert to float and apply bias+scale
    let lo_f32 = vcvtq_f32_u32(lo_u32);
    let hi_f32 = vcvtq_f32_u32(hi_u32);

    let biased_lo = vaddq_f32(lo_f32, bias_f32);
    let biased_hi = vaddq_f32(hi_f32, bias_f32);

    let result_lo = vmulq_f32(biased_lo, scale_vec);
    let result_hi = vmulq_f32(biased_hi, scale_vec);

    (result_lo, result_hi)
}

// ============================================================================
// x86_64 AVX-512 Implementation
// ============================================================================

/// x86_64 AVX-512 dequantization kernel for Pi-quantized data.
///
/// Processes 64 values (24 bytes packed) per iteration using AVX-512 SIMD.
/// Falls back to scalar for non-aligned remainders.
///
/// # Safety
///
/// This function uses raw AVX-512 intrinsics. Caller must ensure:
/// - Running on x86_64 with AVX-512F support (checked at runtime via dispatch)
/// - All slice bounds are valid
/// - Output buffer has sufficient capacity
///
/// # Performance
///
/// Achieves >12 GB/s throughput on modern Intel/AMD CPUs with AVX-512 by:
/// - Processing 16 values per AVX-512 vector (512-bit)
/// - Using _mm512_cvtepi32_ps for fast int-to-float conversion
/// - Fused multiply with _mm512_mul_ps
/// - 2x theoretical throughput over AVX2 with wider registers
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx512f")]
pub unsafe fn pi_dequantize_avx512(packed: &[u8], scale: f32, output: &mut [f32]) {
    use core::arch::x86_64::*;

    let num_groups = packed.len() / PI3_BYTES_PER_GROUP;
    let total_values = num_groups * PI3_VALUES_PER_GROUP;

    assert_eq!(
        output.len(),
        total_values,
        "Output length mismatch: {} vs expected {}",
        output.len(),
        total_values
    );

    if num_groups == 0 {
        return;
    }

    // Broadcast scale to all 16 lanes (512-bit register)
    let scale_vec = _mm512_set1_ps(scale);

    // Bias for sign extension: -4 in all 16 lanes
    let bias_vec = _mm512_set1_epi32(-4);

    // Process 8 groups (64 values) at a time for maximum AVX-512 throughput
    // 8 groups * 8 values = 64 values = 4 * 16-wide vectors
    let simd_groups = num_groups / 8;
    let mut group = 0usize;

    while group < simd_groups * 8 {
        let byte_offset = group * PI3_BYTES_PER_GROUP;
        let out_offset = group * PI3_VALUES_PER_GROUP;

        // Process 8 groups = 24 bytes = 64 values
        // We'll process in 4 iterations of 16 values each (2 groups per 16-wide vector)
        for batch in 0..4 {
            let g0 = batch * 2;
            let g1 = batch * 2 + 1;

            // First group of the pair
            let gb0 = byte_offset + g0 * 3;
            let b0_0 = *packed.get_unchecked(gb0) as u32;
            let b0_1 = *packed.get_unchecked(gb0 + 1) as u32;
            let b0_2 = *packed.get_unchecked(gb0 + 2) as u32;
            let combined0 = b0_0 | (b0_1 << 8) | (b0_2 << 16);

            // Extract 8 x 3-bit values from first group
            let v0_0 = (combined0 & 0x7) as i32;
            let v0_1 = ((combined0 >> 3) & 0x7) as i32;
            let v0_2 = ((combined0 >> 6) & 0x7) as i32;
            let v0_3 = ((combined0 >> 9) & 0x7) as i32;
            let v0_4 = ((combined0 >> 12) & 0x7) as i32;
            let v0_5 = ((combined0 >> 15) & 0x7) as i32;
            let v0_6 = ((combined0 >> 18) & 0x7) as i32;
            let v0_7 = ((combined0 >> 21) & 0x7) as i32;

            // Second group of the pair
            let gb1 = byte_offset + g1 * 3;
            let b1_0 = *packed.get_unchecked(gb1) as u32;
            let b1_1 = *packed.get_unchecked(gb1 + 1) as u32;
            let b1_2 = *packed.get_unchecked(gb1 + 2) as u32;
            let combined1 = b1_0 | (b1_1 << 8) | (b1_2 << 16);

            // Extract 8 x 3-bit values from second group
            let v1_0 = (combined1 & 0x7) as i32;
            let v1_1 = ((combined1 >> 3) & 0x7) as i32;
            let v1_2 = ((combined1 >> 6) & 0x7) as i32;
            let v1_3 = ((combined1 >> 9) & 0x7) as i32;
            let v1_4 = ((combined1 >> 12) & 0x7) as i32;
            let v1_5 = ((combined1 >> 15) & 0x7) as i32;
            let v1_6 = ((combined1 >> 18) & 0x7) as i32;
            let v1_7 = ((combined1 >> 21) & 0x7) as i32;

            // Load all 16 values into AVX-512 vector
            let raw_vec = _mm512_setr_epi32(
                v0_0, v0_1, v0_2, v0_3, v0_4, v0_5, v0_6, v0_7, v1_0, v1_1, v1_2, v1_3, v1_4, v1_5,
                v1_6, v1_7,
            );

            // Apply bias (sign extension: raw - 4)
            let signed_vec = _mm512_add_epi32(raw_vec, bias_vec);

            // Convert i32 to f32
            let float_vec = _mm512_cvtepi32_ps(signed_vec);

            // Multiply by scale
            let result_vec = _mm512_mul_ps(float_vec, scale_vec);

            // Store results (unaligned store for safety)
            let go = out_offset + batch * 16;
            _mm512_storeu_ps(output.as_mut_ptr().add(go), result_vec);
        }

        group += 8;
    }

    // Handle remaining groups with scalar fallback
    while group < num_groups {
        let byte_offset = group * PI3_BYTES_PER_GROUP;
        let out_offset = group * PI3_VALUES_PER_GROUP;

        let b0 = *packed.get_unchecked(byte_offset) as u32;
        let b1 = *packed.get_unchecked(byte_offset + 1) as u32;
        let b2 = *packed.get_unchecked(byte_offset + 2) as u32;
        let combined = b0 | (b1 << 8) | (b2 << 16);

        for i in 0..8 {
            let shift = i * 3;
            let raw = ((combined >> shift) & 0x7) as i32;
            let signed = raw - 4;
            *output.get_unchecked_mut(out_offset + i) = (signed as f32) * scale;
        }

        group += 1;
    }
}

/// x86_64 AVX-512 quantization kernel for Pi-quantized data.
///
/// Quantizes f32 values to packed 3-bit format using AVX-512 SIMD.
///
/// # Safety
///
/// This function uses raw AVX-512 intrinsics. Caller must ensure:
/// - Running on x86_64 with AVX-512F support (checked at runtime via dispatch)
/// - All slice bounds are valid
/// - Output buffer has sufficient capacity
///
/// # Performance
///
/// Uses 512-bit wide operations for efficient quantization of large weight tensors.
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx512f")]
pub unsafe fn pi_quantize_avx512(weights: &[f32], scale: f32, output: &mut [u8]) {
    use core::arch::x86_64::*;

    assert!(
        weights.len() % PI3_VALUES_PER_GROUP == 0,
        "Weights length must be multiple of 8"
    );

    let num_groups = weights.len() / PI3_VALUES_PER_GROUP;
    assert_eq!(
        output.len(),
        num_groups * PI3_BYTES_PER_GROUP,
        "Output buffer size mismatch"
    );

    if num_groups == 0 {
        return;
    }

    let inv_scale = if scale.abs() > 1e-10 {
        1.0 / scale
    } else {
        0.0
    };

    // Broadcast inverse scale to all 16 lanes
    let inv_scale_vec = _mm512_set1_ps(inv_scale);

    // Bias for conversion: add 4 to shift [-4, +3] to [0, 7]
    let bias_vec = _mm512_set1_epi32(4);

    // Clamp bounds
    let min_vec = _mm512_set1_epi32(0);
    let max_vec = _mm512_set1_epi32(7);

    // Rounding mode constant (nearest) - must be const for AVX-512 intrinsics
    const ROUNDING: i32 = 0x08; // _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC

    // Process 2 groups (16 values) at a time
    let simd_groups = num_groups / 2;
    let mut group = 0usize;

    while group < simd_groups * 2 {
        let val_offset = group * PI3_VALUES_PER_GROUP;
        let byte_offset = group * PI3_BYTES_PER_GROUP;

        // Load 16 floats
        let weights_vec = _mm512_loadu_ps(weights.as_ptr().add(val_offset));

        // Quantize: q = round(w * inv_scale)
        let scaled_vec = _mm512_mul_ps(weights_vec, inv_scale_vec);
        let rounded_vec = _mm512_roundscale_ps(scaled_vec, ROUNDING);
        let quantized_vec = _mm512_cvtps_epi32(rounded_vec);

        // Add bias: [−4, +3] -> [0, 7]
        let biased_vec = _mm512_add_epi32(quantized_vec, bias_vec);

        // Clamp to [0, 7]
        let clamped_vec = _mm512_max_epi32(_mm512_min_epi32(biased_vec, max_vec), min_vec);

        // Extract values to array for packing
        let mut values = [0i32; 16];
        _mm512_storeu_si512(values.as_mut_ptr() as *mut __m512i, clamped_vec);

        // Pack first group (values 0-7)
        let combined0: u32 = (values[0] as u32 & 0x7)
            | ((values[1] as u32 & 0x7) << 3)
            | ((values[2] as u32 & 0x7) << 6)
            | ((values[3] as u32 & 0x7) << 9)
            | ((values[4] as u32 & 0x7) << 12)
            | ((values[5] as u32 & 0x7) << 15)
            | ((values[6] as u32 & 0x7) << 18)
            | ((values[7] as u32 & 0x7) << 21);

        *output.get_unchecked_mut(byte_offset) = (combined0 & 0xFF) as u8;
        *output.get_unchecked_mut(byte_offset + 1) = ((combined0 >> 8) & 0xFF) as u8;
        *output.get_unchecked_mut(byte_offset + 2) = ((combined0 >> 16) & 0xFF) as u8;

        // Pack second group (values 8-15)
        let combined1: u32 = (values[8] as u32 & 0x7)
            | ((values[9] as u32 & 0x7) << 3)
            | ((values[10] as u32 & 0x7) << 6)
            | ((values[11] as u32 & 0x7) << 9)
            | ((values[12] as u32 & 0x7) << 12)
            | ((values[13] as u32 & 0x7) << 15)
            | ((values[14] as u32 & 0x7) << 18)
            | ((values[15] as u32 & 0x7) << 21);

        *output.get_unchecked_mut(byte_offset + 3) = (combined1 & 0xFF) as u8;
        *output.get_unchecked_mut(byte_offset + 4) = ((combined1 >> 8) & 0xFF) as u8;
        *output.get_unchecked_mut(byte_offset + 5) = ((combined1 >> 16) & 0xFF) as u8;

        group += 2;
    }

    // Handle remaining group with scalar fallback
    while group < num_groups {
        let val_offset = group * PI3_VALUES_PER_GROUP;
        let byte_offset = group * PI3_BYTES_PER_GROUP;

        let mut combined: u32 = 0;

        for i in 0..8 {
            let v = *weights.get_unchecked(val_offset + i);
            // Quantize: round(v / scale) then clamp to [-4, +3]
            let quantized = (v * inv_scale).round() as i32;
            let clamped = quantized.clamp(-4, 3);
            // Convert to unsigned 3-bit: add 4 to get [0, 7]
            let unsigned = (clamped + 4) as u32;
            combined |= (unsigned & 0x7) << (i * 3);
        }

        *output.get_unchecked_mut(byte_offset) = (combined & 0xFF) as u8;
        *output.get_unchecked_mut(byte_offset + 1) = ((combined >> 8) & 0xFF) as u8;
        *output.get_unchecked_mut(byte_offset + 2) = ((combined >> 16) & 0xFF) as u8;

        group += 1;
    }
}

// ============================================================================
// x86_64 AVX2 Implementation
// ============================================================================

/// x86_64 AVX2 dequantization kernel for Pi-quantized data.
///
/// Processes 32 values (12 bytes packed) per iteration using AVX2 SIMD.
/// Falls back to scalar for non-aligned remainders.
///
/// # Safety
///
/// This function uses raw AVX2 intrinsics. Caller must ensure:
/// - Running on x86_64 with AVX2 support (checked at runtime via dispatch)
/// - All slice bounds are valid
/// - Output buffer has sufficient capacity
///
/// # Performance
///
/// Achieves >8 GB/s throughput on modern Intel/AMD CPUs by:
/// - Processing 8 values per AVX2 vector
/// - Using _mm256_cvtepi32_ps for fast int-to-float conversion
/// - Fused multiply with _mm256_mul_ps
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
pub unsafe fn pi_dequantize_avx2(packed: &[u8], scale: f32, output: &mut [f32]) {
    use core::arch::x86_64::*;

    let num_groups = packed.len() / PI3_BYTES_PER_GROUP;
    let total_values = num_groups * PI3_VALUES_PER_GROUP;

    assert_eq!(
        output.len(),
        total_values,
        "Output length mismatch: {} vs expected {}",
        output.len(),
        total_values
    );

    if num_groups == 0 {
        return;
    }

    // Broadcast scale to all 8 lanes
    let scale_vec = _mm256_set1_ps(scale);

    // Bias for sign extension: -4 in all lanes
    let bias_vec = _mm256_set1_epi32(-4);

    // Process 4 groups (32 values) at a time
    let simd_groups = num_groups / 4;
    let mut group = 0usize;

    while group < simd_groups * 4 {
        let byte_offset = group * PI3_BYTES_PER_GROUP;
        let out_offset = group * PI3_VALUES_PER_GROUP;

        // Process 4 groups
        for g in 0..4 {
            let gb = byte_offset + g * 3;
            let go = out_offset + g * 8;

            let b0 = *packed.get_unchecked(gb) as u32;
            let b1 = *packed.get_unchecked(gb + 1) as u32;
            let b2 = *packed.get_unchecked(gb + 2) as u32;
            let combined = b0 | (b1 << 8) | (b2 << 16);

            // Extract 8 x 3-bit values
            let v0 = (combined & 0x7) as i32;
            let v1 = ((combined >> 3) & 0x7) as i32;
            let v2 = ((combined >> 6) & 0x7) as i32;
            let v3 = ((combined >> 9) & 0x7) as i32;
            let v4 = ((combined >> 12) & 0x7) as i32;
            let v5 = ((combined >> 15) & 0x7) as i32;
            let v6 = ((combined >> 18) & 0x7) as i32;
            let v7 = ((combined >> 21) & 0x7) as i32;

            // Load into AVX2 vector
            let raw_vec = _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);

            // Apply bias (sign extension: raw - 4)
            let signed_vec = _mm256_add_epi32(raw_vec, bias_vec);

            // Convert i32 to f32
            let float_vec = _mm256_cvtepi32_ps(signed_vec);

            // Multiply by scale
            let result_vec = _mm256_mul_ps(float_vec, scale_vec);

            // Store results (unaligned store for safety)
            _mm256_storeu_ps(output.as_mut_ptr().add(go), result_vec);
        }

        group += 4;
    }

    // Handle remaining groups with scalar fallback
    while group < num_groups {
        let byte_offset = group * PI3_BYTES_PER_GROUP;
        let out_offset = group * PI3_VALUES_PER_GROUP;

        let b0 = *packed.get_unchecked(byte_offset) as u32;
        let b1 = *packed.get_unchecked(byte_offset + 1) as u32;
        let b2 = *packed.get_unchecked(byte_offset + 2) as u32;
        let combined = b0 | (b1 << 8) | (b2 << 16);

        for i in 0..8 {
            let shift = i * 3;
            let raw = ((combined >> shift) & 0x7) as i32;
            let signed = raw - 4;
            *output.get_unchecked_mut(out_offset + i) = (signed as f32) * scale;
        }

        group += 1;
    }
}

// ============================================================================
// Runtime Dispatch
// ============================================================================

/// Dispatch dequantization to the best available kernel.
///
/// Automatically selects the optimal SIMD kernel at runtime:
/// - ARM NEON on aarch64
/// - AVX-512 on x86_64 (preferred when available, >12 GB/s)
/// - AVX2 on x86_64 (fallback, >8 GB/s)
/// - Scalar fallback on all other architectures
///
/// # Arguments
///
/// * `packed` - Packed 3-bit quantized values
/// * `scale` - Dequantization scale factor
/// * `output` - Output f32 buffer
///
/// # Example
///
/// ```rust,ignore
/// use ruvllm::quantize::pi_quant_simd::pi_dequantize;
///
/// let packed = vec![0u8; 12]; // 32 values
/// let scale = std::f32::consts::PI / 4.0;
/// let mut output = vec![0.0f32; 32];
///
/// pi_dequantize(&packed, scale, &mut output);
/// ```
pub fn pi_dequantize(packed: &[u8], scale: f32, output: &mut [f32]) {
    #[cfg(target_arch = "aarch64")]
    {
        // NEON is always available on aarch64
        // SAFETY: aarch64 guarantees NEON support
        unsafe {
            pi_dequantize_neon(packed, scale, output);
        }
        return;
    }

    #[cfg(target_arch = "x86_64")]
    {
        // Prefer AVX-512 when available (highest throughput)
        if is_x86_feature_detected!("avx512f") {
            // SAFETY: AVX-512F feature detected at runtime
            unsafe {
                pi_dequantize_avx512(packed, scale, output);
            }
            return;
        }

        // Fall back to AVX2
        if is_x86_feature_detected!("avx2") {
            // SAFETY: AVX2 feature detected at runtime
            unsafe {
                pi_dequantize_avx2(packed, scale, output);
            }
            return;
        }

        // Fall back to scalar for x86_64 without AVX2
        pi_dequantize_scalar(packed, scale, output);
        return;
    }

    // Fallback to scalar for other architectures
    #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
    {
        pi_dequantize_scalar(packed, scale, output);
    }
}

/// Get the name of the kernel that will be used for dispatch.
///
/// Useful for logging and diagnostics.
pub fn pi_dequantize_kernel_name() -> &'static str {
    #[cfg(target_arch = "aarch64")]
    {
        return "neon";
    }

    #[cfg(target_arch = "x86_64")]
    {
        if is_x86_feature_detected!("avx512f") {
            return "avx512";
        }
        if is_x86_feature_detected!("avx2") {
            return "avx2";
        }
    }

    "scalar"
}

/// Dispatch quantization to the best available kernel.
///
/// Automatically selects the optimal SIMD kernel at runtime:
/// - AVX-512 on x86_64 (preferred when available)
/// - Scalar fallback on all other architectures
///
/// # Arguments
///
/// * `weights` - Input f32 values (length must be multiple of 8)
/// * `scale` - Quantization scale factor
/// * `output` - Output packed buffer (length must be values.len() * 3 / 8)
pub fn pi_quantize(weights: &[f32], scale: f32, output: &mut [u8]) {
    #[cfg(target_arch = "x86_64")]
    {
        // Prefer AVX-512 when available
        if is_x86_feature_detected!("avx512f") {
            // SAFETY: AVX-512F feature detected at runtime
            unsafe {
                pi_quantize_avx512(weights, scale, output);
            }
            return;
        }
    }

    // Fallback to scalar
    pi_quantize_scalar(weights, scale, output);
}

/// Get the name of the quantization kernel that will be used for dispatch.
///
/// Useful for logging and diagnostics.
pub fn pi_quantize_kernel_name() -> &'static str {
    #[cfg(target_arch = "x86_64")]
    {
        if is_x86_feature_detected!("avx512f") {
            return "avx512";
        }
    }

    "scalar"
}

// ============================================================================
// Utility Functions
// ============================================================================

/// Pack f32 values into Pi-quantized 3-bit format.
///
/// This is the inverse of dequantization, useful for testing.
///
/// # Arguments
///
/// * `values` - Input f32 values (length must be multiple of 8)
/// * `scale` - Quantization scale factor
/// * `output` - Output packed buffer (length must be values.len() * 3 / 8)
pub fn pi_quantize_scalar(values: &[f32], scale: f32, output: &mut [u8]) {
    assert!(
        values.len() % PI3_VALUES_PER_GROUP == 0,
        "Values length must be multiple of 8"
    );

    let num_groups = values.len() / PI3_VALUES_PER_GROUP;
    assert_eq!(
        output.len(),
        num_groups * PI3_BYTES_PER_GROUP,
        "Output buffer size mismatch"
    );

    let inv_scale = if scale.abs() > 1e-10 {
        1.0 / scale
    } else {
        0.0
    };

    for group in 0..num_groups {
        let val_offset = group * PI3_VALUES_PER_GROUP;
        let byte_offset = group * PI3_BYTES_PER_GROUP;

        let mut combined: u32 = 0;

        for i in 0..8 {
            let v = values[val_offset + i];
            // Quantize: round(v / scale) then clamp to [-4, +3]
            let quantized = (v * inv_scale).round() as i32;
            let clamped = quantized.clamp(-4, 3);
            // Convert to unsigned 3-bit: add 4 to get [0, 7]
            let unsigned = (clamped + 4) as u32;
            combined |= (unsigned & 0x7) << (i * 3);
        }

        output[byte_offset] = (combined & 0xFF) as u8;
        output[byte_offset + 1] = ((combined >> 8) & 0xFF) as u8;
        output[byte_offset + 2] = ((combined >> 16) & 0xFF) as u8;
    }
}

/// Calculate the default Pi-quantization scale for a given k value.
///
/// Returns pi / k, the step size for Pi-constant quantization.
#[inline]
pub fn pi_scale(k: u32) -> f32 {
    PI_F32 / (k as f32)
}

/// Calculate the adaptive Pi-quantization scale.
///
/// Returns alpha * pi / k, combining per-channel adaptation with Pi-scaling.
#[inline]
pub fn pi_scale_adaptive(alpha: f32, k: u32) -> f32 {
    alpha * PI_F32 / (k as f32)
}

/// Compute per-block adaptive scale from max absolute value.
///
/// This is used during quantization to find the optimal scale for a block.
/// The scale is computed as: max_abs / 3.0 (for 3-bit) so that values map to [-4, +3].
///
/// # Arguments
///
/// * `max_abs` - Maximum absolute value in the block
///
/// # Returns
///
/// Adaptive scale factor. If max_abs is zero, returns a default scale of pi/4.
#[inline]
pub fn pi_scale_from_max(max_abs: f32) -> f32 {
    if max_abs < 1e-10 {
        // Return default pi/4 scale for zero blocks
        PI_F32 / 4.0
    } else {
        // Scale so max value maps to +3 (or min to -4)
        max_abs / 3.0
    }
}

/// Quantize a single f32 value to a 3-bit signed integer [-4, +3].
///
/// This is the per-element quantization used during block packing.
///
/// # Arguments
///
/// * `value` - The f32 value to quantize
/// * `scale` - The quantization scale (from pi_scale_from_max)
///
/// # Returns
///
/// Quantized value as i8 in range [-4, +3]
#[inline]
pub fn pi_quantize_value(value: f32, scale: f32) -> i8 {
    if scale < 1e-10 {
        return 0;
    }
    let quantized = (value / scale).round() as i32;
    quantized.clamp(-4, 3) as i8
}

// ============================================================================
// Tests
// ============================================================================

#[cfg(test)]
mod tests {
    use super::*;

    const EPSILON: f32 = 1e-6;

    // Helper: compute ULP distance between two f32 values
    fn ulp_distance(a: f32, b: f32) -> u32 {
        if a == b {
            return 0;
        }
        if a.is_nan() || b.is_nan() {
            return u32::MAX;
        }

        let a_bits = a.to_bits() as i32;
        let b_bits = b.to_bits() as i32;

        // Handle sign differences
        let a_signed = if a_bits < 0 {
            i32::MIN - a_bits
        } else {
            a_bits
        };
        let b_signed = if b_bits < 0 {
            i32::MIN - b_bits
        } else {
            b_bits
        };

        (a_signed - b_signed).unsigned_abs()
    }

    // -------------------------------------------------------------------------
    // Scalar reference tests
    // -------------------------------------------------------------------------

    #[test]
    fn test_pi_dequantize_scalar_zeros() {
        // All zeros: should decode to -4 * scale for each value
        // (since 0b000 = 0, and 0 - 4 = -4)
        let packed = vec![0u8; 3];
        let scale = 1.0;
        let mut output = vec![0.0f32; 8];

        pi_dequantize_scalar(&packed, scale, &mut output);

        for &v in &output {
            assert!((v - (-4.0)).abs() < EPSILON, "Expected -4.0, got {}", v);
        }
    }

    #[test]
    fn test_pi_dequantize_scalar_all_4s() {
        // All 4s (0b100 = 4 for each value): 4 - 4 = 0
        // Packed: 8 values of 4 = 8 * 3 bits = 24 bits
        // 4 = 0b100, so bit pattern: 100 100 100 100 100 100 100 100
        // = 0b100100 100100 100100 100100 = ...
        // Easier to compute: combined = sum(4 << (i*3)) for i=0..7
        let combined: u32 = (0..8).map(|i| 4u32 << (i * 3)).sum();
        let packed = vec![
            (combined & 0xFF) as u8,
            ((combined >> 8) & 0xFF) as u8,
            ((combined >> 16) & 0xFF) as u8,
        ];
        let scale = 1.0;
        let mut output = vec![0.0f32; 8];

        pi_dequantize_scalar(&packed, scale, &mut output);

        for &v in &output {
            assert!((v - 0.0).abs() < EPSILON, "Expected 0.0, got {}", v);
        }
    }

    #[test]
    fn test_pi_dequantize_scalar_range() {
        // Test all values from -4 to +3
        let values: Vec<i32> = (-4..=3).collect();
        let mut combined: u32 = 0;
        for (i, &v) in values.iter().enumerate() {
            let unsigned = (v + 4) as u32;
            combined |= (unsigned & 0x7) << (i * 3);
        }
        let packed = vec![
            (combined & 0xFF) as u8,
            ((combined >> 8) & 0xFF) as u8,
            ((combined >> 16) & 0xFF) as u8,
        ];
        let scale = 0.5;
        let mut output = vec![0.0f32; 8];

        pi_dequantize_scalar(&packed, scale, &mut output);

        for (i, &v) in values.iter().enumerate() {
            let expected = (v as f32) * scale;
            assert!(
                (output[i] - expected).abs() < EPSILON,
                "Index {}: expected {}, got {}",
                i,
                expected,
                output[i]
            );
        }
    }

    #[test]
    fn test_pi_dequantize_scalar_pi_scale() {
        // Use actual pi/4 scale
        let scale = pi_scale(4);
        let values = [-4i32, -3, -2, -1, 0, 1, 2, 3];
        let mut combined: u32 = 0;
        for (i, &v) in values.iter().enumerate() {
            let unsigned = (v + 4) as u32;
            combined |= (unsigned & 0x7) << (i * 3);
        }
        let packed = vec![
            (combined & 0xFF) as u8,
            ((combined >> 8) & 0xFF) as u8,
            ((combined >> 16) & 0xFF) as u8,
        ];
        let mut output = vec![0.0f32; 8];

        pi_dequantize_scalar(&packed, scale, &mut output);

        for (i, &v) in values.iter().enumerate() {
            let expected = (v as f32) * scale;
            assert!(
                (output[i] - expected).abs() < EPSILON,
                "Index {}: expected {}, got {}",
                i,
                expected,
                output[i]
            );
        }
    }

    #[test]
    fn test_pi_dequantize_scalar_multiple_groups() {
        // 4 groups = 12 bytes = 32 values
        let scale = 1.0;
        let mut packed = vec![0u8; 12];

        // Fill each group with sequential pattern
        for group in 0..4 {
            let base_val = group as i32;
            let clamped = base_val.clamp(-4, 3);
            let unsigned = (clamped + 4) as u32;
            let combined: u32 = (0..8).map(|_| unsigned).fold(0u32, |acc, v| {
                // This creates all same values
                acc
            });
            // Actually, let's just use the same value for all 8 positions
            let combined: u32 = (0..8).map(|i| unsigned << (i * 3)).sum();
            packed[group * 3] = (combined & 0xFF) as u8;
            packed[group * 3 + 1] = ((combined >> 8) & 0xFF) as u8;
            packed[group * 3 + 2] = ((combined >> 16) & 0xFF) as u8;
        }

        let mut output = vec![0.0f32; 32];
        pi_dequantize_scalar(&packed, scale, &mut output);

        // Verify each group
        for group in 0..4 {
            let expected_val = (group as i32).clamp(-4, 3) as f32;
            for i in 0..8 {
                let idx = group * 8 + i;
                assert!(
                    (output[idx] - expected_val).abs() < EPSILON,
                    "Group {}, index {}: expected {}, got {}",
                    group,
                    i,
                    expected_val,
                    output[idx]
                );
            }
        }
    }

    #[test]
    fn test_extract_pi3_value() {
        let values = [-4i32, -3, -2, -1, 0, 1, 2, 3];
        let mut combined: u32 = 0;
        for (i, &v) in values.iter().enumerate() {
            let unsigned = (v + 4) as u32;
            combined |= (unsigned & 0x7) << (i * 3);
        }
        let packed = vec![
            (combined & 0xFF) as u8,
            ((combined >> 8) & 0xFF) as u8,
            ((combined >> 16) & 0xFF) as u8,
        ];

        for (i, &expected) in values.iter().enumerate() {
            let actual = extract_pi3_value(&packed, i);
            assert_eq!(
                actual, expected as i8,
                "Index {}: expected {}, got {}",
                i, expected, actual
            );
        }
    }

    // -------------------------------------------------------------------------
    // Quantize/Dequantize roundtrip tests
    // -------------------------------------------------------------------------

    #[test]
    fn test_quantize_dequantize_roundtrip() {
        let scale = pi_scale(4);
        let original: Vec<f32> = (-4..=3).map(|v| (v as f32) * scale).collect();
        let mut packed = vec![0u8; 3];
        let mut reconstructed = vec![0.0f32; 8];

        pi_quantize_scalar(&original, scale, &mut packed);
        pi_dequantize_scalar(&packed, scale, &mut reconstructed);

        for (i, (&orig, &recon)) in original.iter().zip(reconstructed.iter()).enumerate() {
            let ulp = ulp_distance(orig, recon);
            assert!(
                ulp <= 1,
                "Index {}: roundtrip error > 1 ULP: orig={}, recon={}, ulp={}",
                i,
                orig,
                recon,
                ulp
            );
        }
    }

    #[test]
    fn test_quantize_clipping() {
        let scale = 1.0;
        // Values outside [-4, 3] should be clipped
        let values = vec![-10.0, -5.0, -4.0, 0.0, 3.0, 5.0, 10.0, 100.0];
        let mut packed = vec![0u8; 3];

        pi_quantize_scalar(&values, scale, &mut packed);

        let mut output = vec![0.0f32; 8];
        pi_dequantize_scalar(&packed, scale, &mut output);

        // Verify clipping
        assert!((output[0] - (-4.0)).abs() < EPSILON); // -10 -> -4
        assert!((output[1] - (-4.0)).abs() < EPSILON); // -5 -> -4
        assert!((output[2] - (-4.0)).abs() < EPSILON); // -4 -> -4
        assert!((output[3] - 0.0).abs() < EPSILON); // 0 -> 0
        assert!((output[4] - 3.0).abs() < EPSILON); // 3 -> 3
        assert!((output[5] - 3.0).abs() < EPSILON); // 5 -> 3
        assert!((output[6] - 3.0).abs() < EPSILON); // 10 -> 3
        assert!((output[7] - 3.0).abs() < EPSILON); // 100 -> 3
    }

    // -------------------------------------------------------------------------
    // SIMD equivalence tests (INV-8: <= 1 ULP)
    // -------------------------------------------------------------------------

    #[cfg(target_arch = "aarch64")]
    #[test]
    fn test_neon_equivalence_to_scalar() {
        // Test various sizes
        for num_groups in [1, 4, 16, 100] {
            let packed: Vec<u8> = (0..num_groups * 3)
                .map(|i| (i * 17) as u8) // Pseudo-random pattern
                .collect();
            let scale = pi_scale(4);

            let mut scalar_output = vec![0.0f32; num_groups * 8];
            let mut neon_output = vec![0.0f32; num_groups * 8];

            pi_dequantize_scalar(&packed, scale, &mut scalar_output);
            unsafe {
                pi_dequantize_neon(&packed, scale, &mut neon_output);
            }

            for i in 0..scalar_output.len() {
                let ulp = ulp_distance(scalar_output[i], neon_output[i]);
                assert!(
                    ulp <= 1,
                    "NEON divergence at index {} (groups={}): scalar={}, neon={}, ulp={}",
                    i,
                    num_groups,
                    scalar_output[i],
                    neon_output[i],
                    ulp
                );
            }
        }
    }

    #[cfg(target_arch = "x86_64")]
    #[test]
    fn test_avx2_equivalence_to_scalar() {
        if !is_x86_feature_detected!("avx2") {
            println!("Skipping AVX2 test: feature not available");
            return;
        }

        // Test various sizes
        for num_groups in [1, 4, 16, 100] {
            let packed: Vec<u8> = (0..num_groups * 3)
                .map(|i| (i * 17) as u8) // Pseudo-random pattern
                .collect();
            let scale = pi_scale(4);

            let mut scalar_output = vec![0.0f32; num_groups * 8];
            let mut avx2_output = vec![0.0f32; num_groups * 8];

            pi_dequantize_scalar(&packed, scale, &mut scalar_output);
            unsafe {
                pi_dequantize_avx2(&packed, scale, &mut avx2_output);
            }

            for i in 0..scalar_output.len() {
                let ulp = ulp_distance(scalar_output[i], avx2_output[i]);
                assert!(
                    ulp <= 1,
                    "AVX2 divergence at index {} (groups={}): scalar={}, avx2={}, ulp={}",
                    i,
                    num_groups,
                    scalar_output[i],
                    avx2_output[i],
                    ulp
                );
            }
        }
    }

    #[cfg(target_arch = "x86_64")]
    #[test]
    fn test_avx512_dequantize_equivalence_to_scalar() {
        if !is_x86_feature_detected!("avx512f") {
            println!("Skipping AVX-512 dequantize test: feature not available");
            return;
        }

        // Test various sizes including edge cases for AVX-512 processing
        // AVX-512 processes 8 groups (64 values) at a time
        for num_groups in [1, 4, 8, 16, 32, 100, 123] {
            let packed: Vec<u8> = (0..num_groups * 3)
                .map(|i| (i * 17) as u8) // Pseudo-random pattern
                .collect();
            let scale = pi_scale(4);

            let mut scalar_output = vec![0.0f32; num_groups * 8];
            let mut avx512_output = vec![0.0f32; num_groups * 8];

            pi_dequantize_scalar(&packed, scale, &mut scalar_output);
            unsafe {
                pi_dequantize_avx512(&packed, scale, &mut avx512_output);
            }

            for i in 0..scalar_output.len() {
                let ulp = ulp_distance(scalar_output[i], avx512_output[i]);
                assert!(
                    ulp <= 1,
                    "AVX-512 dequantize divergence at index {} (groups={}): scalar={}, avx512={}, ulp={}",
                    i,
                    num_groups,
                    scalar_output[i],
                    avx512_output[i],
                    ulp
                );
            }
        }
    }

    #[cfg(target_arch = "x86_64")]
    #[test]
    fn test_avx512_quantize_equivalence_to_scalar() {
        if !is_x86_feature_detected!("avx512f") {
            println!("Skipping AVX-512 quantize test: feature not available");
            return;
        }

        // Test various sizes including edge cases
        for num_groups in [1, 2, 4, 8, 16, 32, 100, 123] {
            let num_values = num_groups * 8;
            let scale = pi_scale(4);

            // Generate test weights in the valid range [-4*scale, 3*scale]
            let weights: Vec<f32> = (0..num_values)
                .map(|i| {
                    let t = (i as f32) / (num_values as f32);
                    // Map [0, 1] to [-4*scale, 3*scale]
                    -4.0 * scale + t * 7.0 * scale
                })
                .collect();

            let num_bytes = num_groups * 3;
            let mut scalar_output = vec![0u8; num_bytes];
            let mut avx512_output = vec![0u8; num_bytes];

            pi_quantize_scalar(&weights, scale, &mut scalar_output);
            unsafe {
                pi_quantize_avx512(&weights, scale, &mut avx512_output);
            }

            // Compare packed outputs byte by byte
            for i in 0..num_bytes {
                assert_eq!(
                    scalar_output[i], avx512_output[i],
                    "AVX-512 quantize divergence at byte {} (groups={}): scalar={:#04x}, avx512={:#04x}",
                    i, num_groups, scalar_output[i], avx512_output[i]
                );
            }
        }
    }

    #[cfg(target_arch = "x86_64")]
    #[test]
    fn test_avx512_quantize_dequantize_roundtrip() {
        if !is_x86_feature_detected!("avx512f") {
            println!("Skipping AVX-512 roundtrip test: feature not available");
            return;
        }

        let scale = pi_scale(4);

        // Test with various input patterns
        for num_groups in [1, 2, 8, 16, 100] {
            let num_values = num_groups * 8;

            // Generate weights that map exactly to quantization levels
            let original: Vec<f32> = (0..num_values)
                .map(|i| {
                    // Cycle through all 8 valid quantization levels: -4 to +3
                    let level = ((i % 8) as i32) - 4;
                    (level as f32) * scale
                })
                .collect();

            let num_bytes = num_groups * 3;
            let mut packed = vec![0u8; num_bytes];
            let mut reconstructed = vec![0.0f32; num_values];

            // Quantize with AVX-512
            unsafe {
                pi_quantize_avx512(&original, scale, &mut packed);
                pi_dequantize_avx512(&packed, scale, &mut reconstructed);
            }

            // Verify roundtrip accuracy (should be exact for values on quantization grid)
            for (i, (&orig, &recon)) in original.iter().zip(reconstructed.iter()).enumerate() {
                let ulp = ulp_distance(orig, recon);
                assert!(
                    ulp <= 1,
                    "AVX-512 roundtrip error > 1 ULP at index {}: orig={}, recon={}, ulp={}",
                    i,
                    orig,
                    recon,
                    ulp
                );
            }
        }
    }

    #[cfg(target_arch = "x86_64")]
    #[test]
    fn test_avx512_all_value_combinations() {
        if !is_x86_feature_detected!("avx512f") {
            println!("Skipping AVX-512 all values test: feature not available");
            return;
        }

        let scale = pi_scale(4);

        // Test all possible 3-bit value combinations
        // Create packed data with all values from -4 to +3
        let values: [i32; 8] = [-4, -3, -2, -1, 0, 1, 2, 3];
        let mut combined: u32 = 0;
        for (i, &v) in values.iter().enumerate() {
            let unsigned = (v + 4) as u32;
            combined |= (unsigned & 0x7) << (i * 3);
        }
        let packed = vec![
            (combined & 0xFF) as u8,
            ((combined >> 8) & 0xFF) as u8,
            ((combined >> 16) & 0xFF) as u8,
        ];

        let mut scalar_output = vec![0.0f32; 8];
        let mut avx512_output = vec![0.0f32; 8];

        pi_dequantize_scalar(&packed, scale, &mut scalar_output);
        unsafe {
            pi_dequantize_avx512(&packed, scale, &mut avx512_output);
        }

        for i in 0..8 {
            let ulp = ulp_distance(scalar_output[i], avx512_output[i]);
            assert!(
                ulp <= 1,
                "AVX-512 all values divergence at index {}: scalar={}, avx512={}, ulp={}",
                i,
                scalar_output[i],
                avx512_output[i],
                ulp
            );

            // Also verify the expected value
            let expected = (values[i] as f32) * scale;
            let ulp_expected = ulp_distance(expected, avx512_output[i]);
            assert!(
                ulp_expected <= 1,
                "AVX-512 expected value divergence at index {}: expected={}, avx512={}, ulp={}",
                i,
                expected,
                avx512_output[i],
                ulp_expected
            );
        }
    }

    #[cfg(target_arch = "x86_64")]
    #[test]
    fn test_avx512_edge_cases() {
        if !is_x86_feature_detected!("avx512f") {
            println!("Skipping AVX-512 edge cases test: feature not available");
            return;
        }

        // Test with edge case scales
        let test_scales = [
            1.0f32,            // Unit scale
            0.001,             // Very small scale
            1000.0,            // Large scale
            -1.0,              // Negative scale
            PI / 4.0,          // Typical pi-quantization scale
            PI / 2.0,          // Another pi-based scale
            f32::MIN_POSITIVE, // Smallest positive normal
        ];

        for &scale in &test_scales {
            // Generate packed data
            let num_groups = 8;
            let packed: Vec<u8> = (0..num_groups * 3).map(|i| (i * 31) as u8).collect();

            let mut scalar_output = vec![0.0f32; num_groups * 8];
            let mut avx512_output = vec![0.0f32; num_groups * 8];

            pi_dequantize_scalar(&packed, scale, &mut scalar_output);
            unsafe {
                pi_dequantize_avx512(&packed, scale, &mut avx512_output);
            }

            for i in 0..scalar_output.len() {
                let ulp = ulp_distance(scalar_output[i], avx512_output[i]);
                assert!(
                    ulp <= 1,
                    "AVX-512 edge case (scale={}) divergence at index {}: scalar={}, avx512={}, ulp={}",
                    scale,
                    i,
                    scalar_output[i],
                    avx512_output[i],
                    ulp
                );
            }
        }
    }

    #[test]
    fn test_dispatch_equivalence() {
        // Ensure dispatch produces same results as scalar
        // Test sizes that exercise all paths including AVX-512's 8-group batching
        for num_groups in [1, 4, 8, 16, 32, 100, 123] {
            let packed: Vec<u8> = (0..num_groups * 3).map(|i| (i * 23) as u8).collect();
            let scale = pi_scale(4);

            let mut scalar_output = vec![0.0f32; num_groups * 8];
            let mut dispatch_output = vec![0.0f32; num_groups * 8];

            pi_dequantize_scalar(&packed, scale, &mut scalar_output);
            pi_dequantize(&packed, scale, &mut dispatch_output);

            for i in 0..scalar_output.len() {
                let ulp = ulp_distance(scalar_output[i], dispatch_output[i]);
                assert!(
                    ulp <= 1,
                    "Dispatch ({}) divergence at index {}: scalar={}, dispatch={}, ulp={}",
                    pi_dequantize_kernel_name(),
                    i,
                    scalar_output[i],
                    dispatch_output[i],
                    ulp
                );
            }
        }
    }

    #[test]
    fn test_quantize_dispatch_equivalence() {
        // Ensure quantize dispatch produces same results as scalar
        for num_groups in [1, 2, 4, 8, 16, 100] {
            let num_values = num_groups * 8;
            let scale = pi_scale(4);

            // Generate test weights
            let weights: Vec<f32> = (0..num_values)
                .map(|i| {
                    let t = (i as f32) / (num_values as f32);
                    -4.0 * scale + t * 7.0 * scale
                })
                .collect();

            let num_bytes = num_groups * 3;
            let mut scalar_output = vec![0u8; num_bytes];
            let mut dispatch_output = vec![0u8; num_bytes];

            pi_quantize_scalar(&weights, scale, &mut scalar_output);
            pi_quantize(&weights, scale, &mut dispatch_output);

            for i in 0..num_bytes {
                assert_eq!(
                    scalar_output[i], dispatch_output[i],
                    "Quantize dispatch ({}) divergence at byte {}: scalar={:#04x}, dispatch={:#04x}",
                    pi_quantize_kernel_name(),
                    i,
                    scalar_output[i],
                    dispatch_output[i]
                );
            }
        }
    }

    // -------------------------------------------------------------------------
    // Edge case tests
    // -------------------------------------------------------------------------

    #[test]
    fn test_empty_input() {
        let packed: Vec<u8> = vec![];
        let scale = 1.0;
        let mut output: Vec<f32> = vec![];

        pi_dequantize_scalar(&packed, scale, &mut output);
        // Should not panic, output remains empty
        assert!(output.is_empty());
    }

    #[test]
    fn test_zero_scale() {
        let packed = vec![0xFFu8; 3]; // All 7s = +3 after sign extension
        let scale = 0.0;
        let mut output = vec![0.0f32; 8];

        pi_dequantize_scalar(&packed, scale, &mut output);

        for &v in &output {
            assert!((v - 0.0).abs() < EPSILON, "Zero scale should give zeros");
        }
    }

    #[test]
    fn test_negative_scale() {
        let values = [1i32; 8];
        let mut combined: u32 = 0;
        for (i, &v) in values.iter().enumerate() {
            let unsigned = (v + 4) as u32;
            combined |= (unsigned & 0x7) << (i * 3);
        }
        let packed = vec![
            (combined & 0xFF) as u8,
            ((combined >> 8) & 0xFF) as u8,
            ((combined >> 16) & 0xFF) as u8,
        ];
        let scale = -1.0;
        let mut output = vec![0.0f32; 8];

        pi_dequantize_scalar(&packed, scale, &mut output);

        // 1 * -1.0 = -1.0
        for &v in &output {
            assert!((v - (-1.0)).abs() < EPSILON, "Expected -1.0, got {}", v);
        }
    }

    #[test]
    #[should_panic(expected = "Packed length")]
    fn test_invalid_packed_length() {
        let packed = vec![0u8; 4]; // Not multiple of 3
        let scale = 1.0;
        let mut output = vec![0.0f32; 8];

        pi_dequantize_scalar(&packed, scale, &mut output);
    }

    #[test]
    #[should_panic(expected = "Output length")]
    fn test_output_length_mismatch() {
        let packed = vec![0u8; 3];
        let scale = 1.0;
        let mut output = vec![0.0f32; 4]; // Should be 8

        pi_dequantize_scalar(&packed, scale, &mut output);
    }

    // -------------------------------------------------------------------------
    // Utility function tests
    // -------------------------------------------------------------------------

    #[test]
    fn test_pi_scale() {
        assert!((pi_scale(4) - PI / 4.0).abs() < EPSILON);
        assert!((pi_scale(2) - PI / 2.0).abs() < EPSILON);
        assert!((pi_scale(8) - PI / 8.0).abs() < EPSILON);
    }

    #[test]
    fn test_pi_scale_adaptive() {
        assert!((pi_scale_adaptive(2.0, 4) - 2.0 * PI / 4.0).abs() < EPSILON);
        assert!((pi_scale_adaptive(0.5, 4) - 0.5 * PI / 4.0).abs() < EPSILON);
    }

    #[test]
    fn test_kernel_name() {
        let name = pi_dequantize_kernel_name();
        assert!(
            name == "neon" || name == "avx512" || name == "avx2" || name == "scalar",
            "Unknown kernel name: {}",
            name
        );

        let quant_name = pi_quantize_kernel_name();
        assert!(
            quant_name == "avx512" || quant_name == "scalar",
            "Unknown quantize kernel name: {}",
            quant_name
        );
    }

    // -------------------------------------------------------------------------
    // Large data tests for SIMD path coverage
    // -------------------------------------------------------------------------

    #[test]
    fn test_large_data_simd_path() {
        // 1000 groups = 3000 bytes = 8000 values
        // Exercises SIMD main loop + remainder handling
        let num_groups = 1000;
        let packed: Vec<u8> = (0..num_groups * 3).map(|i| (i % 256) as u8).collect();
        let scale = pi_scale(4);

        let mut output = vec![0.0f32; num_groups * 8];
        pi_dequantize(&packed, scale, &mut output);

        // Verify no NaN or Inf values
        for (i, &v) in output.iter().enumerate() {
            assert!(v.is_finite(), "Non-finite value at index {}: {}", i, v);
            // Values should be in range [-4*scale, 3*scale]
            let min_val = -4.0 * scale;
            let max_val = 3.0 * scale;
            assert!(
                v >= min_val && v <= max_val,
                "Value {} at index {} out of range [{}, {}]",
                v,
                i,
                min_val,
                max_val
            );
        }
    }

    #[test]
    fn test_simd_remainder_handling() {
        // Test cases with various remainder sizes after SIMD loop
        // SIMD processes 4 groups at a time, so test 1, 2, 3, 5, 6, 7 groups
        for num_groups in [1, 2, 3, 5, 6, 7, 9, 13, 17] {
            let packed: Vec<u8> = (0..num_groups * 3).map(|i| (i * 37) as u8).collect();
            let scale = 1.0;

            let mut scalar_output = vec![0.0f32; num_groups * 8];
            let mut dispatch_output = vec![0.0f32; num_groups * 8];

            pi_dequantize_scalar(&packed, scale, &mut scalar_output);
            pi_dequantize(&packed, scale, &mut dispatch_output);

            for i in 0..scalar_output.len() {
                assert!(
                    (scalar_output[i] - dispatch_output[i]).abs() < EPSILON,
                    "Remainder mismatch at index {} (groups={}): {} vs {}",
                    i,
                    num_groups,
                    scalar_output[i],
                    dispatch_output[i]
                );
            }
        }
    }
}