lattice-embed 0.2.4

//! INT8 quantization for efficient embedding storage and similarity computation.
//!
//! Quantized vectors provide ~3x speedup and 4x memory reduction with 99%+ accuracy.

#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

#[cfg(target_arch = "aarch64")]
use std::arch::aarch64::*;

use std::sync::OnceLock;

use super::simd_config;

/// **Unstable**: INT8 quantization parameters; scale/bias scheme may change.
///
/// Quantization parameters for int8 conversion.
#[derive(Debug, Clone, Copy)]
pub struct QuantizationParams {
    /// **Unstable**: scale factor; formula may change with scheme update.
    pub scale: f32,
    /// **Unstable**: zero point offset; may be removed for symmetric-only quantization.
    pub zero_point: i8,
    /// **Unstable**: min float value; may be removed.
    pub min_val: f32,
    /// **Unstable**: max float value; may be removed.
    pub max_val: f32,
}

impl QuantizationParams {
    /// **Unstable**: parameter computation; may be folded into `QuantizedVector::from_f32`.
    ///
    /// Handles edge cases: empty vectors, NaN, Inf, near-zero vectors.
    pub fn from_vector(vector: &[f32]) -> Self {
        // Single pass over finite values to handle NaN/Inf gracefully.
        let mut min_val = f32::INFINITY;
        let mut max_val = f32::NEG_INFINITY;

        for &v in vector {
            if v.is_finite() {
                min_val = min_val.min(v);
                max_val = max_val.max(v);
            }
        }

        // Handle edge case: empty or all non-finite.
        if !min_val.is_finite() || !max_val.is_finite() {
            min_val = 0.0;
            max_val = 0.0;
        }

        // Symmetric quantization: map [-max_abs, max_abs] to [-127, 127]
        let max_abs = min_val.abs().max(max_val.abs());

        // Epsilon guard to avoid division by near-zero
        let scale = if max_abs > 1e-10 {
            127.0 / max_abs
        } else {
            1.0 // All zeros or near-zero case
        };

        Self {
            scale,
            zero_point: 0,
            min_val,
            max_val,
        }
    }
}

/// **Unstable**: INT8 quantized vector; struct layout and invariants may change.
///
/// Quantized int8 vector with its parameters.
#[derive(Debug, Clone)]
pub struct QuantizedVector {
    /// Invariant: all values in `[-127, 127]`. Enforced by `from_f32` clamping.
    /// Private — the invariant makes release-mode assert scans unnecessary.
    data: Vec<i8>,
    /// **Unstable**: quantization parameters; may be separated from the vector.
    pub params: QuantizationParams,
    /// **Unstable**: L2 norm; may be removed or moved.
    pub norm: f32,
}

impl QuantizedVector {
    /// Returns the quantized data as a slice. All values are in `[-127, 127]`.
    #[inline]
    pub fn data(&self) -> &[i8] {
        &self.data
    }

    /// Returns the number of quantized elements.
    #[inline]
    pub fn len(&self) -> usize {
        self.data.len()
    }

    /// Returns `true` if the quantized vector has no elements.
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.data.is_empty()
    }
}

impl QuantizedVector {
    /// **Unstable**: quantization constructor; clamping behavior may change.
    pub fn from_f32(vector: &[f32]) -> Self {
        let mut params = QuantizationParams::from_vector(vector);

        // Defensive guard: avoid NaN/Inf/zero scale.
        if !params.scale.is_finite() || params.scale == 0.0 {
            params.scale = 1.0;
        }

        // Compute L2 norm of finite values (NaN/Inf are treated as 0.0).
        let mut norm_sq = 0.0f32;
        for &v in vector {
            if v.is_finite() {
                norm_sq += v * v;
            }
        }
        let norm = norm_sq.sqrt();

        let data: Vec<i8> = vector
            .iter()
            .map(|&v| {
                if !v.is_finite() {
                    0
                } else {
                    (v * params.scale).round().clamp(-127.0, 127.0) as i8
                }
            })
            .collect();

        Self { data, params, norm }
    }

    /// **Unstable**: dequantization; output precision may change with scheme update.
    ///
    /// # Precision
    ///
    /// INT8 symmetric quantization maps `[-max_abs, max_abs]` to `[-127, 127]`,
    /// so the quantization step size is `max_abs / 127`. The maximum per-element
    /// round-trip error is bounded by half a quantization step: `max_abs / 254`.
    ///
    /// For a 384-dim unit-norm embedding (`max_abs` ≈ 1.0), expect element-wise
    /// absolute error ≤ 0.004 and cosine-similarity error ≤ 0.5%.
    pub fn to_f32(&self) -> Vec<f32> {
        let scale = if self.params.scale.is_finite() && self.params.scale != 0.0 {
            self.params.scale
        } else {
            1.0
        };

        self.data.iter().map(|&v| v as f32 / scale).collect()
    }

    /// **Unstable**: delegates to `dot_product_i8`; SIMD dispatch may change.
    #[inline]
    pub fn dot_product(&self, other: &QuantizedVector) -> f32 {
        dot_product_i8(self, other)
    }

    /// **Unstable**: delegates to `cosine_similarity_i8`; SIMD dispatch may change.
    #[inline]
    pub fn cosine_similarity(&self, other: &QuantizedVector) -> f32 {
        cosine_similarity_i8(self, other)
    }
}

/// **Unstable**: SIMD INT8 dot product; VNNI/AVX2/NEON dispatch may change.
///
/// Returns the approximate float dot product.
/// Returns 0.0 if vectors have different lengths.
///
/// # Invariant
///
/// Both vectors must satisfy the `[-127, 127]` range invariant. The value `-128`
/// causes silent corruption in AVX2 (`_mm256_sign_epi8` saturation) and AVX-512
/// VNNI (`_mm512_dpbusd_epi32` after `vpabsb`). The `data` field is private, so
/// only `from_f32` (which clamps) can populate it — `debug_assert!` is sufficient.
#[inline]
pub fn dot_product_i8(a: &QuantizedVector, b: &QuantizedVector) -> f32 {
    debug_assert!(a.data.iter().all(|&v| v != -128i8));
    debug_assert!(b.data.iter().all(|&v| v != -128i8));

    if a.data.len() != b.data.len() {
        return 0.0;
    }

    let denom = a.params.scale * b.params.scale;
    if denom == 0.0 || !denom.is_finite() {
        return 0.0;
    }

    dot_product_i8_dispatch(&a.data, &b.data) / denom
}

/// Trusted INT8 dot product for constructor-owned vectors in prepared-query paths.
///
/// Uses `debug_assert!` instead of `assert!`; callers must guarantee vectors
/// were produced by `QuantizedVector::from_f32` or equivalent (clamped to [-127,127]).
#[inline]
pub(crate) fn dot_product_i8_trusted(a: &QuantizedVector, b: &QuantizedVector) -> f32 {
    if a.data.len() != b.data.len() {
        return 0.0;
    }
    let denom = a.params.scale * b.params.scale;
    if denom == 0.0 || !denom.is_finite() {
        return 0.0;
    }
    debug_assert!(a.data.iter().all(|&v| v != i8::MIN));
    debug_assert!(b.data.iter().all(|&v| v != i8::MIN));
    dot_product_i8_dispatch(&a.data, &b.data) / denom
}

/// **Unstable**: SIMD INT8 cosine similarity; norm storage approach may change.
///
/// Uses pre-computed norms for efficiency.
#[inline]
pub fn cosine_similarity_i8(a: &QuantizedVector, b: &QuantizedVector) -> f32 {
    let denom = a.norm * b.norm;
    if denom == 0.0 || !denom.is_finite() {
        return 0.0;
    }
    dot_product_i8(a, b) / denom
}

/// Trusted INT8 cosine similarity for constructor-owned vectors in prepared-query paths.
///
/// Uses `dot_product_i8_trusted` instead of `dot_product_i8` to skip release-mode
/// O(N) invariant scans. Callers must guarantee vectors were produced by
/// `QuantizedVector::from_f32` or equivalent (clamped to [-127, 127]).
#[inline]
pub(crate) fn cosine_similarity_i8_trusted(a: &QuantizedVector, b: &QuantizedVector) -> f32 {
    let denom = a.norm * b.norm;
    if denom == 0.0 || !denom.is_finite() {
        return 0.0;
    }
    dot_product_i8_trusted(a, b) / denom
}

/// NEON int8 dot product using vmull/vpadal with 4x unrolling and software prefetch.
///
/// Processes 64 int8s per iteration with 4 accumulators. Prefetches the next
/// cache line (64 bytes) one iteration ahead to hide DRAM latency for large dims.
///
/// # Safety
///
/// Caller must ensure:
/// - Running on aarch64 with FEAT_DotProd (uses SDOT instruction)
/// - `a` and `b` have equal length (checked by caller)
/// - No element is i8::MIN (-128) — see SIMD invariant docs
///
/// Memory safety:
/// - Uses `vld1q_s8` for loads (handles any alignment)
/// - Pointer arithmetic stays within slice bounds via chunk calculation
/// - Remainder handled via safe slice iteration
/// - Prefetch pointers are bounds-checked before issuing (only prefetch if within slice)
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "dotprod")]
unsafe fn dot_product_i8_neon_unrolled(a: &[i8], b: &[i8]) -> f32 {
    const SIMD_WIDTH: usize = 16;
    const UNROLL: usize = 4;
    const CHUNK_SIZE: usize = SIMD_WIDTH * UNROLL;
    const PREFETCH_DISTANCE: usize = CHUNK_SIZE;
    let n = a.len();
    debug_assert_eq!(n, b.len());
    let chunks = n / CHUNK_SIZE;

    let mut sum0 = vdupq_n_s32(0);
    let mut sum1 = vdupq_n_s32(0);
    let mut sum2 = vdupq_n_s32(0);
    let mut sum3 = vdupq_n_s32(0);

    // Use SDOT (ARMv8.2 dotprod) via inline asm — 1 instruction per 16 i8 elements
    // vs 6 instructions (split + vmull×2 + vpadalq×2) with the widening approach.
    // Apple Silicon (M1+) supports dotprod; runtime check at dispatch level.
    for i in 0..chunks {
        let base = i * CHUNK_SIZE;

        let next_base = base + PREFETCH_DISTANCE;
        if next_base + CHUNK_SIZE <= n {
            core::arch::asm!(
                "prfm pldl1keep, [{ptr}]",
                ptr = in(reg) a.as_ptr().add(next_base),
                options(nostack, readonly, preserves_flags)
            );
            core::arch::asm!(
                "prfm pldl1keep, [{ptr}]",
                ptr = in(reg) b.as_ptr().add(next_base),
                options(nostack, readonly, preserves_flags)
            );
        }

        let a0 = vld1q_s8(a.as_ptr().add(base));
        let b0 = vld1q_s8(b.as_ptr().add(base));
        let a1 = vld1q_s8(a.as_ptr().add(base + SIMD_WIDTH));
        let b1 = vld1q_s8(b.as_ptr().add(base + SIMD_WIDTH));
        let a2 = vld1q_s8(a.as_ptr().add(base + SIMD_WIDTH * 2));
        let b2 = vld1q_s8(b.as_ptr().add(base + SIMD_WIDTH * 2));
        let a3 = vld1q_s8(a.as_ptr().add(base + SIMD_WIDTH * 3));
        let b3 = vld1q_s8(b.as_ptr().add(base + SIMD_WIDTH * 3));

        core::arch::asm!(
            "sdot {s0:v}.4s, {a0:v}.16b, {b0:v}.16b",
            "sdot {s1:v}.4s, {a1:v}.16b, {b1:v}.16b",
            "sdot {s2:v}.4s, {a2:v}.16b, {b2:v}.16b",
            "sdot {s3:v}.4s, {a3:v}.16b, {b3:v}.16b",
            s0 = inout(vreg) sum0,
            a0 = in(vreg) a0,
            b0 = in(vreg) b0,
            s1 = inout(vreg) sum1,
            a1 = in(vreg) a1,
            b1 = in(vreg) b1,
            s2 = inout(vreg) sum2,
            a2 = in(vreg) a2,
            b2 = in(vreg) b2,
            s3 = inout(vreg) sum3,
            a3 = in(vreg) a3,
            b3 = in(vreg) b3,
            options(nomem, nostack, preserves_flags)
        );
    }

    let sum01 = vaddq_s32(sum0, sum1);
    let sum23 = vaddq_s32(sum2, sum3);
    let mut sum_vec = vaddq_s32(sum01, sum23);

    // Tail: remaining full 16-byte vectors using sdot
    let tail_start = chunks * CHUNK_SIZE;
    let tail_chunks = (n - tail_start) / SIMD_WIDTH;
    for j in 0..tail_chunks {
        let base = tail_start + j * SIMD_WIDTH;
        let at = vld1q_s8(a.as_ptr().add(base));
        let bt = vld1q_s8(b.as_ptr().add(base));
        core::arch::asm!(
            "sdot {acc:v}.4s, {a:v}.16b, {b:v}.16b",
            acc = inout(vreg) sum_vec,
            a = in(vreg) at,
            b = in(vreg) bt,
            options(nomem, nostack, preserves_flags)
        );
    }

    let sum = vaddvq_s32(sum_vec);

    // Scalar tail: only the final < SIMD_WIDTH elements
    let remainder_start = tail_start + tail_chunks * SIMD_WIDTH;
    let remainder: i32 = a[remainder_start..]
        .iter()
        .zip(b[remainder_start..].iter())
        .map(|(&x, &y)| x as i32 * y as i32)
        .sum();

    (sum + remainder) as f32
}

/// Emulate `mm512_sign_epi8(b, a)` which doesn't exist in AVX-512.
///
/// Returns: b[i] if a[i] > 0, -b[i] if a[i] < 0, 0 if a[i] == 0.
///
/// # Safety
/// Requires AVX-512BW.
#[cfg(all(target_arch = "x86_64", feature = "avx512"))]
#[target_feature(enable = "avx512f", enable = "avx512bw")]
#[inline]
unsafe fn mm512_sign_epi8(b: __m512i, a: __m512i) -> __m512i {
    let zero = _mm512_setzero_si512();
    let neg_b = _mm512_sub_epi8(zero, b);
    // mask where a < 0
    let mask_neg = _mm512_cmplt_epi8_mask(a, zero);
    // mask where a == 0
    let mask_zero = _mm512_cmpeq_epi8_mask(a, zero);
    // Start with b, replace with -b where a < 0
    let result = _mm512_mask_blend_epi8(mask_neg, b, neg_b);
    // Replace with 0 where a == 0
    _mm512_mask_blend_epi8(mask_zero, result, zero)
}

/// AVX-512 VNNI int8 dot product using _mm512_dpbusd_epi32.
///
/// Processes 256 int8s per iteration (4x64 with 4 accumulators).
/// Note: VNNI expects unsigned x signed, so we handle signs carefully.
///
/// # Safety
///
/// Caller must ensure:
/// - CPU supports AVX-512F, AVX-512VNNI, and AVX-512BW (verified via `simd_config()`)
/// - `a` and `b` have equal length (checked by caller)
///
/// Memory safety:
/// - Uses `_mm512_loadu_si512` for unaligned loads (safe for any alignment)
/// - Pointer arithmetic stays within slice bounds via chunk calculation
/// - Remainder handled via safe slice iteration
#[cfg(all(target_arch = "x86_64", feature = "avx512"))]
#[target_feature(enable = "avx512f", enable = "avx512vnni", enable = "avx512bw")]
unsafe fn dot_product_i8_avx512vnni(a: &[i8], b: &[i8]) -> f32 {
    const SIMD_WIDTH: usize = 64; // 64 int8s per 512-bit register
    const UNROLL: usize = 4;
    const CHUNK_SIZE: usize = SIMD_WIDTH * UNROLL;
    let n = a.len();
    debug_assert_eq!(n, b.len());
    debug_assert!(a.iter().all(|&v| v != i8::MIN));
    debug_assert!(b.iter().all(|&v| v != i8::MIN));
    let chunks = n / CHUNK_SIZE;

    // 4 independent int32 accumulators (16 int32s each)
    let mut sum0 = _mm512_setzero_si512();
    let mut sum1 = _mm512_setzero_si512();
    let mut sum2 = _mm512_setzero_si512();
    let mut sum3 = _mm512_setzero_si512();

    for i in 0..chunks {
        let base = i * CHUNK_SIZE;

        // VNNI: dpbusd computes sum += a[unsigned] * b[signed]
        // For signed * signed, we use: abs(a) * sign(b, a)
        let a0 = _mm512_loadu_si512(a.as_ptr().add(base) as *const __m512i);
        let b0 = _mm512_loadu_si512(b.as_ptr().add(base) as *const __m512i);
        let a0_abs = _mm512_abs_epi8(a0);
        let b0_signed = mm512_sign_epi8(b0, a0);
        sum0 = _mm512_dpbusd_epi32(sum0, a0_abs, b0_signed);

        let a1 = _mm512_loadu_si512(a.as_ptr().add(base + SIMD_WIDTH) as *const __m512i);
        let b1 = _mm512_loadu_si512(b.as_ptr().add(base + SIMD_WIDTH) as *const __m512i);
        let a1_abs = _mm512_abs_epi8(a1);
        let b1_signed = mm512_sign_epi8(b1, a1);
        sum1 = _mm512_dpbusd_epi32(sum1, a1_abs, b1_signed);

        let a2 = _mm512_loadu_si512(a.as_ptr().add(base + SIMD_WIDTH * 2) as *const __m512i);
        let b2 = _mm512_loadu_si512(b.as_ptr().add(base + SIMD_WIDTH * 2) as *const __m512i);
        let a2_abs = _mm512_abs_epi8(a2);
        let b2_signed = mm512_sign_epi8(b2, a2);
        sum2 = _mm512_dpbusd_epi32(sum2, a2_abs, b2_signed);

        let a3 = _mm512_loadu_si512(a.as_ptr().add(base + SIMD_WIDTH * 3) as *const __m512i);
        let b3 = _mm512_loadu_si512(b.as_ptr().add(base + SIMD_WIDTH * 3) as *const __m512i);
        let a3_abs = _mm512_abs_epi8(a3);
        let b3_signed = mm512_sign_epi8(b3, a3);
        sum3 = _mm512_dpbusd_epi32(sum3, a3_abs, b3_signed);
    }

    // Combine accumulators
    let sum01 = _mm512_add_epi32(sum0, sum1);
    let sum23 = _mm512_add_epi32(sum2, sum3);
    let sum_vec = _mm512_add_epi32(sum01, sum23);

    // Horizontal sum of 16 int32s
    let sum = _mm512_reduce_add_epi32(sum_vec);

    // Handle remainder with scalar
    let remainder_start = chunks * CHUNK_SIZE;
    let remainder: i32 = a[remainder_start..]
        .iter()
        .zip(b[remainder_start..].iter())
        .map(|(&x, &y)| x as i32 * y as i32)
        .sum();

    (sum + remainder) as f32
}

/// AVX2 int8 dot product with 4x unrolling and software prefetch.
///
/// # Safety
///
/// Caller must ensure:
/// - CPU supports AVX2 (verified via `simd_config()`)
/// - `a` and `b` have equal length (checked by caller)
///
/// Memory safety:
/// - Uses `_mm256_loadu_si256` for unaligned loads (safe for any alignment)
/// - Pointer arithmetic stays within slice bounds via chunk calculation
/// - Remainder handled via safe slice iteration
/// - Prefetch hint issues `_MM_HINT_T0` only when next chunk is within slice bounds
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn dot_product_i8_avx2_unrolled(a: &[i8], b: &[i8]) -> f32 {
    const SIMD_WIDTH: usize = 32;
    const UNROLL: usize = 4;
    const CHUNK_SIZE: usize = SIMD_WIDTH * UNROLL;
    // Prefetch one full chunk ahead for both input arrays.
    const PREFETCH_DISTANCE: usize = CHUNK_SIZE;
    let n = a.len();
    debug_assert_eq!(n, b.len());
    debug_assert!(a.iter().all(|&v| v != i8::MIN));
    debug_assert!(b.iter().all(|&v| v != i8::MIN));
    let chunks = n / CHUNK_SIZE;

    // 4 independent int32 accumulators
    let mut sum0 = _mm256_setzero_si256();
    let mut sum1 = _mm256_setzero_si256();
    let mut sum2 = _mm256_setzero_si256();
    let mut sum3 = _mm256_setzero_si256();

    let ones = _mm256_set1_epi16(1);

    for i in 0..chunks {
        let base = i * CHUNK_SIZE;

        // Software prefetch for the next chunk.
        let next_base = base + PREFETCH_DISTANCE;
        if next_base + CHUNK_SIZE <= n {
            _mm_prefetch(a.as_ptr().add(next_base), _MM_HINT_T0);
            _mm_prefetch(b.as_ptr().add(next_base), _MM_HINT_T0);
        }

        // Unroll 0
        let a0 = _mm256_loadu_si256(a.as_ptr().add(base) as *const __m256i);
        let b0 = _mm256_loadu_si256(b.as_ptr().add(base) as *const __m256i);
        let prod0 = _mm256_maddubs_epi16(_mm256_abs_epi8(a0), _mm256_sign_epi8(b0, a0));
        let prod0_32 = _mm256_madd_epi16(prod0, ones);
        sum0 = _mm256_add_epi32(sum0, prod0_32);

        // Unroll 1
        let a1 = _mm256_loadu_si256(a.as_ptr().add(base + SIMD_WIDTH) as *const __m256i);
        let b1 = _mm256_loadu_si256(b.as_ptr().add(base + SIMD_WIDTH) as *const __m256i);
        let prod1 = _mm256_maddubs_epi16(_mm256_abs_epi8(a1), _mm256_sign_epi8(b1, a1));
        let prod1_32 = _mm256_madd_epi16(prod1, ones);
        sum1 = _mm256_add_epi32(sum1, prod1_32);

        // Unroll 2
        let a2 = _mm256_loadu_si256(a.as_ptr().add(base + SIMD_WIDTH * 2) as *const __m256i);
        let b2 = _mm256_loadu_si256(b.as_ptr().add(base + SIMD_WIDTH * 2) as *const __m256i);
        let prod2 = _mm256_maddubs_epi16(_mm256_abs_epi8(a2), _mm256_sign_epi8(b2, a2));
        let prod2_32 = _mm256_madd_epi16(prod2, ones);
        sum2 = _mm256_add_epi32(sum2, prod2_32);

        // Unroll 3
        let a3 = _mm256_loadu_si256(a.as_ptr().add(base + SIMD_WIDTH * 3) as *const __m256i);
        let b3 = _mm256_loadu_si256(b.as_ptr().add(base + SIMD_WIDTH * 3) as *const __m256i);
        let prod3 = _mm256_maddubs_epi16(_mm256_abs_epi8(a3), _mm256_sign_epi8(b3, a3));
        let prod3_32 = _mm256_madd_epi16(prod3, ones);
        sum3 = _mm256_add_epi32(sum3, prod3_32);
    }

    // Combine accumulators
    let sum01 = _mm256_add_epi32(sum0, sum1);
    let sum23 = _mm256_add_epi32(sum2, sum3);
    let sum_vec = _mm256_add_epi32(sum01, sum23);

    // Horizontal sum
    let sum128_lo = _mm256_castsi256_si128(sum_vec);
    let sum128_hi = _mm256_extracti128_si256(sum_vec, 1);
    let sum128 = _mm_add_epi32(sum128_lo, sum128_hi);
    let sum64 = _mm_add_epi32(sum128, _mm_srli_si128(sum128, 8));
    let sum32 = _mm_add_epi32(sum64, _mm_srli_si128(sum64, 4));
    let sum = _mm_cvtsi128_si32(sum32);

    // Handle remainder
    let remainder_start = chunks * CHUNK_SIZE;
    let remainder: i32 = a[remainder_start..]
        .iter()
        .zip(b[remainder_start..].iter())
        .map(|(&x, &y)| x as i32 * y as i32)
        .sum();

    (sum + remainder) as f32
}

// ============================================================================
// INT8 kernel dispatch cache (mirrors f32 DotKernel pattern in dot_product.rs)
// ============================================================================

/// INT8 dot-product kernel function pointer type.
pub type I8DotKernel = fn(&[i8], &[i8]) -> f32;

static I8_DOT_KERNEL: OnceLock<I8DotKernel> = OnceLock::new();

/// Return the cached INT8 dot-product kernel.
///
/// Callers that invoke INT8 dot product in a tight loop can hoist this call
/// outside the loop so the OnceLock check runs once, not per-iteration.
#[inline]
pub fn resolved_i8_dot_kernel() -> I8DotKernel {
    *I8_DOT_KERNEL.get_or_init(resolve_i8_dot_kernel)
}

fn resolve_i8_dot_kernel() -> I8DotKernel {
    let config = simd_config();

    #[cfg(target_arch = "aarch64")]
    {
        // The NEON kernel uses SDOT (ARMv8.2 FEAT_DotProd), which is optional on
        // Armv8.2/v8.3. Only dispatch when dotprod_enabled is confirmed at runtime.
        if config.neon_enabled && config.dotprod_enabled {
            return dot_product_i8_neon_kernel;
        }
    }

    #[cfg(target_arch = "x86_64")]
    {
        #[cfg(feature = "avx512")]
        {
            if config.avx512vnni_enabled {
                return dot_product_i8_avx512vnni_kernel;
            }
        }
        if config.avx2_enabled {
            return dot_product_i8_avx2_kernel;
        }
    }

    dot_product_i8_scalar_kernel
}

#[cfg(target_arch = "aarch64")]
fn dot_product_i8_neon_kernel(a: &[i8], b: &[i8]) -> f32 {
    // SAFETY: stored only when NEON+dotprod detected at init time.
    unsafe { dot_product_i8_neon_unrolled(a, b) }
}

#[cfg(all(target_arch = "x86_64", feature = "avx512"))]
fn dot_product_i8_avx512vnni_kernel(a: &[i8], b: &[i8]) -> f32 {
    debug_assert!(a.iter().all(|&v| v != i8::MIN));
    debug_assert!(b.iter().all(|&v| v != i8::MIN));
    // SAFETY: stored only when AVX-512F+VNNI+BW were detected at init time.
    unsafe { dot_product_i8_avx512vnni(a, b) }
}

#[cfg(target_arch = "x86_64")]
fn dot_product_i8_avx2_kernel(a: &[i8], b: &[i8]) -> f32 {
    debug_assert!(a.iter().all(|&v| v != i8::MIN));
    debug_assert!(b.iter().all(|&v| v != i8::MIN));
    // SAFETY: stored only when AVX2 was detected at init time.
    unsafe { dot_product_i8_avx2_unrolled(a, b) }
}

fn dot_product_i8_scalar_kernel(a: &[i8], b: &[i8]) -> f32 {
    a.iter()
        .zip(b.iter())
        .map(|(&x, &y)| x as i32 * y as i32)
        .sum::<i32>() as f32
}

/// **Unstable**: raw SIMD INT8 hot path; signature and scaling semantics may change.
///
/// This is the hot-path function for HNSW quantized search. Unlike `dot_product_i8`,
/// it takes raw `&[i8]` slices and does NOT divide by scale factors -- the caller
/// handles scaling. This avoids allocating `QuantizedVector` wrappers.
///
/// Returns 0.0 if slices have different lengths.
///
/// # Invariant
///
/// Both slices must satisfy the `[-127, 127]` range invariant. The value `-128`
/// causes silent corruption in AVX2 and AVX-512 VNNI SIMD paths. This function
/// enforces the invariant with a release-mode `assert!` at the public boundary.
///
/// # Performance
///
#[inline]
fn dot_product_i8_dispatch(a: &[i8], b: &[i8]) -> f32 {
    resolved_i8_dot_kernel()(a, b)
}

/// **Unstable**: raw INT8 dot product on slices; SIMD strategy may change.
///
/// Uses the same SIMD paths as `dot_product_i8`:
/// - aarch64: NEON with 4x unrolling + tail SIMD chunks
/// - x86_64: AVX-512 VNNI > AVX2 > scalar
///
/// The key difference is zero allocation overhead: no `Vec<i8>`, no
/// `QuantizedVector`, no `QuantizationParams`. Just raw slices in, f32 out.
#[inline]
pub fn dot_product_i8_raw(a: &[i8], b: &[i8]) -> f32 {
    if a.len() != b.len() {
        return 0.0;
    }
    debug_assert!(
        a.iter().all(|&v| v != -128i8),
        "dot_product_i8_raw: slice a contains -128, violating the [-127, 127] SIMD invariant"
    );
    debug_assert!(
        b.iter().all(|&v| v != -128i8),
        "dot_product_i8_raw: slice b contains -128, violating the [-127, 127] SIMD invariant"
    );
    dot_product_i8_dispatch(a, b)
}

#[cfg(test)]
mod simd_parity_tests {
    use super::*;

    fn gen_vec(dim: usize, seed: u64) -> Vec<f32> {
        let mut state = seed ^ ((dim as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15));
        (0..dim)
            .map(|i| {
                state = state
                    .wrapping_mul(6364136223846793005)
                    .wrapping_add(1442695040888963407)
                    .wrapping_add(i as u64);
                let unit = ((state >> 32) as u32) as f32 / u32::MAX as f32;
                unit * 2.0 - 1.0
            })
            .collect()
    }

    // FP-034: NEON SDOT vs scalar parity for INT8 dot product.
    // Gated on dotprod: SDOT is FEAT_DotProd, not baseline NEON.
    #[test]
    fn test_i8_neon_scalar_parity() {
        #[cfg(target_arch = "aarch64")]
        {
            if !super::super::SimdConfig::detect().dotprod_enabled {
                eprintln!("skipping SDOT parity test: dotprod not available");
                return;
            }
        }
        #[cfg(target_arch = "aarch64")]
        for dim in [7usize, 16, 64, 128, 384, 768] {
            let a_q = QuantizedVector::from_f32(&gen_vec(dim, 200 + dim as u64));
            let b_q = QuantizedVector::from_f32(&gen_vec(dim, 300 + dim as u64));

            // SAFETY: dotprod confirmed above; slices have equal length from from_f32.
            let neon = unsafe { dot_product_i8_neon_unrolled(&a_q.data, &b_q.data) };
            let scalar: f32 = a_q
                .data
                .iter()
                .zip(b_q.data.iter())
                .map(|(&x, &y)| x as i32 * y as i32)
                .sum::<i32>() as f32;

            let diff = (neon - scalar).abs();
            assert!(
                diff <= 1.0,
                "NEON vs scalar i8 dot product dim={dim}: neon={neon} scalar={scalar} diff={diff}"
            );
        }
    }

    // FP-034: AVX2 vs scalar parity for INT8 dot product.
    #[test]
    fn test_i8_avx2_scalar_parity() {
        #[cfg(target_arch = "x86_64")]
        if std::arch::is_x86_feature_detected!("avx2") {
            for dim in [7usize, 16, 64, 128, 384, 768] {
                let a_q = QuantizedVector::from_f32(&gen_vec(dim, 400 + dim as u64));
                let b_q = QuantizedVector::from_f32(&gen_vec(dim, 500 + dim as u64));

                // SAFETY: AVX2 verified by is_x86_feature_detected! above; slices have equal length.
                let avx2 = unsafe { dot_product_i8_avx2_unrolled(&a_q.data, &b_q.data) };
                let scalar: f32 = a_q
                    .data
                    .iter()
                    .zip(b_q.data.iter())
                    .map(|(&x, &y)| x as i32 * y as i32)
                    .sum::<i32>() as f32;

                let diff = (avx2 - scalar).abs();
                assert!(
                    diff <= 1.0,
                    "AVX2 vs scalar i8 dot product dim={dim}: avx2={avx2} scalar={scalar} diff={diff}"
                );
            }
        }
    }
}