velesdb-core 1.7.1

//! NEON SIMD implementations for ARM64 (EPIC-054 US-001).
//!
//! This module provides NEON-optimized distance calculations for aarch64 targets.
//! Performance is comparable to x86_64 AVX2 (≥95% parity).

#[cfg(target_arch = "aarch64")]
use std::arch::aarch64::*;

/// NEON-optimized dot product for f32 vectors.
///
/// # Safety
/// Requires aarch64 target with NEON support.
/// Input slices must have equal length.
///
/// # Performance
/// - Uses vfmaq_f32 (fused multiply-add)
/// - Processes 4 elements per iteration
/// - ~3-4x faster than scalar on M1/M2
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
#[inline]
#[must_use]
pub unsafe fn dot_product_neon(a: &[f32], b: &[f32]) -> f32 {
    debug_assert_eq!(a.len(), b.len());

    let len = a.len();
    if len == 0 {
        return 0.0;
    }

    let chunks = len / 4;
    let remainder = len % 4;

    // Main SIMD loop
    let mut sum = vdupq_n_f32(0.0);

    for i in 0..chunks {
        let offset = i * 4;
        // SAFETY: NEON load and FMA require in-bounds pointers.
        // - Condition 1: Loop invariant `offset + 4 <= chunks * 4 <= len` keeps loads in bounds.
        // - Condition 2: `a` and `b` have equal length (debug assertion at entry).
        // Reason: Use NEON intrinsics for vectorized multiply-accumulate.
        let va = vld1q_f32(a.as_ptr().add(offset));
        let vb = vld1q_f32(b.as_ptr().add(offset));
        sum = vfmaq_f32(sum, va, vb); // sum += va * vb
    }

    // Horizontal sum of SIMD register
    let mut result = vaddvq_f32(sum);

    // Handle remainder (if len not divisible by 4) - unrolled for performance
    let base = chunks * 4;
    if remainder == 3 {
        result += a[base] * b[base] + a[base + 1] * b[base + 1] + a[base + 2] * b[base + 2];
    } else if remainder == 2 {
        result += a[base] * b[base] + a[base + 1] * b[base + 1];
    } else if remainder == 1 {
        result += a[base] * b[base];
    }

    result
}

/// NEON-optimized squared Euclidean distance.
///
/// # Safety
/// Requires aarch64 target with NEON support.
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
#[inline]
#[must_use]
pub unsafe fn euclidean_squared_neon(a: &[f32], b: &[f32]) -> f32 {
    debug_assert_eq!(a.len(), b.len());

    let len = a.len();
    if len == 0 {
        return 0.0;
    }

    let chunks = len / 4;
    let remainder = len % 4;

    let mut sum = vdupq_n_f32(0.0);

    for i in 0..chunks {
        let offset = i * 4;
        // SAFETY: NEON load/sub/FMA require in-bounds pointers.
        // - Condition 1: Loop invariant `offset + 4 <= chunks * 4 <= len` keeps loads in bounds.
        // - Condition 2: `a` and `b` have equal length (debug assertion at entry).
        // Reason: SIMD distance accumulation is required for NEON fast path.
        let va = vld1q_f32(a.as_ptr().add(offset));
        let vb = vld1q_f32(b.as_ptr().add(offset));
        let diff = vsubq_f32(va, vb);
        sum = vfmaq_f32(sum, diff, diff); // sum += diff * diff
    }

    let mut result = vaddvq_f32(sum);

    let base = chunks * 4;
    if remainder == 3 {
        let d0 = a[base] - b[base];
        let d1 = a[base + 1] - b[base + 1];
        let d2 = a[base + 2] - b[base + 2];
        result += d0 * d0 + d1 * d1 + d2 * d2;
    } else if remainder == 2 {
        let d0 = a[base] - b[base];
        let d1 = a[base + 1] - b[base + 1];
        result += d0 * d0 + d1 * d1;
    } else if remainder == 1 {
        let d = a[base] - b[base];
        result += d * d;
    }

    result
}

/// NEON-optimized Euclidean distance (with sqrt).
///
/// # Safety
/// Requires aarch64 target with NEON support.
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
#[inline]
#[must_use]
pub unsafe fn euclidean_neon(a: &[f32], b: &[f32]) -> f32 {
    euclidean_squared_neon(a, b).sqrt()
}

/// NEON-optimized cosine similarity.
///
/// # Safety
/// Requires aarch64 target with NEON support.
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
#[inline]
#[must_use]
pub unsafe fn cosine_neon(a: &[f32], b: &[f32]) -> f32 {
    let dot = dot_product_neon(a, b);
    let norm_a = dot_product_neon(a, a).sqrt();
    let norm_b = dot_product_neon(b, b).sqrt();

    if norm_a == 0.0 || norm_b == 0.0 {
        0.0
    } else {
        dot / (norm_a * norm_b)
    }
}

/// NEON-optimized cosine similarity for pre-normalized vectors.
///
/// # Safety
/// Requires aarch64 target with NEON support.
/// Vectors must be pre-normalized to unit length.
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
#[inline]
#[must_use]
pub unsafe fn cosine_normalized_neon(a: &[f32], b: &[f32]) -> f32 {
    // For normalized vectors, cosine = dot product
    dot_product_neon(a, b)
}

// =============================================================================
// Wrapper functions for dispatch (safe API)
// =============================================================================

/// Safe wrapper for dot product NEON.
#[cfg(target_arch = "aarch64")]
#[inline]
#[must_use]
pub fn dot_product_neon_safe(a: &[f32], b: &[f32]) -> f32 {
    // SAFETY: Calling `dot_product_neon` requires NEON target support.
    // - Condition 1: This function is compiled only for `target_arch = "aarch64"`.
    // - Condition 2: AArch64 guarantees NEON availability.
    // Reason: Safe wrapper delegates to NEON implementation without repeating checks.
    unsafe { dot_product_neon(a, b) }
}

/// Safe wrapper for euclidean NEON.
#[cfg(target_arch = "aarch64")]
#[inline]
#[must_use]
pub fn euclidean_neon_safe(a: &[f32], b: &[f32]) -> f32 {
    // SAFETY: Calling `euclidean_neon` requires NEON target support.
    // - Condition 1: This function is compiled only for `target_arch = "aarch64"`.
    // - Condition 2: AArch64 guarantees NEON availability.
    // Reason: Safe wrapper delegates to NEON implementation without repeating checks.
    unsafe { euclidean_neon(a, b) }
}

/// Safe wrapper for cosine NEON.
#[cfg(target_arch = "aarch64")]
#[inline]
#[must_use]
pub fn cosine_neon_safe(a: &[f32], b: &[f32]) -> f32 {
    // SAFETY: Calling `cosine_neon` requires NEON target support.
    // - Condition 1: This function is compiled only for `target_arch = "aarch64"`.
    // - Condition 2: AArch64 guarantees NEON availability.
    // Reason: Safe wrapper delegates to NEON implementation without repeating checks.
    unsafe { cosine_neon(a, b) }
}

/// Safe wrapper for cosine normalized NEON.
#[cfg(target_arch = "aarch64")]
#[inline]
#[must_use]
pub fn cosine_normalized_neon_safe(a: &[f32], b: &[f32]) -> f32 {
    // SAFETY: Calling `cosine_normalized_neon` requires NEON target support.
    // - Condition 1: This function is compiled only for `target_arch = "aarch64"`.
    // - Condition 2: AArch64 guarantees NEON availability.
    // Reason: Safe wrapper delegates to NEON implementation without repeating checks.
    unsafe { cosine_normalized_neon(a, b) }
}

// =============================================================================
// Tests
// =============================================================================

#[cfg(all(test, target_arch = "aarch64"))]
mod tests {
    use super::*;

    #[test]
    fn test_dot_product_neon_basic() {
        let a = vec![1.0f32, 2.0, 3.0, 4.0];
        let b = vec![1.0f32, 1.0, 1.0, 1.0];

        let result = dot_product_neon_safe(&a, &b);
        assert!((result - 10.0).abs() < 1e-5);
    }

    #[test]
    fn test_dot_product_neon_empty() {
        let a: Vec<f32> = vec![];
        let b: Vec<f32> = vec![];

        let result = dot_product_neon_safe(&a, &b);
        assert!((result - 0.0).abs() < 1e-5);
    }

    #[test]
    fn test_dot_product_neon_non_aligned() {
        // 7 elements - not divisible by 4
        let a = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0];
        let b = vec![1.0f32, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0];

        let result = dot_product_neon_safe(&a, &b);
        assert!((result - 28.0).abs() < 1e-5);
    }

    #[test]
    fn test_euclidean_neon_basic() {
        let a = vec![0.0f32, 0.0, 0.0, 0.0];
        let b = vec![3.0f32, 4.0, 0.0, 0.0];

        let result = euclidean_neon_safe(&a, &b);
        assert!((result - 5.0).abs() < 1e-5);
    }

    #[test]
    fn test_cosine_neon_identical() {
        let a = vec![1.0f32, 2.0, 3.0, 4.0];

        let result = cosine_neon_safe(&a, &a);
        assert!((result - 1.0).abs() < 1e-5);
    }

    #[test]
    fn test_cosine_neon_orthogonal() {
        let a = vec![1.0f32, 0.0, 0.0, 0.0];
        let b = vec![0.0f32, 1.0, 0.0, 0.0];

        let result = cosine_neon_safe(&a, &b);
        assert!(result.abs() < 1e-5);
    }

    #[test]
    fn test_dot_product_neon_768d() {
        // Test with typical embedding dimension
        let a: Vec<f32> = (0..768).map(|i| (i as f32) * 0.001).collect();
        let b: Vec<f32> = (0..768).map(|i| (i as f32) * 0.002).collect();

        let neon_result = dot_product_neon_safe(&a, &b);

        // Compare with scalar
        let scalar_result: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();

        assert!(
            (neon_result - scalar_result).abs() < 1e-3,
            "NEON: {}, Scalar: {}",
            neon_result,
            scalar_result
        );
    }
}