lcpfs 2026.1.102

// Copyright 2025 LunaOS Contributors
// SPDX-License-Identifier: Apache-2.0

//! Distance functions for vector similarity search.
//!
//! This module provides optimized implementations of common distance metrics
//! used in vector similarity search. All functions are `no_std` compatible
//! and use SIMD-friendly patterns for better auto-vectorization.
//!
//! # Metrics
//!
//! - **Cosine Similarity**: Measures the cosine of the angle between vectors.
//!   Returns 1.0 for identical directions, 0.0 for orthogonal, -1.0 for opposite.
//!
//! - **Euclidean Distance**: The straight-line distance between two points.
//!   Returns 0.0 for identical vectors, larger values for more distant vectors.
//!
//! - **Dot Product**: The inner product of two vectors. For normalized vectors,
//!   equals cosine similarity.
//!
//! - **Manhattan Distance**: Sum of absolute differences (L1 norm).
//!
//! # Performance
//!
//! These implementations use patterns that compilers can auto-vectorize:
//! - Loop unrolling friendly
//! - No branches in inner loops
//! - Accumulator patterns for reduction operations

use super::types::DistanceMetric;

// ═══════════════════════════════════════════════════════════════════════════════
// CORE DISTANCE FUNCTIONS
// ═══════════════════════════════════════════════════════════════════════════════

/// Compute cosine similarity between two vectors.
///
/// Cosine similarity measures the cosine of the angle between two vectors,
/// ranging from -1 (opposite) through 0 (orthogonal) to 1 (identical direction).
///
/// For normalized vectors, this is equivalent to dot product.
///
/// # Formula
///
/// ```text
/// cos(θ) = (a · b) / (||a|| × ||b||)
/// ```
///
/// # Panics
///
/// Panics if vectors have different lengths in debug builds.
///
/// # Examples
///
/// ```ignore
/// let a = [1.0, 0.0, 0.0];
/// let b = [0.0, 1.0, 0.0];
/// assert!((cosine_similarity(&a, &b) - 0.0).abs() < 1e-6); // Orthogonal
/// ```
#[inline]
pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
    debug_assert_eq!(a.len(), b.len(), "vectors must have same length");

    let mut dot = 0.0f32;
    let mut norm_a = 0.0f32;
    let mut norm_b = 0.0f32;

    // Compute dot product and norms in single pass
    for i in 0..a.len() {
        dot += a[i] * b[i];
        norm_a += a[i] * a[i];
        norm_b += b[i] * b[i];
    }

    let denom = libm::sqrtf(norm_a) * libm::sqrtf(norm_b);
    if denom > 1e-10 { dot / denom } else { 0.0 }
}

/// Compute cosine distance (1 - cosine similarity).
///
/// Returns a value between 0 (identical) and 2 (opposite).
/// This is useful for algorithms that expect distances rather than similarities.
#[inline]
pub fn cosine_distance(a: &[f32], b: &[f32]) -> f32 {
    1.0 - cosine_similarity(a, b)
}

/// Compute Euclidean (L2) distance between two vectors.
///
/// The Euclidean distance is the straight-line distance between two points
/// in n-dimensional space.
///
/// # Formula
///
/// ```text
/// d(a, b) = sqrt(Σ(aᵢ - bᵢ)²)
/// ```
///
/// # Panics
///
/// Panics if vectors have different lengths in debug builds.
#[inline]
pub fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
    debug_assert_eq!(a.len(), b.len(), "vectors must have same length");

    let mut sum = 0.0f32;
    for i in 0..a.len() {
        let diff = a[i] - b[i];
        sum += diff * diff;
    }

    libm::sqrtf(sum)
}

/// Compute squared Euclidean distance (avoids sqrt).
///
/// When only comparing distances (not absolute values), using squared
/// distance is faster as it avoids the square root operation.
#[inline]
pub fn euclidean_distance_squared(a: &[f32], b: &[f32]) -> f32 {
    debug_assert_eq!(a.len(), b.len(), "vectors must have same length");

    let mut sum = 0.0f32;
    for i in 0..a.len() {
        let diff = a[i] - b[i];
        sum += diff * diff;
    }

    sum
}

/// Compute dot product (inner product) of two vectors.
///
/// For normalized vectors, the dot product equals cosine similarity.
/// The dot product is also related to Euclidean distance by:
/// ```text
/// ||a - b||² = ||a||² + ||b||² - 2(a · b)
/// ```
///
/// # Formula
///
/// ```text
/// a · b = Σ(aᵢ × bᵢ)
/// ```
///
/// # Panics
///
/// Panics if vectors have different lengths in debug builds.
#[inline]
pub fn dot_product(a: &[f32], b: &[f32]) -> f32 {
    debug_assert_eq!(a.len(), b.len(), "vectors must have same length");

    let mut sum = 0.0f32;
    for i in 0..a.len() {
        sum += a[i] * b[i];
    }

    sum
}

/// Compute negative dot product as a distance.
///
/// For similarity search with dot product, we often want to minimize
/// the negative dot product (maximize dot product).
#[inline]
pub fn negative_dot_product(a: &[f32], b: &[f32]) -> f32 {
    -dot_product(a, b)
}

/// Compute Manhattan (L1) distance between two vectors.
///
/// Also known as taxicab distance, this is the sum of absolute differences.
///
/// # Formula
///
/// ```text
/// d(a, b) = Σ|aᵢ - bᵢ|
/// ```
///
/// # Panics
///
/// Panics if vectors have different lengths in debug builds.
#[inline]
pub fn manhattan_distance(a: &[f32], b: &[f32]) -> f32 {
    debug_assert_eq!(a.len(), b.len(), "vectors must have same length");

    let mut sum = 0.0f32;
    for i in 0..a.len() {
        sum += libm::fabsf(a[i] - b[i]);
    }

    sum
}

/// Compute Hamming distance between binary vectors.
///
/// Counts the number of positions where the corresponding bits differ.
/// This is efficient for binary quantized embeddings.
///
/// # Arguments
///
/// * `a` - First binary vector (each u8 contains 8 bits)
/// * `b` - Second binary vector
///
/// # Returns
///
/// Number of differing bits.
#[inline]
pub fn hamming_distance(a: &[u8], b: &[u8]) -> u32 {
    debug_assert_eq!(a.len(), b.len(), "vectors must have same length");

    let mut distance = 0u32;
    for i in 0..a.len() {
        distance += (a[i] ^ b[i]).count_ones();
    }

    distance
}

// ═══════════════════════════════════════════════════════════════════════════════
// DISTANCE METRIC DISPATCH
// ═══════════════════════════════════════════════════════════════════════════════

/// Compute distance using the specified metric.
///
/// This function dispatches to the appropriate distance function based on
/// the metric parameter.
///
/// # Returns
///
/// For similarity metrics (cosine, dot), returns the distance (lower = more similar).
/// For distance metrics (euclidean, manhattan), returns the distance directly.
#[inline]
pub fn compute_distance(a: &[f32], b: &[f32], metric: DistanceMetric) -> f32 {
    match metric {
        DistanceMetric::Cosine => cosine_distance(a, b),
        DistanceMetric::Euclidean => euclidean_distance(a, b),
        DistanceMetric::DotProduct => negative_dot_product(a, b),
        DistanceMetric::Manhattan => manhattan_distance(a, b),
        DistanceMetric::Hamming => {
            // For f32 vectors, we can't compute Hamming directly
            // This would require binary quantization first
            euclidean_distance(a, b)
        }
    }
}

/// Compute similarity score using the specified metric.
///
/// Returns a value where higher = more similar (0.0 to 1.0 range for most metrics).
#[inline]
pub fn compute_similarity(a: &[f32], b: &[f32], metric: DistanceMetric) -> f32 {
    match metric {
        DistanceMetric::Cosine => {
            // Cosine similarity is already in [-1, 1], map to [0, 1]
            (cosine_similarity(a, b) + 1.0) / 2.0
        }
        DistanceMetric::Euclidean => {
            // Convert distance to similarity using exponential decay
            let dist = euclidean_distance(a, b);
            libm::expf(-dist)
        }
        DistanceMetric::DotProduct => {
            // Dot product can be any value, sigmoid to [0, 1]
            let dp = dot_product(a, b);
            1.0 / (1.0 + libm::expf(-dp))
        }
        DistanceMetric::Manhattan => {
            // Convert distance to similarity using exponential decay
            let dist = manhattan_distance(a, b);
            libm::expf(-dist / a.len() as f32)
        }
        DistanceMetric::Hamming => {
            // Not directly applicable to f32 vectors
            (cosine_similarity(a, b) + 1.0) / 2.0
        }
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// VECTOR OPERATIONS
// ═══════════════════════════════════════════════════════════════════════════════

/// Compute the L2 norm (magnitude) of a vector.
#[inline]
pub fn l2_norm(v: &[f32]) -> f32 {
    let mut sum = 0.0f32;
    for &x in v {
        sum += x * x;
    }
    libm::sqrtf(sum)
}

/// Compute the L1 norm of a vector.
#[inline]
pub fn l1_norm(v: &[f32]) -> f32 {
    let mut sum = 0.0f32;
    for &x in v {
        sum += libm::fabsf(x);
    }
    sum
}

/// Normalize a vector in-place to unit length.
#[inline]
pub fn normalize_inplace(v: &mut [f32]) {
    let norm = l2_norm(v);
    if norm > 1e-10 {
        for x in v.iter_mut() {
            *x /= norm;
        }
    }
}

/// Normalize a vector, returning a new vector.
#[inline]
pub fn normalize(v: &[f32]) -> alloc::vec::Vec<f32> {
    let norm = l2_norm(v);
    if norm > 1e-10 {
        v.iter().map(|x| x / norm).collect()
    } else {
        v.to_vec()
    }
}

/// Add two vectors element-wise.
#[inline]
pub fn vector_add(a: &[f32], b: &[f32]) -> alloc::vec::Vec<f32> {
    debug_assert_eq!(a.len(), b.len());
    a.iter().zip(b.iter()).map(|(x, y)| x + y).collect()
}

/// Subtract two vectors element-wise (a - b).
#[inline]
pub fn vector_sub(a: &[f32], b: &[f32]) -> alloc::vec::Vec<f32> {
    debug_assert_eq!(a.len(), b.len());
    a.iter().zip(b.iter()).map(|(x, y)| x - y).collect()
}

/// Scale a vector by a scalar.
#[inline]
pub fn vector_scale(v: &[f32], s: f32) -> alloc::vec::Vec<f32> {
    v.iter().map(|x| x * s).collect()
}

/// Compute the centroid (mean) of multiple vectors.
pub fn centroid(vectors: &[&[f32]]) -> alloc::vec::Vec<f32> {
    if vectors.is_empty() {
        return alloc::vec::Vec::new();
    }

    let dim = vectors[0].len();
    let mut result = alloc::vec![0.0f32; dim];
    let n = vectors.len() as f32;

    for v in vectors {
        debug_assert_eq!(v.len(), dim);
        for (i, &x) in v.iter().enumerate() {
            result[i] += x;
        }
    }

    for x in &mut result {
        *x /= n;
    }

    result
}

// ═══════════════════════════════════════════════════════════════════════════════
// TESTS
// ═══════════════════════════════════════════════════════════════════════════════

#[cfg(test)]
mod tests {
    use super::*;

    const EPSILON: f32 = 1e-6;

    fn approx_eq(a: f32, b: f32) -> bool {
        (a - b).abs() < EPSILON
    }

    #[test]
    fn test_cosine_similarity_identical() {
        let a = [1.0, 2.0, 3.0];
        let b = [1.0, 2.0, 3.0];
        assert!(approx_eq(cosine_similarity(&a, &b), 1.0));
    }

    #[test]
    fn test_cosine_similarity_orthogonal() {
        let a = [1.0, 0.0, 0.0];
        let b = [0.0, 1.0, 0.0];
        assert!(approx_eq(cosine_similarity(&a, &b), 0.0));
    }

    #[test]
    fn test_cosine_similarity_opposite() {
        let a = [1.0, 0.0, 0.0];
        let b = [-1.0, 0.0, 0.0];
        assert!(approx_eq(cosine_similarity(&a, &b), -1.0));
    }

    #[test]
    fn test_cosine_similarity_scaled() {
        let a = [1.0, 2.0, 3.0];
        let b = [2.0, 4.0, 6.0]; // Same direction, different magnitude
        assert!(approx_eq(cosine_similarity(&a, &b), 1.0));
    }

    #[test]
    fn test_euclidean_distance_identical() {
        let a = [1.0, 2.0, 3.0];
        let b = [1.0, 2.0, 3.0];
        assert!(approx_eq(euclidean_distance(&a, &b), 0.0));
    }

    #[test]
    fn test_euclidean_distance_known() {
        let a = [0.0, 0.0];
        let b = [3.0, 4.0];
        assert!(approx_eq(euclidean_distance(&a, &b), 5.0)); // 3-4-5 triangle
    }

    #[test]
    fn test_euclidean_distance_unit() {
        let a = [0.0, 0.0, 0.0];
        let b = [1.0, 0.0, 0.0];
        assert!(approx_eq(euclidean_distance(&a, &b), 1.0));
    }

    #[test]
    fn test_dot_product_orthogonal() {
        let a = [1.0, 0.0, 0.0];
        let b = [0.0, 1.0, 0.0];
        assert!(approx_eq(dot_product(&a, &b), 0.0));
    }

    #[test]
    fn test_dot_product_parallel() {
        let a = [1.0, 2.0, 3.0];
        let b = [1.0, 2.0, 3.0];
        assert!(approx_eq(dot_product(&a, &b), 14.0)); // 1 + 4 + 9
    }

    #[test]
    fn test_manhattan_distance_known() {
        let a = [0.0, 0.0];
        let b = [3.0, 4.0];
        assert!(approx_eq(manhattan_distance(&a, &b), 7.0)); // 3 + 4
    }

    #[test]
    fn test_hamming_distance() {
        let a = [0b10101010, 0b11110000];
        let b = [0b10101010, 0b00001111];
        assert_eq!(hamming_distance(&a, &b), 8); // All bits differ in second byte
    }

    #[test]
    fn test_l2_norm() {
        let v = [3.0, 4.0];
        assert!(approx_eq(l2_norm(&v), 5.0));
    }

    #[test]
    fn test_normalize() {
        let v = [3.0, 4.0];
        let normalized = normalize(&v);
        assert!(approx_eq(normalized[0], 0.6));
        assert!(approx_eq(normalized[1], 0.8));
        assert!(approx_eq(l2_norm(&normalized), 1.0));
    }

    #[test]
    fn test_centroid() {
        let v1 = [1.0, 0.0];
        let v2 = [0.0, 1.0];
        let v3 = [1.0, 1.0];
        let c = centroid(&[&v1[..], &v2[..], &v3[..]]);
        assert!(approx_eq(c[0], 2.0 / 3.0));
        assert!(approx_eq(c[1], 2.0 / 3.0));
    }

    #[test]
    fn test_compute_distance_cosine() {
        let a = [1.0, 0.0];
        let b = [0.0, 1.0];
        let dist = compute_distance(&a, &b, DistanceMetric::Cosine);
        assert!(approx_eq(dist, 1.0)); // cosine distance = 1 - 0 = 1
    }

    #[test]
    fn test_compute_similarity_cosine() {
        let a = [1.0, 0.0];
        let b = [1.0, 0.0];
        let sim = compute_similarity(&a, &b, DistanceMetric::Cosine);
        assert!(approx_eq(sim, 1.0)); // (1 + 1) / 2 = 1
    }

    #[test]
    fn test_high_dimensional_vectors() {
        // Test with 512-dimensional vectors
        let dim = 512;
        let a: alloc::vec::Vec<f32> = (0..dim).map(|i| (i as f32) / dim as f32).collect();
        let b: alloc::vec::Vec<f32> = (0..dim).map(|i| (i as f32) / dim as f32).collect();

        assert!(approx_eq(cosine_similarity(&a, &b), 1.0));
        assert!(approx_eq(euclidean_distance(&a, &b), 0.0));
    }
}