velesdb-core 1.9.0

High-performance vector database engine written in Rust
Documentation
//! Distance metrics for vector similarity calculations.
//!
//! # Performance
//!
//! All distance calculations use direct SIMD dispatch via `simd_native` module,
//! eliminating intermediate dispatch overhead for maximum performance:
//! - **Cosine**: Direct AVX-512/AVX2/NEON intrinsics
//! - **Euclidean**: Direct native intrinsics with 4-acc unrolling
//! - **Dot Product**: Direct FMA-optimized intrinsics
//! - **Hamming (binary)**: POPCNT on packed u64 (48x faster than f32)
//! - **Jaccard**: Set similarity with SIMD acceleration

use crate::simd_native;
use serde::{Deserialize, Serialize};
use std::str::FromStr;

/// Distance metric for vector similarity calculations.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub enum DistanceMetric {
    /// Cosine similarity (1 - `cosine_distance`).
    /// Best for normalized vectors, commonly used with text embeddings.
    Cosine,

    /// Euclidean distance (L2 norm).
    /// Best for spatial data and when magnitude matters.
    Euclidean,

    /// Dot product (inner product).
    /// Best for maximum inner product search (MIPS).
    DotProduct,

    /// Hamming distance for binary vectors.
    /// Counts the number of positions where bits differ.
    /// Best for binary embeddings and locality-sensitive hashing.
    Hamming,

    /// Jaccard similarity for set-like vectors.
    /// Measures intersection over union of non-zero elements.
    /// Best for sparse vectors, tags, and set membership.
    Jaccard,
}

impl DistanceMetric {
    /// Returns the canonical metric name used by user-facing APIs.
    #[must_use]
    pub const fn canonical_name(self) -> &'static str {
        match self {
            Self::Cosine => "cosine",
            Self::Euclidean => "euclidean",
            Self::DotProduct => "dot",
            Self::Hamming => "hamming",
            Self::Jaccard => "jaccard",
        }
    }

    /// Parses a metric name/alias into a [`DistanceMetric`].
    ///
    /// Supported aliases:
    /// - cosine
    /// - euclidean, l2
    /// - dot, dotproduct, inner
    /// - hamming
    /// - jaccard
    #[must_use]
    pub fn parse_alias(value: &str) -> Option<Self> {
        match value.trim().to_lowercase().as_str() {
            "cosine" => Some(Self::Cosine),
            "euclidean" | "l2" => Some(Self::Euclidean),
            "dot" | "dotproduct" | "inner" => Some(Self::DotProduct),
            "hamming" => Some(Self::Hamming),
            "jaccard" => Some(Self::Jaccard),
            _ => None,
        }
    }

    /// Calculates the distance between two vectors using the specified metric.
    ///
    /// # Arguments
    ///
    /// * `a` - First vector
    /// * `b` - Second vector
    ///
    /// # Returns
    ///
    /// Distance value (lower is more similar for Euclidean, higher for Cosine/DotProduct).
    ///
    /// # Panics
    ///
    /// Panics if vectors have different dimensions.
    ///
    /// # Performance
    ///
    /// Uses SIMD-optimized implementations. Typical latencies for 768d vectors:
    /// - Cosine: ~32ns
    /// - Euclidean: ~20ns
    /// - Dot Product: ~18ns
    #[must_use]
    #[inline]
    pub fn calculate(&self, a: &[f32], b: &[f32]) -> f32 {
        match self {
            Self::Cosine => simd_native::cosine_similarity_native(a, b),
            Self::Euclidean => simd_native::euclidean_native(a, b),
            Self::DotProduct => simd_native::dot_product_native(a, b),
            Self::Hamming => simd_native::hamming_distance_native(a, b) as f32,
            Self::Jaccard => simd_native::jaccard_similarity_native(a, b),
        }
    }

    /// Returns whether higher values indicate more similarity.
    #[must_use]
    pub const fn higher_is_better(&self) -> bool {
        match self {
            Self::Cosine | Self::DotProduct | Self::Jaccard => true,
            Self::Euclidean | Self::Hamming => false,
        }
    }

    /// Sorts search results by distance/similarity according to the metric.
    ///
    /// - **Similarity metrics** (`Cosine`, `DotProduct`, `Jaccard`): sorts descending (higher = better)
    /// - **Distance metrics** (`Euclidean`, `Hamming`): sorts ascending (lower = better)
    ///
    /// # Example
    ///
    /// ```rust,ignore
    /// let mut results = vec![(1, 0.9), (2, 0.7), (3, 0.8)];
    /// DistanceMetric::Cosine.sort_results(&mut results);
    /// assert_eq!(results[0].0, 1); // Highest similarity first
    /// ```
    pub fn sort_results(&self, results: &mut [(u64, f32)]) {
        if self.higher_is_better() {
            // Similarity metrics: descending order (higher = better)
            results.sort_by(|a, b| b.1.total_cmp(&a.1));
        } else {
            // Distance metrics: ascending order (lower = better)
            results.sort_by(|a, b| a.1.total_cmp(&b.1));
        }
    }

    /// Sorts scored results by the distance metric semantics.
    ///
    /// Same as [`sort_results`](Self::sort_results) but operates on
    /// [`ScoredResult`](crate::scored_result::ScoredResult) slices.
    pub fn sort_scored_results(&self, results: &mut [crate::scored_result::ScoredResult]) {
        if self.higher_is_better() {
            results.sort_by(|a, b| b.score.total_cmp(&a.score));
        } else {
            results.sort_by(|a, b| a.score.total_cmp(&b.score));
        }
    }
}

impl FromStr for DistanceMetric {
    type Err = &'static str;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        Self::parse_alias(s).ok_or("Unknown metric. Use: cosine, euclidean, dot, hamming, jaccard")
    }
}