velesdb-core 3.0.0

High-performance vector database engine written in Rust
Documentation
//! Distance metrics for vector similarity calculations.
//!
//! # Performance
//!
//! All distance calculations use direct SIMD dispatch via `simd_native` module,
//! eliminating intermediate dispatch overhead for maximum performance:
//! - **Cosine**: Direct AVX-512/AVX2/NEON intrinsics
//! - **Euclidean**: Direct native intrinsics with 4-acc unrolling
//! - **Dot Product**: Direct FMA-optimized intrinsics
//! - **Hamming (binary)**: `DistanceMetric::calculate` uses the f32 variant
//!   (0.5 threshold per component); the POPCNT-on-packed-u64 fast path
//!   (~48x faster) is a separate API consumed by the `RaBitQ` pipeline
//! - **Jaccard**: Set similarity with SIMD acceleration

use crate::simd_native;
use serde::{Deserialize, Serialize};
use std::str::FromStr;

/// Canonical names of every [`DistanceMetric`] variant, in declaration order.
///
/// Single source of truth for the metric name set exported to downstream
/// crates and bindings (Python `velesdb.DISTANCE_METRICS`, the integrations
/// security guard). Each entry is the variant's
/// [`canonical_name`](DistanceMetric::canonical_name); a unit test asserts the
/// slice stays exhaustive so adding a variant without updating it fails CI.
pub const DISTANCE_METRIC_NAMES: &[&str] = &["cosine", "euclidean", "dot", "hamming", "jaccard"];

/// Distance metric for vector similarity calculations.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub enum DistanceMetric {
    /// Cosine similarity (1 - `cosine_distance`).
    /// Best for normalized vectors, commonly used with text embeddings.
    Cosine,

    /// Euclidean distance (L2 norm).
    /// Best for spatial data and when magnitude matters.
    Euclidean,

    /// Dot product (inner product).
    /// Best for maximum inner product search (MIPS).
    DotProduct,

    /// Hamming distance for binary vectors.
    /// Counts the number of positions where bits differ.
    /// Best for binary embeddings and locality-sensitive hashing.
    Hamming,

    /// Jaccard similarity for set-like vectors.
    /// Measures intersection over union of non-zero elements.
    /// Best for sparse vectors, tags, and set membership.
    Jaccard,
}

impl DistanceMetric {
    /// Returns the canonical metric name used by user-facing APIs.
    #[must_use]
    pub const fn canonical_name(self) -> &'static str {
        match self {
            Self::Cosine => "cosine",
            Self::Euclidean => "euclidean",
            Self::DotProduct => "dot",
            Self::Hamming => "hamming",
            Self::Jaccard => "jaccard",
        }
    }

    /// Parses a metric name/alias into a [`DistanceMetric`].
    ///
    /// Supported aliases:
    /// - cosine
    /// - euclidean, l2
    /// - dot, dotproduct, inner, ip
    /// - hamming
    /// - jaccard
    #[must_use]
    pub fn parse_alias(value: &str) -> Option<Self> {
        match value.trim().to_lowercase().as_str() {
            "cosine" => Some(Self::Cosine),
            "euclidean" | "l2" => Some(Self::Euclidean),
            "dot" | "dotproduct" | "inner" | "ip" => Some(Self::DotProduct),
            "hamming" => Some(Self::Hamming),
            "jaccard" => Some(Self::Jaccard),
            _ => None,
        }
    }

    /// Calculates the distance between two vectors using the specified metric.
    ///
    /// # Arguments
    ///
    /// * `a` - First vector
    /// * `b` - Second vector
    ///
    /// # Returns
    ///
    /// Distance value (lower is more similar for Euclidean, higher for Cosine/DotProduct).
    ///
    /// # Panics
    ///
    /// Panics if vectors have different dimensions.
    ///
    /// # Performance
    ///
    /// Uses SIMD-optimized implementations. Typical latencies for 768d vectors:
    /// - Cosine: ~32ns
    /// - Euclidean: ~20ns
    /// - Dot Product: ~18ns
    #[must_use]
    #[inline]
    pub fn calculate(&self, a: &[f32], b: &[f32]) -> f32 {
        match self {
            Self::Cosine => simd_native::cosine_similarity_native(a, b),
            Self::Euclidean => simd_native::euclidean_native(a, b),
            Self::DotProduct => simd_native::dot_product_native(a, b),
            Self::Hamming => simd_native::hamming_distance_native(a, b),
            Self::Jaccard => simd_native::jaccard_similarity_native(a, b),
        }
    }

    /// Returns whether higher values indicate more similarity.
    #[must_use]
    pub const fn higher_is_better(&self) -> bool {
        match self {
            Self::Cosine | Self::DotProduct | Self::Jaccard => true,
            Self::Euclidean | Self::Hamming => false,
        }
    }

    /// Sorts search results by distance/similarity according to the metric.
    ///
    /// - **Similarity metrics** (`Cosine`, `DotProduct`, `Jaccard`): sorts descending (higher = better)
    /// - **Distance metrics** (`Euclidean`, `Hamming`): sorts ascending (lower = better)
    ///
    /// Generic over the id type so both external ids (`u64`) and internal
    /// HNSW node ids (`usize`) can be sorted with the same metric semantics.
    ///
    /// # Example
    ///
    /// ```rust,ignore
    /// let mut results = vec![(1, 0.9), (2, 0.7), (3, 0.8)];
    /// DistanceMetric::Cosine.sort_results(&mut results);
    /// assert_eq!(results[0].0, 1); // Highest similarity first
    /// ```
    pub fn sort_results<Id>(&self, results: &mut [(Id, f32)]) {
        if self.higher_is_better() {
            // Similarity metrics: descending order (higher = better)
            results.sort_unstable_by(|a, b| b.1.total_cmp(&a.1));
        } else {
            // Distance metrics: ascending order (lower = better)
            results.sort_unstable_by(|a, b| a.1.total_cmp(&b.1));
        }
    }

    /// Sorts scored results by the distance metric semantics.
    ///
    /// Same as [`sort_results`](Self::sort_results) but operates on
    /// [`ScoredResult`](crate::scored_result::ScoredResult) slices.
    pub fn sort_scored_results(&self, results: &mut [crate::scored_result::ScoredResult]) {
        if self.higher_is_better() {
            results.sort_unstable_by(|a, b| b.score.total_cmp(&a.score));
        } else {
            results.sort_unstable_by(|a, b| a.score.total_cmp(&b.score));
        }
    }
}

impl FromStr for DistanceMetric {
    type Err = &'static str;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        Self::parse_alias(s)
            .ok_or("Unknown metric. Use: cosine, euclidean, l2, dot, dotproduct, inner, ip, hamming, jaccard")
    }
}

#[cfg(test)]
mod distance_metric_names_tests {
    use super::{DistanceMetric, DISTANCE_METRIC_NAMES};

    /// Forces this test to be revisited whenever a variant is added: the
    /// exhaustive `match` (no wildcard arm) fails to compile until the new
    /// variant is listed here, which in turn flags the missing const entry.
    fn ordinal(metric: DistanceMetric) -> usize {
        match metric {
            DistanceMetric::Cosine => 0,
            DistanceMetric::Euclidean => 1,
            DistanceMetric::DotProduct => 2,
            DistanceMetric::Hamming => 3,
            DistanceMetric::Jaccard => 4,
        }
    }

    #[test]
    fn distance_metric_names_is_exhaustive_and_canonical() {
        let variants = [
            DistanceMetric::Cosine,
            DistanceMetric::Euclidean,
            DistanceMetric::DotProduct,
            DistanceMetric::Hamming,
            DistanceMetric::Jaccard,
        ];
        // Tie the variant list to the `ordinal` tripwire so a new variant
        // cannot silently skip this assertion.
        assert_eq!(variants.len(), DISTANCE_METRIC_NAMES.len());
        for (i, variant) in variants.into_iter().enumerate() {
            assert_eq!(ordinal(variant), i);
            assert_eq!(DISTANCE_METRIC_NAMES[i], variant.canonical_name());
        }
    }
}