1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
//! Distance metrics for vector similarity calculations.
//!
//! # Performance
//!
//! All distance calculations use direct SIMD dispatch via `simd_native` module,
//! eliminating intermediate dispatch overhead for maximum performance:
//! - **Cosine**: Direct AVX-512/AVX2/NEON intrinsics
//! - **Euclidean**: Direct native intrinsics with 4-acc unrolling
//! - **Dot Product**: Direct FMA-optimized intrinsics
//! - **Hamming (binary)**: POPCNT on packed u64 (48x faster than f32)
//! - **Jaccard**: Set similarity with SIMD acceleration
use crate::simd_native;
use serde::{Deserialize, Serialize};
use std::str::FromStr;
/// Distance metric for vector similarity calculations.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub enum DistanceMetric {
/// Cosine similarity (1 - `cosine_distance`).
/// Best for normalized vectors, commonly used with text embeddings.
Cosine,
/// Euclidean distance (L2 norm).
/// Best for spatial data and when magnitude matters.
Euclidean,
/// Dot product (inner product).
/// Best for maximum inner product search (MIPS).
DotProduct,
/// Hamming distance for binary vectors.
/// Counts the number of positions where bits differ.
/// Best for binary embeddings and locality-sensitive hashing.
Hamming,
/// Jaccard similarity for set-like vectors.
/// Measures intersection over union of non-zero elements.
/// Best for sparse vectors, tags, and set membership.
Jaccard,
}
impl DistanceMetric {
/// Returns the canonical metric name used by user-facing APIs.
#[must_use]
pub const fn canonical_name(self) -> &'static str {
match self {
Self::Cosine => "cosine",
Self::Euclidean => "euclidean",
Self::DotProduct => "dot",
Self::Hamming => "hamming",
Self::Jaccard => "jaccard",
}
}
/// Parses a metric name/alias into a [`DistanceMetric`].
///
/// Supported aliases:
/// - cosine
/// - euclidean, l2
/// - dot, dotproduct, inner
/// - hamming
/// - jaccard
#[must_use]
pub fn parse_alias(value: &str) -> Option<Self> {
match value.trim().to_lowercase().as_str() {
"cosine" => Some(Self::Cosine),
"euclidean" | "l2" => Some(Self::Euclidean),
"dot" | "dotproduct" | "inner" => Some(Self::DotProduct),
"hamming" => Some(Self::Hamming),
"jaccard" => Some(Self::Jaccard),
_ => None,
}
}
/// Calculates the distance between two vectors using the specified metric.
///
/// # Arguments
///
/// * `a` - First vector
/// * `b` - Second vector
///
/// # Returns
///
/// Distance value (lower is more similar for Euclidean, higher for Cosine/DotProduct).
///
/// # Panics
///
/// Panics if vectors have different dimensions.
///
/// # Performance
///
/// Uses SIMD-optimized implementations. Typical latencies for 768d vectors:
/// - Cosine: ~32ns
/// - Euclidean: ~20ns
/// - Dot Product: ~18ns
#[must_use]
#[inline]
pub fn calculate(&self, a: &[f32], b: &[f32]) -> f32 {
match self {
Self::Cosine => simd_native::cosine_similarity_native(a, b),
Self::Euclidean => simd_native::euclidean_native(a, b),
Self::DotProduct => simd_native::dot_product_native(a, b),
Self::Hamming => simd_native::hamming_distance_native(a, b) as f32,
Self::Jaccard => simd_native::jaccard_similarity_native(a, b),
}
}
/// Returns whether higher values indicate more similarity.
#[must_use]
pub const fn higher_is_better(&self) -> bool {
match self {
Self::Cosine | Self::DotProduct | Self::Jaccard => true,
Self::Euclidean | Self::Hamming => false,
}
}
/// Sorts search results by distance/similarity according to the metric.
///
/// - **Similarity metrics** (`Cosine`, `DotProduct`, `Jaccard`): sorts descending (higher = better)
/// - **Distance metrics** (`Euclidean`, `Hamming`): sorts ascending (lower = better)
///
/// # Example
///
/// ```rust,ignore
/// let mut results = vec![(1, 0.9), (2, 0.7), (3, 0.8)];
/// DistanceMetric::Cosine.sort_results(&mut results);
/// assert_eq!(results[0].0, 1); // Highest similarity first
/// ```
pub fn sort_results(&self, results: &mut [(u64, f32)]) {
if self.higher_is_better() {
// Similarity metrics: descending order (higher = better)
results.sort_by(|a, b| b.1.total_cmp(&a.1));
} else {
// Distance metrics: ascending order (lower = better)
results.sort_by(|a, b| a.1.total_cmp(&b.1));
}
}
/// Sorts scored results by the distance metric semantics.
///
/// Same as [`sort_results`](Self::sort_results) but operates on
/// [`ScoredResult`](crate::scored_result::ScoredResult) slices.
pub fn sort_scored_results(&self, results: &mut [crate::scored_result::ScoredResult]) {
if self.higher_is_better() {
results.sort_by(|a, b| b.score.total_cmp(&a.score));
} else {
results.sort_by(|a, b| a.score.total_cmp(&b.score));
}
}
}
impl FromStr for DistanceMetric {
type Err = &'static str;
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::parse_alias(s).ok_or("Unknown metric. Use: cosine, euclidean, dot, hamming, jaccard")
}
}