Skip to main content

grafeo_core/index/vector/
mod.rs

1//! Vector similarity search support.
2//!
3//! This module provides infrastructure for storing and searching vector embeddings,
4//! enabling AI/ML use cases like RAG, semantic search, and recommendations.
5//!
6//! # Distance Metrics
7//!
8//! Choose the metric based on your embedding type:
9//!
10//! | Metric | Best For | Range |
11//! |--------|----------|-------|
12//! | [`Cosine`](DistanceMetric::Cosine) | Normalized embeddings (text) | [0, 2] |
13//! | [`Euclidean`](DistanceMetric::Euclidean) | Raw embeddings | [0, inf) |
14//! | [`DotProduct`](DistanceMetric::DotProduct) | Max inner product search | (-inf, inf) |
15//! | [`Manhattan`](DistanceMetric::Manhattan) | Outlier-resistant | [0, inf) |
16//!
17//! # Index Types
18//!
19//! | Index | Complexity | Use Case |
20//! |-------|------------|----------|
21//! | [`brute_force_knn`] | O(n) | Small datasets, exact results |
22//! | [`HnswIndex`] | O(log n) | Large datasets, approximate results |
23//!
24//! # Example
25//!
26//! ```
27//! use grafeo_core::index::vector::{compute_distance, DistanceMetric, brute_force_knn};
28//! use grafeo_common::types::NodeId;
29//!
30//! // Compute distance between two vectors
31//! let query = [0.1f32, 0.2, 0.3];
32//! let doc1 = [0.1f32, 0.2, 0.35];
33//! let doc2 = [0.5f32, 0.6, 0.7];
34//!
35//! let dist1 = compute_distance(&query, &doc1, DistanceMetric::Cosine);
36//! let dist2 = compute_distance(&query, &doc2, DistanceMetric::Cosine);
37//!
38//! // doc1 is more similar (smaller distance)
39//! assert!(dist1 < dist2);
40//!
41//! // Brute-force k-NN search
42//! let vectors = vec![
43//!     (NodeId::new(1), doc1.as_slice()),
44//!     (NodeId::new(2), doc2.as_slice()),
45//! ];
46//!
47//! let results = brute_force_knn(vectors.into_iter(), &query, 1, DistanceMetric::Cosine);
48//! assert_eq!(results[0].0, NodeId::new(1)); // doc1 is closest
49//! ```
50//!
51//! # HNSW Index (requires `vector-index` feature)
52//!
53//! For larger datasets, use the HNSW approximate nearest neighbor index:
54//!
55//! ```ignore
56//! use grafeo_core::index::vector::{HnswIndex, HnswConfig, DistanceMetric};
57//! use grafeo_common::types::NodeId;
58//!
59//! let config = HnswConfig::new(384, DistanceMetric::Cosine);
60//! let index = HnswIndex::new(config);
61//!
62//! // Insert vectors
63//! index.insert(NodeId::new(1), &embedding);
64//!
65//! // Search (O(log n))
66//! let results = index.search(&query, 10);
67//! ```
68
69mod accessor;
70mod distance;
71mod mmr;
72pub mod quantization;
73mod simd;
74pub mod storage;
75pub mod zone_map;
76
77#[cfg(feature = "vector-index")]
78mod config;
79#[cfg(feature = "vector-index")]
80mod hnsw;
81#[cfg(feature = "vector-index")]
82mod quantized_hnsw;
83
84pub use accessor::{PropertyVectorAccessor, VectorAccessor};
85pub use distance::{
86    DistanceMetric, compute_distance, cosine_distance, cosine_similarity, dot_product,
87    euclidean_distance, euclidean_distance_squared, l2_norm, manhattan_distance, normalize,
88    simd_support,
89};
90pub use mmr::mmr_select;
91pub use quantization::{BinaryQuantizer, ProductQuantizer, QuantizationType, ScalarQuantizer};
92#[cfg(feature = "mmap")]
93pub use storage::MmapStorage;
94pub use storage::{RamStorage, StorageBackend, VectorStorage};
95pub use zone_map::VectorZoneMap;
96
97#[cfg(feature = "vector-index")]
98pub use config::HnswConfig;
99#[cfg(feature = "vector-index")]
100pub use hnsw::HnswIndex;
101#[cfg(feature = "vector-index")]
102pub use quantized_hnsw::QuantizedHnswIndex;
103
104use grafeo_common::types::NodeId;
105
106/// Configuration for vector search operations.
107#[derive(Debug, Clone)]
108pub struct VectorConfig {
109    /// Expected vector dimensions (for validation).
110    pub dimensions: usize,
111    /// Distance metric for similarity computation.
112    pub metric: DistanceMetric,
113}
114
115impl VectorConfig {
116    /// Creates a new vector configuration.
117    #[must_use]
118    pub const fn new(dimensions: usize, metric: DistanceMetric) -> Self {
119        Self { dimensions, metric }
120    }
121
122    /// Creates a configuration for cosine similarity with the given dimensions.
123    #[must_use]
124    pub const fn cosine(dimensions: usize) -> Self {
125        Self::new(dimensions, DistanceMetric::Cosine)
126    }
127
128    /// Creates a configuration for Euclidean distance with the given dimensions.
129    #[must_use]
130    pub const fn euclidean(dimensions: usize) -> Self {
131        Self::new(dimensions, DistanceMetric::Euclidean)
132    }
133}
134
135impl Default for VectorConfig {
136    fn default() -> Self {
137        Self {
138            dimensions: 384, // Common embedding size (MiniLM, etc.)
139            metric: DistanceMetric::default(),
140        }
141    }
142}
143
144/// Performs brute-force k-nearest neighbor search.
145///
146/// This is O(n) where n is the number of vectors. Use this for:
147/// - Small datasets (< 10K vectors)
148/// - Baseline comparisons
149/// - Exact nearest neighbor search
150///
151/// For larger datasets, use an approximate index like HNSW.
152///
153/// # Arguments
154///
155/// * `vectors` - Iterator of (id, vector) pairs to search
156/// * `query` - The query vector
157/// * `k` - Number of nearest neighbors to return
158/// * `metric` - Distance metric to use
159///
160/// # Returns
161///
162/// Vector of (id, distance) pairs sorted by distance (ascending).
163///
164/// # Example
165///
166/// ```
167/// use grafeo_core::index::vector::{brute_force_knn, DistanceMetric};
168/// use grafeo_common::types::NodeId;
169///
170/// let vectors = vec![
171///     (NodeId::new(1), [0.1f32, 0.2, 0.3].as_slice()),
172///     (NodeId::new(2), [0.4f32, 0.5, 0.6].as_slice()),
173///     (NodeId::new(3), [0.7f32, 0.8, 0.9].as_slice()),
174/// ];
175///
176/// let query = [0.15f32, 0.25, 0.35];
177/// let results = brute_force_knn(vectors.into_iter(), &query, 2, DistanceMetric::Euclidean);
178///
179/// assert_eq!(results.len(), 2);
180/// assert_eq!(results[0].0, NodeId::new(1)); // Closest
181/// ```
182pub fn brute_force_knn<'a, I>(
183    vectors: I,
184    query: &[f32],
185    k: usize,
186    metric: DistanceMetric,
187) -> Vec<(NodeId, f32)>
188where
189    I: Iterator<Item = (NodeId, &'a [f32])>,
190{
191    let mut results: Vec<(NodeId, f32)> = vectors
192        .map(|(id, vec)| (id, compute_distance(query, vec, metric)))
193        .collect();
194
195    // Sort by distance (ascending)
196    results.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
197
198    // Truncate to k
199    results.truncate(k);
200    results
201}
202
203/// Performs brute-force k-nearest neighbor search with a filter predicate.
204///
205/// Only considers vectors where the predicate returns true.
206///
207/// # Arguments
208///
209/// * `vectors` - Iterator of (id, vector) pairs to search
210/// * `query` - The query vector
211/// * `k` - Number of nearest neighbors to return
212/// * `metric` - Distance metric to use
213/// * `predicate` - Filter function; only vectors where this returns true are considered
214///
215/// # Returns
216///
217/// Vector of (id, distance) pairs sorted by distance (ascending).
218pub fn brute_force_knn_filtered<'a, I, F>(
219    vectors: I,
220    query: &[f32],
221    k: usize,
222    metric: DistanceMetric,
223    predicate: F,
224) -> Vec<(NodeId, f32)>
225where
226    I: Iterator<Item = (NodeId, &'a [f32])>,
227    F: Fn(NodeId) -> bool,
228{
229    let mut results: Vec<(NodeId, f32)> = vectors
230        .filter(|(id, _)| predicate(*id))
231        .map(|(id, vec)| (id, compute_distance(query, vec, metric)))
232        .collect();
233
234    results.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
235    results.truncate(k);
236    results
237}
238
239/// Computes the distance between a query and multiple vectors in batch.
240///
241/// More efficient than computing distances one by one for large batches.
242///
243/// # Returns
244///
245/// Vector of (id, distance) pairs in the same order as input.
246pub fn batch_distances<'a, I>(
247    vectors: I,
248    query: &[f32],
249    metric: DistanceMetric,
250) -> Vec<(NodeId, f32)>
251where
252    I: Iterator<Item = (NodeId, &'a [f32])>,
253{
254    vectors
255        .map(|(id, vec)| (id, compute_distance(query, vec, metric)))
256        .collect()
257}
258
259#[cfg(test)]
260mod tests {
261    use super::*;
262
263    #[test]
264    fn test_vector_config_default() {
265        let config = VectorConfig::default();
266        assert_eq!(config.dimensions, 384);
267        assert_eq!(config.metric, DistanceMetric::Cosine);
268    }
269
270    #[test]
271    fn test_vector_config_constructors() {
272        let cosine = VectorConfig::cosine(768);
273        assert_eq!(cosine.dimensions, 768);
274        assert_eq!(cosine.metric, DistanceMetric::Cosine);
275
276        let euclidean = VectorConfig::euclidean(1536);
277        assert_eq!(euclidean.dimensions, 1536);
278        assert_eq!(euclidean.metric, DistanceMetric::Euclidean);
279    }
280
281    #[test]
282    fn test_brute_force_knn() {
283        let vectors = vec![
284            (NodeId::new(1), [0.0f32, 0.0, 0.0].as_slice()),
285            (NodeId::new(2), [1.0f32, 0.0, 0.0].as_slice()),
286            (NodeId::new(3), [2.0f32, 0.0, 0.0].as_slice()),
287            (NodeId::new(4), [3.0f32, 0.0, 0.0].as_slice()),
288        ];
289
290        let query = [0.5f32, 0.0, 0.0];
291        let results = brute_force_knn(vectors.into_iter(), &query, 2, DistanceMetric::Euclidean);
292
293        assert_eq!(results.len(), 2);
294        // Closest should be node 1 (dist 0.5) or node 2 (dist 0.5)
295        assert!(results[0].0 == NodeId::new(1) || results[0].0 == NodeId::new(2));
296    }
297
298    #[test]
299    fn test_brute_force_knn_empty() {
300        let vectors: Vec<(NodeId, &[f32])> = vec![];
301        let query = [0.0f32, 0.0];
302        let results = brute_force_knn(vectors.into_iter(), &query, 10, DistanceMetric::Cosine);
303        assert!(results.is_empty());
304    }
305
306    #[test]
307    fn test_brute_force_knn_k_larger_than_n() {
308        let vectors = vec![
309            (NodeId::new(1), [0.0f32, 0.0].as_slice()),
310            (NodeId::new(2), [1.0f32, 0.0].as_slice()),
311        ];
312
313        let query = [0.0f32, 0.0];
314        let results = brute_force_knn(vectors.into_iter(), &query, 10, DistanceMetric::Euclidean);
315
316        // Should return all 2 vectors, not 10
317        assert_eq!(results.len(), 2);
318    }
319
320    #[test]
321    fn test_brute_force_knn_filtered() {
322        let vectors = vec![
323            (NodeId::new(1), [0.0f32, 0.0].as_slice()),
324            (NodeId::new(2), [1.0f32, 0.0].as_slice()),
325            (NodeId::new(3), [2.0f32, 0.0].as_slice()),
326        ];
327
328        let query = [0.0f32, 0.0];
329
330        // Only consider even IDs
331        let results = brute_force_knn_filtered(
332            vectors.into_iter(),
333            &query,
334            10,
335            DistanceMetric::Euclidean,
336            |id| id.as_u64() % 2 == 0,
337        );
338
339        assert_eq!(results.len(), 1);
340        assert_eq!(results[0].0, NodeId::new(2));
341    }
342
343    #[test]
344    fn test_batch_distances() {
345        let vectors = vec![
346            (NodeId::new(1), [0.0f32, 0.0].as_slice()),
347            (NodeId::new(2), [3.0f32, 4.0].as_slice()),
348        ];
349
350        let query = [0.0f32, 0.0];
351        let results = batch_distances(vectors.into_iter(), &query, DistanceMetric::Euclidean);
352
353        assert_eq!(results.len(), 2);
354        assert_eq!(results[0].0, NodeId::new(1));
355        assert!((results[0].1 - 0.0).abs() < 0.001);
356        assert_eq!(results[1].0, NodeId::new(2));
357        assert!((results[1].1 - 5.0).abs() < 0.001); // 3-4-5 triangle
358    }
359}