Skip to main content

grafeo_core/index/vector/
mod.rs

1//! Vector similarity search support.
2//!
3//! This module provides infrastructure for storing and searching vector embeddings,
4//! enabling AI/ML use cases like RAG, semantic search, and recommendations.
5//!
6//! # Distance Metrics
7//!
8//! Choose the metric based on your embedding type:
9//!
10//! | Metric | Best For | Range |
11//! |--------|----------|-------|
12//! | [`Cosine`](DistanceMetric::Cosine) | Normalized embeddings (text) | [0, 2] |
13//! | [`Euclidean`](DistanceMetric::Euclidean) | Raw embeddings | [0, inf) |
14//! | [`DotProduct`](DistanceMetric::DotProduct) | Max inner product search | (-inf, inf) |
15//! | [`Manhattan`](DistanceMetric::Manhattan) | Outlier-resistant | [0, inf) |
16//!
17//! # Index Types
18//!
19//! | Index | Complexity | Use Case |
20//! |-------|------------|----------|
21//! | [`brute_force_knn`] | O(n) | Small datasets, exact results |
22//! | [`HnswIndex`] | O(log n) | Large datasets, approximate results |
23//!
24//! # Example
25//!
26//! ```
27//! use grafeo_core::index::vector::{compute_distance, DistanceMetric, brute_force_knn};
28//! use grafeo_common::types::NodeId;
29//!
30//! // Compute distance between two vectors
31//! let query = [0.1f32, 0.2, 0.3];
32//! let doc1 = [0.1f32, 0.2, 0.35];
33//! let doc2 = [0.5f32, 0.6, 0.7];
34//!
35//! let dist1 = compute_distance(&query, &doc1, DistanceMetric::Cosine);
36//! let dist2 = compute_distance(&query, &doc2, DistanceMetric::Cosine);
37//!
38//! // doc1 is more similar (smaller distance)
39//! assert!(dist1 < dist2);
40//!
41//! // Brute-force k-NN search
42//! let vectors = vec![
43//!     (NodeId::new(1), doc1.as_slice()),
44//!     (NodeId::new(2), doc2.as_slice()),
45//! ];
46//!
47//! let results = brute_force_knn(vectors.into_iter(), &query, 1, DistanceMetric::Cosine);
48//! assert_eq!(results[0].0, NodeId::new(1)); // doc1 is closest
49//! ```
50//!
51//! # HNSW Index (requires `vector-index` feature)
52//!
53//! For larger datasets, use the HNSW approximate nearest neighbor index:
54//!
55//! ```ignore
56//! use grafeo_core::index::vector::{HnswIndex, HnswConfig, DistanceMetric};
57//! use grafeo_common::types::NodeId;
58//!
59//! let config = HnswConfig::new(384, DistanceMetric::Cosine);
60//! let index = HnswIndex::new(config);
61//!
62//! // Insert vectors
63//! index.insert(NodeId::new(1), &embedding);
64//!
65//! // Search (O(log n))
66//! let results = index.search(&query, 10);
67//! ```
68
69mod distance;
70pub mod quantization;
71mod simd;
72pub mod storage;
73pub mod zone_map;
74
75#[cfg(feature = "vector-index")]
76mod config;
77#[cfg(feature = "vector-index")]
78mod hnsw;
79#[cfg(feature = "vector-index")]
80mod quantized_hnsw;
81
82pub use distance::{
83    DistanceMetric, compute_distance, cosine_distance, cosine_similarity, dot_product,
84    euclidean_distance, euclidean_distance_squared, l2_norm, manhattan_distance, normalize,
85    simd_support,
86};
87pub use quantization::{BinaryQuantizer, ProductQuantizer, QuantizationType, ScalarQuantizer};
88#[cfg(feature = "mmap")]
89pub use storage::MmapStorage;
90pub use storage::{RamStorage, StorageBackend, VectorStorage};
91pub use zone_map::VectorZoneMap;
92
93#[cfg(feature = "vector-index")]
94pub use config::HnswConfig;
95#[cfg(feature = "vector-index")]
96pub use hnsw::HnswIndex;
97#[cfg(feature = "vector-index")]
98pub use quantized_hnsw::QuantizedHnswIndex;
99
100use grafeo_common::types::NodeId;
101
102/// Configuration for vector search operations.
103#[derive(Debug, Clone)]
104pub struct VectorConfig {
105    /// Expected vector dimensions (for validation).
106    pub dimensions: usize,
107    /// Distance metric for similarity computation.
108    pub metric: DistanceMetric,
109}
110
111impl VectorConfig {
112    /// Creates a new vector configuration.
113    #[must_use]
114    pub const fn new(dimensions: usize, metric: DistanceMetric) -> Self {
115        Self { dimensions, metric }
116    }
117
118    /// Creates a configuration for cosine similarity with the given dimensions.
119    #[must_use]
120    pub const fn cosine(dimensions: usize) -> Self {
121        Self::new(dimensions, DistanceMetric::Cosine)
122    }
123
124    /// Creates a configuration for Euclidean distance with the given dimensions.
125    #[must_use]
126    pub const fn euclidean(dimensions: usize) -> Self {
127        Self::new(dimensions, DistanceMetric::Euclidean)
128    }
129}
130
131impl Default for VectorConfig {
132    fn default() -> Self {
133        Self {
134            dimensions: 384, // Common embedding size (MiniLM, etc.)
135            metric: DistanceMetric::default(),
136        }
137    }
138}
139
140/// Performs brute-force k-nearest neighbor search.
141///
142/// This is O(n) where n is the number of vectors. Use this for:
143/// - Small datasets (< 10K vectors)
144/// - Baseline comparisons
145/// - Exact nearest neighbor search
146///
147/// For larger datasets, use an approximate index like HNSW.
148///
149/// # Arguments
150///
151/// * `vectors` - Iterator of (id, vector) pairs to search
152/// * `query` - The query vector
153/// * `k` - Number of nearest neighbors to return
154/// * `metric` - Distance metric to use
155///
156/// # Returns
157///
158/// Vector of (id, distance) pairs sorted by distance (ascending).
159///
160/// # Example
161///
162/// ```
163/// use grafeo_core::index::vector::{brute_force_knn, DistanceMetric};
164/// use grafeo_common::types::NodeId;
165///
166/// let vectors = vec![
167///     (NodeId::new(1), [0.1f32, 0.2, 0.3].as_slice()),
168///     (NodeId::new(2), [0.4f32, 0.5, 0.6].as_slice()),
169///     (NodeId::new(3), [0.7f32, 0.8, 0.9].as_slice()),
170/// ];
171///
172/// let query = [0.15f32, 0.25, 0.35];
173/// let results = brute_force_knn(vectors.into_iter(), &query, 2, DistanceMetric::Euclidean);
174///
175/// assert_eq!(results.len(), 2);
176/// assert_eq!(results[0].0, NodeId::new(1)); // Closest
177/// ```
178pub fn brute_force_knn<'a, I>(
179    vectors: I,
180    query: &[f32],
181    k: usize,
182    metric: DistanceMetric,
183) -> Vec<(NodeId, f32)>
184where
185    I: Iterator<Item = (NodeId, &'a [f32])>,
186{
187    let mut results: Vec<(NodeId, f32)> = vectors
188        .map(|(id, vec)| (id, compute_distance(query, vec, metric)))
189        .collect();
190
191    // Sort by distance (ascending)
192    results.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
193
194    // Truncate to k
195    results.truncate(k);
196    results
197}
198
199/// Performs brute-force k-nearest neighbor search with a filter predicate.
200///
201/// Only considers vectors where the predicate returns true.
202///
203/// # Arguments
204///
205/// * `vectors` - Iterator of (id, vector) pairs to search
206/// * `query` - The query vector
207/// * `k` - Number of nearest neighbors to return
208/// * `metric` - Distance metric to use
209/// * `predicate` - Filter function; only vectors where this returns true are considered
210///
211/// # Returns
212///
213/// Vector of (id, distance) pairs sorted by distance (ascending).
214pub fn brute_force_knn_filtered<'a, I, F>(
215    vectors: I,
216    query: &[f32],
217    k: usize,
218    metric: DistanceMetric,
219    predicate: F,
220) -> Vec<(NodeId, f32)>
221where
222    I: Iterator<Item = (NodeId, &'a [f32])>,
223    F: Fn(NodeId) -> bool,
224{
225    let mut results: Vec<(NodeId, f32)> = vectors
226        .filter(|(id, _)| predicate(*id))
227        .map(|(id, vec)| (id, compute_distance(query, vec, metric)))
228        .collect();
229
230    results.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
231    results.truncate(k);
232    results
233}
234
235/// Computes the distance between a query and multiple vectors in batch.
236///
237/// More efficient than computing distances one by one for large batches.
238///
239/// # Returns
240///
241/// Vector of (id, distance) pairs in the same order as input.
242pub fn batch_distances<'a, I>(
243    vectors: I,
244    query: &[f32],
245    metric: DistanceMetric,
246) -> Vec<(NodeId, f32)>
247where
248    I: Iterator<Item = (NodeId, &'a [f32])>,
249{
250    vectors
251        .map(|(id, vec)| (id, compute_distance(query, vec, metric)))
252        .collect()
253}
254
255#[cfg(test)]
256mod tests {
257    use super::*;
258
259    #[test]
260    fn test_vector_config_default() {
261        let config = VectorConfig::default();
262        assert_eq!(config.dimensions, 384);
263        assert_eq!(config.metric, DistanceMetric::Cosine);
264    }
265
266    #[test]
267    fn test_vector_config_constructors() {
268        let cosine = VectorConfig::cosine(768);
269        assert_eq!(cosine.dimensions, 768);
270        assert_eq!(cosine.metric, DistanceMetric::Cosine);
271
272        let euclidean = VectorConfig::euclidean(1536);
273        assert_eq!(euclidean.dimensions, 1536);
274        assert_eq!(euclidean.metric, DistanceMetric::Euclidean);
275    }
276
277    #[test]
278    fn test_brute_force_knn() {
279        let vectors = vec![
280            (NodeId::new(1), [0.0f32, 0.0, 0.0].as_slice()),
281            (NodeId::new(2), [1.0f32, 0.0, 0.0].as_slice()),
282            (NodeId::new(3), [2.0f32, 0.0, 0.0].as_slice()),
283            (NodeId::new(4), [3.0f32, 0.0, 0.0].as_slice()),
284        ];
285
286        let query = [0.5f32, 0.0, 0.0];
287        let results = brute_force_knn(vectors.into_iter(), &query, 2, DistanceMetric::Euclidean);
288
289        assert_eq!(results.len(), 2);
290        // Closest should be node 1 (dist 0.5) or node 2 (dist 0.5)
291        assert!(results[0].0 == NodeId::new(1) || results[0].0 == NodeId::new(2));
292    }
293
294    #[test]
295    fn test_brute_force_knn_empty() {
296        let vectors: Vec<(NodeId, &[f32])> = vec![];
297        let query = [0.0f32, 0.0];
298        let results = brute_force_knn(vectors.into_iter(), &query, 10, DistanceMetric::Cosine);
299        assert!(results.is_empty());
300    }
301
302    #[test]
303    fn test_brute_force_knn_k_larger_than_n() {
304        let vectors = vec![
305            (NodeId::new(1), [0.0f32, 0.0].as_slice()),
306            (NodeId::new(2), [1.0f32, 0.0].as_slice()),
307        ];
308
309        let query = [0.0f32, 0.0];
310        let results = brute_force_knn(vectors.into_iter(), &query, 10, DistanceMetric::Euclidean);
311
312        // Should return all 2 vectors, not 10
313        assert_eq!(results.len(), 2);
314    }
315
316    #[test]
317    fn test_brute_force_knn_filtered() {
318        let vectors = vec![
319            (NodeId::new(1), [0.0f32, 0.0].as_slice()),
320            (NodeId::new(2), [1.0f32, 0.0].as_slice()),
321            (NodeId::new(3), [2.0f32, 0.0].as_slice()),
322        ];
323
324        let query = [0.0f32, 0.0];
325
326        // Only consider even IDs
327        let results = brute_force_knn_filtered(
328            vectors.into_iter(),
329            &query,
330            10,
331            DistanceMetric::Euclidean,
332            |id| id.as_u64() % 2 == 0,
333        );
334
335        assert_eq!(results.len(), 1);
336        assert_eq!(results[0].0, NodeId::new(2));
337    }
338
339    #[test]
340    fn test_batch_distances() {
341        let vectors = vec![
342            (NodeId::new(1), [0.0f32, 0.0].as_slice()),
343            (NodeId::new(2), [3.0f32, 4.0].as_slice()),
344        ];
345
346        let query = [0.0f32, 0.0];
347        let results = batch_distances(vectors.into_iter(), &query, DistanceMetric::Euclidean);
348
349        assert_eq!(results.len(), 2);
350        assert_eq!(results[0].0, NodeId::new(1));
351        assert!((results[0].1 - 0.0).abs() < 0.001);
352        assert_eq!(results[1].0, NodeId::new(2));
353        assert!((results[1].1 - 5.0).abs() < 0.001); // 3-4-5 triangle
354    }
355}