grafeo_core/index/vector/mod.rs
1//! Vector similarity search support.
2//!
3//! This module provides infrastructure for storing and searching vector embeddings,
4//! enabling AI/ML use cases like RAG, semantic search, and recommendations.
5//!
6//! # Distance Metrics
7//!
8//! Choose the metric based on your embedding type:
9//!
10//! | Metric | Best For | Range |
11//! |--------|----------|-------|
12//! | [`Cosine`](DistanceMetric::Cosine) | Normalized embeddings (text) | [0, 2] |
13//! | [`Euclidean`](DistanceMetric::Euclidean) | Raw embeddings | [0, inf) |
14//! | [`DotProduct`](DistanceMetric::DotProduct) | Max inner product search | (-inf, inf) |
15//! | [`Manhattan`](DistanceMetric::Manhattan) | Outlier-resistant | [0, inf) |
16//!
17//! # Index Types
18//!
19//! | Index | Complexity | Use Case |
20//! |-------|------------|----------|
21//! | [`brute_force_knn`] | O(n) | Small datasets, exact results |
22//! | [`HnswIndex`] | O(log n) | Large datasets, approximate results |
23//!
24//! # Example
25//!
26//! ```
27//! use grafeo_core::index::vector::{compute_distance, DistanceMetric, brute_force_knn};
28//! use grafeo_common::types::NodeId;
29//!
30//! // Compute distance between two vectors
31//! let query = [0.1f32, 0.2, 0.3];
32//! let doc1 = [0.1f32, 0.2, 0.35];
33//! let doc2 = [0.5f32, 0.6, 0.7];
34//!
35//! let dist1 = compute_distance(&query, &doc1, DistanceMetric::Cosine);
36//! let dist2 = compute_distance(&query, &doc2, DistanceMetric::Cosine);
37//!
38//! // doc1 is more similar (smaller distance)
39//! assert!(dist1 < dist2);
40//!
41//! // Brute-force k-NN search
42//! let vectors = vec![
43//! (NodeId::new(1), doc1.as_slice()),
44//! (NodeId::new(2), doc2.as_slice()),
45//! ];
46//!
47//! let results = brute_force_knn(vectors.into_iter(), &query, 1, DistanceMetric::Cosine);
48//! assert_eq!(results[0].0, NodeId::new(1)); // doc1 is closest
49//! ```
50//!
51//! # HNSW Index (requires `vector-index` feature)
52//!
53//! For larger datasets, use the HNSW approximate nearest neighbor index:
54//!
55//! ```ignore
56//! use grafeo_core::index::vector::{HnswIndex, HnswConfig, DistanceMetric};
57//! use grafeo_common::types::NodeId;
58//!
59//! let config = HnswConfig::new(384, DistanceMetric::Cosine);
60//! let index = HnswIndex::new(config);
61//!
62//! // Insert vectors
63//! index.insert(NodeId::new(1), &embedding);
64//!
65//! // Search (O(log n))
66//! let results = index.search(&query, 10);
67//! ```
68
69mod distance;
70pub mod quantization;
71mod simd;
72pub mod storage;
73pub mod zone_map;
74
75#[cfg(feature = "vector-index")]
76mod config;
77#[cfg(feature = "vector-index")]
78mod hnsw;
79#[cfg(feature = "vector-index")]
80mod quantized_hnsw;
81
82pub use distance::{
83 DistanceMetric, compute_distance, cosine_distance, cosine_similarity, dot_product,
84 euclidean_distance, euclidean_distance_squared, l2_norm, manhattan_distance, normalize,
85 simd_support,
86};
87pub use quantization::{BinaryQuantizer, ProductQuantizer, QuantizationType, ScalarQuantizer};
88pub use storage::{MmapStorage, RamStorage, StorageBackend, VectorStorage};
89pub use zone_map::VectorZoneMap;
90
91#[cfg(feature = "vector-index")]
92pub use config::HnswConfig;
93#[cfg(feature = "vector-index")]
94pub use hnsw::HnswIndex;
95#[cfg(feature = "vector-index")]
96pub use quantized_hnsw::QuantizedHnswIndex;
97
98use grafeo_common::types::NodeId;
99
100/// Configuration for vector search operations.
101#[derive(Debug, Clone)]
102pub struct VectorConfig {
103 /// Expected vector dimensions (for validation).
104 pub dimensions: usize,
105 /// Distance metric for similarity computation.
106 pub metric: DistanceMetric,
107}
108
109impl VectorConfig {
110 /// Creates a new vector configuration.
111 #[must_use]
112 pub const fn new(dimensions: usize, metric: DistanceMetric) -> Self {
113 Self { dimensions, metric }
114 }
115
116 /// Creates a configuration for cosine similarity with the given dimensions.
117 #[must_use]
118 pub const fn cosine(dimensions: usize) -> Self {
119 Self::new(dimensions, DistanceMetric::Cosine)
120 }
121
122 /// Creates a configuration for Euclidean distance with the given dimensions.
123 #[must_use]
124 pub const fn euclidean(dimensions: usize) -> Self {
125 Self::new(dimensions, DistanceMetric::Euclidean)
126 }
127}
128
129impl Default for VectorConfig {
130 fn default() -> Self {
131 Self {
132 dimensions: 384, // Common embedding size (MiniLM, etc.)
133 metric: DistanceMetric::default(),
134 }
135 }
136}
137
138/// Performs brute-force k-nearest neighbor search.
139///
140/// This is O(n) where n is the number of vectors. Use this for:
141/// - Small datasets (< 10K vectors)
142/// - Baseline comparisons
143/// - Exact nearest neighbor search
144///
145/// For larger datasets, use an approximate index like HNSW.
146///
147/// # Arguments
148///
149/// * `vectors` - Iterator of (id, vector) pairs to search
150/// * `query` - The query vector
151/// * `k` - Number of nearest neighbors to return
152/// * `metric` - Distance metric to use
153///
154/// # Returns
155///
156/// Vector of (id, distance) pairs sorted by distance (ascending).
157///
158/// # Example
159///
160/// ```
161/// use grafeo_core::index::vector::{brute_force_knn, DistanceMetric};
162/// use grafeo_common::types::NodeId;
163///
164/// let vectors = vec![
165/// (NodeId::new(1), [0.1f32, 0.2, 0.3].as_slice()),
166/// (NodeId::new(2), [0.4f32, 0.5, 0.6].as_slice()),
167/// (NodeId::new(3), [0.7f32, 0.8, 0.9].as_slice()),
168/// ];
169///
170/// let query = [0.15f32, 0.25, 0.35];
171/// let results = brute_force_knn(vectors.into_iter(), &query, 2, DistanceMetric::Euclidean);
172///
173/// assert_eq!(results.len(), 2);
174/// assert_eq!(results[0].0, NodeId::new(1)); // Closest
175/// ```
176pub fn brute_force_knn<'a, I>(
177 vectors: I,
178 query: &[f32],
179 k: usize,
180 metric: DistanceMetric,
181) -> Vec<(NodeId, f32)>
182where
183 I: Iterator<Item = (NodeId, &'a [f32])>,
184{
185 let mut results: Vec<(NodeId, f32)> = vectors
186 .map(|(id, vec)| (id, compute_distance(query, vec, metric)))
187 .collect();
188
189 // Sort by distance (ascending)
190 results.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
191
192 // Truncate to k
193 results.truncate(k);
194 results
195}
196
197/// Performs brute-force k-nearest neighbor search with a filter predicate.
198///
199/// Only considers vectors where the predicate returns true.
200///
201/// # Arguments
202///
203/// * `vectors` - Iterator of (id, vector) pairs to search
204/// * `query` - The query vector
205/// * `k` - Number of nearest neighbors to return
206/// * `metric` - Distance metric to use
207/// * `predicate` - Filter function; only vectors where this returns true are considered
208///
209/// # Returns
210///
211/// Vector of (id, distance) pairs sorted by distance (ascending).
212pub fn brute_force_knn_filtered<'a, I, F>(
213 vectors: I,
214 query: &[f32],
215 k: usize,
216 metric: DistanceMetric,
217 predicate: F,
218) -> Vec<(NodeId, f32)>
219where
220 I: Iterator<Item = (NodeId, &'a [f32])>,
221 F: Fn(NodeId) -> bool,
222{
223 let mut results: Vec<(NodeId, f32)> = vectors
224 .filter(|(id, _)| predicate(*id))
225 .map(|(id, vec)| (id, compute_distance(query, vec, metric)))
226 .collect();
227
228 results.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
229 results.truncate(k);
230 results
231}
232
233/// Computes the distance between a query and multiple vectors in batch.
234///
235/// More efficient than computing distances one by one for large batches.
236///
237/// # Returns
238///
239/// Vector of (id, distance) pairs in the same order as input.
240pub fn batch_distances<'a, I>(
241 vectors: I,
242 query: &[f32],
243 metric: DistanceMetric,
244) -> Vec<(NodeId, f32)>
245where
246 I: Iterator<Item = (NodeId, &'a [f32])>,
247{
248 vectors
249 .map(|(id, vec)| (id, compute_distance(query, vec, metric)))
250 .collect()
251}
252
253#[cfg(test)]
254mod tests {
255 use super::*;
256
257 #[test]
258 fn test_vector_config_default() {
259 let config = VectorConfig::default();
260 assert_eq!(config.dimensions, 384);
261 assert_eq!(config.metric, DistanceMetric::Cosine);
262 }
263
264 #[test]
265 fn test_vector_config_constructors() {
266 let cosine = VectorConfig::cosine(768);
267 assert_eq!(cosine.dimensions, 768);
268 assert_eq!(cosine.metric, DistanceMetric::Cosine);
269
270 let euclidean = VectorConfig::euclidean(1536);
271 assert_eq!(euclidean.dimensions, 1536);
272 assert_eq!(euclidean.metric, DistanceMetric::Euclidean);
273 }
274
275 #[test]
276 fn test_brute_force_knn() {
277 let vectors = vec![
278 (NodeId::new(1), [0.0f32, 0.0, 0.0].as_slice()),
279 (NodeId::new(2), [1.0f32, 0.0, 0.0].as_slice()),
280 (NodeId::new(3), [2.0f32, 0.0, 0.0].as_slice()),
281 (NodeId::new(4), [3.0f32, 0.0, 0.0].as_slice()),
282 ];
283
284 let query = [0.5f32, 0.0, 0.0];
285 let results = brute_force_knn(vectors.into_iter(), &query, 2, DistanceMetric::Euclidean);
286
287 assert_eq!(results.len(), 2);
288 // Closest should be node 1 (dist 0.5) or node 2 (dist 0.5)
289 assert!(results[0].0 == NodeId::new(1) || results[0].0 == NodeId::new(2));
290 }
291
292 #[test]
293 fn test_brute_force_knn_empty() {
294 let vectors: Vec<(NodeId, &[f32])> = vec![];
295 let query = [0.0f32, 0.0];
296 let results = brute_force_knn(vectors.into_iter(), &query, 10, DistanceMetric::Cosine);
297 assert!(results.is_empty());
298 }
299
300 #[test]
301 fn test_brute_force_knn_k_larger_than_n() {
302 let vectors = vec![
303 (NodeId::new(1), [0.0f32, 0.0].as_slice()),
304 (NodeId::new(2), [1.0f32, 0.0].as_slice()),
305 ];
306
307 let query = [0.0f32, 0.0];
308 let results = brute_force_knn(vectors.into_iter(), &query, 10, DistanceMetric::Euclidean);
309
310 // Should return all 2 vectors, not 10
311 assert_eq!(results.len(), 2);
312 }
313
314 #[test]
315 fn test_brute_force_knn_filtered() {
316 let vectors = vec![
317 (NodeId::new(1), [0.0f32, 0.0].as_slice()),
318 (NodeId::new(2), [1.0f32, 0.0].as_slice()),
319 (NodeId::new(3), [2.0f32, 0.0].as_slice()),
320 ];
321
322 let query = [0.0f32, 0.0];
323
324 // Only consider even IDs
325 let results = brute_force_knn_filtered(
326 vectors.into_iter(),
327 &query,
328 10,
329 DistanceMetric::Euclidean,
330 |id| id.as_u64() % 2 == 0,
331 );
332
333 assert_eq!(results.len(), 1);
334 assert_eq!(results[0].0, NodeId::new(2));
335 }
336
337 #[test]
338 fn test_batch_distances() {
339 let vectors = vec![
340 (NodeId::new(1), [0.0f32, 0.0].as_slice()),
341 (NodeId::new(2), [3.0f32, 4.0].as_slice()),
342 ];
343
344 let query = [0.0f32, 0.0];
345 let results = batch_distances(vectors.into_iter(), &query, DistanceMetric::Euclidean);
346
347 assert_eq!(results.len(), 2);
348 assert_eq!(results[0].0, NodeId::new(1));
349 assert!((results[0].1 - 0.0).abs() < 0.001);
350 assert_eq!(results[1].0, NodeId::new(2));
351 assert!((results[1].1 - 5.0).abs() < 0.001); // 3-4-5 triangle
352 }
353}