manifoldb_vector/index/traits.rs
1//! Traits for vector indexes.
2
3use manifoldb_core::EntityId;
4
5use crate::error::VectorError;
6use crate::types::Embedding;
7
8/// Result of a similarity search.
9#[derive(Debug, Clone)]
10pub struct SearchResult {
11 /// The entity ID of the matching embedding.
12 pub entity_id: EntityId,
13 /// The distance to the query vector (lower is more similar for most metrics).
14 pub distance: f32,
15}
16
17impl SearchResult {
18 /// Create a new search result.
19 #[must_use]
20 pub const fn new(entity_id: EntityId, distance: f32) -> Self {
21 Self { entity_id, distance }
22 }
23}
24
25/// Configuration for filtered search operations.
26#[derive(Debug, Clone)]
27pub struct FilteredSearchConfig {
28 /// The minimum ef_search to use when filtering.
29 /// When filters are selective, ef_search may need to be increased
30 /// to maintain recall.
31 pub min_ef_search: usize,
32
33 /// The maximum ef_search to use when filtering.
34 pub max_ef_search: usize,
35
36 /// The ef_search multiplier to apply when filtering.
37 /// When a filter is applied, ef_search is multiplied by this factor
38 /// to account for nodes that may be filtered out during traversal.
39 pub ef_multiplier: f32,
40}
41
42impl Default for FilteredSearchConfig {
43 fn default() -> Self {
44 Self { min_ef_search: 40, max_ef_search: 500, ef_multiplier: 2.0 }
45 }
46}
47
48impl FilteredSearchConfig {
49 /// Create a new filtered search configuration.
50 #[must_use]
51 pub const fn new() -> Self {
52 Self { min_ef_search: 40, max_ef_search: 500, ef_multiplier: 2.0 }
53 }
54
55 /// Set the minimum ef_search.
56 #[must_use]
57 pub const fn with_min_ef_search(mut self, min_ef: usize) -> Self {
58 self.min_ef_search = min_ef;
59 self
60 }
61
62 /// Set the maximum ef_search.
63 #[must_use]
64 pub const fn with_max_ef_search(mut self, max_ef: usize) -> Self {
65 self.max_ef_search = max_ef;
66 self
67 }
68
69 /// Set the ef_search multiplier.
70 #[must_use]
71 pub const fn with_ef_multiplier(mut self, multiplier: f32) -> Self {
72 self.ef_multiplier = multiplier;
73 self
74 }
75
76 /// Calculate the adjusted ef_search based on filter selectivity.
77 ///
78 /// # Arguments
79 ///
80 /// * `base_ef` - The base ef_search value
81 /// * `selectivity` - Optional estimated filter selectivity (0.0 to 1.0).
82 /// 1.0 means all nodes pass, 0.0 means no nodes pass.
83 /// If None, the default multiplier is applied.
84 #[must_use]
85 #[allow(clippy::cast_possible_truncation)]
86 #[allow(clippy::cast_sign_loss)]
87 pub fn adjusted_ef(&self, base_ef: usize, selectivity: Option<f32>) -> usize {
88 let multiplier = match selectivity {
89 Some(s) if s > 0.0 && s < 1.0 => {
90 // More aggressive expansion for more selective filters
91 // For selectivity 0.1 (10% pass), expand by 10x
92 // For selectivity 0.5 (50% pass), expand by 2x
93 (1.0 / s).min(self.ef_multiplier * 5.0)
94 }
95 Some(_) => 1.0, // No multiplier needed if all/none pass
96 None => self.ef_multiplier,
97 };
98
99 let adjusted = ((base_ef as f32) * multiplier) as usize;
100 adjusted.clamp(self.min_ef_search, self.max_ef_search)
101 }
102}
103
104/// Trait for vector similarity search indexes.
105///
106/// This trait defines the interface for approximate nearest neighbor (ANN)
107/// indexes that support insert, delete, and k-NN search operations.
108pub trait VectorIndex {
109 /// Insert an embedding into the index.
110 ///
111 /// If an embedding already exists for the given entity, it will be updated.
112 ///
113 /// # Errors
114 ///
115 /// Returns an error if the embedding dimension doesn't match the index dimension,
116 /// or if there's a storage error.
117 fn insert(&mut self, entity_id: EntityId, embedding: &Embedding) -> Result<(), VectorError>;
118
119 /// Insert multiple embeddings into the index in a batch operation.
120 ///
121 /// This is significantly more efficient than calling `insert` multiple times
122 /// as it minimizes transaction overhead and optimizes HNSW graph construction.
123 /// All embeddings are inserted within a single operation.
124 ///
125 /// If an embedding already exists for any entity, it will be updated.
126 ///
127 /// # Performance
128 ///
129 /// For bulk loading, this can provide up to 10x throughput improvement over
130 /// individual `insert` calls.
131 ///
132 /// # Arguments
133 ///
134 /// * `embeddings` - Slice of (EntityId, Embedding reference) pairs to insert
135 ///
136 /// # Errors
137 ///
138 /// Returns an error if any embedding dimension doesn't match the index dimension,
139 /// or if there's a storage error.
140 fn insert_batch(&mut self, embeddings: &[(EntityId, &Embedding)]) -> Result<(), VectorError> {
141 // Default implementation: insert one at a time
142 // Implementations can override for better performance
143 for (entity_id, embedding) in embeddings {
144 self.insert(*entity_id, embedding)?;
145 }
146 Ok(())
147 }
148
149 /// Remove an embedding from the index.
150 ///
151 /// # Returns
152 ///
153 /// Returns `true` if the embedding was removed, `false` if it didn't exist.
154 ///
155 /// # Errors
156 ///
157 /// Returns an error if there's a storage error.
158 fn delete(&mut self, entity_id: EntityId) -> Result<bool, VectorError>;
159
160 /// Search for the k nearest neighbors of a query vector.
161 ///
162 /// # Arguments
163 ///
164 /// * `query` - The query embedding
165 /// * `k` - The number of nearest neighbors to return
166 /// * `ef_search` - Optional beam width for search (uses default if None)
167 ///
168 /// # Returns
169 ///
170 /// A vector of search results, sorted by distance (closest first).
171 ///
172 /// # Errors
173 ///
174 /// Returns an error if the query dimension doesn't match the index dimension,
175 /// or if there's a storage error.
176 fn search(
177 &self,
178 query: &Embedding,
179 k: usize,
180 ef_search: Option<usize>,
181 ) -> Result<Vec<SearchResult>, VectorError>;
182
183 /// Search for the k nearest neighbors with a filter predicate.
184 ///
185 /// This performs filtered search where the predicate is applied during
186 /// graph traversal, not as a post-filter. This is more efficient when
187 /// the filter is selective, as it avoids exploring many non-matching paths.
188 ///
189 /// # Arguments
190 ///
191 /// * `query` - The query embedding
192 /// * `k` - The number of nearest neighbors to return
193 /// * `predicate` - A function that returns true for entity IDs that should be included
194 /// * `ef_search` - Optional beam width for search (uses adjusted default if None)
195 /// * `config` - Optional configuration for filtered search (uses defaults if None)
196 ///
197 /// # Returns
198 ///
199 /// A vector of search results that pass the predicate, sorted by distance (closest first).
200 ///
201 /// # Errors
202 ///
203 /// Returns an error if the query dimension doesn't match the index dimension,
204 /// or if there's a storage error.
205 fn search_with_filter<F>(
206 &self,
207 query: &Embedding,
208 k: usize,
209 predicate: F,
210 ef_search: Option<usize>,
211 config: Option<FilteredSearchConfig>,
212 ) -> Result<Vec<SearchResult>, VectorError>
213 where
214 F: Fn(EntityId) -> bool;
215
216 /// Check if an embedding exists in the index.
217 fn contains(&self, entity_id: EntityId) -> Result<bool, VectorError>;
218
219 /// Get the number of embeddings in the index.
220 fn len(&self) -> Result<usize, VectorError>;
221
222 /// Check if the index is empty.
223 fn is_empty(&self) -> Result<bool, VectorError> {
224 self.len().map(|n| n == 0)
225 }
226
227 /// Get the dimension of embeddings in this index.
228 ///
229 /// # Errors
230 ///
231 /// Returns an error if there's a concurrency error (lock poisoning).
232 fn dimension(&self) -> Result<usize, VectorError>;
233}