manifoldb_vector/index/
traits.rs

1//! Traits for vector indexes.
2
3use manifoldb_core::EntityId;
4
5use crate::error::VectorError;
6use crate::types::Embedding;
7
8/// Result of a similarity search.
9#[derive(Debug, Clone)]
10pub struct SearchResult {
11    /// The entity ID of the matching embedding.
12    pub entity_id: EntityId,
13    /// The distance to the query vector (lower is more similar for most metrics).
14    pub distance: f32,
15}
16
17impl SearchResult {
18    /// Create a new search result.
19    #[must_use]
20    pub const fn new(entity_id: EntityId, distance: f32) -> Self {
21        Self { entity_id, distance }
22    }
23}
24
25/// Configuration for filtered search operations.
26#[derive(Debug, Clone)]
27pub struct FilteredSearchConfig {
28    /// The minimum ef_search to use when filtering.
29    /// When filters are selective, ef_search may need to be increased
30    /// to maintain recall.
31    pub min_ef_search: usize,
32
33    /// The maximum ef_search to use when filtering.
34    pub max_ef_search: usize,
35
36    /// The ef_search multiplier to apply when filtering.
37    /// When a filter is applied, ef_search is multiplied by this factor
38    /// to account for nodes that may be filtered out during traversal.
39    pub ef_multiplier: f32,
40}
41
42impl Default for FilteredSearchConfig {
43    fn default() -> Self {
44        Self { min_ef_search: 40, max_ef_search: 500, ef_multiplier: 2.0 }
45    }
46}
47
48impl FilteredSearchConfig {
49    /// Create a new filtered search configuration.
50    #[must_use]
51    pub const fn new() -> Self {
52        Self { min_ef_search: 40, max_ef_search: 500, ef_multiplier: 2.0 }
53    }
54
55    /// Set the minimum ef_search.
56    #[must_use]
57    pub const fn with_min_ef_search(mut self, min_ef: usize) -> Self {
58        self.min_ef_search = min_ef;
59        self
60    }
61
62    /// Set the maximum ef_search.
63    #[must_use]
64    pub const fn with_max_ef_search(mut self, max_ef: usize) -> Self {
65        self.max_ef_search = max_ef;
66        self
67    }
68
69    /// Set the ef_search multiplier.
70    #[must_use]
71    pub const fn with_ef_multiplier(mut self, multiplier: f32) -> Self {
72        self.ef_multiplier = multiplier;
73        self
74    }
75
76    /// Calculate the adjusted ef_search based on filter selectivity.
77    ///
78    /// # Arguments
79    ///
80    /// * `base_ef` - The base ef_search value
81    /// * `selectivity` - Optional estimated filter selectivity (0.0 to 1.0).
82    ///   1.0 means all nodes pass, 0.0 means no nodes pass.
83    ///   If None, the default multiplier is applied.
84    #[must_use]
85    #[allow(clippy::cast_possible_truncation)]
86    #[allow(clippy::cast_sign_loss)]
87    pub fn adjusted_ef(&self, base_ef: usize, selectivity: Option<f32>) -> usize {
88        let multiplier = match selectivity {
89            Some(s) if s > 0.0 && s < 1.0 => {
90                // More aggressive expansion for more selective filters
91                // For selectivity 0.1 (10% pass), expand by 10x
92                // For selectivity 0.5 (50% pass), expand by 2x
93                (1.0 / s).min(self.ef_multiplier * 5.0)
94            }
95            Some(_) => 1.0, // No multiplier needed if all/none pass
96            None => self.ef_multiplier,
97        };
98
99        let adjusted = ((base_ef as f32) * multiplier) as usize;
100        adjusted.clamp(self.min_ef_search, self.max_ef_search)
101    }
102}
103
104/// Trait for vector similarity search indexes.
105///
106/// This trait defines the interface for approximate nearest neighbor (ANN)
107/// indexes that support insert, delete, and k-NN search operations.
108pub trait VectorIndex {
109    /// Insert an embedding into the index.
110    ///
111    /// If an embedding already exists for the given entity, it will be updated.
112    ///
113    /// # Errors
114    ///
115    /// Returns an error if the embedding dimension doesn't match the index dimension,
116    /// or if there's a storage error.
117    fn insert(&mut self, entity_id: EntityId, embedding: &Embedding) -> Result<(), VectorError>;
118
119    /// Insert multiple embeddings into the index in a batch operation.
120    ///
121    /// This is significantly more efficient than calling `insert` multiple times
122    /// as it minimizes transaction overhead and optimizes HNSW graph construction.
123    /// All embeddings are inserted within a single operation.
124    ///
125    /// If an embedding already exists for any entity, it will be updated.
126    ///
127    /// # Performance
128    ///
129    /// For bulk loading, this can provide up to 10x throughput improvement over
130    /// individual `insert` calls.
131    ///
132    /// # Arguments
133    ///
134    /// * `embeddings` - Slice of (EntityId, Embedding reference) pairs to insert
135    ///
136    /// # Errors
137    ///
138    /// Returns an error if any embedding dimension doesn't match the index dimension,
139    /// or if there's a storage error.
140    fn insert_batch(&mut self, embeddings: &[(EntityId, &Embedding)]) -> Result<(), VectorError> {
141        // Default implementation: insert one at a time
142        // Implementations can override for better performance
143        for (entity_id, embedding) in embeddings {
144            self.insert(*entity_id, embedding)?;
145        }
146        Ok(())
147    }
148
149    /// Remove an embedding from the index.
150    ///
151    /// # Returns
152    ///
153    /// Returns `true` if the embedding was removed, `false` if it didn't exist.
154    ///
155    /// # Errors
156    ///
157    /// Returns an error if there's a storage error.
158    fn delete(&mut self, entity_id: EntityId) -> Result<bool, VectorError>;
159
160    /// Search for the k nearest neighbors of a query vector.
161    ///
162    /// # Arguments
163    ///
164    /// * `query` - The query embedding
165    /// * `k` - The number of nearest neighbors to return
166    /// * `ef_search` - Optional beam width for search (uses default if None)
167    ///
168    /// # Returns
169    ///
170    /// A vector of search results, sorted by distance (closest first).
171    ///
172    /// # Errors
173    ///
174    /// Returns an error if the query dimension doesn't match the index dimension,
175    /// or if there's a storage error.
176    fn search(
177        &self,
178        query: &Embedding,
179        k: usize,
180        ef_search: Option<usize>,
181    ) -> Result<Vec<SearchResult>, VectorError>;
182
183    /// Search for the k nearest neighbors with a filter predicate.
184    ///
185    /// This performs filtered search where the predicate is applied during
186    /// graph traversal, not as a post-filter. This is more efficient when
187    /// the filter is selective, as it avoids exploring many non-matching paths.
188    ///
189    /// # Arguments
190    ///
191    /// * `query` - The query embedding
192    /// * `k` - The number of nearest neighbors to return
193    /// * `predicate` - A function that returns true for entity IDs that should be included
194    /// * `ef_search` - Optional beam width for search (uses adjusted default if None)
195    /// * `config` - Optional configuration for filtered search (uses defaults if None)
196    ///
197    /// # Returns
198    ///
199    /// A vector of search results that pass the predicate, sorted by distance (closest first).
200    ///
201    /// # Errors
202    ///
203    /// Returns an error if the query dimension doesn't match the index dimension,
204    /// or if there's a storage error.
205    fn search_with_filter<F>(
206        &self,
207        query: &Embedding,
208        k: usize,
209        predicate: F,
210        ef_search: Option<usize>,
211        config: Option<FilteredSearchConfig>,
212    ) -> Result<Vec<SearchResult>, VectorError>
213    where
214        F: Fn(EntityId) -> bool;
215
216    /// Check if an embedding exists in the index.
217    fn contains(&self, entity_id: EntityId) -> Result<bool, VectorError>;
218
219    /// Get the number of embeddings in the index.
220    fn len(&self) -> Result<usize, VectorError>;
221
222    /// Check if the index is empty.
223    fn is_empty(&self) -> Result<bool, VectorError> {
224        self.len().map(|n| n == 0)
225    }
226
227    /// Get the dimension of embeddings in this index.
228    ///
229    /// # Errors
230    ///
231    /// Returns an error if there's a concurrency error (lock poisoning).
232    fn dimension(&self) -> Result<usize, VectorError>;
233}