Skip to main content

ruvector_core/
vector_db.rs

1//! Main VectorDB interface
2
3use crate::error::Result;
4use crate::index::flat::FlatIndex;
5
6#[cfg(feature = "hnsw")]
7use crate::index::hnsw::HnswIndex;
8
9use crate::index::VectorIndex;
10use crate::types::*;
11use parking_lot::RwLock;
12use std::sync::Arc;
13
14// Import appropriate storage backend based on features
15#[cfg(feature = "storage")]
16use crate::storage::VectorStorage;
17
18#[cfg(not(feature = "storage"))]
19use crate::storage_memory::MemoryStorage as VectorStorage;
20
21/// Main vector database
22pub struct VectorDB {
23    storage: Arc<VectorStorage>,
24    index: Arc<RwLock<Box<dyn VectorIndex>>>,
25    options: DbOptions,
26}
27
28impl VectorDB {
29    /// Create a new vector database with the given options
30    ///
31    /// If a storage path is provided and contains persisted vectors,
32    /// the HNSW index will be automatically rebuilt from storage.
33    /// If opening an existing database, the stored configuration (dimensions,
34    /// distance metric, etc.) will be used instead of the provided options.
35    #[allow(unused_mut)] // `options` is mutated only when feature = "storage"
36    pub fn new(mut options: DbOptions) -> Result<Self> {
37        #[cfg(feature = "storage")]
38        let storage = {
39            // First, try to load existing configuration from the database
40            // We create a temporary storage to check for config
41            let temp_storage = VectorStorage::new(&options.storage_path, options.dimensions)?;
42
43            let stored_config = temp_storage.load_config()?;
44
45            if let Some(config) = stored_config {
46                // Existing database - use stored configuration
47                tracing::info!(
48                    "Loading existing database with {} dimensions",
49                    config.dimensions
50                );
51                options = DbOptions {
52                    // Keep the provided storage path (may have changed)
53                    storage_path: options.storage_path.clone(),
54                    // Use stored configuration for everything else
55                    dimensions: config.dimensions,
56                    distance_metric: config.distance_metric,
57                    hnsw_config: config.hnsw_config,
58                    quantization: config.quantization,
59                };
60                // Recreate storage with correct dimensions
61                Arc::new(VectorStorage::new(
62                    &options.storage_path,
63                    options.dimensions,
64                )?)
65            } else {
66                // New database - save the configuration
67                tracing::info!(
68                    "Creating new database with {} dimensions",
69                    options.dimensions
70                );
71                temp_storage.save_config(&options)?;
72                Arc::new(temp_storage)
73            }
74        };
75
76        #[cfg(not(feature = "storage"))]
77        let storage = Arc::new(VectorStorage::new(options.dimensions)?);
78
79        // Choose index based on configuration and available features
80        #[allow(unused_mut)] // `index` is mutated only when feature = "storage"
81        let mut index: Box<dyn VectorIndex> = if let Some(hnsw_config) = &options.hnsw_config {
82            #[cfg(feature = "hnsw")]
83            {
84                Box::new(HnswIndex::new(
85                    options.dimensions,
86                    options.distance_metric,
87                    hnsw_config.clone(),
88                )?)
89            }
90            #[cfg(not(feature = "hnsw"))]
91            {
92                // Fall back to flat index if HNSW is not available
93                tracing::warn!("HNSW requested but not available (WASM build), using flat index");
94                Box::new(FlatIndex::new(options.dimensions, options.distance_metric))
95            }
96        } else {
97            Box::new(FlatIndex::new(options.dimensions, options.distance_metric))
98        };
99
100        // `DbOptions.quantization` is persisted/restored but not yet applied to
101        // the index or storage representation (issue #563). Warn loudly rather
102        // than silently ignoring a requested quantization so callers don't
103        // assume a memory reduction that isn't happening.
104        if !matches!(
105            options.quantization,
106            None | Some(crate::types::QuantizationConfig::None)
107        ) {
108            tracing::warn!(
109                "DbOptions.quantization = {:?} is set but not yet applied — the \
110                 index is stored unquantized (no compression / memory reduction). \
111                 See issue #563.",
112                options.quantization
113            );
114        }
115
116        // Rebuild index from persisted vectors if storage is not empty
117        // This fixes the bug where search() returns empty results after restart
118        #[cfg(feature = "storage")]
119        {
120            let stored_ids = storage.all_ids()?;
121            if !stored_ids.is_empty() {
122                tracing::info!(
123                    "Rebuilding index from {} persisted vectors",
124                    stored_ids.len()
125                );
126
127                // Batch load all vectors for efficient index rebuilding
128                let mut entries = Vec::with_capacity(stored_ids.len());
129                for id in stored_ids {
130                    if let Some(entry) = storage.get(&id)? {
131                        entries.push((id, entry.vector));
132                    }
133                }
134
135                // Add all vectors to index in batch for better performance
136                index.add_batch(entries)?;
137
138                tracing::info!("Index rebuilt successfully");
139            }
140        }
141
142        Ok(Self {
143            storage,
144            index: Arc::new(RwLock::new(index)),
145            options,
146        })
147    }
148
149    /// Create with default options
150    pub fn with_dimensions(dimensions: usize) -> Result<Self> {
151        let options = DbOptions {
152            dimensions,
153            ..DbOptions::default()
154        };
155        Self::new(options)
156    }
157
158    /// Insert a vector entry
159    pub fn insert(&self, entry: VectorEntry) -> Result<VectorId> {
160        let id = self.storage.insert(&entry)?;
161
162        // Add to index
163        let mut index = self.index.write();
164        index.add(id.clone(), entry.vector)?;
165
166        Ok(id)
167    }
168
169    /// Insert multiple vectors in a batch
170    pub fn insert_batch(&self, entries: impl AsRef<[VectorEntry]>) -> Result<Vec<VectorId>> {
171        let entries = entries.as_ref();
172        let ids = self.storage.insert_batch(entries)?;
173
174        // Add to index
175        let mut index = self.index.write();
176        let index_entries: Vec<_> = ids
177            .iter()
178            .zip(entries.iter())
179            .map(|(id, entry)| (id.clone(), entry.vector.clone()))
180            .collect();
181
182        index.add_batch(index_entries)?;
183
184        Ok(ids)
185    }
186
187    /// Search for similar vectors
188    pub fn search(&self, query: SearchQuery) -> Result<Vec<SearchResult>> {
189        let index = self.index.read();
190        let mut results = index.search(&query.vector, query.k)?;
191
192        // Enrich results with full data if needed
193        for result in &mut results {
194            if let Ok(Some(entry)) = self.storage.get(&result.id) {
195                result.vector = Some(entry.vector);
196                result.metadata = entry.metadata;
197            }
198        }
199
200        // Apply metadata filters if specified
201        if let Some(filter) = &query.filter {
202            results.retain(|r| {
203                if let Some(metadata) = &r.metadata {
204                    filter
205                        .iter()
206                        .all(|(key, value)| metadata.get(key).is_some_and(|v| v == value))
207                } else {
208                    false
209                }
210            });
211        }
212
213        Ok(results)
214    }
215
216    /// Delete a vector by ID
217    pub fn delete(&self, id: &str) -> Result<bool> {
218        let deleted_storage = self.storage.delete(id)?;
219
220        if deleted_storage {
221            let mut index = self.index.write();
222            let _ = index.remove(&id.to_string())?;
223        }
224
225        Ok(deleted_storage)
226    }
227
228    /// Get a vector by ID
229    pub fn get(&self, id: &str) -> Result<Option<VectorEntry>> {
230        self.storage.get(id)
231    }
232
233    /// Get the number of vectors
234    pub fn len(&self) -> Result<usize> {
235        self.storage.len()
236    }
237
238    /// Check if database is empty
239    pub fn is_empty(&self) -> Result<bool> {
240        self.storage.is_empty()
241    }
242
243    /// Get database options
244    pub fn options(&self) -> &DbOptions {
245        &self.options
246    }
247
248    /// Get all vector IDs (for iteration/serialization)
249    pub fn keys(&self) -> Result<Vec<String>> {
250        self.storage.all_ids()
251    }
252}
253
254#[cfg(test)]
255mod tests {
256    use super::*;
257    use std::path::Path;
258    use tempfile::tempdir;
259
260    #[test]
261    fn test_vector_db_creation() -> Result<()> {
262        let dir = tempdir().unwrap();
263        let mut options = DbOptions::default();
264        options.storage_path = dir.path().join("test.db").to_string_lossy().to_string();
265        options.dimensions = 3;
266
267        let db = VectorDB::new(options)?;
268        assert!(db.is_empty()?);
269
270        Ok(())
271    }
272
273    #[test]
274    fn test_insert_and_search() -> Result<()> {
275        let dir = tempdir().unwrap();
276        let mut options = DbOptions::default();
277        options.storage_path = dir.path().join("test.db").to_string_lossy().to_string();
278        options.dimensions = 3;
279        options.distance_metric = DistanceMetric::Euclidean; // Use Euclidean for clearer test
280        options.hnsw_config = None; // Use flat index for testing
281
282        let db = VectorDB::new(options)?;
283
284        // Insert vectors
285        db.insert(VectorEntry {
286            id: Some("v1".to_string()),
287            vector: vec![1.0, 0.0, 0.0],
288            metadata: None,
289        })?;
290
291        db.insert(VectorEntry {
292            id: Some("v2".to_string()),
293            vector: vec![0.0, 1.0, 0.0],
294            metadata: None,
295        })?;
296
297        db.insert(VectorEntry {
298            id: Some("v3".to_string()),
299            vector: vec![0.0, 0.0, 1.0],
300            metadata: None,
301        })?;
302
303        // Search for exact match
304        let results = db.search(SearchQuery {
305            vector: vec![1.0, 0.0, 0.0],
306            k: 2,
307            filter: None,
308            ef_search: None,
309        })?;
310
311        assert!(results.len() >= 1);
312        assert_eq!(results[0].id, "v1", "First result should be exact match");
313        assert!(
314            results[0].score < 0.01,
315            "Exact match should have ~0 distance"
316        );
317
318        Ok(())
319    }
320
321    /// Test that search works after simulated restart (new VectorDB instance)
322    /// This verifies the fix for issue #30: HNSW index not rebuilt from storage
323    #[test]
324    #[cfg(feature = "storage")]
325    fn test_search_after_restart() -> Result<()> {
326        let dir = tempdir().unwrap();
327        let db_path = dir.path().join("persist.db").to_string_lossy().to_string();
328
329        // Phase 1: Create database and insert vectors
330        {
331            let mut options = DbOptions::default();
332            options.storage_path = db_path.clone();
333            options.dimensions = 3;
334            options.distance_metric = DistanceMetric::Euclidean;
335            options.hnsw_config = None;
336
337            let db = VectorDB::new(options)?;
338
339            db.insert(VectorEntry {
340                id: Some("v1".to_string()),
341                vector: vec![1.0, 0.0, 0.0],
342                metadata: None,
343            })?;
344
345            db.insert(VectorEntry {
346                id: Some("v2".to_string()),
347                vector: vec![0.0, 1.0, 0.0],
348                metadata: None,
349            })?;
350
351            db.insert(VectorEntry {
352                id: Some("v3".to_string()),
353                vector: vec![0.7, 0.7, 0.0],
354                metadata: None,
355            })?;
356
357            // Verify search works before "restart"
358            let results = db.search(SearchQuery {
359                vector: vec![0.8, 0.6, 0.0],
360                k: 3,
361                filter: None,
362                ef_search: None,
363            })?;
364            assert_eq!(results.len(), 3, "Should find all 3 vectors before restart");
365        }
366        // db is dropped here, simulating application shutdown
367
368        // Phase 2: Create new database instance (simulates restart)
369        {
370            let mut options = DbOptions::default();
371            options.storage_path = db_path.clone();
372            options.dimensions = 3;
373            options.distance_metric = DistanceMetric::Euclidean;
374            options.hnsw_config = None;
375
376            let db = VectorDB::new(options)?;
377
378            // Verify vectors are still accessible
379            assert_eq!(db.len()?, 3, "Should have 3 vectors after restart");
380
381            // Verify get() works
382            let v1 = db.get("v1")?;
383            assert!(v1.is_some(), "get() should work after restart");
384
385            // Verify search() works - THIS WAS THE BUG
386            let results = db.search(SearchQuery {
387                vector: vec![0.8, 0.6, 0.0],
388                k: 3,
389                filter: None,
390                ef_search: None,
391            })?;
392
393            assert_eq!(
394                results.len(),
395                3,
396                "search() should return results after restart (was returning 0 before fix)"
397            );
398
399            // v3 should be closest to query [0.8, 0.6, 0.0]
400            assert_eq!(
401                results[0].id, "v3",
402                "v3 [0.7, 0.7, 0.0] should be closest to query [0.8, 0.6, 0.0]"
403            );
404        }
405
406        Ok(())
407    }
408}