Skip to main content

oxirs_vec/
vector_store.rs

1//! Enhanced vector store with embedding management, advanced features, and persistence.
2
3use anyhow::Result;
4use std::collections::HashMap;
5
6use crate::embeddings;
7use crate::vector_index::{MemoryVectorIndex, VectorIndex};
8use crate::{BatchSearchResult, Vector, VectorId, VectorStoreTrait};
9
10/// Configuration for vector store
11#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
12pub struct VectorStoreConfig {
13    pub auto_embed: bool,
14    pub cache_embeddings: bool,
15    pub similarity_threshold: f32,
16    pub max_results: usize,
17}
18
19impl Default for VectorStoreConfig {
20    fn default() -> Self {
21        Self {
22            auto_embed: true,
23            cache_embeddings: true,
24            similarity_threshold: 0.7,
25            max_results: 100,
26        }
27    }
28}
29
30/// Enhanced vector store with embedding management and advanced features
31pub struct VectorStore {
32    index: Box<dyn VectorIndex>,
33    embedding_manager: Option<embeddings::EmbeddingManager>,
34    config: VectorStoreConfig,
35}
36
37impl VectorStore {
38    /// Create a new vector store with default memory index
39    pub fn new() -> Self {
40        Self {
41            index: Box::new(MemoryVectorIndex::new()),
42            embedding_manager: None,
43            config: VectorStoreConfig::default(),
44        }
45    }
46
47    /// Create vector store with specific embedding strategy
48    pub fn with_embedding_strategy(strategy: embeddings::EmbeddingStrategy) -> Result<Self> {
49        let embedding_manager = embeddings::EmbeddingManager::new(strategy, 1000)?;
50
51        Ok(Self {
52            index: Box::new(MemoryVectorIndex::new()),
53            embedding_manager: Some(embedding_manager),
54            config: VectorStoreConfig::default(),
55        })
56    }
57
58    /// Create vector store with custom index
59    pub fn with_index(index: Box<dyn VectorIndex>) -> Self {
60        Self {
61            index,
62            embedding_manager: None,
63            config: VectorStoreConfig::default(),
64        }
65    }
66
67    /// Create vector store with custom index and embedding strategy
68    pub fn with_index_and_embeddings(
69        index: Box<dyn VectorIndex>,
70        strategy: embeddings::EmbeddingStrategy,
71    ) -> Result<Self> {
72        let embedding_manager = embeddings::EmbeddingManager::new(strategy, 1000)?;
73
74        Ok(Self {
75            index,
76            embedding_manager: Some(embedding_manager),
77            config: VectorStoreConfig::default(),
78        })
79    }
80
81    /// Set vector store configuration
82    pub fn with_config(mut self, config: VectorStoreConfig) -> Self {
83        self.config = config;
84        self
85    }
86
87    /// Index a resource with automatic embedding generation
88    pub fn index_resource(&mut self, uri: String, content: &str) -> Result<()> {
89        if let Some(ref mut embedding_manager) = self.embedding_manager {
90            let embeddable_content = embeddings::EmbeddableContent::Text(content.to_string());
91            let vector = embedding_manager.get_embedding(&embeddable_content)?;
92            self.index.insert(uri, vector)
93        } else {
94            // Generate a simple hash-based vector as fallback
95            let vector = self.generate_fallback_vector(content);
96            self.index.insert(uri, vector)
97        }
98    }
99
100    /// Index an RDF resource with structured content
101    pub fn index_rdf_resource(
102        &mut self,
103        uri: String,
104        label: Option<String>,
105        description: Option<String>,
106        properties: std::collections::HashMap<String, Vec<String>>,
107    ) -> Result<()> {
108        if let Some(ref mut embedding_manager) = self.embedding_manager {
109            let embeddable_content = embeddings::EmbeddableContent::RdfResource {
110                uri: uri.clone(),
111                label,
112                description,
113                properties,
114            };
115            let vector = embedding_manager.get_embedding(&embeddable_content)?;
116            self.index.insert(uri, vector)
117        } else {
118            Err(anyhow::anyhow!(
119                "Embedding manager required for RDF resource indexing"
120            ))
121        }
122    }
123
124    /// Index a pre-computed vector
125    pub fn index_vector(&mut self, uri: String, vector: Vector) -> Result<()> {
126        self.index.insert(uri, vector)
127    }
128
129    /// Search for similar resources using text query
130    pub fn similarity_search(&self, query: &str, limit: usize) -> Result<Vec<(String, f32)>> {
131        let query_vector = if let Some(ref _embedding_manager) = self.embedding_manager {
132            let _embeddable_content = embeddings::EmbeddableContent::Text(query.to_string());
133            // We need a mutable reference, but we only have an immutable one
134            // For now, generate a fallback vector
135            self.generate_fallback_vector(query)
136        } else {
137            self.generate_fallback_vector(query)
138        };
139
140        self.index.search_knn(&query_vector, limit)
141    }
142
143    /// Search for similar resources using a vector query
144    pub fn similarity_search_vector(
145        &self,
146        query: &Vector,
147        limit: usize,
148    ) -> Result<Vec<(String, f32)>> {
149        self.index.search_knn(query, limit)
150    }
151
152    /// Find resources within similarity threshold
153    pub fn threshold_search(&self, query: &str, threshold: f32) -> Result<Vec<(String, f32)>> {
154        let query_vector = self.generate_fallback_vector(query);
155        self.index.search_threshold(&query_vector, threshold)
156    }
157
158    /// Advanced search with multiple options
159    pub fn advanced_search(&self, options: SearchOptions) -> Result<Vec<(String, f32)>> {
160        let query_vector = match options.query {
161            SearchQuery::Text(text) => self.generate_fallback_vector(&text),
162            SearchQuery::Vector(vector) => vector,
163        };
164
165        let results = match options.search_type {
166            SearchType::KNN(k) => self.index.search_knn(&query_vector, k)?,
167            SearchType::Threshold(threshold) => {
168                self.index.search_threshold(&query_vector, threshold)?
169            }
170        };
171
172        Ok(results)
173    }
174
175    fn generate_fallback_vector(&self, text: &str) -> Vector {
176        // Simple hash-based vector generation for fallback
177        use std::collections::hash_map::DefaultHasher;
178        use std::hash::{Hash, Hasher};
179
180        let mut hasher = DefaultHasher::new();
181        text.hash(&mut hasher);
182        let hash = hasher.finish();
183
184        let mut values = Vec::with_capacity(384); // Standard embedding size
185        let mut seed = hash;
186
187        for _ in 0..384 {
188            seed = seed.wrapping_mul(1103515245).wrapping_add(12345);
189            let normalized = (seed as f32) / (u64::MAX as f32);
190            values.push((normalized - 0.5) * 2.0); // Range: -1.0 to 1.0
191        }
192
193        Vector::new(values)
194    }
195
196    /// Get embedding manager statistics
197    pub fn embedding_stats(&self) -> Option<(usize, usize)> {
198        self.embedding_manager.as_ref().map(|em| em.cache_stats())
199    }
200
201    /// Build vocabulary for TF-IDF embeddings
202    pub fn build_vocabulary(&mut self, documents: &[String]) -> Result<()> {
203        if let Some(ref mut embedding_manager) = self.embedding_manager {
204            embedding_manager.build_vocabulary(documents)
205        } else {
206            Ok(()) // No-op if no embedding manager
207        }
208    }
209
210    /// Calculate similarity between two resources by their URIs
211    pub fn calculate_similarity(&self, uri1: &str, uri2: &str) -> Result<f32> {
212        // If the URIs are identical, return perfect similarity
213        if uri1 == uri2 {
214            return Ok(1.0);
215        }
216
217        // Get the vectors for both URIs
218        let vector1 = self
219            .index
220            .get_vector(uri1)
221            .ok_or_else(|| anyhow::anyhow!("Vector not found for URI: {}", uri1))?;
222
223        let vector2 = self
224            .index
225            .get_vector(uri2)
226            .ok_or_else(|| anyhow::anyhow!("Vector not found for URI: {}", uri2))?;
227
228        // Calculate cosine similarity between the vectors
229        vector1.cosine_similarity(vector2)
230    }
231
232    /// Get a vector by its ID (delegates to VectorIndex)
233    pub fn get_vector(&self, id: &str) -> Option<&Vector> {
234        self.index.get_vector(id)
235    }
236
237    /// Iterate all (id, vector) pairs stored in the underlying index.
238    ///
239    /// Only index types that override [`VectorIndex::iter_vectors`]
240    /// (e.g. `MemoryVectorIndex`) return a non-empty list; other
241    /// implementations return an empty `Vec` by default.
242    pub fn iter_vectors(&self) -> Vec<(String, Vector)> {
243        self.index.iter_vectors()
244    }
245
246    /// Index a vector with metadata (stub)
247    pub fn index_vector_with_metadata(
248        &mut self,
249        uri: String,
250        vector: Vector,
251        _metadata: HashMap<String, String>,
252    ) -> Result<()> {
253        // For now, just delegate to index_vector, ignoring metadata
254        // Future: Extend VectorIndex trait to support metadata
255        self.index_vector(uri, vector)
256    }
257
258    /// Index a resource with metadata (stub)
259    pub fn index_resource_with_metadata(
260        &mut self,
261        uri: String,
262        content: &str,
263        _metadata: HashMap<String, String>,
264    ) -> Result<()> {
265        // For now, just delegate to index_resource, ignoring metadata
266        // Future: Store and utilize metadata
267        self.index_resource(uri, content)
268    }
269
270    /// Search with additional parameters (stub)
271    pub fn similarity_search_with_params(
272        &self,
273        query: &str,
274        limit: usize,
275        _params: HashMap<String, String>,
276    ) -> Result<Vec<(String, f32)>> {
277        // For now, just delegate to similarity_search, ignoring params
278        // Future: Use params for filtering, threshold, etc.
279        self.similarity_search(query, limit)
280    }
281
282    /// Vector search with additional parameters (stub)
283    pub fn vector_search_with_params(
284        &self,
285        query: &Vector,
286        limit: usize,
287        _params: HashMap<String, String>,
288    ) -> Result<Vec<(String, f32)>> {
289        // For now, just delegate to similarity_search_vector, ignoring params
290        // Future: Use params for filtering, distance metric selection, etc.
291        self.similarity_search_vector(query, limit)
292    }
293
294    /// Get all vector IDs (stub)
295    pub fn get_vector_ids(&self) -> Result<Vec<String>> {
296        // VectorIndex trait doesn't provide this method yet
297        // Future: Add to VectorIndex trait or track separately
298        Ok(Vec::new())
299    }
300
301    /// Remove a vector by its URI (stub)
302    pub fn remove_vector(&mut self, uri: &str) -> Result<()> {
303        // Delegate to VectorIndex trait's remove_vector method
304        self.index.remove_vector(uri.to_string())
305    }
306
307    /// Get store statistics (stub)
308    pub fn get_statistics(&self) -> Result<HashMap<String, String>> {
309        // Return basic statistics as a map
310        // Future: Provide comprehensive stats from index
311        let mut stats = HashMap::new();
312        stats.insert("type".to_string(), "VectorStore".to_string());
313
314        if let Some((cache_size, cache_capacity)) = self.embedding_stats() {
315            stats.insert("embedding_cache_size".to_string(), cache_size.to_string());
316            stats.insert(
317                "embedding_cache_capacity".to_string(),
318                cache_capacity.to_string(),
319            );
320        }
321
322        Ok(stats)
323    }
324
325    /// Save store to disk.
326    ///
327    /// Creates `{path}/metadata.json` (config + vector count) and
328    /// `{path}/vectors.json` (all `(id, Vector)` pairs).  The embedding
329    /// manager (in-memory cache only) is **not** persisted; call
330    /// `with_embedding_strategy` again after loading if needed.
331    ///
332    /// Only vectors held by index types that override
333    /// [`VectorIndex::iter_vectors`] (e.g. `MemoryVectorIndex`) are saved;
334    /// other index implementations return an empty list by default.
335    pub fn save_to_disk(&self, path: &str) -> Result<()> {
336        use anyhow::Context as _;
337
338        std::fs::create_dir_all(path)
339            .with_context(|| format!("Failed to create directory: {}", path))?;
340
341        // --- metadata ---
342        let vectors = self.index.iter_vectors();
343        let metadata = serde_json::json!({
344            "config": self.config,
345            "vector_count": vectors.len(),
346            "index_type": "memory",
347        });
348        let metadata_path = std::path::Path::new(path).join("metadata.json");
349        let metadata_str = serde_json::to_string_pretty(&metadata)
350            .with_context(|| "Failed to serialize VectorStore metadata")?;
351        std::fs::write(&metadata_path, metadata_str)
352            .with_context(|| format!("Failed to write {}", metadata_path.display()))?;
353
354        // --- vectors ---
355        let vectors_path = std::path::Path::new(path).join("vectors.json");
356        let vectors_str = serde_json::to_string_pretty(&vectors)
357            .with_context(|| "Failed to serialize VectorStore vectors")?;
358        std::fs::write(&vectors_path, vectors_str)
359            .with_context(|| format!("Failed to write {}", vectors_path.display()))?;
360
361        Ok(())
362    }
363
364    /// Load a store that was previously saved with [`save_to_disk`].
365    ///
366    /// Reconstructs a `VectorStore` backed by a fresh `MemoryVectorIndex` and
367    /// re-inserts all vectors from the saved snapshot.  The embedding manager
368    /// is not restored; create a new one with `with_embedding_strategy` if
369    /// needed.
370    pub fn load_from_disk(path: &str) -> Result<Self> {
371        use anyhow::Context as _;
372
373        // --- read metadata ---
374        let metadata_path = std::path::Path::new(path).join("metadata.json");
375        let metadata_str = std::fs::read_to_string(&metadata_path)
376            .with_context(|| format!("Failed to read {}", metadata_path.display()))?;
377        let metadata: serde_json::Value = serde_json::from_str(&metadata_str)
378            .with_context(|| "Failed to parse VectorStore metadata")?;
379
380        let config: VectorStoreConfig = serde_json::from_value(metadata["config"].clone())
381            .with_context(|| "Failed to deserialize VectorStoreConfig from metadata")?;
382
383        // --- read vectors ---
384        let vectors_path = std::path::Path::new(path).join("vectors.json");
385        let vectors_str = std::fs::read_to_string(&vectors_path)
386            .with_context(|| format!("Failed to read {}", vectors_path.display()))?;
387        let entries: Vec<(String, Vector)> = serde_json::from_str(&vectors_str)
388            .with_context(|| "Failed to deserialize VectorStore vectors")?;
389
390        // --- reconstruct ---
391        let mut store = Self {
392            index: Box::new(MemoryVectorIndex::new()),
393            embedding_manager: None,
394            config,
395        };
396
397        for (id, vector) in entries {
398            store
399                .index
400                .insert(id.clone(), vector)
401                .with_context(|| format!("Failed to re-insert vector '{}'", id))?;
402        }
403
404        Ok(store)
405    }
406
407    /// Optimize the underlying index (stub)
408    pub fn optimize_index(&mut self) -> Result<()> {
409        // Stub implementation - optimization not yet implemented
410        // Future: Trigger index compaction, rebalancing, etc.
411        Ok(())
412    }
413}
414
415impl Default for VectorStore {
416    fn default() -> Self {
417        Self::new()
418    }
419}
420
421impl VectorStoreTrait for VectorStore {
422    fn insert_vector(&mut self, id: VectorId, vector: Vector) -> Result<()> {
423        self.index.insert(id, vector)
424    }
425
426    fn add_vector(&mut self, vector: Vector) -> Result<VectorId> {
427        // Generate a unique ID for the vector
428        let id = format!("vec_{}", uuid::Uuid::new_v4());
429        self.index.insert(id.clone(), vector)?;
430        Ok(id)
431    }
432
433    fn get_vector(&self, id: &VectorId) -> Result<Option<Vector>> {
434        Ok(self.index.get_vector(id).cloned())
435    }
436
437    fn get_all_vector_ids(&self) -> Result<Vec<VectorId>> {
438        // For now, return empty vec as VectorIndex doesn't provide this method
439        // This could be enhanced if the underlying index supports it
440        Ok(Vec::new())
441    }
442
443    fn search_similar(&self, query: &Vector, k: usize) -> Result<Vec<(VectorId, f32)>> {
444        self.index.search_knn(query, k)
445    }
446
447    fn remove_vector(&mut self, id: &VectorId) -> Result<bool> {
448        // VectorIndex trait doesn't have remove, so we'll return false for now
449        // This could be enhanced in the future if needed
450        let _ = id;
451        Ok(false)
452    }
453
454    fn len(&self) -> usize {
455        // VectorIndex trait doesn't have len, so we'll return 0 for now
456        // This could be enhanced in the future if needed
457        0
458    }
459}
460
461/// Search query types
462#[derive(Debug, Clone)]
463pub enum SearchQuery {
464    Text(String),
465    Vector(Vector),
466}
467
468/// Search operation types
469#[derive(Debug, Clone)]
470pub enum SearchType {
471    KNN(usize),
472    Threshold(f32),
473}
474
475/// Advanced search options
476#[derive(Debug, Clone)]
477pub struct SearchOptions {
478    pub query: SearchQuery,
479    pub search_type: SearchType,
480}
481
482/// Vector operation results with enhanced metadata
483#[derive(Debug, Clone)]
484pub struct VectorOperationResult {
485    pub uri: String,
486    pub similarity: f32,
487    pub vector: Option<Vector>,
488    pub metadata: Option<std::collections::HashMap<String, String>>,
489    pub rank: usize,
490}
491
492/// Document batch processing utilities
493pub struct DocumentBatchProcessor;
494
495impl DocumentBatchProcessor {
496    /// Process multiple documents in batch for efficient indexing
497    pub fn batch_index(
498        store: &mut VectorStore,
499        documents: &[(String, String)], // (uri, content) pairs
500    ) -> Result<Vec<Result<()>>> {
501        let mut results = Vec::new();
502
503        for (uri, content) in documents {
504            let result = store.index_resource(uri.clone(), content);
505            results.push(result);
506        }
507
508        Ok(results)
509    }
510
511    /// Process multiple queries in batch
512    pub fn batch_search(
513        store: &VectorStore,
514        queries: &[String],
515        limit: usize,
516    ) -> Result<BatchSearchResult> {
517        let mut results = Vec::new();
518
519        for query in queries {
520            let result = store.similarity_search(query, limit);
521            results.push(result);
522        }
523
524        Ok(results)
525    }
526}