Skip to main content

mockforge_data/rag/
storage.rs

1//! Document storage and vector indexing
2//!
3//! This module provides storage backends for documents and their vector embeddings,
4//! supporting various indexing strategies and similarity search algorithms.
5
6use crate::rag::engine::DocumentChunk;
7use crate::Result;
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10use std::sync::Arc;
11use tokio::sync::RwLock;
12
13type VectorStore = Arc<RwLock<Vec<(String, Vec<f32>)>>>;
14
15/// Vector index for similarity search
16#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct VectorIndex {
18    /// Index ID
19    pub id: String,
20    /// Index name
21    pub name: String,
22    /// Index type
23    pub index_type: IndexType,
24    /// Vector dimensions
25    pub dimensions: usize,
26    /// Number of vectors indexed
27    pub vector_count: usize,
28    /// Index metadata
29    pub metadata: HashMap<String, String>,
30    /// Creation timestamp
31    pub created_at: chrono::DateTime<chrono::Utc>,
32    /// Last updated timestamp
33    pub updated_at: chrono::DateTime<chrono::Utc>,
34}
35
36impl VectorIndex {
37    /// Create a new vector index
38    pub fn new(id: String, name: String, index_type: IndexType, dimensions: usize) -> Self {
39        let now = chrono::Utc::now();
40        Self {
41            id,
42            name,
43            index_type,
44            dimensions,
45            vector_count: 0,
46            metadata: HashMap::new(),
47            created_at: now,
48            updated_at: now,
49        }
50    }
51
52    /// Add metadata to index
53    pub fn add_metadata(&mut self, key: String, value: String) {
54        self.metadata.insert(key, value);
55        self.updated_at = chrono::Utc::now();
56    }
57
58    /// Get metadata value
59    pub fn get_metadata(&self, key: &str) -> Option<&String> {
60        self.metadata.get(key)
61    }
62
63    /// Remove metadata
64    pub fn remove_metadata(&mut self, key: &str) -> Option<String> {
65        let result = self.metadata.remove(key);
66        if result.is_some() {
67            self.updated_at = chrono::Utc::now();
68        }
69        result
70    }
71
72    /// Update vector count
73    pub fn update_vector_count(&mut self, count: usize) {
74        self.vector_count = count;
75        self.updated_at = chrono::Utc::now();
76    }
77
78    /// Get index size estimate in bytes
79    pub fn estimated_size_bytes(&self) -> u64 {
80        // Rough estimate: each vector takes ~4 bytes per dimension + overhead
81        (self.vector_count * self.dimensions * 4 + 1024) as u64
82    }
83
84    /// Check if index is empty
85    pub fn is_empty(&self) -> bool {
86        self.vector_count == 0
87    }
88
89    /// Get index statistics
90    pub fn stats(&self) -> IndexStats {
91        IndexStats {
92            id: self.id.clone(),
93            name: self.name.clone(),
94            index_type: self.index_type.clone(),
95            dimensions: self.dimensions,
96            vector_count: self.vector_count,
97            estimated_size_bytes: self.estimated_size_bytes(),
98            metadata_count: self.metadata.len(),
99            created_at: self.created_at,
100            updated_at: self.updated_at,
101        }
102    }
103}
104
105/// Index statistics
106#[derive(Debug, Clone, Serialize, Deserialize)]
107pub struct IndexStats {
108    /// Index ID
109    pub id: String,
110    /// Index name
111    pub name: String,
112    /// Index type
113    pub index_type: IndexType,
114    /// Vector dimensions
115    pub dimensions: usize,
116    /// Number of vectors
117    pub vector_count: usize,
118    /// Estimated size in bytes
119    pub estimated_size_bytes: u64,
120    /// Number of metadata entries
121    pub metadata_count: usize,
122    /// Creation timestamp
123    pub created_at: chrono::DateTime<chrono::Utc>,
124    /// Last updated timestamp
125    pub updated_at: chrono::DateTime<chrono::Utc>,
126}
127
128/// Index type enumeration
129#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
130pub enum IndexType {
131    /// Flat index - brute force search
132    #[default]
133    Flat,
134    /// IVF (Inverted File) index - for large datasets
135    IVF,
136    /// HNSW (Hierarchical Navigable Small World) index - for high performance
137    HNSW,
138    /// PQ (Product Quantization) index - for memory efficiency
139    PQ,
140    /// Custom index type
141    Custom(String),
142}
143
144/// Search parameters for vector search
145#[derive(Debug, Clone, Serialize, Deserialize)]
146pub struct SearchParams {
147    /// Number of results to return
148    pub top_k: usize,
149    /// Similarity threshold (0.0 to 1.0)
150    pub threshold: f32,
151    /// Search method to use
152    pub search_method: SearchMethod,
153    /// Include metadata in results
154    pub include_metadata: bool,
155    /// Filter by document ID
156    pub document_filter: Option<String>,
157    /// Filter by metadata
158    pub metadata_filter: Option<HashMap<String, String>>,
159}
160
161impl Default for SearchParams {
162    fn default() -> Self {
163        Self {
164            top_k: 10,
165            threshold: 0.7,
166            search_method: SearchMethod::Cosine,
167            include_metadata: true,
168            document_filter: None,
169            metadata_filter: None,
170        }
171    }
172}
173
174/// Search method enumeration
175#[derive(Debug, Clone, Default, Serialize, Deserialize)]
176pub enum SearchMethod {
177    /// Cosine similarity
178    #[default]
179    Cosine,
180    /// Euclidean distance
181    Euclidean,
182    /// Dot product
183    DotProduct,
184    /// Manhattan distance
185    Manhattan,
186}
187
188/// Storage backend trait for documents and vectors
189#[async_trait::async_trait]
190pub trait DocumentStorage: Send + Sync {
191    /// Store document chunks
192    async fn store_chunks(&self, chunks: Vec<DocumentChunk>) -> Result<()>;
193
194    /// Search for similar chunks
195    async fn search_similar(
196        &self,
197        query_embedding: &[f32],
198        top_k: usize,
199    ) -> Result<Vec<DocumentChunk>>;
200
201    /// Search with custom parameters
202    async fn search_with_params(
203        &self,
204        query_embedding: &[f32],
205        params: SearchParams,
206    ) -> Result<Vec<DocumentChunk>>;
207
208    /// Get chunk by ID
209    async fn get_chunk(&self, chunk_id: &str) -> Result<Option<DocumentChunk>>;
210
211    /// Delete chunk by ID
212    async fn delete_chunk(&self, chunk_id: &str) -> Result<bool>;
213
214    /// Get chunks by document ID
215    async fn get_chunks_by_document(&self, document_id: &str) -> Result<Vec<DocumentChunk>>;
216
217    /// Delete all chunks for a document
218    async fn delete_document(&self, document_id: &str) -> Result<usize>;
219
220    /// Get storage statistics
221    async fn get_stats(&self) -> Result<StorageStats>;
222
223    /// List all document IDs
224    async fn list_documents(&self) -> Result<Vec<String>>;
225
226    /// Get total number of chunks
227    async fn get_total_chunks(&self) -> Result<usize>;
228
229    /// Clear all data
230    async fn clear(&self) -> Result<()>;
231
232    /// Optimize storage (rebuild indexes, compact data)
233    async fn optimize(&self) -> Result<()>;
234
235    /// Create backup
236    async fn create_backup(&self, path: &str) -> Result<()>;
237
238    /// Restore from backup
239    async fn restore_backup(&self, path: &str) -> Result<()>;
240
241    /// Check storage health
242    async fn health_check(&self) -> Result<StorageHealth>;
243}
244
245/// Storage statistics
246#[derive(Debug, Clone, Serialize, Deserialize)]
247pub struct StorageStats {
248    /// Total number of documents
249    pub total_documents: usize,
250    /// Total number of chunks
251    pub total_chunks: usize,
252    /// Index size in bytes
253    pub index_size_bytes: u64,
254    /// Last updated timestamp
255    pub last_updated: chrono::DateTime<chrono::Utc>,
256    /// Storage backend type
257    pub backend_type: String,
258    /// Available disk space in bytes
259    pub available_space_bytes: u64,
260    /// Used space in bytes
261    pub used_space_bytes: u64,
262}
263
264/// Storage health information
265#[derive(Debug, Clone, Serialize, Deserialize)]
266pub struct StorageHealth {
267    /// Overall health status
268    pub status: HealthStatus,
269    /// Health check timestamp
270    pub checked_at: chrono::DateTime<chrono::Utc>,
271    /// Detailed health information
272    pub details: HashMap<String, String>,
273    /// Performance metrics
274    pub metrics: Option<StorageMetrics>,
275}
276
277/// Health status enumeration
278#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
279pub enum HealthStatus {
280    /// Storage is healthy
281    Healthy,
282    /// Storage has warnings
283    Warning,
284    /// Storage is unhealthy
285    Unhealthy,
286    /// Storage is unavailable
287    Unavailable,
288}
289
290/// Storage performance metrics
291#[derive(Debug, Clone, Serialize, Deserialize)]
292pub struct StorageMetrics {
293    /// Average search time in milliseconds
294    pub average_search_time_ms: f64,
295    /// Average insert time in milliseconds
296    pub average_insert_time_ms: f64,
297    /// Index fragmentation percentage (0.0 to 1.0)
298    pub fragmentation_ratio: f32,
299    /// Cache hit rate (0.0 to 1.0)
300    pub cache_hit_rate: f32,
301    /// Memory usage in bytes
302    pub memory_usage_bytes: u64,
303    /// Disk usage in bytes
304    pub disk_usage_bytes: u64,
305}
306
307/// In-memory storage implementation for development and testing
308pub struct InMemoryStorage {
309    chunks: Arc<RwLock<HashMap<String, DocumentChunk>>>,
310    vectors: VectorStore,
311    stats: Arc<RwLock<StorageStats>>,
312}
313
314impl InMemoryStorage {
315    /// Create a new in-memory storage
316    pub fn new() -> Self {
317        Self::new_with_backend_type("memory")
318    }
319
320    /// Create in-memory storage with a specific backend label.
321    /// This is used when a persistent backend is configured but running with an in-memory fallback.
322    pub fn new_with_backend_type(backend_type: &str) -> Self {
323        let now = chrono::Utc::now();
324        Self {
325            chunks: Arc::new(RwLock::new(HashMap::new())),
326            vectors: Arc::new(RwLock::new(Vec::new())),
327            stats: Arc::new(RwLock::new(StorageStats {
328                total_documents: 0,
329                total_chunks: 0,
330                index_size_bytes: 0,
331                last_updated: now,
332                backend_type: backend_type.to_string(),
333                available_space_bytes: u64::MAX,
334                used_space_bytes: 0,
335            })),
336        }
337    }
338
339    /// Calculate cosine similarity
340    fn cosine_similarity(&self, a: &[f32], b: &[f32]) -> f32 {
341        if a.len() != b.len() {
342            return 0.0;
343        }
344
345        let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
346        let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
347        let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
348
349        if norm_a == 0.0 || norm_b == 0.0 {
350            0.0
351        } else {
352            dot_product / (norm_a * norm_b)
353        }
354    }
355}
356
357impl Default for InMemoryStorage {
358    fn default() -> Self {
359        Self::new()
360    }
361}
362
363#[async_trait::async_trait]
364impl DocumentStorage for InMemoryStorage {
365    async fn store_chunks(&self, chunks: Vec<DocumentChunk>) -> Result<()> {
366        let mut chunks_map = self.chunks.write().await;
367        let mut vectors = self.vectors.write().await;
368        let mut stats = self.stats.write().await;
369
370        for chunk in chunks {
371            chunks_map.insert(chunk.id.clone(), chunk.clone());
372
373            // Store vector for similarity search
374            vectors.push((chunk.id.clone(), chunk.embedding.clone()));
375
376            stats.total_chunks += 1;
377        }
378
379        stats.last_updated = chrono::Utc::now();
380        stats.index_size_bytes = (stats.total_chunks * 1536 * 4) as u64; // Rough estimate
381        stats.used_space_bytes = stats.index_size_bytes;
382
383        Ok(())
384    }
385
386    async fn search_similar(
387        &self,
388        query_embedding: &[f32],
389        top_k: usize,
390    ) -> Result<Vec<DocumentChunk>> {
391        let vectors = self.vectors.read().await;
392        let chunks = self.chunks.read().await;
393
394        let mut similarities: Vec<(String, f32)> = vectors
395            .iter()
396            .map(|(chunk_id, embedding)| {
397                let similarity = self.cosine_similarity(query_embedding, embedding);
398                (chunk_id.clone(), similarity)
399            })
400            .collect();
401
402        // Sort by similarity (descending)
403        similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
404
405        // Take top-k results
406        let mut results = Vec::new();
407        for (chunk_id, _) in similarities.iter().take(top_k) {
408            if let Some(chunk) = chunks.get(chunk_id) {
409                results.push(chunk.clone());
410            }
411        }
412
413        Ok(results)
414    }
415
416    async fn search_with_params(
417        &self,
418        query_embedding: &[f32],
419        params: SearchParams,
420    ) -> Result<Vec<DocumentChunk>> {
421        let mut results = self.search_similar(query_embedding, params.top_k * 2).await?; // Get more candidates
422
423        // Apply filters
424        if let Some(document_filter) = &params.document_filter {
425            results.retain(|chunk| chunk.document_id == *document_filter);
426        }
427
428        if let Some(metadata_filter) = &params.metadata_filter {
429            results.retain(|chunk| {
430                metadata_filter.iter().all(|(key, value)| {
431                    chunk.get_metadata(key).map(|v| v == value).unwrap_or(false)
432                })
433            });
434        }
435
436        // Apply threshold filter
437        results.retain(|chunk| {
438            let similarity = self.cosine_similarity(query_embedding, &chunk.embedding);
439            similarity >= params.threshold
440        });
441
442        // Sort by similarity
443        results.sort_by(|a, b| {
444            let sim_a = self.cosine_similarity(query_embedding, &a.embedding);
445            let sim_b = self.cosine_similarity(query_embedding, &b.embedding);
446            sim_b.partial_cmp(&sim_a).unwrap_or(std::cmp::Ordering::Equal)
447        });
448
449        // Take top-k
450        results.truncate(params.top_k);
451
452        Ok(results)
453    }
454
455    async fn get_chunk(&self, chunk_id: &str) -> Result<Option<DocumentChunk>> {
456        let chunks = self.chunks.read().await;
457        Ok(chunks.get(chunk_id).cloned())
458    }
459
460    async fn delete_chunk(&self, chunk_id: &str) -> Result<bool> {
461        let mut chunks = self.chunks.write().await;
462        let mut vectors = self.vectors.write().await;
463        let mut stats = self.stats.write().await;
464
465        let chunk_removed = chunks.remove(chunk_id).is_some();
466        let _vector_removed = vectors.retain(|(id, _)| id != chunk_id);
467
468        if chunk_removed {
469            stats.total_chunks = stats.total_chunks.saturating_sub(1);
470            stats.last_updated = chrono::Utc::now();
471            stats.index_size_bytes = (stats.total_chunks * 1536 * 4) as u64;
472            stats.used_space_bytes = stats.index_size_bytes;
473        }
474
475        Ok(chunk_removed)
476    }
477
478    async fn get_chunks_by_document(&self, document_id: &str) -> Result<Vec<DocumentChunk>> {
479        let chunks = self.chunks.read().await;
480        let results = chunks
481            .values()
482            .filter(|chunk| chunk.document_id == document_id)
483            .cloned()
484            .collect();
485        Ok(results)
486    }
487
488    async fn delete_document(&self, document_id: &str) -> Result<usize> {
489        let mut chunks = self.chunks.write().await;
490        let mut vectors = self.vectors.write().await;
491        let mut stats = self.stats.write().await;
492
493        let initial_count = chunks.len();
494        chunks.retain(|_, chunk| chunk.document_id != document_id);
495        vectors.retain(|(id, _)| {
496            // Remove vectors for chunks that were removed
497            chunks.contains_key(id)
498        });
499
500        let removed_count = initial_count - chunks.len();
501        if removed_count > 0 {
502            stats.total_chunks = stats.total_chunks.saturating_sub(removed_count);
503            stats.last_updated = chrono::Utc::now();
504            stats.index_size_bytes = (stats.total_chunks * 1536 * 4) as u64;
505            stats.used_space_bytes = stats.index_size_bytes;
506        }
507
508        Ok(removed_count)
509    }
510
511    async fn get_stats(&self) -> Result<StorageStats> {
512        let stats = self.stats.read().await;
513        Ok(stats.clone())
514    }
515
516    async fn list_documents(&self) -> Result<Vec<String>> {
517        let chunks = self.chunks.read().await;
518        let documents: std::collections::HashSet<String> =
519            chunks.values().map(|chunk| chunk.document_id.clone()).collect();
520        Ok(documents.into_iter().collect())
521    }
522
523    async fn get_total_chunks(&self) -> Result<usize> {
524        let stats = self.stats.read().await;
525        Ok(stats.total_chunks)
526    }
527
528    async fn clear(&self) -> Result<()> {
529        let mut chunks = self.chunks.write().await;
530        let mut vectors = self.vectors.write().await;
531        let mut stats = self.stats.write().await;
532
533        chunks.clear();
534        vectors.clear();
535
536        stats.total_documents = 0;
537        stats.total_chunks = 0;
538        stats.index_size_bytes = 0;
539        stats.used_space_bytes = 0;
540        stats.last_updated = chrono::Utc::now();
541
542        Ok(())
543    }
544
545    async fn optimize(&self) -> Result<()> {
546        // In-memory storage doesn't need optimization
547        Ok(())
548    }
549
550    async fn create_backup(&self, _path: &str) -> Result<()> {
551        // Placeholder implementation
552        Ok(())
553    }
554
555    async fn restore_backup(&self, _path: &str) -> Result<()> {
556        // Placeholder implementation
557        Ok(())
558    }
559
560    async fn health_check(&self) -> Result<StorageHealth> {
561        let chunks = self.chunks.read().await;
562        let vectors = self.vectors.read().await;
563
564        let mut details = HashMap::new();
565        details.insert("chunk_count".to_string(), chunks.len().to_string());
566        details.insert("vector_count".to_string(), vectors.len().to_string());
567        details.insert("memory_usage".to_string(), "unknown".to_string());
568
569        let status = if chunks.len() == vectors.len() {
570            HealthStatus::Healthy
571        } else {
572            details.insert("error".to_string(), "Chunk/vector count mismatch".to_string());
573            HealthStatus::Unhealthy
574        };
575
576        Ok(StorageHealth {
577            status,
578            checked_at: chrono::Utc::now(),
579            details,
580            metrics: None,
581        })
582    }
583}
584
585/// Storage factory for creating different storage backends
586pub struct StorageFactory;
587
588impl StorageFactory {
589    /// Create in-memory storage
590    pub fn create_memory() -> Box<dyn DocumentStorage> {
591        Box::new(InMemoryStorage::new())
592    }
593
594    /// Create file-based storage
595    pub fn create_file(path: &str) -> Result<Box<dyn DocumentStorage>> {
596        if path.trim().is_empty() {
597            return Err(crate::Error::generic("File storage path cannot be empty"));
598        }
599
600        std::fs::create_dir_all(path)?;
601        Ok(Box::new(InMemoryStorage::new_with_backend_type("file")))
602    }
603
604    /// Create database storage
605    pub fn create_database(connection_string: &str) -> Result<Box<dyn DocumentStorage>> {
606        if connection_string.trim().is_empty() {
607            return Err(crate::Error::generic("Database connection string cannot be empty"));
608        }
609
610        Ok(Box::new(InMemoryStorage::new_with_backend_type("database")))
611    }
612
613    /// Create vector database storage
614    pub fn create_vector_db(config: HashMap<String, String>) -> Result<Box<dyn DocumentStorage>> {
615        if config.is_empty() {
616            return Err(crate::Error::generic("Vector database configuration cannot be empty"));
617        }
618
619        Ok(Box::new(InMemoryStorage::new_with_backend_type("vector-db")))
620    }
621}
622
623#[cfg(test)]
624mod tests {
625    use super::StorageFactory;
626    use std::collections::HashMap;
627
628    #[test]
629    fn test_module_compiles() {
630        // Basic compilation test
631    }
632
633    #[tokio::test]
634    async fn test_create_file_storage_fallback_backend_type() {
635        let dir =
636            std::env::temp_dir().join(format!("mockforge-data-storage-{}", std::process::id()));
637        let _ = std::fs::remove_dir_all(&dir);
638        let storage = StorageFactory::create_file(dir.to_str().expect("path")).expect("create");
639        let stats = storage.get_stats().await.expect("stats");
640        assert_eq!(stats.backend_type, "file");
641        let _ = std::fs::remove_dir_all(&dir);
642    }
643
644    #[tokio::test]
645    async fn test_create_database_storage_fallback_backend_type() {
646        let storage =
647            StorageFactory::create_database("postgres://user:pass@localhost/db").expect("create");
648        let stats = storage.get_stats().await.expect("stats");
649        assert_eq!(stats.backend_type, "database");
650    }
651
652    #[tokio::test]
653    async fn test_create_vector_storage_fallback_backend_type() {
654        let mut cfg = HashMap::new();
655        cfg.insert("provider".to_string(), "qdrant".to_string());
656        let storage = StorageFactory::create_vector_db(cfg).expect("create");
657        let stats = storage.get_stats().await.expect("stats");
658        assert_eq!(stats.backend_type, "vector-db");
659    }
660}