mockforge_data/rag/
storage.rs

1//! Document storage and vector indexing
2//!
3//! This module provides storage backends for documents and their vector embeddings,
4//! supporting various indexing strategies and similarity search algorithms.
5
6use crate::rag::engine::DocumentChunk;
7use crate::Result;
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10use std::sync::Arc;
11use tokio::sync::RwLock;
12
13type VectorStore = Arc<RwLock<Vec<(String, Vec<f32>)>>>;
14
15/// Vector index for similarity search
16#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct VectorIndex {
18    /// Index ID
19    pub id: String,
20    /// Index name
21    pub name: String,
22    /// Index type
23    pub index_type: IndexType,
24    /// Vector dimensions
25    pub dimensions: usize,
26    /// Number of vectors indexed
27    pub vector_count: usize,
28    /// Index metadata
29    pub metadata: HashMap<String, String>,
30    /// Creation timestamp
31    pub created_at: chrono::DateTime<chrono::Utc>,
32    /// Last updated timestamp
33    pub updated_at: chrono::DateTime<chrono::Utc>,
34}
35
36impl VectorIndex {
37    /// Create a new vector index
38    pub fn new(id: String, name: String, index_type: IndexType, dimensions: usize) -> Self {
39        let now = chrono::Utc::now();
40        Self {
41            id,
42            name,
43            index_type,
44            dimensions,
45            vector_count: 0,
46            metadata: HashMap::new(),
47            created_at: now,
48            updated_at: now,
49        }
50    }
51
52    /// Add metadata to index
53    pub fn add_metadata(&mut self, key: String, value: String) {
54        self.metadata.insert(key, value);
55        self.updated_at = chrono::Utc::now();
56    }
57
58    /// Get metadata value
59    pub fn get_metadata(&self, key: &str) -> Option<&String> {
60        self.metadata.get(key)
61    }
62
63    /// Remove metadata
64    pub fn remove_metadata(&mut self, key: &str) -> Option<String> {
65        let result = self.metadata.remove(key);
66        if result.is_some() {
67            self.updated_at = chrono::Utc::now();
68        }
69        result
70    }
71
72    /// Update vector count
73    pub fn update_vector_count(&mut self, count: usize) {
74        self.vector_count = count;
75        self.updated_at = chrono::Utc::now();
76    }
77
78    /// Get index size estimate in bytes
79    pub fn estimated_size_bytes(&self) -> u64 {
80        // Rough estimate: each vector takes ~4 bytes per dimension + overhead
81        (self.vector_count * self.dimensions * 4 + 1024) as u64
82    }
83
84    /// Check if index is empty
85    pub fn is_empty(&self) -> bool {
86        self.vector_count == 0
87    }
88
89    /// Get index statistics
90    pub fn stats(&self) -> IndexStats {
91        IndexStats {
92            id: self.id.clone(),
93            name: self.name.clone(),
94            index_type: self.index_type.clone(),
95            dimensions: self.dimensions,
96            vector_count: self.vector_count,
97            estimated_size_bytes: self.estimated_size_bytes(),
98            metadata_count: self.metadata.len(),
99            created_at: self.created_at,
100            updated_at: self.updated_at,
101        }
102    }
103}
104
105/// Index statistics
106#[derive(Debug, Clone, Serialize, Deserialize)]
107pub struct IndexStats {
108    /// Index ID
109    pub id: String,
110    /// Index name
111    pub name: String,
112    /// Index type
113    pub index_type: IndexType,
114    /// Vector dimensions
115    pub dimensions: usize,
116    /// Number of vectors
117    pub vector_count: usize,
118    /// Estimated size in bytes
119    pub estimated_size_bytes: u64,
120    /// Number of metadata entries
121    pub metadata_count: usize,
122    /// Creation timestamp
123    pub created_at: chrono::DateTime<chrono::Utc>,
124    /// Last updated timestamp
125    pub updated_at: chrono::DateTime<chrono::Utc>,
126}
127
128/// Index type enumeration
129#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
130pub enum IndexType {
131    /// Flat index - brute force search
132    Flat,
133    /// IVF (Inverted File) index - for large datasets
134    IVF,
135    /// HNSW (Hierarchical Navigable Small World) index - for high performance
136    HNSW,
137    /// PQ (Product Quantization) index - for memory efficiency
138    PQ,
139    /// Custom index type
140    Custom(String),
141}
142
143impl Default for IndexType {
144    fn default() -> Self {
145        Self::Flat
146    }
147}
148
149/// Search parameters for vector search
150#[derive(Debug, Clone, Serialize, Deserialize)]
151pub struct SearchParams {
152    /// Number of results to return
153    pub top_k: usize,
154    /// Similarity threshold (0.0 to 1.0)
155    pub threshold: f32,
156    /// Search method to use
157    pub search_method: SearchMethod,
158    /// Include metadata in results
159    pub include_metadata: bool,
160    /// Filter by document ID
161    pub document_filter: Option<String>,
162    /// Filter by metadata
163    pub metadata_filter: Option<HashMap<String, String>>,
164}
165
166impl Default for SearchParams {
167    fn default() -> Self {
168        Self {
169            top_k: 10,
170            threshold: 0.7,
171            search_method: SearchMethod::Cosine,
172            include_metadata: true,
173            document_filter: None,
174            metadata_filter: None,
175        }
176    }
177}
178
179/// Search method enumeration
180#[derive(Debug, Clone, Serialize, Deserialize)]
181pub enum SearchMethod {
182    /// Cosine similarity
183    Cosine,
184    /// Euclidean distance
185    Euclidean,
186    /// Dot product
187    DotProduct,
188    /// Manhattan distance
189    Manhattan,
190}
191
192impl Default for SearchMethod {
193    fn default() -> Self {
194        Self::Cosine
195    }
196}
197
198/// Storage backend trait for documents and vectors
199#[async_trait::async_trait]
200pub trait DocumentStorage: Send + Sync {
201    /// Store document chunks
202    async fn store_chunks(&self, chunks: Vec<DocumentChunk>) -> Result<()>;
203
204    /// Search for similar chunks
205    async fn search_similar(
206        &self,
207        query_embedding: &[f32],
208        top_k: usize,
209    ) -> Result<Vec<DocumentChunk>>;
210
211    /// Search with custom parameters
212    async fn search_with_params(
213        &self,
214        query_embedding: &[f32],
215        params: SearchParams,
216    ) -> Result<Vec<DocumentChunk>>;
217
218    /// Get chunk by ID
219    async fn get_chunk(&self, chunk_id: &str) -> Result<Option<DocumentChunk>>;
220
221    /// Delete chunk by ID
222    async fn delete_chunk(&self, chunk_id: &str) -> Result<bool>;
223
224    /// Get chunks by document ID
225    async fn get_chunks_by_document(&self, document_id: &str) -> Result<Vec<DocumentChunk>>;
226
227    /// Delete all chunks for a document
228    async fn delete_document(&self, document_id: &str) -> Result<usize>;
229
230    /// Get storage statistics
231    async fn get_stats(&self) -> Result<StorageStats>;
232
233    /// List all document IDs
234    async fn list_documents(&self) -> Result<Vec<String>>;
235
236    /// Get total number of chunks
237    async fn get_total_chunks(&self) -> Result<usize>;
238
239    /// Clear all data
240    async fn clear(&self) -> Result<()>;
241
242    /// Optimize storage (rebuild indexes, compact data)
243    async fn optimize(&self) -> Result<()>;
244
245    /// Create backup
246    async fn create_backup(&self, path: &str) -> Result<()>;
247
248    /// Restore from backup
249    async fn restore_backup(&self, path: &str) -> Result<()>;
250
251    /// Check storage health
252    async fn health_check(&self) -> Result<StorageHealth>;
253}
254
255/// Storage statistics
256#[derive(Debug, Clone, Serialize, Deserialize)]
257pub struct StorageStats {
258    /// Total number of documents
259    pub total_documents: usize,
260    /// Total number of chunks
261    pub total_chunks: usize,
262    /// Index size in bytes
263    pub index_size_bytes: u64,
264    /// Last updated timestamp
265    pub last_updated: chrono::DateTime<chrono::Utc>,
266    /// Storage backend type
267    pub backend_type: String,
268    /// Available disk space in bytes
269    pub available_space_bytes: u64,
270    /// Used space in bytes
271    pub used_space_bytes: u64,
272}
273
274/// Storage health information
275#[derive(Debug, Clone, Serialize, Deserialize)]
276pub struct StorageHealth {
277    /// Overall health status
278    pub status: HealthStatus,
279    /// Health check timestamp
280    pub checked_at: chrono::DateTime<chrono::Utc>,
281    /// Detailed health information
282    pub details: HashMap<String, String>,
283    /// Performance metrics
284    pub metrics: Option<StorageMetrics>,
285}
286
287/// Health status enumeration
288#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
289pub enum HealthStatus {
290    /// Storage is healthy
291    Healthy,
292    /// Storage has warnings
293    Warning,
294    /// Storage is unhealthy
295    Unhealthy,
296    /// Storage is unavailable
297    Unavailable,
298}
299
300/// Storage performance metrics
301#[derive(Debug, Clone, Serialize, Deserialize)]
302pub struct StorageMetrics {
303    /// Average search time in milliseconds
304    pub average_search_time_ms: f64,
305    /// Average insert time in milliseconds
306    pub average_insert_time_ms: f64,
307    /// Index fragmentation percentage (0.0 to 1.0)
308    pub fragmentation_ratio: f32,
309    /// Cache hit rate (0.0 to 1.0)
310    pub cache_hit_rate: f32,
311    /// Memory usage in bytes
312    pub memory_usage_bytes: u64,
313    /// Disk usage in bytes
314    pub disk_usage_bytes: u64,
315}
316
317/// In-memory storage implementation for development and testing
318pub struct InMemoryStorage {
319    chunks: Arc<RwLock<HashMap<String, DocumentChunk>>>,
320    vectors: VectorStore,
321    stats: Arc<RwLock<StorageStats>>,
322}
323
324impl InMemoryStorage {
325    /// Create a new in-memory storage
326    pub fn new() -> Self {
327        let now = chrono::Utc::now();
328        Self {
329            chunks: Arc::new(RwLock::new(HashMap::new())),
330            vectors: Arc::new(RwLock::new(Vec::new())),
331            stats: Arc::new(RwLock::new(StorageStats {
332                total_documents: 0,
333                total_chunks: 0,
334                index_size_bytes: 0,
335                last_updated: now,
336                backend_type: "memory".to_string(),
337                available_space_bytes: u64::MAX,
338                used_space_bytes: 0,
339            })),
340        }
341    }
342
343    /// Calculate cosine similarity
344    fn cosine_similarity(&self, a: &[f32], b: &[f32]) -> f32 {
345        if a.len() != b.len() {
346            return 0.0;
347        }
348
349        let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
350        let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
351        let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
352
353        if norm_a == 0.0 || norm_b == 0.0 {
354            0.0
355        } else {
356            dot_product / (norm_a * norm_b)
357        }
358    }
359}
360
361impl Default for InMemoryStorage {
362    fn default() -> Self {
363        Self::new()
364    }
365}
366
367#[async_trait::async_trait]
368impl DocumentStorage for InMemoryStorage {
369    async fn store_chunks(&self, chunks: Vec<DocumentChunk>) -> Result<()> {
370        let mut chunks_map = self.chunks.write().await;
371        let mut vectors = self.vectors.write().await;
372        let mut stats = self.stats.write().await;
373
374        for chunk in chunks {
375            chunks_map.insert(chunk.id.clone(), chunk.clone());
376
377            // Store vector for similarity search
378            vectors.push((chunk.id.clone(), chunk.embedding.clone()));
379
380            stats.total_chunks += 1;
381        }
382
383        stats.last_updated = chrono::Utc::now();
384        stats.index_size_bytes = (stats.total_chunks * 1536 * 4) as u64; // Rough estimate
385        stats.used_space_bytes = stats.index_size_bytes;
386
387        Ok(())
388    }
389
390    async fn search_similar(
391        &self,
392        query_embedding: &[f32],
393        top_k: usize,
394    ) -> Result<Vec<DocumentChunk>> {
395        let vectors = self.vectors.read().await;
396        let chunks = self.chunks.read().await;
397
398        let mut similarities: Vec<(String, f32)> = vectors
399            .iter()
400            .map(|(chunk_id, embedding)| {
401                let similarity = self.cosine_similarity(query_embedding, embedding);
402                (chunk_id.clone(), similarity)
403            })
404            .collect();
405
406        // Sort by similarity (descending)
407        similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
408
409        // Take top-k results
410        let mut results = Vec::new();
411        for (chunk_id, _) in similarities.iter().take(top_k) {
412            if let Some(chunk) = chunks.get(chunk_id) {
413                results.push(chunk.clone());
414            }
415        }
416
417        Ok(results)
418    }
419
420    async fn search_with_params(
421        &self,
422        query_embedding: &[f32],
423        params: SearchParams,
424    ) -> Result<Vec<DocumentChunk>> {
425        let mut results = self.search_similar(query_embedding, params.top_k * 2).await?; // Get more candidates
426
427        // Apply filters
428        if let Some(document_filter) = &params.document_filter {
429            results.retain(|chunk| chunk.document_id == *document_filter);
430        }
431
432        if let Some(metadata_filter) = &params.metadata_filter {
433            results.retain(|chunk| {
434                metadata_filter.iter().all(|(key, value)| {
435                    chunk.get_metadata(key).map(|v| v == value).unwrap_or(false)
436                })
437            });
438        }
439
440        // Apply threshold filter
441        results.retain(|chunk| {
442            let similarity = self.cosine_similarity(query_embedding, &chunk.embedding);
443            similarity >= params.threshold
444        });
445
446        // Sort by similarity
447        results.sort_by(|a, b| {
448            let sim_a = self.cosine_similarity(query_embedding, &a.embedding);
449            let sim_b = self.cosine_similarity(query_embedding, &b.embedding);
450            sim_b.partial_cmp(&sim_a).unwrap_or(std::cmp::Ordering::Equal)
451        });
452
453        // Take top-k
454        results.truncate(params.top_k);
455
456        Ok(results)
457    }
458
459    async fn get_chunk(&self, chunk_id: &str) -> Result<Option<DocumentChunk>> {
460        let chunks = self.chunks.read().await;
461        Ok(chunks.get(chunk_id).cloned())
462    }
463
464    async fn delete_chunk(&self, chunk_id: &str) -> Result<bool> {
465        let mut chunks = self.chunks.write().await;
466        let mut vectors = self.vectors.write().await;
467        let mut stats = self.stats.write().await;
468
469        let chunk_removed = chunks.remove(chunk_id).is_some();
470        let _vector_removed = vectors.retain(|(id, _)| id != chunk_id);
471
472        if chunk_removed {
473            stats.total_chunks = stats.total_chunks.saturating_sub(1);
474            stats.last_updated = chrono::Utc::now();
475            stats.index_size_bytes = (stats.total_chunks * 1536 * 4) as u64;
476            stats.used_space_bytes = stats.index_size_bytes;
477        }
478
479        Ok(chunk_removed)
480    }
481
482    async fn get_chunks_by_document(&self, document_id: &str) -> Result<Vec<DocumentChunk>> {
483        let chunks = self.chunks.read().await;
484        let results = chunks
485            .values()
486            .filter(|chunk| chunk.document_id == document_id)
487            .cloned()
488            .collect();
489        Ok(results)
490    }
491
492    async fn delete_document(&self, document_id: &str) -> Result<usize> {
493        let mut chunks = self.chunks.write().await;
494        let mut vectors = self.vectors.write().await;
495        let mut stats = self.stats.write().await;
496
497        let initial_count = chunks.len();
498        chunks.retain(|_, chunk| chunk.document_id != document_id);
499        vectors.retain(|(id, _)| {
500            // Remove vectors for chunks that were removed
501            chunks.contains_key(id)
502        });
503
504        let removed_count = initial_count - chunks.len();
505        if removed_count > 0 {
506            stats.total_chunks = stats.total_chunks.saturating_sub(removed_count);
507            stats.last_updated = chrono::Utc::now();
508            stats.index_size_bytes = (stats.total_chunks * 1536 * 4) as u64;
509            stats.used_space_bytes = stats.index_size_bytes;
510        }
511
512        Ok(removed_count)
513    }
514
515    async fn get_stats(&self) -> Result<StorageStats> {
516        let stats = self.stats.read().await;
517        Ok(stats.clone())
518    }
519
520    async fn list_documents(&self) -> Result<Vec<String>> {
521        let chunks = self.chunks.read().await;
522        let documents: std::collections::HashSet<String> =
523            chunks.values().map(|chunk| chunk.document_id.clone()).collect();
524        Ok(documents.into_iter().collect())
525    }
526
527    async fn get_total_chunks(&self) -> Result<usize> {
528        let stats = self.stats.read().await;
529        Ok(stats.total_chunks)
530    }
531
532    async fn clear(&self) -> Result<()> {
533        let mut chunks = self.chunks.write().await;
534        let mut vectors = self.vectors.write().await;
535        let mut stats = self.stats.write().await;
536
537        chunks.clear();
538        vectors.clear();
539
540        stats.total_documents = 0;
541        stats.total_chunks = 0;
542        stats.index_size_bytes = 0;
543        stats.used_space_bytes = 0;
544        stats.last_updated = chrono::Utc::now();
545
546        Ok(())
547    }
548
549    async fn optimize(&self) -> Result<()> {
550        // In-memory storage doesn't need optimization
551        Ok(())
552    }
553
554    async fn create_backup(&self, _path: &str) -> Result<()> {
555        // Placeholder implementation
556        Ok(())
557    }
558
559    async fn restore_backup(&self, _path: &str) -> Result<()> {
560        // Placeholder implementation
561        Ok(())
562    }
563
564    async fn health_check(&self) -> Result<StorageHealth> {
565        let chunks = self.chunks.read().await;
566        let vectors = self.vectors.read().await;
567
568        let mut details = HashMap::new();
569        details.insert("chunk_count".to_string(), chunks.len().to_string());
570        details.insert("vector_count".to_string(), vectors.len().to_string());
571        details.insert("memory_usage".to_string(), "unknown".to_string());
572
573        let status = if chunks.len() == vectors.len() {
574            HealthStatus::Healthy
575        } else {
576            details.insert("error".to_string(), "Chunk/vector count mismatch".to_string());
577            HealthStatus::Unhealthy
578        };
579
580        Ok(StorageHealth {
581            status,
582            checked_at: chrono::Utc::now(),
583            details,
584            metrics: None,
585        })
586    }
587}
588
589/// Storage factory for creating different storage backends
590pub struct StorageFactory;
591
592impl StorageFactory {
593    /// Create in-memory storage
594    pub fn create_memory() -> Box<dyn DocumentStorage> {
595        Box::new(InMemoryStorage::new())
596    }
597
598    /// Create file-based storage
599    pub fn create_file(_path: &str) -> Result<Box<dyn DocumentStorage>> {
600        // Placeholder for file-based storage implementation
601        Err(crate::Error::generic("File storage not yet implemented"))
602    }
603
604    /// Create database storage
605    pub fn create_database(_connection_string: &str) -> Result<Box<dyn DocumentStorage>> {
606        // Placeholder for database storage implementation
607        Err(crate::Error::generic("Database storage not yet implemented"))
608    }
609
610    /// Create vector database storage
611    pub fn create_vector_db(_config: HashMap<String, String>) -> Result<Box<dyn DocumentStorage>> {
612        // Placeholder for vector database storage implementation
613        Err(crate::Error::generic("Vector database storage not yet implemented"))
614    }
615}
616
617#[cfg(test)]
618mod tests {
619
620    #[test]
621    fn test_module_compiles() {
622        // Basic compilation test
623    }
624}