1use crate::rag::engine::DocumentChunk;
7use crate::Result;
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10use std::sync::Arc;
11use tokio::sync::RwLock;
12
13type VectorStore = Arc<RwLock<Vec<(String, Vec<f32>)>>>;
14
15#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct VectorIndex {
18 pub id: String,
20 pub name: String,
22 pub index_type: IndexType,
24 pub dimensions: usize,
26 pub vector_count: usize,
28 pub metadata: HashMap<String, String>,
30 pub created_at: chrono::DateTime<chrono::Utc>,
32 pub updated_at: chrono::DateTime<chrono::Utc>,
34}
35
36impl VectorIndex {
37 pub fn new(id: String, name: String, index_type: IndexType, dimensions: usize) -> Self {
39 let now = chrono::Utc::now();
40 Self {
41 id,
42 name,
43 index_type,
44 dimensions,
45 vector_count: 0,
46 metadata: HashMap::new(),
47 created_at: now,
48 updated_at: now,
49 }
50 }
51
52 pub fn add_metadata(&mut self, key: String, value: String) {
54 self.metadata.insert(key, value);
55 self.updated_at = chrono::Utc::now();
56 }
57
58 pub fn get_metadata(&self, key: &str) -> Option<&String> {
60 self.metadata.get(key)
61 }
62
63 pub fn remove_metadata(&mut self, key: &str) -> Option<String> {
65 let result = self.metadata.remove(key);
66 if result.is_some() {
67 self.updated_at = chrono::Utc::now();
68 }
69 result
70 }
71
72 pub fn update_vector_count(&mut self, count: usize) {
74 self.vector_count = count;
75 self.updated_at = chrono::Utc::now();
76 }
77
78 pub fn estimated_size_bytes(&self) -> u64 {
80 (self.vector_count * self.dimensions * 4 + 1024) as u64
82 }
83
84 pub fn is_empty(&self) -> bool {
86 self.vector_count == 0
87 }
88
89 pub fn stats(&self) -> IndexStats {
91 IndexStats {
92 id: self.id.clone(),
93 name: self.name.clone(),
94 index_type: self.index_type.clone(),
95 dimensions: self.dimensions,
96 vector_count: self.vector_count,
97 estimated_size_bytes: self.estimated_size_bytes(),
98 metadata_count: self.metadata.len(),
99 created_at: self.created_at,
100 updated_at: self.updated_at,
101 }
102 }
103}
104
105#[derive(Debug, Clone, Serialize, Deserialize)]
107pub struct IndexStats {
108 pub id: String,
110 pub name: String,
112 pub index_type: IndexType,
114 pub dimensions: usize,
116 pub vector_count: usize,
118 pub estimated_size_bytes: u64,
120 pub metadata_count: usize,
122 pub created_at: chrono::DateTime<chrono::Utc>,
124 pub updated_at: chrono::DateTime<chrono::Utc>,
126}
127
128#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
130pub enum IndexType {
131 #[default]
133 Flat,
134 IVF,
136 HNSW,
138 PQ,
140 Custom(String),
142}
143
144#[derive(Debug, Clone, Serialize, Deserialize)]
146pub struct SearchParams {
147 pub top_k: usize,
149 pub threshold: f32,
151 pub search_method: SearchMethod,
153 pub include_metadata: bool,
155 pub document_filter: Option<String>,
157 pub metadata_filter: Option<HashMap<String, String>>,
159}
160
161impl Default for SearchParams {
162 fn default() -> Self {
163 Self {
164 top_k: 10,
165 threshold: 0.7,
166 search_method: SearchMethod::Cosine,
167 include_metadata: true,
168 document_filter: None,
169 metadata_filter: None,
170 }
171 }
172}
173
174#[derive(Debug, Clone, Default, Serialize, Deserialize)]
176pub enum SearchMethod {
177 #[default]
179 Cosine,
180 Euclidean,
182 DotProduct,
184 Manhattan,
186}
187
188#[async_trait::async_trait]
190pub trait DocumentStorage: Send + Sync {
191 async fn store_chunks(&self, chunks: Vec<DocumentChunk>) -> Result<()>;
193
194 async fn search_similar(
196 &self,
197 query_embedding: &[f32],
198 top_k: usize,
199 ) -> Result<Vec<DocumentChunk>>;
200
201 async fn search_with_params(
203 &self,
204 query_embedding: &[f32],
205 params: SearchParams,
206 ) -> Result<Vec<DocumentChunk>>;
207
208 async fn get_chunk(&self, chunk_id: &str) -> Result<Option<DocumentChunk>>;
210
211 async fn delete_chunk(&self, chunk_id: &str) -> Result<bool>;
213
214 async fn get_chunks_by_document(&self, document_id: &str) -> Result<Vec<DocumentChunk>>;
216
217 async fn delete_document(&self, document_id: &str) -> Result<usize>;
219
220 async fn get_stats(&self) -> Result<StorageStats>;
222
223 async fn list_documents(&self) -> Result<Vec<String>>;
225
226 async fn get_total_chunks(&self) -> Result<usize>;
228
229 async fn clear(&self) -> Result<()>;
231
232 async fn optimize(&self) -> Result<()>;
234
235 async fn create_backup(&self, path: &str) -> Result<()>;
237
238 async fn restore_backup(&self, path: &str) -> Result<()>;
240
241 async fn health_check(&self) -> Result<StorageHealth>;
243}
244
245#[derive(Debug, Clone, Serialize, Deserialize)]
247pub struct StorageStats {
248 pub total_documents: usize,
250 pub total_chunks: usize,
252 pub index_size_bytes: u64,
254 pub last_updated: chrono::DateTime<chrono::Utc>,
256 pub backend_type: String,
258 pub available_space_bytes: u64,
260 pub used_space_bytes: u64,
262}
263
264#[derive(Debug, Clone, Serialize, Deserialize)]
266pub struct StorageHealth {
267 pub status: HealthStatus,
269 pub checked_at: chrono::DateTime<chrono::Utc>,
271 pub details: HashMap<String, String>,
273 pub metrics: Option<StorageMetrics>,
275}
276
277#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
279pub enum HealthStatus {
280 Healthy,
282 Warning,
284 Unhealthy,
286 Unavailable,
288}
289
290#[derive(Debug, Clone, Serialize, Deserialize)]
292pub struct StorageMetrics {
293 pub average_search_time_ms: f64,
295 pub average_insert_time_ms: f64,
297 pub fragmentation_ratio: f32,
299 pub cache_hit_rate: f32,
301 pub memory_usage_bytes: u64,
303 pub disk_usage_bytes: u64,
305}
306
307pub struct InMemoryStorage {
309 chunks: Arc<RwLock<HashMap<String, DocumentChunk>>>,
310 vectors: VectorStore,
311 stats: Arc<RwLock<StorageStats>>,
312}
313
314impl InMemoryStorage {
315 pub fn new() -> Self {
317 Self::new_with_backend_type("memory")
318 }
319
320 pub fn new_with_backend_type(backend_type: &str) -> Self {
323 let now = chrono::Utc::now();
324 Self {
325 chunks: Arc::new(RwLock::new(HashMap::new())),
326 vectors: Arc::new(RwLock::new(Vec::new())),
327 stats: Arc::new(RwLock::new(StorageStats {
328 total_documents: 0,
329 total_chunks: 0,
330 index_size_bytes: 0,
331 last_updated: now,
332 backend_type: backend_type.to_string(),
333 available_space_bytes: u64::MAX,
334 used_space_bytes: 0,
335 })),
336 }
337 }
338
339 fn cosine_similarity(&self, a: &[f32], b: &[f32]) -> f32 {
341 if a.len() != b.len() {
342 return 0.0;
343 }
344
345 let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
346 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
347 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
348
349 if norm_a == 0.0 || norm_b == 0.0 {
350 0.0
351 } else {
352 dot_product / (norm_a * norm_b)
353 }
354 }
355}
356
357impl Default for InMemoryStorage {
358 fn default() -> Self {
359 Self::new()
360 }
361}
362
363#[async_trait::async_trait]
364impl DocumentStorage for InMemoryStorage {
365 async fn store_chunks(&self, chunks: Vec<DocumentChunk>) -> Result<()> {
366 let mut chunks_map = self.chunks.write().await;
367 let mut vectors = self.vectors.write().await;
368 let mut stats = self.stats.write().await;
369
370 for chunk in chunks {
371 chunks_map.insert(chunk.id.clone(), chunk.clone());
372
373 vectors.push((chunk.id.clone(), chunk.embedding.clone()));
375
376 stats.total_chunks += 1;
377 }
378
379 stats.last_updated = chrono::Utc::now();
380 stats.index_size_bytes = (stats.total_chunks * 1536 * 4) as u64; stats.used_space_bytes = stats.index_size_bytes;
382
383 Ok(())
384 }
385
386 async fn search_similar(
387 &self,
388 query_embedding: &[f32],
389 top_k: usize,
390 ) -> Result<Vec<DocumentChunk>> {
391 let vectors = self.vectors.read().await;
392 let chunks = self.chunks.read().await;
393
394 let mut similarities: Vec<(String, f32)> = vectors
395 .iter()
396 .map(|(chunk_id, embedding)| {
397 let similarity = self.cosine_similarity(query_embedding, embedding);
398 (chunk_id.clone(), similarity)
399 })
400 .collect();
401
402 similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
404
405 let mut results = Vec::new();
407 for (chunk_id, _) in similarities.iter().take(top_k) {
408 if let Some(chunk) = chunks.get(chunk_id) {
409 results.push(chunk.clone());
410 }
411 }
412
413 Ok(results)
414 }
415
416 async fn search_with_params(
417 &self,
418 query_embedding: &[f32],
419 params: SearchParams,
420 ) -> Result<Vec<DocumentChunk>> {
421 let mut results = self.search_similar(query_embedding, params.top_k * 2).await?; if let Some(document_filter) = ¶ms.document_filter {
425 results.retain(|chunk| chunk.document_id == *document_filter);
426 }
427
428 if let Some(metadata_filter) = ¶ms.metadata_filter {
429 results.retain(|chunk| {
430 metadata_filter.iter().all(|(key, value)| {
431 chunk.get_metadata(key).map(|v| v == value).unwrap_or(false)
432 })
433 });
434 }
435
436 results.retain(|chunk| {
438 let similarity = self.cosine_similarity(query_embedding, &chunk.embedding);
439 similarity >= params.threshold
440 });
441
442 results.sort_by(|a, b| {
444 let sim_a = self.cosine_similarity(query_embedding, &a.embedding);
445 let sim_b = self.cosine_similarity(query_embedding, &b.embedding);
446 sim_b.partial_cmp(&sim_a).unwrap_or(std::cmp::Ordering::Equal)
447 });
448
449 results.truncate(params.top_k);
451
452 Ok(results)
453 }
454
455 async fn get_chunk(&self, chunk_id: &str) -> Result<Option<DocumentChunk>> {
456 let chunks = self.chunks.read().await;
457 Ok(chunks.get(chunk_id).cloned())
458 }
459
460 async fn delete_chunk(&self, chunk_id: &str) -> Result<bool> {
461 let mut chunks = self.chunks.write().await;
462 let mut vectors = self.vectors.write().await;
463 let mut stats = self.stats.write().await;
464
465 let chunk_removed = chunks.remove(chunk_id).is_some();
466 let _vector_removed = vectors.retain(|(id, _)| id != chunk_id);
467
468 if chunk_removed {
469 stats.total_chunks = stats.total_chunks.saturating_sub(1);
470 stats.last_updated = chrono::Utc::now();
471 stats.index_size_bytes = (stats.total_chunks * 1536 * 4) as u64;
472 stats.used_space_bytes = stats.index_size_bytes;
473 }
474
475 Ok(chunk_removed)
476 }
477
478 async fn get_chunks_by_document(&self, document_id: &str) -> Result<Vec<DocumentChunk>> {
479 let chunks = self.chunks.read().await;
480 let results = chunks
481 .values()
482 .filter(|chunk| chunk.document_id == document_id)
483 .cloned()
484 .collect();
485 Ok(results)
486 }
487
488 async fn delete_document(&self, document_id: &str) -> Result<usize> {
489 let mut chunks = self.chunks.write().await;
490 let mut vectors = self.vectors.write().await;
491 let mut stats = self.stats.write().await;
492
493 let initial_count = chunks.len();
494 chunks.retain(|_, chunk| chunk.document_id != document_id);
495 vectors.retain(|(id, _)| {
496 chunks.contains_key(id)
498 });
499
500 let removed_count = initial_count - chunks.len();
501 if removed_count > 0 {
502 stats.total_chunks = stats.total_chunks.saturating_sub(removed_count);
503 stats.last_updated = chrono::Utc::now();
504 stats.index_size_bytes = (stats.total_chunks * 1536 * 4) as u64;
505 stats.used_space_bytes = stats.index_size_bytes;
506 }
507
508 Ok(removed_count)
509 }
510
511 async fn get_stats(&self) -> Result<StorageStats> {
512 let stats = self.stats.read().await;
513 Ok(stats.clone())
514 }
515
516 async fn list_documents(&self) -> Result<Vec<String>> {
517 let chunks = self.chunks.read().await;
518 let documents: std::collections::HashSet<String> =
519 chunks.values().map(|chunk| chunk.document_id.clone()).collect();
520 Ok(documents.into_iter().collect())
521 }
522
523 async fn get_total_chunks(&self) -> Result<usize> {
524 let stats = self.stats.read().await;
525 Ok(stats.total_chunks)
526 }
527
528 async fn clear(&self) -> Result<()> {
529 let mut chunks = self.chunks.write().await;
530 let mut vectors = self.vectors.write().await;
531 let mut stats = self.stats.write().await;
532
533 chunks.clear();
534 vectors.clear();
535
536 stats.total_documents = 0;
537 stats.total_chunks = 0;
538 stats.index_size_bytes = 0;
539 stats.used_space_bytes = 0;
540 stats.last_updated = chrono::Utc::now();
541
542 Ok(())
543 }
544
545 async fn optimize(&self) -> Result<()> {
546 Ok(())
548 }
549
550 async fn create_backup(&self, _path: &str) -> Result<()> {
551 Ok(())
553 }
554
555 async fn restore_backup(&self, _path: &str) -> Result<()> {
556 Ok(())
558 }
559
560 async fn health_check(&self) -> Result<StorageHealth> {
561 let chunks = self.chunks.read().await;
562 let vectors = self.vectors.read().await;
563
564 let mut details = HashMap::new();
565 details.insert("chunk_count".to_string(), chunks.len().to_string());
566 details.insert("vector_count".to_string(), vectors.len().to_string());
567 details.insert("memory_usage".to_string(), "unknown".to_string());
568
569 let status = if chunks.len() == vectors.len() {
570 HealthStatus::Healthy
571 } else {
572 details.insert("error".to_string(), "Chunk/vector count mismatch".to_string());
573 HealthStatus::Unhealthy
574 };
575
576 Ok(StorageHealth {
577 status,
578 checked_at: chrono::Utc::now(),
579 details,
580 metrics: None,
581 })
582 }
583}
584
585pub struct StorageFactory;
587
588impl StorageFactory {
589 pub fn create_memory() -> Box<dyn DocumentStorage> {
591 Box::new(InMemoryStorage::new())
592 }
593
594 pub fn create_file(path: &str) -> Result<Box<dyn DocumentStorage>> {
596 if path.trim().is_empty() {
597 return Err(crate::Error::generic("File storage path cannot be empty"));
598 }
599
600 std::fs::create_dir_all(path)?;
601 Ok(Box::new(InMemoryStorage::new_with_backend_type("file")))
602 }
603
604 pub fn create_database(connection_string: &str) -> Result<Box<dyn DocumentStorage>> {
606 if connection_string.trim().is_empty() {
607 return Err(crate::Error::generic("Database connection string cannot be empty"));
608 }
609
610 Ok(Box::new(InMemoryStorage::new_with_backend_type("database")))
611 }
612
613 pub fn create_vector_db(config: HashMap<String, String>) -> Result<Box<dyn DocumentStorage>> {
615 if config.is_empty() {
616 return Err(crate::Error::generic("Vector database configuration cannot be empty"));
617 }
618
619 Ok(Box::new(InMemoryStorage::new_with_backend_type("vector-db")))
620 }
621}
622
623#[cfg(test)]
624mod tests {
625 use super::StorageFactory;
626 use std::collections::HashMap;
627
628 #[test]
629 fn test_module_compiles() {
630 }
632
633 #[tokio::test]
634 async fn test_create_file_storage_fallback_backend_type() {
635 let dir =
636 std::env::temp_dir().join(format!("mockforge-data-storage-{}", std::process::id()));
637 let _ = std::fs::remove_dir_all(&dir);
638 let storage = StorageFactory::create_file(dir.to_str().expect("path")).expect("create");
639 let stats = storage.get_stats().await.expect("stats");
640 assert_eq!(stats.backend_type, "file");
641 let _ = std::fs::remove_dir_all(&dir);
642 }
643
644 #[tokio::test]
645 async fn test_create_database_storage_fallback_backend_type() {
646 let storage =
647 StorageFactory::create_database("postgres://user:pass@localhost/db").expect("create");
648 let stats = storage.get_stats().await.expect("stats");
649 assert_eq!(stats.backend_type, "database");
650 }
651
652 #[tokio::test]
653 async fn test_create_vector_storage_fallback_backend_type() {
654 let mut cfg = HashMap::new();
655 cfg.insert("provider".to_string(), "qdrant".to_string());
656 let storage = StorageFactory::create_vector_db(cfg).expect("create");
657 let stats = storage.get_stats().await.expect("stats");
658 assert_eq!(stats.backend_type, "vector-db");
659 }
660}