1use crate::rag::engine::DocumentChunk;
7use crate::Result;
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10use std::sync::Arc;
11use tokio::sync::RwLock;
12
13type VectorStore = Arc<RwLock<Vec<(String, Vec<f32>)>>>;
14
15#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct VectorIndex {
18 pub id: String,
20 pub name: String,
22 pub index_type: IndexType,
24 pub dimensions: usize,
26 pub vector_count: usize,
28 pub metadata: HashMap<String, String>,
30 pub created_at: chrono::DateTime<chrono::Utc>,
32 pub updated_at: chrono::DateTime<chrono::Utc>,
34}
35
36impl VectorIndex {
37 pub fn new(id: String, name: String, index_type: IndexType, dimensions: usize) -> Self {
39 let now = chrono::Utc::now();
40 Self {
41 id,
42 name,
43 index_type,
44 dimensions,
45 vector_count: 0,
46 metadata: HashMap::new(),
47 created_at: now,
48 updated_at: now,
49 }
50 }
51
52 pub fn add_metadata(&mut self, key: String, value: String) {
54 self.metadata.insert(key, value);
55 self.updated_at = chrono::Utc::now();
56 }
57
58 pub fn get_metadata(&self, key: &str) -> Option<&String> {
60 self.metadata.get(key)
61 }
62
63 pub fn remove_metadata(&mut self, key: &str) -> Option<String> {
65 let result = self.metadata.remove(key);
66 if result.is_some() {
67 self.updated_at = chrono::Utc::now();
68 }
69 result
70 }
71
72 pub fn update_vector_count(&mut self, count: usize) {
74 self.vector_count = count;
75 self.updated_at = chrono::Utc::now();
76 }
77
78 pub fn estimated_size_bytes(&self) -> u64 {
80 (self.vector_count * self.dimensions * 4 + 1024) as u64
82 }
83
84 pub fn is_empty(&self) -> bool {
86 self.vector_count == 0
87 }
88
89 pub fn stats(&self) -> IndexStats {
91 IndexStats {
92 id: self.id.clone(),
93 name: self.name.clone(),
94 index_type: self.index_type.clone(),
95 dimensions: self.dimensions,
96 vector_count: self.vector_count,
97 estimated_size_bytes: self.estimated_size_bytes(),
98 metadata_count: self.metadata.len(),
99 created_at: self.created_at,
100 updated_at: self.updated_at,
101 }
102 }
103}
104
105#[derive(Debug, Clone, Serialize, Deserialize)]
107pub struct IndexStats {
108 pub id: String,
110 pub name: String,
112 pub index_type: IndexType,
114 pub dimensions: usize,
116 pub vector_count: usize,
118 pub estimated_size_bytes: u64,
120 pub metadata_count: usize,
122 pub created_at: chrono::DateTime<chrono::Utc>,
124 pub updated_at: chrono::DateTime<chrono::Utc>,
126}
127
128#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
130pub enum IndexType {
131 Flat,
133 IVF,
135 HNSW,
137 PQ,
139 Custom(String),
141}
142
143impl Default for IndexType {
144 fn default() -> Self {
145 Self::Flat
146 }
147}
148
149#[derive(Debug, Clone, Serialize, Deserialize)]
151pub struct SearchParams {
152 pub top_k: usize,
154 pub threshold: f32,
156 pub search_method: SearchMethod,
158 pub include_metadata: bool,
160 pub document_filter: Option<String>,
162 pub metadata_filter: Option<HashMap<String, String>>,
164}
165
166impl Default for SearchParams {
167 fn default() -> Self {
168 Self {
169 top_k: 10,
170 threshold: 0.7,
171 search_method: SearchMethod::Cosine,
172 include_metadata: true,
173 document_filter: None,
174 metadata_filter: None,
175 }
176 }
177}
178
179#[derive(Debug, Clone, Serialize, Deserialize)]
181pub enum SearchMethod {
182 Cosine,
184 Euclidean,
186 DotProduct,
188 Manhattan,
190}
191
192impl Default for SearchMethod {
193 fn default() -> Self {
194 Self::Cosine
195 }
196}
197
198#[async_trait::async_trait]
200pub trait DocumentStorage: Send + Sync {
201 async fn store_chunks(&self, chunks: Vec<DocumentChunk>) -> Result<()>;
203
204 async fn search_similar(
206 &self,
207 query_embedding: &[f32],
208 top_k: usize,
209 ) -> Result<Vec<DocumentChunk>>;
210
211 async fn search_with_params(
213 &self,
214 query_embedding: &[f32],
215 params: SearchParams,
216 ) -> Result<Vec<DocumentChunk>>;
217
218 async fn get_chunk(&self, chunk_id: &str) -> Result<Option<DocumentChunk>>;
220
221 async fn delete_chunk(&self, chunk_id: &str) -> Result<bool>;
223
224 async fn get_chunks_by_document(&self, document_id: &str) -> Result<Vec<DocumentChunk>>;
226
227 async fn delete_document(&self, document_id: &str) -> Result<usize>;
229
230 async fn get_stats(&self) -> Result<StorageStats>;
232
233 async fn list_documents(&self) -> Result<Vec<String>>;
235
236 async fn get_total_chunks(&self) -> Result<usize>;
238
239 async fn clear(&self) -> Result<()>;
241
242 async fn optimize(&self) -> Result<()>;
244
245 async fn create_backup(&self, path: &str) -> Result<()>;
247
248 async fn restore_backup(&self, path: &str) -> Result<()>;
250
251 async fn health_check(&self) -> Result<StorageHealth>;
253}
254
255#[derive(Debug, Clone, Serialize, Deserialize)]
257pub struct StorageStats {
258 pub total_documents: usize,
260 pub total_chunks: usize,
262 pub index_size_bytes: u64,
264 pub last_updated: chrono::DateTime<chrono::Utc>,
266 pub backend_type: String,
268 pub available_space_bytes: u64,
270 pub used_space_bytes: u64,
272}
273
274#[derive(Debug, Clone, Serialize, Deserialize)]
276pub struct StorageHealth {
277 pub status: HealthStatus,
279 pub checked_at: chrono::DateTime<chrono::Utc>,
281 pub details: HashMap<String, String>,
283 pub metrics: Option<StorageMetrics>,
285}
286
287#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
289pub enum HealthStatus {
290 Healthy,
292 Warning,
294 Unhealthy,
296 Unavailable,
298}
299
300#[derive(Debug, Clone, Serialize, Deserialize)]
302pub struct StorageMetrics {
303 pub average_search_time_ms: f64,
305 pub average_insert_time_ms: f64,
307 pub fragmentation_ratio: f32,
309 pub cache_hit_rate: f32,
311 pub memory_usage_bytes: u64,
313 pub disk_usage_bytes: u64,
315}
316
317pub struct InMemoryStorage {
319 chunks: Arc<RwLock<HashMap<String, DocumentChunk>>>,
320 vectors: VectorStore,
321 stats: Arc<RwLock<StorageStats>>,
322}
323
324impl InMemoryStorage {
325 pub fn new() -> Self {
327 let now = chrono::Utc::now();
328 Self {
329 chunks: Arc::new(RwLock::new(HashMap::new())),
330 vectors: Arc::new(RwLock::new(Vec::new())),
331 stats: Arc::new(RwLock::new(StorageStats {
332 total_documents: 0,
333 total_chunks: 0,
334 index_size_bytes: 0,
335 last_updated: now,
336 backend_type: "memory".to_string(),
337 available_space_bytes: u64::MAX,
338 used_space_bytes: 0,
339 })),
340 }
341 }
342
343 fn cosine_similarity(&self, a: &[f32], b: &[f32]) -> f32 {
345 if a.len() != b.len() {
346 return 0.0;
347 }
348
349 let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
350 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
351 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
352
353 if norm_a == 0.0 || norm_b == 0.0 {
354 0.0
355 } else {
356 dot_product / (norm_a * norm_b)
357 }
358 }
359}
360
361impl Default for InMemoryStorage {
362 fn default() -> Self {
363 Self::new()
364 }
365}
366
367#[async_trait::async_trait]
368impl DocumentStorage for InMemoryStorage {
369 async fn store_chunks(&self, chunks: Vec<DocumentChunk>) -> Result<()> {
370 let mut chunks_map = self.chunks.write().await;
371 let mut vectors = self.vectors.write().await;
372 let mut stats = self.stats.write().await;
373
374 for chunk in chunks {
375 chunks_map.insert(chunk.id.clone(), chunk.clone());
376
377 vectors.push((chunk.id.clone(), chunk.embedding.clone()));
379
380 stats.total_chunks += 1;
381 }
382
383 stats.last_updated = chrono::Utc::now();
384 stats.index_size_bytes = (stats.total_chunks * 1536 * 4) as u64; stats.used_space_bytes = stats.index_size_bytes;
386
387 Ok(())
388 }
389
390 async fn search_similar(
391 &self,
392 query_embedding: &[f32],
393 top_k: usize,
394 ) -> Result<Vec<DocumentChunk>> {
395 let vectors = self.vectors.read().await;
396 let chunks = self.chunks.read().await;
397
398 let mut similarities: Vec<(String, f32)> = vectors
399 .iter()
400 .map(|(chunk_id, embedding)| {
401 let similarity = self.cosine_similarity(query_embedding, embedding);
402 (chunk_id.clone(), similarity)
403 })
404 .collect();
405
406 similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
408
409 let mut results = Vec::new();
411 for (chunk_id, _) in similarities.iter().take(top_k) {
412 if let Some(chunk) = chunks.get(chunk_id) {
413 results.push(chunk.clone());
414 }
415 }
416
417 Ok(results)
418 }
419
420 async fn search_with_params(
421 &self,
422 query_embedding: &[f32],
423 params: SearchParams,
424 ) -> Result<Vec<DocumentChunk>> {
425 let mut results = self.search_similar(query_embedding, params.top_k * 2).await?; if let Some(document_filter) = ¶ms.document_filter {
429 results.retain(|chunk| chunk.document_id == *document_filter);
430 }
431
432 if let Some(metadata_filter) = ¶ms.metadata_filter {
433 results.retain(|chunk| {
434 metadata_filter.iter().all(|(key, value)| {
435 chunk.get_metadata(key).map(|v| v == value).unwrap_or(false)
436 })
437 });
438 }
439
440 results.retain(|chunk| {
442 let similarity = self.cosine_similarity(query_embedding, &chunk.embedding);
443 similarity >= params.threshold
444 });
445
446 results.sort_by(|a, b| {
448 let sim_a = self.cosine_similarity(query_embedding, &a.embedding);
449 let sim_b = self.cosine_similarity(query_embedding, &b.embedding);
450 sim_b.partial_cmp(&sim_a).unwrap_or(std::cmp::Ordering::Equal)
451 });
452
453 results.truncate(params.top_k);
455
456 Ok(results)
457 }
458
459 async fn get_chunk(&self, chunk_id: &str) -> Result<Option<DocumentChunk>> {
460 let chunks = self.chunks.read().await;
461 Ok(chunks.get(chunk_id).cloned())
462 }
463
464 async fn delete_chunk(&self, chunk_id: &str) -> Result<bool> {
465 let mut chunks = self.chunks.write().await;
466 let mut vectors = self.vectors.write().await;
467 let mut stats = self.stats.write().await;
468
469 let chunk_removed = chunks.remove(chunk_id).is_some();
470 let _vector_removed = vectors.retain(|(id, _)| id != chunk_id);
471
472 if chunk_removed {
473 stats.total_chunks = stats.total_chunks.saturating_sub(1);
474 stats.last_updated = chrono::Utc::now();
475 stats.index_size_bytes = (stats.total_chunks * 1536 * 4) as u64;
476 stats.used_space_bytes = stats.index_size_bytes;
477 }
478
479 Ok(chunk_removed)
480 }
481
482 async fn get_chunks_by_document(&self, document_id: &str) -> Result<Vec<DocumentChunk>> {
483 let chunks = self.chunks.read().await;
484 let results = chunks
485 .values()
486 .filter(|chunk| chunk.document_id == document_id)
487 .cloned()
488 .collect();
489 Ok(results)
490 }
491
492 async fn delete_document(&self, document_id: &str) -> Result<usize> {
493 let mut chunks = self.chunks.write().await;
494 let mut vectors = self.vectors.write().await;
495 let mut stats = self.stats.write().await;
496
497 let initial_count = chunks.len();
498 chunks.retain(|_, chunk| chunk.document_id != document_id);
499 vectors.retain(|(id, _)| {
500 chunks.contains_key(id)
502 });
503
504 let removed_count = initial_count - chunks.len();
505 if removed_count > 0 {
506 stats.total_chunks = stats.total_chunks.saturating_sub(removed_count);
507 stats.last_updated = chrono::Utc::now();
508 stats.index_size_bytes = (stats.total_chunks * 1536 * 4) as u64;
509 stats.used_space_bytes = stats.index_size_bytes;
510 }
511
512 Ok(removed_count)
513 }
514
515 async fn get_stats(&self) -> Result<StorageStats> {
516 let stats = self.stats.read().await;
517 Ok(stats.clone())
518 }
519
520 async fn list_documents(&self) -> Result<Vec<String>> {
521 let chunks = self.chunks.read().await;
522 let documents: std::collections::HashSet<String> =
523 chunks.values().map(|chunk| chunk.document_id.clone()).collect();
524 Ok(documents.into_iter().collect())
525 }
526
527 async fn get_total_chunks(&self) -> Result<usize> {
528 let stats = self.stats.read().await;
529 Ok(stats.total_chunks)
530 }
531
532 async fn clear(&self) -> Result<()> {
533 let mut chunks = self.chunks.write().await;
534 let mut vectors = self.vectors.write().await;
535 let mut stats = self.stats.write().await;
536
537 chunks.clear();
538 vectors.clear();
539
540 stats.total_documents = 0;
541 stats.total_chunks = 0;
542 stats.index_size_bytes = 0;
543 stats.used_space_bytes = 0;
544 stats.last_updated = chrono::Utc::now();
545
546 Ok(())
547 }
548
549 async fn optimize(&self) -> Result<()> {
550 Ok(())
552 }
553
554 async fn create_backup(&self, _path: &str) -> Result<()> {
555 Ok(())
557 }
558
559 async fn restore_backup(&self, _path: &str) -> Result<()> {
560 Ok(())
562 }
563
564 async fn health_check(&self) -> Result<StorageHealth> {
565 let chunks = self.chunks.read().await;
566 let vectors = self.vectors.read().await;
567
568 let mut details = HashMap::new();
569 details.insert("chunk_count".to_string(), chunks.len().to_string());
570 details.insert("vector_count".to_string(), vectors.len().to_string());
571 details.insert("memory_usage".to_string(), "unknown".to_string());
572
573 let status = if chunks.len() == vectors.len() {
574 HealthStatus::Healthy
575 } else {
576 details.insert("error".to_string(), "Chunk/vector count mismatch".to_string());
577 HealthStatus::Unhealthy
578 };
579
580 Ok(StorageHealth {
581 status,
582 checked_at: chrono::Utc::now(),
583 details,
584 metrics: None,
585 })
586 }
587}
588
589pub struct StorageFactory;
591
592impl StorageFactory {
593 pub fn create_memory() -> Box<dyn DocumentStorage> {
595 Box::new(InMemoryStorage::new())
596 }
597
598 pub fn create_file(_path: &str) -> Result<Box<dyn DocumentStorage>> {
600 Err(crate::Error::generic("File storage not yet implemented"))
602 }
603
604 pub fn create_database(_connection_string: &str) -> Result<Box<dyn DocumentStorage>> {
606 Err(crate::Error::generic("Database storage not yet implemented"))
608 }
609
610 pub fn create_vector_db(_config: HashMap<String, String>) -> Result<Box<dyn DocumentStorage>> {
612 Err(crate::Error::generic("Vector database storage not yet implemented"))
614 }
615}
616
617#[cfg(test)]
618mod tests {
619
620 #[test]
621 fn test_module_compiles() {
622 }
624}