1use anyhow::{Context, Result};
33use async_trait::async_trait;
34use chrono::{DateTime, Utc};
35use serde::{Deserialize, Serialize};
36use std::collections::HashMap;
37use std::fs::{self, File};
38use std::io::{BufReader, BufWriter};
39use std::path::{Path, PathBuf};
40use std::sync::RwLock;
41use std::time::Instant;
42
43use super::{
44 cosine_similarity, euclidean_distance, DeleteStats, DistanceMetric, EmbeddedDocument, Filter,
45 HealthStatus, SearchResult, UpsertStats, VectorStore,
46};
47
48#[derive(Debug, Clone, Serialize, Deserialize)]
50struct StoreMetadata {
51 version: u32,
53 created_at: DateTime<Utc>,
55 updated_at: DateTime<Utc>,
57 document_count: usize,
59 dimensions: Option<usize>,
61}
62
63#[derive(Serialize, Deserialize)]
65struct FileStoreData {
66 metadata: StoreMetadata,
68 documents: HashMap<String, EmbeddedDocument>,
70 distance_metric: DistanceMetric,
72}
73
74pub struct FileVectorStore {
79 data: RwLock<FileStoreData>,
81 file_path: PathBuf,
83}
84
85impl FileVectorStore {
86 pub fn new(config: FileConfig) -> Result<Self> {
91 let file_path = config.storage_path();
92
93 if let Some(parent) = file_path.parent() {
95 fs::create_dir_all(parent)
96 .with_context(|| format!("Failed to create directory: {}", parent.display()))?;
97 }
98
99 let data = if file_path.exists() {
101 tracing::info!("Loading vector store from {}", file_path.display());
102 Self::load_from_disk(&file_path)?
103 } else {
104 tracing::info!("Creating new vector store at {}", file_path.display());
105 FileStoreData {
106 metadata: StoreMetadata {
107 version: 1,
108 created_at: Utc::now(),
109 updated_at: Utc::now(),
110 document_count: 0,
111 dimensions: None,
112 },
113 documents: HashMap::new(),
114 distance_metric: config.distance_metric,
115 }
116 };
117
118 Ok(Self {
119 data: RwLock::new(data),
120 file_path,
121 })
122 }
123
124 fn load_from_disk(path: &Path) -> Result<FileStoreData> {
126 let file = File::open(path)
127 .with_context(|| format!("Failed to open vector store file: {}", path.display()))?;
128 let reader = BufReader::new(file);
129 let data: FileStoreData = bincode::deserialize_from(reader)
130 .context("Failed to deserialize vector store")?;
131
132 tracing::info!(
133 "Loaded {} documents from vector store (version {})",
134 data.documents.len(),
135 data.metadata.version
136 );
137
138 Ok(data)
139 }
140
141 fn save_to_disk(&self) -> Result<()> {
146 let data = self.data.read().unwrap();
147
148 let temp_path = self.file_path.with_extension("tmp");
150 let file = File::create(&temp_path)
151 .with_context(|| format!("Failed to create temp file: {}", temp_path.display()))?;
152 let writer = BufWriter::new(file);
153
154 bincode::serialize_into(writer, &*data).context("Failed to serialize vector store")?;
155
156 fs::rename(&temp_path, &self.file_path).with_context(|| {
158 format!(
159 "Failed to rename {} to {}",
160 temp_path.display(),
161 self.file_path.display()
162 )
163 })?;
164
165 tracing::debug!(
166 "Persisted vector store with {} documents to {}",
167 data.documents.len(),
168 self.file_path.display()
169 );
170
171 Ok(())
172 }
173
174 fn persist(&self) -> Result<()> {
178 {
180 let mut data = self.data.write().unwrap();
181 data.metadata.updated_at = Utc::now();
182 data.metadata.document_count = data.documents.len();
183 if let Some(first_doc) = data.documents.values().next() {
185 let dims = first_doc.embedding.len();
186 if data.metadata.dimensions != Some(dims) {
187 data.metadata.dimensions = Some(dims);
188 }
189 }
190 }
191
192 self.save_to_disk()
193 }
194
195 fn calculate_score(&self, embedding_a: &[f32], embedding_b: &[f32]) -> f32 {
197 let data = self.data.read().unwrap();
198 match data.distance_metric {
199 DistanceMetric::Cosine => {
200 let similarity = cosine_similarity(embedding_a, embedding_b);
202 (similarity + 1.0) / 2.0 }
204 DistanceMetric::Euclidean => {
205 let distance = euclidean_distance(embedding_a, embedding_b);
207 1.0 / (1.0 + distance) }
209 DistanceMetric::DotProduct => {
210 embedding_a
212 .iter()
213 .zip(embedding_b.iter())
214 .map(|(a, b)| a * b)
215 .sum::<f32>()
216 .max(0.0) .min(1.0)
218 }
219 }
220 }
221}
222
223#[async_trait]
224impl VectorStore for FileVectorStore {
225 async fn upsert(&self, documents: Vec<EmbeddedDocument>) -> Result<UpsertStats> {
226 let start = Instant::now();
227 let mut inserted = 0;
228 let mut updated = 0;
229
230 {
231 let mut data = self.data.write().unwrap();
232
233 if data.metadata.dimensions.is_none() && !documents.is_empty() {
235 data.metadata.dimensions = Some(documents[0].embedding.len());
236 }
237
238 for doc in documents {
239 if let Some(expected_dims) = data.metadata.dimensions {
241 if doc.embedding.len() != expected_dims {
242 anyhow::bail!(
243 "Document {} has {} dimensions, expected {}",
244 doc.id,
245 doc.embedding.len(),
246 expected_dims
247 );
248 }
249 }
250
251 if data.documents.contains_key(&doc.id) {
253 updated += 1;
254 } else {
255 inserted += 1;
256 }
257
258 data.documents.insert(doc.id.clone(), doc);
259 }
260 }
261
262 self.persist()?;
264
265 let duration_ms = start.elapsed().as_millis() as u64;
266
267 tracing::debug!(
268 "Upserted {} documents ({} inserted, {} updated) in {}ms",
269 inserted + updated,
270 inserted,
271 updated,
272 duration_ms
273 );
274
275 Ok(UpsertStats::new(inserted, updated, duration_ms))
276 }
277
278 async fn search(
279 &self,
280 query_embedding: Vec<f32>,
281 filter: Option<Filter>,
282 top_k: usize,
283 ) -> Result<Vec<SearchResult>> {
284 let data = self.data.read().unwrap();
285
286 let mut scored_results: Vec<(String, f32, &EmbeddedDocument)> = data
288 .documents
289 .iter()
290 .filter_map(|(id, doc)| {
291 if let Some(ref f) = filter {
293 if !f.matches(&doc.metadata) {
294 return None;
295 }
296 }
297
298 let score = self.calculate_score(&query_embedding, &doc.embedding);
300
301 if let Some(ref f) = filter {
303 if let Some(min_score) = f.min_score {
304 if score < min_score {
305 return None;
306 }
307 }
308 }
309
310 Some((id.clone(), score, doc))
311 })
312 .collect();
313
314 scored_results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
316
317 let results: Vec<SearchResult> = scored_results
319 .into_iter()
320 .take(top_k)
321 .map(|(id, score, doc)| SearchResult {
322 id,
323 score,
324 metadata: doc.metadata.clone(),
325 content: doc.content.clone(),
326 embedding: None, })
328 .collect();
329
330 tracing::debug!(
331 "Search completed: {} results out of {} documents",
332 results.len(),
333 data.documents.len()
334 );
335
336 Ok(results)
337 }
338
339 async fn delete(&self, ids: Vec<String>) -> Result<DeleteStats> {
340 let start = Instant::now();
341 let mut deleted = 0;
342 let mut not_found = 0;
343
344 {
345 let mut data = self.data.write().unwrap();
346 for id in &ids {
347 if data.documents.remove(id).is_some() {
348 deleted += 1;
349 } else {
350 not_found += 1;
351 }
352 }
353 }
354
355 if deleted > 0 {
357 self.persist()?;
358 }
359
360 let duration_ms = start.elapsed().as_millis() as u64;
361
362 tracing::debug!(
363 "Deleted {} documents ({} not found) in {}ms",
364 deleted,
365 not_found,
366 duration_ms
367 );
368
369 Ok(DeleteStats::new(deleted, not_found, duration_ms))
370 }
371
372 async fn get(&self, ids: Vec<String>) -> Result<Vec<EmbeddedDocument>> {
373 let data = self.data.read().unwrap();
374 let docs: Vec<EmbeddedDocument> = ids
375 .iter()
376 .filter_map(|id| data.documents.get(id).cloned())
377 .collect();
378
379 Ok(docs)
380 }
381
382 async fn count(&self, filter: Option<Filter>) -> Result<usize> {
383 let data = self.data.read().unwrap();
384
385 if let Some(f) = filter {
386 let count = data
388 .documents
389 .values()
390 .filter(|doc| f.matches(&doc.metadata))
391 .count();
392 Ok(count)
393 } else {
394 Ok(data.documents.len())
396 }
397 }
398
399 async fn health_check(&self) -> Result<HealthStatus> {
400 let start = Instant::now();
401
402 let count = {
404 let data = self.data.read().unwrap();
405 data.documents.len()
406 };
407
408 let file_exists = self.file_path.exists();
410 let latency_ms = start.elapsed().as_millis() as u64;
411
412 if file_exists {
413 Ok(HealthStatus::healthy("file", latency_ms).with_document_count(count))
414 } else {
415 Ok(HealthStatus::unhealthy(
416 "file",
417 format!("Store file not found: {}", self.file_path.display()),
418 latency_ms,
419 ))
420 }
421 }
422
423 fn backend_name(&self) -> &'static str {
424 "file"
425 }
426
427 fn dimensions(&self) -> Option<usize> {
428 let data = self.data.read().unwrap();
429 data.metadata.dimensions
430 }
431}
432
433#[derive(Debug, Clone, Serialize, Deserialize)]
435pub struct FileConfig {
436 pub storage_dir: Option<PathBuf>,
438 pub distance_metric: DistanceMetric,
440}
441
442impl FileConfig {
443 pub fn storage_path(&self) -> PathBuf {
445 self.storage_dir.clone().unwrap_or_else(|| {
446 let home = dirs::home_dir().expect("Could not determine home directory");
447 home.join(".skill-engine/vectors/store.bin")
448 })
449 }
450
451 pub fn with_storage_path(mut self, path: PathBuf) -> Self {
453 self.storage_dir = Some(path);
454 self
455 }
456
457 pub fn with_distance_metric(mut self, metric: DistanceMetric) -> Self {
459 self.distance_metric = metric;
460 self
461 }
462}
463
464impl Default for FileConfig {
465 fn default() -> Self {
466 Self {
467 storage_dir: None,
468 distance_metric: DistanceMetric::Cosine,
469 }
470 }
471}
472
473#[cfg(test)]
474mod tests {
475 use super::*;
476 use tempfile::tempdir;
477
478 #[tokio::test]
479 async fn test_file_vector_store_persistence() {
480 let temp_dir = tempdir().unwrap();
481 let storage_path = temp_dir.path().join("test_store.bin");
482
483 let config = FileConfig::default().with_storage_path(storage_path.clone());
484
485 let store = FileVectorStore::new(config.clone()).unwrap();
487
488 let docs = vec![
489 EmbeddedDocument::new("doc1", vec![0.1, 0.2, 0.3])
490 .with_skill_name("test")
491 .with_content("Test document 1"),
492 EmbeddedDocument::new("doc2", vec![0.4, 0.5, 0.6])
493 .with_skill_name("test")
494 .with_content("Test document 2"),
495 ];
496
497 store.upsert(docs).await.unwrap();
498
499 assert_eq!(store.count(None).await.unwrap(), 2);
501
502 drop(store);
504
505 let store2 = FileVectorStore::new(config).unwrap();
507 assert_eq!(store2.count(None).await.unwrap(), 2);
508
509 let loaded_docs = store2.get(vec!["doc1".to_string(), "doc2".to_string()]).await.unwrap();
511 assert_eq!(loaded_docs.len(), 2);
512 assert_eq!(loaded_docs[0].id, "doc1");
513 assert_eq!(loaded_docs[0].embedding, vec![0.1, 0.2, 0.3]);
514 }
515
516 #[tokio::test]
517 async fn test_file_vector_store_search() {
518 let temp_dir = tempdir().unwrap();
519 let config = FileConfig::default().with_storage_path(temp_dir.path().join("search_test.bin"));
520
521 let store = FileVectorStore::new(config).unwrap();
522
523 let docs = vec![
524 EmbeddedDocument::new("doc1", vec![1.0, 0.0, 0.0])
525 .with_skill_name("skill1")
526 .with_content("Document 1"),
527 EmbeddedDocument::new("doc2", vec![0.0, 1.0, 0.0])
528 .with_skill_name("skill2")
529 .with_content("Document 2"),
530 EmbeddedDocument::new("doc3", vec![0.9, 0.1, 0.0])
531 .with_skill_name("skill1")
532 .with_content("Document 3"),
533 ];
534
535 store.upsert(docs).await.unwrap();
536
537 let results = store
539 .search(vec![1.0, 0.0, 0.0], None, 2)
540 .await
541 .unwrap();
542
543 assert_eq!(results.len(), 2);
544 assert_eq!(results[0].id, "doc1"); assert_eq!(results[1].id, "doc3"); assert!(results[0].score > results[1].score);
547 }
548
549 #[tokio::test]
550 async fn test_file_vector_store_filter() {
551 let temp_dir = tempdir().unwrap();
552 let config = FileConfig::default().with_storage_path(temp_dir.path().join("filter_test.bin"));
553
554 let store = FileVectorStore::new(config).unwrap();
555
556 let docs = vec![
557 EmbeddedDocument::new("doc1", vec![1.0, 0.0])
558 .with_skill_name("skill1")
559 .with_content("Document 1"),
560 EmbeddedDocument::new("doc2", vec![0.9, 0.1])
561 .with_skill_name("skill2")
562 .with_content("Document 2"),
563 EmbeddedDocument::new("doc3", vec![0.8, 0.2])
564 .with_skill_name("skill1")
565 .with_content("Document 3"),
566 ];
567
568 store.upsert(docs).await.unwrap();
569
570 let filter = Filter::new().skill("skill1");
572 let results = store
573 .search(vec![1.0, 0.0], Some(filter), 10)
574 .await
575 .unwrap();
576
577 assert_eq!(results.len(), 2);
578 assert!(results.iter().all(|r| r.metadata.skill_name.as_deref() == Some("skill1")));
579 }
580
581 #[tokio::test]
582 async fn test_file_vector_store_delete() {
583 let temp_dir = tempdir().unwrap();
584 let config = FileConfig::default().with_storage_path(temp_dir.path().join("delete_test.bin"));
585
586 let store = FileVectorStore::new(config).unwrap();
587
588 let docs = vec![
589 EmbeddedDocument::new("doc1", vec![1.0, 0.0]),
590 EmbeddedDocument::new("doc2", vec![0.0, 1.0]),
591 ];
592
593 store.upsert(docs).await.unwrap();
594 assert_eq!(store.count(None).await.unwrap(), 2);
595
596 let stats = store.delete(vec!["doc1".to_string()]).await.unwrap();
598 assert_eq!(stats.deleted, 1);
599 assert_eq!(stats.not_found, 0);
600 assert_eq!(store.count(None).await.unwrap(), 1);
601
602 let stats = store.delete(vec!["doc3".to_string()]).await.unwrap();
604 assert_eq!(stats.deleted, 0);
605 assert_eq!(stats.not_found, 1);
606 }
607
608 #[tokio::test]
609 async fn test_file_vector_store_health_check() {
610 let temp_dir = tempdir().unwrap();
611 let config = FileConfig::default().with_storage_path(temp_dir.path().join("health_test.bin"));
612
613 let store = FileVectorStore::new(config).unwrap();
614
615 store
617 .upsert(vec![EmbeddedDocument::new("doc1", vec![1.0, 0.0])])
618 .await
619 .unwrap();
620
621 let health = store.health_check().await.unwrap();
622 assert!(health.healthy);
623 assert_eq!(health.backend, "file");
624 assert_eq!(health.document_count, Some(1));
625 }
626}