skill_runtime/vector_store/
file.rs

1//! File-based vector store with persistence to local disk
2//!
3//! Stores vectors in bincode format at ~/.skill-engine/vectors/store.bin
4//! Provides atomic writes and automatic persistence.
5//!
6//! # Features
7//!
8//! - **Persistent storage**: Vectors survive server restarts
9//! - **Atomic writes**: Uses temp file + rename for safe persistence
10//! - **Lazy loading**: Loads from disk on first access
11//! - **Auto-save**: Persists after each modification
12//! - **Simple and fast**: Binary serialization with bincode
13//!
14//! # Performance
15//!
16//! - Write latency: ~5-20ms for 1000 documents
17//! - Search: O(n) linear scan (acceptable for <10k documents)
18//! - File size: ~4-8 bytes per dimension per document
19//!
20//! # Example
21//!
22//! ```ignore
23//! use skill_runtime::vector_store::{FileVectorStore, FileConfig};
24//!
25//! let config = FileConfig::default(); // Uses ~/.skill-engine/vectors/store.bin
26//! let store = FileVectorStore::new(config)?;
27//!
28//! // Data persists to disk automatically
29//! store.upsert(documents).await?;
30//! ```
31
32use anyhow::{Context, Result};
33use async_trait::async_trait;
34use chrono::{DateTime, Utc};
35use serde::{Deserialize, Serialize};
36use std::collections::HashMap;
37use std::fs::{self, File};
38use std::io::{BufReader, BufWriter};
39use std::path::{Path, PathBuf};
40use std::sync::RwLock;
41use std::time::Instant;
42
43use super::{
44    cosine_similarity, euclidean_distance, DeleteStats, DistanceMetric, EmbeddedDocument, Filter,
45    HealthStatus, SearchResult, UpsertStats, VectorStore,
46};
47
48/// Metadata about the vector store file
49#[derive(Debug, Clone, Serialize, Deserialize)]
50struct StoreMetadata {
51    /// Format version for forward compatibility
52    version: u32,
53    /// When the store was first created
54    created_at: DateTime<Utc>,
55    /// Last modification time
56    updated_at: DateTime<Utc>,
57    /// Number of documents currently stored
58    document_count: usize,
59    /// Embedding dimensions (if known)
60    dimensions: Option<usize>,
61}
62
63/// Serializable container for the vector store
64#[derive(Serialize, Deserialize)]
65struct FileStoreData {
66    /// Store metadata
67    metadata: StoreMetadata,
68    /// Documents indexed by ID
69    documents: HashMap<String, EmbeddedDocument>,
70    /// Distance metric for similarity calculation
71    distance_metric: DistanceMetric,
72}
73
74/// File-based vector store with automatic persistence
75///
76/// This implementation provides persistent vector storage using local files.
77/// Data is serialized with bincode for performance and compactness.
78pub struct FileVectorStore {
79    /// The store data (protected by RwLock for thread safety)
80    data: RwLock<FileStoreData>,
81    /// Path to the storage file
82    file_path: PathBuf,
83}
84
85impl FileVectorStore {
86    /// Create new file-based vector store
87    ///
88    /// If the file exists, loads data from disk. Otherwise, creates a new empty store.
89    /// The parent directory will be created if it doesn't exist.
90    pub fn new(config: FileConfig) -> Result<Self> {
91        let file_path = config.storage_path();
92
93        // Create directory if needed
94        if let Some(parent) = file_path.parent() {
95            fs::create_dir_all(parent)
96                .with_context(|| format!("Failed to create directory: {}", parent.display()))?;
97        }
98
99        // Load existing data or create new
100        let data = if file_path.exists() {
101            tracing::info!("Loading vector store from {}", file_path.display());
102            Self::load_from_disk(&file_path)?
103        } else {
104            tracing::info!("Creating new vector store at {}", file_path.display());
105            FileStoreData {
106                metadata: StoreMetadata {
107                    version: 1,
108                    created_at: Utc::now(),
109                    updated_at: Utc::now(),
110                    document_count: 0,
111                    dimensions: None,
112                },
113                documents: HashMap::new(),
114                distance_metric: config.distance_metric,
115            }
116        };
117
118        Ok(Self {
119            data: RwLock::new(data),
120            file_path,
121        })
122    }
123
124    /// Load store data from disk
125    fn load_from_disk(path: &Path) -> Result<FileStoreData> {
126        let file = File::open(path)
127            .with_context(|| format!("Failed to open vector store file: {}", path.display()))?;
128        let reader = BufReader::new(file);
129        let data: FileStoreData = bincode::deserialize_from(reader)
130            .context("Failed to deserialize vector store")?;
131
132        tracing::info!(
133            "Loaded {} documents from vector store (version {})",
134            data.documents.len(),
135            data.metadata.version
136        );
137
138        Ok(data)
139    }
140
141    /// Save store data to disk atomically
142    ///
143    /// Writes to a temporary file first, then renames it to the target path.
144    /// This ensures the store is never left in a corrupted state.
145    fn save_to_disk(&self) -> Result<()> {
146        let data = self.data.read().unwrap();
147
148        // Write to temp file first (atomic operation)
149        let temp_path = self.file_path.with_extension("tmp");
150        let file = File::create(&temp_path)
151            .with_context(|| format!("Failed to create temp file: {}", temp_path.display()))?;
152        let writer = BufWriter::new(file);
153
154        bincode::serialize_into(writer, &*data).context("Failed to serialize vector store")?;
155
156        // Rename temp file to actual file (atomic on Unix)
157        fs::rename(&temp_path, &self.file_path).with_context(|| {
158            format!(
159                "Failed to rename {} to {}",
160                temp_path.display(),
161                self.file_path.display()
162            )
163        })?;
164
165        tracing::debug!(
166            "Persisted vector store with {} documents to {}",
167            data.documents.len(),
168            self.file_path.display()
169        );
170
171        Ok(())
172    }
173
174    /// Auto-persist after modification
175    ///
176    /// Updates metadata and saves to disk.
177    fn persist(&self) -> Result<()> {
178        // Update metadata
179        {
180            let mut data = self.data.write().unwrap();
181            data.metadata.updated_at = Utc::now();
182            data.metadata.document_count = data.documents.len();
183            // Update dimensions if they changed
184            if let Some(first_doc) = data.documents.values().next() {
185                let dims = first_doc.embedding.len();
186                if data.metadata.dimensions != Some(dims) {
187                    data.metadata.dimensions = Some(dims);
188                }
189            }
190        }
191
192        self.save_to_disk()
193    }
194
195    /// Calculate similarity score between two embeddings
196    fn calculate_score(&self, embedding_a: &[f32], embedding_b: &[f32]) -> f32 {
197        let data = self.data.read().unwrap();
198        match data.distance_metric {
199            DistanceMetric::Cosine => {
200                // Convert cosine similarity to score (0-1 range, higher is better)
201                let similarity = cosine_similarity(embedding_a, embedding_b);
202                (similarity + 1.0) / 2.0 // Map [-1, 1] to [0, 1]
203            }
204            DistanceMetric::Euclidean => {
205                // Convert euclidean distance to score (higher is better)
206                let distance = euclidean_distance(embedding_a, embedding_b);
207                1.0 / (1.0 + distance) // Closer distances = higher scores
208            }
209            DistanceMetric::DotProduct => {
210                // Dot product (assumes normalized vectors)
211                embedding_a
212                    .iter()
213                    .zip(embedding_b.iter())
214                    .map(|(a, b)| a * b)
215                    .sum::<f32>()
216                    .max(0.0) // Clamp to 0-1 for score
217                    .min(1.0)
218            }
219        }
220    }
221}
222
223#[async_trait]
224impl VectorStore for FileVectorStore {
225    async fn upsert(&self, documents: Vec<EmbeddedDocument>) -> Result<UpsertStats> {
226        let start = Instant::now();
227        let mut inserted = 0;
228        let mut updated = 0;
229
230        {
231            let mut data = self.data.write().unwrap();
232
233            // Set dimensions from first document if not set
234            if data.metadata.dimensions.is_none() && !documents.is_empty() {
235                data.metadata.dimensions = Some(documents[0].embedding.len());
236            }
237
238            for doc in documents {
239                // Validate dimensions match
240                if let Some(expected_dims) = data.metadata.dimensions {
241                    if doc.embedding.len() != expected_dims {
242                        anyhow::bail!(
243                            "Document {} has {} dimensions, expected {}",
244                            doc.id,
245                            doc.embedding.len(),
246                            expected_dims
247                        );
248                    }
249                }
250
251                // Track insert vs update
252                if data.documents.contains_key(&doc.id) {
253                    updated += 1;
254                } else {
255                    inserted += 1;
256                }
257
258                data.documents.insert(doc.id.clone(), doc);
259            }
260        }
261
262        // Persist to disk
263        self.persist()?;
264
265        let duration_ms = start.elapsed().as_millis() as u64;
266
267        tracing::debug!(
268            "Upserted {} documents ({} inserted, {} updated) in {}ms",
269            inserted + updated,
270            inserted,
271            updated,
272            duration_ms
273        );
274
275        Ok(UpsertStats::new(inserted, updated, duration_ms))
276    }
277
278    async fn search(
279        &self,
280        query_embedding: Vec<f32>,
281        filter: Option<Filter>,
282        top_k: usize,
283    ) -> Result<Vec<SearchResult>> {
284        let data = self.data.read().unwrap();
285
286        // Calculate scores for all documents
287        let mut scored_results: Vec<(String, f32, &EmbeddedDocument)> = data
288            .documents
289            .iter()
290            .filter_map(|(id, doc)| {
291                // Apply metadata filter if provided
292                if let Some(ref f) = filter {
293                    if !f.matches(&doc.metadata) {
294                        return None;
295                    }
296                }
297
298                // Calculate similarity score
299                let score = self.calculate_score(&query_embedding, &doc.embedding);
300
301                // Apply minimum score filter if provided
302                if let Some(ref f) = filter {
303                    if let Some(min_score) = f.min_score {
304                        if score < min_score {
305                            return None;
306                        }
307                    }
308                }
309
310                Some((id.clone(), score, doc))
311            })
312            .collect();
313
314        // Sort by score (descending - higher scores first)
315        scored_results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
316
317        // Take top_k and convert to SearchResult
318        let results: Vec<SearchResult> = scored_results
319            .into_iter()
320            .take(top_k)
321            .map(|(id, score, doc)| SearchResult {
322                id,
323                score,
324                metadata: doc.metadata.clone(),
325                content: doc.content.clone(),
326                embedding: None, // Don't include embedding in results for efficiency
327            })
328            .collect();
329
330        tracing::debug!(
331            "Search completed: {} results out of {} documents",
332            results.len(),
333            data.documents.len()
334        );
335
336        Ok(results)
337    }
338
339    async fn delete(&self, ids: Vec<String>) -> Result<DeleteStats> {
340        let start = Instant::now();
341        let mut deleted = 0;
342        let mut not_found = 0;
343
344        {
345            let mut data = self.data.write().unwrap();
346            for id in &ids {
347                if data.documents.remove(id).is_some() {
348                    deleted += 1;
349                } else {
350                    not_found += 1;
351                }
352            }
353        }
354
355        // Persist to disk
356        if deleted > 0 {
357            self.persist()?;
358        }
359
360        let duration_ms = start.elapsed().as_millis() as u64;
361
362        tracing::debug!(
363            "Deleted {} documents ({} not found) in {}ms",
364            deleted,
365            not_found,
366            duration_ms
367        );
368
369        Ok(DeleteStats::new(deleted, not_found, duration_ms))
370    }
371
372    async fn get(&self, ids: Vec<String>) -> Result<Vec<EmbeddedDocument>> {
373        let data = self.data.read().unwrap();
374        let docs: Vec<EmbeddedDocument> = ids
375            .iter()
376            .filter_map(|id| data.documents.get(id).cloned())
377            .collect();
378
379        Ok(docs)
380    }
381
382    async fn count(&self, filter: Option<Filter>) -> Result<usize> {
383        let data = self.data.read().unwrap();
384
385        if let Some(f) = filter {
386            // Count with filter
387            let count = data
388                .documents
389                .values()
390                .filter(|doc| f.matches(&doc.metadata))
391                .count();
392            Ok(count)
393        } else {
394            // Total count
395            Ok(data.documents.len())
396        }
397    }
398
399    async fn health_check(&self) -> Result<HealthStatus> {
400        let start = Instant::now();
401
402        // Check if we can read the data
403        let count = {
404            let data = self.data.read().unwrap();
405            data.documents.len()
406        };
407
408        // Check if file exists and is readable
409        let file_exists = self.file_path.exists();
410        let latency_ms = start.elapsed().as_millis() as u64;
411
412        if file_exists {
413            Ok(HealthStatus::healthy("file", latency_ms).with_document_count(count))
414        } else {
415            Ok(HealthStatus::unhealthy(
416                "file",
417                format!("Store file not found: {}", self.file_path.display()),
418                latency_ms,
419            ))
420        }
421    }
422
423    fn backend_name(&self) -> &'static str {
424        "file"
425    }
426
427    fn dimensions(&self) -> Option<usize> {
428        let data = self.data.read().unwrap();
429        data.metadata.dimensions
430    }
431}
432
433/// Configuration for file-based vector store
434#[derive(Debug, Clone, Serialize, Deserialize)]
435pub struct FileConfig {
436    /// Custom storage directory (if None, uses default ~/.skill-engine/vectors/store.bin)
437    pub storage_dir: Option<PathBuf>,
438    /// Distance metric for similarity calculation
439    pub distance_metric: DistanceMetric,
440}
441
442impl FileConfig {
443    /// Get the storage path, defaulting to ~/.skill-engine/vectors/store.bin
444    pub fn storage_path(&self) -> PathBuf {
445        self.storage_dir.clone().unwrap_or_else(|| {
446            let home = dirs::home_dir().expect("Could not determine home directory");
447            home.join(".skill-engine/vectors/store.bin")
448        })
449    }
450
451    /// Create config with custom storage path
452    pub fn with_storage_path(mut self, path: PathBuf) -> Self {
453        self.storage_dir = Some(path);
454        self
455    }
456
457    /// Create config with custom distance metric
458    pub fn with_distance_metric(mut self, metric: DistanceMetric) -> Self {
459        self.distance_metric = metric;
460        self
461    }
462}
463
464impl Default for FileConfig {
465    fn default() -> Self {
466        Self {
467            storage_dir: None,
468            distance_metric: DistanceMetric::Cosine,
469        }
470    }
471}
472
473#[cfg(test)]
474mod tests {
475    use super::*;
476    use tempfile::tempdir;
477
478    #[tokio::test]
479    async fn test_file_vector_store_persistence() {
480        let temp_dir = tempdir().unwrap();
481        let storage_path = temp_dir.path().join("test_store.bin");
482
483        let config = FileConfig::default().with_storage_path(storage_path.clone());
484
485        // Create store and add documents
486        let store = FileVectorStore::new(config.clone()).unwrap();
487
488        let docs = vec![
489            EmbeddedDocument::new("doc1", vec![0.1, 0.2, 0.3])
490                .with_skill_name("test")
491                .with_content("Test document 1"),
492            EmbeddedDocument::new("doc2", vec![0.4, 0.5, 0.6])
493                .with_skill_name("test")
494                .with_content("Test document 2"),
495        ];
496
497        store.upsert(docs).await.unwrap();
498
499        // Verify count
500        assert_eq!(store.count(None).await.unwrap(), 2);
501
502        // Drop store (simulating server restart)
503        drop(store);
504
505        // Create new store - should load persisted data
506        let store2 = FileVectorStore::new(config).unwrap();
507        assert_eq!(store2.count(None).await.unwrap(), 2);
508
509        // Verify documents are intact
510        let loaded_docs = store2.get(vec!["doc1".to_string(), "doc2".to_string()]).await.unwrap();
511        assert_eq!(loaded_docs.len(), 2);
512        assert_eq!(loaded_docs[0].id, "doc1");
513        assert_eq!(loaded_docs[0].embedding, vec![0.1, 0.2, 0.3]);
514    }
515
516    #[tokio::test]
517    async fn test_file_vector_store_search() {
518        let temp_dir = tempdir().unwrap();
519        let config = FileConfig::default().with_storage_path(temp_dir.path().join("search_test.bin"));
520
521        let store = FileVectorStore::new(config).unwrap();
522
523        let docs = vec![
524            EmbeddedDocument::new("doc1", vec![1.0, 0.0, 0.0])
525                .with_skill_name("skill1")
526                .with_content("Document 1"),
527            EmbeddedDocument::new("doc2", vec![0.0, 1.0, 0.0])
528                .with_skill_name("skill2")
529                .with_content("Document 2"),
530            EmbeddedDocument::new("doc3", vec![0.9, 0.1, 0.0])
531                .with_skill_name("skill1")
532                .with_content("Document 3"),
533        ];
534
535        store.upsert(docs).await.unwrap();
536
537        // Search for vectors similar to [1, 0, 0]
538        let results = store
539            .search(vec![1.0, 0.0, 0.0], None, 2)
540            .await
541            .unwrap();
542
543        assert_eq!(results.len(), 2);
544        assert_eq!(results[0].id, "doc1"); // Exact match should be first
545        assert_eq!(results[1].id, "doc3"); // Similar vector should be second
546        assert!(results[0].score > results[1].score);
547    }
548
549    #[tokio::test]
550    async fn test_file_vector_store_filter() {
551        let temp_dir = tempdir().unwrap();
552        let config = FileConfig::default().with_storage_path(temp_dir.path().join("filter_test.bin"));
553
554        let store = FileVectorStore::new(config).unwrap();
555
556        let docs = vec![
557            EmbeddedDocument::new("doc1", vec![1.0, 0.0])
558                .with_skill_name("skill1")
559                .with_content("Document 1"),
560            EmbeddedDocument::new("doc2", vec![0.9, 0.1])
561                .with_skill_name("skill2")
562                .with_content("Document 2"),
563            EmbeddedDocument::new("doc3", vec![0.8, 0.2])
564                .with_skill_name("skill1")
565                .with_content("Document 3"),
566        ];
567
568        store.upsert(docs).await.unwrap();
569
570        // Search with filter
571        let filter = Filter::new().skill("skill1");
572        let results = store
573            .search(vec![1.0, 0.0], Some(filter), 10)
574            .await
575            .unwrap();
576
577        assert_eq!(results.len(), 2);
578        assert!(results.iter().all(|r| r.metadata.skill_name.as_deref() == Some("skill1")));
579    }
580
581    #[tokio::test]
582    async fn test_file_vector_store_delete() {
583        let temp_dir = tempdir().unwrap();
584        let config = FileConfig::default().with_storage_path(temp_dir.path().join("delete_test.bin"));
585
586        let store = FileVectorStore::new(config).unwrap();
587
588        let docs = vec![
589            EmbeddedDocument::new("doc1", vec![1.0, 0.0]),
590            EmbeddedDocument::new("doc2", vec![0.0, 1.0]),
591        ];
592
593        store.upsert(docs).await.unwrap();
594        assert_eq!(store.count(None).await.unwrap(), 2);
595
596        // Delete one document
597        let stats = store.delete(vec!["doc1".to_string()]).await.unwrap();
598        assert_eq!(stats.deleted, 1);
599        assert_eq!(stats.not_found, 0);
600        assert_eq!(store.count(None).await.unwrap(), 1);
601
602        // Try to delete non-existent document
603        let stats = store.delete(vec!["doc3".to_string()]).await.unwrap();
604        assert_eq!(stats.deleted, 0);
605        assert_eq!(stats.not_found, 1);
606    }
607
608    #[tokio::test]
609    async fn test_file_vector_store_health_check() {
610        let temp_dir = tempdir().unwrap();
611        let config = FileConfig::default().with_storage_path(temp_dir.path().join("health_test.bin"));
612
613        let store = FileVectorStore::new(config).unwrap();
614
615        // Add some documents
616        store
617            .upsert(vec![EmbeddedDocument::new("doc1", vec![1.0, 0.0])])
618            .await
619            .unwrap();
620
621        let health = store.health_check().await.unwrap();
622        assert!(health.healthy);
623        assert_eq!(health.backend, "file");
624        assert_eq!(health.document_count, Some(1));
625    }
626}