skill_runtime/search/
index_manager.rs

1//! Persistent index manager with incremental updates
2//!
3//! Provides index management for persistent storage, incremental updates,
4//! and automatic synchronization of skill embeddings.
5
6use anyhow::{Context, Result};
7use chrono::{DateTime, Utc};
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10use std::fs;
11use std::path::{Path, PathBuf};
12
13/// Configuration for the index manager
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct IndexConfig {
16    /// Directory for index storage
17    pub index_path: PathBuf,
18    /// Embedding model name
19    pub embedding_model: String,
20    /// Embedding dimensions
21    pub embedding_dimensions: usize,
22    /// Batch size for embedding generation
23    pub chunk_size: usize,
24    /// Whether to index on startup
25    pub index_on_startup: bool,
26    /// Whether to watch for changes
27    pub watch_for_changes: bool,
28}
29
30impl Default for IndexConfig {
31    fn default() -> Self {
32        let default_path = dirs::home_dir()
33            .map(|p| p.join(".skill-engine").join("index"))
34            .unwrap_or_else(|| PathBuf::from(".skill-engine/index"));
35
36        Self {
37            index_path: default_path,
38            embedding_model: "all-minilm".to_string(),
39            embedding_dimensions: 384,
40            chunk_size: 32,
41            index_on_startup: true,
42            watch_for_changes: false,
43        }
44    }
45}
46
47impl IndexConfig {
48    /// Create config with custom path
49    pub fn with_path(path: impl Into<PathBuf>) -> Self {
50        Self {
51            index_path: path.into(),
52            ..Default::default()
53        }
54    }
55
56    /// Set embedding model
57    pub fn with_model(mut self, model: impl Into<String>, dimensions: usize) -> Self {
58        self.embedding_model = model.into();
59        self.embedding_dimensions = dimensions;
60        self
61    }
62
63    /// Set chunk size
64    pub fn with_chunk_size(mut self, size: usize) -> Self {
65        self.chunk_size = size;
66        self
67    }
68
69    /// Disable startup indexing
70    pub fn no_startup_index(mut self) -> Self {
71        self.index_on_startup = false;
72        self
73    }
74}
75
76/// Checksum for a skill to detect changes
77#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
78pub struct SkillChecksum {
79    /// SKILL.md content hash
80    pub skill_md_hash: String,
81    /// WASM binary hash (if exists)
82    pub wasm_hash: Option<String>,
83    /// Manifest hash
84    pub manifest_hash: Option<String>,
85    /// Last indexed timestamp
86    pub indexed_at: DateTime<Utc>,
87}
88
89/// Index metadata stored on disk
90#[derive(Debug, Clone, Serialize, Deserialize)]
91pub struct IndexMetadata {
92    /// Metadata version
93    pub version: u32,
94    /// Embedding model used
95    pub embedding_model: String,
96    /// Embedding dimensions
97    pub dimensions: usize,
98    /// Index creation timestamp
99    pub created_at: DateTime<Utc>,
100    /// Last modification timestamp
101    pub last_modified: DateTime<Utc>,
102    /// Total document count
103    pub document_count: usize,
104    /// Checksums for indexed skills
105    pub skill_checksums: HashMap<String, SkillChecksum>,
106}
107
108impl IndexMetadata {
109    const CURRENT_VERSION: u32 = 1;
110    const METADATA_FILE: &'static str = "index_metadata.json";
111
112    /// Create new metadata
113    pub fn new(embedding_model: impl Into<String>, dimensions: usize) -> Self {
114        let now = Utc::now();
115        Self {
116            version: Self::CURRENT_VERSION,
117            embedding_model: embedding_model.into(),
118            dimensions,
119            created_at: now,
120            last_modified: now,
121            document_count: 0,
122            skill_checksums: HashMap::new(),
123        }
124    }
125
126    /// Load metadata from disk
127    pub fn load(index_path: &Path) -> Result<Option<Self>> {
128        let metadata_path = index_path.join(Self::METADATA_FILE);
129        if !metadata_path.exists() {
130            return Ok(None);
131        }
132
133        let content = fs::read_to_string(&metadata_path)
134            .context("Failed to read index metadata")?;
135        let metadata: Self = serde_json::from_str(&content)
136            .context("Failed to parse index metadata")?;
137
138        Ok(Some(metadata))
139    }
140
141    /// Save metadata to disk
142    pub fn save(&self, index_path: &Path) -> Result<()> {
143        fs::create_dir_all(index_path)
144            .context("Failed to create index directory")?;
145
146        let metadata_path = index_path.join(Self::METADATA_FILE);
147        let content = serde_json::to_string_pretty(self)
148            .context("Failed to serialize index metadata")?;
149        fs::write(&metadata_path, content)
150            .context("Failed to write index metadata")?;
151
152        Ok(())
153    }
154
155    /// Check if metadata is compatible with config
156    pub fn is_compatible(&self, config: &IndexConfig) -> bool {
157        self.version == Self::CURRENT_VERSION &&
158        self.embedding_model == config.embedding_model &&
159        self.dimensions == config.embedding_dimensions
160    }
161
162    /// Update last modified time
163    pub fn touch(&mut self) {
164        self.last_modified = Utc::now();
165    }
166}
167
168/// Statistics about the index
169#[derive(Debug, Clone, Default)]
170pub struct IndexStats {
171    /// Total skills indexed
172    pub total_skills: usize,
173    /// Total documents (tools) indexed
174    pub total_documents: usize,
175    /// Skills that need re-indexing
176    pub stale_skills: usize,
177    /// Index size on disk (bytes)
178    pub index_size_bytes: u64,
179}
180
181/// Result of a sync operation
182#[derive(Debug, Clone, Default)]
183pub struct SyncResult {
184    /// Skills that were added
185    pub added: Vec<String>,
186    /// Skills that were updated
187    pub updated: Vec<String>,
188    /// Skills that were removed
189    pub removed: Vec<String>,
190    /// Skills that were skipped (unchanged)
191    pub skipped: usize,
192    /// Whether a full reindex was required
193    pub full_reindex: bool,
194}
195
196impl SyncResult {
197    /// Check if any changes were made
198    pub fn has_changes(&self) -> bool {
199        !self.added.is_empty() || !self.updated.is_empty() || !self.removed.is_empty()
200    }
201
202    /// Total number of skills processed
203    pub fn total_processed(&self) -> usize {
204        self.added.len() + self.updated.len() + self.removed.len() + self.skipped
205    }
206}
207
208/// Index manager for persistent skill indexing
209pub struct IndexManager {
210    config: IndexConfig,
211    metadata: IndexMetadata,
212}
213
214impl IndexManager {
215    /// Create a new index manager
216    pub fn new(config: IndexConfig) -> Result<Self> {
217        // Load or create metadata
218        let metadata = match IndexMetadata::load(&config.index_path)? {
219            Some(meta) if meta.is_compatible(&config) => meta,
220            _ => IndexMetadata::new(&config.embedding_model, config.embedding_dimensions),
221        };
222
223        Ok(Self { config, metadata })
224    }
225
226    /// Get the config
227    pub fn config(&self) -> &IndexConfig {
228        &self.config
229    }
230
231    /// Get the metadata
232    pub fn metadata(&self) -> &IndexMetadata {
233        &self.metadata
234    }
235
236    /// Get index statistics
237    pub fn stats(&self) -> IndexStats {
238        let index_size_bytes = self.calculate_index_size();
239
240        IndexStats {
241            total_skills: self.metadata.skill_checksums.len(),
242            total_documents: self.metadata.document_count,
243            stale_skills: 0, // Would be calculated during sync
244            index_size_bytes,
245        }
246    }
247
248    /// Compute checksum for a skill
249    pub fn compute_skill_checksum(&self, skill_path: &Path) -> Result<SkillChecksum> {
250        let mut skill_md_hash = String::new();
251        let mut wasm_hash = None;
252        let mut manifest_hash = None;
253
254        // Hash SKILL.md
255        let skill_md_path = skill_path.join("SKILL.md");
256        if skill_md_path.exists() {
257            let content = fs::read(&skill_md_path)
258                .context("Failed to read SKILL.md")?;
259            skill_md_hash = self.hash_content(&content);
260        }
261
262        // Hash WASM file (if exists)
263        for entry in fs::read_dir(skill_path).into_iter().flatten() {
264            if let Ok(entry) = entry {
265                if entry.path().extension().map_or(false, |e| e == "wasm") {
266                    let content = fs::read(entry.path())
267                        .context("Failed to read WASM file")?;
268                    wasm_hash = Some(self.hash_content(&content));
269                    break;
270                }
271            }
272        }
273
274        // Hash manifest (skill.toml or skill.json)
275        for filename in ["skill.toml", "skill.json"] {
276            let manifest_path = skill_path.join(filename);
277            if manifest_path.exists() {
278                let content = fs::read(&manifest_path)
279                    .context("Failed to read manifest")?;
280                manifest_hash = Some(self.hash_content(&content));
281                break;
282            }
283        }
284
285        Ok(SkillChecksum {
286            skill_md_hash,
287            wasm_hash,
288            manifest_hash,
289            indexed_at: Utc::now(),
290        })
291    }
292
293    /// Check if a skill needs re-indexing
294    pub fn needs_reindex(&self, skill_name: &str, skill_path: &Path) -> Result<bool> {
295        // Check if skill exists in metadata
296        let existing = match self.metadata.skill_checksums.get(skill_name) {
297            Some(checksum) => checksum,
298            None => return Ok(true), // New skill
299        };
300
301        // Compute current checksum
302        let current = self.compute_skill_checksum(skill_path)?;
303
304        // Compare checksums
305        Ok(existing.skill_md_hash != current.skill_md_hash ||
306           existing.wasm_hash != current.wasm_hash ||
307           existing.manifest_hash != current.manifest_hash)
308    }
309
310    /// Record that a skill was indexed
311    pub fn record_indexed(&mut self, skill_name: &str, checksum: SkillChecksum, doc_count: usize) -> Result<()> {
312        self.metadata.skill_checksums.insert(skill_name.to_string(), checksum);
313        self.metadata.document_count = self.metadata.document_count.saturating_add(doc_count);
314        self.metadata.touch();
315        self.save_metadata()
316    }
317
318    /// Record that a skill was removed
319    pub fn record_removed(&mut self, skill_name: &str, doc_count: usize) -> Result<()> {
320        self.metadata.skill_checksums.remove(skill_name);
321        self.metadata.document_count = self.metadata.document_count.saturating_sub(doc_count);
322        self.metadata.touch();
323        self.save_metadata()
324    }
325
326    /// Determine what sync operations are needed
327    pub fn plan_sync(&self, current_skills: &HashMap<String, PathBuf>) -> Result<SyncResult> {
328        let mut result = SyncResult::default();
329
330        // Check for skills that need to be added or updated
331        for (skill_name, skill_path) in current_skills {
332            if !self.metadata.skill_checksums.contains_key(skill_name) {
333                result.added.push(skill_name.clone());
334            } else if self.needs_reindex(skill_name, skill_path)? {
335                result.updated.push(skill_name.clone());
336            } else {
337                result.skipped += 1;
338            }
339        }
340
341        // Check for skills that need to be removed
342        for skill_name in self.metadata.skill_checksums.keys() {
343            if !current_skills.contains_key(skill_name) {
344                result.removed.push(skill_name.clone());
345            }
346        }
347
348        Ok(result)
349    }
350
351    /// Check if full reindex is needed
352    pub fn needs_full_reindex(&self, config: &IndexConfig) -> bool {
353        // Load existing metadata if any
354        match IndexMetadata::load(&config.index_path) {
355            Ok(Some(meta)) => !meta.is_compatible(config),
356            Ok(None) => true, // No existing index
357            Err(_) => true,   // Corrupted metadata
358        }
359    }
360
361    /// Clear all index data
362    pub fn clear(&mut self) -> Result<()> {
363        self.metadata = IndexMetadata::new(&self.config.embedding_model, self.config.embedding_dimensions);
364        self.save_metadata()?;
365
366        // Clear data files (keep metadata)
367        let data_dir = self.config.index_path.join("data");
368        if data_dir.exists() {
369            fs::remove_dir_all(&data_dir)
370                .context("Failed to remove index data")?;
371        }
372
373        Ok(())
374    }
375
376    /// Save metadata to disk
377    fn save_metadata(&self) -> Result<()> {
378        self.metadata.save(&self.config.index_path)
379    }
380
381    /// Calculate total index size on disk
382    fn calculate_index_size(&self) -> u64 {
383        if !self.config.index_path.exists() {
384            return 0;
385        }
386
387        walkdir::WalkDir::new(&self.config.index_path)
388            .into_iter()
389            .filter_map(|e| e.ok())
390            .filter_map(|e| e.metadata().ok())
391            .map(|m| m.len())
392            .sum()
393    }
394
395    /// Hash content using blake3
396    fn hash_content(&self, content: &[u8]) -> String {
397        use std::io::Write;
398        let mut hasher = blake3::Hasher::new();
399        hasher.write_all(content).expect("write to hasher");
400        hasher.finalize().to_hex().to_string()
401    }
402}
403
404#[cfg(test)]
405mod tests {
406    use super::*;
407    use tempfile::TempDir;
408
409    fn temp_config() -> (IndexConfig, TempDir) {
410        let temp_dir = TempDir::new().unwrap();
411        let config = IndexConfig::with_path(temp_dir.path().join("index"));
412        (config, temp_dir)
413    }
414
415    #[test]
416    fn test_config_default() {
417        let config = IndexConfig::default();
418        assert!(config.index_path.to_str().unwrap().contains(".skill-engine"));
419        assert_eq!(config.embedding_model, "all-minilm");
420        assert_eq!(config.embedding_dimensions, 384);
421        assert_eq!(config.chunk_size, 32);
422        assert!(config.index_on_startup);
423    }
424
425    #[test]
426    fn test_config_builder() {
427        let config = IndexConfig::with_path("/tmp/test")
428            .with_model("bge-small", 384)
429            .with_chunk_size(64)
430            .no_startup_index();
431
432        assert_eq!(config.index_path, PathBuf::from("/tmp/test"));
433        assert_eq!(config.embedding_model, "bge-small");
434        assert_eq!(config.chunk_size, 64);
435        assert!(!config.index_on_startup);
436    }
437
438    #[test]
439    fn test_metadata_new() {
440        let meta = IndexMetadata::new("test-model", 384);
441        assert_eq!(meta.version, IndexMetadata::CURRENT_VERSION);
442        assert_eq!(meta.embedding_model, "test-model");
443        assert_eq!(meta.dimensions, 384);
444        assert_eq!(meta.document_count, 0);
445        assert!(meta.skill_checksums.is_empty());
446    }
447
448    #[test]
449    fn test_metadata_save_load() {
450        let (config, _temp) = temp_config();
451
452        let mut meta = IndexMetadata::new(&config.embedding_model, config.embedding_dimensions);
453        meta.document_count = 42;
454        meta.skill_checksums.insert(
455            "test-skill".to_string(),
456            SkillChecksum {
457                skill_md_hash: "abc123".to_string(),
458                wasm_hash: Some("def456".to_string()),
459                manifest_hash: None,
460                indexed_at: Utc::now(),
461            },
462        );
463
464        meta.save(&config.index_path).unwrap();
465        let loaded = IndexMetadata::load(&config.index_path).unwrap().unwrap();
466
467        assert_eq!(loaded.document_count, 42);
468        assert!(loaded.skill_checksums.contains_key("test-skill"));
469    }
470
471    #[test]
472    fn test_metadata_compatibility() {
473        let config = IndexConfig::default();
474        let meta = IndexMetadata::new(&config.embedding_model, config.embedding_dimensions);
475        assert!(meta.is_compatible(&config));
476
477        let mut incompatible_config = config.clone();
478        incompatible_config.embedding_model = "different-model".to_string();
479        assert!(!meta.is_compatible(&incompatible_config));
480    }
481
482    #[test]
483    fn test_index_manager_creation() {
484        let (config, _temp) = temp_config();
485        let manager = IndexManager::new(config.clone()).unwrap();
486        assert_eq!(manager.metadata().embedding_model, config.embedding_model);
487    }
488
489    #[test]
490    fn test_skill_checksum() {
491        let (config, temp) = temp_config();
492        let manager = IndexManager::new(config).unwrap();
493
494        // Create a test skill
495        let skill_dir = temp.path().join("test-skill");
496        fs::create_dir_all(&skill_dir).unwrap();
497        fs::write(skill_dir.join("SKILL.md"), "# Test Skill\n").unwrap();
498        fs::write(skill_dir.join("skill.toml"), "name = \"test\"").unwrap();
499
500        let checksum = manager.compute_skill_checksum(&skill_dir).unwrap();
501        assert!(!checksum.skill_md_hash.is_empty());
502        assert!(checksum.manifest_hash.is_some());
503        assert!(checksum.wasm_hash.is_none());
504    }
505
506    #[test]
507    fn test_needs_reindex() {
508        let (config, temp) = temp_config();
509        let mut manager = IndexManager::new(config).unwrap();
510
511        // Create a test skill
512        let skill_dir = temp.path().join("test-skill");
513        fs::create_dir_all(&skill_dir).unwrap();
514        fs::write(skill_dir.join("SKILL.md"), "# Test Skill v1\n").unwrap();
515
516        // New skill should need indexing
517        assert!(manager.needs_reindex("test-skill", &skill_dir).unwrap());
518
519        // Record it as indexed
520        let checksum = manager.compute_skill_checksum(&skill_dir).unwrap();
521        manager.record_indexed("test-skill", checksum, 5).unwrap();
522
523        // Should not need re-indexing now
524        assert!(!manager.needs_reindex("test-skill", &skill_dir).unwrap());
525
526        // Modify skill
527        fs::write(skill_dir.join("SKILL.md"), "# Test Skill v2\n").unwrap();
528
529        // Should need re-indexing now
530        assert!(manager.needs_reindex("test-skill", &skill_dir).unwrap());
531    }
532
533    #[test]
534    fn test_plan_sync() {
535        let (config, temp) = temp_config();
536        let mut manager = IndexManager::new(config).unwrap();
537
538        // Setup: record some indexed skills
539        let checksum = SkillChecksum {
540            skill_md_hash: "old_hash".to_string(),
541            wasm_hash: None,
542            manifest_hash: None,
543            indexed_at: Utc::now(),
544        };
545        manager.record_indexed("existing-skill", checksum.clone(), 3).unwrap();
546        manager.record_indexed("removed-skill", checksum, 2).unwrap();
547
548        // Create skill directories
549        let existing_skill_dir = temp.path().join("existing-skill");
550        let new_skill_dir = temp.path().join("new-skill");
551        fs::create_dir_all(&existing_skill_dir).unwrap();
552        fs::create_dir_all(&new_skill_dir).unwrap();
553        fs::write(existing_skill_dir.join("SKILL.md"), "# Existing\n").unwrap();
554        fs::write(new_skill_dir.join("SKILL.md"), "# New\n").unwrap();
555
556        // Current skills (existing changed, new added, removed missing)
557        let mut current_skills = HashMap::new();
558        current_skills.insert("existing-skill".to_string(), existing_skill_dir);
559        current_skills.insert("new-skill".to_string(), new_skill_dir);
560
561        let result = manager.plan_sync(&current_skills).unwrap();
562
563        assert!(result.added.contains(&"new-skill".to_string()));
564        assert!(result.updated.contains(&"existing-skill".to_string())); // Hash changed
565        assert!(result.removed.contains(&"removed-skill".to_string()));
566    }
567
568    #[test]
569    fn test_sync_result() {
570        let mut result = SyncResult::default();
571        assert!(!result.has_changes());
572        assert_eq!(result.total_processed(), 0);
573
574        result.added.push("skill-1".to_string());
575        result.skipped = 2;
576        assert!(result.has_changes());
577        assert_eq!(result.total_processed(), 3);
578    }
579
580    #[test]
581    fn test_clear_index() {
582        let (config, _temp) = temp_config();
583        let mut manager = IndexManager::new(config).unwrap();
584
585        // Add some data
586        let checksum = SkillChecksum {
587            skill_md_hash: "hash".to_string(),
588            wasm_hash: None,
589            manifest_hash: None,
590            indexed_at: Utc::now(),
591        };
592        manager.record_indexed("test-skill", checksum, 10).unwrap();
593        assert!(!manager.metadata().skill_checksums.is_empty());
594
595        // Clear
596        manager.clear().unwrap();
597        assert!(manager.metadata().skill_checksums.is_empty());
598        assert_eq!(manager.metadata().document_count, 0);
599    }
600}