infiniloom_engine/
incremental.rs

1//! Incremental scanning with file watching and caching
2//!
3//! Provides efficient re-scanning by caching results and only processing changed files.
4
5use bincode::Options;
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8use std::fs;
9use std::path::{Path, PathBuf};
10use std::time::SystemTime;
11use thiserror::Error;
12
13use crate::bincode_safe::deserialize_with_limit;
14use crate::tokenizer::TokenCounts;
15use crate::types::Symbol;
16
17/// Cache entry for a single file
18#[derive(Debug, Clone, Serialize, Deserialize)]
19pub struct CachedFile {
20    /// Relative path
21    pub path: String,
22    /// Last modified time (Unix timestamp)
23    pub mtime: u64,
24    /// File size in bytes
25    pub size: u64,
26    /// Content hash (for change detection)
27    pub hash: u64,
28    /// Token counts
29    pub tokens: TokenCounts,
30    /// Extracted symbols
31    pub symbols: Vec<CachedSymbol>,
32    /// Whether symbols were extracted for this file
33    pub symbols_extracted: bool,
34    /// Detected language
35    pub language: Option<String>,
36    /// Line count
37    pub lines: usize,
38}
39
40/// Cached symbol (simplified for storage)
41#[derive(Debug, Clone, Serialize, Deserialize)]
42pub struct CachedSymbol {
43    pub name: String,
44    pub kind: String,
45    pub start_line: u32,
46    pub end_line: u32,
47    pub signature: Option<String>,
48}
49
50impl From<&Symbol> for CachedSymbol {
51    fn from(s: &Symbol) -> Self {
52        Self {
53            name: s.name.clone(),
54            kind: s.kind.name().to_owned(),
55            start_line: s.start_line,
56            end_line: s.end_line,
57            signature: s.signature.clone(),
58        }
59    }
60}
61
62impl From<&CachedSymbol> for Symbol {
63    fn from(s: &CachedSymbol) -> Self {
64        use crate::types::{SymbolKind, Visibility};
65        Self {
66            name: s.name.clone(),
67            kind: SymbolKind::from_str(&s.kind).unwrap_or(SymbolKind::Variable),
68            start_line: s.start_line,
69            end_line: s.end_line,
70            signature: s.signature.clone(),
71            docstring: None,
72            visibility: Visibility::Public,
73            references: 0,
74            importance: 0.5,
75            parent: None,
76            calls: Vec::new(),
77            extends: None,
78            implements: Vec::new(),
79        }
80    }
81}
82
83/// Repository cache
84#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct RepoCache {
86    /// Cache version (for compatibility)
87    pub version: u32,
88    /// Repository root path
89    pub root_path: String,
90    /// Cache creation time
91    pub created_at: u64,
92    /// Last update time
93    pub updated_at: u64,
94    /// Cached files
95    pub files: HashMap<String, CachedFile>,
96    /// Total token count
97    pub total_tokens: TokenCounts,
98    /// External dependencies detected
99    pub external_deps: Vec<String>,
100}
101
102impl RepoCache {
103    /// Current cache version
104    pub const VERSION: u32 = 2;
105
106    /// Create a new empty cache
107    pub fn new(root_path: &str) -> Self {
108        let now = SystemTime::now()
109            .duration_since(SystemTime::UNIX_EPOCH)
110            .map(|d| d.as_secs())
111            .unwrap_or(0);
112
113        Self {
114            version: Self::VERSION,
115            root_path: root_path.to_owned(),
116            created_at: now,
117            updated_at: now,
118            files: HashMap::new(),
119            total_tokens: TokenCounts::default(),
120            external_deps: Vec::new(),
121        }
122    }
123
124    /// Load cache from file
125    pub fn load(cache_path: &Path) -> Result<Self, CacheError> {
126        let content = fs::read(cache_path).map_err(|e| CacheError::IoError(e.to_string()))?;
127
128        let cache: Self = deserialize_with_limit(&content)
129            .map_err(|e| CacheError::DeserializeError(e.to_string()))?;
130
131        // Check version compatibility
132        if cache.version != Self::VERSION {
133            return Err(CacheError::VersionMismatch {
134                expected: Self::VERSION,
135                found: cache.version,
136            });
137        }
138
139        Ok(cache)
140    }
141
142    /// Save cache to file
143    pub fn save(&self, cache_path: &Path) -> Result<(), CacheError> {
144        // Ensure parent directory exists
145        if let Some(parent) = cache_path.parent() {
146            fs::create_dir_all(parent).map_err(|e| CacheError::IoError(e.to_string()))?;
147        }
148
149        // Note: Must use bincode::options() to match deserialize_with_limit() in load()
150        let content = bincode::options()
151            .serialize(self)
152            .map_err(|e| CacheError::SerializeError(e.to_string()))?;
153
154        fs::write(cache_path, content).map_err(|e| CacheError::IoError(e.to_string()))?;
155
156        Ok(())
157    }
158
159    /// Get default cache path for a repository
160    pub fn default_cache_path(repo_path: &Path) -> PathBuf {
161        repo_path.join(".infiniloom/cache/repo.cache")
162    }
163
164    /// Check if a file needs rescanning based on mtime and size
165    pub fn needs_rescan(&self, path: &str, current_mtime: u64, current_size: u64) -> bool {
166        match self.files.get(path) {
167            Some(cached) => cached.mtime != current_mtime || cached.size != current_size,
168            None => true,
169        }
170    }
171
172    /// Check if a file needs rescanning, including content hash comparison
173    /// This catches changes that don't modify mtime/size (e.g., touch followed by edit)
174    pub fn needs_rescan_with_hash(
175        &self,
176        path: &str,
177        current_mtime: u64,
178        current_size: u64,
179        current_hash: u64,
180    ) -> bool {
181        match self.files.get(path) {
182            Some(cached) => {
183                cached.mtime != current_mtime
184                    || cached.size != current_size
185                    || (cached.hash != 0 && current_hash != 0 && cached.hash != current_hash)
186            },
187            None => true,
188        }
189    }
190
191    /// Get a cached file by path
192    pub fn get(&self, path: &str) -> Option<&CachedFile> {
193        self.files.get(path)
194    }
195
196    /// Add or update a file in the cache
197    pub fn update_file(&mut self, file: CachedFile) {
198        self.files.insert(file.path.clone(), file);
199        self.updated_at = SystemTime::now()
200            .duration_since(SystemTime::UNIX_EPOCH)
201            .map(|d| d.as_secs())
202            .unwrap_or(0);
203    }
204
205    /// Remove a file from the cache
206    pub fn remove_file(&mut self, path: &str) {
207        self.files.remove(path);
208    }
209
210    /// Get files that no longer exist
211    pub fn find_deleted_files(&self, current_files: &[&str]) -> Vec<String> {
212        let current_set: std::collections::HashSet<&str> = current_files.iter().copied().collect();
213        self.files
214            .keys()
215            .filter(|p| !current_set.contains(p.as_str()))
216            .cloned()
217            .collect()
218    }
219
220    /// Recalculate total tokens
221    pub fn recalculate_totals(&mut self) {
222        self.total_tokens = self.files.values().map(|f| f.tokens).sum();
223    }
224
225    /// Get cache statistics
226    pub fn stats(&self) -> CacheStats {
227        CacheStats {
228            file_count: self.files.len(),
229            total_tokens: self.total_tokens,
230            total_bytes: self.files.values().map(|f| f.size).sum(),
231            age_seconds: SystemTime::now()
232                .duration_since(SystemTime::UNIX_EPOCH)
233                .map(|d| d.as_secs())
234                .unwrap_or(0)
235                .saturating_sub(self.updated_at),
236        }
237    }
238}
239
240/// Cache statistics
241#[derive(Debug, Clone)]
242pub struct CacheStats {
243    pub file_count: usize,
244    pub total_tokens: TokenCounts,
245    pub total_bytes: u64,
246    pub age_seconds: u64,
247}
248
249/// Cache errors
250#[derive(Debug, Error)]
251pub enum CacheError {
252    #[error("I/O error: {0}")]
253    IoError(String),
254    #[error("Serialization error: {0}")]
255    SerializeError(String),
256    #[error("Deserialization error: {0}")]
257    DeserializeError(String),
258    #[error("Cache version mismatch: expected {expected}, found {found}")]
259    VersionMismatch { expected: u32, found: u32 },
260}
261
262/// Incremental scanner that uses caching
263pub struct IncrementalScanner {
264    cache: RepoCache,
265    cache_path: PathBuf,
266    dirty: bool,
267}
268
269impl IncrementalScanner {
270    /// Create or load an incremental scanner for a repository
271    pub fn new(repo_path: &Path) -> Self {
272        let cache_path = RepoCache::default_cache_path(repo_path);
273
274        let cache = RepoCache::load(&cache_path)
275            .unwrap_or_else(|_| RepoCache::new(&repo_path.to_string_lossy()));
276
277        Self { cache, cache_path, dirty: false }
278    }
279
280    /// Create with custom cache path
281    pub fn with_cache_path(repo_path: &Path, cache_path: PathBuf) -> Self {
282        let cache = RepoCache::load(&cache_path)
283            .unwrap_or_else(|_| RepoCache::new(&repo_path.to_string_lossy()));
284
285        Self { cache, cache_path, dirty: false }
286    }
287
288    /// Check if a file needs to be rescanned (fast check using mtime/size only)
289    pub fn needs_rescan(&self, path: &Path) -> bool {
290        let metadata = match path.metadata() {
291            Ok(m) => m,
292            Err(_) => return true,
293        };
294
295        let mtime = metadata
296            .modified()
297            .ok()
298            .and_then(|t| t.duration_since(SystemTime::UNIX_EPOCH).ok())
299            .map(|d| d.as_secs())
300            .unwrap_or(0);
301
302        let relative_path = path.to_string_lossy();
303        self.cache
304            .needs_rescan(&relative_path, mtime, metadata.len())
305    }
306
307    /// Check if a file needs to be rescanned, including content hash check
308    /// This is more accurate but requires reading the file content
309    pub fn needs_rescan_with_content(&self, path: &Path, content: &[u8]) -> bool {
310        let metadata = match path.metadata() {
311            Ok(m) => m,
312            Err(_) => return true,
313        };
314
315        let mtime = metadata
316            .modified()
317            .ok()
318            .and_then(|t| t.duration_since(SystemTime::UNIX_EPOCH).ok())
319            .map(|d| d.as_secs())
320            .unwrap_or(0);
321
322        let content_hash = hash_content(content);
323        let relative_path = path.to_string_lossy();
324        self.cache
325            .needs_rescan_with_hash(&relative_path, mtime, metadata.len(), content_hash)
326    }
327
328    /// Get cached file if available and up-to-date
329    pub fn get_cached(&self, path: &str) -> Option<&CachedFile> {
330        self.cache.files.get(path)
331    }
332
333    /// Update cache with new file data
334    pub fn update(&mut self, file: CachedFile) {
335        self.cache.update_file(file);
336        self.dirty = true;
337    }
338
339    /// Remove a deleted file from cache
340    pub fn remove(&mut self, path: &str) {
341        self.cache.remove_file(path);
342        self.dirty = true;
343    }
344
345    /// Save cache if modified
346    pub fn save(&mut self) -> Result<(), CacheError> {
347        if self.dirty {
348            self.cache.recalculate_totals();
349            self.cache.save(&self.cache_path)?;
350            self.dirty = false;
351        }
352        Ok(())
353    }
354
355    /// Force save cache
356    pub fn force_save(&mut self) -> Result<(), CacheError> {
357        self.cache.recalculate_totals();
358        self.cache.save(&self.cache_path)?;
359        self.dirty = false;
360        Ok(())
361    }
362
363    /// Get cache statistics
364    pub fn stats(&self) -> CacheStats {
365        self.cache.stats()
366    }
367
368    /// Clear the cache
369    pub fn clear(&mut self) {
370        self.cache = RepoCache::new(&self.cache.root_path);
371        self.dirty = true;
372    }
373
374    /// Get list of changed files compared to current state
375    pub fn get_changed_files<'a>(
376        &self,
377        current_files: &'a [(PathBuf, u64, u64)],
378    ) -> Vec<&'a PathBuf> {
379        current_files
380            .iter()
381            .filter(|(path, mtime, size)| {
382                let relative = path.to_string_lossy();
383                self.cache.needs_rescan(&relative, *mtime, *size)
384            })
385            .map(|(path, _, _)| path)
386            .collect()
387    }
388}
389
390/// File change event for watching
391#[derive(Debug, Clone)]
392pub enum FileChange {
393    Created(PathBuf),
394    Modified(PathBuf),
395    Deleted(PathBuf),
396    Renamed { from: PathBuf, to: PathBuf },
397}
398
399/// File watcher using notify crate (when 'watch' feature enabled)
400#[cfg(feature = "watch")]
401pub mod watcher {
402    use super::*;
403    use notify::{Config, Event, EventKind, RecommendedWatcher, RecursiveMode, Watcher};
404    use std::sync::mpsc::{channel, Receiver};
405
406    /// File system watcher for incremental updates
407    pub struct FileWatcher {
408        watcher: RecommendedWatcher,
409        receiver: Receiver<Result<Event, notify::Error>>,
410        root_path: PathBuf,
411    }
412
413    impl FileWatcher {
414        /// Create a new file watcher for a directory
415        pub fn new(path: &Path) -> Result<Self, notify::Error> {
416            let (tx, rx) = channel();
417
418            let watcher = RecommendedWatcher::new(
419                move |res| {
420                    let _ = tx.send(res);
421                },
422                Config::default(),
423            )?;
424
425            let mut fw = Self { watcher, receiver: rx, root_path: path.to_path_buf() };
426
427            fw.watcher.watch(path, RecursiveMode::Recursive)?;
428
429            Ok(fw)
430        }
431
432        /// Get next file change event (non-blocking)
433        pub fn try_next(&self) -> Option<FileChange> {
434            match self.receiver.try_recv() {
435                Ok(Ok(event)) => self.event_to_change(event),
436                _ => None,
437            }
438        }
439
440        /// Wait for next file change event (blocking)
441        pub fn next(&self) -> Option<FileChange> {
442            match self.receiver.recv() {
443                Ok(Ok(event)) => self.event_to_change(event),
444                _ => None,
445            }
446        }
447
448        /// Convert notify event to FileChange
449        fn event_to_change(&self, event: Event) -> Option<FileChange> {
450            let path = event.paths.first()?.clone();
451
452            match event.kind {
453                EventKind::Create(_) => Some(FileChange::Created(path)),
454                EventKind::Modify(_) => Some(FileChange::Modified(path)),
455                EventKind::Remove(_) => Some(FileChange::Deleted(path)),
456                _ => None,
457            }
458        }
459
460        /// Stop watching
461        pub fn stop(mut self) -> Result<(), notify::Error> {
462            self.watcher.unwatch(&self.root_path)
463        }
464    }
465}
466
467/// Compute a simple hash for change detection
468pub fn hash_content(content: &[u8]) -> u64 {
469    use std::collections::hash_map::DefaultHasher;
470    use std::hash::{Hash, Hasher};
471
472    let mut hasher = DefaultHasher::new();
473    content.hash(&mut hasher);
474    hasher.finish()
475}
476
477/// Get file modification time as Unix timestamp
478pub fn get_mtime(path: &Path) -> Option<u64> {
479    path.metadata()
480        .ok()?
481        .modified()
482        .ok()?
483        .duration_since(SystemTime::UNIX_EPOCH)
484        .ok()
485        .map(|d| d.as_secs())
486}
487
488#[cfg(test)]
489#[allow(clippy::str_to_string)]
490mod tests {
491    use super::*;
492    use tempfile::TempDir;
493
494    #[test]
495    fn test_cache_create_save_load() {
496        let temp = TempDir::new().unwrap();
497        let cache_path = temp.path().join("test.cache");
498
499        let mut cache = RepoCache::new("/test/repo");
500        cache.files.insert(
501            "test.py".to_string(),
502            CachedFile {
503                path: "test.py".to_string(),
504                mtime: 12345,
505                size: 100,
506                hash: 0,
507                tokens: TokenCounts {
508                    o200k: 45,
509                    cl100k: 48,
510                    claude: 50,
511                    gemini: 46,
512                    llama: 50,
513                    mistral: 50,
514                    deepseek: 50,
515                    qwen: 50,
516                    cohere: 48,
517                    grok: 50,
518                },
519                symbols: vec![],
520                symbols_extracted: false,
521                language: Some("python".to_string()),
522                lines: 10,
523            },
524        );
525
526        cache.save(&cache_path).unwrap();
527
528        let loaded = RepoCache::load(&cache_path).unwrap();
529        assert_eq!(loaded.files.len(), 1);
530        assert!(loaded.files.contains_key("test.py"));
531    }
532
533    #[test]
534    fn test_needs_rescan() {
535        let cache = RepoCache::new("/test");
536        assert!(cache.needs_rescan("new_file.py", 0, 0));
537
538        let mut cache = RepoCache::new("/test");
539        cache.files.insert(
540            "existing.py".to_string(),
541            CachedFile {
542                path: "existing.py".to_string(),
543                mtime: 1000,
544                size: 500,
545                hash: 0,
546                tokens: TokenCounts::default(),
547                symbols: vec![],
548                symbols_extracted: false,
549                language: None,
550                lines: 0,
551            },
552        );
553
554        assert!(!cache.needs_rescan("existing.py", 1000, 500));
555        assert!(cache.needs_rescan("existing.py", 2000, 500)); // mtime changed
556        assert!(cache.needs_rescan("existing.py", 1000, 600)); // size changed
557    }
558
559    #[test]
560    fn test_incremental_scanner() {
561        let temp = TempDir::new().unwrap();
562
563        let mut scanner = IncrementalScanner::new(temp.path());
564        assert!(scanner.needs_rescan(&temp.path().join("test.py")));
565
566        scanner.update(CachedFile {
567            path: "test.py".to_string(),
568            mtime: 1000,
569            size: 100,
570            hash: 0,
571            tokens: TokenCounts::default(),
572            symbols: vec![],
573            symbols_extracted: false,
574            language: Some("python".to_string()),
575            lines: 5,
576        });
577
578        assert!(scanner.get_cached("test.py").is_some());
579    }
580
581    #[test]
582    fn test_hash_content() {
583        let h1 = hash_content(b"hello world");
584        let h2 = hash_content(b"hello world");
585        let h3 = hash_content(b"different");
586
587        assert_eq!(h1, h2);
588        assert_ne!(h1, h3);
589    }
590
591    #[test]
592    fn test_needs_rescan_with_hash() {
593        let mut cache = RepoCache::new("/test");
594        let original_hash = hash_content(b"original content");
595        let modified_hash = hash_content(b"modified content");
596
597        cache.files.insert(
598            "file.py".to_string(),
599            CachedFile {
600                path: "file.py".to_string(),
601                mtime: 1000,
602                size: 500,
603                hash: original_hash,
604                tokens: TokenCounts::default(),
605                symbols: vec![],
606                symbols_extracted: false,
607                language: None,
608                lines: 0,
609            },
610        );
611
612        // Same mtime/size/hash - no rescan needed
613        assert!(!cache.needs_rescan_with_hash("file.py", 1000, 500, original_hash));
614
615        // Same mtime/size but different hash - rescan needed
616        assert!(cache.needs_rescan_with_hash("file.py", 1000, 500, modified_hash));
617
618        // Different mtime - rescan needed regardless of hash
619        assert!(cache.needs_rescan_with_hash("file.py", 2000, 500, original_hash));
620
621        // Hash of 0 is ignored (backwards compatibility)
622        assert!(!cache.needs_rescan_with_hash("file.py", 1000, 500, 0));
623    }
624}