infiniloom_engine/
incremental.rs

1//! Incremental scanning with file watching and caching
2//!
3//! Provides efficient re-scanning by caching results and only processing changed files.
4
5use bincode::Options;
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8use std::fs;
9use std::path::{Path, PathBuf};
10use std::time::SystemTime;
11use thiserror::Error;
12
13use crate::bincode_safe::deserialize_with_limit;
14use crate::tokenizer::TokenCounts;
15use crate::types::Symbol;
16
17/// Cache entry for a single file
18#[derive(Debug, Clone, Serialize, Deserialize)]
19pub struct CachedFile {
20    /// Relative path
21    pub path: String,
22    /// Last modified time (Unix timestamp)
23    pub mtime: u64,
24    /// File size in bytes
25    pub size: u64,
26    /// Content hash (for change detection)
27    pub hash: u64,
28    /// Token counts
29    pub tokens: TokenCounts,
30    /// Extracted symbols
31    pub symbols: Vec<CachedSymbol>,
32    /// Whether symbols were extracted for this file
33    pub symbols_extracted: bool,
34    /// Detected language
35    pub language: Option<String>,
36    /// Line count
37    pub lines: usize,
38}
39
40/// Cached symbol (simplified for storage)
41#[derive(Debug, Clone, Serialize, Deserialize)]
42pub struct CachedSymbol {
43    pub name: String,
44    pub kind: String,
45    pub start_line: u32,
46    pub end_line: u32,
47    pub signature: Option<String>,
48}
49
50impl From<&Symbol> for CachedSymbol {
51    fn from(s: &Symbol) -> Self {
52        Self {
53            name: s.name.clone(),
54            kind: s.kind.name().to_owned(),
55            start_line: s.start_line,
56            end_line: s.end_line,
57            signature: s.signature.clone(),
58        }
59    }
60}
61
62impl From<&CachedSymbol> for Symbol {
63    fn from(s: &CachedSymbol) -> Self {
64        use crate::types::{SymbolKind, Visibility};
65        Self {
66            name: s.name.clone(),
67            kind: SymbolKind::from_str(&s.kind).unwrap_or(SymbolKind::Variable),
68            start_line: s.start_line,
69            end_line: s.end_line,
70            signature: s.signature.clone(),
71            docstring: None,
72            visibility: Visibility::Public,
73            references: 0,
74            importance: 0.5,
75            parent: None,
76            calls: Vec::new(),
77            extends: None,
78            implements: Vec::new(),
79        }
80    }
81}
82
83/// Repository cache
84#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct RepoCache {
86    /// Cache version (for compatibility)
87    pub version: u32,
88    /// Repository root path
89    pub root_path: String,
90    /// Cache creation time
91    pub created_at: u64,
92    /// Last update time
93    pub updated_at: u64,
94    /// Cached files
95    pub files: HashMap<String, CachedFile>,
96    /// Total token count
97    pub total_tokens: TokenCounts,
98    /// External dependencies detected
99    pub external_deps: Vec<String>,
100}
101
102impl RepoCache {
103    /// Current cache version
104    pub const VERSION: u32 = 2;
105
106    /// Create a new empty cache
107    pub fn new(root_path: &str) -> Self {
108        let now = SystemTime::now()
109            .duration_since(SystemTime::UNIX_EPOCH)
110            .map(|d| d.as_secs())
111            .unwrap_or(0);
112
113        Self {
114            version: Self::VERSION,
115            root_path: root_path.to_owned(),
116            created_at: now,
117            updated_at: now,
118            files: HashMap::new(),
119            total_tokens: TokenCounts::default(),
120            external_deps: Vec::new(),
121        }
122    }
123
124    /// Load cache from file
125    pub fn load(cache_path: &Path) -> Result<Self, CacheError> {
126        let content = fs::read(cache_path).map_err(|e| CacheError::IoError(e.to_string()))?;
127
128        let cache: Self = deserialize_with_limit(&content)
129            .map_err(|e| CacheError::DeserializeError(e.to_string()))?;
130
131        // Check version compatibility
132        if cache.version != Self::VERSION {
133            return Err(CacheError::VersionMismatch {
134                expected: Self::VERSION,
135                found: cache.version,
136            });
137        }
138
139        Ok(cache)
140    }
141
142    /// Save cache to file
143    pub fn save(&self, cache_path: &Path) -> Result<(), CacheError> {
144        // Ensure parent directory exists
145        if let Some(parent) = cache_path.parent() {
146            fs::create_dir_all(parent).map_err(|e| CacheError::IoError(e.to_string()))?;
147        }
148
149        // Note: Must use bincode::options() to match deserialize_with_limit() in load()
150        let content = bincode::options()
151            .serialize(self)
152            .map_err(|e| CacheError::SerializeError(e.to_string()))?;
153
154        fs::write(cache_path, content).map_err(|e| CacheError::IoError(e.to_string()))?;
155
156        Ok(())
157    }
158
159    /// Get default cache path for a repository
160    pub fn default_cache_path(repo_path: &Path) -> PathBuf {
161        repo_path.join(".infiniloom/cache/repo.cache")
162    }
163
164    /// Check if a file needs rescanning based on mtime and size
165    pub fn needs_rescan(&self, path: &str, current_mtime: u64, current_size: u64) -> bool {
166        match self.files.get(path) {
167            Some(cached) => cached.mtime != current_mtime || cached.size != current_size,
168            None => true,
169        }
170    }
171
172    /// Check if a file needs rescanning, including content hash comparison
173    /// This catches changes that don't modify mtime/size (e.g., touch followed by edit)
174    pub fn needs_rescan_with_hash(
175        &self,
176        path: &str,
177        current_mtime: u64,
178        current_size: u64,
179        current_hash: u64,
180    ) -> bool {
181        match self.files.get(path) {
182            Some(cached) => {
183                cached.mtime != current_mtime
184                    || cached.size != current_size
185                    || (cached.hash != 0 && current_hash != 0 && cached.hash != current_hash)
186            },
187            None => true,
188        }
189    }
190
191    /// Get a cached file by path
192    pub fn get(&self, path: &str) -> Option<&CachedFile> {
193        self.files.get(path)
194    }
195
196    /// Add or update a file in the cache
197    pub fn update_file(&mut self, file: CachedFile) {
198        self.files.insert(file.path.clone(), file);
199        self.updated_at = SystemTime::now()
200            .duration_since(SystemTime::UNIX_EPOCH)
201            .map(|d| d.as_secs())
202            .unwrap_or(0);
203    }
204
205    /// Remove a file from the cache
206    pub fn remove_file(&mut self, path: &str) {
207        self.files.remove(path);
208    }
209
210    /// Get files that no longer exist
211    pub fn find_deleted_files(&self, current_files: &[&str]) -> Vec<String> {
212        let current_set: std::collections::HashSet<&str> = current_files.iter().copied().collect();
213        self.files
214            .keys()
215            .filter(|p| !current_set.contains(p.as_str()))
216            .cloned()
217            .collect()
218    }
219
220    /// Recalculate total tokens
221    pub fn recalculate_totals(&mut self) {
222        self.total_tokens = self.files.values().map(|f| f.tokens).sum();
223    }
224
225    /// Get cache statistics
226    pub fn stats(&self) -> CacheStats {
227        CacheStats {
228            file_count: self.files.len(),
229            total_tokens: self.total_tokens,
230            total_bytes: self.files.values().map(|f| f.size).sum(),
231            age_seconds: SystemTime::now()
232                .duration_since(SystemTime::UNIX_EPOCH)
233                .map(|d| d.as_secs())
234                .unwrap_or(0)
235                .saturating_sub(self.updated_at),
236        }
237    }
238}
239
240/// Cache statistics
241#[derive(Debug, Clone)]
242pub struct CacheStats {
243    pub file_count: usize,
244    pub total_tokens: TokenCounts,
245    pub total_bytes: u64,
246    pub age_seconds: u64,
247}
248
249/// Cache errors
250#[derive(Debug, Error)]
251pub enum CacheError {
252    #[error("I/O error: {0}")]
253    IoError(String),
254    #[error("Serialization error: {0}")]
255    SerializeError(String),
256    #[error("Deserialization error: {0}")]
257    DeserializeError(String),
258    #[error("Cache version mismatch: expected {expected}, found {found}")]
259    VersionMismatch { expected: u32, found: u32 },
260}
261
262/// Incremental scanner that uses caching
263pub struct IncrementalScanner {
264    cache: RepoCache,
265    cache_path: PathBuf,
266    dirty: bool,
267}
268
269impl IncrementalScanner {
270    /// Create or load an incremental scanner for a repository
271    pub fn new(repo_path: &Path) -> Self {
272        let cache_path = RepoCache::default_cache_path(repo_path);
273
274        let cache = RepoCache::load(&cache_path)
275            .unwrap_or_else(|_| RepoCache::new(&repo_path.to_string_lossy()));
276
277        Self { cache, cache_path, dirty: false }
278    }
279
280    /// Create with custom cache path
281    pub fn with_cache_path(repo_path: &Path, cache_path: PathBuf) -> Self {
282        let cache = RepoCache::load(&cache_path)
283            .unwrap_or_else(|_| RepoCache::new(&repo_path.to_string_lossy()));
284
285        Self { cache, cache_path, dirty: false }
286    }
287
288    /// Check if a file needs to be rescanned (fast check using mtime/size only)
289    pub fn needs_rescan(&self, path: &Path) -> bool {
290        let metadata = match path.metadata() {
291            Ok(m) => m,
292            Err(_) => return true,
293        };
294
295        let mtime = metadata
296            .modified()
297            .ok()
298            .and_then(|t| t.duration_since(SystemTime::UNIX_EPOCH).ok())
299            .map_or(0, |d| d.as_secs());
300
301        let relative_path = path.to_string_lossy();
302        self.cache
303            .needs_rescan(&relative_path, mtime, metadata.len())
304    }
305
306    /// Check if a file needs to be rescanned, including content hash check
307    /// This is more accurate but requires reading the file content
308    pub fn needs_rescan_with_content(&self, path: &Path, content: &[u8]) -> bool {
309        let metadata = match path.metadata() {
310            Ok(m) => m,
311            Err(_) => return true,
312        };
313
314        let mtime = metadata
315            .modified()
316            .ok()
317            .and_then(|t| t.duration_since(SystemTime::UNIX_EPOCH).ok())
318            .map_or(0, |d| d.as_secs());
319
320        let content_hash = hash_content(content);
321        let relative_path = path.to_string_lossy();
322        self.cache
323            .needs_rescan_with_hash(&relative_path, mtime, metadata.len(), content_hash)
324    }
325
326    /// Get cached file if available and up-to-date
327    pub fn get_cached(&self, path: &str) -> Option<&CachedFile> {
328        self.cache.files.get(path)
329    }
330
331    /// Update cache with new file data
332    pub fn update(&mut self, file: CachedFile) {
333        self.cache.update_file(file);
334        self.dirty = true;
335    }
336
337    /// Remove a deleted file from cache
338    pub fn remove(&mut self, path: &str) {
339        self.cache.remove_file(path);
340        self.dirty = true;
341    }
342
343    /// Save cache if modified
344    pub fn save(&mut self) -> Result<(), CacheError> {
345        if self.dirty {
346            self.cache.recalculate_totals();
347            self.cache.save(&self.cache_path)?;
348            self.dirty = false;
349        }
350        Ok(())
351    }
352
353    /// Force save cache
354    pub fn force_save(&mut self) -> Result<(), CacheError> {
355        self.cache.recalculate_totals();
356        self.cache.save(&self.cache_path)?;
357        self.dirty = false;
358        Ok(())
359    }
360
361    /// Get cache statistics
362    pub fn stats(&self) -> CacheStats {
363        self.cache.stats()
364    }
365
366    /// Clear the cache
367    pub fn clear(&mut self) {
368        self.cache = RepoCache::new(&self.cache.root_path);
369        self.dirty = true;
370    }
371
372    /// Get list of changed files compared to current state
373    pub fn get_changed_files<'a>(
374        &self,
375        current_files: &'a [(PathBuf, u64, u64)],
376    ) -> Vec<&'a PathBuf> {
377        current_files
378            .iter()
379            .filter(|(path, mtime, size)| {
380                let relative = path.to_string_lossy();
381                self.cache.needs_rescan(&relative, *mtime, *size)
382            })
383            .map(|(path, _, _)| path)
384            .collect()
385    }
386}
387
388/// File change event for watching
389#[derive(Debug, Clone)]
390pub enum FileChange {
391    Created(PathBuf),
392    Modified(PathBuf),
393    Deleted(PathBuf),
394    Renamed { from: PathBuf, to: PathBuf },
395}
396
397/// File watcher using notify crate (when 'watch' feature enabled)
398#[cfg(feature = "watch")]
399pub mod watcher {
400    use super::*;
401    use notify::{Config, Event, EventKind, RecommendedWatcher, RecursiveMode, Watcher};
402    use std::sync::mpsc::{channel, Receiver};
403
404    /// File system watcher for incremental updates
405    pub struct FileWatcher {
406        watcher: RecommendedWatcher,
407        receiver: Receiver<Result<Event, notify::Error>>,
408        root_path: PathBuf,
409    }
410
411    impl FileWatcher {
412        /// Create a new file watcher for a directory
413        pub fn new(path: &Path) -> Result<Self, notify::Error> {
414            let (tx, rx) = channel();
415
416            let watcher = RecommendedWatcher::new(
417                move |res| {
418                    let _ = tx.send(res);
419                },
420                Config::default(),
421            )?;
422
423            let mut fw = Self { watcher, receiver: rx, root_path: path.to_path_buf() };
424
425            fw.watcher.watch(path, RecursiveMode::Recursive)?;
426
427            Ok(fw)
428        }
429
430        /// Get next file change event (non-blocking)
431        pub fn try_next(&self) -> Option<FileChange> {
432            match self.receiver.try_recv() {
433                Ok(Ok(event)) => self.event_to_change(event),
434                _ => None,
435            }
436        }
437
438        /// Wait for next file change event (blocking)
439        pub fn next(&self) -> Option<FileChange> {
440            match self.receiver.recv() {
441                Ok(Ok(event)) => self.event_to_change(event),
442                _ => None,
443            }
444        }
445
446        /// Convert notify event to FileChange
447        fn event_to_change(&self, event: Event) -> Option<FileChange> {
448            let path = event.paths.first()?.clone();
449
450            match event.kind {
451                EventKind::Create(_) => Some(FileChange::Created(path)),
452                EventKind::Modify(_) => Some(FileChange::Modified(path)),
453                EventKind::Remove(_) => Some(FileChange::Deleted(path)),
454                _ => None,
455            }
456        }
457
458        /// Stop watching
459        pub fn stop(mut self) -> Result<(), notify::Error> {
460            self.watcher.unwatch(&self.root_path)
461        }
462    }
463}
464
465/// Compute a simple hash for change detection
466pub fn hash_content(content: &[u8]) -> u64 {
467    use std::collections::hash_map::DefaultHasher;
468    use std::hash::{Hash, Hasher};
469
470    let mut hasher = DefaultHasher::new();
471    content.hash(&mut hasher);
472    hasher.finish()
473}
474
475/// Get file modification time as Unix timestamp
476pub fn get_mtime(path: &Path) -> Option<u64> {
477    path.metadata()
478        .ok()?
479        .modified()
480        .ok()?
481        .duration_since(SystemTime::UNIX_EPOCH)
482        .ok()
483        .map(|d| d.as_secs())
484}
485
486#[cfg(test)]
487#[allow(clippy::str_to_string)]
488mod tests {
489    use super::*;
490    use tempfile::TempDir;
491
492    #[test]
493    fn test_cache_create_save_load() {
494        let temp = TempDir::new().unwrap();
495        let cache_path = temp.path().join("test.cache");
496
497        let mut cache = RepoCache::new("/test/repo");
498        cache.files.insert(
499            "test.py".to_string(),
500            CachedFile {
501                path: "test.py".to_string(),
502                mtime: 12345,
503                size: 100,
504                hash: 0,
505                tokens: TokenCounts {
506                    o200k: 45,
507                    cl100k: 48,
508                    claude: 50,
509                    gemini: 46,
510                    llama: 50,
511                    mistral: 50,
512                    deepseek: 50,
513                    qwen: 50,
514                    cohere: 48,
515                    grok: 50,
516                },
517                symbols: vec![],
518                symbols_extracted: false,
519                language: Some("python".to_string()),
520                lines: 10,
521            },
522        );
523
524        cache.save(&cache_path).unwrap();
525
526        let loaded = RepoCache::load(&cache_path).unwrap();
527        assert_eq!(loaded.files.len(), 1);
528        assert!(loaded.files.contains_key("test.py"));
529    }
530
531    #[test]
532    fn test_needs_rescan() {
533        let cache = RepoCache::new("/test");
534        assert!(cache.needs_rescan("new_file.py", 0, 0));
535
536        let mut cache = RepoCache::new("/test");
537        cache.files.insert(
538            "existing.py".to_string(),
539            CachedFile {
540                path: "existing.py".to_string(),
541                mtime: 1000,
542                size: 500,
543                hash: 0,
544                tokens: TokenCounts::default(),
545                symbols: vec![],
546                symbols_extracted: false,
547                language: None,
548                lines: 0,
549            },
550        );
551
552        assert!(!cache.needs_rescan("existing.py", 1000, 500));
553        assert!(cache.needs_rescan("existing.py", 2000, 500)); // mtime changed
554        assert!(cache.needs_rescan("existing.py", 1000, 600)); // size changed
555    }
556
557    #[test]
558    fn test_incremental_scanner() {
559        let temp = TempDir::new().unwrap();
560
561        let mut scanner = IncrementalScanner::new(temp.path());
562        assert!(scanner.needs_rescan(&temp.path().join("test.py")));
563
564        scanner.update(CachedFile {
565            path: "test.py".to_string(),
566            mtime: 1000,
567            size: 100,
568            hash: 0,
569            tokens: TokenCounts::default(),
570            symbols: vec![],
571            symbols_extracted: false,
572            language: Some("python".to_string()),
573            lines: 5,
574        });
575
576        assert!(scanner.get_cached("test.py").is_some());
577    }
578
579    #[test]
580    fn test_hash_content() {
581        let h1 = hash_content(b"hello world");
582        let h2 = hash_content(b"hello world");
583        let h3 = hash_content(b"different");
584
585        assert_eq!(h1, h2);
586        assert_ne!(h1, h3);
587    }
588
589    #[test]
590    fn test_needs_rescan_with_hash() {
591        let mut cache = RepoCache::new("/test");
592        let original_hash = hash_content(b"original content");
593        let modified_hash = hash_content(b"modified content");
594
595        cache.files.insert(
596            "file.py".to_string(),
597            CachedFile {
598                path: "file.py".to_string(),
599                mtime: 1000,
600                size: 500,
601                hash: original_hash,
602                tokens: TokenCounts::default(),
603                symbols: vec![],
604                symbols_extracted: false,
605                language: None,
606                lines: 0,
607            },
608        );
609
610        // Same mtime/size/hash - no rescan needed
611        assert!(!cache.needs_rescan_with_hash("file.py", 1000, 500, original_hash));
612
613        // Same mtime/size but different hash - rescan needed
614        assert!(cache.needs_rescan_with_hash("file.py", 1000, 500, modified_hash));
615
616        // Different mtime - rescan needed regardless of hash
617        assert!(cache.needs_rescan_with_hash("file.py", 2000, 500, original_hash));
618
619        // Hash of 0 is ignored (backwards compatibility)
620        assert!(!cache.needs_rescan_with_hash("file.py", 1000, 500, 0));
621    }
622}