Skip to main content

infiniloom_engine/
incremental.rs

1//! Incremental scanning with file watching and caching
2//!
3//! Provides efficient re-scanning by caching results and only processing changed files.
4
5use serde::{Deserialize, Serialize};
6use std::collections::HashMap;
7use std::fs;
8use std::path::{Path, PathBuf};
9use std::time::SystemTime;
10use thiserror::Error;
11
12use crate::bincode_safe::{deserialize_with_limit, serialize};
13use crate::tokenizer::TokenCounts;
14use crate::types::Symbol;
15
16/// Cache entry for a single file
17#[derive(Debug, Clone, Serialize, Deserialize)]
18pub struct CachedFile {
19    /// Relative path
20    pub path: String,
21    /// Last modified time (Unix timestamp)
22    pub mtime: u64,
23    /// File size in bytes
24    pub size: u64,
25    /// Content hash (for change detection)
26    pub hash: u64,
27    /// Token counts
28    pub tokens: TokenCounts,
29    /// Extracted symbols
30    pub symbols: Vec<CachedSymbol>,
31    /// Whether symbols were extracted for this file
32    pub symbols_extracted: bool,
33    /// Detected language
34    pub language: Option<String>,
35    /// Line count
36    pub lines: usize,
37}
38
39/// Cached symbol (simplified for storage)
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct CachedSymbol {
42    pub name: String,
43    pub kind: String,
44    pub start_line: u32,
45    pub end_line: u32,
46    pub signature: Option<String>,
47}
48
49impl From<&Symbol> for CachedSymbol {
50    fn from(s: &Symbol) -> Self {
51        Self {
52            name: s.name.clone(),
53            kind: s.kind.name().to_owned(),
54            start_line: s.start_line,
55            end_line: s.end_line,
56            signature: s.signature.clone(),
57        }
58    }
59}
60
61impl From<&CachedSymbol> for Symbol {
62    fn from(s: &CachedSymbol) -> Self {
63        use crate::types::{SymbolKind, Visibility};
64        Self {
65            name: s.name.clone(),
66            kind: SymbolKind::from_str(&s.kind).unwrap_or(SymbolKind::Variable),
67            start_line: s.start_line,
68            end_line: s.end_line,
69            signature: s.signature.clone(),
70            docstring: None,
71            visibility: Visibility::Public,
72            references: 0,
73            importance: 0.5,
74            parent: None,
75            calls: Vec::new(),
76            extends: None,
77            implements: Vec::new(),
78        }
79    }
80}
81
82/// Repository cache
83#[derive(Debug, Clone, Serialize, Deserialize)]
84pub struct RepoCache {
85    /// Cache version (for compatibility)
86    pub version: u32,
87    /// Repository root path
88    pub root_path: String,
89    /// Cache creation time
90    pub created_at: u64,
91    /// Last update time
92    pub updated_at: u64,
93    /// Cached files
94    pub files: HashMap<String, CachedFile>,
95    /// Total token count
96    pub total_tokens: TokenCounts,
97    /// External dependencies detected
98    pub external_deps: Vec<String>,
99}
100
101impl RepoCache {
102    /// Current cache version
103    pub const VERSION: u32 = 4;
104
105    /// Create a new empty cache
106    pub fn new(root_path: &str) -> Self {
107        let now = SystemTime::now()
108            .duration_since(SystemTime::UNIX_EPOCH)
109            .map(|d| d.as_secs())
110            .unwrap_or(0);
111
112        Self {
113            version: Self::VERSION,
114            root_path: root_path.to_owned(),
115            created_at: now,
116            updated_at: now,
117            files: HashMap::new(),
118            total_tokens: TokenCounts::default(),
119            external_deps: Vec::new(),
120        }
121    }
122
123    /// Load cache from file
124    pub fn load(cache_path: &Path) -> Result<Self, CacheError> {
125        let content = fs::read(cache_path).map_err(|e| CacheError::IoError(e.to_string()))?;
126
127        let cache: Self = deserialize_with_limit(&content)
128            .map_err(|e| CacheError::DeserializeError(e.to_string()))?;
129
130        // Check version compatibility
131        if cache.version != Self::VERSION {
132            return Err(CacheError::VersionMismatch {
133                expected: Self::VERSION,
134                found: cache.version,
135            });
136        }
137
138        Ok(cache)
139    }
140
141    /// Save cache to file atomically (write to temp file, then rename).
142    pub fn save(&self, cache_path: &Path) -> Result<(), CacheError> {
143        // Ensure parent directory exists
144        if let Some(parent) = cache_path.parent() {
145            fs::create_dir_all(parent).map_err(|e| CacheError::IoError(e.to_string()))?;
146        }
147
148        let content = serialize(self).map_err(|e| CacheError::SerializeError(e.to_string()))?;
149
150        // Write to temp file first for atomicity (prevents corruption on crash)
151        let tmp_path = cache_path.with_extension("tmp");
152        fs::write(&tmp_path, content).map_err(|e| CacheError::IoError(e.to_string()))?;
153        fs::rename(&tmp_path, cache_path).map_err(|e| CacheError::IoError(e.to_string()))?;
154
155        Ok(())
156    }
157
158    /// Get default cache path for a repository
159    pub fn default_cache_path(repo_path: &Path) -> PathBuf {
160        repo_path.join(".infiniloom/cache/repo.cache")
161    }
162
163    /// Check if a file needs rescanning based on mtime and size
164    pub fn needs_rescan(&self, path: &str, current_mtime: u64, current_size: u64) -> bool {
165        match self.files.get(path) {
166            Some(cached) => cached.mtime != current_mtime || cached.size != current_size,
167            None => true,
168        }
169    }
170
171    /// Check if a file needs rescanning, including content hash comparison
172    /// This catches changes that don't modify mtime/size (e.g., touch followed by edit)
173    pub fn needs_rescan_with_hash(
174        &self,
175        path: &str,
176        current_mtime: u64,
177        current_size: u64,
178        current_hash: u64,
179    ) -> bool {
180        match self.files.get(path) {
181            Some(cached) => {
182                cached.mtime != current_mtime
183                    || cached.size != current_size
184                    || (cached.hash != 0 && current_hash != 0 && cached.hash != current_hash)
185            },
186            None => true,
187        }
188    }
189
190    /// Get a cached file by path
191    pub fn get(&self, path: &str) -> Option<&CachedFile> {
192        self.files.get(path)
193    }
194
195    /// Add or update a file in the cache
196    pub fn update_file(&mut self, file: CachedFile) {
197        self.files.insert(file.path.clone(), file);
198        self.updated_at = SystemTime::now()
199            .duration_since(SystemTime::UNIX_EPOCH)
200            .map(|d| d.as_secs())
201            .unwrap_or(0);
202    }
203
204    /// Remove a file from the cache
205    pub fn remove_file(&mut self, path: &str) {
206        self.files.remove(path);
207    }
208
209    /// Get files that no longer exist
210    pub fn find_deleted_files(&self, current_files: &[&str]) -> Vec<String> {
211        let current_set: std::collections::HashSet<&str> = current_files.iter().copied().collect();
212        self.files
213            .keys()
214            .filter(|p| !current_set.contains(p.as_str()))
215            .cloned()
216            .collect()
217    }
218
219    /// Recalculate total tokens
220    pub fn recalculate_totals(&mut self) {
221        self.total_tokens = self.files.values().map(|f| f.tokens).sum();
222    }
223
224    /// Get cache statistics
225    pub fn stats(&self) -> CacheStats {
226        CacheStats {
227            file_count: self.files.len(),
228            total_tokens: self.total_tokens,
229            total_bytes: self.files.values().map(|f| f.size).sum(),
230            age_seconds: SystemTime::now()
231                .duration_since(SystemTime::UNIX_EPOCH)
232                .map(|d| d.as_secs())
233                .unwrap_or(0)
234                .saturating_sub(self.updated_at),
235        }
236    }
237}
238
239/// Cache statistics
240#[derive(Debug, Clone)]
241pub struct CacheStats {
242    pub file_count: usize,
243    pub total_tokens: TokenCounts,
244    pub total_bytes: u64,
245    pub age_seconds: u64,
246}
247
248/// Cache errors
249#[derive(Debug, Error)]
250pub enum CacheError {
251    #[error("I/O error: {0}")]
252    IoError(String),
253    #[error("Serialization error: {0}")]
254    SerializeError(String),
255    #[error("Deserialization error: {0}. Try clearing the cache by deleting `.infiniloom/cache/` and re-running.")]
256    DeserializeError(String),
257    #[error("Cache version mismatch: expected {expected}, found {found}")]
258    VersionMismatch { expected: u32, found: u32 },
259}
260
261/// Incremental scanner that uses caching
262pub struct IncrementalScanner {
263    cache: RepoCache,
264    cache_path: PathBuf,
265    dirty: bool,
266}
267
268impl IncrementalScanner {
269    /// Create or load an incremental scanner for a repository
270    pub fn new(repo_path: &Path) -> Self {
271        let cache_path = RepoCache::default_cache_path(repo_path);
272
273        let cache = RepoCache::load(&cache_path)
274            .unwrap_or_else(|_| RepoCache::new(&repo_path.to_string_lossy()));
275
276        Self { cache, cache_path, dirty: false }
277    }
278
279    /// Create with custom cache path
280    pub fn with_cache_path(repo_path: &Path, cache_path: PathBuf) -> Self {
281        let cache = RepoCache::load(&cache_path)
282            .unwrap_or_else(|_| RepoCache::new(&repo_path.to_string_lossy()));
283
284        Self { cache, cache_path, dirty: false }
285    }
286
287    /// Check if a file needs to be rescanned (fast check using mtime/size only)
288    pub fn needs_rescan(&self, path: &Path) -> bool {
289        let metadata = match path.metadata() {
290            Ok(m) => m,
291            Err(_) => return true,
292        };
293
294        let mtime = metadata
295            .modified()
296            .ok()
297            .and_then(|t| t.duration_since(SystemTime::UNIX_EPOCH).ok())
298            .map_or(0, |d| d.as_secs());
299
300        let relative_path = path.to_string_lossy();
301        self.cache
302            .needs_rescan(&relative_path, mtime, metadata.len())
303    }
304
305    /// Check if a file needs to be rescanned, including content hash check
306    /// This is more accurate but requires reading the file content
307    pub fn needs_rescan_with_content(&self, path: &Path, content: &[u8]) -> bool {
308        let metadata = match path.metadata() {
309            Ok(m) => m,
310            Err(_) => return true,
311        };
312
313        let mtime = metadata
314            .modified()
315            .ok()
316            .and_then(|t| t.duration_since(SystemTime::UNIX_EPOCH).ok())
317            .map_or(0, |d| d.as_secs());
318
319        let content_hash = hash_content(content);
320        let relative_path = path.to_string_lossy();
321        self.cache
322            .needs_rescan_with_hash(&relative_path, mtime, metadata.len(), content_hash)
323    }
324
325    /// Get cached file if available and up-to-date
326    pub fn get_cached(&self, path: &str) -> Option<&CachedFile> {
327        self.cache.files.get(path)
328    }
329
330    /// Update cache with new file data
331    pub fn update(&mut self, file: CachedFile) {
332        self.cache.update_file(file);
333        self.dirty = true;
334    }
335
336    /// Remove a deleted file from cache
337    pub fn remove(&mut self, path: &str) {
338        self.cache.remove_file(path);
339        self.dirty = true;
340    }
341
342    /// Save cache if modified
343    pub fn save(&mut self) -> Result<(), CacheError> {
344        if self.dirty {
345            self.cache.recalculate_totals();
346            self.cache.save(&self.cache_path)?;
347            self.dirty = false;
348        }
349        Ok(())
350    }
351
352    /// Force save cache
353    pub fn force_save(&mut self) -> Result<(), CacheError> {
354        self.cache.recalculate_totals();
355        self.cache.save(&self.cache_path)?;
356        self.dirty = false;
357        Ok(())
358    }
359
360    /// Get cache statistics
361    pub fn stats(&self) -> CacheStats {
362        self.cache.stats()
363    }
364
365    /// Clear the cache
366    pub fn clear(&mut self) {
367        self.cache = RepoCache::new(&self.cache.root_path);
368        self.dirty = true;
369    }
370
371    /// Get list of changed files compared to current state
372    pub fn get_changed_files<'a>(
373        &self,
374        current_files: &'a [(PathBuf, u64, u64)],
375    ) -> Vec<&'a PathBuf> {
376        current_files
377            .iter()
378            .filter(|(path, mtime, size)| {
379                let relative = path.to_string_lossy();
380                self.cache.needs_rescan(&relative, *mtime, *size)
381            })
382            .map(|(path, _, _)| path)
383            .collect()
384    }
385}
386
387/// File change event for watching
388#[derive(Debug, Clone)]
389pub enum FileChange {
390    Created(PathBuf),
391    Modified(PathBuf),
392    Deleted(PathBuf),
393    Renamed { from: PathBuf, to: PathBuf },
394}
395
396/// File watcher using notify crate (when 'watch' feature enabled)
397#[cfg(feature = "watch")]
398pub mod watcher {
399    use super::*;
400    use notify::{Config, Event, EventKind, RecommendedWatcher, RecursiveMode, Watcher};
401    use std::sync::mpsc::{channel, Receiver};
402
403    /// File system watcher for incremental updates
404    pub struct FileWatcher {
405        watcher: RecommendedWatcher,
406        receiver: Receiver<Result<Event, notify::Error>>,
407        root_path: PathBuf,
408    }
409
410    impl FileWatcher {
411        /// Create a new file watcher for a directory
412        pub fn new(path: &Path) -> Result<Self, notify::Error> {
413            let (tx, rx) = channel();
414
415            let watcher = RecommendedWatcher::new(
416                move |res| {
417                    let _ = tx.send(res);
418                },
419                Config::default(),
420            )?;
421
422            let mut fw = Self { watcher, receiver: rx, root_path: path.to_path_buf() };
423
424            fw.watcher.watch(path, RecursiveMode::Recursive)?;
425
426            Ok(fw)
427        }
428
429        /// Get next file change event (non-blocking)
430        pub fn try_next(&self) -> Option<FileChange> {
431            match self.receiver.try_recv() {
432                Ok(Ok(event)) => self.event_to_change(event),
433                _ => None,
434            }
435        }
436
437        /// Wait for next file change event (blocking)
438        pub fn next(&self) -> Option<FileChange> {
439            match self.receiver.recv() {
440                Ok(Ok(event)) => self.event_to_change(event),
441                _ => None,
442            }
443        }
444
445        /// Convert notify event to FileChange
446        fn event_to_change(&self, event: Event) -> Option<FileChange> {
447            let path = event.paths.first()?.clone();
448
449            match event.kind {
450                EventKind::Create(_) => Some(FileChange::Created(path)),
451                EventKind::Modify(_) => Some(FileChange::Modified(path)),
452                EventKind::Remove(_) => Some(FileChange::Deleted(path)),
453                _ => None,
454            }
455        }
456
457        /// Stop watching
458        pub fn stop(mut self) -> Result<(), notify::Error> {
459            self.watcher.unwatch(&self.root_path)
460        }
461    }
462}
463
464/// Compute a cryptographic hash for change detection
465///
466/// Uses BLAKE3 for collision resistance (truncated to 64 bits for API compatibility).
467/// This is significantly more collision-resistant than DefaultHasher (SipHash-1-3)
468/// which has known collision attacks.
469pub fn hash_content(content: &[u8]) -> u64 {
470    let hash = blake3::hash(content);
471    let bytes = hash.as_bytes();
472    // Take first 8 bytes as u64 (little-endian)
473    u64::from_le_bytes([
474        bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7],
475    ])
476}
477
478/// Get file modification time as Unix timestamp
479pub fn get_mtime(path: &Path) -> Option<u64> {
480    path.metadata()
481        .ok()?
482        .modified()
483        .ok()?
484        .duration_since(SystemTime::UNIX_EPOCH)
485        .ok()
486        .map(|d| d.as_secs())
487}
488
489#[cfg(test)]
490#[allow(clippy::str_to_string)]
491mod tests {
492    use super::*;
493    use tempfile::TempDir;
494
495    #[test]
496    fn test_cache_create_save_load() {
497        let temp = TempDir::new().unwrap();
498        let cache_path = temp.path().join("test.cache");
499
500        let mut cache = RepoCache::new("/test/repo");
501        cache.files.insert(
502            "test.py".to_string(),
503            CachedFile {
504                path: "test.py".to_string(),
505                mtime: 12345,
506                size: 100,
507                hash: 0,
508                tokens: TokenCounts {
509                    o200k: 45,
510                    cl100k: 48,
511                    claude: 50,
512                    gemini: 46,
513                    llama: 50,
514                    mistral: 50,
515                    deepseek: 50,
516                    qwen: 50,
517                    cohere: 48,
518                    grok: 50,
519                },
520                symbols: vec![],
521                symbols_extracted: false,
522                language: Some("python".to_string()),
523                lines: 10,
524            },
525        );
526
527        cache.save(&cache_path).unwrap();
528
529        let loaded = RepoCache::load(&cache_path).unwrap();
530        assert_eq!(loaded.files.len(), 1);
531        assert!(loaded.files.contains_key("test.py"));
532    }
533
534    #[test]
535    fn test_needs_rescan() {
536        let cache = RepoCache::new("/test");
537        assert!(cache.needs_rescan("new_file.py", 0, 0));
538
539        let mut cache = RepoCache::new("/test");
540        cache.files.insert(
541            "existing.py".to_string(),
542            CachedFile {
543                path: "existing.py".to_string(),
544                mtime: 1000,
545                size: 500,
546                hash: 0,
547                tokens: TokenCounts::default(),
548                symbols: vec![],
549                symbols_extracted: false,
550                language: None,
551                lines: 0,
552            },
553        );
554
555        assert!(!cache.needs_rescan("existing.py", 1000, 500));
556        assert!(cache.needs_rescan("existing.py", 2000, 500)); // mtime changed
557        assert!(cache.needs_rescan("existing.py", 1000, 600)); // size changed
558    }
559
560    #[test]
561    fn test_incremental_scanner() {
562        let temp = TempDir::new().unwrap();
563
564        let mut scanner = IncrementalScanner::new(temp.path());
565        assert!(scanner.needs_rescan(&temp.path().join("test.py")));
566
567        scanner.update(CachedFile {
568            path: "test.py".to_string(),
569            mtime: 1000,
570            size: 100,
571            hash: 0,
572            tokens: TokenCounts::default(),
573            symbols: vec![],
574            symbols_extracted: false,
575            language: Some("python".to_string()),
576            lines: 5,
577        });
578
579        assert!(scanner.get_cached("test.py").is_some());
580    }
581
582    #[test]
583    fn test_hash_content() {
584        let h1 = hash_content(b"hello world");
585        let h2 = hash_content(b"hello world");
586        let h3 = hash_content(b"different");
587
588        assert_eq!(h1, h2);
589        assert_ne!(h1, h3);
590    }
591
592    #[test]
593    fn test_needs_rescan_with_hash() {
594        let mut cache = RepoCache::new("/test");
595        let original_hash = hash_content(b"original content");
596        let modified_hash = hash_content(b"modified content");
597
598        cache.files.insert(
599            "file.py".to_string(),
600            CachedFile {
601                path: "file.py".to_string(),
602                mtime: 1000,
603                size: 500,
604                hash: original_hash,
605                tokens: TokenCounts::default(),
606                symbols: vec![],
607                symbols_extracted: false,
608                language: None,
609                lines: 0,
610            },
611        );
612
613        // Same mtime/size/hash - no rescan needed
614        assert!(!cache.needs_rescan_with_hash("file.py", 1000, 500, original_hash));
615
616        // Same mtime/size but different hash - rescan needed
617        assert!(cache.needs_rescan_with_hash("file.py", 1000, 500, modified_hash));
618
619        // Different mtime - rescan needed regardless of hash
620        assert!(cache.needs_rescan_with_hash("file.py", 2000, 500, original_hash));
621
622        // Hash of 0 is ignored (backwards compatibility)
623        assert!(!cache.needs_rescan_with_hash("file.py", 1000, 500, 0));
624    }
625}