Skip to main content

infiniloom_engine/
incremental.rs

1//! Incremental scanning with file watching and caching
2//!
3//! Provides efficient re-scanning by caching results and only processing changed files.
4
5use bincode::Options;
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8use std::fs;
9use std::path::{Path, PathBuf};
10use std::time::SystemTime;
11use thiserror::Error;
12
13use crate::bincode_safe::deserialize_with_limit;
14use crate::tokenizer::TokenCounts;
15use crate::types::Symbol;
16
17/// Cache entry for a single file
18#[derive(Debug, Clone, Serialize, Deserialize)]
19pub struct CachedFile {
20    /// Relative path
21    pub path: String,
22    /// Last modified time (Unix timestamp)
23    pub mtime: u64,
24    /// File size in bytes
25    pub size: u64,
26    /// Content hash (for change detection)
27    pub hash: u64,
28    /// Token counts
29    pub tokens: TokenCounts,
30    /// Extracted symbols
31    pub symbols: Vec<CachedSymbol>,
32    /// Whether symbols were extracted for this file
33    pub symbols_extracted: bool,
34    /// Detected language
35    pub language: Option<String>,
36    /// Line count
37    pub lines: usize,
38}
39
40/// Cached symbol (simplified for storage)
41#[derive(Debug, Clone, Serialize, Deserialize)]
42pub struct CachedSymbol {
43    pub name: String,
44    pub kind: String,
45    pub start_line: u32,
46    pub end_line: u32,
47    pub signature: Option<String>,
48}
49
50impl From<&Symbol> for CachedSymbol {
51    fn from(s: &Symbol) -> Self {
52        Self {
53            name: s.name.clone(),
54            kind: s.kind.name().to_owned(),
55            start_line: s.start_line,
56            end_line: s.end_line,
57            signature: s.signature.clone(),
58        }
59    }
60}
61
62impl From<&CachedSymbol> for Symbol {
63    fn from(s: &CachedSymbol) -> Self {
64        use crate::types::{SymbolKind, Visibility};
65        Self {
66            name: s.name.clone(),
67            kind: SymbolKind::from_str(&s.kind).unwrap_or(SymbolKind::Variable),
68            start_line: s.start_line,
69            end_line: s.end_line,
70            signature: s.signature.clone(),
71            docstring: None,
72            visibility: Visibility::Public,
73            references: 0,
74            importance: 0.5,
75            parent: None,
76            calls: Vec::new(),
77            extends: None,
78            implements: Vec::new(),
79        }
80    }
81}
82
83/// Repository cache
84#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct RepoCache {
86    /// Cache version (for compatibility)
87    pub version: u32,
88    /// Repository root path
89    pub root_path: String,
90    /// Cache creation time
91    pub created_at: u64,
92    /// Last update time
93    pub updated_at: u64,
94    /// Cached files
95    pub files: HashMap<String, CachedFile>,
96    /// Total token count
97    pub total_tokens: TokenCounts,
98    /// External dependencies detected
99    pub external_deps: Vec<String>,
100}
101
102impl RepoCache {
103    /// Current cache version
104    pub const VERSION: u32 = 2;
105
106    /// Create a new empty cache
107    pub fn new(root_path: &str) -> Self {
108        let now = SystemTime::now()
109            .duration_since(SystemTime::UNIX_EPOCH)
110            .map(|d| d.as_secs())
111            .unwrap_or(0);
112
113        Self {
114            version: Self::VERSION,
115            root_path: root_path.to_owned(),
116            created_at: now,
117            updated_at: now,
118            files: HashMap::new(),
119            total_tokens: TokenCounts::default(),
120            external_deps: Vec::new(),
121        }
122    }
123
124    /// Load cache from file
125    pub fn load(cache_path: &Path) -> Result<Self, CacheError> {
126        let content = fs::read(cache_path).map_err(|e| CacheError::IoError(e.to_string()))?;
127
128        let cache: Self = deserialize_with_limit(&content)
129            .map_err(|e| CacheError::DeserializeError(e.to_string()))?;
130
131        // Check version compatibility
132        if cache.version != Self::VERSION {
133            return Err(CacheError::VersionMismatch {
134                expected: Self::VERSION,
135                found: cache.version,
136            });
137        }
138
139        Ok(cache)
140    }
141
142    /// Save cache to file
143    pub fn save(&self, cache_path: &Path) -> Result<(), CacheError> {
144        // Ensure parent directory exists
145        if let Some(parent) = cache_path.parent() {
146            fs::create_dir_all(parent).map_err(|e| CacheError::IoError(e.to_string()))?;
147        }
148
149        // Note: Must use bincode::options() to match deserialize_with_limit() in load()
150        let content = bincode::options()
151            .serialize(self)
152            .map_err(|e| CacheError::SerializeError(e.to_string()))?;
153
154        fs::write(cache_path, content).map_err(|e| CacheError::IoError(e.to_string()))?;
155
156        Ok(())
157    }
158
159    /// Get default cache path for a repository
160    pub fn default_cache_path(repo_path: &Path) -> PathBuf {
161        repo_path.join(".infiniloom/cache/repo.cache")
162    }
163
164    /// Check if a file needs rescanning based on mtime and size
165    pub fn needs_rescan(&self, path: &str, current_mtime: u64, current_size: u64) -> bool {
166        match self.files.get(path) {
167            Some(cached) => cached.mtime != current_mtime || cached.size != current_size,
168            None => true,
169        }
170    }
171
172    /// Check if a file needs rescanning, including content hash comparison
173    /// This catches changes that don't modify mtime/size (e.g., touch followed by edit)
174    pub fn needs_rescan_with_hash(
175        &self,
176        path: &str,
177        current_mtime: u64,
178        current_size: u64,
179        current_hash: u64,
180    ) -> bool {
181        match self.files.get(path) {
182            Some(cached) => {
183                cached.mtime != current_mtime
184                    || cached.size != current_size
185                    || (cached.hash != 0 && current_hash != 0 && cached.hash != current_hash)
186            },
187            None => true,
188        }
189    }
190
191    /// Get a cached file by path
192    pub fn get(&self, path: &str) -> Option<&CachedFile> {
193        self.files.get(path)
194    }
195
196    /// Add or update a file in the cache
197    pub fn update_file(&mut self, file: CachedFile) {
198        self.files.insert(file.path.clone(), file);
199        self.updated_at = SystemTime::now()
200            .duration_since(SystemTime::UNIX_EPOCH)
201            .map(|d| d.as_secs())
202            .unwrap_or(0);
203    }
204
205    /// Remove a file from the cache
206    pub fn remove_file(&mut self, path: &str) {
207        self.files.remove(path);
208    }
209
210    /// Get files that no longer exist
211    pub fn find_deleted_files(&self, current_files: &[&str]) -> Vec<String> {
212        let current_set: std::collections::HashSet<&str> = current_files.iter().copied().collect();
213        self.files
214            .keys()
215            .filter(|p| !current_set.contains(p.as_str()))
216            .cloned()
217            .collect()
218    }
219
220    /// Recalculate total tokens
221    pub fn recalculate_totals(&mut self) {
222        self.total_tokens = self.files.values().map(|f| f.tokens).sum();
223    }
224
225    /// Get cache statistics
226    pub fn stats(&self) -> CacheStats {
227        CacheStats {
228            file_count: self.files.len(),
229            total_tokens: self.total_tokens,
230            total_bytes: self.files.values().map(|f| f.size).sum(),
231            age_seconds: SystemTime::now()
232                .duration_since(SystemTime::UNIX_EPOCH)
233                .map(|d| d.as_secs())
234                .unwrap_or(0)
235                .saturating_sub(self.updated_at),
236        }
237    }
238}
239
240/// Cache statistics
241#[derive(Debug, Clone)]
242pub struct CacheStats {
243    pub file_count: usize,
244    pub total_tokens: TokenCounts,
245    pub total_bytes: u64,
246    pub age_seconds: u64,
247}
248
249/// Cache errors
250#[derive(Debug, Error)]
251pub enum CacheError {
252    #[error("I/O error: {0}")]
253    IoError(String),
254    #[error("Serialization error: {0}")]
255    SerializeError(String),
256    #[error("Deserialization error: {0}")]
257    DeserializeError(String),
258    #[error("Cache version mismatch: expected {expected}, found {found}")]
259    VersionMismatch { expected: u32, found: u32 },
260}
261
262/// Incremental scanner that uses caching
263pub struct IncrementalScanner {
264    cache: RepoCache,
265    cache_path: PathBuf,
266    dirty: bool,
267}
268
269impl IncrementalScanner {
270    /// Create or load an incremental scanner for a repository
271    pub fn new(repo_path: &Path) -> Self {
272        let cache_path = RepoCache::default_cache_path(repo_path);
273
274        let cache = RepoCache::load(&cache_path)
275            .unwrap_or_else(|_| RepoCache::new(&repo_path.to_string_lossy()));
276
277        Self { cache, cache_path, dirty: false }
278    }
279
280    /// Create with custom cache path
281    pub fn with_cache_path(repo_path: &Path, cache_path: PathBuf) -> Self {
282        let cache = RepoCache::load(&cache_path)
283            .unwrap_or_else(|_| RepoCache::new(&repo_path.to_string_lossy()));
284
285        Self { cache, cache_path, dirty: false }
286    }
287
288    /// Check if a file needs to be rescanned (fast check using mtime/size only)
289    pub fn needs_rescan(&self, path: &Path) -> bool {
290        let metadata = match path.metadata() {
291            Ok(m) => m,
292            Err(_) => return true,
293        };
294
295        let mtime = metadata
296            .modified()
297            .ok()
298            .and_then(|t| t.duration_since(SystemTime::UNIX_EPOCH).ok())
299            .map_or(0, |d| d.as_secs());
300
301        let relative_path = path.to_string_lossy();
302        self.cache
303            .needs_rescan(&relative_path, mtime, metadata.len())
304    }
305
306    /// Check if a file needs to be rescanned, including content hash check
307    /// This is more accurate but requires reading the file content
308    pub fn needs_rescan_with_content(&self, path: &Path, content: &[u8]) -> bool {
309        let metadata = match path.metadata() {
310            Ok(m) => m,
311            Err(_) => return true,
312        };
313
314        let mtime = metadata
315            .modified()
316            .ok()
317            .and_then(|t| t.duration_since(SystemTime::UNIX_EPOCH).ok())
318            .map_or(0, |d| d.as_secs());
319
320        let content_hash = hash_content(content);
321        let relative_path = path.to_string_lossy();
322        self.cache
323            .needs_rescan_with_hash(&relative_path, mtime, metadata.len(), content_hash)
324    }
325
326    /// Get cached file if available and up-to-date
327    pub fn get_cached(&self, path: &str) -> Option<&CachedFile> {
328        self.cache.files.get(path)
329    }
330
331    /// Update cache with new file data
332    pub fn update(&mut self, file: CachedFile) {
333        self.cache.update_file(file);
334        self.dirty = true;
335    }
336
337    /// Remove a deleted file from cache
338    pub fn remove(&mut self, path: &str) {
339        self.cache.remove_file(path);
340        self.dirty = true;
341    }
342
343    /// Save cache if modified
344    pub fn save(&mut self) -> Result<(), CacheError> {
345        if self.dirty {
346            self.cache.recalculate_totals();
347            self.cache.save(&self.cache_path)?;
348            self.dirty = false;
349        }
350        Ok(())
351    }
352
353    /// Force save cache
354    pub fn force_save(&mut self) -> Result<(), CacheError> {
355        self.cache.recalculate_totals();
356        self.cache.save(&self.cache_path)?;
357        self.dirty = false;
358        Ok(())
359    }
360
361    /// Get cache statistics
362    pub fn stats(&self) -> CacheStats {
363        self.cache.stats()
364    }
365
366    /// Clear the cache
367    pub fn clear(&mut self) {
368        self.cache = RepoCache::new(&self.cache.root_path);
369        self.dirty = true;
370    }
371
372    /// Get list of changed files compared to current state
373    pub fn get_changed_files<'a>(
374        &self,
375        current_files: &'a [(PathBuf, u64, u64)],
376    ) -> Vec<&'a PathBuf> {
377        current_files
378            .iter()
379            .filter(|(path, mtime, size)| {
380                let relative = path.to_string_lossy();
381                self.cache.needs_rescan(&relative, *mtime, *size)
382            })
383            .map(|(path, _, _)| path)
384            .collect()
385    }
386}
387
388/// File change event for watching
389#[derive(Debug, Clone)]
390pub enum FileChange {
391    Created(PathBuf),
392    Modified(PathBuf),
393    Deleted(PathBuf),
394    Renamed { from: PathBuf, to: PathBuf },
395}
396
397/// File watcher using notify crate (when 'watch' feature enabled)
398#[cfg(feature = "watch")]
399pub mod watcher {
400    use super::*;
401    use notify::{Config, Event, EventKind, RecommendedWatcher, RecursiveMode, Watcher};
402    use std::sync::mpsc::{channel, Receiver};
403
404    /// File system watcher for incremental updates
405    pub struct FileWatcher {
406        watcher: RecommendedWatcher,
407        receiver: Receiver<Result<Event, notify::Error>>,
408        root_path: PathBuf,
409    }
410
411    impl FileWatcher {
412        /// Create a new file watcher for a directory
413        pub fn new(path: &Path) -> Result<Self, notify::Error> {
414            let (tx, rx) = channel();
415
416            let watcher = RecommendedWatcher::new(
417                move |res| {
418                    let _ = tx.send(res);
419                },
420                Config::default(),
421            )?;
422
423            let mut fw = Self { watcher, receiver: rx, root_path: path.to_path_buf() };
424
425            fw.watcher.watch(path, RecursiveMode::Recursive)?;
426
427            Ok(fw)
428        }
429
430        /// Get next file change event (non-blocking)
431        pub fn try_next(&self) -> Option<FileChange> {
432            match self.receiver.try_recv() {
433                Ok(Ok(event)) => self.event_to_change(event),
434                _ => None,
435            }
436        }
437
438        /// Wait for next file change event (blocking)
439        pub fn next(&self) -> Option<FileChange> {
440            match self.receiver.recv() {
441                Ok(Ok(event)) => self.event_to_change(event),
442                _ => None,
443            }
444        }
445
446        /// Convert notify event to FileChange
447        fn event_to_change(&self, event: Event) -> Option<FileChange> {
448            let path = event.paths.first()?.clone();
449
450            match event.kind {
451                EventKind::Create(_) => Some(FileChange::Created(path)),
452                EventKind::Modify(_) => Some(FileChange::Modified(path)),
453                EventKind::Remove(_) => Some(FileChange::Deleted(path)),
454                _ => None,
455            }
456        }
457
458        /// Stop watching
459        pub fn stop(mut self) -> Result<(), notify::Error> {
460            self.watcher.unwatch(&self.root_path)
461        }
462    }
463}
464
465/// Compute a cryptographic hash for change detection
466///
467/// Uses BLAKE3 for collision resistance (truncated to 64 bits for API compatibility).
468/// This is significantly more collision-resistant than DefaultHasher (SipHash-1-3)
469/// which has known collision attacks.
470pub fn hash_content(content: &[u8]) -> u64 {
471    let hash = blake3::hash(content);
472    let bytes = hash.as_bytes();
473    // Take first 8 bytes as u64 (little-endian)
474    u64::from_le_bytes([
475        bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7],
476    ])
477}
478
479/// Get file modification time as Unix timestamp
480pub fn get_mtime(path: &Path) -> Option<u64> {
481    path.metadata()
482        .ok()?
483        .modified()
484        .ok()?
485        .duration_since(SystemTime::UNIX_EPOCH)
486        .ok()
487        .map(|d| d.as_secs())
488}
489
490#[cfg(test)]
491#[allow(clippy::str_to_string)]
492mod tests {
493    use super::*;
494    use tempfile::TempDir;
495
496    #[test]
497    fn test_cache_create_save_load() {
498        let temp = TempDir::new().unwrap();
499        let cache_path = temp.path().join("test.cache");
500
501        let mut cache = RepoCache::new("/test/repo");
502        cache.files.insert(
503            "test.py".to_string(),
504            CachedFile {
505                path: "test.py".to_string(),
506                mtime: 12345,
507                size: 100,
508                hash: 0,
509                tokens: TokenCounts {
510                    o200k: 45,
511                    cl100k: 48,
512                    claude: 50,
513                    gemini: 46,
514                    llama: 50,
515                    mistral: 50,
516                    deepseek: 50,
517                    qwen: 50,
518                    cohere: 48,
519                    grok: 50,
520                },
521                symbols: vec![],
522                symbols_extracted: false,
523                language: Some("python".to_string()),
524                lines: 10,
525            },
526        );
527
528        cache.save(&cache_path).unwrap();
529
530        let loaded = RepoCache::load(&cache_path).unwrap();
531        assert_eq!(loaded.files.len(), 1);
532        assert!(loaded.files.contains_key("test.py"));
533    }
534
535    #[test]
536    fn test_needs_rescan() {
537        let cache = RepoCache::new("/test");
538        assert!(cache.needs_rescan("new_file.py", 0, 0));
539
540        let mut cache = RepoCache::new("/test");
541        cache.files.insert(
542            "existing.py".to_string(),
543            CachedFile {
544                path: "existing.py".to_string(),
545                mtime: 1000,
546                size: 500,
547                hash: 0,
548                tokens: TokenCounts::default(),
549                symbols: vec![],
550                symbols_extracted: false,
551                language: None,
552                lines: 0,
553            },
554        );
555
556        assert!(!cache.needs_rescan("existing.py", 1000, 500));
557        assert!(cache.needs_rescan("existing.py", 2000, 500)); // mtime changed
558        assert!(cache.needs_rescan("existing.py", 1000, 600)); // size changed
559    }
560
561    #[test]
562    fn test_incremental_scanner() {
563        let temp = TempDir::new().unwrap();
564
565        let mut scanner = IncrementalScanner::new(temp.path());
566        assert!(scanner.needs_rescan(&temp.path().join("test.py")));
567
568        scanner.update(CachedFile {
569            path: "test.py".to_string(),
570            mtime: 1000,
571            size: 100,
572            hash: 0,
573            tokens: TokenCounts::default(),
574            symbols: vec![],
575            symbols_extracted: false,
576            language: Some("python".to_string()),
577            lines: 5,
578        });
579
580        assert!(scanner.get_cached("test.py").is_some());
581    }
582
583    #[test]
584    fn test_hash_content() {
585        let h1 = hash_content(b"hello world");
586        let h2 = hash_content(b"hello world");
587        let h3 = hash_content(b"different");
588
589        assert_eq!(h1, h2);
590        assert_ne!(h1, h3);
591    }
592
593    #[test]
594    fn test_needs_rescan_with_hash() {
595        let mut cache = RepoCache::new("/test");
596        let original_hash = hash_content(b"original content");
597        let modified_hash = hash_content(b"modified content");
598
599        cache.files.insert(
600            "file.py".to_string(),
601            CachedFile {
602                path: "file.py".to_string(),
603                mtime: 1000,
604                size: 500,
605                hash: original_hash,
606                tokens: TokenCounts::default(),
607                symbols: vec![],
608                symbols_extracted: false,
609                language: None,
610                lines: 0,
611            },
612        );
613
614        // Same mtime/size/hash - no rescan needed
615        assert!(!cache.needs_rescan_with_hash("file.py", 1000, 500, original_hash));
616
617        // Same mtime/size but different hash - rescan needed
618        assert!(cache.needs_rescan_with_hash("file.py", 1000, 500, modified_hash));
619
620        // Different mtime - rescan needed regardless of hash
621        assert!(cache.needs_rescan_with_hash("file.py", 2000, 500, original_hash));
622
623        // Hash of 0 is ignored (backwards compatibility)
624        assert!(!cache.needs_rescan_with_hash("file.py", 1000, 500, 0));
625    }
626}