infiniloom_engine/
incremental.rs

1//! Incremental scanning with file watching and caching
2//!
3//! Provides efficient re-scanning by caching results and only processing changed files.
4
5use serde::{Deserialize, Serialize};
6use std::collections::HashMap;
7use std::fs;
8use std::path::{Path, PathBuf};
9use std::time::SystemTime;
10use thiserror::Error;
11
12use crate::tokenizer::TokenCounts;
13use crate::types::Symbol;
14
15/// Cache entry for a single file
16#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct CachedFile {
18    /// Relative path
19    pub path: String,
20    /// Last modified time (Unix timestamp)
21    pub mtime: u64,
22    /// File size in bytes
23    pub size: u64,
24    /// Content hash (for change detection)
25    pub hash: u64,
26    /// Token counts
27    pub tokens: TokenCounts,
28    /// Extracted symbols
29    pub symbols: Vec<CachedSymbol>,
30    /// Whether symbols were extracted for this file
31    pub symbols_extracted: bool,
32    /// Detected language
33    pub language: Option<String>,
34    /// Line count
35    pub lines: usize,
36}
37
38/// Cached symbol (simplified for storage)
39#[derive(Debug, Clone, Serialize, Deserialize)]
40pub struct CachedSymbol {
41    pub name: String,
42    pub kind: String,
43    pub start_line: u32,
44    pub end_line: u32,
45    pub signature: Option<String>,
46}
47
48impl From<&Symbol> for CachedSymbol {
49    fn from(s: &Symbol) -> Self {
50        Self {
51            name: s.name.clone(),
52            kind: s.kind.name().to_owned(),
53            start_line: s.start_line,
54            end_line: s.end_line,
55            signature: s.signature.clone(),
56        }
57    }
58}
59
60impl From<&CachedSymbol> for Symbol {
61    fn from(s: &CachedSymbol) -> Self {
62        use crate::types::{SymbolKind, Visibility};
63        Self {
64            name: s.name.clone(),
65            kind: SymbolKind::from_str(&s.kind).unwrap_or(SymbolKind::Variable),
66            start_line: s.start_line,
67            end_line: s.end_line,
68            signature: s.signature.clone(),
69            docstring: None,
70            visibility: Visibility::Public,
71            references: 0,
72            importance: 0.5,
73            parent: None,
74            calls: Vec::new(),
75            extends: None,
76            implements: Vec::new(),
77        }
78    }
79}
80
81/// Repository cache
82#[derive(Debug, Clone, Serialize, Deserialize)]
83pub struct RepoCache {
84    /// Cache version (for compatibility)
85    pub version: u32,
86    /// Repository root path
87    pub root_path: String,
88    /// Cache creation time
89    pub created_at: u64,
90    /// Last update time
91    pub updated_at: u64,
92    /// Cached files
93    pub files: HashMap<String, CachedFile>,
94    /// Total token count
95    pub total_tokens: TokenCounts,
96    /// External dependencies detected
97    pub external_deps: Vec<String>,
98}
99
100impl RepoCache {
101    /// Current cache version
102    pub const VERSION: u32 = 2;
103
104    /// Create a new empty cache
105    pub fn new(root_path: &str) -> Self {
106        let now = SystemTime::now()
107            .duration_since(SystemTime::UNIX_EPOCH)
108            .map(|d| d.as_secs())
109            .unwrap_or(0);
110
111        Self {
112            version: Self::VERSION,
113            root_path: root_path.to_owned(),
114            created_at: now,
115            updated_at: now,
116            files: HashMap::new(),
117            total_tokens: TokenCounts::default(),
118            external_deps: Vec::new(),
119        }
120    }
121
122    /// Load cache from file
123    pub fn load(cache_path: &Path) -> Result<Self, CacheError> {
124        let content = fs::read(cache_path).map_err(|e| CacheError::IoError(e.to_string()))?;
125
126        let cache: Self = bincode::deserialize(&content)
127            .map_err(|e| CacheError::DeserializeError(e.to_string()))?;
128
129        // Check version compatibility
130        if cache.version != Self::VERSION {
131            return Err(CacheError::VersionMismatch {
132                expected: Self::VERSION,
133                found: cache.version,
134            });
135        }
136
137        Ok(cache)
138    }
139
140    /// Save cache to file
141    pub fn save(&self, cache_path: &Path) -> Result<(), CacheError> {
142        // Ensure parent directory exists
143        if let Some(parent) = cache_path.parent() {
144            fs::create_dir_all(parent).map_err(|e| CacheError::IoError(e.to_string()))?;
145        }
146
147        let content =
148            bincode::serialize(self).map_err(|e| CacheError::SerializeError(e.to_string()))?;
149
150        fs::write(cache_path, content).map_err(|e| CacheError::IoError(e.to_string()))?;
151
152        Ok(())
153    }
154
155    /// Get default cache path for a repository
156    pub fn default_cache_path(repo_path: &Path) -> PathBuf {
157        repo_path.join(".infiniloom/cache/repo.cache")
158    }
159
160    /// Check if a file needs rescanning based on mtime and size
161    pub fn needs_rescan(&self, path: &str, current_mtime: u64, current_size: u64) -> bool {
162        match self.files.get(path) {
163            Some(cached) => cached.mtime != current_mtime || cached.size != current_size,
164            None => true,
165        }
166    }
167
168    /// Check if a file needs rescanning, including content hash comparison
169    /// This catches changes that don't modify mtime/size (e.g., touch followed by edit)
170    pub fn needs_rescan_with_hash(
171        &self,
172        path: &str,
173        current_mtime: u64,
174        current_size: u64,
175        current_hash: u64,
176    ) -> bool {
177        match self.files.get(path) {
178            Some(cached) => {
179                cached.mtime != current_mtime
180                    || cached.size != current_size
181                    || (cached.hash != 0 && current_hash != 0 && cached.hash != current_hash)
182            },
183            None => true,
184        }
185    }
186
187    /// Get a cached file by path
188    pub fn get(&self, path: &str) -> Option<&CachedFile> {
189        self.files.get(path)
190    }
191
192    /// Add or update a file in the cache
193    pub fn update_file(&mut self, file: CachedFile) {
194        self.files.insert(file.path.clone(), file);
195        self.updated_at = SystemTime::now()
196            .duration_since(SystemTime::UNIX_EPOCH)
197            .map(|d| d.as_secs())
198            .unwrap_or(0);
199    }
200
201    /// Remove a file from the cache
202    pub fn remove_file(&mut self, path: &str) {
203        self.files.remove(path);
204    }
205
206    /// Get files that no longer exist
207    pub fn find_deleted_files(&self, current_files: &[&str]) -> Vec<String> {
208        let current_set: std::collections::HashSet<&str> = current_files.iter().copied().collect();
209        self.files
210            .keys()
211            .filter(|p| !current_set.contains(p.as_str()))
212            .cloned()
213            .collect()
214    }
215
216    /// Recalculate total tokens
217    pub fn recalculate_totals(&mut self) {
218        self.total_tokens = self.files.values().map(|f| f.tokens).sum();
219    }
220
221    /// Get cache statistics
222    pub fn stats(&self) -> CacheStats {
223        CacheStats {
224            file_count: self.files.len(),
225            total_tokens: self.total_tokens,
226            total_bytes: self.files.values().map(|f| f.size).sum(),
227            age_seconds: SystemTime::now()
228                .duration_since(SystemTime::UNIX_EPOCH)
229                .map(|d| d.as_secs())
230                .unwrap_or(0)
231                .saturating_sub(self.updated_at),
232        }
233    }
234}
235
236/// Cache statistics
237#[derive(Debug, Clone)]
238pub struct CacheStats {
239    pub file_count: usize,
240    pub total_tokens: TokenCounts,
241    pub total_bytes: u64,
242    pub age_seconds: u64,
243}
244
245/// Cache errors
246#[derive(Debug, Error)]
247pub enum CacheError {
248    #[error("I/O error: {0}")]
249    IoError(String),
250    #[error("Serialization error: {0}")]
251    SerializeError(String),
252    #[error("Deserialization error: {0}")]
253    DeserializeError(String),
254    #[error("Cache version mismatch: expected {expected}, found {found}")]
255    VersionMismatch { expected: u32, found: u32 },
256}
257
258/// Incremental scanner that uses caching
259pub struct IncrementalScanner {
260    cache: RepoCache,
261    cache_path: PathBuf,
262    dirty: bool,
263}
264
265impl IncrementalScanner {
266    /// Create or load an incremental scanner for a repository
267    pub fn new(repo_path: &Path) -> Self {
268        let cache_path = RepoCache::default_cache_path(repo_path);
269
270        let cache = RepoCache::load(&cache_path)
271            .unwrap_or_else(|_| RepoCache::new(&repo_path.to_string_lossy()));
272
273        Self { cache, cache_path, dirty: false }
274    }
275
276    /// Create with custom cache path
277    pub fn with_cache_path(repo_path: &Path, cache_path: PathBuf) -> Self {
278        let cache = RepoCache::load(&cache_path)
279            .unwrap_or_else(|_| RepoCache::new(&repo_path.to_string_lossy()));
280
281        Self { cache, cache_path, dirty: false }
282    }
283
284    /// Check if a file needs to be rescanned (fast check using mtime/size only)
285    pub fn needs_rescan(&self, path: &Path) -> bool {
286        let metadata = match path.metadata() {
287            Ok(m) => m,
288            Err(_) => return true,
289        };
290
291        let mtime = metadata
292            .modified()
293            .ok()
294            .and_then(|t| t.duration_since(SystemTime::UNIX_EPOCH).ok())
295            .map(|d| d.as_secs())
296            .unwrap_or(0);
297
298        let relative_path = path.to_string_lossy();
299        self.cache
300            .needs_rescan(&relative_path, mtime, metadata.len())
301    }
302
303    /// Check if a file needs to be rescanned, including content hash check
304    /// This is more accurate but requires reading the file content
305    pub fn needs_rescan_with_content(&self, path: &Path, content: &[u8]) -> bool {
306        let metadata = match path.metadata() {
307            Ok(m) => m,
308            Err(_) => return true,
309        };
310
311        let mtime = metadata
312            .modified()
313            .ok()
314            .and_then(|t| t.duration_since(SystemTime::UNIX_EPOCH).ok())
315            .map(|d| d.as_secs())
316            .unwrap_or(0);
317
318        let content_hash = hash_content(content);
319        let relative_path = path.to_string_lossy();
320        self.cache
321            .needs_rescan_with_hash(&relative_path, mtime, metadata.len(), content_hash)
322    }
323
324    /// Get cached file if available and up-to-date
325    pub fn get_cached(&self, path: &str) -> Option<&CachedFile> {
326        self.cache.files.get(path)
327    }
328
329    /// Update cache with new file data
330    pub fn update(&mut self, file: CachedFile) {
331        self.cache.update_file(file);
332        self.dirty = true;
333    }
334
335    /// Remove a deleted file from cache
336    pub fn remove(&mut self, path: &str) {
337        self.cache.remove_file(path);
338        self.dirty = true;
339    }
340
341    /// Save cache if modified
342    pub fn save(&mut self) -> Result<(), CacheError> {
343        if self.dirty {
344            self.cache.recalculate_totals();
345            self.cache.save(&self.cache_path)?;
346            self.dirty = false;
347        }
348        Ok(())
349    }
350
351    /// Force save cache
352    pub fn force_save(&mut self) -> Result<(), CacheError> {
353        self.cache.recalculate_totals();
354        self.cache.save(&self.cache_path)?;
355        self.dirty = false;
356        Ok(())
357    }
358
359    /// Get cache statistics
360    pub fn stats(&self) -> CacheStats {
361        self.cache.stats()
362    }
363
364    /// Clear the cache
365    pub fn clear(&mut self) {
366        self.cache = RepoCache::new(&self.cache.root_path);
367        self.dirty = true;
368    }
369
370    /// Get list of changed files compared to current state
371    pub fn get_changed_files<'a>(
372        &self,
373        current_files: &'a [(PathBuf, u64, u64)],
374    ) -> Vec<&'a PathBuf> {
375        current_files
376            .iter()
377            .filter(|(path, mtime, size)| {
378                let relative = path.to_string_lossy();
379                self.cache.needs_rescan(&relative, *mtime, *size)
380            })
381            .map(|(path, _, _)| path)
382            .collect()
383    }
384}
385
386/// File change event for watching
387#[derive(Debug, Clone)]
388pub enum FileChange {
389    Created(PathBuf),
390    Modified(PathBuf),
391    Deleted(PathBuf),
392    Renamed { from: PathBuf, to: PathBuf },
393}
394
395/// File watcher using notify crate (when 'watch' feature enabled)
396#[cfg(feature = "watch")]
397pub mod watcher {
398    use super::*;
399    use notify::{Config, Event, EventKind, RecommendedWatcher, RecursiveMode, Watcher};
400    use std::sync::mpsc::{channel, Receiver};
401
402    /// File system watcher for incremental updates
403    pub struct FileWatcher {
404        watcher: RecommendedWatcher,
405        receiver: Receiver<Result<Event, notify::Error>>,
406        root_path: PathBuf,
407    }
408
409    impl FileWatcher {
410        /// Create a new file watcher for a directory
411        pub fn new(path: &Path) -> Result<Self, notify::Error> {
412            let (tx, rx) = channel();
413
414            let watcher = RecommendedWatcher::new(
415                move |res| {
416                    let _ = tx.send(res);
417                },
418                Config::default(),
419            )?;
420
421            let mut fw = Self { watcher, receiver: rx, root_path: path.to_path_buf() };
422
423            fw.watcher.watch(path, RecursiveMode::Recursive)?;
424
425            Ok(fw)
426        }
427
428        /// Get next file change event (non-blocking)
429        pub fn try_next(&self) -> Option<FileChange> {
430            match self.receiver.try_recv() {
431                Ok(Ok(event)) => self.event_to_change(event),
432                _ => None,
433            }
434        }
435
436        /// Wait for next file change event (blocking)
437        pub fn next(&self) -> Option<FileChange> {
438            match self.receiver.recv() {
439                Ok(Ok(event)) => self.event_to_change(event),
440                _ => None,
441            }
442        }
443
444        /// Convert notify event to FileChange
445        fn event_to_change(&self, event: Event) -> Option<FileChange> {
446            let path = event.paths.first()?.clone();
447
448            match event.kind {
449                EventKind::Create(_) => Some(FileChange::Created(path)),
450                EventKind::Modify(_) => Some(FileChange::Modified(path)),
451                EventKind::Remove(_) => Some(FileChange::Deleted(path)),
452                _ => None,
453            }
454        }
455
456        /// Stop watching
457        pub fn stop(mut self) -> Result<(), notify::Error> {
458            self.watcher.unwatch(&self.root_path)
459        }
460    }
461}
462
463/// Compute a simple hash for change detection
464pub fn hash_content(content: &[u8]) -> u64 {
465    use std::collections::hash_map::DefaultHasher;
466    use std::hash::{Hash, Hasher};
467
468    let mut hasher = DefaultHasher::new();
469    content.hash(&mut hasher);
470    hasher.finish()
471}
472
473/// Get file modification time as Unix timestamp
474pub fn get_mtime(path: &Path) -> Option<u64> {
475    path.metadata()
476        .ok()?
477        .modified()
478        .ok()?
479        .duration_since(SystemTime::UNIX_EPOCH)
480        .ok()
481        .map(|d| d.as_secs())
482}
483
484#[cfg(test)]
485#[allow(clippy::str_to_string)]
486mod tests {
487    use super::*;
488    use tempfile::TempDir;
489
490    #[test]
491    fn test_cache_create_save_load() {
492        let temp = TempDir::new().unwrap();
493        let cache_path = temp.path().join("test.cache");
494
495        let mut cache = RepoCache::new("/test/repo");
496        cache.files.insert(
497            "test.py".to_string(),
498            CachedFile {
499                path: "test.py".to_string(),
500                mtime: 12345,
501                size: 100,
502                hash: 0,
503                tokens: TokenCounts {
504                    o200k: 45,
505                    cl100k: 48,
506                    claude: 50,
507                    gemini: 46,
508                    llama: 50,
509                    mistral: 50,
510                    deepseek: 50,
511                    qwen: 50,
512                    cohere: 48,
513                    grok: 50,
514                },
515                symbols: vec![],
516                symbols_extracted: false,
517                language: Some("python".to_string()),
518                lines: 10,
519            },
520        );
521
522        cache.save(&cache_path).unwrap();
523
524        let loaded = RepoCache::load(&cache_path).unwrap();
525        assert_eq!(loaded.files.len(), 1);
526        assert!(loaded.files.contains_key("test.py"));
527    }
528
529    #[test]
530    fn test_needs_rescan() {
531        let cache = RepoCache::new("/test");
532        assert!(cache.needs_rescan("new_file.py", 0, 0));
533
534        let mut cache = RepoCache::new("/test");
535        cache.files.insert(
536            "existing.py".to_string(),
537            CachedFile {
538                path: "existing.py".to_string(),
539                mtime: 1000,
540                size: 500,
541                hash: 0,
542                tokens: TokenCounts::default(),
543                symbols: vec![],
544                symbols_extracted: false,
545                language: None,
546                lines: 0,
547            },
548        );
549
550        assert!(!cache.needs_rescan("existing.py", 1000, 500));
551        assert!(cache.needs_rescan("existing.py", 2000, 500)); // mtime changed
552        assert!(cache.needs_rescan("existing.py", 1000, 600)); // size changed
553    }
554
555    #[test]
556    fn test_incremental_scanner() {
557        let temp = TempDir::new().unwrap();
558
559        let mut scanner = IncrementalScanner::new(temp.path());
560        assert!(scanner.needs_rescan(&temp.path().join("test.py")));
561
562        scanner.update(CachedFile {
563            path: "test.py".to_string(),
564            mtime: 1000,
565            size: 100,
566            hash: 0,
567            tokens: TokenCounts::default(),
568            symbols: vec![],
569            symbols_extracted: false,
570            language: Some("python".to_string()),
571            lines: 5,
572        });
573
574        assert!(scanner.get_cached("test.py").is_some());
575    }
576
577    #[test]
578    fn test_hash_content() {
579        let h1 = hash_content(b"hello world");
580        let h2 = hash_content(b"hello world");
581        let h3 = hash_content(b"different");
582
583        assert_eq!(h1, h2);
584        assert_ne!(h1, h3);
585    }
586
587    #[test]
588    fn test_needs_rescan_with_hash() {
589        let mut cache = RepoCache::new("/test");
590        let original_hash = hash_content(b"original content");
591        let modified_hash = hash_content(b"modified content");
592
593        cache.files.insert(
594            "file.py".to_string(),
595            CachedFile {
596                path: "file.py".to_string(),
597                mtime: 1000,
598                size: 500,
599                hash: original_hash,
600                tokens: TokenCounts::default(),
601                symbols: vec![],
602                symbols_extracted: false,
603                language: None,
604                lines: 0,
605            },
606        );
607
608        // Same mtime/size/hash - no rescan needed
609        assert!(!cache.needs_rescan_with_hash("file.py", 1000, 500, original_hash));
610
611        // Same mtime/size but different hash - rescan needed
612        assert!(cache.needs_rescan_with_hash("file.py", 1000, 500, modified_hash));
613
614        // Different mtime - rescan needed regardless of hash
615        assert!(cache.needs_rescan_with_hash("file.py", 2000, 500, original_hash));
616
617        // Hash of 0 is ignored (backwards compatibility)
618        assert!(!cache.needs_rescan_with_hash("file.py", 1000, 500, 0));
619    }
620}