Skip to main content

kardo_core/scanner/
cache.rs

1//! Content-hash caching for incremental scanning.
2//!
3//! Before processing a file, check if its SHA-256 hash matches the stored hash.
4//! If unchanged, skip processing.
5
6use sha2::{Digest, Sha256};
7use std::collections::HashMap;
8use std::time::Instant;
9
10/// Result of an incremental scan.
11#[derive(Debug, Clone)]
12pub struct ScanResult {
13    pub files_scanned: usize,
14    pub files_skipped: usize,
15    pub duration_ms: u64,
16    pub score: Option<f64>,
17}
18
19/// Cache of content hashes for incremental processing.
20pub struct ContentCache {
21    hashes: HashMap<String, String>,
22}
23
24impl ContentCache {
25    pub fn new() -> Self {
26        Self {
27            hashes: HashMap::new(),
28        }
29    }
30
31    /// Load cache from a list of (path, hash) pairs (e.g., from DB).
32    pub fn from_entries(entries: Vec<(String, String)>) -> Self {
33        Self {
34            hashes: entries.into_iter().collect(),
35        }
36    }
37
38    /// Compute SHA-256 hash of file content.
39    pub fn compute_hash(content: &[u8]) -> String {
40        let mut hasher = Sha256::new();
41        hasher.update(content);
42        format!("{:x}", hasher.finalize())
43    }
44
45    /// Check if a file has changed since last scan.
46    /// Returns true if the file is new or changed (needs processing).
47    pub fn needs_processing(&self, relative_path: &str, content: &[u8]) -> bool {
48        let new_hash = Self::compute_hash(content);
49        match self.hashes.get(relative_path) {
50            Some(stored) => stored != &new_hash,
51            None => true, // New file
52        }
53    }
54
55    /// Update the stored hash for a file.
56    pub fn update(&mut self, relative_path: String, content: &[u8]) {
57        let hash = Self::compute_hash(content);
58        self.hashes.insert(relative_path, hash);
59    }
60
61    /// Get all current cache entries (for persisting to DB).
62    pub fn entries(&self) -> Vec<(&str, &str)> {
63        self.hashes
64            .iter()
65            .map(|(k, v)| (k.as_str(), v.as_str()))
66            .collect()
67    }
68
69    /// Process a batch of files, skipping unchanged ones.
70    ///
71    /// `files`: (relative_path, content) pairs
72    /// `processor`: function to call for each file that needs processing
73    ///
74    /// Returns ScanResult with counts and timing.
75    pub fn process_batch<F>(
76        &mut self,
77        files: &[(String, Vec<u8>)],
78        mut processor: F,
79    ) -> ScanResult
80    where
81        F: FnMut(&str, &[u8]),
82    {
83        let start = Instant::now();
84        let mut scanned = 0;
85        let mut skipped = 0;
86
87        for (path, content) in files {
88            if self.needs_processing(path, content) {
89                processor(path, content);
90                self.update(path.clone(), content);
91                scanned += 1;
92            } else {
93                skipped += 1;
94            }
95        }
96
97        ScanResult {
98            files_scanned: scanned,
99            files_skipped: skipped,
100            duration_ms: start.elapsed().as_millis() as u64,
101            score: None,
102        }
103    }
104}
105
106impl Default for ContentCache {
107    fn default() -> Self {
108        Self::new()
109    }
110}
111
112#[cfg(test)]
113mod tests {
114    use super::*;
115
116    #[test]
117    fn test_compute_hash() {
118        let hash = ContentCache::compute_hash(b"hello world");
119        assert!(!hash.is_empty());
120        // SHA-256 of "hello world" is known
121        assert_eq!(
122            hash,
123            "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9"
124        );
125    }
126
127    #[test]
128    fn test_needs_processing_new_file() {
129        let cache = ContentCache::new();
130        assert!(cache.needs_processing("new.md", b"content"));
131    }
132
133    #[test]
134    fn test_needs_processing_unchanged() {
135        let mut cache = ContentCache::new();
136        cache.update("file.md".to_string(), b"content");
137        assert!(!cache.needs_processing("file.md", b"content"));
138    }
139
140    #[test]
141    fn test_needs_processing_changed() {
142        let mut cache = ContentCache::new();
143        cache.update("file.md".to_string(), b"old content");
144        assert!(cache.needs_processing("file.md", b"new content"));
145    }
146
147    #[test]
148    fn test_process_batch_skips_unchanged() {
149        let mut cache = ContentCache::new();
150        cache.update("unchanged.md".to_string(), b"same");
151
152        let files = vec![
153            ("unchanged.md".to_string(), b"same".to_vec()),
154            ("new.md".to_string(), b"new content".to_vec()),
155        ];
156
157        let mut processed = Vec::new();
158        let result = cache.process_batch(&files, |path, _content| {
159            processed.push(path.to_string());
160        });
161
162        assert_eq!(result.files_scanned, 1);
163        assert_eq!(result.files_skipped, 1);
164        assert_eq!(processed, vec!["new.md"]);
165    }
166
167    #[test]
168    fn test_process_batch_all_new() {
169        let mut cache = ContentCache::new();
170        let files = vec![
171            ("a.md".to_string(), b"aaa".to_vec()),
172            ("b.md".to_string(), b"bbb".to_vec()),
173        ];
174
175        let result = cache.process_batch(&files, |_path, _content| {});
176        assert_eq!(result.files_scanned, 2);
177        assert_eq!(result.files_skipped, 0);
178    }
179
180    #[test]
181    fn test_from_entries() {
182        let hash = ContentCache::compute_hash(b"content");
183        let cache = ContentCache::from_entries(vec![("file.md".to_string(), hash)]);
184        assert!(!cache.needs_processing("file.md", b"content"));
185    }
186
187    #[test]
188    fn test_entries_roundtrip() {
189        let mut cache = ContentCache::new();
190        cache.update("a.md".to_string(), b"aaa");
191        cache.update("b.md".to_string(), b"bbb");
192
193        let entries = cache.entries();
194        assert_eq!(entries.len(), 2);
195    }
196}