Skip to main content

mir_analyzer/
cache.rs

1/// Per-file analysis result cache backed by a JSON file on disk.
2///
3/// Cache key: file path.  Cache validity: SHA-256 hash of file content.
4/// If the content hash matches what was stored, the cached issues are returned
5/// and Pass 2 analysis is skipped for that file.
6use std::collections::{HashMap, HashSet};
7use std::path::{Path, PathBuf};
8use std::sync::Mutex;
9
10use serde::{Deserialize, Serialize};
11use sha2::{Digest, Sha256};
12
13use mir_issues::Issue;
14
15/// Cached analysis result returned on a cache hit: issues and reference location
16/// triples `(symbol_key, start_byte, end_byte)`.
17pub type CacheHit = (Vec<Issue>, Vec<(String, u32, u32)>);
18
19// ---------------------------------------------------------------------------
20// Hash helper
21// ---------------------------------------------------------------------------
22
23/// Compute the SHA-256 hex digest of `content`.
24pub fn hash_content(content: &str) -> String {
25    let mut h = Sha256::new();
26    h.update(content.as_bytes());
27    h.finalize().iter().fold(String::new(), |mut acc, b| {
28        use std::fmt::Write;
29        write!(acc, "{:02x}", b).unwrap();
30        acc
31    })
32}
33
34// ---------------------------------------------------------------------------
35// CacheEntry
36// ---------------------------------------------------------------------------
37
38#[derive(Debug, Clone, Serialize, Deserialize)]
39struct CacheEntry {
40    content_hash: String,
41    issues: Vec<Issue>,
42    /// Reference locations recorded during Pass 2: (symbol_key, start_byte, end_byte).
43    /// Stored so that cache hits can replay symbol_reference_locations without re-running
44    /// analyze_bodies.
45    #[serde(default)]
46    reference_locations: Vec<(String, u32, u32)>,
47}
48
49// ---------------------------------------------------------------------------
50// AnalysisCache
51// ---------------------------------------------------------------------------
52
53/// Serialized form of the full cache file.
54#[derive(Debug, Default, Serialize, Deserialize)]
55struct CacheFile {
56    #[serde(default)]
57    entries: HashMap<String, CacheEntry>,
58    /// Reverse dependency graph: defining_file → [files that depend on it].
59    /// Persisted so that the next run can invalidate dependents before Pass 1.
60    #[serde(default)]
61    reverse_deps: HashMap<String, HashSet<String>>,
62}
63
64/// Thread-safe, disk-backed cache for per-file analysis results.
65pub struct AnalysisCache {
66    cache_dir: PathBuf,
67    entries: Mutex<HashMap<String, CacheEntry>>,
68    /// Reverse dependency graph loaded from disk (from the previous run).
69    reverse_deps: Mutex<HashMap<String, HashSet<String>>>,
70    dirty: Mutex<bool>,
71}
72
73impl AnalysisCache {
74    /// Open or create a cache stored under `cache_dir`.
75    /// If the directory or cache file do not exist they are created lazily on
76    /// the first `flush()` call.
77    pub fn open(cache_dir: &Path) -> Self {
78        std::fs::create_dir_all(cache_dir).ok();
79        let file = Self::load(cache_dir);
80        Self {
81            cache_dir: cache_dir.to_path_buf(),
82            entries: Mutex::new(file.entries),
83            reverse_deps: Mutex::new(file.reverse_deps),
84            dirty: Mutex::new(false),
85        }
86    }
87
88    /// Open the default cache directory: `{project_root}/.mir-cache/`.
89    pub fn open_default(project_root: &Path) -> Self {
90        Self::open(&project_root.join(".mir-cache"))
91    }
92
93    /// Return cached issues and reference locations for `file_path` if its
94    /// `content_hash` matches. Returns `None` if there is no entry or the file
95    /// has changed. The second element of the tuple is the list of
96    /// `(symbol_key, start_byte, end_byte)` entries to replay into
97    /// `Codebase::symbol_reference_locations`.
98    pub fn get(&self, file_path: &str, content_hash: &str) -> Option<CacheHit> {
99        let entries = self.entries.lock().unwrap();
100        entries.get(file_path).and_then(|e| {
101            if e.content_hash == content_hash {
102                Some((e.issues.clone(), e.reference_locations.clone()))
103            } else {
104                None
105            }
106        })
107    }
108
109    /// Store `issues` and `reference_locations` for `file_path` with the given
110    /// `content_hash`. `reference_locations` is a list of
111    /// `(symbol_key, start_byte, end_byte)` recorded during Pass 2.
112    pub fn put(
113        &self,
114        file_path: &str,
115        content_hash: String,
116        issues: Vec<Issue>,
117        reference_locations: Vec<(String, u32, u32)>,
118    ) {
119        let mut entries = self.entries.lock().unwrap();
120        entries.insert(
121            file_path.to_string(),
122            CacheEntry {
123                content_hash,
124                issues,
125                reference_locations,
126            },
127        );
128        *self.dirty.lock().unwrap() = true;
129    }
130
131    /// Persist the in-memory cache to `{cache_dir}/cache.json`.
132    /// This is a no-op if nothing changed since the last flush.
133    pub fn flush(&self) {
134        let dirty = {
135            let mut d = self.dirty.lock().unwrap();
136            let was = *d;
137            *d = false;
138            was
139        };
140        if !dirty {
141            return;
142        }
143        let cache_file = self.cache_dir.join("cache.json");
144        let file = CacheFile {
145            entries: self.entries.lock().unwrap().clone(),
146            reverse_deps: self.reverse_deps.lock().unwrap().clone(),
147        };
148        if let Ok(json) = serde_json::to_string(&file) {
149            std::fs::write(cache_file, json).ok();
150        }
151    }
152
153    /// Replace the reverse dependency graph (called after each Pass 1).
154    pub fn set_reverse_deps(&self, deps: HashMap<String, HashSet<String>>) {
155        *self.reverse_deps.lock().unwrap() = deps;
156        *self.dirty.lock().unwrap() = true;
157    }
158
159    /// BFS from each changed file through the reverse dep graph.
160    /// Evicts every reachable dependent's cache entry.
161    /// Returns the number of entries evicted.
162    pub fn evict_with_dependents(&self, changed_files: &[String]) -> usize {
163        // Phase 1: collect all dependents to evict via BFS (lock held only here).
164        let to_evict: Vec<String> = {
165            let deps = self.reverse_deps.lock().unwrap();
166            let mut visited: HashSet<String> = changed_files.iter().cloned().collect();
167            let mut queue: std::collections::VecDeque<String> =
168                changed_files.iter().cloned().collect();
169            let mut result = Vec::new();
170
171            while let Some(file) = queue.pop_front() {
172                if let Some(dependents) = deps.get(&file) {
173                    for dep in dependents {
174                        if visited.insert(dep.clone()) {
175                            queue.push_back(dep.clone());
176                            result.push(dep.clone());
177                        }
178                    }
179                }
180            }
181            result
182        };
183
184        // Phase 2: evict (reverse_deps lock released above, entries lock taken per file).
185        let count = to_evict.len();
186        for file in &to_evict {
187            self.evict(file);
188        }
189        count
190    }
191
192    /// Remove a single file's cache entry.
193    pub fn evict(&self, file_path: &str) {
194        let mut entries = self.entries.lock().unwrap();
195        entries.remove(file_path);
196        *self.dirty.lock().unwrap() = true;
197    }
198
199    // -----------------------------------------------------------------------
200
201    fn load(cache_dir: &Path) -> CacheFile {
202        let cache_file = cache_dir.join("cache.json");
203        let Ok(bytes) = std::fs::read(&cache_file) else {
204            return CacheFile::default();
205        };
206        serde_json::from_slice(&bytes).unwrap_or_default()
207    }
208}
209
210// ---------------------------------------------------------------------------
211// Tests
212// ---------------------------------------------------------------------------
213
214#[cfg(test)]
215mod tests {
216    use super::*;
217    use tempfile::TempDir;
218
219    fn make_cache(dir: &TempDir) -> AnalysisCache {
220        AnalysisCache::open(dir.path())
221    }
222
223    fn seed(cache: &AnalysisCache, file: &str) {
224        cache.put(file, "hash".to_string(), vec![], vec![]);
225    }
226
227    #[test]
228    fn evict_with_dependents_linear_chain() {
229        // reverse_deps: A → [B], B → [C]
230        // Changing A must evict B and C.
231        let dir = TempDir::new().unwrap();
232        let cache = make_cache(&dir);
233        seed(&cache, "A");
234        seed(&cache, "B");
235        seed(&cache, "C");
236
237        let mut deps: HashMap<String, HashSet<String>> = HashMap::new();
238        deps.entry("A".into()).or_default().insert("B".into());
239        deps.entry("B".into()).or_default().insert("C".into());
240        cache.set_reverse_deps(deps);
241
242        let evicted = cache.evict_with_dependents(&["A".to_string()]);
243
244        assert_eq!(evicted, 2, "B and C should be evicted");
245        assert!(cache.get("A", "hash").is_some(), "A itself is not evicted");
246        assert!(cache.get("B", "hash").is_none(), "B should be evicted");
247        assert!(cache.get("C", "hash").is_none(), "C should be evicted");
248    }
249
250    #[test]
251    fn evict_with_dependents_diamond() {
252        // reverse_deps: A → [B, C], B → [D], C → [D]
253        // D should be evicted exactly once (visited set prevents double-eviction).
254        let dir = TempDir::new().unwrap();
255        let cache = make_cache(&dir);
256        seed(&cache, "A");
257        seed(&cache, "B");
258        seed(&cache, "C");
259        seed(&cache, "D");
260
261        let mut deps: HashMap<String, HashSet<String>> = HashMap::new();
262        deps.entry("A".into()).or_default().insert("B".into());
263        deps.entry("A".into()).or_default().insert("C".into());
264        deps.entry("B".into()).or_default().insert("D".into());
265        deps.entry("C".into()).or_default().insert("D".into());
266        cache.set_reverse_deps(deps);
267
268        let evicted = cache.evict_with_dependents(&["A".to_string()]);
269
270        assert_eq!(evicted, 3, "B, C, D each evicted once");
271        assert!(cache.get("D", "hash").is_none());
272    }
273
274    #[test]
275    fn evict_with_dependents_cycle_safety() {
276        // reverse_deps: A → [B], B → [A]  (circular)
277        // Must not loop forever; B should be evicted.
278        let dir = TempDir::new().unwrap();
279        let cache = make_cache(&dir);
280        seed(&cache, "A");
281        seed(&cache, "B");
282
283        let mut deps: HashMap<String, HashSet<String>> = HashMap::new();
284        deps.entry("A".into()).or_default().insert("B".into());
285        deps.entry("B".into()).or_default().insert("A".into());
286        cache.set_reverse_deps(deps);
287
288        let evicted = cache.evict_with_dependents(&["A".to_string()]);
289
290        // B is a dependent of A; A is the seed (not counted as "evicted dependent")
291        assert_eq!(evicted, 1);
292        assert!(cache.get("B", "hash").is_none());
293    }
294
295    #[test]
296    fn evict_with_dependents_unrelated_file_untouched() {
297        // Changing C should not evict B (which depends on A, not C).
298        let dir = TempDir::new().unwrap();
299        let cache = make_cache(&dir);
300        seed(&cache, "A");
301        seed(&cache, "B");
302        seed(&cache, "C");
303
304        let mut deps: HashMap<String, HashSet<String>> = HashMap::new();
305        deps.entry("A".into()).or_default().insert("B".into());
306        cache.set_reverse_deps(deps);
307
308        let evicted = cache.evict_with_dependents(&["C".to_string()]);
309
310        assert_eq!(evicted, 0);
311        assert!(
312            cache.get("B", "hash").is_some(),
313            "B unrelated, should survive"
314        );
315    }
316
317    #[test]
318    fn old_cache_without_reference_locations_deserializes_to_empty() {
319        // Cache entries written before the reference_locations field was added
320        // must still be readable. The #[serde(default)] attribute covers this,
321        // but we verify it explicitly so a future refactor can't silently break it.
322        let dir = TempDir::new().unwrap();
323        let cache_file = dir.path().join("cache.json");
324
325        // Write a cache file in the old format (no reference_locations field).
326        std::fs::write(
327            &cache_file,
328            r#"{"entries":{"a.php":{"content_hash":"abc","issues":[]}},"reverse_deps":{}}"#,
329        )
330        .unwrap();
331
332        let cache = AnalysisCache::open(dir.path());
333        let hit = cache
334            .get("a.php", "abc")
335            .expect("old cache entry should deserialize successfully");
336
337        assert!(hit.0.is_empty(), "no issues");
338        assert!(
339            hit.1.is_empty(),
340            "reference_locations should default to empty vec, not fail"
341        );
342    }
343}