Skip to main content

mir_analyzer/
cache.rs

1/// Per-file analysis result cache backed by a JSON file on disk.
2///
3/// Cache key: file path.  Cache validity: BLAKE3 hash of file content.
4/// If the content hash matches what was stored, the cached issues are returned
5/// and Pass 2 analysis is skipped for that file.
6use std::collections::{HashMap, HashSet};
7use std::path::{Path, PathBuf};
8use std::sync::Mutex;
9
10use serde::{Deserialize, Serialize};
11
12use mir_issues::Issue;
13
14/// Cached analysis result returned on a cache hit: issues and reference location
15/// triples `(symbol_key, start_byte, end_byte)`.
16pub type CacheHit = (Vec<Issue>, Vec<(String, u32, u32)>);
17
18// ---------------------------------------------------------------------------
19// Hash helper
20// ---------------------------------------------------------------------------
21
22/// Compute the BLAKE3 hex digest of `content`.
23pub fn hash_content(content: &str) -> String {
24    blake3::hash(content.as_bytes()).to_hex().to_string()
25}
26
27// ---------------------------------------------------------------------------
28// CacheEntry
29// ---------------------------------------------------------------------------
30
31#[derive(Debug, Clone, Serialize, Deserialize)]
32struct CacheEntry {
33    content_hash: String,
34    issues: Vec<Issue>,
35    /// Reference locations recorded during Pass 2: (symbol_key, start_byte, end_byte).
36    /// Stored so that cache hits can replay symbol_reference_locations without re-running
37    /// analyze_bodies.
38    #[serde(default)]
39    reference_locations: Vec<(String, u32, u32)>,
40}
41
42// ---------------------------------------------------------------------------
43// AnalysisCache
44// ---------------------------------------------------------------------------
45
46/// Serialized form of the full cache file.
47#[derive(Debug, Default, Serialize, Deserialize)]
48struct CacheFile {
49    #[serde(default)]
50    entries: HashMap<String, CacheEntry>,
51    /// Reverse dependency graph: defining_file → [files that depend on it].
52    /// Persisted so that the next run can invalidate dependents before Pass 1.
53    #[serde(default)]
54    reverse_deps: HashMap<String, HashSet<String>>,
55}
56
57/// Thread-safe, disk-backed cache for per-file analysis results.
58pub struct AnalysisCache {
59    cache_dir: PathBuf,
60    entries: Mutex<HashMap<String, CacheEntry>>,
61    /// Reverse dependency graph loaded from disk (from the previous run).
62    reverse_deps: Mutex<HashMap<String, HashSet<String>>>,
63    dirty: Mutex<bool>,
64}
65
66impl AnalysisCache {
67    /// Open or create a cache stored under `cache_dir`.
68    /// If the directory or cache file do not exist they are created lazily on
69    /// the first `flush()` call.
70    pub fn open(cache_dir: &Path) -> Self {
71        std::fs::create_dir_all(cache_dir).ok();
72        let file = Self::load(cache_dir);
73        Self {
74            cache_dir: cache_dir.to_path_buf(),
75            entries: Mutex::new(file.entries),
76            reverse_deps: Mutex::new(file.reverse_deps),
77            dirty: Mutex::new(false),
78        }
79    }
80
81    /// Open the default cache directory: `{project_root}/.mir-cache/`.
82    pub fn open_default(project_root: &Path) -> Self {
83        Self::open(&project_root.join(".mir-cache"))
84    }
85
86    /// Return cached issues and reference locations for `file_path` if its
87    /// `content_hash` matches. Returns `None` if there is no entry or the file
88    /// has changed. The second element of the tuple is the list of
89    /// `(symbol_key, start_byte, end_byte)` entries to replay into
90    /// `Codebase::symbol_reference_locations`.
91    pub fn get(&self, file_path: &str, content_hash: &str) -> Option<CacheHit> {
92        let entries = self.entries.lock().unwrap();
93        entries.get(file_path).and_then(|e| {
94            if e.content_hash == content_hash {
95                Some((e.issues.clone(), e.reference_locations.clone()))
96            } else {
97                None
98            }
99        })
100    }
101
102    /// Store `issues` and `reference_locations` for `file_path` with the given
103    /// `content_hash`. `reference_locations` is a list of
104    /// `(symbol_key, start_byte, end_byte)` recorded during Pass 2.
105    pub fn put(
106        &self,
107        file_path: &str,
108        content_hash: String,
109        issues: Vec<Issue>,
110        reference_locations: Vec<(String, u32, u32)>,
111    ) {
112        let mut entries = self.entries.lock().unwrap();
113        entries.insert(
114            file_path.to_string(),
115            CacheEntry {
116                content_hash,
117                issues,
118                reference_locations,
119            },
120        );
121        *self.dirty.lock().unwrap() = true;
122    }
123
124    /// Persist the in-memory cache to `{cache_dir}/cache.json`.
125    /// This is a no-op if nothing changed since the last flush.
126    pub fn flush(&self) {
127        let dirty = {
128            let mut d = self.dirty.lock().unwrap();
129            let was = *d;
130            *d = false;
131            was
132        };
133        if !dirty {
134            return;
135        }
136        let cache_file = self.cache_dir.join("cache.json");
137        let file = CacheFile {
138            entries: self.entries.lock().unwrap().clone(),
139            reverse_deps: self.reverse_deps.lock().unwrap().clone(),
140        };
141        if let Ok(json) = serde_json::to_string(&file) {
142            std::fs::write(cache_file, json).ok();
143        }
144    }
145
146    /// Replace the reverse dependency graph (called after each Pass 1).
147    pub fn set_reverse_deps(&self, deps: HashMap<String, HashSet<String>>) {
148        *self.reverse_deps.lock().unwrap() = deps;
149        *self.dirty.lock().unwrap() = true;
150    }
151
152    /// BFS from each changed file through the reverse dep graph.
153    /// Evicts every reachable dependent's cache entry.
154    /// Returns the number of entries evicted.
155    pub fn evict_with_dependents(&self, changed_files: &[String]) -> usize {
156        // Phase 1: collect all dependents to evict via BFS (lock held only here).
157        let to_evict: Vec<String> = {
158            let deps = self.reverse_deps.lock().unwrap();
159            let mut visited: HashSet<String> = changed_files.iter().cloned().collect();
160            let mut queue: std::collections::VecDeque<String> =
161                changed_files.iter().cloned().collect();
162            let mut result = Vec::new();
163
164            while let Some(file) = queue.pop_front() {
165                if let Some(dependents) = deps.get(&file) {
166                    for dep in dependents {
167                        if visited.insert(dep.clone()) {
168                            queue.push_back(dep.clone());
169                            result.push(dep.clone());
170                        }
171                    }
172                }
173            }
174            result
175        };
176
177        // Phase 2: evict (reverse_deps lock released above, entries lock taken per file).
178        let count = to_evict.len();
179        for file in &to_evict {
180            self.evict(file);
181        }
182        count
183    }
184
185    /// Remove a single file's cache entry.
186    pub fn evict(&self, file_path: &str) {
187        let mut entries = self.entries.lock().unwrap();
188        entries.remove(file_path);
189        *self.dirty.lock().unwrap() = true;
190    }
191
192    // -----------------------------------------------------------------------
193
194    fn load(cache_dir: &Path) -> CacheFile {
195        let cache_file = cache_dir.join("cache.json");
196        let Ok(bytes) = std::fs::read(&cache_file) else {
197            return CacheFile::default();
198        };
199        serde_json::from_slice(&bytes).unwrap_or_default()
200    }
201}
202
203// ---------------------------------------------------------------------------
204// Tests
205// ---------------------------------------------------------------------------
206
207#[cfg(test)]
208mod tests {
209    use super::*;
210    use tempfile::TempDir;
211
212    fn make_cache(dir: &TempDir) -> AnalysisCache {
213        AnalysisCache::open(dir.path())
214    }
215
216    fn seed(cache: &AnalysisCache, file: &str) {
217        cache.put(file, "hash".to_string(), vec![], vec![]);
218    }
219
220    #[test]
221    fn evict_with_dependents_linear_chain() {
222        // reverse_deps: A → [B], B → [C]
223        // Changing A must evict B and C.
224        let dir = TempDir::new().unwrap();
225        let cache = make_cache(&dir);
226        seed(&cache, "A");
227        seed(&cache, "B");
228        seed(&cache, "C");
229
230        let mut deps: HashMap<String, HashSet<String>> = HashMap::new();
231        deps.entry("A".into()).or_default().insert("B".into());
232        deps.entry("B".into()).or_default().insert("C".into());
233        cache.set_reverse_deps(deps);
234
235        let evicted = cache.evict_with_dependents(&["A".to_string()]);
236
237        assert_eq!(evicted, 2, "B and C should be evicted");
238        assert!(cache.get("A", "hash").is_some(), "A itself is not evicted");
239        assert!(cache.get("B", "hash").is_none(), "B should be evicted");
240        assert!(cache.get("C", "hash").is_none(), "C should be evicted");
241    }
242
243    #[test]
244    fn evict_with_dependents_diamond() {
245        // reverse_deps: A → [B, C], B → [D], C → [D]
246        // D should be evicted exactly once (visited set prevents double-eviction).
247        let dir = TempDir::new().unwrap();
248        let cache = make_cache(&dir);
249        seed(&cache, "A");
250        seed(&cache, "B");
251        seed(&cache, "C");
252        seed(&cache, "D");
253
254        let mut deps: HashMap<String, HashSet<String>> = HashMap::new();
255        deps.entry("A".into()).or_default().insert("B".into());
256        deps.entry("A".into()).or_default().insert("C".into());
257        deps.entry("B".into()).or_default().insert("D".into());
258        deps.entry("C".into()).or_default().insert("D".into());
259        cache.set_reverse_deps(deps);
260
261        let evicted = cache.evict_with_dependents(&["A".to_string()]);
262
263        assert_eq!(evicted, 3, "B, C, D each evicted once");
264        assert!(cache.get("D", "hash").is_none());
265    }
266
267    #[test]
268    fn evict_with_dependents_cycle_safety() {
269        // reverse_deps: A → [B], B → [A]  (circular)
270        // Must not loop forever; B should be evicted.
271        let dir = TempDir::new().unwrap();
272        let cache = make_cache(&dir);
273        seed(&cache, "A");
274        seed(&cache, "B");
275
276        let mut deps: HashMap<String, HashSet<String>> = HashMap::new();
277        deps.entry("A".into()).or_default().insert("B".into());
278        deps.entry("B".into()).or_default().insert("A".into());
279        cache.set_reverse_deps(deps);
280
281        let evicted = cache.evict_with_dependents(&["A".to_string()]);
282
283        // B is a dependent of A; A is the seed (not counted as "evicted dependent")
284        assert_eq!(evicted, 1);
285        assert!(cache.get("B", "hash").is_none());
286    }
287
288    #[test]
289    fn evict_with_dependents_unrelated_file_untouched() {
290        // Changing C should not evict B (which depends on A, not C).
291        let dir = TempDir::new().unwrap();
292        let cache = make_cache(&dir);
293        seed(&cache, "A");
294        seed(&cache, "B");
295        seed(&cache, "C");
296
297        let mut deps: HashMap<String, HashSet<String>> = HashMap::new();
298        deps.entry("A".into()).or_default().insert("B".into());
299        cache.set_reverse_deps(deps);
300
301        let evicted = cache.evict_with_dependents(&["C".to_string()]);
302
303        assert_eq!(evicted, 0);
304        assert!(
305            cache.get("B", "hash").is_some(),
306            "B unrelated, should survive"
307        );
308    }
309
310    #[test]
311    fn old_cache_without_reference_locations_deserializes_to_empty() {
312        // Cache entries written before the reference_locations field was added
313        // must still be readable. The #[serde(default)] attribute covers this,
314        // but we verify it explicitly so a future refactor can't silently break it.
315        let dir = TempDir::new().unwrap();
316        let cache_file = dir.path().join("cache.json");
317
318        // Write a cache file in the old format (no reference_locations field).
319        std::fs::write(
320            &cache_file,
321            r#"{"entries":{"a.php":{"content_hash":"abc","issues":[]}},"reverse_deps":{}}"#,
322        )
323        .unwrap();
324
325        let cache = AnalysisCache::open(dir.path());
326        let hit = cache
327            .get("a.php", "abc")
328            .expect("old cache entry should deserialize successfully");
329
330        assert!(hit.0.is_empty(), "no issues");
331        assert!(
332            hit.1.is_empty(),
333            "reference_locations should default to empty vec, not fail"
334        );
335    }
336}