Skip to main content

cha_core/
cache.rs

1use crate::{Finding, SourceModel, SymbolIndex};
2use serde::{Deserialize, Serialize};
3use std::collections::HashMap;
4use std::path::{Path, PathBuf};
5
6/// Per-file cache metadata.
7#[derive(Debug, Serialize, Deserialize)]
8struct FileEntry {
9    mtime_secs: u64,
10    size: u64,
11    content_hash: u64,
12    /// Cached import sources for fast unstable_dependency analysis.
13    #[serde(default)]
14    imports: Vec<String>,
15}
16
17/// Per-file findings cache entry.
18#[derive(Debug, Serialize, Deserialize)]
19struct FindingsEntry {
20    content_hash: u64,
21    findings: Vec<Finding>,
22}
23
24/// On-disk cache metadata.
25#[derive(Debug, Serialize, Deserialize, Default)]
26struct CacheMeta {
27    env_hash: u64,
28    files: HashMap<String, FileEntry>,
29}
30
31/// Unified project cache: parse results + findings + symbol summaries.
32/// Facade over three cache layers (models, symbols, findings) plus the
33/// meta bookkeeping they share. Method count ≥ 10 is structural, not a
34/// smell — each layer exposes get/put atop the layered read/write helpers.
35// cha:ignore large_class
36pub struct ProjectCache {
37    root: PathBuf,
38    meta: CacheMeta,
39    dirty: bool,
40    /// L1 in-memory parse cache (avoids repeated disk reads within same process).
41    mem_models: HashMap<u64, SourceModel>,
42    /// L1 symbol-level summaries. Small enough to keep alongside models; lets
43    /// `cached_symbols` skip the `SourceModel` deserialisation entirely.
44    mem_symbols: HashMap<u64, SymbolIndex>,
45}
46
47fn hash_all_configs(dir: &Path, h: &mut impl std::hash::Hasher) {
48    use std::hash::Hash;
49    let cfg = dir.join(".cha.toml");
50    if let Ok(content) = std::fs::read_to_string(&cfg) {
51        content.hash(h);
52    }
53    let Ok(entries) = std::fs::read_dir(dir) else {
54        return;
55    };
56    for entry in entries.flatten() {
57        let path = entry.path();
58        if path.is_dir() {
59            let name = entry.file_name();
60            let s = name.to_string_lossy();
61            if !s.starts_with('.') && !matches!(s.as_ref(), "target" | "node_modules" | "dist") {
62                hash_all_configs(&path, h);
63            }
64        }
65    }
66}
67
68fn cache_dir(root: &Path) -> PathBuf {
69    root.join(".cha/cache")
70}
71
72fn content_hash(content: &str) -> u64 {
73    use std::hash::{Hash, Hasher};
74    let mut h = std::collections::hash_map::DefaultHasher::new();
75    content.hash(&mut h);
76    h.finish()
77}
78
79/// Layered L1-memory + L2-disk cache read. Templated over any
80/// bincode-serialisable value; SourceModel and SymbolIndex share this.
81fn get_layered<T: serde::de::DeserializeOwned + Clone>(
82    mem: &mut HashMap<u64, T>,
83    root: &Path,
84    subdir: &str,
85    chash: u64,
86) -> Option<T> {
87    if let Some(v) = mem.get(&chash) {
88        return Some(v.clone());
89    }
90    let path = cache_dir(root)
91        .join(subdir)
92        .join(format!("{chash:016x}.bin"));
93    let bytes = std::fs::read(&path).ok()?;
94    let val: T = bincode::deserialize(&bytes).ok()?;
95    mem.insert(chash, val.clone());
96    Some(val)
97}
98
99/// Layered L1 + L2 write. Silently ignores serialisation or filesystem
100/// errors — cache is a speed-up, not a correctness layer.
101fn put_layered<T: serde::Serialize + Clone>(
102    mem: &mut HashMap<u64, T>,
103    root: &Path,
104    subdir: &str,
105    chash: u64,
106    value: &T,
107) {
108    mem.insert(chash, value.clone());
109    let dir = cache_dir(root).join(subdir);
110    let _ = std::fs::create_dir_all(&dir);
111    if let Ok(bytes) = bincode::serialize(value) {
112        let _ = std::fs::write(dir.join(format!("{chash:016x}.bin")), bytes);
113    }
114}
115
116fn file_mtime_and_size(path: &Path) -> Option<(u64, u64)> {
117    let meta = std::fs::metadata(path).ok()?;
118    let mtime = meta
119        .modified()
120        .ok()?
121        .duration_since(std::time::UNIX_EPOCH)
122        .ok()?
123        .as_secs();
124    Some((mtime, meta.len()))
125}
126
127impl ProjectCache {
128    /// Open or create a cache for the given project root.
129    pub fn open(project_root: &Path, env_hash: u64) -> Self {
130        let dir = cache_dir(project_root);
131        let meta_path = dir.join("meta.bin");
132        let meta = std::fs::read(&meta_path)
133            .ok()
134            .and_then(|b| bincode::deserialize::<CacheMeta>(&b).ok())
135            .unwrap_or_default();
136        let meta = if meta.env_hash != env_hash {
137            // Environment changed — full invalidation
138            let _ = std::fs::remove_dir_all(&dir);
139            CacheMeta {
140                env_hash,
141                ..Default::default()
142            }
143        } else {
144            meta
145        };
146        Self {
147            root: project_root.to_path_buf(),
148            meta,
149            dirty: false,
150            mem_models: HashMap::new(),
151            mem_symbols: HashMap::new(),
152        }
153    }
154
155    /// Check if a file is unchanged (mtime + size match).
156    /// Returns (is_unchanged, content_hash) — hash is 0 if unchanged and not yet computed.
157    pub fn check_file(&self, rel_path: &str, path: &Path) -> FileStatus {
158        let Some(entry) = self.meta.files.get(rel_path) else {
159            return FileStatus::Changed;
160        };
161        if let Some((mtime, size)) = file_mtime_and_size(path)
162            && mtime == entry.mtime_secs
163            && size == entry.size
164        {
165            return FileStatus::Unchanged(entry.content_hash);
166        }
167        FileStatus::Changed
168    }
169
170    /// Get cached SourceModel: L1 memory → L2 disk.
171    pub fn get_model(&mut self, chash: u64) -> Option<SourceModel> {
172        get_layered(&mut self.mem_models, &self.root, "parse", chash)
173    }
174
175    /// Store a SourceModel in L1 + L2.
176    pub fn put_model(&mut self, chash: u64, model: &SourceModel) {
177        put_layered(&mut self.mem_models, &self.root, "parse", chash, model);
178    }
179
180    /// Get a cached `SymbolIndex`: L1 memory → L2 disk.
181    /// Fast path for `cha deps`, LSP workspace-symbols and any other
182    /// consumer that only needs structural information (names, relations,
183    /// positions) without per-function-body analyse data.
184    pub fn get_symbols(&mut self, chash: u64) -> Option<SymbolIndex> {
185        get_layered(&mut self.mem_symbols, &self.root, "symbols", chash)
186    }
187
188    /// Store a `SymbolIndex` in L1 + L2. Called alongside `put_model` so
189    /// both caches stay in lockstep.
190    pub fn put_symbols(&mut self, chash: u64, idx: &SymbolIndex) {
191        put_layered(&mut self.mem_symbols, &self.root, "symbols", chash, idx);
192    }
193
194    /// Get cached findings for a file.
195    pub fn get_findings(&self, chash: u64) -> Option<Vec<Finding>> {
196        let path = cache_dir(&self.root)
197            .join("findings")
198            .join(format!("{chash:016x}.bin"));
199        let bytes = std::fs::read(&path).ok()?;
200        let entry: FindingsEntry = bincode::deserialize(&bytes).ok()?;
201        (entry.content_hash == chash).then_some(entry.findings)
202    }
203
204    /// Store findings for a file.
205    pub fn put_findings(&mut self, chash: u64, findings: &[Finding]) {
206        let dir = cache_dir(&self.root).join("findings");
207        let _ = std::fs::create_dir_all(&dir);
208        let entry = FindingsEntry {
209            content_hash: chash,
210            findings: findings.to_vec(),
211        };
212        if let Ok(bytes) = bincode::serialize(&entry) {
213            let _ = std::fs::write(dir.join(format!("{chash:016x}.bin")), bytes);
214        }
215    }
216
217    /// Update file metadata after processing.
218    pub fn update_file_entry(
219        &mut self,
220        rel_path: String,
221        path: &Path,
222        chash: u64,
223        imports: Vec<String>,
224    ) {
225        let (mtime_secs, size) = file_mtime_and_size(path).unwrap_or((0, 0));
226        self.meta.files.insert(
227            rel_path,
228            FileEntry {
229                mtime_secs,
230                size,
231                content_hash: chash,
232                imports,
233            },
234        );
235        self.dirty = true;
236    }
237
238    /// Get cached imports for a file (from meta, no disk I/O).
239    pub fn get_imports(&self, rel_path: &str) -> Option<&[String]> {
240        self.meta.files.get(rel_path).map(|e| e.imports.as_slice())
241    }
242
243    /// Flush metadata to disk and clean up orphan cache files.
244    pub fn flush(&self) {
245        if !self.dirty {
246            return;
247        }
248        let dir = cache_dir(&self.root);
249        let _ = std::fs::create_dir_all(&dir);
250        if let Ok(bytes) = bincode::serialize(&self.meta) {
251            let _ = std::fs::write(dir.join("meta.bin"), bytes);
252        }
253        self.gc();
254    }
255
256    /// Remove orphan cache files not referenced by meta.
257    fn gc(&self) {
258        let hashes: std::collections::HashSet<String> = self
259            .meta
260            .files
261            .values()
262            .map(|e| format!("{:016x}.bin", e.content_hash))
263            .collect();
264        for subdir in &["parse", "findings", "symbols"] {
265            let dir = cache_dir(&self.root).join(subdir);
266            let Ok(entries) = std::fs::read_dir(&dir) else {
267                continue;
268            };
269            for entry in entries.flatten() {
270                let name = entry.file_name().to_string_lossy().to_string();
271                if name.ends_with(".bin") && !hashes.contains(&name) {
272                    let _ = std::fs::remove_file(entry.path());
273                }
274            }
275        }
276        // Remove legacy analysis.json
277        let legacy = cache_dir(&self.root).join("analysis.json");
278        let _ = std::fs::remove_file(legacy);
279    }
280}
281
282/// Result of checking a file against cache.
283pub enum FileStatus {
284    /// File unchanged — content hash from cache.
285    Unchanged(u64),
286    /// File changed or not in cache.
287    Changed,
288}
289
290/// Compute a content hash.
291pub fn hash_content(s: &str) -> u64 {
292    content_hash(s)
293}
294
295/// Compute environment hash from config + plugins + cha binary fingerprint.
296///
297/// The binary fingerprint covers both cases that make cached SourceModels
298/// stale:
299/// - developer rebuilds cha after editing parser code,
300/// - end user upgrades to a new cha release.
301///
302/// Both produce a different on-disk binary, so the binary's modification
303/// time is sufficient. `CARGO_PKG_VERSION` was the old key, but it was
304/// a strict subset of this: every release-version bump necessarily writes
305/// a new binary (new mtime), and no parser change ever happens without a
306/// rebuild (new mtime). Version-only tracking missed parser-behaviour
307/// changes that shipped without a `cargo xtask bump` — this is what let
308/// the header-declaration parser fix silently fail against stale caches.
309pub fn env_hash(project_root: &Path, plugin_dirs: &[PathBuf]) -> u64 {
310    use std::hash::{Hash, Hasher};
311    let mut h = std::collections::hash_map::DefaultHasher::new();
312    hash_cha_binary(&mut h);
313    hash_all_configs(project_root, &mut h);
314    for dir in plugin_dirs {
315        if let Ok(entries) = std::fs::read_dir(dir) {
316            for entry in entries.flatten() {
317                if let Ok(mtime) = entry.metadata().and_then(|m| m.modified()) {
318                    mtime.hash(&mut h);
319                }
320                entry.file_name().hash(&mut h);
321            }
322        }
323    }
324    h.finish()
325}
326
327/// Hash the cha binary's identity. Uses the running executable's mtime;
328/// falls back to `CARGO_PKG_VERSION` if the executable path isn't
329/// discoverable (unusual — sandboxed runners, embedded contexts). Either
330/// path invalidates the cache on every new binary.
331fn hash_cha_binary(h: &mut impl std::hash::Hasher) {
332    use std::hash::Hash;
333    match std::env::current_exe().and_then(|p| p.metadata()?.modified()) {
334        Ok(mtime) => mtime.hash(h),
335        Err(_) => env!("CARGO_PKG_VERSION").hash(h),
336    }
337}
338
339#[cfg(test)]
340mod tests {
341    use super::*;
342    use crate::{SourceModel, TypeRef};
343    use std::path::PathBuf;
344
345    fn unique_tmp_dir() -> PathBuf {
346        use std::sync::atomic::{AtomicU64, Ordering};
347        // Per-thread nanos can collide under parallel test execution —
348        // fold in a monotonic counter so each call is unique.
349        static SEQ: AtomicU64 = AtomicU64::new(0);
350        let base = std::env::temp_dir().join(format!(
351            "cha-cache-test-{}-{}-{}",
352            std::process::id(),
353            std::time::SystemTime::now()
354                .duration_since(std::time::UNIX_EPOCH)
355                .map(|d| d.as_nanos())
356                .unwrap_or(0),
357            SEQ.fetch_add(1, Ordering::Relaxed),
358        ));
359        std::fs::create_dir_all(&base).unwrap();
360        base
361    }
362
363    fn sample_model() -> SourceModel {
364        SourceModel {
365            language: "c".into(),
366            total_lines: 10,
367            functions: vec![],
368            classes: vec![],
369            imports: vec![],
370            comments: vec![],
371            type_aliases: vec![
372                ("MyId".into(), "uint32_t".into()),
373                ("Handle".into(), "void*".into()),
374            ],
375        }
376    }
377
378    /// Regression: `boundary_leak` used to parse files fresh because the
379    /// cache appeared to drop typedef aliases on some C projects. After
380    /// v1.11.0 tied the cache key to the binary's mtime, put/get should
381    /// round-trip SourceModel faithfully — including `type_aliases`.
382    #[test]
383    fn cache_roundtrip_preserves_type_aliases() {
384        let tmp = unique_tmp_dir();
385        let mut cache = ProjectCache::open(&tmp, 0xdeadbeef);
386        let model = sample_model();
387        let chash: u64 = 0xdead_beef_1234_5678;
388        // Register a file entry so flush()->gc() keeps the parse blob.
389        cache.update_file_entry("x.c".into(), &tmp.join("nope"), chash, vec![]);
390        cache.put_model(chash, &model);
391        let got = cache.get_model(chash).expect("cached model present");
392        assert_eq!(got.type_aliases, model.type_aliases);
393        // Persist meta so reopening with the same env_hash doesn't
394        // trigger the full-invalidation branch.
395        cache.flush();
396        drop(cache);
397        let mut fresh = ProjectCache::open(&tmp, 0xdeadbeef);
398        let from_disk = fresh.get_model(chash).expect("on-disk model present");
399        assert_eq!(from_disk.type_aliases, model.type_aliases);
400    }
401
402    /// TypeRef origin information in parameter / return types also has to
403    /// survive serialisation; boundary_leak's "interesting" check keys on
404    /// `TypeOrigin::External`.
405    #[test]
406    fn cache_roundtrip_preserves_typeref_origin() {
407        use crate::{FunctionInfo, TypeOrigin};
408        let tmp = unique_tmp_dir();
409        let mut cache = ProjectCache::open(&tmp, 0xdeadbeef);
410        let model = SourceModel {
411            language: "rust".into(),
412            total_lines: 5,
413            functions: vec![FunctionInfo {
414                name: "f".into(),
415                parameter_types: vec![TypeRef {
416                    name: "ExtThing".into(),
417                    raw: "ext::ExtThing".into(),
418                    origin: TypeOrigin::External("ext".into()),
419                }],
420                ..Default::default()
421            }],
422            classes: vec![],
423            imports: vec![],
424            comments: vec![],
425            type_aliases: vec![],
426        };
427        cache.put_model(99, &model);
428        let got = cache.get_model(99).unwrap();
429        let p = &got.functions[0].parameter_types[0];
430        assert_eq!(p.name, "ExtThing");
431        assert!(matches!(&p.origin, TypeOrigin::External(m) if m == "ext"));
432    }
433
434    /// `SymbolIndex` cache writes land in `symbols/{chash}.bin` independent
435    /// of `parse/{chash}.bin`. Reopening the cache must recover them byte-
436    /// for-byte — this is the invariant that lets `cached_symbols` skip
437    /// `SourceModel` deserialisation entirely on warm runs.
438    #[test]
439    fn symbol_index_roundtrip_preserves_classes_and_functions() {
440        use crate::{ClassSymbol, FunctionSymbol, SymbolIndex};
441        let tmp = unique_tmp_dir();
442        let mut cache = ProjectCache::open(&tmp, 0xdeadbeef);
443        let idx = SymbolIndex {
444            language: "c".into(),
445            total_lines: 42,
446            imports: vec![],
447            classes: vec![ClassSymbol {
448                name: "Foo".into(),
449                parent_name: Some("Base".into()),
450                is_interface: false,
451                is_exported: true,
452                method_count: 3,
453                has_behavior: true,
454                field_names: vec!["x".into()],
455                field_types: vec!["int".into()],
456                start_line: 10,
457                end_line: 20,
458                ..Default::default()
459            }],
460            functions: vec![FunctionSymbol {
461                name: "bar".into(),
462                is_exported: true,
463                parameter_count: 2,
464                called_functions: vec!["helper".into(), "log".into()],
465                start_line: 30,
466                end_line: 40,
467                ..Default::default()
468            }],
469            type_aliases: vec![("Handle".into(), "void*".into())],
470        };
471        let chash = 0x1234_abcd_u64;
472        cache.update_file_entry("t.c".into(), &tmp.join("nope"), chash, vec![]);
473        cache.put_symbols(chash, &idx);
474        let got_l1 = cache.get_symbols(chash).expect("L1 hit");
475        assert_eq!(got_l1.classes[0].name, "Foo");
476        assert_eq!(got_l1.functions[0].called_functions.len(), 2);
477        assert_eq!(got_l1.type_aliases[0].0, "Handle");
478        // Force L2 round-trip.
479        cache.flush();
480        drop(cache);
481        let mut fresh = ProjectCache::open(&tmp, 0xdeadbeef);
482        let from_disk = fresh.get_symbols(chash).expect("L2 hit");
483        assert_eq!(from_disk.classes[0].parent_name.as_deref(), Some("Base"));
484        assert_eq!(from_disk.functions[0].parameter_count, 2);
485    }
486}