Skip to main content

cha_core/
cache.rs

1use crate::{Finding, SourceModel, SymbolIndex};
2use serde::{Deserialize, Serialize};
3use std::collections::HashMap;
4use std::path::{Path, PathBuf};
5
6/// Per-file cache metadata.
7#[derive(Debug, Serialize, Deserialize)]
8struct FileEntry {
9    mtime_secs: u64,
10    size: u64,
11    content_hash: u64,
12    /// Cached import sources for fast unstable_dependency analysis.
13    #[serde(default)]
14    imports: Vec<String>,
15}
16
17/// Per-file findings cache entry.
18#[derive(Debug, Serialize, Deserialize)]
19struct FindingsEntry {
20    content_hash: u64,
21    findings: Vec<Finding>,
22}
23
24/// On-disk cache metadata.
25#[derive(Debug, Serialize, Deserialize, Default)]
26struct CacheMeta {
27    env_hash: u64,
28    files: HashMap<String, FileEntry>,
29}
30
31/// Unified project cache: parse results + findings + symbol summaries.
32/// Facade over three cache layers (models, symbols, findings) plus the
33/// meta bookkeeping they share. Method count ≥ 10 is structural, not a
34/// smell — each layer exposes get/put atop the layered read/write helpers.
35// cha:ignore large_class
36pub struct ProjectCache {
37    root: PathBuf,
38    meta: CacheMeta,
39    dirty: bool,
40    /// L1 in-memory parse cache (avoids repeated disk reads within same process).
41    mem_models: HashMap<u64, SourceModel>,
42    /// L1 symbol-level summaries. Small enough to keep alongside models; lets
43    /// `cached_symbols` skip the `SourceModel` deserialisation entirely.
44    mem_symbols: HashMap<u64, SymbolIndex>,
45}
46
47fn hash_all_configs(dir: &Path, h: &mut impl std::hash::Hasher) {
48    use std::hash::Hash;
49    let cfg = dir.join(".cha.toml");
50    if let Ok(content) = std::fs::read_to_string(&cfg) {
51        content.hash(h);
52    }
53    let Ok(entries) = std::fs::read_dir(dir) else {
54        return;
55    };
56    for entry in entries.flatten() {
57        let path = entry.path();
58        if path.is_dir() {
59            let name = entry.file_name();
60            let s = name.to_string_lossy();
61            if !s.starts_with('.') && !is_skip_dir(&s) {
62                hash_all_configs(&path, h);
63            }
64        }
65    }
66}
67
68fn cache_dir(root: &Path) -> PathBuf {
69    root.join(".cha/cache")
70}
71
72fn is_skip_dir(name: &str) -> bool {
73    matches!(
74        name,
75        "target"
76            | "node_modules"
77            | "dist"
78            | "build"
79            | "out"
80            | "__pycache__"
81            | "venv"
82            | ".venv"
83            | "vendor"
84    )
85}
86
87fn content_hash(content: &str) -> u64 {
88    use std::hash::{Hash, Hasher};
89    let mut h = std::collections::hash_map::DefaultHasher::new();
90    content.hash(&mut h);
91    h.finish()
92}
93
94/// Layered L1-memory + L2-disk cache read. Templated over any
95/// bincode-serialisable value; SourceModel and SymbolIndex share this.
96fn get_layered<T: serde::de::DeserializeOwned + Clone>(
97    mem: &mut HashMap<u64, T>,
98    root: &Path,
99    subdir: &str,
100    chash: u64,
101) -> Option<T> {
102    if let Some(v) = mem.get(&chash) {
103        return Some(v.clone());
104    }
105    let path = cache_dir(root)
106        .join(subdir)
107        .join(format!("{chash:016x}.bin"));
108    let bytes = std::fs::read(&path).ok()?;
109    let val: T = bincode::deserialize(&bytes).ok()?;
110    mem.insert(chash, val.clone());
111    Some(val)
112}
113
114/// Layered L1 + L2 write. Silently ignores serialisation or filesystem
115/// errors — cache is a speed-up, not a correctness layer.
116fn put_layered<T: serde::Serialize + Clone>(
117    mem: &mut HashMap<u64, T>,
118    root: &Path,
119    subdir: &str,
120    chash: u64,
121    value: &T,
122) {
123    mem.insert(chash, value.clone());
124    let dir = cache_dir(root).join(subdir);
125    let _ = std::fs::create_dir_all(&dir);
126    if let Ok(bytes) = bincode::serialize(value) {
127        let _ = std::fs::write(dir.join(format!("{chash:016x}.bin")), bytes);
128    }
129}
130
131fn file_mtime_and_size(path: &Path) -> Option<(u64, u64)> {
132    let meta = std::fs::metadata(path).ok()?;
133    let mtime = meta
134        .modified()
135        .ok()?
136        .duration_since(std::time::UNIX_EPOCH)
137        .ok()?
138        .as_secs();
139    Some((mtime, meta.len()))
140}
141
142impl ProjectCache {
143    /// Open or create a cache for the given project root.
144    pub fn open(project_root: &Path, env_hash: u64) -> Self {
145        let dir = cache_dir(project_root);
146        let meta_path = dir.join("meta.bin");
147        let meta = std::fs::read(&meta_path)
148            .ok()
149            .and_then(|b| bincode::deserialize::<CacheMeta>(&b).ok())
150            .unwrap_or_default();
151        let meta = if meta.env_hash != env_hash {
152            // Environment changed — full invalidation
153            let _ = std::fs::remove_dir_all(&dir);
154            CacheMeta {
155                env_hash,
156                ..Default::default()
157            }
158        } else {
159            meta
160        };
161        Self {
162            root: project_root.to_path_buf(),
163            meta,
164            dirty: false,
165            mem_models: HashMap::new(),
166            mem_symbols: HashMap::new(),
167        }
168    }
169
170    /// Check if a file is unchanged (mtime + size match).
171    /// Returns (is_unchanged, content_hash) — hash is 0 if unchanged and not yet computed.
172    pub fn check_file(&self, rel_path: &str, path: &Path) -> FileStatus {
173        let Some(entry) = self.meta.files.get(rel_path) else {
174            return FileStatus::Changed;
175        };
176        if let Some((mtime, size)) = file_mtime_and_size(path)
177            && mtime == entry.mtime_secs
178            && size == entry.size
179        {
180            return FileStatus::Unchanged(entry.content_hash);
181        }
182        FileStatus::Changed
183    }
184
185    /// Get cached SourceModel: L1 memory → L2 disk.
186    pub fn get_model(&mut self, chash: u64) -> Option<SourceModel> {
187        get_layered(&mut self.mem_models, &self.root, "parse", chash)
188    }
189
190    /// Store a SourceModel in L1 + L2.
191    pub fn put_model(&mut self, chash: u64, model: &SourceModel) {
192        put_layered(&mut self.mem_models, &self.root, "parse", chash, model);
193    }
194
195    /// Get a cached `SymbolIndex`: L1 memory → L2 disk.
196    /// Fast path for `cha deps`, LSP workspace-symbols and any other
197    /// consumer that only needs structural information (names, relations,
198    /// positions) without per-function-body analyse data.
199    pub fn get_symbols(&mut self, chash: u64) -> Option<SymbolIndex> {
200        get_layered(&mut self.mem_symbols, &self.root, "symbols", chash)
201    }
202
203    /// Store a `SymbolIndex` in L1 + L2. Called alongside `put_model` so
204    /// both caches stay in lockstep.
205    pub fn put_symbols(&mut self, chash: u64, idx: &SymbolIndex) {
206        put_layered(&mut self.mem_symbols, &self.root, "symbols", chash, idx);
207    }
208
209    /// Get cached findings for a file.
210    pub fn get_findings(&self, chash: u64) -> Option<Vec<Finding>> {
211        let path = cache_dir(&self.root)
212            .join("findings")
213            .join(format!("{chash:016x}.bin"));
214        let bytes = std::fs::read(&path).ok()?;
215        let entry: FindingsEntry = bincode::deserialize(&bytes).ok()?;
216        (entry.content_hash == chash).then_some(entry.findings)
217    }
218
219    /// Store findings for a file.
220    pub fn put_findings(&mut self, chash: u64, findings: &[Finding]) {
221        let dir = cache_dir(&self.root).join("findings");
222        let _ = std::fs::create_dir_all(&dir);
223        let entry = FindingsEntry {
224            content_hash: chash,
225            findings: findings.to_vec(),
226        };
227        if let Ok(bytes) = bincode::serialize(&entry) {
228            let _ = std::fs::write(dir.join(format!("{chash:016x}.bin")), bytes);
229        }
230    }
231
232    /// Update file metadata after processing.
233    pub fn update_file_entry(
234        &mut self,
235        rel_path: String,
236        path: &Path,
237        chash: u64,
238        imports: Vec<String>,
239    ) {
240        let (mtime_secs, size) = file_mtime_and_size(path).unwrap_or((0, 0));
241        self.meta.files.insert(
242            rel_path,
243            FileEntry {
244                mtime_secs,
245                size,
246                content_hash: chash,
247                imports,
248            },
249        );
250        self.dirty = true;
251    }
252
253    /// Get cached imports for a file (from meta, no disk I/O).
254    pub fn get_imports(&self, rel_path: &str) -> Option<&[String]> {
255        self.meta.files.get(rel_path).map(|e| e.imports.as_slice())
256    }
257
258    /// Flush metadata to disk and clean up orphan cache files.
259    pub fn flush(&self) {
260        if !self.dirty {
261            return;
262        }
263        let dir = cache_dir(&self.root);
264        let _ = std::fs::create_dir_all(&dir);
265        if let Ok(bytes) = bincode::serialize(&self.meta) {
266            let _ = std::fs::write(dir.join("meta.bin"), bytes);
267        }
268        self.gc();
269    }
270
271    /// Remove orphan cache files not referenced by meta.
272    fn gc(&self) {
273        let hashes: std::collections::HashSet<String> = self
274            .meta
275            .files
276            .values()
277            .map(|e| format!("{:016x}.bin", e.content_hash))
278            .collect();
279        for subdir in &["parse", "findings", "symbols"] {
280            let dir = cache_dir(&self.root).join(subdir);
281            let Ok(entries) = std::fs::read_dir(&dir) else {
282                continue;
283            };
284            for entry in entries.flatten() {
285                let name = entry.file_name().to_string_lossy().to_string();
286                if name.ends_with(".bin") && !hashes.contains(&name) {
287                    let _ = std::fs::remove_file(entry.path());
288                }
289            }
290        }
291        // Remove legacy analysis.json
292        let legacy = cache_dir(&self.root).join("analysis.json");
293        let _ = std::fs::remove_file(legacy);
294    }
295}
296
297/// Result of checking a file against cache.
298pub enum FileStatus {
299    /// File unchanged — content hash from cache.
300    Unchanged(u64),
301    /// File changed or not in cache.
302    Changed,
303}
304
305/// Compute a content hash.
306pub fn hash_content(s: &str) -> u64 {
307    content_hash(s)
308}
309
310/// Compute environment hash from config + plugins + cha binary fingerprint.
311///
312/// The binary fingerprint covers both cases that make cached SourceModels
313/// stale:
314/// - developer rebuilds cha after editing parser code,
315/// - end user upgrades to a new cha release.
316///
317/// Both produce a different on-disk binary, so the binary's modification
318/// time is sufficient. `CARGO_PKG_VERSION` was the old key, but it was
319/// a strict subset of this: every release-version bump necessarily writes
320/// a new binary (new mtime), and no parser change ever happens without a
321/// rebuild (new mtime). Version-only tracking missed parser-behaviour
322/// changes that shipped without a `cargo xtask bump` — this is what let
323/// the header-declaration parser fix silently fail against stale caches.
324pub fn env_hash(project_root: &Path, plugin_dirs: &[PathBuf]) -> u64 {
325    use std::hash::{Hash, Hasher};
326    let mut h = std::collections::hash_map::DefaultHasher::new();
327    hash_cha_binary(&mut h);
328    hash_all_configs(project_root, &mut h);
329    for dir in plugin_dirs {
330        if let Ok(entries) = std::fs::read_dir(dir) {
331            for entry in entries.flatten() {
332                if let Ok(mtime) = entry.metadata().and_then(|m| m.modified()) {
333                    mtime.hash(&mut h);
334                }
335                entry.file_name().hash(&mut h);
336            }
337        }
338    }
339    h.finish()
340}
341
342/// Hash the cha binary's identity. Uses the running executable's mtime;
343/// falls back to `CARGO_PKG_VERSION` if the executable path isn't
344/// discoverable (unusual — sandboxed runners, embedded contexts). Either
345/// path invalidates the cache on every new binary.
346fn hash_cha_binary(h: &mut impl std::hash::Hasher) {
347    use std::hash::Hash;
348    match std::env::current_exe().and_then(|p| p.metadata()?.modified()) {
349        Ok(mtime) => mtime.hash(h),
350        Err(_) => env!("CARGO_PKG_VERSION").hash(h),
351    }
352}
353
354#[cfg(test)]
355mod tests {
356    use super::*;
357    use crate::{SourceModel, TypeRef};
358    use std::path::PathBuf;
359
360    fn unique_tmp_dir() -> PathBuf {
361        use std::sync::atomic::{AtomicU64, Ordering};
362        // Per-thread nanos can collide under parallel test execution —
363        // fold in a monotonic counter so each call is unique.
364        static SEQ: AtomicU64 = AtomicU64::new(0);
365        let base = std::env::temp_dir().join(format!(
366            "cha-cache-test-{}-{}-{}",
367            std::process::id(),
368            std::time::SystemTime::now()
369                .duration_since(std::time::UNIX_EPOCH)
370                .map(|d| d.as_nanos())
371                .unwrap_or(0),
372            SEQ.fetch_add(1, Ordering::Relaxed),
373        ));
374        std::fs::create_dir_all(&base).unwrap();
375        base
376    }
377
378    fn sample_model() -> SourceModel {
379        SourceModel {
380            language: "c".into(),
381            total_lines: 10,
382            functions: vec![],
383            classes: vec![],
384            imports: vec![],
385            comments: vec![],
386            type_aliases: vec![
387                ("MyId".into(), "uint32_t".into()),
388                ("Handle".into(), "void*".into()),
389            ],
390        }
391    }
392
393    /// Regression: `boundary_leak` used to parse files fresh because the
394    /// cache appeared to drop typedef aliases on some C projects. After
395    /// v1.11.0 tied the cache key to the binary's mtime, put/get should
396    /// round-trip SourceModel faithfully — including `type_aliases`.
397    #[test]
398    fn cache_roundtrip_preserves_type_aliases() {
399        let tmp = unique_tmp_dir();
400        let mut cache = ProjectCache::open(&tmp, 0xdeadbeef);
401        let model = sample_model();
402        let chash: u64 = 0xdead_beef_1234_5678;
403        // Register a file entry so flush()->gc() keeps the parse blob.
404        cache.update_file_entry("x.c".into(), &tmp.join("nope"), chash, vec![]);
405        cache.put_model(chash, &model);
406        let got = cache.get_model(chash).expect("cached model present");
407        assert_eq!(got.type_aliases, model.type_aliases);
408        // Persist meta so reopening with the same env_hash doesn't
409        // trigger the full-invalidation branch.
410        cache.flush();
411        drop(cache);
412        let mut fresh = ProjectCache::open(&tmp, 0xdeadbeef);
413        let from_disk = fresh.get_model(chash).expect("on-disk model present");
414        assert_eq!(from_disk.type_aliases, model.type_aliases);
415    }
416
417    /// TypeRef origin information in parameter / return types also has to
418    /// survive serialisation; boundary_leak's "interesting" check keys on
419    /// `TypeOrigin::External`.
420    #[test]
421    fn cache_roundtrip_preserves_typeref_origin() {
422        use crate::{FunctionInfo, TypeOrigin};
423        let tmp = unique_tmp_dir();
424        let mut cache = ProjectCache::open(&tmp, 0xdeadbeef);
425        let model = SourceModel {
426            language: "rust".into(),
427            total_lines: 5,
428            functions: vec![FunctionInfo {
429                name: "f".into(),
430                parameter_types: vec![TypeRef {
431                    name: "ExtThing".into(),
432                    raw: "ext::ExtThing".into(),
433                    origin: TypeOrigin::External("ext".into()),
434                }],
435                ..Default::default()
436            }],
437            classes: vec![],
438            imports: vec![],
439            comments: vec![],
440            type_aliases: vec![],
441        };
442        cache.put_model(99, &model);
443        let got = cache.get_model(99).unwrap();
444        let p = &got.functions[0].parameter_types[0];
445        assert_eq!(p.name, "ExtThing");
446        assert!(matches!(&p.origin, TypeOrigin::External(m) if m == "ext"));
447    }
448
449    /// `SymbolIndex` cache writes land in `symbols/{chash}.bin` independent
450    /// of `parse/{chash}.bin`. Reopening the cache must recover them byte-
451    /// for-byte — this is the invariant that lets `cached_symbols` skip
452    /// `SourceModel` deserialisation entirely on warm runs.
453    #[test]
454    fn symbol_index_roundtrip_preserves_classes_and_functions() {
455        use crate::{ClassSymbol, FunctionSymbol, SymbolIndex};
456        let tmp = unique_tmp_dir();
457        let mut cache = ProjectCache::open(&tmp, 0xdeadbeef);
458        let idx = SymbolIndex {
459            language: "c".into(),
460            total_lines: 42,
461            imports: vec![],
462            classes: vec![ClassSymbol {
463                name: "Foo".into(),
464                parent_name: Some("Base".into()),
465                is_interface: false,
466                is_exported: true,
467                method_count: 3,
468                has_behavior: true,
469                field_names: vec!["x".into()],
470                field_types: vec!["int".into()],
471                start_line: 10,
472                end_line: 20,
473                ..Default::default()
474            }],
475            functions: vec![FunctionSymbol {
476                name: "bar".into(),
477                is_exported: true,
478                parameter_count: 2,
479                called_functions: vec!["helper".into(), "log".into()],
480                start_line: 30,
481                end_line: 40,
482                ..Default::default()
483            }],
484            type_aliases: vec![("Handle".into(), "void*".into())],
485        };
486        let chash = 0x1234_abcd_u64;
487        cache.update_file_entry("t.c".into(), &tmp.join("nope"), chash, vec![]);
488        cache.put_symbols(chash, &idx);
489        let got_l1 = cache.get_symbols(chash).expect("L1 hit");
490        assert_eq!(got_l1.classes[0].name, "Foo");
491        assert_eq!(got_l1.functions[0].called_functions.len(), 2);
492        assert_eq!(got_l1.type_aliases[0].0, "Handle");
493        // Force L2 round-trip.
494        cache.flush();
495        drop(cache);
496        let mut fresh = ProjectCache::open(&tmp, 0xdeadbeef);
497        let from_disk = fresh.get_symbols(chash).expect("L2 hit");
498        assert_eq!(from_disk.classes[0].parent_name.as_deref(), Some("Base"));
499        assert_eq!(from_disk.functions[0].parameter_count, 2);
500    }
501}