Skip to main content

lean_ctx/core/
content_cache.rs

1//! Resident, bounded file-content cache shared across the search-index build and
2//! `ctx_search` (issue #148).
3//!
4//! Before this module the trigram [`search_index`](crate::core::search_index)
5//! build read *every* file in the corpus to extract trigrams and then threw the
6//! content away, after which `ctx_search` read the narrowed candidate files
7//! **again** to run the regex line-by-line — the corpus was read from disk
8//! twice. This cache lets the first reader (whichever it is) populate file
9//! contents once, keyed by absolute path and validated by `(mtime, size)`, and
10//! every subsequent reader reuse them as an in-memory hit.
11//!
12//! Correctness: an entry is only ever served when the file's *current*
13//! `(mtime, size)` exactly matches the stored identity, so any edit (which
14//! changes mtime, and usually size) is a guaranteed miss — results can never go
15//! stale. A miss simply falls back to a disk read.
16//!
17//! Bounds & safety:
18//! - Total resident bytes are capped (`LEAN_CTX_CONTENT_CACHE_MB`, default
19//!   128 MB) with approximate-LRU eviction, so a large corpus cannot grow the
20//!   cache without limit.
21//! - Inserts are skipped while the process is under memory pressure, and the
22//!   eviction orchestrator can [`clear`] the cache on `UnloadIndices` /
23//!   `EmergencyDrop`.
24
25use std::collections::HashMap;
26use std::fs::Metadata;
27use std::path::{Path, PathBuf};
28use std::sync::{Arc, Mutex, OnceLock};
29use std::time::UNIX_EPOCH;
30
31/// Default resident byte budget when `LEAN_CTX_CONTENT_CACHE_MB` is unset.
32const DEFAULT_BUDGET_MB: usize = 128;
33
34/// Identity of one file *version*. A changed mtime or size ⇒ stale ⇒ cache miss.
35/// Mirrors the `(mtime, size)` pair the BM25 index already trusts for staleness.
36#[derive(Debug, Clone, Copy, PartialEq, Eq)]
37pub struct FileState {
38    pub mtime_ms: u64,
39    pub size_bytes: u64,
40}
41
42impl FileState {
43    /// Build from an already-`stat`ed [`Metadata`] (no extra syscall) — callers
44    /// in the hot path typically have this in hand from their size/regular-file
45    /// checks. Returns `None` only when the platform cannot report mtime.
46    pub fn from_metadata(meta: &Metadata) -> Option<Self> {
47        let mtime_ms = meta
48            .modified()
49            .ok()
50            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
51            .map(|d| d.as_millis() as u64)?;
52        Some(Self {
53            mtime_ms,
54            size_bytes: meta.len(),
55        })
56    }
57
58    /// Convenience: `stat` the path then build the state. Costs one syscall.
59    pub fn from_path(path: &Path) -> Option<Self> {
60        Self::from_metadata(&path.metadata().ok()?)
61    }
62}
63
64struct Entry {
65    state: FileState,
66    content: Arc<str>,
67    /// Logical clock tick of the last hit/insert — drives approximate LRU.
68    last_used: u64,
69}
70
71struct Cache {
72    map: HashMap<PathBuf, Entry>,
73    total_bytes: usize,
74    budget_bytes: usize,
75    clock: u64,
76    hits: u64,
77    misses: u64,
78    inserts: u64,
79    evictions: u64,
80}
81
82impl Cache {
83    fn new(budget_bytes: usize) -> Self {
84        Self {
85            map: HashMap::new(),
86            total_bytes: 0,
87            budget_bytes,
88            clock: 0,
89            hits: 0,
90            misses: 0,
91            inserts: 0,
92            evictions: 0,
93        }
94    }
95
96    fn tick(&mut self) -> u64 {
97        self.clock += 1;
98        self.clock
99    }
100
101    fn remove_entry(&mut self, path: &Path) {
102        if let Some(old) = self.map.remove(path) {
103            self.total_bytes = self.total_bytes.saturating_sub(old.content.len());
104        }
105    }
106
107    /// Evict approximate-LRU entries until the budget is satisfied. Eviction
108    /// only runs after an over-budget insert, so the `O(n)` min-scan is rare and
109    /// dwarfed by the disk reads it prevents.
110    fn evict_to_budget(&mut self) {
111        while self.total_bytes > self.budget_bytes && !self.map.is_empty() {
112            let Some(victim) = self
113                .map
114                .iter()
115                .min_by_key(|(_, e)| e.last_used)
116                .map(|(p, _)| p.clone())
117            else {
118                break;
119            };
120            self.remove_entry(&victim);
121            self.evictions += 1;
122        }
123    }
124}
125
126static CACHE: OnceLock<Mutex<Cache>> = OnceLock::new();
127
128fn budget_bytes() -> usize {
129    let mb = std::env::var("LEAN_CTX_CONTENT_CACHE_MB")
130        .ok()
131        .and_then(|v| v.trim().parse::<usize>().ok())
132        .unwrap_or(DEFAULT_BUDGET_MB);
133    mb.saturating_mul(1024 * 1024)
134}
135
136fn disabled() -> bool {
137    // A zero byte budget (or the explicit disable flag) turns the cache into a
138    // no-op pass-through — every `get` misses and `insert` is dropped.
139    std::env::var("LEAN_CTX_DISABLE_CONTENT_CACHE")
140        .is_ok_and(|v| v == "1" || v.eq_ignore_ascii_case("true"))
141        || budget_bytes() == 0
142}
143
144fn cache() -> &'static Mutex<Cache> {
145    CACHE.get_or_init(|| Mutex::new(Cache::new(budget_bytes())))
146}
147
148fn lock() -> std::sync::MutexGuard<'static, Cache> {
149    cache()
150        .lock()
151        .unwrap_or_else(std::sync::PoisonError::into_inner)
152}
153
154/// Look up `path`; returns the cached content only when the supplied current
155/// `(mtime, size)` matches the stored identity. A mismatch evicts the stale
156/// entry and reports a miss. `state` is passed in (not re-`stat`ed) because hot
157/// callers already hold the metadata.
158pub fn get(path: &Path, current: FileState) -> Option<Arc<str>> {
159    if disabled() {
160        return None;
161    }
162    let mut c = lock();
163    let Some(entry) = c.map.get(path) else {
164        c.misses += 1;
165        return None;
166    };
167    let matches = entry.state == current;
168    if !matches {
169        // Stale version cached — drop it so we don't keep paying for it.
170        c.remove_entry(path);
171        c.misses += 1;
172        return None;
173    }
174    let tick = c.tick();
175    c.hits += 1;
176    let entry = c
177        .map
178        .get_mut(path)
179        .expect("entry present after fresh match");
180    entry.last_used = tick;
181    Some(Arc::clone(&entry.content))
182}
183
184/// Insert (or replace) the content for `path` at version `state`. Skipped while
185/// the process is under memory pressure or when the cache is disabled, so the
186/// cache never *adds* to a memory problem.
187pub fn insert(path: &Path, state: FileState, content: Arc<str>) {
188    if disabled() || crate::core::memory_guard::is_under_pressure() {
189        return;
190    }
191    let len = content.len();
192    let mut c = lock();
193    // A single file larger than the whole budget would thrash eviction — skip it.
194    if len > c.budget_bytes {
195        return;
196    }
197    c.remove_entry(path);
198    let tick = c.tick();
199    c.map.insert(
200        path.to_path_buf(),
201        Entry {
202            state,
203            content,
204            last_used: tick,
205        },
206    );
207    c.total_bytes += len;
208    c.inserts += 1;
209    if c.total_bytes > c.budget_bytes {
210        c.evict_to_budget();
211    }
212}
213
214/// Read a file through the cache: returns cached content on a fresh hit, else
215/// reads from disk (UTF-8), populates the cache, and returns it. `None` on a
216/// non-UTF-8/unreadable/unstatable file. Convenience for callers without their
217/// own size/special-file gating (the search-index build and `ctx_search` use
218/// the explicit [`get`]/[`insert`] pair so they keep their own skip rules).
219pub fn get_or_read(path: &Path) -> Option<Arc<str>> {
220    let state = FileState::from_path(path)?;
221    if let Some(hit) = get(path, state) {
222        return Some(hit);
223    }
224    let content = std::fs::read_to_string(path).ok()?;
225    let arc: Arc<str> = Arc::from(content);
226    insert(path, state, Arc::clone(&arc));
227    Some(arc)
228}
229
230/// Drop all entries, freeing the heap. Called by the eviction orchestrator under
231/// memory pressure; the cache simply re-warms on subsequent reads.
232pub fn clear() {
233    if CACHE.get().is_none() {
234        return;
235    }
236    let mut c = lock();
237    c.map.clear();
238    c.total_bytes = 0;
239}
240
241/// Approximate resident heap used by cached contents, in bytes.
242pub fn memory_usage_bytes() -> usize {
243    if CACHE.get().is_none() {
244        return 0;
245    }
246    lock().total_bytes
247}
248
249/// Observability snapshot: `(hits, misses, entries, bytes, evictions)`.
250#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
251pub struct CacheStats {
252    pub hits: u64,
253    pub misses: u64,
254    pub entries: usize,
255    pub bytes: usize,
256    pub inserts: u64,
257    pub evictions: u64,
258}
259
260pub fn stats() -> CacheStats {
261    if CACHE.get().is_none() {
262        return CacheStats::default();
263    }
264    let c = lock();
265    CacheStats {
266        hits: c.hits,
267        misses: c.misses,
268        entries: c.map.len(),
269        bytes: c.total_bytes,
270        inserts: c.inserts,
271        evictions: c.evictions,
272    }
273}
274
275#[cfg(test)]
276mod tests {
277    use super::*;
278
279    /// The cache is a process-wide global and tests mutate it (and the budget
280    /// env var). Serialize them so they cannot observe each other's state.
281    static TEST_LOCK: Mutex<()> = Mutex::new(());
282
283    fn fresh_cache(budget_bytes: usize) {
284        std::env::remove_var("LEAN_CTX_CONTENT_CACHE_MB");
285        std::env::remove_var("LEAN_CTX_DISABLE_CONTENT_CACHE");
286        let mut c = lock();
287        *c = Cache::new(budget_bytes);
288    }
289
290    fn write(dir: &Path, name: &str, body: &str) -> PathBuf {
291        let p = dir.join(name);
292        std::fs::write(&p, body).unwrap();
293        p
294    }
295
296    #[test]
297    fn hit_after_insert_with_matching_state() {
298        let _g = TEST_LOCK
299            .lock()
300            .unwrap_or_else(std::sync::PoisonError::into_inner);
301        fresh_cache(1024 * 1024);
302        let dir = tempfile::tempdir().unwrap();
303        let p = write(dir.path(), "a.rs", "fn main() {}\n");
304        let state = FileState::from_path(&p).unwrap();
305        assert!(get(&p, state).is_none(), "cold cache must miss");
306        insert(&p, state, Arc::from("fn main() {}\n"));
307        let got = get(&p, state).expect("warm cache must hit");
308        assert_eq!(&*got, "fn main() {}\n");
309    }
310
311    #[test]
312    fn mtime_or_size_change_invalidates() {
313        let _g = TEST_LOCK
314            .lock()
315            .unwrap_or_else(std::sync::PoisonError::into_inner);
316        fresh_cache(1024 * 1024);
317        let dir = tempfile::tempdir().unwrap();
318        let p = write(dir.path(), "a.rs", "v1\n");
319        let s1 = FileState::from_path(&p).unwrap();
320        insert(&p, s1, Arc::from("v1\n"));
321        assert!(get(&p, s1).is_some());
322
323        // Different size ⇒ different state ⇒ miss, and the stale entry is dropped.
324        let s_bigger = FileState {
325            size_bytes: s1.size_bytes + 10,
326            ..s1
327        };
328        assert!(get(&p, s_bigger).is_none(), "size change must miss");
329        assert!(
330            get(&p, s1).is_none(),
331            "stale entry must be evicted on mismatch"
332        );
333
334        // Different mtime ⇒ miss as well.
335        insert(&p, s1, Arc::from("v1\n"));
336        let s_newer = FileState {
337            mtime_ms: s1.mtime_ms + 1,
338            ..s1
339        };
340        assert!(get(&p, s_newer).is_none(), "mtime change must miss");
341    }
342
343    #[test]
344    fn get_or_read_populates_then_serves_from_cache() {
345        let _g = TEST_LOCK
346            .lock()
347            .unwrap_or_else(std::sync::PoisonError::into_inner);
348        fresh_cache(1024 * 1024);
349        let dir = tempfile::tempdir().unwrap();
350        let p = write(dir.path(), "a.rs", "hello world\n");
351
352        let before = stats();
353        let first = get_or_read(&p).unwrap();
354        assert_eq!(&*first, "hello world\n");
355        let after_first = stats();
356        assert_eq!(
357            after_first.inserts,
358            before.inserts + 1,
359            "first read inserts"
360        );
361
362        let second = get_or_read(&p).unwrap();
363        assert_eq!(&*second, "hello world\n");
364        let after_second = stats();
365        assert_eq!(
366            after_second.inserts, after_first.inserts,
367            "second read must NOT re-insert (served from cache)"
368        );
369        assert!(after_second.hits > after_first.hits, "second read is a hit");
370    }
371
372    #[test]
373    fn eviction_keeps_cache_within_budget() {
374        let _g = TEST_LOCK
375            .lock()
376            .unwrap_or_else(std::sync::PoisonError::into_inner);
377        // Budget fits ~2 small files; a third insert must evict the LRU one.
378        fresh_cache(64);
379        let dir = tempfile::tempdir().unwrap();
380        let pa = write(dir.path(), "a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaa"); // 28 bytes
381        let pb = write(dir.path(), "b", "bbbbbbbbbbbbbbbbbbbbbbbbbbbb");
382        let pc = write(dir.path(), "c", "cccccccccccccccccccccccccccc");
383        let sa = FileState::from_path(&pa).unwrap();
384        let sb = FileState::from_path(&pb).unwrap();
385        let sc = FileState::from_path(&pc).unwrap();
386
387        insert(&pa, sa, Arc::from("aaaaaaaaaaaaaaaaaaaaaaaaaaaa"));
388        // Touch a so b becomes the LRU victim.
389        let _ = get(&pa, sa);
390        insert(&pb, sb, Arc::from("bbbbbbbbbbbbbbbbbbbbbbbbbbbb"));
391        let _ = get(&pa, sa);
392        insert(&pc, sc, Arc::from("cccccccccccccccccccccccccccc"));
393
394        let st = stats();
395        assert!(st.bytes <= 64, "cache must respect byte budget: {st:?}");
396        assert!(st.evictions >= 1, "an eviction must have occurred: {st:?}");
397        assert!(get(&pa, sa).is_some(), "recently-used entry must survive");
398    }
399
400    #[test]
401    fn disabled_via_zero_budget_is_passthrough() {
402        let _g = TEST_LOCK
403            .lock()
404            .unwrap_or_else(std::sync::PoisonError::into_inner);
405        fresh_cache(1024 * 1024);
406        std::env::set_var("LEAN_CTX_CONTENT_CACHE_MB", "0");
407        let dir = tempfile::tempdir().unwrap();
408        let p = write(dir.path(), "a.rs", "x\n");
409        let state = FileState::from_path(&p).unwrap();
410        insert(&p, state, Arc::from("x\n"));
411        assert!(get(&p, state).is_none(), "zero-budget cache is a no-op");
412        std::env::remove_var("LEAN_CTX_CONTENT_CACHE_MB");
413    }
414}