Skip to main content

lean_ctx/core/
bm25_cache.rs

1use std::path::{Path, PathBuf};
2use std::sync::Arc;
3use std::time::{Instant, SystemTime};
4
5use super::bm25_index::BM25Index;
6
7const DEFAULT_TTL_SECS: u64 = 60;
8
9/// Cheap content fingerprint of the persisted index file: `(mtime, size)`.
10///
11/// mtime alone is not enough — many filesystems only resolve mtime to 1–2 s, so
12/// a background rebuild that lands in the same tick as the load would be missed.
13/// Pairing it with the file size catches those same-second rewrites without the
14/// cost of hashing a multi-MB index file on every per-query freshness check.
15#[derive(Clone, Copy, PartialEq, Eq, Debug, Default)]
16pub struct IndexFingerprint {
17    mtime: Option<SystemTime>,
18    size: u64,
19}
20
21pub struct Bm25CacheEntry {
22    pub root: PathBuf,
23    pub index: Arc<BM25Index>,
24    pub loaded_at: Instant,
25    /// Fingerprint of the persisted index file when this entry was loaded.
26    pub fingerprint: IndexFingerprint,
27}
28
29impl Bm25CacheEntry {
30    pub fn is_fresh(&self) -> bool {
31        if self.loaded_at.elapsed().as_secs() >= ttl_secs() {
32            return false;
33        }
34        // Precise invalidation: if a background rebuild changed the index file
35        // on disk, the resident copy is stale even within the TTL window.
36        index_fingerprint(&self.root) == self.fingerprint
37    }
38}
39
40/// `(mtime, size)` fingerprint of the persisted BM25 index file for `root`.
41pub(crate) fn index_fingerprint(root: &Path) -> IndexFingerprint {
42    match std::fs::metadata(BM25Index::index_file_path(root)) {
43        Ok(m) => IndexFingerprint {
44            mtime: m.modified().ok(),
45            size: m.len(),
46        },
47        Err(_) => IndexFingerprint::default(),
48    }
49}
50
51fn ttl_secs() -> u64 {
52    std::env::var("LEAN_CTX_BM25_CACHE_TTL")
53        .ok()
54        .and_then(|v| v.parse().ok())
55        .unwrap_or(DEFAULT_TTL_SECS)
56}
57
58pub type SharedBm25Cache = std::sync::Arc<std::sync::Mutex<Option<Bm25CacheEntry>>>;
59
60/// Get the BM25 index from cache if available and fresh, otherwise load/build,
61/// cache it, and return. Uses Arc to avoid cloning the entire index.
62pub fn get_or_load(cache: &SharedBm25Cache, root: &Path) -> Arc<BM25Index> {
63    {
64        let guard = cache
65            .lock()
66            .unwrap_or_else(std::sync::PoisonError::into_inner);
67        if let Some(ref entry) = *guard {
68            if entry.root == root && entry.is_fresh() {
69                return Arc::clone(&entry.index);
70            }
71        }
72    }
73
74    let index = Arc::new(BM25Index::load_or_build_fast(root));
75
76    let mut guard = cache
77        .lock()
78        .unwrap_or_else(std::sync::PoisonError::into_inner);
79    *guard = Some(Bm25CacheEntry {
80        root: root.to_path_buf(),
81        index: Arc::clone(&index),
82        loaded_at: Instant::now(),
83        fingerprint: index_fingerprint(root),
84    });
85
86    index
87}
88
89/// Get index from cache (fresh or stale), triggering background rebuild if stale.
90/// Returns None only if no cache entry exists at all.
91pub fn get_or_background(cache: &SharedBm25Cache, root: &Path) -> Option<Arc<BM25Index>> {
92    let guard = cache
93        .lock()
94        .unwrap_or_else(std::sync::PoisonError::into_inner);
95    let entry = guard.as_ref()?;
96    if entry.root != root {
97        return None;
98    }
99
100    let idx = Arc::clone(&entry.index);
101
102    if !entry.is_fresh() {
103        let root_str = root.to_string_lossy().to_string();
104        let cache_clone = cache.clone();
105        let root_clone = root.to_path_buf();
106        std::thread::spawn(move || {
107            // Isolate panics (corrupt index file, FS race): a panic here must not
108            // kill the worker silently — the stale index keeps serving and the
109            // next call retries the refresh.
110            let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
111                let rebuilt = BM25Index::load_or_build(&root_clone);
112                let rebuilt_fp = index_fingerprint(&root_clone);
113                let mut g = cache_clone
114                    .lock()
115                    .unwrap_or_else(std::sync::PoisonError::into_inner);
116                *g = Some(Bm25CacheEntry {
117                    root: root_clone,
118                    index: Arc::new(rebuilt),
119                    loaded_at: Instant::now(),
120                    fingerprint: rebuilt_fp,
121                });
122            }));
123            if result.is_ok() {
124                tracing::debug!("[bm25_cache: background refresh done for {root_str}]");
125            } else {
126                tracing::warn!(
127                    "[bm25_cache: background refresh panicked for {root_str}; serving stale index]"
128                );
129            }
130        });
131    }
132
133    Some(idx)
134}
135
136/// Drops the cached BM25 index, freeing its heap memory.
137/// The index will be rebuilt from disk on the next search.
138pub fn unload(cache: &SharedBm25Cache) {
139    let mut guard = cache
140        .lock()
141        .unwrap_or_else(std::sync::PoisonError::into_inner);
142    if guard.is_some() {
143        *guard = None;
144        tracing::info!("[bm25_cache] unloaded index to free memory");
145    }
146}
147
148/// Returns the approximate heap memory used by the cached BM25 index, or 0.
149pub fn memory_usage(cache: &SharedBm25Cache) -> usize {
150    let guard = cache
151        .lock()
152        .unwrap_or_else(std::sync::PoisonError::into_inner);
153    guard.as_ref().map_or(0, |e| e.index.memory_usage_bytes())
154}
155
156#[cfg(test)]
157mod tests {
158    use super::*;
159    use std::sync::Arc;
160
161    #[test]
162    fn fresh_cache_returns_same_instance() {
163        let cache: SharedBm25Cache = Arc::new(std::sync::Mutex::new(None));
164        let tmp = tempfile::tempdir().unwrap();
165        let root = tmp.path();
166        std::fs::write(root.join("main.rs"), "fn main() {}\n").unwrap();
167
168        let idx1 = get_or_load(&cache, root);
169        assert!(idx1.doc_count > 0);
170
171        let idx2 = get_or_load(&cache, root);
172        assert_eq!(idx1.doc_count, idx2.doc_count);
173    }
174
175    #[test]
176    fn different_root_invalidates() {
177        let cache: SharedBm25Cache = Arc::new(std::sync::Mutex::new(None));
178        let tmp1 = tempfile::tempdir().unwrap();
179        let tmp2 = tempfile::tempdir().unwrap();
180        std::fs::write(tmp1.path().join("a.rs"), "fn a() {}\n").unwrap();
181        std::fs::write(tmp2.path().join("b.rs"), "fn b() {}\n").unwrap();
182
183        let _ = get_or_load(&cache, tmp1.path());
184        let idx2 = get_or_load(&cache, tmp2.path());
185
186        let guard = cache.lock().unwrap();
187        let entry = guard.as_ref().unwrap();
188        assert_eq!(entry.root, tmp2.path());
189        assert_eq!(entry.index.doc_count, idx2.doc_count);
190    }
191
192    #[test]
193    fn get_or_background_returns_none_on_empty() {
194        let cache: SharedBm25Cache = Arc::new(std::sync::Mutex::new(None));
195        let tmp = tempfile::tempdir().unwrap();
196        assert!(get_or_background(&cache, tmp.path()).is_none());
197    }
198
199    #[test]
200    fn fingerprint_default_when_index_file_absent() {
201        let tmp = tempfile::tempdir().unwrap();
202        // No persisted index file → default (None, 0) fingerprint.
203        assert_eq!(index_fingerprint(tmp.path()), IndexFingerprint::default());
204    }
205
206    #[test]
207    fn fingerprint_detects_size_change_under_equal_mtime() {
208        // Two fingerprints with the same mtime but different size must differ,
209        // proving size catches same-second rewrites that mtime alone misses.
210        let mtime = Some(SystemTime::UNIX_EPOCH);
211        let a = IndexFingerprint { mtime, size: 100 };
212        let b = IndexFingerprint { mtime, size: 200 };
213        assert_ne!(a, b);
214        assert_eq!(a, IndexFingerprint { mtime, size: 100 });
215    }
216}