Skip to main content

fallow_extract/cache/
store.rs

1//! Cache store: load, save, and query cached module data.
2
3use std::path::Path;
4
5use rustc_hash::FxHashMap;
6
7use bitcode::{Decode, Encode};
8
9use super::types::{
10    CACHE_VERSION, CachedModule, DEFAULT_CACHE_MAX_SIZE, EVICTION_SIGNIFICANT_BPS,
11    EVICTION_TARGET_BPS, EVICTION_TRIGGER_BPS,
12};
13
14/// Cached module information stored on disk.
15#[derive(Debug, Encode, Decode)]
16pub struct CacheStore {
17    version: u32,
18    /// Stable u64 hash of extraction-affecting config fields (currently the
19    /// active external plugin names + inline framework definition names).
20    /// A mismatch at load time discards the cache, matching how
21    /// `CACHE_VERSION` works but invalidating on a user-driven config change
22    /// rather than on a fallow upgrade. See ADR-009 for the ingredient list.
23    config_hash: u64,
24    /// Map from file path to cached module data.
25    entries: FxHashMap<String, CachedModule>,
26}
27
28impl CacheStore {
29    /// Create a new empty cache.
30    #[must_use]
31    pub fn new() -> Self {
32        Self {
33            version: CACHE_VERSION,
34            config_hash: 0,
35            entries: FxHashMap::default(),
36        }
37    }
38
39    /// Load cache from disk.
40    ///
41    /// Returns `None` when:
42    /// - the file does not exist or cannot be read,
43    /// - the on-disk size exceeds the configured `max_size_bytes` (matches the
44    ///   user's `cache.maxSizeMb` / `FALLOW_CACHE_MAX_SIZE` setting, with the
45    ///   built-in `DEFAULT_CACHE_MAX_SIZE` as the lower bound so a misconfigured
46    ///   tiny cap cannot push a still-valid larger cache into discard),
47    /// - bitcode decoding fails or the decoded version differs from
48    ///   `CACHE_VERSION` (emits a one-line `tracing::info!` so the user sees the
49    ///   one-time rebuild cost; decode failure is the common case across
50    ///   `CACHE_VERSION` bumps because the on-disk schema changes shape and
51    ///   the new struct cannot deserialize the old bytes),
52    /// - the on-disk `config_hash` differs from `expected_config_hash` (silent;
53    ///   config changes are user-driven and routine, no log noise).
54    #[must_use]
55    pub fn load(
56        cache_dir: &Path,
57        expected_config_hash: u64,
58        max_size_bytes: usize,
59    ) -> Option<Self> {
60        let cache_file = cache_dir.join("cache.bin");
61        let data = std::fs::read(&cache_file).ok()?;
62        // Honour both the user's cap AND the built-in default so a
63        // misconfigured tiny cap (e.g. `FALLOW_CACHE_MAX_SIZE=1`) does NOT
64        // throw away a valid existing cache on the load path; the user's
65        // cap takes effect at the NEXT save via the eviction logic.
66        let safety_ceiling = max_size_bytes.max(DEFAULT_CACHE_MAX_SIZE);
67        if data.len() > safety_ceiling {
68            tracing::warn!(
69                size_mb = data.len() / (1024 * 1024),
70                ceiling_mb = safety_ceiling / (1024 * 1024),
71                "Cache file exceeds safety ceiling, ignoring"
72            );
73            return None;
74        }
75        let store: Self = match bitcode::decode(&data) {
76            Ok(s) => s,
77            Err(_) => {
78                tracing::info!(
79                    "Cache format upgraded, rebuilding (one-time cost after version bump)"
80                );
81                return None;
82            }
83        };
84        if store.version != CACHE_VERSION {
85            tracing::info!("Cache format upgraded, rebuilding (one-time cost after version bump)");
86            return None;
87        }
88        if store.config_hash != expected_config_hash {
89            return None;
90        }
91        Some(store)
92    }
93
94    /// Save cache to disk with write-time size enforcement and atomic rename.
95    ///
96    /// Algorithm:
97    /// 1. Set `self.config_hash = config_hash`.
98    /// 2. Encode once.
99    /// 3. If the encoded size exceeds 80% of `max_size_bytes`, evict LRU
100    ///    entries (oldest `last_access_secs` first, path-tiebroken) until
101    ///    the size is below 60% of `max_size_bytes` OR only one entry
102    ///    remains. Re-encode after eviction.
103    /// 4. Write the bytes to `cache.bin.tmp` then `rename` to `cache.bin`.
104    ///    This bounds the partial-truncate window that a plain
105    ///    `std::fs::write` would expose mid-write.
106    ///
107    /// # Errors
108    ///
109    /// Returns an error string when the cache directory cannot be created
110    /// or the temporary file cannot be written or renamed.
111    pub fn save(
112        &mut self,
113        cache_dir: &Path,
114        config_hash: u64,
115        max_size_bytes: usize,
116    ) -> Result<(), String> {
117        std::fs::create_dir_all(cache_dir)
118            .map_err(|e| format!("Failed to create cache dir: {e}"))?;
119        write_cache_gitignore(cache_dir)?;
120
121        self.config_hash = config_hash;
122        let initial_entries = self.entries.len();
123        let mut encoded = bitcode::encode(self);
124
125        // Divide-first ordering keeps the multiplication from saturating at
126        // pathologically-large caps. At most 0.008% rounding error per
127        // operation, negligible for a soft size threshold.
128        let trigger = (max_size_bytes / 10_000).saturating_mul(EVICTION_TRIGGER_BPS);
129        if encoded.len() > trigger {
130            let target = (max_size_bytes / 10_000).saturating_mul(EVICTION_TARGET_BPS);
131            self.evict_lru_to_target(target);
132            encoded = bitcode::encode(self);
133            let evicted = initial_entries.saturating_sub(self.entries.len());
134            let final_size = encoded.len();
135            // `initial_entries` is bounded by the file count, so
136            // `usize` saturation is not a concern here. Use the
137            // multiply-then-divide ordering so small caches (< 10k
138            // entries) still produce a non-zero significance threshold.
139            let significant_evicted =
140                initial_entries.saturating_mul(EVICTION_SIGNIFICANT_BPS) / 10_000;
141            if evicted >= significant_evicted && initial_entries > 0 {
142                tracing::info!(
143                    evicted_entries = evicted,
144                    remaining_entries = self.entries.len(),
145                    final_size_kb = final_size / 1024,
146                    max_size_kb = max_size_bytes / 1024,
147                    "Cache eviction: removed oldest entries to stay under cap"
148                );
149            } else {
150                tracing::debug!(
151                    evicted_entries = evicted,
152                    remaining_entries = self.entries.len(),
153                    final_size_kb = final_size / 1024,
154                    max_size_kb = max_size_bytes / 1024,
155                    "Cache eviction"
156                );
157            }
158        }
159
160        let cache_file = cache_dir.join("cache.bin");
161        atomic_write(&cache_file, &encoded)?;
162        Ok(())
163    }
164
165    /// Evict LRU entries until the re-encoded size is under `target_bytes`
166    /// OR only one entry remains. The single-entry floor exists so the
167    /// cache stays useful under extremely tight caps; if even one entry
168    /// busts the cap, the call site logs a warning and the cap is
169    /// overshot intentionally rather than silently lying about respecting
170    /// it (the alternative is dropping the entry and rebuilding the cache
171    /// from scratch every run, which is worse).
172    fn evict_lru_to_target(&mut self, target_bytes: usize) {
173        // Collect (key, last_access_secs) pairs and sort ascending so the
174        // oldest leave first. Ties break on path string for reproducible
175        // eviction order across runs (FxHashMap iteration order is not
176        // stable across processes).
177        let mut order: Vec<(u64, String)> = self
178            .entries
179            .iter()
180            .map(|(k, v)| (v.last_access_secs, k.clone()))
181            .collect();
182        order.sort();
183
184        // Drop in batches of 100 to amortize the re-encode cost: 100k
185        // entries with one re-encode per eviction would be O(n^2 * encode).
186        const BATCH: usize = 100;
187        let mut idx = 0;
188        while idx < order.len() {
189            let batch_end = (idx + BATCH).min(order.len());
190            for (_, key) in &order[idx..batch_end] {
191                if self.entries.len() <= 1 {
192                    break;
193                }
194                self.entries.remove(key);
195            }
196            idx = batch_end;
197
198            // Cheap progress check: re-encode and bail if we're already
199            // under target. This costs one extra encode per 100 evictions,
200            // but avoids over-evicting when the bulk of the size came from
201            // a small number of large entries near the front.
202            let encoded_size = bitcode::encode(self).len();
203            if encoded_size <= target_bytes || self.entries.len() <= 1 {
204                if encoded_size > target_bytes && self.entries.len() <= 1 {
205                    tracing::warn!(
206                        encoded_kb = encoded_size / 1024,
207                        target_kb = target_bytes / 1024,
208                        "Single cache entry exceeds configured max; cache will overshoot the cap"
209                    );
210                }
211                return;
212            }
213        }
214    }
215
216    /// Look up a cached module by path and content hash.
217    /// Returns None if not cached or hash mismatch.
218    #[must_use]
219    pub fn get(&self, path: &Path, content_hash: u64) -> Option<&CachedModule> {
220        let key = path.to_string_lossy();
221        let entry = self.entries.get(key.as_ref())?;
222        if entry.content_hash == content_hash {
223            Some(entry)
224        } else {
225            None
226        }
227    }
228
229    /// Insert or update a cached module.
230    pub fn insert(&mut self, path: &Path, module: CachedModule) {
231        let key = path.to_string_lossy().into_owned();
232        self.entries.insert(key, module);
233    }
234
235    /// Fast cache lookup using only file metadata (mtime + size).
236    ///
237    /// If the cached entry has matching mtime and size, the file content
238    /// almost certainly has not changed, so we can skip reading the file
239    /// entirely. This turns a cache hit from `stat() + read() + hash`
240    /// into just `stat()`.
241    #[must_use]
242    pub fn get_by_metadata(
243        &self,
244        path: &Path,
245        mtime_secs: u64,
246        file_size: u64,
247    ) -> Option<&CachedModule> {
248        let key = path.to_string_lossy();
249        let entry = self.entries.get(key.as_ref())?;
250        if entry.mtime_secs == mtime_secs && entry.file_size == file_size && mtime_secs > 0 {
251            Some(entry)
252        } else {
253            None
254        }
255    }
256
257    /// Look up a cached module by path only (ignoring hash).
258    /// Used to check whether a module's content hash matches without
259    /// requiring the caller to know the hash upfront.
260    #[must_use]
261    pub fn get_by_path_only(&self, path: &Path) -> Option<&CachedModule> {
262        let key = path.to_string_lossy();
263        self.entries.get(key.as_ref())
264    }
265
266    /// Remove cache entries for files that are no longer in the project.
267    /// Keeps the cache from growing unboundedly as files are deleted.
268    pub fn retain_paths(&mut self, files: &[fallow_types::discover::DiscoveredFile]) {
269        use rustc_hash::FxHashSet;
270        let current_paths: FxHashSet<String> = files
271            .iter()
272            .map(|f| f.path.to_string_lossy().to_string())
273            .collect();
274        self.entries.retain(|key, _| current_paths.contains(key));
275    }
276
277    /// Number of cached entries.
278    #[must_use]
279    pub fn len(&self) -> usize {
280        self.entries.len()
281    }
282
283    /// Whether cache is empty.
284    #[must_use]
285    pub fn is_empty(&self) -> bool {
286        self.entries.is_empty()
287    }
288}
289
290fn write_cache_gitignore(cache_dir: &Path) -> Result<(), String> {
291    std::fs::write(cache_dir.join(".gitignore"), "*\n")
292        .map_err(|e| format!("Failed to write cache .gitignore: {e}"))
293}
294
295/// Write `data` to `cache_file` atomically: write to a sibling `.tmp` file,
296/// best-effort fsync, then rename over the destination. Bounds the
297/// partial-truncate window that a plain `std::fs::write` exposes, which
298/// matters more once the eviction path encodes twice for large caches.
299fn atomic_write(cache_file: &Path, data: &[u8]) -> Result<(), String> {
300    let tmp_file = match cache_file.file_name() {
301        Some(name) => cache_file.with_file_name({
302            let mut s = name.to_os_string();
303            s.push(".tmp");
304            s
305        }),
306        None => return Err("Cache file path has no filename component".to_owned()),
307    };
308
309    {
310        use std::io::Write as _;
311        let mut f = std::fs::File::create(&tmp_file)
312            .map_err(|e| format!("Failed to create cache tmp: {e}"))?;
313        f.write_all(data)
314            .map_err(|e| format!("Failed to write cache tmp: {e}"))?;
315        // Best-effort fsync. Failures here are non-fatal because the
316        // rename below is still atomic on every platform fallow targets;
317        // the fsync just reduces the chance of post-power-loss corruption.
318        let _ = f.sync_all();
319    }
320
321    std::fs::rename(&tmp_file, cache_file)
322        .map_err(|e| format!("Failed to rename cache tmp into place: {e}"))?;
323    Ok(())
324}
325
326impl Default for CacheStore {
327    fn default() -> Self {
328        Self::new()
329    }
330}