fallow_extract/cache/store.rs
1//! Cache store: load, save, and query cached module data.
2
3use std::path::Path;
4
5use rustc_hash::FxHashMap;
6
7use bitcode::{Decode, Encode};
8
9use super::types::{
10 CACHE_VERSION, CachedModule, DEFAULT_CACHE_MAX_SIZE, EVICTION_SIGNIFICANT_BPS,
11 EVICTION_TARGET_BPS, EVICTION_TRIGGER_BPS,
12};
13
14/// Cached module information stored on disk.
15#[derive(Debug, Encode, Decode)]
16pub struct CacheStore {
17 version: u32,
18 /// Stable u64 hash of extraction-affecting config fields (currently the
19 /// active external plugin names + inline framework definition names).
20 /// A mismatch at load time discards the cache, matching how
21 /// `CACHE_VERSION` works but invalidating on a user-driven config change
22 /// rather than on a fallow upgrade. See ADR-009 for the ingredient list.
23 config_hash: u64,
24 /// Map from file path to cached module data.
25 entries: FxHashMap<String, CachedModule>,
26}
27
28impl CacheStore {
29 /// Create a new empty cache.
30 #[must_use]
31 pub fn new() -> Self {
32 Self {
33 version: CACHE_VERSION,
34 config_hash: 0,
35 entries: FxHashMap::default(),
36 }
37 }
38
39 /// Load cache from disk.
40 ///
41 /// Returns `None` when:
42 /// - the file does not exist or cannot be read,
43 /// - the on-disk size exceeds the configured `max_size_bytes` (matches the
44 /// user's `cache.maxSizeMb` / `FALLOW_CACHE_MAX_SIZE` setting, with the
45 /// built-in `DEFAULT_CACHE_MAX_SIZE` as the lower bound so a misconfigured
46 /// tiny cap cannot push a still-valid larger cache into discard),
47 /// - bitcode decoding fails or the decoded version differs from
48 /// `CACHE_VERSION` (emits a one-line `tracing::info!` so the user sees the
49 /// one-time rebuild cost; decode failure is the common case across
50 /// `CACHE_VERSION` bumps because the on-disk schema changes shape and
51 /// the new struct cannot deserialize the old bytes),
52 /// - the on-disk `config_hash` differs from `expected_config_hash` (silent;
53 /// config changes are user-driven and routine, no log noise).
54 #[must_use]
55 pub fn load(
56 cache_dir: &Path,
57 expected_config_hash: u64,
58 max_size_bytes: usize,
59 ) -> Option<Self> {
60 let cache_file = cache_dir.join("cache.bin");
61 let data = std::fs::read(&cache_file).ok()?;
62 // Honour both the user's cap AND the built-in default so a
63 // misconfigured tiny cap (e.g. `FALLOW_CACHE_MAX_SIZE=1`) does NOT
64 // throw away a valid existing cache on the load path; the user's
65 // cap takes effect at the NEXT save via the eviction logic.
66 let safety_ceiling = max_size_bytes.max(DEFAULT_CACHE_MAX_SIZE);
67 if data.len() > safety_ceiling {
68 tracing::warn!(
69 size_mb = data.len() / (1024 * 1024),
70 ceiling_mb = safety_ceiling / (1024 * 1024),
71 "Cache file exceeds safety ceiling, ignoring"
72 );
73 return None;
74 }
75 let store: Self = match bitcode::decode(&data) {
76 Ok(s) => s,
77 Err(_) => {
78 tracing::info!(
79 "Cache format upgraded, rebuilding (one-time cost after version bump)"
80 );
81 return None;
82 }
83 };
84 if store.version != CACHE_VERSION {
85 tracing::info!("Cache format upgraded, rebuilding (one-time cost after version bump)");
86 return None;
87 }
88 if store.config_hash != expected_config_hash {
89 return None;
90 }
91 Some(store)
92 }
93
94 /// Save cache to disk with write-time size enforcement and atomic rename.
95 ///
96 /// Algorithm:
97 /// 1. Set `self.config_hash = config_hash`.
98 /// 2. Encode once.
99 /// 3. If the encoded size exceeds 80% of `max_size_bytes`, evict LRU
100 /// entries (oldest `last_access_secs` first, path-tiebroken) until
101 /// the size is below 60% of `max_size_bytes` OR only one entry
102 /// remains. Re-encode after eviction.
103 /// 4. Write the bytes to `cache.bin.tmp` then `rename` to `cache.bin`.
104 /// This bounds the partial-truncate window that a plain
105 /// `std::fs::write` would expose mid-write.
106 ///
107 /// # Errors
108 ///
109 /// Returns an error string when the cache directory cannot be created
110 /// or the temporary file cannot be written or renamed.
111 pub fn save(
112 &mut self,
113 cache_dir: &Path,
114 config_hash: u64,
115 max_size_bytes: usize,
116 ) -> Result<(), String> {
117 std::fs::create_dir_all(cache_dir)
118 .map_err(|e| format!("Failed to create cache dir: {e}"))?;
119 write_cache_gitignore(cache_dir)?;
120
121 self.config_hash = config_hash;
122 let initial_entries = self.entries.len();
123 let mut encoded = bitcode::encode(self);
124
125 // Divide-first ordering keeps the multiplication from saturating at
126 // pathologically-large caps. At most 0.008% rounding error per
127 // operation, negligible for a soft size threshold.
128 let trigger = (max_size_bytes / 10_000).saturating_mul(EVICTION_TRIGGER_BPS);
129 if encoded.len() > trigger {
130 let target = (max_size_bytes / 10_000).saturating_mul(EVICTION_TARGET_BPS);
131 self.evict_lru_to_target(target);
132 encoded = bitcode::encode(self);
133 let evicted = initial_entries.saturating_sub(self.entries.len());
134 let final_size = encoded.len();
135 // `initial_entries` is bounded by the file count, so
136 // `usize` saturation is not a concern here. Use the
137 // multiply-then-divide ordering so small caches (< 10k
138 // entries) still produce a non-zero significance threshold.
139 let significant_evicted =
140 initial_entries.saturating_mul(EVICTION_SIGNIFICANT_BPS) / 10_000;
141 if evicted >= significant_evicted && initial_entries > 0 {
142 tracing::info!(
143 evicted_entries = evicted,
144 remaining_entries = self.entries.len(),
145 final_size_kb = final_size / 1024,
146 max_size_kb = max_size_bytes / 1024,
147 "Cache eviction: removed oldest entries to stay under cap"
148 );
149 } else {
150 tracing::debug!(
151 evicted_entries = evicted,
152 remaining_entries = self.entries.len(),
153 final_size_kb = final_size / 1024,
154 max_size_kb = max_size_bytes / 1024,
155 "Cache eviction"
156 );
157 }
158 }
159
160 let cache_file = cache_dir.join("cache.bin");
161 atomic_write(&cache_file, &encoded)?;
162 Ok(())
163 }
164
165 /// Evict LRU entries until the re-encoded size is under `target_bytes`
166 /// OR only one entry remains. The single-entry floor exists so the
167 /// cache stays useful under extremely tight caps; if even one entry
168 /// busts the cap, the call site logs a warning and the cap is
169 /// overshot intentionally rather than silently lying about respecting
170 /// it (the alternative is dropping the entry and rebuilding the cache
171 /// from scratch every run, which is worse).
172 fn evict_lru_to_target(&mut self, target_bytes: usize) {
173 // Collect (key, last_access_secs) pairs and sort ascending so the
174 // oldest leave first. Ties break on path string for reproducible
175 // eviction order across runs (FxHashMap iteration order is not
176 // stable across processes).
177 let mut order: Vec<(u64, String)> = self
178 .entries
179 .iter()
180 .map(|(k, v)| (v.last_access_secs, k.clone()))
181 .collect();
182 order.sort();
183
184 // Drop in batches of 100 to amortize the re-encode cost: 100k
185 // entries with one re-encode per eviction would be O(n^2 * encode).
186 const BATCH: usize = 100;
187 let mut idx = 0;
188 while idx < order.len() {
189 let batch_end = (idx + BATCH).min(order.len());
190 for (_, key) in &order[idx..batch_end] {
191 if self.entries.len() <= 1 {
192 break;
193 }
194 self.entries.remove(key);
195 }
196 idx = batch_end;
197
198 // Cheap progress check: re-encode and bail if we're already
199 // under target. This costs one extra encode per 100 evictions,
200 // but avoids over-evicting when the bulk of the size came from
201 // a small number of large entries near the front.
202 let encoded_size = bitcode::encode(self).len();
203 if encoded_size <= target_bytes || self.entries.len() <= 1 {
204 if encoded_size > target_bytes && self.entries.len() <= 1 {
205 tracing::warn!(
206 encoded_kb = encoded_size / 1024,
207 target_kb = target_bytes / 1024,
208 "Single cache entry exceeds configured max; cache will overshoot the cap"
209 );
210 }
211 return;
212 }
213 }
214 }
215
216 /// Look up a cached module by path and content hash.
217 /// Returns None if not cached or hash mismatch.
218 #[must_use]
219 pub fn get(&self, path: &Path, content_hash: u64) -> Option<&CachedModule> {
220 let key = path.to_string_lossy();
221 let entry = self.entries.get(key.as_ref())?;
222 if entry.content_hash == content_hash {
223 Some(entry)
224 } else {
225 None
226 }
227 }
228
229 /// Insert or update a cached module.
230 pub fn insert(&mut self, path: &Path, module: CachedModule) {
231 let key = path.to_string_lossy().into_owned();
232 self.entries.insert(key, module);
233 }
234
235 /// Fast cache lookup using only file metadata (mtime + size).
236 ///
237 /// If the cached entry has matching mtime and size, the file content
238 /// almost certainly has not changed, so we can skip reading the file
239 /// entirely. This turns a cache hit from `stat() + read() + hash`
240 /// into just `stat()`.
241 #[must_use]
242 pub fn get_by_metadata(
243 &self,
244 path: &Path,
245 mtime_secs: u64,
246 file_size: u64,
247 ) -> Option<&CachedModule> {
248 let key = path.to_string_lossy();
249 let entry = self.entries.get(key.as_ref())?;
250 if entry.mtime_secs == mtime_secs && entry.file_size == file_size && mtime_secs > 0 {
251 Some(entry)
252 } else {
253 None
254 }
255 }
256
257 /// Look up a cached module by path only (ignoring hash).
258 /// Used to check whether a module's content hash matches without
259 /// requiring the caller to know the hash upfront.
260 #[must_use]
261 pub fn get_by_path_only(&self, path: &Path) -> Option<&CachedModule> {
262 let key = path.to_string_lossy();
263 self.entries.get(key.as_ref())
264 }
265
266 /// Remove cache entries for files that are no longer in the project.
267 /// Keeps the cache from growing unboundedly as files are deleted.
268 pub fn retain_paths(&mut self, files: &[fallow_types::discover::DiscoveredFile]) {
269 use rustc_hash::FxHashSet;
270 let current_paths: FxHashSet<String> = files
271 .iter()
272 .map(|f| f.path.to_string_lossy().to_string())
273 .collect();
274 self.entries.retain(|key, _| current_paths.contains(key));
275 }
276
277 /// Number of cached entries.
278 #[must_use]
279 pub fn len(&self) -> usize {
280 self.entries.len()
281 }
282
283 /// Whether cache is empty.
284 #[must_use]
285 pub fn is_empty(&self) -> bool {
286 self.entries.is_empty()
287 }
288}
289
290fn write_cache_gitignore(cache_dir: &Path) -> Result<(), String> {
291 std::fs::write(cache_dir.join(".gitignore"), "*\n")
292 .map_err(|e| format!("Failed to write cache .gitignore: {e}"))
293}
294
295/// Write `data` to `cache_file` atomically: write to a sibling `.tmp` file,
296/// best-effort fsync, then rename over the destination. Bounds the
297/// partial-truncate window that a plain `std::fs::write` exposes, which
298/// matters more once the eviction path encodes twice for large caches.
299fn atomic_write(cache_file: &Path, data: &[u8]) -> Result<(), String> {
300 let tmp_file = match cache_file.file_name() {
301 Some(name) => cache_file.with_file_name({
302 let mut s = name.to_os_string();
303 s.push(".tmp");
304 s
305 }),
306 None => return Err("Cache file path has no filename component".to_owned()),
307 };
308
309 {
310 use std::io::Write as _;
311 let mut f = std::fs::File::create(&tmp_file)
312 .map_err(|e| format!("Failed to create cache tmp: {e}"))?;
313 f.write_all(data)
314 .map_err(|e| format!("Failed to write cache tmp: {e}"))?;
315 // Best-effort fsync. Failures here are non-fatal because the
316 // rename below is still atomic on every platform fallow targets;
317 // the fsync just reduces the chance of post-power-loss corruption.
318 let _ = f.sync_all();
319 }
320
321 std::fs::rename(&tmp_file, cache_file)
322 .map_err(|e| format!("Failed to rename cache tmp into place: {e}"))?;
323 Ok(())
324}
325
326impl Default for CacheStore {
327 fn default() -> Self {
328 Self::new()
329 }
330}