Skip to main content

fallow_extract/cache/
store.rs

1//! Cache store: load, save, and query cached module data.
2
3use std::path::Path;
4
5use fallow_types::source_fingerprint::SourceFingerprint;
6use rustc_hash::FxHashMap;
7
8use bitcode::{Decode, Encode};
9
10use super::types::{
11    CACHE_VERSION, CachedModule, DEFAULT_CACHE_MAX_SIZE, EVICTION_SIGNIFICANT_BPS,
12    EVICTION_TARGET_BPS, EVICTION_TRIGGER_BPS,
13};
14
15/// Cached module information stored on disk.
16#[derive(Debug, Encode, Decode)]
17pub struct CacheStore {
18    version: u32,
19    /// Stable hash of extraction-affecting config fields.
20    config_hash: u64,
21    /// Map from file path to cached module data.
22    entries: FxHashMap<String, CachedModule>,
23}
24
25impl CacheStore {
26    /// Create a new empty cache.
27    #[must_use]
28    pub fn new() -> Self {
29        Self {
30            version: CACHE_VERSION,
31            config_hash: 0,
32            entries: FxHashMap::default(),
33        }
34    }
35
36    /// Load cache from disk.
37    ///
38    /// Returns `None` when the file is missing, too large, undecodable, or
39    /// built for a different `config_hash`.
40    #[must_use]
41    pub fn load(
42        cache_dir: &Path,
43        expected_config_hash: u64,
44        max_size_bytes: usize,
45    ) -> Option<Self> {
46        let cache_file = cache_dir.join("cache.bin");
47        let data = std::fs::read(&cache_file).ok()?;
48        let safety_ceiling = max_size_bytes.max(DEFAULT_CACHE_MAX_SIZE);
49        if data.len() > safety_ceiling {
50            tracing::warn!(
51                size_mb = data.len() / (1024 * 1024),
52                ceiling_mb = safety_ceiling / (1024 * 1024),
53                "Cache file exceeds safety ceiling, ignoring"
54            );
55            return None;
56        }
57        let store: Self = match bitcode::decode(&data) {
58            Ok(s) => s,
59            Err(_) => {
60                tracing::info!(
61                    "Cache format upgraded, rebuilding (one-time cost after version bump)"
62                );
63                return None;
64            }
65        };
66        if store.version != CACHE_VERSION {
67            tracing::info!("Cache format upgraded, rebuilding (one-time cost after version bump)");
68            return None;
69        }
70        if store.config_hash != expected_config_hash {
71            return None;
72        }
73        Some(store)
74    }
75
76    /// Save cache to disk with write-time size enforcement and atomic rename.
77    pub fn save(
78        &mut self,
79        cache_dir: &Path,
80        config_hash: u64,
81        max_size_bytes: usize,
82    ) -> Result<(), String> {
83        std::fs::create_dir_all(cache_dir)
84            .map_err(|e| format!("Failed to create cache dir: {e}"))?;
85        write_cache_gitignore(cache_dir)?;
86
87        self.config_hash = config_hash;
88        let initial_entries = self.entries.len();
89        let mut encoded = bitcode::encode(self);
90
91        let trigger = (max_size_bytes / 10_000).saturating_mul(EVICTION_TRIGGER_BPS);
92        if encoded.len() > trigger {
93            let target = (max_size_bytes / 10_000).saturating_mul(EVICTION_TARGET_BPS);
94            self.evict_lru_to_target(target);
95            encoded = bitcode::encode(self);
96            let evicted = initial_entries.saturating_sub(self.entries.len());
97            let final_size = encoded.len();
98            let significant_evicted =
99                initial_entries.saturating_mul(EVICTION_SIGNIFICANT_BPS) / 10_000;
100            if evicted >= significant_evicted && initial_entries > 0 {
101                tracing::info!(
102                    evicted_entries = evicted,
103                    remaining_entries = self.entries.len(),
104                    final_size_kb = final_size / 1024,
105                    max_size_kb = max_size_bytes / 1024,
106                    "Cache eviction: removed oldest entries to stay under cap"
107                );
108            } else {
109                tracing::debug!(
110                    evicted_entries = evicted,
111                    remaining_entries = self.entries.len(),
112                    final_size_kb = final_size / 1024,
113                    max_size_kb = max_size_bytes / 1024,
114                    "Cache eviction"
115                );
116            }
117        }
118
119        let cache_file = cache_dir.join("cache.bin");
120        atomic_write(&cache_file, &encoded)?;
121        Ok(())
122    }
123
124    /// Evict LRU entries until the re-encoded size is under `target_bytes`
125    /// or only one entry remains.
126    fn evict_lru_to_target(&mut self, target_bytes: usize) {
127        let mut order: Vec<(u64, String)> = self
128            .entries
129            .iter()
130            .map(|(k, v)| (v.last_access_secs, k.clone()))
131            .collect();
132        order.sort();
133
134        const BATCH: usize = 100;
135        let mut idx = 0;
136        while idx < order.len() {
137            let batch_end = (idx + BATCH).min(order.len());
138            for (_, key) in &order[idx..batch_end] {
139                if self.entries.len() <= 1 {
140                    break;
141                }
142                self.entries.remove(key);
143            }
144            idx = batch_end;
145
146            let encoded_size = bitcode::encode(self).len();
147            if encoded_size <= target_bytes || self.entries.len() <= 1 {
148                if encoded_size > target_bytes && self.entries.len() <= 1 {
149                    tracing::warn!(
150                        encoded_kb = encoded_size / 1024,
151                        target_kb = target_bytes / 1024,
152                        "Single cache entry exceeds configured max; cache will overshoot the cap"
153                    );
154                }
155                return;
156            }
157        }
158    }
159
160    /// Look up a cached module by path and content hash.
161    /// Returns None if not cached or hash mismatch.
162    #[must_use]
163    pub fn get(&self, path: &Path, content_hash: u64) -> Option<&CachedModule> {
164        let key = path.to_string_lossy();
165        let entry = self.entries.get(key.as_ref())?;
166        if entry.content_hash == content_hash {
167            Some(entry)
168        } else {
169            None
170        }
171    }
172
173    /// Insert or update a cached module.
174    pub fn insert(&mut self, path: &Path, module: CachedModule) {
175        let key = path.to_string_lossy().into_owned();
176        self.entries.insert(key, module);
177    }
178
179    /// Fast cache lookup using only file metadata (mtime + size).
180    #[must_use]
181    pub fn get_by_metadata(
182        &self,
183        path: &Path,
184        fingerprint: SourceFingerprint,
185    ) -> Option<&CachedModule> {
186        let key = path.to_string_lossy();
187        let entry = self.entries.get(key.as_ref())?;
188        if entry.source_fingerprint() == fingerprint && fingerprint.has_known_mtime() {
189            Some(entry)
190        } else {
191            None
192        }
193    }
194
195    /// Look up a cached module by path only (ignoring hash).
196    #[must_use]
197    pub fn get_by_path_only(&self, path: &Path) -> Option<&CachedModule> {
198        let key = path.to_string_lossy();
199        self.entries.get(key.as_ref())
200    }
201
202    /// Remove cache entries for files that are no longer in the project.
203    ///
204    /// Returns `true` when any entry was removed.
205    pub fn retain_paths(&mut self, files: &[fallow_types::discover::DiscoveredFile]) -> bool {
206        use rustc_hash::FxHashSet;
207        let current_paths: FxHashSet<String> = files
208            .iter()
209            .map(|f| f.path.to_string_lossy().to_string())
210            .collect();
211        let before = self.entries.len();
212        self.entries.retain(|key, _| current_paths.contains(key));
213        self.entries.len() != before
214    }
215
216    /// Number of cached entries.
217    #[must_use]
218    pub fn len(&self) -> usize {
219        self.entries.len()
220    }
221
222    /// Whether the cache is empty.
223    #[must_use]
224    pub fn is_empty(&self) -> bool {
225        self.entries.is_empty()
226    }
227}
228
229fn write_cache_gitignore(cache_dir: &Path) -> Result<(), String> {
230    std::fs::write(cache_dir.join(".gitignore"), "*\n")
231        .map_err(|e| format!("Failed to write cache .gitignore: {e}"))
232}
233
234/// Write `data` atomically via a sibling `.tmp` file, best-effort fsync, then rename.
235fn atomic_write(cache_file: &Path, data: &[u8]) -> Result<(), String> {
236    let tmp_file = match cache_file.file_name() {
237        Some(name) => cache_file.with_file_name({
238            let mut s = name.to_os_string();
239            s.push(".tmp");
240            s
241        }),
242        None => return Err("Cache file path has no filename component".to_owned()),
243    };
244
245    {
246        use std::io::Write as _;
247        let mut f = std::fs::File::create(&tmp_file)
248            .map_err(|e| format!("Failed to create cache tmp: {e}"))?;
249        f.write_all(data)
250            .map_err(|e| format!("Failed to write cache tmp: {e}"))?;
251        let _ = f.sync_all();
252    }
253
254    std::fs::rename(&tmp_file, cache_file)
255        .map_err(|e| format!("Failed to rename cache tmp into place: {e}"))?;
256    Ok(())
257}
258
259impl Default for CacheStore {
260    fn default() -> Self {
261        Self::new()
262    }
263}