Skip to main content

fallow_extract/cache/
store.rs

1//! Cache store: load, save, and query cached module data.
2
3use std::path::Path;
4
5use rustc_hash::FxHashMap;
6
7use bitcode::{Decode, Encode};
8
9use super::types::{
10    CACHE_VERSION, CachedModule, DEFAULT_CACHE_MAX_SIZE, EVICTION_SIGNIFICANT_BPS,
11    EVICTION_TARGET_BPS, EVICTION_TRIGGER_BPS,
12};
13
14/// Cached module information stored on disk.
15#[derive(Debug, Encode, Decode)]
16pub struct CacheStore {
17    version: u32,
18    /// Stable hash of extraction-affecting config fields.
19    config_hash: u64,
20    /// Map from file path to cached module data.
21    entries: FxHashMap<String, CachedModule>,
22}
23
24impl CacheStore {
25    /// Create a new empty cache.
26    #[must_use]
27    pub fn new() -> Self {
28        Self {
29            version: CACHE_VERSION,
30            config_hash: 0,
31            entries: FxHashMap::default(),
32        }
33    }
34
35    /// Load cache from disk.
36    ///
37    /// Returns `None` when the file is missing, too large, undecodable, or
38    /// built for a different `config_hash`.
39    #[must_use]
40    pub fn load(
41        cache_dir: &Path,
42        expected_config_hash: u64,
43        max_size_bytes: usize,
44    ) -> Option<Self> {
45        let cache_file = cache_dir.join("cache.bin");
46        let data = std::fs::read(&cache_file).ok()?;
47        let safety_ceiling = max_size_bytes.max(DEFAULT_CACHE_MAX_SIZE);
48        if data.len() > safety_ceiling {
49            tracing::warn!(
50                size_mb = data.len() / (1024 * 1024),
51                ceiling_mb = safety_ceiling / (1024 * 1024),
52                "Cache file exceeds safety ceiling, ignoring"
53            );
54            return None;
55        }
56        let store: Self = match bitcode::decode(&data) {
57            Ok(s) => s,
58            Err(_) => {
59                tracing::info!(
60                    "Cache format upgraded, rebuilding (one-time cost after version bump)"
61                );
62                return None;
63            }
64        };
65        if store.version != CACHE_VERSION {
66            tracing::info!("Cache format upgraded, rebuilding (one-time cost after version bump)");
67            return None;
68        }
69        if store.config_hash != expected_config_hash {
70            return None;
71        }
72        Some(store)
73    }
74
75    /// Save cache to disk with write-time size enforcement and atomic rename.
76    pub fn save(
77        &mut self,
78        cache_dir: &Path,
79        config_hash: u64,
80        max_size_bytes: usize,
81    ) -> Result<(), String> {
82        std::fs::create_dir_all(cache_dir)
83            .map_err(|e| format!("Failed to create cache dir: {e}"))?;
84        write_cache_gitignore(cache_dir)?;
85
86        self.config_hash = config_hash;
87        let initial_entries = self.entries.len();
88        let mut encoded = bitcode::encode(self);
89
90        let trigger = (max_size_bytes / 10_000).saturating_mul(EVICTION_TRIGGER_BPS);
91        if encoded.len() > trigger {
92            let target = (max_size_bytes / 10_000).saturating_mul(EVICTION_TARGET_BPS);
93            self.evict_lru_to_target(target);
94            encoded = bitcode::encode(self);
95            let evicted = initial_entries.saturating_sub(self.entries.len());
96            let final_size = encoded.len();
97            let significant_evicted =
98                initial_entries.saturating_mul(EVICTION_SIGNIFICANT_BPS) / 10_000;
99            if evicted >= significant_evicted && initial_entries > 0 {
100                tracing::info!(
101                    evicted_entries = evicted,
102                    remaining_entries = self.entries.len(),
103                    final_size_kb = final_size / 1024,
104                    max_size_kb = max_size_bytes / 1024,
105                    "Cache eviction: removed oldest entries to stay under cap"
106                );
107            } else {
108                tracing::debug!(
109                    evicted_entries = evicted,
110                    remaining_entries = self.entries.len(),
111                    final_size_kb = final_size / 1024,
112                    max_size_kb = max_size_bytes / 1024,
113                    "Cache eviction"
114                );
115            }
116        }
117
118        let cache_file = cache_dir.join("cache.bin");
119        atomic_write(&cache_file, &encoded)?;
120        Ok(())
121    }
122
123    /// Evict LRU entries until the re-encoded size is under `target_bytes`
124    /// or only one entry remains.
125    fn evict_lru_to_target(&mut self, target_bytes: usize) {
126        let mut order: Vec<(u64, String)> = self
127            .entries
128            .iter()
129            .map(|(k, v)| (v.last_access_secs, k.clone()))
130            .collect();
131        order.sort();
132
133        const BATCH: usize = 100;
134        let mut idx = 0;
135        while idx < order.len() {
136            let batch_end = (idx + BATCH).min(order.len());
137            for (_, key) in &order[idx..batch_end] {
138                if self.entries.len() <= 1 {
139                    break;
140                }
141                self.entries.remove(key);
142            }
143            idx = batch_end;
144
145            let encoded_size = bitcode::encode(self).len();
146            if encoded_size <= target_bytes || self.entries.len() <= 1 {
147                if encoded_size > target_bytes && self.entries.len() <= 1 {
148                    tracing::warn!(
149                        encoded_kb = encoded_size / 1024,
150                        target_kb = target_bytes / 1024,
151                        "Single cache entry exceeds configured max; cache will overshoot the cap"
152                    );
153                }
154                return;
155            }
156        }
157    }
158
159    /// Look up a cached module by path and content hash.
160    /// Returns None if not cached or hash mismatch.
161    #[must_use]
162    pub fn get(&self, path: &Path, content_hash: u64) -> Option<&CachedModule> {
163        let key = path.to_string_lossy();
164        let entry = self.entries.get(key.as_ref())?;
165        if entry.content_hash == content_hash {
166            Some(entry)
167        } else {
168            None
169        }
170    }
171
172    /// Insert or update a cached module.
173    pub fn insert(&mut self, path: &Path, module: CachedModule) {
174        let key = path.to_string_lossy().into_owned();
175        self.entries.insert(key, module);
176    }
177
178    /// Fast cache lookup using only file metadata (mtime + size).
179    #[must_use]
180    pub fn get_by_metadata(
181        &self,
182        path: &Path,
183        mtime_secs: u64,
184        file_size: u64,
185    ) -> Option<&CachedModule> {
186        let key = path.to_string_lossy();
187        let entry = self.entries.get(key.as_ref())?;
188        if entry.mtime_secs == mtime_secs && entry.file_size == file_size && mtime_secs > 0 {
189            Some(entry)
190        } else {
191            None
192        }
193    }
194
195    /// Look up a cached module by path only (ignoring hash).
196    #[must_use]
197    pub fn get_by_path_only(&self, path: &Path) -> Option<&CachedModule> {
198        let key = path.to_string_lossy();
199        self.entries.get(key.as_ref())
200    }
201
202    /// Remove cache entries for files that are no longer in the project.
203    pub fn retain_paths(&mut self, files: &[fallow_types::discover::DiscoveredFile]) {
204        use rustc_hash::FxHashSet;
205        let current_paths: FxHashSet<String> = files
206            .iter()
207            .map(|f| f.path.to_string_lossy().to_string())
208            .collect();
209        self.entries.retain(|key, _| current_paths.contains(key));
210    }
211
212    /// Number of cached entries.
213    #[must_use]
214    pub fn len(&self) -> usize {
215        self.entries.len()
216    }
217
218    /// Whether the cache is empty.
219    #[must_use]
220    pub fn is_empty(&self) -> bool {
221        self.entries.is_empty()
222    }
223}
224
225fn write_cache_gitignore(cache_dir: &Path) -> Result<(), String> {
226    std::fs::write(cache_dir.join(".gitignore"), "*\n")
227        .map_err(|e| format!("Failed to write cache .gitignore: {e}"))
228}
229
230/// Write `data` atomically via a sibling `.tmp` file, best-effort fsync, then rename.
231fn atomic_write(cache_file: &Path, data: &[u8]) -> Result<(), String> {
232    let tmp_file = match cache_file.file_name() {
233        Some(name) => cache_file.with_file_name({
234            let mut s = name.to_os_string();
235            s.push(".tmp");
236            s
237        }),
238        None => return Err("Cache file path has no filename component".to_owned()),
239    };
240
241    {
242        use std::io::Write as _;
243        let mut f = std::fs::File::create(&tmp_file)
244            .map_err(|e| format!("Failed to create cache tmp: {e}"))?;
245        f.write_all(data)
246            .map_err(|e| format!("Failed to write cache tmp: {e}"))?;
247        let _ = f.sync_all();
248    }
249
250    std::fs::rename(&tmp_file, cache_file)
251        .map_err(|e| format!("Failed to rename cache tmp into place: {e}"))?;
252    Ok(())
253}
254
255impl Default for CacheStore {
256    fn default() -> Self {
257        Self::new()
258    }
259}