Skip to main content

aft/
cache_freshness.rs

1use rayon::prelude::*;
2#[cfg(debug_assertions)]
3use std::cell::Cell;
4use std::fs;
5use std::path::{Path, PathBuf};
6#[cfg(debug_assertions)]
7use std::sync::atomic::{AtomicUsize, Ordering};
8use std::time::{SystemTime, UNIX_EPOCH};
9
10pub const CONTENT_HASH_SIZE_CAP: u64 = 4 * 1024 * 1024;
11
12#[cfg(debug_assertions)]
13static STRICT_VERIFY_FILE_CALLS: AtomicUsize = AtomicUsize::new(0);
14#[cfg(debug_assertions)]
15thread_local! {
16    static HASH_FILE_IF_SMALL_CALLS: Cell<usize> = const { Cell::new(0) };
17}
18
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub struct FileFreshness {
21    pub mtime: SystemTime,
22    pub size: u64,
23    pub content_hash: blake3::Hash,
24}
25
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
27pub enum FreshnessVerdict {
28    HotFresh,
29    ContentFresh {
30        new_mtime: SystemTime,
31        new_size: u64,
32    },
33    Stale,
34    Deleted,
35}
36
37pub fn hash_bytes(bytes: &[u8]) -> blake3::Hash {
38    blake3::hash(bytes)
39}
40
41pub fn hash_file_if_small(path: &Path, size: u64) -> std::io::Result<Option<blake3::Hash>> {
42    if size > CONTENT_HASH_SIZE_CAP {
43        return Ok(None);
44    }
45    #[cfg(debug_assertions)]
46    HASH_FILE_IF_SMALL_CALLS.with(|calls| calls.set(calls.get() + 1));
47    fs::read(path).map(|bytes| Some(hash_bytes(&bytes)))
48}
49
50pub fn metadata_matches(path: &Path, cached: &FileFreshness) -> std::io::Result<bool> {
51    let metadata = fs::metadata(path)?;
52    let new_size = metadata.len();
53    let new_mtime = metadata.modified().unwrap_or(UNIX_EPOCH);
54    Ok(new_size == cached.size && new_mtime == cached.mtime)
55}
56
57pub fn zero_hash() -> blake3::Hash {
58    blake3::Hash::from_bytes([0u8; 32])
59}
60
61pub fn collect(path: &Path) -> std::io::Result<FileFreshness> {
62    let metadata = fs::metadata(path)?;
63    let mtime = metadata.modified().unwrap_or(UNIX_EPOCH);
64    let size = metadata.len();
65    let content_hash = hash_file_if_small(path, size)?.unwrap_or_else(zero_hash);
66    Ok(FileFreshness {
67        mtime,
68        size,
69        content_hash,
70    })
71}
72
73pub fn verify_file(path: &Path, cached: &FileFreshness) -> FreshnessVerdict {
74    verify_file_inner(path, cached, false)
75}
76
77pub fn verify_file_strict(path: &Path, cached: &FileFreshness) -> FreshnessVerdict {
78    #[cfg(debug_assertions)]
79    STRICT_VERIFY_FILE_CALLS.fetch_add(1, Ordering::Relaxed);
80    verify_file_inner(path, cached, true)
81}
82
83/// Verify semantic cache file freshness in a private bounded Rayon pool.
84///
85/// Do not use the global pool here: load-time strict verification can hash every
86/// indexed file, and the semantic load/build already runs beside the bridge's
87/// single dispatch thread. Match the half-cores/cap-8 policy used by the search
88/// and callgraph cold-build pools.
89pub(crate) fn verify_files_strict_bounded<K: Send>(
90    files: Vec<(K, PathBuf, FileFreshness)>,
91) -> Vec<(K, PathBuf, FreshnessVerdict)> {
92    fn verify_one<K>(
93        (key, path, cached): (K, PathBuf, FileFreshness),
94    ) -> (K, PathBuf, FreshnessVerdict) {
95        let verdict = verify_file_strict(&path, &cached);
96        (key, path, verdict)
97    }
98
99    if files.len() <= 1 {
100        return files.into_iter().map(verify_one::<K>).collect();
101    }
102
103    match rayon::ThreadPoolBuilder::new()
104        .num_threads(strict_verify_pool_size())
105        .thread_name(|index| format!("aft-semantic-verify-{index}"))
106        .build()
107    {
108        Ok(pool) => pool.install(|| files.into_par_iter().map(verify_one::<K>).collect()),
109        Err(_) => files.into_iter().map(verify_one::<K>).collect(),
110    }
111}
112
113fn strict_verify_pool_size() -> usize {
114    std::thread::available_parallelism()
115        .map(|parallelism| parallelism.get())
116        .unwrap_or(1)
117        .div_ceil(2)
118        .clamp(1, 8)
119}
120
121#[cfg(debug_assertions)]
122#[doc(hidden)]
123pub fn reset_verify_file_strict_count_for_debug() {
124    STRICT_VERIFY_FILE_CALLS.store(0, Ordering::Relaxed);
125}
126
127#[cfg(debug_assertions)]
128#[doc(hidden)]
129pub fn verify_file_strict_count_for_debug() -> usize {
130    STRICT_VERIFY_FILE_CALLS.load(Ordering::Relaxed)
131}
132
133#[cfg(debug_assertions)]
134#[doc(hidden)]
135pub fn reset_hash_file_if_small_count_for_debug() {
136    HASH_FILE_IF_SMALL_CALLS.with(|calls| calls.set(0));
137}
138
139#[cfg(debug_assertions)]
140#[doc(hidden)]
141pub fn hash_file_if_small_count_for_debug() -> usize {
142    HASH_FILE_IF_SMALL_CALLS.with(Cell::get)
143}
144
145fn verify_file_inner(
146    path: &Path,
147    cached: &FileFreshness,
148    hash_matching_metadata: bool,
149) -> FreshnessVerdict {
150    let Ok(metadata) = fs::metadata(path) else {
151        return FreshnessVerdict::Deleted;
152    };
153    let new_size = metadata.len();
154    let new_mtime = metadata.modified().unwrap_or(UNIX_EPOCH);
155    if new_size == cached.size && new_mtime == cached.mtime {
156        if hash_matching_metadata {
157            if new_size > CONTENT_HASH_SIZE_CAP || cached.content_hash == zero_hash() {
158                return FreshnessVerdict::Stale;
159            }
160            return match hash_file_if_small(path, new_size) {
161                Ok(Some(hash)) if hash == cached.content_hash => FreshnessVerdict::HotFresh,
162                _ => FreshnessVerdict::Stale,
163            };
164        }
165        return FreshnessVerdict::HotFresh;
166    }
167    if new_size != cached.size || new_size > CONTENT_HASH_SIZE_CAP {
168        return FreshnessVerdict::Stale;
169    }
170    match hash_file_if_small(path, new_size) {
171        Ok(Some(hash)) if hash == cached.content_hash => FreshnessVerdict::ContentFresh {
172            new_mtime,
173            new_size,
174        },
175        _ => FreshnessVerdict::Stale,
176    }
177}
178
179#[cfg(test)]
180mod tests {
181    use super::*;
182    use std::io::Write;
183
184    fn write(path: &Path, bytes: &[u8]) {
185        fs::write(path, bytes).unwrap();
186    }
187
188    /// Phase-3 gating benchmark: stat-all (non-strict verify_file) vs hash-all
189    /// (strict verify_file_strict) cost across a real repo's file set, with NO
190    /// file changed (the steady-state warm freshness pass). Decides Option B:
191    /// if stat-all is cheap even at large file counts, replace the per-edit
192    /// hash-all with stat-diff-first.
193    ///
194    ///   AFT_BENCH_REPO=/path/to/repo cargo test -p agent-file-tools --lib \
195    ///     --release -- --ignored --nocapture --test-threads=1 \
196    ///     freshness_stat_vs_hash_benchmark
197    #[test]
198    #[ignore = "manual benchmark; needs AFT_BENCH_REPO"]
199    fn freshness_stat_vs_hash_benchmark() {
200        use std::time::Instant;
201        let Ok(repo) = std::env::var("AFT_BENCH_REPO") else {
202            eprintln!("AFT_BENCH_REPO unset; skipping");
203            return;
204        };
205        let root = std::path::PathBuf::from(&repo);
206        let files: Vec<std::path::PathBuf> = crate::callgraph::walk_project_files(&root).collect();
207
208        // Cold pass: collect freshness records (this is the cold-build cost, not
209        // what we're optimizing — just needed to seed the warm comparison).
210        let records: Vec<(std::path::PathBuf, FileFreshness)> = files
211            .iter()
212            .filter_map(|p| collect(p).ok().map(|f| (p.clone(), f)))
213            .collect();
214
215        eprintln!(
216            "\n=== freshness stat-vs-hash benchmark ===\nrepo: {}\nfiles walked: {}  freshness records: {}",
217            root.display(),
218            files.len(),
219            records.len()
220        );
221
222        // 3 iterations each, report medians. Interleave to share cache effects.
223        let mut stat_ms = Vec::new();
224        let mut hash_ms = Vec::new();
225        for _ in 0..3 {
226            let t = Instant::now();
227            let mut stat_hot = 0usize;
228            for (path, cached) in &records {
229                // Non-strict: stat only; hashes ONLY if (mtime,size) differ.
230                // With no file changed, this is pure stat — the Option B cost.
231                if matches!(verify_file(path, cached), FreshnessVerdict::HotFresh) {
232                    stat_hot += 1;
233                }
234            }
235            stat_ms.push(t.elapsed().as_micros());
236
237            let t = Instant::now();
238            let mut hash_hot = 0usize;
239            for (path, cached) in &records {
240                // Strict: stat + content-hash every file (today's per-edit cost).
241                if matches!(verify_file_strict(path, cached), FreshnessVerdict::HotFresh) {
242                    hash_hot += 1;
243                }
244            }
245            hash_ms.push(t.elapsed().as_micros());
246
247            eprintln!("  iter: stat_hot={stat_hot} hash_hot={hash_hot}");
248        }
249        stat_ms.sort_unstable();
250        hash_ms.sort_unstable();
251        let stat_med = stat_ms[1] as f64 / 1000.0;
252        let hash_med = hash_ms[1] as f64 / 1000.0;
253        eprintln!(
254            "SUMMARY  files={}  stat_all_median={:.2}ms  hash_all_median={:.2}ms  speedup={:.1}x",
255            records.len(),
256            stat_med,
257            hash_med,
258            hash_med / stat_med.max(0.001)
259        );
260    }
261
262    #[test]
263    fn hot_fresh_when_mtime_size_match() {
264        let dir = tempfile::tempdir().unwrap();
265        let path = dir.path().join("a.txt");
266        write(&path, b"same");
267        let fresh = collect(&path).unwrap();
268        assert_eq!(verify_file(&path, &fresh), FreshnessVerdict::HotFresh);
269    }
270
271    #[test]
272    fn strict_hashes_small_file_when_metadata_matches() {
273        let dir = tempfile::tempdir().unwrap();
274        let path = dir.path().join("a.txt");
275        let original_mtime = filetime::FileTime::from_unix_time(1_700_000_000, 0);
276        write(&path, b"alpha");
277        filetime::set_file_mtime(&path, original_mtime).unwrap();
278        let fresh = collect(&path).unwrap();
279
280        assert_eq!(
281            verify_file_strict(&path, &fresh),
282            FreshnessVerdict::HotFresh
283        );
284
285        write(&path, b"bravo");
286        filetime::set_file_mtime(&path, original_mtime).unwrap();
287
288        // Stat-diff freshness intentionally treats same-size, same-mtime edits as
289        // fresh; Tier-2's staleness ceiling heals this accepted residual with a
290        // periodic strict pass instead of hashing every file on each edit.
291        assert_eq!(verify_file(&path, &fresh), FreshnessVerdict::HotFresh);
292        assert_eq!(verify_file_strict(&path, &fresh), FreshnessVerdict::Stale);
293    }
294
295    #[test]
296    fn strict_stale_when_large_file_hash_was_not_cached() {
297        let dir = tempfile::tempdir().unwrap();
298        let path = dir.path().join("big.bin");
299        let original_mtime = filetime::FileTime::from_unix_time(1_700_000_000, 0);
300        let file = fs::File::create(&path).unwrap();
301        file.set_len(CONTENT_HASH_SIZE_CAP + 1).unwrap();
302        filetime::set_file_mtime(&path, original_mtime).unwrap();
303        let fresh = collect(&path).unwrap();
304
305        assert_eq!(fresh.size, CONTENT_HASH_SIZE_CAP + 1);
306        assert_eq!(fresh.content_hash, zero_hash());
307        // Non-strict stat-diff trusts unchanged metadata for over-cap files and
308        // avoids strict's needless rescan of large unchanged files.
309        assert_eq!(verify_file(&path, &fresh), FreshnessVerdict::HotFresh);
310        assert_eq!(verify_file_strict(&path, &fresh), FreshnessVerdict::Stale);
311    }
312
313    #[test]
314    fn content_fresh_when_only_mtime_changes() {
315        let dir = tempfile::tempdir().unwrap();
316        let path = dir.path().join("a.txt");
317        write(&path, b"same");
318        let fresh = collect(&path).unwrap();
319        let mut file = fs::OpenOptions::new().append(true).open(&path).unwrap();
320        file.write_all(b"").unwrap();
321        file.sync_all().unwrap();
322        filetime::set_file_mtime(&path, filetime::FileTime::from_unix_time(1, 0)).unwrap();
323        assert!(matches!(
324            verify_file(&path, &fresh),
325            FreshnessVerdict::ContentFresh { .. }
326        ));
327    }
328
329    #[test]
330    fn stale_when_size_changes() {
331        let dir = tempfile::tempdir().unwrap();
332        let path = dir.path().join("a.txt");
333        write(&path, b"same");
334        let fresh = collect(&path).unwrap();
335        write(&path, b"different");
336        assert_eq!(verify_file(&path, &fresh), FreshnessVerdict::Stale);
337    }
338
339    #[test]
340    fn deleted_when_missing() {
341        let dir = tempfile::tempdir().unwrap();
342        let path = dir.path().join("a.txt");
343        write(&path, b"same");
344        let fresh = collect(&path).unwrap();
345        fs::remove_file(&path).unwrap();
346        assert_eq!(verify_file(&path, &fresh), FreshnessVerdict::Deleted);
347    }
348
349    #[test]
350    fn over_cap_hash_is_not_computed() {
351        let dir = tempfile::tempdir().unwrap();
352        let path = dir.path().join("big.bin");
353        fs::write(&path, vec![0u8; CONTENT_HASH_SIZE_CAP as usize + 1]).unwrap();
354        assert!(hash_file_if_small(&path, CONTENT_HASH_SIZE_CAP + 1)
355            .unwrap()
356            .is_none());
357    }
358}