agent-file-tools 0.42.0

Agent File Tools — tree-sitter powered code analysis for AI agents
Documentation
use rayon::prelude::*;
#[cfg(debug_assertions)]
use std::cell::Cell;
use std::fs;
use std::path::{Path, PathBuf};
#[cfg(debug_assertions)]
use std::sync::atomic::{AtomicUsize, Ordering};
use std::time::{SystemTime, UNIX_EPOCH};

pub const CONTENT_HASH_SIZE_CAP: u64 = 4 * 1024 * 1024;

#[cfg(debug_assertions)]
static STRICT_VERIFY_FILE_CALLS: AtomicUsize = AtomicUsize::new(0);
#[cfg(debug_assertions)]
thread_local! {
    static HASH_FILE_IF_SMALL_CALLS: Cell<usize> = const { Cell::new(0) };
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct FileFreshness {
    pub mtime: SystemTime,
    pub size: u64,
    pub content_hash: blake3::Hash,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FreshnessVerdict {
    HotFresh,
    ContentFresh {
        new_mtime: SystemTime,
        new_size: u64,
    },
    Stale,
    Deleted,
}

pub fn hash_bytes(bytes: &[u8]) -> blake3::Hash {
    blake3::hash(bytes)
}

pub fn hash_file_if_small(path: &Path, size: u64) -> std::io::Result<Option<blake3::Hash>> {
    if size > CONTENT_HASH_SIZE_CAP {
        return Ok(None);
    }
    #[cfg(debug_assertions)]
    HASH_FILE_IF_SMALL_CALLS.with(|calls| calls.set(calls.get() + 1));
    fs::read(path).map(|bytes| Some(hash_bytes(&bytes)))
}

pub fn metadata_matches(path: &Path, cached: &FileFreshness) -> std::io::Result<bool> {
    let metadata = fs::metadata(path)?;
    let new_size = metadata.len();
    let new_mtime = metadata.modified().unwrap_or(UNIX_EPOCH);
    Ok(new_size == cached.size && new_mtime == cached.mtime)
}

pub fn zero_hash() -> blake3::Hash {
    blake3::Hash::from_bytes([0u8; 32])
}

pub fn collect(path: &Path) -> std::io::Result<FileFreshness> {
    let metadata = fs::metadata(path)?;
    let mtime = metadata.modified().unwrap_or(UNIX_EPOCH);
    let size = metadata.len();
    let content_hash = hash_file_if_small(path, size)?.unwrap_or_else(zero_hash);
    Ok(FileFreshness {
        mtime,
        size,
        content_hash,
    })
}

pub fn verify_file(path: &Path, cached: &FileFreshness) -> FreshnessVerdict {
    verify_file_inner(path, cached, false)
}

pub fn verify_file_strict(path: &Path, cached: &FileFreshness) -> FreshnessVerdict {
    #[cfg(debug_assertions)]
    STRICT_VERIFY_FILE_CALLS.fetch_add(1, Ordering::Relaxed);
    verify_file_inner(path, cached, true)
}

/// Verify semantic cache file freshness in a private bounded Rayon pool.
///
/// Do not use the global pool here: load-time strict verification can hash every
/// indexed file, and the semantic load/build already runs beside the bridge's
/// single dispatch thread. Match the half-cores/cap-8 policy used by the search
/// and callgraph cold-build pools.
pub(crate) fn verify_files_strict_bounded<K: Send>(
    files: Vec<(K, PathBuf, FileFreshness)>,
) -> Vec<(K, PathBuf, FreshnessVerdict)> {
    fn verify_one<K>(
        (key, path, cached): (K, PathBuf, FileFreshness),
    ) -> (K, PathBuf, FreshnessVerdict) {
        let verdict = verify_file_strict(&path, &cached);
        (key, path, verdict)
    }

    if files.len() <= 1 {
        return files.into_iter().map(verify_one::<K>).collect();
    }

    match rayon::ThreadPoolBuilder::new()
        .num_threads(strict_verify_pool_size())
        .thread_name(|index| format!("aft-semantic-verify-{index}"))
        .build()
    {
        Ok(pool) => pool.install(|| files.into_par_iter().map(verify_one::<K>).collect()),
        Err(_) => files.into_iter().map(verify_one::<K>).collect(),
    }
}

fn strict_verify_pool_size() -> usize {
    std::thread::available_parallelism()
        .map(|parallelism| parallelism.get())
        .unwrap_or(1)
        .div_ceil(2)
        .clamp(1, 8)
}

#[cfg(debug_assertions)]
#[doc(hidden)]
pub fn reset_verify_file_strict_count_for_debug() {
    STRICT_VERIFY_FILE_CALLS.store(0, Ordering::Relaxed);
}

#[cfg(debug_assertions)]
#[doc(hidden)]
pub fn verify_file_strict_count_for_debug() -> usize {
    STRICT_VERIFY_FILE_CALLS.load(Ordering::Relaxed)
}

#[cfg(debug_assertions)]
#[doc(hidden)]
pub fn reset_hash_file_if_small_count_for_debug() {
    HASH_FILE_IF_SMALL_CALLS.with(|calls| calls.set(0));
}

#[cfg(debug_assertions)]
#[doc(hidden)]
pub fn hash_file_if_small_count_for_debug() -> usize {
    HASH_FILE_IF_SMALL_CALLS.with(Cell::get)
}

fn verify_file_inner(
    path: &Path,
    cached: &FileFreshness,
    hash_matching_metadata: bool,
) -> FreshnessVerdict {
    let Ok(metadata) = fs::metadata(path) else {
        return FreshnessVerdict::Deleted;
    };
    let new_size = metadata.len();
    let new_mtime = metadata.modified().unwrap_or(UNIX_EPOCH);
    if new_size == cached.size && new_mtime == cached.mtime {
        if hash_matching_metadata {
            if new_size > CONTENT_HASH_SIZE_CAP || cached.content_hash == zero_hash() {
                return FreshnessVerdict::Stale;
            }
            return match hash_file_if_small(path, new_size) {
                Ok(Some(hash)) if hash == cached.content_hash => FreshnessVerdict::HotFresh,
                _ => FreshnessVerdict::Stale,
            };
        }
        return FreshnessVerdict::HotFresh;
    }
    if new_size != cached.size || new_size > CONTENT_HASH_SIZE_CAP {
        return FreshnessVerdict::Stale;
    }
    match hash_file_if_small(path, new_size) {
        Ok(Some(hash)) if hash == cached.content_hash => FreshnessVerdict::ContentFresh {
            new_mtime,
            new_size,
        },
        _ => FreshnessVerdict::Stale,
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Write;

    fn write(path: &Path, bytes: &[u8]) {
        fs::write(path, bytes).unwrap();
    }

    /// Phase-3 gating benchmark: stat-all (non-strict verify_file) vs hash-all
    /// (strict verify_file_strict) cost across a real repo's file set, with NO
    /// file changed (the steady-state warm freshness pass). Decides Option B:
    /// if stat-all is cheap even at large file counts, replace the per-edit
    /// hash-all with stat-diff-first.
    ///
    ///   AFT_BENCH_REPO=/path/to/repo cargo test -p agent-file-tools --lib \
    ///     --release -- --ignored --nocapture --test-threads=1 \
    ///     freshness_stat_vs_hash_benchmark
    #[test]
    #[ignore = "manual benchmark; needs AFT_BENCH_REPO"]
    fn freshness_stat_vs_hash_benchmark() {
        use std::time::Instant;
        let Ok(repo) = std::env::var("AFT_BENCH_REPO") else {
            eprintln!("AFT_BENCH_REPO unset; skipping");
            return;
        };
        let root = std::path::PathBuf::from(&repo);
        let files: Vec<std::path::PathBuf> = crate::callgraph::walk_project_files(&root).collect();

        // Cold pass: collect freshness records (this is the cold-build cost, not
        // what we're optimizing — just needed to seed the warm comparison).
        let records: Vec<(std::path::PathBuf, FileFreshness)> = files
            .iter()
            .filter_map(|p| collect(p).ok().map(|f| (p.clone(), f)))
            .collect();

        eprintln!(
            "\n=== freshness stat-vs-hash benchmark ===\nrepo: {}\nfiles walked: {}  freshness records: {}",
            root.display(),
            files.len(),
            records.len()
        );

        // 3 iterations each, report medians. Interleave to share cache effects.
        let mut stat_ms = Vec::new();
        let mut hash_ms = Vec::new();
        for _ in 0..3 {
            let t = Instant::now();
            let mut stat_hot = 0usize;
            for (path, cached) in &records {
                // Non-strict: stat only; hashes ONLY if (mtime,size) differ.
                // With no file changed, this is pure stat — the Option B cost.
                if matches!(verify_file(path, cached), FreshnessVerdict::HotFresh) {
                    stat_hot += 1;
                }
            }
            stat_ms.push(t.elapsed().as_micros());

            let t = Instant::now();
            let mut hash_hot = 0usize;
            for (path, cached) in &records {
                // Strict: stat + content-hash every file (today's per-edit cost).
                if matches!(verify_file_strict(path, cached), FreshnessVerdict::HotFresh) {
                    hash_hot += 1;
                }
            }
            hash_ms.push(t.elapsed().as_micros());

            eprintln!("  iter: stat_hot={stat_hot} hash_hot={hash_hot}");
        }
        stat_ms.sort_unstable();
        hash_ms.sort_unstable();
        let stat_med = stat_ms[1] as f64 / 1000.0;
        let hash_med = hash_ms[1] as f64 / 1000.0;
        eprintln!(
            "SUMMARY  files={}  stat_all_median={:.2}ms  hash_all_median={:.2}ms  speedup={:.1}x",
            records.len(),
            stat_med,
            hash_med,
            hash_med / stat_med.max(0.001)
        );
    }

    #[test]
    fn hot_fresh_when_mtime_size_match() {
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("a.txt");
        write(&path, b"same");
        let fresh = collect(&path).unwrap();
        assert_eq!(verify_file(&path, &fresh), FreshnessVerdict::HotFresh);
    }

    #[test]
    fn strict_hashes_small_file_when_metadata_matches() {
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("a.txt");
        let original_mtime = filetime::FileTime::from_unix_time(1_700_000_000, 0);
        write(&path, b"alpha");
        filetime::set_file_mtime(&path, original_mtime).unwrap();
        let fresh = collect(&path).unwrap();

        assert_eq!(
            verify_file_strict(&path, &fresh),
            FreshnessVerdict::HotFresh
        );

        write(&path, b"bravo");
        filetime::set_file_mtime(&path, original_mtime).unwrap();

        // Stat-diff freshness intentionally treats same-size, same-mtime edits as
        // fresh; Tier-2's staleness ceiling heals this accepted residual with a
        // periodic strict pass instead of hashing every file on each edit.
        assert_eq!(verify_file(&path, &fresh), FreshnessVerdict::HotFresh);
        assert_eq!(verify_file_strict(&path, &fresh), FreshnessVerdict::Stale);
    }

    #[test]
    fn strict_stale_when_large_file_hash_was_not_cached() {
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("big.bin");
        let original_mtime = filetime::FileTime::from_unix_time(1_700_000_000, 0);
        let file = fs::File::create(&path).unwrap();
        file.set_len(CONTENT_HASH_SIZE_CAP + 1).unwrap();
        filetime::set_file_mtime(&path, original_mtime).unwrap();
        let fresh = collect(&path).unwrap();

        assert_eq!(fresh.size, CONTENT_HASH_SIZE_CAP + 1);
        assert_eq!(fresh.content_hash, zero_hash());
        // Non-strict stat-diff trusts unchanged metadata for over-cap files and
        // avoids strict's needless rescan of large unchanged files.
        assert_eq!(verify_file(&path, &fresh), FreshnessVerdict::HotFresh);
        assert_eq!(verify_file_strict(&path, &fresh), FreshnessVerdict::Stale);
    }

    #[test]
    fn content_fresh_when_only_mtime_changes() {
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("a.txt");
        write(&path, b"same");
        let fresh = collect(&path).unwrap();
        let mut file = fs::OpenOptions::new().append(true).open(&path).unwrap();
        file.write_all(b"").unwrap();
        file.sync_all().unwrap();
        filetime::set_file_mtime(&path, filetime::FileTime::from_unix_time(1, 0)).unwrap();
        assert!(matches!(
            verify_file(&path, &fresh),
            FreshnessVerdict::ContentFresh { .. }
        ));
    }

    #[test]
    fn stale_when_size_changes() {
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("a.txt");
        write(&path, b"same");
        let fresh = collect(&path).unwrap();
        write(&path, b"different");
        assert_eq!(verify_file(&path, &fresh), FreshnessVerdict::Stale);
    }

    #[test]
    fn deleted_when_missing() {
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("a.txt");
        write(&path, b"same");
        let fresh = collect(&path).unwrap();
        fs::remove_file(&path).unwrap();
        assert_eq!(verify_file(&path, &fresh), FreshnessVerdict::Deleted);
    }

    #[test]
    fn over_cap_hash_is_not_computed() {
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("big.bin");
        fs::write(&path, vec![0u8; CONTENT_HASH_SIZE_CAP as usize + 1]).unwrap();
        assert!(hash_file_if_small(&path, CONTENT_HASH_SIZE_CAP + 1)
            .unwrap()
            .is_none());
    }
}