agent-source-repository 0.1.0

Agent Source Repository local context registry for coding agents
Documentation
use std::collections::hash_map::DefaultHasher;
use std::fs;
use std::hash::{Hash, Hasher};
use std::io::{BufReader, BufWriter, Write};
use std::path::{Path, PathBuf};
use std::time::UNIX_EPOCH;

use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};

use crate::bm25::Bm25Index;
use crate::graph::DependencyGraph;
use crate::index::build::MAX_FILE_BYTES;
use crate::model::Chunk;
use crate::source_files::{filter_extensions, walk_source_files};

const CACHE_VERSION: u32 = 7;

#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub(crate) struct FileFingerprint {
    path: String,
    len: u64,
    modified_secs: u64,
    modified_nanos: u32,
    content_hash: u64,
}

#[derive(Serialize, Deserialize)]
struct CachePayload {
    version: u32,
    root: String,
    include_text_files: bool,
    files: Vec<FileFingerprint>,
    bm25_index: Bm25Index,
    chunks: Vec<Chunk>,
    graph: DependencyGraph,
}

pub(crate) struct CachedBm25Index {
    pub(crate) bm25_index: Bm25Index,
    pub(crate) chunks: Vec<Chunk>,
    pub(crate) graph: DependencyGraph,
}

pub(crate) fn build_manifest(root: &Path, include_text_files: bool) -> Vec<FileFingerprint> {
    let extensions = filter_extensions(None, include_text_files);
    let files = walk_source_files(root, &extensions, None);
    files
        .into_iter()
        .filter_map(|path| fingerprint_file(root, &path))
        .collect()
}

pub(crate) fn load_bm25(root: &Path, include_text_files: bool) -> Option<CachedBm25Index> {
    let cache_path = cache_path(root, include_text_files)?;
    let file = fs::File::open(cache_path).ok()?;
    let mut payload: CachePayload = serde_json::from_reader(BufReader::new(file)).ok()?;
    let current_files = build_manifest(root, include_text_files);
    let root_key = root_key(root);

    if payload.version != CACHE_VERSION
        || payload.root != root_key
        || payload.include_text_files != include_text_files
        || payload.files != current_files
    {
        return None;
    }

    payload.graph.hydrate_sources_from_root(root);
    payload.graph.resolve_dependencies();
    Some(CachedBm25Index {
        bm25_index: payload.bm25_index,
        chunks: payload.chunks,
        graph: payload.graph,
    })
}

pub(crate) fn store_bm25(
    root: &Path,
    include_text_files: bool,
    files: Vec<FileFingerprint>,
    bm25_index: &Bm25Index,
    chunks: &[Chunk],
    graph: &DependencyGraph,
) -> Result<()> {
    let cache_path =
        cache_path(root, include_text_files).context("failed to resolve cache path")?;
    let payload = CachePayload {
        version: CACHE_VERSION,
        root: root_key(root),
        include_text_files,
        files,
        bm25_index: bm25_index.clone(),
        chunks: chunks.to_vec(),
        graph: graph.clone(),
    };
    if let Some(parent) = cache_path.parent() {
        fs::create_dir_all(parent).context("failed to create index cache directory")?;
    }

    let tmp_path = cache_path.with_extension(format!("json.{}.tmp", std::process::id()));
    let tmp_file = fs::File::create(&tmp_path).context("failed to create temporary index cache")?;
    let mut writer = BufWriter::new(tmp_file);
    serde_json::to_writer(&mut writer, &payload).context("failed to serialize index cache")?;
    writer.flush().context("failed to flush index cache")?;
    fs::rename(&tmp_path, cache_path).context("failed to replace index cache")?;
    Ok(())
}

fn fingerprint_file(root: &Path, path: &Path) -> Option<FileFingerprint> {
    let metadata = path.metadata().ok()?;
    if metadata.len() > MAX_FILE_BYTES {
        return None;
    }
    let modified = metadata.modified().ok()?.duration_since(UNIX_EPOCH).ok()?;
    let content = fs::read(path).ok()?;
    let relative = path
        .strip_prefix(root)
        .ok()?
        .to_string_lossy()
        .replace('\\', "/");
    Some(FileFingerprint {
        path: relative,
        len: metadata.len(),
        modified_secs: modified.as_secs(),
        modified_nanos: modified.subsec_nanos(),
        content_hash: stable_hash(&content),
    })
}

fn cache_path(root: &Path, include_text_files: bool) -> Option<PathBuf> {
    Some(
        std::env::var_os("ASR_HOME")
            .map(PathBuf::from)
            .or_else(dirs::home_dir)?
            .join(".asr")
            .join("cache")
            .join("source-index")
            .join(format!(
                "{:016x}.json",
                cache_hash(root, include_text_files)
            )),
    )
}

fn cache_hash(root: &Path, include_text_files: bool) -> u64 {
    let mut hasher = DefaultHasher::new();
    CACHE_VERSION.hash(&mut hasher);
    root_key(root).hash(&mut hasher);
    include_text_files.hash(&mut hasher);
    hasher.finish()
}

fn root_key(root: &Path) -> String {
    root.to_string_lossy().replace('\\', "/")
}

fn stable_hash(bytes: &[u8]) -> u64 {
    const OFFSET: u64 = 0xcbf29ce484222325;
    const PRIME: u64 = 0x100000001b3;

    let mut hash = OFFSET;
    for byte in bytes {
        hash ^= u64::from(*byte);
        hash = hash.wrapping_mul(PRIME);
    }
    hash
}

#[cfg(test)]
mod tests {
    use std::fs;
    use std::time::{SystemTime, UNIX_EPOCH};

    use super::{build_manifest, MAX_FILE_BYTES};

    fn temp_root(name: &str) -> std::path::PathBuf {
        let unique = SystemTime::now()
            .duration_since(UNIX_EPOCH)
            .expect("system time should be after unix epoch")
            .as_nanos();
        std::env::temp_dir().join(format!("asr-cache-{name}-{unique}"))
    }

    #[test]
    fn manifest_skips_files_larger_than_index_limit() {
        let root = temp_root("large-file-skip");
        fs::create_dir_all(&root).expect("root should be created");
        fs::write(root.join("small.rs"), "pub fn small() {}\n")
            .expect("small source should be written");
        fs::write(
            root.join("large.rs"),
            "a".repeat((MAX_FILE_BYTES + 1) as usize),
        )
        .expect("large source should be written");

        let manifest = build_manifest(&root, false);
        let paths: Vec<&str> = manifest.iter().map(|file| file.path.as_str()).collect();

        assert_eq!(paths, vec!["small.rs"]);

        let _ = fs::remove_dir_all(root);
    }

    #[test]
    fn manifest_changes_when_same_length_content_changes() {
        let root = temp_root("same-length-content");
        fs::create_dir_all(&root).expect("root should be created");
        let source = root.join("lib.rs");
        fs::write(&source, "pub fn a() {}\n").expect("source should be written");
        let first = build_manifest(&root, false);

        fs::write(&source, "pub fn b() {}\n").expect("source should be rewritten");
        let second = build_manifest(&root, false);

        assert_eq!(first.len(), 1);
        assert_eq!(second.len(), 1);
        assert_ne!(first, second);

        let _ = fs::remove_dir_all(root);
    }
}