use std::collections::hash_map::DefaultHasher;
use std::fs;
use std::hash::{Hash, Hasher};
use std::io::{BufReader, BufWriter, Write};
use std::path::{Path, PathBuf};
use std::time::UNIX_EPOCH;
use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use crate::bm25::Bm25Index;
use crate::graph::DependencyGraph;
use crate::index::build::MAX_FILE_BYTES;
use crate::model::Chunk;
use crate::source_files::{filter_extensions, walk_source_files};
const CACHE_VERSION: u32 = 7;
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub(crate) struct FileFingerprint {
path: String,
len: u64,
modified_secs: u64,
modified_nanos: u32,
content_hash: u64,
}
#[derive(Serialize, Deserialize)]
struct CachePayload {
version: u32,
root: String,
include_text_files: bool,
files: Vec<FileFingerprint>,
bm25_index: Bm25Index,
chunks: Vec<Chunk>,
graph: DependencyGraph,
}
pub(crate) struct CachedBm25Index {
pub(crate) bm25_index: Bm25Index,
pub(crate) chunks: Vec<Chunk>,
pub(crate) graph: DependencyGraph,
}
pub(crate) fn build_manifest(root: &Path, include_text_files: bool) -> Vec<FileFingerprint> {
let extensions = filter_extensions(None, include_text_files);
let files = walk_source_files(root, &extensions, None);
files
.into_iter()
.filter_map(|path| fingerprint_file(root, &path))
.collect()
}
pub(crate) fn load_bm25(root: &Path, include_text_files: bool) -> Option<CachedBm25Index> {
let cache_path = cache_path(root, include_text_files)?;
let file = fs::File::open(cache_path).ok()?;
let mut payload: CachePayload = serde_json::from_reader(BufReader::new(file)).ok()?;
let current_files = build_manifest(root, include_text_files);
let root_key = root_key(root);
if payload.version != CACHE_VERSION
|| payload.root != root_key
|| payload.include_text_files != include_text_files
|| payload.files != current_files
{
return None;
}
payload.graph.hydrate_sources_from_root(root);
payload.graph.resolve_dependencies();
Some(CachedBm25Index {
bm25_index: payload.bm25_index,
chunks: payload.chunks,
graph: payload.graph,
})
}
pub(crate) fn store_bm25(
root: &Path,
include_text_files: bool,
files: Vec<FileFingerprint>,
bm25_index: &Bm25Index,
chunks: &[Chunk],
graph: &DependencyGraph,
) -> Result<()> {
let cache_path =
cache_path(root, include_text_files).context("failed to resolve cache path")?;
let payload = CachePayload {
version: CACHE_VERSION,
root: root_key(root),
include_text_files,
files,
bm25_index: bm25_index.clone(),
chunks: chunks.to_vec(),
graph: graph.clone(),
};
if let Some(parent) = cache_path.parent() {
fs::create_dir_all(parent).context("failed to create index cache directory")?;
}
let tmp_path = cache_path.with_extension(format!("json.{}.tmp", std::process::id()));
let tmp_file = fs::File::create(&tmp_path).context("failed to create temporary index cache")?;
let mut writer = BufWriter::new(tmp_file);
serde_json::to_writer(&mut writer, &payload).context("failed to serialize index cache")?;
writer.flush().context("failed to flush index cache")?;
fs::rename(&tmp_path, cache_path).context("failed to replace index cache")?;
Ok(())
}
fn fingerprint_file(root: &Path, path: &Path) -> Option<FileFingerprint> {
let metadata = path.metadata().ok()?;
if metadata.len() > MAX_FILE_BYTES {
return None;
}
let modified = metadata.modified().ok()?.duration_since(UNIX_EPOCH).ok()?;
let content = fs::read(path).ok()?;
let relative = path
.strip_prefix(root)
.ok()?
.to_string_lossy()
.replace('\\', "/");
Some(FileFingerprint {
path: relative,
len: metadata.len(),
modified_secs: modified.as_secs(),
modified_nanos: modified.subsec_nanos(),
content_hash: stable_hash(&content),
})
}
fn cache_path(root: &Path, include_text_files: bool) -> Option<PathBuf> {
Some(
std::env::var_os("ASR_HOME")
.map(PathBuf::from)
.or_else(dirs::home_dir)?
.join(".asr")
.join("cache")
.join("source-index")
.join(format!(
"{:016x}.json",
cache_hash(root, include_text_files)
)),
)
}
fn cache_hash(root: &Path, include_text_files: bool) -> u64 {
let mut hasher = DefaultHasher::new();
CACHE_VERSION.hash(&mut hasher);
root_key(root).hash(&mut hasher);
include_text_files.hash(&mut hasher);
hasher.finish()
}
fn root_key(root: &Path) -> String {
root.to_string_lossy().replace('\\', "/")
}
fn stable_hash(bytes: &[u8]) -> u64 {
const OFFSET: u64 = 0xcbf29ce484222325;
const PRIME: u64 = 0x100000001b3;
let mut hash = OFFSET;
for byte in bytes {
hash ^= u64::from(*byte);
hash = hash.wrapping_mul(PRIME);
}
hash
}
#[cfg(test)]
mod tests {
use std::fs;
use std::time::{SystemTime, UNIX_EPOCH};
use super::{build_manifest, MAX_FILE_BYTES};
fn temp_root(name: &str) -> std::path::PathBuf {
let unique = SystemTime::now()
.duration_since(UNIX_EPOCH)
.expect("system time should be after unix epoch")
.as_nanos();
std::env::temp_dir().join(format!("asr-cache-{name}-{unique}"))
}
#[test]
fn manifest_skips_files_larger_than_index_limit() {
let root = temp_root("large-file-skip");
fs::create_dir_all(&root).expect("root should be created");
fs::write(root.join("small.rs"), "pub fn small() {}\n")
.expect("small source should be written");
fs::write(
root.join("large.rs"),
"a".repeat((MAX_FILE_BYTES + 1) as usize),
)
.expect("large source should be written");
let manifest = build_manifest(&root, false);
let paths: Vec<&str> = manifest.iter().map(|file| file.path.as_str()).collect();
assert_eq!(paths, vec!["small.rs"]);
let _ = fs::remove_dir_all(root);
}
#[test]
fn manifest_changes_when_same_length_content_changes() {
let root = temp_root("same-length-content");
fs::create_dir_all(&root).expect("root should be created");
let source = root.join("lib.rs");
fs::write(&source, "pub fn a() {}\n").expect("source should be written");
let first = build_manifest(&root, false);
fs::write(&source, "pub fn b() {}\n").expect("source should be rewritten");
let second = build_manifest(&root, false);
assert_eq!(first.len(), 1);
assert_eq!(second.len(), 1);
assert_ne!(first, second);
let _ = fs::remove_dir_all(root);
}
}