use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fs;
use std::path::{Path, PathBuf};
use std::time::SystemTime;
use xxhash_rust::xxh64::xxh64;
use crate::config::buffers::parse_buffer_size;
const HASH_INDEX_MAGIC: [u8; 7] = *b"SQRYHSH";
const HASH_INDEX_ENVELOPE_VERSION: u16 = 1;
#[derive(Serialize, Deserialize)]
struct HashIndexEnvelope {
magic: [u8; 7],
version: u16,
sqry_version: String,
payload: Vec<u8>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FileHash {
pub path: PathBuf,
pub hash: u64,
pub size: u64,
pub mtime: SystemTime,
pub symbols_count: usize,
#[serde(skip)]
pub content: Option<String>,
}
impl FileHash {
pub fn compute(path: &Path) -> Result<Self> {
use std::io::Read;
let metadata = fs::metadata(path)
.with_context(|| format!("Failed to read metadata for {}", path.display()))?;
let size = metadata.len();
let mtime = metadata
.modified()
.with_context(|| format!("Failed to get modification time for {}", path.display()))?;
let mut file = fs::File::open(path)
.with_context(|| format!("Failed to open file {}", path.display()))?;
let mut buffer = vec![0u8; parse_buffer_size()];
let mut hasher = xxhash_rust::xxh64::Xxh64::new(0);
loop {
let bytes_read = file
.read(&mut buffer)
.with_context(|| format!("Failed to read file {}", path.display()))?;
if bytes_read == 0 {
break;
}
hasher.update(&buffer[..bytes_read]);
}
let hash = hasher.digest();
Ok(Self {
path: path.to_path_buf(),
hash,
size,
mtime,
symbols_count: 0, content: None, })
}
pub fn from_bytes(path: &Path, content: &[u8]) -> Result<Self> {
let metadata = fs::metadata(path)
.with_context(|| format!("Failed to read metadata for {}", path.display()))?;
let hash = xxh64(content, 0);
Ok(Self {
path: path.to_path_buf(),
hash,
size: content.len() as u64,
mtime: metadata.modified().with_context(|| {
format!("Failed to get modification time for {}", path.display())
})?,
symbols_count: 0,
content: None, })
}
pub fn metadata_changed(&self, path: &Path) -> Result<bool> {
let metadata = fs::metadata(path)
.with_context(|| format!("Failed to read metadata for {}", path.display()))?;
let current_size = metadata.len();
let current_mtime = metadata
.modified()
.with_context(|| format!("Failed to get modification time for {}", path.display()))?;
Ok(current_size != self.size || current_mtime != self.mtime)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HashIndex {
hashes: HashMap<PathBuf, FileHash>,
pub file_count: usize,
pub total_symbols: usize,
#[serde(default)]
content_cache_max_bytes: Option<usize>,
}
impl HashIndex {
#[must_use]
pub fn new() -> Self {
Self::with_content_cache_limit(None)
}
#[must_use]
pub fn with_content_cache_limit(limit: Option<usize>) -> Self {
Self {
hashes: HashMap::new(),
file_count: 0,
total_symbols: 0,
content_cache_max_bytes: limit,
}
}
pub fn set_content_cache_limit(&mut self, limit: Option<usize>) {
self.content_cache_max_bytes = limit;
}
pub fn has_changed(&self, path: &Path) -> Result<bool> {
let Some(stored_hash) = self.hashes.get(path) else {
return Ok(true);
};
if !path.exists() {
return Ok(true);
}
if !stored_hash.metadata_changed(path)? {
return Ok(false);
}
let current_hash = FileHash::compute(path)?;
Ok(current_hash.hash != stored_hash.hash)
}
pub fn update(&mut self, path: PathBuf, mut file_hash: FileHash) {
if let Some(old_hash) = self.hashes.remove(&path) {
self.total_symbols = self.total_symbols.saturating_sub(old_hash.symbols_count);
self.file_count = self.file_count.saturating_sub(1);
}
self.total_symbols += file_hash.symbols_count;
self.file_count += 1;
file_hash.path.clone_from(&path);
self.hashes.insert(path, file_hash);
}
pub fn remove(&mut self, path: &Path) -> Option<FileHash> {
if let Some(removed) = self.hashes.remove(path) {
self.total_symbols = self.total_symbols.saturating_sub(removed.symbols_count);
self.file_count = self.file_count.saturating_sub(1);
Some(removed)
} else {
None
}
}
#[must_use]
pub fn get(&self, path: &Path) -> Option<&FileHash> {
self.hashes.get(path)
}
pub fn iter(&self) -> impl Iterator<Item = (&PathBuf, &FileHash)> {
self.hashes.iter()
}
#[must_use]
pub fn len(&self) -> usize {
self.file_count
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.file_count == 0
}
pub fn clear(&mut self) {
self.hashes.clear();
self.file_count = 0;
self.total_symbols = 0;
}
pub fn get_cached_content(&self, path: &Path) -> Result<String> {
if let Some(file_hash) = self.hashes.get(path)
&& let Some(ref content) = file_hash.content
{
return Ok(content.clone());
}
anyhow::bail!("Content not cached for {}", path.display())
}
pub fn cache_content(&mut self, path: &Path, content: String) {
if let Some(limit) = self.content_cache_max_bytes
&& content.len() > limit
{
log::trace!(
"Skipping content cache for {} (size: {} bytes > {} limit)",
path.display(),
content.len(),
limit
);
return;
}
if let Some(file_hash) = self.hashes.get_mut(path) {
let size = content.len();
file_hash.content = Some(content);
log::trace!("Cached content for {} ({size} bytes)", path.display());
}
}
pub fn save(&self, cache_dir: &Path) -> Result<()> {
fs::create_dir_all(cache_dir)
.with_context(|| format!("Failed to create cache directory {}", cache_dir.display()))?;
let hash_file = cache_dir.join("file_hashes.bin");
let payload =
postcard::to_allocvec(self).context("Failed to serialize hash index payload")?;
let envelope = HashIndexEnvelope {
magic: HASH_INDEX_MAGIC,
version: HASH_INDEX_ENVELOPE_VERSION,
sqry_version: env!("CARGO_PKG_VERSION").to_string(),
payload,
};
let bytes =
postcard::to_allocvec(&envelope).context("Failed to serialize hash index envelope")?;
let tmp_hash_index_file_path = hash_file.with_extension("bin.tmp");
fs::write(&tmp_hash_index_file_path, bytes).with_context(|| {
format!(
"Failed to write temp hash index to {}",
tmp_hash_index_file_path.display()
)
})?;
if hash_file.exists() {
let _ = fs::remove_file(&hash_file);
}
fs::rename(&tmp_hash_index_file_path, &hash_file).with_context(|| {
format!(
"Failed to atomically replace hash index at {} with temp {}",
hash_file.display(),
tmp_hash_index_file_path.display()
)
})?;
log::debug!(
"Saved hash index: {} files, {} symbols to {}",
self.file_count,
self.total_symbols,
hash_file.display()
);
Ok(())
}
pub fn load(cache_dir: &Path) -> Result<Self> {
let hash_file = cache_dir.join("file_hashes.bin");
if !hash_file.exists() {
log::debug!(
"No hash index found at {}, starting fresh",
hash_file.display()
);
return Ok(Self::new());
}
const MAX_HASH_INDEX_BYTES: u64 = 256 * 1024 * 1024; let metadata = fs::metadata(&hash_file)
.with_context(|| format!("Failed to stat hash index: {}", hash_file.display()))?;
if metadata.len() > MAX_HASH_INDEX_BYTES {
anyhow::bail!(
"Hash index file is too large ({} bytes, max {}): {}",
metadata.len(),
MAX_HASH_INDEX_BYTES,
hash_file.display()
);
}
let bytes = fs::read(&hash_file)
.with_context(|| format!("Failed to read hash index from {}", hash_file.display()))?;
let env: HashIndexEnvelope =
postcard::from_bytes(&bytes).context("Failed to deserialize hash index envelope")?;
if env.magic != HASH_INDEX_MAGIC {
anyhow::bail!("Invalid hash index magic: expected {HASH_INDEX_MAGIC:?}");
}
if env.version != HASH_INDEX_ENVELOPE_VERSION {
anyhow::bail!(
"Unsupported hash index version: {} (expected {})",
env.version,
HASH_INDEX_ENVELOPE_VERSION
);
}
let index: Self = postcard::from_bytes(&env.payload)
.context("Failed to deserialize hash index payload")?;
log::debug!(
"Loaded hash index: {} files, {} symbols from {}",
index.file_count,
index.total_symbols,
hash_file.display()
);
Ok(index)
}
}
impl Default for HashIndex {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::{NamedTempFile, TempDir};
#[test]
fn test_file_hash_compute() {
let mut temp_file = NamedTempFile::new().unwrap();
temp_file.write_all(b"test content").unwrap();
temp_file.flush().unwrap();
let hash = FileHash::compute(temp_file.path()).unwrap();
assert_eq!(hash.size, 12); assert!(hash.hash != 0); assert_eq!(hash.symbols_count, 0); }
#[test]
fn test_file_hash_from_bytes() {
let mut temp_file = NamedTempFile::new().unwrap();
temp_file.write_all(b"test").unwrap();
temp_file.flush().unwrap();
let content = b"test";
let hash = FileHash::from_bytes(temp_file.path(), content).unwrap();
assert_eq!(hash.size, 4);
assert_eq!(hash.hash, xxh64(content, 0));
}
#[test]
fn test_file_hash_deterministic() {
let mut temp_file = NamedTempFile::new().unwrap();
let content = b"deterministic test content";
temp_file.write_all(content).unwrap();
temp_file.flush().unwrap();
let hash1 = FileHash::compute(temp_file.path()).unwrap();
let hash2 = FileHash::compute(temp_file.path()).unwrap();
assert_eq!(hash1.hash, hash2.hash);
assert_eq!(hash1.size, hash2.size);
}
#[test]
fn test_file_hash_different_content() {
let mut temp1 = NamedTempFile::new().unwrap();
temp1.write_all(b"content A").unwrap();
temp1.flush().unwrap();
let mut temp2 = NamedTempFile::new().unwrap();
temp2.write_all(b"content B").unwrap();
temp2.flush().unwrap();
let hash1 = FileHash::compute(temp1.path()).unwrap();
let hash2 = FileHash::compute(temp2.path()).unwrap();
assert_ne!(hash1.hash, hash2.hash);
}
#[test]
fn test_hash_index_new_file() {
let index = HashIndex::new();
let path = Path::new("nonexistent.rs");
assert!(index.has_changed(path).unwrap());
}
#[test]
fn test_hash_index_unchanged_file() {
let mut temp_file = NamedTempFile::new().unwrap();
temp_file.write_all(b"unchanged content").unwrap();
temp_file.flush().unwrap();
let mut index = HashIndex::new();
let hash = FileHash::compute(temp_file.path()).unwrap();
index.update(temp_file.path().to_path_buf(), hash);
assert!(!index.has_changed(temp_file.path()).unwrap());
}
#[test]
fn test_hash_index_changed_content() {
let mut temp_file = NamedTempFile::new().unwrap();
temp_file.write_all(b"original content").unwrap();
temp_file.flush().unwrap();
let mut index = HashIndex::new();
let hash = FileHash::compute(temp_file.path()).unwrap();
index.update(temp_file.path().to_path_buf(), hash);
temp_file.write_all(b" modified").unwrap();
temp_file.flush().unwrap();
assert!(index.has_changed(temp_file.path()).unwrap());
}
#[test]
fn test_hash_index_update_and_remove() {
let mut index = HashIndex::new();
let path = PathBuf::from("test.rs");
let mut hash = FileHash {
path: path.clone(),
hash: 12345,
size: 100,
mtime: SystemTime::now(),
symbols_count: 5,
content: None,
};
index.update(path.clone(), hash.clone());
assert_eq!(index.len(), 1);
assert_eq!(index.total_symbols, 5);
hash.symbols_count = 10;
index.update(path.clone(), hash.clone());
assert_eq!(index.len(), 1); assert_eq!(index.total_symbols, 10);
let removed = index.remove(&path);
assert!(removed.is_some());
assert_eq!(index.len(), 0);
assert_eq!(index.total_symbols, 0);
}
#[test]
fn test_hash_index_save_and_load() {
let tmp_index_dir = TempDir::new().unwrap();
let cache_dir = tmp_index_dir.path();
let mut index = HashIndex::new();
let path = PathBuf::from("test.rs");
let hash = FileHash {
path: path.clone(),
hash: 67890,
size: 200,
mtime: SystemTime::now(),
symbols_count: 15,
content: None,
};
index.update(path, hash);
index.save(cache_dir).unwrap();
let loaded = HashIndex::load(cache_dir).unwrap();
assert_eq!(loaded.len(), 1);
assert_eq!(loaded.total_symbols, 15);
assert_eq!(loaded.get(Path::new("test.rs")).unwrap().hash, 67890);
}
#[test]
fn test_hash_index_mtime_change_no_content_change() {
use filetime::{FileTime, set_file_mtime};
use std::time::Duration;
let mut temp_file = NamedTempFile::new().unwrap();
temp_file.write_all(b"same content").unwrap();
temp_file.flush().unwrap();
let mut index = HashIndex::new();
let hash = FileHash::compute(temp_file.path()).unwrap();
index.update(temp_file.path().to_path_buf(), hash);
let meta = fs::metadata(temp_file.path()).unwrap();
let orig_mtime = meta.modified().unwrap();
let new_mtime = FileTime::from_system_time(orig_mtime + Duration::from_secs(60));
set_file_mtime(temp_file.path(), new_mtime).unwrap();
assert!(!index.has_changed(temp_file.path()).unwrap());
}
#[test]
fn test_hash_index_load_nonexistent() {
let tmp_index_dir = TempDir::new().unwrap();
let cache_dir = tmp_index_dir.path().join("nonexistent");
let index = HashIndex::load(&cache_dir).unwrap();
assert_eq!(index.len(), 0);
assert!(index.is_empty());
}
#[test]
fn test_hash_index_clear() {
let mut index = HashIndex::new();
for i in 0_u64..5 {
let path = PathBuf::from(format!("file{i}.rs"));
let hash = FileHash {
path: path.clone(),
hash: i,
size: 100,
mtime: SystemTime::now(),
symbols_count: 3,
content: None,
};
index.update(path, hash);
}
assert_eq!(index.len(), 5);
assert_eq!(index.total_symbols, 15);
index.clear();
assert_eq!(index.len(), 0);
assert_eq!(index.total_symbols, 0);
assert!(index.is_empty());
}
#[test]
fn test_xxhash64_performance_characteristic() {
let data = vec![0u8; 1_000_000];
let start = std::time::Instant::now();
let _hash = xxh64(&data, 0);
let elapsed = start.elapsed();
assert!(
elapsed.as_millis() < 100,
"XXHash64 took {elapsed:?} to hash 1MB (expected <100ms)"
);
}
#[test]
fn test_cache_small_file() {
let mut temp_file = NamedTempFile::new().unwrap();
let content = "Small file content for caching test";
temp_file.write_all(content.as_bytes()).unwrap();
temp_file.flush().unwrap();
let mut index = HashIndex::new();
let hash = FileHash::compute(temp_file.path()).unwrap();
index.update(temp_file.path().to_path_buf(), hash);
index.cache_content(temp_file.path(), content.to_string());
let cached = index.get_cached_content(temp_file.path()).unwrap();
assert_eq!(cached, content);
let file_hash = index.get(temp_file.path()).unwrap();
assert!(file_hash.content.is_some());
assert_eq!(file_hash.content.as_ref().unwrap(), content);
}
#[test]
fn test_skip_large_file_when_limit_configured() {
let mut temp_file = NamedTempFile::new().unwrap();
let large_content = "x".repeat(101_000); temp_file.write_all(large_content.as_bytes()).unwrap();
temp_file.flush().unwrap();
let mut index = HashIndex::with_content_cache_limit(Some(100_000));
let hash = FileHash::compute(temp_file.path()).unwrap();
index.update(temp_file.path().to_path_buf(), hash);
index.cache_content(temp_file.path(), large_content.clone());
let file_hash = index.get(temp_file.path()).unwrap();
assert!(file_hash.content.is_none());
assert!(index.get_cached_content(temp_file.path()).is_err());
}
#[test]
fn test_large_file_cached_without_limit() {
let mut temp_file = NamedTempFile::new().unwrap();
let large_content = "x".repeat(101_000); temp_file.write_all(large_content.as_bytes()).unwrap();
temp_file.flush().unwrap();
let mut index = HashIndex::new();
let hash = FileHash::compute(temp_file.path()).unwrap();
index.update(temp_file.path().to_path_buf(), hash);
index.cache_content(temp_file.path(), large_content.clone());
let cached = index.get_cached_content(temp_file.path()).unwrap();
assert_eq!(cached.len(), large_content.len());
}
#[test]
fn test_get_cached_content_error_when_not_cached() {
let mut temp_file = NamedTempFile::new().unwrap();
let content = "Test content";
temp_file.write_all(content.as_bytes()).unwrap();
temp_file.flush().unwrap();
let mut index = HashIndex::new();
let hash = FileHash::compute(temp_file.path()).unwrap();
index.update(temp_file.path().to_path_buf(), hash);
assert!(index.get_cached_content(temp_file.path()).is_err());
}
}