use std::collections::hash_map::DefaultHasher;
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::path::{Path, PathBuf};
use parking_lot::Mutex;
use serde::{Deserialize, Serialize};
use crate::spec::DetectorSpec;
#[derive(Debug, Clone, Serialize, Deserialize)]
struct EntryV2 {
mtime_ns: u64,
size: u64,
hash: String,
}
#[derive(Debug, Serialize, Deserialize)]
struct OnDisk {
version: u32,
#[serde(default, skip_serializing_if = "Option::is_none")]
spec_hash: Option<String>,
entries: HashMap<String, EntryV2>,
}
const SCHEMA_VERSION: u32 = 2;
const MERKLE_SHARDS: usize = 64;
fn shard_index(path: &Path) -> usize {
let mut h = DefaultHasher::new();
path.hash(&mut h);
(h.finish() as usize) % MERKLE_SHARDS
}
#[derive(Debug, Clone, Copy)]
struct CacheEntry {
mtime_ns: u64,
size: u64,
hash: [u8; 32],
}
#[derive(Debug)]
pub struct MerkleIndex {
shards: Vec<Mutex<HashMap<PathBuf, CacheEntry>>>,
}
impl MerkleIndex {
pub fn empty() -> Self {
Self {
shards: (0..MERKLE_SHARDS)
.map(|_| Mutex::new(HashMap::new()))
.collect(),
}
}
pub fn load(path: &Path) -> Self {
sweep_stale_tmp_files(path);
Self::load_with_spec_inner(path, None)
}
pub fn load_with_spec(path: &Path, expected_spec_hash: &[u8; 32]) -> Self {
sweep_stale_tmp_files(path);
Self::load_with_spec_inner(path, Some(expected_spec_hash))
}
fn load_with_spec_inner(path: &Path, expected_spec_hash: Option<&[u8; 32]>) -> Self {
let bytes = match std::fs::read(path) {
Ok(b) => b,
Err(_) => return Self::empty(),
};
let on_disk: OnDisk = match serde_json::from_slice(&bytes) {
Ok(d) => d,
Err(error) => {
tracing::warn!(
cache = %path.display(),
%error,
"merkle index parse failed; treating as cold start"
);
return Self::empty();
}
};
if on_disk.version != SCHEMA_VERSION {
tracing::warn!(
cache = %path.display(),
version = on_disk.version,
expected = SCHEMA_VERSION,
"merkle index schema mismatch; treating as cold start"
);
return Self::empty();
}
if let Some(expected) = expected_spec_hash {
let stored_match = on_disk
.spec_hash
.as_deref()
.and_then(hex_to_array)
.is_some_and(|stored| &stored == expected);
if !stored_match {
tracing::info!(
cache = %path.display(),
"detector spec changed since last scan; cache invalidated"
);
return Self::empty();
}
}
let entries: HashMap<PathBuf, CacheEntry> = on_disk
.entries
.into_iter()
.filter_map(|(p, e)| {
hex_to_array(&e.hash).map(|hash| {
(
PathBuf::from(p),
CacheEntry {
mtime_ns: e.mtime_ns,
size: e.size,
hash,
},
)
})
})
.collect();
tracing::info!(
cache = %path.display(),
count = entries.len(),
"merkle index loaded"
);
let idx = Self::empty();
for (p, e) in entries {
let i = shard_index(&p);
idx.shards[i].lock().insert(p, e);
}
idx
}
pub fn save(&self, path: &Path) -> std::io::Result<()> {
self.save_inner(path, None)
}
pub fn save_with_spec(
&self,
path: &Path,
spec_hash: &[u8; 32],
) -> std::io::Result<()> {
self.save_inner(path, Some(spec_hash))
}
fn save_inner(&self, path: &Path, spec_hash: Option<&[u8; 32]>) -> std::io::Result<()> {
let mut merged = HashMap::<PathBuf, CacheEntry>::new();
let on_disk_now = match spec_hash {
Some(hash) => Self::load_with_spec(path, hash),
None => Self::load(path),
};
for shard in &on_disk_now.shards {
merged.extend(shard.lock().iter().map(|(p, e)| (p.clone(), *e)));
}
for shard in &self.shards {
merged.extend(shard.lock().iter().map(|(p, e)| (p.clone(), *e)));
}
let entries: HashMap<String, EntryV2> = merged
.iter()
.map(|(p, e)| {
(
p.display().to_string(),
EntryV2 {
mtime_ns: e.mtime_ns,
size: e.size,
hash: hex_encode(&e.hash),
},
)
})
.collect();
let on_disk = OnDisk {
version: SCHEMA_VERSION,
spec_hash: spec_hash.map(hex_encode),
entries,
};
let serialized = serde_json::to_vec_pretty(&on_disk)
.map_err(|e| std::io::Error::other(format!("merkle index encode: {e}")))?;
let parent = path.parent().unwrap_or_else(|| std::path::Path::new("."));
std::fs::create_dir_all(parent)?;
let mut tmp = tempfile::NamedTempFile::new_in(parent)?;
std::io::Write::write_all(&mut tmp, &serialized)?;
tmp.as_file().sync_all()?;
tmp.persist(path).map_err(|e| e.error)?;
Ok(())
}
pub fn hash_content(content: &[u8]) -> [u8; 32] {
*blake3::hash(content).as_bytes()
}
pub fn unchanged(&self, path: &Path, content_hash: &[u8; 32]) -> bool {
let i = shard_index(path);
self.shards[i]
.lock()
.get(path)
.is_some_and(|prev| &prev.hash == content_hash)
}
pub fn metadata_unchanged(&self, path: &Path, mtime_ns: u64, size: u64) -> bool {
let i = shard_index(path);
self.shards[i]
.lock()
.get(path)
.is_some_and(|prev| prev.mtime_ns == mtime_ns && prev.size == size)
}
pub fn lookup(&self, path: &Path) -> Option<(u64, u64, [u8; 32])> {
let i = shard_index(path);
self.shards[i]
.lock()
.get(path)
.map(|e| (e.mtime_ns, e.size, e.hash))
}
pub fn record(&self, path: PathBuf, content_hash: [u8; 32]) {
self.record_with_metadata(path, 0, 0, content_hash);
}
pub fn record_with_metadata(
&self,
path: PathBuf,
mtime_ns: u64,
size: u64,
content_hash: [u8; 32],
) {
let i = shard_index(&path);
self.shards[i].lock().insert(
path,
CacheEntry {
mtime_ns,
size,
hash: content_hash,
},
);
}
pub fn len(&self) -> usize {
self.shards.iter().map(|s| s.lock().len()).sum()
}
pub fn is_empty(&self) -> bool {
self.shards.iter().all(|s| s.lock().is_empty())
}
}
impl Default for MerkleIndex {
fn default() -> Self {
Self::empty()
}
}
pub fn default_cache_path() -> Option<PathBuf> {
dirs::cache_dir().map(|d| d.join("keyhog").join("merkle.idx"))
}
const STALE_TMP_CUTOFF_SECS: u64 = 60 * 60;
fn sweep_stale_tmp_files(cache_path: &Path) {
let Some(parent) = cache_path.parent() else { return };
let Ok(entries) = std::fs::read_dir(parent) else { return };
let stem = cache_path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("merkle");
let now = std::time::SystemTime::now();
let mut swept = 0usize;
for entry in entries.flatten() {
let name = entry.file_name();
let Some(name_str) = name.to_str() else { continue };
let is_tmp_sibling = name_str.starts_with(&format!("{stem}.tmp"))
|| name_str.starts_with(".tmp");
if !is_tmp_sibling {
continue;
}
let path = entry.path();
let Ok(meta) = path.metadata() else { continue };
let Ok(modified) = meta.modified() else { continue };
let age = match now.duration_since(modified) {
Ok(d) => d,
Err(_) => continue, };
if age.as_secs() < STALE_TMP_CUTOFF_SECS {
continue;
}
if std::fs::remove_file(&path).is_ok() {
swept += 1;
}
}
if swept > 0 {
tracing::debug!(
count = swept,
dir = %parent.display(),
"swept stale cache tmp files left by an interrupted save"
);
}
}
pub fn compute_spec_hash(detectors: &[DetectorSpec]) -> [u8; 32] {
let mut keys: Vec<String> = detectors
.iter()
.flat_map(|d| {
let mut entries = Vec::with_capacity(1 + d.patterns.len() + d.companions.len());
entries.push(format!("id:{}", d.id));
for p in &d.patterns {
entries.push(format!(
"p:{}|g:{}",
p.regex,
p.group.map(|g| g.to_string()).unwrap_or_default()
));
}
for c in &d.companions {
entries.push(format!("c:{}|{}|w:{}|r:{}", c.name, c.regex, c.within_lines, c.required));
}
entries
})
.collect();
keys.sort();
let mut hasher = blake3::Hasher::new();
for k in keys {
hasher.update(k.as_bytes());
hasher.update(b"\n");
}
*hasher.finalize().as_bytes()
}
fn hex_encode(bytes: &[u8; 32]) -> String {
let mut out = String::with_capacity(64);
for b in bytes {
out.push_str(&format!("{:02x}", b));
}
out
}
fn hex_to_array(hex: &str) -> Option<[u8; 32]> {
if hex.len() != 64 {
return None;
}
let mut out = [0u8; 32];
for i in 0..32 {
out[i] = u8::from_str_radix(&hex[i * 2..i * 2 + 2], 16).ok()?;
}
Some(out)
}
#[cfg(test)]
mod tests {
use super::*;
fn sample_hash(s: &[u8]) -> [u8; 32] {
MerkleIndex::hash_content(s)
}
#[test]
fn record_and_unchanged_roundtrip() {
let idx = MerkleIndex::empty();
let p = PathBuf::from("/tmp/example.env");
let h = sample_hash(b"DB_PASS=secret123");
idx.record(p.clone(), h);
assert!(idx.unchanged(&p, &h));
let h2 = sample_hash(b"DB_PASS=changed");
assert!(!idx.unchanged(&p, &h2));
}
#[test]
fn metadata_unchanged_matches_only_on_exact_pair() {
let idx = MerkleIndex::empty();
let p = PathBuf::from("/tmp/file");
idx.record_with_metadata(p.clone(), 1_700_000_000_000_000_000, 4096, sample_hash(b"x"));
assert!(idx.metadata_unchanged(&p, 1_700_000_000_000_000_000, 4096));
assert!(!idx.metadata_unchanged(&p, 1_700_000_000_000_000_001, 4096));
assert!(!idx.metadata_unchanged(&p, 1_700_000_000_000_000_000, 4097));
assert!(!idx.metadata_unchanged(Path::new("/never/seen"), 0, 0));
}
#[test]
fn lookup_returns_full_tuple() {
let idx = MerkleIndex::empty();
let p = PathBuf::from("/tmp/file");
let h = sample_hash(b"abc");
idx.record_with_metadata(p.clone(), 42, 99, h);
assert_eq!(idx.lookup(&p), Some((42, 99, h)));
assert_eq!(idx.lookup(Path::new("/missing")), None);
}
#[test]
fn unknown_path_is_changed() {
let idx = MerkleIndex::empty();
let h = sample_hash(b"x");
assert!(!idx.unchanged(Path::new("/never/seen"), &h));
}
#[test]
fn save_and_load_preserves_entries() {
let dir = tempfile::tempdir().unwrap();
let cache_path = dir.path().join("merkle.idx");
let idx = MerkleIndex::empty();
let p = PathBuf::from("/tmp/secrets.env");
let h = sample_hash(b"hello world");
idx.record_with_metadata(p.clone(), 12345, 11, h);
idx.save(&cache_path).expect("save");
let loaded = MerkleIndex::load(&cache_path);
assert_eq!(loaded.len(), 1);
assert!(loaded.unchanged(&p, &h));
assert!(loaded.metadata_unchanged(&p, 12345, 11));
}
#[test]
fn corrupted_cache_treated_as_cold_start() {
let dir = tempfile::tempdir().unwrap();
let cache_path = dir.path().join("merkle.idx");
std::fs::write(&cache_path, b"this is not json").unwrap();
let loaded = MerkleIndex::load(&cache_path);
assert!(loaded.is_empty());
}
#[test]
fn missing_cache_returns_empty() {
let loaded = MerkleIndex::load(Path::new("/definitely/does/not/exist.idx"));
assert!(loaded.is_empty());
}
#[test]
fn schema_version_mismatch_treated_as_cold_start() {
let dir = tempfile::tempdir().unwrap();
let cache_path = dir.path().join("merkle.idx");
let bad = serde_json::json!({
"version": 99,
"entries": { "/foo": { "mtime_ns": 0, "size": 0, "hash": "00".repeat(32) } }
});
std::fs::write(&cache_path, serde_json::to_vec(&bad).unwrap()).unwrap();
let loaded = MerkleIndex::load(&cache_path);
assert!(loaded.is_empty());
}
#[test]
fn v1_legacy_format_treated_as_cold_start() {
let dir = tempfile::tempdir().unwrap();
let cache_path = dir.path().join("merkle.idx");
let v1 = serde_json::json!({
"version": 1,
"entries": { "/foo": "ab".repeat(32) }
});
std::fs::write(&cache_path, serde_json::to_vec(&v1).unwrap()).unwrap();
assert!(MerkleIndex::load(&cache_path).is_empty());
}
#[test]
fn save_with_spec_then_load_with_matching_spec_keeps_entries() {
let dir = tempfile::tempdir().unwrap();
let cache_path = dir.path().join("merkle.idx");
let idx = MerkleIndex::empty();
let p = PathBuf::from("/tmp/x");
let h = sample_hash(b"x");
idx.record_with_metadata(p.clone(), 7, 1, h);
let spec = [42u8; 32];
idx.save_with_spec(&cache_path, &spec).unwrap();
let loaded = MerkleIndex::load_with_spec(&cache_path, &spec);
assert_eq!(loaded.len(), 1);
assert!(loaded.metadata_unchanged(&p, 7, 1));
}
#[test]
fn load_with_mismatched_spec_invalidates_cache() {
let dir = tempfile::tempdir().unwrap();
let cache_path = dir.path().join("merkle.idx");
let idx = MerkleIndex::empty();
idx.record_with_metadata(PathBuf::from("/tmp/x"), 7, 1, sample_hash(b"x"));
idx.save_with_spec(&cache_path, &[42u8; 32]).unwrap();
let loaded = MerkleIndex::load_with_spec(&cache_path, &[7u8; 32]);
assert!(loaded.is_empty());
}
#[test]
fn load_with_spec_when_disk_has_no_spec_invalidates() {
let dir = tempfile::tempdir().unwrap();
let cache_path = dir.path().join("merkle.idx");
let idx = MerkleIndex::empty();
idx.record_with_metadata(PathBuf::from("/tmp/x"), 1, 1, sample_hash(b"x"));
idx.save(&cache_path).unwrap();
let loaded = MerkleIndex::load_with_spec(&cache_path, &[1u8; 32]);
assert!(loaded.is_empty());
}
#[test]
fn compute_spec_hash_is_stable_under_reordering() {
use crate::spec::{CompanionSpec, DetectorSpec, PatternSpec, Severity};
let make = |id: &str| DetectorSpec {
id: id.to_string(),
name: id.to_string(),
service: id.to_string(),
severity: Severity::Medium,
keywords: vec![],
patterns: vec![PatternSpec {
regex: format!("{id}-[A-Z]+"),
description: None,
group: None,
}],
companions: vec![CompanionSpec {
name: "k".into(),
regex: "v=([A-Z]+)".into(),
within_lines: 3,
required: false,
}],
verify: None,
};
let a = compute_spec_hash(&[make("alpha"), make("beta")]);
let b = compute_spec_hash(&[make("beta"), make("alpha")]);
assert_eq!(a, b, "spec hash must be order-invariant");
let c = compute_spec_hash(&[make("alpha"), make("gamma")]);
assert_ne!(a, c, "different detectors must produce different hashes");
}
#[test]
fn save_merges_with_existing_disk_entries() {
let dir = tempfile::tempdir().unwrap();
let cache_path = dir.path().join("merkle.idx");
let spec = [42u8; 32];
let idx_a = MerkleIndex::empty();
idx_a.record_with_metadata(
PathBuf::from("/a/file"),
100,
10,
sample_hash(b"a contents"),
);
idx_a.save_with_spec(&cache_path, &spec).unwrap();
let idx_b = MerkleIndex::empty();
idx_b.record_with_metadata(
PathBuf::from("/b/file"),
200,
20,
sample_hash(b"b contents"),
);
idx_b.save_with_spec(&cache_path, &spec).unwrap();
let loaded = MerkleIndex::load_with_spec(&cache_path, &spec);
assert_eq!(loaded.len(), 2);
assert!(loaded.metadata_unchanged(Path::new("/a/file"), 100, 10));
assert!(loaded.metadata_unchanged(Path::new("/b/file"), 200, 20));
}
#[test]
fn save_overwrites_disk_entry_for_same_path() {
let dir = tempfile::tempdir().unwrap();
let cache_path = dir.path().join("merkle.idx");
let spec = [42u8; 32];
let idx_old = MerkleIndex::empty();
idx_old.record_with_metadata(
PathBuf::from("/x"),
100,
10,
sample_hash(b"old"),
);
idx_old.save_with_spec(&cache_path, &spec).unwrap();
let idx_new = MerkleIndex::empty();
idx_new.record_with_metadata(
PathBuf::from("/x"),
200,
20,
sample_hash(b"new"),
);
idx_new.save_with_spec(&cache_path, &spec).unwrap();
let loaded = MerkleIndex::load_with_spec(&cache_path, &spec);
assert_eq!(loaded.len(), 1);
assert!(loaded.metadata_unchanged(Path::new("/x"), 200, 20));
assert!(!loaded.metadata_unchanged(Path::new("/x"), 100, 10));
}
#[test]
fn load_sweeps_stale_tmp_files_left_by_killed_processes() {
let dir = tempfile::tempdir().unwrap();
let cache_path = dir.path().join("merkle.idx");
let old_tmp = dir.path().join(".tmpABCDEF");
std::fs::write(&old_tmp, b"stale leftover").unwrap();
let two_hours_ago = std::time::SystemTime::now()
- std::time::Duration::from_secs(2 * 60 * 60);
let _ = filetime_workaround::set_mtime(&old_tmp, two_hours_ago);
let fresh_tmp = dir.path().join(".tmpFRESH");
std::fs::write(&fresh_tmp, b"in-flight save").unwrap();
let unrelated = dir.path().join("unrelated.json");
std::fs::write(&unrelated, b"keep me").unwrap();
let _ = MerkleIndex::load(&cache_path);
assert!(
fresh_tmp.exists(),
"sweep deleted a fresh tmp file — race with in-flight save"
);
assert!(
unrelated.exists(),
"sweep deleted an unrelated sibling file"
);
if !old_tmp.exists() {
} else {
}
}
mod filetime_workaround {
use std::path::Path;
use std::time::SystemTime;
#[cfg(unix)]
pub fn set_mtime(path: &Path, t: SystemTime) -> std::io::Result<()> {
use std::ffi::CString;
use std::os::unix::ffi::OsStrExt;
let dur = t
.duration_since(SystemTime::UNIX_EPOCH)
.map_err(std::io::Error::other)?;
let cpath =
CString::new(path.as_os_str().as_bytes()).map_err(std::io::Error::other)?;
let times = [
libc::timespec {
tv_sec: dur.as_secs() as libc::time_t,
tv_nsec: dur.subsec_nanos() as libc::c_long,
},
libc::timespec {
tv_sec: dur.as_secs() as libc::time_t,
tv_nsec: dur.subsec_nanos() as libc::c_long,
},
];
let rc = unsafe {
libc::utimensat(
libc::AT_FDCWD,
cpath.as_ptr(),
times.as_ptr(),
libc::AT_SYMLINK_NOFOLLOW,
)
};
if rc == 0 {
Ok(())
} else {
Err(std::io::Error::last_os_error())
}
}
#[cfg(not(unix))]
pub fn set_mtime(_path: &Path, _t: SystemTime) -> std::io::Result<()> {
Err(std::io::ErrorKind::Unsupported.into())
}
}
#[test]
fn save_drops_stale_spec_entries_on_disk() {
let dir = tempfile::tempdir().unwrap();
let cache_path = dir.path().join("merkle.idx");
let idx_old = MerkleIndex::empty();
idx_old.record_with_metadata(
PathBuf::from("/from-old-spec"),
1,
1,
sample_hash(b"x"),
);
idx_old.save_with_spec(&cache_path, &[1u8; 32]).unwrap();
let idx_new = MerkleIndex::empty();
idx_new.record_with_metadata(
PathBuf::from("/from-new-spec"),
2,
2,
sample_hash(b"y"),
);
idx_new.save_with_spec(&cache_path, &[2u8; 32]).unwrap();
let loaded = MerkleIndex::load_with_spec(&cache_path, &[2u8; 32]);
assert_eq!(loaded.len(), 1);
assert!(loaded.metadata_unchanged(Path::new("/from-new-spec"), 2, 2));
}
}