use std::{
collections::HashMap,
fs::{File, OpenOptions},
io::{BufRead, BufReader, Write},
path::Path,
sync::Mutex,
};
use anyhow::{Context, Result};
use linprov_common::siphash;
use log::{debug, warn};
use crate::encoding::{b64, escape, unb64};
pub struct HashDb {
map: Mutex<HashMap<u64, String>>,
writer: Mutex<File>,
k0: u64,
k1: u64,
}
impl HashDb {
pub fn open(path: &Path, k0: u64, k1: u64) -> Result<Self> {
if let Some(parent) = path.parent() {
if !parent.as_os_str().is_empty() {
std::fs::create_dir_all(parent)
.with_context(|| format!("creating `{}`", parent.display()))?;
}
}
let mut map = HashMap::new();
match File::open(path) {
Ok(f) => {
for (i, line) in BufReader::new(f).lines().enumerate() {
let line = match line {
Ok(l) => l,
Err(e) => {
warn!("hashdb: read error on line {}: {e}", i + 1);
break;
}
};
let Some((hex, path_str)) = line.split_once('\t') else {
continue; };
if let Ok(hash) = u64::from_str_radix(hex.trim(), 16) {
let raw = match path_str.split_once('\t') {
Some((b64_col, _human)) => match unb64(b64_col) {
Some(p) => p,
None => continue, },
None => path_str.to_string(), };
map.insert(hash, raw);
}
}
debug!(
"hashdb: loaded {} entries from {}",
map.len(),
path.display()
);
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
Err(e) => return Err(e).with_context(|| format!("opening `{}`", path.display())),
}
let writer = OpenOptions::new()
.create(true)
.append(true)
.open(path)
.with_context(|| format!("opening `{}` for append", path.display()))?;
Ok(Self {
map: Mutex::new(map),
writer: Mutex::new(writer),
k0,
k1,
})
}
pub fn record(&self, s: &str) -> u64 {
let hash = siphash(self.k0, self.k1, s.as_bytes());
let mut map = self.map.lock().expect("hashdb map mutex poisoned");
if map.contains_key(&hash) {
return hash;
}
map.insert(hash, s.to_string()); let mut w = self.writer.lock().expect("hashdb writer mutex poisoned");
if let Err(e) = writeln!(w, "{hash:016x}\t{}\t{}", b64(s), escape(s)) {
warn!("hashdb: failed to append {hash:016x}: {e}");
}
hash
}
pub fn resolve(&self, hash: u64) -> Option<String> {
if hash == 0 {
return None;
}
self.map
.lock()
.expect("hashdb map mutex poisoned")
.get(&hash)
.cloned()
}
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::tempdir;
const K0: u64 = 0x1111_2222_3333_4444;
const K1: u64 = 0x5555_6666_7777_8888;
#[test]
fn record_resolve_roundtrip() {
let dir = tempdir().unwrap();
let path = dir.path().join("hashes.tsv");
let db = HashDb::open(&path, K0, K1).unwrap();
let h = db.record("/home/user/Downloads/");
assert_eq!(h, siphash(K0, K1, b"/home/user/Downloads/"));
assert_eq!(db.resolve(h).as_deref(), Some("/home/user/Downloads/"));
assert_eq!(db.resolve(0), None);
assert_eq!(db.resolve(0xdead_beef), None);
}
#[test]
fn dedup_and_persist() {
let dir = tempdir().unwrap();
let path = dir.path().join("hashes.tsv");
{
let db = HashDb::open(&path, K0, K1).unwrap();
db.record("/a/b/");
db.record("/a/b/"); db.record("foo.sh");
}
let body = std::fs::read_to_string(&path).unwrap();
assert_eq!(body.lines().count(), 2, "body was:\n{body}");
let db = HashDb::open(&path, K0, K1).unwrap();
assert_eq!(db.resolve(siphash(K0, K1, b"/a/b/")).as_deref(), Some("/a/b/"));
assert_eq!(db.resolve(siphash(K0, K1, b"foo.sh")).as_deref(), Some("foo.sh"));
}
#[test]
fn injection_path_stored_base64_resolves_raw() {
let dir = tempdir().unwrap();
let path = dir.path().join("hashes.tsv");
let evil = "/tmp/a\tb\nDEADBEEFDEADBEEF\t/etc/shadow";
let h = {
let db = HashDb::open(&path, K0, K1).unwrap();
db.record(evil)
};
let body = std::fs::read_to_string(&path).unwrap();
assert_eq!(body.lines().count(), 1, "expected one row, got:\n{body}");
let cols: Vec<&str> = body.trim_end().split('\t').collect();
assert_eq!(cols.len(), 3, "want 3 columns, got {cols:?}");
assert_eq!(h, siphash(K0, K1, evil.as_bytes()));
let db = HashDb::open(&path, K0, K1).unwrap();
assert_eq!(db.resolve(h).as_deref(), Some(evil));
}
#[test]
fn legacy_two_column_rows_still_load() {
let dir = tempdir().unwrap();
let path = dir.path().join("hashes.tsv");
let raw = "/home/u/Downloads/x";
let legacy_hash: u64 = 0x0123_4567_89ab_cdef;
let mut f = File::create(&path).unwrap();
writeln!(f, "{legacy_hash:016x}\t{raw}").unwrap();
drop(f);
let db = HashDb::open(&path, K0, K1).unwrap();
assert_eq!(db.resolve(legacy_hash).as_deref(), Some(raw));
}
}