#![allow(dead_code)]
use std::collections::{HashMap, VecDeque};
use std::fmt;
use std::hash::Hash;
use std::path::Path;
use std::time::UNIX_EPOCH;
use crate::{DedupError, DedupResult};
#[derive(Debug, Clone, Default)]
pub struct CacheStats {
pub hits: u64,
pub misses: u64,
pub insertions: u64,
pub evictions: u64,
}
impl CacheStats {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
#[allow(clippy::cast_precision_loss)]
pub fn hit_rate(&self) -> f64 {
let total = self.hits + self.misses;
if total == 0 {
return 0.0;
}
self.hits as f64 / total as f64
}
#[must_use]
pub fn total_lookups(&self) -> u64 {
self.hits + self.misses
}
pub fn reset(&mut self) {
self.hits = 0;
self.misses = 0;
self.insertions = 0;
self.evictions = 0;
}
}
impl fmt::Display for CacheStats {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"hits={}, misses={}, hit_rate={:.2}%, evictions={}",
self.hits,
self.misses,
self.hit_rate() * 100.0,
self.evictions,
)
}
}
struct LruNode<K, V> {
key: K,
value: V,
prev: usize,
next: usize,
}
pub struct LruCache<K, V> {
capacity: usize,
map: HashMap<K, usize>,
nodes: Vec<LruNode<K, V>>,
head: usize,
tail: usize,
free: Vec<usize>,
stats: CacheStats,
}
const NONE: usize = usize::MAX;
impl<K: Clone + Eq + Hash, V> LruCache<K, V> {
#[must_use]
pub fn new(capacity: usize) -> Self {
assert!(capacity > 0, "LRU cache capacity must be > 0");
Self {
capacity,
map: HashMap::with_capacity(capacity),
nodes: Vec::with_capacity(capacity),
head: NONE,
tail: NONE,
free: Vec::new(),
stats: CacheStats::new(),
}
}
pub fn get(&mut self, key: &K) -> Option<&V> {
if let Some(&idx) = self.map.get(key) {
self.stats.hits += 1;
self.move_to_head(idx);
Some(&self.nodes[idx].value)
} else {
self.stats.misses += 1;
None
}
}
#[must_use]
pub fn contains(&self, key: &K) -> bool {
self.map.contains_key(key)
}
pub fn insert(&mut self, key: K, value: V) -> Option<(K, V)> {
if let Some(&idx) = self.map.get(&key) {
self.nodes[idx].value = value;
self.move_to_head(idx);
self.stats.insertions += 1;
return None;
}
self.stats.insertions += 1;
let evicted = if self.map.len() >= self.capacity {
self.evict_tail()
} else {
None
};
let idx = if let Some(free_idx) = self.free.pop() {
self.nodes[free_idx] = LruNode {
key: key.clone(),
value,
prev: NONE,
next: NONE,
};
free_idx
} else {
let idx = self.nodes.len();
self.nodes.push(LruNode {
key: key.clone(),
value,
prev: NONE,
next: NONE,
});
idx
};
self.map.insert(key, idx);
self.push_head(idx);
evicted
}
pub fn remove(&mut self, key: &K) -> Option<V> {
if let Some(idx) = self.map.remove(key) {
self.unlink(idx);
self.free.push(idx);
None
} else {
None
}
}
#[must_use]
pub fn len(&self) -> usize {
self.map.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.map.is_empty()
}
#[must_use]
pub fn capacity(&self) -> usize {
self.capacity
}
#[must_use]
pub fn stats(&self) -> &CacheStats {
&self.stats
}
pub fn clear(&mut self) {
self.map.clear();
self.nodes.clear();
self.free.clear();
self.head = NONE;
self.tail = NONE;
}
fn unlink(&mut self, idx: usize) {
let prev = self.nodes[idx].prev;
let next = self.nodes[idx].next;
if prev != NONE {
self.nodes[prev].next = next;
} else {
self.head = next;
}
if next != NONE {
self.nodes[next].prev = prev;
} else {
self.tail = prev;
}
self.nodes[idx].prev = NONE;
self.nodes[idx].next = NONE;
}
fn push_head(&mut self, idx: usize) {
self.nodes[idx].prev = NONE;
self.nodes[idx].next = self.head;
if self.head != NONE {
self.nodes[self.head].prev = idx;
}
self.head = idx;
if self.tail == NONE {
self.tail = idx;
}
}
fn move_to_head(&mut self, idx: usize) {
if self.head == idx {
return;
}
self.unlink(idx);
self.push_head(idx);
}
fn evict_tail(&mut self) -> Option<(K, V)> {
if self.tail == NONE {
return None;
}
let tail_idx = self.tail;
let evicted_key = self.nodes[tail_idx].key.clone();
self.unlink(tail_idx);
self.map.remove(&evicted_key);
self.free.push(tail_idx);
self.stats.evictions += 1;
None
}
}
impl<K: Clone + Eq + Hash + fmt::Debug, V> fmt::Debug for LruCache<K, V> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("LruCache")
.field("capacity", &self.capacity)
.field("len", &self.map.len())
.field("stats", &self.stats)
.finish()
}
}
pub struct HashCache {
inner: LruCache<String, String>,
}
impl HashCache {
#[must_use]
pub fn new(capacity: usize) -> Self {
Self {
inner: LruCache::new(capacity),
}
}
pub fn get(&mut self, path: &str) -> Option<&String> {
self.inner.get(&path.to_string())
}
pub fn insert(&mut self, path: String, hash: String) {
self.inner.insert(path, hash);
}
#[must_use]
pub fn contains(&self, path: &str) -> bool {
self.inner.contains(&path.to_string())
}
#[must_use]
pub fn stats(&self) -> &CacheStats {
self.inner.stats()
}
#[must_use]
pub fn len(&self) -> usize {
self.inner.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.inner.is_empty()
}
pub fn clear(&mut self) {
self.inner.clear();
}
}
impl fmt::Debug for HashCache {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("HashCache")
.field("capacity", &self.inner.capacity())
.field("len", &self.inner.len())
.field("stats", &self.inner.stats())
.finish()
}
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct CacheEntry {
pub phash: Option<u64>,
pub fingerprint: Option<Vec<f32>>,
pub mtime_secs: u64,
}
pub struct DedupSessionCache {
capacity: usize,
entries: HashMap<u64, CacheEntry>,
lru_order: VecDeque<u64>,
}
fn fnv1a_64(bytes: &[u8]) -> u64 {
const OFFSET: u64 = 0xcbf2_9ce4_8422_2325;
const PRIME: u64 = 0x0000_0100_0000_01b3;
bytes
.iter()
.fold(OFFSET, |acc, &b| (acc ^ u64::from(b)).wrapping_mul(PRIME))
}
fn mtime_secs(path: &Path) -> DedupResult<u64> {
let meta = std::fs::metadata(path)?;
let mtime = meta
.modified()
.map_err(|e| DedupError::Io(std::io::Error::new(std::io::ErrorKind::Other, e)))?;
let secs = mtime
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
Ok(secs)
}
impl DedupSessionCache {
#[must_use]
pub fn new(capacity: usize) -> Self {
let capacity = capacity.max(1);
Self {
capacity,
entries: HashMap::with_capacity(capacity),
lru_order: VecDeque::with_capacity(capacity),
}
}
fn promote(&mut self, key: u64) {
if let Some(pos) = self.lru_order.iter().position(|&k| k == key) {
self.lru_order.remove(pos);
}
self.lru_order.push_front(key);
}
fn evict_if_needed(&mut self) {
while self.entries.len() >= self.capacity {
if let Some(lru_key) = self.lru_order.pop_back() {
self.entries.remove(&lru_key);
} else {
break;
}
}
}
pub fn get_or_compute_phash(
&mut self,
path: &Path,
compute: impl FnOnce() -> DedupResult<u64>,
) -> DedupResult<u64> {
let key = fnv1a_64(path.as_os_str().as_encoded_bytes());
let current_mtime = mtime_secs(path)?;
if let Some(entry) = self.entries.get(&key) {
if entry.mtime_secs == current_mtime {
if let Some(ph) = entry.phash {
self.promote(key);
return Ok(ph);
}
}
}
let ph = compute()?;
self.evict_if_needed();
let entry = self.entries.entry(key).or_insert_with(|| CacheEntry {
phash: None,
fingerprint: None,
mtime_secs: current_mtime,
});
entry.phash = Some(ph);
entry.mtime_secs = current_mtime;
self.promote(key);
Ok(ph)
}
#[must_use]
pub fn len(&self) -> usize {
self.entries.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
pub fn save_to_file(&self, path: &Path) -> DedupResult<()> {
let json = serde_json::to_string(&self.entries)
.map_err(|e| DedupError::Io(std::io::Error::new(std::io::ErrorKind::Other, e)))?;
std::fs::write(path, json)?;
Ok(())
}
pub fn load_from_file(path: &Path) -> DedupResult<Self> {
let json = std::fs::read_to_string(path)?;
let entries: HashMap<u64, CacheEntry> = serde_json::from_str(&json)
.map_err(|e| DedupError::Io(std::io::Error::new(std::io::ErrorKind::Other, e)))?;
let capacity = entries.len().max(1);
let lru_order = entries.keys().cloned().collect();
Ok(Self {
capacity,
entries,
lru_order,
})
}
}
impl fmt::Debug for DedupSessionCache {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("DedupSessionCache")
.field("capacity", &self.capacity)
.field("len", &self.entries.len())
.finish()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_cache_stats_default() {
let stats = CacheStats::new();
assert_eq!(stats.hits, 0);
assert_eq!(stats.misses, 0);
assert_eq!(stats.total_lookups(), 0);
assert!((stats.hit_rate() - 0.0).abs() < f64::EPSILON);
}
#[test]
fn test_cache_stats_hit_rate() {
let stats = CacheStats {
hits: 3,
misses: 1,
insertions: 4,
evictions: 0,
};
assert!((stats.hit_rate() - 0.75).abs() < 1e-10);
assert_eq!(stats.total_lookups(), 4);
}
#[test]
fn test_cache_stats_display() {
let stats = CacheStats {
hits: 10,
misses: 5,
insertions: 15,
evictions: 2,
};
let s = stats.to_string();
assert!(s.contains("hits=10"));
assert!(s.contains("misses=5"));
}
#[test]
fn test_cache_stats_reset() {
let mut stats = CacheStats {
hits: 10,
misses: 5,
insertions: 15,
evictions: 2,
};
stats.reset();
assert_eq!(stats.hits, 0);
assert_eq!(stats.misses, 0);
}
#[test]
fn test_lru_cache_insert_and_get() {
let mut cache = LruCache::new(4);
cache.insert("a", 1);
cache.insert("b", 2);
assert_eq!(cache.get(&"a"), Some(&1));
assert_eq!(cache.get(&"b"), Some(&2));
assert_eq!(cache.get(&"c"), None);
}
#[test]
fn test_lru_cache_update_existing() {
let mut cache = LruCache::new(4);
cache.insert("key", 10);
cache.insert("key", 20);
assert_eq!(cache.get(&"key"), Some(&20));
assert_eq!(cache.len(), 1);
}
#[test]
fn test_lru_cache_eviction() {
let mut cache = LruCache::new(3);
cache.insert("a", 1);
cache.insert("b", 2);
cache.insert("c", 3);
cache.insert("d", 4);
assert!(!cache.contains(&"a"));
assert!(cache.contains(&"b"));
assert!(cache.contains(&"c"));
assert!(cache.contains(&"d"));
assert_eq!(cache.stats().evictions, 1);
}
#[test]
fn test_lru_cache_access_promotes() {
let mut cache = LruCache::new(3);
cache.insert("a", 1);
cache.insert("b", 2);
cache.insert("c", 3);
cache.get(&"a");
cache.insert("d", 4);
assert!(cache.contains(&"a"));
assert!(!cache.contains(&"b"));
}
#[test]
fn test_lru_cache_stats() {
let mut cache = LruCache::new(4);
cache.insert("x", 1);
cache.get(&"x"); cache.get(&"y"); assert_eq!(cache.stats().hits, 1);
assert_eq!(cache.stats().misses, 1);
assert_eq!(cache.stats().insertions, 1);
}
#[test]
fn test_lru_cache_clear() {
let mut cache = LruCache::new(4);
cache.insert("a", 1);
cache.insert("b", 2);
assert_eq!(cache.len(), 2);
cache.clear();
assert_eq!(cache.len(), 0);
assert!(cache.is_empty());
}
#[test]
fn test_hash_cache_basic() {
let mut cache = HashCache::new(100);
cache.insert("/video/a.mp4".to_string(), "abc123".to_string());
assert!(cache.contains("/video/a.mp4"));
assert!(!cache.contains("/video/b.mp4"));
assert_eq!(cache.get("/video/a.mp4"), Some(&"abc123".to_string()));
assert_eq!(cache.len(), 1);
}
#[test]
fn test_hash_cache_clear() {
let mut cache = HashCache::new(10);
cache.insert("path".to_string(), "hash".to_string());
assert!(!cache.is_empty());
cache.clear();
assert!(cache.is_empty());
}
#[test]
fn test_hash_cache_eviction() {
let mut cache = HashCache::new(2);
cache.insert("a".to_string(), "h1".to_string());
cache.insert("b".to_string(), "h2".to_string());
cache.insert("c".to_string(), "h3".to_string());
assert!(!cache.contains("a"));
assert!(cache.contains("b"));
assert!(cache.contains("c"));
assert_eq!(cache.stats().evictions, 1);
}
fn write_temp_file(content: &[u8]) -> std::path::PathBuf {
use std::sync::atomic::{AtomicU64, Ordering};
static COUNTER: AtomicU64 = AtomicU64::new(0);
let dir = std::env::temp_dir();
let uid = COUNTER.fetch_add(1, Ordering::Relaxed);
let pid = std::process::id();
let path = dir.join(format!("dedup_cache_test_{pid}_{uid}.bin"));
std::fs::write(&path, content).expect("write temp file");
path
}
#[test]
fn test_cache_hit_returns_cached_value() {
let tmp = write_temp_file(b"hello");
let mut cache = DedupSessionCache::new(16);
let mut calls = 0usize;
let ph1 = cache
.get_or_compute_phash(&tmp, || {
calls += 1;
Ok(0xDEAD_BEEF_u64)
})
.expect("first call should succeed");
assert_eq!(ph1, 0xDEAD_BEEF);
assert_eq!(calls, 1);
let ph2 = cache
.get_or_compute_phash(&tmp, || {
calls += 1;
Ok(0u64)
})
.expect("second call should succeed");
assert_eq!(ph2, 0xDEAD_BEEF, "cached value should be returned on hit");
assert_eq!(calls, 1, "compute closure must not be called on cache hit");
let _ = std::fs::remove_file(&tmp);
}
#[test]
fn test_cache_lru_eviction_at_capacity() {
let tmp_a = write_temp_file(b"file_a");
let tmp_b = write_temp_file(b"file_b");
let tmp_c = write_temp_file(b"file_c");
let mut cache = DedupSessionCache::new(2);
cache
.get_or_compute_phash(&tmp_a, || Ok(1))
.expect("insert a");
cache
.get_or_compute_phash(&tmp_b, || Ok(2))
.expect("insert b");
cache
.get_or_compute_phash(&tmp_c, || Ok(3))
.expect("insert c");
assert_eq!(cache.len(), 2, "cache should not exceed capacity");
let _ = std::fs::remove_file(&tmp_a);
let _ = std::fs::remove_file(&tmp_b);
let _ = std::fs::remove_file(&tmp_c);
}
#[test]
fn test_cache_mtime_invalidation() {
let tmp = write_temp_file(b"original content");
let mut cache = DedupSessionCache::new(16);
let ph1 = cache
.get_or_compute_phash(&tmp, || Ok(0xAAAA_AAAA))
.expect("first compute");
assert_eq!(ph1, 0xAAAA_AAAA);
std::fs::write(&tmp, b"modified content").expect("rewrite temp file");
let key = fnv1a_64(tmp.as_os_str().as_encoded_bytes());
if let Some(e) = cache.entries.get_mut(&key) {
e.mtime_secs = 0; }
let mut recomputed = false;
let ph2 = cache
.get_or_compute_phash(&tmp, || {
recomputed = true;
Ok(0xBBBB_BBBB)
})
.expect("second compute");
assert!(recomputed, "stale mtime should trigger recomputation");
assert_eq!(ph2, 0xBBBB_BBBB);
let _ = std::fs::remove_file(&tmp);
}
#[test]
fn test_cache_save_load_roundtrip() {
use std::sync::atomic::{AtomicU64, Ordering};
static CACHE_COUNTER: AtomicU64 = AtomicU64::new(0);
let dir = std::env::temp_dir();
let tmp_file = write_temp_file(b"roundtrip test");
let pid = std::process::id();
let uid = CACHE_COUNTER.fetch_add(1, Ordering::Relaxed);
let cache_path = dir.join(format!("dedup_session_cache_test_{pid}_{uid}.json"));
let mut cache = DedupSessionCache::new(16);
cache
.get_or_compute_phash(&tmp_file, || Ok(0x1234_5678_9ABC_DEF0))
.expect("compute phash");
cache.save_to_file(&cache_path).expect("save to file");
let loaded = DedupSessionCache::load_from_file(&cache_path).expect("load from file");
assert_eq!(
cache.entries.len(),
loaded.entries.len(),
"entry count must survive round-trip"
);
let key = fnv1a_64(tmp_file.as_os_str().as_encoded_bytes());
let loaded_entry = loaded.entries.get(&key).expect("entry must be present");
assert_eq!(
loaded_entry.phash,
Some(0x1234_5678_9ABC_DEF0),
"phash must survive round-trip"
);
let _ = std::fs::remove_file(&tmp_file);
let _ = std::fs::remove_file(&cache_path);
}
}