#![allow(dead_code)]
#![allow(clippy::cast_precision_loss)]
use std::path::{Path, PathBuf};
use rayon::prelude::*;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DedupEntry {
pub id: u64,
pub content_hash: Vec<u8>,
pub size_bytes: u64,
pub first_seen_epoch: u64,
pub occurrence_count: u32,
}
impl DedupEntry {
#[must_use]
pub fn is_duplicate(&self) -> bool {
self.occurrence_count > 1
}
#[must_use]
pub fn space_savings(&self) -> u64 {
if self.occurrence_count <= 1 {
return 0;
}
(self.occurrence_count as u64 - 1).saturating_mul(self.size_bytes)
}
#[must_use]
pub fn first_seen(&self) -> u64 {
self.first_seen_epoch
}
#[must_use]
pub fn hash_hex(&self) -> String {
self.content_hash
.iter()
.map(|b| format!("{b:02x}"))
.collect()
}
}
pub struct DedupIndex {
pub entries: Vec<DedupEntry>,
pub next_id: u64,
}
impl DedupIndex {
#[must_use]
pub fn new() -> Self {
Self {
entries: Vec::new(),
next_id: 1,
}
}
pub fn add_or_increment(&mut self, hash: Vec<u8>, size_bytes: u64, epoch: u64) -> u64 {
if let Some(entry) = self.entries.iter_mut().find(|e| e.content_hash == hash) {
entry.occurrence_count += 1;
return entry.id;
}
let id = self.next_id;
self.next_id += 1;
self.entries.push(DedupEntry {
id,
content_hash: hash,
size_bytes,
first_seen_epoch: epoch,
occurrence_count: 1,
});
id
}
#[must_use]
pub fn find_by_hash(&self, hash: &[u8]) -> Option<&DedupEntry> {
self.entries.iter().find(|e| e.content_hash == hash)
}
#[must_use]
pub fn find_by_id(&self, id: u64) -> Option<&DedupEntry> {
self.entries.iter().find(|e| e.id == id)
}
#[must_use]
pub fn find_duplicates(&self) -> Vec<&DedupEntry> {
self.entries.iter().filter(|e| e.is_duplicate()).collect()
}
#[must_use]
pub fn total_space_savings(&self) -> u64 {
self.entries.iter().map(|e| e.space_savings()).sum()
}
#[must_use]
pub fn unique_count(&self) -> usize {
self.entries.len()
}
#[must_use]
pub fn total_insertions(&self) -> u64 {
self.entries.iter().map(|e| e.occurrence_count as u64).sum()
}
pub fn remove_by_hash(&mut self, hash: &[u8]) -> bool {
if let Some(pos) = self.entries.iter().position(|e| e.content_hash == hash) {
self.entries.swap_remove(pos);
true
} else {
false
}
}
pub fn clear(&mut self) {
self.entries.clear();
self.next_id = 1;
}
#[must_use]
pub fn most_common(&self) -> Vec<&DedupEntry> {
let mut sorted: Vec<&DedupEntry> = self.entries.iter().collect();
sorted.sort_by(|a, b| b.occurrence_count.cmp(&a.occurrence_count));
sorted
}
#[must_use]
pub fn duplicate_rate(&self) -> f64 {
let total = self.total_insertions();
if total == 0 {
return 0.0;
}
let unique = self.unique_count() as u64;
let dupes = total.saturating_sub(unique);
dupes as f64 / total as f64
}
}
impl Default for DedupIndex {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct FileFeatures {
pub path: PathBuf,
pub content_hash: Vec<u8>,
pub size_bytes: u64,
pub mtime_epoch: u64,
}
pub fn compute_file_features(path: &Path) -> std::io::Result<FileFeatures> {
use std::io::Read;
use std::time::UNIX_EPOCH;
let meta = std::fs::metadata(path)?;
let size_bytes = meta.len();
let mtime_epoch = meta
.modified()
.unwrap_or(UNIX_EPOCH)
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
const FNV_OFFSET: u64 = 0xcbf2_9ce4_8422_2325;
const FNV_PRIME: u64 = 0x0000_0100_0000_01b3;
let mut hash: u64 = FNV_OFFSET;
let file = std::fs::File::open(path)?;
let mut reader = std::io::BufReader::with_capacity(65_536, file);
let mut buf = vec![0u8; 65_536];
loop {
let n = reader.read(&mut buf)?;
if n == 0 {
break;
}
for &b in &buf[..n] {
hash = (hash ^ u64::from(b)).wrapping_mul(FNV_PRIME);
}
}
Ok(FileFeatures {
path: path.to_path_buf(),
content_hash: hash.to_le_bytes().to_vec(),
size_bytes,
mtime_epoch,
})
}
impl DedupIndex {
pub fn add_files(&mut self, paths: &[impl AsRef<Path> + Sync]) -> (Vec<u64>, Vec<PathBuf>) {
let results: Vec<(PathBuf, std::io::Result<FileFeatures>)> = paths
.par_iter()
.map(|p| {
let path = p.as_ref().to_path_buf();
let feat = compute_file_features(&path);
(path, feat)
})
.collect();
let mut added_ids = Vec::with_capacity(results.len());
let mut skipped = Vec::new();
for (path, result) in results {
match result {
Ok(feat) => {
let id =
self.add_or_increment(feat.content_hash, feat.size_bytes, feat.mtime_epoch);
added_ids.push(id);
}
Err(_) => {
skipped.push(path);
}
}
}
(added_ids, skipped)
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::env::temp_dir;
fn hash(s: &str) -> Vec<u8> {
s.as_bytes().to_vec()
}
fn make_temp_files(n: usize) -> Vec<std::path::PathBuf> {
use std::sync::atomic::{AtomicU64, Ordering};
static COUNTER: AtomicU64 = AtomicU64::new(0);
let base = temp_dir();
let pid = std::process::id();
(0..n)
.map(|i| {
let uid = COUNTER.fetch_add(1, Ordering::Relaxed);
let path = base.join(format!("dedup_idx_{pid}_{uid}_{i}.bin"));
std::fs::write(&path, format!("synthetic content for file {pid}_{uid}_{i}"))
.expect("write temp file");
path
})
.collect()
}
#[test]
fn test_entry_is_duplicate_false_when_once() {
let e = DedupEntry {
id: 1,
content_hash: hash("abc"),
size_bytes: 1024,
first_seen_epoch: 0,
occurrence_count: 1,
};
assert!(!e.is_duplicate());
}
#[test]
fn test_entry_is_duplicate_true_when_multiple() {
let e = DedupEntry {
id: 1,
content_hash: hash("abc"),
size_bytes: 1024,
first_seen_epoch: 0,
occurrence_count: 3,
};
assert!(e.is_duplicate());
}
#[test]
fn test_entry_space_savings_zero_when_once() {
let e = DedupEntry {
id: 1,
content_hash: hash("abc"),
size_bytes: 500,
first_seen_epoch: 0,
occurrence_count: 1,
};
assert_eq!(e.space_savings(), 0);
}
#[test]
fn test_entry_space_savings_correct() {
let e = DedupEntry {
id: 1,
content_hash: hash("abc"),
size_bytes: 1000,
first_seen_epoch: 0,
occurrence_count: 4,
};
assert_eq!(e.space_savings(), 3000);
}
#[test]
fn test_entry_hash_hex() {
let e = DedupEntry {
id: 1,
content_hash: vec![0xDE, 0xAD, 0xBE, 0xEF],
size_bytes: 0,
first_seen_epoch: 0,
occurrence_count: 1,
};
assert_eq!(e.hash_hex(), "deadbeef");
}
#[test]
fn test_index_new_empty() {
let idx = DedupIndex::new();
assert_eq!(idx.unique_count(), 0);
assert_eq!(idx.total_insertions(), 0);
assert_eq!(idx.total_space_savings(), 0);
}
#[test]
fn test_add_or_increment_new_entry() {
let mut idx = DedupIndex::new();
let id = idx.add_or_increment(hash("file_a"), 100, 1000);
assert_eq!(idx.unique_count(), 1);
assert_eq!(id, 1);
}
#[test]
fn test_add_or_increment_existing_entry() {
let mut idx = DedupIndex::new();
let id1 = idx.add_or_increment(hash("file_a"), 100, 1000);
let id2 = idx.add_or_increment(hash("file_a"), 100, 2000);
assert_eq!(id1, id2, "Same hash should return same ID");
assert_eq!(idx.unique_count(), 1, "Should still be one unique entry");
let entry = idx
.find_by_hash(&hash("file_a"))
.expect("operation should succeed");
assert_eq!(entry.occurrence_count, 2);
}
#[test]
fn test_find_by_hash_found() {
let mut idx = DedupIndex::new();
idx.add_or_increment(hash("alpha"), 200, 0);
let entry = idx.find_by_hash(&hash("alpha"));
assert!(entry.is_some());
assert_eq!(entry.expect("operation should succeed").size_bytes, 200);
}
#[test]
fn test_find_by_hash_not_found() {
let idx = DedupIndex::new();
assert!(idx.find_by_hash(&hash("missing")).is_none());
}
#[test]
fn test_find_by_id() {
let mut idx = DedupIndex::new();
let id = idx.add_or_increment(hash("entry_1"), 512, 100);
let entry = idx.find_by_id(id);
assert!(entry.is_some());
assert_eq!(
entry.expect("operation should succeed").content_hash,
hash("entry_1")
);
}
#[test]
fn test_find_duplicates() {
let mut idx = DedupIndex::new();
idx.add_or_increment(hash("unique"), 10, 0);
idx.add_or_increment(hash("dup"), 20, 0);
idx.add_or_increment(hash("dup"), 20, 1);
idx.add_or_increment(hash("dup"), 20, 2);
let dups = idx.find_duplicates();
assert_eq!(dups.len(), 1);
assert_eq!(dups[0].content_hash, hash("dup"));
assert_eq!(dups[0].occurrence_count, 3);
}
#[test]
fn test_total_space_savings() {
let mut idx = DedupIndex::new();
idx.add_or_increment(hash("a"), 100, 0);
idx.add_or_increment(hash("b"), 200, 0);
idx.add_or_increment(hash("b"), 200, 1);
idx.add_or_increment(hash("b"), 200, 2);
assert_eq!(idx.total_space_savings(), 400);
}
#[test]
fn test_remove_by_hash() {
let mut idx = DedupIndex::new();
idx.add_or_increment(hash("to_remove"), 50, 0);
assert_eq!(idx.unique_count(), 1);
let removed = idx.remove_by_hash(&hash("to_remove"));
assert!(removed);
assert_eq!(idx.unique_count(), 0);
}
#[test]
fn test_remove_nonexistent_returns_false() {
let mut idx = DedupIndex::new();
assert!(!idx.remove_by_hash(&hash("ghost")));
}
#[test]
fn test_clear() {
let mut idx = DedupIndex::new();
idx.add_or_increment(hash("x"), 10, 0);
idx.add_or_increment(hash("y"), 20, 0);
idx.clear();
assert_eq!(idx.unique_count(), 0);
assert_eq!(idx.next_id, 1);
}
#[test]
fn test_duplicate_rate() {
let mut idx = DedupIndex::new();
idx.add_or_increment(hash("h"), 100, 0);
idx.add_or_increment(hash("h"), 100, 1);
idx.add_or_increment(hash("h"), 100, 2);
let rate = idx.duplicate_rate();
let expected = 2.0 / 3.0;
assert!((rate - expected).abs() < 1e-9);
}
#[test]
fn test_most_common_ordering() {
let mut idx = DedupIndex::new();
idx.add_or_increment(hash("rare"), 10, 0);
idx.add_or_increment(hash("common"), 100, 0);
idx.add_or_increment(hash("common"), 100, 1);
idx.add_or_increment(hash("common"), 100, 2);
idx.add_or_increment(hash("mid"), 50, 0);
idx.add_or_increment(hash("mid"), 50, 1);
let mc = idx.most_common();
assert_eq!(mc[0].content_hash, hash("common"));
assert_eq!(mc[1].content_hash, hash("mid"));
assert_eq!(mc[2].content_hash, hash("rare"));
}
#[test]
fn test_total_insertions() {
let mut idx = DedupIndex::new();
idx.add_or_increment(hash("a"), 1, 0);
idx.add_or_increment(hash("a"), 1, 1);
idx.add_or_increment(hash("b"), 1, 0);
assert_eq!(idx.total_insertions(), 3);
}
#[test]
fn test_add_files_parallel_same_result_as_sequential() {
let paths = make_temp_files(10);
let mut par_idx = DedupIndex::new();
let (par_ids, par_skipped) = par_idx.add_files(&paths);
assert!(par_skipped.is_empty(), "no files should be skipped");
assert_eq!(par_ids.len(), 10, "all 10 files should produce an ID");
let mut seq_idx = DedupIndex::new();
let mut seq_ids = Vec::with_capacity(paths.len());
for p in &paths {
let feat = compute_file_features(p.as_ref()).expect("compute features");
let id = seq_idx.add_or_increment(feat.content_hash, feat.size_bytes, feat.mtime_epoch);
seq_ids.push(id);
}
assert_eq!(
par_idx.unique_count(),
seq_idx.unique_count(),
"unique count must match between parallel and sequential"
);
assert_eq!(par_idx.find_duplicates().len(), 0);
assert_eq!(seq_idx.find_duplicates().len(), 0);
for p in &paths {
let _ = std::fs::remove_file(p);
}
}
#[test]
fn test_add_files_skips_missing_files() {
let real = make_temp_files(3);
let mut all_paths = real.clone();
all_paths.push(temp_dir().join("nonexistent_dedup_test_file.bin"));
let mut idx = DedupIndex::new();
let (_ids, skipped) = idx.add_files(&all_paths);
assert_eq!(
skipped.len(),
1,
"exactly one missing file should be skipped"
);
assert_eq!(idx.unique_count(), 3, "three real files should be indexed");
for p in &real {
let _ = std::fs::remove_file(p);
}
}
}