#![allow(dead_code)]
use std::collections::HashMap;
use std::fmt;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct ContentFingerprint {
digest: String,
size_bytes: u64,
}
impl ContentFingerprint {
pub fn new(digest: impl Into<String>, size_bytes: u64) -> Self {
Self {
digest: digest.into(),
size_bytes,
}
}
pub fn digest(&self) -> &str {
&self.digest
}
pub fn size_bytes(&self) -> u64 {
self.size_bytes
}
}
impl fmt::Display for ContentFingerprint {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}:{}", self.digest, self.size_bytes)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DedupLevel {
File,
FixedChunk,
VariableChunk,
}
impl fmt::Display for DedupLevel {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let s = match self {
Self::File => "file",
Self::FixedChunk => "fixed-chunk",
Self::VariableChunk => "variable-chunk",
};
write!(f, "{s}")
}
}
#[derive(Debug, Clone)]
pub struct DedupConfig {
pub level: DedupLevel,
pub chunk_size: usize,
pub min_chunk: usize,
pub max_chunk: usize,
pub track_refcount: bool,
}
impl Default for DedupConfig {
fn default() -> Self {
Self {
level: DedupLevel::File,
chunk_size: 4 * 1024 * 1024, min_chunk: 512 * 1024, max_chunk: 16 * 1024 * 1024, track_refcount: true,
}
}
}
#[derive(Debug, Clone)]
pub struct DedupEntry {
pub fingerprint: ContentFingerprint,
pub canonical_path: String,
pub refcount: u32,
}
impl DedupEntry {
pub fn new(fingerprint: ContentFingerprint, canonical_path: impl Into<String>) -> Self {
Self {
fingerprint,
canonical_path: canonical_path.into(),
refcount: 1,
}
}
pub fn add_ref(&mut self) {
self.refcount = self.refcount.saturating_add(1);
}
pub fn release_ref(&mut self) -> u32 {
self.refcount = self.refcount.saturating_sub(1);
self.refcount
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum DedupResult {
Stored,
Duplicate,
Skipped,
}
#[allow(clippy::cast_precision_loss)]
pub fn rolling_hash(data: &[u8], window: usize) -> Vec<u64> {
if data.len() < window || window == 0 {
return vec![];
}
let mut hashes = Vec::with_capacity(data.len() - window + 1);
let mut h: u64 = 0;
for &b in &data[..window] {
h = h.wrapping_mul(31).wrapping_add(u64::from(b));
}
hashes.push(h);
let base_pow = 31_u64.wrapping_pow(window as u32);
for i in window..data.len() {
h = h
.wrapping_mul(31)
.wrapping_add(u64::from(data[i]))
.wrapping_sub(base_pow.wrapping_mul(u64::from(data[i - window])));
hashes.push(h);
}
hashes
}
pub fn compute_fingerprint(data: &[u8]) -> ContentFingerprint {
let mut h: u64 = 0xcbf2_9ce4_8422_2325;
for &b in data {
h ^= u64::from(b);
h = h.wrapping_mul(0x0100_0000_01b3);
}
ContentFingerprint::new(format!("{h:016x}"), data.len() as u64)
}
#[derive(Debug)]
pub struct DedupIndex {
config: DedupConfig,
entries: HashMap<String, DedupEntry>,
bytes_saved: u64,
}
impl DedupIndex {
pub fn new(config: DedupConfig) -> Self {
Self {
config,
entries: HashMap::new(),
bytes_saved: 0,
}
}
pub fn with_defaults() -> Self {
Self::new(DedupConfig::default())
}
pub fn config(&self) -> &DedupConfig {
&self.config
}
pub fn unique_count(&self) -> usize {
self.entries.len()
}
pub fn bytes_saved(&self) -> u64 {
self.bytes_saved
}
pub fn ingest(&mut self, data: &[u8], path: &str) -> DedupResult {
let fp = compute_fingerprint(data);
let digest = fp.digest().to_string();
if let Some(entry) = self.entries.get_mut(&digest) {
entry.add_ref();
self.bytes_saved += fp.size_bytes();
DedupResult::Duplicate
} else {
self.entries.insert(digest, DedupEntry::new(fp, path));
DedupResult::Stored
}
}
pub fn contains(&self, data: &[u8]) -> bool {
let fp = compute_fingerprint(data);
self.entries.contains_key(fp.digest())
}
pub fn release(&mut self, data: &[u8]) -> bool {
let fp = compute_fingerprint(data);
let digest = fp.digest().to_string();
if let Some(entry) = self.entries.get_mut(&digest) {
if entry.release_ref() == 0 {
self.entries.remove(&digest);
return true;
}
}
false
}
pub fn stats(&self) -> DedupStats {
let total_refs: u64 = self.entries.values().map(|e| u64::from(e.refcount)).sum();
DedupStats {
unique_entries: self.entries.len(),
total_references: total_refs,
bytes_saved: self.bytes_saved,
}
}
}
#[derive(Debug, Clone)]
pub struct DedupStats {
pub unique_entries: usize,
pub total_references: u64,
pub bytes_saved: u64,
}
#[derive(Debug, Clone)]
pub struct DuplicateGroup {
pub fingerprint_digest: String,
pub size_bytes: u64,
pub canonical_path: String,
pub all_paths: Vec<String>,
pub duplicate_count: u32,
pub wasted_bytes: u64,
}
#[derive(Debug, Clone)]
pub struct DedupReport {
pub unique_count: usize,
pub total_references: u64,
pub duplicate_count: u64,
pub total_bytes_saved: u64,
pub total_logical_size: u64,
pub total_physical_size: u64,
pub dedup_ratio: f64,
pub savings_percentage: f64,
pub duplicate_groups: Vec<DuplicateGroup>,
pub top_wasters: Vec<DuplicateGroup>,
pub refcount_distribution: Vec<(u32, usize)>,
}
impl DedupReport {
#[must_use]
pub fn to_summary_string(&self) -> String {
let mut out = String::new();
out.push_str("=== Deduplication Report ===\n");
out.push_str(&format!("Unique items: {}\n", self.unique_count));
out.push_str(&format!("Total references: {}\n", self.total_references));
out.push_str(&format!("Duplicate refs: {}\n", self.duplicate_count));
out.push_str(&format!(
"Logical size: {} bytes\n",
self.total_logical_size
));
out.push_str(&format!(
"Physical size: {} bytes\n",
self.total_physical_size
));
out.push_str(&format!(
"Space saved: {} bytes ({:.1}%)\n",
self.total_bytes_saved, self.savings_percentage
));
out.push_str(&format!("Dedup ratio: {:.2}x\n", self.dedup_ratio));
if !self.top_wasters.is_empty() {
out.push_str("\nTop duplicate groups by wasted space:\n");
for (i, group) in self.top_wasters.iter().enumerate() {
out.push_str(&format!(
" {}. {} — {} bytes x {} copies = {} bytes wasted\n",
i + 1,
group.canonical_path,
group.size_bytes,
group.duplicate_count,
group.wasted_bytes,
));
}
}
if !self.refcount_distribution.is_empty() {
out.push_str("\nReference count distribution:\n");
for (refcount, count) in &self.refcount_distribution {
out.push_str(&format!(" {refcount} refs: {count} entries\n"));
}
}
out
}
}
impl DedupIndex {
#[must_use]
pub fn generate_report(&self, top_n: usize) -> DedupReport {
let mut duplicate_groups = Vec::new();
let mut total_logical_size: u64 = 0;
let mut total_physical_size: u64 = 0;
let mut refcount_map: HashMap<u32, usize> = HashMap::new();
for entry in self.entries.values() {
let size = entry.fingerprint.size_bytes();
let refs = entry.refcount;
total_physical_size += size;
total_logical_size += size * u64::from(refs);
*refcount_map.entry(refs).or_insert(0) += 1;
if refs > 1 {
let duplicate_count = refs - 1;
let wasted = size * u64::from(duplicate_count);
duplicate_groups.push(DuplicateGroup {
fingerprint_digest: entry.fingerprint.digest().to_string(),
size_bytes: size,
canonical_path: entry.canonical_path.clone(),
all_paths: vec![entry.canonical_path.clone()],
duplicate_count,
wasted_bytes: wasted,
});
}
}
duplicate_groups.sort_by(|a, b| b.wasted_bytes.cmp(&a.wasted_bytes));
let top_wasters: Vec<DuplicateGroup> =
duplicate_groups.iter().take(top_n).cloned().collect();
let total_refs: u64 = self.entries.values().map(|e| u64::from(e.refcount)).sum();
let duplicate_count = total_refs.saturating_sub(self.entries.len() as u64);
let dedup_ratio = if total_physical_size == 0 {
1.0
} else {
total_logical_size as f64 / total_physical_size as f64
};
let savings_percentage = if total_logical_size == 0 {
0.0
} else {
(self.bytes_saved as f64 / total_logical_size as f64) * 100.0
};
let mut refcount_distribution: Vec<(u32, usize)> = refcount_map.into_iter().collect();
refcount_distribution.sort_by_key(|(rc, _)| *rc);
DedupReport {
unique_count: self.entries.len(),
total_references: total_refs,
duplicate_count,
total_bytes_saved: self.bytes_saved,
total_logical_size,
total_physical_size,
dedup_ratio,
savings_percentage,
duplicate_groups,
top_wasters,
refcount_distribution,
}
}
#[must_use]
pub fn list_duplicates(&self) -> Vec<(&str, u32, u64)> {
self.entries
.values()
.filter(|e| e.refcount > 1)
.map(|e| {
(
e.canonical_path.as_str(),
e.refcount,
e.fingerprint.size_bytes(),
)
})
.collect()
}
#[must_use]
pub fn estimated_physical_storage(&self) -> u64 {
self.entries
.values()
.map(|e| e.fingerprint.size_bytes())
.sum()
}
#[must_use]
pub fn estimated_logical_storage(&self) -> u64 {
self.entries
.values()
.map(|e| e.fingerprint.size_bytes() * u64::from(e.refcount))
.sum()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_fingerprint_creation() {
let fp = ContentFingerprint::new("abc123", 1024);
assert_eq!(fp.digest(), "abc123");
assert_eq!(fp.size_bytes(), 1024);
}
#[test]
fn test_fingerprint_display() {
let fp = ContentFingerprint::new("abc123", 1024);
assert_eq!(fp.to_string(), "abc123:1024");
}
#[test]
fn test_dedup_level_display() {
assert_eq!(DedupLevel::File.to_string(), "file");
assert_eq!(DedupLevel::FixedChunk.to_string(), "fixed-chunk");
assert_eq!(DedupLevel::VariableChunk.to_string(), "variable-chunk");
}
#[test]
fn test_default_config() {
let cfg = DedupConfig::default();
assert_eq!(cfg.level, DedupLevel::File);
assert_eq!(cfg.chunk_size, 4 * 1024 * 1024);
assert!(cfg.track_refcount);
}
#[test]
fn test_dedup_entry_refcount() {
let fp = ContentFingerprint::new("abc", 100);
let mut entry = DedupEntry::new(fp, "/store/abc");
assert_eq!(entry.refcount, 1);
entry.add_ref();
assert_eq!(entry.refcount, 2);
entry.release_ref();
assert_eq!(entry.refcount, 1);
entry.release_ref();
assert_eq!(entry.refcount, 0);
entry.release_ref();
assert_eq!(entry.refcount, 0);
}
#[test]
fn test_compute_fingerprint_deterministic() {
let data = b"hello world archive";
let fp1 = compute_fingerprint(data);
let fp2 = compute_fingerprint(data);
assert_eq!(fp1, fp2);
}
#[test]
fn test_compute_fingerprint_different_data() {
let fp1 = compute_fingerprint(b"data_a");
let fp2 = compute_fingerprint(b"data_b");
assert_ne!(fp1, fp2);
}
#[test]
fn test_rolling_hash_basic() {
let data = b"abcdefghij";
let hashes = rolling_hash(data, 4);
assert_eq!(hashes.len(), 7); }
#[test]
fn test_rolling_hash_empty() {
let hashes = rolling_hash(b"ab", 5);
assert!(hashes.is_empty());
}
#[test]
fn test_index_ingest_new() {
let mut idx = DedupIndex::with_defaults();
let result = idx.ingest(b"content_one", "/archive/a.mxf");
assert_eq!(result, DedupResult::Stored);
assert_eq!(idx.unique_count(), 1);
}
#[test]
fn test_index_ingest_duplicate() {
let mut idx = DedupIndex::with_defaults();
idx.ingest(b"content_one", "/archive/a.mxf");
let result = idx.ingest(b"content_one", "/archive/b.mxf");
assert_eq!(result, DedupResult::Duplicate);
assert_eq!(idx.unique_count(), 1);
assert!(idx.bytes_saved() > 0);
}
#[test]
fn test_index_contains() {
let mut idx = DedupIndex::with_defaults();
idx.ingest(b"content_one", "/archive/a.mxf");
assert!(idx.contains(b"content_one"));
assert!(!idx.contains(b"content_two"));
}
#[test]
fn test_index_release() {
let mut idx = DedupIndex::with_defaults();
idx.ingest(b"content_one", "/archive/a.mxf");
let removed = idx.release(b"content_one");
assert!(removed);
assert_eq!(idx.unique_count(), 0);
}
#[test]
fn test_index_stats() {
let mut idx = DedupIndex::with_defaults();
idx.ingest(b"content_one", "/archive/a.mxf");
idx.ingest(b"content_one", "/archive/b.mxf");
idx.ingest(b"content_two", "/archive/c.mxf");
let stats = idx.stats();
assert_eq!(stats.unique_entries, 2);
assert_eq!(stats.total_references, 3); }
#[test]
fn test_report_empty_index() {
let idx = DedupIndex::with_defaults();
let report = idx.generate_report(5);
assert_eq!(report.unique_count, 0);
assert_eq!(report.total_references, 0);
assert_eq!(report.duplicate_count, 0);
assert!((report.dedup_ratio - 1.0).abs() < f64::EPSILON);
assert!(report.duplicate_groups.is_empty());
}
#[test]
fn test_report_no_duplicates() {
let mut idx = DedupIndex::with_defaults();
idx.ingest(b"unique_a", "/a.mxf");
idx.ingest(b"unique_b", "/b.mxf");
idx.ingest(b"unique_c", "/c.mxf");
let report = idx.generate_report(5);
assert_eq!(report.unique_count, 3);
assert_eq!(report.duplicate_count, 0);
assert!(report.duplicate_groups.is_empty());
assert_eq!(report.total_bytes_saved, 0);
}
#[test]
fn test_report_with_duplicates() {
let mut idx = DedupIndex::with_defaults();
idx.ingest(b"content_dup", "/archive/a.mxf");
idx.ingest(b"content_dup", "/archive/b.mxf");
idx.ingest(b"content_dup", "/archive/c.mxf");
idx.ingest(b"content_unique", "/archive/d.mxf");
let report = idx.generate_report(5);
assert_eq!(report.unique_count, 2);
assert_eq!(report.total_references, 4);
assert_eq!(report.duplicate_count, 2);
assert!(report.total_bytes_saved > 0);
assert_eq!(report.duplicate_groups.len(), 1);
assert_eq!(report.duplicate_groups[0].duplicate_count, 2);
}
#[test]
fn test_report_top_wasters_limit() {
let mut idx = DedupIndex::with_defaults();
idx.ingest(b"small_dup", "/s1.mxf");
idx.ingest(b"small_dup", "/s2.mxf");
idx.ingest(b"medium_dup_data!", "/m1.mxf");
idx.ingest(b"medium_dup_data!", "/m2.mxf");
idx.ingest(b"large_duplicate_content_here", "/l1.mxf");
idx.ingest(b"large_duplicate_content_here", "/l2.mxf");
idx.ingest(b"large_duplicate_content_here", "/l3.mxf");
let report = idx.generate_report(2);
assert_eq!(report.top_wasters.len(), 2);
assert!(report.top_wasters[0].wasted_bytes >= report.top_wasters[1].wasted_bytes);
}
#[test]
fn test_report_dedup_ratio() {
let mut idx = DedupIndex::with_defaults();
idx.ingest(b"aaaaaaaaaa", "/a.bin"); idx.ingest(b"aaaaaaaaaa", "/b.bin");
let report = idx.generate_report(5);
assert!((report.dedup_ratio - 2.0).abs() < f64::EPSILON);
}
#[test]
fn test_report_savings_percentage() {
let mut idx = DedupIndex::with_defaults();
let content = b"test_content";
idx.ingest(content, "/a.bin");
idx.ingest(content, "/b.bin");
let report = idx.generate_report(5);
assert!(report.savings_percentage > 0.0);
assert!(report.savings_percentage <= 100.0);
}
#[test]
fn test_report_refcount_distribution() {
let mut idx = DedupIndex::with_defaults();
idx.ingest(b"once", "/a.bin");
idx.ingest(b"twice", "/b.bin");
idx.ingest(b"twice", "/c.bin");
idx.ingest(b"thrice", "/d.bin");
idx.ingest(b"thrice", "/e.bin");
idx.ingest(b"thrice", "/f.bin");
let report = idx.generate_report(5);
assert!(!report.refcount_distribution.is_empty());
let rc_map: HashMap<u32, usize> = report.refcount_distribution.into_iter().collect();
assert_eq!(rc_map.get(&1), Some(&1)); assert_eq!(rc_map.get(&2), Some(&1)); assert_eq!(rc_map.get(&3), Some(&1)); }
#[test]
fn test_report_summary_string() {
let mut idx = DedupIndex::with_defaults();
idx.ingest(b"content", "/a.mxf");
idx.ingest(b"content", "/b.mxf");
let report = idx.generate_report(5);
let summary = report.to_summary_string();
assert!(summary.contains("Deduplication Report"));
assert!(summary.contains("Unique items"));
assert!(summary.contains("Space saved"));
assert!(summary.contains("Dedup ratio"));
}
#[test]
fn test_list_duplicates() {
let mut idx = DedupIndex::with_defaults();
idx.ingest(b"unique", "/a.bin");
idx.ingest(b"dup_content", "/b.bin");
idx.ingest(b"dup_content", "/c.bin");
let dups = idx.list_duplicates();
assert_eq!(dups.len(), 1);
assert_eq!(dups[0].1, 2); }
#[test]
fn test_estimated_storage() {
let mut idx = DedupIndex::with_defaults();
let content_a = b"aaaa"; let content_b = b"bbbb"; idx.ingest(content_a, "/a.bin");
idx.ingest(content_a, "/a2.bin");
idx.ingest(content_b, "/b.bin");
assert_eq!(idx.estimated_physical_storage(), 8);
assert_eq!(idx.estimated_logical_storage(), 12);
}
#[test]
fn test_report_sorted_by_wasted() {
let mut idx = DedupIndex::with_defaults();
idx.ingest(b"aa", "/s.bin");
idx.ingest(b"aa", "/s2.bin");
idx.ingest(b"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", "/l.bin");
idx.ingest(b"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", "/l2.bin");
let report = idx.generate_report(10);
if report.duplicate_groups.len() >= 2 {
assert!(
report.duplicate_groups[0].wasted_bytes >= report.duplicate_groups[1].wasted_bytes
);
}
}
}