#![allow(dead_code)]
use std::collections::HashMap;
use std::fmt;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct ContentFingerprint {
digest: String,
size_bytes: u64,
}
impl ContentFingerprint {
pub fn new(digest: impl Into<String>, size_bytes: u64) -> Self {
Self {
digest: digest.into(),
size_bytes,
}
}
pub fn digest(&self) -> &str {
&self.digest
}
pub fn size_bytes(&self) -> u64 {
self.size_bytes
}
}
impl fmt::Display for ContentFingerprint {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}:{}", self.digest, self.size_bytes)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DedupLevel {
File,
FixedChunk,
VariableChunk,
}
impl fmt::Display for DedupLevel {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let s = match self {
Self::File => "file",
Self::FixedChunk => "fixed-chunk",
Self::VariableChunk => "variable-chunk",
};
write!(f, "{s}")
}
}
#[derive(Debug, Clone)]
pub struct DedupConfig {
pub level: DedupLevel,
pub chunk_size: usize,
pub min_chunk: usize,
pub max_chunk: usize,
pub track_refcount: bool,
}
impl Default for DedupConfig {
fn default() -> Self {
Self {
level: DedupLevel::File,
chunk_size: 4 * 1024 * 1024, min_chunk: 512 * 1024, max_chunk: 16 * 1024 * 1024, track_refcount: true,
}
}
}
#[derive(Debug, Clone)]
pub struct DedupEntry {
pub fingerprint: ContentFingerprint,
pub canonical_path: String,
pub refcount: u32,
}
impl DedupEntry {
pub fn new(fingerprint: ContentFingerprint, canonical_path: impl Into<String>) -> Self {
Self {
fingerprint,
canonical_path: canonical_path.into(),
refcount: 1,
}
}
pub fn add_ref(&mut self) {
self.refcount = self.refcount.saturating_add(1);
}
pub fn release_ref(&mut self) -> u32 {
self.refcount = self.refcount.saturating_sub(1);
self.refcount
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum DedupResult {
Stored,
Duplicate,
Skipped,
}
#[allow(clippy::cast_precision_loss)]
pub fn rolling_hash(data: &[u8], window: usize) -> Vec<u64> {
if data.len() < window || window == 0 {
return vec![];
}
let mut hashes = Vec::with_capacity(data.len() - window + 1);
let mut h: u64 = 0;
for &b in &data[..window] {
h = h.wrapping_mul(31).wrapping_add(u64::from(b));
}
hashes.push(h);
let base_pow = 31_u64.wrapping_pow(window as u32);
for i in window..data.len() {
h = h
.wrapping_mul(31)
.wrapping_add(u64::from(data[i]))
.wrapping_sub(base_pow.wrapping_mul(u64::from(data[i - window])));
hashes.push(h);
}
hashes
}
pub fn compute_fingerprint(data: &[u8]) -> ContentFingerprint {
let mut h: u64 = 0xcbf2_9ce4_8422_2325;
for &b in data {
h ^= u64::from(b);
h = h.wrapping_mul(0x0100_0000_01b3);
}
ContentFingerprint::new(format!("{h:016x}"), data.len() as u64)
}
#[derive(Debug)]
pub struct DedupIndex {
config: DedupConfig,
entries: HashMap<String, DedupEntry>,
bytes_saved: u64,
}
impl DedupIndex {
pub fn new(config: DedupConfig) -> Self {
Self {
config,
entries: HashMap::new(),
bytes_saved: 0,
}
}
pub fn with_defaults() -> Self {
Self::new(DedupConfig::default())
}
pub fn config(&self) -> &DedupConfig {
&self.config
}
pub fn unique_count(&self) -> usize {
self.entries.len()
}
pub fn bytes_saved(&self) -> u64 {
self.bytes_saved
}
pub fn ingest(&mut self, data: &[u8], path: &str) -> DedupResult {
let fp = compute_fingerprint(data);
let digest = fp.digest().to_string();
if let Some(entry) = self.entries.get_mut(&digest) {
entry.add_ref();
self.bytes_saved += fp.size_bytes();
DedupResult::Duplicate
} else {
self.entries.insert(digest, DedupEntry::new(fp, path));
DedupResult::Stored
}
}
pub fn contains(&self, data: &[u8]) -> bool {
let fp = compute_fingerprint(data);
self.entries.contains_key(fp.digest())
}
pub fn release(&mut self, data: &[u8]) -> bool {
let fp = compute_fingerprint(data);
let digest = fp.digest().to_string();
if let Some(entry) = self.entries.get_mut(&digest) {
if entry.release_ref() == 0 {
self.entries.remove(&digest);
return true;
}
}
false
}
pub fn stats(&self) -> DedupStats {
let total_refs: u64 = self.entries.values().map(|e| u64::from(e.refcount)).sum();
DedupStats {
unique_entries: self.entries.len(),
total_references: total_refs,
bytes_saved: self.bytes_saved,
}
}
}
#[derive(Debug, Clone)]
pub struct DedupStats {
pub unique_entries: usize,
pub total_references: u64,
pub bytes_saved: u64,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_fingerprint_creation() {
let fp = ContentFingerprint::new("abc123", 1024);
assert_eq!(fp.digest(), "abc123");
assert_eq!(fp.size_bytes(), 1024);
}
#[test]
fn test_fingerprint_display() {
let fp = ContentFingerprint::new("abc123", 1024);
assert_eq!(fp.to_string(), "abc123:1024");
}
#[test]
fn test_dedup_level_display() {
assert_eq!(DedupLevel::File.to_string(), "file");
assert_eq!(DedupLevel::FixedChunk.to_string(), "fixed-chunk");
assert_eq!(DedupLevel::VariableChunk.to_string(), "variable-chunk");
}
#[test]
fn test_default_config() {
let cfg = DedupConfig::default();
assert_eq!(cfg.level, DedupLevel::File);
assert_eq!(cfg.chunk_size, 4 * 1024 * 1024);
assert!(cfg.track_refcount);
}
#[test]
fn test_dedup_entry_refcount() {
let fp = ContentFingerprint::new("abc", 100);
let mut entry = DedupEntry::new(fp, "/store/abc");
assert_eq!(entry.refcount, 1);
entry.add_ref();
assert_eq!(entry.refcount, 2);
entry.release_ref();
assert_eq!(entry.refcount, 1);
entry.release_ref();
assert_eq!(entry.refcount, 0);
entry.release_ref();
assert_eq!(entry.refcount, 0);
}
#[test]
fn test_compute_fingerprint_deterministic() {
let data = b"hello world archive";
let fp1 = compute_fingerprint(data);
let fp2 = compute_fingerprint(data);
assert_eq!(fp1, fp2);
}
#[test]
fn test_compute_fingerprint_different_data() {
let fp1 = compute_fingerprint(b"data_a");
let fp2 = compute_fingerprint(b"data_b");
assert_ne!(fp1, fp2);
}
#[test]
fn test_rolling_hash_basic() {
let data = b"abcdefghij";
let hashes = rolling_hash(data, 4);
assert_eq!(hashes.len(), 7); }
#[test]
fn test_rolling_hash_empty() {
let hashes = rolling_hash(b"ab", 5);
assert!(hashes.is_empty());
}
#[test]
fn test_index_ingest_new() {
let mut idx = DedupIndex::with_defaults();
let result = idx.ingest(b"content_one", "/archive/a.mxf");
assert_eq!(result, DedupResult::Stored);
assert_eq!(idx.unique_count(), 1);
}
#[test]
fn test_index_ingest_duplicate() {
let mut idx = DedupIndex::with_defaults();
idx.ingest(b"content_one", "/archive/a.mxf");
let result = idx.ingest(b"content_one", "/archive/b.mxf");
assert_eq!(result, DedupResult::Duplicate);
assert_eq!(idx.unique_count(), 1);
assert!(idx.bytes_saved() > 0);
}
#[test]
fn test_index_contains() {
let mut idx = DedupIndex::with_defaults();
idx.ingest(b"content_one", "/archive/a.mxf");
assert!(idx.contains(b"content_one"));
assert!(!idx.contains(b"content_two"));
}
#[test]
fn test_index_release() {
let mut idx = DedupIndex::with_defaults();
idx.ingest(b"content_one", "/archive/a.mxf");
let removed = idx.release(b"content_one");
assert!(removed);
assert_eq!(idx.unique_count(), 0);
}
#[test]
fn test_index_stats() {
let mut idx = DedupIndex::with_defaults();
idx.ingest(b"content_one", "/archive/a.mxf");
idx.ingest(b"content_one", "/archive/b.mxf");
idx.ingest(b"content_two", "/archive/c.mxf");
let stats = idx.stats();
assert_eq!(stats.unique_entries, 2);
assert_eq!(stats.total_references, 3); }
}