use std::collections::HashSet;
use std::hash::Hasher;
use std::io::Read;
fn gear_table() -> [u64; 256] {
let mut x: u64 = 0x9E37_79B9_7F4A_7C15;
let mut t = [0u64; 256];
for slot in t.iter_mut() {
x = x.wrapping_add(0x9E37_79B9_7F4A_7C15);
let mut z = x;
z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
*slot = z ^ (z >> 31);
}
t
}
const MIN_CHUNK: usize = 2 * 1024;
const AVG_MASK_BITS: u32 = 14; const MAX_CHUNK: usize = 64 * 1024;
fn cdc_chunks(data: &[u8], gear: &[u64; 256], mut f: impl FnMut(&[u8])) {
let mask = (1u64 << AVG_MASK_BITS) - 1;
let mut start = 0usize;
let n = data.len();
while start < n {
let mut h: u64 = 0;
let mut i = start;
let hard_end = (start + MAX_CHUNK).min(n);
let mut cut = hard_end;
while i < hard_end {
h = (h << 1).wrapping_add(gear[data[i] as usize]);
i += 1;
if i - start >= MIN_CHUNK && (h & mask) == 0 {
cut = i;
break;
}
}
f(&data[start..cut]);
start = cut;
}
}
fn hash_chunk(chunk: &[u8]) -> u64 {
let mut h = std::collections::hash_map::DefaultHasher::new();
h.write(chunk);
h.finish()
}
fn main() {
let default_dir = format!(
"{}/.local/supermachine-layer-cache/registry/blobs/sha256",
std::env::var("HOME").unwrap_or_default()
);
let dir = std::env::args().nth(1).unwrap_or(default_dir);
eprintln!("=== chunk-dedup probe over {dir} ===");
let gear = gear_table();
let mut total_compressed_on_disk: u64 = 0;
let mut total_uncompressed: u64 = 0;
let mut total_chunks: u64 = 0;
let mut unique_bytes: u64 = 0;
let mut layer_files = 0u64;
let mut seen: HashSet<u64> = HashSet::new();
let entries = match std::fs::read_dir(&dir) {
Ok(e) => e,
Err(e) => {
eprintln!("cannot read {dir}: {e}");
std::process::exit(1);
}
};
for ent in entries.flatten() {
let path = ent.path();
if !path.is_file() {
continue;
}
let raw = match std::fs::read(&path) {
Ok(b) => b,
Err(_) => continue,
};
total_compressed_on_disk += raw.len() as u64;
if raw.len() < 2 || raw[0] != 0x1f || raw[1] != 0x8b {
continue;
}
let mut gz = flate2::read::GzDecoder::new(&raw[..]);
let mut data = Vec::new();
if gz.read_to_end(&mut data).is_err() || data.is_empty() {
continue;
}
layer_files += 1;
total_uncompressed += data.len() as u64;
cdc_chunks(&data, &gear, |chunk| {
total_chunks += 1;
if seen.insert(hash_chunk(chunk)) {
unique_bytes += chunk.len() as u64;
}
});
}
let mib = |b: u64| b as f64 / (1024.0 * 1024.0);
let pct = |a: u64, b: u64| {
if b > 0 {
100.0 * a as f64 / b as f64
} else {
0.0
}
};
eprintln!("layers (gzip blobs) processed : {layer_files}");
eprintln!(
"current on-disk (compressed) : {:.1} MiB (exact-layer-deduped already)",
mib(total_compressed_on_disk)
);
eprintln!(
"uncompressed layer bytes : {:.1} MiB ({} chunks, avg {:.1} KiB)",
mib(total_uncompressed),
total_chunks,
if total_chunks > 0 {
total_uncompressed as f64 / total_chunks as f64 / 1024.0
} else {
0.0
}
);
eprintln!(
"unique chunk bytes : {:.1} MiB ({:.1}% of uncompressed)",
mib(unique_bytes),
pct(unique_bytes, total_uncompressed)
);
eprintln!(
"==> sub-layer redundancy found by chunking: {:.1}% (100% - unique/uncompressed)",
100.0 - pct(unique_bytes, total_uncompressed)
);
eprintln!(
" note: chunk store holds UNCOMPRESSED unique bytes ({:.1} MiB) vs current\n \
compressed on-disk ({:.1} MiB) — chunk-CDN only nets storage if per-chunk\n \
recompression keeps unique below {:.1} MiB.",
mib(unique_bytes),
mib(total_compressed_on_disk),
mib(total_compressed_on_disk)
);
}