supermachine 0.7.82

Run any OCI/Docker image as a hardware-isolated microVM on macOS HVF (Linux KVM and Windows WHP in progress). Single library API, zero flags for the common case, sub-100 ms cold-restore from snapshot.
Documentation
//! Chunk-dedup opportunity probe (measurement only — informs whether a
//! content-defined-chunk CDN is worth building).
//!
//! The registry blob cache already stores each *unique layer* once (exact-layer
//! dedup by sha256). Content-defined chunking only pays off if there is real
//! *sub-layer* redundancy ACROSS those already-deduped layers — e.g. two image
//! versions whose layers differ by a few files, or common files duplicated
//! across unrelated layers. This probe quantifies exactly that.
//!
//! Method: walk the blob cache, gunzip each layer (skip non-gzip config/manifest
//! blobs), split the uncompressed bytes with a FastCDC-style gear-hash chunker,
//! and compare total uncompressed bytes vs unique-chunk bytes. The headline
//! number is `unique_chunk_bytes / total_uncompressed` — how much a chunk store
//! would hold vs naive per-layer storage of the same (already exact-deduped)
//! layers. It also prints current on-disk compressed size for context.
//!
//! Usage: `_chunk_dedup_probe [BLOB_CACHE_DIR]`
//! (default: $HOME/.local/supermachine-layer-cache/registry/blobs/sha256)

use std::collections::HashSet;
use std::hash::Hasher;
use std::io::Read;

/// Gear table for content-defined chunking — 256 pseudo-random u64s (fixed seed
/// so chunk boundaries are deterministic). Boundary when the rolling gear hash
/// has the low `MASK_BITS` clear; bounded by MIN/MAX so chunks stay in range.
fn gear_table() -> [u64; 256] {
    // SplitMix64 from a fixed seed → reproducible, well-distributed table.
    let mut x: u64 = 0x9E37_79B9_7F4A_7C15;
    let mut t = [0u64; 256];
    for slot in t.iter_mut() {
        x = x.wrapping_add(0x9E37_79B9_7F4A_7C15);
        let mut z = x;
        z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
        z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
        *slot = z ^ (z >> 31);
    }
    t
}

const MIN_CHUNK: usize = 2 * 1024;
const AVG_MASK_BITS: u32 = 14; // ~16 KiB average
const MAX_CHUNK: usize = 64 * 1024;

/// Split `data` into content-defined chunks; call `f(chunk)` for each.
fn cdc_chunks(data: &[u8], gear: &[u64; 256], mut f: impl FnMut(&[u8])) {
    let mask = (1u64 << AVG_MASK_BITS) - 1;
    let mut start = 0usize;
    let n = data.len();
    while start < n {
        let mut h: u64 = 0;
        let mut i = start;
        let hard_end = (start + MAX_CHUNK).min(n);
        let mut cut = hard_end;
        while i < hard_end {
            h = (h << 1).wrapping_add(gear[data[i] as usize]);
            i += 1;
            if i - start >= MIN_CHUNK && (h & mask) == 0 {
                cut = i;
                break;
            }
        }
        f(&data[start..cut]);
        start = cut;
    }
}

fn hash_chunk(chunk: &[u8]) -> u64 {
    // 64-bit content hash for dedup identity (collision-negligible at this scale).
    let mut h = std::collections::hash_map::DefaultHasher::new();
    h.write(chunk);
    h.finish()
}

fn main() {
    let default_dir = format!(
        "{}/.local/supermachine-layer-cache/registry/blobs/sha256",
        std::env::var("HOME").unwrap_or_default()
    );
    let dir = std::env::args().nth(1).unwrap_or(default_dir);
    eprintln!("=== chunk-dedup probe over {dir} ===");

    let gear = gear_table();
    let mut total_compressed_on_disk: u64 = 0;
    let mut total_uncompressed: u64 = 0;
    let mut total_chunks: u64 = 0;
    let mut unique_bytes: u64 = 0;
    let mut layer_files = 0u64;
    let mut seen: HashSet<u64> = HashSet::new();

    let entries = match std::fs::read_dir(&dir) {
        Ok(e) => e,
        Err(e) => {
            eprintln!("cannot read {dir}: {e}");
            std::process::exit(1);
        }
    };
    for ent in entries.flatten() {
        let path = ent.path();
        if !path.is_file() {
            continue;
        }
        let raw = match std::fs::read(&path) {
            Ok(b) => b,
            Err(_) => continue,
        };
        total_compressed_on_disk += raw.len() as u64;
        // Only layers are gzip (magic 1f 8b); skip config/manifest JSON blobs.
        if raw.len() < 2 || raw[0] != 0x1f || raw[1] != 0x8b {
            continue;
        }
        let mut gz = flate2::read::GzDecoder::new(&raw[..]);
        let mut data = Vec::new();
        if gz.read_to_end(&mut data).is_err() || data.is_empty() {
            continue;
        }
        layer_files += 1;
        total_uncompressed += data.len() as u64;
        cdc_chunks(&data, &gear, |chunk| {
            total_chunks += 1;
            if seen.insert(hash_chunk(chunk)) {
                unique_bytes += chunk.len() as u64;
            }
        });
    }

    let mib = |b: u64| b as f64 / (1024.0 * 1024.0);
    let pct = |a: u64, b: u64| {
        if b > 0 {
            100.0 * a as f64 / b as f64
        } else {
            0.0
        }
    };
    eprintln!("layers (gzip blobs) processed : {layer_files}");
    eprintln!(
        "current on-disk (compressed)  : {:.1} MiB  (exact-layer-deduped already)",
        mib(total_compressed_on_disk)
    );
    eprintln!(
        "uncompressed layer bytes      : {:.1} MiB  ({} chunks, avg {:.1} KiB)",
        mib(total_uncompressed),
        total_chunks,
        if total_chunks > 0 {
            total_uncompressed as f64 / total_chunks as f64 / 1024.0
        } else {
            0.0
        }
    );
    eprintln!(
        "unique chunk bytes            : {:.1} MiB  ({:.1}% of uncompressed)",
        mib(unique_bytes),
        pct(unique_bytes, total_uncompressed)
    );
    eprintln!(
        "==> sub-layer redundancy found by chunking: {:.1}%  (100% - unique/uncompressed)",
        100.0 - pct(unique_bytes, total_uncompressed)
    );
    eprintln!(
        "    note: chunk store holds UNCOMPRESSED unique bytes ({:.1} MiB) vs current\n    \
         compressed on-disk ({:.1} MiB) — chunk-CDN only nets storage if per-chunk\n    \
         recompression keeps unique below {:.1} MiB.",
        mib(unique_bytes),
        mib(total_compressed_on_disk),
        mib(total_compressed_on_disk)
    );
}