supermachine 0.7.41

Run any OCI/Docker image as a hardware-isolated microVM on macOS HVF (Linux KVM and Windows WHP in progress). Single library API, zero flags for the common case, sub-100 ms cold-restore from snapshot.
//! Snapshot deduplication via APFS `clonefile`.
//!
//! Used by:
//!   * `supermachine dedup` CLI subcommand — offline batch pass
//!     across the whole snapshots dir.
//!   * The bake completion path (0.7.41+) — automatic dedup of
//!     each fresh snapshot against the best on-disk sibling.
//!
//! Mechanism: `clonefile(canonical, tmp) + pwrite_diff_pages
//! (target_current vs canonical) + atomic_rename(tmp → target)`.
//!
//! Properties:
//!   * Output is byte-identical to pre-dedup input (verified by
//!     sha256 in the original commit; preserves correctness).
//!   * Atomic — interrupted runs leave either old or new file,
//!     never half-written.
//!   * APFS clonefile sharing is invisible to `stat`/`du` — the
//!     real savings show up in `statfs` on the data volume.
//!   * Bake-time auto-dedup adds ~1-3s per bake (page-by-page
//!     compare); paid once, all future restores benefit from the
//!     warm OS page cache.
//!
//! When restoring from a dedup'd snapshot:
//!   1. APFS-shared blocks mean the OS page cache is shared with
//!      every other mmap of those same blocks (e.g. earlier
//!      restores of related snapshots).
//!   2. Page faults on first guest access hit warm cache instead
//!      of disk read → faster cold restore.
//!   3. Host RAM accounting: shared pages cost once, not N times.

#![cfg(target_os = "macos")]

use std::os::unix::fs::{FileExt, MetadataExt};
use std::path::Path;

#[derive(Debug, Clone, Copy, Default)]
pub struct DedupStats {
    pub apparent_before: u64,
    pub apparent_after: u64,
    pub diff_pages_written: u64,
}

#[derive(Debug)]
pub enum DedupError {
    SizeMismatch { canon: u64, target: u64 },
    Stat(std::io::Error),
    Clonefile(std::io::Error),
    Mmap(std::io::Error),
    Write(std::io::Error),
    Sync(std::io::Error),
    Rename(std::io::Error),
    PathNul,
    EmptyFile,
}

impl std::fmt::Display for DedupError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::SizeMismatch { canon, target } => write!(
                f,
                "size mismatch: canon={canon} target={target}"
            ),
            Self::Stat(e) => write!(f, "stat: {e}"),
            Self::Clonefile(e) => write!(f, "clonefile: {e}"),
            Self::Mmap(e) => write!(f, "mmap: {e}"),
            Self::Write(e) => write!(f, "pwrite: {e}"),
            Self::Sync(e) => write!(f, "fsync: {e}"),
            Self::Rename(e) => write!(f, "rename: {e}"),
            Self::PathNul => write!(f, "path contains NUL byte"),
            Self::EmptyFile => write!(f, "empty file"),
        }
    }
}

impl std::error::Error for DedupError {}

/// Page-by-page replace `target` with an APFS clone of `canon`
/// plus diff-pwrites for pages that differ. Returns stats; the
/// new on-disk content is byte-identical to the original
/// `target`. Files must be the same size — caller should
/// pre-check.
pub fn dedup_against(canon: &Path, target: &Path) -> Result<DedupStats, DedupError> {
    let canon_md = std::fs::metadata(canon).map_err(DedupError::Stat)?;
    let target_md = std::fs::metadata(target).map_err(DedupError::Stat)?;
    if canon_md.len() != target_md.len() {
        return Err(DedupError::SizeMismatch {
            canon: canon_md.len(),
            target: target_md.len(),
        });
    }
    let file_len = canon_md.len();
    if file_len == 0 {
        return Err(DedupError::EmptyFile);
    }
    let apparent_before = target_md.blocks().saturating_mul(512);

    let tmp_path = target.with_extension("snap.dedup.tmp");
    let _ = std::fs::remove_file(&tmp_path);

    // APFS clonefile — tmp shares blocks with canon. ~ms for any size.
    {
        use std::os::unix::ffi::OsStrExt;
        let canon_c = std::ffi::CString::new(canon.as_os_str().as_bytes())
            .map_err(|_| DedupError::PathNul)?;
        let tmp_c = std::ffi::CString::new(tmp_path.as_os_str().as_bytes())
            .map_err(|_| DedupError::PathNul)?;
        // SAFETY: paths are valid C strings (no interior NUL).
        let ret = unsafe { libc::clonefile(canon_c.as_ptr(), tmp_c.as_ptr(), 0) };
        if ret != 0 {
            return Err(DedupError::Clonefile(std::io::Error::last_os_error()));
        }
    }

    let target_data = mmap_readonly(target, file_len)?;
    let tmp_data = mmap_readonly(&tmp_path, file_len)?;
    let tmp_file = std::fs::OpenOptions::new()
        .write(true)
        .open(&tmp_path)
        .map_err(DedupError::Write)?;

    const PAGE: usize = 4096;
    let total = file_len as usize;
    let mut diff_pages: u64 = 0;
    let mut off = 0usize;
    while off < total {
        let end = (off + PAGE).min(total);
        // SAFETY: both slices are within the mmap'd region.
        let t = &target_data[off..end];
        let c = &tmp_data[off..end];
        if t != c {
            tmp_file
                .write_all_at(t, off as u64)
                .map_err(DedupError::Write)?;
            diff_pages += 1;
        }
        off = end;
    }
    tmp_file.sync_data().map_err(DedupError::Sync)?;

    drop(target_data);
    drop(tmp_data);
    drop(tmp_file);

    std::fs::rename(&tmp_path, target).map_err(DedupError::Rename)?;
    let new_md = std::fs::metadata(target).map_err(DedupError::Stat)?;
    Ok(DedupStats {
        apparent_before,
        apparent_after: new_md.blocks().saturating_mul(512),
        diff_pages_written: diff_pages,
    })
}

/// Find the most-recent on-disk snapshot sibling of `fresh_snap`
/// suitable as a dedup canonical: same image, memory_mib,
/// baked_by_version, AND file size. Excludes `fresh_snap` itself.
///
/// Returns `Ok(None)` if no suitable sibling exists — caller
/// should write the fresh snapshot normally.
pub fn find_best_sibling(
    snapshots_dir: &Path,
    fresh_snap_dir: &Path,
    image: &str,
    memory_mib: u32,
    baked_by_version: &str,
) -> std::io::Result<Option<std::path::PathBuf>> {
    let fresh_canonical = std::fs::canonicalize(fresh_snap_dir).ok();
    let fresh_size = std::fs::metadata(fresh_snap_dir.join("restore.snap"))
        .map(|m| m.len())
        .ok();
    let Some(fresh_size) = fresh_size else {
        return Ok(None);
    };
    let entries = std::fs::read_dir(snapshots_dir)?;
    let mut best: Option<(u64, std::path::PathBuf)> = None;
    for entry in entries.flatten() {
        let path = entry.path();
        if !path.is_dir() {
            continue;
        }
        if fresh_canonical
            .as_ref()
            .and_then(|c| std::fs::canonicalize(&path).ok().map(|p| p == *c))
            .unwrap_or(false)
        {
            continue;
        }
        let restore_snap = path.join("restore.snap");
        let Ok(stat) = std::fs::metadata(&restore_snap) else {
            continue;
        };
        if stat.len() != fresh_size {
            continue;
        }
        // Parse metadata.json — same image / memoryMib / version.
        let Ok(text) = std::fs::read_to_string(path.join("metadata.json")) else {
            continue;
        };
        let Ok(meta): Result<serde_json::Value, _> = serde_json::from_str(&text) else {
            continue;
        };
        if meta.get("image").and_then(|v| v.as_str()) != Some(image) {
            continue;
        }
        if meta.get("memory_mib").and_then(|v| v.as_u64()) != Some(memory_mib as u64) {
            continue;
        }
        if meta.get("baked_by_version").and_then(|v| v.as_str()) != Some(baked_by_version) {
            continue;
        }
        let mtime = stat
            .modified()
            .ok()
            .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
            .map(|d| d.as_secs())
            .unwrap_or(0);
        if best.as_ref().is_none_or(|(m, _)| mtime > *m) {
            best = Some((mtime, restore_snap));
        }
    }
    Ok(best.map(|(_, p)| p))
}

/// Convenience: auto-dedup a freshly-baked snapshot against the
/// best on-disk sibling. Called by the bake driver at completion.
///
/// Best-effort — any failure (no sibling, clonefile EXDEV,
/// permission denied) returns `Ok(None)` and the caller continues
/// with the original fresh snapshot file. We never corrupt the
/// fresh snapshot: atomic rename + failure → caller sees the
/// original file unchanged.
///
/// Opt-out via `SUPERMACHINE_AUTO_DEDUP=0`. Stats are printed to
/// stderr when `trace::enabled("dedup")` is true.
pub fn auto_dedup_on_bake(
    snapshots_dir: &Path,
    fresh_snap_dir: &Path,
    image: &str,
    memory_mib: u32,
    baked_by_version: &str,
) -> Option<DedupStats> {
    if std::env::var("SUPERMACHINE_AUTO_DEDUP").as_deref() == Ok("0") {
        return None;
    }
    let candidate = match find_best_sibling(
        snapshots_dir,
        fresh_snap_dir,
        image,
        memory_mib,
        baked_by_version,
    ) {
        Ok(Some(p)) => p,
        Ok(None) => return None,
        Err(_) => return None,
    };
    let target = fresh_snap_dir.join("restore.snap");
    let t0 = std::time::Instant::now();
    let result = dedup_against(&candidate, &target);
    let trace = crate::trace::enabled("dedup") || crate::trace::enabled("bake");
    match result {
        Ok(stats) => {
            if trace {
                eprintln!(
                    "[auto-dedup] OK in {:?}: {}{} (diff_pages={}; canonical={})",
                    t0.elapsed(),
                    fmt_bytes(stats.apparent_before),
                    fmt_bytes(stats.apparent_after),
                    stats.diff_pages_written,
                    candidate.display()
                );
            }
            Some(stats)
        }
        Err(e) => {
            if trace {
                eprintln!(
                    "[auto-dedup] FAIL in {:?}: {e} (canonical={}); fresh snapshot retained",
                    t0.elapsed(),
                    candidate.display()
                );
            }
            None
        }
    }
}

fn fmt_bytes(n: u64) -> String {
    let g = 1024u64.pow(3);
    let m = 1024u64.pow(2);
    let k = 1024u64;
    if n >= g {
        format!("{:.1} GiB", n as f64 / g as f64)
    } else if n >= m {
        format!("{:.0} MiB", n as f64 / m as f64)
    } else if n >= k {
        format!("{:.0} KiB", n as f64 / k as f64)
    } else {
        format!("{n} B")
    }
}

struct Mmap {
    ptr: *const u8,
    len: usize,
}

impl std::ops::Deref for Mmap {
    type Target = [u8];
    fn deref(&self) -> &[u8] {
        // SAFETY: ptr is from mmap with len bytes, valid until Drop.
        unsafe { std::slice::from_raw_parts(self.ptr, self.len) }
    }
}

impl Drop for Mmap {
    fn drop(&mut self) {
        // SAFETY: we own this mapping; unmap matches the prior mmap.
        unsafe {
            libc::munmap(self.ptr as *mut libc::c_void, self.len);
        }
    }
}

// SAFETY: read-only mmap of a real file is Send + Sync.
unsafe impl Send for Mmap {}
unsafe impl Sync for Mmap {}

fn mmap_readonly(path: &Path, expected_len: u64) -> Result<Mmap, DedupError> {
    use std::os::unix::io::AsRawFd;
    let file = std::fs::File::open(path).map_err(DedupError::Mmap)?;
    let md = file.metadata().map_err(DedupError::Stat)?;
    if md.len() != expected_len {
        return Err(DedupError::SizeMismatch {
            canon: expected_len,
            target: md.len(),
        });
    }
    let len = md.len() as usize;
    if len == 0 {
        return Err(DedupError::EmptyFile);
    }
    // SAFETY: standard mmap usage. len is non-zero, fd is valid for the open file.
    let ptr = unsafe {
        libc::mmap(
            std::ptr::null_mut(),
            len,
            libc::PROT_READ,
            libc::MAP_PRIVATE,
            file.as_raw_fd(),
            0,
        )
    };
    if ptr == libc::MAP_FAILED {
        return Err(DedupError::Mmap(std::io::Error::last_os_error()));
    }
    Ok(Mmap {
        ptr: ptr as *const u8,
        len,
    })
}