supermachine 0.4.25

//! Measure: how does worker RSS / phys_footprint evolve across a
//! pool of N idle VMs over time?
//!
//! What we want to learn:
//!   1. Initial RSS right after restore (before balloon inflates).
//!   2. RSS at `inflate_complete` (balloon driver finished pushing
//!      PFNs, host madvise(MADV_FREE)'d them).
//!   3. Steady-state RSS over a 30 s idle window.
//!   4. Whether RSS recovers if we put memory pressure on the host.
//!
//! Output: a CSV-style row per sample so we can plot if useful.
//!
//! Usage:
//!   SUPERMACHINE_WORKER_BIN=$(pwd)/target/release/supermachine-worker \
//!     ./target/release/examples/_balloon_rss
//!
//! Env knobs:
//!   POOL_SIZE=10           default
//!   SAMPLE_INTERVAL_MS=500
//!   SAMPLE_DURATION_S=30
//!   IMAGE=alpine:latest

use std::process::Command;
use std::time::{Duration, Instant};
use supermachine::Image;

fn ps_rss_kb(pid: u32) -> Option<u64> {
    // macOS `ps -o rss=` prints kilobytes. phys_footprint via
    // `footprint` is more accurate but harder to get; rss is fine
    // for relative trend tracking.
    let out = Command::new("ps")
        .args(["-o", "rss=", "-p", &pid.to_string()])
        .output()
        .ok()?;
    if !out.status.success() {
        return None;
    }
    String::from_utf8(out.stdout).ok()?.trim().parse().ok()
}

/// macOS `footprint -p PID` reports phys_footprint, which is the
/// authoritative "this process is responsible for N bytes of
/// physical memory" number. Unlike RSS, it accounts correctly for
/// MADV_FREE'd pages (they're excluded). Slow (~30 ms per call)
/// because it forks the footprint tool, so we use it only for
/// summary samples, not the per-tick trace.
fn footprint_mib(pid: u32) -> Option<u64> {
    let out = Command::new("footprint")
        .args(["-p", &pid.to_string()])
        .output()
        .ok()?;
    if !out.status.success() {
        return None;
    }
    let s = String::from_utf8(out.stdout).ok()?;
    // Parse the header banner line, format like:
    //   supermachine-w [97248]: 64-bit    Footprint: 86.6 MB (16384 bytes per page)
    // OR (smaller processes):
    //   zsh [14583]: 64-bit    Footprint: 2128 KB (16384 bytes per page)
    for line in s.lines() {
        let lower = line.to_ascii_lowercase();
        if let Some(idx) = lower.find("footprint:") {
            let rest = &line[idx + "footprint:".len()..];
            let parts: Vec<&str> = rest.trim().split_whitespace().collect();
            if parts.len() >= 2 {
                if let Ok(n) = parts[0].parse::<f64>() {
                    let unit = parts[1].to_ascii_lowercase();
                    let mib = match unit.as_str() {
                        "kb" => n / 1024.0,
                        "mb" => n,
                        "gb" => n * 1024.0,
                        _ => continue,
                    };
                    return Some(mib as u64);
                }
            }
        }
    }
    None
}

fn worker_pids() -> Vec<u32> {
    // pgrep matches by command name. We assume the parent is *us*
    // so we grab child supermachine-worker processes only.
    let our_pid = std::process::id();
    let out = Command::new("pgrep")
        .args(["-P", &our_pid.to_string(), "-f", "supermachine-worker"])
        .output()
        .ok();
    out.and_then(|o| String::from_utf8(o.stdout).ok())
        .map(|s| {
            s.lines()
                .filter_map(|l| l.trim().parse().ok())
                .collect()
        })
        .unwrap_or_default()
}

fn main() -> Result<(), Box<dyn std::error::Error>> {
    let pool_size: usize = std::env::var("POOL_SIZE")
        .ok()
        .and_then(|s| s.parse().ok())
        .unwrap_or(10);
    let sample_interval = Duration::from_millis(
        std::env::var("SAMPLE_INTERVAL_MS")
            .ok()
            .and_then(|s| s.parse().ok())
            .unwrap_or(500),
    );
    let duration = Duration::from_secs(
        std::env::var("SAMPLE_DURATION_S")
            .ok()
            .and_then(|s| s.parse().ok())
            .unwrap_or(30),
    );
    let image_ref = std::env::var("IMAGE").unwrap_or_else(|_| "alpine:latest".to_owned());

    let home = std::env::var("HOME")?;
    let snap_name = "_balloon_rss_bench";
    let snap_dir = format!("{home}/.local/supermachine-snapshots/{snap_name}");

    if !std::path::Path::new(&format!("{snap_dir}/restore.snap")).exists() {
        eprintln!("[bench] baking fresh snapshot of {image_ref}...");
        Image::builder(&image_ref)
            .with_name(snap_name)
            .with_memory_mib(256)
            .build()?;
        std::thread::sleep(Duration::from_secs(2)); // let bg save complete
    }

    let memory_mib = 256u64; // matches build above
    let total_mib_if_no_balloon = memory_mib * pool_size as u64;
    eprintln!(
        "[bench] pool of {pool_size} workers, {memory_mib} MiB each \
         (would be {total_mib_if_no_balloon} MiB total without balloon)"
    );

    let img = Image::from_snapshot(&snap_dir)?;
    let pool_t0 = Instant::now();
    let pool = img
        .pool()
        .min(pool_size)
        .max(pool_size)
        .restore_on_release(true)
        .build()?;
    let pool_build_ms = pool_t0.elapsed().as_millis();
    eprintln!("[bench] pool built in {pool_build_ms} ms");

    // Optionally cycle every worker through an acquire/exec/release
    // so we measure post-workload steady-state, not just idle.
    if std::env::var("CYCLE_WORKLOAD").ok().as_deref() == Some("1") {
        eprintln!(
            "[bench] cycling each worker through acquire→exec→release \
             to measure post-workload memory..."
        );
        let cycle_t0 = Instant::now();
        for _ in 0..pool_size {
            let vm = pool.acquire()?;
            // Touch ~30 MiB of guest memory by allocating in tmpfs,
            // then deleting. Avoids /dev/urandom (slow entropy on
            // alpine in a microVM) and avoids capturing 50MB of
            // stdout (which the agent's output buffer would have
            // to ferry over vsock). `dd if=/dev/zero` is fast and
            // its stdout goes to /tmp/x, not back to us.
            let _ = vm
                .exec_builder()
                .argv([
                    "sh",
                    "-c",
                    "dd if=/dev/zero of=/tmp/x bs=1M count=30 2>/dev/null; rm /tmp/x",
                ])
                .timeout(Duration::from_secs(15))
                .output();
            drop(vm);
        }
        eprintln!(
            "[bench] cycled all {pool_size} workers in {} ms",
            cycle_t0.elapsed().as_millis()
        );
    }

    // Grab worker PIDs. Pool is min-eager so all spawned by now.
    let pids = worker_pids();
    eprintln!("[bench] tracking {} worker PIDs: {:?}", pids.len(), pids);
    if pids.len() != pool_size {
        eprintln!(
            "[bench] WARNING: expected {pool_size} pids, got {} — \
             pgrep filter may have missed some",
            pids.len()
        );
    }

    println!("t_ms,total_rss_mib,avg_rss_mib_per_worker,workers_seen");
    let t0 = Instant::now();
    while t0.elapsed() < duration {
        let mut total_kb = 0u64;
        let mut seen = 0usize;
        for &pid in &pids {
            if let Some(kb) = ps_rss_kb(pid) {
                total_kb += kb;
                seen += 1;
            }
        }
        let total_mib = total_kb / 1024;
        let avg_mib = if seen > 0 { total_kb / seen as u64 / 1024 } else { 0 };
        println!(
            "{},{},{},{}",
            t0.elapsed().as_millis(),
            total_mib,
            avg_mib,
            seen
        );
        std::thread::sleep(sample_interval);
    }

    // After the trace, take ONE phys_footprint sample on the first
    // worker — that's the authoritative MADV_FREE-aware number.
    eprintln!("\n[bench] phys_footprint (authoritative per-worker MiB):");
    let mut fp_total = 0u64;
    let mut fp_seen = 0usize;
    for &pid in &pids {
        if let Some(mib) = footprint_mib(pid) {
            eprintln!("  pid={pid:6} phys_footprint={mib:4} MiB");
            fp_total += mib;
            fp_seen += 1;
        }
    }
    if fp_seen > 0 {
        eprintln!(
            "[bench] phys_footprint TOTAL: {} MiB across {} workers (avg {} MiB/worker)",
            fp_total,
            fp_seen,
            fp_total / fp_seen as u64
        );
    }

    Ok(())
}