supermachine 0.7.47

//! Spike 22 command-line harness.
//!
//! The VM implementation lives in the library crate. This binary parses CLI
//! flags, applies process-wide harness hooks, and calls `vmm::runner`.

// 0.7.42+: per-process global allocator swap to mimalloc.
//
// The worker is the hot path for runtime allocation:
//   * vCPU exit handlers allocating per-event tokio futures
//   * Vsock relay packet buffers
//   * Virtio-fs / blk request frames
//   * Snapshot save's page-by-page diff buffers
//
// mimalloc gives ~5-15% on this mix and reduces the per-worker
// phys_footprint by 5-10 MiB versus macOS's system malloc.
// Per-binary opt-in (library consumers of `supermachine` keep
// their own allocator).
#[global_allocator]
static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;

use supermachine as vmm;

#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
fn arg_value(args: &[String], i: usize, flag: &str) -> String {
    args.get(i + 1).cloned().unwrap_or_else(|| {
        eprintln!("{flag}: missing value");
        std::process::exit(2);
    })
}

#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
fn parse_arg<T>(args: &[String], i: usize, flag: &str) -> T
where
    T: std::str::FromStr,
    T::Err: std::fmt::Display,
{
    let value = arg_value(args, i, flag);
    value.parse().unwrap_or_else(|e| {
        eprintln!("{flag}: invalid value {value:?}: {e}");
        std::process::exit(2);
    })
}

#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
fn main() {
    env_logger::init();

    use vmm::vmm::resources::{VmProfile, VmResources, DEFAULT_CMDLINE, DEFAULT_MEMORY_MIB};
    use vmm::vmm::runner::{self, RunOptions};

    // P-core hint for vCPU 0 (the main thread enters dispatch_vcpu).
    // Secondary vCPU threads call this themselves in run_secondary.
    #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
    vmm::vmm::worker::pin_vcpu_thread_to_pcore();

    let args: Vec<String> = std::env::args().collect();

    // Version probe — must run BEFORE any other arg handling. The
    // library spawns this binary with `--version` once on first
    // use to verify protocol compatibility (the worker and the
    // library MUST be the same crate version because the
    // supervisor protocol evolves between releases — e.g. 0.4.6
    // added BAKE_READY / SNAPSHOT_ASYNC, missing in older workers,
    // and a stale `~/.cargo/bin/supermachine-worker` deadlocks
    // pipelined-bake otherwise). Output is parsed by the lib;
    // format is stable: exactly `supermachine-worker <semver>\n`.
    if args.iter().any(|a| a == "--version") {
        println!("supermachine-worker {}", env!("CARGO_PKG_VERSION"));
        return;
    }

    // Parent-death watchdog. If the library process that spawned us
    // dies (clean exit, SIGKILL, crash, whatever), the OS reparents
    // us to init (PID 1). Poll getppid() every second; on the first
    // observation that PPID has changed away from our initial parent,
    // self-exit.
    //
    // This is the macOS-equivalent of Linux's `prctl(PR_SET_PDEATHSIG)`.
    // Always on. The 0.7.36 `SUPERMACHINE_WORKER_NO_WATCHDOG=1`
    // kill-switch was deleted in 0.7.37 — no real use case justified
    // it, and "worker without parent watchdog" is a footgun shape
    // (orphan workers piling up after a crashed lib process).
    {
        let initial_ppid = unsafe { libc::getppid() };
        // The lib often spawns via `posix_spawn` which doesn't
        // change PPID, so the value we see here matches the
        // library process. If we were started directly from a
        // shell our initial PPID is the shell — and if THAT shell
        // dies, we should also clean up.
        // 0.7.43+ kernel-boot cache HIT auto-resume.
        //
        // When `SUPERMACHINE_KCACHE_RESUME=1` is in the env, this
        // worker was spawned with `--kcache-restore-from
        // <kcache.snap>`. The restored guest has init-oci paused
        // in `read(0)` at the kcache pause point. We push 'R\n'
        // to PL011 RX so init-oci's read returns and bake
        // continues.
        //
        // Robustness: we push in a LOOP rather than once, retrying
        // every 50 ms for up to 5 s. Reasons it might be needed:
        //   * GIC not yet created → gic_set_spi silently no-ops,
        //     bytes queued in RX_QUEUE but no IRQ delivered.
        //   * Restore in progress → kernel state mid-flux, IRQ
        //     might be lost.
        //   * Initial push raced the kernel's UART driver init.
        //
        // The kernel's PL011 driver reads UARTDR in a loop until
        // RXFE=1, so multiple pushes won't deliver duplicate
        // 'R\n' tokens — the FIRST successful delivery satisfies
        // init-oci's read; subsequent pushes leave bytes in
        // RX_QUEUE but init-oci's already past the read by then.
        // The byte buffer is at most ~50 bytes (5s / 50ms × 2),
        // discarded when init-oci exits.
        //
        // Stop condition: a sentinel — once init-oci writes the
        // "[SUPERMACHINE-KCACHE] resumed" marker, the line
        // scanner sets KCACHE_READY=false (we reuse the atomic
        // as a "resume confirmed" signal). The retry loop polls
        // this and exits.
        if std::env::var("SUPERMACHINE_KCACHE_RESUME")
            .map(|v| v == "1" || v == "true")
            .unwrap_or(false)
        {
            std::thread::Builder::new()
                .name("supermachine-kcache-resume".into())
                .stack_size(64 * 1024)
                .spawn(|| {
                    use std::sync::atomic::Ordering as O;
                    // Initial sleep — let the runner reach the
                    // point where restore + GIC + vCPU launch have
                    // completed. 50 ms is empirically enough for
                    // the median; the retry loop covers tails.
                    std::thread::sleep(std::time::Duration::from_millis(50));
                    // Retry every 100 ms, up to 30 attempts (3 s).
                    // Stops as soon as init-oci writes the resumed
                    // marker (which the PL011 scanner flips
                    // KCACHE_RESUMED for). Avoids piling extra
                    // bytes into the TTY for the post-pivot workload.
                    for _ in 0..30 {
                        if supermachine::devices::serial::KCACHE_RESUMED
                            .load(O::SeqCst)
                        {
                            break;
                        }
                        supermachine::devices::serial::push_rx_byte(b'R');
                        supermachine::devices::serial::push_rx_byte(b'\n');
                        std::thread::sleep(std::time::Duration::from_millis(100));
                    }
                })
                .ok();
        }
        // 0.7.44+ pre-exec sync HIT auto-resume.
        //
        // When `SUPERMACHINE_PRE_EXEC_SYNC_RESUME=1` is in the env,
        // this worker was spawned to restore from a snapshot taken
        // with `supermachine.pre_exec_sync=1` — init-oci is paused
        // inside `read(0)` at the pre-exec pause point (post-pivot,
        // immediately before forking the workload). We push 'P\n'
        // so init-oci's read returns and the workload starts.
        //
        // Mirror of the kcache-resume thread above. Same robustness
        // story: GIC race / IRQ loss / mid-restore state flux —
        // retry every 100 ms until init-oci writes the resumed
        // marker (PRE_EXEC_SYNC_RESUMED atomic).
        if std::env::var("SUPERMACHINE_PRE_EXEC_SYNC_RESUME")
            .map(|v| v == "1" || v == "true")
            .unwrap_or(false)
        {
            std::thread::Builder::new()
                .name("supermachine-pre-exec-resume".into())
                .stack_size(64 * 1024)
                .spawn(|| {
                    use std::sync::atomic::Ordering as O;
                    std::thread::sleep(std::time::Duration::from_millis(50));
                    for _ in 0..30 {
                        if supermachine::devices::serial::PRE_EXEC_SYNC_RESUMED
                            .load(O::SeqCst)
                        {
                            break;
                        }
                        supermachine::devices::serial::push_rx_byte(b'P');
                        supermachine::devices::serial::push_rx_byte(b'\n');
                        std::thread::sleep(std::time::Duration::from_millis(100));
                    }
                })
                .ok();
        }
        std::thread::Builder::new()
            .name("supermachine-watchdog".into())
            // 64 KiB stack is enough for the sleep+getppid loop;
            // default 2 MiB is wasteful for a single-purpose thread.
            .stack_size(64 * 1024)
            .spawn(move || loop {
                std::thread::sleep(std::time::Duration::from_secs(1));
                let current = unsafe { libc::getppid() };
                if current != initial_ppid {
                    // Parent gone (reparented, usually to init=1).
                    // Exit cleanly — the lib's worker control socket
                    // already saw us go away.
                    eprintln!(
                        "[supermachine-worker] parent {initial_ppid} exited (now reparented to {current}); self-exiting"
                    );
                    // 0.7.44+ libc::_exit instead of process::exit:
                    // same shutdown-race rationale as in main()'s OK
                    // return path. If vCPU/muxer threads are live
                    // (the common case here — orphaned mid-run), we
                    // can't let C++ static destructors race them.
                    unsafe { libc::_exit(0) };
                }
            })
            .ok();
    }

    let mut kernel_path: Option<String> = None;
    let mut initrd_path: Option<String> = None;
    let mut cmdline = String::from(DEFAULT_CMDLINE);
    let mut memory_mib: usize = DEFAULT_MEMORY_MIB;
    let mut blk_paths: Vec<String> = Vec::new();
    // `--volume HOST_FILE:GUEST_PATH` — writable attachments,
    // opened RW and ordered after RO layers in /dev/vd*. The
    // matching guest paths are written to /.supermachine-volumes
    // by the bake pipeline; init-oci consumes them post-pivot.
    let mut volumes: Vec<vmm::vmm::resources::VolumeSpec> = Vec::new();
    let mut mounts: Vec<vmm::vmm::resources::MountSpec> = Vec::new();
    let mut snapshot_after_ms: Option<u64> = None;
    let mut snapshot_at: Option<u64> = None;
    let mut snapshot_on_listener: bool = false;
    let mut snapshot_on_pre_exec: bool = false;
    let mut snapshot_out: Option<String> = None;
    let mut restore_from: Option<String> = None;
    // 0.7.43+ kernel-boot cache HIT path: restore is requested
    // BUT the worker stays in the bake-then-pool flow (i.e.
    // doesn't wait for a `RESTORE` command from supervisor).
    // The supermachine-kcache-resume thread pushes the resume
    // token shortly after spawn so init-oci's paused read() in
    // the cached snapshot unblocks.
    let mut kcache_restore_from: Option<String> = None;
    let mut cow_restore: bool = false;
    let mut quiesce_ms: u64 = 0;
    let mut balloon_target_pages: Option<u32> = None;
    let mut log_sink: Option<String> = None;
    let mut n_vcpus: u32 = 1;
    let mut profile: Option<VmProfile> = None;
    let mut vcpus_explicit = false;
    let mut tls_listen: Option<String> = None;
    let mut tls_vm_port: Option<u32> = None;
    let mut tls_cert: Option<String> = None;
    let mut tls_key: Option<String> = None;
    let mut env_pairs: Vec<(String, String)> = Vec::new();
    let mut env_file: Option<String> = None;
    let mut egress_policy: Option<String> = None;
    let mut vsock_mux: Option<String> = None;
    let mut vsock_mux_handoff: Option<String> = None;
    let mut vsock_exec: Option<String> = None;
    let mut vsock_exec_guest_port: Option<u32> = None;
    let mut http_port: Option<String> = None;
    let mut pool_worker: Option<String> = None;
    // TSI control-channel auth token (64 lowercase hex chars =
    // 32 bytes). When present we (a) append
    // `supermachine.tsi_token=<hex>` to the kernel cmdline so the
    // guest's af_tsi driver prepends the token to every control
    // DGRAM, and (b) hand the decoded 32 bytes to the vsock muxer
    // so it rejects any guest-userspace-forged control message.
    // See `kernel-build/patches/af-tsi/0014-*.patch`.
    let mut tsi_token_hex: Option<String> = None;

    let mut i = 1;
    while i < args.len() {
        match args[i].as_str() {
            "--kernel" => {
                kernel_path = Some(arg_value(&args, i, "--kernel"));
                i += 2;
            }
            "--initramfs" => {
                initrd_path = Some(arg_value(&args, i, "--initramfs"));
                i += 2;
            }
            "--cmdline" => {
                cmdline = arg_value(&args, i, "--cmdline");
                i += 2;
            }
            "--memory" => {
                memory_mib = parse_arg(&args, i, "--memory");
                i += 2;
            }
            "--virtio-blk" => {
                blk_paths.push(arg_value(&args, i, "--virtio-blk"));
                i += 2;
            }
            "--volume" => {
                let raw = arg_value(&args, i, "--volume");
                // Encoding: HOST:GUEST (legacy) or HOST:GUEST:SIZE_BYTES
                // (0.7.0+, size_bytes optional with default 1 GiB).
                let parts: Vec<&str> = raw.splitn(3, ':').collect();
                if parts.len() < 2 {
                    eprintln!("--volume expects HOST:GUEST[:SIZE_BYTES], got {raw:?}");
                    std::process::exit(2);
                }
                let mut spec = vmm::vmm::resources::VolumeSpec::new(parts[0], parts[1]);
                if let Some(s) = parts.get(2) {
                    match s.parse::<u64>() {
                        Ok(sz) => spec = spec.with_size_bytes(sz),
                        Err(_) => {
                            eprintln!("--volume SIZE_BYTES not a u64: {s:?}");
                            std::process::exit(2);
                        }
                    }
                }
                volumes.push(spec);
                i += 2;
            }
            "--mount" => {
                let raw = arg_value(&args, i, "--mount");
                // Encoding: `HOST:TAG:GUEST_PATH[:POLICY]`. POLICY is
                // optional; defaults to Opaque. Splits left-to-right
                // on `:` — host paths on macOS never contain `:`.
                let parts: Vec<&str> = raw.splitn(4, ':').collect();
                let (host, tag, guest_path, policy_str): (
                    &str,
                    &str,
                    &str,
                    Option<&str>,
                ) = match parts.len() {
                    3 => (parts[0], parts[1], parts[2], None),
                    4 => (parts[0], parts[1], parts[2], Some(parts[3])),
                    _ => {
                        eprintln!(
                            "--mount expects HOST:TAG:GUEST_PATH[:POLICY], got {raw:?}"
                        );
                        std::process::exit(2);
                    }
                };
                if tag.is_empty() {
                    eprintln!("--mount tag is empty: {raw:?}");
                    std::process::exit(2);
                }
                if tag.len() > 35 {
                    eprintln!(
                        "--mount tag too long (max 35 bytes, got {}): {raw:?}",
                        tag.len()
                    );
                    std::process::exit(2);
                }
                if guest_path.is_empty() {
                    eprintln!("--mount guest_path is empty: {raw:?}");
                    std::process::exit(2);
                }
                if !guest_path.starts_with('/') {
                    eprintln!(
                        "--mount guest_path must be absolute (start with `/`), got {guest_path:?}"
                    );
                    std::process::exit(2);
                }
                let policy = match policy_str {
                    None => vmm::vmm::resources::SymlinkPolicy::default(),
                    Some("deny") => vmm::vmm::resources::SymlinkPolicy::Deny,
                    Some("opaque") => vmm::vmm::resources::SymlinkPolicy::Opaque,
                    Some("follow") => vmm::vmm::resources::SymlinkPolicy::Follow,
                    Some(other) => {
                        eprintln!(
                            "--mount policy must be one of: deny, opaque, follow (got {other:?})"
                        );
                        std::process::exit(2);
                    }
                };
                mounts.push(
                    vmm::vmm::resources::MountSpec::new(host, tag, guest_path)
                        .with_symlinks(policy),
                );
                i += 2;
            }
            "--vcpus" => {
                n_vcpus = parse_arg(&args, i, "--vcpus");
                vcpus_explicit = true;
                i += 2;
            }
            "--profile" => {
                let value = arg_value(&args, i, "--profile");
                profile = VmProfile::parse(&value).or_else(|| {
                    eprintln!("--profile: expected latency or throughput, got {value:?}");
                    std::process::exit(2);
                });
                i += 2;
            }
            "--snapshot-after-ms" => {
                snapshot_after_ms = Some(parse_arg(&args, i, "--snapshot-after-ms"));
                i += 2;
            }
            "--snapshot-at" => {
                snapshot_at = Some(parse_arg(&args, i, "--snapshot-at"));
                i += 2;
            }
            "--snapshot-on-listener" => {
                snapshot_on_listener = true;
                i += 1;
            }
            "--snapshot-on-pre-exec" => {
                snapshot_on_pre_exec = true;
                i += 1;
            }
            "--snapshot-out" => {
                snapshot_out = Some(arg_value(&args, i, "--snapshot-out"));
                i += 2;
            }
            "--balloon-target-pages" => {
                balloon_target_pages =
                    Some(parse_arg::<u64>(&args, i, "--balloon-target-pages") as u32);
                i += 2;
            }
            "--restore-from" => {
                restore_from = Some(arg_value(&args, i, "--restore-from"));
                i += 2;
            }
            // 0.7.43+ kernel-boot cache HIT — see field comment
            // above. Distinct flag from --restore-from so the
            // bake-then-pool detection at line ~460 stays correct.
            "--kcache-restore-from" => {
                kcache_restore_from =
                    Some(arg_value(&args, i, "--kcache-restore-from"));
                i += 2;
            }
            "--cow-restore" => {
                cow_restore = true;
                i += 1;
            }
            "--quiesce-ms" => {
                quiesce_ms = parse_arg(&args, i, "--quiesce-ms");
                i += 2;
            }
            "--log-sink" => {
                log_sink = Some(arg_value(&args, i, "--log-sink"));
                i += 2;
            }
            "--tls-listen" => {
                tls_listen = Some(arg_value(&args, i, "--tls-listen"));
                i += 2;
            }
            "--tls-vm-port" => {
                tls_vm_port = Some(parse_arg(&args, i, "--tls-vm-port"));
                i += 2;
            }
            "--tls-cert" => {
                tls_cert = Some(arg_value(&args, i, "--tls-cert"));
                i += 2;
            }
            "--tls-key" => {
                tls_key = Some(arg_value(&args, i, "--tls-key"));
                i += 2;
            }
            "--env" => {
                let value = arg_value(&args, i, "--env");
                if let Some((k, v)) = value.split_once('=') {
                    env_pairs.push((k.to_string(), v.to_string()));
                } else {
                    eprintln!("--env: expected K=V, got {value:?}");
                    std::process::exit(2);
                }
                i += 2;
            }
            "--env-file" => {
                env_file = Some(arg_value(&args, i, "--env-file"));
                i += 2;
            }
            "--egress-policy" => {
                egress_policy = Some(arg_value(&args, i, "--egress-policy"));
                i += 2;
            }
            "--vsock-mux" => {
                vsock_mux = Some(arg_value(&args, i, "--vsock-mux"));
                i += 2;
            }
            "--vsock-mux-handoff" => {
                vsock_mux_handoff = Some(arg_value(&args, i, "--vsock-mux-handoff"));
                i += 2;
            }
            "--vsock-exec" => {
                vsock_exec = Some(arg_value(&args, i, "--vsock-exec"));
                i += 2;
            }
            "--vsock-exec-guest-port" => {
                let v = arg_value(&args, i, "--vsock-exec-guest-port");
                vsock_exec_guest_port = Some(v.parse().unwrap_or_else(|e| {
                    eprintln!("--vsock-exec-guest-port: {e}");
                    std::process::exit(2);
                }));
                i += 2;
            }
            "--http-port" => {
                http_port = Some(arg_value(&args, i, "--http-port"));
                i += 2;
            }
            "--pool-worker" => {
                pool_worker = Some(arg_value(&args, i, "--pool-worker"));
                i += 2;
            }
            "--tsi-token" => {
                let v = arg_value(&args, i, "--tsi-token");
                if v.len() != 64 || !v.bytes().all(|b| b.is_ascii_hexdigit()) {
                    eprintln!(
                        "--tsi-token: expected 64 lowercase hex chars (32 bytes), got {}",
                        v.len()
                    );
                    std::process::exit(2);
                }
                tsi_token_hex = Some(v.to_ascii_lowercase());
                i += 2;
            }
            _ => {
                eprintln!("unknown arg: {}", args[i]);
                std::process::exit(2);
            }
        }
    }

    let tls_cfg = match (tls_listen, tls_cert, tls_key) {
        (Some(l), Some(c), Some(k)) => Some(vmm::vmm::tls::TlsConfig {
            listen_addr: l,
            vm_port: tls_vm_port,
            cert_path: c,
            key_path: k,
        }),
        (None, None, None) => None,
        _ => {
            eprintln!("--tls-listen / --tls-cert / --tls-key must all be set together (--tls-vm-port optional)");
            std::process::exit(2);
        }
    };

    if let Some(p) = log_sink.as_deref() {
        vmm::devices::serial::set_log_sink(p).unwrap_or_else(|e| {
            eprintln!("--log-sink: {e}");
            std::process::exit(2);
        });
    }
    // Enable line-marker detection (heartbeat counter + "parking
    // PID 1") for any snapshot mode. Listener-only bakes use the
    // parked-marker as the early non-service fallback so they
    // don't sit through the full --snapshot-after-ms timeout.
    vmm::devices::serial::set_heartbeat_detection(
        snapshot_at.is_some()
            || snapshot_on_listener
            || snapshot_on_pre_exec
            || snapshot_after_ms.is_some(),
    );

    if let Some(json) = build_env_payload(&env_pairs, env_file.as_deref()) {
        eprintln!(
            "  env JSON: {} bytes (served on AF_VSOCK port 1026)",
            json.len()
        );
        vmm::devices::virtio::vsock::muxer::set_env_json(json);
    }

    if let Some(p) = egress_policy.as_deref() {
        eprintln!("  egress policy: {p}");
        vmm::vmm::egress_policy::set(p);
    }

    let mut pool_restore_path: Option<String> = None;
    let mut pool_sock: Option<std::os::unix::net::UnixStream> = None;
    // Capture bake-then-pool out of the inner block so we can
    // propagate it via RunOptions::bake_then_pool to runner::run.
    let mut bake_then_pool_flag: bool = false;
    if let Some(sock_path) = pool_worker.as_deref() {
        use std::io::{BufRead, Write};
        let mut sock = std::os::unix::net::UnixStream::connect(sock_path).unwrap_or_else(|e| {
            eprintln!("--pool-worker connect {sock_path}: {e}");
            std::process::exit(1);
        });
        sock.write_all(b"READY\n").ok();
        eprintln!("  pool-worker connected to {sock_path}");
        // Bake-then-pool: spawned without --restore-from AND without
        // --snapshot-out. The lib doesn't send an initial RESTORE in
        // this mode — it waits for BAKE_READY (emitted by the runner
        // after init reaches its readiness trigger). Skip the
        // RESTORE handshake; runner.rs detects bake-then-pool and
        // signals BAKE_READY itself.
        // bake-then-pool decision: we're in bake-then-pool mode if
        // the spawn doesn't carry a regular --restore-from AND no
        // --snapshot-out (which would trigger a snapshot capture
        // and require a destination path).
        //
        // The kcache-HIT path uses --kcache-restore-from (a 0.7.43+
        // additional flag) instead of --restore-from. That flag
        // adds a restore at resources construction time but does
        // NOT flip us out of bake-then-pool mode — see
        // RunOptions::bake_then_pool for the rationale.
        let bake_then_pool = restore_from.is_none() && snapshot_out.is_none();
        bake_then_pool_flag = bake_then_pool;
        if bake_then_pool {
            pool_sock = Some(sock);
        } else {
            let mut reader = std::io::BufReader::new(sock.try_clone().unwrap_or_else(|e| {
                eprintln!("--pool-worker clone {sock_path}: {e}");
                std::process::exit(1);
            }));
            let mut line = String::new();
            if reader.read_line(&mut line).is_err() {
                eprintln!("  pool-worker: supervisor closed before RESTORE");
                std::process::exit(0);
            }
            let cmd = line.trim();
            let Some(rest) = cmd.strip_prefix("RESTORE ") else {
                eprintln!("  pool-worker: expected RESTORE, got {cmd:?}");
                std::process::exit(1);
            };
            let mut parts = rest.split_ascii_whitespace();
            let base = parts.next().unwrap_or("").to_string();
            for kv in parts {
                if let Some(v) = kv.strip_prefix("egress_policy=") {
                    vmm::vmm::egress_policy::set(v);
                }
            }
            pool_restore_path = Some(base);
            pool_sock = Some(sock);
            cow_restore = true;
        }
    }
    let restore_from = pool_restore_path.or(restore_from);

    // TSI auth token: decode the hex once, append the kernel
    // cmdline arg if the worker boots a fresh kernel. On restore
    // we still set `tsi_token` on VmResources (the muxer enforces
    // on every dispatch) but we don't touch the cmdline — the
    // kernel's `static u8 tsi_auth_token[32]` is already captured
    // in the snapshot's memory image. The bake-time worker
    // appended the cmdline; restoring re-uses that same token,
    // which is why the library passes the SAME hex string from
    // metadata.json on every restore.
    let tsi_token_bytes: Option<[u8; 32]> = tsi_token_hex.as_deref().map(|hex| {
        let mut out = [0u8; 32];
        for i in 0..32 {
            out[i] = u8::from_str_radix(&hex[i * 2..i * 2 + 2], 16).unwrap_or_else(|e| {
                eprintln!("--tsi-token: malformed hex at byte {i}: {e}");
                std::process::exit(2);
            });
        }
        out
    });
    if let Some(hex) = tsi_token_hex.as_deref() {
        if !cmdline.is_empty() && !cmdline.ends_with(' ') {
            cmdline.push(' ');
        }
        cmdline.push_str("supermachine.tsi_token=");
        cmdline.push_str(hex);
    }

    // Guest IPv6 policy: **enabled by default** as of 0.7.36.
    //
    // History. The 2026-05-20 essential-upsell integrator field
    // report described `server.listen(443, '::', cb)` blocking
    // indefinitely inside the guest. The initial 0.7.35 fix
    // disabled v6 entirely (`ipv6.disable=1` on cmdline) so
    // AF_INET6 socket() would fail fast with EAFNOSUPPORT. That
    // turned a silent hang into a visible error, but it cost
    // every legitimate v6 user.
    //
    // Direct repro testing in 0.7.36 prep showed v6 listen
    // ACTUALLY WORKS in our kernel — `bind(::, 23456)` returns in
    // <1 ms, `listen(128)` returns in <1 ms, Node's
    // `http.createServer().listen(443, '::', cb)` fires the cb in
    // ~1.5 s (limited by the wait timer in the test, not by TSI).
    // Our TSI patch series (0009-0014, with our local 0011
    // adding isocket listen for guest-local loopback) implements
    // AF_TSI6 alongside AF_TSI, so v6 binds flow through TSI's
    // dual-stack host-side listener (`bind_dual_stack()` in
    // `tsi_stream.rs`) the same way v4 binds do.
    //
    // The integrator's hang was almost certainly from a binary
    // that predated patch 0011 — the loopback-broken state could
    // make Node's accept-poll wedge. Patch 0011 has shipped since
    // 0.7.x; today's kernel handles v6 cleanly.
    //
    // Kill-switch. If a workload-specific edge case re-introduces
    // a v6 hang, set `SUPERMACHINE_GUEST_IPV6=0` on the host
    // process to add `ipv6.disable=1` back to the cmdline.
    // AF_INET6 socket() returns EAFNOSUPPORT and apps fall back
    // to v4. Document the workload in a bug report so we can
    // chase it; the kill-switch is meant to be transient.
    // 0.7.44+ prefer SUPERMACHINE_HOST_IPV6 (set by the lib's
    // `cached_host_ipv6_route`) over a per-worker probe. Saves
    // ~1-3 ms per spawn_one on the pool's hot acquire-from-disk
    // path. Direct-worker invocations (cargo run, debug) still
    // fall back to the local probe so the worker remains
    // self-contained.
    let host_ipv6 = match std::env::var("SUPERMACHINE_HOST_IPV6").as_deref() {
        Ok("1") | Ok("true") => true,
        Ok("0") | Ok("false") => false,
        _ => probe_host_ipv6_route(),
    };
    let guest_ipv6_disabled =
        std::env::var("SUPERMACHINE_GUEST_IPV6").as_deref() == Ok("0");
    if !cmdline.is_empty() && !cmdline.ends_with(' ') {
        cmdline.push(' ');
    }
    if guest_ipv6_disabled {
        cmdline.push_str(if host_ipv6 {
            "supermachine.host_ipv6=1 ipv6.disable=1"
        } else {
            "supermachine.host_ipv6=0 ipv6.disable=1"
        });
    } else {
        cmdline.push_str(if host_ipv6 {
            "supermachine.host_ipv6=1"
        } else {
            "supermachine.host_ipv6=0"
        });
    }
    eprintln!(
        "  host IPv6 routing: {} (guest IPv6: {})",
        if host_ipv6 { "available" } else { "unavailable" },
        if guest_ipv6_disabled {
            "disabled via SUPERMACHINE_GUEST_IPV6=0 (kill-switch)"
        } else {
            "enabled (TSI handles AF_INET6 via AF_TSI6 patches 0009-0014)"
        }
    );

    if kernel_path.is_none() && restore_from.is_none() {
        runner::run_proof_of_life().unwrap_or_else(|e| {
            eprintln!("HVF proof-of-life failed: {e}");
            std::process::exit(1);
        });
        return;
    }

    let mut resources = match (kernel_path, initrd_path) {
        (Some(kernel), Some(initramfs)) => VmResources::for_kernel(kernel, initramfs),
        (Some(kernel), None) => VmResources::new().with_kernel_path(kernel),
        (None, Some(initramfs)) => VmResources::new().with_initramfs(initramfs),
        (None, None) => VmResources::new(),
    }
    .with_cmdline(cmdline)
    .with_memory_mib(memory_mib)
    .with_vcpus(n_vcpus)
    .with_cow_restore(cow_restore)
    .with_quiesce_ms(quiesce_ms)
    .with_tsi_token(tsi_token_bytes);
    if let Some(path) = restore_from {
        resources = resources.with_restore(path);
    }
    // 0.7.43+ kernel-boot cache HIT: like --restore-from but
    // doesn't flip the bake-then-pool flag, so the bake driver's
    // wait-for-BAKE_READY loop continues to work.
    if let Some(path) = kcache_restore_from {
        resources = resources.with_restore(path);
    }
    for path in blk_paths {
        resources = resources.with_block_device(path);
    }
    for volume in volumes {
        resources = resources.with_volume(volume);
    }
    for mount in mounts {
        resources = resources.with_mount(mount);
    }
    if let Some(after_ms) = snapshot_after_ms {
        if let Some(out_path) = snapshot_out.as_deref() {
            resources = resources.with_snapshot_after_ms(after_ms, out_path);
        } else {
            resources.snapshot.after_ms = Some(after_ms);
        }
    }
    if let Some(at_heartbeat) = snapshot_at {
        if let Some(out_path) = snapshot_out.as_deref() {
            resources = resources.with_snapshot_at_heartbeat(at_heartbeat, out_path);
        } else {
            resources.snapshot.at_heartbeat = Some(at_heartbeat);
        }
    }
    if snapshot_on_listener {
        if let Some(out_path) = snapshot_out.as_deref() {
            resources = resources.with_snapshot_on_listener(out_path);
        } else {
            resources.snapshot.on_listener = true;
        }
    }
    if snapshot_on_pre_exec {
        if let Some(out_path) = snapshot_out.as_deref() {
            resources = resources.with_snapshot_on_pre_exec(out_path);
        } else {
            resources.snapshot.on_pre_exec = true;
        }
    }
    if resources.snapshot.out_path.is_none() {
        resources.snapshot.out_path = snapshot_out;
    }
    resources.balloon_target_pages = balloon_target_pages;
    if let Some(path) = vsock_mux {
        resources = resources.with_vsock_mux(path);
    }
    if let Some(path) = vsock_mux_handoff {
        resources = resources.with_vsock_mux_handoff(path);
    }
    if let Some(path) = vsock_exec {
        resources = resources.with_vsock_exec(path);
    }
    if let Some(port) = vsock_exec_guest_port {
        resources = resources.with_vsock_exec_guest_port(port);
    }
    if let Some(port) = http_port {
        resources = resources.with_http_port(port);
    }
    if let Some(profile) = profile {
        if !vcpus_explicit {
            resources.apply_profile_defaults(profile);
        }
    }

    runner::run(
        &resources,
        RunOptions {
            tls: tls_cfg,
            pool_sock,
            pool_worker: None,
            experimental_skip_warm_gic_restore: std::env::var_os("SUPERMACHINE_SKIP_WARM_GIC_RESTORE")
                .is_some(),
            bake_then_pool: bake_then_pool_flag,
            // The standalone worker binary always exits after one
            // VM lifecycle. Tell the runner to libc::_exit before
            // dropping MicroVm — see runner.rs `exit_after_run`
            // doc for the full shutdown-race rationale.
            exit_after_run: true,
        },
    )
    .unwrap_or_else(|e| {
        eprintln!("VM run failed: {e}");
        unsafe { libc::_exit(2) };
    });
    // Unreachable under `exit_after_run: true` — runner::run
    // libc::_exits before returning. Kept as a defensive fallback
    // in case the flag is ever flipped off.
    unsafe { libc::_exit(0) };
}

/// Quick host-side probe: can we route IPv6 to the public internet?
///
/// We open a UDP6 socket and `connect()` it to a known-globally-
/// reachable Cloudflare v6 anycast (`2606:4700:4700::1111:53`).
/// UDP `connect()` doesn't send any packet — it just records the
/// peer and forces the kernel to pick a source address. If the host
/// has no IPv6 default route, the kernel returns ENETUNREACH (51 on
/// macOS, 101 on Linux) immediately; if it has, `connect` succeeds
/// and `getsockname` would return a global v6 source.
///
/// Why this matters: when the guest sees AAAA records and tries v6
/// first, a v6 destination unreachable from the host wedges legacy
/// clients (alpine `apk`, busybox `wget` — both lack v4-fallback
/// after a v6 connect failure). We signal the guest via cmdline
/// so init-oci can disable guest IPv6 and force getaddrinfo to
/// return v4 only.
fn probe_host_ipv6_route() -> bool {
    use std::net::{IpAddr, Ipv6Addr, SocketAddr, UdpSocket};
    // bind ephemeral v6 socket
    let Ok(sock) = UdpSocket::bind(SocketAddr::new(
        IpAddr::V6(Ipv6Addr::UNSPECIFIED),
        0,
    )) else {
        return false;
    };
    // `connect` is non-blocking on UDP — just sets up routing.
    // Cloudflare 2606:4700:4700::1111 is anycast, present on every
    // IPv6 transit.
    let target = SocketAddr::new(
        IpAddr::V6(Ipv6Addr::new(0x2606, 0x4700, 0x4700, 0, 0, 0, 0, 0x1111)),
        53,
    );
    sock.connect(target).is_ok()
        && sock
            .local_addr()
            .map(|a| match a.ip() {
                IpAddr::V6(v6) => {
                    // Reject unspecified `::` or loopback `::1` —
                    // both mean "no real v6 source", e.g. the
                    // kernel picked no v6 interface.
                    !v6.is_unspecified() && !v6.is_loopback()
                }
                _ => false,
            })
            .unwrap_or(false)
}

fn build_env_payload(pairs: &[(String, String)], file: Option<&str>) -> Option<String> {
    if pairs.is_empty() && file.is_none() {
        return None;
    }
    if let Some(path) = file {
        return std::fs::read_to_string(path).ok();
    }
    let mut out = String::from(r#"{"env":{"#);
    for (i, (k, v)) in pairs.iter().enumerate() {
        if i > 0 {
            out.push(',');
        }
        out.push('"');
        json_escape_into(&mut out, k);
        out.push_str(r#"":"#);
        out.push('"');
        json_escape_into(&mut out, v);
        out.push('"');
    }
    out.push_str(r#"},"secrets":{}}"#);
    Some(out)
}

fn json_escape_into(out: &mut String, s: &str) {
    for c in s.chars() {
        match c {
            '"' => out.push_str("\\\""),
            '\\' => out.push_str("\\\\"),
            '\n' => out.push_str("\\n"),
            '\r' => out.push_str("\\r"),
            '\t' => out.push_str("\\t"),
            c if (c as u32) < 0x20 => {
                use std::fmt::Write;
                let _ = write!(out, "\\u{:04x}", c as u32);
            }
            c => out.push(c),
        }
    }
}

#[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
fn main() {
    eprintln!("supermachine only runs on macOS aarch64");
}