supermachine 0.7.25

// init-oci.c — pid 1 for supermachine OCI container boots.
//
// The squashfs (mounted at /, kernel boots root=/dev/vda) contains:
//   - the OCI rootfs (extracted from `docker export`)
//   - /init (this binary)
//   - /.supermachine-cmd (newline-separated argv tokens; first line is the
//     program path or program name resolved against PATH)
//   - /.supermachine-workdir (optional: chdir target)
//
// What we do:
//   1. Mount /proc /sys /dev (tmpfs needed for many distros).
//   2. Pull the env JSON from the VMM's AF_VSOCK port 1026 (the
//      --env / --env-file payload). Set each K=V into the
//      environment so the customer's program inherits them.
//   3. Print the heartbeat marker so the VMM's --snapshot-at can fire
//      at a known clean point (post-env, pre-program-exec).
//   4. Read /.supermachine-cmd → argv[].
//   5. Optionally chdir to /.supermachine-workdir.
//   6. supervise the customer program as a child. By default the
//      child's stdio is /dev/null so request logs do not go through
//      the emulated serial device; set SUPERMACHINE_GUEST_STDIO=console
//      to inherit the PID-1 console for debugging.
//
// On failure, prints diagnostics + sleeps forever (so the kernel
// keeps the VM alive long enough for the operator to inspect).

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <signal.h>
#include <time.h>
#include <pwd.h>
#include <grp.h>
#include <ctype.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/wait.h>
#include <sys/syscall.h>     /* finit_module via syscall() */
#include <sys/ioctl.h>
#include <linux/vm_sockets.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/if_link.h>
#include <linux/if_addr.h>
#include <net/if.h>
#include <netinet/in.h>

// Embedded minimal x86_64 Linux static ELF that just calls SYS_exit(0).
// Built via:
//   zig cc --target=x86_64-linux-musl -nostdlib -static -s \
//     -Wl,--build-id=none -o exit0 exit0.S
// where exit0.S is `mov $60,%eax; xor %edi,%edi; syscall`.
// Used by setup_rosetta_in_vm() as a warm-up: we write it to /tmp,
// exec it via /run/rosetta/rosetta, and wait for it to complete
// BEFORE signaling bake-ready. That makes rosettad's first-connect
// lazy init happen during the bake — the snapshot captures rosettad
// in its post-init state, and post-restore execs hit a warm rosettad
// instead of racing the lazy init. Without this, the first amd64
// exec after VM acquire would sometimes trigger `ld.so` to panic
// during glibc-dynamic startup with `GL(dl_rtld_map).l_libname`
// assertion (the lazy init produces a broken translation for the
// VERY first call).
#include "warmup_amd64.h"

static int supervise(char **argv);

#define HOST_CID 2
#define ENV_PORT 1026
/// Native AF_VSOCK port the in-guest exec agent listens on. Must
/// match `vmm::resources::DEFAULT_EXEC_GUEST_PORT` on the host
/// side. See `docs/design/exec-2026-05-03.md`. Currently a STUB
/// echo loop — real Rust agent crate replaces this in step 2.
#define EXEC_PORT 1028
#define CMD_FILE "/.supermachine-cmd"
#define WD_FILE  "/.supermachine-workdir"
#define USER_FILE "/.supermachine-user"
#define HOSTNAME_FILE "/.supermachine-hostname"
#define MAX_ARGS 64
#define ARG_BUF 65536

static void die(const char *msg) {
    fprintf(stderr, "init-oci: %s: %s\n", msg, strerror(errno));
    sleep(86400);  // keep VM alive for inspection
    exit(1);
}

/// Bring the loopback interface (`lo`) up. Required for any
/// in-guest TCP/UDP traffic to 127.0.0.1 / ::1. A standard distro
/// gets this from systemd-networkd / NetworkManager / sysvinit;
/// we're minimal init so we do it ourselves with a single
/// SIOCSIFFLAGS ioctl.
///
/// Why it matters: with `tsi_hijack` enabled, all guest AF_INET
/// sockets get converted to AF_TSI. The TSI driver has two
/// internal sockets — an isocket (real AF_INET) and a vsocket.
/// On `bind()` it binds the isocket to the user's requested
/// address; on `connect()` it first tries the isocket
/// (in-kernel path, fast), falling back to vsocket only on
/// failure. The isocket can only complete a loopback connect if
/// `lo` is UP. Without this, in-guest `127.0.0.1` connects hit
/// `ENETUNREACH` before the TSI fallback can even try the
/// host-proxy path.
///
/// Best-effort: failure here doesn't abort init. If the kernel
/// is missing CONFIG_INET (extremely unlikely for our build) the
/// ioctl will fail; userspace will see the same ENETUNREACH it
/// always did, but at least non-network workloads still run.
static void bring_up_loopback(void) {
    int s = socket(AF_INET, SOCK_DGRAM, 0);
    if (s < 0) {
        fprintf(stderr, "init-oci: loopback: socket: %s\n", strerror(errno));
        return;
    }
    struct ifreq ifr;
    memset(&ifr, 0, sizeof(ifr));
    strncpy(ifr.ifr_name, "lo", IFNAMSIZ - 1);
    if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0) {
        fprintf(stderr, "init-oci: loopback: SIOCGIFFLAGS: %s\n", strerror(errno));
        close(s);
        return;
    }
    ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
    if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) {
        fprintf(stderr, "init-oci: loopback: SIOCSIFFLAGS: %s\n", strerror(errno));
        close(s);
        return;
    }
    close(s);
    fprintf(stderr, "init-oci: loopback up\n");
}

/// Open up ICMP echo socket (`SOCK_DGRAM`+`IPPROTO_ICMP`) to every
/// guest UID by widening `net.ipv4.ping_group_range`. Linux ships a
/// default `1 0` (== "no GID may open an icmp socket"), which trips
/// up cloudflared's ICMP-proxy feature and any `ping(8)` not setuid-
/// root. Setting `0 4294967295` opens it to every GID — there's no
/// real attack surface inside an ephemeral microVM, and cloudflared
/// in particular logs a noisy WRN about it. Cheap to fix.
static void open_ping_group_range(void) {
    int fd = open("/proc/sys/net/ipv4/ping_group_range",
                  O_WRONLY | O_CLOEXEC);
    if (fd < 0) return;
    static const char val[] = "0 4294967295\n";
    (void)write(fd, val, sizeof(val) - 1);
    close(fd);
}

/// Disable IPv6 in the guest when the host can't route it. Without
/// this, musl + glibc + libfetch (apk) return AAAA records to
/// callers, those callers try v6 first, the v6 connect fails (no
/// route on host), and the legacy clients (alpine apk, busybox
/// wget — both lack v4-fallback-after-v6-failure) give up entirely
/// with "TLS: unspecified error" / "Address family not supported".
/// The supermachine worker probes host IPv6 reachability at boot
/// and signals via the `supermachine.host_ipv6=0|1` kernel cmdline
/// arg. We honor `=0` by writing `disable_ipv6=1` to every sysctl
/// knob that matters (all, default, lo). When `=1`, we leave IPv6
/// enabled — production hosts on dual-stack networks keep their
/// native v6 outbound path working through the muxer's dual-stack
/// UDP socket.
static void apply_ipv6_policy(void) {
    int fd = open("/proc/cmdline", O_RDONLY | O_CLOEXEC);
    if (fd < 0) return;
    char buf[4096];
    ssize_t n = read(fd, buf, sizeof(buf) - 1);
    close(fd);
    if (n <= 0) return;
    buf[n] = 0;
    char *p = strstr(buf, "supermachine.host_ipv6=");
    if (!p) return;
    char val = p[strlen("supermachine.host_ipv6=")];
    if (val != '0') return;
    // Disable IPv6 across the board. Order matters: `all` first
    // (acts as default for new interfaces), then `default` (acts
    // as template), then per-interface for any already-up.
    const char *paths[] = {
        "/proc/sys/net/ipv6/conf/all/disable_ipv6",
        "/proc/sys/net/ipv6/conf/default/disable_ipv6",
        "/proc/sys/net/ipv6/conf/lo/disable_ipv6",
        NULL,
    };
    for (int i = 0; paths[i]; i++) {
        int sf = open(paths[i], O_WRONLY | O_CLOEXEC);
        if (sf < 0) continue;
        (void)write(sf, "1\n", 2);
        close(sf);
    }
    fprintf(stderr, "init-oci: IPv6 disabled (host has no v6 route)\n");
}

/// Create a non-loopback network interface so glibc's
/// `getaddrinfo(AI_ADDRCONFIG)` sees a route family for IPv4. Without
/// this, the guest has only `lo` and glibc filters out *all* AF_INET
/// answers — `getent ahostsv4 example.com` returns nothing,
/// `getaddrinfo(host, NULL, &{ai_family=AF_INET,ai_flags=AI_ADDRCONFIG}, ...)`
/// returns `EAI_NONAME`, and any app that uses AI_ADDRCONFIG (apt,
/// gnutls, curl with `--resolve` against a synthetic name, libuv's
/// `uv_getaddrinfo` default in some configurations, plenty of
/// enterprise tooling) sees ENOTFOUND for every DNS name regardless
/// of length.
///
/// We don't need real connectivity — the TSI patches route all socket
/// traffic via vsock — we just need a UP, non-IFF_LOOPBACK interface
/// with an assigned IPv4 address that glibc's `getifaddrs()` walk
/// will pick up. We use a `veth` pair (CONFIG_VETH=y in our kernel
/// fragment) because `dummy` requires CONFIG_DUMMY which we don't
/// build. Both veth endpoints stay in the same netns; we only
/// configure one side. The peer stays DOWN with no address; harmless.
///
/// Address: `198.18.0.1/30` from the IANA-reserved benchmarking range
/// (RFC 2544 / 5735). Won't collide with anything real, including
/// docker bridges or k8s pod CIDRs. The whole iface exists only to
/// satisfy AI_ADDRCONFIG's probe; no packet ever leaves it.
///
/// All errors are non-fatal. If netlink setup fails the worst case
/// is the AI_ADDRCONFIG bug stays as it was — every other code path
/// (TSI routing, `lo`, IPv6) is unaffected.
static int nl_send_recv(int s, struct nlmsghdr *nlh) {
    if (send(s, nlh, nlh->nlmsg_len, 0) < 0) {
        return -1;
    }
    char rbuf[1024];
    ssize_t r = recv(s, rbuf, sizeof(rbuf), 0);
    if (r <= 0) {
        return -1;
    }
    struct nlmsghdr *resp = (struct nlmsghdr *)rbuf;
    if (resp->nlmsg_type == NLMSG_ERROR) {
        struct nlmsgerr *err = NLMSG_DATA(resp);
        if (err->error != 0) {
            errno = -err->error;
            return -1;
        }
    }
    return 0;
}

static void nl_add_attr(struct nlmsghdr *nlh, size_t maxlen,
                        int type, const void *data, size_t dlen) {
    struct rtattr *rta = (struct rtattr *)((char *)nlh + NLMSG_ALIGN(nlh->nlmsg_len));
    size_t add = RTA_LENGTH(dlen);
    rta->rta_type = type;
    rta->rta_len = (unsigned short)add;
    memcpy(RTA_DATA(rta), data, dlen);
    nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + (unsigned int)RTA_ALIGN(add);
    (void)maxlen;
}

static struct rtattr *nl_nest_start(struct nlmsghdr *nlh, int type) {
    struct rtattr *rta = (struct rtattr *)((char *)nlh + NLMSG_ALIGN(nlh->nlmsg_len));
    rta->rta_type = type;
    rta->rta_len = RTA_LENGTH(0);
    nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + RTA_ALIGN(rta->rta_len);
    return rta;
}

static void nl_nest_end(struct nlmsghdr *nlh, struct rtattr *nest) {
    nest->rta_len = (unsigned short)((char *)nlh + nlh->nlmsg_len - (char *)nest);
}

static void create_fake_iface(void) {
    int s = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
    if (s < 0) {
        fprintf(stderr, "init-oci: fake-iface: netlink socket: %s\n", strerror(errno));
        return;
    }
    struct sockaddr_nl sa = { .nl_family = AF_NETLINK };
    if (bind(s, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
        close(s);
        return;
    }

    const char *ifname = "smnet0";
    const char *peername = "smnet0p";

    // 1) RTM_NEWLINK to create the veth pair.
    {
        char buf[1024] = {0};
        struct nlmsghdr *nlh = (struct nlmsghdr *)buf;
        nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
        nlh->nlmsg_type = RTM_NEWLINK;
        nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE | NLM_F_EXCL;
        nlh->nlmsg_seq = 1;
        struct ifinfomsg *ifi = NLMSG_DATA(nlh);
        ifi->ifi_family = AF_UNSPEC;
        nl_add_attr(nlh, sizeof(buf), IFLA_IFNAME, ifname, strlen(ifname) + 1);
        struct rtattr *linkinfo = nl_nest_start(nlh, IFLA_LINKINFO);
        nl_add_attr(nlh, sizeof(buf), IFLA_INFO_KIND, "veth", 4);
        struct rtattr *infodata = nl_nest_start(nlh, IFLA_INFO_DATA);
        // VETH_INFO_PEER = 1: nested ifinfomsg + IFLA_IFNAME for the peer.
        struct rtattr *peer = nl_nest_start(nlh, 1 /* VETH_INFO_PEER */);
        // ifinfomsg for the peer (the kernel ignores fields but needs the struct).
        struct ifinfomsg *peer_ifi =
            (struct ifinfomsg *)((char *)nlh + NLMSG_ALIGN(nlh->nlmsg_len));
        memset(peer_ifi, 0, sizeof(*peer_ifi));
        peer_ifi->ifi_family = AF_UNSPEC;
        nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) +
                         NLMSG_ALIGN(sizeof(struct ifinfomsg));
        nl_add_attr(nlh, sizeof(buf), IFLA_IFNAME, peername, strlen(peername) + 1);
        nl_nest_end(nlh, peer);
        nl_nest_end(nlh, infodata);
        nl_nest_end(nlh, linkinfo);
        if (nl_send_recv(s, nlh) < 0) {
            fprintf(stderr, "init-oci: fake-iface: create veth: %s\n", strerror(errno));
            close(s);
            return;
        }
    }

    // 2) RTM_NEWADDR — assign 198.18.0.1/30 to smnet0.
    int ifindex = if_nametoindex(ifname);
    if (ifindex == 0) {
        close(s);
        return;
    }
    {
        char buf[256] = {0};
        struct nlmsghdr *nlh = (struct nlmsghdr *)buf;
        nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg));
        nlh->nlmsg_type = RTM_NEWADDR;
        nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE | NLM_F_REPLACE;
        nlh->nlmsg_seq = 2;
        struct ifaddrmsg *ifa = NLMSG_DATA(nlh);
        ifa->ifa_family = AF_INET;
        ifa->ifa_prefixlen = 30;
        ifa->ifa_flags = 0;
        ifa->ifa_scope = RT_SCOPE_LINK;
        ifa->ifa_index = (unsigned int)ifindex;
        // 198.18.0.1 — RFC 2544 benchmarking range.
        unsigned char addr[4] = { 198, 18, 0, 1 };
        nl_add_attr(nlh, sizeof(buf), IFA_LOCAL, addr, 4);
        nl_add_attr(nlh, sizeof(buf), IFA_ADDRESS, addr, 4);
        if (nl_send_recv(s, nlh) < 0) {
            fprintf(stderr, "init-oci: fake-iface: add addr: %s\n", strerror(errno));
            // Continue — UP without an address still wouldn't satisfy AI_ADDRCONFIG,
            // but partial state is at least visible for debugging.
        }
    }

    // 3) RTM_NEWLINK — set smnet0 UP.
    {
        char buf[128] = {0};
        struct nlmsghdr *nlh = (struct nlmsghdr *)buf;
        nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
        nlh->nlmsg_type = RTM_NEWLINK;
        nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
        nlh->nlmsg_seq = 3;
        struct ifinfomsg *ifi = NLMSG_DATA(nlh);
        ifi->ifi_family = AF_UNSPEC;
        ifi->ifi_index = ifindex;
        ifi->ifi_flags = IFF_UP;
        ifi->ifi_change = IFF_UP;
        if (nl_send_recv(s, nlh) < 0) {
            fprintf(stderr, "init-oci: fake-iface: link up: %s\n", strerror(errno));
        }
    }

    close(s);
    fprintf(stderr, "init-oci: fake iface %s (198.18.0.1/30) up — AI_ADDRCONFIG ok\n", ifname);
}

static void mount_pseudofs(void) {
    mkdir("/proc", 0755);
    mkdir("/sys", 0755);
    mkdir("/dev", 0755);
    // The kernel may already have auto-mounted /proc /sys /dev
    // because of CONFIG_PROC_FS=y / CONFIG_SYSFS=y /
    // CONFIG_DEVTMPFS_MOUNT=y. EBUSY here is harmless and noisy
    // — suppress in the common case (already mounted on the
    // expected fs); fall through and report only the genuinely
    // unexpected errors (path doesn't exist, fs unsupported,
    // etc.) that warrant attention.
    if (mount("proc", "/proc", "proc", 0, NULL) < 0 && errno != EBUSY)
        fprintf(stderr, "init-oci: mount proc: %s\n", strerror(errno));
    if (mount("sysfs", "/sys", "sysfs", 0, NULL) < 0 && errno != EBUSY)
        fprintf(stderr, "init-oci: mount sysfs: %s\n", strerror(errno));
    if (mount("devtmpfs", "/dev", "devtmpfs", 0, NULL) < 0 && errno != EBUSY)
        fprintf(stderr, "init-oci: mount devtmpfs: %s\n", strerror(errno));
    // devpts so AF_VSOCK exec agent's openpty() finds /dev/pts/N.
    // Many OCI images expect this anyway for `script`, `gdb`, etc.
    mkdir("/dev/pts", 0755);
    if (mount("devpts", "/dev/pts", "devpts", 0,
              "newinstance,ptmxmode=0666,mode=0620,gid=5") < 0)
        fprintf(stderr, "init-oci: mount devpts: %s\n", strerror(errno));
    // /dev/shm: bump default from 64 MiB to 256 MiB. Chrome /
    // Chromium (and any other workload that uses POSIX
    // shared-memory at scale — postgres, redis, JVM hotspot's
    // perfdata) hit the 64 MiB ceiling fast. Apple HVF gives us
    // ~25% host-RSS savings via the balloon already; the cap on
    // /dev/shm is a separate accounting limit that doesn't
    // benefit from CoW, so the practical ceiling here is the
    // configured `memory_mib`. 256 MiB is well under the default
    // 256 MiB guest and gives Chromium enough rope; the size
    // option is a maximum, not a commit.
    mkdir("/dev/shm", 01777);
    mount("tmpfs", "/dev/shm", "tmpfs", 0, "size=256m,mode=1777");
    // tmpfs at /tmp + /run (many programs assume these exist + are writable)
    mkdir("/tmp", 01777); chmod("/tmp", 01777);
    mkdir("/run", 0755);
    mount("tmpfs", "/tmp", "tmpfs", 0, NULL);
    mount("tmpfs", "/run", "tmpfs", 0, NULL);

    // cgroup v2 unified hierarchy. Chromium reads cgroup state for
    // its renderer/utility process accounting; without /sys/fs/cgroup
    // mounted, Chrome's process bringup logs errors and some
    // subprocesses hang on cgroup setup syscalls. Modern systemd-
    // based distros mount this by default; we're an init-less
    // squashfs, so we do it ourselves.
    //
    // cgroup2 (not v1) because:
    //   - Kernel 6.x default, simpler hierarchy
    //   - No controller-pinning across cgroup-v1's split hierarchies
    //   - Matches what `docker run --cgroup-parent` expects on modern
    //     Docker installs
    mkdir("/sys/fs/cgroup", 0755);
    if (mount("cgroup2", "/sys/fs/cgroup", "cgroup2", 0, NULL) < 0 && errno != EBUSY)
        fprintf(stderr, "init-oci: mount cgroup2: %s\n", strerror(errno));

    // /tmp/.X11-unix: Xvfb (and any X server) creates `:N` sockets
    // here. Default-mode 1777 (sticky, world-writable) matches
    // standard Linux distros' /tmp/.X11-unix permissions —
    // multiple uids can each own their own X socket, but only the
    // owner can unlink. Without this dir, `xvfb-run` fails fast
    // with "Missing X server or $DISPLAY".
    mkdir("/tmp/.X11-unix", 01777);
    chmod("/tmp/.X11-unix", 01777);

    // /run/dbus: many GTK/Qt apps and Chrome probe for the system
    // dbus socket here. We don't run a dbus daemon (would add a
    // ~5 MB rss baseline cost for a service most workloads never
    // touch), but creating the directory means Chrome's probe
    // fails fast with ENOENT on the socket instead of getting
    // stuck retrying directory lookups. Same intent for
    // /var/run/dbus (older convention; many images symlink one to
    // the other but we're permissive and create both).
    mkdir("/run/dbus", 0755);
}

static void write_file_if_missing(const char *path, const char *body, mode_t mode) {
    struct stat st;
    if (stat(path, &st) == 0) return;
    int fd = open(path, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, mode);
    if (fd < 0) return;
    (void)write(fd, body, strlen(body));
    close(fd);
}

/// Like `write_file_if_missing`, but also treats files that are empty
/// or lack a given marker substring as missing. Use when the file may
/// exist but be incomplete — e.g. an OCI rootfs that ships an empty
/// `/etc/hosts` (some scratch-style images do) or one that has custom
/// app-specific entries without the canonical `localhost` lines.
/// Without this, `getaddrinfo("localhost")` returns NXDOMAIN inside
/// the VM, which breaks cloudflared / sidecar patterns that bind on
/// the literal name "localhost".
static void write_file_if_lacks(const char *path, const char *marker,
                                const char *body, mode_t mode) {
    struct stat st;
    int has_marker = 0;
    if (stat(path, &st) == 0 && st.st_size > 0) {
        int fd = open(path, O_RDONLY | O_CLOEXEC);
        if (fd >= 0) {
            char buf[4096];
            ssize_t n = read(fd, buf, sizeof(buf) - 1);
            close(fd);
            if (n > 0) {
                buf[n] = '\0';
                has_marker = (strstr(buf, marker) != NULL);
            }
        }
        if (has_marker) return;
    }
    int fd = open(path, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, mode);
    if (fd < 0) return;
    (void)write(fd, body, strlen(body));
    close(fd);
}

static void write_file_replace(const char *path, const char *body, mode_t mode) {
    int fd = open(path, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, mode);
    if (fd < 0) return;
    (void)write(fd, body, strlen(body));
    close(fd);
}

// ---- Rosetta-in-VM bootstrap ----------------------------------------------
//
// If the host bake mounted a virtio-fs share tagged `rosetta-runtime`
// (it does this when the embedder builds with `rosetta: true`), this
// function:
//   1. Creates /run/rosetta as the mount point
//   2. mount -t virtiofs rosetta-runtime /run/rosetta
//   3. Mounts binfmt_misc filesystem (one-shot)
//   4. Registers the x86_64 ELF magic → /run/rosetta/rosetta interpreter
//
// All steps are best-effort. If any fails, we log to stderr and continue —
// the workload may still be a pure arm64 binary that doesn't need Rosetta,
// in which case missing Rosetta-setup is a no-op rather than a boot
// failure.
//
// The binfmt_misc magic line matches what Apple's container CLI / VZ
// install for amd64. Format:
//
//     :amd64:M::<magic>:<mask>:<interpreter>:<flags>
//
// magic (20 bytes from byte 0 of an x86_64 ELF):
//   0x7f 'E' 'L' 'F'        — ELF magic
//   0x02                    — EI_CLASS = ELFCLASS64
//   0x01                    — EI_DATA = ELFDATA2LSB
//   0x01                    — EI_VERSION = 1
//   0x00 x 9                — EI_OSABI through EI_PAD (zero-padded)
//   0x02 0x00               — e_type = ET_EXEC=2 (little-endian u16)
//   0x3e 0x00               — e_machine = EM_X86_64=62 (little-endian u16)
// mask (20 bytes): match every byte except e_type's high byte and the
// OSABI/ABIVERSION/PAD region (which legitimately varies).
// flags: OCF
//   O — open-binary (interpreter gets fd, not path)
//   C — credentials preserved (setuid bits respected)
//   F — fix-binary (interpreter path resolved at registration time, not exec)
static void setup_rosetta_in_vm(void) {
    // If the share isn't mounted by host, skip silently.
    struct stat st;
    if (stat("/run", &st) != 0) {
        mkdir("/run", 0755);
    }
    mkdir("/run/rosetta", 0755);
    // Tag is `"rosetta"` — matches Apple's `containerization`
    // convention (see Vminitd+Rosetta.swift) so the same Image bake
    // works on supermachine HVF and Apple's VZ-backed container.
    int mret = mount("rosetta", "/run/rosetta", "virtiofs", 0, NULL);
    if (mret != 0) {
        // No host-side rosetta-runtime share — nothing to bootstrap.
        // Don't even log this on every boot; it's the common case for
        // arm64-only workloads.
        rmdir("/run/rosetta");
        return;
    }
    // Confirm the translator binary exists. If the share is mounted but
    // the binary isn't there, that's a host-side misconfiguration; log
    // and bail.
    if (stat("/run/rosetta/rosetta", &st) != 0) {
        fprintf(stderr, "init-oci: rosetta-runtime mount present but "
            "/run/rosetta/rosetta missing; skipping Rosetta setup\n");
        return;
    }
    // Mount binfmt_misc if the kernel supports it. Without
    // CONFIG_BINFMT_MISC the mount() call returns ENODEV — that's a
    // kernel rebuild requirement, not a runtime failure.
    int bret = mount("binfmt_misc", "/proc/sys/fs/binfmt_misc", "binfmt_misc",
                     0, NULL);
    if (bret != 0 && errno != EBUSY) {
        // EBUSY = already mounted (idempotent rerun); anything else is fatal
        // for the binfmt path.
        fprintf(stderr, "init-oci: mount binfmt_misc failed (errno=%d). "
            "Kernel missing CONFIG_BINFMT_MISC?\n", errno);
        return;
    }
    // Register the amd64 ELF binfmt entry.
    static const char binfmt_line[] =
        ":amd64:M::"
        "\\x7fELF\\x02\\x01\\x01\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00"
        "\\x02\\x00\\x3e\\x00"
        ":"
        "\\xff\\xff\\xff\\xff\\xff\\xfe\\xfe\\x00\\xff\\xff\\xff\\xff\\xff\\xff"
        "\\xff\\xff\\xfe\\xff\\xff\\xff"
        ":/run/rosetta/rosetta:OCF\n";
    int fd = open("/proc/sys/fs/binfmt_misc/register",
                  O_WRONLY | O_CLOEXEC);
    if (fd < 0) {
        fprintf(stderr, "init-oci: open binfmt_misc/register failed: "
            "errno=%d\n", errno);
        return;
    }
    ssize_t wret = write(fd, binfmt_line, sizeof binfmt_line - 1);
    close(fd);
    if (wret < 0) {
        // Most common case: line already registered (EEXIST=17) from a
        // prior bake. Idempotent — don't log noisily.
        if (errno != EEXIST) {
            fprintf(stderr, "init-oci: register amd64 binfmt failed: "
                "errno=%d\n", errno);
            return;
        }
        // EEXIST path: binfmt already registered. Fall through to
        // ensure HOME / cache / daemon are also (idempotently) wired
        // for restored guests, where the binfmt entry survives in
        // /proc/sys/fs/binfmt_misc but our setenv / mount / fork
        // state does not.
    }
    // Set HOME=/root so the rosetta interpreter (invoked transparently
    // via binfmt) sees a non-empty HOME. The interpreter parses
    // environ for a `HOME=` prefix and stores the value at an internal
    // offset (see TranslationCacheAot.cpp). When HOME is empty the
    // cache filename resolves to `/.cache/rosetta/<id>.flu` (filesystem
    // root) and the open silently fails — we'd be back to JIT-only.
    // `0` for the overwrite flag respects any HOME the image / user
    // explicitly set via fetch_and_set_env.
    setenv("HOME", "/root", 0);
    // Mount the host-persistent AOT cache share at /var/cache/rosettad
    // if the host bake passed one. Tag `rosettad-cache` is auto-added
    // by api.rs whenever platform=linux/amd64; integrators don't see
    // it. Without this share rosettad still works but caches only to
    // the in-guest rootfs (lost on VM teardown).
    mkdir("/var/cache", 0755);
    mkdir("/var/cache/rosettad", 0755);
    int cret = mount("rosettad-cache", "/var/cache/rosettad",
                     "virtiofs", 0, NULL);
    if (cret != 0 && errno != EBUSY) {
        // No share — caches stay in-guest. Non-fatal.
        fprintf(stderr, "init-oci: rosettad-cache mount unavailable "
            "(errno=%d); AOT cache won't persist across VM restarts\n",
            errno);
    }
    // Pre-create /run/rosettad so rosettad can bind its filesystem
    // socket there. Apple's documented default cache configuration
    // (VZLinuxRosettaUnixSocketCachingOptions.init() in the public
    // Virtualization.framework headers) uses path
    // `/run/rosettad/rosetta.sock` — which requires the parent
    // directory to exist before bind(2). VZ pre-creates this in its
    // managed VMs; we do the same here. Without it, rosettad's bind
    // silently fails and the interpreter's connect() fails — for
    // musl-static amd64 the interpreter falls through to a path that
    // still translates (you just lose the cache), but for glibc-
    // dynamic amd64 the fallback produces broken codegen that crashes
    // `ld.so` at `_dl_start` (`l_libname` assertion). The mkdir
    // is the single line that unblocks glibc-amd64 with AOT enabled.
    mkdir("/run/rosettad", 0755);
    // Start rosettad as a long-lived background daemon. The socket
    // path it binds is determined by the cache_settings ioctl response
    // our FUSE backend serves (see crates/supermachine/src/fuse/posix.rs);
    // by default that's `/run/rosettad/rosetta.sock` (filesystem
    // socket, matching Apple's VZ default). The cache DIRECTORY arg
    // below (`/var/cache/rosettad`) is unrelated to the socket — it's
    // where rosettad writes content-addressed `.aotcache` files.
    // Init reaps SIGCHLD (see reap_zombies) so a daemon crash doesn't
    // leak a zombie.
    pid_t dpid = fork();
    if (dpid == 0) {
        // Child: replace with rosettad. argv[0] is the cache root —
        // rosettad defaults to $HOME/.cache/rosettad when called with
        // no path, but we want the host-mounted share, so be explicit.
        char *argv[] = {
            (char *)"/run/rosetta/rosettad",
            (char *)"daemon",
            (char *)"/var/cache/rosettad",
            NULL,
        };
        execv("/run/rosetta/rosettad", argv);
        // execv only returns on error; log to PID 1's stderr.
        fprintf(stderr, "init-oci: execv rosettad: %s\n", strerror(errno));
        _exit(127);
    } else if (dpid < 0) {
        fprintf(stderr, "init-oci: fork rosettad: %s\n", strerror(errno));
    } else {
        fprintf(stderr, "init-oci: rosettad spawned pid=%d "
            "(cache=/var/cache/rosettad)\n", dpid);
        // Wait briefly for rosettad to reach accept(). Without this
        // the warm-up exec below can race with rosettad's bind+listen
        // and connect-refuse, defeating the warm-up.
        struct timespec brief = { .tv_sec = 0, .tv_nsec = 100 * 1000 * 1000 };
        nanosleep(&brief, NULL);
    }
    // Warm up rosetta's first-connect lazy init path by exec'ing a
    // tiny embedded amd64 ELF through the binfmt/rosetta pipeline.
    // After this exec returns, rosettad has done whatever per-process
    // lazy init it does on first connection; subsequent execs see a
    // warm path. Crucially, the snapshot taken by the bake AFTER this
    // captures rosettad in post-init state — so restored VMs are also
    // warm, no race window.
    //
    // Without the warm-up, the first amd64 exec in a freshly-acquired
    // VM sometimes panics in glibc's `_dl_start` with
    // `GL(dl_rtld_map).l_libname` assertion (the lazy init path
    // produces a buggy translation for the very first call).
    {
        const char *warmup_path = "/tmp/.supermachine-rosetta-warmup";
        int wfd = open(warmup_path, O_WRONLY | O_CREAT | O_TRUNC, 0755);
        if (wfd >= 0) {
            ssize_t wn = write(wfd, warmup_amd64, warmup_amd64_len);
            close(wfd);
            if (wn != (ssize_t)warmup_amd64_len) {
                fprintf(stderr, "init-oci: warmup binary write short "
                    "(%zd/%u)\n", wn, warmup_amd64_len);
            } else if (chmod(warmup_path, 0755) != 0) {
                // chmod after open(O_CREAT, 0755) is paranoia (umask
                // could strip bits); failure is non-fatal.
                fprintf(stderr, "init-oci: warmup chmod failed: %s\n",
                    strerror(errno));
            } else {
                pid_t wpid = fork();
                if (wpid == 0) {
                    // Child: redirect stdio to /dev/null so the warmup's
                    // potentially-noisy first-translation stderr doesn't
                    // pollute the bake log.
                    int devnull = open("/dev/null", O_RDWR | O_CLOEXEC);
                    if (devnull >= 0) {
                        dup2(devnull, STDIN_FILENO);
                        dup2(devnull, STDOUT_FILENO);
                        dup2(devnull, STDERR_FILENO);
                        if (devnull > 2) close(devnull);
                    }
                    char *argv[] = { (char *)warmup_path, NULL };
                    execv(warmup_path, argv);
                    _exit(127);
                } else if (wpid > 0) {
                    // Wait for the warmup to finish — we MUST not
                    // proceed to bake-ready until rosettad has been
                    // exercised by at least one full amd64 exec cycle.
                    // Bound the wait so a hung warmup can't block boot
                    // indefinitely (5s is generous; warmup typically
                    // completes in <500ms).
                    int status = 0;
                    struct timespec deadline_check =
                        { .tv_sec = 0, .tv_nsec = 50 * 1000 * 1000 };
                    int waited = 0;
                    while (waited < 100) {  // 100 * 50ms = 5s
                        pid_t r = waitpid(wpid, &status, WNOHANG);
                        if (r == wpid) break;
                        if (r < 0 && errno != EINTR) break;
                        nanosleep(&deadline_check, NULL);
                        waited++;
                    }
                    if (waited >= 100) {
                        fprintf(stderr, "init-oci: rosetta warmup "
                            "timed out (5s); first user exec may race "
                            "rosettad's lazy init\n");
                        kill(wpid, SIGKILL);
                        waitpid(wpid, &status, 0);
                    } else {
                        fprintf(stderr, "init-oci: rosetta warmup done\n");
                    }
                    unlink(warmup_path);
                } else {
                    fprintf(stderr, "init-oci: fork warmup: %s\n",
                        strerror(errno));
                }
            }
        } else {
            fprintf(stderr, "init-oci: open warmup tmp file: %s\n",
                strerror(errno));
        }
    }
    fprintf(stderr, "init-oci: Rosetta-in-VM ready "
        "(amd64 ELFs → /run/rosetta/rosetta, AOT cache via rosettad)\n");
}

static void ensure_runtime_files(void) {
    mkdir("/etc", 0755);
    // `/etc/hosts`: ensure `127.0.0.1 localhost` is resolvable even
    // if the OCI rootfs ships an empty file or one without it.
    // Cloudflared, sidecar patterns, and anything that hard-codes
    // "localhost" in its config call `getaddrinfo("localhost")`
    // before falling back to /etc/resolv.conf — without a hosts
    // entry that's NXDOMAIN, and our public-DNS resolver path
    // (1.1.1.1) obviously doesn't know about it either.
    write_file_if_lacks("/etc/hosts", "localhost",
        "127.0.0.1\tlocalhost\n"
        "::1\tlocalhost ip6-localhost ip6-loopback\n",
        0644);
    // Docker-exported rootfs often contains Docker's embedded DNS
    // address (127.0.0.11). That address is meaningless inside our
    // VM, so replace it with public resolver sentinels for the TSI
    // egress path.
    //
    // `options no-aaaa`: TSI currently only carries AF_INET (IPv4)
    // traffic — AF_INET6 isn't wired through the muxer. Without
    // `no-aaaa`, libc's getaddrinfo() does a parallel A+AAAA query
    // and (per RFC 6724) prefers the AAAA result when both succeed.
    // The caller then tries to connect to a v6 address, TSI doesn't
    // carry it, the connect fails — and getaddrinfo doesn't always
    // fall back to A. Symptom: `apk add curl` in pure alpine fails
    // with "DNS: transient error (try again later)" even though
    // `nslookup` from the same shell succeeds.
    //
    // `no-aaaa` is the standard musl 1.2.4+ / glibc 2.36+ resolver
    // option that disables AAAA queries entirely. Older libcs
    // silently ignore unknown options (same line as `timeout:2` and
    // `attempts:2` — both also gracefully degrade on older musl).
    // The host-side TSI relay can be extended to v6 later; until
    // then this is the no-disruption fix for "first VM" workflows
    // that go through the system resolver (apk, wget, busybox sh
    // — anything that bypasses Node's c-ares / Python's getaddr).
    // We configure exactly ONE nameserver here, NOT two, because of
    // a TSI shim limitation in the `af-tsi` kernel patches. When the
    // shim relays UDP responses back from the host, it overwrites
    // `msg_name` (the perceived response source) with the LAST
    // sendto target — there's no per-packet source addr tracking
    // through the vsock channel yet. With two nameservers, musl's
    // `__res_msend` sends to both in parallel; ~half the responses
    // come back tagged as coming from the wrong nameserver, and musl
    // rejects them as off-path attack defense. The symptom is
    // `getent` / `apk add` / `wget` failing with "DNS: transient
    // error" even though `nslookup` works (nslookup uses its own
    // resolver protocol that doesn't enforce the source check).
    //
    // Single nameserver ⇒ tsk->sendto_addr is always the right
    // nameserver ⇒ musl's source check always passes. We pick
    // Cloudflare's 1.1.1.1 because it's the lowest-latency of the
    // public resolvers in most regions and supports DNS-over-HTTPS
    // upstream if we add a guest-side proxy later for hardening.
    //
    // `no-aaaa`: skip AAAA queries entirely. musl 1.2.4+ and
    // glibc 2.36+ honour this; older libcs ignore unknown options.
    // `single-request`: force serial A→AAAA (not parallel). Some
    // resolvers ignore no-aaaa but honour single-request — this
    // covers the gap. Both are safe.
    write_file_replace("/etc/resolv.conf",
        "nameserver 1.1.1.1\n"
        "options timeout:2 attempts:2 no-aaaa single-request\n",
        0644);

    mkdir("/dev/shm", 01777);
    mount("tmpfs", "/dev/shm", "tmpfs", 0, "size=64m,mode=1777");
    symlink("/proc/self/fd",   "/dev/fd");
    symlink("/proc/self/fd/0", "/dev/stdin");
    symlink("/proc/self/fd/1", "/dev/stdout");
    symlink("/proc/self/fd/2", "/dev/stderr");

    int f = open("/proc/sys/net/ipv4/ip_unprivileged_port_start",
                 O_WRONLY | O_CLOEXEC);
    if (f >= 0) {
        (void)write(f, "0", 1);
        close(f);
    }
}

static void seed_wall_clock(void) {
    struct timespec now = {0};
    if (clock_gettime(CLOCK_REALTIME, &now) == 0 && now.tv_sec > 1000000000L)
        return;

    time_t wall_sec = 1735689600;  // 2026-01-01: non-zero fallback for nginx-class images.
    int fd = open("/proc/cmdline", O_RDONLY | O_CLOEXEC);
    if (fd >= 0) {
        char buf[4096];
        ssize_t n = read(fd, buf, sizeof(buf) - 1);
        close(fd);
        if (n > 0) {
            buf[n] = 0;
            const char *needle = "supermachine.host_time=";
            char *p = strstr(buf, needle);
            if (p) {
                long long v = strtoll(p + strlen(needle), NULL, 10);
                if (v > 1000000000LL) wall_sec = (time_t)v;
            }
        }
    }

    struct timespec ts = { .tv_sec = wall_sec, .tv_nsec = 0 };
    if (clock_settime(CLOCK_REALTIME, &ts) != 0)
        fprintf(stderr, "init-oci: clock_settime: %s\n", strerror(errno));
}

// Pull the env JSON via AF_VSOCK and set each K=V into the env.
// JSON shape: {"env":{"K":"V",...},"secrets":{"K":"V",...}}.
// We do a tiny hand-rolled parser — no JSON deps.
static void fetch_and_set_env(void) {
    int s = socket(AF_VSOCK, SOCK_STREAM, 0);
    if (s < 0) { fprintf(stderr, "init-oci: socket(AF_VSOCK): %s\n", strerror(errno)); return; }
    struct sockaddr_vm a = {0};
    a.svm_family = AF_VSOCK; a.svm_cid = HOST_CID; a.svm_port = ENV_PORT;
    if (connect(s, (struct sockaddr*)&a, sizeof(a)) < 0) {
        fprintf(stderr, "init-oci: env connect: %s\n", strerror(errno));
        close(s); return;
    }
    char buf[16384]; int total = 0;
    for (;;) {
        ssize_t n = read(s, buf + total, sizeof(buf) - 1 - total);
        if (n <= 0) break;
        total += n;
        if (total >= (int)sizeof(buf) - 1) break;
    }
    close(s);
    buf[total] = 0;
    // Parse: find "env":{ ... }, then "secrets":{ ... }. For each
    // "K":"V" pair, setenv(K, V, 1).
    char *p = strstr(buf, "\"env\"");
    while (p) {
        p = strchr(p, '{');
        if (!p) break;
        p++;
        for (;;) {
            char *kq = strchr(p, '"');
            if (!kq) break;
            char *kqe = strchr(kq + 1, '"');
            if (!kqe) break;
            char *colon = strchr(kqe, ':');
            if (!colon) break;
            char *vq = strchr(colon, '"');
            if (!vq) break;
            char *vqe = strchr(vq + 1, '"');
            if (!vqe) break;
            // Found a "K":"V" pair.
            *kqe = 0; *vqe = 0;
            setenv(kq + 1, vq + 1, 1);
            *kqe = '"'; *vqe = '"';
            p = vqe + 1;
            // Skip comma; bail on closing brace.
            while (*p == ',' || *p == ' ' || *p == '\n' || *p == '\r' || *p == '\t') p++;
            if (*p == '}' || *p == 0) break;
        }
        // Advance to the next top-level field (e.g. "secrets") and
        // run the same loop.
        p = strstr(p, "\"secrets\"");
        if (!p) break;
    }
}

static int read_cmd(char *out_buf, char **argv, int max_argv) {
    int fd = open(CMD_FILE, O_RDONLY);
    if (fd < 0) {
        // Fallback: /bin/sh.
        argv[0] = (char *)"/bin/sh";
        argv[1] = NULL;
        return 1;
    }
    ssize_t n = read(fd, out_buf, ARG_BUF - 1);
    close(fd);
    if (n <= 0) {
        argv[0] = (char *)"/bin/sh"; argv[1] = NULL; return 1;
    }
    out_buf[n] = 0;
    int argc = 0;
    char *p = out_buf;
    while (*p && argc < max_argv - 1) {
        // Trim leading whitespace.
        while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') p++;
        if (!*p) break;
        argv[argc++] = p;
        // Find end of token (newline-separated).
        while (*p && *p != '\n' && *p != '\r') p++;
        if (*p) { *p = 0; p++; }
    }
    argv[argc] = NULL;
    return argc;
}

static int all_digits(const char *s) {
    if (!s || !*s) return 0;
    for (const unsigned char *p = (const unsigned char *)s; *p; p++)
        if (!isdigit(*p)) return 0;
    return 1;
}

static void drop_to_image_user(void) {
    int fd = open(USER_FILE, O_RDONLY | O_CLOEXEC);
    if (fd < 0) return;

    char spec[256];
    ssize_t n = read(fd, spec, sizeof(spec) - 1);
    close(fd);
    if (n <= 0) return;
    spec[n] = 0;
    while (n > 0 && (spec[n - 1] == '\n' || spec[n - 1] == '\r' ||
                     spec[n - 1] == ' ' || spec[n - 1] == '\t'))
        spec[--n] = 0;
    if (spec[0] == 0) return;

    char *group = strchr(spec, ':');
    if (group) *group++ = 0;

    uid_t uid = 0;
    gid_t gid = 0;
    const char *init_user = NULL;
    struct passwd *pw = NULL;

    if (all_digits(spec)) {
        uid = (uid_t)strtoul(spec, NULL, 10);
        pw = getpwuid(uid);
        gid = pw ? pw->pw_gid : 0;
        init_user = pw ? pw->pw_name : NULL;
    } else {
        pw = getpwnam(spec);
        if (!pw) {
            fprintf(stderr, "init-oci: unknown user %s\n", spec);
            _exit(126);
        }
        uid = pw->pw_uid;
        gid = pw->pw_gid;
        init_user = pw->pw_name;
    }

    if (group && *group) {
        if (all_digits(group)) {
            gid = (gid_t)strtoul(group, NULL, 10);
        } else {
            struct group *gr = getgrnam(group);
            if (!gr) {
                fprintf(stderr, "init-oci: unknown group %s\n", group);
                _exit(126);
            }
            gid = gr->gr_gid;
        }
    }

    if (init_user) (void)initgroups(init_user, gid);
    if (setgid(gid) != 0) {
        fprintf(stderr, "init-oci: setgid(%lu): %s\n",
                (unsigned long)gid, strerror(errno));
        _exit(126);
    }
    if (setuid(uid) != 0) {
        fprintf(stderr, "init-oci: setuid(%lu): %s\n",
                (unsigned long)uid, strerror(errno));
        _exit(126);
    }
}

static int guest_stdio_to_console(void) {
    // Default: /dev/null. PL011 is byte-by-byte MMIO-emulated;
    // sending a request log line through it for every HTTP hit
    // (~600 B × thousands of req/s) saturates the vCPU on MMIO
    // exits and tanks RPS by ~8x. The old "default to console"
    // attempt was reverted after benching showed 37k -> 4.5k rps
    // on nginx. Real workload logs need a faster channel (vsock
    // capture); see docs/design/lifecycle-v2-2026-05-04.md.
    //
    // Set SUPERMACHINE_GUEST_STDIO=console (or 1/true/inherit)
    // to opt in for debugging.
    const char *v = getenv("SUPERMACHINE_GUEST_STDIO");
    if (!v || !*v) return 0;
    return strcmp(v, "1") == 0 ||
           strcmp(v, "true") == 0 ||
           strcmp(v, "yes") == 0 ||
           strcmp(v, "on") == 0 ||
           strcmp(v, "console") == 0 ||
           strcmp(v, "inherit") == 0;
}

static int redirect_child_stdio(void) {
    if (guest_stdio_to_console()) return 0;

    int fd = open("/dev/null", O_RDWR | O_CLOEXEC);
    if (fd < 0) return -1;
    int rc = 0;
    if (dup2(fd, STDIN_FILENO) < 0) rc = -1;
    if (dup2(fd, STDOUT_FILENO) < 0) rc = -1;
    if (dup2(fd, STDERR_FILENO) < 0) rc = -1;
    if (fd > STDERR_FILENO) close(fd);
    return rc;
}

static void report_child_exec_failure(const char *argv0) {
    int saved_errno = errno;
    int fd = open("/dev/console", O_WRONLY | O_CLOEXEC);
    if (fd < 0) fd = STDERR_FILENO;
    dprintf(fd, "init-oci: child execvp(%s): %s\n",
            argv0, strerror(saved_errno));
    if (fd > STDERR_FILENO) close(fd);
}

// Two-phase init.
//   Phase 1 (initramfs): /dev/vd[a-z] are squashfs layers attached
//     base→top. We compose an overlayfs:
//       lowerdir = top squashfs:...:base squashfs (read-only)
//       upperdir + workdir = tmpfs (writable, per-dispatch)
//     so the customer's program can `mkdir /var/log/nginx` etc.
//     without us needing per-image knowledge of which paths are
//     writable. Then switch_root + exec /init from the overlay.
//   Phase 2 (overlay rootfs): /.supermachine-cmd is present. Fetch env,
//     exec the customer program.
static int try_pivot_to_overlay(void) {
    struct stat st;
    if (stat("/.supermachine-cmd", &st) == 0) return 0;  // already in overlay
    if (stat("/dev/vda", &st) != 0) return 0;        // no rootfs disk

    int n_layers = 0;
    for (char letter = 'a'; letter <= 'z'; letter++) {
        int i = letter - 'a';
        char dev_path[16];
        snprintf(dev_path, sizeof dev_path, "/dev/vd%c", letter);
        if (stat(dev_path, &st) != 0) break;

        char lower_path[32];
        snprintf(lower_path, sizeof lower_path, "/lower-%d", i);
        mkdir(lower_path, 0755);
        // Stop on the first non-squashfs device. The bake pipeline
        // orders volumes (--volume HOST:GUEST, ext4) AFTER the
        // image's squashfs layers, so a failed squashfs mount
        // means we've reached the volume range. Volumes get
        // mounted later by `mount_volumes()` post-pivot.
        if (mount(dev_path, lower_path, "squashfs", MS_RDONLY, NULL) != 0) {
            rmdir(lower_path);
            break;
        }
        n_layers++;
    }
    if (n_layers == 0) return 0;

    mkdir("/upper", 0755);
    mkdir("/newroot", 0755);
    if (mount("tmpfs", "/upper", "tmpfs", 0, NULL) != 0) {
        fprintf(stderr, "init-oci: mount tmpfs upper: %s\n", strerror(errno));
        return -1;
    }
    mkdir("/upper/upper", 0755);
    mkdir("/upper/work", 0755);
    char opts[1024];
    int off = snprintf(opts, sizeof opts, "lowerdir=");
    for (int i = n_layers - 1; i >= 0; i--) {
        off += snprintf(opts + off, sizeof opts - off,
                        "/lower-%d%s", i, i > 0 ? ":" : "");
    }
    snprintf(opts + off, sizeof opts - off,
             ",upperdir=/upper/upper,workdir=/upper/work");
    if (mount("overlay", "/newroot", "overlay", 0, opts) != 0) {
        fprintf(stderr, "init-oci: mount overlay: %s\n", strerror(errno));
        return -1;
    }
    // Move pseudo-fs into /newroot so child init finds them.
    mount("/dev", "/newroot/dev", NULL, MS_MOVE, NULL);
    mount("/proc", "/newroot/proc", NULL, MS_MOVE, NULL);
    mount("/sys", "/newroot/sys", NULL, MS_MOVE, NULL);
    if (chdir("/newroot") != 0) {
        fprintf(stderr, "init-oci: chdir /newroot: %s\n", strerror(errno));
        return -1;
    }
    if (mount(".", "/", NULL, MS_MOVE, NULL) != 0) {
        fprintf(stderr, "init-oci: mount move /: %s\n", strerror(errno));
        return -1;
    }
    if (chroot(".") != 0) {
        fprintf(stderr, "init-oci: chroot: %s\n", strerror(errno));
        return -1;
    }
    chdir("/");
    char *argv[] = { (char *)"/init", NULL };
    char *envp[] = { NULL };
    execve("/init", argv, envp);
    fprintf(stderr, "init-oci: execve /init (overlay): %s\n", strerror(errno));
    return -1;
}

// `--hostname HOSTNAME` from the bake's CLI lands as a single
// line in `/.supermachine-hostname` (delta squashfs). Read +
// sethostname() before the workload exec so customers see the
// expected name in `uname -n`, prompts, log lines, etc. No-op
// if the file is missing.
static void apply_hostname(void) {
    int fd = open(HOSTNAME_FILE, O_RDONLY | O_CLOEXEC);
    if (fd < 0) return;
    char buf[128];
    ssize_t n = read(fd, buf, sizeof(buf) - 1);
    close(fd);
    if (n <= 0) return;
    buf[n] = 0;
    // Trim trailing newline / CR.
    while (n > 0 && (buf[n - 1] == '\n' || buf[n - 1] == '\r' || buf[n - 1] == ' '))
        buf[--n] = 0;
    if (n == 0) return;
    if (sethostname(buf, n) != 0) {
        fprintf(stderr, "init-oci: sethostname(%s): %s\n", buf, strerror(errno));
    }
}

// Mount writable volumes attached as virtio-blk after the
// read-only image layers. The bake pipeline writes
// /.supermachine-volumes (one absolute guest mount path per line)
// and orders volumes after the layers in /dev/vd* (so layers are
// /dev/vda..vd<n>, volumes are /dev/vd<n+1>..). The host has
// already formatted each volume ext4 — we just mount.
//
// On any error mounting a single volume, we log + continue: the
// workload may not need the missing mount, and a partial-volume
// boot is more debuggable than a hard failure.
static void mount_volumes(void) {
    int fd = open("/.supermachine-volumes", O_RDONLY | O_CLOEXEC);
    if (fd < 0) return;  // no volumes configured
    char buf[4096];
    ssize_t n = read(fd, buf, sizeof(buf) - 1);
    close(fd);
    if (n <= 0) return;
    buf[n] = 0;

    // Collect mount paths.
    char *paths[16];
    int n_paths = 0;
    char *p = buf;
    while (*p && n_paths < 16) {
        char *eol = strchr(p, '\n');
        if (eol) *eol = 0;
        if (*p) paths[n_paths++] = p;
        if (!eol) break;
        p = eol + 1;
    }
    if (n_paths == 0) return;

    // Count total /dev/vd* devices (max 26: vda..vdz).
    int total = 0;
    struct stat st;
    for (char letter = 'a'; letter <= 'z'; letter++) {
        char dev[16];
        snprintf(dev, sizeof dev, "/dev/vd%c", letter);
        if (stat(dev, &st) != 0) break;
        total++;
    }
    if (n_paths > total) {
        fprintf(stderr, "init-oci: %d volumes requested but only %d /dev/vd* devices\n",
                n_paths, total);
        return;
    }
    int first_volume_idx = total - n_paths;

    for (int i = 0; i < n_paths; i++) {
        char dev[16];
        snprintf(dev, sizeof dev, "/dev/vd%c", 'a' + first_volume_idx + i);
        const char *mount_point = paths[i];
        // mkdir -p the mount point. Walking the path char-by-char
        // is overkill for typical 1-2-component paths; just try.
        if (mkdir(mount_point, 0755) < 0 && errno != EEXIST) {
            // Best-effort: try to create parents.
            char parent[4096];
            strncpy(parent, mount_point, sizeof(parent) - 1);
            parent[sizeof(parent) - 1] = 0;
            for (char *q = parent + 1; *q; q++) {
                if (*q == '/') {
                    *q = 0;
                    mkdir(parent, 0755);
                    *q = '/';
                }
            }
            mkdir(mount_point, 0755);
        }
        if (mount(dev, mount_point, "ext4", 0, NULL) < 0) {
            fprintf(stderr, "init-oci: mount %s -> %s ext4: %s\n",
                    dev, mount_point, strerror(errno));
            continue;
        }
        fprintf(stderr, "init-oci: mounted %s -> %s (ext4, rw)\n", dev, mount_point);
    }
}

// In-guest exec agent: a Rust binary at `/supermachine-agent`,
// shipped via the bake's delta squashfs (see `bake.rs::
// ensure_supermachine_agent`). It binds AF_VSOCK port EXEC_PORT
// (1028) and serves the framed exec protocol described in
// `docs/design/exec-2026-05-03.md`.
//
// Lifecycle: forked from main() before the heartbeat marker so by
// the time the snapshot fires (on listener-ready) the agent is
// already parked in `accept()` — capture-clean state.
//
// If the agent binary is missing (e.g. an older bake that ran
// before the agent feature landed), skip silently and let the
// workload boot anyway. The host's `<vsock>-exec.sock` will dial
// to no listener and return an immediate EOF; tooling treats that
// as "exec not available on this snapshot."
static void spawn_exec_agent(void) {
    struct stat st;
    if (stat("/supermachine-agent", &st) != 0) {
        fprintf(stderr, "exec-agent: /supermachine-agent missing — exec disabled\n");
        return;
    }
    pid_t p = fork();
    if (p < 0) {
        fprintf(stderr, "exec-agent: fork: %s\n", strerror(errno));
        return;
    }
    if (p == 0) {
        char *argv[] = { (char *)"/supermachine-agent", NULL };
        // Inherit pid 1's full env (which fetch_and_set_env has
        // populated with the OCI image's ENV directives + the
        // host's --env overrides). Without this, the agent and
        // every workload it spawns start with an empty env, so
        // images like rust:1-slim that rely on PATH /
        // RUSTUP_HOME / CARGO_HOME break out of the box.
        // execvp inherits via the global `environ`.
        execvp("/supermachine-agent", argv);
        fprintf(stderr, "exec-agent: execvp: %s\n", strerror(errno));
        _exit(127);
    }
    fprintf(stderr, "exec-agent: spawned pid=%d\n", p);
}

/// Wait until the in-guest exec agent has reached `accept()` on
/// AF_VSOCK port 1028, by attempting a loopback connect to that
/// port from inside the guest. Polls with exponential-ish backoff
/// up to a bounded total — typical wait is <5 ms.
///
/// Why this matters: the bake's pre-exec snapshot fires right
/// after this function returns. If we snapshot before the agent
/// has bound + listened + entered accept(), the captured AF_VSOCK
/// kernel state is mid-syscall and restored guests get a half-
/// initialized agent. Symptom: `vm.exec` on a restored worker
/// hangs forever, or the agent closes the exec connection mid-
/// protocol. By blocking here until a loopback connect succeeds,
/// we guarantee the snapshot captures the agent in a clean
/// post-accept-ready state.
///
/// If the agent isn't available (binary missing, fork failed),
/// the connect will keep refusing and we'll time out after the
/// budget — that's fine, we proceed without exec support, same
/// as today.
static void wait_for_exec_agent_listening(void) {
    // Wait for the agent's "ready" sentinel file. The agent
    // creates /run/supermachine-agent-ready after listen() on
    // AF_VSOCK:EXEC_PORT returns successfully. /run is tmpfs
    // (mounted in mount_pseudofs).
    //
    // Why passive sentinel + not active probe? A connect+close
    // probe from this side would race the agent's per-connection
    // thread cleanup at snapshot time, leaving a "ghost thread"
    // in the captured state that breaks the FIRST real exec call
    // on every restored worker (symptom: "agent closed connection
    // before sending EXIT"). Passive file polling touches nothing
    // in the agent's address space, so the captured snapshot has
    // the agent cleanly in accept() with no leftover state.
    //
    // If the agent binary is missing (older bakes), the file
    // never appears and we time out at 1 s — bake proceeds
    // anyway, exec on restore stays broken but bake didn't hang.
    const int budget_ms = 1000;
    const int step_us = 500;
    int elapsed_us = 0;
    while (elapsed_us < budget_ms * 1000) {
        struct stat st;
        if (stat("/run/supermachine-agent-ready", &st) == 0) {
            fprintf(stderr,
                    "exec-agent: ready after %d us probe\n",
                    elapsed_us);
            return;
        }
        struct timespec ts = { .tv_sec = 0, .tv_nsec = step_us * 1000 };
        nanosleep(&ts, NULL);
        elapsed_us += step_us;
    }
    fprintf(stderr,
            "exec-agent: ready file absent after %d ms; proceeding\n",
            budget_ms);
}

int main(void) {
    setvbuf(stdout, NULL, _IONBF, 0);
    setvbuf(stderr, NULL, _IONBF, 0);
    write(1, "init-oci: hello from pid 1\n", 29);

    mount_pseudofs();
    bring_up_loopback();
    if (try_pivot_to_overlay() < 0) die("pivot");
    // If we get here, we're in the squashfs (or no vda existed).
    ensure_runtime_files();
    apply_ipv6_policy();
    open_ping_group_range();
    create_fake_iface();
    setup_rosetta_in_vm();
    seed_wall_clock();
    apply_hostname();
    fetch_and_set_env();

    // Best-effort: load the snapshot-park kernel module if the
    // bake staged it at /supermachine-smpark.ko. Used by the
    // host's multi-vCPU snapshot path to drive secondaries into
    // a known parked-WFI state before capture. Single-vCPU bakes
    // don't need this; the module no-ops when num_online_cpus()
    // is 1.
    //
    // Failure modes (silently ignored):
    //   * /supermachine-smpark.ko absent — single-vCPU bake or
    //     this kernel ships pre-smpark.
    //   * vermagic mismatch — kernel rebuilt without rebuilding
    //     the module. Snapshot-park unavailable for this run;
    //     multi-vCPU snapshots fall back to the existing
    //     intermittent capture path.
    {
        struct stat smpst;
        if (stat("/supermachine-smpark.ko", &smpst) == 0) {
            int fd = open("/supermachine-smpark.ko", O_RDONLY | O_CLOEXEC);
            if (fd >= 0) {
                /* finit_module is the Linux 3.8+ interface that
                 * loads from an fd directly — no userspace
                 * malloc + read of the .ko bytes. */
                long rc = syscall(__NR_finit_module, fd, "", 0);
                if (rc < 0)
                    fprintf(stderr,
                            "init-oci: load smpark.ko: %s\n",
                            strerror(errno));
                close(fd);
            }
        }
    }

    spawn_exec_agent();

    // Wait until the agent has finished listen() + entered the
    // accept loop. Critical for pre-exec snapshots: see the
    // function comment. ~1-5 ms typical; bounded to 1 s.
    wait_for_exec_agent_listening();

    // Heartbeat marker for the VMM's --snapshot-at trigger. The
    // line-aware detector in PL011 (`devices/serial.rs`) requires
    // the full literal `heartbeat counter=N\n` — the previous
    // hard-coded length truncated the trailing `er=1\n`, which
    // was harmless when nothing relied on the heartbeat firing
    // promptly but breaks `--snapshot-at` for volume mode.
    //
    // CRITICAL ORDERING: volumes are mounted AFTER this marker so
    // the snapshot captures *pre-mount* state. Each restore then
    // mounts the volume fresh, picking up the host file's current
    // contents. Mounting pre-snapshot would freeze the in-guest
    // page cache to the bake-time empty filesystem; subsequent
    // runs that wrote to the host file would see EBADMSG when
    // ext4's superblock cache disagreed with the on-disk state.
    {
        const char *m = "[SUPERMACHINE-INIT] heartbeat counter=1\n";
        write(1, m, strlen(m));
    }
    mount_volumes();

    static char buf[ARG_BUF];
    char *argv[MAX_ARGS];
    int argc = read_cmd(buf, argv, MAX_ARGS);
    if (argc < 1) die("read_cmd");

    // Optional workdir.
    int wfd = open(WD_FILE, O_RDONLY);
    if (wfd >= 0) {
        char wd[4096]; ssize_t n = read(wfd, wd, sizeof(wd) - 1); close(wfd);
        if (n > 0) {
            wd[n] = 0;
            // Trim trailing newline.
            while (n > 0 && (wd[n-1] == '\n' || wd[n-1] == '\r')) wd[--n] = 0;
            if (chdir(wd) < 0)
                fprintf(stderr, "init-oci: chdir(%s): %s\n", wd, strerror(errno));
        }
    }

    fprintf(stderr, "init-oci: exec");
    for (int i = 0; i < argc; i++) fprintf(stderr, " %s", argv[i]);
    fprintf(stderr, "\n");

    // Run customer cmd as a CHILD; init stays PID 1 to play the
    // role a real init system plays — reap zombies, forward signals.
    // Programs not designed to be PID 1 (nginx, postgres, anything
    // that fork()s and expects the parent to wait for SIGCHLD) blow
    // up otherwise.
    return supervise(argv);
}

// PID-1 supervisor. Mirrors the responsibilities tini / dumb-init /
// libkrun-init handle. Without these, daemons like nginx that fork
// workers SIGSEGV the moment they fork because nobody reaps the
// dead worker and the master tries to talk to a process the kernel
// has already cleaned up under it.
static volatile pid_t g_child_pid = 0;

static void forward_signal(int sig) {
    if (g_child_pid > 0) kill(g_child_pid, sig);
}

static void reap_zombies(int sig) {
    (void)sig;
    int saved_errno = errno;
    for (;;) {
        // Reap any zombie EXCEPT the main child — main()'s waitpid
        // claims that one so we can report its exit status.
        pid_t r = waitpid(-1, NULL, WNOHANG);
        if (r <= 0) break;
        if (r == g_child_pid) {
            // Edge case: shouldn't happen because main blocks
            // SIGCHLD around its own waitpid, but tolerate it.
        }
    }
    errno = saved_errno;
}

static int supervise(char **argv) {
    // Install signal handlers BEFORE forking so the child inherits
    // SIG_DFL (we'll restore in the child after fork).
    struct sigaction sa = {0};
    sa.sa_handler = reap_zombies;
    sa.sa_flags = SA_RESTART | SA_NOCLDSTOP;
    sigaction(SIGCHLD, &sa, NULL);

    sa.sa_handler = forward_signal;
    sa.sa_flags = SA_RESTART;
    for (int s = 1; s <= 31; s++) {
        if (s == SIGKILL || s == SIGSTOP || s == SIGCHLD) continue;
        sigaction(s, &sa, NULL);
    }

    // PRE-EXEC SNAPSHOT TRIGGER
    //
    // For the always-pipelined-skip-warm `.build()` path, the host's
    // bake driver fires BAKE_READY on this marker — capturing the
    // guest in a quiesced pre-fork state INSTEAD of waiting out the
    // workload's listener startup or the wall-clock fallback. Saves
    // ~50-150 ms of bake time on slow-startup workloads (rust
    // toolchain, JVM, python with heavy imports), trading it for
    // workload-startup-on-first-restore (which is paid once-per-VM
    // by the warm-handoff worker, transparent to the embedder
    // calling `vm.exec()`).
    //
    // Implementation: print a known marker on stderr (PL011 console),
    // then nanosleep briefly so the host has a stable WFI window to
    // capture in. The host's serial.rs detects the marker and sets
    // PRE_EXEC_READY, which worker.rs's dispatch loop treats as
    // bake-ready (gated on `on_pre_exec`, set by the bake driver
    // for skip_warm_snapshot=true only — preserves listener-ready
    // semantics for `with_warmup` and service-image bakes).
    //
    // The 100 ms sleep wakes up post-capture (real time has advanced)
    // and init-oci continues to fork+exec the workload. Both for
    // bake-time AND for post-restore execution, the workload starts
    // AFTER this point — i.e. the bake snapshot does NOT include any
    // workload state. Each restore re-executes the workload from
    // scratch, which is what agent-runtime users want (fresh state
    // every cycle).
    //
    // For service-image bakes (nginx, redis, etc. where the user
    // wants the listener up at restore-time), the bake driver leaves
    // `on_pre_exec=false`; this marker is printed but ignored, and
    // the existing listener-ready path fires bake-ready. The 100 ms
    // sleep is a one-time bake overhead even in that case (~100 ms
    // out of ~250 ms vmm_bake_ms — small).
    fprintf(stderr, "init-oci: workload-pre-exec\n");
    fflush(stderr);
    {
        // 250 ms is the race window between (a) the host detecting
        // the marker on PL011 and (b) the host's bake driver
        // receiving BAKE_READY and dispatching SNAPSHOT_ASYNC. The
        // capture itself runs while the vCPU is paused inside
        // hv_vcpu_run cancellation, so the workload can't start
        // mid-capture. After capture, dispatch resumes the vCPU,
        // init-oci's nanosleep finishes (CLOCK_REALTIME has advanced
        // — on bake host time advanced naturally; on restore the
        // saved deadline is in the past so wake is immediate), and
        // init-oci forks+execs the workload.
        struct timespec ts = { .tv_sec = 0, .tv_nsec = 250 * 1000 * 1000 };
        // Loop on EINTR so signal handlers (SIGCHLD from the
        // exec-agent) don't cut the window short.
        while (nanosleep(&ts, &ts) == -1 && errno == EINTR) {}
    }

    pid_t pid = fork();
    if (pid < 0) die("fork");
    if (pid == 0) {
        // Child: restore default signal handlers, become its own
        // process group leader so signal forwarding works cleanly.
        for (int s = 1; s <= 31; s++) signal(s, SIG_DFL);
        setpgid(0, 0);
        drop_to_image_user();
        if (redirect_child_stdio() != 0)
            report_child_exec_failure("redirect stdio");
        execvp(argv[0], argv);
        report_child_exec_failure(argv[0]);
        _exit(127);
    }
    g_child_pid = pid;
    setpgid(pid, pid);

    // Publish the workload pid so the in-guest exec agent can find
    // it on a CONTROL "signal" request from the host. /run is
    // tmpfs-mounted in `mount_pseudofs`, so the file vanishes on
    // VM shutdown — no stale pids leaking into a fresh restore.
    {
        int fd = open("/run/supermachine-workload.pid",
                      O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0644);
        if (fd >= 0) {
            char buf[32];
            int n = snprintf(buf, sizeof(buf), "%d\n", (int)pid);
            if (n > 0) {
                (void)write(fd, buf, n);
            }
            close(fd);
        }
    }

    // Wait for THE main child specifically. Other zombies get reaped
    // by the SIGCHLD handler. EINTR loops because forwarded signals
    // wake the syscall.
    int status = 0;
    for (;;) {
        pid_t r = waitpid(pid, &status, 0);
        if (r == pid) break;
        if (r < 0 && errno != EINTR) {
            fprintf(stderr, "init-oci: waitpid: %s\n", strerror(errno));
            break;
        }
    }

    int code;
    if (WIFEXITED(status)) {
        code = WEXITSTATUS(status);
        fprintf(stderr, "init-oci: child exited %d\n", code);
    } else if (WIFSIGNALED(status)) {
        code = 128 + WTERMSIG(status);
        fprintf(stderr, "init-oci: child killed by signal %d\n",
            WTERMSIG(status));
    } else {
        code = 1;
    }

    // Sync filesystems, halt. As PID 1 we mustn't return — kernel
    // panics otherwise. sleep forever so the operator can inspect
    // the VM (or pool-worker can issue another RESTORE).
    sync();
    fprintf(stderr, "init-oci: parking PID 1 (exit=%d)\n", code);
    for (;;) pause();
}