supermachine 0.4.21

Run any OCI/Docker image as a hardware-isolated microVM on macOS HVF (Linux KVM and Windows WHP in progress). Single library API, zero flags for the common case, sub-100 ms cold-restore from snapshot.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
// init-oci.c — pid 1 for supermachine OCI container boots.
//
// The squashfs (mounted at /, kernel boots root=/dev/vda) contains:
//   - the OCI rootfs (extracted from `docker export`)
//   - /init (this binary)
//   - /.supermachine-cmd (newline-separated argv tokens; first line is the
//     program path or program name resolved against PATH)
//   - /.supermachine-workdir (optional: chdir target)
//
// What we do:
//   1. Mount /proc /sys /dev (tmpfs needed for many distros).
//   2. Pull the env JSON from the VMM's AF_VSOCK port 1026 (the
//      --env / --env-file payload). Set each K=V into the
//      environment so the customer's program inherits them.
//   3. Print the heartbeat marker so the VMM's --snapshot-at can fire
//      at a known clean point (post-env, pre-program-exec).
//   4. Read /.supermachine-cmd → argv[].
//   5. Optionally chdir to /.supermachine-workdir.
//   6. supervise the customer program as a child. By default the
//      child's stdio is /dev/null so request logs do not go through
//      the emulated serial device; set SUPERMACHINE_GUEST_STDIO=console
//      to inherit the PID-1 console for debugging.
//
// On failure, prints diagnostics + sleeps forever (so the kernel
// keeps the VM alive long enough for the operator to inspect).

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <signal.h>
#include <time.h>
#include <pwd.h>
#include <grp.h>
#include <ctype.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/wait.h>
#include <sys/syscall.h>     /* finit_module via syscall() */
#include <linux/vm_sockets.h>

static int supervise(char **argv);

#define HOST_CID 2
#define ENV_PORT 1026
/// Native AF_VSOCK port the in-guest exec agent listens on. Must
/// match `vmm::resources::DEFAULT_EXEC_GUEST_PORT` on the host
/// side. See `docs/design/exec-2026-05-03.md`. Currently a STUB
/// echo loop — real Rust agent crate replaces this in step 2.
#define EXEC_PORT 1028
#define CMD_FILE "/.supermachine-cmd"
#define WD_FILE  "/.supermachine-workdir"
#define USER_FILE "/.supermachine-user"
#define HOSTNAME_FILE "/.supermachine-hostname"
#define MAX_ARGS 64
#define ARG_BUF 65536

static void die(const char *msg) {
    fprintf(stderr, "init-oci: %s: %s\n", msg, strerror(errno));
    sleep(86400);  // keep VM alive for inspection
    exit(1);
}

static void mount_pseudofs(void) {
    mkdir("/proc", 0755);
    mkdir("/sys", 0755);
    mkdir("/dev", 0755);
    // The kernel may already have auto-mounted /proc /sys /dev
    // because of CONFIG_PROC_FS=y / CONFIG_SYSFS=y /
    // CONFIG_DEVTMPFS_MOUNT=y. EBUSY here is harmless and noisy
    // — suppress in the common case (already mounted on the
    // expected fs); fall through and report only the genuinely
    // unexpected errors (path doesn't exist, fs unsupported,
    // etc.) that warrant attention.
    if (mount("proc", "/proc", "proc", 0, NULL) < 0 && errno != EBUSY)
        fprintf(stderr, "init-oci: mount proc: %s\n", strerror(errno));
    if (mount("sysfs", "/sys", "sysfs", 0, NULL) < 0 && errno != EBUSY)
        fprintf(stderr, "init-oci: mount sysfs: %s\n", strerror(errno));
    if (mount("devtmpfs", "/dev", "devtmpfs", 0, NULL) < 0 && errno != EBUSY)
        fprintf(stderr, "init-oci: mount devtmpfs: %s\n", strerror(errno));
    // devpts so AF_VSOCK exec agent's openpty() finds /dev/pts/N.
    // Many OCI images expect this anyway for `script`, `gdb`, etc.
    mkdir("/dev/pts", 0755);
    if (mount("devpts", "/dev/pts", "devpts", 0,
              "newinstance,ptmxmode=0666,mode=0620,gid=5") < 0)
        fprintf(stderr, "init-oci: mount devpts: %s\n", strerror(errno));
    mkdir("/dev/shm", 01777);
    mount("tmpfs", "/dev/shm", "tmpfs", 0, "size=64m,mode=1777");
    // tmpfs at /tmp + /run (many programs assume these exist + are writable)
    mkdir("/tmp", 01777); chmod("/tmp", 01777);
    mkdir("/run", 0755);
    mount("tmpfs", "/tmp", "tmpfs", 0, NULL);
    mount("tmpfs", "/run", "tmpfs", 0, NULL);
}

static void write_file_if_missing(const char *path, const char *body, mode_t mode) {
    struct stat st;
    if (stat(path, &st) == 0) return;
    int fd = open(path, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, mode);
    if (fd < 0) return;
    (void)write(fd, body, strlen(body));
    close(fd);
}

static void write_file_replace(const char *path, const char *body, mode_t mode) {
    int fd = open(path, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, mode);
    if (fd < 0) return;
    (void)write(fd, body, strlen(body));
    close(fd);
}

static void ensure_runtime_files(void) {
    mkdir("/etc", 0755);
    write_file_if_missing("/etc/hosts",
        "127.0.0.1\tlocalhost\n"
        "::1\tlocalhost ip6-localhost ip6-loopback\n",
        0644);
    // Docker-exported rootfs often contains Docker's embedded DNS
    // address (127.0.0.11). That address is meaningless inside our
    // VM, so replace it with public resolver sentinels for the TSI
    // egress path.
    write_file_replace("/etc/resolv.conf",
        "nameserver 1.1.1.1\n"
        "nameserver 8.8.8.8\n"
        "options timeout:2 attempts:2\n",
        0644);

    mkdir("/dev/shm", 01777);
    mount("tmpfs", "/dev/shm", "tmpfs", 0, "size=64m,mode=1777");
    symlink("/proc/self/fd",   "/dev/fd");
    symlink("/proc/self/fd/0", "/dev/stdin");
    symlink("/proc/self/fd/1", "/dev/stdout");
    symlink("/proc/self/fd/2", "/dev/stderr");

    int f = open("/proc/sys/net/ipv4/ip_unprivileged_port_start",
                 O_WRONLY | O_CLOEXEC);
    if (f >= 0) {
        (void)write(f, "0", 1);
        close(f);
    }
}

static void seed_wall_clock(void) {
    struct timespec now = {0};
    if (clock_gettime(CLOCK_REALTIME, &now) == 0 && now.tv_sec > 1000000000L)
        return;

    time_t wall_sec = 1735689600;  // 2026-01-01: non-zero fallback for nginx-class images.
    int fd = open("/proc/cmdline", O_RDONLY | O_CLOEXEC);
    if (fd >= 0) {
        char buf[4096];
        ssize_t n = read(fd, buf, sizeof(buf) - 1);
        close(fd);
        if (n > 0) {
            buf[n] = 0;
            const char *needle = "supermachine.host_time=";
            char *p = strstr(buf, needle);
            if (p) {
                long long v = strtoll(p + strlen(needle), NULL, 10);
                if (v > 1000000000LL) wall_sec = (time_t)v;
            }
        }
    }

    struct timespec ts = { .tv_sec = wall_sec, .tv_nsec = 0 };
    if (clock_settime(CLOCK_REALTIME, &ts) != 0)
        fprintf(stderr, "init-oci: clock_settime: %s\n", strerror(errno));
}

// Pull the env JSON via AF_VSOCK and set each K=V into the env.
// JSON shape: {"env":{"K":"V",...},"secrets":{"K":"V",...}}.
// We do a tiny hand-rolled parser — no JSON deps.
static void fetch_and_set_env(void) {
    int s = socket(AF_VSOCK, SOCK_STREAM, 0);
    if (s < 0) { fprintf(stderr, "init-oci: socket(AF_VSOCK): %s\n", strerror(errno)); return; }
    struct sockaddr_vm a = {0};
    a.svm_family = AF_VSOCK; a.svm_cid = HOST_CID; a.svm_port = ENV_PORT;
    if (connect(s, (struct sockaddr*)&a, sizeof(a)) < 0) {
        fprintf(stderr, "init-oci: env connect: %s\n", strerror(errno));
        close(s); return;
    }
    char buf[16384]; int total = 0;
    for (;;) {
        ssize_t n = read(s, buf + total, sizeof(buf) - 1 - total);
        if (n <= 0) break;
        total += n;
        if (total >= (int)sizeof(buf) - 1) break;
    }
    close(s);
    buf[total] = 0;
    // Parse: find "env":{ ... }, then "secrets":{ ... }. For each
    // "K":"V" pair, setenv(K, V, 1).
    char *p = strstr(buf, "\"env\"");
    while (p) {
        p = strchr(p, '{');
        if (!p) break;
        p++;
        for (;;) {
            char *kq = strchr(p, '"');
            if (!kq) break;
            char *kqe = strchr(kq + 1, '"');
            if (!kqe) break;
            char *colon = strchr(kqe, ':');
            if (!colon) break;
            char *vq = strchr(colon, '"');
            if (!vq) break;
            char *vqe = strchr(vq + 1, '"');
            if (!vqe) break;
            // Found a "K":"V" pair.
            *kqe = 0; *vqe = 0;
            setenv(kq + 1, vq + 1, 1);
            *kqe = '"'; *vqe = '"';
            p = vqe + 1;
            // Skip comma; bail on closing brace.
            while (*p == ',' || *p == ' ' || *p == '\n' || *p == '\r' || *p == '\t') p++;
            if (*p == '}' || *p == 0) break;
        }
        // Advance to the next top-level field (e.g. "secrets") and
        // run the same loop.
        p = strstr(p, "\"secrets\"");
        if (!p) break;
    }
}

static int read_cmd(char *out_buf, char **argv, int max_argv) {
    int fd = open(CMD_FILE, O_RDONLY);
    if (fd < 0) {
        // Fallback: /bin/sh.
        argv[0] = (char *)"/bin/sh";
        argv[1] = NULL;
        return 1;
    }
    ssize_t n = read(fd, out_buf, ARG_BUF - 1);
    close(fd);
    if (n <= 0) {
        argv[0] = (char *)"/bin/sh"; argv[1] = NULL; return 1;
    }
    out_buf[n] = 0;
    int argc = 0;
    char *p = out_buf;
    while (*p && argc < max_argv - 1) {
        // Trim leading whitespace.
        while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') p++;
        if (!*p) break;
        argv[argc++] = p;
        // Find end of token (newline-separated).
        while (*p && *p != '\n' && *p != '\r') p++;
        if (*p) { *p = 0; p++; }
    }
    argv[argc] = NULL;
    return argc;
}

static int all_digits(const char *s) {
    if (!s || !*s) return 0;
    for (const unsigned char *p = (const unsigned char *)s; *p; p++)
        if (!isdigit(*p)) return 0;
    return 1;
}

static void drop_to_image_user(void) {
    int fd = open(USER_FILE, O_RDONLY | O_CLOEXEC);
    if (fd < 0) return;

    char spec[256];
    ssize_t n = read(fd, spec, sizeof(spec) - 1);
    close(fd);
    if (n <= 0) return;
    spec[n] = 0;
    while (n > 0 && (spec[n - 1] == '\n' || spec[n - 1] == '\r' ||
                     spec[n - 1] == ' ' || spec[n - 1] == '\t'))
        spec[--n] = 0;
    if (spec[0] == 0) return;

    char *group = strchr(spec, ':');
    if (group) *group++ = 0;

    uid_t uid = 0;
    gid_t gid = 0;
    const char *init_user = NULL;
    struct passwd *pw = NULL;

    if (all_digits(spec)) {
        uid = (uid_t)strtoul(spec, NULL, 10);
        pw = getpwuid(uid);
        gid = pw ? pw->pw_gid : 0;
        init_user = pw ? pw->pw_name : NULL;
    } else {
        pw = getpwnam(spec);
        if (!pw) {
            fprintf(stderr, "init-oci: unknown user %s\n", spec);
            _exit(126);
        }
        uid = pw->pw_uid;
        gid = pw->pw_gid;
        init_user = pw->pw_name;
    }

    if (group && *group) {
        if (all_digits(group)) {
            gid = (gid_t)strtoul(group, NULL, 10);
        } else {
            struct group *gr = getgrnam(group);
            if (!gr) {
                fprintf(stderr, "init-oci: unknown group %s\n", group);
                _exit(126);
            }
            gid = gr->gr_gid;
        }
    }

    if (init_user) (void)initgroups(init_user, gid);
    if (setgid(gid) != 0) {
        fprintf(stderr, "init-oci: setgid(%lu): %s\n",
                (unsigned long)gid, strerror(errno));
        _exit(126);
    }
    if (setuid(uid) != 0) {
        fprintf(stderr, "init-oci: setuid(%lu): %s\n",
                (unsigned long)uid, strerror(errno));
        _exit(126);
    }
}

static int guest_stdio_to_console(void) {
    // Default: /dev/null. PL011 is byte-by-byte MMIO-emulated;
    // sending a request log line through it for every HTTP hit
    // (~600 B × thousands of req/s) saturates the vCPU on MMIO
    // exits and tanks RPS by ~8x. The old "default to console"
    // attempt was reverted after benching showed 37k -> 4.5k rps
    // on nginx. Real workload logs need a faster channel (vsock
    // capture); see docs/design/lifecycle-v2-2026-05-04.md.
    //
    // Set SUPERMACHINE_GUEST_STDIO=console (or 1/true/inherit)
    // to opt in for debugging.
    const char *v = getenv("SUPERMACHINE_GUEST_STDIO");
    if (!v || !*v) return 0;
    return strcmp(v, "1") == 0 ||
           strcmp(v, "true") == 0 ||
           strcmp(v, "yes") == 0 ||
           strcmp(v, "on") == 0 ||
           strcmp(v, "console") == 0 ||
           strcmp(v, "inherit") == 0;
}

static int redirect_child_stdio(void) {
    if (guest_stdio_to_console()) return 0;

    int fd = open("/dev/null", O_RDWR | O_CLOEXEC);
    if (fd < 0) return -1;
    int rc = 0;
    if (dup2(fd, STDIN_FILENO) < 0) rc = -1;
    if (dup2(fd, STDOUT_FILENO) < 0) rc = -1;
    if (dup2(fd, STDERR_FILENO) < 0) rc = -1;
    if (fd > STDERR_FILENO) close(fd);
    return rc;
}

static void report_child_exec_failure(const char *argv0) {
    int saved_errno = errno;
    int fd = open("/dev/console", O_WRONLY | O_CLOEXEC);
    if (fd < 0) fd = STDERR_FILENO;
    dprintf(fd, "init-oci: child execvp(%s): %s\n",
            argv0, strerror(saved_errno));
    if (fd > STDERR_FILENO) close(fd);
}

// Two-phase init.
//   Phase 1 (initramfs): /dev/vd[a-z] are squashfs layers attached
//     base→top. We compose an overlayfs:
//       lowerdir = top squashfs:...:base squashfs (read-only)
//       upperdir + workdir = tmpfs (writable, per-dispatch)
//     so the customer's program can `mkdir /var/log/nginx` etc.
//     without us needing per-image knowledge of which paths are
//     writable. Then switch_root + exec /init from the overlay.
//   Phase 2 (overlay rootfs): /.supermachine-cmd is present. Fetch env,
//     exec the customer program.
static int try_pivot_to_overlay(void) {
    struct stat st;
    if (stat("/.supermachine-cmd", &st) == 0) return 0;  // already in overlay
    if (stat("/dev/vda", &st) != 0) return 0;        // no rootfs disk

    int n_layers = 0;
    for (char letter = 'a'; letter <= 'z'; letter++) {
        int i = letter - 'a';
        char dev_path[16];
        snprintf(dev_path, sizeof dev_path, "/dev/vd%c", letter);
        if (stat(dev_path, &st) != 0) break;

        char lower_path[32];
        snprintf(lower_path, sizeof lower_path, "/lower-%d", i);
        mkdir(lower_path, 0755);
        // Stop on the first non-squashfs device. The bake pipeline
        // orders volumes (--volume HOST:GUEST, ext4) AFTER the
        // image's squashfs layers, so a failed squashfs mount
        // means we've reached the volume range. Volumes get
        // mounted later by `mount_volumes()` post-pivot.
        if (mount(dev_path, lower_path, "squashfs", MS_RDONLY, NULL) != 0) {
            rmdir(lower_path);
            break;
        }
        n_layers++;
    }
    if (n_layers == 0) return 0;

    mkdir("/upper", 0755);
    mkdir("/newroot", 0755);
    if (mount("tmpfs", "/upper", "tmpfs", 0, NULL) != 0) {
        fprintf(stderr, "init-oci: mount tmpfs upper: %s\n", strerror(errno));
        return -1;
    }
    mkdir("/upper/upper", 0755);
    mkdir("/upper/work", 0755);
    char opts[1024];
    int off = snprintf(opts, sizeof opts, "lowerdir=");
    for (int i = n_layers - 1; i >= 0; i--) {
        off += snprintf(opts + off, sizeof opts - off,
                        "/lower-%d%s", i, i > 0 ? ":" : "");
    }
    snprintf(opts + off, sizeof opts - off,
             ",upperdir=/upper/upper,workdir=/upper/work");
    if (mount("overlay", "/newroot", "overlay", 0, opts) != 0) {
        fprintf(stderr, "init-oci: mount overlay: %s\n", strerror(errno));
        return -1;
    }
    // Move pseudo-fs into /newroot so child init finds them.
    mount("/dev", "/newroot/dev", NULL, MS_MOVE, NULL);
    mount("/proc", "/newroot/proc", NULL, MS_MOVE, NULL);
    mount("/sys", "/newroot/sys", NULL, MS_MOVE, NULL);
    if (chdir("/newroot") != 0) {
        fprintf(stderr, "init-oci: chdir /newroot: %s\n", strerror(errno));
        return -1;
    }
    if (mount(".", "/", NULL, MS_MOVE, NULL) != 0) {
        fprintf(stderr, "init-oci: mount move /: %s\n", strerror(errno));
        return -1;
    }
    if (chroot(".") != 0) {
        fprintf(stderr, "init-oci: chroot: %s\n", strerror(errno));
        return -1;
    }
    chdir("/");
    char *argv[] = { (char *)"/init", NULL };
    char *envp[] = { NULL };
    execve("/init", argv, envp);
    fprintf(stderr, "init-oci: execve /init (overlay): %s\n", strerror(errno));
    return -1;
}

// `--hostname HOSTNAME` from the bake's CLI lands as a single
// line in `/.supermachine-hostname` (delta squashfs). Read +
// sethostname() before the workload exec so customers see the
// expected name in `uname -n`, prompts, log lines, etc. No-op
// if the file is missing.
static void apply_hostname(void) {
    int fd = open(HOSTNAME_FILE, O_RDONLY | O_CLOEXEC);
    if (fd < 0) return;
    char buf[128];
    ssize_t n = read(fd, buf, sizeof(buf) - 1);
    close(fd);
    if (n <= 0) return;
    buf[n] = 0;
    // Trim trailing newline / CR.
    while (n > 0 && (buf[n - 1] == '\n' || buf[n - 1] == '\r' || buf[n - 1] == ' '))
        buf[--n] = 0;
    if (n == 0) return;
    if (sethostname(buf, n) != 0) {
        fprintf(stderr, "init-oci: sethostname(%s): %s\n", buf, strerror(errno));
    }
}

// Mount writable volumes attached as virtio-blk after the
// read-only image layers. The bake pipeline writes
// /.supermachine-volumes (one absolute guest mount path per line)
// and orders volumes after the layers in /dev/vd* (so layers are
// /dev/vda..vd<n>, volumes are /dev/vd<n+1>..). The host has
// already formatted each volume ext4 — we just mount.
//
// On any error mounting a single volume, we log + continue: the
// workload may not need the missing mount, and a partial-volume
// boot is more debuggable than a hard failure.
static void mount_volumes(void) {
    int fd = open("/.supermachine-volumes", O_RDONLY | O_CLOEXEC);
    if (fd < 0) return;  // no volumes configured
    char buf[4096];
    ssize_t n = read(fd, buf, sizeof(buf) - 1);
    close(fd);
    if (n <= 0) return;
    buf[n] = 0;

    // Collect mount paths.
    char *paths[16];
    int n_paths = 0;
    char *p = buf;
    while (*p && n_paths < 16) {
        char *eol = strchr(p, '\n');
        if (eol) *eol = 0;
        if (*p) paths[n_paths++] = p;
        if (!eol) break;
        p = eol + 1;
    }
    if (n_paths == 0) return;

    // Count total /dev/vd* devices (max 26: vda..vdz).
    int total = 0;
    struct stat st;
    for (char letter = 'a'; letter <= 'z'; letter++) {
        char dev[16];
        snprintf(dev, sizeof dev, "/dev/vd%c", letter);
        if (stat(dev, &st) != 0) break;
        total++;
    }
    if (n_paths > total) {
        fprintf(stderr, "init-oci: %d volumes requested but only %d /dev/vd* devices\n",
                n_paths, total);
        return;
    }
    int first_volume_idx = total - n_paths;

    for (int i = 0; i < n_paths; i++) {
        char dev[16];
        snprintf(dev, sizeof dev, "/dev/vd%c", 'a' + first_volume_idx + i);
        const char *mount_point = paths[i];
        // mkdir -p the mount point. Walking the path char-by-char
        // is overkill for typical 1-2-component paths; just try.
        if (mkdir(mount_point, 0755) < 0 && errno != EEXIST) {
            // Best-effort: try to create parents.
            char parent[4096];
            strncpy(parent, mount_point, sizeof(parent) - 1);
            parent[sizeof(parent) - 1] = 0;
            for (char *q = parent + 1; *q; q++) {
                if (*q == '/') {
                    *q = 0;
                    mkdir(parent, 0755);
                    *q = '/';
                }
            }
            mkdir(mount_point, 0755);
        }
        if (mount(dev, mount_point, "ext4", 0, NULL) < 0) {
            fprintf(stderr, "init-oci: mount %s -> %s ext4: %s\n",
                    dev, mount_point, strerror(errno));
            continue;
        }
        fprintf(stderr, "init-oci: mounted %s -> %s (ext4, rw)\n", dev, mount_point);
    }
}

// In-guest exec agent: a Rust binary at `/supermachine-agent`,
// shipped via the bake's delta squashfs (see `bake.rs::
// ensure_supermachine_agent`). It binds AF_VSOCK port EXEC_PORT
// (1028) and serves the framed exec protocol described in
// `docs/design/exec-2026-05-03.md`.
//
// Lifecycle: forked from main() before the heartbeat marker so by
// the time the snapshot fires (on listener-ready) the agent is
// already parked in `accept()` — capture-clean state.
//
// If the agent binary is missing (e.g. an older bake that ran
// before the agent feature landed), skip silently and let the
// workload boot anyway. The host's `<vsock>-exec.sock` will dial
// to no listener and return an immediate EOF; tooling treats that
// as "exec not available on this snapshot."
static void spawn_exec_agent(void) {
    struct stat st;
    if (stat("/supermachine-agent", &st) != 0) {
        fprintf(stderr, "exec-agent: /supermachine-agent missing — exec disabled\n");
        return;
    }
    pid_t p = fork();
    if (p < 0) {
        fprintf(stderr, "exec-agent: fork: %s\n", strerror(errno));
        return;
    }
    if (p == 0) {
        char *argv[] = { (char *)"/supermachine-agent", NULL };
        // Inherit pid 1's full env (which fetch_and_set_env has
        // populated with the OCI image's ENV directives + the
        // host's --env overrides). Without this, the agent and
        // every workload it spawns start with an empty env, so
        // images like rust:1-slim that rely on PATH /
        // RUSTUP_HOME / CARGO_HOME break out of the box.
        // execvp inherits via the global `environ`.
        execvp("/supermachine-agent", argv);
        fprintf(stderr, "exec-agent: execvp: %s\n", strerror(errno));
        _exit(127);
    }
    fprintf(stderr, "exec-agent: spawned pid=%d\n", p);
}

int main(void) {
    setvbuf(stdout, NULL, _IONBF, 0);
    setvbuf(stderr, NULL, _IONBF, 0);
    write(1, "init-oci: hello from pid 1\n", 29);

    mount_pseudofs();
    if (try_pivot_to_overlay() < 0) die("pivot");
    // If we get here, we're in the squashfs (or no vda existed).
    ensure_runtime_files();
    seed_wall_clock();
    apply_hostname();
    fetch_and_set_env();

    // Best-effort: load the snapshot-park kernel module if the
    // bake staged it at /supermachine-smpark.ko. Used by the
    // host's multi-vCPU snapshot path to drive secondaries into
    // a known parked-WFI state before capture. Single-vCPU bakes
    // don't need this; the module no-ops when num_online_cpus()
    // is 1.
    //
    // Failure modes (silently ignored):
    //   * /supermachine-smpark.ko absent — single-vCPU bake or
    //     this kernel ships pre-smpark.
    //   * vermagic mismatch — kernel rebuilt without rebuilding
    //     the module. Snapshot-park unavailable for this run;
    //     multi-vCPU snapshots fall back to the existing
    //     intermittent capture path.
    {
        struct stat smpst;
        if (stat("/supermachine-smpark.ko", &smpst) == 0) {
            int fd = open("/supermachine-smpark.ko", O_RDONLY | O_CLOEXEC);
            if (fd >= 0) {
                /* finit_module is the Linux 3.8+ interface that
                 * loads from an fd directly — no userspace
                 * malloc + read of the .ko bytes. */
                long rc = syscall(__NR_finit_module, fd, "", 0);
                if (rc < 0)
                    fprintf(stderr,
                            "init-oci: load smpark.ko: %s\n",
                            strerror(errno));
                close(fd);
            }
        }
    }

    spawn_exec_agent();

    // Heartbeat marker for the VMM's --snapshot-at trigger. The
    // line-aware detector in PL011 (`devices/serial.rs`) requires
    // the full literal `heartbeat counter=N\n` — the previous
    // hard-coded length truncated the trailing `er=1\n`, which
    // was harmless when nothing relied on the heartbeat firing
    // promptly but breaks `--snapshot-at` for volume mode.
    //
    // CRITICAL ORDERING: volumes are mounted AFTER this marker so
    // the snapshot captures *pre-mount* state. Each restore then
    // mounts the volume fresh, picking up the host file's current
    // contents. Mounting pre-snapshot would freeze the in-guest
    // page cache to the bake-time empty filesystem; subsequent
    // runs that wrote to the host file would see EBADMSG when
    // ext4's superblock cache disagreed with the on-disk state.
    {
        const char *m = "[SUPERMACHINE-INIT] heartbeat counter=1\n";
        write(1, m, strlen(m));
    }
    mount_volumes();

    static char buf[ARG_BUF];
    char *argv[MAX_ARGS];
    int argc = read_cmd(buf, argv, MAX_ARGS);
    if (argc < 1) die("read_cmd");

    // Optional workdir.
    int wfd = open(WD_FILE, O_RDONLY);
    if (wfd >= 0) {
        char wd[4096]; ssize_t n = read(wfd, wd, sizeof(wd) - 1); close(wfd);
        if (n > 0) {
            wd[n] = 0;
            // Trim trailing newline.
            while (n > 0 && (wd[n-1] == '\n' || wd[n-1] == '\r')) wd[--n] = 0;
            if (chdir(wd) < 0)
                fprintf(stderr, "init-oci: chdir(%s): %s\n", wd, strerror(errno));
        }
    }

    fprintf(stderr, "init-oci: exec");
    for (int i = 0; i < argc; i++) fprintf(stderr, " %s", argv[i]);
    fprintf(stderr, "\n");

    // Run customer cmd as a CHILD; init stays PID 1 to play the
    // role a real init system plays — reap zombies, forward signals.
    // Programs not designed to be PID 1 (nginx, postgres, anything
    // that fork()s and expects the parent to wait for SIGCHLD) blow
    // up otherwise.
    return supervise(argv);
}

// PID-1 supervisor. Mirrors the responsibilities tini / dumb-init /
// libkrun-init handle. Without these, daemons like nginx that fork
// workers SIGSEGV the moment they fork because nobody reaps the
// dead worker and the master tries to talk to a process the kernel
// has already cleaned up under it.
static volatile pid_t g_child_pid = 0;

static void forward_signal(int sig) {
    if (g_child_pid > 0) kill(g_child_pid, sig);
}

static void reap_zombies(int sig) {
    (void)sig;
    int saved_errno = errno;
    for (;;) {
        // Reap any zombie EXCEPT the main child — main()'s waitpid
        // claims that one so we can report its exit status.
        pid_t r = waitpid(-1, NULL, WNOHANG);
        if (r <= 0) break;
        if (r == g_child_pid) {
            // Edge case: shouldn't happen because main blocks
            // SIGCHLD around its own waitpid, but tolerate it.
        }
    }
    errno = saved_errno;
}

static int supervise(char **argv) {
    // Install signal handlers BEFORE forking so the child inherits
    // SIG_DFL (we'll restore in the child after fork).
    struct sigaction sa = {0};
    sa.sa_handler = reap_zombies;
    sa.sa_flags = SA_RESTART | SA_NOCLDSTOP;
    sigaction(SIGCHLD, &sa, NULL);

    sa.sa_handler = forward_signal;
    sa.sa_flags = SA_RESTART;
    for (int s = 1; s <= 31; s++) {
        if (s == SIGKILL || s == SIGSTOP || s == SIGCHLD) continue;
        sigaction(s, &sa, NULL);
    }

    pid_t pid = fork();
    if (pid < 0) die("fork");
    if (pid == 0) {
        // Child: restore default signal handlers, become its own
        // process group leader so signal forwarding works cleanly.
        for (int s = 1; s <= 31; s++) signal(s, SIG_DFL);
        setpgid(0, 0);
        drop_to_image_user();
        if (redirect_child_stdio() != 0)
            report_child_exec_failure("redirect stdio");
        execvp(argv[0], argv);
        report_child_exec_failure(argv[0]);
        _exit(127);
    }
    g_child_pid = pid;
    setpgid(pid, pid);

    // Publish the workload pid so the in-guest exec agent can find
    // it on a CONTROL "signal" request from the host. /run is
    // tmpfs-mounted in `mount_pseudofs`, so the file vanishes on
    // VM shutdown — no stale pids leaking into a fresh restore.
    {
        int fd = open("/run/supermachine-workload.pid",
                      O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0644);
        if (fd >= 0) {
            char buf[32];
            int n = snprintf(buf, sizeof(buf), "%d\n", (int)pid);
            if (n > 0) {
                (void)write(fd, buf, n);
            }
            close(fd);
        }
    }

    // Wait for THE main child specifically. Other zombies get reaped
    // by the SIGCHLD handler. EINTR loops because forwarded signals
    // wake the syscall.
    int status = 0;
    for (;;) {
        pid_t r = waitpid(pid, &status, 0);
        if (r == pid) break;
        if (r < 0 && errno != EINTR) {
            fprintf(stderr, "init-oci: waitpid: %s\n", strerror(errno));
            break;
        }
    }

    int code;
    if (WIFEXITED(status)) {
        code = WEXITSTATUS(status);
        fprintf(stderr, "init-oci: child exited %d\n", code);
    } else if (WIFSIGNALED(status)) {
        code = 128 + WTERMSIG(status);
        fprintf(stderr, "init-oci: child killed by signal %d\n",
            WTERMSIG(status));
    } else {
        code = 1;
    }

    // Sync filesystems, halt. As PID 1 we mustn't return — kernel
    // panics otherwise. sleep forever so the operator can inspect
    // the VM (or pool-worker can issue another RESTORE).
    sync();
    fprintf(stderr, "init-oci: parking PID 1 (exit=%d)\n", code);
    for (;;) pause();
}