#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <signal.h>
#include <time.h>
#include <pwd.h>
#include <grp.h>
#include <ctype.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/wait.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <linux/vm_sockets.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/if_link.h>
#include <linux/if_addr.h>
#include <net/if.h>
#include <netinet/in.h>
#include "warmup_amd64.h"
static int supervise(char **argv);
#define HOST_CID 2
#define ENV_PORT 1026
#define EXEC_PORT 1028
#define CMD_FILE "/.supermachine-cmd"
#define WD_FILE "/.supermachine-workdir"
#define USER_FILE "/.supermachine-user"
#define HOSTNAME_FILE "/.supermachine-hostname"
#define MAX_ARGS 64
#define ARG_BUF 65536
static void die(const char *msg) {
fprintf(stderr, "init-oci: %s: %s\n", msg, strerror(errno));
sleep(86400); exit(1);
}
static void bring_up_loopback(void) {
int s = socket(AF_INET, SOCK_DGRAM, 0);
if (s < 0) {
fprintf(stderr, "init-oci: loopback: socket: %s\n", strerror(errno));
return;
}
struct ifreq ifr;
memset(&ifr, 0, sizeof(ifr));
strncpy(ifr.ifr_name, "lo", IFNAMSIZ - 1);
if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0) {
fprintf(stderr, "init-oci: loopback: SIOCGIFFLAGS: %s\n", strerror(errno));
close(s);
return;
}
ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) {
fprintf(stderr, "init-oci: loopback: SIOCSIFFLAGS: %s\n", strerror(errno));
close(s);
return;
}
close(s);
fprintf(stderr, "init-oci: loopback up\n");
}
static void open_ping_group_range(void) {
int fd = open("/proc/sys/net/ipv4/ping_group_range",
O_WRONLY | O_CLOEXEC);
if (fd < 0) return;
static const char val[] = "0 4294967295\n";
(void)write(fd, val, sizeof(val) - 1);
close(fd);
}
static void apply_ipv6_policy(void) {
int fd = open("/proc/cmdline", O_RDONLY | O_CLOEXEC);
if (fd < 0) return;
char buf[4096];
ssize_t n = read(fd, buf, sizeof(buf) - 1);
close(fd);
if (n <= 0) return;
buf[n] = 0;
char *p = strstr(buf, "supermachine.host_ipv6=");
if (!p) return;
char val = p[strlen("supermachine.host_ipv6=")];
if (val != '0') return;
const char *paths[] = {
"/proc/sys/net/ipv6/conf/all/disable_ipv6",
"/proc/sys/net/ipv6/conf/default/disable_ipv6",
"/proc/sys/net/ipv6/conf/lo/disable_ipv6",
NULL,
};
for (int i = 0; paths[i]; i++) {
int sf = open(paths[i], O_WRONLY | O_CLOEXEC);
if (sf < 0) continue;
(void)write(sf, "1\n", 2);
close(sf);
}
fprintf(stderr, "init-oci: IPv6 disabled (host has no v6 route)\n");
}
static int nl_send_recv(int s, struct nlmsghdr *nlh) {
if (send(s, nlh, nlh->nlmsg_len, 0) < 0) {
return -1;
}
char rbuf[1024];
ssize_t r = recv(s, rbuf, sizeof(rbuf), 0);
if (r <= 0) {
return -1;
}
struct nlmsghdr *resp = (struct nlmsghdr *)rbuf;
if (resp->nlmsg_type == NLMSG_ERROR) {
struct nlmsgerr *err = NLMSG_DATA(resp);
if (err->error != 0) {
errno = -err->error;
return -1;
}
}
return 0;
}
static void nl_add_attr(struct nlmsghdr *nlh, size_t maxlen,
int type, const void *data, size_t dlen) {
struct rtattr *rta = (struct rtattr *)((char *)nlh + NLMSG_ALIGN(nlh->nlmsg_len));
size_t add = RTA_LENGTH(dlen);
rta->rta_type = type;
rta->rta_len = (unsigned short)add;
memcpy(RTA_DATA(rta), data, dlen);
nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + (unsigned int)RTA_ALIGN(add);
(void)maxlen;
}
static struct rtattr *nl_nest_start(struct nlmsghdr *nlh, int type) {
struct rtattr *rta = (struct rtattr *)((char *)nlh + NLMSG_ALIGN(nlh->nlmsg_len));
rta->rta_type = type;
rta->rta_len = RTA_LENGTH(0);
nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + RTA_ALIGN(rta->rta_len);
return rta;
}
static void nl_nest_end(struct nlmsghdr *nlh, struct rtattr *nest) {
nest->rta_len = (unsigned short)((char *)nlh + nlh->nlmsg_len - (char *)nest);
}
static void create_fake_iface(void) {
int s = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
if (s < 0) {
fprintf(stderr, "init-oci: fake-iface: netlink socket: %s\n", strerror(errno));
return;
}
struct sockaddr_nl sa = { .nl_family = AF_NETLINK };
if (bind(s, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
close(s);
return;
}
const char *ifname = "smnet0";
const char *peername = "smnet0p";
{
char buf[1024] = {0};
struct nlmsghdr *nlh = (struct nlmsghdr *)buf;
nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
nlh->nlmsg_type = RTM_NEWLINK;
nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE | NLM_F_EXCL;
nlh->nlmsg_seq = 1;
struct ifinfomsg *ifi = NLMSG_DATA(nlh);
ifi->ifi_family = AF_UNSPEC;
nl_add_attr(nlh, sizeof(buf), IFLA_IFNAME, ifname, strlen(ifname) + 1);
struct rtattr *linkinfo = nl_nest_start(nlh, IFLA_LINKINFO);
nl_add_attr(nlh, sizeof(buf), IFLA_INFO_KIND, "veth", 4);
struct rtattr *infodata = nl_nest_start(nlh, IFLA_INFO_DATA);
struct rtattr *peer = nl_nest_start(nlh, 1 );
struct ifinfomsg *peer_ifi =
(struct ifinfomsg *)((char *)nlh + NLMSG_ALIGN(nlh->nlmsg_len));
memset(peer_ifi, 0, sizeof(*peer_ifi));
peer_ifi->ifi_family = AF_UNSPEC;
nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) +
NLMSG_ALIGN(sizeof(struct ifinfomsg));
nl_add_attr(nlh, sizeof(buf), IFLA_IFNAME, peername, strlen(peername) + 1);
nl_nest_end(nlh, peer);
nl_nest_end(nlh, infodata);
nl_nest_end(nlh, linkinfo);
if (nl_send_recv(s, nlh) < 0) {
fprintf(stderr, "init-oci: fake-iface: create veth: %s\n", strerror(errno));
close(s);
return;
}
}
int ifindex = if_nametoindex(ifname);
if (ifindex == 0) {
close(s);
return;
}
{
char buf[256] = {0};
struct nlmsghdr *nlh = (struct nlmsghdr *)buf;
nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg));
nlh->nlmsg_type = RTM_NEWADDR;
nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE | NLM_F_REPLACE;
nlh->nlmsg_seq = 2;
struct ifaddrmsg *ifa = NLMSG_DATA(nlh);
ifa->ifa_family = AF_INET;
ifa->ifa_prefixlen = 30;
ifa->ifa_flags = 0;
ifa->ifa_scope = RT_SCOPE_LINK;
ifa->ifa_index = (unsigned int)ifindex;
unsigned char addr[4] = { 198, 18, 0, 1 };
nl_add_attr(nlh, sizeof(buf), IFA_LOCAL, addr, 4);
nl_add_attr(nlh, sizeof(buf), IFA_ADDRESS, addr, 4);
if (nl_send_recv(s, nlh) < 0) {
fprintf(stderr, "init-oci: fake-iface: add addr: %s\n", strerror(errno));
}
}
{
char buf[128] = {0};
struct nlmsghdr *nlh = (struct nlmsghdr *)buf;
nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
nlh->nlmsg_type = RTM_NEWLINK;
nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
nlh->nlmsg_seq = 3;
struct ifinfomsg *ifi = NLMSG_DATA(nlh);
ifi->ifi_family = AF_UNSPEC;
ifi->ifi_index = ifindex;
ifi->ifi_flags = IFF_UP;
ifi->ifi_change = IFF_UP;
if (nl_send_recv(s, nlh) < 0) {
fprintf(stderr, "init-oci: fake-iface: link up: %s\n", strerror(errno));
}
}
close(s);
fprintf(stderr, "init-oci: fake iface %s (198.18.0.1/30) up — AI_ADDRCONFIG ok\n", ifname);
}
static void mount_pseudofs(void) {
mkdir("/proc", 0755);
mkdir("/sys", 0755);
mkdir("/dev", 0755);
if (mount("proc", "/proc", "proc", 0, NULL) < 0 && errno != EBUSY)
fprintf(stderr, "init-oci: mount proc: %s\n", strerror(errno));
if (mount("sysfs", "/sys", "sysfs", 0, NULL) < 0 && errno != EBUSY)
fprintf(stderr, "init-oci: mount sysfs: %s\n", strerror(errno));
if (mount("devtmpfs", "/dev", "devtmpfs", 0, NULL) < 0 && errno != EBUSY)
fprintf(stderr, "init-oci: mount devtmpfs: %s\n", strerror(errno));
mkdir("/dev/pts", 0755);
if (mount("devpts", "/dev/pts", "devpts", 0,
"newinstance,ptmxmode=0666,mode=0620,gid=5") < 0)
fprintf(stderr, "init-oci: mount devpts: %s\n", strerror(errno));
mkdir("/dev/shm", 01777);
mount("tmpfs", "/dev/shm", "tmpfs", 0, "size=256m,mode=1777");
mkdir("/tmp", 01777); chmod("/tmp", 01777);
mkdir("/run", 0755);
mount("tmpfs", "/tmp", "tmpfs", 0, NULL);
mount("tmpfs", "/run", "tmpfs", 0, NULL);
mkdir("/sys/fs/cgroup", 0755);
if (mount("cgroup2", "/sys/fs/cgroup", "cgroup2", 0, NULL) < 0 && errno != EBUSY)
fprintf(stderr, "init-oci: mount cgroup2: %s\n", strerror(errno));
mkdir("/tmp/.X11-unix", 01777);
chmod("/tmp/.X11-unix", 01777);
mkdir("/run/dbus", 0755);
}
static void write_file_if_missing(const char *path, const char *body, mode_t mode) {
struct stat st;
if (stat(path, &st) == 0) return;
int fd = open(path, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, mode);
if (fd < 0) return;
(void)write(fd, body, strlen(body));
close(fd);
}
static void write_file_if_lacks(const char *path, const char *marker,
const char *body, mode_t mode) {
struct stat st;
int has_marker = 0;
if (stat(path, &st) == 0 && st.st_size > 0) {
int fd = open(path, O_RDONLY | O_CLOEXEC);
if (fd >= 0) {
char buf[4096];
ssize_t n = read(fd, buf, sizeof(buf) - 1);
close(fd);
if (n > 0) {
buf[n] = '\0';
has_marker = (strstr(buf, marker) != NULL);
}
}
if (has_marker) return;
}
int fd = open(path, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, mode);
if (fd < 0) return;
(void)write(fd, body, strlen(body));
close(fd);
}
static void write_file_replace(const char *path, const char *body, mode_t mode) {
int fd = open(path, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, mode);
if (fd < 0) return;
(void)write(fd, body, strlen(body));
close(fd);
}
static void setup_rosetta_in_vm(void) {
struct stat st;
if (stat("/run", &st) != 0) {
mkdir("/run", 0755);
}
mkdir("/run/rosetta", 0755);
int mret = mount("rosetta", "/run/rosetta", "virtiofs", 0, NULL);
if (mret != 0) {
rmdir("/run/rosetta");
return;
}
if (stat("/run/rosetta/rosetta", &st) != 0) {
fprintf(stderr, "init-oci: rosetta-runtime mount present but "
"/run/rosetta/rosetta missing; skipping Rosetta setup\n");
return;
}
int bret = mount("binfmt_misc", "/proc/sys/fs/binfmt_misc", "binfmt_misc",
0, NULL);
if (bret != 0 && errno != EBUSY) {
fprintf(stderr, "init-oci: mount binfmt_misc failed (errno=%d). "
"Kernel missing CONFIG_BINFMT_MISC?\n", errno);
return;
}
static const char binfmt_line[] =
":amd64:M::"
"\\x7fELF\\x02\\x01\\x01\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00"
"\\x02\\x00\\x3e\\x00"
":"
"\\xff\\xff\\xff\\xff\\xff\\xfe\\xfe\\x00\\xff\\xff\\xff\\xff\\xff\\xff"
"\\xff\\xff\\xfe\\xff\\xff\\xff"
":/run/rosetta/rosetta:OCF\n";
int fd = open("/proc/sys/fs/binfmt_misc/register",
O_WRONLY | O_CLOEXEC);
if (fd < 0) {
fprintf(stderr, "init-oci: open binfmt_misc/register failed: "
"errno=%d\n", errno);
return;
}
ssize_t wret = write(fd, binfmt_line, sizeof binfmt_line - 1);
close(fd);
if (wret < 0) {
if (errno != EEXIST) {
fprintf(stderr, "init-oci: register amd64 binfmt failed: "
"errno=%d\n", errno);
return;
}
}
setenv("HOME", "/root", 0);
mkdir("/var/cache", 0755);
mkdir("/var/cache/rosettad", 0755);
int cret = mount("rosettad-cache", "/var/cache/rosettad",
"virtiofs", 0, NULL);
if (cret != 0 && errno != EBUSY) {
fprintf(stderr, "init-oci: rosettad-cache mount unavailable "
"(errno=%d); AOT cache won't persist across VM restarts\n",
errno);
}
mkdir("/run/rosettad", 0755);
pid_t dpid = fork();
if (dpid == 0) {
char *argv[] = {
(char *)"/run/rosetta/rosettad",
(char *)"daemon",
(char *)"/var/cache/rosettad",
NULL,
};
execv("/run/rosetta/rosettad", argv);
fprintf(stderr, "init-oci: execv rosettad: %s\n", strerror(errno));
_exit(127);
} else if (dpid < 0) {
fprintf(stderr, "init-oci: fork rosettad: %s\n", strerror(errno));
} else {
fprintf(stderr, "init-oci: rosettad spawned pid=%d "
"(cache=/var/cache/rosettad)\n", dpid);
struct timespec brief = { .tv_sec = 0, .tv_nsec = 100 * 1000 * 1000 };
nanosleep(&brief, NULL);
}
{
const char *warmup_path = "/tmp/.supermachine-rosetta-warmup";
int wfd = open(warmup_path, O_WRONLY | O_CREAT | O_TRUNC, 0755);
if (wfd >= 0) {
ssize_t wn = write(wfd, warmup_amd64, warmup_amd64_len);
close(wfd);
if (wn != (ssize_t)warmup_amd64_len) {
fprintf(stderr, "init-oci: warmup binary write short "
"(%zd/%u)\n", wn, warmup_amd64_len);
} else if (chmod(warmup_path, 0755) != 0) {
fprintf(stderr, "init-oci: warmup chmod failed: %s\n",
strerror(errno));
} else {
pid_t wpid = fork();
if (wpid == 0) {
int devnull = open("/dev/null", O_RDWR | O_CLOEXEC);
if (devnull >= 0) {
dup2(devnull, STDIN_FILENO);
dup2(devnull, STDOUT_FILENO);
dup2(devnull, STDERR_FILENO);
if (devnull > 2) close(devnull);
}
char *argv[] = { (char *)warmup_path, NULL };
execv(warmup_path, argv);
_exit(127);
} else if (wpid > 0) {
int status = 0;
struct timespec deadline_check =
{ .tv_sec = 0, .tv_nsec = 50 * 1000 * 1000 };
int waited = 0;
while (waited < 100) { pid_t r = waitpid(wpid, &status, WNOHANG);
if (r == wpid) break;
if (r < 0 && errno != EINTR) break;
nanosleep(&deadline_check, NULL);
waited++;
}
if (waited >= 100) {
fprintf(stderr, "init-oci: rosetta warmup "
"timed out (5s); first user exec may race "
"rosettad's lazy init\n");
kill(wpid, SIGKILL);
waitpid(wpid, &status, 0);
} else {
fprintf(stderr, "init-oci: rosetta warmup done\n");
}
unlink(warmup_path);
} else {
fprintf(stderr, "init-oci: fork warmup: %s\n",
strerror(errno));
}
}
} else {
fprintf(stderr, "init-oci: open warmup tmp file: %s\n",
strerror(errno));
}
}
fprintf(stderr, "init-oci: Rosetta-in-VM ready "
"(amd64 ELFs → /run/rosetta/rosetta, AOT cache via rosettad)\n");
}
static void ensure_runtime_files(void) {
mkdir("/etc", 0755);
write_file_if_lacks("/etc/hosts", "localhost",
"127.0.0.1\tlocalhost\n"
"::1\tlocalhost ip6-localhost ip6-loopback\n",
0644);
write_file_replace("/etc/resolv.conf",
"nameserver 1.1.1.1\n"
"options timeout:2 attempts:2 no-aaaa single-request\n",
0644);
mkdir("/dev/shm", 01777);
mount("tmpfs", "/dev/shm", "tmpfs", 0, "size=64m,mode=1777");
symlink("/proc/self/fd", "/dev/fd");
symlink("/proc/self/fd/0", "/dev/stdin");
symlink("/proc/self/fd/1", "/dev/stdout");
symlink("/proc/self/fd/2", "/dev/stderr");
int f = open("/proc/sys/net/ipv4/ip_unprivileged_port_start",
O_WRONLY | O_CLOEXEC);
if (f >= 0) {
(void)write(f, "0", 1);
close(f);
}
}
static void seed_wall_clock(void) {
struct timespec now = {0};
if (clock_gettime(CLOCK_REALTIME, &now) == 0 && now.tv_sec > 1000000000L)
return;
time_t wall_sec = 1735689600; int fd = open("/proc/cmdline", O_RDONLY | O_CLOEXEC);
if (fd >= 0) {
char buf[4096];
ssize_t n = read(fd, buf, sizeof(buf) - 1);
close(fd);
if (n > 0) {
buf[n] = 0;
const char *needle = "supermachine.host_time=";
char *p = strstr(buf, needle);
if (p) {
long long v = strtoll(p + strlen(needle), NULL, 10);
if (v > 1000000000LL) wall_sec = (time_t)v;
}
}
}
struct timespec ts = { .tv_sec = wall_sec, .tv_nsec = 0 };
if (clock_settime(CLOCK_REALTIME, &ts) != 0)
fprintf(stderr, "init-oci: clock_settime: %s\n", strerror(errno));
}
static void fetch_and_set_env(void) {
int s = socket(AF_VSOCK, SOCK_STREAM, 0);
if (s < 0) { fprintf(stderr, "init-oci: socket(AF_VSOCK): %s\n", strerror(errno)); return; }
struct sockaddr_vm a = {0};
a.svm_family = AF_VSOCK; a.svm_cid = HOST_CID; a.svm_port = ENV_PORT;
if (connect(s, (struct sockaddr*)&a, sizeof(a)) < 0) {
fprintf(stderr, "init-oci: env connect: %s\n", strerror(errno));
close(s); return;
}
char buf[16384]; int total = 0;
for (;;) {
ssize_t n = read(s, buf + total, sizeof(buf) - 1 - total);
if (n <= 0) break;
total += n;
if (total >= (int)sizeof(buf) - 1) break;
}
close(s);
buf[total] = 0;
char *p = strstr(buf, "\"env\"");
while (p) {
p = strchr(p, '{');
if (!p) break;
p++;
for (;;) {
char *kq = strchr(p, '"');
if (!kq) break;
char *kqe = strchr(kq + 1, '"');
if (!kqe) break;
char *colon = strchr(kqe, ':');
if (!colon) break;
char *vq = strchr(colon, '"');
if (!vq) break;
char *vqe = strchr(vq + 1, '"');
if (!vqe) break;
*kqe = 0; *vqe = 0;
setenv(kq + 1, vq + 1, 1);
*kqe = '"'; *vqe = '"';
p = vqe + 1;
while (*p == ',' || *p == ' ' || *p == '\n' || *p == '\r' || *p == '\t') p++;
if (*p == '}' || *p == 0) break;
}
p = strstr(p, "\"secrets\"");
if (!p) break;
}
}
static int read_cmd(char *out_buf, char **argv, int max_argv) {
int fd = open(CMD_FILE, O_RDONLY);
if (fd < 0) {
argv[0] = (char *)"/bin/sh";
argv[1] = NULL;
return 1;
}
ssize_t n = read(fd, out_buf, ARG_BUF - 1);
close(fd);
if (n <= 0) {
argv[0] = (char *)"/bin/sh"; argv[1] = NULL; return 1;
}
out_buf[n] = 0;
int argc = 0;
char *p = out_buf;
while (*p && argc < max_argv - 1) {
while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') p++;
if (!*p) break;
argv[argc++] = p;
while (*p && *p != '\n' && *p != '\r') p++;
if (*p) { *p = 0; p++; }
}
argv[argc] = NULL;
return argc;
}
static int all_digits(const char *s) {
if (!s || !*s) return 0;
for (const unsigned char *p = (const unsigned char *)s; *p; p++)
if (!isdigit(*p)) return 0;
return 1;
}
static void drop_to_image_user(void) {
int fd = open(USER_FILE, O_RDONLY | O_CLOEXEC);
if (fd < 0) return;
char spec[256];
ssize_t n = read(fd, spec, sizeof(spec) - 1);
close(fd);
if (n <= 0) return;
spec[n] = 0;
while (n > 0 && (spec[n - 1] == '\n' || spec[n - 1] == '\r' ||
spec[n - 1] == ' ' || spec[n - 1] == '\t'))
spec[--n] = 0;
if (spec[0] == 0) return;
char *group = strchr(spec, ':');
if (group) *group++ = 0;
uid_t uid = 0;
gid_t gid = 0;
const char *init_user = NULL;
struct passwd *pw = NULL;
if (all_digits(spec)) {
uid = (uid_t)strtoul(spec, NULL, 10);
pw = getpwuid(uid);
gid = pw ? pw->pw_gid : 0;
init_user = pw ? pw->pw_name : NULL;
} else {
pw = getpwnam(spec);
if (!pw) {
fprintf(stderr, "init-oci: unknown user %s\n", spec);
_exit(126);
}
uid = pw->pw_uid;
gid = pw->pw_gid;
init_user = pw->pw_name;
}
if (group && *group) {
if (all_digits(group)) {
gid = (gid_t)strtoul(group, NULL, 10);
} else {
struct group *gr = getgrnam(group);
if (!gr) {
fprintf(stderr, "init-oci: unknown group %s\n", group);
_exit(126);
}
gid = gr->gr_gid;
}
}
if (init_user) (void)initgroups(init_user, gid);
if (setgid(gid) != 0) {
fprintf(stderr, "init-oci: setgid(%lu): %s\n",
(unsigned long)gid, strerror(errno));
_exit(126);
}
if (setuid(uid) != 0) {
fprintf(stderr, "init-oci: setuid(%lu): %s\n",
(unsigned long)uid, strerror(errno));
_exit(126);
}
}
static int guest_stdio_to_console(void) {
const char *v = getenv("SUPERMACHINE_GUEST_STDIO");
if (!v || !*v) return 0;
return strcmp(v, "1") == 0 ||
strcmp(v, "true") == 0 ||
strcmp(v, "yes") == 0 ||
strcmp(v, "on") == 0 ||
strcmp(v, "console") == 0 ||
strcmp(v, "inherit") == 0;
}
static int redirect_child_stdio(void) {
if (guest_stdio_to_console()) return 0;
int fd = open("/dev/null", O_RDWR | O_CLOEXEC);
if (fd < 0) return -1;
int rc = 0;
if (dup2(fd, STDIN_FILENO) < 0) rc = -1;
if (dup2(fd, STDOUT_FILENO) < 0) rc = -1;
if (dup2(fd, STDERR_FILENO) < 0) rc = -1;
if (fd > STDERR_FILENO) close(fd);
return rc;
}
static void report_child_exec_failure(const char *argv0) {
int saved_errno = errno;
int fd = open("/dev/console", O_WRONLY | O_CLOEXEC);
if (fd < 0) fd = STDERR_FILENO;
dprintf(fd, "init-oci: child execvp(%s): %s\n",
argv0, strerror(saved_errno));
if (fd > STDERR_FILENO) close(fd);
}
static int try_pivot_to_overlay(void) {
struct stat st;
if (stat("/.supermachine-cmd", &st) == 0) return 0; if (stat("/dev/vda", &st) != 0) return 0;
int n_layers = 0;
for (char letter = 'a'; letter <= 'z'; letter++) {
int i = letter - 'a';
char dev_path[16];
snprintf(dev_path, sizeof dev_path, "/dev/vd%c", letter);
if (stat(dev_path, &st) != 0) break;
char lower_path[32];
snprintf(lower_path, sizeof lower_path, "/lower-%d", i);
mkdir(lower_path, 0755);
if (mount(dev_path, lower_path, "squashfs", MS_RDONLY, NULL) != 0) {
rmdir(lower_path);
break;
}
n_layers++;
}
if (n_layers == 0) return 0;
mkdir("/upper", 0755);
mkdir("/newroot", 0755);
if (mount("tmpfs", "/upper", "tmpfs", 0, NULL) != 0) {
fprintf(stderr, "init-oci: mount tmpfs upper: %s\n", strerror(errno));
return -1;
}
mkdir("/upper/upper", 0755);
mkdir("/upper/work", 0755);
char opts[1024];
int off = snprintf(opts, sizeof opts, "lowerdir=");
for (int i = n_layers - 1; i >= 0; i--) {
off += snprintf(opts + off, sizeof opts - off,
"/lower-%d%s", i, i > 0 ? ":" : "");
}
snprintf(opts + off, sizeof opts - off,
",upperdir=/upper/upper,workdir=/upper/work");
if (mount("overlay", "/newroot", "overlay", 0, opts) != 0) {
fprintf(stderr, "init-oci: mount overlay: %s\n", strerror(errno));
return -1;
}
mount("/dev", "/newroot/dev", NULL, MS_MOVE, NULL);
mount("/proc", "/newroot/proc", NULL, MS_MOVE, NULL);
mount("/sys", "/newroot/sys", NULL, MS_MOVE, NULL);
if (chdir("/newroot") != 0) {
fprintf(stderr, "init-oci: chdir /newroot: %s\n", strerror(errno));
return -1;
}
if (mount(".", "/", NULL, MS_MOVE, NULL) != 0) {
fprintf(stderr, "init-oci: mount move /: %s\n", strerror(errno));
return -1;
}
if (chroot(".") != 0) {
fprintf(stderr, "init-oci: chroot: %s\n", strerror(errno));
return -1;
}
chdir("/");
char *argv[] = { (char *)"/init", NULL };
char *envp[] = { NULL };
execve("/init", argv, envp);
fprintf(stderr, "init-oci: execve /init (overlay): %s\n", strerror(errno));
return -1;
}
static void apply_hostname(void) {
int fd = open(HOSTNAME_FILE, O_RDONLY | O_CLOEXEC);
if (fd < 0) return;
char buf[128];
ssize_t n = read(fd, buf, sizeof(buf) - 1);
close(fd);
if (n <= 0) return;
buf[n] = 0;
while (n > 0 && (buf[n - 1] == '\n' || buf[n - 1] == '\r' || buf[n - 1] == ' '))
buf[--n] = 0;
if (n == 0) return;
if (sethostname(buf, n) != 0) {
fprintf(stderr, "init-oci: sethostname(%s): %s\n", buf, strerror(errno));
}
}
static void mount_volumes(void) {
int fd = open("/.supermachine-volumes", O_RDONLY | O_CLOEXEC);
if (fd < 0) return; char buf[4096];
ssize_t n = read(fd, buf, sizeof(buf) - 1);
close(fd);
if (n <= 0) return;
buf[n] = 0;
char *paths[16];
int n_paths = 0;
char *p = buf;
while (*p && n_paths < 16) {
char *eol = strchr(p, '\n');
if (eol) *eol = 0;
if (*p) paths[n_paths++] = p;
if (!eol) break;
p = eol + 1;
}
if (n_paths == 0) return;
int total = 0;
struct stat st;
for (char letter = 'a'; letter <= 'z'; letter++) {
char dev[16];
snprintf(dev, sizeof dev, "/dev/vd%c", letter);
if (stat(dev, &st) != 0) break;
total++;
}
if (n_paths > total) {
fprintf(stderr, "init-oci: %d volumes requested but only %d /dev/vd* devices\n",
n_paths, total);
return;
}
int first_volume_idx = total - n_paths;
for (int i = 0; i < n_paths; i++) {
char dev[16];
snprintf(dev, sizeof dev, "/dev/vd%c", 'a' + first_volume_idx + i);
const char *mount_point = paths[i];
if (mkdir(mount_point, 0755) < 0 && errno != EEXIST) {
char parent[4096];
strncpy(parent, mount_point, sizeof(parent) - 1);
parent[sizeof(parent) - 1] = 0;
for (char *q = parent + 1; *q; q++) {
if (*q == '/') {
*q = 0;
mkdir(parent, 0755);
*q = '/';
}
}
mkdir(mount_point, 0755);
}
if (mount(dev, mount_point, "ext4", 0, NULL) < 0) {
fprintf(stderr, "init-oci: mount %s -> %s ext4: %s\n",
dev, mount_point, strerror(errno));
continue;
}
fprintf(stderr, "init-oci: mounted %s -> %s (ext4, rw)\n", dev, mount_point);
}
}
static void spawn_exec_agent(void) {
struct stat st;
if (stat("/supermachine-agent", &st) != 0) {
fprintf(stderr, "exec-agent: /supermachine-agent missing — exec disabled\n");
return;
}
pid_t p = fork();
if (p < 0) {
fprintf(stderr, "exec-agent: fork: %s\n", strerror(errno));
return;
}
if (p == 0) {
char *argv[] = { (char *)"/supermachine-agent", NULL };
execvp("/supermachine-agent", argv);
fprintf(stderr, "exec-agent: execvp: %s\n", strerror(errno));
_exit(127);
}
fprintf(stderr, "exec-agent: spawned pid=%d\n", p);
}
static void wait_for_exec_agent_listening(void) {
const int budget_ms = 1000;
const int step_us = 500;
int elapsed_us = 0;
while (elapsed_us < budget_ms * 1000) {
struct stat st;
if (stat("/run/supermachine-agent-ready", &st) == 0) {
fprintf(stderr,
"exec-agent: ready after %d us probe\n",
elapsed_us);
return;
}
struct timespec ts = { .tv_sec = 0, .tv_nsec = step_us * 1000 };
nanosleep(&ts, NULL);
elapsed_us += step_us;
}
fprintf(stderr,
"exec-agent: ready file absent after %d ms; proceeding\n",
budget_ms);
}
int main(void) {
setvbuf(stdout, NULL, _IONBF, 0);
setvbuf(stderr, NULL, _IONBF, 0);
write(1, "init-oci: hello from pid 1\n", 29);
mount_pseudofs();
bring_up_loopback();
if (try_pivot_to_overlay() < 0) die("pivot");
ensure_runtime_files();
apply_ipv6_policy();
open_ping_group_range();
create_fake_iface();
setup_rosetta_in_vm();
seed_wall_clock();
apply_hostname();
fetch_and_set_env();
{
struct stat smpst;
if (stat("/supermachine-smpark.ko", &smpst) == 0) {
int fd = open("/supermachine-smpark.ko", O_RDONLY | O_CLOEXEC);
if (fd >= 0) {
long rc = syscall(__NR_finit_module, fd, "", 0);
if (rc < 0)
fprintf(stderr,
"init-oci: load smpark.ko: %s\n",
strerror(errno));
close(fd);
}
}
}
spawn_exec_agent();
wait_for_exec_agent_listening();
{
const char *m = "[SUPERMACHINE-INIT] heartbeat counter=1\n";
write(1, m, strlen(m));
}
mount_volumes();
static char buf[ARG_BUF];
char *argv[MAX_ARGS];
int argc = read_cmd(buf, argv, MAX_ARGS);
if (argc < 1) die("read_cmd");
int wfd = open(WD_FILE, O_RDONLY);
if (wfd >= 0) {
char wd[4096]; ssize_t n = read(wfd, wd, sizeof(wd) - 1); close(wfd);
if (n > 0) {
wd[n] = 0;
while (n > 0 && (wd[n-1] == '\n' || wd[n-1] == '\r')) wd[--n] = 0;
if (chdir(wd) < 0)
fprintf(stderr, "init-oci: chdir(%s): %s\n", wd, strerror(errno));
}
}
fprintf(stderr, "init-oci: exec");
for (int i = 0; i < argc; i++) fprintf(stderr, " %s", argv[i]);
fprintf(stderr, "\n");
return supervise(argv);
}
static volatile pid_t g_child_pid = 0;
static void forward_signal(int sig) {
if (g_child_pid > 0) kill(g_child_pid, sig);
}
static void reap_zombies(int sig) {
(void)sig;
int saved_errno = errno;
for (;;) {
pid_t r = waitpid(-1, NULL, WNOHANG);
if (r <= 0) break;
if (r == g_child_pid) {
}
}
errno = saved_errno;
}
static int supervise(char **argv) {
struct sigaction sa = {0};
sa.sa_handler = reap_zombies;
sa.sa_flags = SA_RESTART | SA_NOCLDSTOP;
sigaction(SIGCHLD, &sa, NULL);
sa.sa_handler = forward_signal;
sa.sa_flags = SA_RESTART;
for (int s = 1; s <= 31; s++) {
if (s == SIGKILL || s == SIGSTOP || s == SIGCHLD) continue;
sigaction(s, &sa, NULL);
}
fprintf(stderr, "init-oci: workload-pre-exec\n");
fflush(stderr);
{
struct timespec ts = { .tv_sec = 0, .tv_nsec = 250 * 1000 * 1000 };
while (nanosleep(&ts, &ts) == -1 && errno == EINTR) {}
}
pid_t pid = fork();
if (pid < 0) die("fork");
if (pid == 0) {
for (int s = 1; s <= 31; s++) signal(s, SIG_DFL);
setpgid(0, 0);
drop_to_image_user();
if (redirect_child_stdio() != 0)
report_child_exec_failure("redirect stdio");
execvp(argv[0], argv);
report_child_exec_failure(argv[0]);
_exit(127);
}
g_child_pid = pid;
setpgid(pid, pid);
{
int fd = open("/run/supermachine-workload.pid",
O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0644);
if (fd >= 0) {
char buf[32];
int n = snprintf(buf, sizeof(buf), "%d\n", (int)pid);
if (n > 0) {
(void)write(fd, buf, n);
}
close(fd);
}
}
int status = 0;
for (;;) {
pid_t r = waitpid(pid, &status, 0);
if (r == pid) break;
if (r < 0 && errno != EINTR) {
fprintf(stderr, "init-oci: waitpid: %s\n", strerror(errno));
break;
}
}
int code;
if (WIFEXITED(status)) {
code = WEXITSTATUS(status);
fprintf(stderr, "init-oci: child exited %d\n", code);
} else if (WIFSIGNALED(status)) {
code = 128 + WTERMSIG(status);
fprintf(stderr, "init-oci: child killed by signal %d\n",
WTERMSIG(status));
} else {
code = 1;
}
sync();
fprintf(stderr, "init-oci: parking PID 1 (exit=%d)\n", code);
for (;;) pause();
}