#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <signal.h>
#include <time.h>
#include <pwd.h>
#include <grp.h>
#include <ctype.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/wait.h>
#include <sys/syscall.h>
#include <linux/vm_sockets.h>
static int supervise(char **argv);
#define HOST_CID 2
#define ENV_PORT 1026
#define EXEC_PORT 1028
#define CMD_FILE "/.supermachine-cmd"
#define WD_FILE "/.supermachine-workdir"
#define USER_FILE "/.supermachine-user"
#define HOSTNAME_FILE "/.supermachine-hostname"
#define MAX_ARGS 64
#define ARG_BUF 65536
static void die(const char *msg) {
fprintf(stderr, "init-oci: %s: %s\n", msg, strerror(errno));
sleep(86400); exit(1);
}
static void mount_pseudofs(void) {
mkdir("/proc", 0755);
mkdir("/sys", 0755);
mkdir("/dev", 0755);
if (mount("proc", "/proc", "proc", 0, NULL) < 0 && errno != EBUSY)
fprintf(stderr, "init-oci: mount proc: %s\n", strerror(errno));
if (mount("sysfs", "/sys", "sysfs", 0, NULL) < 0 && errno != EBUSY)
fprintf(stderr, "init-oci: mount sysfs: %s\n", strerror(errno));
if (mount("devtmpfs", "/dev", "devtmpfs", 0, NULL) < 0 && errno != EBUSY)
fprintf(stderr, "init-oci: mount devtmpfs: %s\n", strerror(errno));
mkdir("/dev/pts", 0755);
if (mount("devpts", "/dev/pts", "devpts", 0,
"newinstance,ptmxmode=0666,mode=0620,gid=5") < 0)
fprintf(stderr, "init-oci: mount devpts: %s\n", strerror(errno));
mkdir("/dev/shm", 01777);
mount("tmpfs", "/dev/shm", "tmpfs", 0, "size=64m,mode=1777");
mkdir("/tmp", 01777); chmod("/tmp", 01777);
mkdir("/run", 0755);
mount("tmpfs", "/tmp", "tmpfs", 0, NULL);
mount("tmpfs", "/run", "tmpfs", 0, NULL);
}
static void write_file_if_missing(const char *path, const char *body, mode_t mode) {
struct stat st;
if (stat(path, &st) == 0) return;
int fd = open(path, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, mode);
if (fd < 0) return;
(void)write(fd, body, strlen(body));
close(fd);
}
static void write_file_replace(const char *path, const char *body, mode_t mode) {
int fd = open(path, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, mode);
if (fd < 0) return;
(void)write(fd, body, strlen(body));
close(fd);
}
static void ensure_runtime_files(void) {
mkdir("/etc", 0755);
write_file_if_missing("/etc/hosts",
"127.0.0.1\tlocalhost\n"
"::1\tlocalhost ip6-localhost ip6-loopback\n",
0644);
write_file_replace("/etc/resolv.conf",
"nameserver 1.1.1.1\n"
"nameserver 8.8.8.8\n"
"options timeout:2 attempts:2\n",
0644);
mkdir("/dev/shm", 01777);
mount("tmpfs", "/dev/shm", "tmpfs", 0, "size=64m,mode=1777");
symlink("/proc/self/fd", "/dev/fd");
symlink("/proc/self/fd/0", "/dev/stdin");
symlink("/proc/self/fd/1", "/dev/stdout");
symlink("/proc/self/fd/2", "/dev/stderr");
int f = open("/proc/sys/net/ipv4/ip_unprivileged_port_start",
O_WRONLY | O_CLOEXEC);
if (f >= 0) {
(void)write(f, "0", 1);
close(f);
}
}
static void seed_wall_clock(void) {
struct timespec now = {0};
if (clock_gettime(CLOCK_REALTIME, &now) == 0 && now.tv_sec > 1000000000L)
return;
time_t wall_sec = 1735689600; int fd = open("/proc/cmdline", O_RDONLY | O_CLOEXEC);
if (fd >= 0) {
char buf[4096];
ssize_t n = read(fd, buf, sizeof(buf) - 1);
close(fd);
if (n > 0) {
buf[n] = 0;
const char *needle = "supermachine.host_time=";
char *p = strstr(buf, needle);
if (p) {
long long v = strtoll(p + strlen(needle), NULL, 10);
if (v > 1000000000LL) wall_sec = (time_t)v;
}
}
}
struct timespec ts = { .tv_sec = wall_sec, .tv_nsec = 0 };
if (clock_settime(CLOCK_REALTIME, &ts) != 0)
fprintf(stderr, "init-oci: clock_settime: %s\n", strerror(errno));
}
static void fetch_and_set_env(void) {
int s = socket(AF_VSOCK, SOCK_STREAM, 0);
if (s < 0) { fprintf(stderr, "init-oci: socket(AF_VSOCK): %s\n", strerror(errno)); return; }
struct sockaddr_vm a = {0};
a.svm_family = AF_VSOCK; a.svm_cid = HOST_CID; a.svm_port = ENV_PORT;
if (connect(s, (struct sockaddr*)&a, sizeof(a)) < 0) {
fprintf(stderr, "init-oci: env connect: %s\n", strerror(errno));
close(s); return;
}
char buf[16384]; int total = 0;
for (;;) {
ssize_t n = read(s, buf + total, sizeof(buf) - 1 - total);
if (n <= 0) break;
total += n;
if (total >= (int)sizeof(buf) - 1) break;
}
close(s);
buf[total] = 0;
char *p = strstr(buf, "\"env\"");
while (p) {
p = strchr(p, '{');
if (!p) break;
p++;
for (;;) {
char *kq = strchr(p, '"');
if (!kq) break;
char *kqe = strchr(kq + 1, '"');
if (!kqe) break;
char *colon = strchr(kqe, ':');
if (!colon) break;
char *vq = strchr(colon, '"');
if (!vq) break;
char *vqe = strchr(vq + 1, '"');
if (!vqe) break;
*kqe = 0; *vqe = 0;
setenv(kq + 1, vq + 1, 1);
*kqe = '"'; *vqe = '"';
p = vqe + 1;
while (*p == ',' || *p == ' ' || *p == '\n' || *p == '\r' || *p == '\t') p++;
if (*p == '}' || *p == 0) break;
}
p = strstr(p, "\"secrets\"");
if (!p) break;
}
}
static int read_cmd(char *out_buf, char **argv, int max_argv) {
int fd = open(CMD_FILE, O_RDONLY);
if (fd < 0) {
argv[0] = (char *)"/bin/sh";
argv[1] = NULL;
return 1;
}
ssize_t n = read(fd, out_buf, ARG_BUF - 1);
close(fd);
if (n <= 0) {
argv[0] = (char *)"/bin/sh"; argv[1] = NULL; return 1;
}
out_buf[n] = 0;
int argc = 0;
char *p = out_buf;
while (*p && argc < max_argv - 1) {
while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') p++;
if (!*p) break;
argv[argc++] = p;
while (*p && *p != '\n' && *p != '\r') p++;
if (*p) { *p = 0; p++; }
}
argv[argc] = NULL;
return argc;
}
static int all_digits(const char *s) {
if (!s || !*s) return 0;
for (const unsigned char *p = (const unsigned char *)s; *p; p++)
if (!isdigit(*p)) return 0;
return 1;
}
static void drop_to_image_user(void) {
int fd = open(USER_FILE, O_RDONLY | O_CLOEXEC);
if (fd < 0) return;
char spec[256];
ssize_t n = read(fd, spec, sizeof(spec) - 1);
close(fd);
if (n <= 0) return;
spec[n] = 0;
while (n > 0 && (spec[n - 1] == '\n' || spec[n - 1] == '\r' ||
spec[n - 1] == ' ' || spec[n - 1] == '\t'))
spec[--n] = 0;
if (spec[0] == 0) return;
char *group = strchr(spec, ':');
if (group) *group++ = 0;
uid_t uid = 0;
gid_t gid = 0;
const char *init_user = NULL;
struct passwd *pw = NULL;
if (all_digits(spec)) {
uid = (uid_t)strtoul(spec, NULL, 10);
pw = getpwuid(uid);
gid = pw ? pw->pw_gid : 0;
init_user = pw ? pw->pw_name : NULL;
} else {
pw = getpwnam(spec);
if (!pw) {
fprintf(stderr, "init-oci: unknown user %s\n", spec);
_exit(126);
}
uid = pw->pw_uid;
gid = pw->pw_gid;
init_user = pw->pw_name;
}
if (group && *group) {
if (all_digits(group)) {
gid = (gid_t)strtoul(group, NULL, 10);
} else {
struct group *gr = getgrnam(group);
if (!gr) {
fprintf(stderr, "init-oci: unknown group %s\n", group);
_exit(126);
}
gid = gr->gr_gid;
}
}
if (init_user) (void)initgroups(init_user, gid);
if (setgid(gid) != 0) {
fprintf(stderr, "init-oci: setgid(%lu): %s\n",
(unsigned long)gid, strerror(errno));
_exit(126);
}
if (setuid(uid) != 0) {
fprintf(stderr, "init-oci: setuid(%lu): %s\n",
(unsigned long)uid, strerror(errno));
_exit(126);
}
}
static int guest_stdio_to_console(void) {
const char *v = getenv("SUPERMACHINE_GUEST_STDIO");
if (!v || !*v) return 0;
return strcmp(v, "1") == 0 ||
strcmp(v, "true") == 0 ||
strcmp(v, "yes") == 0 ||
strcmp(v, "on") == 0 ||
strcmp(v, "console") == 0 ||
strcmp(v, "inherit") == 0;
}
static int redirect_child_stdio(void) {
if (guest_stdio_to_console()) return 0;
int fd = open("/dev/null", O_RDWR | O_CLOEXEC);
if (fd < 0) return -1;
int rc = 0;
if (dup2(fd, STDIN_FILENO) < 0) rc = -1;
if (dup2(fd, STDOUT_FILENO) < 0) rc = -1;
if (dup2(fd, STDERR_FILENO) < 0) rc = -1;
if (fd > STDERR_FILENO) close(fd);
return rc;
}
static void report_child_exec_failure(const char *argv0) {
int saved_errno = errno;
int fd = open("/dev/console", O_WRONLY | O_CLOEXEC);
if (fd < 0) fd = STDERR_FILENO;
dprintf(fd, "init-oci: child execvp(%s): %s\n",
argv0, strerror(saved_errno));
if (fd > STDERR_FILENO) close(fd);
}
static int try_pivot_to_overlay(void) {
struct stat st;
if (stat("/.supermachine-cmd", &st) == 0) return 0; if (stat("/dev/vda", &st) != 0) return 0;
int n_layers = 0;
for (char letter = 'a'; letter <= 'z'; letter++) {
int i = letter - 'a';
char dev_path[16];
snprintf(dev_path, sizeof dev_path, "/dev/vd%c", letter);
if (stat(dev_path, &st) != 0) break;
char lower_path[32];
snprintf(lower_path, sizeof lower_path, "/lower-%d", i);
mkdir(lower_path, 0755);
if (mount(dev_path, lower_path, "squashfs", MS_RDONLY, NULL) != 0) {
rmdir(lower_path);
break;
}
n_layers++;
}
if (n_layers == 0) return 0;
mkdir("/upper", 0755);
mkdir("/newroot", 0755);
if (mount("tmpfs", "/upper", "tmpfs", 0, NULL) != 0) {
fprintf(stderr, "init-oci: mount tmpfs upper: %s\n", strerror(errno));
return -1;
}
mkdir("/upper/upper", 0755);
mkdir("/upper/work", 0755);
char opts[1024];
int off = snprintf(opts, sizeof opts, "lowerdir=");
for (int i = n_layers - 1; i >= 0; i--) {
off += snprintf(opts + off, sizeof opts - off,
"/lower-%d%s", i, i > 0 ? ":" : "");
}
snprintf(opts + off, sizeof opts - off,
",upperdir=/upper/upper,workdir=/upper/work");
if (mount("overlay", "/newroot", "overlay", 0, opts) != 0) {
fprintf(stderr, "init-oci: mount overlay: %s\n", strerror(errno));
return -1;
}
mount("/dev", "/newroot/dev", NULL, MS_MOVE, NULL);
mount("/proc", "/newroot/proc", NULL, MS_MOVE, NULL);
mount("/sys", "/newroot/sys", NULL, MS_MOVE, NULL);
if (chdir("/newroot") != 0) {
fprintf(stderr, "init-oci: chdir /newroot: %s\n", strerror(errno));
return -1;
}
if (mount(".", "/", NULL, MS_MOVE, NULL) != 0) {
fprintf(stderr, "init-oci: mount move /: %s\n", strerror(errno));
return -1;
}
if (chroot(".") != 0) {
fprintf(stderr, "init-oci: chroot: %s\n", strerror(errno));
return -1;
}
chdir("/");
char *argv[] = { (char *)"/init", NULL };
char *envp[] = { NULL };
execve("/init", argv, envp);
fprintf(stderr, "init-oci: execve /init (overlay): %s\n", strerror(errno));
return -1;
}
static void apply_hostname(void) {
int fd = open(HOSTNAME_FILE, O_RDONLY | O_CLOEXEC);
if (fd < 0) return;
char buf[128];
ssize_t n = read(fd, buf, sizeof(buf) - 1);
close(fd);
if (n <= 0) return;
buf[n] = 0;
while (n > 0 && (buf[n - 1] == '\n' || buf[n - 1] == '\r' || buf[n - 1] == ' '))
buf[--n] = 0;
if (n == 0) return;
if (sethostname(buf, n) != 0) {
fprintf(stderr, "init-oci: sethostname(%s): %s\n", buf, strerror(errno));
}
}
static void mount_volumes(void) {
int fd = open("/.supermachine-volumes", O_RDONLY | O_CLOEXEC);
if (fd < 0) return; char buf[4096];
ssize_t n = read(fd, buf, sizeof(buf) - 1);
close(fd);
if (n <= 0) return;
buf[n] = 0;
char *paths[16];
int n_paths = 0;
char *p = buf;
while (*p && n_paths < 16) {
char *eol = strchr(p, '\n');
if (eol) *eol = 0;
if (*p) paths[n_paths++] = p;
if (!eol) break;
p = eol + 1;
}
if (n_paths == 0) return;
int total = 0;
struct stat st;
for (char letter = 'a'; letter <= 'z'; letter++) {
char dev[16];
snprintf(dev, sizeof dev, "/dev/vd%c", letter);
if (stat(dev, &st) != 0) break;
total++;
}
if (n_paths > total) {
fprintf(stderr, "init-oci: %d volumes requested but only %d /dev/vd* devices\n",
n_paths, total);
return;
}
int first_volume_idx = total - n_paths;
for (int i = 0; i < n_paths; i++) {
char dev[16];
snprintf(dev, sizeof dev, "/dev/vd%c", 'a' + first_volume_idx + i);
const char *mount_point = paths[i];
if (mkdir(mount_point, 0755) < 0 && errno != EEXIST) {
char parent[4096];
strncpy(parent, mount_point, sizeof(parent) - 1);
parent[sizeof(parent) - 1] = 0;
for (char *q = parent + 1; *q; q++) {
if (*q == '/') {
*q = 0;
mkdir(parent, 0755);
*q = '/';
}
}
mkdir(mount_point, 0755);
}
if (mount(dev, mount_point, "ext4", 0, NULL) < 0) {
fprintf(stderr, "init-oci: mount %s -> %s ext4: %s\n",
dev, mount_point, strerror(errno));
continue;
}
fprintf(stderr, "init-oci: mounted %s -> %s (ext4, rw)\n", dev, mount_point);
}
}
static void spawn_exec_agent(void) {
struct stat st;
if (stat("/supermachine-agent", &st) != 0) {
fprintf(stderr, "exec-agent: /supermachine-agent missing — exec disabled\n");
return;
}
pid_t p = fork();
if (p < 0) {
fprintf(stderr, "exec-agent: fork: %s\n", strerror(errno));
return;
}
if (p == 0) {
char *argv[] = { (char *)"/supermachine-agent", NULL };
execvp("/supermachine-agent", argv);
fprintf(stderr, "exec-agent: execvp: %s\n", strerror(errno));
_exit(127);
}
fprintf(stderr, "exec-agent: spawned pid=%d\n", p);
}
int main(void) {
setvbuf(stdout, NULL, _IONBF, 0);
setvbuf(stderr, NULL, _IONBF, 0);
write(1, "init-oci: hello from pid 1\n", 29);
mount_pseudofs();
if (try_pivot_to_overlay() < 0) die("pivot");
ensure_runtime_files();
seed_wall_clock();
apply_hostname();
fetch_and_set_env();
{
struct stat smpst;
if (stat("/supermachine-smpark.ko", &smpst) == 0) {
int fd = open("/supermachine-smpark.ko", O_RDONLY | O_CLOEXEC);
if (fd >= 0) {
long rc = syscall(__NR_finit_module, fd, "", 0);
if (rc < 0)
fprintf(stderr,
"init-oci: load smpark.ko: %s\n",
strerror(errno));
close(fd);
}
}
}
spawn_exec_agent();
{
const char *m = "[SUPERMACHINE-INIT] heartbeat counter=1\n";
write(1, m, strlen(m));
}
mount_volumes();
static char buf[ARG_BUF];
char *argv[MAX_ARGS];
int argc = read_cmd(buf, argv, MAX_ARGS);
if (argc < 1) die("read_cmd");
int wfd = open(WD_FILE, O_RDONLY);
if (wfd >= 0) {
char wd[4096]; ssize_t n = read(wfd, wd, sizeof(wd) - 1); close(wfd);
if (n > 0) {
wd[n] = 0;
while (n > 0 && (wd[n-1] == '\n' || wd[n-1] == '\r')) wd[--n] = 0;
if (chdir(wd) < 0)
fprintf(stderr, "init-oci: chdir(%s): %s\n", wd, strerror(errno));
}
}
fprintf(stderr, "init-oci: exec");
for (int i = 0; i < argc; i++) fprintf(stderr, " %s", argv[i]);
fprintf(stderr, "\n");
return supervise(argv);
}
static volatile pid_t g_child_pid = 0;
static void forward_signal(int sig) {
if (g_child_pid > 0) kill(g_child_pid, sig);
}
static void reap_zombies(int sig) {
(void)sig;
int saved_errno = errno;
for (;;) {
pid_t r = waitpid(-1, NULL, WNOHANG);
if (r <= 0) break;
if (r == g_child_pid) {
}
}
errno = saved_errno;
}
static int supervise(char **argv) {
struct sigaction sa = {0};
sa.sa_handler = reap_zombies;
sa.sa_flags = SA_RESTART | SA_NOCLDSTOP;
sigaction(SIGCHLD, &sa, NULL);
sa.sa_handler = forward_signal;
sa.sa_flags = SA_RESTART;
for (int s = 1; s <= 31; s++) {
if (s == SIGKILL || s == SIGSTOP || s == SIGCHLD) continue;
sigaction(s, &sa, NULL);
}
pid_t pid = fork();
if (pid < 0) die("fork");
if (pid == 0) {
for (int s = 1; s <= 31; s++) signal(s, SIG_DFL);
setpgid(0, 0);
drop_to_image_user();
if (redirect_child_stdio() != 0)
report_child_exec_failure("redirect stdio");
execvp(argv[0], argv);
report_child_exec_failure(argv[0]);
_exit(127);
}
g_child_pid = pid;
setpgid(pid, pid);
{
int fd = open("/run/supermachine-workload.pid",
O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0644);
if (fd >= 0) {
char buf[32];
int n = snprintf(buf, sizeof(buf), "%d\n", (int)pid);
if (n > 0) {
(void)write(fd, buf, n);
}
close(fd);
}
}
int status = 0;
for (;;) {
pid_t r = waitpid(pid, &status, 0);
if (r == pid) break;
if (r < 0 && errno != EINTR) {
fprintf(stderr, "init-oci: waitpid: %s\n", strerror(errno));
break;
}
}
int code;
if (WIFEXITED(status)) {
code = WEXITSTATUS(status);
fprintf(stderr, "init-oci: child exited %d\n", code);
} else if (WIFSIGNALED(status)) {
code = 128 + WTERMSIG(status);
fprintf(stderr, "init-oci: child killed by signal %d\n",
WTERMSIG(status));
} else {
code = 1;
}
sync();
fprintf(stderr, "init-oci: parking PID 1 (exit=%d)\n", code);
for (;;) pause();
}