Skip to main content

edgeguard/
supervisor.rs

1//! Co-process supervisor.
2//!
3//! When EdgeGuard is run with `--wrap "<command>"`, it launches the user's app as a
4//! child process and acts as a tiny init for the container: it restarts the child on
5//! crash, and on shutdown forwards a termination signal to the child before exiting.
6//! The child is told to listen on `APP_PORT` via the `PORT` env var (the convention
7//! most web frameworks follow), while EdgeGuard itself binds the public `$PORT`.
8//!
9//! The signal/process-group plumbing is Unix-specific; on Windows we fall back to a
10//! plain child kill (no process groups, no POSIX signals).
11
12use std::time::Duration;
13use tokio::process::Command;
14use tokio::sync::watch;
15use tracing::{error, info, warn};
16
17/// Run the supervised child until `shutdown` flips to true.
18pub async fn run(cmd: String, app_port: u16, mut shutdown: watch::Receiver<bool>) {
19    let mut backoff = Duration::from_millis(500);
20
21    loop {
22        if *shutdown.borrow() {
23            break;
24        }
25
26        info!(command = %cmd, app_port, "starting wrapped app");
27        let mut command = build_command(&cmd, app_port);
28        let mut child = match command.spawn() {
29            Ok(c) => c,
30            Err(e) => {
31                error!(error = %e, "failed to spawn wrapped app; retrying");
32                if wait_or_shutdown(&mut shutdown, backoff).await {
33                    break;
34                }
35                backoff = (backoff * 2).min(Duration::from_secs(10));
36                continue;
37            }
38        };
39
40        let pid = child.id();
41
42        tokio::select! {
43            status = child.wait() => {
44                if *shutdown.borrow() {
45                    break;
46                }
47                match status {
48                    Ok(s) => warn!(?s, "wrapped app exited; restarting"),
49                    Err(e) => error!(error = %e, "error waiting on wrapped app; restarting"),
50                }
51                // Reap any stragglers left in the child's process group (Unix only).
52                reap_group(pid);
53                if wait_or_shutdown(&mut shutdown, backoff).await {
54                    break;
55                }
56                backoff = (backoff * 2).min(Duration::from_secs(10));
57            }
58            _ = shutdown.changed() => {
59                info!("shutdown requested; terminating wrapped app");
60                terminate(&mut child, pid).await;
61                break;
62            }
63        }
64    }
65
66    info!("supervisor stopped");
67}
68
69/// Build the child `Command`. On Unix the app runs under `sh -c` in its own session
70/// (so the whole process tree can be signaled); on Windows it runs under `cmd /C`.
71#[cfg(unix)]
72fn build_command(cmd: &str, app_port: u16) -> Command {
73    let mut command = Command::new("sh");
74    command
75        .arg("-c")
76        .arg(cmd)
77        .env("PORT", app_port.to_string())
78        .env("HOST", "127.0.0.1")
79        .kill_on_drop(true);
80    // Put the child in its own process group (leader pid == child pid) so we can
81    // signal the entire tree — `sh -c` may fork the real app as a grandchild.
82    unsafe {
83        command.pre_exec(|| {
84            if libc::setsid() == -1 {
85                return Err(std::io::Error::last_os_error());
86            }
87            Ok(())
88        });
89    }
90    command
91}
92
93#[cfg(windows)]
94fn build_command(cmd: &str, app_port: u16) -> Command {
95    let mut command = Command::new("cmd");
96    command
97        .arg("/C")
98        .arg(cmd)
99        .env("PORT", app_port.to_string())
100        .env("HOST", "127.0.0.1")
101        .kill_on_drop(true);
102    command
103}
104
105/// Sleep for `dur`, returning early (true) if a shutdown is requested meanwhile.
106async fn wait_or_shutdown(shutdown: &mut watch::Receiver<bool>, dur: Duration) -> bool {
107    tokio::select! {
108        _ = tokio::time::sleep(dur) => *shutdown.borrow(),
109        _ = shutdown.changed() => true,
110    }
111}
112
113/// After an unexpected exit, force-kill any stragglers left in the child's process
114/// group. No-op on Windows, where `kill_on_drop` already reaps the direct child.
115#[cfg(unix)]
116fn reap_group(pid: Option<u32>) {
117    if let Some(pid) = pid {
118        unsafe {
119            libc::kill(-(pid as i32), libc::SIGKILL);
120        }
121    }
122}
123
124#[cfg(windows)]
125fn reap_group(_pid: Option<u32>) {}
126
127/// Gracefully stop the child, escalating to a hard kill if it lingers.
128///
129/// On Unix this sends SIGTERM to the child's process group, waits up to 10s, then
130/// SIGKILLs the group. Windows has no POSIX signals, so we issue a single kill and
131/// wait for it to land.
132#[cfg(unix)]
133async fn terminate(child: &mut tokio::process::Child, pid: Option<u32>) {
134    if let Some(pid) = pid {
135        // Negative pid targets the process group (leader == pid), so the real app
136        // running under `sh -c` is signaled too, not just the shell.
137        unsafe {
138            libc::kill(-(pid as i32), libc::SIGTERM);
139        }
140    }
141    match tokio::time::timeout(Duration::from_secs(10), child.wait()).await {
142        Ok(Ok(s)) => info!(?s, "wrapped app exited after SIGTERM"),
143        _ => {
144            warn!("wrapped app did not exit in time; sending SIGKILL");
145            if let Some(pid) = pid {
146                unsafe {
147                    libc::kill(-(pid as i32), libc::SIGKILL);
148                }
149            }
150            let _ = child.start_kill();
151            let _ = child.wait().await;
152        }
153    }
154}
155
156#[cfg(windows)]
157async fn terminate(child: &mut tokio::process::Child, _pid: Option<u32>) {
158    let _ = child.start_kill();
159    match tokio::time::timeout(Duration::from_secs(10), child.wait()).await {
160        Ok(Ok(s)) => info!(?s, "wrapped app exited"),
161        _ => warn!("wrapped app did not exit in time"),
162    }
163}