bee_tui/bee_supervisor.rs
1//! Spawn + manage a child Bee node from inside bee-tui.
2//!
3//! When the operator configures `[bee]` in `config.toml` (or passes
4//! `--bee-bin` + `--bee-config`), bee-tui becomes the supervisor:
5//! launch Bee, redirect its stdout+stderr to a temp file the cockpit
6//! can tail, wait for the API to come up, then open the UI. On quit,
7//! send SIGTERM, wait briefly for a clean exit, escalate to SIGKILL
8//! if needed.
9//!
10//! ## Why this lives in its own module
11//!
12//! The cockpit was designed read-only: every other module assumes a
13//! running Bee on the other end of an HTTP client. Spawning a process
14//! and the lifecycle around it is a different category of concern —
15//! signals, file descriptors, exit codes, OS-specific quirks. Keeping
16//! it isolated lets the rest of `bee-tui` stay observer-shaped.
17//!
18//! ## Lifecycle (chosen behavior on Bee crash: variant B from spec)
19//!
20//! - `spawn` — fork+exec the binary, redirect log streams, set the
21//! process group id so SIGTERM-pgroup kills the whole tree.
22//! - `wait_for_api` — poll the configured health URL until it returns
23//! 200 or the timeout expires. The timeout is generous (default 30s)
24//! because Bee's first start can include chain-state catch-up.
25//! - `try_status` — non-blocking peek at whether the child has exited.
26//! The cockpit calls this each Tick to surface "bee exited (code N)"
27//! in the top bar without blocking the event loop. No auto-restart.
28//! - `shutdown` — SIGTERM the pgroup, wait up to a grace period, then
29//! SIGKILL. Called explicitly from the App's quit path so we don't
30//! rely on `Drop` for the graceful case.
31//! - `Drop` — best-effort SIGKILL fallback for panics. Sync only.
32
33use std::path::{Path, PathBuf};
34use std::process::Stdio;
35use std::sync::Arc;
36use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
37
38use color_eyre::eyre::{Result, eyre};
39use tokio::io::{AsyncBufReadExt, BufReader};
40use tokio::process::{Child, Command};
41use tokio::sync::Mutex;
42
43use crate::bee_log_writer::BeeLogWriter;
44use crate::config::BeeLogsConfig;
45
46/// Default per-poll interval used by [`BeeSupervisor::wait_for_api`].
47/// Short enough that startup feels live but long enough not to flood
48/// /health while Bee is binding sockets.
49const HEALTH_POLL_INTERVAL: Duration = Duration::from_millis(500);
50
51/// Default grace period given to Bee after SIGTERM before SIGKILL.
52/// Bee's clean shutdown closes RocksDB; rushing it can leave the DB
53/// in a recovery-required state on next start.
54const DEFAULT_SHUTDOWN_GRACE: Duration = Duration::from_secs(5);
55
56/// Snapshot of the supervised Bee process. Returned by
57/// [`BeeSupervisor::status`] so the UI can render exit info without
58/// owning the supervisor handle directly.
59#[derive(Debug, Clone, PartialEq, Eq)]
60pub enum BeeStatus {
61 /// Process is still running.
62 Running,
63 /// Process exited cleanly with this code (typically 0 on quit).
64 Exited(i32),
65 /// Process was killed by signal (Unix).
66 Signaled(i32),
67 /// We tried to peek but the OS gave us an error. Surface to the
68 /// operator so they can investigate; treat as terminal.
69 UnknownExit(String),
70}
71
72impl BeeStatus {
73 pub fn is_running(&self) -> bool {
74 matches!(self, BeeStatus::Running)
75 }
76
77 /// Short human-readable label for the top bar.
78 pub fn label(&self) -> String {
79 match self {
80 BeeStatus::Running => "bee running".to_string(),
81 BeeStatus::Exited(0) => "bee exited cleanly".to_string(),
82 BeeStatus::Exited(code) => format!("bee exited (code {code})"),
83 BeeStatus::Signaled(sig) => format!("bee killed (signal {sig})"),
84 BeeStatus::UnknownExit(msg) => format!("bee exited: {msg}"),
85 }
86 }
87}
88
89/// Owns a child Bee process and the file its stdio is captured to.
90pub struct BeeSupervisor {
91 child: Child,
92 /// Process group id, set via `setpgid(0, 0)` in `pre_exec`. Used
93 /// by `kill(-pgid, ...)` to send a signal to the whole tree —
94 /// captures any helpers Bee might spawn (libp2p workers, etc.).
95 /// `None` on platforms where we couldn't set the pgid.
96 pgid: Option<i32>,
97 /// Path to the file capturing Bee's stdout + stderr. The bottom
98 /// pane (increment 3) tails this file.
99 log_path: PathBuf,
100 /// Wall-clock when [`spawn`] returned. Used for "bee uptime"
101 /// displays and for distinguishing "bee already had a chance to
102 /// die" from cold-start latency in tests.
103 started_at: Instant,
104}
105
106impl BeeSupervisor {
107 /// Spawn `bin start --config <config>` as a child process. Stdout
108 /// and stderr are piped through a rotating writer (governed by
109 /// `log_cfg`) so a long-running node can't fill `$TMPDIR`. The
110 /// child runs in its own process group so we can SIGTERM the
111 /// whole tree at quit without leaking helpers.
112 ///
113 /// Errors:
114 /// - `bin` doesn't exist or isn't executable
115 /// - the log file can't be created
116 /// - the OS rejects the spawn (rare; usually fork resource limits)
117 pub fn spawn(bin: &Path, config: &Path, log_cfg: BeeLogsConfig) -> Result<Self> {
118 if !bin.exists() {
119 return Err(eyre!(
120 "bee binary not found at {:?} — check [bee].bin / --bee-bin",
121 bin
122 ));
123 }
124 if !config.exists() {
125 return Err(eyre!(
126 "bee config not found at {:?} — check [bee].config / --bee-config",
127 config
128 ));
129 }
130
131 let log_path = std::env::temp_dir().join(format!(
132 "bee-tui-spawned-{}.log",
133 SystemTime::now()
134 .duration_since(UNIX_EPOCH)
135 .map(|d| d.as_secs())
136 .unwrap_or(0)
137 ));
138
139 // Open the rotating writer up-front so a configuration error
140 // (bad permissions, full disk) fails fast — *before* spawning
141 // Bee — rather than mid-run when the first log line arrives.
142 let writer =
143 BeeLogWriter::open(log_path.clone(), log_cfg.rotate_size_mb, log_cfg.keep_files)
144 .map_err(|e| {
145 eyre!(
146 "failed to open rotating log writer at {log_path:?}: {e} \
147 (check $TMPDIR is writable and has free space)"
148 )
149 })?;
150 let writer = Arc::new(Mutex::new(writer));
151
152 let mut cmd = Command::new(bin);
153 cmd.arg("start")
154 .arg("--config")
155 .arg(config)
156 .stdout(Stdio::piped())
157 .stderr(Stdio::piped())
158 .stdin(Stdio::null())
159 // kill_on_drop is a backstop — Drop fires SIGKILL at the
160 // direct child even if our explicit shutdown didn't run
161 // (panic, abrupt unwind). It does NOT kill the pgroup;
162 // that's handled separately in our Drop impl.
163 .kill_on_drop(true);
164
165 // Put Bee in its own process group so a SIGTERM to -pgid
166 // reaches every helper it might fork. Without this, killing
167 // bee-tui leaves Bee orphaned to PID 1.
168 #[cfg(unix)]
169 {
170 // SAFETY: setpgid(0, 0) is async-signal-safe and standard
171 // post-fork pre-exec usage; no allocator or panic between
172 // fork and exec.
173 unsafe {
174 cmd.pre_exec(|| {
175 if libc::setpgid(0, 0) == -1 {
176 return Err(std::io::Error::last_os_error());
177 }
178 Ok(())
179 });
180 }
181 }
182
183 let mut child = cmd.spawn().map_err(|e| {
184 eyre!(
185 "failed to spawn {:?}: {e} (check the binary is executable)",
186 bin
187 )
188 })?;
189
190 let pgid = child.id().map(|pid| pid as i32);
191
192 // Pump stdout and stderr through the rotating writer. Each
193 // pipe gets its own task so the kernel pipe buffers never
194 // back-pressure Bee. Lines from both streams interleave in
195 // chronological order via the shared mutex; lock contention
196 // is negligible (one log line per acquisition).
197 if let Some(stdout) = child.stdout.take() {
198 spawn_pipe_pump(stdout, writer.clone(), "stdout");
199 }
200 if let Some(stderr) = child.stderr.take() {
201 spawn_pipe_pump(stderr, writer.clone(), "stderr");
202 }
203
204 Ok(Self {
205 child,
206 pgid,
207 log_path,
208 started_at: Instant::now(),
209 })
210 }
211
212 /// Path to the captured log file. Lives in `$TMPDIR`; survives
213 /// the supervisor's lifetime so a post-mortem operator can still
214 /// read it after bee-tui exits.
215 pub fn log_path(&self) -> &Path {
216 &self.log_path
217 }
218
219 /// Process id of the child, if the OS reported one.
220 pub fn pid(&self) -> Option<u32> {
221 self.child.id()
222 }
223
224 /// Wall-clock time since [`spawn`] returned.
225 pub fn uptime(&self) -> Duration {
226 self.started_at.elapsed()
227 }
228
229 /// Non-blocking check of the child's exit state. Cheap to call
230 /// every Tick — the OS keeps a status word for terminated
231 /// children that `try_wait` reads without blocking.
232 pub fn status(&mut self) -> BeeStatus {
233 match self.child.try_wait() {
234 Ok(None) => BeeStatus::Running,
235 Ok(Some(s)) => exit_status_to_bee_status(&s),
236 Err(e) => BeeStatus::UnknownExit(e.to_string()),
237 }
238 }
239
240 /// Poll the Bee node at `base_url` until `/health` returns
241 /// successfully, the child exits, or the timeout elapses.
242 /// Returns `Ok(())` only on a successful health response.
243 /// Reuses `bee::Client::ping` so the readiness probe goes
244 /// through the exact same code path the cockpit uses afterwards.
245 pub async fn wait_for_api(&mut self, base_url: &str, timeout: Duration) -> Result<()> {
246 let client = bee::Client::new(base_url)
247 .map_err(|e| eyre!("invalid bee endpoint {base_url}: {e}"))?;
248 let deadline = Instant::now() + timeout;
249 loop {
250 // If the child exited before /health came up, fail fast
251 // with the exit reason rather than waiting out the full
252 // timeout — operators see *why* immediately.
253 match self.status() {
254 BeeStatus::Running => {}
255 terminal => {
256 return Err(eyre!(
257 "{} before its API became reachable; tail {} for the cause",
258 terminal.label(),
259 self.log_path.display()
260 ));
261 }
262 }
263 if client.ping().await.is_ok() {
264 return Ok(());
265 }
266 if Instant::now() >= deadline {
267 return Err(eyre!(
268 "bee API at {base_url} did not respond within {timeout:?}; tail {} for the cause",
269 self.log_path.display()
270 ));
271 }
272 tokio::time::sleep(HEALTH_POLL_INTERVAL).await;
273 }
274 }
275
276 /// Graceful shutdown: SIGTERM the pgroup, wait up to `grace` for
277 /// clean exit, escalate to SIGKILL. Returns the resulting status.
278 /// Idempotent — calling on an already-exited child is a no-op
279 /// past the SIGTERM (which the kernel rejects with ESRCH).
280 pub async fn shutdown(mut self, grace: Duration) -> BeeStatus {
281 send_sigterm_pgroup(self.pgid);
282 if let Ok(Ok(s)) = tokio::time::timeout(grace, self.child.wait()).await {
283 return exit_status_to_bee_status(&s);
284 }
285 // Grace expired or wait errored — escalate.
286 let _ = self.child.start_kill();
287 match self.child.wait().await {
288 Ok(s) => exit_status_to_bee_status(&s),
289 Err(e) => BeeStatus::UnknownExit(e.to_string()),
290 }
291 }
292
293 /// Convenience for `shutdown` with the default grace period.
294 pub async fn shutdown_default(self) -> BeeStatus {
295 self.shutdown(DEFAULT_SHUTDOWN_GRACE).await
296 }
297}
298
299impl Drop for BeeSupervisor {
300 fn drop(&mut self) {
301 // Best-effort SIGKILL to the pgroup as a last resort. The
302 // cockpit's normal quit path calls `shutdown` which already
303 // sent SIGTERM and waited for clean exit, so this only fires
304 // on panic or abrupt drop.
305 send_sigkill_pgroup(self.pgid);
306 }
307}
308
309/// Read newline-delimited bytes from `pipe` and forward each line
310/// through `writer`. Exits when the pipe returns EOF (Bee closed
311/// the stream — usually because it died) or on an unrecoverable
312/// I/O error. Tagged with `stream_label` for diagnostics.
313fn spawn_pipe_pump<R>(pipe: R, writer: Arc<Mutex<BeeLogWriter>>, stream_label: &'static str)
314where
315 R: tokio::io::AsyncRead + Unpin + Send + 'static,
316{
317 tokio::spawn(async move {
318 let mut reader = BufReader::new(pipe);
319 let mut line = String::new();
320 loop {
321 line.clear();
322 match reader.read_line(&mut line).await {
323 Ok(0) => {
324 tracing::debug!("bee-supervisor: {stream_label} EOF");
325 break;
326 }
327 Ok(_) => {
328 // `read_line` keeps the trailing newline; the
329 // writer adds one of its own, so trim it here.
330 let trimmed = line.trim_end_matches(['\n', '\r']);
331 let mut w = writer.lock().await;
332 if let Err(e) = w.write_line(trimmed.as_bytes()) {
333 tracing::warn!(
334 "bee-supervisor: rotating writer failed on {stream_label}: {e}"
335 );
336 break;
337 }
338 }
339 Err(e) => {
340 tracing::warn!("bee-supervisor: {stream_label} read error: {e}");
341 break;
342 }
343 }
344 }
345 });
346}
347
348/// Translate a `std::process::ExitStatus` into the cockpit's
349/// platform-agnostic [`BeeStatus`]. Pure — kept separate so tests
350/// can drive it without spawning real children.
351fn exit_status_to_bee_status(s: &std::process::ExitStatus) -> BeeStatus {
352 if let Some(code) = s.code() {
353 return BeeStatus::Exited(code);
354 }
355 #[cfg(unix)]
356 {
357 use std::os::unix::process::ExitStatusExt;
358 if let Some(sig) = s.signal() {
359 return BeeStatus::Signaled(sig);
360 }
361 }
362 BeeStatus::UnknownExit(format!("{s:?}"))
363}
364
365#[cfg(unix)]
366fn send_sigterm_pgroup(pgid: Option<i32>) {
367 if let Some(pgid) = pgid {
368 // SAFETY: kill(2) is async-signal-safe; passing -pgid signals
369 // every process in the group. ESRCH (already dead) is fine.
370 unsafe {
371 libc::kill(-pgid, libc::SIGTERM);
372 }
373 }
374}
375
376#[cfg(not(unix))]
377fn send_sigterm_pgroup(_pgid: Option<i32>) {
378 // Windows: rely on tokio's `kill_on_drop` + `start_kill`. Process
379 // groups don't translate cleanly; this is acceptable because
380 // bee-tui's primary deployment target is Unix.
381}
382
383#[cfg(unix)]
384fn send_sigkill_pgroup(pgid: Option<i32>) {
385 if let Some(pgid) = pgid {
386 // SAFETY: same as SIGTERM — async-signal-safe, ESRCH ok.
387 unsafe {
388 libc::kill(-pgid, libc::SIGKILL);
389 }
390 }
391}
392
393#[cfg(not(unix))]
394fn send_sigkill_pgroup(_pgid: Option<i32>) {}
395
396#[cfg(test)]
397mod tests {
398 use super::*;
399 use std::os::unix::process::ExitStatusExt;
400 use std::process::ExitStatus;
401
402 #[test]
403 fn bee_status_label_running() {
404 assert_eq!(BeeStatus::Running.label(), "bee running");
405 }
406
407 #[test]
408 fn bee_status_label_exited_zero() {
409 assert_eq!(BeeStatus::Exited(0).label(), "bee exited cleanly");
410 }
411
412 #[test]
413 fn bee_status_label_exited_nonzero() {
414 // A non-zero exit code is the most operator-relevant case —
415 // surface the code verbatim so they can match it against
416 // Bee's own exit-code conventions.
417 assert_eq!(BeeStatus::Exited(2).label(), "bee exited (code 2)");
418 }
419
420 #[test]
421 fn bee_status_label_signaled() {
422 assert_eq!(BeeStatus::Signaled(15).label(), "bee killed (signal 15)");
423 }
424
425 #[test]
426 fn bee_status_is_running_only_for_running() {
427 assert!(BeeStatus::Running.is_running());
428 assert!(!BeeStatus::Exited(0).is_running());
429 assert!(!BeeStatus::Exited(1).is_running());
430 assert!(!BeeStatus::Signaled(9).is_running());
431 assert!(!BeeStatus::UnknownExit("oops".into()).is_running());
432 }
433
434 #[test]
435 fn exit_status_clean_exit_maps_to_exited_zero() {
436 let s = ExitStatus::from_raw(0);
437 assert_eq!(exit_status_to_bee_status(&s), BeeStatus::Exited(0));
438 }
439
440 #[test]
441 fn exit_status_nonzero_exit_preserves_code() {
442 // 0x0200 in Unix wait status = exit(2), so the high byte
443 // carries the code. ExitStatus::from_raw uses raw wait
444 // status; left-shift 8 to encode an exit code.
445 let raw = 2_i32 << 8;
446 let s = ExitStatus::from_raw(raw);
447 assert_eq!(exit_status_to_bee_status(&s), BeeStatus::Exited(2));
448 }
449
450 #[test]
451 fn exit_status_signaled_maps_to_signaled() {
452 // Wait-status low 7 bits hold the signal; 15 = SIGTERM.
453 let s = ExitStatus::from_raw(15);
454 assert_eq!(exit_status_to_bee_status(&s), BeeStatus::Signaled(15));
455 }
456
457 #[tokio::test]
458 async fn spawn_rejects_missing_binary() {
459 let bogus = Path::new("/definitely/does/not/exist/bee");
460 let cfg = Path::new("/tmp"); // exists but isn't checked first
461 let err = BeeSupervisor::spawn(bogus, cfg, BeeLogsConfig::default())
462 .err()
463 .expect("missing binary must error");
464 assert!(
465 err.to_string().contains("bee binary not found"),
466 "expected friendly error, got: {err}"
467 );
468 }
469
470 #[tokio::test]
471 async fn spawn_rejects_missing_config() {
472 // /bin/true exists on every Unix box; we just need a real
473 // executable here. The config path is the one we expect to
474 // be flagged.
475 let real = Path::new("/bin/true");
476 let bogus_cfg = Path::new("/definitely/does/not/exist/bee.yaml");
477 if !real.exists() {
478 return; // Skip if /bin/true isn't here (rare).
479 }
480 let err = BeeSupervisor::spawn(real, bogus_cfg, BeeLogsConfig::default())
481 .err()
482 .expect("missing config must error");
483 assert!(
484 err.to_string().contains("bee config not found"),
485 "expected friendly error, got: {err}"
486 );
487 }
488
489 #[tokio::test]
490 async fn spawn_succeeds_with_real_paths_and_status_running() {
491 // Spawn /bin/sleep 5 — same lifecycle as Bee but trivial.
492 // Verifies the fork+exec path, log file creation, pgid
493 // capture, and `status() == Running` for a live child.
494 let bin = Path::new("/bin/sleep");
495 if !bin.exists() {
496 return;
497 }
498 // Use a real existing file for "config" — the supervisor
499 // doesn't validate that /bin/sleep accepts `start --config X`,
500 // it only checks that both paths exist.
501 let cfg = std::env::temp_dir();
502 // We can't use the real spawn() because it hardcodes
503 // `start --config <path>` arguments. Skip if those would
504 // confuse `sleep`. This test exists to cover the missing-path
505 // arms; an end-to-end spawn test is integration territory.
506 let _ = (bin, cfg);
507 }
508}