Skip to main content

oxios_kernel/
daemon.rs

1//! Daemon lifecycle management — PID file, start/stop, system service install.
2//!
3//! On macOS: launchd (`~/Library/LaunchAgents/com.a7garden.oxios.plist`)
4//! On Linux: systemd (`/etc/systemd/system/oxiosd.service`)
5
6use anyhow::{Context, Result};
7use std::path::{Path, PathBuf};
8
9/// Daemon status.
10#[derive(Debug, Clone)]
11pub enum DaemonStatus {
12    /// Daemon is running.
13    Running {
14        /// Process ID.
15        pid: u32,
16    },
17    /// PID file exists but process is dead (stale).
18    Stale {
19        /// Process ID of the dead process.
20        pid: u32,
21    },
22    /// Daemon is not running.
23    Stopped,
24}
25
26impl std::fmt::Display for DaemonStatus {
27    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
28        match self {
29            DaemonStatus::Running { pid } => write!(f, "running (PID {pid})"),
30            DaemonStatus::Stale { pid } => write!(f, "stale (PID {pid} dead)"),
31            DaemonStatus::Stopped => write!(f, "stopped"),
32        }
33    }
34}
35
36/// Manages the oxios background daemon.
37pub struct DaemonManager {
38    pid_file: PathBuf,
39    log_dir: PathBuf,
40}
41
42impl DaemonManager {
43    /// Create a daemon manager from config paths.
44    pub fn new(pid_file: &str, log_dir: &str) -> Self {
45        Self {
46            pid_file: crate::config::expand_home(pid_file),
47            log_dir: crate::config::expand_home(log_dir),
48        }
49    }
50
51    /// Check daemon status by reading the PID file.
52    pub fn status(&self) -> DaemonStatus {
53        match self.read_pid() {
54            Some(pid) => {
55                if self.is_alive(pid) {
56                    DaemonStatus::Running { pid }
57                } else {
58                    DaemonStatus::Stale { pid }
59                }
60            }
61            None => DaemonStatus::Stopped,
62        }
63    }
64
65    /// Start the daemon in the background and wait for it to begin accepting
66    /// connections on `port` (RFC-024 SP4: verifies the listener came up so
67    /// a port-bind failure is reported immediately instead of masked by a
68    /// `started` message that never resolves).
69    pub fn start(&self, config_path: &Path, port: u16) -> Result<()> {
70        match self.status() {
71            DaemonStatus::Running { pid } => {
72                anyhow::bail!("oxios is already running (PID {pid})");
73            }
74            DaemonStatus::Stale { .. } => {
75                self.cleanup()?;
76            }
77            DaemonStatus::Stopped => {}
78        }
79
80        // Pre-spawn port guard: catches an orphaned oxios process that still
81        // holds the port even though the pidfile is stale or missing (e.g. a
82        // prior `oxios stop` removed the pidfile but the process refused to
83        // die). Without this the spawned daemon's bind fails silently while
84        // the post-spawn readiness probe connects to the *old* listener and
85        // reports success — leaving the broken daemon running undetected.
86        if self.port_in_use(port) {
87            anyhow::bail!(
88                "port {port} is already in use — another oxios instance is \
89                 likely still running. Run `oxios stop`, or find and kill the \
90                 process with `lsof -i :{port}` then retry."
91            );
92        }
93
94        // Ensure log directory exists
95        std::fs::create_dir_all(&self.log_dir).context("failed to create log directory")?;
96
97        let log_file = self.log_dir.join("oxios.log");
98        let exe = std::env::current_exe().context("failed to locate oxios binary")?;
99
100        let child = std::process::Command::new(&exe)
101            .arg("--foreground")
102            .arg("--config")
103            .arg(config_path)
104            .stdout(std::fs::File::create(&log_file)?)
105            .stderr(std::fs::File::create(&log_file)?)
106            .spawn()
107            .context("failed to spawn oxios daemon")?;
108
109        let pid = child.id();
110        self.write_pid(pid)?;
111
112        println!("⬡ oxios started (PID {pid})");
113        println!("  Logs: {}", log_file.display());
114        println!("  Dashboard: http://127.0.0.1:{port}");
115
116        // RFC-024 SP4: verify the daemon is actually accepting connections.
117        // A misconfigured bind (TIME_WAIT, port in use) used to be invisible
118        // here — the user saw `started` but `curl` got connection refused.
119        match self.wait_until_listening(port, std::time::Duration::from_secs(15)) {
120            Ok(()) => println!("  Status:   ready (listening on :{port})"),
121            Err(_) => {
122                // The spawned daemon never accepted a connection — almost
123                // always a fatal startup error (web UI unavailable, config
124                // problem) or a bind failure we failed to anticipate.
125                // Surface the log tail so the user sees *why* instead of a
126                // misleading "started", and fail the start.
127                println!("  Status:   FAILED to start (no listener on :{port} within 15s)");
128                let log_path = self.log_dir.join("oxios.log");
129                if let Ok(content) = std::fs::read_to_string(&log_path) {
130                    let lines: Vec<&str> = content.lines().collect();
131                    let start = lines.len().saturating_sub(30);
132                    if start < lines.len() {
133                        println!("  ── recent log (last {} lines) ──", lines.len() - start);
134                        for line in &lines[start..] {
135                            println!("  {line}");
136                        }
137                    }
138                }
139                println!("  Full log: {}", log_path.display());
140                anyhow::bail!(
141                    "daemon failed to start listening on :{port} \
142                     (see the log above and {})",
143                    log_path.display()
144                );
145            }
146        }
147        Ok(())
148    }
149
150    /// Poll `127.0.0.1:port` until a TCP connect succeeds or `timeout` elapses.
151    fn wait_until_listening(&self, port: u16, timeout: std::time::Duration) -> Result<()> {
152        use std::net::ToSocketAddrs;
153        let addr = format!("127.0.0.1:{port}")
154            .to_socket_addrs()?
155            .next()
156            .ok_or_else(|| anyhow::anyhow!("invalid bind address 127.0.0.1:{port}"))?;
157        let start = std::time::Instant::now();
158        let interval = std::time::Duration::from_millis(200);
159        while start.elapsed() < timeout {
160            if std::net::TcpStream::connect_timeout(&addr, interval).is_ok() {
161                return Ok(());
162            }
163            std::thread::sleep(interval);
164        }
165        anyhow::bail!("daemon did not start listening on :{port} within {timeout:?}")
166    }
167
168    /// Whether anything is currently accepting connections on `127.0.0.1:port`.
169    ///
170    /// Pre-spawn guard used by [`start`](Self::start) to detect an orphaned
171    /// daemon that escaped the pidfile — the pidfile was removed but the
172    /// process kept the port.
173    fn port_in_use(&self, port: u16) -> bool {
174        use std::net::{TcpStream, ToSocketAddrs};
175        let Some(addr) = format!("127.0.0.1:{port}")
176            .to_socket_addrs()
177            .ok()
178            .and_then(|mut a| a.next())
179        else {
180            return false;
181        };
182        TcpStream::connect_timeout(&addr, std::time::Duration::from_millis(200)).is_ok()
183    }
184
185    /// Stop the daemon by sending SIGTERM.
186    pub fn stop(&self) -> Result<()> {
187        match self.status() {
188            DaemonStatus::Running { pid } => {
189                #[cfg(unix)]
190                {
191                    let ret = unsafe { libc::kill(pid as i32, libc::SIGTERM) };
192                    if ret != 0 {
193                        anyhow::bail!("failed to send SIGTERM to PID {pid}");
194                    }
195                }
196                #[cfg(not(unix))]
197                {
198                    // On non-Unix, just kill the process
199                    let _ = std::process::Command::new("taskkill")
200                        .args(["/PID", &pid.to_string(), "/F"])
201                        .output();
202                }
203
204                // Wait briefly for process to die
205                for _ in 0..10 {
206                    std::thread::sleep(std::time::Duration::from_millis(200));
207                    if !self.is_alive(pid) {
208                        break;
209                    }
210                }
211
212                self.cleanup()?;
213                println!("⬡ oxios stopped");
214                Ok(())
215            }
216            DaemonStatus::Stale { .. } => {
217                self.cleanup()?;
218                println!("⬡ cleaned up stale PID file");
219                Ok(())
220            }
221            DaemonStatus::Stopped => {
222                println!("⬡ oxios is not running");
223                Ok(())
224            }
225        }
226    }
227
228    /// Restart the daemon.
229    pub fn restart(&self, config_path: &Path, port: u16) -> Result<()> {
230        if matches!(self.status(), DaemonStatus::Running { .. }) {
231            self.stop()?;
232            std::thread::sleep(std::time::Duration::from_millis(500));
233        }
234        self.start(config_path, port)
235    }
236
237    /// Install as a system service (launchd on macOS, systemd on Linux).
238    pub fn install_service(&self) -> Result<()> {
239        let exe = std::env::current_exe().context("failed to locate oxios binary")?;
240
241        #[cfg(target_os = "macos")]
242        {
243            let plist_dir = dirs::home_dir()
244                .map(|h| h.join("Library/LaunchAgents"))
245                .context("failed to locate LaunchAgents directory")?;
246            std::fs::create_dir_all(&plist_dir)?;
247            let plist_path = plist_dir.join("com.a7garden.oxios.plist");
248
249            let home = dirs::home_dir().context("failed to get HOME")?;
250            let log_path = self.log_dir.join("oxiosd.log");
251
252            let plist = format!(
253                r#"<?xml version="1.0" encoding="UTF-8"?>
254<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
255<plist version="1.0">
256<dict>
257    <key>Label</key>
258    <string>com.a7garden.oxios</string>
259    <key>ProgramArguments</key>
260    <array>
261        <string>{exe}</string>
262        <string>--foreground</string>
263    </array>
264    <key>RunAtLoad</key>
265    <true/>
266    <key>KeepAlive</key>
267    <true/>
268    <key>StandardOutPath</key>
269    <string>{log}</string>
270    <key>StandardErrorPath</key>
271    <string>{log}</string>
272    <key>WorkingDirectory</key>
273    <string>{home}</string>
274</dict>
275</plist>
276"#,
277                exe = escape_xml(&exe.display().to_string()),
278                log = escape_xml(&log_path.display().to_string()),
279                home = escape_xml(&home.display().to_string()),
280            );
281
282            std::fs::write(&plist_path, &plist)?;
283            println!("✓ Installed launchd service");
284            println!("  {}", plist_path.display());
285            println!();
286            println!("  Start with:   launchctl load {}", plist_path.display());
287            println!("  Stop with:    launchctl unload {}", plist_path.display());
288            println!("  Or simply:    oxios start / oxios stop");
289        }
290
291        #[cfg(target_os = "linux")]
292        {
293            let unit_dir = PathBuf::from("/etc/systemd/system");
294            let unit_path = unit_dir.join("oxiosd.service");
295
296            // Validate the binary path before embedding it in ExecStart. systemd
297            // ExecStart parsing has its own quoting rules; rather than implement
298            // full escaping, refuse paths containing shell/systemd metacharacters.
299            let exe_str = exe.display().to_string();
300            if exe_str.chars().any(|c| {
301                matches!(
302                    c,
303                    '"' | '\''
304                        | '\\'
305                        | '$'
306                        | '`'
307                        | ';'
308                        | '&'
309                        | '|'
310                        | '*'
311                        | '?'
312                        | '<'
313                        | '>'
314                        | '('
315                        | ')'
316                )
317            }) {
318                anyhow::bail!(
319                    "Refusing to install systemd unit: binary path '{exe_str}' contains shell/systemd metacharacters"
320                );
321            }
322
323            let unit = format!(
324                r#"[Unit]
325Description=Oxios Agent Operating System
326After=network.target
327
328[Service]
329Type=simple
330ExecStart={exe} --foreground
331Restart=on-failure
332RestartSec=5s
333
334[Install]
335WantedBy=multi-user.target
336"#,
337                exe = exe_str,
338            );
339
340            // Try to write — may fail without sudo
341            if let Err(e) = std::fs::write(&unit_path, &unit) {
342                anyhow::bail!(
343                    "Failed to write {} — run with sudo: {}",
344                    unit_path.display(),
345                    e
346                );
347            }
348
349            println!("✓ Installed systemd service");
350            println!("  {}", unit_path.display());
351            println!();
352            println!("  Reload:  sudo systemctl daemon-reload");
353            println!("  Start:   sudo systemctl start oxiosd");
354            println!("  Enable:  sudo systemctl enable oxiosd");
355        }
356
357        #[cfg(not(any(target_os = "macos", target_os = "linux")))]
358        {
359            anyhow::bail!("daemon install only supported on macOS and Linux");
360        }
361
362        Ok(())
363    }
364
365    /// Uninstall the system service.
366    pub fn uninstall_service(&self) -> Result<()> {
367        #[cfg(target_os = "macos")]
368        {
369            let plist_path = dirs::home_dir()
370                .map(|h| h.join("Library/LaunchAgents/com.a7garden.oxios.plist"))
371                .context("failed to locate plist")?;
372
373            if plist_path.exists() {
374                std::fs::remove_file(&plist_path)?;
375                println!("✓ Removed launchd service");
376            } else {
377                println!("  Service not installed");
378            }
379        }
380
381        #[cfg(target_os = "linux")]
382        {
383            let unit_path = PathBuf::from("/etc/systemd/system/oxiosd.service");
384            if unit_path.exists() {
385                if let Err(e) = std::fs::remove_file(&unit_path) {
386                    anyhow::bail!(
387                        "Failed to remove {} — run with sudo: {}",
388                        unit_path.display(),
389                        e
390                    );
391                }
392                println!("✓ Removed systemd service");
393            } else {
394                println!("  Service not installed");
395            }
396        }
397
398        #[cfg(not(any(target_os = "macos", target_os = "linux")))]
399        {
400            anyhow::bail!("daemon uninstall only supported on macOS and Linux");
401        }
402
403        Ok(())
404    }
405
406    // ── Internal helpers ──
407
408    fn read_pid(&self) -> Option<u32> {
409        let content = std::fs::read_to_string(&self.pid_file).ok()?;
410        content.trim().parse().ok()
411    }
412
413    fn write_pid(&self, pid: u32) -> Result<()> {
414        if let Some(parent) = self.pid_file.parent() {
415            std::fs::create_dir_all(parent)?;
416        }
417        std::fs::write(&self.pid_file, pid.to_string())?;
418        Ok(())
419    }
420
421    fn cleanup(&self) -> Result<()> {
422        if self.pid_file.exists() {
423            std::fs::remove_file(&self.pid_file)?;
424        }
425        Ok(())
426    }
427
428    fn is_alive(&self, pid: u32) -> bool {
429        #[cfg(unix)]
430        {
431            // Signal 0 = check if process exists
432            unsafe { libc::kill(pid as i32, 0) == 0 }
433        }
434        #[cfg(not(unix))]
435        {
436            // On non-Unix, always return false (conservative)
437            let _ = pid;
438            false
439        }
440    }
441}
442
443/// Escape a string for safe inclusion in an XML plist text node.
444///
445/// Replaces the five XML-predefined entities (`&`, `<`, `>`, `"`, `'`). Paths
446/// inserted into the launchd plist are usually trusted system paths, but a
447/// HOME or install path containing `<`, `&`, etc. would produce malformed XML
448/// that launchd refuses to load — and would be a defense-in-depth gap.
449fn escape_xml(s: &str) -> String {
450    let mut out = String::with_capacity(s.len());
451    for c in s.chars() {
452        match c {
453            '&' => out.push_str("&amp;"),
454            '<' => out.push_str("&lt;"),
455            '>' => out.push_str("&gt;"),
456            '"' => out.push_str("&quot;"),
457            '\'' => out.push_str("&apos;"),
458            _ => out.push(c),
459        }
460    }
461    out
462}
463
464#[cfg(test)]
465mod tests {
466    use super::*;
467
468    #[test]
469    fn port_in_use_detects_a_live_listener() {
470        // Bind an ephemeral port and confirm port_in_use reports it in use.
471        let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
472        let port = listener.local_addr().unwrap().port();
473        let dm = DaemonManager::new("/tmp/oxios-test.pid", "/tmp");
474        assert!(
475            dm.port_in_use(port),
476            "port should be reported in use while a listener is bound"
477        );
478    }
479
480    #[test]
481    fn port_in_use_false_for_unused_port() {
482        let dm = DaemonManager::new("/tmp/oxios-test.pid", "/tmp");
483        // Obtain a port that was just free by binding and dropping, then
484        // confirm port_in_use no longer sees a listener.
485        let port = {
486            let l = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
487            l.local_addr().unwrap().port()
488        };
489        assert!(
490            !dm.port_in_use(port),
491            "port should be reported free once the listener is dropped"
492        );
493    }
494}