Skip to main content

steamroom_cli/daemon/
lifecycle.rs

1//! Daemon lifecycle: PID file, launch (Unix double-fork+exec), stop.
2//!
3//! On Unix the launch sequence is: parse CLI, authenticate in the
4//! foreground (so Steam Guard works), save the refresh token via the
5//! existing `save_token` path, fork once to escape the controlling
6//! terminal, `setsid`, fork again, then `exec` the same binary with a
7//! `--daemon-resume` flag. The resumed child rebuilds a fresh tokio
8//! runtime, re-authenticates using the saved token (fast, no prompts),
9//! binds the socket, and enters the accept loop. The original parent
10//! waits on a pipe for the resumed child to report its PID, prints the
11//! info block, and exits 0.
12
13use crate::daemon::ipc::socket_name_string;
14use crate::daemon::server::DaemonState;
15use crate::errors::CliError;
16use std::path::PathBuf;
17
18/// PID file and log file live in a stable per-user cache dir so they
19/// can be found from any shell. `$TMPDIR` on macOS is session-specific
20/// (`/var/folders/<hash>/T/`), which would cause `daemon info` and
21/// `daemon stop` to miss the daemon from a different shell session.
22fn cache_dir() -> PathBuf {
23    if let Ok(dir) = std::env::var("XDG_CACHE_HOME") {
24        return PathBuf::from(dir).join("steamroom");
25    }
26    // Windows has no HOME, so resolve %LOCALAPPDATA% via dirs_next before
27    // the HOME branch. Checked first on Windows because a stray HOME (set
28    // by Git Bash, MSYS, etc.) would otherwise route the PID file to a
29    // drive-relative `\.cache\steamroom`, which `daemon info` and the
30    // history reload would miss when the CLI is launched from a different
31    // drive.
32    #[cfg(windows)]
33    if let Some(dir) = dirs_next::cache_dir() {
34        return dir.join("steamroom");
35    }
36    if let Some(home) = std::env::var_os("HOME") {
37        return PathBuf::from(home).join(".cache").join("steamroom");
38    }
39    // Last resort: /tmp + uid so we at least match the socket location.
40    PathBuf::from("/tmp").join(format!("steamroom-{}", unix_uid()))
41}
42
43pub fn pid_file_path() -> PathBuf {
44    cache_dir().join("daemon.pid")
45}
46
47#[cfg(unix)]
48fn unix_uid() -> u32 {
49    unsafe { libc::getuid() }
50}
51#[cfg(not(unix))]
52fn unix_uid() -> u32 {
53    0
54}
55
56pub fn write_pid_file(pid: u32) -> Result<(), CliError> {
57    let path = pid_file_path();
58    if let Some(parent) = path.parent() {
59        std::fs::create_dir_all(parent).map_err(CliError::Io)?;
60    }
61    std::fs::write(&path, format!("{pid}\n")).map_err(CliError::Io)
62}
63
64pub fn read_pid_file() -> Result<u32, CliError> {
65    let data = std::fs::read_to_string(pid_file_path()).map_err(CliError::Io)?;
66    data.trim()
67        .parse::<u32>()
68        .map_err(|e| CliError::MalformedFrame(format!("pid file: {e}")))
69}
70
71pub fn remove_pid_file() {
72    let _ = std::fs::remove_file(pid_file_path());
73}
74
75/// Render the `daemon info` block to stdout. Does NOT contact the
76/// daemon. Useful for diagnosing a wedged daemon.
77pub fn render_daemon_info() {
78    let path = pid_file_path();
79    println!("pid file: {}", path.display());
80    match read_pid_file() {
81        Ok(pid) => println!("pid     : {pid}"),
82        Err(_) => println!("pid     : (none; no daemon recorded)"),
83    }
84    println!("socket  : {}", socket_name_string());
85    println!("stop    : steamroom daemon stop");
86}
87
88pub fn log_path() -> PathBuf {
89    cache_dir().join("daemon.log")
90}
91
92/// Path to the JSON file where the daemon persists its `recent` ring on
93/// shutdown and reloads on startup. Co-located with the PID and log.
94pub fn recent_history_path() -> PathBuf {
95    cache_dir().join("recent.json")
96}
97
98/// Load persisted recent-job history, if any, into the given state's
99/// ring. Silent on failure -- a missing or corrupt file just means we
100/// start with an empty history.
101pub async fn load_recent_history(state: &DaemonState) {
102    let path = recent_history_path();
103    let Ok(data) = std::fs::read_to_string(&path) else {
104        return;
105    };
106    let Ok(records) = serde_json::from_str::<Vec<crate::daemon::proto::JobRecord>>(&data) else {
107        tracing::warn!("recent history at {} is corrupt; ignoring", path.display());
108        return;
109    };
110    let mut recent = state.recent.lock().await;
111    for r in records {
112        recent.push(r);
113    }
114}
115
116/// Snapshot the recent ring to disk. Best-effort: any I/O failure is
117/// logged but does not block shutdown.
118pub async fn save_recent_history(state: &DaemonState) {
119    let path = recent_history_path();
120    if let Some(parent) = path.parent() {
121        let _ = std::fs::create_dir_all(parent);
122    }
123    let records: Vec<_> = state.recent.lock().await.iter().cloned().collect();
124    match serde_json::to_string(&records) {
125        Ok(json) => {
126            if let Err(e) = std::fs::write(&path, json) {
127                tracing::warn!("failed to write recent history to {}: {e}", path.display());
128            }
129        }
130        Err(e) => {
131            tracing::warn!("failed to serialize recent history: {e}");
132        }
133    }
134}
135
136/// Spawn the daemon child detached from this process and probe the
137/// socket until it binds.
138///
139/// On Unix we use `pre_exec` to call `setsid` so the child becomes its
140/// own session leader (no controlling terminal). On Windows we set
141/// `DETACHED_PROCESS | CREATE_NEW_PROCESS_GROUP` so the child has no
142/// console attached and survives the parent's exit. Both platforms
143/// redirect the child's stdin/stdout/stderr to the daemon log file
144/// before spawn so the user sees nothing from the daemon on this
145/// terminal.
146///
147/// The parent does NOT wait on the spawned child. It instead polls the
148/// daemon socket via `wait_for_socket` to confirm `serve_resumed`'s
149/// `bind_listener` actually succeeded; on timeout it prints a failure
150/// pointing at the log instead of an unfounded success banner.
151pub fn detach_and_exec_resume(username: &str, log_path: &std::path::Path) -> Result<(), CliError> {
152    use std::process::Command;
153    use std::process::Stdio;
154
155    if let Some(parent) = log_path.parent() {
156        std::fs::create_dir_all(parent).map_err(CliError::Io)?;
157    }
158    let log_out = std::fs::OpenOptions::new()
159        .create(true)
160        .append(true)
161        .open(log_path)
162        .map_err(CliError::Io)?;
163    let log_err = log_out.try_clone().map_err(CliError::Io)?;
164
165    let exe = std::env::current_exe().map_err(CliError::Io)?;
166    let mut cmd = Command::new(exe);
167    // `daemon start` is included only to satisfy clap's subcommand
168    // requirement. `main()` checks `cli.daemon_resume` first and routes
169    // to `serve_resumed` before the subcommand handler runs.
170    cmd.arg("--daemon-resume")
171        .arg(username)
172        .arg("daemon")
173        .arg("start");
174    cmd.stdin(Stdio::null());
175    cmd.stdout(Stdio::from(log_out));
176    cmd.stderr(Stdio::from(log_err));
177
178    #[cfg(unix)]
179    unsafe {
180        use std::os::unix::process::CommandExt;
181        // SAFETY: `setsid` is async-signal-safe and has no Rust
182        // invariants to uphold. `pre_exec` runs after fork() and before
183        // execve() so this happens in the child only.
184        cmd.pre_exec(|| {
185            if libc::setsid() == -1 {
186                return Err(std::io::Error::last_os_error());
187            }
188            Ok(())
189        });
190    }
191
192    #[cfg(windows)]
193    {
194        use std::os::windows::process::CommandExt;
195        // From winbase.h:
196        //   DETACHED_PROCESS          = 0x00000008
197        //   CREATE_NEW_PROCESS_GROUP  = 0x00000200
198        // Together: the child has no console attached, is in its own
199        // process group, and is unaffected by Ctrl-C delivered to the
200        // launching console.
201        const DETACHED_PROCESS: u32 = 0x0000_0008;
202        const CREATE_NEW_PROCESS_GROUP: u32 = 0x0000_0200;
203        cmd.creation_flags(DETACHED_PROCESS | CREATE_NEW_PROCESS_GROUP);
204    }
205
206    let child = cmd.spawn().map_err(CliError::Io)?;
207    let pid = child.id();
208    // Detach: don't reap the child. On Unix it gets reparented to init
209    // when this process exits; on Windows the new process group keeps
210    // it alive independently.
211    std::mem::forget(child);
212
213    if !wait_for_socket(std::time::Duration::from_secs(5)) {
214        eprintln!("steamroom daemon (pid {pid}) failed to bind socket within 5s");
215        eprintln!("check the log for the failure:");
216        eprintln!("  {}", log_path.display());
217        std::process::exit(1);
218    }
219    #[cfg(windows)]
220    let manual_kill = format!("taskkill /PID {pid} /F");
221    #[cfg(not(windows))]
222    let manual_kill = format!("kill {pid}");
223    println!("steamroom daemon started");
224    println!("  pid    : {pid}");
225    println!("  socket : {}", socket_name_string());
226    println!("  stop   : steamroom daemon stop    (or: {manual_kill})");
227    println!("  logs   : {}", log_path.display());
228    std::process::exit(0);
229}
230
231/// Poll the daemon socket until `Status` round-trips successfully or
232/// `timeout` elapses. Used by the parent of `detach_and_exec_resume`
233/// to verify the grandchild actually finished `serve_resumed`'s
234/// `bind_listener` step before reporting success.
235fn wait_for_socket(timeout: std::time::Duration) -> bool {
236    let rt = match tokio::runtime::Builder::new_current_thread()
237        .enable_all()
238        .build()
239    {
240        Ok(rt) => rt,
241        Err(_) => return false,
242    };
243    rt.block_on(async move {
244        let deadline = std::time::Instant::now() + timeout;
245        let mut delay = std::time::Duration::from_millis(50);
246        while std::time::Instant::now() < deadline {
247            if crate::daemon::ipc::probe_peer().await.is_ok() {
248                return true;
249            }
250            tokio::time::sleep(delay).await;
251            // Backoff: 50ms, 100ms, 200ms, then 200ms thereafter.
252            delay = (delay * 2).min(std::time::Duration::from_millis(200));
253        }
254        false
255    })
256}
257
258use crate::cli::Cli;
259use crate::commands::shared;
260use crate::daemon::ipc;
261use crate::daemon::server::handle_connection;
262use crate::daemon::server::worker_loop;
263use crate::daemon::tracing_layer::JobIdAttachmentInstaller;
264use crate::daemon::tracing_layer::JobScopedLogLayer;
265
266/// Phase 1 of `daemon start`: preflight + foreground authentication.
267///
268/// Eager (`--username` or any other auth flag was passed): authenticate
269/// in the foreground so Steam Guard prompts can be answered on the
270/// launching terminal. Returns `Some(username)`; the resumed child
271/// re-authenticates non-interactively using the saved refresh token.
272///
273/// Lazy (no auth flags): skip launch-time login entirely. The worker
274/// authenticates on the first job that needs a Steam connection,
275/// falling back to auto-detected saved tokens or anonymous. Returns
276/// `None`.
277pub async fn launch_daemon_authenticate(cli: &Cli) -> Result<Option<String>, CliError> {
278    // Fail fast if a daemon is already up.
279    if ipc::probe_peer().await.is_ok() {
280        return Err(CliError::DaemonAlreadyRunning);
281    }
282    // Clear a stale PID file pointing at a dead process.
283    if let Ok(stale_pid) = read_pid_file()
284        && !pid_is_alive(stale_pid)
285    {
286        remove_pid_file();
287    }
288
289    let auth = &cli.auth;
290    let has_explicit_auth = auth.username.is_some()
291        || auth.password.is_some()
292        || auth.qr
293        || auth.use_steam_token
294        || auth.device_name.is_some();
295
296    if !has_explicit_auth {
297        return Ok(None);
298    }
299
300    let client = shared::connect_and_login(auth, None).await?;
301    let username = auth
302        .username
303        .clone()
304        .or_else(|| shared::detect_steam_user().map(|(u, _)| u))
305        .ok_or(CliError::InteractiveAuthRequired)?;
306    drop(client);
307    Ok(Some(username))
308}
309
310/// `kill(pid, 0)` returns 0 if the process exists and we have signal
311/// permission, ESRCH if the pid is unknown, EPERM if it exists but is
312/// owned by another user. Both EPERM and 0 count as "alive" -- a pid
313/// we cannot signal is still occupying the pid namespace.
314#[cfg(unix)]
315fn pid_is_alive(pid: u32) -> bool {
316    // SAFETY: libc::kill is always safe to call with signal 0.
317    let rc = unsafe { libc::kill(pid as libc::pid_t, 0) };
318    if rc == 0 {
319        return true;
320    }
321    // SAFETY: __error()/errno_location depending on platform, but the
322    // io::Error::last_os_error() wrapper handles both.
323    std::io::Error::last_os_error().raw_os_error() == Some(libc::EPERM)
324}
325#[cfg(not(unix))]
326fn pid_is_alive(_pid: u32) -> bool {
327    true
328}
329
330/// The actual long-lived daemon process, post-exec. Builds a fresh
331/// tokio runtime above this; this just runs the accept loop.
332///
333/// `username` is empty for daemons launched without auth flags (lazy
334/// mode): the worker authenticates on first job. Non-empty means the
335/// parent did the interactive auth dance and saved a refresh token;
336/// we re-login non-interactively here.
337pub async fn serve_resumed(username: String, _cli: Cli) -> Result<(), CliError> {
338    let (initial_client, preferred_user) = if username.is_empty() {
339        // Lazy mode: no eager login. The worker will authenticate on
340        // the first job, using whatever saved token / autodetect /
341        // anonymous path applies at that time.
342        (None, None)
343    } else {
344        let token = shared::load_saved_token(&username).ok_or(CliError::InteractiveAuthRequired)?;
345        let client = steamroom_client::login::LoginBuilder::new()
346            .device_name("steamroom")
347            .with_refresh_token(&username, &token)
348            .login()
349            .await?;
350        (Some(client), Some(username.clone()))
351    };
352
353    let listener = ipc::bind_listener().await?;
354
355    let pid = std::process::id();
356    write_pid_file(pid)?;
357    let account_label = if username.is_empty() {
358        None
359    } else {
360        Some(username.clone())
361    };
362    let state = DaemonState::new(account_label, pid, unix_now_lifecycle());
363
364    // Seed the recent ring from disk so `daemon status` and `daemon
365    // attach` see jobs from prior daemon lifetimes.
366    load_recent_history(&state).await;
367
368    use tracing_subscriber::filter::LevelFilter;
369    use tracing_subscriber::layer::SubscriberExt;
370    use tracing_subscriber::util::SubscriberInitExt;
371    // Log first-party crates only. Without a filter this logged every
372    // h2/tcp/hyper TRACE line, which grew the daemon log to tens of GB and
373    // filled the disk over a long run. Silencing third-party crates caps
374    // that volume; honor RUST_LOG when the operator wants more.
375    let _ = tracing_subscriber::registry()
376        .with(crate::commands::shared::log_filter(LevelFilter::INFO))
377        .with(tracing_subscriber::fmt::layer())
378        .with(JobIdAttachmentInstaller)
379        .with(JobScopedLogLayer::new(state.events.clone()))
380        .try_init();
381
382    let worker_state = state.clone();
383    let worker_client = initial_client;
384    let worker_user = preferred_user;
385    let mut worker_task = Some(tokio::spawn(async move {
386        worker_loop(worker_state, worker_client, worker_user).await;
387    }));
388
389    // Collector that drains the broadcast channel into per-job replay
390    // rings. Exits cleanly when the broadcast sender drops.
391    crate::daemon::server::spawn_replay_collector(state.clone());
392
393    loop {
394        // worker_task is always Some here; the None arm is unreachable but
395        // avoids an unwrap().
396        let join_arm = match worker_task {
397            Some(ref mut h) => h,
398            None => break,
399        };
400        tokio::select! {
401            _ = state.shutdown.cancelled() => break,
402            res = join_arm => {
403                // Worker ended -- graceful drain, panic, or abort.
404                match res {
405                    Ok(()) => tracing::info!("worker_loop exited"),
406                    Err(ref e) if e.is_panic() => tracing::error!("worker_loop panicked: {e}"),
407                    Err(ref e) => tracing::warn!("worker_loop join error: {e}"),
408                }
409                worker_task = None;
410                state.shutdown.cancel();
411                break;
412            }
413            res = ipc::accept(&listener) => match res {
414                Ok(stream) => {
415                    let st = state.clone();
416                    tokio::spawn(handle_connection(st, stream));
417                }
418                Err(e) => {
419                    tracing::warn!("accept failed: {e}");
420                }
421            }
422        }
423    }
424
425    let _ = state.events.send(crate::daemon::proto::Event::Log {
426        job_id: None,
427        level: crate::daemon::proto::LogLevel::Info,
428        target: "daemon".into(),
429        message: "shutting down".into(),
430    });
431    // If the accept loop exited via shutdown.cancelled() (not via the worker
432    // arm), worker_task is still Some -- abort it.
433    if let Some(h) = worker_task {
434        h.abort();
435        let _ = h.await;
436    }
437    // Persist the recent ring so the next daemon launch shows job
438    // history across restarts.
439    save_recent_history(&state).await;
440    remove_pid_file();
441    Ok(())
442}
443
444fn unix_now_lifecycle() -> u64 {
445    std::time::SystemTime::now()
446        .duration_since(std::time::UNIX_EPOCH)
447        .map(|d| d.as_secs())
448        .unwrap_or(0)
449}