Skip to main content

sqry_daemon/
entrypoint.rs

1//! Production `sqryd` binary entry point — Task 9 U10.
2//!
3//! # Overview
4//!
5//! This module owns the complete CLI definition and the ordered startup /
6//! shutdown lifecycle for the `sqryd` daemon binary.  Every code path that
7//! could surface an error to the operator maps to a POSIX `sysexits.h` exit
8//! code via [`DaemonError::exit_code`].
9//!
10//! # CLI surface (§C.2)
11//!
12//! ```text
13//! sqryd [OPTIONS] [COMMAND]
14//!
15//! Commands:
16//!   start                   Start the daemon (foreground by default; --detach for detached)
17//!   foreground              Run in the foreground (alias for `start`)
18//!   stop                    Send daemon/stop and wait for socket to become unreachable
19//!   status                  Print daemon status (--json for machine-readable output)
20//!   install-systemd-user    Emit a systemd user-service unit to stdout   [Linux]
21//!   install-systemd-system  Emit a systemd system-service unit to stdout  [Linux]
22//!   install-launchd         Emit a launchd user-agent plist to stdout    [macOS]
23//!   install-windows         Emit sc.exe + Task Scheduler XML to stdout   [Windows]
24//!   print-config            Print the effective daemon configuration as TOML
25//! ```
26//!
27//! Default (no command given): `start` with `detach=false`.
28//!
29//! # Startup ordering (§C.3.1)
30//!
31//! The foreground path follows these 17 ordered steps, each protected by
32//! RAII so every Drop runs on the success *and* error paths:
33//!
34//! 1.  Load [`DaemonConfig`] (honour `--config` / `SQRY_DAEMON_CONFIG`).
35//! 2.  Install tracing subscriber (gate `RollingSizeAppender` on
36//!     `NOTIFY_SOCKET` absence — §G.1 m4).
37//! 3.  Create `runtime_dir()` with mode `0700` on Unix.
38//! 4.  [`acquire_pidfile_lock`] → [`PidfileLock`].  `WouldBlock` →
39//!     [`DaemonError::AlreadyRunning`] → exit 75.
40//! 5.  *(Skip — detach path handled in `run_start_detach`.)*
41//! 6.  Build plugin manager.
42//! 7.  [`WorkspaceManager::new`] (spawns the retention reaper).
43//! 8.  [`RebuildDispatcher::new`].
44//! 9.  [`RealWorkspaceBuilder::new`].
45//! 10. [`QueryExecutor::new`].
46//! 11. [`CancellationToken`].
47//! 12. Install signal handlers → [`SignalGuard`].
48//! 13. Pre-load pinned workspaces (log + continue on failure).
49//! 14. [`IpcServer::bind`].
50//! 15. Signal ready (§C.3.1 step 15 authoritative matrix):
51//!     - `NOTIFY_SOCKET` set → `sd_notify(READY=1)` (authoritative for systemd).
52//!     - `--spawned-by-client` → close `SQRYD_READY_PIPE_FD` (authoritative
53//!       for the parent auto-spawn path).
54//!     - Always: touch `runtime_dir/sqryd.ready` (diagnostic, non-authoritative).
55//! 16. `server.run().await`.
56//! 17. RAII Drop order: `IpcServer` drops (stops accepting; socket file
57//!     remains on disk in configured-path mode), pidfile removed + lock
58//!     released by `PidfileLock::Drop`.
59//!
60//! # Detach path (§C.3.2)
61//!
62//! On `start --detach` (Unix only):
63//!
64//! A. Parent acquires `PidfileLock` (`WriteOwner`).
65//! B. Parent creates a self-pipe via `pipe2(O_CLOEXEC)`.
66//! C. Parent spawns `current_exe()` with `["start", "--detach",
67//!    "--spawned-by-client"]` and environment `SQRYD_READY_PIPE_FD`,
68//!    `SQRYD_LOCK_FD`, `SQRYD_PIDFILE_PATH`.  A `pre_exec` hook clears
69//!    `FD_CLOEXEC` on both FDs and calls `setsid()`.
70//! D. Parent closes its write end.
71//! E. Parent polls read end up to `auto_start_ready_timeout_secs`:
72//!    - EOF -> `hand_off_to_adopter()` + exit 0.
73//!    - Timeout -> `child.kill()` (SIGKILL to specific PID via
74//!      `std::process::Child::kill`) + exit 69.
75//! F. Grandchild reads `SQRYD_LOCK_FD`, wraps via
76//!    [`PidfileLock::adopt`], reads `SQRYD_READY_PIPE_FD`, runs steps
77//!    2-14, closes pipe at step 15, runs `server.run()`.
78//!
79//! On Windows, `--detach` is a no-op with a `WARN` log (see §C.5).
80//!
81//! # Design reference
82//!
83//! `docs/reviews/sqryd-daemon/2026-04-19/task-9-design_iter3_request.md`
84//! §C, §D, §E, §G, §I, §J.
85
86use std::{path::PathBuf, process::ExitCode, sync::Arc, time::Duration};
87
88use clap::{Parser, Subcommand};
89use sqry_core::query::executor::QueryExecutor;
90use tokio_util::sync::CancellationToken;
91use tracing::{error, info, warn};
92
93use crate::{
94    DaemonConfig, DaemonError, DaemonResult, IpcServer, RealWorkspaceBuilder, RebuildDispatcher,
95    WorkspaceManager,
96    lifecycle::{
97        log_rotate::install_tracing,
98        notify::{is_under_systemd, notify_ready},
99        pidfile::{PidfileLock, acquire_pidfile_lock},
100        signals::install_signal_handlers,
101        units::InstallOptions,
102    },
103};
104
105// ---------------------------------------------------------------------------
106// Environment variable names for FD-inheritance protocol (§C.3.2)
107// ---------------------------------------------------------------------------
108
109/// Environment variable carrying the write-end FD of the parent->grandchild
110/// self-pipe.  The grandchild closes this FD after signalling ready; the
111/// parent's read end returns EOF, proving readiness.
112const ENV_READY_PIPE_FD: &str = "SQRYD_READY_PIPE_FD";
113
114/// Environment variable carrying the raw FD of the already-locked
115/// `sqryd.lock` file.  The grandchild calls
116/// [`PidfileLock::adopt`] on this FD instead of calling
117/// [`acquire_pidfile_lock`] again.
118#[cfg(unix)]
119const ENV_LOCK_FD: &str = "SQRYD_LOCK_FD";
120
121/// Environment variable carrying the canonical path of `sqryd.pid` so
122/// the grandchild's adopted [`PidfileLock`] can unlink it on Drop.
123#[cfg(unix)]
124const ENV_PIDFILE_PATH: &str = "SQRYD_PIDFILE_PATH";
125
126/// Environment variable carrying the canonical path of `sqryd.lock` so
127/// the grandchild's adopted [`PidfileLock`] knows which lockfile backs
128/// the inherited FD.
129#[cfg(unix)]
130const ENV_LOCKFILE_PATH: &str = "SQRYD_LOCKFILE_PATH";
131
132// ---------------------------------------------------------------------------
133// CLI definition (§C.2)
134// ---------------------------------------------------------------------------
135
136/// Production `sqryd` daemon binary.
137///
138/// Run `sqryd help` or `sqryd <subcommand> --help` for usage.
139#[derive(Debug, Parser)]
140#[command(
141    name = "sqryd",
142    about = "sqry daemon — persistent semantic code-search graph service",
143    version,
144    author
145)]
146pub struct SqrydCli {
147    /// Path to the daemon configuration file.
148    ///
149    /// Defaults to `~/.config/sqry/daemon.toml` (or the platform-specific
150    /// equivalent).  Can also be set via the `SQRY_DAEMON_CONFIG` environment
151    /// variable; the `--config` flag takes precedence over the env var.
152    #[arg(long, value_name = "FILE", env = "SQRY_DAEMON_CONFIG", global = true)]
153    pub config: Option<PathBuf>,
154
155    /// Log verbosity (e.g. `info`, `debug`, `sqry_daemon=trace`).
156    ///
157    /// Accepts the same syntax as `RUST_LOG` / `tracing_subscriber::EnvFilter`.
158    /// Overrides both `SQRY_DAEMON_LOG_LEVEL` and the `log_level` field in
159    /// the config file.
160    #[arg(long, value_name = "LEVEL", global = true)]
161    pub log_level: Option<String>,
162
163    /// Subcommand to run.  Defaults to `start` (foreground mode).
164    #[command(subcommand)]
165    pub command: Option<Command>,
166}
167
168/// Top-level subcommands.
169#[derive(Debug, Subcommand)]
170pub enum Command {
171    /// Start the daemon.
172    ///
173    /// With `--detach` (Unix only): the parent forks a grandchild that binds
174    /// the socket and then the parent exits 0.  The grandchild inherits the
175    /// pidfile lock FD so the lockfile stays authoritative across the
176    /// fork boundary.
177    ///
178    /// Without `--detach` (the default): the daemon runs in the foreground and
179    /// is suitable for direct terminal use, containers, and systemd
180    /// `Type=notify` supervision.
181    Start(Start),
182
183    /// Run in the foreground (alias for `start` without `--detach`).
184    Foreground,
185
186    /// Send `daemon/stop` to the running daemon and wait until its socket
187    /// becomes unreachable.
188    Stop {
189        /// Maximum seconds to wait for the daemon to exit gracefully.
190        #[arg(long, default_value_t = 15)]
191        timeout_secs: u64,
192    },
193
194    /// Query daemon status.
195    Status {
196        /// Emit machine-readable JSON instead of the default human-readable
197        /// summary.
198        #[arg(long)]
199        json: bool,
200    },
201
202    /// Emit a systemd **user** service unit to stdout.
203    ///
204    /// Pipe the output to
205    /// `~/.config/systemd/user/sqryd.service` and then run
206    /// `systemctl --user daemon-reload && systemctl --user enable --now sqryd`.
207    #[cfg(target_os = "linux")]
208    InstallSystemdUser,
209
210    /// Emit a systemd **system** service unit to stdout.
211    ///
212    /// Use `--user NAME` to specify the POSIX account that the templated
213    /// `sqryd@NAME.service` should run as.  Falls back to `$USER` if omitted.
214    ///
215    /// Install with
216    /// `systemctl enable --now sqryd@<username>` after placing the file in
217    /// `/etc/systemd/system/`.
218    #[cfg(target_os = "linux")]
219    InstallSystemdSystem {
220        /// POSIX user account name for the `%i` template instance specifier.
221        #[arg(long)]
222        user: Option<String>,
223    },
224
225    /// Emit a launchd user-agent plist to stdout.
226    ///
227    /// Install with:
228    /// ```bash
229    /// sqryd install-launchd > ~/Library/LaunchAgents/ai.verivus.sqry.sqryd.plist
230    /// launchctl load -w ~/Library/LaunchAgents/ai.verivus.sqry.sqryd.plist
231    /// ```
232    #[cfg(target_os = "macos")]
233    InstallLaunchd,
234
235    /// Emit `sc.exe create` + Task Scheduler XML to stdout.  [Windows only]
236    #[cfg(target_os = "windows")]
237    InstallWindows,
238
239    /// Print the effective daemon configuration as canonical TOML and exit.
240    ///
241    /// Useful to verify which config file was loaded and what the resolved
242    /// defaults look like before starting the daemon.
243    PrintConfig,
244}
245
246/// Arguments for `sqryd start`.
247#[derive(Debug, clap::Args, Default)]
248pub struct Start {
249    /// Fork a grandchild to run the daemon and exit the parent immediately.
250    ///
251    /// Unix only.  On Windows this flag is accepted but has no effect (a WARN
252    /// is logged and the daemon continues in the foreground).
253    #[arg(long)]
254    pub detach: bool,
255
256    /// *(Internal -- hidden from `--help`)*  Marks the grandchild spawned by
257    /// the detach path.  The grandchild adopts the inherited lock FD and
258    /// self-pipe FD instead of re-acquiring them.
259    #[arg(long, hide = true)]
260    pub spawned_by_client: bool,
261}
262
263// ---------------------------------------------------------------------------
264// Top-level dispatcher
265// ---------------------------------------------------------------------------
266
267/// Parse the CLI and dispatch to the appropriate `run_*` function.
268///
269/// This is the only public entry point called from `main`.  On error it
270/// returns the error value; `main` prints it with `{err:#}` and converts
271/// `err.exit_code()` to a [`std::process::ExitCode`].
272pub fn run() -> DaemonResult<()> {
273    let cli = SqrydCli::parse();
274    let rt = tokio::runtime::Builder::new_multi_thread()
275        .enable_all()
276        .build()
277        .map_err(DaemonError::Io)?;
278
279    let log_level_owned = cli.log_level.clone();
280    let log_level = log_level_owned.as_deref();
281    let config_path = cli.config.clone();
282
283    let command = cli.command.unwrap_or(Command::Start(Start::default()));
284
285    match command {
286        Command::Start(start) => rt.block_on(run_start(start, config_path, log_level)),
287        Command::Foreground => rt.block_on(run_start(Start::default(), config_path, log_level)),
288        Command::Stop { timeout_secs } => {
289            rt.block_on(run_stop(config_path, log_level, timeout_secs))
290        }
291        Command::Status { json } => rt.block_on(run_status(config_path, log_level, json)),
292        #[cfg(target_os = "linux")]
293        Command::InstallSystemdUser => run_install_systemd_user(config_path, log_level),
294        #[cfg(target_os = "linux")]
295        Command::InstallSystemdSystem { user } => {
296            run_install_systemd_system(config_path, log_level, user)
297        }
298        #[cfg(target_os = "macos")]
299        Command::InstallLaunchd => run_install_launchd(config_path, log_level),
300        #[cfg(target_os = "windows")]
301        Command::InstallWindows => run_install_windows(config_path, log_level),
302        Command::PrintConfig => run_print_config(config_path, log_level),
303    }
304}
305
306// ---------------------------------------------------------------------------
307// Start -- foreground and detach paths
308// ---------------------------------------------------------------------------
309
310/// Entry point for `sqryd start` and `sqryd foreground`.
311///
312/// Dispatches to:
313/// - [`run_start_spawned_by_client`] when `--spawned-by-client` is set
314///   (grandchild entry point in the double-fork detach path).
315/// - [`run_start_detach`] when `--detach` is set (parent entry point).
316/// - [`run_start_foreground`] otherwise.
317async fn run_start(
318    args: Start,
319    config_path: Option<PathBuf>,
320    log_level: Option<&str>,
321) -> DaemonResult<()> {
322    if args.spawned_by_client {
323        return run_start_spawned_by_client(config_path, log_level).await;
324    }
325    if args.detach {
326        return run_start_detach(config_path, log_level).await;
327    }
328    run_start_foreground(config_path, log_level).await
329}
330
331// ---------------------------------------------------------------------------
332// Foreground path (§C.3.1) -- 17 ordered steps
333// ---------------------------------------------------------------------------
334
335/// Foreground startup path (no detach, no self-pipe).
336///
337/// Acquires the pidfile lock, wires all components, and runs `server.run()`.
338async fn run_start_foreground(
339    config_path: Option<PathBuf>,
340    log_level: Option<&str>,
341) -> DaemonResult<()> {
342    // Step 1 -- Load config.
343    let cfg = load_config(config_path)?;
344    let cfg = Arc::new(cfg);
345
346    // Step 2 -- Install tracing.
347    let _tracing_guard = match install_tracing(&cfg, log_level) {
348        Ok(g) => g,
349        Err(e) => {
350            eprintln!("sqryd: warning: tracing setup: {e:#}");
351            None
352        }
353    };
354
355    info!(
356        version = env!("CARGO_PKG_VERSION"),
357        socket = %cfg.socket_path().display(),
358        pid_file = %cfg.pid_path().display(),
359        "sqryd starting"
360    );
361
362    // Step 3 -- Create runtime_dir with 0700 on Unix.
363    create_runtime_dir(&cfg)?;
364
365    // Step 4 -- Acquire pidfile lock.
366    let pidfile_lock = acquire_pidfile_lock(&cfg)?;
367    info!(pid_file = %cfg.pid_path().display(), "pidfile lock acquired");
368
369    // Steps 6-10: build all components.
370    let (manager, dispatcher, builder, executor) = build_daemon_components(Arc::clone(&cfg));
371
372    // Step 11 -- CancellationToken.
373    let shutdown = CancellationToken::new();
374
375    // Step 12 -- Install signal handlers.
376    let _signal_guard = install_signal_handlers(shutdown.clone())?;
377    info!("signal handlers installed");
378
379    // Step 13 -- Pre-load pinned workspaces (log + continue on failure).
380    preload_pinned_workspaces(&cfg, &manager, &builder).await;
381
382    // Step 14 -- Bind IPC server.
383    let server = IpcServer::bind(
384        Arc::clone(&cfg),
385        Arc::clone(&manager),
386        Arc::clone(&dispatcher),
387        Arc::clone(&builder),
388        Arc::clone(&executor),
389        shutdown.clone(),
390    )
391    .await?;
392    info!(socket = %server.socket_path().display(), "IPC server bound");
393
394    // Step 15 -- Signal ready.
395    signal_ready(&cfg, server.socket_path());
396
397    // Step 16 -- Run.
398    server.run().await?;
399
400    // Step 17 -- RAII Drop: _signal_guard, then pidfile_lock.
401    info!("sqryd shutdown complete");
402    drop(_signal_guard);
403    drop(pidfile_lock);
404
405    Ok(())
406}
407
408// ---------------------------------------------------------------------------
409// Detach path -- parent side (§C.3.2 A-E)
410// ---------------------------------------------------------------------------
411
412/// Parent entry point for `sqryd start --detach`.
413///
414/// On Unix: creates a self-pipe, acquires the pidfile lock, spawns the
415/// grandchild with the lock FD and pipe write-end inherited, then polls the
416/// read end until EOF (ready) or timeout.
417///
418/// On Windows: no-op with WARN log; runs foreground instead.
419async fn run_start_detach(
420    config_path: Option<PathBuf>,
421    log_level: Option<&str>,
422) -> DaemonResult<()> {
423    #[cfg(unix)]
424    {
425        run_start_detach_unix(config_path, log_level).await
426    }
427    #[cfg(not(unix))]
428    {
429        let cfg = load_config(config_path.clone())?;
430        setup_stderr_tracing(log_level, &cfg);
431        drop(cfg);
432        warn!(
433            "--detach is a no-op on Windows; running in the foreground instead. \
434             Use Task Scheduler or sc.exe to run sqryd as a background service."
435        );
436        run_start_foreground(config_path, log_level).await
437    }
438}
439
440#[cfg(unix)]
441async fn run_start_detach_unix(
442    config_path: Option<PathBuf>,
443    log_level: Option<&str>,
444) -> DaemonResult<()> {
445    // Step A -- Load config and set up basic tracing for the parent.
446    let cfg = load_config(config_path.clone())?;
447    let cfg = Arc::new(cfg);
448
449    let _tracing_guard = match install_tracing(&cfg, log_level) {
450        Ok(g) => g,
451        Err(e) => {
452            eprintln!("sqryd: warning: tracing setup (parent): {e:#}");
453            None
454        }
455    };
456
457    create_runtime_dir(&cfg)?;
458
459    // Step A -- Acquire pidfile lock (WriteOwner).
460    let mut pidfile_lock = acquire_pidfile_lock(&cfg)?;
461    info!(pid_file = %cfg.pid_path().display(), "parent: pidfile lock acquired (WriteOwner)");
462
463    // Step B -- Create self-pipe with O_CLOEXEC on both ends.
464    let (read_fd, write_fd) = create_pipe()?;
465
466    // Retrieve the raw FD from the pidfile lock.
467    let lock_fd = pidfile_lock.as_raw_fd();
468    let pidfile_path = cfg.pid_path();
469    let lockfile_path = cfg.lock_path();
470
471    // Step C -- Spawn the grandchild.
472    let exe = std::env::current_exe()
473        .map_err(|e| DaemonError::Io(std::io::Error::other(format!("current_exe: {e}"))))?;
474
475    let mut cmd = std::process::Command::new(&exe);
476    cmd.args(["start", "--detach", "--spawned-by-client"]);
477
478    if let Some(ref cp) = config_path {
479        cmd.arg("--config").arg(cp);
480    }
481    if let Some(ll) = log_level {
482        cmd.arg("--log-level").arg(ll);
483    }
484
485    cmd.env(ENV_READY_PIPE_FD, write_fd.to_string());
486    cmd.env(ENV_LOCK_FD, lock_fd.to_string());
487    cmd.env(ENV_PIDFILE_PATH, pidfile_path.as_os_str());
488    cmd.env(ENV_LOCKFILE_PATH, lockfile_path.as_os_str());
489
490    // Redirect stdio to /dev/null for the detached grandchild.
491    cmd.stdin(std::process::Stdio::null());
492    cmd.stdout(std::process::Stdio::null());
493    cmd.stderr(std::process::Stdio::null());
494
495    // pre_exec: setsid() + clear FD_CLOEXEC on write_fd + lock_fd.
496    // SAFETY: pre_exec runs after fork in the child; only async-signal-safe
497    // syscalls are used (setsid and fcntl are both async-signal-safe per POSIX).
498    let write_fd_copy = write_fd;
499    let lock_fd_copy = lock_fd;
500    unsafe {
501        use std::os::unix::process::CommandExt as _;
502        cmd.pre_exec(move || {
503            // New session: detach from controlling terminal.
504            if libc::setsid() < 0 {
505                return Err(std::io::Error::last_os_error());
506            }
507            // Clear FD_CLOEXEC so write_fd and lock_fd survive the exec.
508            for fd in [write_fd_copy, lock_fd_copy] {
509                let flags = libc::fcntl(fd, libc::F_GETFD);
510                if flags < 0 {
511                    return Err(std::io::Error::last_os_error());
512                }
513                let rc = libc::fcntl(fd, libc::F_SETFD, flags & !libc::FD_CLOEXEC);
514                if rc < 0 {
515                    return Err(std::io::Error::last_os_error());
516                }
517            }
518            Ok(())
519        });
520    }
521
522    let mut child = cmd.spawn().map_err(|e| {
523        DaemonError::Io(std::io::Error::other(format!(
524            "failed to spawn grandchild sqryd process: {e}"
525        )))
526    })?;
527
528    let grandchild_pid = child.id();
529    info!(pid = grandchild_pid, "spawned grandchild");
530
531    // Step D -- Parent closes its write end; only the grandchild holds it now.
532    drop_raw_fd(write_fd);
533
534    // Step E -- Poll the read end until EOF or timeout.
535    let timeout_secs = cfg.auto_start_ready_timeout_secs;
536    let deadline = std::time::Instant::now() + Duration::from_secs(timeout_secs);
537
538    let result = poll_ready_pipe(read_fd, deadline);
539
540    // Close read end regardless of outcome.
541    drop_raw_fd(read_fd);
542
543    match result {
544        Ok(()) => {
545            // EOF on the ready pipe — but EOF can also mean the grandchild
546            // exited early (before step 15) and the OS closed all its write-end
547            // FDs as part of process teardown.  Distinguish the two cases by
548            // calling `try_wait`: if the child has already exited it is a
549            // startup failure, not a readiness signal (M-1 fix).
550            match child.try_wait() {
551                Ok(Some(status)) => {
552                    // Grandchild already exited — pipe EOF was process death.
553                    warn!(
554                        pid = grandchild_pid,
555                        ?status,
556                        "grandchild exited before signalling ready (pipe EOF was process death)"
557                    );
558                    // Drop with WriteOwner: unlinks pidfile + unlocks.
559                    drop(pidfile_lock);
560                    return Err(DaemonError::AutoStartTimeout {
561                        timeout_secs,
562                        socket: cfg.socket_path(),
563                    });
564                }
565                Ok(None) => {
566                    // Child is still running — pipe EOF was step 15 close: genuine readiness.
567                }
568                Err(e) => {
569                    // try_wait failed (unusual). Log and assume alive.
570                    warn!(
571                        pid = grandchild_pid,
572                        err = %e,
573                        "try_wait after pipe EOF failed -- assuming grandchild is alive"
574                    );
575                }
576            }
577
578            // Grandchild signalled ready: hand off pidfile ownership so our
579            // Drop does NOT unlink the pidfile (Handoff state).
580            pidfile_lock.hand_off_to_adopter();
581            info!(
582                pid = grandchild_pid,
583                "grandchild signalled ready -- parent exiting 0 (Handoff)"
584            );
585            // Drop with Handoff: does NOT unlock (M-2 fix applied in pidfile.rs).
586            drop(pidfile_lock);
587            Ok(())
588        }
589        Err(()) => {
590            warn!(
591                pid = grandchild_pid,
592                timeout_secs, "grandchild did not signal ready within timeout -- killing"
593            );
594            // Child::kill sends SIGKILL to the exact PID (n5: targets the
595            // specific PID via libc::kill(pid, SIGKILL), bypassing the
596            // process group despite the grandchild's setsid).
597            if let Err(e) = child.kill() {
598                warn!(pid = grandchild_pid, err = %e, "kill(grandchild) failed");
599            }
600            let _ = child.wait();
601            // Drop with WriteOwner: unlinks pidfile + unlocks.
602            drop(pidfile_lock);
603            Err(DaemonError::AutoStartTimeout {
604                timeout_secs,
605                socket: cfg.socket_path(),
606            })
607        }
608    }
609}
610
611// ---------------------------------------------------------------------------
612// Grandchild path (§C.3.2 F)
613// ---------------------------------------------------------------------------
614
615/// Grandchild entry point -- `sqryd start --detach --spawned-by-client`.
616///
617/// Reads `SQRYD_LOCK_FD` and `SQRYD_READY_PIPE_FD` from the environment,
618/// adopts the inherited pidfile lock, and then runs the foreground startup
619/// steps 2-16 with the ready-pipe write-end FD so step 15 closes it to
620/// signal readiness to the parent.
621async fn run_start_spawned_by_client(
622    config_path: Option<PathBuf>,
623    log_level: Option<&str>,
624) -> DaemonResult<()> {
625    #[cfg(unix)]
626    {
627        run_start_spawned_by_client_unix(config_path, log_level).await
628    }
629    #[cfg(not(unix))]
630    {
631        // Should never happen: --spawned-by-client is only emitted by the
632        // parent on Unix.  Run foreground as a safe fallback.
633        warn!("--spawned-by-client reached on non-Unix -- running foreground");
634        run_start_foreground(config_path, log_level).await
635    }
636}
637
638#[cfg(unix)]
639async fn run_start_spawned_by_client_unix(
640    config_path: Option<PathBuf>,
641    log_level: Option<&str>,
642) -> DaemonResult<()> {
643    use std::os::unix::io::RawFd;
644
645    // Read inherited FDs from the environment.
646    let lock_fd: RawFd = read_env_fd(ENV_LOCK_FD).ok_or_else(|| {
647        DaemonError::Io(std::io::Error::other(
648            "grandchild: SQRYD_LOCK_FD not set (only valid via --detach parent spawn)",
649        ))
650    })?;
651    let ready_pipe_fd: RawFd = read_env_fd(ENV_READY_PIPE_FD).ok_or_else(|| {
652        DaemonError::Io(std::io::Error::other(
653            "grandchild: SQRYD_READY_PIPE_FD not set",
654        ))
655    })?;
656    let pidfile_path: PathBuf = std::env::var_os(ENV_PIDFILE_PATH)
657        .map(PathBuf::from)
658        .ok_or_else(|| {
659            DaemonError::Io(std::io::Error::other(
660                "grandchild: SQRYD_PIDFILE_PATH not set",
661            ))
662        })?;
663    let lockfile_path: PathBuf = std::env::var_os(ENV_LOCKFILE_PATH)
664        .map(PathBuf::from)
665        .ok_or_else(|| {
666            DaemonError::Io(std::io::Error::other(
667                "grandchild: SQRYD_LOCKFILE_PATH not set",
668            ))
669        })?;
670
671    // Step 1 -- Load config.
672    let cfg = load_config(config_path)?;
673    let cfg = Arc::new(cfg);
674
675    // Write grandchild's own PID to pidfile (atomic tmp+rename, overwriting
676    // the parent's PID written by acquire_pidfile_lock).
677    write_pid_file_grandchild(&cfg.pid_path())?;
678
679    // Adopt the inherited lock FD (§C.3.3 m6 canonical signature).
680    // SAFETY: lock_fd is a valid inherited FD carrying an active OFD-level
681    // flock acquired by the parent.  The grandchild is the sole user of this
682    // FD in this process.  adopt() takes ownership; caller must NOT close
683    // lock_fd separately.
684    let _pidfile_lock = unsafe { PidfileLock::adopt(lock_fd, pidfile_path, lockfile_path) };
685
686    info!(
687        version = env!("CARGO_PKG_VERSION"),
688        pid = std::process::id(),
689        "sqryd grandchild: pidfile lock adopted -- beginning foreground startup"
690    );
691
692    // Steps 2-16 -- run the foreground startup path passing the ready-pipe FD.
693    run_start_foreground_inner(cfg, log_level, ready_pipe_fd).await
694}
695
696/// Inner foreground path shared by the grandchild (`--spawned-by-client`)
697/// and future callers that already hold a pre-loaded config + lock.
698///
699/// Steps 2-16 per §C.3.1.  Step 1 (config load) and the pidfile lock
700/// are done by the caller.
701///
702/// `ready_pipe_write_fd`: the self-pipe write-end to close at step 15 to
703/// signal the parent.  Pass -1 (or any negative value) on non-unix to skip.
704async fn run_start_foreground_inner(
705    cfg: Arc<DaemonConfig>,
706    log_level: Option<&str>,
707    #[cfg(unix)] ready_pipe_write_fd: libc::c_int,
708    #[cfg(not(unix))] _ready_pipe_write_fd: i32,
709) -> DaemonResult<()> {
710    // Step 2 -- Install tracing.
711    let _tracing_guard = match install_tracing(&cfg, log_level) {
712        Ok(g) => g,
713        Err(e) => {
714            eprintln!("sqryd: warning: tracing setup: {e:#}");
715            None
716        }
717    };
718    info!(
719        version = env!("CARGO_PKG_VERSION"),
720        socket = %cfg.socket_path().display(),
721        "sqryd grandchild: tracing active"
722    );
723
724    // Step 3 -- runtime_dir (may already exist; idempotent).
725    create_runtime_dir(&cfg)?;
726
727    // Steps 6-10.
728    let (manager, dispatcher, builder, executor) = build_daemon_components(Arc::clone(&cfg));
729
730    // Step 11.
731    let shutdown = CancellationToken::new();
732
733    // Step 12.
734    let _signal_guard = install_signal_handlers(shutdown.clone())?;
735
736    // Step 13.
737    preload_pinned_workspaces(&cfg, &manager, &builder).await;
738
739    // Step 14.
740    let server = IpcServer::bind(
741        Arc::clone(&cfg),
742        Arc::clone(&manager),
743        Arc::clone(&dispatcher),
744        Arc::clone(&builder),
745        Arc::clone(&executor),
746        shutdown.clone(),
747    )
748    .await?;
749    info!(socket = %server.socket_path().display(), "IPC server bound");
750
751    // Step 15 -- Signal ready.
752    signal_ready(&cfg, server.socket_path());
753
754    // Close the self-pipe write end so the parent's read() returns EOF.
755    #[cfg(unix)]
756    if ready_pipe_write_fd >= 0 {
757        close_ready_pipe_fd(ready_pipe_write_fd);
758    }
759
760    // Step 16 -- Run.
761    server.run().await?;
762
763    info!("sqryd shutdown complete");
764    Ok(())
765}
766
767// ---------------------------------------------------------------------------
768// Stop command
769// ---------------------------------------------------------------------------
770
771/// Connect to the running daemon, send `daemon/stop`, and wait until the
772/// socket becomes unreachable (bounded by `timeout_secs`).
773async fn run_stop(
774    config_path: Option<PathBuf>,
775    log_level: Option<&str>,
776    timeout_secs: u64,
777) -> DaemonResult<()> {
778    let cfg = load_config(config_path)?;
779    setup_stderr_tracing(log_level, &cfg);
780    let socket_path = cfg.socket_path();
781
782    info!(socket = %socket_path.display(), "connecting to daemon to send daemon/stop");
783
784    // Send daemon/stop and read the response via the raw framed protocol.
785    let stop_req = serde_json::json!({
786        "jsonrpc": "2.0",
787        "id": 1,
788        "method": "daemon/stop",
789        "params": {}
790    });
791    send_management_request(&socket_path, &stop_req).await?;
792
793    info!(
794        timeout_secs,
795        "waiting for daemon socket to become unreachable"
796    );
797
798    let deadline = std::time::Instant::now() + std::time::Duration::from_secs(timeout_secs);
799    loop {
800        if !crate::lifecycle::detach::try_connect_path(&socket_path).await {
801            info!("daemon socket gone -- stop complete");
802            return Ok(());
803        }
804        if std::time::Instant::now() >= deadline {
805            return Err(DaemonError::AutoStartTimeout {
806                timeout_secs,
807                socket: socket_path,
808            });
809        }
810        tokio::time::sleep(Duration::from_millis(100)).await;
811    }
812}
813
814// ---------------------------------------------------------------------------
815// Status command (m3 fix: revalidate via socket connect, never pidfile-only)
816// ---------------------------------------------------------------------------
817
818/// Query daemon status, revalidating liveness via socket connect (m3 fix).
819async fn run_status(
820    config_path: Option<PathBuf>,
821    log_level: Option<&str>,
822    json_output: bool,
823) -> DaemonResult<()> {
824    let cfg = load_config(config_path)?;
825    setup_stderr_tracing(log_level, &cfg);
826    let socket_path = cfg.socket_path();
827
828    // m3 fix: revalidate via socket connect -- never pidfile-only.
829    if !crate::lifecycle::detach::try_connect_path(&socket_path).await {
830        eprintln!(
831            "sqryd: daemon is not running (socket not connectable: {})",
832            socket_path.display()
833        );
834        return Err(DaemonError::Io(std::io::Error::other(format!(
835            "daemon socket not reachable: {}",
836            socket_path.display()
837        ))));
838    }
839
840    let status_req = serde_json::json!({
841        "jsonrpc": "2.0",
842        "id": 1,
843        "method": "daemon/status",
844        "params": {}
845    });
846
847    let resp_buf = send_management_request(&socket_path, &status_req).await?;
848
849    if json_output {
850        println!("{}", String::from_utf8_lossy(&resp_buf));
851    } else {
852        // m-4 fix: return Err on malformed JSON so protocol breakage is visible
853        // to operators and scripts instead of silently returning Ok.
854        let v = serde_json::from_slice::<serde_json::Value>(&resp_buf).map_err(|e| {
855            DaemonError::Io(std::io::Error::other(format!(
856                "daemon/status response was not valid JSON: {e} (raw: {})",
857                String::from_utf8_lossy(&resp_buf)
858            )))
859        })?;
860        if let Some(result) = v.get("result") {
861            render_status_human(result);
862        } else if let Some(err_val) = v.get("error") {
863            eprintln!("sqryd status error: {err_val}");
864            return Err(DaemonError::Io(std::io::Error::other(format!(
865                "daemon/status error: {err_val}"
866            ))));
867        } else {
868            println!("{}", serde_json::to_string_pretty(&v).unwrap_or_default());
869        }
870    }
871
872    Ok(())
873}
874
875// ---------------------------------------------------------------------------
876// Raw management-request helper (handshake + one JSON-RPC round trip)
877// ---------------------------------------------------------------------------
878
879/// Send one JSON-RPC management request to the daemon and return the raw
880/// response frame bytes.
881///
882/// The full protocol is:
883/// 1. Connect to the daemon socket.
884/// 2. Send `DaemonHello` frame.
885/// 3. Read `DaemonHelloResponse` frame (verify `compatible`).
886/// 4. Send the JSON-RPC request frame.
887/// 5. Read the JSON-RPC response frame.
888async fn send_management_request(
889    socket_path: &std::path::Path,
890    req: &serde_json::Value,
891) -> DaemonResult<Vec<u8>> {
892    use crate::{DaemonHello, DaemonHelloResponse};
893    use sqry_daemon_protocol::framing::{read_frame, write_frame_json};
894
895    #[cfg(unix)]
896    let mut stream = {
897        tokio::net::UnixStream::connect(socket_path)
898            .await
899            .map_err(|e| {
900                DaemonError::Io(std::io::Error::other(format!(
901                    "connect to daemon socket {}: {e}",
902                    socket_path.display()
903                )))
904            })?
905    };
906
907    #[cfg(windows)]
908    let mut stream = {
909        use tokio::net::windows::named_pipe::ClientOptions;
910        let pipe_path = socket_path.to_string_lossy();
911        ClientOptions::new().open(pipe_path.as_ref()).map_err(|e| {
912            DaemonError::Io(std::io::Error::other(format!(
913                "connect to daemon pipe {}: {e}",
914                pipe_path
915            )))
916        })?
917    };
918
919    // Step 2: send DaemonHello.
920    let hello = DaemonHello {
921        client_version: env!("CARGO_PKG_VERSION").to_owned(),
922        protocol_version: 1,
923        // STEP_6 (workspace-aware-cross-repo): in-process status query
924        // never binds a logical workspace; the daemon `status` path
925        // does not depend on the grouping, so the anonymous hello is
926        // the correct shape.
927        logical_workspace: None,
928    };
929    write_frame_json(&mut stream, &hello)
930        .await
931        .map_err(|e| DaemonError::Io(std::io::Error::other(format!("send hello: {e}"))))?;
932
933    // Step 3: read DaemonHelloResponse.
934    let hello_resp_bytes = read_frame(&mut stream)
935        .await
936        .map_err(|e| DaemonError::Io(std::io::Error::other(format!("read hello response: {e}"))))?
937        .ok_or_else(|| {
938            DaemonError::Io(std::io::Error::other(
939                "daemon closed connection before hello response",
940            ))
941        })?;
942    let hello_resp: DaemonHelloResponse =
943        serde_json::from_slice(&hello_resp_bytes).map_err(|e| {
944            DaemonError::Io(std::io::Error::other(format!("parse hello response: {e}")))
945        })?;
946    if !hello_resp.compatible {
947        return Err(DaemonError::Io(std::io::Error::other(
948            "daemon is not compatible with this client version",
949        )));
950    }
951
952    // Step 4: send the JSON-RPC request.
953    write_frame_json(&mut stream, req)
954        .await
955        .map_err(|e| DaemonError::Io(std::io::Error::other(format!("send request: {e}"))))?;
956
957    // Step 5: read the JSON-RPC response.
958    let resp_bytes = read_frame(&mut stream)
959        .await
960        .map_err(|e| DaemonError::Io(std::io::Error::other(format!("read response: {e}"))))?
961        .ok_or_else(|| {
962            DaemonError::Io(std::io::Error::other(
963                "daemon closed connection before sending response",
964            ))
965        })?;
966
967    Ok(resp_bytes)
968}
969
970/// Render a human-readable daemon status summary from the `result` field of a
971/// `daemon/status` JSON-RPC response envelope.
972fn render_status_human(result: &serde_json::Value) {
973    let payload = result.get("data").unwrap_or(result);
974
975    let version = payload
976        .get("daemon_version")
977        .and_then(|v| v.as_str())
978        .unwrap_or("unknown");
979    let uptime = payload
980        .get("uptime_seconds")
981        .and_then(|v| v.as_u64())
982        .unwrap_or(0);
983
984    println!("sqryd  version: {version}");
985    println!("       uptime:  {uptime}s");
986
987    if let Some(memory) = payload.get("memory") {
988        let limit = memory
989            .get("limit_bytes")
990            .and_then(|v| v.as_u64())
991            .unwrap_or(0);
992        let current = memory
993            .get("current_bytes")
994            .and_then(|v| v.as_u64())
995            .unwrap_or(0);
996        println!(
997            "       memory:  {} MiB used / {} MiB limit",
998            current / (1024 * 1024),
999            limit / (1024 * 1024)
1000        );
1001    }
1002
1003    if let Some(workspaces) = payload.get("workspaces").and_then(|v| v.as_array()) {
1004        println!("       workspaces: {}", workspaces.len());
1005        for ws in workspaces {
1006            let path = ws.get("index_root").and_then(|v| v.as_str()).unwrap_or("?");
1007            let state = ws
1008                .get("state")
1009                .and_then(|v| v.as_str())
1010                .unwrap_or("Unknown");
1011            println!("         {state:10} {path}");
1012        }
1013    }
1014}
1015
1016// ---------------------------------------------------------------------------
1017// Install subcommands
1018// ---------------------------------------------------------------------------
1019
1020#[cfg(target_os = "linux")]
1021fn run_install_systemd_user(
1022    config_path: Option<PathBuf>,
1023    log_level: Option<&str>,
1024) -> DaemonResult<()> {
1025    let cfg = load_config(config_path)?;
1026    setup_stderr_tracing(log_level, &cfg);
1027    let opts = InstallOptions::default();
1028    let unit = crate::lifecycle::units::systemd::generate_user_unit(&cfg, &opts);
1029    println!("{unit}");
1030    Ok(())
1031}
1032
1033#[cfg(target_os = "linux")]
1034fn run_install_systemd_system(
1035    config_path: Option<PathBuf>,
1036    log_level: Option<&str>,
1037    user: Option<String>,
1038) -> DaemonResult<()> {
1039    let cfg = load_config(config_path)?;
1040    setup_stderr_tracing(log_level, &cfg);
1041    let opts = InstallOptions {
1042        user: user.clone(),
1043        ..Default::default()
1044    };
1045    // Validate the user account (n3 fix: exits 78 EX_CONFIG on failure).
1046    let resolved_user =
1047        crate::lifecycle::units::systemd::resolve_system_unit_user(&opts).map_err(|e| {
1048            DaemonError::Config {
1049                path: cfg
1050                    .pid_path()
1051                    .parent()
1052                    .unwrap_or_else(|| std::path::Path::new("."))
1053                    .to_owned(),
1054                source: anyhow::anyhow!("{e}"),
1055            }
1056        })?;
1057    let opts_with_user = InstallOptions {
1058        user: Some(resolved_user),
1059        ..Default::default()
1060    };
1061    let unit = crate::lifecycle::units::systemd::generate_system_unit(&cfg, &opts_with_user);
1062    println!("{unit}");
1063    Ok(())
1064}
1065
1066#[cfg(target_os = "macos")]
1067fn run_install_launchd(config_path: Option<PathBuf>, log_level: Option<&str>) -> DaemonResult<()> {
1068    let cfg = load_config(config_path)?;
1069    setup_stderr_tracing(log_level, &cfg);
1070    let opts = InstallOptions::default();
1071    let plist = crate::lifecycle::units::launchd::generate_plist(&cfg, &opts);
1072    println!("{plist}");
1073    Ok(())
1074}
1075
1076#[cfg(target_os = "windows")]
1077fn run_install_windows(config_path: Option<PathBuf>, log_level: Option<&str>) -> DaemonResult<()> {
1078    let cfg = load_config(config_path)?;
1079    setup_stderr_tracing(log_level, &cfg);
1080    let opts = InstallOptions::default();
1081    let sc = crate::lifecycle::units::windows::generate_sc_create(&cfg, &opts);
1082    let xml = crate::lifecycle::units::windows::generate_task_xml(&cfg, &opts);
1083    println!("-- sc.exe create command --");
1084    println!("{sc}");
1085    println!();
1086    println!("-- Task Scheduler XML --");
1087    println!("{xml}");
1088    Ok(())
1089}
1090
1091fn run_print_config(config_path: Option<PathBuf>, log_level: Option<&str>) -> DaemonResult<()> {
1092    let cfg = load_config(config_path)?;
1093    setup_stderr_tracing(log_level, &cfg);
1094    let toml_str = toml::to_string_pretty(&cfg).map_err(|e| DaemonError::Config {
1095        path: PathBuf::from("<serialise>"),
1096        source: anyhow::anyhow!("toml serialisation failed: {e}"),
1097    })?;
1098    println!("{toml_str}");
1099    Ok(())
1100}
1101
1102// ---------------------------------------------------------------------------
1103// main() bridge -- convert DaemonResult<()> to ExitCode
1104// ---------------------------------------------------------------------------
1105
1106/// Top-level `main` trampoline.  Calls [`run`], prints any error to stderr,
1107/// and converts the error to a POSIX exit code via [`DaemonError::exit_code`].
1108pub fn main_impl() -> ExitCode {
1109    match run() {
1110        Ok(()) => ExitCode::SUCCESS,
1111        Err(err) => {
1112            error!("sqryd: fatal: {err:#}");
1113            eprintln!("sqryd: {err:#}");
1114            ExitCode::from(err.exit_code())
1115        }
1116    }
1117}
1118
1119// ---------------------------------------------------------------------------
1120// Shared helpers
1121// ---------------------------------------------------------------------------
1122
1123/// Load the daemon config, honouring an optional explicit path override.
1124///
1125/// When `config_path` is `Some`, the config is loaded from that path and
1126/// then env overrides are applied.  This avoids mutating the process-global
1127/// environment after the multi-threaded Tokio runtime has been created
1128/// (`std::env::set_var` is UB in the presence of concurrent environment reads,
1129/// and Rust 1.81 made it explicitly unsafe).
1130///
1131/// When `config_path` is `None`, `DaemonConfig::load()` is used which
1132/// respects `SQRY_DAEMON_CONFIG` normally.
1133fn load_config(config_path: Option<PathBuf>) -> DaemonResult<DaemonConfig> {
1134    if let Some(ref p) = config_path {
1135        let mut cfg = DaemonConfig::load_from_path(p)?;
1136        cfg.apply_env_overrides()?;
1137        cfg.validate()?;
1138        Ok(cfg)
1139    } else {
1140        DaemonConfig::load()
1141    }
1142}
1143
1144/// Install a minimal stderr tracing subscriber.
1145///
1146/// Used by short-lived subcommands (`stop`, `status`, `install-*`,
1147/// `print-config`).  Silently ignores double-install errors (e.g. in tests).
1148fn setup_stderr_tracing(log_level: Option<&str>, cfg: &DaemonConfig) {
1149    let level = log_level
1150        .map(ToOwned::to_owned)
1151        .or_else(|| std::env::var("SQRY_DAEMON_LOG_LEVEL").ok())
1152        .unwrap_or_else(|| cfg.log_level.clone());
1153    let filter = tracing_subscriber::EnvFilter::try_new(&level)
1154        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"));
1155    let _ = tracing_subscriber::fmt()
1156        .compact()
1157        .with_env_filter(filter)
1158        .try_init();
1159}
1160
1161/// Create `runtime_dir()` with mode `0700` on Unix.
1162fn create_runtime_dir(cfg: &DaemonConfig) -> DaemonResult<()> {
1163    let dir = cfg.runtime_dir();
1164    std::fs::create_dir_all(&dir).map_err(DaemonError::Io)?;
1165
1166    #[cfg(unix)]
1167    {
1168        use std::os::unix::fs::PermissionsExt as _;
1169        let perms = std::fs::Permissions::from_mode(0o700);
1170        std::fs::set_permissions(&dir, perms).map_err(DaemonError::Io)?;
1171    }
1172
1173    Ok(())
1174}
1175
1176/// Build the plugin manager, workspace manager, rebuild dispatcher,
1177/// workspace builder, and query executor.
1178///
1179/// Steps 6-10 per §C.3.1.  Factored into a helper so both the
1180/// foreground path and the grandchild inner path share the same code.
1181fn build_daemon_components(
1182    cfg: Arc<DaemonConfig>,
1183) -> (
1184    Arc<WorkspaceManager>,
1185    Arc<RebuildDispatcher>,
1186    Arc<dyn crate::workspace::WorkspaceBuilder>,
1187    Arc<QueryExecutor>,
1188) {
1189    let plugins = Arc::new(sqry_plugin_registry::create_plugin_manager());
1190    let manager = WorkspaceManager::new(Arc::clone(&cfg));
1191    let dispatcher =
1192        RebuildDispatcher::new(Arc::clone(&manager), Arc::clone(&cfg), Arc::clone(&plugins));
1193    let builder: Arc<dyn crate::workspace::WorkspaceBuilder> =
1194        Arc::new(RealWorkspaceBuilder::new(Arc::clone(&plugins)));
1195    let executor = Arc::new(QueryExecutor::new());
1196    (manager, dispatcher, builder, executor)
1197}
1198
1199/// Emit the authoritative ready signals and touch the diagnostic sentinel.
1200///
1201/// - `NOTIFY_SOCKET` set -> `sd_notify(READY=1)` (authoritative for systemd).
1202/// - Always: touch `runtime_dir/sqryd.ready` (diagnostic, non-authoritative).
1203///
1204/// The self-pipe close (grandchild -> parent) is handled inline by the
1205/// calling function because it requires the raw FD.
1206fn signal_ready(cfg: &DaemonConfig, socket_path: &std::path::Path) {
1207    if is_under_systemd() {
1208        if let Err(e) = notify_ready() {
1209            warn!(err = %e, "sd_notify(READY=1) failed -- systemctl may time out");
1210        } else {
1211            info!("sd_notify: READY=1 sent");
1212        }
1213    }
1214
1215    let ready_path = cfg.runtime_dir().join("sqryd.ready");
1216    if let Err(e) = std::fs::write(&ready_path, b"") {
1217        warn!(
1218            path = %ready_path.display(),
1219            err = %e,
1220            "could not touch sqryd.ready sentinel (non-fatal)"
1221        );
1222    }
1223
1224    info!(
1225        socket = %socket_path.display(),
1226        "sqryd ready -- accepting connections"
1227    );
1228}
1229
1230/// Pre-load pinned workspaces declared in the daemon config.
1231///
1232/// Step 13 per §C.3.1: log + continue on failure.
1233async fn preload_pinned_workspaces(
1234    cfg: &DaemonConfig,
1235    manager: &Arc<WorkspaceManager>,
1236    builder: &Arc<dyn crate::workspace::WorkspaceBuilder>,
1237) {
1238    use sqry_core::project::ProjectRootMode;
1239
1240    for ws_cfg in &cfg.workspaces {
1241        if ws_cfg.exclude || !ws_cfg.pinned {
1242            continue;
1243        }
1244
1245        let root = ws_cfg.path.clone();
1246        let key =
1247            crate::workspace::WorkspaceKey::new(root.clone(), ProjectRootMode::WorkspaceFolder, 0);
1248
1249        info!(path = %root.display(), "pre-loading pinned workspace");
1250        let estimate =
1251            crate::workspace::working_set_estimate(crate::workspace::WorkingSetInputs::default());
1252
1253        if let Err(e) = manager.get_or_load(&key, builder.as_ref(), estimate) {
1254            warn!(
1255                path = %root.display(),
1256                err = %e,
1257                "pinned workspace pre-load failed (log + continue per §C.3.1 step 13)"
1258            );
1259        }
1260    }
1261}
1262
1263// ---------------------------------------------------------------------------
1264// Unix-specific low-level helpers
1265// ---------------------------------------------------------------------------
1266
1267/// Create a pipe with `O_CLOEXEC` on both ends. Returns `(read_fd, write_fd)`.
1268#[cfg(all(unix, target_os = "linux"))]
1269fn create_pipe() -> DaemonResult<(libc::c_int, libc::c_int)> {
1270    let mut fds = [0i32; 2];
1271    // SAFETY: pipe2 is a Linux syscall; fds is a valid 2-element array.
1272    let rc = unsafe { libc::pipe2(fds.as_mut_ptr(), libc::O_CLOEXEC) };
1273    if rc < 0 {
1274        return Err(DaemonError::Io(std::io::Error::last_os_error()));
1275    }
1276    Ok((fds[0], fds[1]))
1277}
1278
1279/// Create a pipe with close-on-exec set on both ends. Returns `(read_fd, write_fd)`.
1280#[cfg(all(unix, not(target_os = "linux")))]
1281fn create_pipe() -> DaemonResult<(libc::c_int, libc::c_int)> {
1282    let mut fds = [0i32; 2];
1283    // SAFETY: pipe is available on POSIX Unix targets; fds is a valid 2-element array.
1284    let rc = unsafe { libc::pipe(fds.as_mut_ptr()) };
1285    if rc < 0 {
1286        return Err(DaemonError::Io(std::io::Error::last_os_error()));
1287    }
1288
1289    if let Err(err) = set_close_on_exec(fds[0]).and_then(|()| set_close_on_exec(fds[1])) {
1290        drop_raw_fd(fds[0]);
1291        drop_raw_fd(fds[1]);
1292        return Err(err);
1293    }
1294
1295    Ok((fds[0], fds[1]))
1296}
1297
1298#[cfg(all(unix, not(target_os = "linux")))]
1299fn set_close_on_exec(fd: libc::c_int) -> DaemonResult<()> {
1300    // SAFETY: fcntl only observes and updates descriptor flags for a live fd.
1301    let flags = unsafe { libc::fcntl(fd, libc::F_GETFD) };
1302    if flags < 0 {
1303        return Err(DaemonError::Io(std::io::Error::last_os_error()));
1304    }
1305
1306    // SAFETY: F_SETFD sets descriptor flags; FD_CLOEXEC preserves fork/exec hygiene.
1307    let rc = unsafe { libc::fcntl(fd, libc::F_SETFD, flags | libc::FD_CLOEXEC) };
1308    if rc < 0 {
1309        return Err(DaemonError::Io(std::io::Error::last_os_error()));
1310    }
1311
1312    Ok(())
1313}
1314
1315/// Close a raw FD; ignore errors (only call once per FD).
1316#[cfg(unix)]
1317fn drop_raw_fd(fd: libc::c_int) {
1318    // SAFETY: caller ensures exclusive ownership.
1319    unsafe { libc::close(fd) };
1320}
1321
1322/// Poll the read end of the self-pipe until EOF (ready) or deadline.
1323///
1324/// Returns `Ok(())` on EOF (grandchild closed its write end).
1325/// Returns `Err(())` on timeout.
1326#[cfg(unix)]
1327fn poll_ready_pipe(read_fd: libc::c_int, deadline: std::time::Instant) -> Result<(), ()> {
1328    use std::io::Read as _;
1329    use std::os::unix::io::FromRawFd as _;
1330
1331    // Wrap in a File for safe read().  We use forget() to prevent double-close
1332    // because the caller calls drop_raw_fd(read_fd) unconditionally.
1333    let mut file = unsafe { std::fs::File::from_raw_fd(read_fd) };
1334
1335    // Switch to non-blocking so we can poll without blocking the parent thread.
1336    // SAFETY: fcntl is async-signal-safe and we hold exclusive ownership of read_fd.
1337    unsafe {
1338        let flags = libc::fcntl(read_fd, libc::F_GETFL);
1339        if flags >= 0 {
1340            libc::fcntl(read_fd, libc::F_SETFL, flags | libc::O_NONBLOCK);
1341        }
1342    }
1343
1344    loop {
1345        let mut buf = [0u8; 1];
1346        match file.read(&mut buf) {
1347            Ok(0) => {
1348                // EOF: grandchild closed its write end.
1349                std::mem::forget(file);
1350                return Ok(());
1351            }
1352            Ok(_) => {
1353                // Spurious byte -- ignore and poll again.
1354            }
1355            Err(e)
1356                if e.kind() == std::io::ErrorKind::WouldBlock
1357                    || e.raw_os_error() == Some(libc::EAGAIN) =>
1358            {
1359                // No data yet.
1360            }
1361            Err(_) => {
1362                std::mem::forget(file);
1363                return Err(());
1364            }
1365        }
1366
1367        if std::time::Instant::now() >= deadline {
1368            std::mem::forget(file);
1369            return Err(());
1370        }
1371
1372        std::thread::sleep(Duration::from_millis(50));
1373    }
1374}
1375
1376/// Close the self-pipe write end FD after the grandchild has signalled ready.
1377#[cfg(unix)]
1378fn close_ready_pipe_fd(fd: libc::c_int) {
1379    // SAFETY: caller ensures this is the write end and no other code will use it.
1380    unsafe { libc::close(fd) };
1381}
1382
1383/// Read a raw FD integer from an environment variable.
1384#[cfg(unix)]
1385fn read_env_fd(var: &str) -> Option<libc::c_int> {
1386    std::env::var(var).ok()?.parse::<libc::c_int>().ok()
1387}
1388
1389/// Write the current process's PID to `pidfile_path` atomically.
1390///
1391/// The grandchild calls this to overwrite the parent's PID (written by
1392/// `acquire_pidfile_lock`) with its own PID.
1393#[cfg(unix)]
1394fn write_pid_file_grandchild(pidfile_path: &std::path::Path) -> DaemonResult<()> {
1395    use std::io::Write as _;
1396    use std::os::unix::fs::OpenOptionsExt as _;
1397
1398    let pid = std::process::id();
1399    let pid_str = format!("{pid}\n");
1400
1401    let tmp_path = pidfile_path.with_extension("tmp.gc");
1402    {
1403        let mut f = std::fs::OpenOptions::new()
1404            .write(true)
1405            .create(true)
1406            .truncate(true)
1407            .mode(0o644)
1408            .open(&tmp_path)
1409            .map_err(DaemonError::Io)?;
1410        f.write_all(pid_str.as_bytes()).map_err(DaemonError::Io)?;
1411        f.sync_data().map_err(DaemonError::Io)?;
1412    }
1413    std::fs::rename(&tmp_path, pidfile_path).map_err(DaemonError::Io)?;
1414    Ok(())
1415}
1416
1417// ---------------------------------------------------------------------------
1418// Tests
1419// ---------------------------------------------------------------------------
1420
1421#[cfg(test)]
1422mod tests {
1423    use super::*;
1424
1425    // ---- print_config -------------------------------------------------------
1426
1427    /// `run_print_config` must serialize the effective config as canonical TOML.
1428    /// The output must round-trip back into a valid `DaemonConfig` with the
1429    /// same field values.
1430    #[test]
1431    fn print_config_emits_canonical_toml() {
1432        let cfg = DaemonConfig::default();
1433        let toml_str = toml::to_string_pretty(&cfg)
1434            .expect("DaemonConfig must serialise to TOML without error");
1435
1436        assert!(!toml_str.is_empty(), "serialised config must not be empty");
1437
1438        // Round-trip.
1439        let reparsed: DaemonConfig =
1440            toml::from_str(&toml_str).expect("serialised TOML must be parseable back");
1441
1442        assert_eq!(reparsed.memory_limit_mb, cfg.memory_limit_mb);
1443        assert_eq!(
1444            reparsed.auto_start_ready_timeout_secs,
1445            cfg.auto_start_ready_timeout_secs
1446        );
1447        assert_eq!(reparsed.log_keep_rotations, cfg.log_keep_rotations);
1448    }
1449
1450    /// `run_print_config` with no config path must succeed (all defaults).
1451    #[test]
1452    fn run_print_config_succeeds_with_defaults() {
1453        // Clear any lingering SQRY_DAEMON_CONFIG.
1454        unsafe { std::env::remove_var("SQRY_DAEMON_CONFIG") };
1455
1456        let result = run_print_config(None, None);
1457        assert!(
1458            result.is_ok(),
1459            "run_print_config with no config file must succeed: {result:?}"
1460        );
1461    }
1462
1463    // ---- install-systemd-user (Linux only) ----------------------------------
1464
1465    /// On Linux, `install_systemd_user` must produce a non-empty string
1466    /// containing the expected `Type=notify` marker.
1467    #[cfg(target_os = "linux")]
1468    #[test]
1469    fn install_systemd_user_prints_to_stdout() {
1470        use crate::lifecycle::units::systemd::generate_user_unit;
1471        let cfg = DaemonConfig::default();
1472        let opts = InstallOptions::default();
1473        let unit = generate_user_unit(&cfg, &opts);
1474        assert!(!unit.is_empty(), "systemd user unit must be non-empty");
1475        assert!(
1476            unit.contains("Type=notify"),
1477            "systemd user unit must contain 'Type=notify'"
1478        );
1479        assert!(
1480            unit.contains("sqryd"),
1481            "systemd user unit must reference sqryd"
1482        );
1483    }
1484
1485    // ---- clap CLI parsing ---------------------------------------------------
1486
1487    /// `sqryd` with no args must parse without error (command is None or Start).
1488    #[test]
1489    fn default_command_is_start_foreground() {
1490        let cli = SqrydCli::try_parse_from(["sqryd"]).expect("parse must succeed");
1491        match cli.command {
1492            None => {}
1493            Some(Command::Start(Start {
1494                detach: false,
1495                spawned_by_client: false,
1496            })) => {}
1497            other => panic!("unexpected command: {other:?}"),
1498        }
1499    }
1500
1501    /// `sqryd start` must parse to `Start { detach: false }`.
1502    #[test]
1503    fn start_without_detach_is_foreground() {
1504        let cli = SqrydCli::try_parse_from(["sqryd", "start"]).expect("parse");
1505        assert!(matches!(
1506            cli.command,
1507            Some(Command::Start(Start {
1508                detach: false,
1509                spawned_by_client: false,
1510            }))
1511        ));
1512    }
1513
1514    /// `sqryd start --detach` must parse to `Start { detach: true }`.
1515    #[test]
1516    fn start_with_detach_flag_is_parsed() {
1517        let cli = SqrydCli::try_parse_from(["sqryd", "start", "--detach"]).expect("parse");
1518        assert!(matches!(
1519            cli.command,
1520            Some(Command::Start(Start {
1521                detach: true,
1522                spawned_by_client: false,
1523            }))
1524        ));
1525    }
1526
1527    /// `sqryd start --detach --spawned-by-client` must parse correctly.
1528    #[test]
1529    fn start_spawned_by_client_is_hidden_but_parseable() {
1530        let cli = SqrydCli::try_parse_from(["sqryd", "start", "--detach", "--spawned-by-client"])
1531            .expect("parse");
1532        assert!(matches!(
1533            cli.command,
1534            Some(Command::Start(Start {
1535                detach: true,
1536                spawned_by_client: true,
1537            }))
1538        ));
1539    }
1540
1541    /// `sqryd foreground` must parse.
1542    #[test]
1543    fn foreground_subcommand_parses() {
1544        let cli = SqrydCli::try_parse_from(["sqryd", "foreground"]).expect("parse");
1545        assert!(matches!(cli.command, Some(Command::Foreground)));
1546    }
1547
1548    /// `sqryd stop --timeout-secs 30` must parse with the custom timeout.
1549    #[test]
1550    fn stop_with_timeout_parses() {
1551        let cli =
1552            SqrydCli::try_parse_from(["sqryd", "stop", "--timeout-secs", "30"]).expect("parse");
1553        assert!(matches!(
1554            cli.command,
1555            Some(Command::Stop { timeout_secs: 30 })
1556        ));
1557    }
1558
1559    /// `sqryd status --json` must parse with `json = true`.
1560    #[test]
1561    fn status_with_json_flag_parses() {
1562        let cli = SqrydCli::try_parse_from(["sqryd", "status", "--json"]).expect("parse");
1563        assert!(matches!(cli.command, Some(Command::Status { json: true })));
1564    }
1565
1566    /// `sqryd print-config` must parse.
1567    #[test]
1568    fn print_config_subcommand_parses() {
1569        let cli = SqrydCli::try_parse_from(["sqryd", "print-config"]).expect("parse");
1570        assert!(matches!(cli.command, Some(Command::PrintConfig)));
1571    }
1572
1573    /// `sqryd --config /tmp/test.toml print-config` must capture the global flag.
1574    #[test]
1575    fn global_config_flag_is_parsed() {
1576        let cli = SqrydCli::try_parse_from(["sqryd", "--config", "/tmp/test.toml", "print-config"])
1577            .expect("parse");
1578        assert_eq!(
1579            cli.config,
1580            Some(PathBuf::from("/tmp/test.toml")),
1581            "--config flag must be captured"
1582        );
1583        assert!(matches!(cli.command, Some(Command::PrintConfig)));
1584    }
1585
1586    /// `sqryd status` (without --json) must parse with `json = false`.
1587    #[test]
1588    fn status_without_json_flag_defaults_to_false() {
1589        let cli = SqrydCli::try_parse_from(["sqryd", "status"]).expect("parse");
1590        assert!(matches!(cli.command, Some(Command::Status { json: false })));
1591    }
1592
1593    /// `sqryd stop` with no `--timeout-secs` must default to 15.
1594    #[test]
1595    fn stop_defaults_to_15_second_timeout() {
1596        let cli = SqrydCli::try_parse_from(["sqryd", "stop"]).expect("parse");
1597        assert!(matches!(
1598            cli.command,
1599            Some(Command::Stop { timeout_secs: 15 })
1600        ));
1601    }
1602
1603    // ---- m-4 fix: status malformed-JSON path --------------------------------
1604
1605    /// `render_status_human` must handle a minimal valid `daemon/status` result
1606    /// without panicking — this exercises the non-JSON output path.
1607    #[test]
1608    fn render_status_human_handles_minimal_result() {
1609        let result = serde_json::json!({
1610            "daemon_version": "8.0.6",
1611            "uptime_seconds": 42,
1612        });
1613        // Must not panic.
1614        render_status_human(&result);
1615    }
1616
1617    /// `load_config` with an explicit path must not mutate the process
1618    /// environment (M-3 fix regression test).
1619    ///
1620    /// Creates a minimal TOML file on disk and calls `load_config(Some(path))`.
1621    /// Checks that `SQRY_DAEMON_CONFIG` is NOT set in the environment after the
1622    /// call (implying `set_var` was not called).
1623    #[test]
1624    fn load_config_with_explicit_path_does_not_set_env_var() {
1625        use std::io::Write as _;
1626        use tempfile::NamedTempFile;
1627
1628        // Clear any pre-existing env var so the assertion below is meaningful.
1629        unsafe { std::env::remove_var("SQRY_DAEMON_CONFIG") };
1630
1631        // Write a minimal valid daemon TOML to a temp file.
1632        let mut tmp = NamedTempFile::new().expect("NamedTempFile");
1633        writeln!(tmp, "# minimal sqryd test config").expect("write");
1634        let path = tmp.path().to_path_buf();
1635
1636        let result = load_config(Some(path.clone()));
1637
1638        assert!(
1639            result.is_ok(),
1640            "load_config with valid TOML path must succeed: {result:?}"
1641        );
1642        assert!(
1643            std::env::var_os("SQRY_DAEMON_CONFIG").is_none(),
1644            "load_config must NOT mutate SQRY_DAEMON_CONFIG (M-3 fix)"
1645        );
1646    }
1647}