Skip to main content

running_process/broker/
doctor.rs

1//! Read-only `broker doctor` environment diagnostics (#354, v1.x-5 from #228).
2//!
3//! `doctor` inspects the local broker environment and reports a flat list of
4//! PASS / WARN / FAIL checks. It never mutates anything: no files are
5//! created, deleted, or rewritten, no processes are spawned, and no daemon
6//! state is changed. Stale artifacts are *reported*, never repaired.
7//!
8//! Check areas:
9//!
10//! 1. Environment-variable sanity for every `RUNNING_PROCESS_*` knob,
11//!    including a loud WARN when test-only seams are set.
12//! 2. Broker endpoint reachability: derive the default per-user shared
13//!    broker endpoint, attempt a connection, and — when something is
14//!    listening — run a deadline-bounded Hello probe to report the daemon
15//!    version, negotiated protocol, and decoded server capability bits.
16//! 3. Service-definition directory health plus per-file `.servicedef`
17//!    parse/validation results (same loader the broker Hello path uses).
18//! 4. Unix socket hygiene: count stale `*.sock` files in the broker runtime
19//!    directory (connect-refused ⇒ stale). Reported, not deleted.
20//! 5. Platform path budget: derived pipe/socket path length against the
21//!    platform limit (`MAX_PATH` on Windows, `sun_path` on Unix).
22//! 6. systemd KillMode (#391): WARN when systemd-managed with
23//!    `KillMode=control-group` (or undeterminable). The only check that may
24//!    spawn a process — a read-only `systemctl show -p KillMode` query on
25//!    Linux, and only when `INVOCATION_ID` indicates systemd management.
26//! 7. Version/build info: crate version, negotiated protocol version, and
27//!    framing version.
28//!
29//! Every check is fault-isolated: a panic inside one check is converted to
30//! a FAIL for that check and the remaining checks still run.
31
32use std::path::{Path, PathBuf};
33use std::sync::mpsc;
34use std::thread;
35use std::time::Duration;
36
37use prost::Message;
38
39use crate::broker::capabilities::CAP_HANDLE_PASSING;
40use crate::broker::client::{
41    broker_disabled_by_env, connect_local_socket, RUNNING_PROCESS_DISABLE_ENV,
42    RUNNING_PROCESS_FAKE_BACKEND_ENV,
43};
44use crate::broker::lifecycle::names::{
45    backend_pipe, shared_broker_pipe, PipePathError, LINUX_SUN_PATH_MAX, MACOS_SUN_PATH_MAX,
46    WINDOWS_MAX_PATH,
47};
48use crate::broker::lifecycle::sid::user_sid_hash;
49use crate::broker::protocol::{
50    hello_reply::Result as HelloReplyResult, read_frame, write_frame, ErrorCode, Frame, FrameKind,
51    Hello, HelloReply, PayloadEncoding, CONTROL_PAYLOAD_PROTOCOL, PROTOCOL_VERSION,
52};
53use crate::broker::server::service_def_loader::{
54    service_definition_dir, ServiceDefinitionLoader, SERVICE_DEF_DIR_ENV, SERVICE_DEF_EXTENSION,
55};
56use crate::broker::{secure_dir, FRAMING_VERSION_V1};
57
58/// Daemon-IPC tracking kill switch read by the Python layer and daemon
59/// client. Defined here as a literal because the canonical constant lives
60/// behind the `daemon` feature and doctor must stay `client`-only.
61const NO_TRACKING_ENV: &str = "RUNNING_PROCESS_NO_TRACKING";
62/// CWD-scoped daemon override used for test isolation.
63const DAEMON_SCOPE_ENV: &str = "RUNNING_PROCESS_DAEMON_SCOPE";
64/// Admin-socket override consumed by the `running-process-broker-v1` CLI.
65const BROKER_SOCKET_ENV: &str = "RUNNING_PROCESS_BROKER_V1_SOCKET";
66
67/// Wall-clock bound on the Hello probe so doctor can never hang on a
68/// listener that accepts but never replies.
69pub const DOCTOR_PROBE_TIMEOUT: Duration = Duration::from_secs(2);
70
71/// Service name the reachability probe sends in `Hello.service_name`.
72///
73/// A real broker refuses it with `ERROR_SERVICE_UNKNOWN` (unless an
74/// operator actually installed a service with this name), which still
75/// proves framing, protocol negotiation, and the daemon protocol range.
76pub const DOCTOR_PROBE_SERVICE: &str = "rp-doctor-probe";
77
78/// Outcome of one doctor check.
79#[derive(Clone, Copy, Debug, PartialEq, Eq)]
80pub enum DoctorStatus {
81    /// Healthy.
82    Pass,
83    /// Suspicious or non-default but not fatal. Never affects exit code.
84    Warn,
85    /// Broken. Any FAIL makes the doctor exit code 1.
86    Fail,
87}
88
89impl DoctorStatus {
90    /// Stable uppercase label used in both text and JSON output.
91    pub fn as_str(self) -> &'static str {
92        match self {
93            DoctorStatus::Pass => "PASS",
94            DoctorStatus::Warn => "WARN",
95            DoctorStatus::Fail => "FAIL",
96        }
97    }
98}
99
100/// One named check with its outcome and a one-line detail.
101#[derive(Clone, Debug)]
102pub struct DoctorCheck {
103    /// Stable check identifier, e.g. `env:RUNNING_PROCESS_DISABLE`.
104    pub name: String,
105    /// PASS / WARN / FAIL.
106    pub status: DoctorStatus,
107    /// Human-readable one-line detail.
108    pub detail: String,
109}
110
111impl DoctorCheck {
112    fn pass(name: impl Into<String>, detail: impl Into<String>) -> Self {
113        Self {
114            name: name.into(),
115            status: DoctorStatus::Pass,
116            detail: detail.into(),
117        }
118    }
119
120    fn warn(name: impl Into<String>, detail: impl Into<String>) -> Self {
121        Self {
122            name: name.into(),
123            status: DoctorStatus::Warn,
124            detail: detail.into(),
125        }
126    }
127
128    fn fail(name: impl Into<String>, detail: impl Into<String>) -> Self {
129        Self {
130            name: name.into(),
131            status: DoctorStatus::Fail,
132            detail: detail.into(),
133        }
134    }
135}
136
137/// Aggregated doctor run.
138#[derive(Clone, Debug, Default)]
139pub struct DoctorReport {
140    /// Every check that ran, in execution order.
141    pub checks: Vec<DoctorCheck>,
142}
143
144impl DoctorReport {
145    /// True when at least one check FAILed. WARNs do not count.
146    pub fn has_failures(&self) -> bool {
147        self.checks
148            .iter()
149            .any(|check| check.status == DoctorStatus::Fail)
150    }
151
152    /// Process exit code contract: 0 when no FAIL, 1 otherwise.
153    pub fn exit_code(&self) -> i32 {
154        if self.has_failures() {
155            1
156        } else {
157            0
158        }
159    }
160
161    /// Stable machine-readable JSON document.
162    ///
163    /// Shape (frozen — only additive changes allowed):
164    /// `{"schema_version":1,"command":"doctor","exit_code":0,
165    ///   "checks":[{"check":"...","status":"PASS","detail":"..."}]}`
166    pub fn to_json(&self) -> String {
167        let checks: Vec<serde_json::Value> = self
168            .checks
169            .iter()
170            .map(|check| {
171                serde_json::json!({
172                    "check": check.name,
173                    "status": check.status.as_str(),
174                    "detail": check.detail,
175                })
176            })
177            .collect();
178        serde_json::json!({
179            "schema_version": 1,
180            "command": "doctor",
181            "exit_code": self.exit_code(),
182            "checks": checks,
183        })
184        .to_string()
185    }
186
187    /// Human-readable table plus a one-line summary.
188    pub fn render_text(&self) -> String {
189        let name_width = self
190            .checks
191            .iter()
192            .map(|check| check.name.len())
193            .max()
194            .unwrap_or(0);
195        let mut out = String::new();
196        for check in &self.checks {
197            out.push_str(&format!(
198                "{:<4}  {:<name_width$}  {}\n",
199                check.status.as_str(),
200                check.name,
201                check.detail,
202            ));
203        }
204        let pass = self.count(DoctorStatus::Pass);
205        let warn = self.count(DoctorStatus::Warn);
206        let fail = self.count(DoctorStatus::Fail);
207        out.push_str(&format!(
208            "doctor: {} checks — {pass} pass, {warn} warn, {fail} fail\n",
209            self.checks.len()
210        ));
211        out
212    }
213
214    fn count(&self, status: DoctorStatus) -> usize {
215        self.checks
216            .iter()
217            .filter(|check| check.status == status)
218            .count()
219    }
220}
221
222/// Inputs for [`run_doctor`]. `Default` derives everything from the
223/// environment exactly like a broker client would.
224#[derive(Clone, Debug, Default)]
225pub struct DoctorOptions {
226    /// Probe this broker endpoint instead of the derived per-user shared
227    /// broker endpoint.
228    pub broker_endpoint: Option<String>,
229    /// Inspect this service-definition directory instead of the resolved
230    /// platform default (`paths.service_definition_dir` contract).
231    pub service_definition_dir: Option<PathBuf>,
232}
233
234/// Run every doctor check and aggregate the report.
235///
236/// Read-only by contract. Each check area is individually fault-isolated:
237/// a panic in one area becomes a FAIL entry and the rest still run.
238pub fn run_doctor(options: &DoctorOptions) -> DoctorReport {
239    let mut checks = Vec::new();
240    checks.extend(isolated("env", env_var_checks));
241    {
242        let endpoint = options.broker_endpoint.clone();
243        checks.extend(isolated("broker:endpoint", move || {
244            vec![broker_endpoint_check(endpoint.as_deref())]
245        }));
246    }
247    {
248        let dir = options
249            .service_definition_dir
250            .clone()
251            .unwrap_or_else(service_definition_dir);
252        checks.extend(isolated("servicedef:dir", move || {
253            service_definition_checks(&dir)
254        }));
255    }
256    checks.extend(isolated("sockets:runtime-dir", || {
257        vec![socket_hygiene_check()]
258    }));
259    checks.extend(isolated("filesystem:inodes", || {
260        vec![inode_pressure_check()]
261    }));
262    checks.extend(isolated("platform:path-budget", || {
263        vec![platform_path_budget_check()]
264    }));
265    checks.extend(isolated("platform:systemd-killmode", || {
266        vec![systemd_killmode_check()]
267    }));
268    checks.extend(isolated("build:version", || vec![version_check()]));
269    DoctorReport { checks }
270}
271
272/// Run one check area, converting a panic into a FAIL for that area.
273fn isolated<F>(area: &str, body: F) -> Vec<DoctorCheck>
274where
275    F: FnOnce() -> Vec<DoctorCheck> + std::panic::UnwindSafe,
276{
277    match std::panic::catch_unwind(body) {
278        Ok(checks) => checks,
279        Err(payload) => vec![DoctorCheck::fail(
280            area,
281            format!("check panicked: {}", panic_message(payload.as_ref())),
282        )],
283    }
284}
285
286fn panic_message(payload: &(dyn std::any::Any + Send)) -> String {
287    if let Some(message) = payload.downcast_ref::<&str>() {
288        (*message).to_string()
289    } else if let Some(message) = payload.downcast_ref::<String>() {
290        message.clone()
291    } else {
292        "non-string panic payload".to_string()
293    }
294}
295
296// ---------------------------------------------------------------------------
297// 1. Environment-variable sanity
298// ---------------------------------------------------------------------------
299
300/// Check every running-process environment knob.
301pub fn env_var_checks() -> Vec<DoctorCheck> {
302    let mut checks = vec![disable_env_check(), fake_backend_env_check()];
303    checks.push(informational_env_check(
304        NO_TRACKING_ENV,
305        "unset (daemon IPC tracking enabled)",
306        "daemon IPC tracking disabled",
307    ));
308    checks.push(informational_env_check(
309        DAEMON_SCOPE_ENV,
310        "unset (user-scoped daemon)",
311        "CWD-scoped daemon (test-isolation mode)",
312    ));
313    checks.push(informational_env_check(
314        SERVICE_DEF_DIR_ENV,
315        "unset (platform default service-definition dir)",
316        "service-definition dir overridden",
317    ));
318    checks.push(informational_env_check(
319        BROKER_SOCKET_ENV,
320        "unset (derived broker endpoint)",
321        "broker admin endpoint overridden",
322    ));
323    checks
324}
325
326fn disable_env_check() -> DoctorCheck {
327    let name = format!("env:{RUNNING_PROCESS_DISABLE_ENV}");
328    match broker_disabled_by_env() {
329        Ok(false) => DoctorCheck::pass(name, "unset (broker enabled)"),
330        Ok(true) => DoctorCheck::warn(
331            name,
332            "set to \"1\" — broker disabled; consumers use their direct fallback path",
333        ),
334        Err(err) => DoctorCheck::fail(name, err.to_string()),
335    }
336}
337
338fn fake_backend_env_check() -> DoctorCheck {
339    let name = format!("env:{RUNNING_PROCESS_FAKE_BACKEND_ENV}");
340    match std::env::var_os(RUNNING_PROCESS_FAKE_BACKEND_ENV) {
341        None => DoctorCheck::pass(name, "unset"),
342        Some(value) if value.is_empty() => {
343            DoctorCheck::warn(name, "set but empty (seam ignored) — unset it")
344        }
345        Some(value) => DoctorCheck::warn(
346            name,
347            format!(
348                "TEST-ONLY seam is set to {:?} — broker negotiation is bypassed; \
349                 never set this in production",
350                value.to_string_lossy()
351            ),
352        ),
353    }
354}
355
356fn informational_env_check(env: &str, unset_detail: &str, set_description: &str) -> DoctorCheck {
357    let name = format!("env:{env}");
358    match std::env::var_os(env) {
359        None => DoctorCheck::pass(name, unset_detail),
360        Some(value) => DoctorCheck::warn(
361            name,
362            format!("set to {:?} — {set_description}", value.to_string_lossy()),
363        ),
364    }
365}
366
367// ---------------------------------------------------------------------------
368// 2. Broker endpoint reachability
369// ---------------------------------------------------------------------------
370
371/// Derive the default per-user shared-broker endpoint string.
372pub fn default_broker_endpoint() -> Result<String, String> {
373    let sid_hash = user_sid_hash().map_err(|err| err.to_string())?;
374    let pipe = shared_broker_pipe(&sid_hash).map_err(|err| err.to_string())?;
375    pipe_path_string(pipe.windows, pipe.unix)
376        .ok_or_else(|| "pipe path has no platform form".to_string())
377}
378
379fn pipe_path_string(windows: Option<String>, unix: Option<PathBuf>) -> Option<String> {
380    windows.or_else(|| unix.map(|path| path.to_string_lossy().into_owned()))
381}
382
383/// Probe `endpoint` (or the derived default) for a listening broker.
384pub fn broker_endpoint_check(endpoint: Option<&str>) -> DoctorCheck {
385    const NAME: &str = "broker:endpoint";
386    let endpoint = match endpoint {
387        Some(endpoint) => endpoint.to_string(),
388        None => match default_broker_endpoint() {
389            Ok(endpoint) => endpoint,
390            Err(err) => {
391                return DoctorCheck::fail(NAME, format!("cannot derive broker endpoint: {err}"));
392            }
393        },
394    };
395    let stream = match connect_local_socket(&endpoint) {
396        Ok(stream) => stream,
397        Err(err) => {
398            return DoctorCheck::warn(NAME, format!("no broker listening at {endpoint} ({err})"));
399        }
400    };
401    match hello_probe(stream) {
402        Ok(ProbeOutcome::Negotiated {
403            daemon_version,
404            negotiated_protocol,
405            server_capabilities,
406        }) => DoctorCheck::pass(
407            NAME,
408            format!(
409                "broker listening at {endpoint}: daemon {daemon_version}, \
410                 protocol v{negotiated_protocol}, capabilities {}",
411                describe_capabilities(server_capabilities)
412            ),
413        ),
414        Ok(ProbeOutcome::Refused {
415            code,
416            daemon_min_protocol,
417            daemon_max_protocol,
418        }) => DoctorCheck::pass(
419            NAME,
420            format!(
421                "broker listening at {endpoint}: protocol v{daemon_min_protocol}..v{daemon_max_protocol}, \
422                 probe refused with {code:?} (expected for the doctor probe service)"
423            ),
424        ),
425        Err(err) => DoctorCheck::warn(
426            NAME,
427            format!("{endpoint} accepted a connection but the v1 Hello probe failed: {err}"),
428        ),
429    }
430}
431
432enum ProbeOutcome {
433    Negotiated {
434        daemon_version: String,
435        negotiated_protocol: u32,
436        server_capabilities: u64,
437    },
438    Refused {
439        code: ErrorCode,
440        daemon_min_protocol: u32,
441        daemon_max_protocol: u32,
442    },
443}
444
445/// Send one Hello for [`DOCTOR_PROBE_SERVICE`] and classify the reply.
446///
447/// Runs on a helper thread bounded by [`DOCTOR_PROBE_TIMEOUT`] because
448/// local-socket streams have no portable read timeout; on timeout the
449/// abandoned stream stays with the helper thread.
450fn hello_probe(stream: interprocess::local_socket::Stream) -> Result<ProbeOutcome, String> {
451    let (result_tx, result_rx) = mpsc::channel();
452    thread::spawn(move || {
453        let mut stream = stream;
454        let _ = result_tx.send(hello_probe_blocking(&mut stream));
455    });
456    match result_rx.recv_timeout(DOCTOR_PROBE_TIMEOUT) {
457        Ok(outcome) => outcome,
458        Err(_) => Err(format!(
459            "no HelloReply within {DOCTOR_PROBE_TIMEOUT:?} (listener is not a v1 broker?)"
460        )),
461    }
462}
463
464fn hello_probe_blocking(
465    stream: &mut interprocess::local_socket::Stream,
466) -> Result<ProbeOutcome, String> {
467    let hello = Hello {
468        client_min_protocol: PROTOCOL_VERSION,
469        client_max_protocol: PROTOCOL_VERSION,
470        service_name: DOCTOR_PROBE_SERVICE.into(),
471        wanted_version: "0.0.0".into(),
472        client_version: env!("CARGO_PKG_VERSION").into(),
473        client_capabilities: 0,
474        auth_token: Vec::new(),
475        request_id: "doctor-probe".into(),
476        connection_id: 0,
477        peer_pid: std::process::id(),
478        client_lib_name: "running-process-doctor".into(),
479        client_lib_version: env!("CARGO_PKG_VERSION").into(),
480        peer_attestation_nonce: Vec::new(),
481        capability_token: Vec::new(),
482        client_keepalive_secs: 0,
483    };
484    let request_frame = Frame {
485        envelope_version: PROTOCOL_VERSION,
486        kind: FrameKind::Request as i32,
487        payload_protocol: CONTROL_PAYLOAD_PROTOCOL,
488        payload: hello.encode_to_vec(),
489        request_id: 1,
490        payload_encoding: PayloadEncoding::None as i32,
491        deadline_unix_ms: 0,
492        traceparent: String::new(),
493        tracestate: String::new(),
494    };
495    write_frame(stream, &request_frame.encode_to_vec())
496        .map_err(|err| format!("failed to write Hello frame: {err}"))?;
497    let response_bytes =
498        read_frame(stream).map_err(|err| format!("failed to read HelloReply frame: {err}"))?;
499    let response_frame = Frame::decode(response_bytes.as_slice())
500        .map_err(|err| format!("failed to decode response Frame: {err}"))?;
501    let reply = HelloReply::decode(response_frame.payload.as_slice())
502        .map_err(|err| format!("failed to decode HelloReply: {err}"))?;
503    match reply.result.ok_or("HelloReply carried no result")? {
504        HelloReplyResult::Negotiated(negotiated) => Ok(ProbeOutcome::Negotiated {
505            daemon_version: negotiated.daemon_version,
506            negotiated_protocol: negotiated.negotiated_protocol,
507            server_capabilities: negotiated.server_capabilities,
508        }),
509        HelloReplyResult::Refused(refused) => Ok(ProbeOutcome::Refused {
510            code: ErrorCode::try_from(refused.code).unwrap_or(ErrorCode::Unspecified),
511            daemon_min_protocol: refused.daemon_min_protocol,
512            daemon_max_protocol: refused.daemon_max_protocol,
513        }),
514    }
515}
516
517/// Render a capability bitmap with the registry's known bit names.
518pub fn describe_capabilities(bits: u64) -> String {
519    if bits == 0 {
520        return "none".to_string();
521    }
522    let mut names = Vec::new();
523    if bits & CAP_HANDLE_PASSING != 0 {
524        names.push("HANDLE_PASSING".to_string());
525    }
526    let unknown = bits & !CAP_HANDLE_PASSING;
527    if unknown != 0 {
528        names.push(format!("unknown:0x{unknown:x}"));
529    }
530    format!("0x{bits:x} [{}]", names.join(", "))
531}
532
533// ---------------------------------------------------------------------------
534// 3. Service-definition directory + per-file validation
535// ---------------------------------------------------------------------------
536
537/// Check the service-definition directory and every `.servicedef` in it.
538pub fn service_definition_checks(dir: &Path) -> Vec<DoctorCheck> {
539    const DIR_CHECK: &str = "servicedef:dir";
540    let display = dir.display();
541    if !dir.exists() {
542        return vec![DoctorCheck::warn(
543            DIR_CHECK,
544            format!("{display} does not exist (no service definitions installed)"),
545        )];
546    }
547    if !dir.is_dir() {
548        return vec![DoctorCheck::fail(
549            DIR_CHECK,
550            format!("{display} exists but is not a directory"),
551        )];
552    }
553    match secure_dir::private_dir_permissions_are_private(dir) {
554        Ok(true) => {}
555        Ok(false) => {
556            return vec![DoctorCheck::fail(
557                DIR_CHECK,
558                format!(
559                    "{display} has insecure permissions (must be current-user-only); \
560                     the broker refuses to load definitions from it"
561                ),
562            )];
563        }
564        Err(err) => {
565            return vec![DoctorCheck::fail(
566                DIR_CHECK,
567                format!("cannot inspect permissions of {display}: {err}"),
568            )];
569        }
570    }
571
572    let entries = match std::fs::read_dir(dir) {
573        Ok(entries) => entries,
574        Err(err) => {
575            return vec![DoctorCheck::fail(
576                DIR_CHECK,
577                format!("cannot enumerate {display}: {err}"),
578            )];
579        }
580    };
581    let mut files: Vec<PathBuf> = entries
582        .filter_map(|entry| entry.ok().map(|entry| entry.path()))
583        .filter(|path| {
584            path.extension()
585                .map(|ext| ext == SERVICE_DEF_EXTENSION)
586                .unwrap_or(false)
587        })
588        .collect();
589    files.sort();
590
591    let mut checks = vec![DoctorCheck::pass(
592        DIR_CHECK,
593        format!(
594            "{display} (private, {} .{SERVICE_DEF_EXTENSION} file{})",
595            files.len(),
596            if files.len() == 1 { "" } else { "s" }
597        ),
598    )];
599
600    let loader = ServiceDefinitionLoader::new(dir);
601    for path in files {
602        let file_name = path
603            .file_name()
604            .map(|name| name.to_string_lossy().into_owned())
605            .unwrap_or_else(|| path.display().to_string());
606        let check_name = format!("servicedef:{file_name}");
607        let Some(service_name) = path
608            .file_stem()
609            .map(|stem| stem.to_string_lossy().into_owned())
610        else {
611            checks.push(DoctorCheck::fail(check_name, "file has no stem"));
612            continue;
613        };
614        match loader.load(&service_name) {
615            Ok(definition) => checks.push(DoctorCheck::pass(
616                check_name,
617                format!(
618                    "valid (service {:?}, binary {:?})",
619                    definition.service_name, definition.binary_path
620                ),
621            )),
622            Err(err) => checks.push(DoctorCheck::fail(check_name, err.to_string())),
623        }
624    }
625    checks
626}
627
628// ---------------------------------------------------------------------------
629// 4. Socket/pipe hygiene
630// ---------------------------------------------------------------------------
631
632/// Report stale `*.sock` files in the broker runtime directory (Unix).
633///
634/// A socket file counts as stale when connecting to it is refused —
635/// nothing is listening behind it. Doctor only reports the count; it
636/// never deletes anything.
637pub fn socket_hygiene_check() -> DoctorCheck {
638    const NAME: &str = "sockets:runtime-dir";
639    #[cfg(windows)]
640    {
641        DoctorCheck::pass(
642            NAME,
643            "not applicable on Windows (named pipes leave no filesystem residue)",
644        )
645    }
646    #[cfg(unix)]
647    {
648        let Some(dir) = broker_runtime_dir() else {
649            return DoctorCheck::fail(NAME, "cannot derive broker runtime directory");
650        };
651        let display = dir.display();
652        if !dir.exists() {
653            return DoctorCheck::pass(NAME, format!("{display} does not exist (no sockets)"));
654        }
655        let entries = match std::fs::read_dir(&dir) {
656            Ok(entries) => entries,
657            Err(err) => {
658                return DoctorCheck::fail(NAME, format!("cannot enumerate {display}: {err}"));
659            }
660        };
661        let mut total = 0usize;
662        let mut stale = 0usize;
663        for path in entries.filter_map(|entry| entry.ok().map(|entry| entry.path())) {
664            if path.extension().map(|ext| ext == "sock").unwrap_or(false) {
665                total += 1;
666                let endpoint = path.to_string_lossy();
667                if let Err(err) = connect_local_socket(&endpoint) {
668                    if err.kind() == std::io::ErrorKind::ConnectionRefused {
669                        stale += 1;
670                    }
671                }
672            }
673        }
674        if stale == 0 {
675            DoctorCheck::pass(
676                NAME,
677                format!("{display}: {total} socket file(s), none stale"),
678            )
679        } else {
680            DoctorCheck::warn(
681                NAME,
682                format!(
683                    "{display}: {stale} of {total} socket file(s) are stale \
684                     (connect refused) — not deleted, doctor is read-only"
685                ),
686            )
687        }
688    }
689}
690
691/// Parent directory of the per-user broker sockets, derived from the
692/// shared-broker pipe path (Unix only).
693#[cfg(unix)]
694fn broker_runtime_dir() -> Option<PathBuf> {
695    let sid_hash = user_sid_hash().ok()?;
696    let pipe = shared_broker_pipe(&sid_hash).ok()?;
697    pipe.unix
698        .and_then(|path| path.parent().map(Path::to_path_buf))
699}
700
701// ---------------------------------------------------------------------------
702// 4b. Inode pressure on the daemon data dir filesystem (#390)
703// ---------------------------------------------------------------------------
704
705/// Free-inode fraction below which the check WARNs.
706const INODE_WARN_FREE_RATIO: f64 = 0.05;
707/// Free-inode fraction below which the check FAILs.
708const INODE_FAIL_FREE_RATIO: f64 = 0.01;
709
710/// Report inode usage/headroom of the daemon data dir filesystem.
711///
712/// Windows filesystems have no fixed inode table, so the check PASSes as
713/// not-applicable there instead of faking numbers. Same for Unix
714/// filesystems reporting a zero inode total (e.g. btrfs).
715pub fn inode_pressure_check() -> DoctorCheck {
716    const NAME: &str = "filesystem:inodes";
717    let dir = crate::client::paths::data_dir();
718    let display = dir.display();
719    match crate::broker::fs_health::daemon_data_dir_inode_usage() {
720        Ok(Some(usage)) => {
721            let free_ratio = if usage.total == 0 {
722                1.0
723            } else {
724                usage.free as f64 / usage.total as f64
725            };
726            let detail = format!(
727                "{display}: {} of {} inodes free ({:.1}% used)",
728                usage.free,
729                usage.total,
730                usage.used_ratio() * 100.0
731            );
732            if free_ratio < INODE_FAIL_FREE_RATIO {
733                DoctorCheck::fail(
734                    NAME,
735                    format!("{detail} — inode exhaustion imminent; daemon writes will ENOSPC"),
736                )
737            } else if free_ratio < INODE_WARN_FREE_RATIO {
738                DoctorCheck::warn(NAME, format!("{detail} — low inode headroom"))
739            } else {
740                DoctorCheck::pass(NAME, detail)
741            }
742        }
743        Ok(None) => DoctorCheck::pass(
744            NAME,
745            if cfg!(windows) {
746                format!("not applicable on Windows ({display} has no fixed inode table)")
747            } else {
748                format!("{display}: filesystem reports no fixed inode table (not applicable)")
749            },
750        ),
751        Err(err) => DoctorCheck::warn(
752            NAME,
753            format!("cannot probe inode usage of {display}: {err}"),
754        ),
755    }
756}
757
758// ---------------------------------------------------------------------------
759// 5. Platform path budget
760// ---------------------------------------------------------------------------
761
762/// Slack (bytes) below the platform path limit that triggers a WARN.
763const PATH_BUDGET_WARN_SLACK: usize = 8;
764
765/// Check the longest standard pipe name (a backend pipe) against the
766/// platform path-length limit. This bit the test suite repeatedly on
767/// macOS, where `sun_path` is only 104 bytes.
768pub fn platform_path_budget_check() -> DoctorCheck {
769    const NAME: &str = "platform:path-budget";
770    let (limit, limit_label) = if cfg!(windows) {
771        (WINDOWS_MAX_PATH, "Windows MAX_PATH")
772    } else if cfg!(target_os = "macos") {
773        (MACOS_SUN_PATH_MAX, "macOS sun_path")
774    } else {
775        (LINUX_SUN_PATH_MAX, "Linux/Unix sun_path")
776    };
777    let sid_hash = match user_sid_hash() {
778        Ok(hash) => hash,
779        Err(err) => {
780            return DoctorCheck::fail(NAME, format!("cannot compute user SID hash: {err}"));
781        }
782    };
783    // Backend pipes carry the longest standard suffix (32 hex chars), so
784    // they exhaust the budget first.
785    match backend_pipe(&sid_hash, &[0u8; 16]) {
786        Ok(pipe) => {
787            let Some(path) = pipe_path_string(pipe.windows, pipe.unix) else {
788                return DoctorCheck::fail(NAME, "derived pipe path has no platform form");
789            };
790            let len = path.len();
791            let detail =
792                format!("backend pipe path is {len} of {limit} bytes ({limit_label}): {path}");
793            if len + PATH_BUDGET_WARN_SLACK >= limit {
794                DoctorCheck::warn(
795                    NAME,
796                    format!("{detail} — within {PATH_BUDGET_WARN_SLACK} bytes of the limit"),
797                )
798            } else {
799                DoctorCheck::pass(NAME, detail)
800            }
801        }
802        Err(err @ PipePathError::PathTooLong { .. }) => DoctorCheck::fail(
803            NAME,
804            format!("derived backend pipe path exceeds the {limit_label} budget: {err}"),
805        ),
806        Err(err) => DoctorCheck::fail(NAME, format!("cannot derive backend pipe path: {err}")),
807    }
808}
809
810// ---------------------------------------------------------------------------
811// 6. systemd KillMode (#391)
812// ---------------------------------------------------------------------------
813
814/// WARN when running under a systemd unit whose KillMode would reap
815/// spawned children on unit stop (`control-group`, systemd's default), or
816/// when systemd-managed but the KillMode cannot be determined.
817pub fn systemd_killmode_check() -> DoctorCheck {
818    const NAME: &str = "platform:systemd-killmode";
819    use crate::systemd_killmode::{probe, KillModeAssessment};
820    let assessment = probe();
821    match assessment.warning() {
822        Some(warning) => DoctorCheck::warn(NAME, warning),
823        None => match assessment {
824            KillModeAssessment::Safe { unit, kill_mode } => DoctorCheck::pass(
825                NAME,
826                format!("systemd unit {unit} uses KillMode={kill_mode} (children survive stop)"),
827            ),
828            _ => DoctorCheck::pass(
829                NAME,
830                if cfg!(target_os = "linux") {
831                    "not running under systemd"
832                } else {
833                    "not applicable on this platform"
834                },
835            ),
836        },
837    }
838}
839
840// ---------------------------------------------------------------------------
841// 7. Version/build info
842// ---------------------------------------------------------------------------
843
844/// Report crate, protocol, and framing versions. Always PASS.
845pub fn version_check() -> DoctorCheck {
846    DoctorCheck::pass(
847        "build:version",
848        format!(
849            "running-process {} — protocol v{PROTOCOL_VERSION}, framing v{FRAMING_VERSION_V1}",
850            env!("CARGO_PKG_VERSION")
851        ),
852    )
853}
854
855#[cfg(test)]
856mod tests {
857    use super::*;
858
859    fn check(status: DoctorStatus) -> DoctorCheck {
860        DoctorCheck {
861            name: "test:check".into(),
862            status,
863            detail: "detail".into(),
864        }
865    }
866
867    #[test]
868    fn exit_code_is_zero_without_failures() {
869        let report = DoctorReport {
870            checks: vec![check(DoctorStatus::Pass), check(DoctorStatus::Warn)],
871        };
872        assert!(!report.has_failures());
873        assert_eq!(report.exit_code(), 0);
874    }
875
876    #[test]
877    fn exit_code_is_one_with_any_failure() {
878        let report = DoctorReport {
879            checks: vec![check(DoctorStatus::Pass), check(DoctorStatus::Fail)],
880        };
881        assert!(report.has_failures());
882        assert_eq!(report.exit_code(), 1);
883    }
884
885    #[test]
886    fn isolated_converts_panics_into_fail_checks() {
887        let checks = isolated("area:test", || panic!("boom"));
888        assert_eq!(checks.len(), 1);
889        assert_eq!(checks[0].status, DoctorStatus::Fail);
890        assert!(checks[0].detail.contains("boom"));
891    }
892
893    #[test]
894    fn describe_capabilities_names_known_bits() {
895        assert_eq!(describe_capabilities(0), "none");
896        assert_eq!(describe_capabilities(1), "0x1 [HANDLE_PASSING]");
897        let mixed = describe_capabilities(0b11);
898        assert!(mixed.contains("HANDLE_PASSING"));
899        assert!(mixed.contains("unknown:0x2"));
900    }
901
902    #[test]
903    fn render_text_includes_summary_line() {
904        let report = DoctorReport {
905            checks: vec![check(DoctorStatus::Pass), check(DoctorStatus::Warn)],
906        };
907        let text = report.render_text();
908        assert!(text.contains("PASS"));
909        assert!(text.contains("WARN"));
910        assert!(text.contains("doctor: 2 checks — 1 pass, 1 warn, 0 fail"));
911    }
912}