Skip to main content

rch_common/errors/
reliability.rs

1//! Reliability-doctor reason-code catalog (`RCH-Rnnn`).
2//!
3//! This module mirrors the [`super::catalog::ErrorCode`] pattern but for the
4//! `rch doctor --reliability` diagnostics surface. Every diagnostic emitted
5//! by the reliability doctor carries one of these variants. The CLI/JSON
6//! representation is a stable `RCH-Rnnn` string, so agents and dashboards
7//! can branch on the code without parsing free-form snake_case strings.
8//!
9//! # Code Ranges
10//!
11//! | Range      | Category              | Description                            |
12//! |------------|-----------------------|----------------------------------------|
13//! | R001-R099  | Topology              | Worker config + daemon worker capacity |
14//! | R100-R199  | DiskPressure          | Worker disk-pressure tiers + telemetry |
15//! | R200-R299  | ProcessTriage         | Cancellation cleanup + process debt    |
16//! | R300-R399  | RepoConvergence       | Worker repo-state convergence          |
17//! | R400-R499  | HelperCompatibility   | rsync / ssh / cargo / zstd availability |
18//! | R500-R599  | RolloutPosture        | self-healing config flags              |
19//! | R600-R699  | SchemaCompatibility   | Cross-binary schema-version drift      |
20//!
21//! Discipline:
22//! - Variant identifiers are CamelCase.
23//! - Each variant has a fixed `RCH-Rnnn` code returned by [`ReliabilityReasonCode::code`].
24//! - Each variant has a category (per the table above) for analytics grouping.
25//! - Each variant declares whether its remediation requires a daemon restart
26//!   via [`ReliabilityReasonCode::requires_restart`] (consumed by the
27//!   reliability doctor's `requires_restart` field on `RemediationStep`).
28//! - Each variant carries a one-line `remediation_hint` for the diagnostic's
29//!   default suggestion text.
30//!
31//! Adding a new variant requires:
32//! 1. Pick the next free code in the right range.
33//! 2. Add the variant to the enum below.
34//! 3. Add an arm to every `match self` block (Rust's exhaustiveness check
35//!    will not let you forget — that's the whole point of the enum).
36//! 4. The unit tests in this module enforce uniqueness, format, and
37//!    range-membership at `cargo test` time.
38
39use serde::{Deserialize, Serialize};
40use std::fmt;
41
42/// Stable category groups for reliability reason codes.
43#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
44#[serde(rename_all = "snake_case")]
45pub enum ReliabilityCategoryKind {
46    Topology,
47    DiskPressure,
48    ProcessTriage,
49    RepoConvergence,
50    HelperCompatibility,
51    RolloutPosture,
52    SchemaCompatibility,
53}
54
55/// One reason code per emitted reliability diagnostic. Serializes to its
56/// canonical `RCH-Rnnn` string form via [`Serialize`]; deserializes the same
57/// form via [`Deserialize`].
58#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
59pub enum ReliabilityReasonCode {
60    // ---- Topology (R001-R099) ----
61    /// Workers configuration could not be loaded.
62    WorkersConfigUnreadable,
63    /// No workers configured (every build runs locally).
64    NoWorkersConfigured,
65    /// Workers are configured (Pass).
66    WorkersConfigured,
67    /// Daemon status surface is unavailable.
68    DaemonStatusUnavailable,
69    /// Daemon has no registered workers.
70    DaemonHasNoWorkers,
71    /// Every worker is unhealthy.
72    AllWorkersUnhealthy,
73    /// Some workers are healthy, some are not.
74    PartialWorkerCapacity,
75    /// All workers are healthy (Pass).
76    WorkersHealthy,
77    /// A worker's circuit breaker is open.
78    WorkerCircuitOpen,
79    /// A worker is unreachable / offline / failed.
80    WorkerUnreachable,
81    /// A worker is degraded (half-open circuit or not-ready).
82    WorkerDegraded,
83    /// A worker is ready (Pass).
84    WorkerReady,
85    /// A worker reported an unrecognized `ready_status` value (defensive parse).
86    WorkerStatusUnrecognized,
87    /// A worker reported an unrecognized `circuit_state` value (defensive parse).
88    WorkerCircuitStateUnrecognized,
89
90    // ---- DiskPressure (R100-R199) ----
91    /// Disk-pressure surface is unavailable.
92    DiskPressureUnavailable,
93    /// Worker disk pressure has reached critical.
94    WorkerDiskPressureCritical,
95    /// Worker disk pressure has reached the warning threshold.
96    WorkerDiskPressureWarning,
97    /// Worker disk pressure is healthy (Pass).
98    WorkerDiskPressureHealthy,
99    /// Worker is missing fresh disk telemetry.
100    WorkerDiskPressureTelemetryGap,
101    /// No workers reported disk-pressure telemetry (Info; common for empty fleets).
102    DiskPressureNoWorkers,
103
104    // ---- ProcessTriage (R200-R299) ----
105    /// Process-debt surface is unavailable.
106    ProcessDebtUnavailable,
107    /// Cancellation cleanup is healthy (Pass).
108    CancellationCleanupHealthy,
109    /// Cancellation cleanup was skipped (no jobs to triage).
110    CancellationCleanupSkipped,
111    /// Cancellation cleanup is degraded (some warnings).
112    CancellationCleanupDegraded,
113    /// Cancellation cleanup is failing.
114    CancellationCleanupFailed,
115
116    // ---- RepoConvergence (R300-R399) ----
117    /// Repo-convergence surface is unavailable.
118    RepoConvergenceUnavailable,
119    /// One or more workers failed repo convergence.
120    RepoConvergenceFailed,
121    /// Workers are drifting / stale on repo convergence.
122    RepoConvergenceDrift,
123    /// No workers reported repo-convergence records (Info).
124    RepoConvergenceNoWorkers,
125    /// All workers are repo-converged (Pass).
126    RepoConvergenceReady,
127    /// A specific worker's repo state is not ready.
128    WorkerRepoNotReady,
129
130    // ---- HelperCompatibility (R400-R499) ----
131    /// A required helper binary is available (Pass).
132    HelperAvailable,
133    /// A required helper binary is missing.
134    HelperMissing,
135    /// The helper compatibility probe itself did not complete.
136    HelperProbeUnavailable,
137
138    // ---- RolloutPosture (R500-R599) ----
139    /// `self_healing.hook_starts_daemon` is enabled (Pass).
140    HookAutoStartEnabled,
141    /// `self_healing.hook_starts_daemon` is disabled.
142    HookAutoStartDisabled,
143    /// `self_healing.daemon_installs_hooks` is enabled (Pass).
144    DaemonHookRepairEnabled,
145    /// `self_healing.daemon_installs_hooks` is disabled.
146    DaemonHookRepairDisabled,
147    /// Configuration could not be loaded.
148    ConfigLoadFailed,
149    /// Unified status surface is compiled in (Pass).
150    StatusSurfaceAvailable,
151    /// Repo-convergence status endpoint is wired into the CLI (Pass).
152    RepoConvergenceSurfaceAvailable,
153    /// Disk-pressure fields are wired into worker status (Pass).
154    DiskPressureSurfaceAvailable,
155
156    // ---- SchemaCompatibility (R600-R699) ----
157    /// Schema versions are compatible (Pass).
158    SchemaCompatible,
159    /// Schema versions are incompatible.
160    SchemaIncompatible,
161}
162
163impl ReliabilityReasonCode {
164    /// The CamelCase variant identifier as a static string. Used by
165    /// `rch error explain` to render a human-readable name alongside
166    /// the `RCH-Rnnn` code.
167    #[must_use]
168    pub const fn name(self) -> &'static str {
169        match self {
170            Self::WorkersConfigUnreadable => "WorkersConfigUnreadable",
171            Self::NoWorkersConfigured => "NoWorkersConfigured",
172            Self::WorkersConfigured => "WorkersConfigured",
173            Self::DaemonStatusUnavailable => "DaemonStatusUnavailable",
174            Self::DaemonHasNoWorkers => "DaemonHasNoWorkers",
175            Self::AllWorkersUnhealthy => "AllWorkersUnhealthy",
176            Self::PartialWorkerCapacity => "PartialWorkerCapacity",
177            Self::WorkersHealthy => "WorkersHealthy",
178            Self::WorkerCircuitOpen => "WorkerCircuitOpen",
179            Self::WorkerUnreachable => "WorkerUnreachable",
180            Self::WorkerDegraded => "WorkerDegraded",
181            Self::WorkerReady => "WorkerReady",
182            Self::WorkerStatusUnrecognized => "WorkerStatusUnrecognized",
183            Self::WorkerCircuitStateUnrecognized => "WorkerCircuitStateUnrecognized",
184            Self::DiskPressureUnavailable => "DiskPressureUnavailable",
185            Self::WorkerDiskPressureCritical => "WorkerDiskPressureCritical",
186            Self::WorkerDiskPressureWarning => "WorkerDiskPressureWarning",
187            Self::WorkerDiskPressureHealthy => "WorkerDiskPressureHealthy",
188            Self::WorkerDiskPressureTelemetryGap => "WorkerDiskPressureTelemetryGap",
189            Self::DiskPressureNoWorkers => "DiskPressureNoWorkers",
190            Self::ProcessDebtUnavailable => "ProcessDebtUnavailable",
191            Self::CancellationCleanupHealthy => "CancellationCleanupHealthy",
192            Self::CancellationCleanupSkipped => "CancellationCleanupSkipped",
193            Self::CancellationCleanupDegraded => "CancellationCleanupDegraded",
194            Self::CancellationCleanupFailed => "CancellationCleanupFailed",
195            Self::RepoConvergenceUnavailable => "RepoConvergenceUnavailable",
196            Self::RepoConvergenceFailed => "RepoConvergenceFailed",
197            Self::RepoConvergenceDrift => "RepoConvergenceDrift",
198            Self::RepoConvergenceNoWorkers => "RepoConvergenceNoWorkers",
199            Self::RepoConvergenceReady => "RepoConvergenceReady",
200            Self::WorkerRepoNotReady => "WorkerRepoNotReady",
201            Self::HelperAvailable => "HelperAvailable",
202            Self::HelperMissing => "HelperMissing",
203            Self::HelperProbeUnavailable => "HelperProbeUnavailable",
204            Self::HookAutoStartEnabled => "HookAutoStartEnabled",
205            Self::HookAutoStartDisabled => "HookAutoStartDisabled",
206            Self::DaemonHookRepairEnabled => "DaemonHookRepairEnabled",
207            Self::DaemonHookRepairDisabled => "DaemonHookRepairDisabled",
208            Self::ConfigLoadFailed => "ConfigLoadFailed",
209            Self::StatusSurfaceAvailable => "StatusSurfaceAvailable",
210            Self::RepoConvergenceSurfaceAvailable => "RepoConvergenceSurfaceAvailable",
211            Self::DiskPressureSurfaceAvailable => "DiskPressureSurfaceAvailable",
212            Self::SchemaCompatible => "SchemaCompatible",
213            Self::SchemaIncompatible => "SchemaIncompatible",
214        }
215    }
216
217    /// The canonical `RCH-Rnnn` code string for this variant.
218    #[must_use]
219    pub const fn code(self) -> &'static str {
220        match self {
221            // R001-R099 — Topology
222            Self::WorkersConfigUnreadable => "RCH-R001",
223            Self::NoWorkersConfigured => "RCH-R002",
224            Self::WorkersConfigured => "RCH-R003",
225            Self::DaemonStatusUnavailable => "RCH-R004",
226            Self::DaemonHasNoWorkers => "RCH-R005",
227            Self::AllWorkersUnhealthy => "RCH-R006",
228            Self::PartialWorkerCapacity => "RCH-R007",
229            Self::WorkersHealthy => "RCH-R008",
230            Self::WorkerCircuitOpen => "RCH-R009",
231            Self::WorkerUnreachable => "RCH-R010",
232            Self::WorkerDegraded => "RCH-R011",
233            Self::WorkerReady => "RCH-R012",
234            Self::WorkerStatusUnrecognized => "RCH-R013",
235            Self::WorkerCircuitStateUnrecognized => "RCH-R014",
236
237            // R100-R199 — DiskPressure
238            Self::DiskPressureUnavailable => "RCH-R100",
239            Self::WorkerDiskPressureCritical => "RCH-R101",
240            Self::WorkerDiskPressureWarning => "RCH-R102",
241            Self::WorkerDiskPressureHealthy => "RCH-R103",
242            Self::WorkerDiskPressureTelemetryGap => "RCH-R104",
243            Self::DiskPressureNoWorkers => "RCH-R105",
244
245            // R200-R299 — ProcessTriage
246            Self::ProcessDebtUnavailable => "RCH-R200",
247            Self::CancellationCleanupHealthy => "RCH-R201",
248            Self::CancellationCleanupSkipped => "RCH-R202",
249            Self::CancellationCleanupDegraded => "RCH-R203",
250            Self::CancellationCleanupFailed => "RCH-R204",
251
252            // R300-R399 — RepoConvergence
253            Self::RepoConvergenceUnavailable => "RCH-R300",
254            Self::RepoConvergenceFailed => "RCH-R301",
255            Self::RepoConvergenceDrift => "RCH-R302",
256            Self::RepoConvergenceNoWorkers => "RCH-R303",
257            Self::RepoConvergenceReady => "RCH-R304",
258            Self::WorkerRepoNotReady => "RCH-R305",
259
260            // R400-R499 — HelperCompatibility
261            Self::HelperAvailable => "RCH-R400",
262            Self::HelperMissing => "RCH-R401",
263            Self::HelperProbeUnavailable => "RCH-R402",
264
265            // R500-R599 — RolloutPosture
266            Self::HookAutoStartEnabled => "RCH-R500",
267            Self::HookAutoStartDisabled => "RCH-R501",
268            Self::DaemonHookRepairEnabled => "RCH-R502",
269            Self::DaemonHookRepairDisabled => "RCH-R503",
270            Self::ConfigLoadFailed => "RCH-R504",
271            Self::StatusSurfaceAvailable => "RCH-R505",
272            Self::RepoConvergenceSurfaceAvailable => "RCH-R506",
273            Self::DiskPressureSurfaceAvailable => "RCH-R507",
274
275            // R600-R699 — SchemaCompatibility
276            Self::SchemaCompatible => "RCH-R600",
277            Self::SchemaIncompatible => "RCH-R601",
278        }
279    }
280
281    /// The category this variant belongs to.
282    #[must_use]
283    pub const fn category(self) -> ReliabilityCategoryKind {
284        use ReliabilityCategoryKind as C;
285        match self {
286            Self::WorkersConfigUnreadable
287            | Self::NoWorkersConfigured
288            | Self::WorkersConfigured
289            | Self::DaemonStatusUnavailable
290            | Self::DaemonHasNoWorkers
291            | Self::AllWorkersUnhealthy
292            | Self::PartialWorkerCapacity
293            | Self::WorkersHealthy
294            | Self::WorkerCircuitOpen
295            | Self::WorkerUnreachable
296            | Self::WorkerDegraded
297            | Self::WorkerReady
298            | Self::WorkerStatusUnrecognized
299            | Self::WorkerCircuitStateUnrecognized => C::Topology,
300
301            Self::DiskPressureUnavailable
302            | Self::WorkerDiskPressureCritical
303            | Self::WorkerDiskPressureWarning
304            | Self::WorkerDiskPressureHealthy
305            | Self::WorkerDiskPressureTelemetryGap
306            | Self::DiskPressureNoWorkers => C::DiskPressure,
307
308            Self::ProcessDebtUnavailable
309            | Self::CancellationCleanupHealthy
310            | Self::CancellationCleanupSkipped
311            | Self::CancellationCleanupDegraded
312            | Self::CancellationCleanupFailed => C::ProcessTriage,
313
314            Self::RepoConvergenceUnavailable
315            | Self::RepoConvergenceFailed
316            | Self::RepoConvergenceDrift
317            | Self::RepoConvergenceNoWorkers
318            | Self::RepoConvergenceReady
319            | Self::WorkerRepoNotReady => C::RepoConvergence,
320
321            Self::HelperAvailable | Self::HelperMissing | Self::HelperProbeUnavailable => {
322                C::HelperCompatibility
323            }
324
325            Self::HookAutoStartEnabled
326            | Self::HookAutoStartDisabled
327            | Self::DaemonHookRepairEnabled
328            | Self::DaemonHookRepairDisabled
329            | Self::ConfigLoadFailed
330            | Self::StatusSurfaceAvailable
331            | Self::RepoConvergenceSurfaceAvailable
332            | Self::DiskPressureSurfaceAvailable => C::RolloutPosture,
333
334            Self::SchemaCompatible | Self::SchemaIncompatible => C::SchemaCompatibility,
335        }
336    }
337
338    /// Whether the configured remediation requires a process restart for the
339    /// fix to take effect. Consumed by the reliability doctor when building
340    /// `RemediationStep::requires_restart` (sibling bead `2s99h.15`).
341    ///
342    /// Policy:
343    /// - `true` when the fix changes a flag/state read at daemon startup, OR
344    ///   when the underlying subsystem caches state at process start.
345    /// - `false` when the fix is purely external (e.g., disk space, key
346    ///   permissions) OR when the daemon hot-reloads the relevant state.
347    #[must_use]
348    pub const fn requires_restart(self) -> bool {
349        match self {
350            // Topology — daemon parses workers.toml at startup; no SIGHUP yet.
351            Self::WorkersConfigUnreadable
352            | Self::NoWorkersConfigured
353            | Self::DaemonStatusUnavailable
354            | Self::DaemonHasNoWorkers => true,
355            Self::WorkersConfigured | Self::WorkersHealthy | Self::WorkerReady => false,
356            // Worker-level conditions are upstream; no rch restart fixes them.
357            Self::AllWorkersUnhealthy
358            | Self::PartialWorkerCapacity
359            | Self::WorkerCircuitOpen
360            | Self::WorkerUnreachable
361            | Self::WorkerDegraded
362            | Self::WorkerStatusUnrecognized
363            | Self::WorkerCircuitStateUnrecognized => false,
364
365            // Disk pressure is external — operator cleans up disk; daemon picks
366            // up the new free-space numbers on next probe.
367            Self::DiskPressureUnavailable
368            | Self::WorkerDiskPressureCritical
369            | Self::WorkerDiskPressureWarning
370            | Self::WorkerDiskPressureHealthy
371            | Self::WorkerDiskPressureTelemetryGap
372            | Self::DiskPressureNoWorkers => false,
373
374            // Process-triage stale subprocess cleanup may require daemon restart
375            // to clear pgid handles.
376            Self::ProcessDebtUnavailable | Self::CancellationCleanupFailed => true,
377            Self::CancellationCleanupHealthy
378            | Self::CancellationCleanupSkipped
379            | Self::CancellationCleanupDegraded => false,
380
381            // Repo-convergence checks are read-only; remediation is git-side.
382            Self::RepoConvergenceUnavailable
383            | Self::RepoConvergenceFailed
384            | Self::RepoConvergenceDrift
385            | Self::RepoConvergenceNoWorkers
386            | Self::RepoConvergenceReady
387            | Self::WorkerRepoNotReady => false,
388
389            // Helper install (cargo install / package manager) doesn't require
390            // daemon restart.
391            Self::HelperAvailable | Self::HelperMissing | Self::HelperProbeUnavailable => false,
392
393            // Rollout posture flags are cached at startup.
394            Self::HookAutoStartEnabled => false,
395            Self::HookAutoStartDisabled => true,
396            Self::DaemonHookRepairEnabled => false,
397            Self::DaemonHookRepairDisabled => true,
398            Self::ConfigLoadFailed => false,
399            Self::StatusSurfaceAvailable
400            | Self::RepoConvergenceSurfaceAvailable
401            | Self::DiskPressureSurfaceAvailable => false,
402
403            // Schema versions are pinned at compile/bind time; mismatch
404            // requires a fresh process.
405            Self::SchemaCompatible => false,
406            Self::SchemaIncompatible => true,
407        }
408    }
409
410    /// One-line operator-facing remediation hint. Used as the default
411    /// suggestion text when the diagnostic doesn't override it.
412    #[must_use]
413    pub const fn remediation_hint(self) -> &'static str {
414        match self {
415            Self::WorkersConfigUnreadable => {
416                "Inspect ~/.config/rch/workers.toml for a parse error."
417            }
418            Self::NoWorkersConfigured => "Run `rch workers add <host>` to register a worker.",
419            Self::WorkersConfigured => "No action needed.",
420            Self::DaemonStatusUnavailable => "Start the daemon with `rch daemon start` and retry.",
421            Self::DaemonHasNoWorkers => "Run `rch workers add <host>` to register a worker.",
422            Self::AllWorkersUnhealthy => {
423                "Run `rch workers probe --all` to diagnose worker connectivity."
424            }
425            Self::PartialWorkerCapacity => {
426                "Run `rch workers list --json` to identify the unhealthy worker."
427            }
428            Self::WorkersHealthy => "No action needed.",
429            Self::WorkerCircuitOpen => {
430                "Run `rch workers reset-circuit <worker>` once the underlying issue is fixed."
431            }
432            Self::WorkerUnreachable => "Verify SSH connectivity with `rch workers probe <worker>`.",
433            Self::WorkerDegraded => {
434                "Run `rch workers probe <worker>` to refresh worker health state."
435            }
436            Self::WorkerReady => "No action needed.",
437            Self::WorkerStatusUnrecognized => {
438                "Daemon and rch versions may have drifted; reinstall both binaries."
439            }
440            Self::WorkerCircuitStateUnrecognized => {
441                "Daemon and rch versions may have drifted; reinstall both binaries."
442            }
443            Self::DiskPressureUnavailable => "Start the daemon with `rch daemon start` and retry.",
444            Self::WorkerDiskPressureCritical => {
445                "Run `rch worker disk-cleanup --worker <name>` immediately."
446            }
447            Self::WorkerDiskPressureWarning => {
448                "Plan a `rch worker disk-cleanup --worker <name>` cycle."
449            }
450            Self::WorkerDiskPressureHealthy => "No action needed.",
451            Self::WorkerDiskPressureTelemetryGap => {
452                "Run `rch workers probe <worker>` to refresh telemetry."
453            }
454            Self::DiskPressureNoWorkers => "No action needed.",
455            Self::ProcessDebtUnavailable => "Start the daemon with `rch daemon start` and retry.",
456            Self::CancellationCleanupHealthy => "No action needed.",
457            Self::CancellationCleanupSkipped => "No action needed.",
458            Self::CancellationCleanupDegraded => {
459                "Run `rch status --jobs --json` to inspect process-triage state."
460            }
461            Self::CancellationCleanupFailed => {
462                "Restart the daemon with `rch daemon restart` to reset stale pgid handles."
463            }
464            Self::RepoConvergenceUnavailable => {
465                "Start the daemon with `rch daemon start` and retry."
466            }
467            Self::RepoConvergenceFailed => "Run `rch repo sync --all` to drive convergence.",
468            Self::RepoConvergenceDrift => "Run `rch repo sync --all` to refresh worker state.",
469            Self::RepoConvergenceNoWorkers => "No action needed.",
470            Self::RepoConvergenceReady => "No action needed.",
471            Self::WorkerRepoNotReady => "Run `rch repo sync --worker <name>` to converge.",
472            Self::HelperAvailable => "No action needed.",
473            Self::HelperMissing => "Install the missing helper via the system package manager.",
474            Self::HelperProbeUnavailable => {
475                "Rerun the helper probe after checking for stuck local helper subprocesses."
476            }
477            Self::HookAutoStartEnabled => "No action needed.",
478            Self::HookAutoStartDisabled => {
479                "Run `rch config set self_healing.hook_starts_daemon true`."
480            }
481            Self::DaemonHookRepairEnabled => "No action needed.",
482            Self::DaemonHookRepairDisabled => {
483                "Run `rch config set self_healing.daemon_installs_hooks true`."
484            }
485            Self::ConfigLoadFailed => "Run `rch config doctor --json` to diagnose.",
486            Self::StatusSurfaceAvailable
487            | Self::RepoConvergenceSurfaceAvailable
488            | Self::DiskPressureSurfaceAvailable => "No action needed.",
489            Self::SchemaCompatible => "No action needed.",
490            Self::SchemaIncompatible => {
491                "Upgrade rch / rchd / rch-wkr binaries to the same release."
492            }
493        }
494    }
495
496    /// Every variant of this enum, useful for exhaustive iteration in tests.
497    pub const ALL: &'static [ReliabilityReasonCode] = &[
498        Self::WorkersConfigUnreadable,
499        Self::NoWorkersConfigured,
500        Self::WorkersConfigured,
501        Self::DaemonStatusUnavailable,
502        Self::DaemonHasNoWorkers,
503        Self::AllWorkersUnhealthy,
504        Self::PartialWorkerCapacity,
505        Self::WorkersHealthy,
506        Self::WorkerCircuitOpen,
507        Self::WorkerUnreachable,
508        Self::WorkerDegraded,
509        Self::WorkerReady,
510        Self::WorkerStatusUnrecognized,
511        Self::WorkerCircuitStateUnrecognized,
512        Self::DiskPressureUnavailable,
513        Self::WorkerDiskPressureCritical,
514        Self::WorkerDiskPressureWarning,
515        Self::WorkerDiskPressureHealthy,
516        Self::WorkerDiskPressureTelemetryGap,
517        Self::DiskPressureNoWorkers,
518        Self::ProcessDebtUnavailable,
519        Self::CancellationCleanupHealthy,
520        Self::CancellationCleanupSkipped,
521        Self::CancellationCleanupDegraded,
522        Self::CancellationCleanupFailed,
523        Self::RepoConvergenceUnavailable,
524        Self::RepoConvergenceFailed,
525        Self::RepoConvergenceDrift,
526        Self::RepoConvergenceNoWorkers,
527        Self::RepoConvergenceReady,
528        Self::WorkerRepoNotReady,
529        Self::HelperAvailable,
530        Self::HelperMissing,
531        Self::HelperProbeUnavailable,
532        Self::HookAutoStartEnabled,
533        Self::HookAutoStartDisabled,
534        Self::DaemonHookRepairEnabled,
535        Self::DaemonHookRepairDisabled,
536        Self::ConfigLoadFailed,
537        Self::StatusSurfaceAvailable,
538        Self::RepoConvergenceSurfaceAvailable,
539        Self::DiskPressureSurfaceAvailable,
540        Self::SchemaCompatible,
541        Self::SchemaIncompatible,
542    ];
543}
544
545impl fmt::Display for ReliabilityReasonCode {
546    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
547        f.write_str(self.code())
548    }
549}
550
551impl Serialize for ReliabilityReasonCode {
552    fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
553        s.serialize_str(self.code())
554    }
555}
556
557impl<'de> Deserialize<'de> for ReliabilityReasonCode {
558    fn deserialize<D: serde::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
559        let raw = String::deserialize(d)?;
560        Self::from_code_str(&raw).ok_or_else(|| {
561            serde::de::Error::custom(format!("unknown reliability reason code {raw:?}"))
562        })
563    }
564}
565
566impl ReliabilityReasonCode {
567    /// Reverse-lookup helper for deserialization.
568    #[must_use]
569    pub fn from_code_str(code: &str) -> Option<Self> {
570        Self::ALL.iter().copied().find(|c| c.code() == code)
571    }
572}
573
574#[cfg(test)]
575mod tests {
576    use super::*;
577    use std::collections::HashSet;
578
579    #[test]
580    fn test_reliability_reason_codes_unique() {
581        let mut seen = HashSet::new();
582        for &c in ReliabilityReasonCode::ALL {
583            assert!(
584                seen.insert(c.code()),
585                "duplicate code {} for variant {:?}",
586                c.code(),
587                c
588            );
589        }
590        assert_eq!(seen.len(), ReliabilityReasonCode::ALL.len());
591    }
592
593    #[test]
594    fn test_reliability_reason_codes_format() {
595        for &c in ReliabilityReasonCode::ALL {
596            let code = c.code();
597            assert!(
598                code.starts_with("RCH-R")
599                    && code[5..].len() == 3
600                    && code[5..].chars().all(|ch| ch.is_ascii_digit()),
601                "invalid code format {code} for {c:?}"
602            );
603        }
604    }
605
606    #[test]
607    fn test_reliability_reason_codes_in_documented_ranges() {
608        for &c in ReliabilityReasonCode::ALL {
609            let n: u32 = c.code()[5..].parse().expect("3-digit numeric");
610            let cat = c.category();
611            let expected_range = match cat {
612                ReliabilityCategoryKind::Topology => 1..=99,
613                ReliabilityCategoryKind::DiskPressure => 100..=199,
614                ReliabilityCategoryKind::ProcessTriage => 200..=299,
615                ReliabilityCategoryKind::RepoConvergence => 300..=399,
616                ReliabilityCategoryKind::HelperCompatibility => 400..=499,
617                ReliabilityCategoryKind::RolloutPosture => 500..=599,
618                ReliabilityCategoryKind::SchemaCompatibility => 600..=699,
619            };
620            assert!(
621                expected_range.contains(&n),
622                "{c:?} code {} (n={n}) not in expected range {:?} for category {:?}",
623                c.code(),
624                expected_range,
625                cat
626            );
627        }
628    }
629
630    #[test]
631    fn test_reliability_reason_codes_serde_roundtrip() {
632        for &c in ReliabilityReasonCode::ALL {
633            let s = serde_json::to_string(&c).unwrap();
634            let d: ReliabilityReasonCode = serde_json::from_str(&s).unwrap();
635            assert_eq!(c, d, "round-trip mismatch for {c:?}");
636            // Also confirm the on-the-wire form is the RCH-Rnnn string.
637            let expected = format!("\"{}\"", c.code());
638            assert_eq!(s, expected);
639        }
640    }
641
642    #[test]
643    fn test_reliability_reason_codes_remediation_non_empty() {
644        for &c in ReliabilityReasonCode::ALL {
645            let hint = c.remediation_hint();
646            assert!(!hint.is_empty(), "empty remediation hint for {c:?}");
647        }
648    }
649
650    #[test]
651    fn test_unknown_code_deserialize_fails_clearly() {
652        let r: Result<ReliabilityReasonCode, _> = serde_json::from_str("\"RCH-R999\"");
653        let err = r.expect_err("RCH-R999 should not deserialize");
654        let msg = err.to_string();
655        assert!(
656            msg.contains("RCH-R999"),
657            "error should name the unknown code, got: {msg}"
658        );
659    }
660
661    #[test]
662    fn test_display_matches_code() {
663        for &c in ReliabilityReasonCode::ALL {
664            assert_eq!(format!("{c}"), c.code().to_string());
665        }
666    }
667
668    #[test]
669    fn test_requires_restart_explicit_for_every_reason() {
670        // Rust's exhaustive match enforces this; the test exists to catch a
671        // future "_ => false" wildcard from regressing the discipline.
672        for &c in ReliabilityReasonCode::ALL {
673            // Just call it; if there's a panic-on-miss, this catches it.
674            let _ = c.requires_restart();
675        }
676    }
677
678    /// Pinned policy table mirroring the [bead body's table for
679    /// `2s99h.15`](https://example.invalid). The implementation
680    /// [`ReliabilityReasonCode::requires_restart`] is the canonical authority;
681    /// this table is duplicate state used to:
682    /// 1. Detect implementation drift (reviewer sees both edits in the diff).
683    /// 2. Provide a paste-ready reference table for documentation.
684    ///
685    /// Adding a new variant requires updating BOTH the impl AND this table —
686    /// surfacing the rationale in code review. The
687    /// [`test_requires_restart_table_matches_impl`] test enforces the match.
688    const REQUIRES_RESTART_TABLE: &[(ReliabilityReasonCode, bool)] = &[
689        // Topology
690        (ReliabilityReasonCode::WorkersConfigUnreadable, true),
691        (ReliabilityReasonCode::NoWorkersConfigured, true),
692        (ReliabilityReasonCode::WorkersConfigured, false),
693        (ReliabilityReasonCode::DaemonStatusUnavailable, true),
694        (ReliabilityReasonCode::DaemonHasNoWorkers, true),
695        (ReliabilityReasonCode::AllWorkersUnhealthy, false),
696        (ReliabilityReasonCode::PartialWorkerCapacity, false),
697        (ReliabilityReasonCode::WorkersHealthy, false),
698        (ReliabilityReasonCode::WorkerCircuitOpen, false),
699        (ReliabilityReasonCode::WorkerUnreachable, false),
700        (ReliabilityReasonCode::WorkerDegraded, false),
701        (ReliabilityReasonCode::WorkerReady, false),
702        (ReliabilityReasonCode::WorkerStatusUnrecognized, false),
703        (ReliabilityReasonCode::WorkerCircuitStateUnrecognized, false),
704        // DiskPressure
705        (ReliabilityReasonCode::DiskPressureUnavailable, false),
706        (ReliabilityReasonCode::WorkerDiskPressureCritical, false),
707        (ReliabilityReasonCode::WorkerDiskPressureWarning, false),
708        (ReliabilityReasonCode::WorkerDiskPressureHealthy, false),
709        (ReliabilityReasonCode::WorkerDiskPressureTelemetryGap, false),
710        (ReliabilityReasonCode::DiskPressureNoWorkers, false),
711        // ProcessTriage
712        (ReliabilityReasonCode::ProcessDebtUnavailable, true),
713        (ReliabilityReasonCode::CancellationCleanupHealthy, false),
714        (ReliabilityReasonCode::CancellationCleanupSkipped, false),
715        (ReliabilityReasonCode::CancellationCleanupDegraded, false),
716        (ReliabilityReasonCode::CancellationCleanupFailed, true),
717        // RepoConvergence
718        (ReliabilityReasonCode::RepoConvergenceUnavailable, false),
719        (ReliabilityReasonCode::RepoConvergenceFailed, false),
720        (ReliabilityReasonCode::RepoConvergenceDrift, false),
721        (ReliabilityReasonCode::RepoConvergenceNoWorkers, false),
722        (ReliabilityReasonCode::RepoConvergenceReady, false),
723        (ReliabilityReasonCode::WorkerRepoNotReady, false),
724        // HelperCompatibility
725        (ReliabilityReasonCode::HelperAvailable, false),
726        (ReliabilityReasonCode::HelperMissing, false),
727        (ReliabilityReasonCode::HelperProbeUnavailable, false),
728        // RolloutPosture
729        (ReliabilityReasonCode::HookAutoStartEnabled, false),
730        (ReliabilityReasonCode::HookAutoStartDisabled, true),
731        (ReliabilityReasonCode::DaemonHookRepairEnabled, false),
732        (ReliabilityReasonCode::DaemonHookRepairDisabled, true),
733        (ReliabilityReasonCode::ConfigLoadFailed, false),
734        (ReliabilityReasonCode::StatusSurfaceAvailable, false),
735        (
736            ReliabilityReasonCode::RepoConvergenceSurfaceAvailable,
737            false,
738        ),
739        (ReliabilityReasonCode::DiskPressureSurfaceAvailable, false),
740        // SchemaCompatibility
741        (ReliabilityReasonCode::SchemaCompatible, false),
742        (ReliabilityReasonCode::SchemaIncompatible, true),
743    ];
744
745    #[test]
746    fn test_requires_restart_table_matches_impl() {
747        // Every entry in the pinned table must agree with the impl AND
748        // every variant must appear in the table (in the same order as
749        // ALL_COMPONENTS). Editing requires_restart() without updating the
750        // table — or vice versa — triggers a clear failure.
751        assert_eq!(
752            REQUIRES_RESTART_TABLE.len(),
753            ReliabilityReasonCode::ALL.len(),
754            "REQUIRES_RESTART_TABLE has {} entries but {} variants exist. \
755             Add or remove the corresponding entry when changing the variant set.",
756            REQUIRES_RESTART_TABLE.len(),
757            ReliabilityReasonCode::ALL.len()
758        );
759
760        for (i, ((variant, expected), &impl_variant)) in REQUIRES_RESTART_TABLE
761            .iter()
762            .zip(ReliabilityReasonCode::ALL.iter())
763            .enumerate()
764        {
765            assert_eq!(
766                *variant, impl_variant,
767                "Position {i}: REQUIRES_RESTART_TABLE has {variant:?} but ALL has {impl_variant:?}. \
768                 Tables must be in identical order — easier diff review.",
769            );
770            let actual = variant.requires_restart();
771            assert_eq!(
772                *expected, actual,
773                "Policy mismatch for {variant:?}: table says {expected}, impl says {actual}. \
774                 Update BOTH or NEITHER.",
775            );
776        }
777    }
778
779    #[test]
780    fn test_requires_restart_consistency_with_remediation_hint() {
781        // Heuristic: if the remediation hint mentions "restart", the variant
782        // SHOULD have requires_restart=true. Catches drift between the
783        // operator-facing hint and the policy bool. (Limited to the obvious
784        // case — no false-positive on hints that mention "rch daemon
785        // restart" only as a remediation command for a non-restart variant.)
786        for &c in ReliabilityReasonCode::ALL {
787            let hint = c.remediation_hint().to_lowercase();
788            // Only flag the case where hint says "restart" but bool says false.
789            // The reverse (bool=true, no "restart" in hint) is fine since the
790            // hint may use a different idiom (e.g., "reinstall both binaries").
791            if hint.contains("restart") {
792                assert!(
793                    c.requires_restart(),
794                    "Variant {c:?} hint mentions 'restart' but requires_restart() returns false. \
795                     Either update the hint to NOT say 'restart' or set requires_restart=true."
796                );
797            }
798        }
799    }
800}