Skip to main content

rch_common/errors/
explain.rs

1//! Operator-facing code-lookup surface (`rch error explain`).
2//!
3//! Bridges the two parallel code namespaces in this workspace —
4//! [`super::ErrorCode`] (`RCH-E001..E599`, the primary error catalog)
5//! and [`super::ReliabilityReasonCode`] (`RCH-R001..R699`, the doctor
6//! reliability surface) — into a single uniform lookup so an operator
7//! who pastes any code from a log line into `rch error explain` gets a
8//! useful answer regardless of which namespace it came from.
9//!
10//! # Surface
11//!
12//! ```text
13//! rch error explain <CODE>      # human form
14//! rch error explain <CODE> --json
15//! rch error list                # all codes
16//! rch error list --category=worker
17//! rch error list --json
18//! ```
19//!
20//! # Wire shape (JSON)
21//!
22//! ```json
23//! {
24//!   "code": "RCH-R104",
25//!   "namespace": "reliability",
26//!   "name": "WorkerDiskPressureTelemetryGap",
27//!   "category": "disk_pressure",
28//!   "description": "Worker is missing fresh disk telemetry.",
29//!   "remediation": ["Run `rch workers probe <worker>` to refresh telemetry."],
30//!   "requires_restart": false
31//! }
32//! ```
33
34use super::{ErrorCategory, ErrorCode, ReliabilityCategoryKind, ReliabilityReasonCode};
35use serde::{Deserialize, Serialize};
36
37/// Resolved lookup result for either namespace. Produced by [`lookup`].
38#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct CodeExplanation {
40    pub code: String,
41    pub namespace: CodeNamespace,
42    pub name: String,
43    pub category: String,
44    pub description: String,
45    pub remediation: Vec<String>,
46    /// Only meaningful for reliability codes. `None` for error codes
47    /// where the concept doesn't apply.
48    #[serde(skip_serializing_if = "Option::is_none")]
49    pub requires_restart: Option<bool>,
50    /// Optional documentation link. Currently only populated for some
51    /// `ErrorCode` entries that supply a `doc_url`.
52    #[serde(skip_serializing_if = "Option::is_none")]
53    pub doc_url: Option<String>,
54}
55
56/// Which catalog the code came from.
57#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
58#[serde(rename_all = "snake_case")]
59pub enum CodeNamespace {
60    /// `RCH-Ennn` codes from [`ErrorCode`].
61    Error,
62    /// `RCH-Rnnn` codes from [`ReliabilityReasonCode`].
63    Reliability,
64}
65
66/// Look up a code string. Accepts whitespace-padded input.
67///
68/// Returns `None` if the code is unknown or malformed.
69#[must_use]
70pub fn lookup(raw: &str) -> Option<CodeExplanation> {
71    let normalized = raw.trim().to_ascii_uppercase();
72    if let Some(c) = lookup_reliability(&normalized) {
73        return Some(c);
74    }
75    if let Some(c) = lookup_error(&normalized) {
76        return Some(c);
77    }
78    None
79}
80
81/// Whether a string parses as either a known reliability or error code.
82#[must_use]
83pub fn is_known(raw: &str) -> bool {
84    lookup(raw).is_some()
85}
86
87fn lookup_reliability(code: &str) -> Option<CodeExplanation> {
88    let v = ReliabilityReasonCode::from_code_str(code)?;
89    Some(CodeExplanation {
90        code: v.code().to_string(),
91        namespace: CodeNamespace::Reliability,
92        name: v.name().to_string(),
93        category: reliability_category_str(v.category()).to_string(),
94        description: v.remediation_hint().to_string(),
95        remediation: vec![v.remediation_hint().to_string()],
96        requires_restart: Some(v.requires_restart()),
97        doc_url: None,
98    })
99}
100
101fn lookup_error(code: &str) -> Option<CodeExplanation> {
102    for c in error_code_all() {
103        if c.code_string() == code {
104            let entry = c.entry();
105            return Some(CodeExplanation {
106                code: entry.code,
107                namespace: CodeNamespace::Error,
108                name: format!("{c:?}"),
109                category: error_category_str(entry.category).to_string(),
110                description: entry.message,
111                remediation: entry.remediation,
112                requires_restart: None,
113                doc_url: entry.doc_url,
114            });
115        }
116    }
117    None
118}
119
120/// Snake-case string name for a reliability category.
121const fn reliability_category_str(c: ReliabilityCategoryKind) -> &'static str {
122    match c {
123        ReliabilityCategoryKind::Topology => "topology",
124        ReliabilityCategoryKind::DiskPressure => "disk_pressure",
125        ReliabilityCategoryKind::ProcessTriage => "process_triage",
126        ReliabilityCategoryKind::RepoConvergence => "repo_convergence",
127        ReliabilityCategoryKind::HelperCompatibility => "helper_compatibility",
128        ReliabilityCategoryKind::RolloutPosture => "rollout_posture",
129        ReliabilityCategoryKind::SchemaCompatibility => "schema_compatibility",
130    }
131}
132
133/// Snake-case string name for an error category.
134const fn error_category_str(c: ErrorCategory) -> &'static str {
135    match c {
136        ErrorCategory::Config => "config",
137        ErrorCategory::Network => "network",
138        ErrorCategory::Worker => "worker",
139        ErrorCategory::Build => "build",
140        ErrorCategory::Transfer => "transfer",
141        ErrorCategory::Internal => "internal",
142    }
143}
144
145/// All known codes across both namespaces. Used by `rch error list`.
146#[must_use]
147pub fn list_all() -> Vec<CodeExplanation> {
148    let mut out: Vec<CodeExplanation> = Vec::new();
149    for v in ReliabilityReasonCode::ALL {
150        if let Some(e) = lookup_reliability(v.code()) {
151            out.push(e);
152        }
153    }
154    for c in error_code_all() {
155        let s = c.code_string();
156        if let Some(e) = lookup_error(&s) {
157            out.push(e);
158        }
159    }
160    out.sort_by(|a, b| a.code.cmp(&b.code));
161    out
162}
163
164/// Subset of [`list_all`] filtered to one category (matches the
165/// snake_case `category` field on [`CodeExplanation`]). Empty result
166/// indicates an unknown category — caller can detect that.
167#[must_use]
168pub fn list_by_category(category: &str) -> Vec<CodeExplanation> {
169    let cat = category.trim().to_ascii_lowercase();
170    list_all()
171        .into_iter()
172        .filter(|e| e.category == cat)
173        .collect()
174}
175
176/// Known category names across both code namespaces, sorted for stable CLI
177/// help and JSON error payloads.
178#[must_use]
179pub fn known_categories() -> Vec<String> {
180    let mut categories: Vec<String> = list_all().into_iter().map(|e| e.category).collect();
181    categories.sort();
182    categories.dedup();
183    categories
184}
185
186/// Whether a category name matches at least one known code category.
187#[must_use]
188pub fn is_known_category(category: &str) -> bool {
189    let cat = category.trim().to_ascii_lowercase();
190    !cat.is_empty() && known_categories().iter().any(|known| known == &cat)
191}
192
193/// All known [`ErrorCode`] variants. Hand-maintained because `ErrorCode`
194/// doesn't expose an iteration API; this list is the authoritative
195/// snapshot. A unit test asserts every variant has a unique `RCH-Ennn`
196/// `code_string()`, which catches drift if the enum gains/renames variants.
197fn error_code_all() -> &'static [ErrorCode] {
198    use ErrorCode::*;
199    &[
200        // Config (E001-E099)
201        ConfigNotFound,
202        ConfigReadError,
203        ConfigParseError,
204        ConfigValidationError,
205        ConfigEnvError,
206        ConfigProfileNotFound,
207        ConfigNoWorkers,
208        ConfigInvalidWorker,
209        ConfigSshKeyError,
210        ConfigSocketPathError,
211        // Path-Dependency (within Config E013-E018)
212        PathDepManifestParseFailed,
213        PathDepMissing,
214        PathDepCyclic,
215        PathDepPolicyViolation,
216        PathDepMetadataFailed,
217        PathDepMetadataParseFailed,
218        // Closure planner (within Config E019-E024)
219        ClosureFailOpen,
220        ClosureFingerprintMismatch,
221        ClosureHighRisk,
222        ClosureMissingData,
223        ClosureNonDeterministic,
224        ClosurePlanFailed,
225        // Network (E100-E199)
226        SshConnectionFailed,
227        SshAuthFailed,
228        SshHostKeyError,
229        SshKeyError,
230        SshTimeout,
231        NetworkTimeout,
232        NetworkConnectionRefused,
233        NetworkDnsError,
234        NetworkUnreachable,
235        SshSessionDropped,
236        // Worker (E200-E299)
237        WorkerAllUnhealthy,
238        WorkerAtCapacity,
239        WorkerCircuitOpen,
240        WorkerHealthCheckFailed,
241        WorkerLoadQueryFailed,
242        WorkerMissingToolchain,
243        WorkerNoneAvailable,
244        WorkerSelectionFailed,
245        WorkerSelfTestFailed,
246        WorkerStateError,
247        // Worker/Storage (E210-E219)
248        WorkerDiskPressureWarning,
249        WorkerDiskPressureCritical,
250        WorkerDiskHeadroomInsufficient,
251        WorkerDiskIoHigh,
252        WorkerMemoryPressureHigh,
253        WorkerTelemetryGap,
254        WorkerReclaimFailed,
255        WorkerReclaimProtected,
256        // Build (E300-E399)
257        BuildCompilationFailed,
258        BuildTimeout,
259        BuildArtifactMissing,
260        BuildOutputError,
261        BuildKilledBySignal,
262        BuildToolchainError,
263        BuildIncrementalError,
264        BuildEnvError,
265        BuildWorkdirError,
266        BuildUnknownCommand,
267        // Build/Triage (E310-E319)
268        ProcessTriageAdapterUnavailable,
269        ProcessTriageDetectorUncertain,
270        ProcessTriageExecutorError,
271        ProcessTriageInvalidRequest,
272        ProcessTriagePartialResult,
273        ProcessTriagePolicyViolation,
274        ProcessTriageTimeout,
275        ProcessTriageTransportError,
276        // Build/Cancellation (E320-E325)
277        CancelGracefulSent,
278        CancelTimeoutExceeded,
279        CancelEscalatedKill,
280        CancelRemoteKillFailed,
281        CancelCleanupFailed,
282        CancelSlotLeak,
283        // Transfer (E400-E499)
284        TransferRsyncFailed,
285        TransferTimeout,
286        TransferSourceMissing,
287        TransferDestError,
288        TransferDiskFull,
289        TransferPermissionDenied,
290        TransferChecksumError,
291        TransferBinaryFailed,
292        TransferIncomplete,
293        TransferProtocolError,
294        // Internal (E500-E599)
295        InternalDaemonSocket,
296        InternalDaemonProtocol,
297        InternalDaemonNotRunning,
298        InternalIpcError,
299        InternalStateError,
300        InternalSerdeError,
301        InternalHookError,
302        InternalMetricsError,
303        InternalLoggingError,
304        InternalUpdateError,
305    ]
306}
307
308/// Render a [`CodeExplanation`] in a paste-ready human form. Used by
309/// the CLI when `--json` is not set.
310#[must_use]
311pub fn render_human(e: &CodeExplanation) -> String {
312    let mut out = String::with_capacity(512);
313    out.push_str(&format!("{}  {}\n", e.code, e.name));
314    out.push_str(&format!(
315        "Category:     {:<30}  Namespace: {}\n",
316        e.category,
317        match e.namespace {
318            CodeNamespace::Error => "error (RCH-Ennn)",
319            CodeNamespace::Reliability => "reliability (RCH-Rnnn)",
320        }
321    ));
322    if let Some(rr) = e.requires_restart {
323        out.push_str(&format!("Requires restart: {rr}\n"));
324    }
325    out.push_str("\nDescription:\n");
326    out.push_str(&format!("  {}\n", e.description));
327    if !e.remediation.is_empty() {
328        out.push_str("\nRemediation:\n");
329        for step in &e.remediation {
330            out.push_str(&format!("  - {step}\n"));
331        }
332    }
333    if let Some(url) = &e.doc_url {
334        out.push_str(&format!("\nReference: {url}\n"));
335    }
336    out
337}
338
339#[cfg(test)]
340mod tests {
341    use super::*;
342
343    #[test]
344    fn test_lookup_reliability_code() {
345        let e = lookup("RCH-R104").expect("R104 known");
346        assert_eq!(e.code, "RCH-R104");
347        assert_eq!(e.namespace, CodeNamespace::Reliability);
348        assert_eq!(e.category, "disk_pressure");
349        assert!(!e.description.is_empty());
350        assert!(!e.remediation.is_empty());
351        assert!(e.requires_restart.is_some());
352    }
353
354    #[test]
355    fn test_lookup_error_code() {
356        let e = lookup("RCH-E001").expect("E001 known");
357        assert_eq!(e.code, "RCH-E001");
358        assert_eq!(e.namespace, CodeNamespace::Error);
359        assert_eq!(e.category, "config");
360        assert!(!e.description.is_empty());
361        assert!(e.requires_restart.is_none());
362    }
363
364    #[test]
365    fn test_lookup_unknown_returns_none() {
366        assert!(lookup("RCH-R999").is_none());
367        assert!(lookup("RCH-E999").is_none());
368        assert!(lookup("not-a-code").is_none());
369        assert!(lookup("").is_none());
370    }
371
372    #[test]
373    fn test_lookup_trims_whitespace() {
374        let e = lookup("  RCH-R001  ").expect("trimmed lookup hits");
375        assert_eq!(e.code, "RCH-R001");
376    }
377
378    #[test]
379    fn test_lookup_is_case_insensitive() {
380        let e = lookup("rch-e001").expect("lowercase error-code lookup hits");
381        assert_eq!(e.code, "RCH-E001");
382
383        let e = lookup("rch-r104").expect("lowercase reliability-code lookup hits");
384        assert_eq!(e.code, "RCH-R104");
385    }
386
387    #[test]
388    fn test_is_known() {
389        assert!(is_known("RCH-R001"));
390        assert!(is_known("RCH-E001"));
391        assert!(!is_known("RCH-X001"));
392    }
393
394    #[test]
395    fn test_list_all_includes_both_namespaces() {
396        let all = list_all();
397        let reliability_count = all
398            .iter()
399            .filter(|e| e.namespace == CodeNamespace::Reliability)
400            .count();
401        let error_count = all
402            .iter()
403            .filter(|e| e.namespace == CodeNamespace::Error)
404            .count();
405        assert!(reliability_count >= 40, "expected ≥40 reliability codes");
406        assert!(error_count >= 50, "expected ≥50 error codes");
407    }
408
409    #[test]
410    fn test_list_all_sorted_by_code() {
411        let all = list_all();
412        for w in all.windows(2) {
413            assert!(w[0].code <= w[1].code, "list_all not sorted by code");
414        }
415    }
416
417    #[test]
418    fn test_list_all_no_duplicates() {
419        use std::collections::HashSet;
420        let all = list_all();
421        let codes: HashSet<&str> = all.iter().map(|e| e.code.as_str()).collect();
422        assert_eq!(codes.len(), all.len(), "duplicate code in list_all");
423    }
424
425    #[test]
426    fn test_list_by_category_filters() {
427        let dp = list_by_category("disk_pressure");
428        assert!(!dp.is_empty());
429        for e in &dp {
430            assert_eq!(e.category, "disk_pressure");
431        }
432    }
433
434    #[test]
435    fn test_list_by_category_unknown_returns_empty() {
436        assert!(list_by_category("nonexistent_category").is_empty());
437    }
438
439    #[test]
440    fn test_known_categories_are_sorted_unique_and_complete() {
441        let categories = known_categories();
442        assert!(!categories.is_empty());
443        for w in categories.windows(2) {
444            assert!(w[0] < w[1], "known_categories must be sorted and unique");
445        }
446        assert!(categories.iter().any(|c| c == "disk_pressure"));
447        assert!(categories.iter().any(|c| c == "worker"));
448        assert!(categories.iter().any(|c| c == "topology"));
449    }
450
451    #[test]
452    fn test_is_known_category_trims_and_normalizes_case() {
453        assert!(is_known_category(" disk_pressure "));
454        assert!(is_known_category("WORKER"));
455        assert!(is_known_category("Topology"));
456        assert!(!is_known_category(""));
457        assert!(!is_known_category("nonexistent_category"));
458    }
459
460    #[test]
461    fn test_list_by_category_case_insensitive() {
462        let lower = list_by_category("topology");
463        let upper = list_by_category("TOPOLOGY");
464        let mixed = list_by_category("Topology");
465        assert_eq!(lower.len(), upper.len());
466        assert_eq!(lower.len(), mixed.len());
467        assert!(!lower.is_empty());
468    }
469
470    #[test]
471    fn test_render_human_includes_code_and_name() {
472        let e = lookup("RCH-R104").unwrap();
473        let rendered = render_human(&e);
474        assert!(rendered.contains("RCH-R104"));
475        assert!(rendered.contains(&e.name));
476        assert!(rendered.contains("Description:"));
477        assert!(rendered.contains("Remediation:"));
478    }
479
480    #[test]
481    fn test_render_human_omits_requires_restart_for_error_codes() {
482        let e = lookup("RCH-E001").unwrap();
483        let rendered = render_human(&e);
484        assert!(!rendered.contains("Requires restart"));
485    }
486
487    #[test]
488    fn test_serde_roundtrip() {
489        let e = lookup("RCH-R001").unwrap();
490        let json = serde_json::to_string(&e).unwrap();
491        let back: CodeExplanation = serde_json::from_str(&json).unwrap();
492        assert_eq!(e.code, back.code);
493        assert_eq!(e.namespace, back.namespace);
494        assert_eq!(e.category, back.category);
495    }
496
497    #[test]
498    fn test_error_code_all_consistent_with_code_string() {
499        // Every code in the hand-maintained list MUST have a unique
500        // RCH-Ennn code_string. Asserts the catalog isn't drifting.
501        use std::collections::HashSet;
502        let codes: HashSet<String> = error_code_all().iter().map(|c| c.code_string()).collect();
503        assert_eq!(codes.len(), error_code_all().len());
504    }
505}