Skip to main content

ralph/runner/
error.rs

1//! Runner error surface and contextual constructors.
2//!
3//! Responsibilities:
4//! - Define `RunnerError`, the matchable error type for runner orchestration.
5//! - Provide helpers to construct contextual `RunnerError::Other` values.
6//! - Classify failures as retryable vs non-retryable vs requires-user-input.
7//!
8//! Does not handle:
9//! - Runner/model validation (see `runner/model.rs`).
10//! - Command assembly and process execution (see `runner/execution/*`).
11//!
12//! Assumptions/invariants:
13//! - Any user-visible stdout/stderr stored in errors must be redacted via `RedactedString`
14//!   (or redacted at display time by downstream formatting).
15
16use std::fmt;
17
18use anyhow::anyhow;
19
20use crate::contracts::Runner;
21use crate::redaction::RedactedString;
22
23/// Classification of runner failures for retry decisions.
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25pub(crate) enum RunnerFailureClass {
26    /// Transient failure; safe to automatically retry.
27    Retryable(RetryableReason),
28    /// User action required; should not be retried.
29    RequiresUserInput(UserInputReason),
30    /// Deterministic failure; do not retry.
31    NonRetryable(NonRetryableReason),
32}
33
34/// Reasons why a failure is considered retryable.
35#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36pub(crate) enum RetryableReason {
37    /// Rate limit or quota exceeded (HTTP 429, etc).
38    RateLimited,
39    /// Temporary service unavailability (HTTP 503, etc).
40    TemporaryUnavailable,
41    /// Transient I/O error (connection reset, timeout, etc).
42    TransientIo,
43}
44
45/// Reasons why a failure requires user input.
46#[derive(Debug, Clone, Copy, PartialEq, Eq)]
47pub(crate) enum UserInputReason {
48    /// Authentication required (API key, login, etc).
49    Auth,
50    /// Required binary is missing.
51    MissingBinary,
52}
53
54/// Reasons why a failure is considered non-retryable.
55#[derive(Debug, Clone, Copy, PartialEq, Eq)]
56pub(crate) enum NonRetryableReason {
57    /// Invalid invocation or bad arguments.
58    InvalidInvocation,
59    /// Fatal exit; no point in retrying.
60    FatalExit,
61}
62
63#[derive(Debug, thiserror::Error)]
64pub enum RunnerError {
65    #[error("runner binary not found: {bin}")]
66    BinaryMissing {
67        bin: String,
68        #[source]
69        source: std::io::Error,
70    },
71
72    #[error("runner failed to spawn: {bin}")]
73    SpawnFailed {
74        bin: String,
75        #[source]
76        source: std::io::Error,
77    },
78
79    #[error("runner exited non-zero (code={code})\nstdout: {stdout}\nstderr: {stderr}")]
80    NonZeroExit {
81        code: i32,
82        stdout: RedactedString,
83        stderr: RedactedString,
84        session_id: Option<String>,
85    },
86
87    #[error("runner terminated by signal (signal={signal:?})\nstdout: {stdout}\nstderr: {stderr}")]
88    TerminatedBySignal {
89        signal: Option<i32>,
90        stdout: RedactedString,
91        stderr: RedactedString,
92        session_id: Option<String>,
93    },
94
95    #[error("runner interrupted")]
96    Interrupted,
97
98    #[error("runner timed out")]
99    Timeout,
100
101    #[error("io error: {0}")]
102    Io(#[from] std::io::Error),
103
104    #[error("other error: {0}")]
105    Other(#[from] anyhow::Error),
106}
107
108fn runner_label(runner: &Runner) -> String {
109    match runner {
110        Runner::Codex => "codex".to_string(),
111        Runner::Opencode => "opencode".to_string(),
112        Runner::Gemini => "gemini".to_string(),
113        Runner::Cursor => "cursor".to_string(),
114        Runner::Claude => "claude".to_string(),
115        Runner::Kimi => "kimi".to_string(),
116        Runner::Pi => "pi".to_string(),
117        Runner::Plugin(id) => format!("plugin:{}", id),
118    }
119}
120
121/// Check if text looks like a rate limit error.
122fn looks_like_rate_limit(text: &str) -> bool {
123    let lower = text.to_lowercase();
124    lower.contains("429")
125        || lower.contains("rate limit")
126        || lower.contains("too many requests")
127        || lower.contains("quota exceeded")
128        || lower.contains("throttled")
129}
130
131/// Check if text looks like a temporary unavailability error.
132fn looks_like_temporary_unavailable(text: &str) -> bool {
133    let lower = text.to_lowercase();
134    lower.contains("503")
135        || lower.contains("service unavailable")
136        || lower.contains("temporarily unavailable")
137        || lower.contains("gateway timeout")
138        || lower.contains("502")
139        || lower.contains("504")
140}
141
142/// Check if text looks like an auth error.
143fn looks_like_auth_required(_runner: &Runner, text: &str) -> bool {
144    let lower = text.to_lowercase();
145    lower.contains("401")
146        || lower.contains("unauthorized")
147        || lower.contains("invalid api key")
148        || lower.contains("not logged in")
149        || lower.contains("authentication failed")
150        || lower.contains("access denied")
151}
152
153/// Classify textual failure based on exit code and output.
154fn classify_textual_failure(
155    runner: &Runner,
156    _code: i32,
157    stdout: &str,
158    stderr: &str,
159) -> RunnerFailureClass {
160    let combined = format!("{} {}", stdout, stderr);
161    let text = combined.to_lowercase();
162
163    if looks_like_rate_limit(&text) {
164        return RunnerFailureClass::Retryable(RetryableReason::RateLimited);
165    }
166    if looks_like_temporary_unavailable(&text) {
167        return RunnerFailureClass::Retryable(RetryableReason::TemporaryUnavailable);
168    }
169    if looks_like_auth_required(runner, &text) {
170        return RunnerFailureClass::RequiresUserInput(UserInputReason::Auth);
171    }
172
173    RunnerFailureClass::NonRetryable(NonRetryableReason::FatalExit)
174}
175
176impl RunnerError {
177    /// Classify this error for retry decisions.
178    ///
179    /// Conservative policy: only clearly transient cases are classified as retryable.
180    pub(crate) fn classify(&self, runner: &Runner) -> RunnerFailureClass {
181        match self {
182            RunnerError::BinaryMissing { .. } => {
183                RunnerFailureClass::RequiresUserInput(UserInputReason::MissingBinary)
184            }
185            RunnerError::SpawnFailed { .. } => {
186                // Usually deterministic; keep non-retryable for now.
187                RunnerFailureClass::NonRetryable(NonRetryableReason::InvalidInvocation)
188            }
189            RunnerError::Interrupted => {
190                RunnerFailureClass::NonRetryable(NonRetryableReason::FatalExit)
191            }
192            RunnerError::Timeout => {
193                // Conservative: treat as retryable only if caller opts in via config.
194                RunnerFailureClass::Retryable(RetryableReason::TemporaryUnavailable)
195            }
196            RunnerError::Io(e) => {
197                use std::io::ErrorKind;
198                match e.kind() {
199                    ErrorKind::TimedOut
200                    | ErrorKind::ConnectionReset
201                    | ErrorKind::ConnectionAborted
202                    | ErrorKind::ConnectionRefused
203                    | ErrorKind::NotConnected
204                    | ErrorKind::UnexpectedEof
205                    | ErrorKind::WouldBlock => {
206                        RunnerFailureClass::Retryable(RetryableReason::TransientIo)
207                    }
208                    _ => RunnerFailureClass::NonRetryable(NonRetryableReason::FatalExit),
209                }
210            }
211            RunnerError::NonZeroExit {
212                code,
213                stdout,
214                stderr,
215                ..
216            } => classify_textual_failure(runner, *code, &stdout.to_string(), &stderr.to_string()),
217            RunnerError::TerminatedBySignal { .. } => {
218                // Usually not safe to retry automatically.
219                RunnerFailureClass::NonRetryable(NonRetryableReason::FatalExit)
220            }
221            RunnerError::Other(err) => {
222                let msg = format!("{:#}", err).to_lowercase();
223                if looks_like_rate_limit(&msg) {
224                    RunnerFailureClass::Retryable(RetryableReason::RateLimited)
225                } else if looks_like_temporary_unavailable(&msg) {
226                    RunnerFailureClass::Retryable(RetryableReason::TemporaryUnavailable)
227                } else if looks_like_auth_required(runner, &msg) {
228                    RunnerFailureClass::RequiresUserInput(UserInputReason::Auth)
229                } else {
230                    RunnerFailureClass::NonRetryable(NonRetryableReason::FatalExit)
231                }
232            }
233        }
234    }
235}
236
237pub(crate) fn runner_execution_error(runner: &Runner, bin: &str, step: &str) -> RunnerError {
238    RunnerError::Other(anyhow!(
239        "Runner execution failed (runner={}, bin={}): {}.",
240        runner_label(runner),
241        bin,
242        step
243    ))
244}
245
246pub(crate) fn runner_execution_error_with_source(
247    runner: &Runner,
248    bin: &str,
249    step: &str,
250    source: impl fmt::Display,
251) -> RunnerError {
252    RunnerError::Other(anyhow!(
253        "Runner execution failed (runner={}, bin={}): {}: {}.",
254        runner_label(runner),
255        bin,
256        step,
257        source
258    ))
259}
260
261#[cfg(test)]
262mod tests;