Skip to main content

ralph/runner/
error.rs

1//! Runner error surface and contextual constructors.
2//!
3//! Responsibilities:
4//! - Define `RunnerError`, the matchable error type for runner orchestration.
5//! - Provide helpers to construct contextual `RunnerError::Other` values.
6//! - Classify failures as retryable vs non-retryable vs requires-user-input.
7//!
8//! Does not handle:
9//! - Runner/model validation (see `runner/model.rs`).
10//! - Command assembly and process execution (see `runner/execution/*`).
11//!
12//! Assumptions/invariants:
13//! - Any user-visible stdout/stderr stored in errors must be redacted via `RedactedString`
14//!   (or redacted at display time by downstream formatting).
15
16use std::fmt;
17
18use anyhow::anyhow;
19
20use crate::contracts::Runner;
21use crate::redaction::RedactedString;
22
23/// Classification of runner failures for retry decisions.
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25pub(crate) enum RunnerFailureClass {
26    /// Transient failure; safe to automatically retry.
27    Retryable(RetryableReason),
28    /// User action required; should not be retried.
29    RequiresUserInput(UserInputReason),
30    /// Deterministic failure; do not retry.
31    NonRetryable(NonRetryableReason),
32}
33
34/// Reasons why a failure is considered retryable.
35#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36pub(crate) enum RetryableReason {
37    /// Rate limit or quota exceeded (HTTP 429, etc).
38    RateLimited,
39    /// Temporary service unavailability (HTTP 503, etc).
40    TemporaryUnavailable,
41    /// Transient I/O error (connection reset, timeout, etc).
42    TransientIo,
43}
44
45/// Reasons why a failure requires user input.
46#[derive(Debug, Clone, Copy, PartialEq, Eq)]
47pub(crate) enum UserInputReason {
48    /// Authentication required (API key, login, etc).
49    Auth,
50    /// Required binary is missing.
51    MissingBinary,
52}
53
54/// Reasons why a failure is considered non-retryable.
55#[derive(Debug, Clone, Copy, PartialEq, Eq)]
56pub(crate) enum NonRetryableReason {
57    /// Invalid invocation or bad arguments.
58    InvalidInvocation,
59    /// Fatal exit; no point in retrying.
60    FatalExit,
61}
62
63#[derive(Debug, thiserror::Error)]
64pub enum RunnerError {
65    #[error("runner binary not found: {bin}")]
66    BinaryMissing {
67        bin: String,
68        #[source]
69        source: std::io::Error,
70    },
71
72    #[error("runner failed to spawn: {bin}")]
73    SpawnFailed {
74        bin: String,
75        #[source]
76        source: std::io::Error,
77    },
78
79    #[error("runner exited non-zero (code={code})\nstdout: {stdout}\nstderr: {stderr}")]
80    NonZeroExit {
81        code: i32,
82        stdout: RedactedString,
83        stderr: RedactedString,
84        session_id: Option<String>,
85    },
86
87    #[error("runner terminated by signal (signal={signal:?})\nstdout: {stdout}\nstderr: {stderr}")]
88    TerminatedBySignal {
89        signal: Option<i32>,
90        stdout: RedactedString,
91        stderr: RedactedString,
92        session_id: Option<String>,
93    },
94
95    #[error("runner interrupted")]
96    Interrupted,
97
98    #[error("runner timed out")]
99    Timeout,
100
101    #[error("io error: {0}")]
102    Io(#[from] std::io::Error),
103
104    #[error("other error: {0}")]
105    Other(#[from] anyhow::Error),
106}
107
108fn runner_label(runner: &Runner) -> String {
109    match runner {
110        Runner::Codex => "codex".to_string(),
111        Runner::Opencode => "opencode".to_string(),
112        Runner::Gemini => "gemini".to_string(),
113        Runner::Cursor => "cursor".to_string(),
114        Runner::Claude => "claude".to_string(),
115        Runner::Kimi => "kimi".to_string(),
116        Runner::Pi => "pi".to_string(),
117        Runner::Plugin(id) => format!("plugin:{}", id),
118    }
119}
120
121/// Check if text looks like a rate limit error.
122fn looks_like_rate_limit(text: &str) -> bool {
123    let lower = text.to_lowercase();
124    lower.contains("429")
125        || lower.contains("rate limit")
126        || lower.contains("too many requests")
127        || lower.contains("quota exceeded")
128        || lower.contains("throttled")
129}
130
131/// Check if text looks like a temporary unavailability error.
132fn looks_like_temporary_unavailable(text: &str) -> bool {
133    let lower = text.to_lowercase();
134    lower.contains("503")
135        || lower.contains("service unavailable")
136        || lower.contains("temporarily unavailable")
137        || lower.contains("gateway timeout")
138        || lower.contains("502")
139        || lower.contains("504")
140}
141
142/// Check if text looks like an auth error.
143fn looks_like_auth_required(_runner: &Runner, text: &str) -> bool {
144    let lower = text.to_lowercase();
145    lower.contains("401")
146        || lower.contains("unauthorized")
147        || lower.contains("invalid api key")
148        || lower.contains("not logged in")
149        || lower.contains("authentication failed")
150        || lower.contains("access denied")
151}
152
153/// Classify textual failure based on exit code and output.
154fn classify_textual_failure(
155    runner: &Runner,
156    _code: i32,
157    stdout: &str,
158    stderr: &str,
159) -> RunnerFailureClass {
160    let combined = format!("{} {}", stdout, stderr);
161    let text = combined.to_lowercase();
162
163    if looks_like_rate_limit(&text) {
164        return RunnerFailureClass::Retryable(RetryableReason::RateLimited);
165    }
166    if looks_like_temporary_unavailable(&text) {
167        return RunnerFailureClass::Retryable(RetryableReason::TemporaryUnavailable);
168    }
169    if looks_like_auth_required(runner, &text) {
170        return RunnerFailureClass::RequiresUserInput(UserInputReason::Auth);
171    }
172
173    RunnerFailureClass::NonRetryable(NonRetryableReason::FatalExit)
174}
175
176impl RunnerError {
177    /// Classify this error for retry decisions.
178    ///
179    /// Conservative policy: only clearly transient cases are classified as retryable.
180    pub(crate) fn classify(&self, runner: &Runner) -> RunnerFailureClass {
181        match self {
182            RunnerError::BinaryMissing { .. } => {
183                RunnerFailureClass::RequiresUserInput(UserInputReason::MissingBinary)
184            }
185            RunnerError::SpawnFailed { .. } => {
186                // Usually deterministic; keep non-retryable for now.
187                RunnerFailureClass::NonRetryable(NonRetryableReason::InvalidInvocation)
188            }
189            RunnerError::Interrupted => {
190                RunnerFailureClass::NonRetryable(NonRetryableReason::FatalExit)
191            }
192            RunnerError::Timeout => {
193                // Conservative: treat as retryable only if caller opts in via config.
194                RunnerFailureClass::Retryable(RetryableReason::TemporaryUnavailable)
195            }
196            RunnerError::Io(e) => {
197                use std::io::ErrorKind;
198                match e.kind() {
199                    ErrorKind::TimedOut
200                    | ErrorKind::ConnectionReset
201                    | ErrorKind::ConnectionAborted
202                    | ErrorKind::ConnectionRefused
203                    | ErrorKind::NotConnected
204                    | ErrorKind::UnexpectedEof
205                    | ErrorKind::WouldBlock => {
206                        RunnerFailureClass::Retryable(RetryableReason::TransientIo)
207                    }
208                    _ => RunnerFailureClass::NonRetryable(NonRetryableReason::FatalExit),
209                }
210            }
211            RunnerError::NonZeroExit {
212                code,
213                stdout,
214                stderr,
215                ..
216            } => classify_textual_failure(runner, *code, &stdout.to_string(), &stderr.to_string()),
217            RunnerError::TerminatedBySignal { .. } => {
218                // Usually not safe to retry automatically.
219                RunnerFailureClass::NonRetryable(NonRetryableReason::FatalExit)
220            }
221            RunnerError::Other(err) => {
222                let msg = format!("{:#}", err).to_lowercase();
223                if looks_like_rate_limit(&msg) {
224                    RunnerFailureClass::Retryable(RetryableReason::RateLimited)
225                } else if looks_like_temporary_unavailable(&msg) {
226                    RunnerFailureClass::Retryable(RetryableReason::TemporaryUnavailable)
227                } else if looks_like_auth_required(runner, &msg) {
228                    RunnerFailureClass::RequiresUserInput(UserInputReason::Auth)
229                } else {
230                    RunnerFailureClass::NonRetryable(NonRetryableReason::FatalExit)
231                }
232            }
233        }
234    }
235}
236
237pub(crate) fn runner_execution_error(runner: &Runner, bin: &str, step: &str) -> RunnerError {
238    RunnerError::Other(anyhow!(
239        "Runner execution failed (runner={}, bin={}): {}.",
240        runner_label(runner),
241        bin,
242        step
243    ))
244}
245
246pub(crate) fn runner_execution_error_with_source(
247    runner: &Runner,
248    bin: &str,
249    step: &str,
250    source: impl fmt::Display,
251) -> RunnerError {
252    RunnerError::Other(anyhow!(
253        "Runner execution failed (runner={}, bin={}): {}: {}.",
254        runner_label(runner),
255        bin,
256        step,
257        source
258    ))
259}
260
261#[cfg(test)]
262mod tests {
263    use super::*;
264
265    #[test]
266    fn runner_error_nonzero_exit_redacts_output() {
267        let err = RunnerError::NonZeroExit {
268            code: 1,
269            stdout: "out: API_KEY=secret123".into(),
270            stderr: "err: bearer abc123def456".into(),
271            session_id: None,
272        };
273        let msg = format!("{err}");
274        assert!(msg.contains("API_KEY=[REDACTED]"));
275        assert!(msg.contains("bearer [REDACTED]"));
276        assert!(!msg.contains("secret123"));
277        assert!(!msg.contains("abc123def456"));
278    }
279
280    #[test]
281    fn runner_execution_error_includes_context() {
282        let err = runner_execution_error(&Runner::Gemini, "gemini", "capture child stdout");
283        let msg = format!("{err}");
284        assert!(msg.contains("runner=gemini"));
285        assert!(msg.contains("bin=gemini"));
286        assert!(msg.contains("capture child stdout"));
287    }
288
289    // Tests for looks_like_rate_limit
290    #[test]
291    fn looks_like_rate_limit_detects_429() {
292        assert!(looks_like_rate_limit("Error 429"));
293        assert!(looks_like_rate_limit("HTTP 429"));
294        assert!(!looks_like_rate_limit("Error 500"));
295    }
296
297    #[test]
298    fn looks_like_rate_limit_detects_variations() {
299        assert!(looks_like_rate_limit("rate limit exceeded"));
300        assert!(looks_like_rate_limit("Rate Limit Exceeded"));
301        assert!(looks_like_rate_limit("too many requests"));
302        assert!(looks_like_rate_limit("Too Many Requests"));
303        assert!(looks_like_rate_limit("quota exceeded"));
304        assert!(looks_like_rate_limit("API throttled"));
305    }
306
307    #[test]
308    fn looks_like_rate_limit_negative_cases() {
309        assert!(!looks_like_rate_limit("success"));
310        assert!(!looks_like_rate_limit("internal server error"));
311        assert!(!looks_like_rate_limit(""));
312    }
313
314    // Tests for looks_like_temporary_unavailable
315    #[test]
316    fn looks_like_temporary_unavailable_detects_503() {
317        assert!(looks_like_temporary_unavailable("Error 503"));
318        assert!(looks_like_temporary_unavailable("HTTP 503"));
319    }
320
321    #[test]
322    fn looks_like_temporary_unavailable_detects_gateway_errors() {
323        assert!(looks_like_temporary_unavailable("502 Bad Gateway"));
324        assert!(looks_like_temporary_unavailable("504 Gateway Timeout"));
325    }
326
327    #[test]
328    fn looks_like_temporary_unavailable_detects_variations() {
329        assert!(looks_like_temporary_unavailable("service unavailable"));
330        assert!(looks_like_temporary_unavailable("Service Unavailable"));
331        assert!(looks_like_temporary_unavailable("temporarily unavailable"));
332        assert!(looks_like_temporary_unavailable("gateway timeout"));
333    }
334
335    #[test]
336    fn looks_like_temporary_unavailable_negative_cases() {
337        assert!(!looks_like_temporary_unavailable("success"));
338        assert!(!looks_like_temporary_unavailable("Error 404"));
339        assert!(!looks_like_temporary_unavailable(""));
340    }
341
342    // Tests for looks_like_auth_required
343    #[test]
344    fn looks_like_auth_required_detects_401() {
345        let runner = Runner::Gemini;
346        assert!(looks_like_auth_required(&runner, "Error 401"));
347        assert!(looks_like_auth_required(&runner, "HTTP 401"));
348    }
349
350    #[test]
351    fn looks_like_auth_required_detects_variations() {
352        let runner = Runner::Gemini;
353        assert!(looks_like_auth_required(&runner, "unauthorized"));
354        assert!(looks_like_auth_required(&runner, "Unauthorized"));
355        assert!(looks_like_auth_required(&runner, "invalid api key"));
356        assert!(looks_like_auth_required(&runner, "not logged in"));
357        assert!(looks_like_auth_required(&runner, "authentication failed"));
358        assert!(looks_like_auth_required(&runner, "access denied"));
359    }
360
361    #[test]
362    fn looks_like_auth_required_negative_cases() {
363        let runner = Runner::Gemini;
364        assert!(!looks_like_auth_required(&runner, "success"));
365        assert!(!looks_like_auth_required(&runner, "Error 500"));
366        assert!(!looks_like_auth_required(&runner, ""));
367    }
368
369    // Tests for classify() method - NonZeroExit
370    #[test]
371    fn classify_returns_retryable_for_rate_limit() {
372        let err = RunnerError::NonZeroExit {
373            code: 1,
374            stdout: "rate limit exceeded".into(),
375            stderr: "".into(),
376            session_id: None,
377        };
378        let runner = Runner::Gemini;
379        match err.classify(&runner) {
380            RunnerFailureClass::Retryable(RetryableReason::RateLimited) => {}
381            other => panic!("Expected RateLimited, got {:?}", other),
382        }
383    }
384
385    #[test]
386    fn classify_returns_retryable_for_503() {
387        let err = RunnerError::NonZeroExit {
388            code: 1,
389            stdout: "".into(),
390            stderr: "HTTP 503 Service Unavailable".into(),
391            session_id: None,
392        };
393        let runner = Runner::Gemini;
394        match err.classify(&runner) {
395            RunnerFailureClass::Retryable(RetryableReason::TemporaryUnavailable) => {}
396            other => panic!("Expected TemporaryUnavailable, got {:?}", other),
397        }
398    }
399
400    #[test]
401    fn classify_returns_requires_user_input_for_auth() {
402        let err = RunnerError::NonZeroExit {
403            code: 1,
404            stdout: "401 Unauthorized".into(),
405            stderr: "".into(),
406            session_id: None,
407        };
408        let runner = Runner::Gemini;
409        match err.classify(&runner) {
410            RunnerFailureClass::RequiresUserInput(UserInputReason::Auth) => {}
411            other => panic!("Expected Auth, got {:?}", other),
412        }
413    }
414
415    #[test]
416    fn classify_returns_non_retryable_for_fatal_exit() {
417        let err = RunnerError::NonZeroExit {
418            code: 1,
419            stdout: "some random error".into(),
420            stderr: "no matching pattern".into(),
421            session_id: None,
422        };
423        let runner = Runner::Gemini;
424        match err.classify(&runner) {
425            RunnerFailureClass::NonRetryable(NonRetryableReason::FatalExit) => {}
426            other => panic!("Expected FatalExit, got {:?}", other),
427        }
428    }
429
430    // Tests for classify() method - Other Error Variants
431    #[test]
432    fn classify_binary_missing_requires_user_input() {
433        let io_err = std::io::Error::new(std::io::ErrorKind::NotFound, "not found");
434        let err = RunnerError::BinaryMissing {
435            bin: "test".to_string(),
436            source: io_err,
437        };
438        let runner = Runner::Gemini;
439        match err.classify(&runner) {
440            RunnerFailureClass::RequiresUserInput(UserInputReason::MissingBinary) => {}
441            other => panic!("Expected MissingBinary, got {:?}", other),
442        }
443    }
444
445    #[test]
446    fn classify_timeout_is_retryable() {
447        let err = RunnerError::Timeout;
448        let runner = Runner::Gemini;
449        match err.classify(&runner) {
450            RunnerFailureClass::Retryable(RetryableReason::TemporaryUnavailable) => {}
451            other => panic!("Expected TemporaryUnavailable, got {:?}", other),
452        }
453    }
454
455    #[test]
456    fn classify_interrupted_is_non_retryable() {
457        let err = RunnerError::Interrupted;
458        let runner = Runner::Gemini;
459        match err.classify(&runner) {
460            RunnerFailureClass::NonRetryable(NonRetryableReason::FatalExit) => {}
461            other => panic!("Expected FatalExit, got {:?}", other),
462        }
463    }
464
465    #[test]
466    fn classify_io_transient_errors_are_retryable() {
467        use std::io::ErrorKind;
468
469        let transient_kinds = [
470            ErrorKind::TimedOut,
471            ErrorKind::ConnectionReset,
472            ErrorKind::ConnectionAborted,
473            ErrorKind::ConnectionRefused,
474            ErrorKind::NotConnected,
475            ErrorKind::UnexpectedEof,
476            ErrorKind::WouldBlock,
477        ];
478
479        for kind in &transient_kinds {
480            let io_err = std::io::Error::new(*kind, "transient error");
481            let err = RunnerError::Io(io_err);
482            let runner = Runner::Gemini;
483            match err.classify(&runner) {
484                RunnerFailureClass::Retryable(RetryableReason::TransientIo) => {}
485                other => panic!("Expected TransientIo for {:?}, got {:?}", kind, other),
486            }
487        }
488    }
489
490    #[test]
491    fn classify_io_other_errors_are_non_retryable() {
492        let io_err = std::io::Error::new(std::io::ErrorKind::PermissionDenied, "permission denied");
493        let err = RunnerError::Io(io_err);
494        let runner = Runner::Gemini;
495        match err.classify(&runner) {
496            RunnerFailureClass::NonRetryable(NonRetryableReason::FatalExit) => {}
497            other => panic!("Expected FatalExit, got {:?}", other),
498        }
499    }
500
501    #[test]
502    fn classify_other_error_with_rate_limit_pattern() {
503        let err = RunnerError::Other(anyhow!("429 rate limit exceeded"));
504        let runner = Runner::Gemini;
505        match err.classify(&runner) {
506            RunnerFailureClass::Retryable(RetryableReason::RateLimited) => {}
507            other => panic!("Expected RateLimited, got {:?}", other),
508        }
509    }
510
511    #[test]
512    fn classify_other_error_with_auth_pattern() {
513        let err = RunnerError::Other(anyhow!("401 invalid api key"));
514        let runner = Runner::Gemini;
515        match err.classify(&runner) {
516            RunnerFailureClass::RequiresUserInput(UserInputReason::Auth) => {}
517            other => panic!("Expected Auth, got {:?}", other),
518        }
519    }
520
521    #[test]
522    fn classify_other_error_without_pattern_is_non_retryable() {
523        let err = RunnerError::Other(anyhow!("some generic error"));
524        let runner = Runner::Gemini;
525        match err.classify(&runner) {
526            RunnerFailureClass::NonRetryable(NonRetryableReason::FatalExit) => {}
527            other => panic!("Expected FatalExit, got {:?}", other),
528        }
529    }
530}