Skip to main content

ralph_workflow/agents/error/
kind.rs

1use super::glm_detection::is_glm_like_agent;
2
3/// Error classification for agent failures.
4///
5/// Used to determine appropriate recovery strategy when an agent fails:
6/// - `should_retry()` - Try same agent again after delay
7/// - `should_fallback()` - Switch to next agent in the chain
8/// - `is_unrecoverable()` - Abort the pipeline
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub enum AgentErrorKind {
11    /// API rate limit exceeded - retry after delay.
12    RateLimited,
13    /// Token/context limit exceeded - may need different agent.
14    TokenExhausted,
15    /// API temporarily unavailable (server-side issue) - retry.
16    ApiUnavailable,
17    /// Network connectivity issue (client-side) - retry.
18    NetworkError,
19    /// Authentication failure - switch agent.
20    AuthFailure,
21    /// Command not found - switch agent.
22    CommandNotFound,
23    /// Disk space exhausted - cannot continue.
24    DiskFull,
25    /// Process killed (OOM, signal) - may retry with smaller context.
26    ProcessKilled,
27    /// Invalid JSON response from agent - may retry.
28    InvalidResponse,
29    /// Request/response timeout - retry.
30    Timeout,
31    /// Tool execution failed - should fallback (e.g., file write issues).
32    ToolExecutionFailed,
33    /// Known agent-specific behavioral quirk - should fallback with specific advice.
34    AgentSpecificQuirk,
35    /// Agent-specific issue that may be transient - should retry before falling back.
36    RetryableAgentQuirk,
37    /// Other transient error - retry.
38    Transient,
39    /// Permanent failure - do not retry.
40    Permanent,
41}
42
43impl AgentErrorKind {
44    /// Determine if this error should trigger a retry.
45    ///
46    /// Note: `RateLimited` is intentionally excluded - it triggers immediate agent fallback
47    /// via `should_immediate_agent_fallback()` instead of retrying with the same agent.
48    #[must_use]
49    pub const fn should_retry(self) -> bool {
50        matches!(
51            self,
52            Self::ApiUnavailable
53                | Self::NetworkError
54                | Self::Timeout
55                | Self::InvalidResponse
56                | Self::RetryableAgentQuirk
57                | Self::Transient
58        )
59    }
60
61    /// Determine if this error requires immediate agent fallback (without retry).
62    ///
63    /// Rate limit (429) errors indicate the current provider is temporarily exhausted.
64    /// Rather than waiting and retrying the same agent (which wastes time), we should
65    /// immediately switch to the next agent in the fallback chain to continue work.
66    #[must_use]
67    pub const fn should_immediate_agent_fallback(self) -> bool {
68        matches!(self, Self::RateLimited)
69    }
70
71    /// Determine if this error should trigger a fallback to another agent.
72    #[must_use]
73    pub const fn should_fallback(self) -> bool {
74        matches!(
75            self,
76            Self::TokenExhausted
77                | Self::AuthFailure
78                | Self::CommandNotFound
79                | Self::ProcessKilled
80                | Self::ToolExecutionFailed
81                | Self::AgentSpecificQuirk
82        )
83    }
84
85    /// Determine if this error is unrecoverable (should abort).
86    #[must_use]
87    pub const fn is_unrecoverable(self) -> bool {
88        matches!(self, Self::DiskFull | Self::Permanent)
89    }
90
91    /// Check if this is a command not found error.
92    #[must_use]
93    pub const fn is_command_not_found(self) -> bool {
94        matches!(self, Self::CommandNotFound)
95    }
96
97    /// Check if this is a network-related error.
98    #[must_use]
99    pub const fn is_network_error(self) -> bool {
100        matches!(self, Self::NetworkError | Self::Timeout)
101    }
102
103    /// Check if this error might be resolved by reducing context size.
104    #[must_use]
105    pub const fn suggests_smaller_context(self) -> bool {
106        matches!(self, Self::TokenExhausted | Self::ProcessKilled)
107    }
108
109    /// Get suggested wait time in milliseconds before retry.
110    #[must_use]
111    pub const fn suggested_wait_ms(self) -> u64 {
112        match self {
113            Self::ApiUnavailable => 3000, // Server issue: wait 3 seconds
114            Self::NetworkError => 2000,   // Network: wait 2 seconds
115            Self::Timeout | Self::Transient | Self::RetryableAgentQuirk => 1000, // Timeout/Transient: short wait
116            Self::InvalidResponse => 500, // Bad response: quick retry
117            // No wait for non-retryable errors
118            Self::RateLimited
119            | Self::TokenExhausted
120            | Self::AuthFailure
121            | Self::CommandNotFound
122            | Self::DiskFull
123            | Self::ProcessKilled
124            | Self::ToolExecutionFailed
125            | Self::AgentSpecificQuirk
126            | Self::Permanent => 0,
127        }
128    }
129
130    /// Get a user-friendly description of this error type.
131    #[must_use]
132    pub const fn description(self) -> &'static str {
133        match self {
134            Self::RateLimited => "API rate limit exceeded",
135            Self::TokenExhausted => "Token/context limit exceeded",
136            Self::ApiUnavailable => "API service temporarily unavailable",
137            Self::NetworkError => "Network connectivity issue",
138            Self::AuthFailure => "Authentication failure",
139            Self::CommandNotFound => "Command not found",
140            Self::DiskFull => "Disk space exhausted",
141            Self::ProcessKilled => "Process terminated (possibly OOM)",
142            Self::InvalidResponse => "Invalid response from agent",
143            Self::Timeout => "Request timed out",
144            Self::ToolExecutionFailed => "Tool execution failed (e.g., file write)",
145            Self::AgentSpecificQuirk => "Known agent-specific issue",
146            Self::RetryableAgentQuirk => "Agent-specific issue (may be transient)",
147            Self::Transient => "Transient error",
148            Self::Permanent => "Permanent error",
149        }
150    }
151
152    /// Get recovery advice for this error type.
153    #[must_use]
154    pub const fn recovery_advice(self) -> &'static str {
155        match self {
156            Self::RateLimited => {
157                "Switching to next agent immediately. Rate limit indicates provider exhaustion."
158            }
159            Self::TokenExhausted => {
160                "Switching to alternative agent. Tip: Try RALPH_DEVELOPER_CONTEXT=0 or RALPH_REVIEWER_CONTEXT=0"
161            }
162            Self::ApiUnavailable => {
163                "API server issue. Will retry automatically. Tip: Check status page or try different provider."
164            }
165            Self::NetworkError => {
166                "Check your internet connection. Will retry automatically. Tip: Check firewall/VPN settings."
167            }
168            Self::AuthFailure => {
169                "Check API key or run 'agent auth' to authenticate. Tip: Verify credentials for this provider."
170            }
171            Self::CommandNotFound => {
172                "Agent binary not installed. See installation guidance below. Tip: Run 'ralph --list-available-agents'"
173            }
174            Self::DiskFull => "Free up disk space and try again. Tip: Check .agent directory size.",
175            Self::ProcessKilled => {
176                "Process was killed (possible OOM). Trying with smaller context. Tip: Reduce context with RALPH_*_CONTEXT=0"
177            }
178            Self::InvalidResponse => {
179                "Received malformed response. Retrying... Tip: May indicate parser mismatch with this agent."
180            }
181            Self::Timeout => {
182                "Request timed out. Will retry with longer timeout. Tip: Try reducing prompt size or context."
183            }
184            Self::ToolExecutionFailed => {
185                "Tool execution failed (file write/permissions). Switching agent. Tip: Check directory write permissions."
186            }
187            Self::AgentSpecificQuirk => {
188                "Known agent-specific issue. Switching to alternative agent. Tip: See docs/agent-compatibility.md"
189            }
190            Self::RetryableAgentQuirk => {
191                "Agent-specific issue that may be transient. Retrying... Tip: See docs/agent-compatibility.md"
192            }
193            Self::Transient => "Temporary issue. Will retry automatically.",
194            Self::Permanent => {
195                "Unrecoverable error. Check agent logs (.agent/logs/) and see docs/agent-compatibility.md for help."
196            }
197        }
198    }
199
200    /// Classify an error from exit code, output, and agent name.
201    ///
202    /// This variant takes the agent name into account for better classification.
203    /// Some agents have known failure patterns that should trigger fallback
204    /// instead of retry, even when the stderr output is generic.
205    pub fn classify_with_agent(
206        exit_code: i32,
207        stderr: &str,
208        agent_name: Option<&str>,
209        model_flag: Option<&str>,
210    ) -> Self {
211        let stderr_lower = stderr.to_lowercase();
212
213        // Check for specific error patterns FIRST, before applying agent-specific heuristics.
214        // This ensures that token exhaustion is detected even for GLM-like agents.
215        if let Some(err) = Self::check_api_errors(&stderr_lower) {
216            return err;
217        }
218
219        if let Some(err) = Self::check_network_errors(&stderr_lower) {
220            return err;
221        }
222
223        if let Some(err) = Self::check_resource_errors(exit_code, &stderr_lower) {
224            return err;
225        }
226
227        if let Some(err) = Self::check_tool_failures(&stderr_lower) {
228            return err;
229        }
230
231        // If we know this is a GLM-like agent and it failed with exit code 1
232        // (and we haven't matched a specific error pattern above),
233        // classify based on stderr content.
234        let is_problematic_agent =
235            agent_name.is_some_and(is_glm_like_agent) || model_flag.is_some_and(is_glm_like_agent);
236
237        if is_problematic_agent && exit_code == 1 {
238            // Check if stderr has known problematic patterns that indicate unrecoverable issues
239            let has_known_problematic_pattern = stderr_lower.contains("permission")
240                || stderr_lower.contains("denied")
241                || stderr_lower.contains("unauthorized")
242                || stderr_lower.contains("auth")
243                || stderr_lower.contains("token")
244                || stderr_lower.contains("limit")
245                || stderr_lower.contains("quota")
246                || stderr_lower.contains("disk")
247                || stderr_lower.contains("space")
248                // Agent-specific known patterns (from check_agent_specific_quirks)
249                || (stderr_lower.contains("glm") && stderr_lower.contains("failed"))
250                || (stderr_lower.contains("ccs") && stderr_lower.contains("failed"));
251
252            if has_known_problematic_pattern {
253                // Known issue - should fallback
254                return Self::AgentSpecificQuirk;
255            }
256
257            // Unknown error - may be transient, should retry
258            return Self::RetryableAgentQuirk;
259        }
260
261        if let Some(err) = Self::check_agent_specific_quirks(&stderr_lower, exit_code) {
262            return err;
263        }
264
265        if let Some(err) = Self::check_command_not_found(exit_code, &stderr_lower) {
266            return err;
267        }
268
269        // Transient errors (exit codes that might succeed on retry)
270        if exit_code == 1 && stderr_lower.contains("error") {
271            return Self::Transient;
272        }
273
274        Self::Permanent
275    }
276
277    fn check_api_errors(stderr_lower: &str) -> Option<Self> {
278        // Rate limiting indicators (API-side)
279        if stderr_lower.contains("rate limit")
280            || stderr_lower.contains("too many requests")
281            || stderr_lower.contains("429")
282            || stderr_lower.contains("quota exceeded")
283        {
284            return Some(Self::RateLimited);
285        }
286
287        // Auth failures
288        // Check BEFORE token/context exhaustion so strings like "invalid token" are
289        // treated as authentication failures (not context exhaustion).
290        if stderr_lower.contains("unauthorized")
291            || stderr_lower.contains("authentication")
292            || stderr_lower.contains("401")
293            || stderr_lower.contains("api key")
294            || stderr_lower.contains("invalid token")
295            || stderr_lower.contains("forbidden")
296            || stderr_lower.contains("403")
297            || stderr_lower.contains("access denied")
298            || stderr_lower.contains("credential")
299        {
300            return Some(Self::AuthFailure);
301        }
302
303        // Token/context exhaustion (API-side)
304        // Check this BEFORE GLM agent-specific fallback to ensure TokenExhausted is detected
305        // Note: "too long" is specifically for API token limits, not OS argument limits
306        // We exclude "argument list too long" which is an E2BIG OS error
307        if stderr_lower.contains("context length")
308            || stderr_lower.contains("maximum context")
309            || stderr_lower.contains("max context")
310            || stderr_lower.contains("context window")
311            || stderr_lower.contains("maximum tokens")
312            || stderr_lower.contains("max tokens")
313            || stderr_lower.contains("too many tokens")
314            || stderr_lower.contains("token limit")
315            || stderr_lower.contains("context_length_exceeded")
316            || stderr_lower.contains("input too large")
317            || stderr_lower.contains("prompt is too long")
318            || (stderr_lower.contains("too long")
319                && !stderr_lower.contains("argument list too long"))
320        {
321            return Some(Self::TokenExhausted);
322        }
323
324        None
325    }
326
327    fn check_network_errors(stderr_lower: &str) -> Option<Self> {
328        // Network errors (client-side connectivity issues)
329        if stderr_lower.contains("connection refused")
330            || stderr_lower.contains("network unreachable")
331            || stderr_lower.contains("dns resolution")
332            || stderr_lower.contains("name resolution")
333            || stderr_lower.contains("no route to host")
334            || stderr_lower.contains("network is down")
335            || stderr_lower.contains("host unreachable")
336            || stderr_lower.contains("connection reset")
337            || stderr_lower.contains("broken pipe")
338            || stderr_lower.contains("econnrefused")
339            || stderr_lower.contains("enetunreach")
340        {
341            return Some(Self::NetworkError);
342        }
343
344        // API unavailable (server-side issues)
345        if stderr_lower.contains("service unavailable")
346            || stderr_lower.contains("503")
347            || stderr_lower.contains("502")
348            || stderr_lower.contains("504")
349            || stderr_lower.contains("500")
350            || stderr_lower.contains("internal server error")
351            || stderr_lower.contains("bad gateway")
352            || stderr_lower.contains("gateway timeout")
353            || stderr_lower.contains("overloaded")
354            || stderr_lower.contains("maintenance")
355        {
356            return Some(Self::ApiUnavailable);
357        }
358
359        // Request timeout
360        if stderr_lower.contains("timeout")
361            || stderr_lower.contains("timed out")
362            || stderr_lower.contains("request timeout")
363            || stderr_lower.contains("deadline exceeded")
364        {
365            return Some(Self::Timeout);
366        }
367
368        None
369    }
370
371    fn check_resource_errors(exit_code: i32, stderr_lower: &str) -> Option<Self> {
372        // Disk space exhaustion
373        if stderr_lower.contains("no space left")
374            || stderr_lower.contains("disk full")
375            || stderr_lower.contains("enospc")
376            || stderr_lower.contains("out of disk")
377            || stderr_lower.contains("insufficient storage")
378        {
379            return Some(Self::DiskFull);
380        }
381
382        // Argument list too long (E2BIG) - prompt exceeds OS limit
383        // Exit code 7 is the E2BIG errno value used by spawn_agent_process
384        if exit_code == 7
385            || stderr_lower.contains("argument list too long")
386            || stderr_lower.contains("e2big")
387        {
388            return Some(Self::ToolExecutionFailed);
389        }
390
391        // Process killed (OOM or signals)
392        // Exit code 137 = 128 + 9 (SIGKILL), 139 = 128 + 11 (SIGSEGV)
393        if exit_code == 137
394            || exit_code == 139
395            || exit_code == -9
396            || stderr_lower.contains("killed")
397            || stderr_lower.contains("oom")
398            || stderr_lower.contains("out of memory")
399            || stderr_lower.contains("memory exhausted")
400            || stderr_lower.contains("cannot allocate")
401            || stderr_lower.contains("segmentation fault")
402            || stderr_lower.contains("sigsegv")
403            || stderr_lower.contains("sigkill")
404        {
405            return Some(Self::ProcessKilled);
406        }
407
408        None
409    }
410
411    fn check_tool_failures(stderr_lower: &str) -> Option<Self> {
412        // Invalid JSON response
413        if stderr_lower.contains("invalid json")
414            || stderr_lower.contains("json parse")
415            || stderr_lower.contains("unexpected token")
416            || stderr_lower.contains("malformed")
417            || stderr_lower.contains("truncated response")
418            || stderr_lower.contains("incomplete response")
419        {
420            return Some(Self::InvalidResponse);
421        }
422
423        // Tool execution failures (file writes, tool calls, etc.)
424        if stderr_lower.contains("write error")
425            || stderr_lower.contains("cannot write")
426            || stderr_lower.contains("failed to write")
427            || stderr_lower.contains("unable to create file")
428            || stderr_lower.contains("file creation failed")
429            || stderr_lower.contains("i/o error")
430            || stderr_lower.contains("io error")
431            || stderr_lower.contains("tool failed")
432            || stderr_lower.contains("tool execution failed")
433            || stderr_lower.contains("tool call failed")
434        {
435            return Some(Self::ToolExecutionFailed);
436        }
437
438        // Permission denied errors (specific patterns that should fallback)
439        if stderr_lower.contains("permission denied")
440            || stderr_lower.contains("operation not permitted")
441            || stderr_lower.contains("insufficient permissions")
442            || stderr_lower.contains("eacces")
443            || stderr_lower.contains("eperm")
444        {
445            return Some(Self::ToolExecutionFailed);
446        }
447
448        None
449    }
450
451    fn check_agent_specific_quirks(stderr_lower: &str, exit_code: i32) -> Option<Self> {
452        // GLM/CCS-specific known issues
453        if stderr_lower.contains("ccs") || stderr_lower.contains("glm") {
454            // CCS/GLM with exit code 1 is likely a permission/tool issue
455            if exit_code == 1 {
456                return Some(Self::AgentSpecificQuirk);
457            }
458            // CCS-specific error patterns
459            if stderr_lower.contains("ccs") && stderr_lower.contains("failed") {
460                return Some(Self::AgentSpecificQuirk);
461            }
462            // GLM-specific permission errors
463            if stderr_lower.contains("glm")
464                && (stderr_lower.contains("permission")
465                    || stderr_lower.contains("denied")
466                    || stderr_lower.contains("unauthorized"))
467            {
468                return Some(Self::AgentSpecificQuirk);
469            }
470        }
471
472        // Fallback for GLM with any error and exit code 1
473        if stderr_lower.contains("glm") && exit_code == 1 {
474            return Some(Self::AgentSpecificQuirk);
475        }
476
477        None
478    }
479
480    fn check_command_not_found(exit_code: i32, stderr_lower: &str) -> Option<Self> {
481        // Command not found (keep this after permission checks since permission
482        // errors also contain "permission denied")
483        if exit_code == 127
484            || exit_code == 126
485            || stderr_lower.contains("command not found")
486            || stderr_lower.contains("not found")
487            || stderr_lower.contains("no such file")
488        {
489            return Some(Self::CommandNotFound);
490        }
491
492        None
493    }
494}