Skip to main content

ralph_workflow/agents/error/
kind.rs

1use super::glm_detection::is_glm_like_agent;
2
3/// Error classification for agent failures.
4///
5/// Used to determine appropriate recovery strategy when an agent fails:
6/// - `should_retry()` - Try same agent again after delay
7/// - `should_fallback()` - Switch to next agent in the chain
8/// - `is_unrecoverable()` - Abort the pipeline
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub enum AgentErrorKind {
11    /// API rate limit exceeded - retry after delay.
12    RateLimited,
13    /// Token/context limit exceeded - may need different agent.
14    TokenExhausted,
15    /// API temporarily unavailable (server-side issue) - retry.
16    ApiUnavailable,
17    /// Network connectivity issue (client-side) - retry.
18    NetworkError,
19    /// Authentication failure - switch agent.
20    AuthFailure,
21    /// Command not found - switch agent.
22    CommandNotFound,
23    /// Disk space exhausted - cannot continue.
24    DiskFull,
25    /// Process killed (OOM, signal) - may retry with smaller context.
26    ProcessKilled,
27    /// Invalid JSON response from agent - may retry.
28    InvalidResponse,
29    /// Request/response timeout - retry.
30    Timeout,
31    /// Tool execution failed - should fallback (e.g., file write issues).
32    ToolExecutionFailed,
33    /// Known agent-specific behavioral quirk - should fallback with specific advice.
34    AgentSpecificQuirk,
35    /// Agent-specific issue that may be transient - should retry before falling back.
36    RetryableAgentQuirk,
37    /// Other transient error - retry.
38    Transient,
39    /// Permanent failure - do not retry.
40    Permanent,
41}
42
43impl AgentErrorKind {
44    /// Determine if this error should trigger a retry.
45    ///
46    /// Note: `RateLimited` is intentionally excluded - it triggers immediate agent fallback
47    /// via `should_immediate_agent_fallback()` instead of retrying with the same agent.
48    pub const fn should_retry(self) -> bool {
49        matches!(
50            self,
51            Self::ApiUnavailable
52                | Self::NetworkError
53                | Self::Timeout
54                | Self::InvalidResponse
55                | Self::RetryableAgentQuirk
56                | Self::Transient
57        )
58    }
59
60    /// Determine if this error requires immediate agent fallback (without retry).
61    ///
62    /// Rate limit (429) errors indicate the current provider is temporarily exhausted.
63    /// Rather than waiting and retrying the same agent (which wastes time), we should
64    /// immediately switch to the next agent in the fallback chain to continue work.
65    pub const fn should_immediate_agent_fallback(self) -> bool {
66        matches!(self, Self::RateLimited)
67    }
68
69    /// Determine if this error should trigger a fallback to another agent.
70    pub const fn should_fallback(self) -> bool {
71        matches!(
72            self,
73            Self::TokenExhausted
74                | Self::AuthFailure
75                | Self::CommandNotFound
76                | Self::ProcessKilled
77                | Self::ToolExecutionFailed
78                | Self::AgentSpecificQuirk
79        )
80    }
81
82    /// Determine if this error is unrecoverable (should abort).
83    pub const fn is_unrecoverable(self) -> bool {
84        matches!(self, Self::DiskFull | Self::Permanent)
85    }
86
87    /// Check if this is a command not found error.
88    pub const fn is_command_not_found(self) -> bool {
89        matches!(self, Self::CommandNotFound)
90    }
91
92    /// Check if this is a network-related error.
93    pub const fn is_network_error(self) -> bool {
94        matches!(self, Self::NetworkError | Self::Timeout)
95    }
96
97    /// Check if this error might be resolved by reducing context size.
98    pub const fn suggests_smaller_context(self) -> bool {
99        matches!(self, Self::TokenExhausted | Self::ProcessKilled)
100    }
101
102    /// Get suggested wait time in milliseconds before retry.
103    pub const fn suggested_wait_ms(self) -> u64 {
104        match self {
105            // RateLimited: no wait - we immediately fallback to next agent
106            Self::RateLimited => 0,
107            Self::ApiUnavailable => 3000, // Server issue: wait 3 seconds
108            Self::NetworkError => 2000,   // Network: wait 2 seconds
109            Self::Timeout | Self::Transient | Self::RetryableAgentQuirk => 1000, // Timeout/Transient: short wait
110            Self::InvalidResponse => 500, // Bad response: quick retry
111            _ => 0,                       // No wait for non-retryable errors
112        }
113    }
114
115    /// Get a user-friendly description of this error type.
116    pub const fn description(self) -> &'static str {
117        match self {
118            Self::RateLimited => "API rate limit exceeded",
119            Self::TokenExhausted => "Token/context limit exceeded",
120            Self::ApiUnavailable => "API service temporarily unavailable",
121            Self::NetworkError => "Network connectivity issue",
122            Self::AuthFailure => "Authentication failure",
123            Self::CommandNotFound => "Command not found",
124            Self::DiskFull => "Disk space exhausted",
125            Self::ProcessKilled => "Process terminated (possibly OOM)",
126            Self::InvalidResponse => "Invalid response from agent",
127            Self::Timeout => "Request timed out",
128            Self::ToolExecutionFailed => "Tool execution failed (e.g., file write)",
129            Self::AgentSpecificQuirk => "Known agent-specific issue",
130            Self::RetryableAgentQuirk => "Agent-specific issue (may be transient)",
131            Self::Transient => "Transient error",
132            Self::Permanent => "Permanent error",
133        }
134    }
135
136    /// Get recovery advice for this error type.
137    pub const fn recovery_advice(self) -> &'static str {
138        match self {
139            Self::RateLimited => {
140                "Switching to next agent immediately. Rate limit indicates provider exhaustion."
141            }
142            Self::TokenExhausted => {
143                "Switching to alternative agent. Tip: Try RALPH_DEVELOPER_CONTEXT=0 or RALPH_REVIEWER_CONTEXT=0"
144            }
145            Self::ApiUnavailable => {
146                "API server issue. Will retry automatically. Tip: Check status page or try different provider."
147            }
148            Self::NetworkError => {
149                "Check your internet connection. Will retry automatically. Tip: Check firewall/VPN settings."
150            }
151            Self::AuthFailure => {
152                "Check API key or run 'agent auth' to authenticate. Tip: Verify credentials for this provider."
153            }
154            Self::CommandNotFound => {
155                "Agent binary not installed. See installation guidance below. Tip: Run 'ralph --list-available-agents'"
156            }
157            Self::DiskFull => "Free up disk space and try again. Tip: Check .agent directory size.",
158            Self::ProcessKilled => {
159                "Process was killed (possible OOM). Trying with smaller context. Tip: Reduce context with RALPH_*_CONTEXT=0"
160            }
161            Self::InvalidResponse => {
162                "Received malformed response. Retrying... Tip: May indicate parser mismatch with this agent."
163            }
164            Self::Timeout => {
165                "Request timed out. Will retry with longer timeout. Tip: Try reducing prompt size or context."
166            }
167            Self::ToolExecutionFailed => {
168                "Tool execution failed (file write/permissions). Switching agent. Tip: Check directory write permissions."
169            }
170            Self::AgentSpecificQuirk => {
171                "Known agent-specific issue. Switching to alternative agent. Tip: See docs/agent-compatibility.md"
172            }
173            Self::RetryableAgentQuirk => {
174                "Agent-specific issue that may be transient. Retrying... Tip: See docs/agent-compatibility.md"
175            }
176            Self::Transient => "Temporary issue. Will retry automatically.",
177            Self::Permanent => {
178                "Unrecoverable error. Check agent logs (.agent/logs/) and see docs/agent-compatibility.md for help."
179            }
180        }
181    }
182
183    /// Classify an error from exit code, output, and agent name.
184    ///
185    /// This variant takes the agent name into account for better classification.
186    /// Some agents have known failure patterns that should trigger fallback
187    /// instead of retry, even when the stderr output is generic.
188    pub fn classify_with_agent(
189        exit_code: i32,
190        stderr: &str,
191        agent_name: Option<&str>,
192        model_flag: Option<&str>,
193    ) -> Self {
194        let stderr_lower = stderr.to_lowercase();
195
196        // Check for specific error patterns FIRST, before applying agent-specific heuristics.
197        // This ensures that token exhaustion is detected even for GLM-like agents.
198        if let Some(err) = Self::check_api_errors(&stderr_lower) {
199            return err;
200        }
201
202        if let Some(err) = Self::check_network_errors(&stderr_lower) {
203            return err;
204        }
205
206        if let Some(err) = Self::check_resource_errors(exit_code, &stderr_lower) {
207            return err;
208        }
209
210        if let Some(err) = Self::check_tool_failures(&stderr_lower) {
211            return err;
212        }
213
214        // If we know this is a GLM-like agent and it failed with exit code 1
215        // (and we haven't matched a specific error pattern above),
216        // classify based on stderr content.
217        let is_problematic_agent =
218            agent_name.is_some_and(is_glm_like_agent) || model_flag.is_some_and(is_glm_like_agent);
219
220        if is_problematic_agent && exit_code == 1 {
221            // Check if stderr has known problematic patterns that indicate unrecoverable issues
222            let has_known_problematic_pattern = stderr_lower.contains("permission")
223                || stderr_lower.contains("denied")
224                || stderr_lower.contains("unauthorized")
225                || stderr_lower.contains("auth")
226                || stderr_lower.contains("token")
227                || stderr_lower.contains("limit")
228                || stderr_lower.contains("quota")
229                || stderr_lower.contains("disk")
230                || stderr_lower.contains("space")
231                // Agent-specific known patterns (from check_agent_specific_quirks)
232                || (stderr_lower.contains("glm") && stderr_lower.contains("failed"))
233                || (stderr_lower.contains("ccs") && stderr_lower.contains("failed"));
234
235            if has_known_problematic_pattern {
236                // Known issue - should fallback
237                return Self::AgentSpecificQuirk;
238            }
239
240            // Unknown error - may be transient, should retry
241            return Self::RetryableAgentQuirk;
242        }
243
244        if let Some(err) = Self::check_agent_specific_quirks(&stderr_lower, exit_code) {
245            return err;
246        }
247
248        if let Some(err) = Self::check_command_not_found(exit_code, &stderr_lower) {
249            return err;
250        }
251
252        // Transient errors (exit codes that might succeed on retry)
253        if exit_code == 1 && stderr_lower.contains("error") {
254            return Self::Transient;
255        }
256
257        Self::Permanent
258    }
259
260    fn check_api_errors(stderr_lower: &str) -> Option<Self> {
261        // Rate limiting indicators (API-side)
262        if stderr_lower.contains("rate limit")
263            || stderr_lower.contains("too many requests")
264            || stderr_lower.contains("429")
265            || stderr_lower.contains("quota exceeded")
266        {
267            return Some(Self::RateLimited);
268        }
269
270        // Auth failures
271        // Check BEFORE token/context exhaustion so strings like "invalid token" are
272        // treated as authentication failures (not context exhaustion).
273        if stderr_lower.contains("unauthorized")
274            || stderr_lower.contains("authentication")
275            || stderr_lower.contains("401")
276            || stderr_lower.contains("api key")
277            || stderr_lower.contains("invalid token")
278            || stderr_lower.contains("forbidden")
279            || stderr_lower.contains("403")
280            || stderr_lower.contains("access denied")
281            || stderr_lower.contains("credential")
282        {
283            return Some(Self::AuthFailure);
284        }
285
286        // Token/context exhaustion (API-side)
287        // Check this BEFORE GLM agent-specific fallback to ensure TokenExhausted is detected
288        // Note: "too long" is specifically for API token limits, not OS argument limits
289        // We exclude "argument list too long" which is an E2BIG OS error
290        if stderr_lower.contains("context length")
291            || stderr_lower.contains("maximum context")
292            || stderr_lower.contains("max context")
293            || stderr_lower.contains("context window")
294            || stderr_lower.contains("maximum tokens")
295            || stderr_lower.contains("max tokens")
296            || stderr_lower.contains("too many tokens")
297            || stderr_lower.contains("token limit")
298            || stderr_lower.contains("context_length_exceeded")
299            || stderr_lower.contains("input too large")
300            || stderr_lower.contains("prompt is too long")
301            || (stderr_lower.contains("too long")
302                && !stderr_lower.contains("argument list too long"))
303        {
304            return Some(Self::TokenExhausted);
305        }
306
307        None
308    }
309
310    fn check_network_errors(stderr_lower: &str) -> Option<Self> {
311        // Network errors (client-side connectivity issues)
312        if stderr_lower.contains("connection refused")
313            || stderr_lower.contains("network unreachable")
314            || stderr_lower.contains("dns resolution")
315            || stderr_lower.contains("name resolution")
316            || stderr_lower.contains("no route to host")
317            || stderr_lower.contains("network is down")
318            || stderr_lower.contains("host unreachable")
319            || stderr_lower.contains("connection reset")
320            || stderr_lower.contains("broken pipe")
321            || stderr_lower.contains("econnrefused")
322            || stderr_lower.contains("enetunreach")
323        {
324            return Some(Self::NetworkError);
325        }
326
327        // API unavailable (server-side issues)
328        if stderr_lower.contains("service unavailable")
329            || stderr_lower.contains("503")
330            || stderr_lower.contains("502")
331            || stderr_lower.contains("504")
332            || stderr_lower.contains("500")
333            || stderr_lower.contains("internal server error")
334            || stderr_lower.contains("bad gateway")
335            || stderr_lower.contains("gateway timeout")
336            || stderr_lower.contains("overloaded")
337            || stderr_lower.contains("maintenance")
338        {
339            return Some(Self::ApiUnavailable);
340        }
341
342        // Request timeout
343        if stderr_lower.contains("timeout")
344            || stderr_lower.contains("timed out")
345            || stderr_lower.contains("request timeout")
346            || stderr_lower.contains("deadline exceeded")
347        {
348            return Some(Self::Timeout);
349        }
350
351        None
352    }
353
354    fn check_resource_errors(exit_code: i32, stderr_lower: &str) -> Option<Self> {
355        // Disk space exhaustion
356        if stderr_lower.contains("no space left")
357            || stderr_lower.contains("disk full")
358            || stderr_lower.contains("enospc")
359            || stderr_lower.contains("out of disk")
360            || stderr_lower.contains("insufficient storage")
361        {
362            return Some(Self::DiskFull);
363        }
364
365        // Argument list too long (E2BIG) - prompt exceeds OS limit
366        // Exit code 7 is the E2BIG errno value used by spawn_agent_process
367        if exit_code == 7
368            || stderr_lower.contains("argument list too long")
369            || stderr_lower.contains("e2big")
370        {
371            return Some(Self::ToolExecutionFailed);
372        }
373
374        // Process killed (OOM or signals)
375        // Exit code 137 = 128 + 9 (SIGKILL), 139 = 128 + 11 (SIGSEGV)
376        if exit_code == 137
377            || exit_code == 139
378            || exit_code == -9
379            || stderr_lower.contains("killed")
380            || stderr_lower.contains("oom")
381            || stderr_lower.contains("out of memory")
382            || stderr_lower.contains("memory exhausted")
383            || stderr_lower.contains("cannot allocate")
384            || stderr_lower.contains("segmentation fault")
385            || stderr_lower.contains("sigsegv")
386            || stderr_lower.contains("sigkill")
387        {
388            return Some(Self::ProcessKilled);
389        }
390
391        None
392    }
393
394    fn check_tool_failures(stderr_lower: &str) -> Option<Self> {
395        // Invalid JSON response
396        if stderr_lower.contains("invalid json")
397            || stderr_lower.contains("json parse")
398            || stderr_lower.contains("unexpected token")
399            || stderr_lower.contains("malformed")
400            || stderr_lower.contains("truncated response")
401            || stderr_lower.contains("incomplete response")
402        {
403            return Some(Self::InvalidResponse);
404        }
405
406        // Tool execution failures (file writes, tool calls, etc.)
407        if stderr_lower.contains("write error")
408            || stderr_lower.contains("cannot write")
409            || stderr_lower.contains("failed to write")
410            || stderr_lower.contains("unable to create file")
411            || stderr_lower.contains("file creation failed")
412            || stderr_lower.contains("i/o error")
413            || stderr_lower.contains("io error")
414            || stderr_lower.contains("tool failed")
415            || stderr_lower.contains("tool execution failed")
416            || stderr_lower.contains("tool call failed")
417        {
418            return Some(Self::ToolExecutionFailed);
419        }
420
421        // Permission denied errors (specific patterns that should fallback)
422        if stderr_lower.contains("permission denied")
423            || stderr_lower.contains("operation not permitted")
424            || stderr_lower.contains("insufficient permissions")
425            || stderr_lower.contains("eacces")
426            || stderr_lower.contains("eperm")
427        {
428            return Some(Self::ToolExecutionFailed);
429        }
430
431        None
432    }
433
434    fn check_agent_specific_quirks(stderr_lower: &str, exit_code: i32) -> Option<Self> {
435        // GLM/CCS-specific known issues
436        if stderr_lower.contains("ccs") || stderr_lower.contains("glm") {
437            // CCS/GLM with exit code 1 is likely a permission/tool issue
438            if exit_code == 1 {
439                return Some(Self::AgentSpecificQuirk);
440            }
441            // CCS-specific error patterns
442            if stderr_lower.contains("ccs") && stderr_lower.contains("failed") {
443                return Some(Self::AgentSpecificQuirk);
444            }
445            // GLM-specific permission errors
446            if stderr_lower.contains("glm")
447                && (stderr_lower.contains("permission")
448                    || stderr_lower.contains("denied")
449                    || stderr_lower.contains("unauthorized"))
450            {
451                return Some(Self::AgentSpecificQuirk);
452            }
453        }
454
455        // Fallback for GLM with any error and exit code 1
456        if stderr_lower.contains("glm") && exit_code == 1 {
457            return Some(Self::AgentSpecificQuirk);
458        }
459
460        None
461    }
462
463    fn check_command_not_found(exit_code: i32, stderr_lower: &str) -> Option<Self> {
464        // Command not found (keep this after permission checks since permission
465        // errors also contain "permission denied")
466        if exit_code == 127
467            || exit_code == 126
468            || stderr_lower.contains("command not found")
469            || stderr_lower.contains("not found")
470            || stderr_lower.contains("no such file")
471        {
472            return Some(Self::CommandNotFound);
473        }
474
475        None
476    }
477}