ralph_workflow/agents/
error.rs

1//! Error classification for agent failures.
2//!
3//! This module provides error classification logic to determine appropriate
4//! recovery strategies when agents fail. Different error types warrant
5//! different responses: retry, fallback to another agent, or abort.
6
7/// Check if an agent name or command string indicates a GLM-like agent.
8///
9/// GLM-like agents include GLM, `ZhipuAI`, ZAI, Qwen, and `DeepSeek`.
10/// These agents have known compatibility issues with review tasks and may
11/// require special handling or fallback logic.
12///
13/// # Arguments
14///
15/// * `s` - The agent name or command string to check
16///
17/// # Returns
18///
19/// `true` if the string indicates a GLM-like agent, `false` otherwise
20pub fn is_glm_like_agent(s: &str) -> bool {
21    let s_lower = s.to_lowercase();
22    s_lower.contains("glm")
23        || s_lower.contains("zhipuai")
24        || s_lower.contains("zai")
25        || s_lower.contains("qwen")
26        || s_lower.contains("deepseek")
27}
28
29/// Error classification for agent failures.
30///
31/// Used to determine appropriate recovery strategy when an agent fails:
32/// - `should_retry()` - Try same agent again after delay
33/// - `should_fallback()` - Switch to next agent in the chain
34/// - `is_unrecoverable()` - Abort the pipeline
35#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36pub enum AgentErrorKind {
37    /// API rate limit exceeded - retry after delay.
38    RateLimited,
39    /// Token/context limit exceeded - may need different agent.
40    TokenExhausted,
41    /// API temporarily unavailable (server-side issue) - retry.
42    ApiUnavailable,
43    /// Network connectivity issue (client-side) - retry.
44    NetworkError,
45    /// Authentication failure - switch agent.
46    AuthFailure,
47    /// Command not found - switch agent.
48    CommandNotFound,
49    /// Disk space exhausted - cannot continue.
50    DiskFull,
51    /// Process killed (OOM, signal) - may retry with smaller context.
52    ProcessKilled,
53    /// Invalid JSON response from agent - may retry.
54    InvalidResponse,
55    /// Request/response timeout - retry.
56    Timeout,
57    /// Tool execution failed - should fallback (e.g., file write issues).
58    ToolExecutionFailed,
59    /// Known agent-specific behavioral quirk - should fallback with specific advice.
60    AgentSpecificQuirk,
61    /// Other transient error - retry.
62    Transient,
63    /// Permanent failure - do not retry.
64    Permanent,
65}
66
67impl AgentErrorKind {
68    /// Determine if this error should trigger a retry.
69    pub const fn should_retry(self) -> bool {
70        matches!(
71            self,
72            Self::RateLimited
73                | Self::ApiUnavailable
74                | Self::NetworkError
75                | Self::Timeout
76                | Self::InvalidResponse
77                | Self::Transient
78        )
79    }
80
81    /// Determine if this error should trigger a fallback to another agent.
82    pub const fn should_fallback(self) -> bool {
83        matches!(
84            self,
85            Self::TokenExhausted
86                | Self::AuthFailure
87                | Self::CommandNotFound
88                | Self::ProcessKilled
89                | Self::ToolExecutionFailed
90                | Self::AgentSpecificQuirk
91        )
92    }
93
94    /// Determine if this error is unrecoverable (should abort).
95    pub const fn is_unrecoverable(self) -> bool {
96        matches!(self, Self::DiskFull | Self::Permanent)
97    }
98
99    /// Check if this is a command not found error.
100    pub const fn is_command_not_found(self) -> bool {
101        matches!(self, Self::CommandNotFound)
102    }
103
104    /// Check if this is a network-related error.
105    pub const fn is_network_error(self) -> bool {
106        matches!(self, Self::NetworkError | Self::Timeout)
107    }
108
109    /// Check if this error might be resolved by reducing context size.
110    pub const fn suggests_smaller_context(self) -> bool {
111        matches!(self, Self::TokenExhausted | Self::ProcessKilled)
112    }
113
114    /// Get suggested wait time in milliseconds before retry.
115    pub const fn suggested_wait_ms(self) -> u64 {
116        match self {
117            Self::RateLimited => 5000,               // Rate limit: wait 5 seconds
118            Self::ApiUnavailable => 3000,            // Server issue: wait 3 seconds
119            Self::NetworkError => 2000,              // Network: wait 2 seconds
120            Self::Timeout | Self::Transient => 1000, // Timeout/Transient: short wait
121            Self::InvalidResponse => 500,            // Bad response: quick retry
122            _ => 0,                                  // No wait for non-retryable errors
123        }
124    }
125
126    /// Get a user-friendly description of this error type.
127    pub const fn description(self) -> &'static str {
128        match self {
129            Self::RateLimited => "API rate limit exceeded",
130            Self::TokenExhausted => "Token/context limit exceeded",
131            Self::ApiUnavailable => "API service temporarily unavailable",
132            Self::NetworkError => "Network connectivity issue",
133            Self::AuthFailure => "Authentication failure",
134            Self::CommandNotFound => "Command not found",
135            Self::DiskFull => "Disk space exhausted",
136            Self::ProcessKilled => "Process terminated (possibly OOM)",
137            Self::InvalidResponse => "Invalid response from agent",
138            Self::Timeout => "Request timed out",
139            Self::ToolExecutionFailed => "Tool execution failed (e.g., file write)",
140            Self::AgentSpecificQuirk => "Known agent-specific issue",
141            Self::Transient => "Transient error",
142            Self::Permanent => "Permanent error",
143        }
144    }
145
146    /// Get recovery advice for this error type.
147    pub const fn recovery_advice(self) -> &'static str {
148        match self {
149            Self::RateLimited => {
150                "Will retry after delay. Tip: Consider reducing request frequency or using a different provider."
151            }
152            Self::TokenExhausted => {
153                "Switching to alternative agent. Tip: Try RALPH_DEVELOPER_CONTEXT=0 or RALPH_REVIEWER_CONTEXT=0"
154            }
155            Self::ApiUnavailable => {
156                "API server issue. Will retry automatically. Tip: Check status page or try different provider."
157            }
158            Self::NetworkError => {
159                "Check your internet connection. Will retry automatically. Tip: Check firewall/VPN settings."
160            }
161            Self::AuthFailure => {
162                "Check API key or run 'agent auth' to authenticate. Tip: Verify credentials for this provider."
163            }
164            Self::CommandNotFound => {
165                "Agent binary not installed. See installation guidance below. Tip: Run 'ralph --list-available-agents'"
166            }
167            Self::DiskFull => "Free up disk space and try again. Tip: Check .agent directory size.",
168            Self::ProcessKilled => {
169                "Process was killed (possible OOM). Trying with smaller context. Tip: Reduce context with RALPH_*_CONTEXT=0"
170            }
171            Self::InvalidResponse => {
172                "Received malformed response. Retrying... Tip: May indicate parser mismatch with this agent."
173            }
174            Self::Timeout => {
175                "Request timed out. Will retry with longer timeout. Tip: Try reducing prompt size or context."
176            }
177            Self::ToolExecutionFailed => {
178                "Tool execution failed (file write/permissions). Switching agent. Tip: Check directory write permissions."
179            }
180            Self::AgentSpecificQuirk => {
181                "Known agent-specific issue. Switching to alternative agent. Tip: See docs/agent-compatibility.md"
182            }
183            Self::Transient => "Temporary issue. Will retry automatically.",
184            Self::Permanent => {
185                "Unrecoverable error. Check agent logs (.agent/logs/) and see docs/agent-compatibility.md for help."
186            }
187        }
188    }
189
190    /// Classify an error from exit code, output, and agent name.
191    ///
192    /// This variant takes the agent name into account for better classification.
193    /// Some agents have known failure patterns that should trigger fallback
194    /// instead of retry, even when the stderr output is generic.
195    ///
196    /// # Arguments
197    ///
198    /// * `exit_code` - The process exit code
199    /// * `stderr` - The standard error output from the agent
200    /// * `agent_name` - Optional agent name for context-aware classification
201    pub fn classify_with_agent(
202        exit_code: i32,
203        stderr: &str,
204        agent_name: Option<&str>,
205        model_flag: Option<&str>,
206    ) -> Self {
207        let stderr_lower = stderr.to_lowercase();
208
209        // Check for specific error patterns FIRST, before applying agent-specific heuristics.
210        // This ensures that token exhaustion is detected even for GLM-like agents.
211        if let Some(err) = Self::check_api_errors(&stderr_lower) {
212            return err;
213        }
214
215        if let Some(err) = Self::check_network_errors(&stderr_lower) {
216            return err;
217        }
218
219        if let Some(err) = Self::check_resource_errors(exit_code, &stderr_lower) {
220            return err;
221        }
222
223        if let Some(err) = Self::check_tool_failures(&stderr_lower) {
224            return err;
225        }
226
227        // If we know this is a GLM-like agent and it failed with exit code 1
228        // (and we haven't matched a specific error pattern above),
229        // classify it as AgentSpecificQuirk to trigger fallback instead of retry.
230        let is_problematic_agent =
231            agent_name.is_some_and(is_glm_like_agent) || model_flag.is_some_and(is_glm_like_agent);
232
233        if is_problematic_agent && exit_code == 1 {
234            // GLM and similar agents often exit with code 1 for various issues.
235            // Treating as AgentSpecificQuirk ensures faster fallback.
236            return Self::AgentSpecificQuirk;
237        }
238
239        if let Some(err) = Self::check_agent_specific_quirks(&stderr_lower, exit_code) {
240            return err;
241        }
242
243        if let Some(err) = Self::check_command_not_found(exit_code, &stderr_lower) {
244            return err;
245        }
246
247        // Transient errors (exit codes that might succeed on retry)
248        // This is now a more specific catch-all for actual transient issues
249        if exit_code == 1 && stderr_lower.contains("error") {
250            // But only if it's not a known permanent issue pattern
251            // (permission, tool failures, GLM issues are already handled above)
252            return Self::Transient;
253        }
254
255        Self::Permanent
256    }
257
258    /// Check for API-level errors (rate limiting, auth, server issues).
259    fn check_api_errors(stderr_lower: &str) -> Option<Self> {
260        // Rate limiting indicators (API-side)
261        if stderr_lower.contains("rate limit")
262            || stderr_lower.contains("too many requests")
263            || stderr_lower.contains("429")
264            || stderr_lower.contains("quota exceeded")
265        {
266            return Some(Self::RateLimited);
267        }
268
269        // Token/context exhaustion (API-side)
270        // Check this BEFORE GLM agent-specific fallback to ensure TokenExhausted is detected
271        if stderr_lower.contains("token")
272            || stderr_lower.contains("context length")
273            || stderr_lower.contains("maximum context")
274            || stderr_lower.contains("too long")
275            || stderr_lower.contains("input too large")
276        {
277            return Some(Self::TokenExhausted);
278        }
279
280        // Auth failures
281        if stderr_lower.contains("unauthorized")
282            || stderr_lower.contains("authentication")
283            || stderr_lower.contains("401")
284            || stderr_lower.contains("api key")
285            || stderr_lower.contains("invalid token")
286            || stderr_lower.contains("forbidden")
287            || stderr_lower.contains("403")
288            || stderr_lower.contains("access denied")
289        {
290            return Some(Self::AuthFailure);
291        }
292
293        None
294    }
295
296    /// Check for network and server-side errors.
297    fn check_network_errors(stderr_lower: &str) -> Option<Self> {
298        // Network errors (client-side connectivity issues)
299        if stderr_lower.contains("connection refused")
300            || stderr_lower.contains("network unreachable")
301            || stderr_lower.contains("dns resolution")
302            || stderr_lower.contains("name resolution")
303            || stderr_lower.contains("no route to host")
304            || stderr_lower.contains("network is down")
305            || stderr_lower.contains("host unreachable")
306            || stderr_lower.contains("connection reset")
307            || stderr_lower.contains("broken pipe")
308            || stderr_lower.contains("econnrefused")
309            || stderr_lower.contains("enetunreach")
310        {
311            return Some(Self::NetworkError);
312        }
313
314        // API unavailable (server-side issues)
315        if stderr_lower.contains("service unavailable")
316            || stderr_lower.contains("503")
317            || stderr_lower.contains("502")
318            || stderr_lower.contains("504")
319            || stderr_lower.contains("500")
320            || stderr_lower.contains("internal server error")
321            || stderr_lower.contains("bad gateway")
322            || stderr_lower.contains("gateway timeout")
323            || stderr_lower.contains("overloaded")
324            || stderr_lower.contains("maintenance")
325        {
326            return Some(Self::ApiUnavailable);
327        }
328
329        // Request timeout
330        if stderr_lower.contains("timeout")
331            || stderr_lower.contains("timed out")
332            || stderr_lower.contains("request timeout")
333            || stderr_lower.contains("deadline exceeded")
334        {
335            return Some(Self::Timeout);
336        }
337
338        None
339    }
340
341    /// Check for resource exhaustion errors (disk, memory, process).
342    fn check_resource_errors(exit_code: i32, stderr_lower: &str) -> Option<Self> {
343        // Disk space exhaustion
344        if stderr_lower.contains("no space left")
345            || stderr_lower.contains("disk full")
346            || stderr_lower.contains("enospc")
347            || stderr_lower.contains("out of disk")
348            || stderr_lower.contains("insufficient storage")
349        {
350            return Some(Self::DiskFull);
351        }
352
353        // Process killed (OOM or signals)
354        // Exit code 137 = 128 + 9 (SIGKILL), 139 = 128 + 11 (SIGSEGV)
355        if exit_code == 137
356            || exit_code == 139
357            || exit_code == -9
358            || stderr_lower.contains("killed")
359            || stderr_lower.contains("oom")
360            || stderr_lower.contains("out of memory")
361            || stderr_lower.contains("memory exhausted")
362            || stderr_lower.contains("cannot allocate")
363            || stderr_lower.contains("segmentation fault")
364            || stderr_lower.contains("sigsegv")
365            || stderr_lower.contains("sigkill")
366        {
367            return Some(Self::ProcessKilled);
368        }
369
370        None
371    }
372
373    /// Check for tool and file operation failures.
374    fn check_tool_failures(stderr_lower: &str) -> Option<Self> {
375        // Invalid JSON response
376        if stderr_lower.contains("invalid json")
377            || stderr_lower.contains("json parse")
378            || stderr_lower.contains("unexpected token")
379            || stderr_lower.contains("malformed")
380            || stderr_lower.contains("truncated response")
381            || stderr_lower.contains("incomplete response")
382        {
383            return Some(Self::InvalidResponse);
384        }
385
386        // Tool execution failures (file writes, tool calls, etc.)
387        // These should trigger fallback, not retry
388        if stderr_lower.contains("write error")
389            || stderr_lower.contains("cannot write")
390            || stderr_lower.contains("failed to write")
391            || stderr_lower.contains("unable to create file")
392            || stderr_lower.contains("file creation failed")
393            || stderr_lower.contains("i/o error")
394            || stderr_lower.contains("io error")
395            || stderr_lower.contains("tool failed")
396            || stderr_lower.contains("tool execution failed")
397            || stderr_lower.contains("tool call failed")
398        {
399            return Some(Self::ToolExecutionFailed);
400        }
401
402        // Permission denied errors (specific patterns that should fallback)
403        // These need to be checked BEFORE the generic "error" catch-all
404        // Note: "access denied" is already caught by AuthFailure above (for HTTP 403)
405        // This catches file-system permission errors specifically
406        if stderr_lower.contains("permission denied")
407            || stderr_lower.contains("operation not permitted")
408            || stderr_lower.contains("insufficient permissions")
409            || stderr_lower.contains("eacces")
410            || stderr_lower.contains("eperm")
411        {
412            return Some(Self::ToolExecutionFailed);
413        }
414
415        None
416    }
417
418    /// Check for agent-specific quirks that should trigger fallback.
419    fn check_agent_specific_quirks(stderr_lower: &str, exit_code: i32) -> Option<Self> {
420        // GLM/CCS-specific known issues
421        // These are known quirks that should trigger fallback
422        // Check for CCS-specific error patterns
423        if stderr_lower.contains("ccs") || stderr_lower.contains("glm") {
424            // CCS/GLM with exit code 1 is likely a permission/tool issue
425            if exit_code == 1 {
426                return Some(Self::AgentSpecificQuirk);
427            }
428            // CCS-specific error patterns
429            if stderr_lower.contains("ccs") && stderr_lower.contains("failed") {
430                return Some(Self::AgentSpecificQuirk);
431            }
432            // GLM-specific permission errors
433            if stderr_lower.contains("glm")
434                && (stderr_lower.contains("permission")
435                    || stderr_lower.contains("denied")
436                    || stderr_lower.contains("unauthorized"))
437            {
438                return Some(Self::AgentSpecificQuirk);
439            }
440        }
441
442        // Fallback for GLM with any error and exit code 1
443        if stderr_lower.contains("glm") && exit_code == 1 {
444            return Some(Self::AgentSpecificQuirk);
445        }
446
447        None
448    }
449
450    /// Check for command not found errors.
451    fn check_command_not_found(exit_code: i32, stderr_lower: &str) -> Option<Self> {
452        // Command not found (keep this after permission checks since permission
453        // errors also contain "permission denied")
454        if exit_code == 127
455            || exit_code == 126
456            || stderr_lower.contains("command not found")
457            || stderr_lower.contains("not found")
458            || stderr_lower.contains("no such file")
459        {
460            return Some(Self::CommandNotFound);
461        }
462
463        None
464    }
465}
466
467#[cfg(test)]
468mod tests {
469    use super::*;
470
471    fn classify(exit_code: i32, stderr: &str) -> AgentErrorKind {
472        AgentErrorKind::classify_with_agent(exit_code, stderr, None, None)
473    }
474
475    #[test]
476    fn test_agent_error_kind_should_retry() {
477        assert!(AgentErrorKind::RateLimited.should_retry());
478        assert!(AgentErrorKind::ApiUnavailable.should_retry());
479        assert!(AgentErrorKind::NetworkError.should_retry());
480        assert!(AgentErrorKind::Timeout.should_retry());
481        assert!(AgentErrorKind::InvalidResponse.should_retry());
482        assert!(AgentErrorKind::Transient.should_retry());
483
484        assert!(!AgentErrorKind::AuthFailure.should_retry());
485        assert!(!AgentErrorKind::CommandNotFound.should_retry());
486        assert!(!AgentErrorKind::Permanent.should_retry());
487    }
488
489    #[test]
490    fn test_agent_error_kind_should_fallback() {
491        assert!(AgentErrorKind::TokenExhausted.should_fallback());
492        assert!(AgentErrorKind::AuthFailure.should_fallback());
493        assert!(AgentErrorKind::CommandNotFound.should_fallback());
494        assert!(AgentErrorKind::ProcessKilled.should_fallback());
495        assert!(AgentErrorKind::ToolExecutionFailed.should_fallback());
496        assert!(AgentErrorKind::AgentSpecificQuirk.should_fallback());
497
498        assert!(!AgentErrorKind::RateLimited.should_fallback());
499        assert!(!AgentErrorKind::Permanent.should_fallback());
500    }
501
502    #[test]
503    fn test_agent_error_kind_is_unrecoverable() {
504        assert!(AgentErrorKind::DiskFull.is_unrecoverable());
505        assert!(AgentErrorKind::Permanent.is_unrecoverable());
506
507        assert!(!AgentErrorKind::RateLimited.is_unrecoverable());
508        assert!(!AgentErrorKind::AuthFailure.is_unrecoverable());
509    }
510
511    #[test]
512    fn test_agent_error_kind_classify() {
513        // Rate limiting
514        assert_eq!(
515            classify(1, "rate limit exceeded"),
516            AgentErrorKind::RateLimited
517        );
518        assert_eq!(classify(1, "error 429"), AgentErrorKind::RateLimited);
519
520        // Auth failure
521        assert_eq!(classify(1, "unauthorized"), AgentErrorKind::AuthFailure);
522        assert_eq!(classify(1, "error 401"), AgentErrorKind::AuthFailure);
523
524        // Command not found
525        assert_eq!(classify(127, ""), AgentErrorKind::CommandNotFound);
526        assert_eq!(
527            classify(1, "command not found"),
528            AgentErrorKind::CommandNotFound
529        );
530
531        // Process killed
532        assert_eq!(classify(137, ""), AgentErrorKind::ProcessKilled);
533        assert_eq!(classify(1, "out of memory"), AgentErrorKind::ProcessKilled);
534
535        // Tool execution failures (NEW)
536        assert_eq!(
537            classify(1, "write error"),
538            AgentErrorKind::ToolExecutionFailed
539        );
540        assert_eq!(
541            classify(1, "tool failed"),
542            AgentErrorKind::ToolExecutionFailed
543        );
544        assert_eq!(
545            classify(1, "failed to write"),
546            AgentErrorKind::ToolExecutionFailed
547        );
548
549        // Permission denied errors (should fallback, not retry)
550        assert_eq!(
551            classify(1, "permission denied"),
552            AgentErrorKind::ToolExecutionFailed
553        );
554        assert_eq!(
555            classify(1, "operation not permitted"),
556            AgentErrorKind::ToolExecutionFailed
557        );
558        assert_eq!(
559            classify(1, "insufficient permissions"),
560            AgentErrorKind::ToolExecutionFailed
561        );
562
563        // "access denied" is caught by AuthFailure earlier (HTTP 403)
564        assert_eq!(classify(1, "access denied"), AgentErrorKind::AuthFailure);
565
566        // GLM-specific known issues (NEW)
567        assert_eq!(classify(1, "glm error"), AgentErrorKind::AgentSpecificQuirk);
568        assert_eq!(
569            classify(1, "ccs glm failed"),
570            AgentErrorKind::AgentSpecificQuirk
571        );
572
573        // Generic exit code 1 with "error" is now more selective
574        // It should NOT match patterns that are handled above
575        assert_eq!(classify(1, "some random error"), AgentErrorKind::Transient);
576
577        assert_eq!(
578            AgentErrorKind::classify_with_agent(1, "some random error", Some("ccs/glm"), None),
579            AgentErrorKind::AgentSpecificQuirk
580        );
581    }
582
583    #[test]
584    fn test_agent_error_kind_description_and_advice() {
585        let error = AgentErrorKind::RateLimited;
586        assert!(!error.description().is_empty());
587        assert!(!error.recovery_advice().is_empty());
588    }
589
590    #[test]
591    fn test_agent_error_kind_suggested_wait_ms() {
592        assert_eq!(AgentErrorKind::RateLimited.suggested_wait_ms(), 5000);
593        assert_eq!(AgentErrorKind::Permanent.suggested_wait_ms(), 0);
594    }
595
596    #[test]
597    fn test_agent_error_kind_suggests_smaller_context() {
598        assert!(AgentErrorKind::TokenExhausted.suggests_smaller_context());
599        assert!(AgentErrorKind::ProcessKilled.suggests_smaller_context());
600        assert!(!AgentErrorKind::RateLimited.suggests_smaller_context());
601    }
602}