ralph_workflow/agents/
error.rs

1//! Error classification for agent failures.
2//!
3//! This module provides error classification logic to determine appropriate
4//! recovery strategies when agents fail. Different error types warrant
5//! different responses: retry, fallback to another agent, or abort.
6
7/// Check if a string contains a GLM-like model name.
8///
9/// GLM-like models include GLM, ZhipuAI, ZAI, Qwen, and DeepSeek.
10/// Use this for detecting GLM models in any context (e.g., prompt selection).
11/// For detecting CCS/Claude-based GLM agents specifically (error handling),
12/// use `is_glm_like_agent` instead.
13pub fn contains_glm_model(s: &str) -> bool {
14    let s_lower = s.to_lowercase();
15    s_lower.contains("glm")
16        || s_lower.contains("zhipuai")
17        || s_lower.contains("zai")
18        || s_lower.contains("qwen")
19        || s_lower.contains("deepseek")
20}
21
22/// Check if an agent is a CCS/Claude-based agent using a GLM-like model.
23///
24/// These agents have known compatibility issues because they use Claude CLI
25/// with GLM models via CCS (Claude Code Switch). They require:
26/// - The `-p` flag for non-interactive mode
27/// - Special error handling for GLM-specific quirks
28///
29/// This does NOT match OpenCode agents using GLM models, as OpenCode has
30/// its own mechanism (`--auto-approve`) and JSON format.
31///
32/// # Arguments
33///
34/// * `s` - The agent name or command string to check
35///
36/// # Returns
37///
38/// `true` if this is a CCS/Claude agent using a GLM-like model, `false` otherwise
39pub fn is_glm_like_agent(s: &str) -> bool {
40    let s_lower = s.to_lowercase();
41
42    // Must contain a GLM-like model name
43    if !contains_glm_model(&s_lower) {
44        return false;
45    }
46
47    // Exclude OpenCode agents - they have their own mechanism
48    if s_lower.starts_with("opencode") {
49        return false;
50    }
51
52    // Match CCS agents (ccs/glm, ccs/zai, etc.) or claude-based commands
53    s_lower.starts_with("ccs") || s_lower.contains("claude")
54}
55
56/// Error classification for agent failures.
57///
58/// Used to determine appropriate recovery strategy when an agent fails:
59/// - `should_retry()` - Try same agent again after delay
60/// - `should_fallback()` - Switch to next agent in the chain
61/// - `is_unrecoverable()` - Abort the pipeline
62#[derive(Debug, Clone, Copy, PartialEq, Eq)]
63pub enum AgentErrorKind {
64    /// API rate limit exceeded - retry after delay.
65    RateLimited,
66    /// Token/context limit exceeded - may need different agent.
67    TokenExhausted,
68    /// API temporarily unavailable (server-side issue) - retry.
69    ApiUnavailable,
70    /// Network connectivity issue (client-side) - retry.
71    NetworkError,
72    /// Authentication failure - switch agent.
73    AuthFailure,
74    /// Command not found - switch agent.
75    CommandNotFound,
76    /// Disk space exhausted - cannot continue.
77    DiskFull,
78    /// Process killed (OOM, signal) - may retry with smaller context.
79    ProcessKilled,
80    /// Invalid JSON response from agent - may retry.
81    InvalidResponse,
82    /// Request/response timeout - retry.
83    Timeout,
84    /// Tool execution failed - should fallback (e.g., file write issues).
85    ToolExecutionFailed,
86    /// Known agent-specific behavioral quirk - should fallback with specific advice.
87    AgentSpecificQuirk,
88    /// Agent-specific issue that may be transient - should retry before falling back.
89    RetryableAgentQuirk,
90    /// Other transient error - retry.
91    Transient,
92    /// Permanent failure - do not retry.
93    Permanent,
94}
95
96impl AgentErrorKind {
97    /// Determine if this error should trigger a retry.
98    ///
99    /// Note: `RateLimited` is intentionally excluded - it triggers immediate agent fallback
100    /// via `should_immediate_agent_fallback()` instead of retrying with the same agent.
101    pub const fn should_retry(self) -> bool {
102        matches!(
103            self,
104            Self::ApiUnavailable
105                | Self::NetworkError
106                | Self::Timeout
107                | Self::InvalidResponse
108                | Self::RetryableAgentQuirk
109                | Self::Transient
110        )
111    }
112
113    /// Determine if this error requires immediate agent fallback (without retry).
114    ///
115    /// Rate limit (429) errors indicate the current provider is temporarily exhausted.
116    /// Rather than waiting and retrying the same agent (which wastes time), we should
117    /// immediately switch to the next agent in the fallback chain to continue work.
118    pub const fn should_immediate_agent_fallback(self) -> bool {
119        matches!(self, Self::RateLimited)
120    }
121
122    /// Determine if this error should trigger a fallback to another agent.
123    pub const fn should_fallback(self) -> bool {
124        matches!(
125            self,
126            Self::TokenExhausted
127                | Self::AuthFailure
128                | Self::CommandNotFound
129                | Self::ProcessKilled
130                | Self::ToolExecutionFailed
131                | Self::AgentSpecificQuirk
132        )
133    }
134
135    /// Determine if this error is unrecoverable (should abort).
136    pub const fn is_unrecoverable(self) -> bool {
137        matches!(self, Self::DiskFull | Self::Permanent)
138    }
139
140    /// Check if this is a command not found error.
141    pub const fn is_command_not_found(self) -> bool {
142        matches!(self, Self::CommandNotFound)
143    }
144
145    /// Check if this is a network-related error.
146    pub const fn is_network_error(self) -> bool {
147        matches!(self, Self::NetworkError | Self::Timeout)
148    }
149
150    /// Check if this error might be resolved by reducing context size.
151    pub const fn suggests_smaller_context(self) -> bool {
152        matches!(self, Self::TokenExhausted | Self::ProcessKilled)
153    }
154
155    /// Get suggested wait time in milliseconds before retry.
156    pub const fn suggested_wait_ms(self) -> u64 {
157        match self {
158            // RateLimited: no wait - we immediately fallback to next agent
159            Self::RateLimited => 0,
160            Self::ApiUnavailable => 3000, // Server issue: wait 3 seconds
161            Self::NetworkError => 2000,   // Network: wait 2 seconds
162            Self::Timeout | Self::Transient | Self::RetryableAgentQuirk => 1000, // Timeout/Transient: short wait
163            Self::InvalidResponse => 500, // Bad response: quick retry
164            _ => 0,                       // No wait for non-retryable errors
165        }
166    }
167
168    /// Get a user-friendly description of this error type.
169    pub const fn description(self) -> &'static str {
170        match self {
171            Self::RateLimited => "API rate limit exceeded",
172            Self::TokenExhausted => "Token/context limit exceeded",
173            Self::ApiUnavailable => "API service temporarily unavailable",
174            Self::NetworkError => "Network connectivity issue",
175            Self::AuthFailure => "Authentication failure",
176            Self::CommandNotFound => "Command not found",
177            Self::DiskFull => "Disk space exhausted",
178            Self::ProcessKilled => "Process terminated (possibly OOM)",
179            Self::InvalidResponse => "Invalid response from agent",
180            Self::Timeout => "Request timed out",
181            Self::ToolExecutionFailed => "Tool execution failed (e.g., file write)",
182            Self::AgentSpecificQuirk => "Known agent-specific issue",
183            Self::RetryableAgentQuirk => "Agent-specific issue (may be transient)",
184            Self::Transient => "Transient error",
185            Self::Permanent => "Permanent error",
186        }
187    }
188
189    /// Get recovery advice for this error type.
190    pub const fn recovery_advice(self) -> &'static str {
191        match self {
192            Self::RateLimited => {
193                "Switching to next agent immediately. Rate limit indicates provider exhaustion."
194            }
195            Self::TokenExhausted => {
196                "Switching to alternative agent. Tip: Try RALPH_DEVELOPER_CONTEXT=0 or RALPH_REVIEWER_CONTEXT=0"
197            }
198            Self::ApiUnavailable => {
199                "API server issue. Will retry automatically. Tip: Check status page or try different provider."
200            }
201            Self::NetworkError => {
202                "Check your internet connection. Will retry automatically. Tip: Check firewall/VPN settings."
203            }
204            Self::AuthFailure => {
205                "Check API key or run 'agent auth' to authenticate. Tip: Verify credentials for this provider."
206            }
207            Self::CommandNotFound => {
208                "Agent binary not installed. See installation guidance below. Tip: Run 'ralph --list-available-agents'"
209            }
210            Self::DiskFull => "Free up disk space and try again. Tip: Check .agent directory size.",
211            Self::ProcessKilled => {
212                "Process was killed (possible OOM). Trying with smaller context. Tip: Reduce context with RALPH_*_CONTEXT=0"
213            }
214            Self::InvalidResponse => {
215                "Received malformed response. Retrying... Tip: May indicate parser mismatch with this agent."
216            }
217            Self::Timeout => {
218                "Request timed out. Will retry with longer timeout. Tip: Try reducing prompt size or context."
219            }
220            Self::ToolExecutionFailed => {
221                "Tool execution failed (file write/permissions). Switching agent. Tip: Check directory write permissions."
222            }
223            Self::AgentSpecificQuirk => {
224                "Known agent-specific issue. Switching to alternative agent. Tip: See docs/agent-compatibility.md"
225            }
226            Self::RetryableAgentQuirk => {
227                "Agent-specific issue that may be transient. Retrying... Tip: See docs/agent-compatibility.md"
228            }
229            Self::Transient => "Temporary issue. Will retry automatically.",
230            Self::Permanent => {
231                "Unrecoverable error. Check agent logs (.agent/logs/) and see docs/agent-compatibility.md for help."
232            }
233        }
234    }
235
236    /// Classify an error from exit code, output, and agent name.
237    ///
238    /// This variant takes the agent name into account for better classification.
239    /// Some agents have known failure patterns that should trigger fallback
240    /// instead of retry, even when the stderr output is generic.
241    ///
242    /// # Arguments
243    ///
244    /// * `exit_code` - The process exit code
245    /// * `stderr` - The standard error output from the agent
246    /// * `agent_name` - Optional agent name for context-aware classification
247    pub fn classify_with_agent(
248        exit_code: i32,
249        stderr: &str,
250        agent_name: Option<&str>,
251        model_flag: Option<&str>,
252    ) -> Self {
253        let stderr_lower = stderr.to_lowercase();
254
255        // Check for specific error patterns FIRST, before applying agent-specific heuristics.
256        // This ensures that token exhaustion is detected even for GLM-like agents.
257        if let Some(err) = Self::check_api_errors(&stderr_lower) {
258            return err;
259        }
260
261        if let Some(err) = Self::check_network_errors(&stderr_lower) {
262            return err;
263        }
264
265        if let Some(err) = Self::check_resource_errors(exit_code, &stderr_lower) {
266            return err;
267        }
268
269        if let Some(err) = Self::check_tool_failures(&stderr_lower) {
270            return err;
271        }
272
273        // If we know this is a GLM-like agent and it failed with exit code 1
274        // (and we haven't matched a specific error pattern above),
275        // classify based on stderr content:
276        // - If stderr is empty or contains only generic messages, treat as RetryableAgentQuirk
277        // - If stderr contains specific error patterns, it will be caught by check_agent_specific_quirks below
278        let is_problematic_agent =
279            agent_name.is_some_and(is_glm_like_agent) || model_flag.is_some_and(is_glm_like_agent);
280
281        if is_problematic_agent && exit_code == 1 {
282            // Check if stderr has known problematic patterns that indicate unrecoverable issues
283            let has_known_problematic_pattern = stderr_lower.contains("permission")
284                || stderr_lower.contains("denied")
285                || stderr_lower.contains("unauthorized")
286                || stderr_lower.contains("auth")
287                || stderr_lower.contains("token")
288                || stderr_lower.contains("limit")
289                || stderr_lower.contains("quota")
290                || stderr_lower.contains("disk")
291                || stderr_lower.contains("space")
292                // Agent-specific known patterns (from check_agent_specific_quirks)
293                || (stderr_lower.contains("glm") && stderr_lower.contains("failed"))
294                || (stderr_lower.contains("ccs") && stderr_lower.contains("failed"));
295
296            if has_known_problematic_pattern {
297                // Known issue - should fallback
298                return Self::AgentSpecificQuirk;
299            }
300
301            // Unknown error - may be transient, should retry
302            return Self::RetryableAgentQuirk;
303        }
304
305        if let Some(err) = Self::check_agent_specific_quirks(&stderr_lower, exit_code) {
306            return err;
307        }
308
309        if let Some(err) = Self::check_command_not_found(exit_code, &stderr_lower) {
310            return err;
311        }
312
313        // Transient errors (exit codes that might succeed on retry)
314        // This is now a more specific catch-all for actual transient issues
315        if exit_code == 1 && stderr_lower.contains("error") {
316            // But only if it's not a known permanent issue pattern
317            // (permission, tool failures, GLM issues are already handled above)
318            return Self::Transient;
319        }
320
321        Self::Permanent
322    }
323
324    /// Check for API-level errors (rate limiting, auth, server issues).
325    fn check_api_errors(stderr_lower: &str) -> Option<Self> {
326        // Rate limiting indicators (API-side)
327        if stderr_lower.contains("rate limit")
328            || stderr_lower.contains("too many requests")
329            || stderr_lower.contains("429")
330            || stderr_lower.contains("quota exceeded")
331        {
332            return Some(Self::RateLimited);
333        }
334
335        // Auth failures
336        // Check BEFORE token/context exhaustion so strings like "invalid token" are
337        // treated as authentication failures (not context exhaustion).
338        if stderr_lower.contains("unauthorized")
339            || stderr_lower.contains("authentication")
340            || stderr_lower.contains("401")
341            || stderr_lower.contains("api key")
342            || stderr_lower.contains("invalid token")
343            || stderr_lower.contains("forbidden")
344            || stderr_lower.contains("403")
345            || stderr_lower.contains("access denied")
346        {
347            return Some(Self::AuthFailure);
348        }
349
350        // Token/context exhaustion (API-side)
351        // Check this BEFORE GLM agent-specific fallback to ensure TokenExhausted is detected
352        // Note: "too long" is specifically for API token limits, not OS argument limits
353        // We exclude "argument list too long" which is an E2BIG OS error
354        if stderr_lower.contains("context length")
355            || stderr_lower.contains("maximum context")
356            || stderr_lower.contains("max context")
357            || stderr_lower.contains("context window")
358            || stderr_lower.contains("maximum tokens")
359            || stderr_lower.contains("max tokens")
360            || stderr_lower.contains("too many tokens")
361            || stderr_lower.contains("token limit")
362            || stderr_lower.contains("context_length_exceeded")
363            || stderr_lower.contains("input too large")
364            || stderr_lower.contains("prompt is too long")
365            || (stderr_lower.contains("too long")
366                && !stderr_lower.contains("argument list too long"))
367        {
368            return Some(Self::TokenExhausted);
369        }
370
371        None
372    }
373
374    /// Check for network and server-side errors.
375    fn check_network_errors(stderr_lower: &str) -> Option<Self> {
376        // Network errors (client-side connectivity issues)
377        if stderr_lower.contains("connection refused")
378            || stderr_lower.contains("network unreachable")
379            || stderr_lower.contains("dns resolution")
380            || stderr_lower.contains("name resolution")
381            || stderr_lower.contains("no route to host")
382            || stderr_lower.contains("network is down")
383            || stderr_lower.contains("host unreachable")
384            || stderr_lower.contains("connection reset")
385            || stderr_lower.contains("broken pipe")
386            || stderr_lower.contains("econnrefused")
387            || stderr_lower.contains("enetunreach")
388        {
389            return Some(Self::NetworkError);
390        }
391
392        // API unavailable (server-side issues)
393        if stderr_lower.contains("service unavailable")
394            || stderr_lower.contains("503")
395            || stderr_lower.contains("502")
396            || stderr_lower.contains("504")
397            || stderr_lower.contains("500")
398            || stderr_lower.contains("internal server error")
399            || stderr_lower.contains("bad gateway")
400            || stderr_lower.contains("gateway timeout")
401            || stderr_lower.contains("overloaded")
402            || stderr_lower.contains("maintenance")
403        {
404            return Some(Self::ApiUnavailable);
405        }
406
407        // Request timeout
408        if stderr_lower.contains("timeout")
409            || stderr_lower.contains("timed out")
410            || stderr_lower.contains("request timeout")
411            || stderr_lower.contains("deadline exceeded")
412        {
413            return Some(Self::Timeout);
414        }
415
416        None
417    }
418
419    /// Check for resource exhaustion errors (disk, memory, process, arg list).
420    fn check_resource_errors(exit_code: i32, stderr_lower: &str) -> Option<Self> {
421        // Disk space exhaustion
422        if stderr_lower.contains("no space left")
423            || stderr_lower.contains("disk full")
424            || stderr_lower.contains("enospc")
425            || stderr_lower.contains("out of disk")
426            || stderr_lower.contains("insufficient storage")
427        {
428            return Some(Self::DiskFull);
429        }
430
431        // Argument list too long (E2BIG) - prompt exceeds OS limit
432        // Exit code 7 is the E2BIG errno value used by spawn_agent_process
433        // This should trigger fallback to another agent (the prompt size issue
434        // may be transient due to XSD retry context accumulation)
435        if exit_code == 7
436            || stderr_lower.contains("argument list too long")
437            || stderr_lower.contains("e2big")
438        {
439            return Some(Self::ToolExecutionFailed);
440        }
441
442        // Process killed (OOM or signals)
443        // Exit code 137 = 128 + 9 (SIGKILL), 139 = 128 + 11 (SIGSEGV)
444        if exit_code == 137
445            || exit_code == 139
446            || exit_code == -9
447            || stderr_lower.contains("killed")
448            || stderr_lower.contains("oom")
449            || stderr_lower.contains("out of memory")
450            || stderr_lower.contains("memory exhausted")
451            || stderr_lower.contains("cannot allocate")
452            || stderr_lower.contains("segmentation fault")
453            || stderr_lower.contains("sigsegv")
454            || stderr_lower.contains("sigkill")
455        {
456            return Some(Self::ProcessKilled);
457        }
458
459        None
460    }
461
462    /// Check for tool and file operation failures.
463    fn check_tool_failures(stderr_lower: &str) -> Option<Self> {
464        // Invalid JSON response
465        if stderr_lower.contains("invalid json")
466            || stderr_lower.contains("json parse")
467            || stderr_lower.contains("unexpected token")
468            || stderr_lower.contains("malformed")
469            || stderr_lower.contains("truncated response")
470            || stderr_lower.contains("incomplete response")
471        {
472            return Some(Self::InvalidResponse);
473        }
474
475        // Tool execution failures (file writes, tool calls, etc.)
476        // These should trigger fallback, not retry
477        if stderr_lower.contains("write error")
478            || stderr_lower.contains("cannot write")
479            || stderr_lower.contains("failed to write")
480            || stderr_lower.contains("unable to create file")
481            || stderr_lower.contains("file creation failed")
482            || stderr_lower.contains("i/o error")
483            || stderr_lower.contains("io error")
484            || stderr_lower.contains("tool failed")
485            || stderr_lower.contains("tool execution failed")
486            || stderr_lower.contains("tool call failed")
487        {
488            return Some(Self::ToolExecutionFailed);
489        }
490
491        // Permission denied errors (specific patterns that should fallback)
492        // These need to be checked BEFORE the generic "error" catch-all
493        // Note: "access denied" is already caught by AuthFailure above (for HTTP 403)
494        // This catches file-system permission errors specifically
495        if stderr_lower.contains("permission denied")
496            || stderr_lower.contains("operation not permitted")
497            || stderr_lower.contains("insufficient permissions")
498            || stderr_lower.contains("eacces")
499            || stderr_lower.contains("eperm")
500        {
501            return Some(Self::ToolExecutionFailed);
502        }
503
504        None
505    }
506
507    /// Check for agent-specific quirks that should trigger fallback.
508    fn check_agent_specific_quirks(stderr_lower: &str, exit_code: i32) -> Option<Self> {
509        // GLM/CCS-specific known issues
510        // These are known quirks that should trigger fallback
511        // Check for CCS-specific error patterns
512        if stderr_lower.contains("ccs") || stderr_lower.contains("glm") {
513            // CCS/GLM with exit code 1 is likely a permission/tool issue
514            if exit_code == 1 {
515                return Some(Self::AgentSpecificQuirk);
516            }
517            // CCS-specific error patterns
518            if stderr_lower.contains("ccs") && stderr_lower.contains("failed") {
519                return Some(Self::AgentSpecificQuirk);
520            }
521            // GLM-specific permission errors
522            if stderr_lower.contains("glm")
523                && (stderr_lower.contains("permission")
524                    || stderr_lower.contains("denied")
525                    || stderr_lower.contains("unauthorized"))
526            {
527                return Some(Self::AgentSpecificQuirk);
528            }
529        }
530
531        // Fallback for GLM with any error and exit code 1
532        if stderr_lower.contains("glm") && exit_code == 1 {
533            return Some(Self::AgentSpecificQuirk);
534        }
535
536        None
537    }
538
539    /// Check for command not found errors.
540    fn check_command_not_found(exit_code: i32, stderr_lower: &str) -> Option<Self> {
541        // Command not found (keep this after permission checks since permission
542        // errors also contain "permission denied")
543        if exit_code == 127
544            || exit_code == 126
545            || stderr_lower.contains("command not found")
546            || stderr_lower.contains("not found")
547            || stderr_lower.contains("no such file")
548        {
549            return Some(Self::CommandNotFound);
550        }
551
552        None
553    }
554}
555
556#[cfg(test)]
557mod tests {
558    use super::*;
559
560    fn classify(exit_code: i32, stderr: &str) -> AgentErrorKind {
561        AgentErrorKind::classify_with_agent(exit_code, stderr, None, None)
562    }
563
564    #[test]
565    fn test_is_glm_like_agent() {
566        // CCS GLM agents - should match
567        assert!(is_glm_like_agent("ccs/glm"));
568        assert!(is_glm_like_agent("ccs/zai"));
569        assert!(is_glm_like_agent("ccs/zhipuai"));
570        assert!(is_glm_like_agent("ccs/qwen"));
571        assert!(is_glm_like_agent("ccs/deepseek"));
572        assert!(is_glm_like_agent("CCS/GLM")); // case insensitive
573
574        // Claude with GLM model flag - should match
575        assert!(is_glm_like_agent("claude -m glm-4"));
576
577        // OpenCode agents with GLM - should NOT match (OpenCode has own mechanism)
578        assert!(!is_glm_like_agent("opencode/opencode/glm-4.7-free"));
579        assert!(!is_glm_like_agent("opencode/zai/glm-4.7"));
580        assert!(!is_glm_like_agent("opencode run -m glm"));
581
582        // Non-GLM agents - should NOT match
583        assert!(!is_glm_like_agent("claude"));
584        assert!(!is_glm_like_agent("codex"));
585        assert!(!is_glm_like_agent("ccs/work"));
586        assert!(!is_glm_like_agent("ccs/personal"));
587
588        // Model name alone without ccs/claude - should NOT match
589        assert!(!is_glm_like_agent("glm-4.7-free"));
590        assert!(!is_glm_like_agent("zai/glm-4.7"));
591    }
592
593    #[test]
594    fn test_agent_error_kind_should_retry() {
595        // RateLimited should NOT retry - it triggers immediate agent fallback
596        assert!(!AgentErrorKind::RateLimited.should_retry());
597        assert!(AgentErrorKind::ApiUnavailable.should_retry());
598        assert!(AgentErrorKind::NetworkError.should_retry());
599        assert!(AgentErrorKind::Timeout.should_retry());
600        assert!(AgentErrorKind::InvalidResponse.should_retry());
601        assert!(AgentErrorKind::Transient.should_retry());
602        assert!(AgentErrorKind::RetryableAgentQuirk.should_retry());
603
604        assert!(!AgentErrorKind::AuthFailure.should_retry());
605        assert!(!AgentErrorKind::CommandNotFound.should_retry());
606        assert!(!AgentErrorKind::Permanent.should_retry());
607    }
608
609    #[test]
610    fn test_agent_error_kind_should_immediate_agent_fallback() {
611        // Only RateLimited should trigger immediate agent fallback
612        assert!(AgentErrorKind::RateLimited.should_immediate_agent_fallback());
613
614        // All other errors should not trigger immediate agent fallback
615        assert!(!AgentErrorKind::ApiUnavailable.should_immediate_agent_fallback());
616        assert!(!AgentErrorKind::NetworkError.should_immediate_agent_fallback());
617        assert!(!AgentErrorKind::Timeout.should_immediate_agent_fallback());
618        assert!(!AgentErrorKind::AuthFailure.should_immediate_agent_fallback());
619        assert!(!AgentErrorKind::TokenExhausted.should_immediate_agent_fallback());
620        assert!(!AgentErrorKind::CommandNotFound.should_immediate_agent_fallback());
621        assert!(!AgentErrorKind::Permanent.should_immediate_agent_fallback());
622        assert!(!AgentErrorKind::Transient.should_immediate_agent_fallback());
623    }
624
625    #[test]
626    fn test_agent_error_kind_should_fallback() {
627        assert!(AgentErrorKind::TokenExhausted.should_fallback());
628        assert!(AgentErrorKind::AuthFailure.should_fallback());
629        assert!(AgentErrorKind::CommandNotFound.should_fallback());
630        assert!(AgentErrorKind::ProcessKilled.should_fallback());
631        assert!(AgentErrorKind::ToolExecutionFailed.should_fallback());
632        assert!(AgentErrorKind::AgentSpecificQuirk.should_fallback());
633
634        assert!(!AgentErrorKind::RateLimited.should_fallback());
635        assert!(!AgentErrorKind::Permanent.should_fallback());
636    }
637
638    #[test]
639    fn test_agent_error_kind_is_unrecoverable() {
640        assert!(AgentErrorKind::DiskFull.is_unrecoverable());
641        assert!(AgentErrorKind::Permanent.is_unrecoverable());
642
643        assert!(!AgentErrorKind::RateLimited.is_unrecoverable());
644        assert!(!AgentErrorKind::AuthFailure.is_unrecoverable());
645    }
646
647    #[test]
648    fn test_agent_error_kind_classify() {
649        // Rate limiting
650        assert_eq!(
651            classify(1, "rate limit exceeded"),
652            AgentErrorKind::RateLimited
653        );
654        assert_eq!(classify(1, "error 429"), AgentErrorKind::RateLimited);
655
656        // Auth failure
657        assert_eq!(classify(1, "unauthorized"), AgentErrorKind::AuthFailure);
658        assert_eq!(classify(1, "error 401"), AgentErrorKind::AuthFailure);
659        // "invalid token" is an auth failure, not token exhaustion
660        assert_eq!(classify(1, "invalid token"), AgentErrorKind::AuthFailure);
661
662        // Command not found
663        assert_eq!(classify(127, ""), AgentErrorKind::CommandNotFound);
664        assert_eq!(
665            classify(1, "command not found"),
666            AgentErrorKind::CommandNotFound
667        );
668
669        // Process killed
670        assert_eq!(classify(137, ""), AgentErrorKind::ProcessKilled);
671        assert_eq!(classify(1, "out of memory"), AgentErrorKind::ProcessKilled);
672
673        // Tool execution failures (NEW)
674        assert_eq!(
675            classify(1, "write error"),
676            AgentErrorKind::ToolExecutionFailed
677        );
678        assert_eq!(
679            classify(1, "tool failed"),
680            AgentErrorKind::ToolExecutionFailed
681        );
682        assert_eq!(
683            classify(1, "failed to write"),
684            AgentErrorKind::ToolExecutionFailed
685        );
686
687        // Permission denied errors (should fallback, not retry)
688        assert_eq!(
689            classify(1, "permission denied"),
690            AgentErrorKind::ToolExecutionFailed
691        );
692        assert_eq!(
693            classify(1, "operation not permitted"),
694            AgentErrorKind::ToolExecutionFailed
695        );
696        assert_eq!(
697            classify(1, "insufficient permissions"),
698            AgentErrorKind::ToolExecutionFailed
699        );
700
701        // Argument list too long (E2BIG) - should trigger fallback
702        assert_eq!(
703            classify(7, "argument list too long"),
704            AgentErrorKind::ToolExecutionFailed
705        );
706        assert_eq!(
707            classify(
708                7,
709                "opencode: Argument list too long (prompt exceeds OS limit)"
710            ),
711            AgentErrorKind::ToolExecutionFailed
712        );
713
714        // "access denied" is caught by AuthFailure earlier (HTTP 403)
715        assert_eq!(classify(1, "access denied"), AgentErrorKind::AuthFailure);
716
717        // GLM-specific known issues (NEW)
718        assert_eq!(classify(1, "glm error"), AgentErrorKind::AgentSpecificQuirk);
719        assert_eq!(
720            classify(1, "ccs glm failed"),
721            AgentErrorKind::AgentSpecificQuirk
722        );
723
724        // Generic exit code 1 with "error" is now more selective
725        // It should NOT match patterns that are handled above
726        assert_eq!(classify(1, "some random error"), AgentErrorKind::Transient);
727
728        // GLM with unknown error (no specific pattern) should be RetryableAgentQuirk
729        assert_eq!(
730            AgentErrorKind::classify_with_agent(1, "some random error", Some("ccs/glm"), None),
731            AgentErrorKind::RetryableAgentQuirk
732        );
733
734        // GLM with known problematic patterns - permission denied is caught by check_tool_failures first
735        assert_eq!(
736            AgentErrorKind::classify_with_agent(1, "permission denied", Some("ccs/glm"), None),
737            AgentErrorKind::ToolExecutionFailed // Caught by earlier check
738        );
739        assert_eq!(
740            AgentErrorKind::classify_with_agent(1, "token limit exceeded", Some("ccs/glm"), None),
741            AgentErrorKind::TokenExhausted // Caught by earlier check
742        );
743        assert_eq!(
744            AgentErrorKind::classify_with_agent(1, "disk full", Some("ccs/glm"), None),
745            AgentErrorKind::DiskFull // Caught by earlier check (disk pattern)
746        );
747        // GLM mentioned in stderr with "failed" - AgentSpecificQuirk
748        assert_eq!(
749            AgentErrorKind::classify_with_agent(1, "glm failed", Some("ccs/glm"), None),
750            AgentErrorKind::AgentSpecificQuirk
751        );
752    }
753
754    #[test]
755    fn test_opencode_error_classification_not_treated_as_glm() {
756        // OpenCode agents should NOT be treated as GLM-like for error classification
757        // They should get normal error classification, not RetryableAgentQuirk
758
759        // OpenCode with exit code 1 and generic error - should be Transient, not RetryableAgentQuirk
760        assert_eq!(
761            AgentErrorKind::classify_with_agent(
762                1,
763                "some error occurred",
764                Some("opencode/opencode/glm-4.7-free"),
765                None
766            ),
767            AgentErrorKind::Transient
768        );
769
770        // OpenCode with exit code 1 and no error in stderr - should be Permanent
771        assert_eq!(
772            AgentErrorKind::classify_with_agent(
773                1,
774                "something happened",
775                Some("opencode/opencode/glm-4.7-free"),
776                None
777            ),
778            AgentErrorKind::Permanent
779        );
780
781        // OpenCode with rate limit - should be RateLimited
782        assert_eq!(
783            AgentErrorKind::classify_with_agent(
784                1,
785                "rate limit exceeded",
786                Some("opencode/zai/glm-4.7"),
787                None
788            ),
789            AgentErrorKind::RateLimited
790        );
791    }
792
793    #[test]
794    fn test_agent_error_kind_description_and_advice() {
795        let error = AgentErrorKind::RateLimited;
796        assert!(!error.description().is_empty());
797        assert!(!error.recovery_advice().is_empty());
798    }
799
800    #[test]
801    fn test_agent_error_kind_suggested_wait_ms() {
802        // RateLimited: 0ms wait (immediate agent fallback, no retry)
803        assert_eq!(AgentErrorKind::RateLimited.suggested_wait_ms(), 0);
804        assert_eq!(AgentErrorKind::Permanent.suggested_wait_ms(), 0);
805        // Retriable errors should have positive wait times
806        assert!(AgentErrorKind::ApiUnavailable.suggested_wait_ms() > 0);
807        assert!(AgentErrorKind::NetworkError.suggested_wait_ms() > 0);
808    }
809
810    #[test]
811    fn test_agent_error_kind_suggests_smaller_context() {
812        assert!(AgentErrorKind::TokenExhausted.suggests_smaller_context());
813        assert!(AgentErrorKind::ProcessKilled.suggests_smaller_context());
814        assert!(!AgentErrorKind::RateLimited.suggests_smaller_context());
815    }
816}
ralph_workflow/agents/error.rs

ralph_workflow/agents/
error.rs