ralph_workflow/agents/
error.rs

1//! Error classification for agent failures.
2//!
3//! This module provides error classification logic to determine appropriate
4//! recovery strategies when agents fail. Different error types warrant
5//! different responses: retry, fallback to another agent, or abort.
6
7/// Check if a string contains a GLM-like model name.
8///
9/// GLM-like models include GLM, ZhipuAI, ZAI, Qwen, and DeepSeek.
10/// Use this for detecting GLM models in any context (e.g., prompt selection).
11/// For detecting CCS/Claude-based GLM agents specifically (error handling),
12/// use `is_glm_like_agent` instead.
13pub fn contains_glm_model(s: &str) -> bool {
14    let s_lower = s.to_lowercase();
15    s_lower.contains("glm")
16        || s_lower.contains("zhipuai")
17        || s_lower.contains("zai")
18        || s_lower.contains("qwen")
19        || s_lower.contains("deepseek")
20}
21
22/// Check if an agent is a CCS/Claude-based agent using a GLM-like model.
23///
24/// These agents have known compatibility issues because they use Claude CLI
25/// with GLM models via CCS (Claude Code Switch). They require:
26/// - The `-p` flag for non-interactive mode
27/// - Special error handling for GLM-specific quirks
28///
29/// This does NOT match OpenCode agents using GLM models, as OpenCode has
30/// its own mechanism (`--auto-approve`) and JSON format.
31///
32/// # Arguments
33///
34/// * `s` - The agent name or command string to check
35///
36/// # Returns
37///
38/// `true` if this is a CCS/Claude agent using a GLM-like model, `false` otherwise
39pub fn is_glm_like_agent(s: &str) -> bool {
40    let s_lower = s.to_lowercase();
41
42    // Must contain a GLM-like model name
43    if !contains_glm_model(&s_lower) {
44        return false;
45    }
46
47    // Exclude OpenCode agents - they have their own mechanism
48    if s_lower.starts_with("opencode") {
49        return false;
50    }
51
52    // Match CCS agents (ccs/glm, ccs/zai, etc.) or claude-based commands
53    s_lower.starts_with("ccs") || s_lower.contains("claude")
54}
55
56/// Error classification for agent failures.
57///
58/// Used to determine appropriate recovery strategy when an agent fails:
59/// - `should_retry()` - Try same agent again after delay
60/// - `should_fallback()` - Switch to next agent in the chain
61/// - `is_unrecoverable()` - Abort the pipeline
62#[derive(Debug, Clone, Copy, PartialEq, Eq)]
63pub enum AgentErrorKind {
64    /// API rate limit exceeded - retry after delay.
65    RateLimited,
66    /// Token/context limit exceeded - may need different agent.
67    TokenExhausted,
68    /// API temporarily unavailable (server-side issue) - retry.
69    ApiUnavailable,
70    /// Network connectivity issue (client-side) - retry.
71    NetworkError,
72    /// Authentication failure - switch agent.
73    AuthFailure,
74    /// Command not found - switch agent.
75    CommandNotFound,
76    /// Disk space exhausted - cannot continue.
77    DiskFull,
78    /// Process killed (OOM, signal) - may retry with smaller context.
79    ProcessKilled,
80    /// Invalid JSON response from agent - may retry.
81    InvalidResponse,
82    /// Request/response timeout - retry.
83    Timeout,
84    /// Tool execution failed - should fallback (e.g., file write issues).
85    ToolExecutionFailed,
86    /// Known agent-specific behavioral quirk - should fallback with specific advice.
87    AgentSpecificQuirk,
88    /// Agent-specific issue that may be transient - should retry before falling back.
89    RetryableAgentQuirk,
90    /// Other transient error - retry.
91    Transient,
92    /// Permanent failure - do not retry.
93    Permanent,
94}
95
96impl AgentErrorKind {
97    /// Determine if this error should trigger a retry.
98    pub const fn should_retry(self) -> bool {
99        matches!(
100            self,
101            Self::RateLimited
102                | Self::ApiUnavailable
103                | Self::NetworkError
104                | Self::Timeout
105                | Self::InvalidResponse
106                | Self::RetryableAgentQuirk
107                | Self::Transient
108        )
109    }
110
111    /// Determine if this error should trigger a fallback to another agent.
112    pub const fn should_fallback(self) -> bool {
113        matches!(
114            self,
115            Self::TokenExhausted
116                | Self::AuthFailure
117                | Self::CommandNotFound
118                | Self::ProcessKilled
119                | Self::ToolExecutionFailed
120                | Self::AgentSpecificQuirk
121        )
122    }
123
124    /// Determine if this error is unrecoverable (should abort).
125    pub const fn is_unrecoverable(self) -> bool {
126        matches!(self, Self::DiskFull | Self::Permanent)
127    }
128
129    /// Check if this is a command not found error.
130    pub const fn is_command_not_found(self) -> bool {
131        matches!(self, Self::CommandNotFound)
132    }
133
134    /// Check if this is a network-related error.
135    pub const fn is_network_error(self) -> bool {
136        matches!(self, Self::NetworkError | Self::Timeout)
137    }
138
139    /// Check if this error might be resolved by reducing context size.
140    pub const fn suggests_smaller_context(self) -> bool {
141        matches!(self, Self::TokenExhausted | Self::ProcessKilled)
142    }
143
144    /// Get suggested wait time in milliseconds before retry.
145    pub const fn suggested_wait_ms(self) -> u64 {
146        match self {
147            Self::RateLimited => 5000,    // Rate limit: wait 5 seconds
148            Self::ApiUnavailable => 3000, // Server issue: wait 3 seconds
149            Self::NetworkError => 2000,   // Network: wait 2 seconds
150            Self::Timeout | Self::Transient | Self::RetryableAgentQuirk => 1000, // Timeout/Transient: short wait
151            Self::InvalidResponse => 500, // Bad response: quick retry
152            _ => 0,                       // No wait for non-retryable errors
153        }
154    }
155
156    /// Get a user-friendly description of this error type.
157    pub const fn description(self) -> &'static str {
158        match self {
159            Self::RateLimited => "API rate limit exceeded",
160            Self::TokenExhausted => "Token/context limit exceeded",
161            Self::ApiUnavailable => "API service temporarily unavailable",
162            Self::NetworkError => "Network connectivity issue",
163            Self::AuthFailure => "Authentication failure",
164            Self::CommandNotFound => "Command not found",
165            Self::DiskFull => "Disk space exhausted",
166            Self::ProcessKilled => "Process terminated (possibly OOM)",
167            Self::InvalidResponse => "Invalid response from agent",
168            Self::Timeout => "Request timed out",
169            Self::ToolExecutionFailed => "Tool execution failed (e.g., file write)",
170            Self::AgentSpecificQuirk => "Known agent-specific issue",
171            Self::RetryableAgentQuirk => "Agent-specific issue (may be transient)",
172            Self::Transient => "Transient error",
173            Self::Permanent => "Permanent error",
174        }
175    }
176
177    /// Get recovery advice for this error type.
178    pub const fn recovery_advice(self) -> &'static str {
179        match self {
180            Self::RateLimited => {
181                "Will retry after delay. Tip: Consider reducing request frequency or using a different provider."
182            }
183            Self::TokenExhausted => {
184                "Switching to alternative agent. Tip: Try RALPH_DEVELOPER_CONTEXT=0 or RALPH_REVIEWER_CONTEXT=0"
185            }
186            Self::ApiUnavailable => {
187                "API server issue. Will retry automatically. Tip: Check status page or try different provider."
188            }
189            Self::NetworkError => {
190                "Check your internet connection. Will retry automatically. Tip: Check firewall/VPN settings."
191            }
192            Self::AuthFailure => {
193                "Check API key or run 'agent auth' to authenticate. Tip: Verify credentials for this provider."
194            }
195            Self::CommandNotFound => {
196                "Agent binary not installed. See installation guidance below. Tip: Run 'ralph --list-available-agents'"
197            }
198            Self::DiskFull => "Free up disk space and try again. Tip: Check .agent directory size.",
199            Self::ProcessKilled => {
200                "Process was killed (possible OOM). Trying with smaller context. Tip: Reduce context with RALPH_*_CONTEXT=0"
201            }
202            Self::InvalidResponse => {
203                "Received malformed response. Retrying... Tip: May indicate parser mismatch with this agent."
204            }
205            Self::Timeout => {
206                "Request timed out. Will retry with longer timeout. Tip: Try reducing prompt size or context."
207            }
208            Self::ToolExecutionFailed => {
209                "Tool execution failed (file write/permissions). Switching agent. Tip: Check directory write permissions."
210            }
211            Self::AgentSpecificQuirk => {
212                "Known agent-specific issue. Switching to alternative agent. Tip: See docs/agent-compatibility.md"
213            }
214            Self::RetryableAgentQuirk => {
215                "Agent-specific issue that may be transient. Retrying... Tip: See docs/agent-compatibility.md"
216            }
217            Self::Transient => "Temporary issue. Will retry automatically.",
218            Self::Permanent => {
219                "Unrecoverable error. Check agent logs (.agent/logs/) and see docs/agent-compatibility.md for help."
220            }
221        }
222    }
223
224    /// Classify an error from exit code, output, and agent name.
225    ///
226    /// This variant takes the agent name into account for better classification.
227    /// Some agents have known failure patterns that should trigger fallback
228    /// instead of retry, even when the stderr output is generic.
229    ///
230    /// # Arguments
231    ///
232    /// * `exit_code` - The process exit code
233    /// * `stderr` - The standard error output from the agent
234    /// * `agent_name` - Optional agent name for context-aware classification
235    pub fn classify_with_agent(
236        exit_code: i32,
237        stderr: &str,
238        agent_name: Option<&str>,
239        model_flag: Option<&str>,
240    ) -> Self {
241        let stderr_lower = stderr.to_lowercase();
242
243        // Check for specific error patterns FIRST, before applying agent-specific heuristics.
244        // This ensures that token exhaustion is detected even for GLM-like agents.
245        if let Some(err) = Self::check_api_errors(&stderr_lower) {
246            return err;
247        }
248
249        if let Some(err) = Self::check_network_errors(&stderr_lower) {
250            return err;
251        }
252
253        if let Some(err) = Self::check_resource_errors(exit_code, &stderr_lower) {
254            return err;
255        }
256
257        if let Some(err) = Self::check_tool_failures(&stderr_lower) {
258            return err;
259        }
260
261        // If we know this is a GLM-like agent and it failed with exit code 1
262        // (and we haven't matched a specific error pattern above),
263        // classify based on stderr content:
264        // - If stderr is empty or contains only generic messages, treat as RetryableAgentQuirk
265        // - If stderr contains specific error patterns, it will be caught by check_agent_specific_quirks below
266        let is_problematic_agent =
267            agent_name.is_some_and(is_glm_like_agent) || model_flag.is_some_and(is_glm_like_agent);
268
269        if is_problematic_agent && exit_code == 1 {
270            // Check if stderr has known problematic patterns that indicate unrecoverable issues
271            let has_known_problematic_pattern = stderr_lower.contains("permission")
272                || stderr_lower.contains("denied")
273                || stderr_lower.contains("unauthorized")
274                || stderr_lower.contains("auth")
275                || stderr_lower.contains("token")
276                || stderr_lower.contains("limit")
277                || stderr_lower.contains("quota")
278                || stderr_lower.contains("disk")
279                || stderr_lower.contains("space")
280                // Agent-specific known patterns (from check_agent_specific_quirks)
281                || (stderr_lower.contains("glm") && stderr_lower.contains("failed"))
282                || (stderr_lower.contains("ccs") && stderr_lower.contains("failed"));
283
284            if has_known_problematic_pattern {
285                // Known issue - should fallback
286                return Self::AgentSpecificQuirk;
287            }
288
289            // Unknown error - may be transient, should retry
290            return Self::RetryableAgentQuirk;
291        }
292
293        if let Some(err) = Self::check_agent_specific_quirks(&stderr_lower, exit_code) {
294            return err;
295        }
296
297        if let Some(err) = Self::check_command_not_found(exit_code, &stderr_lower) {
298            return err;
299        }
300
301        // Transient errors (exit codes that might succeed on retry)
302        // This is now a more specific catch-all for actual transient issues
303        if exit_code == 1 && stderr_lower.contains("error") {
304            // But only if it's not a known permanent issue pattern
305            // (permission, tool failures, GLM issues are already handled above)
306            return Self::Transient;
307        }
308
309        Self::Permanent
310    }
311
312    /// Check for API-level errors (rate limiting, auth, server issues).
313    fn check_api_errors(stderr_lower: &str) -> Option<Self> {
314        // Rate limiting indicators (API-side)
315        if stderr_lower.contains("rate limit")
316            || stderr_lower.contains("too many requests")
317            || stderr_lower.contains("429")
318            || stderr_lower.contains("quota exceeded")
319        {
320            return Some(Self::RateLimited);
321        }
322
323        // Token/context exhaustion (API-side)
324        // Check this BEFORE GLM agent-specific fallback to ensure TokenExhausted is detected
325        // Note: "too long" is specifically for API token limits, not OS argument limits
326        // We exclude "argument list too long" which is an E2BIG OS error
327        if stderr_lower.contains("token")
328            || stderr_lower.contains("context length")
329            || stderr_lower.contains("maximum context")
330            || stderr_lower.contains("input too large")
331            || (stderr_lower.contains("too long")
332                && !stderr_lower.contains("argument list too long"))
333        {
334            return Some(Self::TokenExhausted);
335        }
336
337        // Auth failures
338        if stderr_lower.contains("unauthorized")
339            || stderr_lower.contains("authentication")
340            || stderr_lower.contains("401")
341            || stderr_lower.contains("api key")
342            || stderr_lower.contains("invalid token")
343            || stderr_lower.contains("forbidden")
344            || stderr_lower.contains("403")
345            || stderr_lower.contains("access denied")
346        {
347            return Some(Self::AuthFailure);
348        }
349
350        None
351    }
352
353    /// Check for network and server-side errors.
354    fn check_network_errors(stderr_lower: &str) -> Option<Self> {
355        // Network errors (client-side connectivity issues)
356        if stderr_lower.contains("connection refused")
357            || stderr_lower.contains("network unreachable")
358            || stderr_lower.contains("dns resolution")
359            || stderr_lower.contains("name resolution")
360            || stderr_lower.contains("no route to host")
361            || stderr_lower.contains("network is down")
362            || stderr_lower.contains("host unreachable")
363            || stderr_lower.contains("connection reset")
364            || stderr_lower.contains("broken pipe")
365            || stderr_lower.contains("econnrefused")
366            || stderr_lower.contains("enetunreach")
367        {
368            return Some(Self::NetworkError);
369        }
370
371        // API unavailable (server-side issues)
372        if stderr_lower.contains("service unavailable")
373            || stderr_lower.contains("503")
374            || stderr_lower.contains("502")
375            || stderr_lower.contains("504")
376            || stderr_lower.contains("500")
377            || stderr_lower.contains("internal server error")
378            || stderr_lower.contains("bad gateway")
379            || stderr_lower.contains("gateway timeout")
380            || stderr_lower.contains("overloaded")
381            || stderr_lower.contains("maintenance")
382        {
383            return Some(Self::ApiUnavailable);
384        }
385
386        // Request timeout
387        if stderr_lower.contains("timeout")
388            || stderr_lower.contains("timed out")
389            || stderr_lower.contains("request timeout")
390            || stderr_lower.contains("deadline exceeded")
391        {
392            return Some(Self::Timeout);
393        }
394
395        None
396    }
397
398    /// Check for resource exhaustion errors (disk, memory, process, arg list).
399    fn check_resource_errors(exit_code: i32, stderr_lower: &str) -> Option<Self> {
400        // Disk space exhaustion
401        if stderr_lower.contains("no space left")
402            || stderr_lower.contains("disk full")
403            || stderr_lower.contains("enospc")
404            || stderr_lower.contains("out of disk")
405            || stderr_lower.contains("insufficient storage")
406        {
407            return Some(Self::DiskFull);
408        }
409
410        // Argument list too long (E2BIG) - prompt exceeds OS limit
411        // Exit code 7 is the E2BIG errno value used by spawn_agent_process
412        // This should trigger fallback to another agent (the prompt size issue
413        // may be transient due to XSD retry context accumulation)
414        if exit_code == 7
415            || stderr_lower.contains("argument list too long")
416            || stderr_lower.contains("e2big")
417        {
418            return Some(Self::ToolExecutionFailed);
419        }
420
421        // Process killed (OOM or signals)
422        // Exit code 137 = 128 + 9 (SIGKILL), 139 = 128 + 11 (SIGSEGV)
423        if exit_code == 137
424            || exit_code == 139
425            || exit_code == -9
426            || stderr_lower.contains("killed")
427            || stderr_lower.contains("oom")
428            || stderr_lower.contains("out of memory")
429            || stderr_lower.contains("memory exhausted")
430            || stderr_lower.contains("cannot allocate")
431            || stderr_lower.contains("segmentation fault")
432            || stderr_lower.contains("sigsegv")
433            || stderr_lower.contains("sigkill")
434        {
435            return Some(Self::ProcessKilled);
436        }
437
438        None
439    }
440
441    /// Check for tool and file operation failures.
442    fn check_tool_failures(stderr_lower: &str) -> Option<Self> {
443        // Invalid JSON response
444        if stderr_lower.contains("invalid json")
445            || stderr_lower.contains("json parse")
446            || stderr_lower.contains("unexpected token")
447            || stderr_lower.contains("malformed")
448            || stderr_lower.contains("truncated response")
449            || stderr_lower.contains("incomplete response")
450        {
451            return Some(Self::InvalidResponse);
452        }
453
454        // Tool execution failures (file writes, tool calls, etc.)
455        // These should trigger fallback, not retry
456        if stderr_lower.contains("write error")
457            || stderr_lower.contains("cannot write")
458            || stderr_lower.contains("failed to write")
459            || stderr_lower.contains("unable to create file")
460            || stderr_lower.contains("file creation failed")
461            || stderr_lower.contains("i/o error")
462            || stderr_lower.contains("io error")
463            || stderr_lower.contains("tool failed")
464            || stderr_lower.contains("tool execution failed")
465            || stderr_lower.contains("tool call failed")
466        {
467            return Some(Self::ToolExecutionFailed);
468        }
469
470        // Permission denied errors (specific patterns that should fallback)
471        // These need to be checked BEFORE the generic "error" catch-all
472        // Note: "access denied" is already caught by AuthFailure above (for HTTP 403)
473        // This catches file-system permission errors specifically
474        if stderr_lower.contains("permission denied")
475            || stderr_lower.contains("operation not permitted")
476            || stderr_lower.contains("insufficient permissions")
477            || stderr_lower.contains("eacces")
478            || stderr_lower.contains("eperm")
479        {
480            return Some(Self::ToolExecutionFailed);
481        }
482
483        None
484    }
485
486    /// Check for agent-specific quirks that should trigger fallback.
487    fn check_agent_specific_quirks(stderr_lower: &str, exit_code: i32) -> Option<Self> {
488        // GLM/CCS-specific known issues
489        // These are known quirks that should trigger fallback
490        // Check for CCS-specific error patterns
491        if stderr_lower.contains("ccs") || stderr_lower.contains("glm") {
492            // CCS/GLM with exit code 1 is likely a permission/tool issue
493            if exit_code == 1 {
494                return Some(Self::AgentSpecificQuirk);
495            }
496            // CCS-specific error patterns
497            if stderr_lower.contains("ccs") && stderr_lower.contains("failed") {
498                return Some(Self::AgentSpecificQuirk);
499            }
500            // GLM-specific permission errors
501            if stderr_lower.contains("glm")
502                && (stderr_lower.contains("permission")
503                    || stderr_lower.contains("denied")
504                    || stderr_lower.contains("unauthorized"))
505            {
506                return Some(Self::AgentSpecificQuirk);
507            }
508        }
509
510        // Fallback for GLM with any error and exit code 1
511        if stderr_lower.contains("glm") && exit_code == 1 {
512            return Some(Self::AgentSpecificQuirk);
513        }
514
515        None
516    }
517
518    /// Check for command not found errors.
519    fn check_command_not_found(exit_code: i32, stderr_lower: &str) -> Option<Self> {
520        // Command not found (keep this after permission checks since permission
521        // errors also contain "permission denied")
522        if exit_code == 127
523            || exit_code == 126
524            || stderr_lower.contains("command not found")
525            || stderr_lower.contains("not found")
526            || stderr_lower.contains("no such file")
527        {
528            return Some(Self::CommandNotFound);
529        }
530
531        None
532    }
533}
534
535#[cfg(test)]
536mod tests {
537    use super::*;
538
539    fn classify(exit_code: i32, stderr: &str) -> AgentErrorKind {
540        AgentErrorKind::classify_with_agent(exit_code, stderr, None, None)
541    }
542
543    #[test]
544    fn test_is_glm_like_agent() {
545        // CCS GLM agents - should match
546        assert!(is_glm_like_agent("ccs/glm"));
547        assert!(is_glm_like_agent("ccs/zai"));
548        assert!(is_glm_like_agent("ccs/zhipuai"));
549        assert!(is_glm_like_agent("ccs/qwen"));
550        assert!(is_glm_like_agent("ccs/deepseek"));
551        assert!(is_glm_like_agent("CCS/GLM")); // case insensitive
552
553        // Claude with GLM model flag - should match
554        assert!(is_glm_like_agent("claude -m glm-4"));
555
556        // OpenCode agents with GLM - should NOT match (OpenCode has own mechanism)
557        assert!(!is_glm_like_agent("opencode/opencode/glm-4.7-free"));
558        assert!(!is_glm_like_agent("opencode/zai/glm-4.7"));
559        assert!(!is_glm_like_agent("opencode run -m glm"));
560
561        // Non-GLM agents - should NOT match
562        assert!(!is_glm_like_agent("claude"));
563        assert!(!is_glm_like_agent("codex"));
564        assert!(!is_glm_like_agent("ccs/work"));
565        assert!(!is_glm_like_agent("ccs/personal"));
566
567        // Model name alone without ccs/claude - should NOT match
568        assert!(!is_glm_like_agent("glm-4.7-free"));
569        assert!(!is_glm_like_agent("zai/glm-4.7"));
570    }
571
572    #[test]
573    fn test_agent_error_kind_should_retry() {
574        assert!(AgentErrorKind::RateLimited.should_retry());
575        assert!(AgentErrorKind::ApiUnavailable.should_retry());
576        assert!(AgentErrorKind::NetworkError.should_retry());
577        assert!(AgentErrorKind::Timeout.should_retry());
578        assert!(AgentErrorKind::InvalidResponse.should_retry());
579        assert!(AgentErrorKind::Transient.should_retry());
580        assert!(AgentErrorKind::RetryableAgentQuirk.should_retry());
581
582        assert!(!AgentErrorKind::AuthFailure.should_retry());
583        assert!(!AgentErrorKind::CommandNotFound.should_retry());
584        assert!(!AgentErrorKind::Permanent.should_retry());
585    }
586
587    #[test]
588    fn test_agent_error_kind_should_fallback() {
589        assert!(AgentErrorKind::TokenExhausted.should_fallback());
590        assert!(AgentErrorKind::AuthFailure.should_fallback());
591        assert!(AgentErrorKind::CommandNotFound.should_fallback());
592        assert!(AgentErrorKind::ProcessKilled.should_fallback());
593        assert!(AgentErrorKind::ToolExecutionFailed.should_fallback());
594        assert!(AgentErrorKind::AgentSpecificQuirk.should_fallback());
595
596        assert!(!AgentErrorKind::RateLimited.should_fallback());
597        assert!(!AgentErrorKind::Permanent.should_fallback());
598    }
599
600    #[test]
601    fn test_agent_error_kind_is_unrecoverable() {
602        assert!(AgentErrorKind::DiskFull.is_unrecoverable());
603        assert!(AgentErrorKind::Permanent.is_unrecoverable());
604
605        assert!(!AgentErrorKind::RateLimited.is_unrecoverable());
606        assert!(!AgentErrorKind::AuthFailure.is_unrecoverable());
607    }
608
609    #[test]
610    fn test_agent_error_kind_classify() {
611        // Rate limiting
612        assert_eq!(
613            classify(1, "rate limit exceeded"),
614            AgentErrorKind::RateLimited
615        );
616        assert_eq!(classify(1, "error 429"), AgentErrorKind::RateLimited);
617
618        // Auth failure
619        assert_eq!(classify(1, "unauthorized"), AgentErrorKind::AuthFailure);
620        assert_eq!(classify(1, "error 401"), AgentErrorKind::AuthFailure);
621
622        // Command not found
623        assert_eq!(classify(127, ""), AgentErrorKind::CommandNotFound);
624        assert_eq!(
625            classify(1, "command not found"),
626            AgentErrorKind::CommandNotFound
627        );
628
629        // Process killed
630        assert_eq!(classify(137, ""), AgentErrorKind::ProcessKilled);
631        assert_eq!(classify(1, "out of memory"), AgentErrorKind::ProcessKilled);
632
633        // Tool execution failures (NEW)
634        assert_eq!(
635            classify(1, "write error"),
636            AgentErrorKind::ToolExecutionFailed
637        );
638        assert_eq!(
639            classify(1, "tool failed"),
640            AgentErrorKind::ToolExecutionFailed
641        );
642        assert_eq!(
643            classify(1, "failed to write"),
644            AgentErrorKind::ToolExecutionFailed
645        );
646
647        // Permission denied errors (should fallback, not retry)
648        assert_eq!(
649            classify(1, "permission denied"),
650            AgentErrorKind::ToolExecutionFailed
651        );
652        assert_eq!(
653            classify(1, "operation not permitted"),
654            AgentErrorKind::ToolExecutionFailed
655        );
656        assert_eq!(
657            classify(1, "insufficient permissions"),
658            AgentErrorKind::ToolExecutionFailed
659        );
660
661        // Argument list too long (E2BIG) - should trigger fallback
662        assert_eq!(
663            classify(7, "argument list too long"),
664            AgentErrorKind::ToolExecutionFailed
665        );
666        assert_eq!(
667            classify(
668                7,
669                "opencode: Argument list too long (prompt exceeds OS limit)"
670            ),
671            AgentErrorKind::ToolExecutionFailed
672        );
673
674        // "access denied" is caught by AuthFailure earlier (HTTP 403)
675        assert_eq!(classify(1, "access denied"), AgentErrorKind::AuthFailure);
676
677        // GLM-specific known issues (NEW)
678        assert_eq!(classify(1, "glm error"), AgentErrorKind::AgentSpecificQuirk);
679        assert_eq!(
680            classify(1, "ccs glm failed"),
681            AgentErrorKind::AgentSpecificQuirk
682        );
683
684        // Generic exit code 1 with "error" is now more selective
685        // It should NOT match patterns that are handled above
686        assert_eq!(classify(1, "some random error"), AgentErrorKind::Transient);
687
688        // GLM with unknown error (no specific pattern) should be RetryableAgentQuirk
689        assert_eq!(
690            AgentErrorKind::classify_with_agent(1, "some random error", Some("ccs/glm"), None),
691            AgentErrorKind::RetryableAgentQuirk
692        );
693
694        // GLM with known problematic patterns - permission denied is caught by check_tool_failures first
695        assert_eq!(
696            AgentErrorKind::classify_with_agent(1, "permission denied", Some("ccs/glm"), None),
697            AgentErrorKind::ToolExecutionFailed // Caught by earlier check
698        );
699        assert_eq!(
700            AgentErrorKind::classify_with_agent(1, "token limit exceeded", Some("ccs/glm"), None),
701            AgentErrorKind::TokenExhausted // Caught by earlier check
702        );
703        assert_eq!(
704            AgentErrorKind::classify_with_agent(1, "disk full", Some("ccs/glm"), None),
705            AgentErrorKind::DiskFull // Caught by earlier check (disk pattern)
706        );
707        // GLM mentioned in stderr with "failed" - AgentSpecificQuirk
708        assert_eq!(
709            AgentErrorKind::classify_with_agent(1, "glm failed", Some("ccs/glm"), None),
710            AgentErrorKind::AgentSpecificQuirk
711        );
712    }
713
714    #[test]
715    fn test_opencode_error_classification_not_treated_as_glm() {
716        // OpenCode agents should NOT be treated as GLM-like for error classification
717        // They should get normal error classification, not RetryableAgentQuirk
718
719        // OpenCode with exit code 1 and generic error - should be Transient, not RetryableAgentQuirk
720        assert_eq!(
721            AgentErrorKind::classify_with_agent(
722                1,
723                "some error occurred",
724                Some("opencode/opencode/glm-4.7-free"),
725                None
726            ),
727            AgentErrorKind::Transient
728        );
729
730        // OpenCode with exit code 1 and no error in stderr - should be Permanent
731        assert_eq!(
732            AgentErrorKind::classify_with_agent(
733                1,
734                "something happened",
735                Some("opencode/opencode/glm-4.7-free"),
736                None
737            ),
738            AgentErrorKind::Permanent
739        );
740
741        // OpenCode with rate limit - should be RateLimited
742        assert_eq!(
743            AgentErrorKind::classify_with_agent(
744                1,
745                "rate limit exceeded",
746                Some("opencode/zai/glm-4.7"),
747                None
748            ),
749            AgentErrorKind::RateLimited
750        );
751    }
752
753    #[test]
754    fn test_agent_error_kind_description_and_advice() {
755        let error = AgentErrorKind::RateLimited;
756        assert!(!error.description().is_empty());
757        assert!(!error.recovery_advice().is_empty());
758    }
759
760    #[test]
761    fn test_agent_error_kind_suggested_wait_ms() {
762        assert_eq!(AgentErrorKind::RateLimited.suggested_wait_ms(), 5000);
763        assert_eq!(AgentErrorKind::Permanent.suggested_wait_ms(), 0);
764    }
765
766    #[test]
767    fn test_agent_error_kind_suggests_smaller_context() {
768        assert!(AgentErrorKind::TokenExhausted.suggests_smaller_context());
769        assert!(AgentErrorKind::ProcessKilled.suggests_smaller_context());
770        assert!(!AgentErrorKind::RateLimited.suggests_smaller_context());
771    }
772}
ralph_workflow/agents/error.rs

ralph_workflow/agents/
error.rs