1pub fn is_glm_like_agent(s: &str) -> bool {
21 let s_lower = s.to_lowercase();
22 s_lower.contains("glm")
23 || s_lower.contains("zhipuai")
24 || s_lower.contains("zai")
25 || s_lower.contains("qwen")
26 || s_lower.contains("deepseek")
27}
28
29#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36pub enum AgentErrorKind {
37 RateLimited,
39 TokenExhausted,
41 ApiUnavailable,
43 NetworkError,
45 AuthFailure,
47 CommandNotFound,
49 DiskFull,
51 ProcessKilled,
53 InvalidResponse,
55 Timeout,
57 ToolExecutionFailed,
59 AgentSpecificQuirk,
61 RetryableAgentQuirk,
63 Transient,
65 Permanent,
67}
68
69impl AgentErrorKind {
70 pub const fn should_retry(self) -> bool {
72 matches!(
73 self,
74 Self::RateLimited
75 | Self::ApiUnavailable
76 | Self::NetworkError
77 | Self::Timeout
78 | Self::InvalidResponse
79 | Self::RetryableAgentQuirk
80 | Self::Transient
81 )
82 }
83
84 pub const fn should_fallback(self) -> bool {
86 matches!(
87 self,
88 Self::TokenExhausted
89 | Self::AuthFailure
90 | Self::CommandNotFound
91 | Self::ProcessKilled
92 | Self::ToolExecutionFailed
93 | Self::AgentSpecificQuirk
94 )
95 }
96
97 pub const fn is_unrecoverable(self) -> bool {
99 matches!(self, Self::DiskFull | Self::Permanent)
100 }
101
102 pub const fn is_command_not_found(self) -> bool {
104 matches!(self, Self::CommandNotFound)
105 }
106
107 pub const fn is_network_error(self) -> bool {
109 matches!(self, Self::NetworkError | Self::Timeout)
110 }
111
112 pub const fn suggests_smaller_context(self) -> bool {
114 matches!(self, Self::TokenExhausted | Self::ProcessKilled)
115 }
116
117 pub const fn suggested_wait_ms(self) -> u64 {
119 match self {
120 Self::RateLimited => 5000, Self::ApiUnavailable => 3000, Self::NetworkError => 2000, Self::Timeout | Self::Transient | Self::RetryableAgentQuirk => 1000, Self::InvalidResponse => 500, _ => 0, }
127 }
128
129 pub const fn description(self) -> &'static str {
131 match self {
132 Self::RateLimited => "API rate limit exceeded",
133 Self::TokenExhausted => "Token/context limit exceeded",
134 Self::ApiUnavailable => "API service temporarily unavailable",
135 Self::NetworkError => "Network connectivity issue",
136 Self::AuthFailure => "Authentication failure",
137 Self::CommandNotFound => "Command not found",
138 Self::DiskFull => "Disk space exhausted",
139 Self::ProcessKilled => "Process terminated (possibly OOM)",
140 Self::InvalidResponse => "Invalid response from agent",
141 Self::Timeout => "Request timed out",
142 Self::ToolExecutionFailed => "Tool execution failed (e.g., file write)",
143 Self::AgentSpecificQuirk => "Known agent-specific issue",
144 Self::RetryableAgentQuirk => "Agent-specific issue (may be transient)",
145 Self::Transient => "Transient error",
146 Self::Permanent => "Permanent error",
147 }
148 }
149
150 pub const fn recovery_advice(self) -> &'static str {
152 match self {
153 Self::RateLimited => {
154 "Will retry after delay. Tip: Consider reducing request frequency or using a different provider."
155 }
156 Self::TokenExhausted => {
157 "Switching to alternative agent. Tip: Try RALPH_DEVELOPER_CONTEXT=0 or RALPH_REVIEWER_CONTEXT=0"
158 }
159 Self::ApiUnavailable => {
160 "API server issue. Will retry automatically. Tip: Check status page or try different provider."
161 }
162 Self::NetworkError => {
163 "Check your internet connection. Will retry automatically. Tip: Check firewall/VPN settings."
164 }
165 Self::AuthFailure => {
166 "Check API key or run 'agent auth' to authenticate. Tip: Verify credentials for this provider."
167 }
168 Self::CommandNotFound => {
169 "Agent binary not installed. See installation guidance below. Tip: Run 'ralph --list-available-agents'"
170 }
171 Self::DiskFull => "Free up disk space and try again. Tip: Check .agent directory size.",
172 Self::ProcessKilled => {
173 "Process was killed (possible OOM). Trying with smaller context. Tip: Reduce context with RALPH_*_CONTEXT=0"
174 }
175 Self::InvalidResponse => {
176 "Received malformed response. Retrying... Tip: May indicate parser mismatch with this agent."
177 }
178 Self::Timeout => {
179 "Request timed out. Will retry with longer timeout. Tip: Try reducing prompt size or context."
180 }
181 Self::ToolExecutionFailed => {
182 "Tool execution failed (file write/permissions). Switching agent. Tip: Check directory write permissions."
183 }
184 Self::AgentSpecificQuirk => {
185 "Known agent-specific issue. Switching to alternative agent. Tip: See docs/agent-compatibility.md"
186 }
187 Self::RetryableAgentQuirk => {
188 "Agent-specific issue that may be transient. Retrying... Tip: See docs/agent-compatibility.md"
189 }
190 Self::Transient => "Temporary issue. Will retry automatically.",
191 Self::Permanent => {
192 "Unrecoverable error. Check agent logs (.agent/logs/) and see docs/agent-compatibility.md for help."
193 }
194 }
195 }
196
197 pub fn classify_with_agent(
209 exit_code: i32,
210 stderr: &str,
211 agent_name: Option<&str>,
212 model_flag: Option<&str>,
213 ) -> Self {
214 let stderr_lower = stderr.to_lowercase();
215
216 if let Some(err) = Self::check_api_errors(&stderr_lower) {
219 return err;
220 }
221
222 if let Some(err) = Self::check_network_errors(&stderr_lower) {
223 return err;
224 }
225
226 if let Some(err) = Self::check_resource_errors(exit_code, &stderr_lower) {
227 return err;
228 }
229
230 if let Some(err) = Self::check_tool_failures(&stderr_lower) {
231 return err;
232 }
233
234 let is_problematic_agent =
240 agent_name.is_some_and(is_glm_like_agent) || model_flag.is_some_and(is_glm_like_agent);
241
242 if is_problematic_agent && exit_code == 1 {
243 let has_known_problematic_pattern = stderr_lower.contains("permission")
245 || stderr_lower.contains("denied")
246 || stderr_lower.contains("unauthorized")
247 || stderr_lower.contains("auth")
248 || stderr_lower.contains("token")
249 || stderr_lower.contains("limit")
250 || stderr_lower.contains("quota")
251 || stderr_lower.contains("disk")
252 || stderr_lower.contains("space")
253 || (stderr_lower.contains("glm") && stderr_lower.contains("failed"))
255 || (stderr_lower.contains("ccs") && stderr_lower.contains("failed"))
256 || (stderr_lower.contains("glm")
257 && (stderr_lower.contains("permission")
258 || stderr_lower.contains("denied")
259 || stderr_lower.contains("unauthorized")));
260
261 if has_known_problematic_pattern {
262 return Self::AgentSpecificQuirk;
264 }
265
266 return Self::RetryableAgentQuirk;
268 }
269
270 if let Some(err) = Self::check_agent_specific_quirks(&stderr_lower, exit_code) {
271 return err;
272 }
273
274 if let Some(err) = Self::check_command_not_found(exit_code, &stderr_lower) {
275 return err;
276 }
277
278 if exit_code == 1 && stderr_lower.contains("error") {
281 return Self::Transient;
284 }
285
286 Self::Permanent
287 }
288
289 fn check_api_errors(stderr_lower: &str) -> Option<Self> {
291 if stderr_lower.contains("rate limit")
293 || stderr_lower.contains("too many requests")
294 || stderr_lower.contains("429")
295 || stderr_lower.contains("quota exceeded")
296 {
297 return Some(Self::RateLimited);
298 }
299
300 if stderr_lower.contains("token")
303 || stderr_lower.contains("context length")
304 || stderr_lower.contains("maximum context")
305 || stderr_lower.contains("too long")
306 || stderr_lower.contains("input too large")
307 {
308 return Some(Self::TokenExhausted);
309 }
310
311 if stderr_lower.contains("unauthorized")
313 || stderr_lower.contains("authentication")
314 || stderr_lower.contains("401")
315 || stderr_lower.contains("api key")
316 || stderr_lower.contains("invalid token")
317 || stderr_lower.contains("forbidden")
318 || stderr_lower.contains("403")
319 || stderr_lower.contains("access denied")
320 {
321 return Some(Self::AuthFailure);
322 }
323
324 None
325 }
326
327 fn check_network_errors(stderr_lower: &str) -> Option<Self> {
329 if stderr_lower.contains("connection refused")
331 || stderr_lower.contains("network unreachable")
332 || stderr_lower.contains("dns resolution")
333 || stderr_lower.contains("name resolution")
334 || stderr_lower.contains("no route to host")
335 || stderr_lower.contains("network is down")
336 || stderr_lower.contains("host unreachable")
337 || stderr_lower.contains("connection reset")
338 || stderr_lower.contains("broken pipe")
339 || stderr_lower.contains("econnrefused")
340 || stderr_lower.contains("enetunreach")
341 {
342 return Some(Self::NetworkError);
343 }
344
345 if stderr_lower.contains("service unavailable")
347 || stderr_lower.contains("503")
348 || stderr_lower.contains("502")
349 || stderr_lower.contains("504")
350 || stderr_lower.contains("500")
351 || stderr_lower.contains("internal server error")
352 || stderr_lower.contains("bad gateway")
353 || stderr_lower.contains("gateway timeout")
354 || stderr_lower.contains("overloaded")
355 || stderr_lower.contains("maintenance")
356 {
357 return Some(Self::ApiUnavailable);
358 }
359
360 if stderr_lower.contains("timeout")
362 || stderr_lower.contains("timed out")
363 || stderr_lower.contains("request timeout")
364 || stderr_lower.contains("deadline exceeded")
365 {
366 return Some(Self::Timeout);
367 }
368
369 None
370 }
371
372 fn check_resource_errors(exit_code: i32, stderr_lower: &str) -> Option<Self> {
374 if stderr_lower.contains("no space left")
376 || stderr_lower.contains("disk full")
377 || stderr_lower.contains("enospc")
378 || stderr_lower.contains("out of disk")
379 || stderr_lower.contains("insufficient storage")
380 {
381 return Some(Self::DiskFull);
382 }
383
384 if exit_code == 137
387 || exit_code == 139
388 || exit_code == -9
389 || stderr_lower.contains("killed")
390 || stderr_lower.contains("oom")
391 || stderr_lower.contains("out of memory")
392 || stderr_lower.contains("memory exhausted")
393 || stderr_lower.contains("cannot allocate")
394 || stderr_lower.contains("segmentation fault")
395 || stderr_lower.contains("sigsegv")
396 || stderr_lower.contains("sigkill")
397 {
398 return Some(Self::ProcessKilled);
399 }
400
401 None
402 }
403
404 fn check_tool_failures(stderr_lower: &str) -> Option<Self> {
406 if stderr_lower.contains("invalid json")
408 || stderr_lower.contains("json parse")
409 || stderr_lower.contains("unexpected token")
410 || stderr_lower.contains("malformed")
411 || stderr_lower.contains("truncated response")
412 || stderr_lower.contains("incomplete response")
413 {
414 return Some(Self::InvalidResponse);
415 }
416
417 if stderr_lower.contains("write error")
420 || stderr_lower.contains("cannot write")
421 || stderr_lower.contains("failed to write")
422 || stderr_lower.contains("unable to create file")
423 || stderr_lower.contains("file creation failed")
424 || stderr_lower.contains("i/o error")
425 || stderr_lower.contains("io error")
426 || stderr_lower.contains("tool failed")
427 || stderr_lower.contains("tool execution failed")
428 || stderr_lower.contains("tool call failed")
429 {
430 return Some(Self::ToolExecutionFailed);
431 }
432
433 if stderr_lower.contains("permission denied")
438 || stderr_lower.contains("operation not permitted")
439 || stderr_lower.contains("insufficient permissions")
440 || stderr_lower.contains("eacces")
441 || stderr_lower.contains("eperm")
442 {
443 return Some(Self::ToolExecutionFailed);
444 }
445
446 None
447 }
448
449 fn check_agent_specific_quirks(stderr_lower: &str, exit_code: i32) -> Option<Self> {
451 if stderr_lower.contains("ccs") || stderr_lower.contains("glm") {
455 if exit_code == 1 {
457 return Some(Self::AgentSpecificQuirk);
458 }
459 if stderr_lower.contains("ccs") && stderr_lower.contains("failed") {
461 return Some(Self::AgentSpecificQuirk);
462 }
463 if stderr_lower.contains("glm")
465 && (stderr_lower.contains("permission")
466 || stderr_lower.contains("denied")
467 || stderr_lower.contains("unauthorized"))
468 {
469 return Some(Self::AgentSpecificQuirk);
470 }
471 }
472
473 if stderr_lower.contains("glm") && exit_code == 1 {
475 return Some(Self::AgentSpecificQuirk);
476 }
477
478 None
479 }
480
481 fn check_command_not_found(exit_code: i32, stderr_lower: &str) -> Option<Self> {
483 if exit_code == 127
486 || exit_code == 126
487 || stderr_lower.contains("command not found")
488 || stderr_lower.contains("not found")
489 || stderr_lower.contains("no such file")
490 {
491 return Some(Self::CommandNotFound);
492 }
493
494 None
495 }
496}
497
498#[cfg(test)]
499mod tests {
500 use super::*;
501
502 fn classify(exit_code: i32, stderr: &str) -> AgentErrorKind {
503 AgentErrorKind::classify_with_agent(exit_code, stderr, None, None)
504 }
505
506 #[test]
507 fn test_agent_error_kind_should_retry() {
508 assert!(AgentErrorKind::RateLimited.should_retry());
509 assert!(AgentErrorKind::ApiUnavailable.should_retry());
510 assert!(AgentErrorKind::NetworkError.should_retry());
511 assert!(AgentErrorKind::Timeout.should_retry());
512 assert!(AgentErrorKind::InvalidResponse.should_retry());
513 assert!(AgentErrorKind::Transient.should_retry());
514 assert!(AgentErrorKind::RetryableAgentQuirk.should_retry());
515
516 assert!(!AgentErrorKind::AuthFailure.should_retry());
517 assert!(!AgentErrorKind::CommandNotFound.should_retry());
518 assert!(!AgentErrorKind::Permanent.should_retry());
519 }
520
521 #[test]
522 fn test_agent_error_kind_should_fallback() {
523 assert!(AgentErrorKind::TokenExhausted.should_fallback());
524 assert!(AgentErrorKind::AuthFailure.should_fallback());
525 assert!(AgentErrorKind::CommandNotFound.should_fallback());
526 assert!(AgentErrorKind::ProcessKilled.should_fallback());
527 assert!(AgentErrorKind::ToolExecutionFailed.should_fallback());
528 assert!(AgentErrorKind::AgentSpecificQuirk.should_fallback());
529
530 assert!(!AgentErrorKind::RateLimited.should_fallback());
531 assert!(!AgentErrorKind::Permanent.should_fallback());
532 }
533
534 #[test]
535 fn test_agent_error_kind_is_unrecoverable() {
536 assert!(AgentErrorKind::DiskFull.is_unrecoverable());
537 assert!(AgentErrorKind::Permanent.is_unrecoverable());
538
539 assert!(!AgentErrorKind::RateLimited.is_unrecoverable());
540 assert!(!AgentErrorKind::AuthFailure.is_unrecoverable());
541 }
542
543 #[test]
544 fn test_agent_error_kind_classify() {
545 assert_eq!(
547 classify(1, "rate limit exceeded"),
548 AgentErrorKind::RateLimited
549 );
550 assert_eq!(classify(1, "error 429"), AgentErrorKind::RateLimited);
551
552 assert_eq!(classify(1, "unauthorized"), AgentErrorKind::AuthFailure);
554 assert_eq!(classify(1, "error 401"), AgentErrorKind::AuthFailure);
555
556 assert_eq!(classify(127, ""), AgentErrorKind::CommandNotFound);
558 assert_eq!(
559 classify(1, "command not found"),
560 AgentErrorKind::CommandNotFound
561 );
562
563 assert_eq!(classify(137, ""), AgentErrorKind::ProcessKilled);
565 assert_eq!(classify(1, "out of memory"), AgentErrorKind::ProcessKilled);
566
567 assert_eq!(
569 classify(1, "write error"),
570 AgentErrorKind::ToolExecutionFailed
571 );
572 assert_eq!(
573 classify(1, "tool failed"),
574 AgentErrorKind::ToolExecutionFailed
575 );
576 assert_eq!(
577 classify(1, "failed to write"),
578 AgentErrorKind::ToolExecutionFailed
579 );
580
581 assert_eq!(
583 classify(1, "permission denied"),
584 AgentErrorKind::ToolExecutionFailed
585 );
586 assert_eq!(
587 classify(1, "operation not permitted"),
588 AgentErrorKind::ToolExecutionFailed
589 );
590 assert_eq!(
591 classify(1, "insufficient permissions"),
592 AgentErrorKind::ToolExecutionFailed
593 );
594
595 assert_eq!(classify(1, "access denied"), AgentErrorKind::AuthFailure);
597
598 assert_eq!(classify(1, "glm error"), AgentErrorKind::AgentSpecificQuirk);
600 assert_eq!(
601 classify(1, "ccs glm failed"),
602 AgentErrorKind::AgentSpecificQuirk
603 );
604
605 assert_eq!(classify(1, "some random error"), AgentErrorKind::Transient);
608
609 assert_eq!(
611 AgentErrorKind::classify_with_agent(1, "some random error", Some("ccs/glm"), None),
612 AgentErrorKind::RetryableAgentQuirk
613 );
614
615 assert_eq!(
617 AgentErrorKind::classify_with_agent(1, "permission denied", Some("ccs/glm"), None),
618 AgentErrorKind::ToolExecutionFailed );
620 assert_eq!(
621 AgentErrorKind::classify_with_agent(1, "token limit exceeded", Some("ccs/glm"), None),
622 AgentErrorKind::TokenExhausted );
624 assert_eq!(
625 AgentErrorKind::classify_with_agent(1, "disk full", Some("ccs/glm"), None),
626 AgentErrorKind::DiskFull );
628 assert_eq!(
630 AgentErrorKind::classify_with_agent(1, "glm failed", Some("ccs/glm"), None),
631 AgentErrorKind::AgentSpecificQuirk
632 );
633 }
634
635 #[test]
636 fn test_agent_error_kind_description_and_advice() {
637 let error = AgentErrorKind::RateLimited;
638 assert!(!error.description().is_empty());
639 assert!(!error.recovery_advice().is_empty());
640 }
641
642 #[test]
643 fn test_agent_error_kind_suggested_wait_ms() {
644 assert_eq!(AgentErrorKind::RateLimited.suggested_wait_ms(), 5000);
645 assert_eq!(AgentErrorKind::Permanent.suggested_wait_ms(), 0);
646 }
647
648 #[test]
649 fn test_agent_error_kind_suggests_smaller_context() {
650 assert!(AgentErrorKind::TokenExhausted.suggests_smaller_context());
651 assert!(AgentErrorKind::ProcessKilled.suggests_smaller_context());
652 assert!(!AgentErrorKind::RateLimited.suggests_smaller_context());
653 }
654}