1pub fn is_glm_like_agent(s: &str) -> bool {
21 let s_lower = s.to_lowercase();
22 s_lower.contains("glm")
23 || s_lower.contains("zhipuai")
24 || s_lower.contains("zai")
25 || s_lower.contains("qwen")
26 || s_lower.contains("deepseek")
27}
28
29#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36pub enum AgentErrorKind {
37 RateLimited,
39 TokenExhausted,
41 ApiUnavailable,
43 NetworkError,
45 AuthFailure,
47 CommandNotFound,
49 DiskFull,
51 ProcessKilled,
53 InvalidResponse,
55 Timeout,
57 ToolExecutionFailed,
59 AgentSpecificQuirk,
61 Transient,
63 Permanent,
65}
66
67impl AgentErrorKind {
68 pub const fn should_retry(self) -> bool {
70 matches!(
71 self,
72 Self::RateLimited
73 | Self::ApiUnavailable
74 | Self::NetworkError
75 | Self::Timeout
76 | Self::InvalidResponse
77 | Self::Transient
78 )
79 }
80
81 pub const fn should_fallback(self) -> bool {
83 matches!(
84 self,
85 Self::TokenExhausted
86 | Self::AuthFailure
87 | Self::CommandNotFound
88 | Self::ProcessKilled
89 | Self::ToolExecutionFailed
90 | Self::AgentSpecificQuirk
91 )
92 }
93
94 pub const fn is_unrecoverable(self) -> bool {
96 matches!(self, Self::DiskFull | Self::Permanent)
97 }
98
99 pub const fn is_command_not_found(self) -> bool {
101 matches!(self, Self::CommandNotFound)
102 }
103
104 pub const fn is_network_error(self) -> bool {
106 matches!(self, Self::NetworkError | Self::Timeout)
107 }
108
109 pub const fn suggests_smaller_context(self) -> bool {
111 matches!(self, Self::TokenExhausted | Self::ProcessKilled)
112 }
113
114 pub const fn suggested_wait_ms(self) -> u64 {
116 match self {
117 Self::RateLimited => 5000, Self::ApiUnavailable => 3000, Self::NetworkError => 2000, Self::Timeout | Self::Transient => 1000, Self::InvalidResponse => 500, _ => 0, }
124 }
125
126 pub const fn description(self) -> &'static str {
128 match self {
129 Self::RateLimited => "API rate limit exceeded",
130 Self::TokenExhausted => "Token/context limit exceeded",
131 Self::ApiUnavailable => "API service temporarily unavailable",
132 Self::NetworkError => "Network connectivity issue",
133 Self::AuthFailure => "Authentication failure",
134 Self::CommandNotFound => "Command not found",
135 Self::DiskFull => "Disk space exhausted",
136 Self::ProcessKilled => "Process terminated (possibly OOM)",
137 Self::InvalidResponse => "Invalid response from agent",
138 Self::Timeout => "Request timed out",
139 Self::ToolExecutionFailed => "Tool execution failed (e.g., file write)",
140 Self::AgentSpecificQuirk => "Known agent-specific issue",
141 Self::Transient => "Transient error",
142 Self::Permanent => "Permanent error",
143 }
144 }
145
146 pub const fn recovery_advice(self) -> &'static str {
148 match self {
149 Self::RateLimited => {
150 "Will retry after delay. Tip: Consider reducing request frequency or using a different provider."
151 }
152 Self::TokenExhausted => {
153 "Switching to alternative agent. Tip: Try RALPH_DEVELOPER_CONTEXT=0 or RALPH_REVIEWER_CONTEXT=0"
154 }
155 Self::ApiUnavailable => {
156 "API server issue. Will retry automatically. Tip: Check status page or try different provider."
157 }
158 Self::NetworkError => {
159 "Check your internet connection. Will retry automatically. Tip: Check firewall/VPN settings."
160 }
161 Self::AuthFailure => {
162 "Check API key or run 'agent auth' to authenticate. Tip: Verify credentials for this provider."
163 }
164 Self::CommandNotFound => {
165 "Agent binary not installed. See installation guidance below. Tip: Run 'ralph --list-available-agents'"
166 }
167 Self::DiskFull => "Free up disk space and try again. Tip: Check .agent directory size.",
168 Self::ProcessKilled => {
169 "Process was killed (possible OOM). Trying with smaller context. Tip: Reduce context with RALPH_*_CONTEXT=0"
170 }
171 Self::InvalidResponse => {
172 "Received malformed response. Retrying... Tip: May indicate parser mismatch with this agent."
173 }
174 Self::Timeout => {
175 "Request timed out. Will retry with longer timeout. Tip: Try reducing prompt size or context."
176 }
177 Self::ToolExecutionFailed => {
178 "Tool execution failed (file write/permissions). Switching agent. Tip: Check directory write permissions."
179 }
180 Self::AgentSpecificQuirk => {
181 "Known agent-specific issue. Switching to alternative agent. Tip: See docs/agent-compatibility.md"
182 }
183 Self::Transient => "Temporary issue. Will retry automatically.",
184 Self::Permanent => {
185 "Unrecoverable error. Check agent logs (.agent/logs/) and see docs/agent-compatibility.md for help."
186 }
187 }
188 }
189
190 pub fn classify_with_agent(
202 exit_code: i32,
203 stderr: &str,
204 agent_name: Option<&str>,
205 model_flag: Option<&str>,
206 ) -> Self {
207 let stderr_lower = stderr.to_lowercase();
208
209 if let Some(err) = Self::check_api_errors(&stderr_lower) {
212 return err;
213 }
214
215 if let Some(err) = Self::check_network_errors(&stderr_lower) {
216 return err;
217 }
218
219 if let Some(err) = Self::check_resource_errors(exit_code, &stderr_lower) {
220 return err;
221 }
222
223 if let Some(err) = Self::check_tool_failures(&stderr_lower) {
224 return err;
225 }
226
227 let is_problematic_agent =
231 agent_name.is_some_and(is_glm_like_agent) || model_flag.is_some_and(is_glm_like_agent);
232
233 if is_problematic_agent && exit_code == 1 {
234 return Self::AgentSpecificQuirk;
237 }
238
239 if let Some(err) = Self::check_agent_specific_quirks(&stderr_lower, exit_code) {
240 return err;
241 }
242
243 if let Some(err) = Self::check_command_not_found(exit_code, &stderr_lower) {
244 return err;
245 }
246
247 if exit_code == 1 && stderr_lower.contains("error") {
250 return Self::Transient;
253 }
254
255 Self::Permanent
256 }
257
258 fn check_api_errors(stderr_lower: &str) -> Option<Self> {
260 if stderr_lower.contains("rate limit")
262 || stderr_lower.contains("too many requests")
263 || stderr_lower.contains("429")
264 || stderr_lower.contains("quota exceeded")
265 {
266 return Some(Self::RateLimited);
267 }
268
269 if stderr_lower.contains("token")
272 || stderr_lower.contains("context length")
273 || stderr_lower.contains("maximum context")
274 || stderr_lower.contains("too long")
275 || stderr_lower.contains("input too large")
276 {
277 return Some(Self::TokenExhausted);
278 }
279
280 if stderr_lower.contains("unauthorized")
282 || stderr_lower.contains("authentication")
283 || stderr_lower.contains("401")
284 || stderr_lower.contains("api key")
285 || stderr_lower.contains("invalid token")
286 || stderr_lower.contains("forbidden")
287 || stderr_lower.contains("403")
288 || stderr_lower.contains("access denied")
289 {
290 return Some(Self::AuthFailure);
291 }
292
293 None
294 }
295
296 fn check_network_errors(stderr_lower: &str) -> Option<Self> {
298 if stderr_lower.contains("connection refused")
300 || stderr_lower.contains("network unreachable")
301 || stderr_lower.contains("dns resolution")
302 || stderr_lower.contains("name resolution")
303 || stderr_lower.contains("no route to host")
304 || stderr_lower.contains("network is down")
305 || stderr_lower.contains("host unreachable")
306 || stderr_lower.contains("connection reset")
307 || stderr_lower.contains("broken pipe")
308 || stderr_lower.contains("econnrefused")
309 || stderr_lower.contains("enetunreach")
310 {
311 return Some(Self::NetworkError);
312 }
313
314 if stderr_lower.contains("service unavailable")
316 || stderr_lower.contains("503")
317 || stderr_lower.contains("502")
318 || stderr_lower.contains("504")
319 || stderr_lower.contains("500")
320 || stderr_lower.contains("internal server error")
321 || stderr_lower.contains("bad gateway")
322 || stderr_lower.contains("gateway timeout")
323 || stderr_lower.contains("overloaded")
324 || stderr_lower.contains("maintenance")
325 {
326 return Some(Self::ApiUnavailable);
327 }
328
329 if stderr_lower.contains("timeout")
331 || stderr_lower.contains("timed out")
332 || stderr_lower.contains("request timeout")
333 || stderr_lower.contains("deadline exceeded")
334 {
335 return Some(Self::Timeout);
336 }
337
338 None
339 }
340
341 fn check_resource_errors(exit_code: i32, stderr_lower: &str) -> Option<Self> {
343 if stderr_lower.contains("no space left")
345 || stderr_lower.contains("disk full")
346 || stderr_lower.contains("enospc")
347 || stderr_lower.contains("out of disk")
348 || stderr_lower.contains("insufficient storage")
349 {
350 return Some(Self::DiskFull);
351 }
352
353 if exit_code == 137
356 || exit_code == 139
357 || exit_code == -9
358 || stderr_lower.contains("killed")
359 || stderr_lower.contains("oom")
360 || stderr_lower.contains("out of memory")
361 || stderr_lower.contains("memory exhausted")
362 || stderr_lower.contains("cannot allocate")
363 || stderr_lower.contains("segmentation fault")
364 || stderr_lower.contains("sigsegv")
365 || stderr_lower.contains("sigkill")
366 {
367 return Some(Self::ProcessKilled);
368 }
369
370 None
371 }
372
373 fn check_tool_failures(stderr_lower: &str) -> Option<Self> {
375 if stderr_lower.contains("invalid json")
377 || stderr_lower.contains("json parse")
378 || stderr_lower.contains("unexpected token")
379 || stderr_lower.contains("malformed")
380 || stderr_lower.contains("truncated response")
381 || stderr_lower.contains("incomplete response")
382 {
383 return Some(Self::InvalidResponse);
384 }
385
386 if stderr_lower.contains("write error")
389 || stderr_lower.contains("cannot write")
390 || stderr_lower.contains("failed to write")
391 || stderr_lower.contains("unable to create file")
392 || stderr_lower.contains("file creation failed")
393 || stderr_lower.contains("i/o error")
394 || stderr_lower.contains("io error")
395 || stderr_lower.contains("tool failed")
396 || stderr_lower.contains("tool execution failed")
397 || stderr_lower.contains("tool call failed")
398 {
399 return Some(Self::ToolExecutionFailed);
400 }
401
402 if stderr_lower.contains("permission denied")
407 || stderr_lower.contains("operation not permitted")
408 || stderr_lower.contains("insufficient permissions")
409 || stderr_lower.contains("eacces")
410 || stderr_lower.contains("eperm")
411 {
412 return Some(Self::ToolExecutionFailed);
413 }
414
415 None
416 }
417
418 fn check_agent_specific_quirks(stderr_lower: &str, exit_code: i32) -> Option<Self> {
420 if stderr_lower.contains("ccs") || stderr_lower.contains("glm") {
424 if exit_code == 1 {
426 return Some(Self::AgentSpecificQuirk);
427 }
428 if stderr_lower.contains("ccs") && stderr_lower.contains("failed") {
430 return Some(Self::AgentSpecificQuirk);
431 }
432 if stderr_lower.contains("glm")
434 && (stderr_lower.contains("permission")
435 || stderr_lower.contains("denied")
436 || stderr_lower.contains("unauthorized"))
437 {
438 return Some(Self::AgentSpecificQuirk);
439 }
440 }
441
442 if stderr_lower.contains("glm") && exit_code == 1 {
444 return Some(Self::AgentSpecificQuirk);
445 }
446
447 None
448 }
449
450 fn check_command_not_found(exit_code: i32, stderr_lower: &str) -> Option<Self> {
452 if exit_code == 127
455 || exit_code == 126
456 || stderr_lower.contains("command not found")
457 || stderr_lower.contains("not found")
458 || stderr_lower.contains("no such file")
459 {
460 return Some(Self::CommandNotFound);
461 }
462
463 None
464 }
465}
466
467#[cfg(test)]
468mod tests {
469 use super::*;
470
471 fn classify(exit_code: i32, stderr: &str) -> AgentErrorKind {
472 AgentErrorKind::classify_with_agent(exit_code, stderr, None, None)
473 }
474
475 #[test]
476 fn test_agent_error_kind_should_retry() {
477 assert!(AgentErrorKind::RateLimited.should_retry());
478 assert!(AgentErrorKind::ApiUnavailable.should_retry());
479 assert!(AgentErrorKind::NetworkError.should_retry());
480 assert!(AgentErrorKind::Timeout.should_retry());
481 assert!(AgentErrorKind::InvalidResponse.should_retry());
482 assert!(AgentErrorKind::Transient.should_retry());
483
484 assert!(!AgentErrorKind::AuthFailure.should_retry());
485 assert!(!AgentErrorKind::CommandNotFound.should_retry());
486 assert!(!AgentErrorKind::Permanent.should_retry());
487 }
488
489 #[test]
490 fn test_agent_error_kind_should_fallback() {
491 assert!(AgentErrorKind::TokenExhausted.should_fallback());
492 assert!(AgentErrorKind::AuthFailure.should_fallback());
493 assert!(AgentErrorKind::CommandNotFound.should_fallback());
494 assert!(AgentErrorKind::ProcessKilled.should_fallback());
495 assert!(AgentErrorKind::ToolExecutionFailed.should_fallback());
496 assert!(AgentErrorKind::AgentSpecificQuirk.should_fallback());
497
498 assert!(!AgentErrorKind::RateLimited.should_fallback());
499 assert!(!AgentErrorKind::Permanent.should_fallback());
500 }
501
502 #[test]
503 fn test_agent_error_kind_is_unrecoverable() {
504 assert!(AgentErrorKind::DiskFull.is_unrecoverable());
505 assert!(AgentErrorKind::Permanent.is_unrecoverable());
506
507 assert!(!AgentErrorKind::RateLimited.is_unrecoverable());
508 assert!(!AgentErrorKind::AuthFailure.is_unrecoverable());
509 }
510
511 #[test]
512 fn test_agent_error_kind_classify() {
513 assert_eq!(
515 classify(1, "rate limit exceeded"),
516 AgentErrorKind::RateLimited
517 );
518 assert_eq!(classify(1, "error 429"), AgentErrorKind::RateLimited);
519
520 assert_eq!(classify(1, "unauthorized"), AgentErrorKind::AuthFailure);
522 assert_eq!(classify(1, "error 401"), AgentErrorKind::AuthFailure);
523
524 assert_eq!(classify(127, ""), AgentErrorKind::CommandNotFound);
526 assert_eq!(
527 classify(1, "command not found"),
528 AgentErrorKind::CommandNotFound
529 );
530
531 assert_eq!(classify(137, ""), AgentErrorKind::ProcessKilled);
533 assert_eq!(classify(1, "out of memory"), AgentErrorKind::ProcessKilled);
534
535 assert_eq!(
537 classify(1, "write error"),
538 AgentErrorKind::ToolExecutionFailed
539 );
540 assert_eq!(
541 classify(1, "tool failed"),
542 AgentErrorKind::ToolExecutionFailed
543 );
544 assert_eq!(
545 classify(1, "failed to write"),
546 AgentErrorKind::ToolExecutionFailed
547 );
548
549 assert_eq!(
551 classify(1, "permission denied"),
552 AgentErrorKind::ToolExecutionFailed
553 );
554 assert_eq!(
555 classify(1, "operation not permitted"),
556 AgentErrorKind::ToolExecutionFailed
557 );
558 assert_eq!(
559 classify(1, "insufficient permissions"),
560 AgentErrorKind::ToolExecutionFailed
561 );
562
563 assert_eq!(classify(1, "access denied"), AgentErrorKind::AuthFailure);
565
566 assert_eq!(classify(1, "glm error"), AgentErrorKind::AgentSpecificQuirk);
568 assert_eq!(
569 classify(1, "ccs glm failed"),
570 AgentErrorKind::AgentSpecificQuirk
571 );
572
573 assert_eq!(classify(1, "some random error"), AgentErrorKind::Transient);
576
577 assert_eq!(
578 AgentErrorKind::classify_with_agent(1, "some random error", Some("ccs/glm"), None),
579 AgentErrorKind::AgentSpecificQuirk
580 );
581 }
582
583 #[test]
584 fn test_agent_error_kind_description_and_advice() {
585 let error = AgentErrorKind::RateLimited;
586 assert!(!error.description().is_empty());
587 assert!(!error.recovery_advice().is_empty());
588 }
589
590 #[test]
591 fn test_agent_error_kind_suggested_wait_ms() {
592 assert_eq!(AgentErrorKind::RateLimited.suggested_wait_ms(), 5000);
593 assert_eq!(AgentErrorKind::Permanent.suggested_wait_ms(), 0);
594 }
595
596 #[test]
597 fn test_agent_error_kind_suggests_smaller_context() {
598 assert!(AgentErrorKind::TokenExhausted.suggests_smaller_context());
599 assert!(AgentErrorKind::ProcessKilled.suggests_smaller_context());
600 assert!(!AgentErrorKind::RateLimited.suggests_smaller_context());
601 }
602}