ralph_workflow/agents/error/
kind.rs1use super::glm_detection::is_glm_like_agent;
2
3#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub enum AgentErrorKind {
11 RateLimited,
13 TokenExhausted,
15 ApiUnavailable,
17 NetworkError,
19 AuthFailure,
21 CommandNotFound,
23 DiskFull,
25 ProcessKilled,
27 InvalidResponse,
29 Timeout,
31 ToolExecutionFailed,
33 AgentSpecificQuirk,
35 RetryableAgentQuirk,
37 Transient,
39 Permanent,
41}
42
43impl AgentErrorKind {
44 #[must_use]
49 pub const fn should_retry(self) -> bool {
50 matches!(
51 self,
52 Self::ApiUnavailable
53 | Self::NetworkError
54 | Self::Timeout
55 | Self::InvalidResponse
56 | Self::RetryableAgentQuirk
57 | Self::Transient
58 )
59 }
60
61 #[must_use]
67 pub const fn should_immediate_agent_fallback(self) -> bool {
68 matches!(self, Self::RateLimited)
69 }
70
71 #[must_use]
73 pub const fn should_fallback(self) -> bool {
74 matches!(
75 self,
76 Self::TokenExhausted
77 | Self::AuthFailure
78 | Self::CommandNotFound
79 | Self::ProcessKilled
80 | Self::ToolExecutionFailed
81 | Self::AgentSpecificQuirk
82 )
83 }
84
85 #[must_use]
87 pub const fn is_unrecoverable(self) -> bool {
88 matches!(self, Self::DiskFull | Self::Permanent)
89 }
90
91 #[must_use]
93 pub const fn is_command_not_found(self) -> bool {
94 matches!(self, Self::CommandNotFound)
95 }
96
97 #[must_use]
99 pub const fn is_network_error(self) -> bool {
100 matches!(self, Self::NetworkError | Self::Timeout)
101 }
102
103 #[must_use]
105 pub const fn suggests_smaller_context(self) -> bool {
106 matches!(self, Self::TokenExhausted | Self::ProcessKilled)
107 }
108
109 #[must_use]
111 pub const fn suggested_wait_ms(self) -> u64 {
112 match self {
113 Self::ApiUnavailable => 3000, Self::NetworkError => 2000, Self::Timeout | Self::Transient | Self::RetryableAgentQuirk => 1000, Self::InvalidResponse => 500, Self::RateLimited
119 | Self::TokenExhausted
120 | Self::AuthFailure
121 | Self::CommandNotFound
122 | Self::DiskFull
123 | Self::ProcessKilled
124 | Self::ToolExecutionFailed
125 | Self::AgentSpecificQuirk
126 | Self::Permanent => 0,
127 }
128 }
129
130 #[must_use]
132 pub const fn description(self) -> &'static str {
133 match self {
134 Self::RateLimited => "API rate limit exceeded",
135 Self::TokenExhausted => "Token/context limit exceeded",
136 Self::ApiUnavailable => "API service temporarily unavailable",
137 Self::NetworkError => "Network connectivity issue",
138 Self::AuthFailure => "Authentication failure",
139 Self::CommandNotFound => "Command not found",
140 Self::DiskFull => "Disk space exhausted",
141 Self::ProcessKilled => "Process terminated (possibly OOM)",
142 Self::InvalidResponse => "Invalid response from agent",
143 Self::Timeout => "Request timed out",
144 Self::ToolExecutionFailed => "Tool execution failed (e.g., file write)",
145 Self::AgentSpecificQuirk => "Known agent-specific issue",
146 Self::RetryableAgentQuirk => "Agent-specific issue (may be transient)",
147 Self::Transient => "Transient error",
148 Self::Permanent => "Permanent error",
149 }
150 }
151
152 #[must_use]
154 pub const fn recovery_advice(self) -> &'static str {
155 match self {
156 Self::RateLimited => {
157 "Switching to next agent immediately. Rate limit indicates provider exhaustion."
158 }
159 Self::TokenExhausted => {
160 "Switching to alternative agent. Tip: Try RALPH_DEVELOPER_CONTEXT=0 or RALPH_REVIEWER_CONTEXT=0"
161 }
162 Self::ApiUnavailable => {
163 "API server issue. Will retry automatically. Tip: Check status page or try different provider."
164 }
165 Self::NetworkError => {
166 "Check your internet connection. Will retry automatically. Tip: Check firewall/VPN settings."
167 }
168 Self::AuthFailure => {
169 "Check API key or run 'agent auth' to authenticate. Tip: Verify credentials for this provider."
170 }
171 Self::CommandNotFound => {
172 "Agent binary not installed. See installation guidance below. Tip: Run 'ralph --list-available-agents'"
173 }
174 Self::DiskFull => "Free up disk space and try again. Tip: Check .agent directory size.",
175 Self::ProcessKilled => {
176 "Process was killed (possible OOM). Trying with smaller context. Tip: Reduce context with RALPH_*_CONTEXT=0"
177 }
178 Self::InvalidResponse => {
179 "Received malformed response. Retrying... Tip: May indicate parser mismatch with this agent."
180 }
181 Self::Timeout => {
182 "Request timed out. Will retry with longer timeout. Tip: Try reducing prompt size or context."
183 }
184 Self::ToolExecutionFailed => {
185 "Tool execution failed (file write/permissions). Switching agent. Tip: Check directory write permissions."
186 }
187 Self::AgentSpecificQuirk => {
188 "Known agent-specific issue. Switching to alternative agent. Tip: See docs/agent-compatibility.md"
189 }
190 Self::RetryableAgentQuirk => {
191 "Agent-specific issue that may be transient. Retrying... Tip: See docs/agent-compatibility.md"
192 }
193 Self::Transient => "Temporary issue. Will retry automatically.",
194 Self::Permanent => {
195 "Unrecoverable error. Check agent logs (.agent/logs/) and see docs/agent-compatibility.md for help."
196 }
197 }
198 }
199
200 pub fn classify_with_agent(
206 exit_code: i32,
207 stderr: &str,
208 agent_name: Option<&str>,
209 model_flag: Option<&str>,
210 ) -> Self {
211 let stderr_lower = stderr.to_lowercase();
212
213 if let Some(err) = Self::check_api_errors(&stderr_lower) {
216 return err;
217 }
218
219 if let Some(err) = Self::check_network_errors(&stderr_lower) {
220 return err;
221 }
222
223 if let Some(err) = Self::check_resource_errors(exit_code, &stderr_lower) {
224 return err;
225 }
226
227 if let Some(err) = Self::check_tool_failures(&stderr_lower) {
228 return err;
229 }
230
231 let is_problematic_agent =
235 agent_name.is_some_and(is_glm_like_agent) || model_flag.is_some_and(is_glm_like_agent);
236
237 if is_problematic_agent && exit_code == 1 {
238 let has_known_problematic_pattern = stderr_lower.contains("permission")
240 || stderr_lower.contains("denied")
241 || stderr_lower.contains("unauthorized")
242 || stderr_lower.contains("auth")
243 || stderr_lower.contains("token")
244 || stderr_lower.contains("limit")
245 || stderr_lower.contains("quota")
246 || stderr_lower.contains("disk")
247 || stderr_lower.contains("space")
248 || (stderr_lower.contains("glm") && stderr_lower.contains("failed"))
250 || (stderr_lower.contains("ccs") && stderr_lower.contains("failed"));
251
252 if has_known_problematic_pattern {
253 return Self::AgentSpecificQuirk;
255 }
256
257 return Self::RetryableAgentQuirk;
259 }
260
261 if let Some(err) = Self::check_agent_specific_quirks(&stderr_lower, exit_code) {
262 return err;
263 }
264
265 if let Some(err) = Self::check_command_not_found(exit_code, &stderr_lower) {
266 return err;
267 }
268
269 if exit_code == 1 && stderr_lower.contains("error") {
271 return Self::Transient;
272 }
273
274 Self::Permanent
275 }
276
277 fn check_api_errors(stderr_lower: &str) -> Option<Self> {
278 if stderr_lower.contains("rate limit")
280 || stderr_lower.contains("too many requests")
281 || stderr_lower.contains("429")
282 || stderr_lower.contains("quota exceeded")
283 {
284 return Some(Self::RateLimited);
285 }
286
287 if stderr_lower.contains("unauthorized")
291 || stderr_lower.contains("authentication")
292 || stderr_lower.contains("401")
293 || stderr_lower.contains("api key")
294 || stderr_lower.contains("invalid token")
295 || stderr_lower.contains("forbidden")
296 || stderr_lower.contains("403")
297 || stderr_lower.contains("access denied")
298 || stderr_lower.contains("credential")
299 {
300 return Some(Self::AuthFailure);
301 }
302
303 if stderr_lower.contains("context length")
308 || stderr_lower.contains("maximum context")
309 || stderr_lower.contains("max context")
310 || stderr_lower.contains("context window")
311 || stderr_lower.contains("maximum tokens")
312 || stderr_lower.contains("max tokens")
313 || stderr_lower.contains("too many tokens")
314 || stderr_lower.contains("token limit")
315 || stderr_lower.contains("context_length_exceeded")
316 || stderr_lower.contains("input too large")
317 || stderr_lower.contains("prompt is too long")
318 || (stderr_lower.contains("too long")
319 && !stderr_lower.contains("argument list too long"))
320 {
321 return Some(Self::TokenExhausted);
322 }
323
324 None
325 }
326
327 fn check_network_errors(stderr_lower: &str) -> Option<Self> {
328 if stderr_lower.contains("connection refused")
330 || stderr_lower.contains("network unreachable")
331 || stderr_lower.contains("dns resolution")
332 || stderr_lower.contains("name resolution")
333 || stderr_lower.contains("no route to host")
334 || stderr_lower.contains("network is down")
335 || stderr_lower.contains("host unreachable")
336 || stderr_lower.contains("connection reset")
337 || stderr_lower.contains("broken pipe")
338 || stderr_lower.contains("econnrefused")
339 || stderr_lower.contains("enetunreach")
340 {
341 return Some(Self::NetworkError);
342 }
343
344 if stderr_lower.contains("service unavailable")
346 || stderr_lower.contains("503")
347 || stderr_lower.contains("502")
348 || stderr_lower.contains("504")
349 || stderr_lower.contains("500")
350 || stderr_lower.contains("internal server error")
351 || stderr_lower.contains("bad gateway")
352 || stderr_lower.contains("gateway timeout")
353 || stderr_lower.contains("overloaded")
354 || stderr_lower.contains("maintenance")
355 {
356 return Some(Self::ApiUnavailable);
357 }
358
359 if stderr_lower.contains("timeout")
361 || stderr_lower.contains("timed out")
362 || stderr_lower.contains("request timeout")
363 || stderr_lower.contains("deadline exceeded")
364 {
365 return Some(Self::Timeout);
366 }
367
368 None
369 }
370
371 fn check_resource_errors(exit_code: i32, stderr_lower: &str) -> Option<Self> {
372 if stderr_lower.contains("no space left")
374 || stderr_lower.contains("disk full")
375 || stderr_lower.contains("enospc")
376 || stderr_lower.contains("out of disk")
377 || stderr_lower.contains("insufficient storage")
378 {
379 return Some(Self::DiskFull);
380 }
381
382 if exit_code == 7
385 || stderr_lower.contains("argument list too long")
386 || stderr_lower.contains("e2big")
387 {
388 return Some(Self::ToolExecutionFailed);
389 }
390
391 if exit_code == 137
394 || exit_code == 139
395 || exit_code == -9
396 || stderr_lower.contains("killed")
397 || stderr_lower.contains("oom")
398 || stderr_lower.contains("out of memory")
399 || stderr_lower.contains("memory exhausted")
400 || stderr_lower.contains("cannot allocate")
401 || stderr_lower.contains("segmentation fault")
402 || stderr_lower.contains("sigsegv")
403 || stderr_lower.contains("sigkill")
404 {
405 return Some(Self::ProcessKilled);
406 }
407
408 None
409 }
410
411 fn check_tool_failures(stderr_lower: &str) -> Option<Self> {
412 if stderr_lower.contains("invalid json")
414 || stderr_lower.contains("json parse")
415 || stderr_lower.contains("unexpected token")
416 || stderr_lower.contains("malformed")
417 || stderr_lower.contains("truncated response")
418 || stderr_lower.contains("incomplete response")
419 {
420 return Some(Self::InvalidResponse);
421 }
422
423 if stderr_lower.contains("write error")
425 || stderr_lower.contains("cannot write")
426 || stderr_lower.contains("failed to write")
427 || stderr_lower.contains("unable to create file")
428 || stderr_lower.contains("file creation failed")
429 || stderr_lower.contains("i/o error")
430 || stderr_lower.contains("io error")
431 || stderr_lower.contains("tool failed")
432 || stderr_lower.contains("tool execution failed")
433 || stderr_lower.contains("tool call failed")
434 {
435 return Some(Self::ToolExecutionFailed);
436 }
437
438 if stderr_lower.contains("permission denied")
440 || stderr_lower.contains("operation not permitted")
441 || stderr_lower.contains("insufficient permissions")
442 || stderr_lower.contains("eacces")
443 || stderr_lower.contains("eperm")
444 {
445 return Some(Self::ToolExecutionFailed);
446 }
447
448 None
449 }
450
451 fn check_agent_specific_quirks(stderr_lower: &str, exit_code: i32) -> Option<Self> {
452 if stderr_lower.contains("ccs") || stderr_lower.contains("glm") {
454 if exit_code == 1 {
456 return Some(Self::AgentSpecificQuirk);
457 }
458 if stderr_lower.contains("ccs") && stderr_lower.contains("failed") {
460 return Some(Self::AgentSpecificQuirk);
461 }
462 if stderr_lower.contains("glm")
464 && (stderr_lower.contains("permission")
465 || stderr_lower.contains("denied")
466 || stderr_lower.contains("unauthorized"))
467 {
468 return Some(Self::AgentSpecificQuirk);
469 }
470 }
471
472 if stderr_lower.contains("glm") && exit_code == 1 {
474 return Some(Self::AgentSpecificQuirk);
475 }
476
477 None
478 }
479
480 fn check_command_not_found(exit_code: i32, stderr_lower: &str) -> Option<Self> {
481 if exit_code == 127
484 || exit_code == 126
485 || stderr_lower.contains("command not found")
486 || stderr_lower.contains("not found")
487 || stderr_lower.contains("no such file")
488 {
489 return Some(Self::CommandNotFound);
490 }
491
492 None
493 }
494}