ralph_workflow/agents/error/
kind.rs1use super::glm_detection::is_glm_like_agent;
2
3#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub enum AgentErrorKind {
11 RateLimited,
13 TokenExhausted,
15 ApiUnavailable,
17 NetworkError,
19 AuthFailure,
21 CommandNotFound,
23 DiskFull,
25 ProcessKilled,
27 InvalidResponse,
29 Timeout,
31 ToolExecutionFailed,
33 AgentSpecificQuirk,
35 RetryableAgentQuirk,
37 Transient,
39 Permanent,
41}
42
43impl AgentErrorKind {
44 pub const fn should_retry(self) -> bool {
49 matches!(
50 self,
51 Self::ApiUnavailable
52 | Self::NetworkError
53 | Self::Timeout
54 | Self::InvalidResponse
55 | Self::RetryableAgentQuirk
56 | Self::Transient
57 )
58 }
59
60 pub const fn should_immediate_agent_fallback(self) -> bool {
66 matches!(self, Self::RateLimited)
67 }
68
69 pub const fn should_fallback(self) -> bool {
71 matches!(
72 self,
73 Self::TokenExhausted
74 | Self::AuthFailure
75 | Self::CommandNotFound
76 | Self::ProcessKilled
77 | Self::ToolExecutionFailed
78 | Self::AgentSpecificQuirk
79 )
80 }
81
82 pub const fn is_unrecoverable(self) -> bool {
84 matches!(self, Self::DiskFull | Self::Permanent)
85 }
86
87 pub const fn is_command_not_found(self) -> bool {
89 matches!(self, Self::CommandNotFound)
90 }
91
92 pub const fn is_network_error(self) -> bool {
94 matches!(self, Self::NetworkError | Self::Timeout)
95 }
96
97 pub const fn suggests_smaller_context(self) -> bool {
99 matches!(self, Self::TokenExhausted | Self::ProcessKilled)
100 }
101
102 pub const fn suggested_wait_ms(self) -> u64 {
104 match self {
105 Self::RateLimited => 0,
107 Self::ApiUnavailable => 3000, Self::NetworkError => 2000, Self::Timeout | Self::Transient | Self::RetryableAgentQuirk => 1000, Self::InvalidResponse => 500, _ => 0, }
113 }
114
115 pub const fn description(self) -> &'static str {
117 match self {
118 Self::RateLimited => "API rate limit exceeded",
119 Self::TokenExhausted => "Token/context limit exceeded",
120 Self::ApiUnavailable => "API service temporarily unavailable",
121 Self::NetworkError => "Network connectivity issue",
122 Self::AuthFailure => "Authentication failure",
123 Self::CommandNotFound => "Command not found",
124 Self::DiskFull => "Disk space exhausted",
125 Self::ProcessKilled => "Process terminated (possibly OOM)",
126 Self::InvalidResponse => "Invalid response from agent",
127 Self::Timeout => "Request timed out",
128 Self::ToolExecutionFailed => "Tool execution failed (e.g., file write)",
129 Self::AgentSpecificQuirk => "Known agent-specific issue",
130 Self::RetryableAgentQuirk => "Agent-specific issue (may be transient)",
131 Self::Transient => "Transient error",
132 Self::Permanent => "Permanent error",
133 }
134 }
135
136 pub const fn recovery_advice(self) -> &'static str {
138 match self {
139 Self::RateLimited => {
140 "Switching to next agent immediately. Rate limit indicates provider exhaustion."
141 }
142 Self::TokenExhausted => {
143 "Switching to alternative agent. Tip: Try RALPH_DEVELOPER_CONTEXT=0 or RALPH_REVIEWER_CONTEXT=0"
144 }
145 Self::ApiUnavailable => {
146 "API server issue. Will retry automatically. Tip: Check status page or try different provider."
147 }
148 Self::NetworkError => {
149 "Check your internet connection. Will retry automatically. Tip: Check firewall/VPN settings."
150 }
151 Self::AuthFailure => {
152 "Check API key or run 'agent auth' to authenticate. Tip: Verify credentials for this provider."
153 }
154 Self::CommandNotFound => {
155 "Agent binary not installed. See installation guidance below. Tip: Run 'ralph --list-available-agents'"
156 }
157 Self::DiskFull => "Free up disk space and try again. Tip: Check .agent directory size.",
158 Self::ProcessKilled => {
159 "Process was killed (possible OOM). Trying with smaller context. Tip: Reduce context with RALPH_*_CONTEXT=0"
160 }
161 Self::InvalidResponse => {
162 "Received malformed response. Retrying... Tip: May indicate parser mismatch with this agent."
163 }
164 Self::Timeout => {
165 "Request timed out. Will retry with longer timeout. Tip: Try reducing prompt size or context."
166 }
167 Self::ToolExecutionFailed => {
168 "Tool execution failed (file write/permissions). Switching agent. Tip: Check directory write permissions."
169 }
170 Self::AgentSpecificQuirk => {
171 "Known agent-specific issue. Switching to alternative agent. Tip: See docs/agent-compatibility.md"
172 }
173 Self::RetryableAgentQuirk => {
174 "Agent-specific issue that may be transient. Retrying... Tip: See docs/agent-compatibility.md"
175 }
176 Self::Transient => "Temporary issue. Will retry automatically.",
177 Self::Permanent => {
178 "Unrecoverable error. Check agent logs (.agent/logs/) and see docs/agent-compatibility.md for help."
179 }
180 }
181 }
182
183 pub fn classify_with_agent(
189 exit_code: i32,
190 stderr: &str,
191 agent_name: Option<&str>,
192 model_flag: Option<&str>,
193 ) -> Self {
194 let stderr_lower = stderr.to_lowercase();
195
196 if let Some(err) = Self::check_api_errors(&stderr_lower) {
199 return err;
200 }
201
202 if let Some(err) = Self::check_network_errors(&stderr_lower) {
203 return err;
204 }
205
206 if let Some(err) = Self::check_resource_errors(exit_code, &stderr_lower) {
207 return err;
208 }
209
210 if let Some(err) = Self::check_tool_failures(&stderr_lower) {
211 return err;
212 }
213
214 let is_problematic_agent =
218 agent_name.is_some_and(is_glm_like_agent) || model_flag.is_some_and(is_glm_like_agent);
219
220 if is_problematic_agent && exit_code == 1 {
221 let has_known_problematic_pattern = stderr_lower.contains("permission")
223 || stderr_lower.contains("denied")
224 || stderr_lower.contains("unauthorized")
225 || stderr_lower.contains("auth")
226 || stderr_lower.contains("token")
227 || stderr_lower.contains("limit")
228 || stderr_lower.contains("quota")
229 || stderr_lower.contains("disk")
230 || stderr_lower.contains("space")
231 || (stderr_lower.contains("glm") && stderr_lower.contains("failed"))
233 || (stderr_lower.contains("ccs") && stderr_lower.contains("failed"));
234
235 if has_known_problematic_pattern {
236 return Self::AgentSpecificQuirk;
238 }
239
240 return Self::RetryableAgentQuirk;
242 }
243
244 if let Some(err) = Self::check_agent_specific_quirks(&stderr_lower, exit_code) {
245 return err;
246 }
247
248 if let Some(err) = Self::check_command_not_found(exit_code, &stderr_lower) {
249 return err;
250 }
251
252 if exit_code == 1 && stderr_lower.contains("error") {
254 return Self::Transient;
255 }
256
257 Self::Permanent
258 }
259
260 fn check_api_errors(stderr_lower: &str) -> Option<Self> {
261 if stderr_lower.contains("rate limit")
263 || stderr_lower.contains("too many requests")
264 || stderr_lower.contains("429")
265 || stderr_lower.contains("quota exceeded")
266 {
267 return Some(Self::RateLimited);
268 }
269
270 if stderr_lower.contains("unauthorized")
274 || stderr_lower.contains("authentication")
275 || stderr_lower.contains("401")
276 || stderr_lower.contains("api key")
277 || stderr_lower.contains("invalid token")
278 || stderr_lower.contains("forbidden")
279 || stderr_lower.contains("403")
280 || stderr_lower.contains("access denied")
281 || stderr_lower.contains("credential")
282 {
283 return Some(Self::AuthFailure);
284 }
285
286 if stderr_lower.contains("context length")
291 || stderr_lower.contains("maximum context")
292 || stderr_lower.contains("max context")
293 || stderr_lower.contains("context window")
294 || stderr_lower.contains("maximum tokens")
295 || stderr_lower.contains("max tokens")
296 || stderr_lower.contains("too many tokens")
297 || stderr_lower.contains("token limit")
298 || stderr_lower.contains("context_length_exceeded")
299 || stderr_lower.contains("input too large")
300 || stderr_lower.contains("prompt is too long")
301 || (stderr_lower.contains("too long")
302 && !stderr_lower.contains("argument list too long"))
303 {
304 return Some(Self::TokenExhausted);
305 }
306
307 None
308 }
309
310 fn check_network_errors(stderr_lower: &str) -> Option<Self> {
311 if stderr_lower.contains("connection refused")
313 || stderr_lower.contains("network unreachable")
314 || stderr_lower.contains("dns resolution")
315 || stderr_lower.contains("name resolution")
316 || stderr_lower.contains("no route to host")
317 || stderr_lower.contains("network is down")
318 || stderr_lower.contains("host unreachable")
319 || stderr_lower.contains("connection reset")
320 || stderr_lower.contains("broken pipe")
321 || stderr_lower.contains("econnrefused")
322 || stderr_lower.contains("enetunreach")
323 {
324 return Some(Self::NetworkError);
325 }
326
327 if stderr_lower.contains("service unavailable")
329 || stderr_lower.contains("503")
330 || stderr_lower.contains("502")
331 || stderr_lower.contains("504")
332 || stderr_lower.contains("500")
333 || stderr_lower.contains("internal server error")
334 || stderr_lower.contains("bad gateway")
335 || stderr_lower.contains("gateway timeout")
336 || stderr_lower.contains("overloaded")
337 || stderr_lower.contains("maintenance")
338 {
339 return Some(Self::ApiUnavailable);
340 }
341
342 if stderr_lower.contains("timeout")
344 || stderr_lower.contains("timed out")
345 || stderr_lower.contains("request timeout")
346 || stderr_lower.contains("deadline exceeded")
347 {
348 return Some(Self::Timeout);
349 }
350
351 None
352 }
353
354 fn check_resource_errors(exit_code: i32, stderr_lower: &str) -> Option<Self> {
355 if stderr_lower.contains("no space left")
357 || stderr_lower.contains("disk full")
358 || stderr_lower.contains("enospc")
359 || stderr_lower.contains("out of disk")
360 || stderr_lower.contains("insufficient storage")
361 {
362 return Some(Self::DiskFull);
363 }
364
365 if exit_code == 7
368 || stderr_lower.contains("argument list too long")
369 || stderr_lower.contains("e2big")
370 {
371 return Some(Self::ToolExecutionFailed);
372 }
373
374 if exit_code == 137
377 || exit_code == 139
378 || exit_code == -9
379 || stderr_lower.contains("killed")
380 || stderr_lower.contains("oom")
381 || stderr_lower.contains("out of memory")
382 || stderr_lower.contains("memory exhausted")
383 || stderr_lower.contains("cannot allocate")
384 || stderr_lower.contains("segmentation fault")
385 || stderr_lower.contains("sigsegv")
386 || stderr_lower.contains("sigkill")
387 {
388 return Some(Self::ProcessKilled);
389 }
390
391 None
392 }
393
394 fn check_tool_failures(stderr_lower: &str) -> Option<Self> {
395 if stderr_lower.contains("invalid json")
397 || stderr_lower.contains("json parse")
398 || stderr_lower.contains("unexpected token")
399 || stderr_lower.contains("malformed")
400 || stderr_lower.contains("truncated response")
401 || stderr_lower.contains("incomplete response")
402 {
403 return Some(Self::InvalidResponse);
404 }
405
406 if stderr_lower.contains("write error")
408 || stderr_lower.contains("cannot write")
409 || stderr_lower.contains("failed to write")
410 || stderr_lower.contains("unable to create file")
411 || stderr_lower.contains("file creation failed")
412 || stderr_lower.contains("i/o error")
413 || stderr_lower.contains("io error")
414 || stderr_lower.contains("tool failed")
415 || stderr_lower.contains("tool execution failed")
416 || stderr_lower.contains("tool call failed")
417 {
418 return Some(Self::ToolExecutionFailed);
419 }
420
421 if stderr_lower.contains("permission denied")
423 || stderr_lower.contains("operation not permitted")
424 || stderr_lower.contains("insufficient permissions")
425 || stderr_lower.contains("eacces")
426 || stderr_lower.contains("eperm")
427 {
428 return Some(Self::ToolExecutionFailed);
429 }
430
431 None
432 }
433
434 fn check_agent_specific_quirks(stderr_lower: &str, exit_code: i32) -> Option<Self> {
435 if stderr_lower.contains("ccs") || stderr_lower.contains("glm") {
437 if exit_code == 1 {
439 return Some(Self::AgentSpecificQuirk);
440 }
441 if stderr_lower.contains("ccs") && stderr_lower.contains("failed") {
443 return Some(Self::AgentSpecificQuirk);
444 }
445 if stderr_lower.contains("glm")
447 && (stderr_lower.contains("permission")
448 || stderr_lower.contains("denied")
449 || stderr_lower.contains("unauthorized"))
450 {
451 return Some(Self::AgentSpecificQuirk);
452 }
453 }
454
455 if stderr_lower.contains("glm") && exit_code == 1 {
457 return Some(Self::AgentSpecificQuirk);
458 }
459
460 None
461 }
462
463 fn check_command_not_found(exit_code: i32, stderr_lower: &str) -> Option<Self> {
464 if exit_code == 127
467 || exit_code == 126
468 || stderr_lower.contains("command not found")
469 || stderr_lower.contains("not found")
470 || stderr_lower.contains("no such file")
471 {
472 return Some(Self::CommandNotFound);
473 }
474
475 None
476 }
477}