1use crate::hooks::{HookId, HookPoint, HookReasonCode};
4use crate::tool_catalog::ToolUnavailableReason;
5use crate::types::SessionId;
6use serde::{Deserialize, Serialize};
7
8#[derive(Debug, Clone, PartialEq)]
9#[non_exhaustive]
10pub enum LlmFailureReason {
11 RateLimited {
12 retry_after: Option<std::time::Duration>,
13 },
14 ContextExceeded {
15 max: u32,
16 requested: u32,
17 },
18 AuthError,
19 InvalidModel(String),
20 ProviderError(LlmProviderError),
21 NetworkTimeout {
23 duration_ms: u64,
24 },
25 CallTimeout {
27 duration_ms: u64,
28 },
29}
30
31#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
32#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
33#[serde(rename_all = "snake_case")]
34pub enum LlmProviderErrorKind {
35 InvalidRequest,
36 ContentFiltered,
37 ServerError,
38 ServerOverloaded,
39 ConnectionReset,
40 Unknown,
41 StreamParseError,
42 IncompleteResponse,
43}
44
45#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
46#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
47#[serde(rename_all = "snake_case")]
48pub enum LlmProviderErrorRetryability {
49 Retryable,
50 NonRetryable,
51}
52
53impl LlmProviderErrorRetryability {
54 pub fn is_retryable(self) -> bool {
55 matches!(self, Self::Retryable)
56 }
57}
58
59#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
60#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
61pub struct LlmProviderError {
62 pub kind: LlmProviderErrorKind,
63 pub retryability: LlmProviderErrorRetryability,
64 #[serde(default, skip_serializing_if = "serde_json::Value::is_null")]
65 pub details: serde_json::Value,
66}
67
68impl LlmProviderError {
69 pub fn new(
70 kind: LlmProviderErrorKind,
71 retryability: LlmProviderErrorRetryability,
72 details: serde_json::Value,
73 ) -> Self {
74 Self {
75 kind,
76 retryability,
77 details,
78 }
79 }
80
81 pub fn retryable(kind: LlmProviderErrorKind, details: serde_json::Value) -> Self {
82 Self::new(kind, LlmProviderErrorRetryability::Retryable, details)
83 }
84
85 pub fn non_retryable(kind: LlmProviderErrorKind, details: serde_json::Value) -> Self {
86 Self::new(kind, LlmProviderErrorRetryability::NonRetryable, details)
87 }
88
89 pub fn is_retryable(&self) -> bool {
90 self.retryability.is_retryable()
91 }
92}
93
94#[derive(Debug, Clone, thiserror::Error, PartialEq)]
96pub enum ToolValidationError {
97 #[error("Tool not found: {name}")]
99 NotFound { name: String },
100 #[error("Invalid arguments for tool '{name}': {reason}")]
102 InvalidArguments { name: String, reason: String },
103}
104
105impl ToolValidationError {
106 pub fn not_found(name: impl Into<String>) -> Self {
107 Self::NotFound { name: name.into() }
108 }
109 pub fn invalid_arguments(name: impl Into<String>, reason: impl Into<String>) -> Self {
110 Self::InvalidArguments {
111 name: name.into(),
112 reason: reason.into(),
113 }
114 }
115}
116
117#[derive(Debug, Clone, thiserror::Error)]
119pub enum ToolError {
120 #[error("Tool not found: {name}")]
122 NotFound { name: String },
123
124 #[error("Tool '{name}' is currently unavailable: {reason}")]
126 Unavailable {
127 name: String,
128 reason: ToolUnavailableReason,
129 },
130
131 #[error("Invalid arguments for tool '{name}': {reason}")]
133 InvalidArguments { name: String, reason: String },
134
135 #[error("Tool execution failed: {message}")]
137 ExecutionFailed { message: String },
138
139 #[error("Tool execution failed: {message}")]
141 ExecutionFailedWithData {
142 message: String,
143 data: serde_json::Value,
144 },
145
146 #[error("Tool '{name}' timed out after {timeout_ms}ms")]
148 Timeout { name: String, timeout_ms: u64 },
149
150 #[error("Tool '{name}' is not allowed by policy")]
152 AccessDenied { name: String },
153
154 #[error("{0}")]
156 Other(String),
157
158 #[error("Callback pending for tool '{tool_name}'")]
164 CallbackPending {
165 tool_name: String,
166 args: serde_json::Value,
167 },
168}
169
170impl ToolError {
171 pub fn error_code(&self) -> &'static str {
172 match self {
173 Self::NotFound { .. } => "tool_not_found",
174 Self::Unavailable { .. } => "tool_unavailable",
175 Self::InvalidArguments { .. } => "invalid_arguments",
176 Self::ExecutionFailed { .. } | Self::ExecutionFailedWithData { .. } => {
177 "execution_failed"
178 }
179 Self::Timeout { .. } => "timeout",
180 Self::AccessDenied { .. } => "access_denied",
181 Self::Other(_) => "tool_error",
182 Self::CallbackPending { .. } => "callback_pending",
183 }
184 }
185
186 pub fn to_error_payload(&self) -> serde_json::Value {
187 let mut payload = serde_json::json!({
188 "error": self.error_code(),
189 "message": self.to_string(),
190 });
191 if let Some(data) = self.structured_data() {
192 payload["data"] = data;
193 }
194 payload
195 }
196
197 pub fn not_found(name: impl Into<String>) -> Self {
198 Self::NotFound { name: name.into() }
199 }
200 pub fn unavailable(name: impl Into<String>, reason: ToolUnavailableReason) -> Self {
201 Self::Unavailable {
202 name: name.into(),
203 reason,
204 }
205 }
206 pub fn invalid_arguments(name: impl Into<String>, reason: impl Into<String>) -> Self {
207 Self::InvalidArguments {
208 name: name.into(),
209 reason: reason.into(),
210 }
211 }
212 pub fn execution_failed(message: impl Into<String>) -> Self {
213 Self::ExecutionFailed {
214 message: message.into(),
215 }
216 }
217 pub fn execution_failed_with_data(message: impl Into<String>, data: serde_json::Value) -> Self {
218 Self::ExecutionFailedWithData {
219 message: message.into(),
220 data,
221 }
222 }
223 pub fn structured_data(&self) -> Option<serde_json::Value> {
224 match self {
225 Self::ExecutionFailedWithData { data, .. } => Some(data.clone()),
226 _ => None,
227 }
228 }
229 pub fn timeout(name: impl Into<String>, timeout_ms: u64) -> Self {
230 Self::Timeout {
231 name: name.into(),
232 timeout_ms,
233 }
234 }
235 pub fn access_denied(name: impl Into<String>) -> Self {
236 Self::AccessDenied { name: name.into() }
237 }
238 pub fn other(message: impl Into<String>) -> Self {
239 Self::Other(message.into())
240 }
241
242 pub fn callback_pending(tool_name: impl Into<String>, args: serde_json::Value) -> Self {
244 Self::CallbackPending {
245 tool_name: tool_name.into(),
246 args,
247 }
248 }
249
250 pub fn is_callback_pending(&self) -> bool {
252 matches!(self, Self::CallbackPending { .. })
253 }
254
255 pub fn as_callback_pending(&self) -> Option<(&str, &serde_json::Value)> {
257 match self {
258 Self::CallbackPending { tool_name, args } => Some((tool_name, args)),
259 _ => None,
260 }
261 }
262}
263
264impl From<String> for ToolError {
265 fn from(s: String) -> Self {
266 Self::Other(s)
267 }
268}
269impl From<&str> for ToolError {
270 fn from(s: &str) -> Self {
271 Self::Other(s.to_string())
272 }
273}
274
275#[derive(Debug, thiserror::Error)]
277#[non_exhaustive]
278pub enum AgentError {
279 #[error("LLM error ({provider}): {message}")]
280 Llm {
281 provider: &'static str,
282 reason: LlmFailureReason,
283 message: String,
284 },
285 #[error("Storage error: {0}")]
286 StoreError(String),
287 #[error("Tool error: {0}")]
288 ToolError(String),
289 #[error("MCP error: {0}")]
290 McpError(String),
291 #[error("Session not found: {0}")]
292 SessionNotFound(SessionId),
293 #[error("Token budget exceeded: used {used}, limit {limit}")]
294 TokenBudgetExceeded { used: u64, limit: u64 },
295 #[error("Time budget exceeded: {elapsed_secs}s > {limit_secs}s")]
296 TimeBudgetExceeded { elapsed_secs: u64, limit_secs: u64 },
297 #[error("Tool call budget exceeded: {count} calls > {limit} limit")]
298 ToolCallBudgetExceeded { count: usize, limit: usize },
299 #[error("Max tokens reached on turn {turn}, partial output: {partial}")]
300 MaxTokensReached { turn: u32, partial: String },
301 #[error("Content filtered on turn {turn}")]
302 ContentFiltered { turn: u32 },
303 #[error("Max turns reached: {turns}")]
304 MaxTurnsReached { turns: u32 },
305 #[error("Run was cancelled")]
306 Cancelled,
307 #[error("Invalid state transition: {from} -> {to}")]
308 InvalidStateTransition { from: String, to: String },
309 #[error("Operation not found: {0}")]
310 OperationNotFound(String),
311 #[error("Depth limit exceeded: {depth} > {max}")]
312 DepthLimitExceeded { depth: u32, max: u32 },
313 #[error("Concurrency limit exceeded")]
314 ConcurrencyLimitExceeded,
315 #[error("Configuration error: {0}")]
316 ConfigError(String),
317 #[error("Invalid tool in access policy: {tool}")]
318 InvalidToolAccess { tool: String },
319 #[error("Internal error: {0}")]
320 InternalError(String),
321
322 #[error("Build error: {0}")]
324 BuildError(String),
325
326 #[error("Connection `{binding_key}` requires re-authentication: {message}")]
333 AuthReauthRequired {
334 binding_key: String,
335 message: String,
336 },
337
338 #[error("Callback pending for tool '{tool_name}'")]
340 CallbackPending {
341 tool_name: String,
342 args: serde_json::Value,
343 },
344
345 #[error("Structured output validation failed after {attempts} attempts: {reason}")]
347 StructuredOutputValidationFailed {
348 attempts: u32,
349 reason: String,
350 last_output: String,
351 },
352
353 #[error("Invalid output schema: {0}")]
355 InvalidOutputSchema(String),
356
357 #[error("Hook '{hook_id}' denied at {point:?}: {reason_code:?} - {message}")]
358 HookDenied {
359 hook_id: HookId,
360 point: HookPoint,
361 reason_code: HookReasonCode,
362 message: String,
363 payload: Option<serde_json::Value>,
364 },
365
366 #[error("Hook '{hook_id}' timed out after {timeout_ms}ms")]
367 HookTimeout { hook_id: HookId, timeout_ms: u64 },
368
369 #[error("Hook execution failed for '{hook_id}': {reason}")]
370 HookExecutionFailed { hook_id: HookId, reason: String },
371
372 #[error("Hook configuration invalid: {reason}")]
373 HookConfigInvalid { reason: String },
374
375 #[error("Terminal failure: {outcome:?} ({cause_kind:?}): {message}")]
377 TerminalFailure {
378 outcome: crate::turn_execution_authority::TurnTerminalOutcome,
379 cause_kind: crate::turn_execution_authority::TurnTerminalCauseKind,
380 message: String,
381 },
382
383 #[error("no pending boundary for resume")]
389 NoPendingBoundary,
390}
391
392impl AgentError {
393 pub fn llm(
394 provider: &'static str,
395 reason: LlmFailureReason,
396 message: impl Into<String>,
397 ) -> Self {
398 Self::Llm {
399 provider,
400 reason,
401 message: message.into(),
402 }
403 }
404
405 pub fn llm_empty_response(provider: &'static str) -> Self {
406 Self::llm(
407 provider,
408 LlmFailureReason::ProviderError(LlmProviderError::retryable(
409 LlmProviderErrorKind::IncompleteResponse,
410 serde_json::json!({
411 "reason": "provider completed without user-visible text, images, or tool calls"
412 }),
413 )),
414 "LLM completed without user-visible text, images, or tool calls",
415 )
416 }
417
418 pub fn is_graceful(&self) -> bool {
419 matches!(
420 self,
421 Self::TokenBudgetExceeded { .. }
422 | Self::TimeBudgetExceeded { .. }
423 | Self::ToolCallBudgetExceeded { .. }
424 | Self::MaxTurnsReached { .. }
425 )
426 }
427 pub fn is_rate_limited(&self) -> bool {
428 matches!(
429 self,
430 Self::Llm {
431 reason: LlmFailureReason::RateLimited { .. },
432 ..
433 }
434 )
435 }
436
437 pub fn retry_after_hint(&self) -> Option<std::time::Duration> {
438 match self {
439 Self::Llm {
440 reason: LlmFailureReason::RateLimited { retry_after },
441 ..
442 } => *retry_after,
443 _ => None,
444 }
445 }
446
447 pub fn is_recoverable(&self) -> bool {
448 match self {
449 Self::Llm { reason, .. } => match reason {
450 LlmFailureReason::RateLimited { .. } => true,
451 LlmFailureReason::NetworkTimeout { .. } => true,
452 LlmFailureReason::CallTimeout { .. } => true,
453 LlmFailureReason::ProviderError(provider_error) => provider_error.is_retryable(),
454 _ => false,
455 },
456 _ => false,
457 }
458 }
459}
460
461pub fn store_error(err: impl std::fmt::Display) -> AgentError {
462 AgentError::StoreError(store_error_message(err))
463}
464pub fn invalid_session_id(err: impl std::fmt::Display) -> AgentError {
465 AgentError::StoreError(invalid_session_id_message(err))
466}
467pub fn store_error_message(err: impl std::fmt::Display) -> String {
468 err.to_string()
469}
470pub fn invalid_session_id_message(err: impl std::fmt::Display) -> String {
471 format!("Invalid session ID: {err}")
472}
473
474#[cfg(test)]
475#[allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)]
476mod tests {
477 use super::*;
478
479 #[test]
480 fn test_network_timeout_is_recoverable() {
481 let err = AgentError::llm(
482 "anthropic",
483 LlmFailureReason::NetworkTimeout { duration_ms: 30000 },
484 "network timeout after 30s",
485 );
486 assert!(err.is_recoverable());
487 }
488
489 #[test]
490 fn test_call_timeout_is_recoverable() {
491 let err = AgentError::llm(
492 "anthropic",
493 LlmFailureReason::CallTimeout { duration_ms: 45000 },
494 "call timeout after 45s",
495 );
496 assert!(err.is_recoverable());
497 }
498
499 #[test]
500 fn test_network_timeout_typed_mapping() {
501 let reason = LlmFailureReason::NetworkTimeout { duration_ms: 5000 };
502 match reason {
503 LlmFailureReason::NetworkTimeout { duration_ms } => {
504 assert_eq!(duration_ms, 5000);
505 }
506 _ => panic!("expected NetworkTimeout"),
507 }
508 }
509
510 #[test]
511 fn test_call_timeout_typed_mapping() {
512 let reason = LlmFailureReason::CallTimeout { duration_ms: 60000 };
513 match reason {
514 LlmFailureReason::CallTimeout { duration_ms } => {
515 assert_eq!(duration_ms, 60000);
516 }
517 _ => panic!("expected CallTimeout"),
518 }
519 }
520
521 #[test]
522 fn test_timeout_variants_are_distinct() {
523 let net = LlmFailureReason::NetworkTimeout { duration_ms: 1000 };
524 let call = LlmFailureReason::CallTimeout { duration_ms: 1000 };
525 assert_ne!(net, call);
526 }
527
528 #[test]
529 fn test_auth_error_not_recoverable() {
530 let err = AgentError::llm("anthropic", LlmFailureReason::AuthError, "bad key");
531 assert!(!err.is_recoverable());
532 }
533
534 #[test]
535 fn provider_error_uses_typed_retryability_for_recovery() {
536 let err = AgentError::llm(
537 "anthropic",
538 LlmFailureReason::ProviderError(LlmProviderError::retryable(
539 LlmProviderErrorKind::ServerOverloaded,
540 serde_json::json!({
541 "message": "provider overloaded"
542 }),
543 )),
544 "provider overloaded",
545 );
546
547 assert!(err.is_recoverable());
548 }
549
550 #[test]
551 fn provider_error_fails_closed_when_json_claims_retryable() {
552 let err = AgentError::llm(
553 "anthropic",
554 LlmFailureReason::ProviderError(LlmProviderError::non_retryable(
555 LlmProviderErrorKind::InvalidRequest,
556 serde_json::json!({
557 "kind": "server_overloaded",
558 "retryable": true,
559 "message": "json payload must not control retryability"
560 }),
561 )),
562 "invalid request",
563 );
564
565 assert!(!err.is_recoverable());
566 }
567
568 #[test]
571 fn test_is_rate_limited_true_for_rate_limit_error() {
572 let err = AgentError::llm(
573 "anthropic",
574 LlmFailureReason::RateLimited {
575 retry_after: Some(std::time::Duration::from_secs(30)),
576 },
577 "rate limited",
578 );
579 assert!(err.is_rate_limited());
580 }
581
582 #[test]
583 fn test_is_rate_limited_false_for_other_errors() {
584 let err = AgentError::llm(
585 "anthropic",
586 LlmFailureReason::NetworkTimeout { duration_ms: 5000 },
587 "timeout",
588 );
589 assert!(!err.is_rate_limited());
590
591 let err = AgentError::llm("anthropic", LlmFailureReason::AuthError, "bad key");
592 assert!(!err.is_rate_limited());
593 }
594
595 #[test]
596 fn test_retry_after_hint_returns_duration_for_rate_limit() {
597 let err = AgentError::llm(
598 "anthropic",
599 LlmFailureReason::RateLimited {
600 retry_after: Some(std::time::Duration::from_secs(60)),
601 },
602 "rate limited",
603 );
604 assert_eq!(
605 err.retry_after_hint(),
606 Some(std::time::Duration::from_secs(60))
607 );
608 }
609
610 #[test]
611 fn test_retry_after_hint_returns_none_for_non_rate_limit() {
612 let err = AgentError::llm(
613 "anthropic",
614 LlmFailureReason::NetworkTimeout { duration_ms: 5000 },
615 "timeout",
616 );
617 assert_eq!(err.retry_after_hint(), None);
618 }
619
620 #[test]
621 fn test_timeout_variants_not_graceful() {
622 let err = AgentError::llm(
623 "anthropic",
624 LlmFailureReason::NetworkTimeout { duration_ms: 1000 },
625 "timeout",
626 );
627 assert!(!err.is_graceful());
628
629 let err = AgentError::llm(
630 "anthropic",
631 LlmFailureReason::CallTimeout { duration_ms: 1000 },
632 "timeout",
633 );
634 assert!(!err.is_graceful());
635 }
636
637 #[test]
640 fn test_build_error_variant_exists_and_carries_message() {
641 let err = AgentError::BuildError("Missing API key for provider 'anthropic'".to_string());
642 match &err {
643 AgentError::BuildError(msg) => {
644 assert!(
645 msg.contains("API key"),
646 "message should contain source text"
647 );
648 }
649 other => panic!("expected BuildError, got: {other}"),
650 }
651 }
652
653 #[test]
654 fn test_build_error_is_not_recoverable() {
655 let err = AgentError::BuildError("Unknown provider for model 'llama-3'".to_string());
656 assert!(!err.is_recoverable(), "build errors are not recoverable");
657 }
658
659 #[test]
660 fn test_build_error_is_not_graceful() {
661 let err = AgentError::BuildError("Missing API key".to_string());
662 assert!(!err.is_graceful(), "build errors are not graceful");
663 }
664
665 #[test]
666 fn test_build_error_display() {
667 let err = AgentError::BuildError("Missing API key for provider 'anthropic'".to_string());
668 let display = err.to_string();
669 assert!(
670 display.contains("Build error")
671 || display.contains("build error")
672 || display.contains("Missing API key"),
673 "display should mention the build error: {display}"
674 );
675 }
676
677 #[test]
680 fn test_terminal_failure_carries_typed_outcome() {
681 use crate::turn_execution_authority::{TurnTerminalCauseKind, TurnTerminalOutcome};
682
683 let err = AgentError::TerminalFailure {
685 outcome: TurnTerminalOutcome::Failed,
686 cause_kind: TurnTerminalCauseKind::LlmFailure,
687 message: "llm failed".to_string(),
688 };
689 match &err {
690 AgentError::TerminalFailure {
691 outcome,
692 cause_kind,
693 ..
694 } => {
695 assert_eq!(*outcome, TurnTerminalOutcome::Failed);
697 assert_eq!(*cause_kind, TurnTerminalCauseKind::LlmFailure);
698 }
699 other => panic!("expected TerminalFailure, got: {other}"),
700 }
701 }
702
703 #[test]
704 fn test_terminal_failure_display_includes_outcome() {
705 use crate::turn_execution_authority::{TurnTerminalCauseKind, TurnTerminalOutcome};
706
707 let err = AgentError::TerminalFailure {
708 outcome: TurnTerminalOutcome::TimeBudgetExceeded,
709 cause_kind: TurnTerminalCauseKind::TimeBudgetExceeded,
710 message: "deadline reached".to_string(),
711 };
712 let display = err.to_string();
713 assert!(
714 display.contains("TimeBudgetExceeded"),
715 "display should include the outcome variant name: {display}"
716 );
717 assert!(
718 display.contains("TimeBudgetExceeded") && display.contains("deadline reached"),
719 "display should include cause and display message: {display}"
720 );
721 }
722
723 #[test]
724 fn test_terminal_failure_all_hard_failure_outcomes() {
725 use crate::turn_execution_authority::{TurnTerminalCauseKind, TurnTerminalOutcome};
726
727 for outcome in [
729 TurnTerminalOutcome::Failed,
730 TurnTerminalOutcome::TimeBudgetExceeded,
731 ] {
732 let err = AgentError::TerminalFailure {
733 outcome,
734 cause_kind: TurnTerminalCauseKind::FatalFailure,
735 message: "terminal".to_string(),
736 };
737 assert!(
738 !err.is_graceful(),
739 "TerminalFailure({outcome:?}) should not be graceful"
740 );
741 }
742 }
743}