1use crate::hooks::{HookId, HookPoint, HookReasonCode};
4use crate::tool_catalog::ToolUnavailableReason;
5use crate::types::SessionId;
6use serde::{Deserialize, Serialize};
7
8#[derive(Debug, Clone, PartialEq)]
9#[non_exhaustive]
10pub enum LlmFailureReason {
11 RateLimited {
12 retry_after: Option<std::time::Duration>,
13 },
14 ContextExceeded {
15 max: u32,
16 requested: u32,
17 },
18 AuthError,
19 InvalidModel(String),
20 ProviderError(LlmProviderError),
21 NetworkTimeout {
23 duration_ms: u64,
24 },
25 CallTimeout {
27 duration_ms: u64,
28 },
29}
30
31#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
32#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
33#[serde(rename_all = "snake_case")]
34pub enum LlmProviderErrorKind {
35 InvalidRequest,
36 ContentFiltered,
37 ServerError,
38 ServerOverloaded,
39 ConnectionReset,
40 Unknown,
41 StreamParseError,
42 IncompleteResponse,
43}
44
45#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
46#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
47#[serde(rename_all = "snake_case")]
48pub enum LlmProviderErrorRetryability {
49 Retryable,
50 NonRetryable,
51}
52
53impl LlmProviderErrorRetryability {
54 pub fn is_retryable(self) -> bool {
55 matches!(self, Self::Retryable)
56 }
57}
58
59#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
60#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
61pub struct LlmProviderError {
62 pub kind: LlmProviderErrorKind,
63 pub retryability: LlmProviderErrorRetryability,
64 #[serde(default, skip_serializing_if = "serde_json::Value::is_null")]
65 pub details: serde_json::Value,
66}
67
68impl LlmProviderError {
69 pub fn new(
70 kind: LlmProviderErrorKind,
71 retryability: LlmProviderErrorRetryability,
72 details: serde_json::Value,
73 ) -> Self {
74 Self {
75 kind,
76 retryability,
77 details,
78 }
79 }
80
81 pub fn retryable(kind: LlmProviderErrorKind, details: serde_json::Value) -> Self {
82 Self::new(kind, LlmProviderErrorRetryability::Retryable, details)
83 }
84
85 pub fn non_retryable(kind: LlmProviderErrorKind, details: serde_json::Value) -> Self {
86 Self::new(kind, LlmProviderErrorRetryability::NonRetryable, details)
87 }
88
89 pub fn is_retryable(&self) -> bool {
90 self.retryability.is_retryable()
91 }
92}
93
94#[derive(Debug, Clone, thiserror::Error, PartialEq)]
96pub enum ToolValidationError {
97 #[error("Tool not found: {name}")]
99 NotFound { name: String },
100 #[error("Invalid arguments for tool '{name}': {reason}")]
102 InvalidArguments { name: String, reason: String },
103}
104
105impl ToolValidationError {
106 pub fn not_found(name: impl Into<String>) -> Self {
107 Self::NotFound { name: name.into() }
108 }
109 pub fn invalid_arguments(name: impl Into<String>, reason: impl Into<String>) -> Self {
110 Self::InvalidArguments {
111 name: name.into(),
112 reason: reason.into(),
113 }
114 }
115}
116
117#[derive(Debug, Clone, PartialEq, thiserror::Error)]
119pub enum ToolError {
120 #[error("Tool not found: {name}")]
122 NotFound { name: String },
123
124 #[error("Tool '{name}' is currently unavailable: {reason}")]
126 Unavailable {
127 name: String,
128 reason: ToolUnavailableReason,
129 },
130
131 #[error("Invalid arguments for tool '{name}': {reason}")]
133 InvalidArguments { name: String, reason: String },
134
135 #[error("Tool execution failed: {message}")]
137 ExecutionFailed { message: String },
138
139 #[error("Tool execution failed: {message}")]
141 ExecutionFailedWithData {
142 message: String,
143 data: serde_json::Value,
144 },
145
146 #[error("Tool '{name}' timed out after {timeout_ms}ms")]
148 Timeout { name: String, timeout_ms: u64 },
149
150 #[error("Tool '{name}' is not allowed by policy")]
152 AccessDenied { name: String },
153
154 #[error("{0}")]
156 Other(String),
157
158 #[error("Callback pending for tool '{tool_name}'")]
164 CallbackPending {
165 tool_name: String,
166 args: serde_json::Value,
167 },
168}
169
170impl ToolError {
171 pub fn error_code(&self) -> &'static str {
172 match self {
173 Self::NotFound { .. } => "tool_not_found",
174 Self::Unavailable { .. } => "tool_unavailable",
175 Self::InvalidArguments { .. } => "invalid_arguments",
176 Self::ExecutionFailed { .. } | Self::ExecutionFailedWithData { .. } => {
177 "execution_failed"
178 }
179 Self::Timeout { .. } => "timeout",
180 Self::AccessDenied { .. } => "access_denied",
181 Self::Other(_) => "tool_error",
182 Self::CallbackPending { .. } => "callback_pending",
183 }
184 }
185
186 pub fn to_error_payload(&self) -> serde_json::Value {
187 let mut payload = serde_json::json!({
188 "error": self.error_code(),
189 "message": self.to_string(),
190 });
191 if let Some(data) = self.structured_data() {
192 payload["data"] = data;
193 }
194 payload
195 }
196
197 #[must_use]
205 pub fn to_transcript_content(&self) -> String {
206 self.to_error_payload().to_string()
207 }
208
209 pub fn not_found(name: impl Into<String>) -> Self {
210 Self::NotFound { name: name.into() }
211 }
212 pub fn unavailable(name: impl Into<String>, reason: ToolUnavailableReason) -> Self {
213 Self::Unavailable {
214 name: name.into(),
215 reason,
216 }
217 }
218 pub fn invalid_arguments(name: impl Into<String>, reason: impl Into<String>) -> Self {
219 Self::InvalidArguments {
220 name: name.into(),
221 reason: reason.into(),
222 }
223 }
224 pub fn execution_failed(message: impl Into<String>) -> Self {
225 Self::ExecutionFailed {
226 message: message.into(),
227 }
228 }
229 pub fn execution_failed_with_data(message: impl Into<String>, data: serde_json::Value) -> Self {
230 Self::ExecutionFailedWithData {
231 message: message.into(),
232 data,
233 }
234 }
235 pub fn structured_data(&self) -> Option<serde_json::Value> {
236 match self {
237 Self::ExecutionFailedWithData { data, .. } => Some(data.clone()),
238 _ => None,
239 }
240 }
241 pub fn timeout(name: impl Into<String>, timeout_ms: u64) -> Self {
242 Self::Timeout {
243 name: name.into(),
244 timeout_ms,
245 }
246 }
247 pub fn access_denied(name: impl Into<String>) -> Self {
248 Self::AccessDenied { name: name.into() }
249 }
250 pub fn other(message: impl Into<String>) -> Self {
251 Self::Other(message.into())
252 }
253
254 pub fn callback_pending(tool_name: impl Into<String>, args: serde_json::Value) -> Self {
256 Self::CallbackPending {
257 tool_name: tool_name.into(),
258 args,
259 }
260 }
261
262 pub fn is_callback_pending(&self) -> bool {
264 matches!(self, Self::CallbackPending { .. })
265 }
266
267 pub fn as_callback_pending(&self) -> Option<(&str, &serde_json::Value)> {
269 match self {
270 Self::CallbackPending { tool_name, args } => Some((tool_name, args)),
271 _ => None,
272 }
273 }
274}
275
276impl From<String> for ToolError {
277 fn from(s: String) -> Self {
278 Self::Other(s)
279 }
280}
281impl From<&str> for ToolError {
282 fn from(s: &str) -> Self {
283 Self::Other(s.to_string())
284 }
285}
286
287#[derive(Debug, thiserror::Error)]
289#[non_exhaustive]
290pub enum AgentError {
291 #[error("LLM error ({provider}): {message}")]
292 Llm {
293 provider: &'static str,
294 reason: LlmFailureReason,
295 message: String,
296 },
297 #[error("Storage error: {0}")]
298 StoreError(String),
299 #[error("Tool error: {error}")]
305 Tool { error: ToolError },
306 #[error("MCP error: {0}")]
307 McpError(String),
308 #[error("Session not found: {0}")]
309 SessionNotFound(SessionId),
310 #[error("Token budget exceeded: used {used}, limit {limit}")]
311 TokenBudgetExceeded { used: u64, limit: u64 },
312 #[error("Time budget exceeded: {elapsed_secs}s > {limit_secs}s")]
313 TimeBudgetExceeded { elapsed_secs: u64, limit_secs: u64 },
314 #[error("Tool call budget exceeded: {count} calls > {limit} limit")]
315 ToolCallBudgetExceeded { count: usize, limit: usize },
316 #[error("Max tokens reached on turn {turn}, partial output: {partial}")]
317 MaxTokensReached { turn: u32, partial: String },
318 #[error("Content filtered on turn {turn}")]
319 ContentFiltered { turn: u32 },
320 #[error("Max turns reached: {turns}")]
321 MaxTurnsReached { turns: u32 },
322 #[error("Run was cancelled")]
323 Cancelled,
324 #[error("Invalid state transition: {from} -> {to}")]
325 InvalidStateTransition { from: String, to: String },
326 #[error("Operation not found: {0}")]
327 OperationNotFound(String),
328 #[error("Depth limit exceeded: {depth} > {max}")]
329 DepthLimitExceeded { depth: u32, max: u32 },
330 #[error("Concurrency limit exceeded")]
331 ConcurrencyLimitExceeded,
332 #[error("Configuration error: {0}")]
333 ConfigError(String),
334 #[error("Invalid tool in access policy: {tool}")]
335 InvalidToolAccess { tool: String },
336 #[error("Skill resolution failed for {skill_key:?}: {reason}")]
337 SkillResolutionFailed {
338 skill_key: Option<crate::skills::SkillKey>,
339 reason: Box<crate::event::SkillResolutionFailureReason>,
340 },
341 #[error("Internal error: {0}")]
342 InternalError(String),
343
344 #[error("Build error: {0}")]
346 BuildError(String),
347
348 #[error("Connection `{binding_key}` requires re-authentication: {message}")]
355 AuthReauthRequired {
356 binding_key: String,
357 message: String,
358 },
359
360 #[error("Callback pending for tool '{tool_name}'")]
362 CallbackPending {
363 tool_name: String,
364 args: serde_json::Value,
365 },
366
367 #[error("Structured output validation failed after {attempts} attempts: {reason}")]
369 StructuredOutputValidationFailed {
370 attempts: u32,
371 reason: String,
372 last_output: String,
373 },
374
375 #[error("Invalid output schema: {0}")]
377 InvalidOutputSchema(String),
378
379 #[error("Hook '{hook_id}' denied at {point:?}: {reason_code:?} - {message}")]
380 HookDenied {
381 hook_id: HookId,
382 point: HookPoint,
383 reason_code: HookReasonCode,
384 message: String,
385 payload: Option<serde_json::Value>,
386 },
387
388 #[error("Hook '{hook_id}' timed out after {timeout_ms}ms")]
389 HookTimeout { hook_id: HookId, timeout_ms: u64 },
390
391 #[error("Hook execution failed for '{hook_id}': {reason}")]
392 HookExecutionFailed { hook_id: HookId, reason: String },
393
394 #[error("Hook configuration invalid: {reason}")]
395 HookConfigInvalid { reason: String },
396
397 #[error("Terminal failure: {outcome:?} ({cause_kind:?}): {message}")]
399 TerminalFailure {
400 outcome: crate::turn_execution_authority::TurnTerminalOutcome,
401 cause_kind: crate::turn_execution_authority::TurnTerminalCauseKind,
402 message: String,
403 },
404
405 #[error("no pending boundary for resume")]
410 NoPendingBoundary,
411
412 #[error("durable session snapshot synchronization is not supported by this session agent")]
417 DurableSnapshotSyncUnsupported,
418}
419
420impl AgentError {
421 pub fn tool(error: ToolError) -> Self {
424 Self::Tool { error }
425 }
426
427 pub fn tool_error_code(&self) -> Option<&'static str> {
434 match self {
435 Self::Tool { error } => Some(error.error_code()),
436 _ => None,
437 }
438 }
439
440 pub fn llm(
441 provider: &'static str,
442 reason: LlmFailureReason,
443 message: impl Into<String>,
444 ) -> Self {
445 Self::Llm {
446 provider,
447 reason,
448 message: message.into(),
449 }
450 }
451
452 pub fn llm_empty_response(provider: &'static str) -> Self {
453 Self::llm(
454 provider,
455 LlmFailureReason::ProviderError(LlmProviderError::retryable(
456 LlmProviderErrorKind::IncompleteResponse,
457 serde_json::json!({
458 "reason": "provider completed without user-visible text, images, or tool calls"
459 }),
460 )),
461 "LLM completed without user-visible text, images, or tool calls",
462 )
463 }
464
465 pub fn is_graceful(&self) -> bool {
466 matches!(
467 self,
468 Self::TokenBudgetExceeded { .. }
469 | Self::TimeBudgetExceeded { .. }
470 | Self::ToolCallBudgetExceeded { .. }
471 | Self::MaxTurnsReached { .. }
472 )
473 }
474 pub fn is_rate_limited(&self) -> bool {
475 matches!(
476 self,
477 Self::Llm {
478 reason: LlmFailureReason::RateLimited { .. },
479 ..
480 }
481 )
482 }
483
484 pub fn retry_after_hint(&self) -> Option<std::time::Duration> {
485 match self {
486 Self::Llm {
487 reason: LlmFailureReason::RateLimited { retry_after },
488 ..
489 } => *retry_after,
490 _ => None,
491 }
492 }
493
494 pub fn is_recoverable(&self) -> bool {
495 match self {
496 Self::Llm { reason, .. } => match reason {
497 LlmFailureReason::RateLimited { .. } => true,
498 LlmFailureReason::NetworkTimeout { .. } => true,
499 LlmFailureReason::CallTimeout { .. } => true,
500 LlmFailureReason::ProviderError(provider_error) => provider_error.is_retryable(),
501 _ => false,
502 },
503 _ => false,
504 }
505 }
506}
507
508pub fn store_error(err: impl std::fmt::Display) -> AgentError {
509 AgentError::StoreError(store_error_message(err))
510}
511pub fn invalid_session_id(err: impl std::fmt::Display) -> AgentError {
512 AgentError::StoreError(invalid_session_id_message(err))
513}
514pub fn store_error_message(err: impl std::fmt::Display) -> String {
515 err.to_string()
516}
517pub fn invalid_session_id_message(err: impl std::fmt::Display) -> String {
518 format!("Invalid session ID: {err}")
519}
520
521#[cfg(test)]
522#[allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)]
523mod tests {
524 use super::*;
525
526 #[test]
527 fn test_network_timeout_is_recoverable() {
528 let err = AgentError::llm(
529 "anthropic",
530 LlmFailureReason::NetworkTimeout { duration_ms: 30000 },
531 "network timeout after 30s",
532 );
533 assert!(err.is_recoverable());
534 }
535
536 #[test]
537 fn test_call_timeout_is_recoverable() {
538 let err = AgentError::llm(
539 "anthropic",
540 LlmFailureReason::CallTimeout { duration_ms: 45000 },
541 "call timeout after 45s",
542 );
543 assert!(err.is_recoverable());
544 }
545
546 #[test]
547 fn test_network_timeout_typed_mapping() {
548 let reason = LlmFailureReason::NetworkTimeout { duration_ms: 5000 };
549 match reason {
550 LlmFailureReason::NetworkTimeout { duration_ms } => {
551 assert_eq!(duration_ms, 5000);
552 }
553 _ => panic!("expected NetworkTimeout"),
554 }
555 }
556
557 #[test]
558 fn test_call_timeout_typed_mapping() {
559 let reason = LlmFailureReason::CallTimeout { duration_ms: 60000 };
560 match reason {
561 LlmFailureReason::CallTimeout { duration_ms } => {
562 assert_eq!(duration_ms, 60000);
563 }
564 _ => panic!("expected CallTimeout"),
565 }
566 }
567
568 #[test]
569 fn test_timeout_variants_are_distinct() {
570 let net = LlmFailureReason::NetworkTimeout { duration_ms: 1000 };
571 let call = LlmFailureReason::CallTimeout { duration_ms: 1000 };
572 assert_ne!(net, call);
573 }
574
575 #[test]
576 fn test_auth_error_not_recoverable() {
577 let err = AgentError::llm("anthropic", LlmFailureReason::AuthError, "bad key");
578 assert!(!err.is_recoverable());
579 }
580
581 #[test]
582 fn provider_error_uses_typed_retryability_for_recovery() {
583 let err = AgentError::llm(
584 "anthropic",
585 LlmFailureReason::ProviderError(LlmProviderError::retryable(
586 LlmProviderErrorKind::ServerOverloaded,
587 serde_json::json!({
588 "message": "provider overloaded"
589 }),
590 )),
591 "provider overloaded",
592 );
593
594 assert!(err.is_recoverable());
595 }
596
597 #[test]
598 fn provider_error_fails_closed_when_json_claims_retryable() {
599 let err = AgentError::llm(
600 "anthropic",
601 LlmFailureReason::ProviderError(LlmProviderError::non_retryable(
602 LlmProviderErrorKind::InvalidRequest,
603 serde_json::json!({
604 "kind": "server_overloaded",
605 "retryable": true,
606 "message": "json payload must not control retryability"
607 }),
608 )),
609 "invalid request",
610 );
611
612 assert!(!err.is_recoverable());
613 }
614
615 #[test]
618 fn test_is_rate_limited_true_for_rate_limit_error() {
619 let err = AgentError::llm(
620 "anthropic",
621 LlmFailureReason::RateLimited {
622 retry_after: Some(std::time::Duration::from_secs(30)),
623 },
624 "rate limited",
625 );
626 assert!(err.is_rate_limited());
627 }
628
629 #[test]
630 fn test_is_rate_limited_false_for_other_errors() {
631 let err = AgentError::llm(
632 "anthropic",
633 LlmFailureReason::NetworkTimeout { duration_ms: 5000 },
634 "timeout",
635 );
636 assert!(!err.is_rate_limited());
637
638 let err = AgentError::llm("anthropic", LlmFailureReason::AuthError, "bad key");
639 assert!(!err.is_rate_limited());
640 }
641
642 #[test]
643 fn test_retry_after_hint_returns_duration_for_rate_limit() {
644 let err = AgentError::llm(
645 "anthropic",
646 LlmFailureReason::RateLimited {
647 retry_after: Some(std::time::Duration::from_secs(60)),
648 },
649 "rate limited",
650 );
651 assert_eq!(
652 err.retry_after_hint(),
653 Some(std::time::Duration::from_secs(60))
654 );
655 }
656
657 #[test]
658 fn test_retry_after_hint_returns_none_for_non_rate_limit() {
659 let err = AgentError::llm(
660 "anthropic",
661 LlmFailureReason::NetworkTimeout { duration_ms: 5000 },
662 "timeout",
663 );
664 assert_eq!(err.retry_after_hint(), None);
665 }
666
667 #[test]
668 fn test_timeout_variants_not_graceful() {
669 let err = AgentError::llm(
670 "anthropic",
671 LlmFailureReason::NetworkTimeout { duration_ms: 1000 },
672 "timeout",
673 );
674 assert!(!err.is_graceful());
675
676 let err = AgentError::llm(
677 "anthropic",
678 LlmFailureReason::CallTimeout { duration_ms: 1000 },
679 "timeout",
680 );
681 assert!(!err.is_graceful());
682 }
683
684 #[test]
687 fn test_build_error_variant_exists_and_carries_message() {
688 let err = AgentError::BuildError("Missing API key for provider 'anthropic'".to_string());
689 match &err {
690 AgentError::BuildError(msg) => {
691 assert!(
692 msg.contains("API key"),
693 "message should contain source text"
694 );
695 }
696 other => panic!("expected BuildError, got: {other}"),
697 }
698 }
699
700 #[test]
701 fn test_build_error_is_not_recoverable() {
702 let err = AgentError::BuildError("Unknown provider for model 'llama-3'".to_string());
703 assert!(!err.is_recoverable(), "build errors are not recoverable");
704 }
705
706 #[test]
707 fn test_build_error_is_not_graceful() {
708 let err = AgentError::BuildError("Missing API key".to_string());
709 assert!(!err.is_graceful(), "build errors are not graceful");
710 }
711
712 #[test]
713 fn test_build_error_display() {
714 let err = AgentError::BuildError("Missing API key for provider 'anthropic'".to_string());
715 let display = err.to_string();
716 assert!(
717 display.contains("Build error")
718 || display.contains("build error")
719 || display.contains("Missing API key"),
720 "display should mention the build error: {display}"
721 );
722 }
723
724 #[test]
727 fn test_terminal_failure_carries_typed_outcome() {
728 use crate::turn_execution_authority::{TurnTerminalCauseKind, TurnTerminalOutcome};
729
730 let err = AgentError::TerminalFailure {
732 outcome: TurnTerminalOutcome::Failed,
733 cause_kind: TurnTerminalCauseKind::LlmFailure,
734 message: "llm failed".to_string(),
735 };
736 match &err {
737 AgentError::TerminalFailure {
738 outcome,
739 cause_kind,
740 ..
741 } => {
742 assert_eq!(*outcome, TurnTerminalOutcome::Failed);
744 assert_eq!(*cause_kind, TurnTerminalCauseKind::LlmFailure);
745 }
746 other => panic!("expected TerminalFailure, got: {other}"),
747 }
748 }
749
750 #[test]
751 fn test_terminal_failure_display_includes_outcome() {
752 use crate::turn_execution_authority::{TurnTerminalCauseKind, TurnTerminalOutcome};
753
754 let err = AgentError::TerminalFailure {
755 outcome: TurnTerminalOutcome::TimeBudgetExceeded,
756 cause_kind: TurnTerminalCauseKind::TimeBudgetExceeded,
757 message: "deadline reached".to_string(),
758 };
759 let display = err.to_string();
760 assert!(
761 display.contains("TimeBudgetExceeded"),
762 "display should include the outcome variant name: {display}"
763 );
764 assert!(
765 display.contains("TimeBudgetExceeded") && display.contains("deadline reached"),
766 "display should include cause and display message: {display}"
767 );
768 }
769
770 #[test]
773 fn tool_variant_preserves_access_denied_error_code() {
774 let err = AgentError::tool(ToolError::access_denied("secret_tool"));
778 match &err {
779 AgentError::Tool { error } => {
780 assert_eq!(error.error_code(), "access_denied");
781 }
782 other => panic!("expected AgentError::Tool, got: {other}"),
783 }
784 assert_eq!(err.tool_error_code(), Some("access_denied"));
785 }
786
787 #[test]
788 fn tool_variant_preserves_not_found_error_code() {
789 let err = AgentError::tool(ToolError::not_found("missing_tool"));
791 assert_eq!(err.tool_error_code(), Some("tool_not_found"));
792 assert_ne!(
793 err.tool_error_code(),
794 AgentError::tool(ToolError::access_denied("missing_tool")).tool_error_code(),
795 "not_found must stay distinct from access_denied"
796 );
797 }
798
799 #[test]
800 fn tool_variant_preserves_invalid_arguments_error_code() {
801 let err = AgentError::tool(ToolError::invalid_arguments(
804 "search",
805 "tool call arguments projection failed: bad json",
806 ));
807 match &err {
808 AgentError::Tool { error } => {
809 assert_eq!(error.error_code(), "invalid_arguments");
810 }
811 other => panic!("expected AgentError::Tool, got: {other}"),
812 }
813 assert_eq!(err.tool_error_code(), Some("invalid_arguments"));
814 }
815
816 #[test]
817 fn test_terminal_failure_all_hard_failure_outcomes() {
818 use crate::turn_execution_authority::{TurnTerminalCauseKind, TurnTerminalOutcome};
819
820 for outcome in [
822 TurnTerminalOutcome::Failed,
823 TurnTerminalOutcome::TimeBudgetExceeded,
824 ] {
825 let err = AgentError::TerminalFailure {
826 outcome,
827 cause_kind: TurnTerminalCauseKind::FatalFailure,
828 message: "terminal".to_string(),
829 };
830 assert!(
831 !err.is_graceful(),
832 "TerminalFailure({outcome:?}) should not be graceful"
833 );
834 }
835 }
836}