1use crate::hooks::{HookId, HookPoint, HookReasonCode};
4use crate::tool_catalog::ToolUnavailableReason;
5use crate::types::SessionId;
6use serde::{Deserialize, Serialize};
7
8#[derive(Debug, Clone, PartialEq)]
9#[non_exhaustive]
10pub enum LlmFailureReason {
11 RateLimited {
12 retry_after: Option<std::time::Duration>,
13 },
14 ContextExceeded {
15 max: u32,
16 requested: u32,
17 },
18 AuthError,
19 InvalidModel(String),
20 ProviderError(LlmProviderError),
21 NetworkTimeout {
23 duration_ms: u64,
24 },
25 CallTimeout {
27 duration_ms: u64,
28 },
29}
30
31#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
32#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
33#[serde(rename_all = "snake_case")]
34pub enum LlmProviderErrorKind {
35 InvalidRequest,
36 ContentFiltered,
37 ServerError,
38 ServerOverloaded,
39 ConnectionReset,
40 Unknown,
41 StreamParseError,
42 IncompleteResponse,
43}
44
45#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
46#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
47#[serde(rename_all = "snake_case")]
48pub enum LlmProviderErrorRetryability {
49 Retryable,
50 NonRetryable,
51}
52
53impl LlmProviderErrorRetryability {
54 pub fn is_retryable(self) -> bool {
55 matches!(self, Self::Retryable)
56 }
57}
58
59#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
60#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
61pub struct LlmProviderError {
62 pub kind: LlmProviderErrorKind,
63 pub retryability: LlmProviderErrorRetryability,
64 #[serde(default, skip_serializing_if = "serde_json::Value::is_null")]
65 pub details: serde_json::Value,
66}
67
68impl LlmProviderError {
69 pub fn new(
70 kind: LlmProviderErrorKind,
71 retryability: LlmProviderErrorRetryability,
72 details: serde_json::Value,
73 ) -> Self {
74 Self {
75 kind,
76 retryability,
77 details,
78 }
79 }
80
81 pub fn retryable(kind: LlmProviderErrorKind, details: serde_json::Value) -> Self {
82 Self::new(kind, LlmProviderErrorRetryability::Retryable, details)
83 }
84
85 pub fn non_retryable(kind: LlmProviderErrorKind, details: serde_json::Value) -> Self {
86 Self::new(kind, LlmProviderErrorRetryability::NonRetryable, details)
87 }
88
89 pub fn is_retryable(&self) -> bool {
90 self.retryability.is_retryable()
91 }
92}
93
94#[derive(Debug, Clone, thiserror::Error, PartialEq)]
96pub enum ToolValidationError {
97 #[error("Tool not found: {name}")]
99 NotFound { name: String },
100 #[error("Invalid arguments for tool '{name}': {reason}")]
102 InvalidArguments { name: String, reason: String },
103}
104
105impl ToolValidationError {
106 pub fn not_found(name: impl Into<String>) -> Self {
107 Self::NotFound { name: name.into() }
108 }
109 pub fn invalid_arguments(name: impl Into<String>, reason: impl Into<String>) -> Self {
110 Self::InvalidArguments {
111 name: name.into(),
112 reason: reason.into(),
113 }
114 }
115}
116
117#[derive(Debug, Clone, thiserror::Error)]
119pub enum ToolError {
120 #[error("Tool not found: {name}")]
122 NotFound { name: String },
123
124 #[error("Tool '{name}' is currently unavailable: {reason}")]
126 Unavailable {
127 name: String,
128 reason: ToolUnavailableReason,
129 },
130
131 #[error("Invalid arguments for tool '{name}': {reason}")]
133 InvalidArguments { name: String, reason: String },
134
135 #[error("Tool execution failed: {message}")]
137 ExecutionFailed { message: String },
138
139 #[error("Tool execution failed: {message}")]
141 ExecutionFailedWithData {
142 message: String,
143 data: serde_json::Value,
144 },
145
146 #[error("Tool '{name}' timed out after {timeout_ms}ms")]
148 Timeout { name: String, timeout_ms: u64 },
149
150 #[error("Tool '{name}' is not allowed by policy")]
152 AccessDenied { name: String },
153
154 #[error("{0}")]
156 Other(String),
157
158 #[error("Callback pending for tool '{tool_name}'")]
164 CallbackPending {
165 tool_name: String,
166 args: serde_json::Value,
167 },
168}
169
170impl ToolError {
171 pub fn error_code(&self) -> &'static str {
172 match self {
173 Self::NotFound { .. } => "tool_not_found",
174 Self::Unavailable { .. } => "tool_unavailable",
175 Self::InvalidArguments { .. } => "invalid_arguments",
176 Self::ExecutionFailed { .. } | Self::ExecutionFailedWithData { .. } => {
177 "execution_failed"
178 }
179 Self::Timeout { .. } => "timeout",
180 Self::AccessDenied { .. } => "access_denied",
181 Self::Other(_) => "tool_error",
182 Self::CallbackPending { .. } => "callback_pending",
183 }
184 }
185
186 pub fn to_error_payload(&self) -> serde_json::Value {
187 let mut payload = serde_json::json!({
188 "error": self.error_code(),
189 "message": self.to_string(),
190 });
191 if let Some(data) = self.structured_data() {
192 payload["data"] = data;
193 }
194 payload
195 }
196
197 pub fn not_found(name: impl Into<String>) -> Self {
198 Self::NotFound { name: name.into() }
199 }
200 pub fn unavailable(name: impl Into<String>, reason: ToolUnavailableReason) -> Self {
201 Self::Unavailable {
202 name: name.into(),
203 reason,
204 }
205 }
206 pub fn invalid_arguments(name: impl Into<String>, reason: impl Into<String>) -> Self {
207 Self::InvalidArguments {
208 name: name.into(),
209 reason: reason.into(),
210 }
211 }
212 pub fn execution_failed(message: impl Into<String>) -> Self {
213 Self::ExecutionFailed {
214 message: message.into(),
215 }
216 }
217 pub fn execution_failed_with_data(message: impl Into<String>, data: serde_json::Value) -> Self {
218 Self::ExecutionFailedWithData {
219 message: message.into(),
220 data,
221 }
222 }
223 pub fn structured_data(&self) -> Option<serde_json::Value> {
224 match self {
225 Self::ExecutionFailedWithData { data, .. } => Some(data.clone()),
226 _ => None,
227 }
228 }
229 pub fn timeout(name: impl Into<String>, timeout_ms: u64) -> Self {
230 Self::Timeout {
231 name: name.into(),
232 timeout_ms,
233 }
234 }
235 pub fn access_denied(name: impl Into<String>) -> Self {
236 Self::AccessDenied { name: name.into() }
237 }
238 pub fn other(message: impl Into<String>) -> Self {
239 Self::Other(message.into())
240 }
241
242 pub fn callback_pending(tool_name: impl Into<String>, args: serde_json::Value) -> Self {
244 Self::CallbackPending {
245 tool_name: tool_name.into(),
246 args,
247 }
248 }
249
250 pub fn is_callback_pending(&self) -> bool {
252 matches!(self, Self::CallbackPending { .. })
253 }
254
255 pub fn as_callback_pending(&self) -> Option<(&str, &serde_json::Value)> {
257 match self {
258 Self::CallbackPending { tool_name, args } => Some((tool_name, args)),
259 _ => None,
260 }
261 }
262}
263
264impl From<String> for ToolError {
265 fn from(s: String) -> Self {
266 Self::Other(s)
267 }
268}
269impl From<&str> for ToolError {
270 fn from(s: &str) -> Self {
271 Self::Other(s.to_string())
272 }
273}
274
275#[derive(Debug, thiserror::Error)]
277#[non_exhaustive]
278pub enum AgentError {
279 #[error("LLM error ({provider}): {message}")]
280 Llm {
281 provider: &'static str,
282 reason: LlmFailureReason,
283 message: String,
284 },
285 #[error("Storage error: {0}")]
286 StoreError(String),
287 #[error("Tool error: {0}")]
288 ToolError(String),
289 #[error("MCP error: {0}")]
290 McpError(String),
291 #[error("Session not found: {0}")]
292 SessionNotFound(SessionId),
293 #[error("Token budget exceeded: used {used}, limit {limit}")]
294 TokenBudgetExceeded { used: u64, limit: u64 },
295 #[error("Time budget exceeded: {elapsed_secs}s > {limit_secs}s")]
296 TimeBudgetExceeded { elapsed_secs: u64, limit_secs: u64 },
297 #[error("Tool call budget exceeded: {count} calls > {limit} limit")]
298 ToolCallBudgetExceeded { count: usize, limit: usize },
299 #[error("Max tokens reached on turn {turn}, partial output: {partial}")]
300 MaxTokensReached { turn: u32, partial: String },
301 #[error("Content filtered on turn {turn}")]
302 ContentFiltered { turn: u32 },
303 #[error("Max turns reached: {turns}")]
304 MaxTurnsReached { turns: u32 },
305 #[error("Run was cancelled")]
306 Cancelled,
307 #[error("Invalid state transition: {from} -> {to}")]
308 InvalidStateTransition { from: String, to: String },
309 #[error("Operation not found: {0}")]
310 OperationNotFound(String),
311 #[error("Depth limit exceeded: {depth} > {max}")]
312 DepthLimitExceeded { depth: u32, max: u32 },
313 #[error("Concurrency limit exceeded")]
314 ConcurrencyLimitExceeded,
315 #[error("Configuration error: {0}")]
316 ConfigError(String),
317 #[error("Invalid tool in access policy: {tool}")]
318 InvalidToolAccess { tool: String },
319 #[error("Internal error: {0}")]
320 InternalError(String),
321
322 #[error("Build error: {0}")]
324 BuildError(String),
325
326 #[error("Connection `{binding_key}` requires re-authentication: {message}")]
333 AuthReauthRequired {
334 binding_key: String,
335 message: String,
336 },
337
338 #[error("Callback pending for tool '{tool_name}'")]
340 CallbackPending {
341 tool_name: String,
342 args: serde_json::Value,
343 },
344
345 #[error("Structured output validation failed after {attempts} attempts: {reason}")]
347 StructuredOutputValidationFailed {
348 attempts: u32,
349 reason: String,
350 last_output: String,
351 },
352
353 #[error("Invalid output schema: {0}")]
355 InvalidOutputSchema(String),
356
357 #[error("Hook '{hook_id}' denied at {point:?}: {reason_code:?} - {message}")]
358 HookDenied {
359 hook_id: HookId,
360 point: HookPoint,
361 reason_code: HookReasonCode,
362 message: String,
363 payload: Option<serde_json::Value>,
364 },
365
366 #[error("Hook '{hook_id}' timed out after {timeout_ms}ms")]
367 HookTimeout { hook_id: HookId, timeout_ms: u64 },
368
369 #[error("Hook execution failed for '{hook_id}': {reason}")]
370 HookExecutionFailed { hook_id: HookId, reason: String },
371
372 #[error("Hook configuration invalid: {reason}")]
373 HookConfigInvalid { reason: String },
374
375 #[error("Terminal failure: {outcome:?} ({cause_kind:?}): {message}")]
377 TerminalFailure {
378 outcome: crate::turn_execution_authority::TurnTerminalOutcome,
379 cause_kind: crate::turn_execution_authority::TurnTerminalCauseKind,
380 message: String,
381 },
382
383 #[error("no pending boundary for resume")]
389 NoPendingBoundary,
390}
391
392impl AgentError {
393 pub fn llm(
394 provider: &'static str,
395 reason: LlmFailureReason,
396 message: impl Into<String>,
397 ) -> Self {
398 Self::Llm {
399 provider,
400 reason,
401 message: message.into(),
402 }
403 }
404 pub fn is_graceful(&self) -> bool {
405 matches!(
406 self,
407 Self::TokenBudgetExceeded { .. }
408 | Self::TimeBudgetExceeded { .. }
409 | Self::ToolCallBudgetExceeded { .. }
410 | Self::MaxTurnsReached { .. }
411 )
412 }
413 pub fn is_rate_limited(&self) -> bool {
414 matches!(
415 self,
416 Self::Llm {
417 reason: LlmFailureReason::RateLimited { .. },
418 ..
419 }
420 )
421 }
422
423 pub fn retry_after_hint(&self) -> Option<std::time::Duration> {
424 match self {
425 Self::Llm {
426 reason: LlmFailureReason::RateLimited { retry_after },
427 ..
428 } => *retry_after,
429 _ => None,
430 }
431 }
432
433 pub fn is_recoverable(&self) -> bool {
434 match self {
435 Self::Llm { reason, .. } => match reason {
436 LlmFailureReason::RateLimited { .. } => true,
437 LlmFailureReason::NetworkTimeout { .. } => true,
438 LlmFailureReason::CallTimeout { .. } => true,
439 LlmFailureReason::ProviderError(provider_error) => provider_error.is_retryable(),
440 _ => false,
441 },
442 _ => false,
443 }
444 }
445}
446
447pub fn store_error(err: impl std::fmt::Display) -> AgentError {
448 AgentError::StoreError(store_error_message(err))
449}
450pub fn invalid_session_id(err: impl std::fmt::Display) -> AgentError {
451 AgentError::StoreError(invalid_session_id_message(err))
452}
453pub fn store_error_message(err: impl std::fmt::Display) -> String {
454 err.to_string()
455}
456pub fn invalid_session_id_message(err: impl std::fmt::Display) -> String {
457 format!("Invalid session ID: {err}")
458}
459
460#[cfg(test)]
461#[allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)]
462mod tests {
463 use super::*;
464
465 #[test]
466 fn test_network_timeout_is_recoverable() {
467 let err = AgentError::llm(
468 "anthropic",
469 LlmFailureReason::NetworkTimeout { duration_ms: 30000 },
470 "network timeout after 30s",
471 );
472 assert!(err.is_recoverable());
473 }
474
475 #[test]
476 fn test_call_timeout_is_recoverable() {
477 let err = AgentError::llm(
478 "anthropic",
479 LlmFailureReason::CallTimeout { duration_ms: 45000 },
480 "call timeout after 45s",
481 );
482 assert!(err.is_recoverable());
483 }
484
485 #[test]
486 fn test_network_timeout_typed_mapping() {
487 let reason = LlmFailureReason::NetworkTimeout { duration_ms: 5000 };
488 match reason {
489 LlmFailureReason::NetworkTimeout { duration_ms } => {
490 assert_eq!(duration_ms, 5000);
491 }
492 _ => panic!("expected NetworkTimeout"),
493 }
494 }
495
496 #[test]
497 fn test_call_timeout_typed_mapping() {
498 let reason = LlmFailureReason::CallTimeout { duration_ms: 60000 };
499 match reason {
500 LlmFailureReason::CallTimeout { duration_ms } => {
501 assert_eq!(duration_ms, 60000);
502 }
503 _ => panic!("expected CallTimeout"),
504 }
505 }
506
507 #[test]
508 fn test_timeout_variants_are_distinct() {
509 let net = LlmFailureReason::NetworkTimeout { duration_ms: 1000 };
510 let call = LlmFailureReason::CallTimeout { duration_ms: 1000 };
511 assert_ne!(net, call);
512 }
513
514 #[test]
515 fn test_auth_error_not_recoverable() {
516 let err = AgentError::llm("anthropic", LlmFailureReason::AuthError, "bad key");
517 assert!(!err.is_recoverable());
518 }
519
520 #[test]
521 fn provider_error_uses_typed_retryability_for_recovery() {
522 let err = AgentError::llm(
523 "anthropic",
524 LlmFailureReason::ProviderError(LlmProviderError::retryable(
525 LlmProviderErrorKind::ServerOverloaded,
526 serde_json::json!({
527 "message": "provider overloaded"
528 }),
529 )),
530 "provider overloaded",
531 );
532
533 assert!(err.is_recoverable());
534 }
535
536 #[test]
537 fn provider_error_fails_closed_when_json_claims_retryable() {
538 let err = AgentError::llm(
539 "anthropic",
540 LlmFailureReason::ProviderError(LlmProviderError::non_retryable(
541 LlmProviderErrorKind::InvalidRequest,
542 serde_json::json!({
543 "kind": "server_overloaded",
544 "retryable": true,
545 "message": "json payload must not control retryability"
546 }),
547 )),
548 "invalid request",
549 );
550
551 assert!(!err.is_recoverable());
552 }
553
554 #[test]
557 fn test_is_rate_limited_true_for_rate_limit_error() {
558 let err = AgentError::llm(
559 "anthropic",
560 LlmFailureReason::RateLimited {
561 retry_after: Some(std::time::Duration::from_secs(30)),
562 },
563 "rate limited",
564 );
565 assert!(err.is_rate_limited());
566 }
567
568 #[test]
569 fn test_is_rate_limited_false_for_other_errors() {
570 let err = AgentError::llm(
571 "anthropic",
572 LlmFailureReason::NetworkTimeout { duration_ms: 5000 },
573 "timeout",
574 );
575 assert!(!err.is_rate_limited());
576
577 let err = AgentError::llm("anthropic", LlmFailureReason::AuthError, "bad key");
578 assert!(!err.is_rate_limited());
579 }
580
581 #[test]
582 fn test_retry_after_hint_returns_duration_for_rate_limit() {
583 let err = AgentError::llm(
584 "anthropic",
585 LlmFailureReason::RateLimited {
586 retry_after: Some(std::time::Duration::from_secs(60)),
587 },
588 "rate limited",
589 );
590 assert_eq!(
591 err.retry_after_hint(),
592 Some(std::time::Duration::from_secs(60))
593 );
594 }
595
596 #[test]
597 fn test_retry_after_hint_returns_none_for_non_rate_limit() {
598 let err = AgentError::llm(
599 "anthropic",
600 LlmFailureReason::NetworkTimeout { duration_ms: 5000 },
601 "timeout",
602 );
603 assert_eq!(err.retry_after_hint(), None);
604 }
605
606 #[test]
607 fn test_timeout_variants_not_graceful() {
608 let err = AgentError::llm(
609 "anthropic",
610 LlmFailureReason::NetworkTimeout { duration_ms: 1000 },
611 "timeout",
612 );
613 assert!(!err.is_graceful());
614
615 let err = AgentError::llm(
616 "anthropic",
617 LlmFailureReason::CallTimeout { duration_ms: 1000 },
618 "timeout",
619 );
620 assert!(!err.is_graceful());
621 }
622
623 #[test]
626 fn test_build_error_variant_exists_and_carries_message() {
627 let err = AgentError::BuildError("Missing API key for provider 'anthropic'".to_string());
628 match &err {
629 AgentError::BuildError(msg) => {
630 assert!(
631 msg.contains("API key"),
632 "message should contain source text"
633 );
634 }
635 other => panic!("expected BuildError, got: {other}"),
636 }
637 }
638
639 #[test]
640 fn test_build_error_is_not_recoverable() {
641 let err = AgentError::BuildError("Unknown provider for model 'llama-3'".to_string());
642 assert!(!err.is_recoverable(), "build errors are not recoverable");
643 }
644
645 #[test]
646 fn test_build_error_is_not_graceful() {
647 let err = AgentError::BuildError("Missing API key".to_string());
648 assert!(!err.is_graceful(), "build errors are not graceful");
649 }
650
651 #[test]
652 fn test_build_error_display() {
653 let err = AgentError::BuildError("Missing API key for provider 'anthropic'".to_string());
654 let display = err.to_string();
655 assert!(
656 display.contains("Build error")
657 || display.contains("build error")
658 || display.contains("Missing API key"),
659 "display should mention the build error: {display}"
660 );
661 }
662
663 #[test]
666 fn test_terminal_failure_carries_typed_outcome() {
667 use crate::turn_execution_authority::{TurnTerminalCauseKind, TurnTerminalOutcome};
668
669 let err = AgentError::TerminalFailure {
671 outcome: TurnTerminalOutcome::Failed,
672 cause_kind: TurnTerminalCauseKind::LlmFailure,
673 message: "llm failed".to_string(),
674 };
675 match &err {
676 AgentError::TerminalFailure {
677 outcome,
678 cause_kind,
679 ..
680 } => {
681 assert_eq!(*outcome, TurnTerminalOutcome::Failed);
683 assert_eq!(*cause_kind, TurnTerminalCauseKind::LlmFailure);
684 }
685 other => panic!("expected TerminalFailure, got: {other}"),
686 }
687 }
688
689 #[test]
690 fn test_terminal_failure_display_includes_outcome() {
691 use crate::turn_execution_authority::{TurnTerminalCauseKind, TurnTerminalOutcome};
692
693 let err = AgentError::TerminalFailure {
694 outcome: TurnTerminalOutcome::TimeBudgetExceeded,
695 cause_kind: TurnTerminalCauseKind::TimeBudgetExceeded,
696 message: "deadline reached".to_string(),
697 };
698 let display = err.to_string();
699 assert!(
700 display.contains("TimeBudgetExceeded"),
701 "display should include the outcome variant name: {display}"
702 );
703 assert!(
704 display.contains("TimeBudgetExceeded") && display.contains("deadline reached"),
705 "display should include cause and display message: {display}"
706 );
707 }
708
709 #[test]
710 fn test_terminal_failure_all_hard_failure_outcomes() {
711 use crate::turn_execution_authority::{TurnTerminalCauseKind, TurnTerminalOutcome};
712
713 for outcome in [
715 TurnTerminalOutcome::Failed,
716 TurnTerminalOutcome::TimeBudgetExceeded,
717 ] {
718 let err = AgentError::TerminalFailure {
719 outcome,
720 cause_kind: TurnTerminalCauseKind::FatalFailure,
721 message: "terminal".to_string(),
722 };
723 assert!(
724 !err.is_graceful(),
725 "TerminalFailure({outcome:?}) should not be graceful"
726 );
727 }
728 }
729}