Skip to main content

meerkat_core/
error.rs

1//! Core error types for Meerkat
2
3use crate::hooks::{HookId, HookPoint, HookReasonCode};
4use crate::tool_catalog::ToolUnavailableReason;
5use crate::types::SessionId;
6use serde::{Deserialize, Serialize};
7
8#[derive(Debug, Clone, PartialEq)]
9#[non_exhaustive]
10pub enum LlmFailureReason {
11    RateLimited {
12        retry_after: Option<std::time::Duration>,
13    },
14    ContextExceeded {
15        max: u32,
16        requested: u32,
17    },
18    AuthError,
19    InvalidModel(String),
20    ProviderError(LlmProviderError),
21    /// Provider/client-native network timeout (owned by client layer)
22    NetworkTimeout {
23        duration_ms: u64,
24    },
25    /// Agent-loop hard call timeout (owned by agent loop policy)
26    CallTimeout {
27        duration_ms: u64,
28    },
29}
30
31#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
32#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
33#[serde(rename_all = "snake_case")]
34pub enum LlmProviderErrorKind {
35    InvalidRequest,
36    ContentFiltered,
37    ServerError,
38    ServerOverloaded,
39    ConnectionReset,
40    Unknown,
41    StreamParseError,
42    IncompleteResponse,
43}
44
45#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
46#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
47#[serde(rename_all = "snake_case")]
48pub enum LlmProviderErrorRetryability {
49    Retryable,
50    NonRetryable,
51}
52
53impl LlmProviderErrorRetryability {
54    pub fn is_retryable(self) -> bool {
55        matches!(self, Self::Retryable)
56    }
57}
58
59#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
60#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
61pub struct LlmProviderError {
62    pub kind: LlmProviderErrorKind,
63    pub retryability: LlmProviderErrorRetryability,
64    #[serde(default, skip_serializing_if = "serde_json::Value::is_null")]
65    pub details: serde_json::Value,
66}
67
68impl LlmProviderError {
69    pub fn new(
70        kind: LlmProviderErrorKind,
71        retryability: LlmProviderErrorRetryability,
72        details: serde_json::Value,
73    ) -> Self {
74        Self {
75            kind,
76            retryability,
77            details,
78        }
79    }
80
81    pub fn retryable(kind: LlmProviderErrorKind, details: serde_json::Value) -> Self {
82        Self::new(kind, LlmProviderErrorRetryability::Retryable, details)
83    }
84
85    pub fn non_retryable(kind: LlmProviderErrorKind, details: serde_json::Value) -> Self {
86        Self::new(kind, LlmProviderErrorRetryability::NonRetryable, details)
87    }
88
89    pub fn is_retryable(&self) -> bool {
90        self.retryability.is_retryable()
91    }
92}
93
94/// Errors that can occur during tool validation
95#[derive(Debug, Clone, thiserror::Error, PartialEq)]
96pub enum ToolValidationError {
97    /// The requested tool was not found
98    #[error("Tool not found: {name}")]
99    NotFound { name: String },
100    /// The tool arguments failed validation
101    #[error("Invalid arguments for tool '{name}': {reason}")]
102    InvalidArguments { name: String, reason: String },
103}
104
105impl ToolValidationError {
106    pub fn not_found(name: impl Into<String>) -> Self {
107        Self::NotFound { name: name.into() }
108    }
109    pub fn invalid_arguments(name: impl Into<String>, reason: impl Into<String>) -> Self {
110        Self::InvalidArguments {
111            name: name.into(),
112            reason: reason.into(),
113        }
114    }
115}
116
117/// Error returned by tool dispatch operations.
118#[derive(Debug, Clone, thiserror::Error)]
119pub enum ToolError {
120    /// The requested tool was not found
121    #[error("Tool not found: {name}")]
122    NotFound { name: String },
123
124    /// The tool exists but is currently unavailable
125    #[error("Tool '{name}' is currently unavailable: {reason}")]
126    Unavailable {
127        name: String,
128        reason: ToolUnavailableReason,
129    },
130
131    /// The tool arguments failed validation
132    #[error("Invalid arguments for tool '{name}': {reason}")]
133    InvalidArguments { name: String, reason: String },
134
135    /// The tool execution failed
136    #[error("Tool execution failed: {message}")]
137    ExecutionFailed { message: String },
138
139    /// The tool execution failed with structured error data for protocol surfaces.
140    #[error("Tool execution failed: {message}")]
141    ExecutionFailedWithData {
142        message: String,
143        data: serde_json::Value,
144    },
145
146    /// The tool execution timed out
147    #[error("Tool '{name}' timed out after {timeout_ms}ms")]
148    Timeout { name: String, timeout_ms: u64 },
149
150    /// Tool access was denied by policy
151    #[error("Tool '{name}' is not allowed by policy")]
152    AccessDenied { name: String },
153
154    /// A generic tool error with a message
155    #[error("{0}")]
156    Other(String),
157
158    /// Tool call must be routed externally (callback pending)
159    ///
160    /// This variant signals that a tool call cannot be handled internally
161    /// and must be routed to an external handler. The payload contains
162    /// serialized information about the pending tool call.
163    #[error("Callback pending for tool '{tool_name}'")]
164    CallbackPending {
165        tool_name: String,
166        args: serde_json::Value,
167    },
168}
169
170impl ToolError {
171    pub fn error_code(&self) -> &'static str {
172        match self {
173            Self::NotFound { .. } => "tool_not_found",
174            Self::Unavailable { .. } => "tool_unavailable",
175            Self::InvalidArguments { .. } => "invalid_arguments",
176            Self::ExecutionFailed { .. } | Self::ExecutionFailedWithData { .. } => {
177                "execution_failed"
178            }
179            Self::Timeout { .. } => "timeout",
180            Self::AccessDenied { .. } => "access_denied",
181            Self::Other(_) => "tool_error",
182            Self::CallbackPending { .. } => "callback_pending",
183        }
184    }
185
186    pub fn to_error_payload(&self) -> serde_json::Value {
187        let mut payload = serde_json::json!({
188            "error": self.error_code(),
189            "message": self.to_string(),
190        });
191        if let Some(data) = self.structured_data() {
192            payload["data"] = data;
193        }
194        payload
195    }
196
197    pub fn not_found(name: impl Into<String>) -> Self {
198        Self::NotFound { name: name.into() }
199    }
200    pub fn unavailable(name: impl Into<String>, reason: ToolUnavailableReason) -> Self {
201        Self::Unavailable {
202            name: name.into(),
203            reason,
204        }
205    }
206    pub fn invalid_arguments(name: impl Into<String>, reason: impl Into<String>) -> Self {
207        Self::InvalidArguments {
208            name: name.into(),
209            reason: reason.into(),
210        }
211    }
212    pub fn execution_failed(message: impl Into<String>) -> Self {
213        Self::ExecutionFailed {
214            message: message.into(),
215        }
216    }
217    pub fn execution_failed_with_data(message: impl Into<String>, data: serde_json::Value) -> Self {
218        Self::ExecutionFailedWithData {
219            message: message.into(),
220            data,
221        }
222    }
223    pub fn structured_data(&self) -> Option<serde_json::Value> {
224        match self {
225            Self::ExecutionFailedWithData { data, .. } => Some(data.clone()),
226            _ => None,
227        }
228    }
229    pub fn timeout(name: impl Into<String>, timeout_ms: u64) -> Self {
230        Self::Timeout {
231            name: name.into(),
232            timeout_ms,
233        }
234    }
235    pub fn access_denied(name: impl Into<String>) -> Self {
236        Self::AccessDenied { name: name.into() }
237    }
238    pub fn other(message: impl Into<String>) -> Self {
239        Self::Other(message.into())
240    }
241
242    /// Create a callback pending error for external tool routing
243    pub fn callback_pending(tool_name: impl Into<String>, args: serde_json::Value) -> Self {
244        Self::CallbackPending {
245            tool_name: tool_name.into(),
246            args,
247        }
248    }
249
250    /// Check if this is a callback pending error
251    pub fn is_callback_pending(&self) -> bool {
252        matches!(self, Self::CallbackPending { .. })
253    }
254
255    /// Extract callback pending info if this is a CallbackPending error
256    pub fn as_callback_pending(&self) -> Option<(&str, &serde_json::Value)> {
257        match self {
258            Self::CallbackPending { tool_name, args } => Some((tool_name, args)),
259            _ => None,
260        }
261    }
262}
263
264impl From<String> for ToolError {
265    fn from(s: String) -> Self {
266        Self::Other(s)
267    }
268}
269impl From<&str> for ToolError {
270    fn from(s: &str) -> Self {
271        Self::Other(s.to_string())
272    }
273}
274
275/// Errors that can occur during agent execution
276#[derive(Debug, thiserror::Error)]
277#[non_exhaustive]
278pub enum AgentError {
279    #[error("LLM error ({provider}): {message}")]
280    Llm {
281        provider: &'static str,
282        reason: LlmFailureReason,
283        message: String,
284    },
285    #[error("Storage error: {0}")]
286    StoreError(String),
287    #[error("Tool error: {0}")]
288    ToolError(String),
289    #[error("MCP error: {0}")]
290    McpError(String),
291    #[error("Session not found: {0}")]
292    SessionNotFound(SessionId),
293    #[error("Token budget exceeded: used {used}, limit {limit}")]
294    TokenBudgetExceeded { used: u64, limit: u64 },
295    #[error("Time budget exceeded: {elapsed_secs}s > {limit_secs}s")]
296    TimeBudgetExceeded { elapsed_secs: u64, limit_secs: u64 },
297    #[error("Tool call budget exceeded: {count} calls > {limit} limit")]
298    ToolCallBudgetExceeded { count: usize, limit: usize },
299    #[error("Max tokens reached on turn {turn}, partial output: {partial}")]
300    MaxTokensReached { turn: u32, partial: String },
301    #[error("Content filtered on turn {turn}")]
302    ContentFiltered { turn: u32 },
303    #[error("Max turns reached: {turns}")]
304    MaxTurnsReached { turns: u32 },
305    #[error("Run was cancelled")]
306    Cancelled,
307    #[error("Invalid state transition: {from} -> {to}")]
308    InvalidStateTransition { from: String, to: String },
309    #[error("Operation not found: {0}")]
310    OperationNotFound(String),
311    #[error("Depth limit exceeded: {depth} > {max}")]
312    DepthLimitExceeded { depth: u32, max: u32 },
313    #[error("Concurrency limit exceeded")]
314    ConcurrencyLimitExceeded,
315    #[error("Configuration error: {0}")]
316    ConfigError(String),
317    #[error("Invalid tool in access policy: {tool}")]
318    InvalidToolAccess { tool: String },
319    #[error("Internal error: {0}")]
320    InternalError(String),
321
322    /// Agent construction failed (e.g. missing API key, unknown provider).
323    #[error("Build error: {0}")]
324    BuildError(String),
325
326    /// MeerkatMachine DSL observed an auth lease in `reauth_required`
327    /// state at a CallingLlm boundary; the lease cannot proceed
328    /// until the user re-authenticates (`rkat auth login`). This is a
329    /// machine-owned terminal class (Phase 1.5-rev), distinct from
330    /// [`AgentError::InternalError`] which is for genuinely
331    /// unexpected failures.
332    #[error("Connection `{binding_key}` requires re-authentication: {message}")]
333    AuthReauthRequired {
334        binding_key: String,
335        message: String,
336    },
337
338    /// A tool call must be routed externally (callback pending)
339    #[error("Callback pending for tool '{tool_name}'")]
340    CallbackPending {
341        tool_name: String,
342        args: serde_json::Value,
343    },
344
345    /// Structured output validation failed after retries
346    #[error("Structured output validation failed after {attempts} attempts: {reason}")]
347    StructuredOutputValidationFailed {
348        attempts: u32,
349        reason: String,
350        last_output: String,
351    },
352
353    /// Invalid output schema provided
354    #[error("Invalid output schema: {0}")]
355    InvalidOutputSchema(String),
356
357    #[error("Hook '{hook_id}' denied at {point:?}: {reason_code:?} - {message}")]
358    HookDenied {
359        hook_id: HookId,
360        point: HookPoint,
361        reason_code: HookReasonCode,
362        message: String,
363        payload: Option<serde_json::Value>,
364    },
365
366    #[error("Hook '{hook_id}' timed out after {timeout_ms}ms")]
367    HookTimeout { hook_id: HookId, timeout_ms: u64 },
368
369    #[error("Hook execution failed for '{hook_id}': {reason}")]
370    HookExecutionFailed { hook_id: HookId, reason: String },
371
372    #[error("Hook configuration invalid: {reason}")]
373    HookConfigInvalid { reason: String },
374
375    /// Turn execution reached a terminal outcome classified as HardFailure.
376    #[error("Terminal failure: {outcome:?} ({cause_kind:?}): {message}")]
377    TerminalFailure {
378        outcome: crate::turn_execution_authority::TurnTerminalOutcome,
379        cause_kind: crate::turn_execution_authority::TurnTerminalCauseKind,
380        message: String,
381    },
382
383    /// The session has no pending user/tool-results boundary for `run_pending`.
384    ///
385    /// Returned when `RuntimeExecutionKind::ResumePending` is requested but the
386    /// session's last message is not `User` or `ToolResults`. The caller should
387    /// treat this as a successful no-op (no turn ran, no output produced).
388    #[error("no pending boundary for resume")]
389    NoPendingBoundary,
390}
391
392impl AgentError {
393    pub fn llm(
394        provider: &'static str,
395        reason: LlmFailureReason,
396        message: impl Into<String>,
397    ) -> Self {
398        Self::Llm {
399            provider,
400            reason,
401            message: message.into(),
402        }
403    }
404    pub fn is_graceful(&self) -> bool {
405        matches!(
406            self,
407            Self::TokenBudgetExceeded { .. }
408                | Self::TimeBudgetExceeded { .. }
409                | Self::ToolCallBudgetExceeded { .. }
410                | Self::MaxTurnsReached { .. }
411        )
412    }
413    pub fn is_rate_limited(&self) -> bool {
414        matches!(
415            self,
416            Self::Llm {
417                reason: LlmFailureReason::RateLimited { .. },
418                ..
419            }
420        )
421    }
422
423    pub fn retry_after_hint(&self) -> Option<std::time::Duration> {
424        match self {
425            Self::Llm {
426                reason: LlmFailureReason::RateLimited { retry_after },
427                ..
428            } => *retry_after,
429            _ => None,
430        }
431    }
432
433    pub fn is_recoverable(&self) -> bool {
434        match self {
435            Self::Llm { reason, .. } => match reason {
436                LlmFailureReason::RateLimited { .. } => true,
437                LlmFailureReason::NetworkTimeout { .. } => true,
438                LlmFailureReason::CallTimeout { .. } => true,
439                LlmFailureReason::ProviderError(provider_error) => provider_error.is_retryable(),
440                _ => false,
441            },
442            _ => false,
443        }
444    }
445}
446
447pub fn store_error(err: impl std::fmt::Display) -> AgentError {
448    AgentError::StoreError(store_error_message(err))
449}
450pub fn invalid_session_id(err: impl std::fmt::Display) -> AgentError {
451    AgentError::StoreError(invalid_session_id_message(err))
452}
453pub fn store_error_message(err: impl std::fmt::Display) -> String {
454    err.to_string()
455}
456pub fn invalid_session_id_message(err: impl std::fmt::Display) -> String {
457    format!("Invalid session ID: {err}")
458}
459
460#[cfg(test)]
461#[allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)]
462mod tests {
463    use super::*;
464
465    #[test]
466    fn test_network_timeout_is_recoverable() {
467        let err = AgentError::llm(
468            "anthropic",
469            LlmFailureReason::NetworkTimeout { duration_ms: 30000 },
470            "network timeout after 30s",
471        );
472        assert!(err.is_recoverable());
473    }
474
475    #[test]
476    fn test_call_timeout_is_recoverable() {
477        let err = AgentError::llm(
478            "anthropic",
479            LlmFailureReason::CallTimeout { duration_ms: 45000 },
480            "call timeout after 45s",
481        );
482        assert!(err.is_recoverable());
483    }
484
485    #[test]
486    fn test_network_timeout_typed_mapping() {
487        let reason = LlmFailureReason::NetworkTimeout { duration_ms: 5000 };
488        match reason {
489            LlmFailureReason::NetworkTimeout { duration_ms } => {
490                assert_eq!(duration_ms, 5000);
491            }
492            _ => panic!("expected NetworkTimeout"),
493        }
494    }
495
496    #[test]
497    fn test_call_timeout_typed_mapping() {
498        let reason = LlmFailureReason::CallTimeout { duration_ms: 60000 };
499        match reason {
500            LlmFailureReason::CallTimeout { duration_ms } => {
501                assert_eq!(duration_ms, 60000);
502            }
503            _ => panic!("expected CallTimeout"),
504        }
505    }
506
507    #[test]
508    fn test_timeout_variants_are_distinct() {
509        let net = LlmFailureReason::NetworkTimeout { duration_ms: 1000 };
510        let call = LlmFailureReason::CallTimeout { duration_ms: 1000 };
511        assert_ne!(net, call);
512    }
513
514    #[test]
515    fn test_auth_error_not_recoverable() {
516        let err = AgentError::llm("anthropic", LlmFailureReason::AuthError, "bad key");
517        assert!(!err.is_recoverable());
518    }
519
520    #[test]
521    fn provider_error_uses_typed_retryability_for_recovery() {
522        let err = AgentError::llm(
523            "anthropic",
524            LlmFailureReason::ProviderError(LlmProviderError::retryable(
525                LlmProviderErrorKind::ServerOverloaded,
526                serde_json::json!({
527                    "message": "provider overloaded"
528                }),
529            )),
530            "provider overloaded",
531        );
532
533        assert!(err.is_recoverable());
534    }
535
536    #[test]
537    fn provider_error_fails_closed_when_json_claims_retryable() {
538        let err = AgentError::llm(
539            "anthropic",
540            LlmFailureReason::ProviderError(LlmProviderError::non_retryable(
541                LlmProviderErrorKind::InvalidRequest,
542                serde_json::json!({
543                    "kind": "server_overloaded",
544                    "retryable": true,
545                    "message": "json payload must not control retryability"
546                }),
547            )),
548            "invalid request",
549        );
550
551        assert!(!err.is_recoverable());
552    }
553
554    // -- Rate-limit helper tests (PR #156 port) --
555
556    #[test]
557    fn test_is_rate_limited_true_for_rate_limit_error() {
558        let err = AgentError::llm(
559            "anthropic",
560            LlmFailureReason::RateLimited {
561                retry_after: Some(std::time::Duration::from_secs(30)),
562            },
563            "rate limited",
564        );
565        assert!(err.is_rate_limited());
566    }
567
568    #[test]
569    fn test_is_rate_limited_false_for_other_errors() {
570        let err = AgentError::llm(
571            "anthropic",
572            LlmFailureReason::NetworkTimeout { duration_ms: 5000 },
573            "timeout",
574        );
575        assert!(!err.is_rate_limited());
576
577        let err = AgentError::llm("anthropic", LlmFailureReason::AuthError, "bad key");
578        assert!(!err.is_rate_limited());
579    }
580
581    #[test]
582    fn test_retry_after_hint_returns_duration_for_rate_limit() {
583        let err = AgentError::llm(
584            "anthropic",
585            LlmFailureReason::RateLimited {
586                retry_after: Some(std::time::Duration::from_secs(60)),
587            },
588            "rate limited",
589        );
590        assert_eq!(
591            err.retry_after_hint(),
592            Some(std::time::Duration::from_secs(60))
593        );
594    }
595
596    #[test]
597    fn test_retry_after_hint_returns_none_for_non_rate_limit() {
598        let err = AgentError::llm(
599            "anthropic",
600            LlmFailureReason::NetworkTimeout { duration_ms: 5000 },
601            "timeout",
602        );
603        assert_eq!(err.retry_after_hint(), None);
604    }
605
606    #[test]
607    fn test_timeout_variants_not_graceful() {
608        let err = AgentError::llm(
609            "anthropic",
610            LlmFailureReason::NetworkTimeout { duration_ms: 1000 },
611            "timeout",
612        );
613        assert!(!err.is_graceful());
614
615        let err = AgentError::llm(
616            "anthropic",
617            LlmFailureReason::CallTimeout { duration_ms: 1000 },
618            "timeout",
619        );
620        assert!(!err.is_graceful());
621    }
622
623    // -- P2-6: Typed BuildError variant --
624
625    #[test]
626    fn test_build_error_variant_exists_and_carries_message() {
627        let err = AgentError::BuildError("Missing API key for provider 'anthropic'".to_string());
628        match &err {
629            AgentError::BuildError(msg) => {
630                assert!(
631                    msg.contains("API key"),
632                    "message should contain source text"
633                );
634            }
635            other => panic!("expected BuildError, got: {other}"),
636        }
637    }
638
639    #[test]
640    fn test_build_error_is_not_recoverable() {
641        let err = AgentError::BuildError("Unknown provider for model 'llama-3'".to_string());
642        assert!(!err.is_recoverable(), "build errors are not recoverable");
643    }
644
645    #[test]
646    fn test_build_error_is_not_graceful() {
647        let err = AgentError::BuildError("Missing API key".to_string());
648        assert!(!err.is_graceful(), "build errors are not graceful");
649    }
650
651    #[test]
652    fn test_build_error_display() {
653        let err = AgentError::BuildError("Missing API key for provider 'anthropic'".to_string());
654        let display = err.to_string();
655        assert!(
656            display.contains("Build error")
657                || display.contains("build error")
658                || display.contains("Missing API key"),
659            "display should mention the build error: {display}"
660        );
661    }
662
663    // -- P2-7: Typed TerminalFailure outcome --
664
665    #[test]
666    fn test_terminal_failure_carries_typed_outcome() {
667        use crate::turn_execution_authority::{TurnTerminalCauseKind, TurnTerminalOutcome};
668
669        // TerminalFailure must carry typed enums, not Debug-formatted strings.
670        let err = AgentError::TerminalFailure {
671            outcome: TurnTerminalOutcome::Failed,
672            cause_kind: TurnTerminalCauseKind::LlmFailure,
673            message: "llm failed".to_string(),
674        };
675        match &err {
676            AgentError::TerminalFailure {
677                outcome,
678                cause_kind,
679                ..
680            } => {
681                // If this compiles, outcome/cause_kind are typed enums, not Strings.
682                assert_eq!(*outcome, TurnTerminalOutcome::Failed);
683                assert_eq!(*cause_kind, TurnTerminalCauseKind::LlmFailure);
684            }
685            other => panic!("expected TerminalFailure, got: {other}"),
686        }
687    }
688
689    #[test]
690    fn test_terminal_failure_display_includes_outcome() {
691        use crate::turn_execution_authority::{TurnTerminalCauseKind, TurnTerminalOutcome};
692
693        let err = AgentError::TerminalFailure {
694            outcome: TurnTerminalOutcome::TimeBudgetExceeded,
695            cause_kind: TurnTerminalCauseKind::TimeBudgetExceeded,
696            message: "deadline reached".to_string(),
697        };
698        let display = err.to_string();
699        assert!(
700            display.contains("TimeBudgetExceeded"),
701            "display should include the outcome variant name: {display}"
702        );
703        assert!(
704            display.contains("TimeBudgetExceeded") && display.contains("deadline reached"),
705            "display should include cause and display message: {display}"
706        );
707    }
708
709    #[test]
710    fn test_terminal_failure_all_hard_failure_outcomes() {
711        use crate::turn_execution_authority::{TurnTerminalCauseKind, TurnTerminalOutcome};
712
713        // Both hard-failure outcomes should be representable.
714        for outcome in [
715            TurnTerminalOutcome::Failed,
716            TurnTerminalOutcome::TimeBudgetExceeded,
717        ] {
718            let err = AgentError::TerminalFailure {
719                outcome,
720                cause_kind: TurnTerminalCauseKind::FatalFailure,
721                message: "terminal".to_string(),
722            };
723            assert!(
724                !err.is_graceful(),
725                "TerminalFailure({outcome:?}) should not be graceful"
726            );
727        }
728    }
729}