Skip to main content

zeph_common/
error_taxonomy.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Shared error classification enums for tool invocation failures.
5//!
6//! Only the pure data types ([`ToolErrorCategory`] and [`ErrorDomain`]) live here.
7//! The `classify_*` helper functions and executor-specific types remain in `zeph-tools`,
8//! which may depend on `std::io::Error` and HTTP status codes.
9
10/// High-level error domain for recovery strategy dispatch.
11///
12/// Groups the `ToolErrorCategory` variants into 4 domains that map to distinct
13/// recovery strategies in the agent loop. Does NOT replace `ToolErrorCategory` — it
14/// is a companion abstraction for coarse dispatch.
15///
16/// # Examples
17///
18/// ```rust
19/// use zeph_common::error_taxonomy::ErrorDomain;
20///
21/// assert!(ErrorDomain::System.is_auto_retryable());
22/// assert!(!ErrorDomain::Planning.is_auto_retryable());
23/// ```
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
25#[serde(rename_all = "snake_case")]
26pub enum ErrorDomain {
27    /// The agent selected the wrong tool or misunderstood the task.
28    /// Recovery: re-plan, pick a different tool or approach.
29    /// Categories: `ToolNotFound`
30    Planning,
31
32    /// The agent's output (parameters, types) was malformed.
33    /// Recovery: reformat parameters using tool schema, retry once.
34    /// Categories: `InvalidParameters`, `TypeMismatch`
35    Reflection,
36
37    /// External action failed due to policy or resource constraints.
38    /// Recovery: inform user, suggest alternative, or skip.
39    /// Categories: `PolicyBlocked`, `ConfirmationRequired`, `PermanentFailure`, `Cancelled`
40    Action,
41
42    /// Transient infrastructure failure.
43    /// Recovery: automatic retry with backoff.
44    /// Categories: `RateLimited`, `ServerError`, `NetworkError`, `Timeout`
45    System,
46}
47
48impl ErrorDomain {
49    /// Whether errors in this domain should trigger automatic retry.
50    #[must_use]
51    pub const fn is_auto_retryable(self) -> bool {
52        matches!(self, Self::System)
53    }
54
55    /// Whether the LLM should be asked to fix its output.
56    #[must_use]
57    pub const fn needs_llm_correction(self) -> bool {
58        matches!(self, Self::Reflection | Self::Planning)
59    }
60
61    /// Human-readable label for audit logs.
62    #[must_use]
63    pub const fn label(self) -> &'static str {
64        match self {
65            Self::Planning => "planning",
66            Self::Reflection => "reflection",
67            Self::Action => "action",
68            Self::System => "system",
69        }
70    }
71}
72
73/// Invocation phase in which a tool failure occurred, per arXiv:2601.16280.
74///
75/// Maps Zeph's `ToolErrorCategory` variants to the 4-phase diagnostic framework:
76/// Setup → `ParamHandling` → Execution → `ResultInterpretation`.
77#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
78#[serde(rename_all = "snake_case")]
79pub enum ToolInvocationPhase {
80    /// Tool lookup/registration phase: was the tool name valid?
81    Setup,
82    /// Parameter validation phase: were the provided arguments well-formed?
83    ParamHandling,
84    /// Runtime execution phase: did the tool run successfully?
85    Execution,
86    /// Output parsing/interpretation phase: was the result usable?
87    /// Reserved for future use — no current `ToolErrorCategory` maps here.
88    ResultInterpretation,
89}
90
91impl ToolInvocationPhase {
92    /// Human-readable label for audit logs.
93    #[must_use]
94    pub const fn label(self) -> &'static str {
95        match self {
96            Self::Setup => "setup",
97            Self::ParamHandling => "param_handling",
98            Self::Execution => "execution",
99            Self::ResultInterpretation => "result_interpretation",
100        }
101    }
102}
103
104/// Fine-grained 12-category classification of tool invocation errors.
105///
106/// Each category determines retry eligibility, LLM parameter reformat path,
107/// quality attribution for reputation scoring, and structured feedback content.
108///
109/// # Examples
110///
111/// ```rust
112/// use zeph_common::error_taxonomy::ToolErrorCategory;
113///
114/// assert!(ToolErrorCategory::RateLimited.is_retryable());
115/// assert!(!ToolErrorCategory::InvalidParameters.is_retryable());
116/// ```
117#[non_exhaustive]
118#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
119pub enum ToolErrorCategory {
120    // ── Initialization failures ──────────────────────────────────────────
121    /// Tool name not found in the registry (LLM requested a non-existent tool).
122    ToolNotFound,
123
124    // ── Parameter failures ───────────────────────────────────────────────
125    /// LLM provided invalid or missing parameters for the tool.
126    InvalidParameters,
127    /// Parameter type mismatch (e.g., string where integer expected).
128    TypeMismatch,
129
130    // ── Permission / policy failures ─────────────────────────────────────
131    /// Blocked by security policy (blocklist, sandbox, trust gate).
132    PolicyBlocked,
133    /// Requires user confirmation before execution.
134    ConfirmationRequired,
135
136    // ── Execution failures (permanent) ───────────────────────────────────
137    /// HTTP 403/404 or equivalent permanent resource rejection.
138    PermanentFailure,
139    /// Operation cancelled by the user.
140    Cancelled,
141
142    // ── Execution failures (transient) ───────────────────────────────────
143    /// HTTP 429 (rate limit) or resource exhaustion.
144    RateLimited,
145    /// HTTP 5xx or equivalent server-side error.
146    ServerError,
147    /// Network connectivity failure (DNS, connection refused, reset).
148    NetworkError,
149    /// Operation timed out.
150    Timeout,
151}
152
153impl ToolErrorCategory {
154    /// Whether this error category is eligible for automatic retry with backoff.
155    #[must_use]
156    pub const fn is_retryable(self) -> bool {
157        matches!(
158            self,
159            Self::RateLimited | Self::ServerError | Self::NetworkError | Self::Timeout
160        )
161    }
162
163    /// Whether the LLM should be asked to reformat parameters and retry.
164    ///
165    /// Only `InvalidParameters` and `TypeMismatch` trigger the reformat path.
166    #[must_use]
167    pub const fn needs_parameter_reformat(self) -> bool {
168        matches!(self, Self::InvalidParameters | Self::TypeMismatch)
169    }
170
171    /// Whether this error is attributable to LLM output quality.
172    ///
173    /// Infrastructure errors (network, timeout, server, rate limit) are NOT
174    /// the model's fault and must never trigger self-reflection.
175    #[must_use]
176    pub const fn is_quality_failure(self) -> bool {
177        matches!(
178            self,
179            Self::InvalidParameters | Self::TypeMismatch | Self::ToolNotFound
180        )
181    }
182
183    /// Map to the high-level error domain for recovery dispatch.
184    #[must_use]
185    pub const fn domain(self) -> ErrorDomain {
186        match self {
187            Self::ToolNotFound => ErrorDomain::Planning,
188            Self::InvalidParameters | Self::TypeMismatch => ErrorDomain::Reflection,
189            Self::PolicyBlocked
190            | Self::ConfirmationRequired
191            | Self::PermanentFailure
192            | Self::Cancelled => ErrorDomain::Action,
193            Self::RateLimited | Self::ServerError | Self::NetworkError | Self::Timeout => {
194                ErrorDomain::System
195            }
196        }
197    }
198
199    /// Human-readable label for audit logs, TUI status indicators, and structured feedback.
200    #[must_use]
201    pub const fn label(self) -> &'static str {
202        match self {
203            Self::ToolNotFound => "tool_not_found",
204            Self::InvalidParameters => "invalid_parameters",
205            Self::TypeMismatch => "type_mismatch",
206            Self::PolicyBlocked => "policy_blocked",
207            Self::ConfirmationRequired => "confirmation_required",
208            Self::PermanentFailure => "permanent_failure",
209            Self::Cancelled => "cancelled",
210            Self::RateLimited => "rate_limited",
211            Self::ServerError => "server_error",
212            Self::NetworkError => "network_error",
213            Self::Timeout => "timeout",
214        }
215    }
216
217    /// Map to the diagnostic invocation phase per arXiv:2601.16280.
218    #[must_use]
219    pub const fn phase(self) -> ToolInvocationPhase {
220        match self {
221            Self::ToolNotFound => ToolInvocationPhase::Setup,
222            Self::InvalidParameters | Self::TypeMismatch => ToolInvocationPhase::ParamHandling,
223            Self::PolicyBlocked
224            | Self::ConfirmationRequired
225            | Self::PermanentFailure
226            | Self::Cancelled
227            | Self::RateLimited
228            | Self::ServerError
229            | Self::NetworkError
230            | Self::Timeout => ToolInvocationPhase::Execution,
231        }
232    }
233
234    /// Recovery suggestion for the LLM based on error category.
235    #[must_use]
236    pub const fn suggestion(self) -> &'static str {
237        match self {
238            Self::ToolNotFound => {
239                "Check the tool name. Use tool_definitions to see available tools."
240            }
241            Self::InvalidParameters => "Review the tool schema and provide correct parameters.",
242            Self::TypeMismatch => "Check parameter types against the tool schema.",
243            Self::PolicyBlocked => {
244                "This operation is blocked by security policy. Try an alternative approach."
245            }
246            Self::ConfirmationRequired => "This operation requires user confirmation.",
247            Self::PermanentFailure => {
248                "This resource is not available. Try an alternative approach."
249            }
250            Self::Cancelled => "Operation was cancelled by the user.",
251            Self::RateLimited => "Rate limit exceeded. The system will retry if possible.",
252            Self::ServerError => "Server error. The system will retry if possible.",
253            Self::NetworkError => "Network error. The system will retry if possible.",
254            Self::Timeout => "Operation timed out. The system will retry if possible.",
255        }
256    }
257}