zeph_common/error_taxonomy.rs
1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Shared error classification enums for tool invocation failures.
5//!
6//! Only the pure data types ([`ToolErrorCategory`] and [`ErrorDomain`]) live here.
7//! The `classify_*` helper functions and executor-specific types remain in `zeph-tools`,
8//! which may depend on `std::io::Error` and HTTP status codes.
9
10/// High-level error domain for recovery strategy dispatch.
11///
12/// Groups the `ToolErrorCategory` variants into 4 domains that map to distinct
13/// recovery strategies in the agent loop. Does NOT replace `ToolErrorCategory` — it
14/// is a companion abstraction for coarse dispatch.
15///
16/// # Examples
17///
18/// ```rust
19/// use zeph_common::error_taxonomy::ErrorDomain;
20///
21/// assert!(ErrorDomain::System.is_auto_retryable());
22/// assert!(!ErrorDomain::Planning.is_auto_retryable());
23/// ```
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
25#[serde(rename_all = "snake_case")]
26pub enum ErrorDomain {
27 /// The agent selected the wrong tool or misunderstood the task.
28 /// Recovery: re-plan, pick a different tool or approach.
29 /// Categories: `ToolNotFound`
30 Planning,
31
32 /// The agent's output (parameters, types) was malformed.
33 /// Recovery: reformat parameters using tool schema, retry once.
34 /// Categories: `InvalidParameters`, `TypeMismatch`
35 Reflection,
36
37 /// External action failed due to policy or resource constraints.
38 /// Recovery: inform user, suggest alternative, or skip.
39 /// Categories: `PolicyBlocked`, `ConfirmationRequired`, `PermanentFailure`, `Cancelled`
40 Action,
41
42 /// Transient infrastructure failure.
43 /// Recovery: automatic retry with backoff.
44 /// Categories: `RateLimited`, `ServerError`, `NetworkError`, `Timeout`
45 System,
46}
47
48impl ErrorDomain {
49 /// Whether errors in this domain should trigger automatic retry.
50 #[must_use]
51 pub fn is_auto_retryable(self) -> bool {
52 matches!(self, Self::System)
53 }
54
55 /// Whether the LLM should be asked to fix its output.
56 #[must_use]
57 pub fn needs_llm_correction(self) -> bool {
58 matches!(self, Self::Reflection | Self::Planning)
59 }
60
61 /// Human-readable label for audit logs.
62 #[must_use]
63 pub fn label(self) -> &'static str {
64 match self {
65 Self::Planning => "planning",
66 Self::Reflection => "reflection",
67 Self::Action => "action",
68 Self::System => "system",
69 }
70 }
71}
72
73/// Invocation phase in which a tool failure occurred, per arXiv:2601.16280.
74///
75/// Maps Zeph's `ToolErrorCategory` variants to the 4-phase diagnostic framework:
76/// Setup → `ParamHandling` → Execution → `ResultInterpretation`.
77#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
78#[serde(rename_all = "snake_case")]
79pub enum ToolInvocationPhase {
80 /// Tool lookup/registration phase: was the tool name valid?
81 Setup,
82 /// Parameter validation phase: were the provided arguments well-formed?
83 ParamHandling,
84 /// Runtime execution phase: did the tool run successfully?
85 Execution,
86 /// Output parsing/interpretation phase: was the result usable?
87 /// Reserved for future use — no current `ToolErrorCategory` maps here.
88 ResultInterpretation,
89}
90
91impl ToolInvocationPhase {
92 /// Human-readable label for audit logs.
93 #[must_use]
94 pub fn label(self) -> &'static str {
95 match self {
96 Self::Setup => "setup",
97 Self::ParamHandling => "param_handling",
98 Self::Execution => "execution",
99 Self::ResultInterpretation => "result_interpretation",
100 }
101 }
102}
103
104/// Fine-grained 12-category classification of tool invocation errors.
105///
106/// Each category determines retry eligibility, LLM parameter reformat path,
107/// quality attribution for reputation scoring, and structured feedback content.
108///
109/// # Examples
110///
111/// ```rust
112/// use zeph_common::error_taxonomy::ToolErrorCategory;
113///
114/// assert!(ToolErrorCategory::RateLimited.is_retryable());
115/// assert!(!ToolErrorCategory::InvalidParameters.is_retryable());
116/// ```
117#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize)]
118pub enum ToolErrorCategory {
119 // ── Initialization failures ──────────────────────────────────────────
120 /// Tool name not found in the registry (LLM requested a non-existent tool).
121 ToolNotFound,
122
123 // ── Parameter failures ───────────────────────────────────────────────
124 /// LLM provided invalid or missing parameters for the tool.
125 InvalidParameters,
126 /// Parameter type mismatch (e.g., string where integer expected).
127 TypeMismatch,
128
129 // ── Permission / policy failures ─────────────────────────────────────
130 /// Blocked by security policy (blocklist, sandbox, trust gate).
131 PolicyBlocked,
132 /// Requires user confirmation before execution.
133 ConfirmationRequired,
134
135 // ── Execution failures (permanent) ───────────────────────────────────
136 /// HTTP 403/404 or equivalent permanent resource rejection.
137 PermanentFailure,
138 /// Operation cancelled by the user.
139 Cancelled,
140
141 // ── Execution failures (transient) ───────────────────────────────────
142 /// HTTP 429 (rate limit) or resource exhaustion.
143 RateLimited,
144 /// HTTP 5xx or equivalent server-side error.
145 ServerError,
146 /// Network connectivity failure (DNS, connection refused, reset).
147 NetworkError,
148 /// Operation timed out.
149 Timeout,
150}
151
152impl ToolErrorCategory {
153 /// Whether this error category is eligible for automatic retry with backoff.
154 #[must_use]
155 pub fn is_retryable(self) -> bool {
156 matches!(
157 self,
158 Self::RateLimited | Self::ServerError | Self::NetworkError | Self::Timeout
159 )
160 }
161
162 /// Whether the LLM should be asked to reformat parameters and retry.
163 ///
164 /// Only `InvalidParameters` and `TypeMismatch` trigger the reformat path.
165 #[must_use]
166 pub fn needs_parameter_reformat(self) -> bool {
167 matches!(self, Self::InvalidParameters | Self::TypeMismatch)
168 }
169
170 /// Whether this error is attributable to LLM output quality.
171 ///
172 /// Infrastructure errors (network, timeout, server, rate limit) are NOT
173 /// the model's fault and must never trigger self-reflection.
174 #[must_use]
175 pub fn is_quality_failure(self) -> bool {
176 matches!(
177 self,
178 Self::InvalidParameters | Self::TypeMismatch | Self::ToolNotFound
179 )
180 }
181
182 /// Map to the high-level error domain for recovery dispatch.
183 #[must_use]
184 pub fn domain(self) -> ErrorDomain {
185 match self {
186 Self::ToolNotFound => ErrorDomain::Planning,
187 Self::InvalidParameters | Self::TypeMismatch => ErrorDomain::Reflection,
188 Self::PolicyBlocked
189 | Self::ConfirmationRequired
190 | Self::PermanentFailure
191 | Self::Cancelled => ErrorDomain::Action,
192 Self::RateLimited | Self::ServerError | Self::NetworkError | Self::Timeout => {
193 ErrorDomain::System
194 }
195 }
196 }
197
198 /// Human-readable label for audit logs, TUI status indicators, and structured feedback.
199 #[must_use]
200 pub fn label(self) -> &'static str {
201 match self {
202 Self::ToolNotFound => "tool_not_found",
203 Self::InvalidParameters => "invalid_parameters",
204 Self::TypeMismatch => "type_mismatch",
205 Self::PolicyBlocked => "policy_blocked",
206 Self::ConfirmationRequired => "confirmation_required",
207 Self::PermanentFailure => "permanent_failure",
208 Self::Cancelled => "cancelled",
209 Self::RateLimited => "rate_limited",
210 Self::ServerError => "server_error",
211 Self::NetworkError => "network_error",
212 Self::Timeout => "timeout",
213 }
214 }
215
216 /// Map to the diagnostic invocation phase per arXiv:2601.16280.
217 #[must_use]
218 pub fn phase(self) -> ToolInvocationPhase {
219 match self {
220 Self::ToolNotFound => ToolInvocationPhase::Setup,
221 Self::InvalidParameters | Self::TypeMismatch => ToolInvocationPhase::ParamHandling,
222 Self::PolicyBlocked
223 | Self::ConfirmationRequired
224 | Self::PermanentFailure
225 | Self::Cancelled
226 | Self::RateLimited
227 | Self::ServerError
228 | Self::NetworkError
229 | Self::Timeout => ToolInvocationPhase::Execution,
230 }
231 }
232
233 /// Recovery suggestion for the LLM based on error category.
234 #[must_use]
235 pub fn suggestion(self) -> &'static str {
236 match self {
237 Self::ToolNotFound => {
238 "Check the tool name. Use tool_definitions to see available tools."
239 }
240 Self::InvalidParameters => "Review the tool schema and provide correct parameters.",
241 Self::TypeMismatch => "Check parameter types against the tool schema.",
242 Self::PolicyBlocked => {
243 "This operation is blocked by security policy. Try an alternative approach."
244 }
245 Self::ConfirmationRequired => "This operation requires user confirmation.",
246 Self::PermanentFailure => {
247 "This resource is not available. Try an alternative approach."
248 }
249 Self::Cancelled => "Operation was cancelled by the user.",
250 Self::RateLimited => "Rate limit exceeded. The system will retry if possible.",
251 Self::ServerError => "Server error. The system will retry if possible.",
252 Self::NetworkError => "Network error. The system will retry if possible.",
253 Self::Timeout => "Operation timed out. The system will retry if possible.",
254 }
255 }
256}