Skip to main content

zeph_common/
error_taxonomy.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Shared error classification enums for tool invocation failures.
5//!
6//! Only the pure data types ([`ToolErrorCategory`] and [`ErrorDomain`]) live here.
7//! The `classify_*` helper functions and executor-specific types remain in `zeph-tools`,
8//! which may depend on `std::io::Error` and HTTP status codes.
9
10/// High-level error domain for recovery strategy dispatch.
11///
12/// Groups the `ToolErrorCategory` variants into 4 domains that map to distinct
13/// recovery strategies in the agent loop. Does NOT replace `ToolErrorCategory` — it
14/// is a companion abstraction for coarse dispatch.
15///
16/// # Examples
17///
18/// ```rust
19/// use zeph_common::error_taxonomy::ErrorDomain;
20///
21/// assert!(ErrorDomain::System.is_auto_retryable());
22/// assert!(!ErrorDomain::Planning.is_auto_retryable());
23/// ```
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
25#[serde(rename_all = "snake_case")]
26#[non_exhaustive]
27pub enum ErrorDomain {
28    /// The agent selected the wrong tool or misunderstood the task.
29    /// Recovery: re-plan, pick a different tool or approach.
30    /// Categories: `ToolNotFound`
31    Planning,
32
33    /// The agent's output (parameters, types) was malformed.
34    /// Recovery: reformat parameters using tool schema, retry once.
35    /// Categories: `InvalidParameters`, `TypeMismatch`
36    Reflection,
37
38    /// External action failed due to policy or resource constraints.
39    /// Recovery: inform user, suggest alternative, or skip.
40    /// Categories: `PolicyBlocked`, `ConfirmationRequired`, `PermanentFailure`, `Cancelled`
41    Action,
42
43    /// Transient infrastructure failure.
44    /// Recovery: automatic retry with backoff.
45    /// Categories: `RateLimited`, `ServerError`, `NetworkError`, `Timeout`
46    System,
47}
48
49impl ErrorDomain {
50    /// Whether errors in this domain should trigger automatic retry.
51    #[must_use]
52    pub const fn is_auto_retryable(self) -> bool {
53        matches!(self, Self::System)
54    }
55
56    /// Whether the LLM should be asked to fix its output.
57    #[must_use]
58    pub const fn needs_llm_correction(self) -> bool {
59        matches!(self, Self::Reflection | Self::Planning)
60    }
61
62    /// Human-readable label for audit logs.
63    #[must_use]
64    pub const fn label(self) -> &'static str {
65        match self {
66            Self::Planning => "planning",
67            Self::Reflection => "reflection",
68            Self::Action => "action",
69            Self::System => "system",
70        }
71    }
72}
73
74/// Invocation phase in which a tool failure occurred, per arXiv:2601.16280.
75///
76/// Maps Zeph's `ToolErrorCategory` variants to the 4-phase diagnostic framework:
77/// Setup → `ParamHandling` → Execution → `ResultInterpretation`.
78#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
79#[serde(rename_all = "snake_case")]
80#[non_exhaustive]
81pub enum ToolInvocationPhase {
82    /// Tool lookup/registration phase: was the tool name valid?
83    Setup,
84    /// Parameter validation phase: were the provided arguments well-formed?
85    ParamHandling,
86    /// Runtime execution phase: did the tool run successfully?
87    Execution,
88    /// Output parsing/interpretation phase: was the result usable?
89    /// Reserved for future use — no current `ToolErrorCategory` maps here.
90    ResultInterpretation,
91}
92
93impl ToolInvocationPhase {
94    /// Human-readable label for audit logs.
95    #[must_use]
96    pub const fn label(self) -> &'static str {
97        match self {
98            Self::Setup => "setup",
99            Self::ParamHandling => "param_handling",
100            Self::Execution => "execution",
101            Self::ResultInterpretation => "result_interpretation",
102        }
103    }
104}
105
106/// Fine-grained 12-category classification of tool invocation errors.
107///
108/// Each category determines retry eligibility, LLM parameter reformat path,
109/// quality attribution for reputation scoring, and structured feedback content.
110///
111/// # Examples
112///
113/// ```rust
114/// use zeph_common::error_taxonomy::ToolErrorCategory;
115///
116/// assert!(ToolErrorCategory::RateLimited.is_retryable());
117/// assert!(!ToolErrorCategory::InvalidParameters.is_retryable());
118/// ```
119#[non_exhaustive]
120#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
121pub enum ToolErrorCategory {
122    // ── Initialization failures ──────────────────────────────────────────
123    /// Tool name not found in the registry (LLM requested a non-existent tool).
124    ToolNotFound,
125
126    // ── Parameter failures ───────────────────────────────────────────────
127    /// LLM provided invalid or missing parameters for the tool.
128    InvalidParameters,
129    /// Parameter type mismatch (e.g., string where integer expected).
130    TypeMismatch,
131
132    // ── Permission / policy failures ─────────────────────────────────────
133    /// Blocked by security policy (blocklist, sandbox, trust gate).
134    PolicyBlocked,
135    /// Requires user confirmation before execution.
136    ConfirmationRequired,
137
138    // ── Execution failures (permanent) ───────────────────────────────────
139    /// HTTP 403/404 or equivalent permanent resource rejection.
140    PermanentFailure,
141    /// Operation cancelled by the user.
142    Cancelled,
143
144    // ── Execution failures (transient) ───────────────────────────────────
145    /// HTTP 429 (rate limit) or resource exhaustion.
146    RateLimited,
147    /// HTTP 5xx or equivalent server-side error.
148    ServerError,
149    /// Network connectivity failure (DNS, connection refused, reset).
150    NetworkError,
151    /// Operation timed out.
152    Timeout,
153}
154
155impl ToolErrorCategory {
156    /// Whether this error category is eligible for automatic retry with backoff.
157    #[must_use]
158    pub const fn is_retryable(self) -> bool {
159        matches!(
160            self,
161            Self::RateLimited | Self::ServerError | Self::NetworkError | Self::Timeout
162        )
163    }
164
165    /// Whether the LLM should be asked to reformat parameters and retry.
166    ///
167    /// Only `InvalidParameters` and `TypeMismatch` trigger the reformat path.
168    #[must_use]
169    pub const fn needs_parameter_reformat(self) -> bool {
170        matches!(self, Self::InvalidParameters | Self::TypeMismatch)
171    }
172
173    /// Whether this error is attributable to LLM output quality.
174    ///
175    /// Infrastructure errors (network, timeout, server, rate limit) are NOT
176    /// the model's fault and must never trigger self-reflection.
177    #[must_use]
178    pub const fn is_quality_failure(self) -> bool {
179        matches!(
180            self,
181            Self::InvalidParameters | Self::TypeMismatch | Self::ToolNotFound
182        )
183    }
184
185    /// Map to the high-level error domain for recovery dispatch.
186    #[must_use]
187    pub const fn domain(self) -> ErrorDomain {
188        match self {
189            Self::ToolNotFound => ErrorDomain::Planning,
190            Self::InvalidParameters | Self::TypeMismatch => ErrorDomain::Reflection,
191            Self::PolicyBlocked
192            | Self::ConfirmationRequired
193            | Self::PermanentFailure
194            | Self::Cancelled => ErrorDomain::Action,
195            Self::RateLimited | Self::ServerError | Self::NetworkError | Self::Timeout => {
196                ErrorDomain::System
197            }
198        }
199    }
200
201    /// Human-readable label for audit logs, TUI status indicators, and structured feedback.
202    #[must_use]
203    pub const fn label(self) -> &'static str {
204        match self {
205            Self::ToolNotFound => "tool_not_found",
206            Self::InvalidParameters => "invalid_parameters",
207            Self::TypeMismatch => "type_mismatch",
208            Self::PolicyBlocked => "policy_blocked",
209            Self::ConfirmationRequired => "confirmation_required",
210            Self::PermanentFailure => "permanent_failure",
211            Self::Cancelled => "cancelled",
212            Self::RateLimited => "rate_limited",
213            Self::ServerError => "server_error",
214            Self::NetworkError => "network_error",
215            Self::Timeout => "timeout",
216        }
217    }
218
219    /// Map to the diagnostic invocation phase per arXiv:2601.16280.
220    #[must_use]
221    pub const fn phase(self) -> ToolInvocationPhase {
222        match self {
223            Self::ToolNotFound => ToolInvocationPhase::Setup,
224            Self::InvalidParameters | Self::TypeMismatch => ToolInvocationPhase::ParamHandling,
225            Self::PolicyBlocked
226            | Self::ConfirmationRequired
227            | Self::PermanentFailure
228            | Self::Cancelled
229            | Self::RateLimited
230            | Self::ServerError
231            | Self::NetworkError
232            | Self::Timeout => ToolInvocationPhase::Execution,
233        }
234    }
235
236    /// Recovery suggestion for the LLM based on error category.
237    #[must_use]
238    pub const fn suggestion(self) -> &'static str {
239        match self {
240            Self::ToolNotFound => {
241                "Check the tool name. Use tool_definitions to see available tools."
242            }
243            Self::InvalidParameters => "Review the tool schema and provide correct parameters.",
244            Self::TypeMismatch => "Check parameter types against the tool schema.",
245            Self::PolicyBlocked => {
246                "This operation is blocked by security policy. Try an alternative approach."
247            }
248            Self::ConfirmationRequired => "This operation requires user confirmation.",
249            Self::PermanentFailure => {
250                "This resource is not available. Try an alternative approach."
251            }
252            Self::Cancelled => "Operation was cancelled by the user.",
253            Self::RateLimited => "Rate limit exceeded. The system will retry if possible.",
254            Self::ServerError => "Server error. The system will retry if possible.",
255            Self::NetworkError => "Network error. The system will retry if possible.",
256            Self::Timeout => "Operation timed out. The system will retry if possible.",
257        }
258    }
259}