Skip to main content

entelix_core/tools/
error_kind.rs

1//! `ToolErrorKind` — tool-dispatch failure category derived from
2//! [`crate::Error`] for observability and retry classification.
3//!
4//! Tool authors return `Result<Value, Error>` from `Tool::execute`;
5//! the runtime classifies the error variant into one of these
6//! seven categories so observability sinks (`AgentEvent::ToolError`),
7//! retry middleware (`RetryToolLayer`), and recovery sinks all
8//! reach the same cross-tool taxonomy.
9//!
10//! Mirrors [`crate::ProviderErrorKind`] in shape (typed enum
11//! categorising failures) but operates at a higher level — provider
12//! kinds describe transport mechanisms, tool kinds describe the
13//! semantic outcome the operator (or the model) actually cares about.
14
15use crate::error::Error;
16
17/// Cross-tool failure category.
18///
19/// Derive from [`Error`] via [`Self::classify`]. Used for retry
20/// middleware (`RetryToolLayer` retries [`Self::Transient`] /
21/// [`Self::RateLimit`]), observability sinks (operators surface the
22/// category in dashboards), and downstream recovery routing
23/// (different categories trigger different operator responses —
24/// page on `Auth`, alert on `Quota`, ignore `Validation` noise).
25#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, serde::Serialize, serde::Deserialize)]
26#[serde(rename_all = "snake_case")]
27#[non_exhaustive]
28pub enum ToolErrorKind {
29    /// Network blip, transient 5xx, generic transport failure —
30    /// safe to retry.
31    Transient,
32    /// Vendor signalled rate limiting (429 with `Retry-After` hint).
33    /// Retryable after the configured cooldown.
34    RateLimit,
35    /// Vendor signalled exhausted quota / billing cap. Retry will
36    /// not succeed until the quota resets or operator intervenes —
37    /// surface to ops, do not retry automatically.
38    Quota,
39    /// Credential rejected (401 / 403 / [`Error::Auth`]). Retry will
40    /// not succeed until credentials are rotated.
41    Auth,
42    /// Permanent vendor failure (4xx other than auth/rate/quota,
43    /// 405, 410, 422 …). The same call will fail again.
44    Permanent,
45    /// Caller-side input rejected ([`Error::InvalidRequest`],
46    /// [`Error::Serde`]) — the operator's payload does not match
47    /// the tool contract. Retry is meaningless without changing the
48    /// payload.
49    Validation,
50    /// Tool-internal bug or misconfiguration ([`Error::Config`], or
51    /// any unclassified shape). Surface to ops; retry is not
52    /// meaningful.
53    Internal,
54}
55
56impl ToolErrorKind {
57    /// Derive the category from an [`Error`].
58    ///
59    /// The mapping is intentionally exhaustive over the variants
60    /// [`Error`] surfaces today — the `_` catch-all routes to
61    /// [`Self::Internal`] so future variants stay observable until
62    /// classified explicitly. Operational variants
63    /// ([`Error::Cancelled`], [`Error::DeadlineExceeded`],
64    /// [`Error::Interrupted`], [`Error::ModelRetry`]) flow through
65    /// `Internal` because they are agent-runtime control signals,
66    /// not tool failures — call sites that observe them should not
67    /// reach this classifier in the first place.
68    #[must_use]
69    pub fn classify(error: &Error) -> Self {
70        use crate::error::ProviderErrorKind;
71        match error {
72            // Already-classified terminal failures (raised by
73            // `ToolErrorPolicyLayer` or propagated from a sub-agent)
74            // carry their classification on the variant itself.
75            // Surface the leaf kind so observability (`AgentEvent::
76            // ToolError::kind`, OTel `entelix.tool.error_kind`,
77            // `DefaultRetryClassifier` introspection) sees the
78            // operator-actionable category regardless of how many
79            // layers re-encountered the same wrapped error. The
80            // `ToolErrorPolicyLayer` itself passes the already-
81            // wrapped variant through unchanged via its double-wrap
82            // guard, so the parent's policy does not re-wrap.
83            Error::ToolErrorTerminal { kind, .. } => *kind,
84            Error::Provider {
85                kind: ProviderErrorKind::Network | ProviderErrorKind::Tls | ProviderErrorKind::Dns,
86                ..
87            } => Self::Transient,
88            Error::Provider {
89                kind: ProviderErrorKind::Http(429),
90                retry_after,
91                ..
92            } => {
93                // Vendor distinguishes 429-with-Retry-After (transient
94                // back-pressure) from 429-without (often quota
95                // exhaustion). The hint presence is the cue.
96                if retry_after.is_some() {
97                    Self::RateLimit
98                } else {
99                    Self::Quota
100                }
101            }
102            Error::Provider {
103                kind: ProviderErrorKind::Http(status),
104                ..
105            } => {
106                if *status == 401 || *status == 403 {
107                    Self::Auth
108                } else if (500..600).contains(status) || *status == 408 || *status == 425 {
109                    Self::Transient
110                } else {
111                    Self::Permanent
112                }
113            }
114            Error::Auth(_) => Self::Auth,
115            Error::UsageLimitExceeded(_) => Self::Quota,
116            Error::InvalidRequest(_) | Error::Serde(_) => Self::Validation,
117            // Operational variants (Cancelled, DeadlineExceeded,
118            // Interrupted, ModelRetry) and any future shape route
119            // here together with Config — none of them are tool
120            // failures the operator can act on per-category.
121            _ => Self::Internal,
122        }
123    }
124
125    /// Whether the runtime should attempt the tool call again.
126    ///
127    /// `Transient` and `RateLimit` are retryable; everything else
128    /// is a surface-and-stop signal. `RetryToolLayer` consults this
129    /// via the underlying `RetryClassifier` (which can be
130    /// overridden per deployment) — operators that want different
131    /// retry policy install a custom classifier rather than mutating
132    /// this method.
133    #[must_use]
134    pub const fn is_retryable(self) -> bool {
135        matches!(self, Self::Transient | Self::RateLimit)
136    }
137
138    /// Stable snake-case identifier surfaced through OTel
139    /// (`entelix.tool.error_kind`), structured logs, and audit
140    /// `GraphEvent` serialisation. Patch-version stable — renaming
141    /// a value is a breaking change for downstream consumers
142    /// keying off the string.
143    #[must_use]
144    pub const fn wire_id(self) -> &'static str {
145        match self {
146            Self::Transient => "transient",
147            Self::RateLimit => "rate_limit",
148            Self::Quota => "quota",
149            Self::Auth => "auth",
150            Self::Permanent => "permanent",
151            Self::Validation => "validation",
152            Self::Internal => "internal",
153        }
154    }
155
156    /// Stable bit position used by [`ToolErrorKindSet`] for the
157    /// bitset representation. Adding a new variant requires updating
158    /// this match — the same-crate exhaustiveness check forces it
159    /// (`#[non_exhaustive]` is for external matchers only). The
160    /// `bit_indices_are_unique_and_fit_in_set_width` regression test
161    /// then asserts the new bit position is unique and fits in
162    /// [`ToolErrorKindSet`]'s backing integer; widening the integer
163    /// in lockstep with the taxonomy is the only correct response if
164    /// the test ever fails.
165    #[must_use]
166    pub(crate) const fn bit_index(self) -> u32 {
167        match self {
168            Self::Transient => 0,
169            Self::RateLimit => 1,
170            Self::Quota => 2,
171            Self::Auth => 3,
172            Self::Permanent => 4,
173            Self::Validation => 5,
174            Self::Internal => 6,
175        }
176    }
177}
178
179impl std::fmt::Display for ToolErrorKind {
180    /// Lowercase snake_case form matching [`Self::wire_id`] — the
181    /// stable operator-channel rendering. Operator dashboards, log
182    /// lines, and the `Error::ToolErrorTerminal` Display all read
183    /// this format so the same kind reads identically across every
184    /// surface.
185    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
186        f.write_str(self.wire_id())
187    }
188}
189
190/// Compact set of [`ToolErrorKind`] values. Backed by a fixed-width
191/// integer bitset — every variant occupies one bit at a stable
192/// position determined by the enum taxonomy.
193///
194/// The type lives next to [`ToolErrorKind`] rather than in a
195/// dedicated `set` module so the bit layout and the enum stay in
196/// one place. The `bit_indices_are_unique_and_fit_in_set_width`
197/// regression test below uses a same-crate exhaustive match to
198/// force every new variant through a uniqueness + width check —
199/// landing a variant whose `bit_index` exceeds the set width fails
200/// the test, and the developer widens the backing integer in
201/// lockstep with the taxonomy.
202///
203/// Construction is `const`-friendly — operator defaults and unit
204/// tests build sets at compile time:
205///
206/// ```
207/// use entelix_core::tools::{ToolErrorKind, ToolErrorKindSet};
208/// const SAFE: ToolErrorKindSet = ToolErrorKindSet::empty()
209///     .with(ToolErrorKind::Auth)
210///     .with(ToolErrorKind::Quota)
211///     .with(ToolErrorKind::Permanent);
212/// assert!(SAFE.contains(ToolErrorKind::Auth));
213/// assert!(!SAFE.contains(ToolErrorKind::Transient));
214/// ```
215#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
216pub struct ToolErrorKindSet(u16);
217
218impl ToolErrorKindSet {
219    /// Width of the underlying integer in bits — the maximum number
220    /// of [`ToolErrorKind`] variants the set can hold. Used by the
221    /// regression test that pins variant count against bitset
222    /// capacity.
223    #[cfg(test)]
224    const CAPACITY_BITS: u32 = u16::BITS;
225
226    /// Empty set — no kinds.
227    #[must_use]
228    pub const fn empty() -> Self {
229        Self(0)
230    }
231
232    /// Single-kind set — useful at literal call sites.
233    #[must_use]
234    pub const fn singleton(kind: ToolErrorKind) -> Self {
235        Self(1u16 << kind.bit_index())
236    }
237
238    /// Insert `kind` into the set, returning the updated set.
239    #[must_use]
240    pub const fn with(self, kind: ToolErrorKind) -> Self {
241        Self(self.0 | (1u16 << kind.bit_index()))
242    }
243
244    /// Remove `kind` from the set, returning the updated set.
245    #[must_use]
246    pub const fn without(self, kind: ToolErrorKind) -> Self {
247        Self(self.0 & !(1u16 << kind.bit_index()))
248    }
249
250    /// Union with another set, returning the updated set.
251    #[must_use]
252    pub const fn union(self, other: Self) -> Self {
253        Self(self.0 | other.0)
254    }
255
256    /// Whether `kind` is in the set.
257    #[must_use]
258    pub const fn contains(self, kind: ToolErrorKind) -> bool {
259        (self.0 >> kind.bit_index()) & 1 == 1
260    }
261
262    /// Whether the set has no kinds.
263    #[must_use]
264    pub const fn is_empty(self) -> bool {
265        self.0 == 0
266    }
267}
268
269#[cfg(test)]
270#[allow(clippy::unwrap_used)]
271mod tests {
272    use super::*;
273    use std::time::Duration;
274
275    #[test]
276    fn provider_network_classifies_as_transient() {
277        let err = Error::provider_network("connect refused");
278        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
279        assert!(ToolErrorKind::classify(&err).is_retryable());
280    }
281
282    #[test]
283    fn provider_dns_classifies_as_transient() {
284        let err = Error::provider_dns("no such host");
285        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
286    }
287
288    #[test]
289    fn provider_5xx_classifies_as_transient() {
290        let err = Error::provider_http(503, "down");
291        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
292        let err = Error::provider_http(502, "bad gateway");
293        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
294    }
295
296    #[test]
297    fn http_408_and_425_classify_as_transient() {
298        // 408 Request Timeout, 425 Too Early — both retryable per
299        // spec semantics.
300        let err = Error::provider_http(408, "timeout");
301        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
302        let err = Error::provider_http(425, "too early");
303        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
304    }
305
306    #[test]
307    fn http_429_with_retry_after_classifies_as_rate_limit() {
308        let err = Error::provider_http(429, "slow down").with_retry_after(Duration::from_secs(5));
309        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::RateLimit);
310        assert!(ToolErrorKind::classify(&err).is_retryable());
311    }
312
313    #[test]
314    fn http_429_without_retry_after_classifies_as_quota() {
315        // Vendor signalling quota exhaustion typically omits
316        // `Retry-After` because the cooldown is a billing cycle,
317        // not a request window.
318        let err = Error::provider_http(429, "monthly cap reached");
319        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Quota);
320        assert!(!ToolErrorKind::classify(&err).is_retryable());
321    }
322
323    #[test]
324    fn http_401_403_classify_as_auth() {
325        let err = Error::provider_http(401, "unauthorized");
326        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Auth);
327        let err = Error::provider_http(403, "forbidden");
328        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Auth);
329        assert!(!ToolErrorKind::classify(&err).is_retryable());
330    }
331
332    #[test]
333    fn http_4xx_other_classifies_as_permanent() {
334        let err = Error::provider_http(404, "not found");
335        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Permanent);
336        let err = Error::provider_http(422, "unprocessable");
337        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Permanent);
338        assert!(!ToolErrorKind::classify(&err).is_retryable());
339    }
340
341    #[test]
342    fn invalid_request_and_serde_classify_as_validation() {
343        let err = Error::invalid_request("bad input");
344        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Validation);
345        let serde_err: serde_json::Error = serde_json::from_str::<i32>("not-a-number").unwrap_err();
346        let err: Error = serde_err.into();
347        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Validation);
348    }
349
350    #[test]
351    fn config_classifies_as_internal() {
352        let err = Error::config("misconfigured");
353        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Internal);
354    }
355
356    #[test]
357    fn tool_error_terminal_unwraps_to_inner_kind() {
358        // `ToolErrorTerminal` is a routing-decision wrapper, not a
359        // distinct category. Classification unwraps so a parent
360        // layer (sub-agent propagation case) sees the original kind
361        // and re-evaluates against its own policy.
362        let inner = Error::provider_http(401, "unauthorized");
363        let wrapped = Error::tool_error_terminal(ToolErrorKind::Auth, "my_tool", inner);
364        assert_eq!(ToolErrorKind::classify(&wrapped), ToolErrorKind::Auth);
365        // Doubly-wrapped (parent escalates a sub-agent's already-
366        // terminal failure): still unwraps cleanly to the leaf kind.
367        let twice = Error::tool_error_terminal(ToolErrorKind::Auth, "parent_tool", wrapped);
368        assert_eq!(ToolErrorKind::classify(&twice), ToolErrorKind::Auth);
369    }
370
371    #[test]
372    fn bit_indices_are_unique_and_fit_in_set_width() {
373        // Same-crate exhaustive match — adding a new variant to
374        // `ToolErrorKind` without extending the arm below is a
375        // compile error. That single forcing function carries two
376        // contracts:
377        //
378        //   - Every variant has a `bit_index` (the match's body
379        //     evaluates `k.bit_index()` for every arm).
380        //   - The runtime loop below visits every variant, so the
381        //     uniqueness and width assertions cover the whole
382        //     taxonomy with no hidden holes.
383        //
384        // This is the single drift guard for the bitset — no
385        // parallel `ALL` array, no `const _: () = …` block, no
386        // `#[allow(clippy::indexing_slicing)]` workaround.
387        fn dispatch_bit_index(k: ToolErrorKind) -> u32 {
388            match k {
389                ToolErrorKind::Transient
390                | ToolErrorKind::RateLimit
391                | ToolErrorKind::Quota
392                | ToolErrorKind::Auth
393                | ToolErrorKind::Permanent
394                | ToolErrorKind::Validation
395                | ToolErrorKind::Internal => k.bit_index(),
396            }
397        }
398        let every_variant = [
399            ToolErrorKind::Transient,
400            ToolErrorKind::RateLimit,
401            ToolErrorKind::Quota,
402            ToolErrorKind::Auth,
403            ToolErrorKind::Permanent,
404            ToolErrorKind::Validation,
405            ToolErrorKind::Internal,
406        ];
407        let mut seen = std::collections::HashSet::new();
408        for k in every_variant {
409            let bi = dispatch_bit_index(k);
410            assert!(
411                bi < ToolErrorKindSet::CAPACITY_BITS,
412                "{k:?}.bit_index() = {bi} exceeds ToolErrorKindSet capacity \
413                 ({cap} bits) — widen the backing integer in lockstep",
414                cap = ToolErrorKindSet::CAPACITY_BITS,
415            );
416            assert!(seen.insert(bi), "duplicate bit_index {bi} for {k:?}");
417        }
418    }
419
420    #[test]
421    fn tool_error_kind_set_const_construction() {
422        const SET: ToolErrorKindSet = ToolErrorKindSet::empty()
423            .with(ToolErrorKind::Auth)
424            .with(ToolErrorKind::Quota)
425            .with(ToolErrorKind::Permanent);
426        assert!(SET.contains(ToolErrorKind::Auth));
427        assert!(SET.contains(ToolErrorKind::Quota));
428        assert!(SET.contains(ToolErrorKind::Permanent));
429        assert!(!SET.contains(ToolErrorKind::Transient));
430        assert!(!SET.contains(ToolErrorKind::Internal));
431        assert!(!SET.is_empty());
432        assert!(ToolErrorKindSet::empty().is_empty());
433    }
434
435    #[test]
436    fn tool_error_kind_set_without_and_union() {
437        let a = ToolErrorKindSet::singleton(ToolErrorKind::Auth);
438        let b = ToolErrorKindSet::singleton(ToolErrorKind::Quota);
439        let both = a.union(b);
440        assert!(both.contains(ToolErrorKind::Auth));
441        assert!(both.contains(ToolErrorKind::Quota));
442        let removed = both.without(ToolErrorKind::Auth);
443        assert!(!removed.contains(ToolErrorKind::Auth));
444        assert!(removed.contains(ToolErrorKind::Quota));
445    }
446
447    #[test]
448    fn usage_limit_exceeded_classifies_as_quota() {
449        use crate::run_budget::UsageLimitBreach;
450        let err = Error::UsageLimitExceeded(UsageLimitBreach::Requests {
451            limit: 10,
452            observed: 11,
453        });
454        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Quota);
455    }
456}