entelix_core/tools/error_kind.rs
1//! `ToolErrorKind` — tool-dispatch failure category derived from
2//! [`crate::Error`] for observability and retry classification.
3//!
4//! Tool authors return `Result<Value, Error>` from `Tool::execute`;
5//! the runtime classifies the error variant into one of these
6//! seven categories so observability sinks (`AgentEvent::ToolError`),
7//! retry middleware (`RetryToolLayer`), and recovery sinks all
8//! reach the same cross-tool taxonomy.
9//!
10//! Mirrors [`crate::ProviderErrorKind`] in shape (typed enum
11//! categorising failures) but operates at a higher level — provider
12//! kinds describe transport mechanisms, tool kinds describe the
13//! semantic outcome the operator (or the model) actually cares about.
14
15use crate::error::Error;
16
17/// Cross-tool failure category.
18///
19/// Derive from [`Error`] via [`Self::classify`]. Used for retry
20/// middleware (`RetryToolLayer` retries [`Self::Transient`] /
21/// [`Self::RateLimit`]), observability sinks (operators surface the
22/// category in dashboards), and downstream recovery routing
23/// (different categories trigger different operator responses —
24/// page on `Auth`, alert on `Quota`, ignore `Validation` noise).
25#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, serde::Serialize, serde::Deserialize)]
26#[serde(rename_all = "snake_case")]
27#[non_exhaustive]
28pub enum ToolErrorKind {
29 /// Network blip, transient 5xx, generic transport failure —
30 /// safe to retry.
31 Transient,
32 /// Vendor signalled rate limiting (429 with `Retry-After` hint).
33 /// Retryable after the configured cooldown.
34 RateLimit,
35 /// Vendor signalled exhausted quota / billing cap. Retry will
36 /// not succeed until the quota resets or operator intervenes —
37 /// surface to ops, do not retry automatically.
38 Quota,
39 /// Credential rejected (401 / 403 / [`Error::Auth`]). Retry will
40 /// not succeed until credentials are rotated.
41 Auth,
42 /// Permanent vendor failure (4xx other than auth/rate/quota,
43 /// 405, 410, 422 …). The same call will fail again.
44 Permanent,
45 /// Caller-side input rejected ([`Error::InvalidRequest`],
46 /// [`Error::Serde`]) — the operator's payload does not match
47 /// the tool contract. Retry is meaningless without changing the
48 /// payload.
49 Validation,
50 /// Tool-internal bug or misconfiguration ([`Error::Config`], or
51 /// any unclassified shape). Surface to ops; retry is not
52 /// meaningful.
53 Internal,
54}
55
56impl ToolErrorKind {
57 /// Derive the category from an [`Error`].
58 ///
59 /// The mapping is intentionally exhaustive over the variants
60 /// [`Error`] surfaces today — the `_` catch-all routes to
61 /// [`Self::Internal`] so future variants stay observable until
62 /// classified explicitly. Operational variants
63 /// ([`Error::Cancelled`], [`Error::DeadlineExceeded`],
64 /// [`Error::Interrupted`], [`Error::ModelRetry`]) flow through
65 /// `Internal` because they are agent-runtime control signals,
66 /// not tool failures — call sites that observe them should not
67 /// reach this classifier in the first place.
68 #[must_use]
69 pub fn classify(error: &Error) -> Self {
70 use crate::error::ProviderErrorKind;
71 match error {
72 // Already-classified terminal failures (raised by
73 // `ToolErrorPolicyLayer` or propagated from a sub-agent)
74 // carry their classification on the variant itself.
75 // Surface the leaf kind so observability (`AgentEvent::
76 // ToolError::kind`, OTel `entelix.tool.error_kind`,
77 // `DefaultRetryClassifier` introspection) sees the
78 // operator-actionable category regardless of how many
79 // layers re-encountered the same wrapped error. The
80 // `ToolErrorPolicyLayer` itself passes the already-
81 // wrapped variant through unchanged via its double-wrap
82 // guard, so the parent's policy does not re-wrap.
83 Error::ToolErrorTerminal { kind, .. } => *kind,
84 Error::Provider {
85 kind: ProviderErrorKind::Network | ProviderErrorKind::Tls | ProviderErrorKind::Dns,
86 ..
87 } => Self::Transient,
88 Error::Provider {
89 kind: ProviderErrorKind::Http(429),
90 retry_after,
91 ..
92 } => {
93 // Vendor distinguishes 429-with-Retry-After (transient
94 // back-pressure) from 429-without (often quota
95 // exhaustion). The hint presence is the cue.
96 if retry_after.is_some() {
97 Self::RateLimit
98 } else {
99 Self::Quota
100 }
101 }
102 Error::Provider {
103 kind: ProviderErrorKind::Http(status),
104 ..
105 } => {
106 if *status == 401 || *status == 403 {
107 Self::Auth
108 } else if (500..600).contains(status) || *status == 408 || *status == 425 {
109 Self::Transient
110 } else {
111 Self::Permanent
112 }
113 }
114 Error::Auth(_) => Self::Auth,
115 Error::UsageLimitExceeded(_) => Self::Quota,
116 Error::InvalidRequest(_) | Error::Serde(_) => Self::Validation,
117 // Operational variants (Cancelled, DeadlineExceeded,
118 // Interrupted, ModelRetry) and any future shape route
119 // here together with Config — none of them are tool
120 // failures the operator can act on per-category.
121 _ => Self::Internal,
122 }
123 }
124
125 /// Whether the runtime should attempt the tool call again.
126 ///
127 /// `Transient` and `RateLimit` are retryable; everything else
128 /// is a surface-and-stop signal. `RetryToolLayer` consults this
129 /// via the underlying `RetryClassifier` (which can be
130 /// overridden per deployment) — operators that want different
131 /// retry policy install a custom classifier rather than mutating
132 /// this method.
133 #[must_use]
134 pub const fn is_retryable(self) -> bool {
135 matches!(self, Self::Transient | Self::RateLimit)
136 }
137
138 /// Stable snake-case identifier surfaced through OTel
139 /// (`entelix.tool.error_kind`), structured logs, and audit
140 /// `GraphEvent` serialisation. Patch-version stable — renaming
141 /// a value is a breaking change for downstream consumers
142 /// keying off the string.
143 #[must_use]
144 pub const fn wire_id(self) -> &'static str {
145 match self {
146 Self::Transient => "transient",
147 Self::RateLimit => "rate_limit",
148 Self::Quota => "quota",
149 Self::Auth => "auth",
150 Self::Permanent => "permanent",
151 Self::Validation => "validation",
152 Self::Internal => "internal",
153 }
154 }
155
156 /// Stable bit position used by [`ToolErrorKindSet`] for the
157 /// bitset representation. Adding a new variant requires updating
158 /// this match — the same-crate exhaustiveness check forces it
159 /// (`#[non_exhaustive]` is for external matchers only). The
160 /// `bit_indices_are_unique_and_fit_in_set_width` regression test
161 /// then asserts the new bit position is unique and fits in
162 /// [`ToolErrorKindSet`]'s backing integer; widening the integer
163 /// in lockstep with the taxonomy is the only correct response if
164 /// the test ever fails.
165 #[must_use]
166 pub(crate) const fn bit_index(self) -> u32 {
167 match self {
168 Self::Transient => 0,
169 Self::RateLimit => 1,
170 Self::Quota => 2,
171 Self::Auth => 3,
172 Self::Permanent => 4,
173 Self::Validation => 5,
174 Self::Internal => 6,
175 }
176 }
177}
178
179impl std::fmt::Display for ToolErrorKind {
180 /// Lowercase snake_case form matching [`Self::wire_id`] — the
181 /// stable operator-channel rendering. Operator dashboards, log
182 /// lines, and the `Error::ToolErrorTerminal` Display all read
183 /// this format so the same kind reads identically across every
184 /// surface.
185 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
186 f.write_str(self.wire_id())
187 }
188}
189
190/// Compact set of [`ToolErrorKind`] values. Backed by a fixed-width
191/// integer bitset — every variant occupies one bit at a stable
192/// position determined by the enum taxonomy.
193///
194/// The type lives next to [`ToolErrorKind`] rather than in a
195/// dedicated `set` module so the bit layout and the enum stay in
196/// one place. The `bit_indices_are_unique_and_fit_in_set_width`
197/// regression test below uses a same-crate exhaustive match to
198/// force every new variant through a uniqueness + width check —
199/// landing a variant whose `bit_index` exceeds the set width fails
200/// the test, and the developer widens the backing integer in
201/// lockstep with the taxonomy.
202///
203/// Construction is `const`-friendly — operator defaults and unit
204/// tests build sets at compile time:
205///
206/// ```
207/// use entelix_core::tools::{ToolErrorKind, ToolErrorKindSet};
208/// const SAFE: ToolErrorKindSet = ToolErrorKindSet::empty()
209/// .with(ToolErrorKind::Auth)
210/// .with(ToolErrorKind::Quota)
211/// .with(ToolErrorKind::Permanent);
212/// assert!(SAFE.contains(ToolErrorKind::Auth));
213/// assert!(!SAFE.contains(ToolErrorKind::Transient));
214/// ```
215#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
216pub struct ToolErrorKindSet(u16);
217
218impl ToolErrorKindSet {
219 /// Width of the underlying integer in bits — the maximum number
220 /// of [`ToolErrorKind`] variants the set can hold. Used by the
221 /// regression test that pins variant count against bitset
222 /// capacity.
223 #[cfg(test)]
224 const CAPACITY_BITS: u32 = u16::BITS;
225
226 /// Empty set — no kinds.
227 #[must_use]
228 pub const fn empty() -> Self {
229 Self(0)
230 }
231
232 /// Single-kind set — useful at literal call sites.
233 #[must_use]
234 pub const fn singleton(kind: ToolErrorKind) -> Self {
235 Self(1u16 << kind.bit_index())
236 }
237
238 /// Insert `kind` into the set, returning the updated set.
239 #[must_use]
240 pub const fn with(self, kind: ToolErrorKind) -> Self {
241 Self(self.0 | (1u16 << kind.bit_index()))
242 }
243
244 /// Remove `kind` from the set, returning the updated set.
245 #[must_use]
246 pub const fn without(self, kind: ToolErrorKind) -> Self {
247 Self(self.0 & !(1u16 << kind.bit_index()))
248 }
249
250 /// Union with another set, returning the updated set.
251 #[must_use]
252 pub const fn union(self, other: Self) -> Self {
253 Self(self.0 | other.0)
254 }
255
256 /// Whether `kind` is in the set.
257 #[must_use]
258 pub const fn contains(self, kind: ToolErrorKind) -> bool {
259 (self.0 >> kind.bit_index()) & 1 == 1
260 }
261
262 /// Whether the set has no kinds.
263 #[must_use]
264 pub const fn is_empty(self) -> bool {
265 self.0 == 0
266 }
267}
268
269#[cfg(test)]
270#[allow(clippy::unwrap_used)]
271mod tests {
272 use super::*;
273 use std::time::Duration;
274
275 #[test]
276 fn provider_network_classifies_as_transient() {
277 let err = Error::provider_network("connect refused");
278 assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
279 assert!(ToolErrorKind::classify(&err).is_retryable());
280 }
281
282 #[test]
283 fn provider_dns_classifies_as_transient() {
284 let err = Error::provider_dns("no such host");
285 assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
286 }
287
288 #[test]
289 fn provider_5xx_classifies_as_transient() {
290 let err = Error::provider_http(503, "down");
291 assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
292 let err = Error::provider_http(502, "bad gateway");
293 assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
294 }
295
296 #[test]
297 fn http_408_and_425_classify_as_transient() {
298 // 408 Request Timeout, 425 Too Early — both retryable per
299 // spec semantics.
300 let err = Error::provider_http(408, "timeout");
301 assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
302 let err = Error::provider_http(425, "too early");
303 assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
304 }
305
306 #[test]
307 fn http_429_with_retry_after_classifies_as_rate_limit() {
308 let err = Error::provider_http(429, "slow down").with_retry_after(Duration::from_secs(5));
309 assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::RateLimit);
310 assert!(ToolErrorKind::classify(&err).is_retryable());
311 }
312
313 #[test]
314 fn http_429_without_retry_after_classifies_as_quota() {
315 // Vendor signalling quota exhaustion typically omits
316 // `Retry-After` because the cooldown is a billing cycle,
317 // not a request window.
318 let err = Error::provider_http(429, "monthly cap reached");
319 assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Quota);
320 assert!(!ToolErrorKind::classify(&err).is_retryable());
321 }
322
323 #[test]
324 fn http_401_403_classify_as_auth() {
325 let err = Error::provider_http(401, "unauthorized");
326 assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Auth);
327 let err = Error::provider_http(403, "forbidden");
328 assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Auth);
329 assert!(!ToolErrorKind::classify(&err).is_retryable());
330 }
331
332 #[test]
333 fn http_4xx_other_classifies_as_permanent() {
334 let err = Error::provider_http(404, "not found");
335 assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Permanent);
336 let err = Error::provider_http(422, "unprocessable");
337 assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Permanent);
338 assert!(!ToolErrorKind::classify(&err).is_retryable());
339 }
340
341 #[test]
342 fn invalid_request_and_serde_classify_as_validation() {
343 let err = Error::invalid_request("bad input");
344 assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Validation);
345 let serde_err: serde_json::Error = serde_json::from_str::<i32>("not-a-number").unwrap_err();
346 let err: Error = serde_err.into();
347 assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Validation);
348 }
349
350 #[test]
351 fn config_classifies_as_internal() {
352 let err = Error::config("misconfigured");
353 assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Internal);
354 }
355
356 #[test]
357 fn tool_error_terminal_unwraps_to_inner_kind() {
358 // `ToolErrorTerminal` is a routing-decision wrapper, not a
359 // distinct category. Classification unwraps so a parent
360 // layer (sub-agent propagation case) sees the original kind
361 // and re-evaluates against its own policy.
362 let inner = Error::provider_http(401, "unauthorized");
363 let wrapped = Error::tool_error_terminal(ToolErrorKind::Auth, "my_tool", inner);
364 assert_eq!(ToolErrorKind::classify(&wrapped), ToolErrorKind::Auth);
365 // Doubly-wrapped (parent escalates a sub-agent's already-
366 // terminal failure): still unwraps cleanly to the leaf kind.
367 let twice = Error::tool_error_terminal(ToolErrorKind::Auth, "parent_tool", wrapped);
368 assert_eq!(ToolErrorKind::classify(&twice), ToolErrorKind::Auth);
369 }
370
371 #[test]
372 fn bit_indices_are_unique_and_fit_in_set_width() {
373 // Same-crate exhaustive match — adding a new variant to
374 // `ToolErrorKind` without extending the arm below is a
375 // compile error. That single forcing function carries two
376 // contracts:
377 //
378 // - Every variant has a `bit_index` (the match's body
379 // evaluates `k.bit_index()` for every arm).
380 // - The runtime loop below visits every variant, so the
381 // uniqueness and width assertions cover the whole
382 // taxonomy with no hidden holes.
383 //
384 // This is the single drift guard for the bitset — no
385 // parallel `ALL` array, no `const _: () = …` block, no
386 // `#[allow(clippy::indexing_slicing)]` workaround.
387 fn dispatch_bit_index(k: ToolErrorKind) -> u32 {
388 match k {
389 ToolErrorKind::Transient
390 | ToolErrorKind::RateLimit
391 | ToolErrorKind::Quota
392 | ToolErrorKind::Auth
393 | ToolErrorKind::Permanent
394 | ToolErrorKind::Validation
395 | ToolErrorKind::Internal => k.bit_index(),
396 }
397 }
398 let every_variant = [
399 ToolErrorKind::Transient,
400 ToolErrorKind::RateLimit,
401 ToolErrorKind::Quota,
402 ToolErrorKind::Auth,
403 ToolErrorKind::Permanent,
404 ToolErrorKind::Validation,
405 ToolErrorKind::Internal,
406 ];
407 let mut seen = std::collections::HashSet::new();
408 for k in every_variant {
409 let bi = dispatch_bit_index(k);
410 assert!(
411 bi < ToolErrorKindSet::CAPACITY_BITS,
412 "{k:?}.bit_index() = {bi} exceeds ToolErrorKindSet capacity \
413 ({cap} bits) — widen the backing integer in lockstep",
414 cap = ToolErrorKindSet::CAPACITY_BITS,
415 );
416 assert!(seen.insert(bi), "duplicate bit_index {bi} for {k:?}");
417 }
418 }
419
420 #[test]
421 fn tool_error_kind_set_const_construction() {
422 const SET: ToolErrorKindSet = ToolErrorKindSet::empty()
423 .with(ToolErrorKind::Auth)
424 .with(ToolErrorKind::Quota)
425 .with(ToolErrorKind::Permanent);
426 assert!(SET.contains(ToolErrorKind::Auth));
427 assert!(SET.contains(ToolErrorKind::Quota));
428 assert!(SET.contains(ToolErrorKind::Permanent));
429 assert!(!SET.contains(ToolErrorKind::Transient));
430 assert!(!SET.contains(ToolErrorKind::Internal));
431 assert!(!SET.is_empty());
432 assert!(ToolErrorKindSet::empty().is_empty());
433 }
434
435 #[test]
436 fn tool_error_kind_set_without_and_union() {
437 let a = ToolErrorKindSet::singleton(ToolErrorKind::Auth);
438 let b = ToolErrorKindSet::singleton(ToolErrorKind::Quota);
439 let both = a.union(b);
440 assert!(both.contains(ToolErrorKind::Auth));
441 assert!(both.contains(ToolErrorKind::Quota));
442 let removed = both.without(ToolErrorKind::Auth);
443 assert!(!removed.contains(ToolErrorKind::Auth));
444 assert!(removed.contains(ToolErrorKind::Quota));
445 }
446
447 #[test]
448 fn usage_limit_exceeded_classifies_as_quota() {
449 use crate::run_budget::UsageLimitBreach;
450 let err = Error::UsageLimitExceeded(UsageLimitBreach::Requests {
451 limit: 10,
452 observed: 11,
453 });
454 assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Quota);
455 }
456}