entelix-core 0.6.0

entelix DAG root — IR, codecs, transports, Tool trait + ToolRegistry, auth, ExecutionContext, ModelInvocation/ToolInvocation Service spine, StreamAggregator
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
//! `ToolErrorKind` — tool-dispatch failure category derived from
//! [`crate::Error`] for observability and retry classification.
//!
//! Tool authors return `Result<Value, Error>` from `Tool::execute`;
//! the runtime classifies the error variant into one of these
//! seven categories so observability sinks (`AgentEvent::ToolError`),
//! retry middleware (`RetryToolLayer`), and recovery sinks all
//! reach the same cross-tool taxonomy.
//!
//! Mirrors [`crate::ProviderErrorKind`] in shape (typed enum
//! categorising failures) but operates at a higher level — provider
//! kinds describe transport mechanisms, tool kinds describe the
//! semantic outcome the operator (or the model) actually cares about.

use crate::error::Error;

/// Cross-tool failure category.
///
/// Derive from [`Error`] via [`Self::classify`]. Used for retry
/// middleware (`RetryToolLayer` retries [`Self::Transient`] /
/// [`Self::RateLimit`]), observability sinks (operators surface the
/// category in dashboards), and downstream recovery routing
/// (different categories trigger different operator responses —
/// page on `Auth`, alert on `Quota`, ignore `Validation` noise).
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, serde::Serialize, serde::Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum ToolErrorKind {
    /// Network blip, transient 5xx, generic transport failure —
    /// safe to retry.
    Transient,
    /// Vendor signalled rate limiting (429 with `Retry-After` hint).
    /// Retryable after the configured cooldown.
    RateLimit,
    /// Vendor signalled exhausted quota / billing cap. Retry will
    /// not succeed until the quota resets or operator intervenes —
    /// surface to ops, do not retry automatically.
    Quota,
    /// Credential rejected (401 / 403 / [`Error::Auth`]). Retry will
    /// not succeed until credentials are rotated.
    Auth,
    /// Permanent vendor failure (4xx other than auth/rate/quota,
    /// 405, 410, 422 …). The same call will fail again.
    Permanent,
    /// Caller-side input rejected ([`Error::InvalidRequest`],
    /// [`Error::Serde`]) — the operator's payload does not match
    /// the tool contract. Retry is meaningless without changing the
    /// payload.
    Validation,
    /// Tool-internal bug or misconfiguration ([`Error::Config`], or
    /// any unclassified shape). Surface to ops; retry is not
    /// meaningful.
    Internal,
}

impl ToolErrorKind {
    /// Derive the category from an [`Error`].
    ///
    /// The mapping is intentionally exhaustive over the variants
    /// [`Error`] surfaces today — the `_` catch-all routes to
    /// [`Self::Internal`] so future variants stay observable until
    /// classified explicitly. Operational variants
    /// ([`Error::Cancelled`], [`Error::DeadlineExceeded`],
    /// [`Error::Interrupted`], [`Error::ModelRetry`]) flow through
    /// `Internal` because they are agent-runtime control signals,
    /// not tool failures — call sites that observe them should not
    /// reach this classifier in the first place.
    #[must_use]
    pub fn classify(error: &Error) -> Self {
        use crate::error::ProviderErrorKind;
        match error {
            // Already-classified terminal failures (raised by
            // `ToolErrorPolicyLayer` or propagated from a sub-agent)
            // carry their classification on the variant itself.
            // Surface the leaf kind so observability (`AgentEvent::
            // ToolError::kind`, OTel `entelix.tool.error_kind`,
            // `DefaultRetryClassifier` introspection) sees the
            // operator-actionable category regardless of how many
            // layers re-encountered the same wrapped error. The
            // `ToolErrorPolicyLayer` itself passes the already-
            // wrapped variant through unchanged via its double-wrap
            // guard, so the parent's policy does not re-wrap.
            Error::ToolErrorTerminal { kind, .. } => *kind,
            Error::Provider {
                kind: ProviderErrorKind::Network | ProviderErrorKind::Tls | ProviderErrorKind::Dns,
                ..
            } => Self::Transient,
            Error::Provider {
                kind: ProviderErrorKind::Http(429),
                retry_after,
                ..
            } => {
                // Vendor distinguishes 429-with-Retry-After (transient
                // back-pressure) from 429-without (often quota
                // exhaustion). The hint presence is the cue.
                if retry_after.is_some() {
                    Self::RateLimit
                } else {
                    Self::Quota
                }
            }
            Error::Provider {
                kind: ProviderErrorKind::Http(status),
                ..
            } => {
                if *status == 401 || *status == 403 {
                    Self::Auth
                } else if (500..600).contains(status) || *status == 408 || *status == 425 {
                    Self::Transient
                } else {
                    Self::Permanent
                }
            }
            Error::Auth(_) => Self::Auth,
            Error::UsageLimitExceeded(_) => Self::Quota,
            Error::InvalidRequest(_) | Error::Serde(_) => Self::Validation,
            // Operational variants (Cancelled, DeadlineExceeded,
            // Interrupted, ModelRetry) and any future shape route
            // here together with Config — none of them are tool
            // failures the operator can act on per-category.
            _ => Self::Internal,
        }
    }

    /// Whether the runtime should attempt the tool call again.
    ///
    /// `Transient` and `RateLimit` are retryable; everything else
    /// is a surface-and-stop signal. `RetryToolLayer` consults this
    /// via the underlying `RetryClassifier` (which can be
    /// overridden per deployment) — operators that want different
    /// retry policy install a custom classifier rather than mutating
    /// this method.
    #[must_use]
    pub const fn is_retryable(self) -> bool {
        matches!(self, Self::Transient | Self::RateLimit)
    }

    /// Stable snake-case identifier surfaced through OTel
    /// (`entelix.tool.error_kind`), structured logs, and audit
    /// `GraphEvent` serialisation. Patch-version stable — renaming
    /// a value is a breaking change for downstream consumers
    /// keying off the string.
    #[must_use]
    pub const fn wire_id(self) -> &'static str {
        match self {
            Self::Transient => "transient",
            Self::RateLimit => "rate_limit",
            Self::Quota => "quota",
            Self::Auth => "auth",
            Self::Permanent => "permanent",
            Self::Validation => "validation",
            Self::Internal => "internal",
        }
    }

    /// Stable bit position used by [`ToolErrorKindSet`] for the
    /// bitset representation. Adding a new variant requires updating
    /// this match — the same-crate exhaustiveness check forces it
    /// (`#[non_exhaustive]` is for external matchers only). The
    /// `bit_indices_are_unique_and_fit_in_set_width` regression test
    /// then asserts the new bit position is unique and fits in
    /// [`ToolErrorKindSet`]'s backing integer; widening the integer
    /// in lockstep with the taxonomy is the only correct response if
    /// the test ever fails.
    #[must_use]
    pub(crate) const fn bit_index(self) -> u32 {
        match self {
            Self::Transient => 0,
            Self::RateLimit => 1,
            Self::Quota => 2,
            Self::Auth => 3,
            Self::Permanent => 4,
            Self::Validation => 5,
            Self::Internal => 6,
        }
    }
}

impl std::fmt::Display for ToolErrorKind {
    /// Lowercase snake_case form matching [`Self::wire_id`] — the
    /// stable operator-channel rendering. Operator dashboards, log
    /// lines, and the `Error::ToolErrorTerminal` Display all read
    /// this format so the same kind reads identically across every
    /// surface.
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str(self.wire_id())
    }
}

/// Compact set of [`ToolErrorKind`] values. Backed by a fixed-width
/// integer bitset — every variant occupies one bit at the position
/// returned by [`ToolErrorKind::bit_index`].
///
/// The type lives next to [`ToolErrorKind`] rather than in a
/// dedicated `set` module so the bit layout and the enum stay in
/// one place. The `bit_indices_are_unique_and_fit_in_set_width`
/// regression test below uses a same-crate exhaustive match to
/// force every new variant through a uniqueness + width check —
/// landing a variant whose `bit_index` exceeds the set width fails
/// the test, and the developer widens the backing integer in
/// lockstep with the taxonomy.
///
/// Construction is `const`-friendly — operator defaults and unit
/// tests build sets at compile time:
///
/// ```
/// use entelix_core::tools::{ToolErrorKind, ToolErrorKindSet};
/// const SAFE: ToolErrorKindSet = ToolErrorKindSet::empty()
///     .with(ToolErrorKind::Auth)
///     .with(ToolErrorKind::Quota)
///     .with(ToolErrorKind::Permanent);
/// assert!(SAFE.contains(ToolErrorKind::Auth));
/// assert!(!SAFE.contains(ToolErrorKind::Transient));
/// ```
#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
pub struct ToolErrorKindSet(u16);

impl ToolErrorKindSet {
    /// Width of the underlying integer in bits — the maximum number
    /// of [`ToolErrorKind`] variants the set can hold. Used by the
    /// regression test that pins variant count against bitset
    /// capacity.
    #[cfg(test)]
    const CAPACITY_BITS: u32 = u16::BITS;

    /// Empty set — no kinds.
    #[must_use]
    pub const fn empty() -> Self {
        Self(0)
    }

    /// Single-kind set — useful at literal call sites.
    #[must_use]
    pub const fn singleton(kind: ToolErrorKind) -> Self {
        Self(1u16 << kind.bit_index())
    }

    /// Insert `kind` into the set, returning the updated set.
    #[must_use]
    pub const fn with(self, kind: ToolErrorKind) -> Self {
        Self(self.0 | (1u16 << kind.bit_index()))
    }

    /// Remove `kind` from the set, returning the updated set.
    #[must_use]
    pub const fn without(self, kind: ToolErrorKind) -> Self {
        Self(self.0 & !(1u16 << kind.bit_index()))
    }

    /// Union with another set, returning the updated set.
    #[must_use]
    pub const fn union(self, other: Self) -> Self {
        Self(self.0 | other.0)
    }

    /// Whether `kind` is in the set.
    #[must_use]
    pub const fn contains(self, kind: ToolErrorKind) -> bool {
        (self.0 >> kind.bit_index()) & 1 == 1
    }

    /// Whether the set has no kinds.
    #[must_use]
    pub const fn is_empty(self) -> bool {
        self.0 == 0
    }
}

#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
    use super::*;
    use std::time::Duration;

    #[test]
    fn provider_network_classifies_as_transient() {
        let err = Error::provider_network("connect refused");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
        assert!(ToolErrorKind::classify(&err).is_retryable());
    }

    #[test]
    fn provider_dns_classifies_as_transient() {
        let err = Error::provider_dns("no such host");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
    }

    #[test]
    fn provider_5xx_classifies_as_transient() {
        let err = Error::provider_http(503, "down");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
        let err = Error::provider_http(502, "bad gateway");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
    }

    #[test]
    fn http_408_and_425_classify_as_transient() {
        // 408 Request Timeout, 425 Too Early — both retryable per
        // spec semantics.
        let err = Error::provider_http(408, "timeout");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
        let err = Error::provider_http(425, "too early");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
    }

    #[test]
    fn http_429_with_retry_after_classifies_as_rate_limit() {
        let err = Error::provider_http(429, "slow down").with_retry_after(Duration::from_secs(5));
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::RateLimit);
        assert!(ToolErrorKind::classify(&err).is_retryable());
    }

    #[test]
    fn http_429_without_retry_after_classifies_as_quota() {
        // Vendor signalling quota exhaustion typically omits
        // `Retry-After` because the cooldown is a billing cycle,
        // not a request window.
        let err = Error::provider_http(429, "monthly cap reached");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Quota);
        assert!(!ToolErrorKind::classify(&err).is_retryable());
    }

    #[test]
    fn http_401_403_classify_as_auth() {
        let err = Error::provider_http(401, "unauthorized");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Auth);
        let err = Error::provider_http(403, "forbidden");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Auth);
        assert!(!ToolErrorKind::classify(&err).is_retryable());
    }

    #[test]
    fn http_4xx_other_classifies_as_permanent() {
        let err = Error::provider_http(404, "not found");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Permanent);
        let err = Error::provider_http(422, "unprocessable");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Permanent);
        assert!(!ToolErrorKind::classify(&err).is_retryable());
    }

    #[test]
    fn invalid_request_and_serde_classify_as_validation() {
        let err = Error::invalid_request("bad input");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Validation);
        let serde_err: serde_json::Error = serde_json::from_str::<i32>("not-a-number").unwrap_err();
        let err: Error = serde_err.into();
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Validation);
    }

    #[test]
    fn config_classifies_as_internal() {
        let err = Error::config("misconfigured");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Internal);
    }

    #[test]
    fn tool_error_terminal_unwraps_to_inner_kind() {
        // `ToolErrorTerminal` is a routing-decision wrapper, not a
        // distinct category. Classification unwraps so a parent
        // layer (sub-agent propagation case) sees the original kind
        // and re-evaluates against its own policy.
        let inner = Error::provider_http(401, "unauthorized");
        let wrapped = Error::tool_error_terminal(ToolErrorKind::Auth, "my_tool", inner);
        assert_eq!(ToolErrorKind::classify(&wrapped), ToolErrorKind::Auth);
        // Doubly-wrapped (parent escalates a sub-agent's already-
        // terminal failure): still unwraps cleanly to the leaf kind.
        let twice = Error::tool_error_terminal(ToolErrorKind::Auth, "parent_tool", wrapped);
        assert_eq!(ToolErrorKind::classify(&twice), ToolErrorKind::Auth);
    }

    #[test]
    fn bit_indices_are_unique_and_fit_in_set_width() {
        // Same-crate exhaustive match — adding a new variant to
        // `ToolErrorKind` without extending the arm below is a
        // compile error. That single forcing function carries two
        // contracts:
        //
        //   - Every variant has a `bit_index` (the match's body
        //     evaluates `k.bit_index()` for every arm).
        //   - The runtime loop below visits every variant, so the
        //     uniqueness and width assertions cover the whole
        //     taxonomy with no hidden holes.
        //
        // This is the single drift guard for the bitset — no
        // parallel `ALL` array, no `const _: () = …` block, no
        // `#[allow(clippy::indexing_slicing)]` workaround.
        fn dispatch_bit_index(k: ToolErrorKind) -> u32 {
            match k {
                ToolErrorKind::Transient
                | ToolErrorKind::RateLimit
                | ToolErrorKind::Quota
                | ToolErrorKind::Auth
                | ToolErrorKind::Permanent
                | ToolErrorKind::Validation
                | ToolErrorKind::Internal => k.bit_index(),
            }
        }
        let every_variant = [
            ToolErrorKind::Transient,
            ToolErrorKind::RateLimit,
            ToolErrorKind::Quota,
            ToolErrorKind::Auth,
            ToolErrorKind::Permanent,
            ToolErrorKind::Validation,
            ToolErrorKind::Internal,
        ];
        let mut seen = std::collections::HashSet::new();
        for k in every_variant {
            let bi = dispatch_bit_index(k);
            assert!(
                bi < ToolErrorKindSet::CAPACITY_BITS,
                "{k:?}.bit_index() = {bi} exceeds ToolErrorKindSet capacity \
                 ({cap} bits) — widen the backing integer in lockstep",
                cap = ToolErrorKindSet::CAPACITY_BITS,
            );
            assert!(seen.insert(bi), "duplicate bit_index {bi} for {k:?}");
        }
    }

    #[test]
    fn tool_error_kind_set_const_construction() {
        const SET: ToolErrorKindSet = ToolErrorKindSet::empty()
            .with(ToolErrorKind::Auth)
            .with(ToolErrorKind::Quota)
            .with(ToolErrorKind::Permanent);
        assert!(SET.contains(ToolErrorKind::Auth));
        assert!(SET.contains(ToolErrorKind::Quota));
        assert!(SET.contains(ToolErrorKind::Permanent));
        assert!(!SET.contains(ToolErrorKind::Transient));
        assert!(!SET.contains(ToolErrorKind::Internal));
        assert!(!SET.is_empty());
        assert!(ToolErrorKindSet::empty().is_empty());
    }

    #[test]
    fn tool_error_kind_set_without_and_union() {
        let a = ToolErrorKindSet::singleton(ToolErrorKind::Auth);
        let b = ToolErrorKindSet::singleton(ToolErrorKind::Quota);
        let both = a.union(b);
        assert!(both.contains(ToolErrorKind::Auth));
        assert!(both.contains(ToolErrorKind::Quota));
        let removed = both.without(ToolErrorKind::Auth);
        assert!(!removed.contains(ToolErrorKind::Auth));
        assert!(removed.contains(ToolErrorKind::Quota));
    }

    #[test]
    fn usage_limit_exceeded_classifies_as_quota() {
        use crate::run_budget::UsageLimitBreach;
        let err = Error::UsageLimitExceeded(UsageLimitBreach::Requests {
            limit: 10,
            observed: 11,
        });
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Quota);
    }
}