Skip to main content

entelix_core/tools/
error_policy.rs

1//! `ToolErrorPolicy` — operator-tunable gate that classifies
2//! tool-dispatch failures into "model-recoverable" vs "terminal" for
3//! the reasoning loop, plus the `tower::Layer` that applies it.
4//!
5//! ## Why a typed gate rather than per-recipe branching
6//!
7//! `ToolErrorKind::classify` already buckets every `Error` into one
8//! of seven categories. Without this policy the categories are
9//! observability metadata: dashboards split errors but the reasoning
10//! loop treats `Auth` (no LLM-actionable recovery), `Quota` (same),
11//! and `Validation` (the model authored the bad input) identically —
12//! feed the rendered error back to the model and let it retry until
13//! `recursion_limit` fires. The result is wasted tokens, latency,
14//! and a generic `recursion_limit_exceeded` event that hides what
15//! actually went wrong.
16//!
17//! `ToolErrorPolicy` closes that loop: operators declare which kinds
18//! have no LLM-actionable recovery; [`ToolErrorPolicyLayer`] wraps
19//! matching failures into [`crate::Error::ToolErrorTerminal`]; every
20//! reasoning recipe propagates the variant the same way it propagates
21//! [`crate::Error::Interrupted`] — return to the caller without
22//! re-prompting the model.
23//!
24//! ## Composition
25//!
26//! The layer wraps the tool-side `Service<ToolInvocation>` spine.
27//! Outer-policy reads intuitively (retry exhausts first, then the
28//! settled failure category reaches the gate):
29//!
30//! ```ignore
31//! use entelix_core::ToolRegistry;
32//! use entelix_core::tools::{RetryToolLayer, ToolErrorPolicy, ToolErrorPolicyLayer};
33//!
34//! let registry = ToolRegistry::new()
35//!     .register(my_tool)?
36//!     .layer(RetryToolLayer::new())                                         // innermost
37//!     .layer(ToolErrorPolicyLayer::new(ToolErrorPolicy::operator_safe())); // outermost
38//! # Ok::<(), entelix_core::Error>(())
39//! ```
40//!
41//! Reversing the order (policy inside retry) is also safe — the
42//! shipped [`crate::transports::DefaultRetryClassifier`] unwraps
43//! [`Error::ToolErrorTerminal`] and re-classifies against the inner
44//! source, so retry decisions stay identical regardless of layer
45//! ordering. Operators wiring a custom `RetryClassifier` should
46//! mirror that unwrap if they want the same property.
47//!
48//! ## Control-signal pass-through
49//!
50//! `Error::Cancelled`, `Error::DeadlineExceeded`,
51//! `Error::Interrupted`, and `Error::ModelRetry` are reasoning-loop
52//! control signals, not tool failures. The layer short-circuits them
53//! before classification so a policy that includes
54//! `ToolErrorKind::Internal` (the `_` catch-all bucket) never
55//! accidentally re-routes a cancellation, HITL pause, or
56//! validator-retry hint through the terminal channel.
57//!
58//! ## Sub-agent propagation
59//!
60//! When a sub-agent (registered on the parent as a `Tool`)
61//! terminates, the parent's `ToolErrorPolicyLayer` sees the inner
62//! `Error::ToolErrorTerminal` returned from `Subagent::execute`. The
63//! double-wrap guard passes it through unchanged — *the sub-agent
64//! already decided to terminate, and the parent cannot un-terminate*.
65//! `ToolErrorKind::classify` unwraps to the leaf's classification, so
66//! the parent's recipe loop catches the termination on the same
67//! `Err(Error::ToolErrorTerminal { .. })` arm regardless of whether
68//! the parent's policy is identical to the sub-agent's. The
69//! propagated variant preserves the **leaf tool's name** — operators
70//! see "BigQuery auth rotated" rather than "sub_agent failed", which
71//! is the actionable diagnostic. Sub-agent correlation rides on the
72//! `AuditSink::record_sub_agent_invoked` event already emitted at
73//! dispatch time.
74
75use std::task::{Context, Poll};
76
77use futures::future::BoxFuture;
78use serde_json::Value;
79use tower::{Layer, Service};
80
81use crate::error::{Error, Result};
82use crate::service::ToolInvocation;
83use crate::tools::{ToolErrorKind, ToolErrorKindSet};
84
85/// Operator-tunable classification gate consumed by
86/// [`ToolErrorPolicyLayer`].
87///
88/// The struct is intentionally small: one [`ToolErrorKindSet`]
89/// describing which kinds escalate to terminal. Operators that want
90/// finer-grained behaviour (per-tool overrides, per-tenant
91/// hot-swap) compose a higher layer that produces tenant-scoped
92/// `ToolErrorPolicy` values; the leaf layer here stays simple by
93/// design.
94#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
95pub struct ToolErrorPolicy {
96    /// `ToolErrorKind`s that cause the layer to wrap the inner
97    /// failure into [`crate::Error::ToolErrorTerminal`], short-
98    /// circuiting the reasoning loop. The empty default is the
99    /// no-op — every failure flows through unchanged and reaches the
100    /// model as a tool-result message.
101    pub terminate_on: ToolErrorKindSet,
102}
103
104impl ToolErrorPolicy {
105    /// Empty policy — every failure flows back to the model. The
106    /// `Default` shape, suitable for dev / debugging deployments.
107    #[must_use]
108    pub const fn new() -> Self {
109        Self {
110            terminate_on: ToolErrorKindSet::empty(),
111        }
112    }
113
114    /// Recommended production set: `Auth | Quota | Permanent`. Three
115    /// kinds the model cannot influence on retry — surfacing them to
116    /// the LLM only burns tokens before recursion-limit fires.
117    ///
118    /// `Internal` is deliberately excluded. `ToolErrorKind::classify`
119    /// routes the `_` catch-all (unclassified `Error` variants) to
120    /// `Internal`, so including it in the default would silently
121    /// terminate runs on every future `Error` variant that ships
122    /// without an explicit classifier arm. Operators that *want* the
123    /// catch-all behaviour add it explicitly with
124    /// `.add_terminal_kind(ToolErrorKind::Internal)`.
125    ///
126    /// `Transient` / `RateLimit` are excluded because
127    /// [`crate::tools::RetryToolLayer`] handles them transport-side;
128    /// what reaches this policy in those buckets is past the retry
129    /// budget and surfacing to the model gives it a chance to pick a
130    /// different tool or restructure the call. `Validation` is
131    /// excluded because the model authored the bad input — retrying
132    /// with a corrected payload is exactly the failure mode the
133    /// LLM-recoverable channel was designed for.
134    #[must_use]
135    pub const fn operator_safe() -> Self {
136        Self {
137            terminate_on: ToolErrorKindSet::empty()
138                .with(ToolErrorKind::Auth)
139                .with(ToolErrorKind::Quota)
140                .with(ToolErrorKind::Permanent),
141        }
142    }
143
144    /// Add `kind` to the terminate-on set. Fluent builder shape for
145    /// recipe wiring sites.
146    #[must_use]
147    pub const fn add_terminal_kind(mut self, kind: ToolErrorKind) -> Self {
148        self.terminate_on = self.terminate_on.with(kind);
149        self
150    }
151
152    /// Whether `kind` causes the layer to escalate to terminal.
153    #[must_use]
154    pub const fn classifies_terminal(self, kind: ToolErrorKind) -> bool {
155        self.terminate_on.contains(kind)
156    }
157}
158
159/// `tower::Layer` applying [`ToolErrorPolicy`] to every
160/// `Service<ToolInvocation>` dispatch. Cloning is a single-byte
161/// copy of the inner [`ToolErrorPolicy`] (no heap allocation).
162#[derive(Clone, Copy, Debug)]
163pub struct ToolErrorPolicyLayer {
164    policy: ToolErrorPolicy,
165}
166
167impl ToolErrorPolicyLayer {
168    /// Patch-version-stable identifier surfaced through
169    /// [`crate::ToolRegistry::layer_names`]. Renaming this constant
170    /// is a breaking change for dashboards keyed off the value.
171    pub const NAME: &'static str = "tool_error_policy";
172
173    /// Build the layer with the supplied policy.
174    #[must_use]
175    pub const fn new(policy: ToolErrorPolicy) -> Self {
176        Self { policy }
177    }
178
179    /// Borrow the wrapped policy — useful for diagnostic dumps and
180    /// `layer_names` correlation.
181    #[must_use]
182    pub const fn policy(&self) -> &ToolErrorPolicy {
183        &self.policy
184    }
185}
186
187impl crate::NamedLayer for ToolErrorPolicyLayer {
188    fn layer_name(&self) -> &'static str {
189        Self::NAME
190    }
191}
192
193impl<S> Layer<S> for ToolErrorPolicyLayer
194where
195    S: Service<ToolInvocation, Response = Value, Error = Error> + Clone + Send + 'static,
196    S::Future: Send + 'static,
197{
198    type Service = ToolErrorPolicyService<S>;
199
200    fn layer(&self, inner: S) -> Self::Service {
201        ToolErrorPolicyService {
202            inner,
203            policy: self.policy,
204        }
205    }
206}
207
208/// `Service<ToolInvocation>` produced by [`ToolErrorPolicyLayer`].
209#[derive(Clone, Debug)]
210pub struct ToolErrorPolicyService<Inner> {
211    inner: Inner,
212    policy: ToolErrorPolicy,
213}
214
215impl<Inner> Service<ToolInvocation> for ToolErrorPolicyService<Inner>
216where
217    Inner: Service<ToolInvocation, Response = Value, Error = Error> + Clone + Send + 'static,
218    Inner::Future: Send + 'static,
219{
220    type Response = Value;
221    type Error = Error;
222    type Future = BoxFuture<'static, Result<Value>>;
223
224    fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<()>> {
225        self.inner.poll_ready(cx)
226    }
227
228    fn call(&mut self, invocation: ToolInvocation) -> Self::Future {
229        let mut inner = self.inner.clone();
230        let policy = self.policy;
231        let tool_name = invocation.metadata.name.clone();
232        // Audit handle only — tenant + thread scope ride on the
233        // adapter's own `ThreadKey`. Pulling the `Arc` out before the
234        // invocation moves into `inner.call` saves the full
235        // `ExecutionContext::clone` for the (cheap) `Option<Arc>` it
236        // actually consumes.
237        let audit = invocation.ctx.audit_sink();
238
239        Box::pin(async move {
240            let result = inner.call(invocation).await;
241            let Err(err) = result else {
242                return result;
243            };
244
245            // Reasoning-loop control signals are not tool failures —
246            // never reroute them through the terminal channel even
247            // if the operator's policy includes `Internal` (the
248            // `_` catch-all classifier would otherwise scoop them
249            // in). The set covers every typed `Error` variant that
250            // is *not* an outcome the operator policy is meant to
251            // classify:
252            //
253            // - `Cancelled` / `DeadlineExceeded` — `ExecutionContext`
254            //   primitives raised by callers.
255            // - `Interrupted` — HITL pause-and-resume.
256            // - `ModelRetry` — the unified validator / schema-retry
257            //   channel (invariant 20). A tool body raising this
258            //   asks the model to retry the *current* turn; classifying
259            //   it as `Internal` and terminating the loop would
260            //   silently break the validator-retry budget.
261            if matches!(
262                err,
263                Error::Cancelled
264                    | Error::DeadlineExceeded
265                    | Error::Interrupted { .. }
266                    | Error::ModelRetry { .. }
267            ) {
268                return Err(err);
269            }
270
271            // Classify with the unwrap arm built into
272            // `ToolErrorKind::classify` — `Error::ToolErrorTerminal`
273            // returns its inner `kind`, so an already-escalated
274            // failure (sub-agent propagation, or domain-side
275            // self-wrap at a `ToolFailure` boundary) classifies
276            // consistently with how this layer would have treated
277            // it had no prior wrap occurred.
278            let kind = ToolErrorKind::classify(&err);
279            if !policy.classifies_terminal(kind) {
280                return Err(err);
281            }
282
283            // Audit-sink emission — fire-and-forget per invariant
284            // 18. Failures inside the sink stay there. Emits on
285            // every terminal escalation that flows through the
286            // layer, regardless of where the original wrap
287            // originated — operators get one audit record per
288            // terminal run end without depending on the wrap
289            // call-site reaching the sink itself.
290            //
291            // For an already-wrapped failure, prefer the inner
292            // `tool_name` (the leaf identity) over this layer's
293            // `invocation.metadata.name` (the parent's view).
294            let recorded_tool: &str = match &err {
295                Error::ToolErrorTerminal {
296                    tool_name: inner, ..
297                } => inner.as_str(),
298                _ => tool_name.as_str(),
299            };
300            if let Some(handle) = &audit {
301                handle
302                    .as_sink()
303                    .record_tool_error_terminal(kind, recorded_tool);
304            }
305
306            // Pass-through guard: an already-terminal error
307            // (sub-agent propagation or domain self-wrap) keeps
308            // its original `tool_name` and `source` — re-wrapping
309            // would obscure the leaf identity without adding
310            // signal.
311            if matches!(err, Error::ToolErrorTerminal { .. }) {
312                return Err(err);
313            }
314
315            Err(Error::tool_error_terminal(kind, tool_name, err))
316        })
317    }
318}
319
320#[cfg(test)]
321#[allow(clippy::unwrap_used)]
322mod tests {
323    use std::sync::Arc;
324    use std::sync::atomic::{AtomicUsize, Ordering};
325
326    use serde_json::{Value, json};
327    use tower::{Layer, Service, ServiceExt};
328
329    use crate::LlmRenderable;
330    use crate::context::ExecutionContext;
331    use crate::error::Error;
332    use crate::service::ToolInvocation;
333    use crate::tools::{ToolErrorKind, ToolMetadata};
334
335    use super::*;
336
337    /// Minimal `Service<ToolInvocation>` returning a preset error /
338    /// success. Counts calls so tests assert the inner service was
339    /// invoked exactly once per dispatch (no double-attempt
340    /// regression).
341    #[derive(Clone)]
342    struct StubTool {
343        calls: Arc<AtomicUsize>,
344        outcome: Arc<dyn Fn() -> std::result::Result<Value, Error> + Send + Sync>,
345    }
346
347    impl StubTool {
348        fn new(
349            outcome: impl Fn() -> std::result::Result<Value, Error> + Send + Sync + 'static,
350        ) -> Self {
351            Self {
352                calls: Arc::new(AtomicUsize::new(0)),
353                outcome: Arc::new(outcome),
354            }
355        }
356    }
357
358    impl Service<ToolInvocation> for StubTool {
359        type Response = Value;
360        type Error = Error;
361        type Future = futures::future::BoxFuture<'static, std::result::Result<Value, Error>>;
362
363        fn poll_ready(&mut self, _cx: &mut Context<'_>) -> Poll<std::result::Result<(), Error>> {
364            Poll::Ready(Ok(()))
365        }
366
367        fn call(&mut self, _req: ToolInvocation) -> Self::Future {
368            self.calls.fetch_add(1, Ordering::SeqCst);
369            let outcome = Arc::clone(&self.outcome);
370            Box::pin(async move { outcome() })
371        }
372    }
373
374    fn invocation() -> ToolInvocation {
375        let metadata = Arc::new(ToolMetadata::function(
376            "stub",
377            "stub tool for tests",
378            json!({"type": "object"}),
379        ));
380        ToolInvocation::new("tu-1".into(), metadata, json!({}), ExecutionContext::new())
381    }
382
383    async fn dispatch(
384        policy: ToolErrorPolicy,
385        outcome: Error,
386    ) -> (std::result::Result<Value, Error>, usize) {
387        let stub = StubTool::new(move || Err(clone_error(&outcome)));
388        let layer = ToolErrorPolicyLayer::new(policy);
389        let mut svc = layer.layer(stub.clone());
390        let result = svc.ready().await.unwrap().call(invocation()).await;
391        (result, stub.calls.load(Ordering::SeqCst))
392    }
393
394    fn clone_error(err: &Error) -> Error {
395        match err {
396            Error::Provider { kind, message, .. } => match kind {
397                crate::error::ProviderErrorKind::Http(s) => {
398                    Error::provider_http(*s, message.clone())
399                }
400                crate::error::ProviderErrorKind::Network => {
401                    Error::provider_network(message.clone())
402                }
403                crate::error::ProviderErrorKind::Tls => Error::provider_tls(message.clone()),
404                crate::error::ProviderErrorKind::Dns => Error::provider_dns(message.clone()),
405            },
406            Error::Cancelled => Error::Cancelled,
407            Error::DeadlineExceeded => Error::DeadlineExceeded,
408            Error::InvalidRequest(s) => Error::invalid_request(s.clone()),
409            Error::Config(s) => Error::config(s.clone()),
410            Error::ModelRetry { hint, attempt } => Error::model_retry(hint.clone(), *attempt),
411            other => Error::invalid_request(format!("unrepeatable in test: {other}")),
412        }
413    }
414
415    #[tokio::test]
416    async fn auth_failure_escalates_under_operator_safe_default() {
417        let (result, calls) = dispatch(
418            ToolErrorPolicy::operator_safe(),
419            Error::provider_http(401, "unauthorized"),
420        )
421        .await;
422        let err = result.unwrap_err();
423        assert!(
424            matches!(&err, Error::ToolErrorTerminal { kind, .. } if *kind == ToolErrorKind::Auth),
425            "expected ToolErrorTerminal{{Auth}}, got {err:?}"
426        );
427        assert_eq!(calls, 1, "inner service must be invoked exactly once");
428    }
429
430    #[tokio::test]
431    async fn validation_failure_passes_through_under_operator_safe() {
432        // Validation stays loop-bound — the model authored the bad
433        // input, retry is the whole point.
434        let (result, calls) = dispatch(
435            ToolErrorPolicy::operator_safe(),
436            Error::invalid_request("bad input"),
437        )
438        .await;
439        let err = result.unwrap_err();
440        assert!(
441            matches!(err, Error::InvalidRequest(_)),
442            "Validation must NOT wrap under operator_safe"
443        );
444        assert_eq!(calls, 1);
445    }
446
447    #[tokio::test]
448    async fn empty_policy_is_full_passthrough() {
449        // Default `terminate_on = empty` preserves the no-shim
450        // semantics — every error reaches the model. The layer is
451        // installable everywhere with zero behaviour change until
452        // operators opt in.
453        let (result, _) = dispatch(
454            ToolErrorPolicy::new(),
455            Error::provider_http(401, "unauthorized"),
456        )
457        .await;
458        assert!(matches!(result.unwrap_err(), Error::Provider { .. }));
459    }
460
461    #[tokio::test]
462    async fn control_signals_short_circuit_classification() {
463        // A policy that includes `Internal` (the `_` catch-all)
464        // would scoop `Cancelled` / `DeadlineExceeded` /
465        // `Interrupted` / `ModelRetry` if classification were
466        // unconditional. The layer's pre-classification guard
467        // prevents that — these are reasoning-loop control signals,
468        // not tool-failure outcomes the operator policy is meant
469        // to classify.
470        let policy = ToolErrorPolicy::new().add_terminal_kind(ToolErrorKind::Internal);
471        let (result, _) = dispatch(policy, Error::Cancelled).await;
472        assert!(
473            matches!(result.unwrap_err(), Error::Cancelled),
474            "Cancelled must pass through unchanged"
475        );
476        let (result, _) = dispatch(policy, Error::DeadlineExceeded).await;
477        assert!(matches!(result.unwrap_err(), Error::DeadlineExceeded));
478
479        // `ModelRetry` is the validator / schema-retry channel
480        // (invariant 20). It must reach the chat-model retry loop
481        // unwrapped — wrapping it as `ToolErrorTerminal` would
482        // silently break the unified retry budget.
483        let model_retry = Error::model_retry("re-check the shape".to_owned().for_llm(), 0);
484        let (result, _) = dispatch(policy, model_retry).await;
485        assert!(matches!(result.unwrap_err(), Error::ModelRetry { .. }));
486    }
487
488    #[tokio::test]
489    async fn already_terminal_passes_through_unchanged() {
490        // Parent's layer must not re-wrap a sub-agent's
491        // already-terminal failure into a nested
492        // `ToolErrorTerminal { source: ToolErrorTerminal { .. } }`
493        // tower of boxes.
494        let inner_terminal = Error::tool_error_terminal(
495            ToolErrorKind::Auth,
496            "sub_tool",
497            Error::provider_http(401, "no auth"),
498        );
499        let stub = StubTool::new(move || {
500            Err(Error::tool_error_terminal(
501                ToolErrorKind::Auth,
502                "sub_tool",
503                Error::provider_http(401, "no auth"),
504            ))
505        });
506        let layer = ToolErrorPolicyLayer::new(ToolErrorPolicy::operator_safe());
507        let mut svc = layer.layer(stub);
508        let err = svc
509            .ready()
510            .await
511            .unwrap()
512            .call(invocation())
513            .await
514            .unwrap_err();
515        match err {
516            Error::ToolErrorTerminal { source, .. } => match *source {
517                // Source must remain the original Provider error,
518                // not a nested ToolErrorTerminal.
519                Error::Provider { .. } => {}
520                other => panic!("expected leaf Provider source, got {other:?}"),
521            },
522            other => panic!("expected ToolErrorTerminal, got {other:?}"),
523        }
524        let _ = inner_terminal; // retain ownership semantics for clarity
525    }
526
527    #[tokio::test]
528    async fn success_is_uninstrumented() {
529        let stub = StubTool::new(|| Ok(json!({"ok": true})));
530        let layer = ToolErrorPolicyLayer::new(ToolErrorPolicy::operator_safe());
531        let mut svc = layer.layer(stub);
532        let out = svc.ready().await.unwrap().call(invocation()).await.unwrap();
533        assert_eq!(out, json!({"ok": true}));
534    }
535}