entelix_core/tools/error_policy.rs
1//! `ToolErrorPolicy` — operator-tunable gate that classifies
2//! tool-dispatch failures into "model-recoverable" vs "terminal" for
3//! the reasoning loop, plus the `tower::Layer` that applies it.
4//!
5//! ## Why a typed gate rather than per-recipe branching
6//!
7//! `ToolErrorKind::classify` already buckets every `Error` into one
8//! of seven categories. Without this policy the categories are
9//! observability metadata: dashboards split errors but the reasoning
10//! loop treats `Auth` (no LLM-actionable recovery), `Quota` (same),
11//! and `Validation` (the model authored the bad input) identically —
12//! feed the rendered error back to the model and let it retry until
13//! `recursion_limit` fires. The result is wasted tokens, latency,
14//! and a generic `recursion_limit_exceeded` event that hides what
15//! actually went wrong.
16//!
17//! `ToolErrorPolicy` closes that loop: operators declare which kinds
18//! have no LLM-actionable recovery; [`ToolErrorPolicyLayer`] wraps
19//! matching failures into [`crate::Error::ToolErrorTerminal`]; every
20//! reasoning recipe propagates the variant the same way it propagates
21//! [`crate::Error::Interrupted`] — return to the caller without
22//! re-prompting the model.
23//!
24//! ## Composition
25//!
26//! The layer wraps the tool-side `Service<ToolInvocation>` spine.
27//! Outer-policy reads intuitively (retry exhausts first, then the
28//! settled failure category reaches the gate):
29//!
30//! ```ignore
31//! use entelix_core::ToolRegistry;
32//! use entelix_core::tools::{RetryToolLayer, ToolErrorPolicy, ToolErrorPolicyLayer};
33//!
34//! let registry = ToolRegistry::new()
35//! .register(my_tool)?
36//! .layer(RetryToolLayer::new()) // innermost
37//! .layer(ToolErrorPolicyLayer::new(ToolErrorPolicy::operator_safe())); // outermost
38//! # Ok::<(), entelix_core::Error>(())
39//! ```
40//!
41//! Reversing the order (policy inside retry) is also safe — the
42//! shipped [`crate::transports::DefaultRetryClassifier`] unwraps
43//! [`Error::ToolErrorTerminal`] and re-classifies against the inner
44//! source, so retry decisions stay identical regardless of layer
45//! ordering. Operators wiring a custom `RetryClassifier` should
46//! mirror that unwrap if they want the same property.
47//!
48//! ## Control-signal pass-through
49//!
50//! `Error::Cancelled`, `Error::DeadlineExceeded`,
51//! `Error::Interrupted`, and `Error::ModelRetry` are reasoning-loop
52//! control signals, not tool failures. The layer short-circuits them
53//! before classification so a policy that includes
54//! `ToolErrorKind::Internal` (the `_` catch-all bucket) never
55//! accidentally re-routes a cancellation, HITL pause, or
56//! validator-retry hint through the terminal channel.
57//!
58//! ## Sub-agent propagation
59//!
60//! When a sub-agent (registered on the parent as a `Tool`)
61//! terminates, the parent's `ToolErrorPolicyLayer` sees the inner
62//! `Error::ToolErrorTerminal` returned from `Subagent::execute`. The
63//! double-wrap guard passes it through unchanged — *the sub-agent
64//! already decided to terminate, and the parent cannot un-terminate*.
65//! `ToolErrorKind::classify` unwraps to the leaf's classification, so
66//! the parent's recipe loop catches the termination on the same
67//! `Err(Error::ToolErrorTerminal { .. })` arm regardless of whether
68//! the parent's policy is identical to the sub-agent's. The
69//! propagated variant preserves the **leaf tool's name** — operators
70//! see "BigQuery auth rotated" rather than "sub_agent failed", which
71//! is the actionable diagnostic. Sub-agent correlation rides on the
72//! `AuditSink::record_sub_agent_invoked` event already emitted at
73//! dispatch time.
74
75use std::task::{Context, Poll};
76
77use futures::future::BoxFuture;
78use serde_json::Value;
79use tower::{Layer, Service};
80
81use crate::error::{Error, Result};
82use crate::service::ToolInvocation;
83use crate::tools::{ToolErrorKind, ToolErrorKindSet};
84
85/// Operator-tunable classification gate consumed by
86/// [`ToolErrorPolicyLayer`].
87///
88/// The struct is intentionally small: one [`ToolErrorKindSet`]
89/// describing which kinds escalate to terminal. Operators that want
90/// finer-grained behaviour (per-tool overrides, per-tenant
91/// hot-swap) compose a higher layer that produces tenant-scoped
92/// `ToolErrorPolicy` values; the leaf layer here stays simple by
93/// design.
94#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
95pub struct ToolErrorPolicy {
96 /// `ToolErrorKind`s that cause the layer to wrap the inner
97 /// failure into [`crate::Error::ToolErrorTerminal`], short-
98 /// circuiting the reasoning loop. The empty default is the
99 /// no-op — every failure flows through unchanged and reaches the
100 /// model as a tool-result message.
101 pub terminate_on: ToolErrorKindSet,
102}
103
104impl ToolErrorPolicy {
105 /// Empty policy — every failure flows back to the model. The
106 /// `Default` shape, suitable for dev / debugging deployments.
107 #[must_use]
108 pub const fn new() -> Self {
109 Self {
110 terminate_on: ToolErrorKindSet::empty(),
111 }
112 }
113
114 /// Recommended production set: `Auth | Quota | Permanent`. Three
115 /// kinds the model cannot influence on retry — surfacing them to
116 /// the LLM only burns tokens before recursion-limit fires.
117 ///
118 /// `Internal` is deliberately excluded. `ToolErrorKind::classify`
119 /// routes the `_` catch-all (unclassified `Error` variants) to
120 /// `Internal`, so including it in the default would silently
121 /// terminate runs on every future `Error` variant that ships
122 /// without an explicit classifier arm. Operators that *want* the
123 /// catch-all behaviour add it explicitly with
124 /// `.add_terminal_kind(ToolErrorKind::Internal)`.
125 ///
126 /// `Transient` / `RateLimit` are excluded because
127 /// [`crate::tools::RetryToolLayer`] handles them transport-side;
128 /// what reaches this policy in those buckets is past the retry
129 /// budget and surfacing to the model gives it a chance to pick a
130 /// different tool or restructure the call. `Validation` is
131 /// excluded because the model authored the bad input — retrying
132 /// with a corrected payload is exactly the failure mode the
133 /// LLM-recoverable channel was designed for.
134 #[must_use]
135 pub const fn operator_safe() -> Self {
136 Self {
137 terminate_on: ToolErrorKindSet::empty()
138 .with(ToolErrorKind::Auth)
139 .with(ToolErrorKind::Quota)
140 .with(ToolErrorKind::Permanent),
141 }
142 }
143
144 /// Add `kind` to the terminate-on set. Fluent builder shape for
145 /// recipe wiring sites.
146 #[must_use]
147 pub const fn add_terminal_kind(mut self, kind: ToolErrorKind) -> Self {
148 self.terminate_on = self.terminate_on.with(kind);
149 self
150 }
151
152 /// Whether `kind` causes the layer to escalate to terminal.
153 #[must_use]
154 pub const fn classifies_terminal(self, kind: ToolErrorKind) -> bool {
155 self.terminate_on.contains(kind)
156 }
157}
158
159/// `tower::Layer` applying [`ToolErrorPolicy`] to every
160/// `Service<ToolInvocation>` dispatch. Cloning is a single-byte
161/// copy of the inner [`ToolErrorPolicy`] (no heap allocation).
162#[derive(Clone, Copy, Debug)]
163pub struct ToolErrorPolicyLayer {
164 policy: ToolErrorPolicy,
165}
166
167impl ToolErrorPolicyLayer {
168 /// Patch-version-stable identifier surfaced through
169 /// [`crate::ToolRegistry::layer_names`]. Renaming this constant
170 /// is a breaking change for dashboards keyed off the value.
171 pub const NAME: &'static str = "tool_error_policy";
172
173 /// Build the layer with the supplied policy.
174 #[must_use]
175 pub const fn new(policy: ToolErrorPolicy) -> Self {
176 Self { policy }
177 }
178
179 /// Borrow the wrapped policy — useful for diagnostic dumps and
180 /// `layer_names` correlation.
181 #[must_use]
182 pub const fn policy(&self) -> &ToolErrorPolicy {
183 &self.policy
184 }
185}
186
187impl crate::NamedLayer for ToolErrorPolicyLayer {
188 fn layer_name(&self) -> &'static str {
189 Self::NAME
190 }
191}
192
193impl<S> Layer<S> for ToolErrorPolicyLayer
194where
195 S: Service<ToolInvocation, Response = Value, Error = Error> + Clone + Send + 'static,
196 S::Future: Send + 'static,
197{
198 type Service = ToolErrorPolicyService<S>;
199
200 fn layer(&self, inner: S) -> Self::Service {
201 ToolErrorPolicyService {
202 inner,
203 policy: self.policy,
204 }
205 }
206}
207
208/// `Service<ToolInvocation>` produced by [`ToolErrorPolicyLayer`].
209#[derive(Clone, Debug)]
210pub struct ToolErrorPolicyService<Inner> {
211 inner: Inner,
212 policy: ToolErrorPolicy,
213}
214
215impl<Inner> Service<ToolInvocation> for ToolErrorPolicyService<Inner>
216where
217 Inner: Service<ToolInvocation, Response = Value, Error = Error> + Clone + Send + 'static,
218 Inner::Future: Send + 'static,
219{
220 type Response = Value;
221 type Error = Error;
222 type Future = BoxFuture<'static, Result<Value>>;
223
224 fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<()>> {
225 self.inner.poll_ready(cx)
226 }
227
228 fn call(&mut self, invocation: ToolInvocation) -> Self::Future {
229 let mut inner = self.inner.clone();
230 let policy = self.policy;
231 let tool_name = invocation.metadata.name.clone();
232 // Audit handle only — tenant + thread scope ride on the
233 // adapter's own `ThreadKey`. Pulling the `Arc` out before the
234 // invocation moves into `inner.call` saves the full
235 // `ExecutionContext::clone` for the (cheap) `Option<Arc>` it
236 // actually consumes.
237 let audit = invocation.ctx.audit_sink();
238
239 Box::pin(async move {
240 let result = inner.call(invocation).await;
241 let Err(err) = result else {
242 return result;
243 };
244
245 // Reasoning-loop control signals are not tool failures —
246 // never reroute them through the terminal channel even
247 // if the operator's policy includes `Internal` (the
248 // `_` catch-all classifier would otherwise scoop them
249 // in). The set covers every typed `Error` variant that
250 // is *not* an outcome the operator policy is meant to
251 // classify:
252 //
253 // - `Cancelled` / `DeadlineExceeded` — `ExecutionContext`
254 // primitives raised by callers.
255 // - `Interrupted` — HITL pause-and-resume.
256 // - `ModelRetry` — the unified validator / schema-retry
257 // channel (invariant 20). A tool body raising this
258 // asks the model to retry the *current* turn; classifying
259 // it as `Internal` and terminating the loop would
260 // silently break the validator-retry budget.
261 if matches!(
262 err,
263 Error::Cancelled
264 | Error::DeadlineExceeded
265 | Error::Interrupted { .. }
266 | Error::ModelRetry { .. }
267 ) {
268 return Err(err);
269 }
270
271 // Classify with the unwrap arm built into
272 // `ToolErrorKind::classify` — `Error::ToolErrorTerminal`
273 // returns its inner `kind`, so an already-escalated
274 // failure (sub-agent propagation, or domain-side
275 // self-wrap at a `ToolFailure` boundary) classifies
276 // consistently with how this layer would have treated
277 // it had no prior wrap occurred.
278 let kind = ToolErrorKind::classify(&err);
279 if !policy.classifies_terminal(kind) {
280 return Err(err);
281 }
282
283 // Audit-sink emission — fire-and-forget per invariant
284 // 18. Failures inside the sink stay there. Emits on
285 // every terminal escalation that flows through the
286 // layer, regardless of where the original wrap
287 // originated — operators get one audit record per
288 // terminal run end without depending on the wrap
289 // call-site reaching the sink itself.
290 //
291 // For an already-wrapped failure, prefer the inner
292 // `tool_name` (the leaf identity) over this layer's
293 // `invocation.metadata.name` (the parent's view).
294 let recorded_tool: &str = match &err {
295 Error::ToolErrorTerminal {
296 tool_name: inner, ..
297 } => inner.as_str(),
298 _ => tool_name.as_str(),
299 };
300 if let Some(handle) = &audit {
301 handle
302 .as_sink()
303 .record_tool_error_terminal(kind, recorded_tool);
304 }
305
306 // Pass-through guard: an already-terminal error
307 // (sub-agent propagation or domain self-wrap) keeps
308 // its original `tool_name` and `source` — re-wrapping
309 // would obscure the leaf identity without adding
310 // signal.
311 if matches!(err, Error::ToolErrorTerminal { .. }) {
312 return Err(err);
313 }
314
315 Err(Error::tool_error_terminal(kind, tool_name, err))
316 })
317 }
318}
319
320#[cfg(test)]
321#[allow(clippy::unwrap_used)]
322mod tests {
323 use std::sync::Arc;
324 use std::sync::atomic::{AtomicUsize, Ordering};
325
326 use serde_json::{Value, json};
327 use tower::{Layer, Service, ServiceExt};
328
329 use crate::LlmRenderable;
330 use crate::context::ExecutionContext;
331 use crate::error::Error;
332 use crate::service::ToolInvocation;
333 use crate::tools::{ToolErrorKind, ToolMetadata};
334
335 use super::*;
336
337 /// Minimal `Service<ToolInvocation>` returning a preset error /
338 /// success. Counts calls so tests assert the inner service was
339 /// invoked exactly once per dispatch (no double-attempt
340 /// regression).
341 #[derive(Clone)]
342 struct StubTool {
343 calls: Arc<AtomicUsize>,
344 outcome: Arc<dyn Fn() -> std::result::Result<Value, Error> + Send + Sync>,
345 }
346
347 impl StubTool {
348 fn new(
349 outcome: impl Fn() -> std::result::Result<Value, Error> + Send + Sync + 'static,
350 ) -> Self {
351 Self {
352 calls: Arc::new(AtomicUsize::new(0)),
353 outcome: Arc::new(outcome),
354 }
355 }
356 }
357
358 impl Service<ToolInvocation> for StubTool {
359 type Response = Value;
360 type Error = Error;
361 type Future = futures::future::BoxFuture<'static, std::result::Result<Value, Error>>;
362
363 fn poll_ready(&mut self, _cx: &mut Context<'_>) -> Poll<std::result::Result<(), Error>> {
364 Poll::Ready(Ok(()))
365 }
366
367 fn call(&mut self, _req: ToolInvocation) -> Self::Future {
368 self.calls.fetch_add(1, Ordering::SeqCst);
369 let outcome = Arc::clone(&self.outcome);
370 Box::pin(async move { outcome() })
371 }
372 }
373
374 fn invocation() -> ToolInvocation {
375 let metadata = Arc::new(ToolMetadata::function(
376 "stub",
377 "stub tool for tests",
378 json!({"type": "object"}),
379 ));
380 ToolInvocation::new("tu-1".into(), metadata, json!({}), ExecutionContext::new())
381 }
382
383 async fn dispatch(
384 policy: ToolErrorPolicy,
385 outcome: Error,
386 ) -> (std::result::Result<Value, Error>, usize) {
387 let stub = StubTool::new(move || Err(clone_error(&outcome)));
388 let layer = ToolErrorPolicyLayer::new(policy);
389 let mut svc = layer.layer(stub.clone());
390 let result = svc.ready().await.unwrap().call(invocation()).await;
391 (result, stub.calls.load(Ordering::SeqCst))
392 }
393
394 fn clone_error(err: &Error) -> Error {
395 match err {
396 Error::Provider { kind, message, .. } => match kind {
397 crate::error::ProviderErrorKind::Http(s) => {
398 Error::provider_http(*s, message.clone())
399 }
400 crate::error::ProviderErrorKind::Network => {
401 Error::provider_network(message.clone())
402 }
403 crate::error::ProviderErrorKind::Tls => Error::provider_tls(message.clone()),
404 crate::error::ProviderErrorKind::Dns => Error::provider_dns(message.clone()),
405 },
406 Error::Cancelled => Error::Cancelled,
407 Error::DeadlineExceeded => Error::DeadlineExceeded,
408 Error::InvalidRequest(s) => Error::invalid_request(s.clone()),
409 Error::Config(s) => Error::config(s.clone()),
410 Error::ModelRetry { hint, attempt } => Error::model_retry(hint.clone(), *attempt),
411 other => Error::invalid_request(format!("unrepeatable in test: {other}")),
412 }
413 }
414
415 #[tokio::test]
416 async fn auth_failure_escalates_under_operator_safe_default() {
417 let (result, calls) = dispatch(
418 ToolErrorPolicy::operator_safe(),
419 Error::provider_http(401, "unauthorized"),
420 )
421 .await;
422 let err = result.unwrap_err();
423 assert!(
424 matches!(&err, Error::ToolErrorTerminal { kind, .. } if *kind == ToolErrorKind::Auth),
425 "expected ToolErrorTerminal{{Auth}}, got {err:?}"
426 );
427 assert_eq!(calls, 1, "inner service must be invoked exactly once");
428 }
429
430 #[tokio::test]
431 async fn validation_failure_passes_through_under_operator_safe() {
432 // Validation stays loop-bound — the model authored the bad
433 // input, retry is the whole point.
434 let (result, calls) = dispatch(
435 ToolErrorPolicy::operator_safe(),
436 Error::invalid_request("bad input"),
437 )
438 .await;
439 let err = result.unwrap_err();
440 assert!(
441 matches!(err, Error::InvalidRequest(_)),
442 "Validation must NOT wrap under operator_safe"
443 );
444 assert_eq!(calls, 1);
445 }
446
447 #[tokio::test]
448 async fn empty_policy_is_full_passthrough() {
449 // Default `terminate_on = empty` preserves the no-shim
450 // semantics — every error reaches the model. The layer is
451 // installable everywhere with zero behaviour change until
452 // operators opt in.
453 let (result, _) = dispatch(
454 ToolErrorPolicy::new(),
455 Error::provider_http(401, "unauthorized"),
456 )
457 .await;
458 assert!(matches!(result.unwrap_err(), Error::Provider { .. }));
459 }
460
461 #[tokio::test]
462 async fn control_signals_short_circuit_classification() {
463 // A policy that includes `Internal` (the `_` catch-all)
464 // would scoop `Cancelled` / `DeadlineExceeded` /
465 // `Interrupted` / `ModelRetry` if classification were
466 // unconditional. The layer's pre-classification guard
467 // prevents that — these are reasoning-loop control signals,
468 // not tool-failure outcomes the operator policy is meant
469 // to classify.
470 let policy = ToolErrorPolicy::new().add_terminal_kind(ToolErrorKind::Internal);
471 let (result, _) = dispatch(policy, Error::Cancelled).await;
472 assert!(
473 matches!(result.unwrap_err(), Error::Cancelled),
474 "Cancelled must pass through unchanged"
475 );
476 let (result, _) = dispatch(policy, Error::DeadlineExceeded).await;
477 assert!(matches!(result.unwrap_err(), Error::DeadlineExceeded));
478
479 // `ModelRetry` is the validator / schema-retry channel
480 // (invariant 20). It must reach the chat-model retry loop
481 // unwrapped — wrapping it as `ToolErrorTerminal` would
482 // silently break the unified retry budget.
483 let model_retry = Error::model_retry("re-check the shape".to_owned().for_llm(), 0);
484 let (result, _) = dispatch(policy, model_retry).await;
485 assert!(matches!(result.unwrap_err(), Error::ModelRetry { .. }));
486 }
487
488 #[tokio::test]
489 async fn already_terminal_passes_through_unchanged() {
490 // Parent's layer must not re-wrap a sub-agent's
491 // already-terminal failure into a nested
492 // `ToolErrorTerminal { source: ToolErrorTerminal { .. } }`
493 // tower of boxes.
494 let inner_terminal = Error::tool_error_terminal(
495 ToolErrorKind::Auth,
496 "sub_tool",
497 Error::provider_http(401, "no auth"),
498 );
499 let stub = StubTool::new(move || {
500 Err(Error::tool_error_terminal(
501 ToolErrorKind::Auth,
502 "sub_tool",
503 Error::provider_http(401, "no auth"),
504 ))
505 });
506 let layer = ToolErrorPolicyLayer::new(ToolErrorPolicy::operator_safe());
507 let mut svc = layer.layer(stub);
508 let err = svc
509 .ready()
510 .await
511 .unwrap()
512 .call(invocation())
513 .await
514 .unwrap_err();
515 match err {
516 Error::ToolErrorTerminal { source, .. } => match *source {
517 // Source must remain the original Provider error,
518 // not a nested ToolErrorTerminal.
519 Error::Provider { .. } => {}
520 other => panic!("expected leaf Provider source, got {other:?}"),
521 },
522 other => panic!("expected ToolErrorTerminal, got {other:?}"),
523 }
524 let _ = inner_terminal; // retain ownership semantics for clarity
525 }
526
527 #[tokio::test]
528 async fn success_is_uninstrumented() {
529 let stub = StubTool::new(|| Ok(json!({"ok": true})));
530 let layer = ToolErrorPolicyLayer::new(ToolErrorPolicy::operator_safe());
531 let mut svc = layer.layer(stub);
532 let out = svc.ready().await.unwrap().call(invocation()).await.unwrap();
533 assert_eq!(out, json!({"ok": true}));
534 }
535}