entelix-core 0.6.1

//! `ToolErrorPolicy` — operator-tunable gate that classifies
//! tool-dispatch failures into "model-recoverable" vs "terminal" for
//! the reasoning loop, plus the `tower::Layer` that applies it.
//!
//! ## Why a typed gate rather than per-recipe branching
//!
//! `ToolErrorKind::classify` already buckets every `Error` into one
//! of seven categories. Without this policy the categories are
//! observability metadata: dashboards split errors but the reasoning
//! loop treats `Auth` (no LLM-actionable recovery), `Quota` (same),
//! and `Validation` (the model authored the bad input) identically —
//! feed the rendered error back to the model and let it retry until
//! `recursion_limit` fires. The result is wasted tokens, latency,
//! and a generic `recursion_limit_exceeded` event that hides what
//! actually went wrong.
//!
//! `ToolErrorPolicy` closes that loop: operators declare which kinds
//! have no LLM-actionable recovery; [`ToolErrorPolicyLayer`] wraps
//! matching failures into [`crate::Error::ToolErrorTerminal`]; every
//! reasoning recipe propagates the variant the same way it propagates
//! [`crate::Error::Interrupted`] — return to the caller without
//! re-prompting the model.
//!
//! ## Composition
//!
//! The layer wraps the tool-side `Service<ToolInvocation>` spine.
//! Outer-policy reads intuitively (retry exhausts first, then the
//! settled failure category reaches the gate):
//!
//! ```ignore
//! use entelix_core::ToolRegistry;
//! use entelix_core::tools::{RetryToolLayer, ToolErrorPolicy, ToolErrorPolicyLayer};
//!
//! let registry = ToolRegistry::new()
//!     .register(my_tool)?
//!     .layer(RetryToolLayer::new())                                         // innermost
//!     .layer(ToolErrorPolicyLayer::new(ToolErrorPolicy::operator_safe())); // outermost
//! # Ok::<(), entelix_core::Error>(())
//! ```
//!
//! Reversing the order (policy inside retry) is also safe — the
//! shipped [`crate::transports::DefaultRetryClassifier`] unwraps
//! [`Error::ToolErrorTerminal`] and re-classifies against the inner
//! source, so retry decisions stay identical regardless of layer
//! ordering. Operators wiring a custom `RetryClassifier` should
//! mirror that unwrap if they want the same property.
//!
//! ## Control-signal pass-through
//!
//! `Error::Cancelled`, `Error::DeadlineExceeded`,
//! `Error::Interrupted`, and `Error::ModelRetry` are reasoning-loop
//! control signals, not tool failures. The layer short-circuits them
//! before classification so a policy that includes
//! `ToolErrorKind::Internal` (the `_` catch-all bucket) never
//! accidentally re-routes a cancellation, HITL pause, or
//! validator-retry hint through the terminal channel.
//!
//! ## Sub-agent propagation
//!
//! When a sub-agent (registered on the parent as a `Tool`)
//! terminates, the parent's `ToolErrorPolicyLayer` sees the inner
//! `Error::ToolErrorTerminal` returned from `Subagent::execute`. The
//! double-wrap guard passes it through unchanged — *the sub-agent
//! already decided to terminate, and the parent cannot un-terminate*.
//! `ToolErrorKind::classify` unwraps to the leaf's classification, so
//! the parent's recipe loop catches the termination on the same
//! `Err(Error::ToolErrorTerminal { .. })` arm regardless of whether
//! the parent's policy is identical to the sub-agent's. The
//! propagated variant preserves the **leaf tool's name** — operators
//! see "BigQuery auth rotated" rather than "sub_agent failed", which
//! is the actionable diagnostic. Sub-agent correlation rides on the
//! `AuditSink::record_sub_agent_invoked` event already emitted at
//! dispatch time.

use std::task::{Context, Poll};

use futures::future::BoxFuture;
use serde_json::Value;
use tower::{Layer, Service};

use crate::error::{Error, Result};
use crate::service::ToolInvocation;
use crate::tools::{ToolErrorKind, ToolErrorKindSet};

/// Operator-tunable classification gate consumed by
/// [`ToolErrorPolicyLayer`].
///
/// The struct is intentionally small: one [`ToolErrorKindSet`]
/// describing which kinds escalate to terminal. Operators that want
/// finer-grained behaviour (per-tool overrides, per-tenant
/// hot-swap) compose a higher layer that produces tenant-scoped
/// `ToolErrorPolicy` values; the leaf layer here stays simple by
/// design.
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
pub struct ToolErrorPolicy {
    /// `ToolErrorKind`s that cause the layer to wrap the inner
    /// failure into [`crate::Error::ToolErrorTerminal`], short-
    /// circuiting the reasoning loop. The empty default is the
    /// no-op — every failure flows through unchanged and reaches the
    /// model as a tool-result message.
    pub terminate_on: ToolErrorKindSet,
}

impl ToolErrorPolicy {
    /// Empty policy — every failure flows back to the model. The
    /// `Default` shape, suitable for dev / debugging deployments.
    #[must_use]
    pub const fn new() -> Self {
        Self {
            terminate_on: ToolErrorKindSet::empty(),
        }
    }

    /// Recommended production set: `Auth | Quota | Permanent`. Three
    /// kinds the model cannot influence on retry — surfacing them to
    /// the LLM only burns tokens before recursion-limit fires.
    ///
    /// `Internal` is deliberately excluded. `ToolErrorKind::classify`
    /// routes the `_` catch-all (unclassified `Error` variants) to
    /// `Internal`, so including it in the default would silently
    /// terminate runs on every future `Error` variant that ships
    /// without an explicit classifier arm. Operators that *want* the
    /// catch-all behaviour add it explicitly with
    /// `.add_terminal_kind(ToolErrorKind::Internal)`.
    ///
    /// `Transient` / `RateLimit` are excluded because
    /// [`crate::tools::RetryToolLayer`] handles them transport-side;
    /// what reaches this policy in those buckets is past the retry
    /// budget and surfacing to the model gives it a chance to pick a
    /// different tool or restructure the call. `Validation` is
    /// excluded because the model authored the bad input — retrying
    /// with a corrected payload is exactly the failure mode the
    /// LLM-recoverable channel was designed for.
    #[must_use]
    pub const fn operator_safe() -> Self {
        Self {
            terminate_on: ToolErrorKindSet::empty()
                .with(ToolErrorKind::Auth)
                .with(ToolErrorKind::Quota)
                .with(ToolErrorKind::Permanent),
        }
    }

    /// Add `kind` to the terminate-on set. Fluent builder shape for
    /// recipe wiring sites.
    #[must_use]
    pub const fn add_terminal_kind(mut self, kind: ToolErrorKind) -> Self {
        self.terminate_on = self.terminate_on.with(kind);
        self
    }

    /// Whether `kind` causes the layer to escalate to terminal.
    #[must_use]
    pub const fn classifies_terminal(self, kind: ToolErrorKind) -> bool {
        self.terminate_on.contains(kind)
    }
}

/// `tower::Layer` applying [`ToolErrorPolicy`] to every
/// `Service<ToolInvocation>` dispatch. Cloning is a single-byte
/// copy of the inner [`ToolErrorPolicy`] (no heap allocation).
#[derive(Clone, Copy, Debug)]
pub struct ToolErrorPolicyLayer {
    policy: ToolErrorPolicy,
}

impl ToolErrorPolicyLayer {
    /// Patch-version-stable identifier surfaced through
    /// [`crate::ToolRegistry::layer_names`]. Renaming this constant
    /// is a breaking change for dashboards keyed off the value.
    pub const NAME: &'static str = "tool_error_policy";

    /// Build the layer with the supplied policy.
    #[must_use]
    pub const fn new(policy: ToolErrorPolicy) -> Self {
        Self { policy }
    }

    /// Borrow the wrapped policy — useful for diagnostic dumps and
    /// `layer_names` correlation.
    #[must_use]
    pub const fn policy(&self) -> &ToolErrorPolicy {
        &self.policy
    }
}

impl crate::NamedLayer for ToolErrorPolicyLayer {
    fn layer_name(&self) -> &'static str {
        Self::NAME
    }
}

impl<S> Layer<S> for ToolErrorPolicyLayer
where
    S: Service<ToolInvocation, Response = Value, Error = Error> + Clone + Send + 'static,
    S::Future: Send + 'static,
{
    type Service = ToolErrorPolicyService<S>;

    fn layer(&self, inner: S) -> Self::Service {
        ToolErrorPolicyService {
            inner,
            policy: self.policy,
        }
    }
}

/// `Service<ToolInvocation>` produced by [`ToolErrorPolicyLayer`].
#[derive(Clone, Debug)]
pub struct ToolErrorPolicyService<Inner> {
    inner: Inner,
    policy: ToolErrorPolicy,
}

impl<Inner> Service<ToolInvocation> for ToolErrorPolicyService<Inner>
where
    Inner: Service<ToolInvocation, Response = Value, Error = Error> + Clone + Send + 'static,
    Inner::Future: Send + 'static,
{
    type Response = Value;
    type Error = Error;
    type Future = BoxFuture<'static, Result<Value>>;

    fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<()>> {
        self.inner.poll_ready(cx)
    }

    fn call(&mut self, invocation: ToolInvocation) -> Self::Future {
        let mut inner = self.inner.clone();
        let policy = self.policy;
        let tool_name = invocation.metadata.name.clone();
        // Audit handle only — tenant + thread scope ride on the
        // adapter's own `ThreadKey`. Pulling the `Arc` out before the
        // invocation moves into `inner.call` saves the full
        // `ExecutionContext::clone` for the (cheap) `Option<Arc>` it
        // actually consumes.
        let audit = invocation.ctx.audit_sink();

        Box::pin(async move {
            let result = inner.call(invocation).await;
            let Err(err) = result else {
                return result;
            };

            // Reasoning-loop control signals are not tool failures —
            // never reroute them through the terminal channel even
            // if the operator's policy includes `Internal` (the
            // `_` catch-all classifier would otherwise scoop them
            // in). The set covers every typed `Error` variant that
            // is *not* an outcome the operator policy is meant to
            // classify:
            //
            // - `Cancelled` / `DeadlineExceeded` — `ExecutionContext`
            //   primitives raised by callers.
            // - `Interrupted` — HITL pause-and-resume.
            // - `ModelRetry` — the unified validator / schema-retry
            //   channel (invariant 20). A tool body raising this
            //   asks the model to retry the *current* turn; classifying
            //   it as `Internal` and terminating the loop would
            //   silently break the validator-retry budget.
            if matches!(
                err,
                Error::Cancelled
                    | Error::DeadlineExceeded
                    | Error::Interrupted { .. }
                    | Error::ModelRetry { .. }
            ) {
                return Err(err);
            }

            // Classify with the unwrap arm built into
            // `ToolErrorKind::classify` — `Error::ToolErrorTerminal`
            // returns its inner `kind`, so an already-escalated
            // failure (sub-agent propagation, or domain-side
            // self-wrap at a `ToolFailure` boundary) classifies
            // consistently with how this layer would have treated
            // it had no prior wrap occurred.
            let kind = ToolErrorKind::classify(&err);
            if !policy.classifies_terminal(kind) {
                return Err(err);
            }

            // Audit-sink emission — fire-and-forget per invariant
            // 18. Failures inside the sink stay there. Emits on
            // every terminal escalation that flows through the
            // layer, regardless of where the original wrap
            // originated — operators get one audit record per
            // terminal run end without depending on the wrap
            // call-site reaching the sink itself.
            //
            // For an already-wrapped failure, prefer the inner
            // `tool_name` (the leaf identity) over this layer's
            // `invocation.metadata.name` (the parent's view).
            let recorded_tool: &str = match &err {
                Error::ToolErrorTerminal {
                    tool_name: inner, ..
                } => inner.as_str(),
                _ => tool_name.as_str(),
            };
            if let Some(handle) = &audit {
                handle
                    .as_sink()
                    .record_tool_error_terminal(kind, recorded_tool);
            }

            // Pass-through guard: an already-terminal error
            // (sub-agent propagation or domain self-wrap) keeps
            // its original `tool_name` and `source` — re-wrapping
            // would obscure the leaf identity without adding
            // signal.
            if matches!(err, Error::ToolErrorTerminal { .. }) {
                return Err(err);
            }

            Err(Error::tool_error_terminal(kind, tool_name, err))
        })
    }
}

#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
    use std::sync::Arc;
    use std::sync::atomic::{AtomicUsize, Ordering};

    use serde_json::{Value, json};
    use tower::{Layer, Service, ServiceExt};

    use crate::LlmRenderable;
    use crate::context::ExecutionContext;
    use crate::error::Error;
    use crate::service::ToolInvocation;
    use crate::tools::{ToolErrorKind, ToolMetadata};

    use super::*;

    /// Minimal `Service<ToolInvocation>` returning a preset error /
    /// success. Counts calls so tests assert the inner service was
    /// invoked exactly once per dispatch (no double-attempt
    /// regression).
    #[derive(Clone)]
    struct StubTool {
        calls: Arc<AtomicUsize>,
        outcome: Arc<dyn Fn() -> std::result::Result<Value, Error> + Send + Sync>,
    }

    impl StubTool {
        fn new(
            outcome: impl Fn() -> std::result::Result<Value, Error> + Send + Sync + 'static,
        ) -> Self {
            Self {
                calls: Arc::new(AtomicUsize::new(0)),
                outcome: Arc::new(outcome),
            }
        }
    }

    impl Service<ToolInvocation> for StubTool {
        type Response = Value;
        type Error = Error;
        type Future = futures::future::BoxFuture<'static, std::result::Result<Value, Error>>;

        fn poll_ready(&mut self, _cx: &mut Context<'_>) -> Poll<std::result::Result<(), Error>> {
            Poll::Ready(Ok(()))
        }

        fn call(&mut self, _req: ToolInvocation) -> Self::Future {
            self.calls.fetch_add(1, Ordering::SeqCst);
            let outcome = Arc::clone(&self.outcome);
            Box::pin(async move { outcome() })
        }
    }

    fn invocation() -> ToolInvocation {
        let metadata = Arc::new(ToolMetadata::function(
            "stub",
            "stub tool for tests",
            json!({"type": "object"}),
        ));
        ToolInvocation::new("tu-1".into(), metadata, json!({}), ExecutionContext::new())
    }

    async fn dispatch(
        policy: ToolErrorPolicy,
        outcome: Error,
    ) -> (std::result::Result<Value, Error>, usize) {
        let stub = StubTool::new(move || Err(clone_error(&outcome)));
        let layer = ToolErrorPolicyLayer::new(policy);
        let mut svc = layer.layer(stub.clone());
        let result = svc.ready().await.unwrap().call(invocation()).await;
        (result, stub.calls.load(Ordering::SeqCst))
    }

    fn clone_error(err: &Error) -> Error {
        match err {
            Error::Provider { kind, message, .. } => match kind {
                crate::error::ProviderErrorKind::Http(s) => {
                    Error::provider_http(*s, message.clone())
                }
                crate::error::ProviderErrorKind::Network => {
                    Error::provider_network(message.clone())
                }
                crate::error::ProviderErrorKind::Tls => Error::provider_tls(message.clone()),
                crate::error::ProviderErrorKind::Dns => Error::provider_dns(message.clone()),
            },
            Error::Cancelled => Error::Cancelled,
            Error::DeadlineExceeded => Error::DeadlineExceeded,
            Error::InvalidRequest(s) => Error::invalid_request(s.clone()),
            Error::Config(s) => Error::config(s.clone()),
            Error::ModelRetry { hint, attempt } => Error::model_retry(hint.clone(), *attempt),
            other => Error::invalid_request(format!("unrepeatable in test: {other}")),
        }
    }

    #[tokio::test]
    async fn auth_failure_escalates_under_operator_safe_default() {
        let (result, calls) = dispatch(
            ToolErrorPolicy::operator_safe(),
            Error::provider_http(401, "unauthorized"),
        )
        .await;
        let err = result.unwrap_err();
        assert!(
            matches!(&err, Error::ToolErrorTerminal { kind, .. } if *kind == ToolErrorKind::Auth),
            "expected ToolErrorTerminal{{Auth}}, got {err:?}"
        );
        assert_eq!(calls, 1, "inner service must be invoked exactly once");
    }

    #[tokio::test]
    async fn validation_failure_passes_through_under_operator_safe() {
        // Validation stays loop-bound — the model authored the bad
        // input, retry is the whole point.
        let (result, calls) = dispatch(
            ToolErrorPolicy::operator_safe(),
            Error::invalid_request("bad input"),
        )
        .await;
        let err = result.unwrap_err();
        assert!(
            matches!(err, Error::InvalidRequest(_)),
            "Validation must NOT wrap under operator_safe"
        );
        assert_eq!(calls, 1);
    }

    #[tokio::test]
    async fn empty_policy_is_full_passthrough() {
        // Default `terminate_on = empty` preserves the no-shim
        // semantics — every error reaches the model. The layer is
        // installable everywhere with zero behaviour change until
        // operators opt in.
        let (result, _) = dispatch(
            ToolErrorPolicy::new(),
            Error::provider_http(401, "unauthorized"),
        )
        .await;
        assert!(matches!(result.unwrap_err(), Error::Provider { .. }));
    }

    #[tokio::test]
    async fn control_signals_short_circuit_classification() {
        // A policy that includes `Internal` (the `_` catch-all)
        // would scoop `Cancelled` / `DeadlineExceeded` /
        // `Interrupted` / `ModelRetry` if classification were
        // unconditional. The layer's pre-classification guard
        // prevents that — these are reasoning-loop control signals,
        // not tool-failure outcomes the operator policy is meant
        // to classify.
        let policy = ToolErrorPolicy::new().add_terminal_kind(ToolErrorKind::Internal);
        let (result, _) = dispatch(policy, Error::Cancelled).await;
        assert!(
            matches!(result.unwrap_err(), Error::Cancelled),
            "Cancelled must pass through unchanged"
        );
        let (result, _) = dispatch(policy, Error::DeadlineExceeded).await;
        assert!(matches!(result.unwrap_err(), Error::DeadlineExceeded));

        // `ModelRetry` is the validator / schema-retry channel
        // (invariant 20). It must reach the chat-model retry loop
        // unwrapped — wrapping it as `ToolErrorTerminal` would
        // silently break the unified retry budget.
        let model_retry = Error::model_retry("re-check the shape".to_owned().for_llm(), 0);
        let (result, _) = dispatch(policy, model_retry).await;
        assert!(matches!(result.unwrap_err(), Error::ModelRetry { .. }));
    }

    #[tokio::test]
    async fn already_terminal_passes_through_unchanged() {
        // Parent's layer must not re-wrap a sub-agent's
        // already-terminal failure into a nested
        // `ToolErrorTerminal { source: ToolErrorTerminal { .. } }`
        // tower of boxes.
        let inner_terminal = Error::tool_error_terminal(
            ToolErrorKind::Auth,
            "sub_tool",
            Error::provider_http(401, "no auth"),
        );
        let stub = StubTool::new(move || {
            Err(Error::tool_error_terminal(
                ToolErrorKind::Auth,
                "sub_tool",
                Error::provider_http(401, "no auth"),
            ))
        });
        let layer = ToolErrorPolicyLayer::new(ToolErrorPolicy::operator_safe());
        let mut svc = layer.layer(stub);
        let err = svc
            .ready()
            .await
            .unwrap()
            .call(invocation())
            .await
            .unwrap_err();
        match err {
            Error::ToolErrorTerminal { source, .. } => match *source {
                // Source must remain the original Provider error,
                // not a nested ToolErrorTerminal.
                Error::Provider { .. } => {}
                other => panic!("expected leaf Provider source, got {other:?}"),
            },
            other => panic!("expected ToolErrorTerminal, got {other:?}"),
        }
        let _ = inner_terminal; // retain ownership semantics for clarity
    }

    #[tokio::test]
    async fn success_is_uninstrumented() {
        let stub = StubTool::new(|| Ok(json!({"ok": true})));
        let layer = ToolErrorPolicyLayer::new(ToolErrorPolicy::operator_safe());
        let mut svc = layer.layer(stub);
        let out = svc.ready().await.unwrap().call(invocation()).await.unwrap();
        assert_eq!(out, json!({"ok": true}));
    }
}