entelix-core 0.6.0

//! `ToolErrorKind` — tool-dispatch failure category derived from
//! [`crate::Error`] for observability and retry classification.
//!
//! Tool authors return `Result<Value, Error>` from `Tool::execute`;
//! the runtime classifies the error variant into one of these
//! seven categories so observability sinks (`AgentEvent::ToolError`),
//! retry middleware (`RetryToolLayer`), and recovery sinks all
//! reach the same cross-tool taxonomy.
//!
//! Mirrors [`crate::ProviderErrorKind`] in shape (typed enum
//! categorising failures) but operates at a higher level — provider
//! kinds describe transport mechanisms, tool kinds describe the
//! semantic outcome the operator (or the model) actually cares about.

use crate::error::Error;

/// Cross-tool failure category.
///
/// Derive from [`Error`] via [`Self::classify`]. Used for retry
/// middleware (`RetryToolLayer` retries [`Self::Transient`] /
/// [`Self::RateLimit`]), observability sinks (operators surface the
/// category in dashboards), and downstream recovery routing
/// (different categories trigger different operator responses —
/// page on `Auth`, alert on `Quota`, ignore `Validation` noise).
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, serde::Serialize, serde::Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum ToolErrorKind {
    /// Network blip, transient 5xx, generic transport failure —
    /// safe to retry.
    Transient,
    /// Vendor signalled rate limiting (429 with `Retry-After` hint).
    /// Retryable after the configured cooldown.
    RateLimit,
    /// Vendor signalled exhausted quota / billing cap. Retry will
    /// not succeed until the quota resets or operator intervenes —
    /// surface to ops, do not retry automatically.
    Quota,
    /// Credential rejected (401 / 403 / [`Error::Auth`]). Retry will
    /// not succeed until credentials are rotated.
    Auth,
    /// Permanent vendor failure (4xx other than auth/rate/quota,
    /// 405, 410, 422 …). The same call will fail again.
    Permanent,
    /// Caller-side input rejected ([`Error::InvalidRequest`],
    /// [`Error::Serde`]) — the operator's payload does not match
    /// the tool contract. Retry is meaningless without changing the
    /// payload.
    Validation,
    /// Tool-internal bug or misconfiguration ([`Error::Config`], or
    /// any unclassified shape). Surface to ops; retry is not
    /// meaningful.
    Internal,
}

impl ToolErrorKind {
    /// Derive the category from an [`Error`].
    ///
    /// The mapping is intentionally exhaustive over the variants
    /// [`Error`] surfaces today — the `_` catch-all routes to
    /// [`Self::Internal`] so future variants stay observable until
    /// classified explicitly. Operational variants
    /// ([`Error::Cancelled`], [`Error::DeadlineExceeded`],
    /// [`Error::Interrupted`], [`Error::ModelRetry`]) flow through
    /// `Internal` because they are agent-runtime control signals,
    /// not tool failures — call sites that observe them should not
    /// reach this classifier in the first place.
    #[must_use]
    pub fn classify(error: &Error) -> Self {
        use crate::error::ProviderErrorKind;
        match error {
            // Already-classified terminal failures (raised by
            // `ToolErrorPolicyLayer` or propagated from a sub-agent)
            // carry their classification on the variant itself.
            // Surface the leaf kind so observability (`AgentEvent::
            // ToolError::kind`, OTel `entelix.tool.error_kind`,
            // `DefaultRetryClassifier` introspection) sees the
            // operator-actionable category regardless of how many
            // layers re-encountered the same wrapped error. The
            // `ToolErrorPolicyLayer` itself passes the already-
            // wrapped variant through unchanged via its double-wrap
            // guard, so the parent's policy does not re-wrap.
            Error::ToolErrorTerminal { kind, .. } => *kind,
            Error::Provider {
                kind: ProviderErrorKind::Network | ProviderErrorKind::Tls | ProviderErrorKind::Dns,
                ..
            } => Self::Transient,
            Error::Provider {
                kind: ProviderErrorKind::Http(429),
                retry_after,
                ..
            } => {
                // Vendor distinguishes 429-with-Retry-After (transient
                // back-pressure) from 429-without (often quota
                // exhaustion). The hint presence is the cue.
                if retry_after.is_some() {
                    Self::RateLimit
                } else {
                    Self::Quota
                }
            }
            Error::Provider {
                kind: ProviderErrorKind::Http(status),
                ..
            } => {
                if *status == 401 || *status == 403 {
                    Self::Auth
                } else if (500..600).contains(status) || *status == 408 || *status == 425 {
                    Self::Transient
                } else {
                    Self::Permanent
                }
            }
            Error::Auth(_) => Self::Auth,
            Error::UsageLimitExceeded(_) => Self::Quota,
            Error::InvalidRequest(_) | Error::Serde(_) => Self::Validation,
            // Operational variants (Cancelled, DeadlineExceeded,
            // Interrupted, ModelRetry) and any future shape route
            // here together with Config — none of them are tool
            // failures the operator can act on per-category.
            _ => Self::Internal,
        }
    }

    /// Whether the runtime should attempt the tool call again.
    ///
    /// `Transient` and `RateLimit` are retryable; everything else
    /// is a surface-and-stop signal. `RetryToolLayer` consults this
    /// via the underlying `RetryClassifier` (which can be
    /// overridden per deployment) — operators that want different
    /// retry policy install a custom classifier rather than mutating
    /// this method.
    #[must_use]
    pub const fn is_retryable(self) -> bool {
        matches!(self, Self::Transient | Self::RateLimit)
    }

    /// Stable snake-case identifier surfaced through OTel
    /// (`entelix.tool.error_kind`), structured logs, and audit
    /// `GraphEvent` serialisation. Patch-version stable — renaming
    /// a value is a breaking change for downstream consumers
    /// keying off the string.
    #[must_use]
    pub const fn wire_id(self) -> &'static str {
        match self {
            Self::Transient => "transient",
            Self::RateLimit => "rate_limit",
            Self::Quota => "quota",
            Self::Auth => "auth",
            Self::Permanent => "permanent",
            Self::Validation => "validation",
            Self::Internal => "internal",
        }
    }

    /// Stable bit position used by [`ToolErrorKindSet`] for the
    /// bitset representation. Adding a new variant requires updating
    /// this match — the same-crate exhaustiveness check forces it
    /// (`#[non_exhaustive]` is for external matchers only). The
    /// `bit_indices_are_unique_and_fit_in_set_width` regression test
    /// then asserts the new bit position is unique and fits in
    /// [`ToolErrorKindSet`]'s backing integer; widening the integer
    /// in lockstep with the taxonomy is the only correct response if
    /// the test ever fails.
    #[must_use]
    pub(crate) const fn bit_index(self) -> u32 {
        match self {
            Self::Transient => 0,
            Self::RateLimit => 1,
            Self::Quota => 2,
            Self::Auth => 3,
            Self::Permanent => 4,
            Self::Validation => 5,
            Self::Internal => 6,
        }
    }
}

impl std::fmt::Display for ToolErrorKind {
    /// Lowercase snake_case form matching [`Self::wire_id`] — the
    /// stable operator-channel rendering. Operator dashboards, log
    /// lines, and the `Error::ToolErrorTerminal` Display all read
    /// this format so the same kind reads identically across every
    /// surface.
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str(self.wire_id())
    }
}

/// Compact set of [`ToolErrorKind`] values. Backed by a fixed-width
/// integer bitset — every variant occupies one bit at the position
/// returned by [`ToolErrorKind::bit_index`].
///
/// The type lives next to [`ToolErrorKind`] rather than in a
/// dedicated `set` module so the bit layout and the enum stay in
/// one place. The `bit_indices_are_unique_and_fit_in_set_width`
/// regression test below uses a same-crate exhaustive match to
/// force every new variant through a uniqueness + width check —
/// landing a variant whose `bit_index` exceeds the set width fails
/// the test, and the developer widens the backing integer in
/// lockstep with the taxonomy.
///
/// Construction is `const`-friendly — operator defaults and unit
/// tests build sets at compile time:
///
/// ```
/// use entelix_core::tools::{ToolErrorKind, ToolErrorKindSet};
/// const SAFE: ToolErrorKindSet = ToolErrorKindSet::empty()
///     .with(ToolErrorKind::Auth)
///     .with(ToolErrorKind::Quota)
///     .with(ToolErrorKind::Permanent);
/// assert!(SAFE.contains(ToolErrorKind::Auth));
/// assert!(!SAFE.contains(ToolErrorKind::Transient));
/// ```
#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
pub struct ToolErrorKindSet(u16);

impl ToolErrorKindSet {
    /// Width of the underlying integer in bits — the maximum number
    /// of [`ToolErrorKind`] variants the set can hold. Used by the
    /// regression test that pins variant count against bitset
    /// capacity.
    #[cfg(test)]
    const CAPACITY_BITS: u32 = u16::BITS;

    /// Empty set — no kinds.
    #[must_use]
    pub const fn empty() -> Self {
        Self(0)
    }

    /// Single-kind set — useful at literal call sites.
    #[must_use]
    pub const fn singleton(kind: ToolErrorKind) -> Self {
        Self(1u16 << kind.bit_index())
    }

    /// Insert `kind` into the set, returning the updated set.
    #[must_use]
    pub const fn with(self, kind: ToolErrorKind) -> Self {
        Self(self.0 | (1u16 << kind.bit_index()))
    }

    /// Remove `kind` from the set, returning the updated set.
    #[must_use]
    pub const fn without(self, kind: ToolErrorKind) -> Self {
        Self(self.0 & !(1u16 << kind.bit_index()))
    }

    /// Union with another set, returning the updated set.
    #[must_use]
    pub const fn union(self, other: Self) -> Self {
        Self(self.0 | other.0)
    }

    /// Whether `kind` is in the set.
    #[must_use]
    pub const fn contains(self, kind: ToolErrorKind) -> bool {
        (self.0 >> kind.bit_index()) & 1 == 1
    }

    /// Whether the set has no kinds.
    #[must_use]
    pub const fn is_empty(self) -> bool {
        self.0 == 0
    }
}

#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
    use super::*;
    use std::time::Duration;

    #[test]
    fn provider_network_classifies_as_transient() {
        let err = Error::provider_network("connect refused");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
        assert!(ToolErrorKind::classify(&err).is_retryable());
    }

    #[test]
    fn provider_dns_classifies_as_transient() {
        let err = Error::provider_dns("no such host");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
    }

    #[test]
    fn provider_5xx_classifies_as_transient() {
        let err = Error::provider_http(503, "down");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
        let err = Error::provider_http(502, "bad gateway");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
    }

    #[test]
    fn http_408_and_425_classify_as_transient() {
        // 408 Request Timeout, 425 Too Early — both retryable per
        // spec semantics.
        let err = Error::provider_http(408, "timeout");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
        let err = Error::provider_http(425, "too early");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Transient);
    }

    #[test]
    fn http_429_with_retry_after_classifies_as_rate_limit() {
        let err = Error::provider_http(429, "slow down").with_retry_after(Duration::from_secs(5));
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::RateLimit);
        assert!(ToolErrorKind::classify(&err).is_retryable());
    }

    #[test]
    fn http_429_without_retry_after_classifies_as_quota() {
        // Vendor signalling quota exhaustion typically omits
        // `Retry-After` because the cooldown is a billing cycle,
        // not a request window.
        let err = Error::provider_http(429, "monthly cap reached");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Quota);
        assert!(!ToolErrorKind::classify(&err).is_retryable());
    }

    #[test]
    fn http_401_403_classify_as_auth() {
        let err = Error::provider_http(401, "unauthorized");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Auth);
        let err = Error::provider_http(403, "forbidden");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Auth);
        assert!(!ToolErrorKind::classify(&err).is_retryable());
    }

    #[test]
    fn http_4xx_other_classifies_as_permanent() {
        let err = Error::provider_http(404, "not found");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Permanent);
        let err = Error::provider_http(422, "unprocessable");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Permanent);
        assert!(!ToolErrorKind::classify(&err).is_retryable());
    }

    #[test]
    fn invalid_request_and_serde_classify_as_validation() {
        let err = Error::invalid_request("bad input");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Validation);
        let serde_err: serde_json::Error = serde_json::from_str::<i32>("not-a-number").unwrap_err();
        let err: Error = serde_err.into();
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Validation);
    }

    #[test]
    fn config_classifies_as_internal() {
        let err = Error::config("misconfigured");
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Internal);
    }

    #[test]
    fn tool_error_terminal_unwraps_to_inner_kind() {
        // `ToolErrorTerminal` is a routing-decision wrapper, not a
        // distinct category. Classification unwraps so a parent
        // layer (sub-agent propagation case) sees the original kind
        // and re-evaluates against its own policy.
        let inner = Error::provider_http(401, "unauthorized");
        let wrapped = Error::tool_error_terminal(ToolErrorKind::Auth, "my_tool", inner);
        assert_eq!(ToolErrorKind::classify(&wrapped), ToolErrorKind::Auth);
        // Doubly-wrapped (parent escalates a sub-agent's already-
        // terminal failure): still unwraps cleanly to the leaf kind.
        let twice = Error::tool_error_terminal(ToolErrorKind::Auth, "parent_tool", wrapped);
        assert_eq!(ToolErrorKind::classify(&twice), ToolErrorKind::Auth);
    }

    #[test]
    fn bit_indices_are_unique_and_fit_in_set_width() {
        // Same-crate exhaustive match — adding a new variant to
        // `ToolErrorKind` without extending the arm below is a
        // compile error. That single forcing function carries two
        // contracts:
        //
        //   - Every variant has a `bit_index` (the match's body
        //     evaluates `k.bit_index()` for every arm).
        //   - The runtime loop below visits every variant, so the
        //     uniqueness and width assertions cover the whole
        //     taxonomy with no hidden holes.
        //
        // This is the single drift guard for the bitset — no
        // parallel `ALL` array, no `const _: () = …` block, no
        // `#[allow(clippy::indexing_slicing)]` workaround.
        fn dispatch_bit_index(k: ToolErrorKind) -> u32 {
            match k {
                ToolErrorKind::Transient
                | ToolErrorKind::RateLimit
                | ToolErrorKind::Quota
                | ToolErrorKind::Auth
                | ToolErrorKind::Permanent
                | ToolErrorKind::Validation
                | ToolErrorKind::Internal => k.bit_index(),
            }
        }
        let every_variant = [
            ToolErrorKind::Transient,
            ToolErrorKind::RateLimit,
            ToolErrorKind::Quota,
            ToolErrorKind::Auth,
            ToolErrorKind::Permanent,
            ToolErrorKind::Validation,
            ToolErrorKind::Internal,
        ];
        let mut seen = std::collections::HashSet::new();
        for k in every_variant {
            let bi = dispatch_bit_index(k);
            assert!(
                bi < ToolErrorKindSet::CAPACITY_BITS,
                "{k:?}.bit_index() = {bi} exceeds ToolErrorKindSet capacity \
                 ({cap} bits) — widen the backing integer in lockstep",
                cap = ToolErrorKindSet::CAPACITY_BITS,
            );
            assert!(seen.insert(bi), "duplicate bit_index {bi} for {k:?}");
        }
    }

    #[test]
    fn tool_error_kind_set_const_construction() {
        const SET: ToolErrorKindSet = ToolErrorKindSet::empty()
            .with(ToolErrorKind::Auth)
            .with(ToolErrorKind::Quota)
            .with(ToolErrorKind::Permanent);
        assert!(SET.contains(ToolErrorKind::Auth));
        assert!(SET.contains(ToolErrorKind::Quota));
        assert!(SET.contains(ToolErrorKind::Permanent));
        assert!(!SET.contains(ToolErrorKind::Transient));
        assert!(!SET.contains(ToolErrorKind::Internal));
        assert!(!SET.is_empty());
        assert!(ToolErrorKindSet::empty().is_empty());
    }

    #[test]
    fn tool_error_kind_set_without_and_union() {
        let a = ToolErrorKindSet::singleton(ToolErrorKind::Auth);
        let b = ToolErrorKindSet::singleton(ToolErrorKind::Quota);
        let both = a.union(b);
        assert!(both.contains(ToolErrorKind::Auth));
        assert!(both.contains(ToolErrorKind::Quota));
        let removed = both.without(ToolErrorKind::Auth);
        assert!(!removed.contains(ToolErrorKind::Auth));
        assert!(removed.contains(ToolErrorKind::Quota));
    }

    #[test]
    fn usage_limit_exceeded_classifies_as_quota() {
        use crate::run_budget::UsageLimitBreach;
        let err = Error::UsageLimitExceeded(UsageLimitBreach::Requests {
            limit: 10,
            observed: 11,
        });
        assert_eq!(ToolErrorKind::classify(&err), ToolErrorKind::Quota);
    }
}