Skip to main content

entelix_core/
error.rs

1//! Top-level error type for `entelix-core` and the public API surface of the
2//! facade crate.
3//!
4//! Conventions (see CLAUDE.md §"Error conventions"):
5//! - Public crate APIs surface `entelix_core::Error`. Module-internal errors
6//!   are typed enums (e.g. `CodecError`) that bubble up via `From` chains.
7//! - Provider failures carry a typed `kind: ProviderErrorKind` (Network /
8//!   Tls / Dns / Http(status)) — retry classifiers branch on the typed
9//!   signal, not on parsed strings.
10//! - `Result<T> = std::result::Result<T, Error>`.
11
12use std::borrow::Cow;
13use std::time::Duration;
14
15use crate::auth::AuthError;
16
17/// Convenience alias used across `entelix-core` and re-exported by the facade.
18pub type Result<T> = core::result::Result<T, Error>;
19
20/// Aggregate error returned from public entelix-core APIs.
21#[derive(Debug, thiserror::Error)]
22#[non_exhaustive]
23pub enum Error {
24    /// Caller supplied an invalid request before any provider was contacted —
25    /// e.g. empty message list, missing required field, schema mismatch.
26    #[error("invalid request: {0}")]
27    InvalidRequest(Cow<'static, str>),
28
29    /// Configuration error detected at construction time (builders, factories,
30    /// crate-init code).
31    #[error("config error: {0}")]
32    Config(Cow<'static, str>),
33
34    /// Provider failure. `kind` distinguishes transport-class
35    /// failures (network / TLS / DNS) from HTTP-class failures
36    /// (4xx / 5xx) so retry classifiers can act on the typed signal
37    /// rather than parsing strings or reading a `status: 0`
38    /// sentinel. `retry_after` carries the vendor's `Retry-After`
39    /// hint when present — the retry layer honours it ahead of its
40    /// own backoff (invariant #17 — vendor authoritative signal
41    /// beats self-jitter).
42    #[error("provider {kind}: {message}")]
43    Provider {
44        /// Failure category — `Network`, `Tls`, `Dns`, or
45        /// `Http(status)`.
46        kind: ProviderErrorKind,
47        /// Provider-supplied message, normalized to a string.
48        message: String,
49        /// Vendor `Retry-After` hint when present. Capped at the
50        /// transport's parsing limit so a malicious vendor cannot
51        /// pin a retry loop forever.
52        #[allow(dead_code)]
53        retry_after: Option<Duration>,
54        /// Underlying error (transport / parser / signer) preserved
55        /// for operator diagnostics. Walk it via [`std::error::Error::source`]
56        /// or `{:?}`; the LLM-facing channel never sees it
57        /// (invariant 16 — `LlmRenderable::render_for_llm` strips
58        /// source chains).
59        #[source]
60        source: Option<Box<dyn std::error::Error + Send + Sync + 'static>>,
61    },
62
63    /// The operation was cancelled via the `ExecutionContext` cancellation token.
64    #[error("operation cancelled")]
65    Cancelled,
66
67    /// The operation hit the deadline carried by `ExecutionContext`.
68    #[error("deadline exceeded")]
69    DeadlineExceeded,
70
71    /// A dispatch (tool body, graph node, or middleware layer) requested
72    /// human-in-the-loop intervention. The runtime catches this,
73    /// persists a checkpoint at the pre-dispatch state, and returns
74    /// `kind` + `payload` to the caller. Resume with
75    /// `entelix_graph::CompiledGraph::resume_with`.
76    ///
77    /// See [`crate::interruption::InterruptionKind`] for the typed
78    /// reason taxonomy and [`crate::interrupt`] /
79    /// [`crate::interrupt_with`] for the canonical raise sites.
80    #[error("dispatch interrupted for human review")]
81    Interrupted {
82        /// Typed reason — `Custom` for operator-defined pauses,
83        /// `ApprovalPending { tool_use_id }` for tool-approval
84        /// pauses raised by `ApprovalLayer`, or any future SDK
85        /// variant. Operator match sites should carry a fall-through
86        /// `_` arm.
87        kind: crate::interruption::InterruptionKind,
88        /// Operator free-form data describing what the resumer needs
89        /// to know. For typed kinds this is often `Value::Null`; for
90        /// `Custom` it carries whatever `interrupt(payload)` passed.
91        payload: serde_json::Value,
92    },
93
94    /// A validator (typed-output `OutputValidator`, tool body, hook)
95    /// requested the model retry the current turn with corrective
96    /// feedback. Distinct from [`Self::Provider`] (transport
97    /// retries — wire-level failure) and [`Self::InvalidRequest`]
98    /// (operator misuse) so retry classifiers, OTel dashboards, and
99    /// budget meters all branch on a typed signal.
100    ///
101    /// Catch-and-resume semantics: the surrounding agent or
102    /// `complete_typed<O>` loop catches this variant, appends a
103    /// `RetryPromptPart` to the conversation carrying `hint`, and
104    /// re-invokes the model — counting one increment against
105    /// `ChatModelConfig::validation_retries`. Operators that want to
106    /// raise this variant build it via [`Error::model_retry`] so
107    /// the `RenderedForLlm` funnel (invariant 16) cannot be
108    /// bypassed.
109    #[error("model retry requested (attempt {attempt})")]
110    ModelRetry {
111        /// Corrective text the loop will surface to the model on the
112        /// retried turn. The `RenderedForLlm` carrier ensures the
113        /// payload was filtered through the operator-controlled
114        /// rendering funnel rather than copied raw from a
115        /// vendor-side error string.
116        hint: crate::llm_facing::RenderedForLlm<String>,
117        /// Per-call attempt counter. The retry loop stamps this on
118        /// emit so the variant is self-describing without callers
119        /// tracking attempt state externally. The first retry sees
120        /// `attempt = 1`.
121        attempt: u32,
122    },
123
124    /// JSON serialization or deserialization failed at an entelix-managed
125    /// boundary (codec, tool I/O, persistence write/read).
126    #[error(transparent)]
127    Serde(#[from] serde_json::Error),
128
129    /// Credential resolution or use failed. Distinct from
130    /// [`Self::Provider`] so retry policies and dashboards can
131    /// distinguish "the model is down" from "our key is bad" without
132    /// pattern-matching on error messages.
133    #[error(transparent)]
134    Auth(AuthError),
135
136    /// A `RunBudget` axis was exceeded — request count, token
137    /// totals, or tool calls hit the configured limit. The
138    /// `axis` field identifies which axis fired; `limit` is the
139    /// configured cap; `observed` is the value that breached it.
140    /// Distinct from [`Self::Provider`] so retry classifiers can
141    /// short-circuit (a budget breach does not retry) and from
142    /// [`Self::InvalidRequest`] so dashboards see the budget
143    /// signal as a first-class category.
144    /// A [`crate::RunBudget`] axis was exceeded. The typed
145    /// [`crate::run_budget::UsageLimitBreach`] enum carries both
146    /// the breaching axis and its magnitude in one variant — axis
147    /// and magnitude are paired by construction so consumers
148    /// pattern-match a single value rather than checking the axis
149    /// to know which numeric type to read.
150    #[error("{0}")]
151    UsageLimitExceeded(crate::run_budget::UsageLimitBreach),
152}
153
154impl Error {
155    /// Build an `InvalidRequest` from a static or owned string.
156    pub fn invalid_request(msg: impl Into<Cow<'static, str>>) -> Self {
157        Self::InvalidRequest(msg.into())
158    }
159
160    /// Build a `Config` error from a static or owned string.
161    pub fn config(msg: impl Into<Cow<'static, str>>) -> Self {
162        Self::Config(msg.into())
163    }
164
165    /// Build a [`Self::ModelRetry`] from an LLM-rendered hint. The
166    /// `attempt` counter starts at zero and is incremented by the
167    /// surrounding retry loop on each emit; validators / tools
168    /// raising this variant from a fresh call site pass `0` and
169    /// trust the loop to stamp the running counter.
170    ///
171    /// Construction goes through [`crate::llm_facing::RenderedForLlm`] so the
172    /// hint is not a free-form `String` — the typed carrier ensures
173    /// the message has been routed through the operator's rendering
174    /// funnel (invariant 16). Consumers raising this variant from a
175    /// validator typically obtain the rendered hint via
176    /// `LlmRenderable::for_llm`.
177    pub const fn model_retry(
178        hint: crate::llm_facing::RenderedForLlm<String>,
179        attempt: u32,
180    ) -> Self {
181        Self::ModelRetry { hint, attempt }
182    }
183
184    /// Build an HTTP-class provider error. Use the `_network` /
185    /// `_tls` / `_dns` variants for transport-class failures so
186    /// retry classifiers see the typed signal rather than a
187    /// stringly-typed status code.
188    ///
189    /// Status `0` / 1xx / 2xx / 3xx / ≥600 do **not** represent a
190    /// terminal vendor response. The constructor coerces them to
191    /// [`ProviderErrorKind::Network`] so retry classifiers, wire
192    /// codes, and dashboards see "we never received a terminal
193    /// response" rather than a plausible-looking `upstream_error`
194    /// (invariant 15).
195    ///
196    /// Synthetic-message form: use when the message is composed
197    /// from vendor body fields (no source error). For
198    /// [`std::error::Error`]-bearing failures, prefer
199    /// [`Self::provider_http_from`] which preserves the source
200    /// chain.
201    pub fn provider_http(status: u16, message: impl Into<String>) -> Self {
202        Self::Provider {
203            kind: http_or_network(status),
204            message: message.into(),
205            retry_after: None,
206            source: None,
207        }
208    }
209
210    /// Build an HTTP-class provider error from any
211    /// [`std::error::Error`]. Message is `err.to_string()`; the
212    /// original error is preserved as `#[source]`. Status coercion
213    /// follows [`Self::provider_http`] — non-4xx/5xx statuses
214    /// surface as [`ProviderErrorKind::Network`].
215    pub fn provider_http_from<E>(status: u16, err: E) -> Self
216    where
217        E: std::error::Error + Send + Sync + 'static,
218    {
219        Self::Provider {
220            kind: http_or_network(status),
221            message: err.to_string(),
222            retry_after: None,
223            source: Some(Box::new(err)),
224        }
225    }
226
227    /// Build a network-class provider error (connect refused, read
228    /// reset, peer hangup before HTTP framing). Distinguishes
229    /// "vendor returned a 5xx" from "we never spoke to vendor".
230    ///
231    /// Synthetic-message form: use when no source error exists
232    /// (e.g. vendor wire-format prose lifted from a JSON body).
233    /// Source-bearing form: [`Self::provider_network_from`] derives
234    /// the message from the source's `Display` and stores the source
235    /// for `{:?}` walks (preferred for `map_err` chains).
236    pub fn provider_network(message: impl Into<String>) -> Self {
237        Self::Provider {
238            kind: ProviderErrorKind::Network,
239            message: message.into(),
240            retry_after: None,
241            source: None,
242        }
243    }
244
245    /// Build a network-class provider error from any
246    /// [`std::error::Error`]. Message is `err.to_string()`; the
247    /// original error is preserved as `#[source]` so operator
248    /// diagnostics walk the full chain. Pairs with `.map_err`:
249    ///
250    /// ```ignore
251    /// http_req.send().await.map_err(Error::provider_network_from)?;
252    /// ```
253    pub fn provider_network_from<E>(err: E) -> Self
254    where
255        E: std::error::Error + Send + Sync + 'static,
256    {
257        Self::Provider {
258            kind: ProviderErrorKind::Network,
259            message: err.to_string(),
260            retry_after: None,
261            source: Some(Box::new(err)),
262        }
263    }
264
265    /// Build a TLS-class provider error (handshake failure,
266    /// certificate validation, protocol mismatch).
267    pub fn provider_tls(message: impl Into<String>) -> Self {
268        Self::Provider {
269            kind: ProviderErrorKind::Tls,
270            message: message.into(),
271            retry_after: None,
272            source: None,
273        }
274    }
275
276    /// Build a TLS-class provider error from any
277    /// [`std::error::Error`]. Message is `err.to_string()`; the
278    /// original error is preserved as `#[source]`.
279    pub fn provider_tls_from<E>(err: E) -> Self
280    where
281        E: std::error::Error + Send + Sync + 'static,
282    {
283        Self::Provider {
284            kind: ProviderErrorKind::Tls,
285            message: err.to_string(),
286            retry_after: None,
287            source: Some(Box::new(err)),
288        }
289    }
290
291    /// Build a DNS-class provider error (name resolution failure,
292    /// SSRF allowlist rejection at the resolver).
293    pub fn provider_dns(message: impl Into<String>) -> Self {
294        Self::Provider {
295            kind: ProviderErrorKind::Dns,
296            message: message.into(),
297            retry_after: None,
298            source: None,
299        }
300    }
301
302    /// Build a DNS-class provider error from any
303    /// [`std::error::Error`]. Message is `err.to_string()`; the
304    /// original error is preserved as `#[source]`.
305    pub fn provider_dns_from<E>(err: E) -> Self
306    where
307        E: std::error::Error + Send + Sync + 'static,
308    {
309        Self::Provider {
310            kind: ProviderErrorKind::Dns,
311            message: err.to_string(),
312            retry_after: None,
313            source: Some(Box::new(err)),
314        }
315    }
316
317    /// Attach a `Retry-After` duration to a provider error. The
318    /// duration arrives from the vendor's `Retry-After` response
319    /// header (or equivalent body field). Returns `self` unchanged
320    /// for non-`Provider` variants — callers know the variant they
321    /// constructed, so this is `Self -> Self` rather than a typed
322    /// projection.
323    #[must_use]
324    pub fn with_retry_after(mut self, duration: Duration) -> Self {
325        if let Self::Provider {
326            ref mut retry_after,
327            ..
328        } = self
329        {
330            *retry_after = Some(duration);
331        }
332        self
333    }
334
335    /// Attach the underlying error as the `Provider` variant's source
336    /// chain, preserving root-cause context for operator diagnostics
337    /// (`{:?}` / [`std::error::Error::source`] walk). Returns `self`
338    /// unchanged for non-`Provider` variants.
339    ///
340    /// Channel-separation guarantee (invariant 16): the source chain
341    /// is operator-only. [`crate::LlmRenderable::render_for_llm`]
342    /// strips it for LLM-facing renderings; sinks / OTel / logs keep
343    /// the full diagnostic.
344    #[must_use]
345    pub fn with_source<E>(mut self, err: E) -> Self
346    where
347        E: std::error::Error + Send + Sync + 'static,
348    {
349        if let Self::Provider { ref mut source, .. } = self {
350            *source = Some(Box::new(err));
351        }
352        self
353    }
354
355    /// Typed wire shape for this error — the **single canonical
356    /// inspector** integrators read at sink / SSE / audit boundaries
357    /// instead of parsing `Display` output. `ErrorEnvelope` is `Copy`,
358    /// so call sites cache or pass-by-value without ceremony.
359    ///
360    /// Guarantees (patch-version stable, mirrored on `ErrorEnvelope`'s
361    /// own doc-comment):
362    /// - `wire_code` is a snake-case ASCII `&'static str` suitable as
363    ///   an i18n key, metric label, or typed-wire-envelope key. Adding
364    ///   a new [`Error`] variant adds a new code; existing codes are
365    ///   forever-stable.
366    /// - `wire_class` is the coarse responsibility split (`Client` for
367    ///   caller-actionable failures, `Server` for SDK/vendor-side
368    ///   failures). Orthogonal to retryability.
369    /// - `retry_after_secs` carries the vendor's `Retry-After` hint
370    ///   converted to whole seconds when the originating
371    ///   [`Self::Provider`] error captured one; `None` for every other
372    ///   variant or Provider error without a hint.
373    /// - `provider_status` carries the raw HTTP status when the error
374    ///   is `Provider` with [`ProviderErrorKind::Http`]; `None`
375    ///   otherwise. Lets sinks/audit retain `429 vs 503` granularity
376    ///   even though `wire_code` collapses them onto coarse buckets.
377    ///
378    /// HTTP provider failures bucket on the status family for
379    /// `wire_code` so vendor drift (a new 4xx) absorbs into the right
380    /// class without an SDK release; the raw status is still observable
381    /// through `provider_status` for operators that want the exact
382    /// signal.
383    pub fn envelope(&self) -> ErrorEnvelope {
384        let (wire_code, wire_class) = self.wire_signal();
385        let (retry_after_secs, provider_status) = match self {
386            Self::Provider {
387                kind, retry_after, ..
388            } => (
389                retry_after.map(|d| d.as_secs()),
390                match kind {
391                    ProviderErrorKind::Http(status) => Some(*status),
392                    _ => None,
393                },
394            ),
395            _ => (None, None),
396        };
397        ErrorEnvelope {
398            wire_code,
399            wire_class,
400            retry_after_secs,
401            provider_status,
402        }
403    }
404
405    /// Internal matcher producing the `(wire_code, wire_class)` pair.
406    /// Single match arm per [`Error`] variant keeps the two signals
407    /// from drifting apart on future variant additions — they're
408    /// chosen together, not from two parallel `match` expressions.
409    fn wire_signal(&self) -> (&'static str, ErrorClass) {
410        match self {
411            Self::InvalidRequest(_) => ("invalid_request", ErrorClass::Client),
412            Self::Config(_) => ("config_error", ErrorClass::Server),
413            Self::Provider { kind, .. } => match kind {
414                ProviderErrorKind::Network => ("transport_failure", ErrorClass::Server),
415                ProviderErrorKind::Tls => ("tls_failure", ErrorClass::Server),
416                ProviderErrorKind::Dns => ("dns_failure", ErrorClass::Server),
417                ProviderErrorKind::Http(status) => match *status {
418                    429 => ("rate_limited", ErrorClass::Client),
419                    401 | 403 => ("upstream_unauthorized", ErrorClass::Client),
420                    s if (400..500).contains(&s) => ("upstream_invalid", ErrorClass::Client),
421                    s if (500..600).contains(&s) => ("upstream_unavailable", ErrorClass::Server),
422                    _ => ("upstream_error", ErrorClass::Server),
423                },
424            },
425            Self::Auth(_) => ("auth_failed", ErrorClass::Client),
426            Self::Cancelled => ("cancelled", ErrorClass::Client),
427            Self::DeadlineExceeded => ("deadline_exceeded", ErrorClass::Server),
428            Self::Interrupted { .. } => ("interrupted", ErrorClass::Client),
429            Self::ModelRetry { .. } => ("model_retry_exhausted", ErrorClass::Client),
430            Self::Serde(_) => ("serde", ErrorClass::Server),
431            Self::UsageLimitExceeded(_) => ("quota_exceeded", ErrorClass::Client),
432        }
433    }
434}
435
436/// Typed wire shape of an [`Error`] — the canonical inspector at
437/// sink / SSE / audit boundaries. Built by [`Error::envelope`]; never
438/// constructed externally so the field set evolves under the same
439/// patch-version-stability guarantee as `wire_code` itself.
440///
441/// `Copy` is intentional: every field is 16 bytes or smaller. Carry
442/// the envelope by value through sink fan-out, OTel attribute
443/// stamping, and SSE serialisation without `.clone()` ceremony.
444///
445/// ## Field semantics
446///
447/// - `wire_code` — patch-version-stable snake-case `&'static str`
448///   bucketing the error onto an i18n / metric / typed-wire key. HTTP
449///   provider failures bucket on the status family so vendor drift
450///   does not require an SDK release.
451/// - `wire_class` — coarse responsibility split. `Client` for
452///   caller-actionable failures, `Server` for SDK/vendor-side
453///   failures. Orthogonal to retry semantics.
454/// - `retry_after_secs` — vendor `Retry-After` hint converted to
455///   whole seconds when the originating [`Error::Provider`] captured
456///   one. `None` for every other variant or for Provider errors that
457///   arrived without a hint. Sinks key SSE rate-limit timers /
458///   FE retry indicators off this field.
459/// - `provider_status` — raw HTTP status when the error is
460///   `Provider` with [`ProviderErrorKind::Http`]; `None` otherwise.
461///   Lets audit / replay retain `429 vs 503` granularity even though
462///   `wire_code` collapses them onto coarse buckets.
463#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, serde::Serialize)]
464#[non_exhaustive]
465pub struct ErrorEnvelope {
466    /// Patch-version-stable wire code. See type-level doc.
467    pub wire_code: &'static str,
468    /// Responsibility class. See type-level doc.
469    pub wire_class: ErrorClass,
470    /// Vendor `Retry-After` hint in seconds. See type-level doc.
471    #[serde(skip_serializing_if = "Option::is_none")]
472    pub retry_after_secs: Option<u64>,
473    /// Raw HTTP status for Provider/Http failures. See type-level doc.
474    #[serde(skip_serializing_if = "Option::is_none")]
475    pub provider_status: Option<u16>,
476}
477
478/// Coarse responsibility class for an [`Error`]. Two values by design —
479/// "transient" / "permanent" is a retry-policy axis, orthogonal to
480/// responsibility, and surfaced via [`Error::Provider`]'s
481/// `retry_after` field plus the `RetryClassifier` policy surface.
482///
483/// Maps onto the standard HTTP family split: `Client` ≈ 4xx-equivalent
484/// (caller / integrator can act to fix), `Server` ≈ 5xx-equivalent
485/// (vendor or deployment must act).
486///
487/// JSON serialisation produces `"client"` / `"server"` to match the
488/// [`std::fmt::Display`] form — wire dashboards keying off the lower-
489/// case bucket stay consistent across the OTel / SSE / audit surfaces.
490#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, serde::Serialize)]
491#[serde(rename_all = "lowercase")]
492#[non_exhaustive]
493pub enum ErrorClass {
494    /// The caller — request shape, credentials, quota, cancellation
495    /// choice — is the actor that can resolve the failure.
496    Client,
497    /// The SDK, vendor, or deployment environment is the actor that
498    /// can resolve the failure.
499    Server,
500}
501
502impl std::fmt::Display for ErrorClass {
503    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
504        match self {
505            Self::Client => f.write_str("client"),
506            Self::Server => f.write_str("server"),
507        }
508    }
509}
510
511/// Coerce a raw `u16` HTTP status into a typed
512/// [`ProviderErrorKind`]. 4xx / 5xx surface as
513/// [`ProviderErrorKind::Http`]; every other value collapses to
514/// [`ProviderErrorKind::Network`] because the SDK never received a
515/// terminal vendor response (invariant 15 — no silent fallback to
516/// a plausible-looking `upstream_error`).
517const fn http_or_network(status: u16) -> ProviderErrorKind {
518    if status >= 400 && status < 600 {
519        ProviderErrorKind::Http(status)
520    } else {
521        ProviderErrorKind::Network
522    }
523}
524
525/// Provider failure category — distinguishes transport-class
526/// failures (the SDK never received a complete HTTP framing) from
527/// HTTP-class failures (the vendor responded with a status). Retry
528/// classifiers use this to make typed decisions rather than
529/// pattern-matching on `status: 0` sentinels (invariant #17).
530#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
531#[non_exhaustive]
532pub enum ProviderErrorKind {
533    /// Connect refused, read reset, peer hangup before HTTP framing
534    /// completed.
535    Network,
536    /// TLS handshake failure, certificate validation failure,
537    /// protocol mismatch.
538    Tls,
539    /// DNS resolution failure or SSRF allowlist rejection at the
540    /// resolver.
541    Dns,
542    /// Vendor responded with an HTTP status. Carries the actual
543    /// numeric code so classifiers can branch on `408|425|429|5xx`.
544    Http(u16),
545}
546
547impl std::fmt::Display for ProviderErrorKind {
548    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
549        match self {
550            Self::Network => f.write_str("network"),
551            Self::Tls => f.write_str("tls"),
552            Self::Dns => f.write_str("dns"),
553            Self::Http(status) => write!(f, "returned {status}"),
554        }
555    }
556}