entelix_core/error.rs
1//! Top-level error type for `entelix-core` and the public API surface of the
2//! facade crate.
3//!
4//! Conventions (see CLAUDE.md §"Error conventions"):
5//! - Public crate APIs surface `entelix_core::Error`. Module-internal errors
6//! are typed enums (e.g. `CodecError`) that bubble up via `From` chains.
7//! - Provider failures carry a typed `kind: ProviderErrorKind` (Network /
8//! Tls / Dns / Http(status)) — retry classifiers branch on the typed
9//! signal, not on parsed strings.
10//! - `Result<T> = std::result::Result<T, Error>`.
11
12use std::borrow::Cow;
13use std::time::Duration;
14
15use crate::auth::AuthError;
16
17/// Convenience alias used across `entelix-core` and re-exported by the facade.
18pub type Result<T> = core::result::Result<T, Error>;
19
20/// Aggregate error returned from public entelix-core APIs.
21#[derive(Debug, thiserror::Error)]
22#[non_exhaustive]
23pub enum Error {
24 /// Caller supplied an invalid request before any provider was contacted —
25 /// e.g. empty message list, missing required field, schema mismatch.
26 #[error("invalid request: {0}")]
27 InvalidRequest(Cow<'static, str>),
28
29 /// Configuration error detected at construction time (builders, factories,
30 /// crate-init code).
31 #[error("config error: {0}")]
32 Config(Cow<'static, str>),
33
34 /// Provider failure. `kind` distinguishes transport-class
35 /// failures (network / TLS / DNS) from HTTP-class failures
36 /// (4xx / 5xx) so retry classifiers can act on the typed signal
37 /// rather than parsing strings or reading a `status: 0`
38 /// sentinel. `retry_after` carries the vendor's `Retry-After`
39 /// hint when present — the retry layer honours it ahead of its
40 /// own backoff (invariant #17 — vendor authoritative signal
41 /// beats self-jitter).
42 #[error("provider {kind}: {message}")]
43 Provider {
44 /// Failure category — `Network`, `Tls`, `Dns`, or
45 /// `Http(status)`.
46 kind: ProviderErrorKind,
47 /// Provider-supplied message, normalized to a string.
48 message: String,
49 /// Vendor `Retry-After` hint when present. Capped at the
50 /// transport's parsing limit so a malicious vendor cannot
51 /// pin a retry loop forever.
52 #[allow(dead_code)]
53 retry_after: Option<Duration>,
54 /// Underlying error (transport / parser / signer) preserved
55 /// for operator diagnostics. Walk it via [`std::error::Error::source`]
56 /// or `{:?}`; the LLM-facing channel never sees it
57 /// (invariant 16 — `LlmRenderable::render_for_llm` strips
58 /// source chains).
59 #[source]
60 source: Option<Box<dyn std::error::Error + Send + Sync + 'static>>,
61 },
62
63 /// The operation was cancelled via the `ExecutionContext` cancellation token.
64 #[error("operation cancelled")]
65 Cancelled,
66
67 /// The operation hit the deadline carried by `ExecutionContext`.
68 #[error("deadline exceeded")]
69 DeadlineExceeded,
70
71 /// A dispatch (tool body, graph node, or middleware layer) requested
72 /// human-in-the-loop intervention. The runtime catches this,
73 /// persists a checkpoint at the pre-dispatch state, and returns
74 /// `kind` + `payload` to the caller. Resume with
75 /// `entelix_graph::CompiledGraph::resume_with`.
76 ///
77 /// See [`crate::interruption::InterruptionKind`] for the typed
78 /// reason taxonomy and [`crate::interrupt`] /
79 /// [`crate::interrupt_with`] for the canonical raise sites.
80 #[error("dispatch interrupted for human review")]
81 Interrupted {
82 /// Typed reason — `Custom` for operator-defined pauses,
83 /// `ApprovalPending { tool_use_id }` for tool-approval
84 /// pauses raised by `ApprovalLayer`, or any future SDK
85 /// variant. Operator match sites should carry a fall-through
86 /// `_` arm.
87 kind: crate::interruption::InterruptionKind,
88 /// Operator free-form data describing what the resumer needs
89 /// to know. For typed kinds this is often `Value::Null`; for
90 /// `Custom` it carries whatever `interrupt(payload)` passed.
91 payload: serde_json::Value,
92 },
93
94 /// A validator (typed-output `OutputValidator`, tool body, hook)
95 /// requested the model retry the current turn with corrective
96 /// feedback. Distinct from [`Self::Provider`] (transport
97 /// retries — wire-level failure) and [`Self::InvalidRequest`]
98 /// (operator misuse) so retry classifiers, OTel dashboards, and
99 /// budget meters all branch on a typed signal.
100 ///
101 /// Catch-and-resume semantics: the surrounding agent or
102 /// `complete_typed<O>` loop catches this variant, appends a
103 /// `RetryPromptPart` to the conversation carrying `hint`, and
104 /// re-invokes the model — counting one increment against
105 /// `ChatModelConfig::validation_retries`. Operators that want to
106 /// raise this variant build it via [`Error::model_retry`] so
107 /// the `RenderedForLlm` funnel (invariant 16) cannot be
108 /// bypassed.
109 #[error("model retry requested (attempt {attempt})")]
110 ModelRetry {
111 /// Corrective text the loop will surface to the model on the
112 /// retried turn. The `RenderedForLlm` carrier ensures the
113 /// payload was filtered through the operator-controlled
114 /// rendering funnel rather than copied raw from a
115 /// vendor-side error string.
116 hint: crate::llm_facing::RenderedForLlm<String>,
117 /// Per-call attempt counter. The retry loop stamps this on
118 /// emit so the variant is self-describing without callers
119 /// tracking attempt state externally. The first retry sees
120 /// `attempt = 1`.
121 attempt: u32,
122 },
123
124 /// JSON serialization or deserialization failed at an entelix-managed
125 /// boundary (codec, tool I/O, persistence write/read).
126 #[error(transparent)]
127 Serde(#[from] serde_json::Error),
128
129 /// Credential resolution or use failed. Distinct from
130 /// [`Self::Provider`] so retry policies and dashboards can
131 /// distinguish "the model is down" from "our key is bad" without
132 /// pattern-matching on error messages.
133 #[error(transparent)]
134 Auth(AuthError),
135
136 /// A `RunBudget` axis was exceeded — request count, token
137 /// totals, or tool calls hit the configured limit. The
138 /// `axis` field identifies which axis fired; `limit` is the
139 /// configured cap; `observed` is the value that breached it.
140 /// Distinct from [`Self::Provider`] so retry classifiers can
141 /// short-circuit (a budget breach does not retry) and from
142 /// [`Self::InvalidRequest`] so dashboards see the budget
143 /// signal as a first-class category.
144 /// A [`crate::RunBudget`] axis was exceeded. The typed
145 /// [`crate::run_budget::UsageLimitBreach`] enum carries both
146 /// the breaching axis and its magnitude in one variant — axis
147 /// and magnitude are paired by construction so consumers
148 /// pattern-match a single value rather than checking the axis
149 /// to know which numeric type to read.
150 #[error("{0}")]
151 UsageLimitExceeded(crate::run_budget::UsageLimitBreach),
152}
153
154impl Error {
155 /// Build an `InvalidRequest` from a static or owned string.
156 pub fn invalid_request(msg: impl Into<Cow<'static, str>>) -> Self {
157 Self::InvalidRequest(msg.into())
158 }
159
160 /// Build a `Config` error from a static or owned string.
161 pub fn config(msg: impl Into<Cow<'static, str>>) -> Self {
162 Self::Config(msg.into())
163 }
164
165 /// Build a [`Self::ModelRetry`] from an LLM-rendered hint. The
166 /// `attempt` counter starts at zero and is incremented by the
167 /// surrounding retry loop on each emit; validators / tools
168 /// raising this variant from a fresh call site pass `0` and
169 /// trust the loop to stamp the running counter.
170 ///
171 /// Construction goes through [`crate::llm_facing::RenderedForLlm`] so the
172 /// hint is not a free-form `String` — the typed carrier ensures
173 /// the message has been routed through the operator's rendering
174 /// funnel (invariant 16). Consumers raising this variant from a
175 /// validator typically obtain the rendered hint via
176 /// `LlmRenderable::for_llm`.
177 pub const fn model_retry(
178 hint: crate::llm_facing::RenderedForLlm<String>,
179 attempt: u32,
180 ) -> Self {
181 Self::ModelRetry { hint, attempt }
182 }
183
184 /// Build an HTTP-class provider error. Use the `_network` /
185 /// `_tls` / `_dns` variants for transport-class failures so
186 /// retry classifiers see the typed signal rather than a
187 /// stringly-typed status code.
188 ///
189 /// Status `0` / 1xx / 2xx / 3xx / ≥600 do **not** represent a
190 /// terminal vendor response. The constructor coerces them to
191 /// [`ProviderErrorKind::Network`] so retry classifiers, wire
192 /// codes, and dashboards see "we never received a terminal
193 /// response" rather than a plausible-looking `upstream_error`
194 /// (invariant 15).
195 ///
196 /// Synthetic-message form: use when the message is composed
197 /// from vendor body fields (no source error). For
198 /// [`std::error::Error`]-bearing failures, prefer
199 /// [`Self::provider_http_from`] which preserves the source
200 /// chain.
201 pub fn provider_http(status: u16, message: impl Into<String>) -> Self {
202 Self::Provider {
203 kind: http_or_network(status),
204 message: message.into(),
205 retry_after: None,
206 source: None,
207 }
208 }
209
210 /// Build an HTTP-class provider error from any
211 /// [`std::error::Error`]. Message is `err.to_string()`; the
212 /// original error is preserved as `#[source]`. Status coercion
213 /// follows [`Self::provider_http`] — non-4xx/5xx statuses
214 /// surface as [`ProviderErrorKind::Network`].
215 pub fn provider_http_from<E>(status: u16, err: E) -> Self
216 where
217 E: std::error::Error + Send + Sync + 'static,
218 {
219 Self::Provider {
220 kind: http_or_network(status),
221 message: err.to_string(),
222 retry_after: None,
223 source: Some(Box::new(err)),
224 }
225 }
226
227 /// Build a network-class provider error (connect refused, read
228 /// reset, peer hangup before HTTP framing). Distinguishes
229 /// "vendor returned a 5xx" from "we never spoke to vendor".
230 ///
231 /// Synthetic-message form: use when no source error exists
232 /// (e.g. vendor wire-format prose lifted from a JSON body).
233 /// Source-bearing form: [`Self::provider_network_from`] derives
234 /// the message from the source's `Display` and stores the source
235 /// for `{:?}` walks (preferred for `map_err` chains).
236 pub fn provider_network(message: impl Into<String>) -> Self {
237 Self::Provider {
238 kind: ProviderErrorKind::Network,
239 message: message.into(),
240 retry_after: None,
241 source: None,
242 }
243 }
244
245 /// Build a network-class provider error from any
246 /// [`std::error::Error`]. Message is `err.to_string()`; the
247 /// original error is preserved as `#[source]` so operator
248 /// diagnostics walk the full chain. Pairs with `.map_err`:
249 ///
250 /// ```ignore
251 /// http_req.send().await.map_err(Error::provider_network_from)?;
252 /// ```
253 pub fn provider_network_from<E>(err: E) -> Self
254 where
255 E: std::error::Error + Send + Sync + 'static,
256 {
257 Self::Provider {
258 kind: ProviderErrorKind::Network,
259 message: err.to_string(),
260 retry_after: None,
261 source: Some(Box::new(err)),
262 }
263 }
264
265 /// Build a TLS-class provider error (handshake failure,
266 /// certificate validation, protocol mismatch).
267 pub fn provider_tls(message: impl Into<String>) -> Self {
268 Self::Provider {
269 kind: ProviderErrorKind::Tls,
270 message: message.into(),
271 retry_after: None,
272 source: None,
273 }
274 }
275
276 /// Build a TLS-class provider error from any
277 /// [`std::error::Error`]. Message is `err.to_string()`; the
278 /// original error is preserved as `#[source]`.
279 pub fn provider_tls_from<E>(err: E) -> Self
280 where
281 E: std::error::Error + Send + Sync + 'static,
282 {
283 Self::Provider {
284 kind: ProviderErrorKind::Tls,
285 message: err.to_string(),
286 retry_after: None,
287 source: Some(Box::new(err)),
288 }
289 }
290
291 /// Build a DNS-class provider error (name resolution failure,
292 /// SSRF allowlist rejection at the resolver).
293 pub fn provider_dns(message: impl Into<String>) -> Self {
294 Self::Provider {
295 kind: ProviderErrorKind::Dns,
296 message: message.into(),
297 retry_after: None,
298 source: None,
299 }
300 }
301
302 /// Build a DNS-class provider error from any
303 /// [`std::error::Error`]. Message is `err.to_string()`; the
304 /// original error is preserved as `#[source]`.
305 pub fn provider_dns_from<E>(err: E) -> Self
306 where
307 E: std::error::Error + Send + Sync + 'static,
308 {
309 Self::Provider {
310 kind: ProviderErrorKind::Dns,
311 message: err.to_string(),
312 retry_after: None,
313 source: Some(Box::new(err)),
314 }
315 }
316
317 /// Attach a `Retry-After` duration to a provider error. The
318 /// duration arrives from the vendor's `Retry-After` response
319 /// header (or equivalent body field). Returns `self` unchanged
320 /// for non-`Provider` variants — callers know the variant they
321 /// constructed, so this is `Self -> Self` rather than a typed
322 /// projection.
323 #[must_use]
324 pub fn with_retry_after(mut self, duration: Duration) -> Self {
325 if let Self::Provider {
326 ref mut retry_after,
327 ..
328 } = self
329 {
330 *retry_after = Some(duration);
331 }
332 self
333 }
334
335 /// Attach the underlying error as the `Provider` variant's source
336 /// chain, preserving root-cause context for operator diagnostics
337 /// (`{:?}` / [`std::error::Error::source`] walk). Returns `self`
338 /// unchanged for non-`Provider` variants.
339 ///
340 /// Channel-separation guarantee (invariant 16): the source chain
341 /// is operator-only. [`crate::LlmRenderable::render_for_llm`]
342 /// strips it for LLM-facing renderings; sinks / OTel / logs keep
343 /// the full diagnostic.
344 #[must_use]
345 pub fn with_source<E>(mut self, err: E) -> Self
346 where
347 E: std::error::Error + Send + Sync + 'static,
348 {
349 if let Self::Provider { ref mut source, .. } = self {
350 *source = Some(Box::new(err));
351 }
352 self
353 }
354
355 /// Typed wire shape for this error — the **single canonical
356 /// inspector** integrators read at sink / SSE / audit boundaries
357 /// instead of parsing `Display` output. `ErrorEnvelope` is `Copy`,
358 /// so call sites cache or pass-by-value without ceremony.
359 ///
360 /// Guarantees (patch-version stable, mirrored on `ErrorEnvelope`'s
361 /// own doc-comment):
362 /// - `wire_code` is a snake-case ASCII `&'static str` suitable as
363 /// an i18n key, metric label, or typed-wire-envelope key. Adding
364 /// a new [`Error`] variant adds a new code; existing codes are
365 /// forever-stable.
366 /// - `wire_class` is the coarse responsibility split (`Client` for
367 /// caller-actionable failures, `Server` for SDK/vendor-side
368 /// failures). Orthogonal to retryability.
369 /// - `retry_after_secs` carries the vendor's `Retry-After` hint
370 /// converted to whole seconds when the originating
371 /// [`Self::Provider`] error captured one; `None` for every other
372 /// variant or Provider error without a hint.
373 /// - `provider_status` carries the raw HTTP status when the error
374 /// is `Provider` with [`ProviderErrorKind::Http`]; `None`
375 /// otherwise. Lets sinks/audit retain `429 vs 503` granularity
376 /// even though `wire_code` collapses them onto coarse buckets.
377 ///
378 /// HTTP provider failures bucket on the status family for
379 /// `wire_code` so vendor drift (a new 4xx) absorbs into the right
380 /// class without an SDK release; the raw status is still observable
381 /// through `provider_status` for operators that want the exact
382 /// signal.
383 pub fn envelope(&self) -> ErrorEnvelope {
384 let (wire_code, wire_class) = self.wire_signal();
385 let (retry_after_secs, provider_status) = match self {
386 Self::Provider {
387 kind, retry_after, ..
388 } => (
389 retry_after.map(|d| d.as_secs()),
390 match kind {
391 ProviderErrorKind::Http(status) => Some(*status),
392 _ => None,
393 },
394 ),
395 _ => (None, None),
396 };
397 ErrorEnvelope {
398 wire_code,
399 wire_class,
400 retry_after_secs,
401 provider_status,
402 }
403 }
404
405 /// Internal matcher producing the `(wire_code, wire_class)` pair.
406 /// Single match arm per [`Error`] variant keeps the two signals
407 /// from drifting apart on future variant additions — they're
408 /// chosen together, not from two parallel `match` expressions.
409 fn wire_signal(&self) -> (&'static str, ErrorClass) {
410 match self {
411 Self::InvalidRequest(_) => ("invalid_request", ErrorClass::Client),
412 Self::Config(_) => ("config_error", ErrorClass::Server),
413 Self::Provider { kind, .. } => match kind {
414 ProviderErrorKind::Network => ("transport_failure", ErrorClass::Server),
415 ProviderErrorKind::Tls => ("tls_failure", ErrorClass::Server),
416 ProviderErrorKind::Dns => ("dns_failure", ErrorClass::Server),
417 ProviderErrorKind::Http(status) => match *status {
418 429 => ("rate_limited", ErrorClass::Client),
419 401 | 403 => ("upstream_unauthorized", ErrorClass::Client),
420 s if (400..500).contains(&s) => ("upstream_invalid", ErrorClass::Client),
421 s if (500..600).contains(&s) => ("upstream_unavailable", ErrorClass::Server),
422 _ => ("upstream_error", ErrorClass::Server),
423 },
424 },
425 Self::Auth(_) => ("auth_failed", ErrorClass::Client),
426 Self::Cancelled => ("cancelled", ErrorClass::Client),
427 Self::DeadlineExceeded => ("deadline_exceeded", ErrorClass::Server),
428 Self::Interrupted { .. } => ("interrupted", ErrorClass::Client),
429 Self::ModelRetry { .. } => ("model_retry_exhausted", ErrorClass::Client),
430 Self::Serde(_) => ("serde", ErrorClass::Server),
431 Self::UsageLimitExceeded(_) => ("quota_exceeded", ErrorClass::Client),
432 }
433 }
434}
435
436/// Typed wire shape of an [`Error`] — the canonical inspector at
437/// sink / SSE / audit boundaries. Built by [`Error::envelope`]; never
438/// constructed externally so the field set evolves under the same
439/// patch-version-stability guarantee as `wire_code` itself.
440///
441/// `Copy` is intentional: every field is 16 bytes or smaller. Carry
442/// the envelope by value through sink fan-out, OTel attribute
443/// stamping, and SSE serialisation without `.clone()` ceremony.
444///
445/// ## Field semantics
446///
447/// - `wire_code` — patch-version-stable snake-case `&'static str`
448/// bucketing the error onto an i18n / metric / typed-wire key. HTTP
449/// provider failures bucket on the status family so vendor drift
450/// does not require an SDK release.
451/// - `wire_class` — coarse responsibility split. `Client` for
452/// caller-actionable failures, `Server` for SDK/vendor-side
453/// failures. Orthogonal to retry semantics.
454/// - `retry_after_secs` — vendor `Retry-After` hint converted to
455/// whole seconds when the originating [`Error::Provider`] captured
456/// one. `None` for every other variant or for Provider errors that
457/// arrived without a hint. Sinks key SSE rate-limit timers /
458/// FE retry indicators off this field.
459/// - `provider_status` — raw HTTP status when the error is
460/// `Provider` with [`ProviderErrorKind::Http`]; `None` otherwise.
461/// Lets audit / replay retain `429 vs 503` granularity even though
462/// `wire_code` collapses them onto coarse buckets.
463#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, serde::Serialize)]
464#[non_exhaustive]
465pub struct ErrorEnvelope {
466 /// Patch-version-stable wire code. See type-level doc.
467 pub wire_code: &'static str,
468 /// Responsibility class. See type-level doc.
469 pub wire_class: ErrorClass,
470 /// Vendor `Retry-After` hint in seconds. See type-level doc.
471 #[serde(skip_serializing_if = "Option::is_none")]
472 pub retry_after_secs: Option<u64>,
473 /// Raw HTTP status for Provider/Http failures. See type-level doc.
474 #[serde(skip_serializing_if = "Option::is_none")]
475 pub provider_status: Option<u16>,
476}
477
478/// Coarse responsibility class for an [`Error`]. Two values by design —
479/// "transient" / "permanent" is a retry-policy axis, orthogonal to
480/// responsibility, and surfaced via [`Error::Provider`]'s
481/// `retry_after` field plus the `RetryClassifier` policy surface.
482///
483/// Maps onto the standard HTTP family split: `Client` ≈ 4xx-equivalent
484/// (caller / integrator can act to fix), `Server` ≈ 5xx-equivalent
485/// (vendor or deployment must act).
486///
487/// JSON serialisation produces `"client"` / `"server"` to match the
488/// [`std::fmt::Display`] form — wire dashboards keying off the lower-
489/// case bucket stay consistent across the OTel / SSE / audit surfaces.
490#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, serde::Serialize)]
491#[serde(rename_all = "lowercase")]
492#[non_exhaustive]
493pub enum ErrorClass {
494 /// The caller — request shape, credentials, quota, cancellation
495 /// choice — is the actor that can resolve the failure.
496 Client,
497 /// The SDK, vendor, or deployment environment is the actor that
498 /// can resolve the failure.
499 Server,
500}
501
502impl std::fmt::Display for ErrorClass {
503 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
504 match self {
505 Self::Client => f.write_str("client"),
506 Self::Server => f.write_str("server"),
507 }
508 }
509}
510
511/// Coerce a raw `u16` HTTP status into a typed
512/// [`ProviderErrorKind`]. 4xx / 5xx surface as
513/// [`ProviderErrorKind::Http`]; every other value collapses to
514/// [`ProviderErrorKind::Network`] because the SDK never received a
515/// terminal vendor response (invariant 15 — no silent fallback to
516/// a plausible-looking `upstream_error`).
517const fn http_or_network(status: u16) -> ProviderErrorKind {
518 if status >= 400 && status < 600 {
519 ProviderErrorKind::Http(status)
520 } else {
521 ProviderErrorKind::Network
522 }
523}
524
525/// Provider failure category — distinguishes transport-class
526/// failures (the SDK never received a complete HTTP framing) from
527/// HTTP-class failures (the vendor responded with a status). Retry
528/// classifiers use this to make typed decisions rather than
529/// pattern-matching on `status: 0` sentinels (invariant #17).
530#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
531#[non_exhaustive]
532pub enum ProviderErrorKind {
533 /// Connect refused, read reset, peer hangup before HTTP framing
534 /// completed.
535 Network,
536 /// TLS handshake failure, certificate validation failure,
537 /// protocol mismatch.
538 Tls,
539 /// DNS resolution failure or SSRF allowlist rejection at the
540 /// resolver.
541 Dns,
542 /// Vendor responded with an HTTP status. Carries the actual
543 /// numeric code so classifiers can branch on `408|425|429|5xx`.
544 Http(u16),
545}
546
547impl std::fmt::Display for ProviderErrorKind {
548 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
549 match self {
550 Self::Network => f.write_str("network"),
551 Self::Tls => f.write_str("tls"),
552 Self::Dns => f.write_str("dns"),
553 Self::Http(status) => write!(f, "returned {status}"),
554 }
555 }
556}