Skip to main content

akribes_types/
error.rs

1//! Structured error envelope shared by core and the SDK.
2//!
3//! This is the wire-level slice of the `akribes-core::error` module: the
4//! [`ErrorKind`] / [`ErrorCode`] enums plus their pure-data impls
5//! (`as_wire`, `from_wire`, `kind`, `default_user_message`,
6//! `suggested_action`, `is_transient`, `is_server_error`,
7//! `is_user_actionable`, `base_backoff_ms`), the [`ErrorSource`] /
8//! [`ErrorDetail`] envelopes, the [`SuggestedAction`] tag, and the
9//! [`ErrorCode::parse_retry_after_ms`] retry-after hint parser.
10//!
11//! Functions that bring in heavier deps (regex-backed `sanitize_error` and
12//! `ErrorKind::classify`, the tokio-backed `CancelTracker` / `CancelReason`,
13//! the regex-backed `ErrorCode::classify_provider_error`) stay in
14//! `akribes_core::error` so the types crate keeps its dependency surface
15//! to `serde`, `serde_json`, `thiserror`, `httpdate`, and `tracing`.
16
17use serde::{Deserialize, Serialize};
18
19/// Coarse error category. Use [`ErrorCode`] for the finer-grained, stable
20/// identifier that consumers should branch on; `ErrorKind` is the rollup
21/// every code belongs to (so a UI can show one bucket, an SDK can decide
22/// "is this retryable" without enumerating every code).
23#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
24pub enum ErrorKind {
25    RateLimit,
26    AuthError,
27    TokenLimit,
28    /// Upstream HTTP 500 — generic provider-side failure. Maybe-transient;
29    /// retry with a short exponential backoff (issue #1296 split). Replaces
30    /// the legacy umbrella `ServerError` for the 500 case specifically so
31    /// retry policies and metrics can distinguish "internal server error"
32    /// from "bad gateway" / "service unavailable" / "gateway timeout".
33    ServerError500,
34    /// Upstream HTTP 502 — bad gateway, the provider's edge fronted a
35    /// failing origin. Retry with a short backoff (issue #1296 split).
36    BadGateway502,
37    /// Upstream HTTP 503 — service unavailable, rate-limit-adjacent.
38    /// Honour `Retry-After` aggressively when the provider sent one
39    /// (issue #1296 split). Default backoff matches `RateLimit` since the
40    /// remediation pattern (wait for capacity) is the same.
41    ServiceUnavailable503,
42    /// Upstream HTTP 504 — gateway timeout. The provider's gateway didn't
43    /// get an answer from the origin in time. Use a longer base backoff
44    /// since the slow side is unlikely to recover faster than the request
45    /// shape itself (issue #1296 split).
46    GatewayTimeout504,
47    NetworkError,
48    ParseError,
49    Cancelled,
50    /// Server-side execution-budget timeout (`AKRIBES_EXECUTION_TIMEOUT`),
51    /// or a checkpoint that elapsed its declared `on_timeout` window.
52    /// Distinct from `Cancelled` (explicit user/client cancel) so consumers
53    /// can tell "the workflow was stopped on purpose" from "the workflow
54    /// ran past its budget" — the latter is a service-level error, not a
55    /// user action. Distinct from `NetworkError`'s "timeout" classification,
56    /// which covers per-provider network timeouts inside a still-running
57    /// execution.
58    Timeout,
59    ScriptError,
60    /// Workflow-author-initiated failure — the LLM returned a non-success
61    /// variant (Unable / a custom failure arm) and the author mapped it to
62    /// `fail` (explicit `on <V> fail` or implicit no-trailer default).
63    /// Distinguished from `ScriptError` so the workflow runner can retry
64    /// the failing task up to `workflow_retries` times before surfacing
65    /// the failure to the caller (issue #312). Retry exhaustion converts
66    /// this to a `ScriptError` to preserve existing handler behavior.
67    AuthorRaise,
68    /// Cross-script `call(...)` chain exceeded the engine's `SUBSCRIPT_MAX_DEPTH`
69    /// (issue #429, `AKRIBES-E-SCRIPT-DEPTH`).
70    ScriptDepthExceeded,
71    /// A spawned tokio task in the engine panicked (typically `unwrap()`
72    /// on `None`, divide-by-zero in stdlib, or an `expect()` blowing).
73    /// Distinct from `ScriptError` because the workflow author didn't
74    /// cause it — it indicates an engine bug that should be filed.
75    /// Surfaces as `AKRIBES-E-INTERNAL-PANIC`.
76    Panic,
77    /// An invariant inside the engine/server was violated — a `oneshot`
78    /// sender was dropped without sending, a deadlock was detected, an
79    /// MCP protocol violation, etc. Always indicates a bug in Akribes
80    /// itself, not in user code or in a third-party provider.
81    Internal,
82}
83
84/// What the client/user/runner should do in response. Derived from
85/// [`ErrorKind`] (see [`ErrorKind::suggested_action`]) so consumers don't
86/// have to maintain their own switch statement.
87#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
88#[serde(rename_all = "kebab-case")]
89pub enum SuggestedAction {
90    /// Retry the operation as-is (no input change required). Pair with
91    /// [`ErrorDetail::retry_after_ms`] when known.
92    Retry,
93    /// The error is the operator's responsibility — fix configuration
94    /// (API keys, model setup, env vars).
95    FixConfig,
96    /// The error is the workflow author's responsibility — the script,
97    /// prompts, or types need editing.
98    FixScript,
99    /// The input was too large or wrong-shape for the current run. The
100    /// caller should reduce or correct it before retrying.
101    FixInput,
102    /// The workflow's `on <variant> fail` (or default failure handling)
103    /// fired — the caller should treat the failed result as authored
104    /// flow rather than bug.
105    HandleAuthorFailure,
106    /// User cancelled — no remediation needed.
107    None,
108    /// Looks like an Akribes bug. The caller should report (with the error
109    /// code + execution id) rather than retry blindly.
110    Report,
111}
112
113impl ErrorKind {
114    /// Whether the underlying condition is expected to clear on its own —
115    /// i.e. the same request retried later may succeed without any input
116    /// change. Pairs with [`SuggestedAction::Retry`].
117    pub fn is_transient(&self) -> bool {
118        matches!(
119            self,
120            ErrorKind::RateLimit
121                | ErrorKind::ServerError500
122                | ErrorKind::BadGateway502
123                | ErrorKind::ServiceUnavailable503
124                | ErrorKind::GatewayTimeout504
125                | ErrorKind::NetworkError
126        )
127    }
128
129    /// True for any of the four upstream 5xx variants (#1296). Use this in
130    /// places that need the umbrella "the provider returned a 5xx" check
131    /// without enumerating every status. Pair with [`is_transient`] when
132    /// the rate-limit / network-error siblings should also count.
133    pub fn is_server_error(&self) -> bool {
134        matches!(
135            self,
136            ErrorKind::ServerError500
137                | ErrorKind::BadGateway502
138                | ErrorKind::ServiceUnavailable503
139                | ErrorKind::GatewayTimeout504
140        )
141    }
142
143    /// Base backoff for the per-error retry loop in milliseconds. Drives
144    /// the per-variant retry semantics introduced by issue #1296:
145    ///
146    /// | Kind                         | Base | Rationale                                            |
147    /// |------------------------------|------|------------------------------------------------------|
148    /// | `RateLimit`                  | 2000 | Honour `Retry-After`; otherwise a 2s start.          |
149    /// | `ServerError500`             | 1000 | Maybe-transient origin failure — short doubling.     |
150    /// | `BadGateway502`              | 1000 | Edge fronted a failing origin — short doubling.      |
151    /// | `ServiceUnavailable503`      | 2000 | Capacity-adjacent — start at the rate-limit cadence. |
152    /// | `GatewayTimeout504`          | 4000 | Slow upstream — longer base before retrying.         |
153    /// | `NetworkError`               | 1000 | Connection-level recoverable.                        |
154    ///
155    /// All other variants return `None` (non-transient).
156    pub fn base_backoff_ms(&self) -> Option<u64> {
157        Some(match self {
158            ErrorKind::RateLimit => 2_000,
159            ErrorKind::ServerError500 => 1_000,
160            ErrorKind::BadGateway502 => 1_000,
161            ErrorKind::ServiceUnavailable503 => 2_000,
162            ErrorKind::GatewayTimeout504 => 4_000,
163            ErrorKind::NetworkError => 1_000,
164            _ => return None,
165        })
166    }
167
168    /// Whether the user (operator or workflow author) can fix this by
169    /// changing something — config, script, or input. Used to gate
170    /// "show actionable diagnostic UI" vs "just report it."
171    pub fn is_user_actionable(&self) -> bool {
172        matches!(
173            self,
174            ErrorKind::AuthError
175                | ErrorKind::TokenLimit
176                | ErrorKind::Timeout
177                | ErrorKind::ScriptError
178                | ErrorKind::ScriptDepthExceeded
179                | ErrorKind::AuthorRaise
180        )
181    }
182
183    /// Stable, machine-parseable identifier for the kind. Use this for
184    /// wire payloads, log fields, and the `error_kind` DB column.
185    /// Distinct from [`std::fmt::Display`] (which returns a human-readable
186    /// phrase like `"rate limit"`) and from `Debug` (which is intentional
187    /// here but not load-bearing).
188    pub fn as_wire(&self) -> &'static str {
189        match self {
190            ErrorKind::RateLimit => "RateLimit",
191            ErrorKind::AuthError => "AuthError",
192            ErrorKind::TokenLimit => "TokenLimit",
193            ErrorKind::ServerError500 => "ServerError500",
194            ErrorKind::BadGateway502 => "BadGateway502",
195            ErrorKind::ServiceUnavailable503 => "ServiceUnavailable503",
196            ErrorKind::GatewayTimeout504 => "GatewayTimeout504",
197            ErrorKind::NetworkError => "NetworkError",
198            ErrorKind::ParseError => "ParseError",
199            ErrorKind::Cancelled => "Cancelled",
200            ErrorKind::Timeout => "Timeout",
201            ErrorKind::ScriptError => "ScriptError",
202            ErrorKind::AuthorRaise => "AuthorRaise",
203            ErrorKind::ScriptDepthExceeded => "ScriptDepthExceeded",
204            ErrorKind::Panic => "Panic",
205            ErrorKind::Internal => "Internal",
206        }
207    }
208
209    /// What the caller should do — see [`SuggestedAction`].
210    pub fn suggested_action(&self) -> SuggestedAction {
211        match self {
212            ErrorKind::RateLimit
213            | ErrorKind::ServerError500
214            | ErrorKind::BadGateway502
215            | ErrorKind::ServiceUnavailable503
216            | ErrorKind::GatewayTimeout504
217            | ErrorKind::NetworkError => {
218                SuggestedAction::Retry
219            }
220            ErrorKind::AuthError => SuggestedAction::FixConfig,
221            ErrorKind::TokenLimit => SuggestedAction::FixInput,
222            ErrorKind::Timeout => SuggestedAction::FixInput,
223            ErrorKind::ScriptError | ErrorKind::ScriptDepthExceeded | ErrorKind::ParseError => {
224                SuggestedAction::FixScript
225            }
226            ErrorKind::AuthorRaise => SuggestedAction::HandleAuthorFailure,
227            ErrorKind::Cancelled => SuggestedAction::None,
228            ErrorKind::Panic | ErrorKind::Internal => SuggestedAction::Report,
229        }
230    }
231}
232
233/// Stable, fine-grained error identifier. Each code maps to exactly one
234/// [`ErrorKind`] and carries a default user-facing message. Wire form:
235/// `AKRIBES-E-<UPPER-KEBAB>` (e.g. `AKRIBES-E-PROVIDER-RATE-LIMIT`).
236///
237/// Codes are intentionally durable: once published, the wire string and
238/// `kind()` mapping should not change. Add new variants for new
239/// conditions rather than repurposing old ones; SDKs match on these
240/// strings to drive retry/UI/triage logic.
241#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
242pub enum ErrorCode {
243    /// Explicit user/client cancellation via `POST /executions/:id/cancel`.
244    UserCancelled,
245    /// Server-side execution-budget timeout (`AKRIBES_EXECUTION_TIMEOUT`).
246    ExecutionTimeout,
247    /// `on_timeout` window on a checkpoint elapsed without a resume.
248    CheckpointTimeout,
249    /// Provider returned a 429 / rate-limit / quota-exhausted response.
250    ProviderRateLimit,
251    /// Provider returned 401/403, or an API key was missing / not configured.
252    ProviderAuth,
253    /// Provider's reported context window or `max_tokens` was exceeded.
254    ProviderTokenLimit,
255    /// Legacy umbrella for any 5xx — kept for wire backward-compat after
256    /// the issue #1296 split. New code should construct one of
257    /// [`ProviderServer500`], [`ProviderBadGateway502`],
258    /// [`ProviderServiceUnavailable503`], or [`ProviderGatewayTimeout504`].
259    /// Decoding the old `AKRIBES-E-PROVIDER-SERVER` wire string yields this
260    /// variant so SDKs that match on it stay green.
261    ProviderServer,
262    /// Provider returned HTTP 500. Maybe-transient; short retry-with-backoff.
263    ProviderServer500,
264    /// Provider returned HTTP 502 (bad gateway). Retry with short backoff.
265    ProviderBadGateway502,
266    /// Provider returned HTTP 503 (service unavailable). Rate-limit-adjacent
267    /// — honour `Retry-After` aggressively.
268    ProviderServiceUnavailable503,
269    /// Provider returned HTTP 504 (gateway timeout). Longer base backoff
270    /// since the slow side is unlikely to recover faster than the request.
271    ProviderGatewayTimeout504,
272    /// Network-level failure reaching the provider (DNS, TLS, reset,
273    /// per-provider request timeout).
274    ProviderNetwork,
275    /// Provider response did not parse as the expected schema.
276    ProviderParse,
277    /// Generic provider/runtime failure that didn't fit a more specific bucket.
278    ProviderOther,
279    /// A spawned engine task panicked (host-side bug).
280    InternalPanic,
281    /// A `oneshot::Receiver` returned `Err` because its sender was dropped
282    /// before sending — covers breakpoint resume, checkpoint resume,
283    /// tool-approval resume. Indicates a server-side cleanup race or a
284    /// host bug, never a user action.
285    InternalDroppedChannel,
286    /// Engine reached a state with pending nodes but none ready to run —
287    /// dependency cycle or compiler bug.
288    InternalDeadlock,
289    /// JoinError that wasn't a panic and wasn't a recognized cancel —
290    /// tokio runtime aborted the task externally. Treated as a host
291    /// invariant violation.
292    InternalTaskAborted,
293    /// Generic "this should not happen" host failure that doesn't fit
294    /// the more specific Internal* codes.
295    InternalOther,
296    /// Generic workflow-author error not categorised more specifically.
297    ScriptError,
298    /// Cross-script `call(...)` chain exceeded the depth cap.
299    ScriptDepthExceeded,
300    /// Validation retries exhausted with `allow_partial: true` (issue #202)
301    /// — partial-retry sentinel routed to `on unable` / handler.
302    PartialRetryExhausted,
303    /// Workflow author's `fail` arm fired — the LLM returned an Unable
304    /// or other non-success variant the author mapped to failure.
305    AuthorRaise,
306    /// Per-agent tool budget (`tool_budget`) exceeded.
307    ToolBudgetExceeded,
308    /// Tool approval resume returned a payload that wasn't the expected
309    /// `{ approve: bool, args?: Value }` shape — host protocol violation.
310    ToolApprovalProtocol,
311    /// Tool call attempted but no MCP registry was attached (script-only
312    /// engine, missing host wiring).
313    ToolNoRegistry,
314    /// MCP tool call returned a tool-side failure (the registry exists
315    /// and dispatched, but the tool itself errored).
316    ToolError,
317    /// Agent dispatched a second `tool_use` after the engine had already
318    /// folded one round-trip's tool results back into the conversation.
319    /// Agents are single-round-trip by design — multi-turn agentic
320    /// behaviour belongs on a `loop` block. Surfaces when an LLM ignores
321    /// the synthesized "produce your final answer now" follow-up turn
322    /// and tries to invoke tools again. The fix is either to use a
323    /// `loop` block or to tighten the agent's system prompt.
324    AgentToolsDoubleDispatch,
325    /// Required configuration missing (API key, env var, etc.).
326    ConfigMissing,
327    /// Loop block exceeded its `max_total_output_tokens` budget. The
328    /// loop driver accumulates each turn's `output_tokens` from the
329    /// provider and stamps this code on the resulting `LoopEnd { value:
330    /// FatalError }` once the running total exceeds the per-loop or
331    /// project-default cap.
332    LoopOutputBudgetExceeded,
333    /// A second checkpoint fired in the same `loop` turn. The supported
334    /// envelope is at most one checkpoint per turn — the driver tracks a
335    /// per-turn counter and fails fast when the increment goes past 1.
336    /// Surfacing this as a distinct code (rather than `Other`) lets SDKs
337    /// and the Studio render a targeted explanation: split the
338    /// checkpoints across turns, or move one onto a non-loop sibling.
339    LoopMultiCheckpoint,
340    /// Mode 1 (`compaction: none` / omitted) only — the assembled
341    /// request would exceed the model's context window. Pre-call
342    /// diagnostic emitted by `Engine::run_compaction_chain`; replaces
343    /// the cryptic provider 400 from upstream. Carries the conversation
344    /// length, the model cap, and the agent in the message body.
345    ContextOverflow,
346    /// `compaction: native()` (or `at <T>: native()`) used with a model
347    /// whose `ModelEntry::native_compaction_capable` is `false`. The
348    /// related-info span points at the model declaration.
349    ContextNativeUnsupported,
350    /// All custom-chain steps ran and the conversation still exceeds
351    /// the configured cap. Surfaces with the chain of attempted
352    /// strategies in the message body. Emitted instead of a provider
353    /// 400 — fail-fast at the akribes seam.
354    ContextCompactionExhausted,
355    /// `compaction: at <invalid>` — value <= 0, percent > 100, or
356    /// duplicate threshold in a custom chain. Compile-time only.
357    CompactionThresholdInvalid,
358    /// User-defined compactor task referenced from a compaction step
359    /// doesn't match one of the four supported signatures
360    /// (`str|list[message] -> str|list[message]`). Compile-time only.
361    CompactorSignature,
362    /// `compact_to_state(field=...)` used outside a loop's `compaction:`
363    /// block. The primitive is loop-only because it writes into the
364    /// loop's state record. Compile-time only.
365    CompactionLoopOnly,
366    /// `std.format` placeholder `{name}` not present in args.
367    /// Stable string form: `AKRIBES-E-STD-FORMAT-MISS-001` (#1224).
368    StdFormatMissing,
369    /// `std.format` malformed template (unclosed `{`, empty `{}`,
370    /// stray `}`). Stable string form: `AKRIBES-E-STD-FORMAT-SYNTAX-001`.
371    StdFormatSyntax,
372    /// `std.json_parse` could not parse the input string as JSON.
373    /// Stable string form: `AKRIBES-E-STD-JSON-PARSE-001`.
374    StdJsonParse,
375    /// `std.json_stringify` could not serialise the input value (e.g.
376    /// `FatalError` payload; non-serializable). Stable string form:
377    /// `AKRIBES-E-STD-JSON-STRINGIFY-001`.
378    StdJsonStringify,
379    /// `std.regex_extract` was given an invalid regex pattern.
380    /// Stable string form: `AKRIBES-E-STD-REGEX-001`.
381    StdRegexInvalid,
382    /// Catch-all for sites that haven't been migrated to a richer code.
383    /// Prefer adding a specific variant — this is for transition only.
384    Other,
385}
386
387impl ErrorCode {
388    /// Extract a `retry_after_ms` hint from a provider error message
389    /// when the wire response carried one (provider implementations
390    /// usually echo it as `retry-after: <secs>` or similar). None when
391    /// no such hint is present.
392    ///
393    /// Honours both [RFC 9110 §10.2.3] forms:
394    ///
395    /// 1. **delta-seconds** — `Retry-After: 30` (returned as `30_000`).
396    /// 2. **HTTP-date** — `Retry-After: Wed, 21 Oct 2026 07:28:00 GMT`
397    ///    parsed via `httpdate::parse_http_date` and returned as the
398    ///    delta from `SystemTime::now()`, clamped to `>= 0`.
399    ///
400    /// [RFC 9110 §10.2.3]: https://www.rfc-editor.org/rfc/rfc9110#section-10.2.3
401    pub fn parse_retry_after_ms(msg: &str) -> Option<u64> {
402        // `retry-after: 30` (seconds) — common HTTP convention.
403        // Match decimals; we only ever emit milliseconds.
404        let needle = "retry-after";
405        // ASCII-case-insensitive search directly in `msg` (issue #1058
406        // — using `to_lowercase().find()` shifts indices on
407        // length-changing chars like `İ`).
408        let bytes = msg.as_bytes();
409        let n_len = needle.len();
410        let start = if bytes.len() < n_len {
411            return None;
412        } else {
413            (0..=bytes.len() - n_len)
414                .find(|&i| bytes[i..i + n_len].eq_ignore_ascii_case(needle.as_bytes()))?
415        };
416        let after = &msg[start + needle.len()..];
417        // Walk past separators (`:`, `=`, whitespace).
418        let after = after.trim_start_matches(|c: char| c == ':' || c == '=' || c.is_whitespace());
419        let end = after
420            .find(|c: char| !c.is_ascii_digit() && c != '.')
421            .unwrap_or(after.len());
422        let head = &after[..end];
423        if !head.is_empty() {
424            if let Ok(secs) = head.parse::<u64>() {
425                return Some(secs.saturating_mul(1000));
426            }
427            if let Ok(secs_f) = head.parse::<f64>() {
428                if secs_f.is_finite() && secs_f >= 0.0 {
429                    return Some((secs_f * 1000.0) as u64);
430                }
431            }
432        }
433        // HTTP-date branch (#1058).
434        let date_slice = after
435            .split(|c: char| c == '\n' || c == '\r')
436            .next()
437            .unwrap_or(after)
438            .trim()
439            .trim_end_matches(|c: char| matches!(c, ',' | ';' | '.'));
440        if date_slice.is_empty() {
441            return None;
442        }
443        if let Ok(then) = httpdate::parse_http_date(date_slice) {
444            let now = std::time::SystemTime::now();
445            match then.duration_since(now) {
446                Ok(d) => return Some(d.as_millis().min(u64::MAX as u128) as u64),
447                Err(_) => return Some(0),
448            }
449        }
450        None
451    }
452}
453
454impl ErrorCode {
455    /// The [`ErrorKind`] bucket this code belongs to. Computed once,
456    /// statically — used by consumers that want the rollup behaviour
457    /// (`is_transient`, `suggested_action`) without hand-mapping codes.
458    pub fn kind(&self) -> ErrorKind {
459        match self {
460            ErrorCode::UserCancelled => ErrorKind::Cancelled,
461            ErrorCode::ExecutionTimeout | ErrorCode::CheckpointTimeout => ErrorKind::Timeout,
462            ErrorCode::ProviderRateLimit => ErrorKind::RateLimit,
463            ErrorCode::ProviderAuth | ErrorCode::ConfigMissing => ErrorKind::AuthError,
464            ErrorCode::ProviderTokenLimit => ErrorKind::TokenLimit,
465            ErrorCode::ProviderServer => ErrorKind::ServerError500,
466            ErrorCode::ProviderServer500 => ErrorKind::ServerError500,
467            ErrorCode::ProviderBadGateway502 => ErrorKind::BadGateway502,
468            ErrorCode::ProviderServiceUnavailable503 => ErrorKind::ServiceUnavailable503,
469            ErrorCode::ProviderGatewayTimeout504 => ErrorKind::GatewayTimeout504,
470            ErrorCode::ProviderNetwork => ErrorKind::NetworkError,
471            ErrorCode::ProviderParse => ErrorKind::ParseError,
472            ErrorCode::ProviderOther => ErrorKind::ServerError500,
473            ErrorCode::InternalPanic => ErrorKind::Panic,
474            ErrorCode::InternalDroppedChannel
475            | ErrorCode::InternalDeadlock
476            | ErrorCode::InternalTaskAborted
477            | ErrorCode::InternalOther => ErrorKind::Internal,
478            ErrorCode::ScriptError
479            | ErrorCode::ToolBudgetExceeded
480            | ErrorCode::ToolApprovalProtocol
481            | ErrorCode::ToolNoRegistry
482            | ErrorCode::ToolError
483            | ErrorCode::AgentToolsDoubleDispatch
484            | ErrorCode::LoopOutputBudgetExceeded
485            | ErrorCode::LoopMultiCheckpoint
486            | ErrorCode::ContextOverflow
487            | ErrorCode::ContextNativeUnsupported
488            | ErrorCode::ContextCompactionExhausted
489            | ErrorCode::CompactionThresholdInvalid
490            | ErrorCode::CompactorSignature
491            | ErrorCode::CompactionLoopOnly
492            | ErrorCode::PartialRetryExhausted
493            | ErrorCode::StdFormatMissing
494            | ErrorCode::StdFormatSyntax
495            | ErrorCode::StdJsonParse
496            | ErrorCode::StdJsonStringify
497            | ErrorCode::StdRegexInvalid
498            | ErrorCode::Other => ErrorKind::ScriptError,
499            ErrorCode::ScriptDepthExceeded => ErrorKind::ScriptDepthExceeded,
500            ErrorCode::AuthorRaise => ErrorKind::AuthorRaise,
501        }
502    }
503
504    /// Stable wire identifier (`AKRIBES-E-<UPPER-KEBAB>`). This is the
505    /// string consumers should match on for retry/UI logic.
506    pub fn as_wire(&self) -> &'static str {
507        match self {
508            ErrorCode::UserCancelled => "AKRIBES-E-USER-CANCELLED",
509            ErrorCode::ExecutionTimeout => "AKRIBES-E-EXECUTION-TIMEOUT",
510            ErrorCode::CheckpointTimeout => "AKRIBES-E-CHECKPOINT-TIMEOUT",
511            ErrorCode::ProviderRateLimit => "AKRIBES-E-PROVIDER-RATE-LIMIT",
512            ErrorCode::ProviderAuth => "AKRIBES-E-PROVIDER-AUTH",
513            ErrorCode::ProviderTokenLimit => "AKRIBES-E-PROVIDER-TOKEN-LIMIT",
514            ErrorCode::ProviderServer => "AKRIBES-E-PROVIDER-SERVER",
515            ErrorCode::ProviderServer500 => "AKRIBES-E-PROVIDER-SERVER-500",
516            ErrorCode::ProviderBadGateway502 => "AKRIBES-E-PROVIDER-BAD-GATEWAY-502",
517            ErrorCode::ProviderServiceUnavailable503 => "AKRIBES-E-PROVIDER-SERVICE-UNAVAILABLE-503",
518            ErrorCode::ProviderGatewayTimeout504 => "AKRIBES-E-PROVIDER-GATEWAY-TIMEOUT-504",
519            ErrorCode::ProviderNetwork => "AKRIBES-E-PROVIDER-NETWORK",
520            ErrorCode::ProviderParse => "AKRIBES-E-PROVIDER-PARSE",
521            ErrorCode::ProviderOther => "AKRIBES-E-PROVIDER-OTHER",
522            ErrorCode::InternalPanic => "AKRIBES-E-INTERNAL-PANIC",
523            ErrorCode::InternalDroppedChannel => "AKRIBES-E-INTERNAL-DROPPED-CHANNEL",
524            ErrorCode::InternalDeadlock => "AKRIBES-E-INTERNAL-DEADLOCK",
525            ErrorCode::InternalTaskAborted => "AKRIBES-E-INTERNAL-TASK-ABORTED",
526            ErrorCode::InternalOther => "AKRIBES-E-INTERNAL-OTHER",
527            ErrorCode::ScriptError => "AKRIBES-E-SCRIPT-ERROR",
528            ErrorCode::ScriptDepthExceeded => "AKRIBES-E-SCRIPT-DEPTH",
529            ErrorCode::PartialRetryExhausted => "AKRIBES-E-RETRY-PARTIAL-EXHAUSTED",
530            ErrorCode::AuthorRaise => "AKRIBES-E-AUTHOR-RAISE",
531            ErrorCode::ToolBudgetExceeded => "AKRIBES-E-TOOL-BUDGET",
532            ErrorCode::ToolApprovalProtocol => "AKRIBES-E-TOOL-APPROVAL-PROTOCOL",
533            ErrorCode::ToolNoRegistry => "AKRIBES-E-TOOL-NO-REGISTRY",
534            ErrorCode::ToolError => "AKRIBES-E-TOOL-ERROR",
535            ErrorCode::AgentToolsDoubleDispatch => "AKRIBES-E-AGENT-TOOLS-DOUBLE-DISPATCH",
536            ErrorCode::ConfigMissing => "AKRIBES-E-CONFIG-MISSING",
537            ErrorCode::LoopOutputBudgetExceeded => "AKRIBES-E-LOOP-OUTPUT-BUDGET-EXCEEDED",
538            ErrorCode::LoopMultiCheckpoint => "AKRIBES-E-LOOP-MULTI-CHECKPOINT",
539            ErrorCode::ContextOverflow => "AKRIBES-E-CONTEXT-OVERFLOW",
540            ErrorCode::ContextNativeUnsupported => "AKRIBES-E-CONTEXT-NATIVE-UNSUPPORTED",
541            ErrorCode::ContextCompactionExhausted => "AKRIBES-E-CONTEXT-COMPACTION-EXHAUSTED",
542            ErrorCode::CompactionThresholdInvalid => "AKRIBES-E-COMPACTION-THRESHOLD-INVALID",
543            ErrorCode::CompactorSignature => "AKRIBES-E-COMPACTOR-SIGNATURE",
544            ErrorCode::CompactionLoopOnly => "AKRIBES-E-COMPACTION-LOOP-ONLY",
545            ErrorCode::StdFormatMissing => "AKRIBES-E-STD-FORMAT-MISS-001",
546            ErrorCode::StdFormatSyntax => "AKRIBES-E-STD-FORMAT-SYNTAX-001",
547            ErrorCode::StdJsonParse => "AKRIBES-E-STD-JSON-PARSE-001",
548            ErrorCode::StdJsonStringify => "AKRIBES-E-STD-JSON-STRINGIFY-001",
549            ErrorCode::StdRegexInvalid => "AKRIBES-E-STD-REGEX-001",
550            ErrorCode::Other => "AKRIBES-E-OTHER",
551        }
552    }
553
554    /// Parse the canonical wire form (`AKRIBES-E-<UPPER-KEBAB>`) back to a
555    /// code. Returns `None` for any string we don't recognise so the
556    /// caller can decide whether to fall back to [`ErrorCode::Other`]
557    /// or surface the unknown code as-is. Used by the legacy
558    /// `Value::fatal_with_code` shim and SDK normalisers.
559    pub fn from_wire(s: &str) -> Option<Self> {
560        let code = match s {
561            "AKRIBES-E-USER-CANCELLED" => ErrorCode::UserCancelled,
562            "AKRIBES-E-EXECUTION-TIMEOUT" => ErrorCode::ExecutionTimeout,
563            "AKRIBES-E-CHECKPOINT-TIMEOUT" => ErrorCode::CheckpointTimeout,
564            "AKRIBES-E-PROVIDER-RATE-LIMIT" => ErrorCode::ProviderRateLimit,
565            "AKRIBES-E-PROVIDER-AUTH" => ErrorCode::ProviderAuth,
566            "AKRIBES-E-PROVIDER-TOKEN-LIMIT" => ErrorCode::ProviderTokenLimit,
567            "AKRIBES-E-PROVIDER-SERVER" => ErrorCode::ProviderServer,
568            "AKRIBES-E-PROVIDER-SERVER-500" => ErrorCode::ProviderServer500,
569            "AKRIBES-E-PROVIDER-BAD-GATEWAY-502" => ErrorCode::ProviderBadGateway502,
570            "AKRIBES-E-PROVIDER-SERVICE-UNAVAILABLE-503" => ErrorCode::ProviderServiceUnavailable503,
571            "AKRIBES-E-PROVIDER-GATEWAY-TIMEOUT-504" => ErrorCode::ProviderGatewayTimeout504,
572            "AKRIBES-E-PROVIDER-NETWORK" => ErrorCode::ProviderNetwork,
573            "AKRIBES-E-PROVIDER-PARSE" => ErrorCode::ProviderParse,
574            "AKRIBES-E-PROVIDER-OTHER" => ErrorCode::ProviderOther,
575            "AKRIBES-E-INTERNAL-PANIC" => ErrorCode::InternalPanic,
576            "AKRIBES-E-INTERNAL-DROPPED-CHANNEL" => ErrorCode::InternalDroppedChannel,
577            "AKRIBES-E-INTERNAL-DEADLOCK" => ErrorCode::InternalDeadlock,
578            "AKRIBES-E-INTERNAL-TASK-ABORTED" => ErrorCode::InternalTaskAborted,
579            "AKRIBES-E-INTERNAL-OTHER" => ErrorCode::InternalOther,
580            "AKRIBES-E-SCRIPT-ERROR" => ErrorCode::ScriptError,
581            "AKRIBES-E-SCRIPT-DEPTH" => ErrorCode::ScriptDepthExceeded,
582            "AKRIBES-E-RETRY-PARTIAL-EXHAUSTED" => ErrorCode::PartialRetryExhausted,
583            "AKRIBES-E-AUTHOR-RAISE" => ErrorCode::AuthorRaise,
584            "AKRIBES-E-TOOL-BUDGET" => ErrorCode::ToolBudgetExceeded,
585            "AKRIBES-E-TOOL-APPROVAL-PROTOCOL" => ErrorCode::ToolApprovalProtocol,
586            "AKRIBES-E-TOOL-NO-REGISTRY" => ErrorCode::ToolNoRegistry,
587            "AKRIBES-E-TOOL-ERROR" => ErrorCode::ToolError,
588            "AKRIBES-E-AGENT-TOOLS-DOUBLE-DISPATCH" => ErrorCode::AgentToolsDoubleDispatch,
589            "AKRIBES-E-CONFIG-MISSING" => ErrorCode::ConfigMissing,
590            "AKRIBES-E-LOOP-OUTPUT-BUDGET-EXCEEDED" => ErrorCode::LoopOutputBudgetExceeded,
591            "AKRIBES-E-LOOP-MULTI-CHECKPOINT" => ErrorCode::LoopMultiCheckpoint,
592            "AKRIBES-E-CONTEXT-OVERFLOW" => ErrorCode::ContextOverflow,
593            "AKRIBES-E-CONTEXT-NATIVE-UNSUPPORTED" => ErrorCode::ContextNativeUnsupported,
594            "AKRIBES-E-CONTEXT-COMPACTION-EXHAUSTED" => ErrorCode::ContextCompactionExhausted,
595            "AKRIBES-E-COMPACTION-THRESHOLD-INVALID" => ErrorCode::CompactionThresholdInvalid,
596            "AKRIBES-E-COMPACTOR-SIGNATURE" => ErrorCode::CompactorSignature,
597            "AKRIBES-E-COMPACTION-LOOP-ONLY" => ErrorCode::CompactionLoopOnly,
598            "AKRIBES-E-STD-FORMAT-MISS-001" => ErrorCode::StdFormatMissing,
599            "AKRIBES-E-STD-FORMAT-SYNTAX-001" => ErrorCode::StdFormatSyntax,
600            "AKRIBES-E-STD-JSON-PARSE-001" => ErrorCode::StdJsonParse,
601            "AKRIBES-E-STD-JSON-STRINGIFY-001" => ErrorCode::StdJsonStringify,
602            "AKRIBES-E-STD-REGEX-001" => ErrorCode::StdRegexInvalid,
603            "AKRIBES-E-OTHER" => {
604                // Issue #1039: `AKRIBES-E-OTHER` is the explicit
605                // unclassified-fallback bucket. Decoding a wire payload
606                // tagged with it is legal, but every occurrence is a hint
607                // that an upstream producer skipped a more specific code.
608                // Surface that via a warn-log so prod can attribute the
609                // drift to the producing component (server / SDK / runner)
610                // rather than silently flattening to `ScriptError`.
611                tracing::warn!(
612                    target: "akribes_types::error",
613                    wire_code = "AKRIBES-E-OTHER",
614                    "decoded fallback ErrorCode::Other from wire payload —                      the producing component skipped a more specific AKRIBES-E-* code"
615                );
616                ErrorCode::Other
617            }
618            _ => return None,
619        };
620        Some(code)
621    }
622
623    /// Default user-facing message for this code. Constructors should
624    /// override only when there is meaningfully more to say to the user
625    /// (e.g. embedding the offending value), not just to restate the
626    /// developer message.
627    pub fn default_user_message(&self) -> &'static str {
628        match self {
629            ErrorCode::UserCancelled => {
630                "The execution was cancelled."
631            }
632            ErrorCode::ExecutionTimeout => {
633                "The workflow ran past its time budget. Try a smaller input, simplify the workflow, or raise AKRIBES_EXECUTION_TIMEOUT."
634            }
635            ErrorCode::CheckpointTimeout => {
636                "A checkpoint waited longer than its on_timeout window without a resume."
637            }
638            ErrorCode::ProviderRateLimit => {
639                "The model provider rate-limited the request. Wait a moment and retry; consider lowering concurrency."
640            }
641            ErrorCode::ProviderAuth => {
642                "The model provider rejected our credentials. Check the provider's API key and that the configured model is enabled."
643            }
644            ErrorCode::ProviderTokenLimit => {
645                "The prompt exceeds the model's context window. Reduce input length, use a larger-context model, or split the work."
646            }
647            ErrorCode::ProviderServer => {
648                "The model provider returned a server-side error. Retrying is usually appropriate."
649            }
650            ErrorCode::ProviderServer500 => {
651                "The model provider returned HTTP 500. The origin reported an internal error; a retry with a short backoff is usually appropriate."
652            }
653            ErrorCode::ProviderBadGateway502 => {
654                "The model provider returned HTTP 502 (bad gateway). The edge fronted a failing origin; retry with a short backoff."
655            }
656            ErrorCode::ProviderServiceUnavailable503 => {
657                "The model provider returned HTTP 503 (service unavailable). This is rate-limit-adjacent — honour Retry-After if the provider sent one, otherwise back off."
658            }
659            ErrorCode::ProviderGatewayTimeout504 => {
660                "The model provider returned HTTP 504 (gateway timeout). The upstream is slow or stuck; retry with a longer backoff before alerting."
661            }
662            ErrorCode::ProviderNetwork => {
663                "Could not reach the model provider (network/DNS/TLS/timeout). Retry; check connectivity if it persists."
664            }
665            ErrorCode::ProviderParse => {
666                "The model produced output that didn't fit the declared schema. Check the prompt and the type definition."
667            }
668            ErrorCode::ProviderOther => {
669                "The model provider failed with an unclassified error."
670            }
671            ErrorCode::InternalPanic => {
672                "An internal Akribes task crashed (AKRIBES-E-INTERNAL-PANIC). \
673                 This is a bug. Report with the execution id at \
674                 https://github.com/PodestaAI/akribes-sdks/issues."
675            }
676            ErrorCode::InternalDroppedChannel => {
677                "An internal Akribes channel was closed unexpectedly (AKRIBES-E-INTERNAL-DROPPED-CHANNEL). \
678                 This is usually a bug. Report with the execution id at \
679                 https://github.com/PodestaAI/akribes-sdks/issues."
680            }
681            ErrorCode::InternalDeadlock => {
682                "Akribes detected a stuck workflow graph (AKRIBES-E-INTERNAL-DEADLOCK). \
683                 This is a compiler/engine bug. Report at \
684                 https://github.com/PodestaAI/akribes-sdks/issues."
685            }
686            ErrorCode::InternalTaskAborted => {
687                "An internal task was aborted unexpectedly (AKRIBES-E-INTERNAL-TASK-ABORTED). \
688                 This is usually a bug. Report at \
689                 https://github.com/PodestaAI/akribes-sdks/issues."
690            }
691            ErrorCode::InternalOther => {
692                "An unspecified internal error occurred (AKRIBES-E-INTERNAL-OTHER). \
693                 Report with the execution id at \
694                 https://github.com/PodestaAI/akribes-sdks/issues."
695            }
696            ErrorCode::ScriptError => {
697                "The workflow encountered a runtime error. Check task logic, types, and inputs."
698            }
699            ErrorCode::ScriptDepthExceeded => {
700                "Workflow call(...) chain exceeded the recursion cap. Refactor to reduce nesting."
701            }
702            ErrorCode::PartialRetryExhausted => {
703                "All validation retries on a partial-retry task were exhausted."
704            }
705            ErrorCode::AuthorRaise => {
706                "The workflow's failure path fired (the LLM returned an Unable or non-success variant the script mapped to fail)."
707            }
708            ErrorCode::ToolBudgetExceeded => {
709                "An agent exceeded its tool_budget cap. Increase the cap or reduce tool use."
710            }
711            ErrorCode::ToolApprovalProtocol => {
712                "Tool approval received an unexpected payload. This is a host-integration bug."
713            }
714            ErrorCode::ToolNoRegistry => {
715                "A tool call was attempted but no MCP registry is attached. Configure mcp_server / mcp_registry, or run via a host that wires the registry."
716            }
717            ErrorCode::ToolError => {
718                "An MCP tool returned an error. Check tool configuration and the upstream service."
719            }
720            ErrorCode::AgentToolsDoubleDispatch => {
721                "An agent invoked tools more than once in a single dispatch. Agents are single-round-trip — use a `loop` block for multi-turn tool use."
722            }
723            ErrorCode::ConfigMissing => {
724                "Required configuration is missing (API key, env var, or provider setup)."
725            }
726            ErrorCode::LoopOutputBudgetExceeded => {
727                "A `loop` block exceeded its `max_total_output_tokens` cap. Raise the cap or shorten per-turn output."
728            }
729            ErrorCode::LoopMultiCheckpoint => {
730                "A loop turn fired more than one checkpoint. One checkpoint per turn is the supported envelope — split them across turns or move one outside the loop."
731            }
732            ErrorCode::ContextOverflow => {
733                "The conversation exceeds the model's context window. Configure `compaction:` on the agent (e.g. `compaction: at 80%`) or pick a model with a larger window."
734            }
735            ErrorCode::ContextNativeUnsupported => {
736                "This model doesn't support server-side native compaction. Pick a capable model (opus_4_7, opus_4_6, sonnet_4_6, gpt_5_3_codex, gpt_5_5) or switch to a custom compaction chain."
737            }
738            ErrorCode::ContextCompactionExhausted => {
739                "The compaction chain ran every configured step and the conversation still exceeds the configured cap. Add a terminal step (truncate or native) or raise the cap."
740            }
741            ErrorCode::CompactionThresholdInvalid => {
742                "A compaction threshold is invalid. Use 1..=100 with `%`, or a positive absolute token count."
743            }
744            ErrorCode::CompactorSignature => {
745                "User-defined compactor must have signature `(history: str | list[message]) -> str | list[message]`."
746            }
747            ErrorCode::CompactionLoopOnly => {
748                "`compact_to_state(...)` may only appear inside a loop's `compaction:` block — move it under the loop, or use a different primitive on the agent."
749            }
750            ErrorCode::StdFormatMissing => {
751                "`std.format` is missing a placeholder key. Pass every `{name}` in the template via the `args` map."
752            }
753            ErrorCode::StdFormatSyntax => {
754                "`std.format` template has malformed brace syntax. Use `{name}` for placeholders, `{{` / `}}` for literal braces."
755            }
756            ErrorCode::StdJsonParse => {
757                "`std.json_parse` could not parse the input as JSON."
758            }
759            ErrorCode::StdJsonStringify => {
760                "`std.json_stringify` could not serialise the value. Check for control-plane values (FatalError) and non-JSON shapes."
761            }
762            ErrorCode::StdRegexInvalid => {
763                "`std.regex_extract` was given an invalid regex pattern. Check the syntax against the Rust `regex` crate's rules."
764            }
765            ErrorCode::Other => {
766                "An error occurred. See the developer message for detail."
767            }
768        }
769    }
770}
771
772/// Where in the workflow an error originated. Every field is optional —
773/// fill what you know, leave the rest. SDKs render whichever fields are
774/// present; downstream tools (logs, OTel) read them as structured
775/// attributes for filtering/aggregation.
776#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
777pub struct ErrorSource {
778    /// Workflow-author-declared task name (matches `task <name>` in source).
779    #[serde(skip_serializing_if = "Option::is_none")]
780    pub task: Option<String>,
781    /// Agent name from the matching `agent <name>` declaration.
782    #[serde(skip_serializing_if = "Option::is_none")]
783    pub agent: Option<String>,
784    /// Provider id when the error came from an LLM/provider call
785    /// (`anthropic`, `google`, `openai`, …).
786    #[serde(skip_serializing_if = "Option::is_none")]
787    pub provider: Option<String>,
788    /// Model alias (`opus_4_7`, `gpt_4o_mini`, …).
789    #[serde(skip_serializing_if = "Option::is_none")]
790    pub model: Option<String>,
791    /// MCP `<alias>.<tool>` reference when the error came from a tool call.
792    #[serde(skip_serializing_if = "Option::is_none")]
793    pub tool_ref: Option<String>,
794    /// Script name when the error came from a sub-script (`call(...)`).
795    #[serde(skip_serializing_if = "Option::is_none")]
796    pub script: Option<String>,
797    /// 1-indexed source line in the originating `.akr` file.
798    #[serde(skip_serializing_if = "Option::is_none")]
799    pub line: Option<u32>,
800}
801
802impl ErrorSource {
803    pub fn empty() -> Self {
804        Self::default()
805    }
806
807    pub fn is_empty(&self) -> bool {
808        self == &Self::default()
809    }
810
811    /// Builder helpers — chainable, infallible.
812    pub fn with_task(mut self, task: impl Into<String>) -> Self {
813        self.task = Some(task.into());
814        self
815    }
816    pub fn with_agent(mut self, agent: impl Into<String>) -> Self {
817        self.agent = Some(agent.into());
818        self
819    }
820    pub fn with_provider(mut self, provider: impl Into<String>) -> Self {
821        self.provider = Some(provider.into());
822        self
823    }
824    pub fn with_model(mut self, model: impl Into<String>) -> Self {
825        self.model = Some(model.into());
826        self
827    }
828    pub fn with_tool_ref(mut self, tool_ref: impl Into<String>) -> Self {
829        self.tool_ref = Some(tool_ref.into());
830        self
831    }
832    pub fn with_script(mut self, script: impl Into<String>) -> Self {
833        self.script = Some(script.into());
834        self
835    }
836    pub fn with_line(mut self, line: u32) -> Self {
837        self.line = Some(line);
838        self
839    }
840}
841
842/// Structured failure detail attached to a [`crate::value::Value::FatalError`] and to
843/// every `EngineEvent::Error`. Replaces the previous `(message, kind)`
844/// shape with a richer envelope so SDKs can decide what to do, users get
845/// actionable text, and developers get structured fields for OTel/logs.
846///
847/// Construction patterns:
848///
849/// * Simple: `ErrorDetail::from_kind(ErrorKind::ScriptError, "div by zero")`
850///   — pulls a generic code (`AKRIBES-E-SCRIPT-ERROR`) and the kind's default
851///   user message.
852/// * Specific: `ErrorDetail::new(ErrorCode::ProviderRateLimit, "...")`
853///   — code drives kind + default user_message via [`ErrorCode::kind`].
854/// * With retry hint: `.with_retry_after_ms(30_000)`.
855/// * With source: `.with_source(ErrorSource::default().with_task("foo"))`.
856#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
857pub struct ErrorDetail {
858    pub kind: ErrorKind,
859    pub code: ErrorCode,
860    /// Developer-facing message — full detail, may include sanitized
861    /// stack/protocol fragments. Always non-empty.
862    pub message: String,
863    /// User-facing single-paragraph summary + suggested action. Always
864    /// non-empty (defaults to [`ErrorCode::default_user_message`]).
865    pub user_message: String,
866    /// When the provider supplied a `Retry-After` (or equivalent), the
867    /// suggested wait in milliseconds. None when not known.
868    #[serde(skip_serializing_if = "Option::is_none")]
869    pub retry_after_ms: Option<u64>,
870    /// Where the error originated. Empty (`is_empty()`) when no
871    /// attribution is available.
872    #[serde(skip_serializing_if = "ErrorSource::is_empty", default)]
873    pub source: ErrorSource,
874}
875
876impl ErrorDetail {
877    /// Construct from a code + developer message. Kind and user_message
878    /// are derived from the code.
879    pub fn new(code: ErrorCode, message: impl Into<String>) -> Self {
880        Self {
881            kind: code.kind(),
882            code,
883            message: message.into(),
884            user_message: code.default_user_message().to_string(),
885            retry_after_ms: None,
886            source: ErrorSource::default(),
887        }
888    }
889
890    /// Construct from an existing kind when no specific code is yet
891    /// available. Picks the closest "Other" code for that kind, and for
892    /// rate-limit/server messages also extracts a `retry_after_ms` hint
893    /// when the upstream response embedded one.
894    pub fn from_kind(kind: ErrorKind, message: impl Into<String>) -> Self {
895        let message = message.into();
896        let code = match kind {
897            ErrorKind::RateLimit => ErrorCode::ProviderRateLimit,
898            ErrorKind::AuthError => ErrorCode::ProviderAuth,
899            ErrorKind::TokenLimit => ErrorCode::ProviderTokenLimit,
900            ErrorKind::ServerError500 => ErrorCode::ProviderServer500,
901            ErrorKind::BadGateway502 => ErrorCode::ProviderBadGateway502,
902            ErrorKind::ServiceUnavailable503 => ErrorCode::ProviderServiceUnavailable503,
903            ErrorKind::GatewayTimeout504 => ErrorCode::ProviderGatewayTimeout504,
904            ErrorKind::NetworkError => ErrorCode::ProviderNetwork,
905            ErrorKind::ParseError => ErrorCode::ProviderParse,
906            ErrorKind::Cancelled => ErrorCode::UserCancelled,
907            ErrorKind::Timeout => ErrorCode::ExecutionTimeout,
908            ErrorKind::ScriptError => ErrorCode::ScriptError,
909            ErrorKind::AuthorRaise => ErrorCode::AuthorRaise,
910            ErrorKind::ScriptDepthExceeded => ErrorCode::ScriptDepthExceeded,
911            ErrorKind::Panic => ErrorCode::InternalPanic,
912            ErrorKind::Internal => ErrorCode::InternalOther,
913        };
914        // Best-effort retry hint extraction for transient kinds. Cheap
915        // (single substring scan) and only relevant for kinds that
916        // would benefit from the hint.
917        let retry_after_ms = if matches!(
918            kind,
919            ErrorKind::RateLimit
920                | ErrorKind::ServerError500
921                | ErrorKind::BadGateway502
922                | ErrorKind::ServiceUnavailable503
923                | ErrorKind::GatewayTimeout504
924                | ErrorKind::NetworkError
925        ) {
926            ErrorCode::parse_retry_after_ms(&message)
927        } else {
928            None
929        };
930        Self {
931            kind,
932            code,
933            message,
934            user_message: code.default_user_message().to_string(),
935            retry_after_ms,
936            source: ErrorSource::default(),
937        }
938    }
939
940    /// Override the user-facing message. Use when the default for the
941    /// code isn't specific enough (e.g. embedding the offending value).
942    pub fn with_user_message(mut self, msg: impl Into<String>) -> Self {
943        self.user_message = msg.into();
944        self
945    }
946
947    pub fn with_retry_after_ms(mut self, ms: u64) -> Self {
948        self.retry_after_ms = Some(ms);
949        self
950    }
951
952    pub fn with_source(mut self, source: ErrorSource) -> Self {
953        self.source = source;
954        self
955    }
956
957    /// Convenience: builder-style task attribution.
958    pub fn with_task(mut self, task: impl Into<String>) -> Self {
959        self.source.task = Some(task.into());
960        self
961    }
962
963    /// Whether retrying as-is may succeed. Pulls from the kind plus
964    /// `retry_after_ms` (an explicit hint always implies retryable).
965    pub fn is_retryable(&self) -> bool {
966        self.retry_after_ms.is_some() || self.kind.is_transient()
967    }
968
969    pub fn suggested_action(&self) -> SuggestedAction {
970        self.kind.suggested_action()
971    }
972}
973
974impl std::fmt::Display for ErrorDetail {
975    /// Renders as `<wire-code>: <message>` so log lines and the legacy
976    /// "string error" shape stay readable. Use the structured fields
977    /// directly when constructing JSON wire payloads.
978    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
979        write!(f, "{}: {}", self.code.as_wire(), self.message)
980    }
981}
982
983impl std::fmt::Display for ErrorKind {
984    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
985        match self {
986            ErrorKind::RateLimit => write!(f, "rate limit"),
987            ErrorKind::AuthError => write!(f, "authentication error"),
988            ErrorKind::TokenLimit => write!(f, "token limit"),
989            ErrorKind::ServerError500 => write!(f, "server error (HTTP 500)"),
990            ErrorKind::BadGateway502 => write!(f, "bad gateway (HTTP 502)"),
991            ErrorKind::ServiceUnavailable503 => write!(f, "service unavailable (HTTP 503)"),
992            ErrorKind::GatewayTimeout504 => write!(f, "gateway timeout (HTTP 504)"),
993            ErrorKind::NetworkError => write!(f, "network error"),
994            ErrorKind::ParseError => write!(f, "parse error"),
995            ErrorKind::Cancelled => write!(f, "cancelled"),
996            ErrorKind::Timeout => write!(f, "timeout"),
997            ErrorKind::ScriptError => write!(f, "script error"),
998            ErrorKind::AuthorRaise => write!(f, "author raise"),
999            ErrorKind::ScriptDepthExceeded => write!(f, "script depth exceeded"),
1000            ErrorKind::Panic => write!(f, "panic"),
1001            ErrorKind::Internal => write!(f, "internal error"),
1002        }
1003    }
1004}