Skip to main content

akribes_types/
error.rs

1//! Structured error envelope shared by core and the SDK.
2//!
3//! This is the wire-level slice of the `akribes-core::error` module: the
4//! [`ErrorKind`] / [`ErrorCode`] enums plus their pure-data impls
5//! (`as_wire`, `from_wire`, `kind`, `default_user_message`,
6//! `suggested_action`, `is_transient`, `is_server_error`,
7//! `is_user_actionable`, `base_backoff_ms`), the [`ErrorSource`] /
8//! [`ErrorDetail`] envelopes, the [`SuggestedAction`] tag, and the
9//! [`ErrorCode::parse_retry_after_ms`] retry-after hint parser.
10//!
11//! Functions that bring in heavier deps (regex-backed `sanitize_error` and
12//! `ErrorKind::classify`, the tokio-backed `CancelTracker` / `CancelReason`,
13//! the regex-backed `ErrorCode::classify_provider_error`) stay in
14//! `akribes_core::error` so the types crate keeps its dependency surface
15//! to `serde`, `serde_json`, `thiserror`, `httpdate`, and `tracing`.
16
17use serde::{Deserialize, Serialize};
18
19/// Coarse error category. Use [`ErrorCode`] for the finer-grained, stable
20/// identifier that consumers should branch on; `ErrorKind` is the rollup
21/// every code belongs to (so a UI can show one bucket, an SDK can decide
22/// "is this retryable" without enumerating every code).
23#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
24pub enum ErrorKind {
25    RateLimit,
26    AuthError,
27    TokenLimit,
28    /// Upstream HTTP 500 — generic provider-side failure. Maybe-transient;
29    /// retry with a short exponential backoff (issue #1296 split). Replaces
30    /// the legacy umbrella `ServerError` for the 500 case specifically so
31    /// retry policies and metrics can distinguish "internal server error"
32    /// from "bad gateway" / "service unavailable" / "gateway timeout".
33    ServerError500,
34    /// Upstream HTTP 502 — bad gateway, the provider's edge fronted a
35    /// failing origin. Retry with a short backoff (issue #1296 split).
36    BadGateway502,
37    /// Upstream HTTP 503 — service unavailable, rate-limit-adjacent.
38    /// Honour `Retry-After` aggressively when the provider sent one
39    /// (issue #1296 split). Default backoff matches `RateLimit` since the
40    /// remediation pattern (wait for capacity) is the same.
41    ServiceUnavailable503,
42    /// Upstream HTTP 504 — gateway timeout. The provider's gateway didn't
43    /// get an answer from the origin in time. Use a longer base backoff
44    /// since the slow side is unlikely to recover faster than the request
45    /// shape itself (issue #1296 split).
46    GatewayTimeout504,
47    NetworkError,
48    ParseError,
49    Cancelled,
50    /// Server-side execution-budget timeout (`AKRIBES_EXECUTION_TIMEOUT`),
51    /// or a checkpoint that elapsed its declared `on_timeout` window.
52    /// Distinct from `Cancelled` (explicit user/client cancel) so consumers
53    /// can tell "the workflow was stopped on purpose" from "the workflow
54    /// ran past its budget" — the latter is a service-level error, not a
55    /// user action. Distinct from `NetworkError`'s "timeout" classification,
56    /// which covers per-provider network timeouts inside a still-running
57    /// execution.
58    Timeout,
59    ScriptError,
60    /// Workflow-author-initiated failure — the LLM returned a non-success
61    /// variant (Unable / a custom failure arm) and the author mapped it to
62    /// `fail` (explicit `on <V> fail` or implicit no-trailer default).
63    /// Distinguished from `ScriptError` so the workflow runner can retry
64    /// the failing task up to `workflow_retries` times before surfacing
65    /// the failure to the caller (issue #312). Retry exhaustion converts
66    /// this to a `ScriptError` to preserve existing handler behavior.
67    AuthorRaise,
68    /// Cross-script `call(...)` chain exceeded the engine's `SUBSCRIPT_MAX_DEPTH`
69    /// (issue #429, `AKRIBES-E-SCRIPT-DEPTH`).
70    ScriptDepthExceeded,
71    /// A spawned tokio task in the engine panicked (typically `unwrap()`
72    /// on `None`, divide-by-zero in stdlib, or an `expect()` blowing).
73    /// Distinct from `ScriptError` because the workflow author didn't
74    /// cause it — it indicates an engine bug that should be filed.
75    /// Surfaces as `AKRIBES-E-INTERNAL-PANIC`.
76    Panic,
77    /// An invariant inside the engine/server was violated — a `oneshot`
78    /// sender was dropped without sending, a deadlock was detected, an
79    /// MCP protocol violation, etc. Always indicates a bug in Akribes
80    /// itself, not in user code or in a third-party provider.
81    Internal,
82}
83
84/// What the client/user/runner should do in response. Derived from
85/// [`ErrorKind`] (see [`ErrorKind::suggested_action`]) so consumers don't
86/// have to maintain their own switch statement.
87#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
88#[serde(rename_all = "kebab-case")]
89pub enum SuggestedAction {
90    /// Retry the operation as-is (no input change required). Pair with
91    /// [`ErrorDetail::retry_after_ms`] when known.
92    Retry,
93    /// The error is the operator's responsibility — fix configuration
94    /// (API keys, model setup, env vars).
95    FixConfig,
96    /// The error is the workflow author's responsibility — the script,
97    /// prompts, or types need editing.
98    FixScript,
99    /// The input was too large or wrong-shape for the current run. The
100    /// caller should reduce or correct it before retrying.
101    FixInput,
102    /// The workflow's `on <variant> fail` (or default failure handling)
103    /// fired — the caller should treat the failed result as authored
104    /// flow rather than bug.
105    HandleAuthorFailure,
106    /// User cancelled — no remediation needed.
107    None,
108    /// Looks like an Akribes bug. The caller should report (with the error
109    /// code + execution id) rather than retry blindly.
110    Report,
111}
112
113impl ErrorKind {
114    /// Whether the underlying condition is expected to clear on its own —
115    /// i.e. the same request retried later may succeed without any input
116    /// change. Pairs with [`SuggestedAction::Retry`].
117    pub fn is_transient(&self) -> bool {
118        matches!(
119            self,
120            ErrorKind::RateLimit
121                | ErrorKind::ServerError500
122                | ErrorKind::BadGateway502
123                | ErrorKind::ServiceUnavailable503
124                | ErrorKind::GatewayTimeout504
125                | ErrorKind::NetworkError
126        )
127    }
128
129    /// True for any of the four upstream 5xx variants (#1296). Use this in
130    /// places that need the umbrella "the provider returned a 5xx" check
131    /// without enumerating every status. Pair with [`is_transient`] when
132    /// the rate-limit / network-error siblings should also count.
133    pub fn is_server_error(&self) -> bool {
134        matches!(
135            self,
136            ErrorKind::ServerError500
137                | ErrorKind::BadGateway502
138                | ErrorKind::ServiceUnavailable503
139                | ErrorKind::GatewayTimeout504
140        )
141    }
142
143    /// Base backoff for the per-error retry loop in milliseconds. Drives
144    /// the per-variant retry semantics introduced by issue #1296:
145    ///
146    /// | Kind                         | Base | Rationale                                            |
147    /// |------------------------------|------|------------------------------------------------------|
148    /// | `RateLimit`                  | 2000 | Honour `Retry-After`; otherwise a 2s start.          |
149    /// | `ServerError500`             | 1000 | Maybe-transient origin failure — short doubling.     |
150    /// | `BadGateway502`              | 1000 | Edge fronted a failing origin — short doubling.      |
151    /// | `ServiceUnavailable503`      | 2000 | Capacity-adjacent — start at the rate-limit cadence. |
152    /// | `GatewayTimeout504`          | 4000 | Slow upstream — longer base before retrying.         |
153    /// | `NetworkError`               | 1000 | Connection-level recoverable.                        |
154    ///
155    /// All other variants return `None` (non-transient).
156    pub fn base_backoff_ms(&self) -> Option<u64> {
157        Some(match self {
158            ErrorKind::RateLimit => 2_000,
159            ErrorKind::ServerError500 => 1_000,
160            ErrorKind::BadGateway502 => 1_000,
161            ErrorKind::ServiceUnavailable503 => 2_000,
162            ErrorKind::GatewayTimeout504 => 4_000,
163            ErrorKind::NetworkError => 1_000,
164            _ => return None,
165        })
166    }
167
168    /// Whether the user (operator or workflow author) can fix this by
169    /// changing something — config, script, or input. Used to gate
170    /// "show actionable diagnostic UI" vs "just report it."
171    pub fn is_user_actionable(&self) -> bool {
172        matches!(
173            self,
174            ErrorKind::AuthError
175                | ErrorKind::TokenLimit
176                | ErrorKind::Timeout
177                | ErrorKind::ScriptError
178                | ErrorKind::ScriptDepthExceeded
179                | ErrorKind::AuthorRaise
180        )
181    }
182
183    /// Stable, machine-parseable identifier for the kind. Use this for
184    /// wire payloads, log fields, and the `error_kind` DB column.
185    /// Distinct from [`std::fmt::Display`] (which returns a human-readable
186    /// phrase like `"rate limit"`) and from `Debug` (which is intentional
187    /// here but not load-bearing).
188    pub fn as_wire(&self) -> &'static str {
189        match self {
190            ErrorKind::RateLimit => "RateLimit",
191            ErrorKind::AuthError => "AuthError",
192            ErrorKind::TokenLimit => "TokenLimit",
193            ErrorKind::ServerError500 => "ServerError500",
194            ErrorKind::BadGateway502 => "BadGateway502",
195            ErrorKind::ServiceUnavailable503 => "ServiceUnavailable503",
196            ErrorKind::GatewayTimeout504 => "GatewayTimeout504",
197            ErrorKind::NetworkError => "NetworkError",
198            ErrorKind::ParseError => "ParseError",
199            ErrorKind::Cancelled => "Cancelled",
200            ErrorKind::Timeout => "Timeout",
201            ErrorKind::ScriptError => "ScriptError",
202            ErrorKind::AuthorRaise => "AuthorRaise",
203            ErrorKind::ScriptDepthExceeded => "ScriptDepthExceeded",
204            ErrorKind::Panic => "Panic",
205            ErrorKind::Internal => "Internal",
206        }
207    }
208
209    /// What the caller should do — see [`SuggestedAction`].
210    pub fn suggested_action(&self) -> SuggestedAction {
211        match self {
212            ErrorKind::RateLimit
213            | ErrorKind::ServerError500
214            | ErrorKind::BadGateway502
215            | ErrorKind::ServiceUnavailable503
216            | ErrorKind::GatewayTimeout504
217            | ErrorKind::NetworkError => SuggestedAction::Retry,
218            ErrorKind::AuthError => SuggestedAction::FixConfig,
219            ErrorKind::TokenLimit => SuggestedAction::FixInput,
220            ErrorKind::Timeout => SuggestedAction::FixInput,
221            ErrorKind::ScriptError | ErrorKind::ScriptDepthExceeded | ErrorKind::ParseError => {
222                SuggestedAction::FixScript
223            }
224            ErrorKind::AuthorRaise => SuggestedAction::HandleAuthorFailure,
225            ErrorKind::Cancelled => SuggestedAction::None,
226            ErrorKind::Panic | ErrorKind::Internal => SuggestedAction::Report,
227        }
228    }
229}
230
231/// Stable, fine-grained error identifier. Each code maps to exactly one
232/// [`ErrorKind`] and carries a default user-facing message. Wire form:
233/// `AKRIBES-E-<UPPER-KEBAB>` (e.g. `AKRIBES-E-PROVIDER-RATE-LIMIT`).
234///
235/// Codes are intentionally durable: once published, the wire string and
236/// `kind()` mapping should not change. Add new variants for new
237/// conditions rather than repurposing old ones; SDKs match on these
238/// strings to drive retry/UI/triage logic.
239#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
240pub enum ErrorCode {
241    /// Explicit user/client cancellation via `POST /executions/:id/cancel`.
242    UserCancelled,
243    /// Server-side execution-budget timeout (`AKRIBES_EXECUTION_TIMEOUT`).
244    ExecutionTimeout,
245    /// `on_timeout` window on a checkpoint elapsed without a resume.
246    CheckpointTimeout,
247    /// Provider returned a 429 / rate-limit / quota-exhausted response.
248    ProviderRateLimit,
249    /// Provider returned 401/403, or an API key was missing / not configured.
250    ProviderAuth,
251    /// Provider's reported context window or `max_tokens` was exceeded.
252    ProviderTokenLimit,
253    /// Legacy umbrella for any 5xx — kept for wire backward-compat after
254    /// the issue #1296 split. New code should construct one of
255    /// [`ProviderServer500`], [`ProviderBadGateway502`],
256    /// [`ProviderServiceUnavailable503`], or [`ProviderGatewayTimeout504`].
257    /// Decoding the old `AKRIBES-E-PROVIDER-SERVER` wire string yields this
258    /// variant so SDKs that match on it stay green.
259    ProviderServer,
260    /// Provider returned HTTP 500. Maybe-transient; short retry-with-backoff.
261    ProviderServer500,
262    /// Provider returned HTTP 502 (bad gateway). Retry with short backoff.
263    ProviderBadGateway502,
264    /// Provider returned HTTP 503 (service unavailable). Rate-limit-adjacent
265    /// — honour `Retry-After` aggressively.
266    ProviderServiceUnavailable503,
267    /// Provider returned HTTP 504 (gateway timeout). Longer base backoff
268    /// since the slow side is unlikely to recover faster than the request.
269    ProviderGatewayTimeout504,
270    /// Network-level failure reaching the provider (DNS, TLS, reset,
271    /// per-provider request timeout).
272    ProviderNetwork,
273    /// Provider response did not parse as the expected schema.
274    ProviderParse,
275    /// Generic provider/runtime failure that didn't fit a more specific bucket.
276    ProviderOther,
277    /// A spawned engine task panicked (host-side bug).
278    InternalPanic,
279    /// A `oneshot::Receiver` returned `Err` because its sender was dropped
280    /// before sending — covers breakpoint resume, checkpoint resume,
281    /// tool-approval resume. Indicates a server-side cleanup race or a
282    /// host bug, never a user action.
283    InternalDroppedChannel,
284    /// Engine reached a state with pending nodes but none ready to run —
285    /// dependency cycle or compiler bug.
286    InternalDeadlock,
287    /// JoinError that wasn't a panic and wasn't a recognized cancel —
288    /// tokio runtime aborted the task externally. Treated as a host
289    /// invariant violation.
290    InternalTaskAborted,
291    /// Generic "this should not happen" host failure that doesn't fit
292    /// the more specific Internal* codes.
293    InternalOther,
294    /// Generic workflow-author error not categorised more specifically.
295    ScriptError,
296    /// Cross-script `call(...)` chain exceeded the depth cap.
297    ScriptDepthExceeded,
298    /// Validation retries exhausted with `allow_partial: true` (issue #202)
299    /// — partial-retry sentinel routed to `on unable` / handler.
300    PartialRetryExhausted,
301    /// Workflow author's `fail` arm fired — the LLM returned an Unable
302    /// or other non-success variant the author mapped to failure.
303    AuthorRaise,
304    /// Per-agent tool budget (`tool_budget`) exceeded.
305    ToolBudgetExceeded,
306    /// Tool approval resume returned a payload that wasn't the expected
307    /// `{ approve: bool, args?: Value }` shape — host protocol violation.
308    ToolApprovalProtocol,
309    /// Tool call attempted but no MCP registry was attached (script-only
310    /// engine, missing host wiring).
311    ToolNoRegistry,
312    /// MCP tool call returned a tool-side failure (the registry exists
313    /// and dispatched, but the tool itself errored).
314    ToolError,
315    /// Agent dispatched a second `tool_use` after the engine had already
316    /// folded one round-trip's tool results back into the conversation.
317    /// Agents are single-round-trip by design — multi-turn agentic
318    /// behaviour belongs on a `loop` block. Surfaces when an LLM ignores
319    /// the synthesized "produce your final answer now" follow-up turn
320    /// and tries to invoke tools again. The fix is either to use a
321    /// `loop` block or to tighten the agent's system prompt.
322    AgentToolsDoubleDispatch,
323    /// Required configuration missing (API key, env var, etc.).
324    ConfigMissing,
325    /// Loop block exceeded its `max_total_output_tokens` budget. The
326    /// loop driver accumulates each turn's `output_tokens` from the
327    /// provider and stamps this code on the resulting `LoopEnd { value:
328    /// FatalError }` once the running total exceeds the per-loop or
329    /// project-default cap.
330    LoopOutputBudgetExceeded,
331    /// A second checkpoint fired in the same `loop` turn. The supported
332    /// envelope is at most one checkpoint per turn — the driver tracks a
333    /// per-turn counter and fails fast when the increment goes past 1.
334    /// Surfacing this as a distinct code (rather than `Other`) lets SDKs
335    /// and the Studio render a targeted explanation: split the
336    /// checkpoints across turns, or move one onto a non-loop sibling.
337    LoopMultiCheckpoint,
338    /// Mode 1 (`compaction: none` / omitted) only — the assembled
339    /// request would exceed the model's context window. Pre-call
340    /// diagnostic emitted by `Engine::run_compaction_chain`; replaces
341    /// the cryptic provider 400 from upstream. Carries the conversation
342    /// length, the model cap, and the agent in the message body.
343    ContextOverflow,
344    /// `compaction: native()` (or `at <T>: native()`) used with a model
345    /// whose `ModelEntry::native_compaction_capable` is `false`. The
346    /// related-info span points at the model declaration.
347    ContextNativeUnsupported,
348    /// All custom-chain steps ran and the conversation still exceeds
349    /// the configured cap. Surfaces with the chain of attempted
350    /// strategies in the message body. Emitted instead of a provider
351    /// 400 — fail-fast at the akribes seam.
352    ContextCompactionExhausted,
353    /// `compaction: at <invalid>` — value <= 0, percent > 100, or
354    /// duplicate threshold in a custom chain. Compile-time only.
355    CompactionThresholdInvalid,
356    /// User-defined compactor task referenced from a compaction step
357    /// doesn't match one of the four supported signatures
358    /// (`str|list[message] -> str|list[message]`). Compile-time only.
359    CompactorSignature,
360    /// `compact_to_state(field=...)` used outside a loop's `compaction:`
361    /// block. The primitive is loop-only because it writes into the
362    /// loop's state record. Compile-time only.
363    CompactionLoopOnly,
364    /// `std.format` placeholder `{name}` not present in args.
365    /// Stable string form: `AKRIBES-E-STD-FORMAT-MISS-001` (#1224).
366    StdFormatMissing,
367    /// `std.format` malformed template (unclosed `{`, empty `{}`,
368    /// stray `}`). Stable string form: `AKRIBES-E-STD-FORMAT-SYNTAX-001`.
369    StdFormatSyntax,
370    /// `std.json_parse` could not parse the input string as JSON.
371    /// Stable string form: `AKRIBES-E-STD-JSON-PARSE-001`.
372    StdJsonParse,
373    /// `std.json_stringify` could not serialise the input value (e.g.
374    /// `FatalError` payload; non-serializable). Stable string form:
375    /// `AKRIBES-E-STD-JSON-STRINGIFY-001`.
376    StdJsonStringify,
377    /// `std.regex_extract` was given an invalid regex pattern.
378    /// Stable string form: `AKRIBES-E-STD-REGEX-001`.
379    StdRegexInvalid,
380    /// Catch-all for sites that haven't been migrated to a richer code.
381    /// Prefer adding a specific variant — this is for transition only.
382    Other,
383}
384
385impl ErrorCode {
386    /// Extract a `retry_after_ms` hint from a provider error message
387    /// when the wire response carried one (provider implementations
388    /// usually echo it as `retry-after: <secs>` or similar). None when
389    /// no such hint is present.
390    ///
391    /// Honours both [RFC 9110 §10.2.3] forms:
392    ///
393    /// 1. **delta-seconds** — `Retry-After: 30` (returned as `30_000`).
394    /// 2. **HTTP-date** — `Retry-After: Wed, 21 Oct 2026 07:28:00 GMT`
395    ///    parsed via `httpdate::parse_http_date` and returned as the
396    ///    delta from `SystemTime::now()`, clamped to `>= 0`.
397    ///
398    /// [RFC 9110 §10.2.3]: https://www.rfc-editor.org/rfc/rfc9110#section-10.2.3
399    pub fn parse_retry_after_ms(msg: &str) -> Option<u64> {
400        // `retry-after: 30` (seconds) — common HTTP convention.
401        // Match decimals; we only ever emit milliseconds.
402        let needle = "retry-after";
403        // ASCII-case-insensitive search directly in `msg` (issue #1058
404        // — using `to_lowercase().find()` shifts indices on
405        // length-changing chars like `İ`).
406        let bytes = msg.as_bytes();
407        let n_len = needle.len();
408        let start = if bytes.len() < n_len {
409            return None;
410        } else {
411            (0..=bytes.len() - n_len)
412                .find(|&i| bytes[i..i + n_len].eq_ignore_ascii_case(needle.as_bytes()))?
413        };
414        let after = &msg[start + needle.len()..];
415        // Walk past separators (`:`, `=`, whitespace).
416        let after = after.trim_start_matches(|c: char| c == ':' || c == '=' || c.is_whitespace());
417        let end = after
418            .find(|c: char| !c.is_ascii_digit() && c != '.')
419            .unwrap_or(after.len());
420        let head = &after[..end];
421        if !head.is_empty() {
422            if let Ok(secs) = head.parse::<u64>() {
423                return Some(secs.saturating_mul(1000));
424            }
425            if let Ok(secs_f) = head.parse::<f64>() {
426                if secs_f.is_finite() && secs_f >= 0.0 {
427                    return Some((secs_f * 1000.0) as u64);
428                }
429            }
430        }
431        // HTTP-date branch (#1058).
432        let date_slice = after
433            .split(['\n', '\r'])
434            .next()
435            .unwrap_or(after)
436            .trim()
437            .trim_end_matches([',', ';', '.']);
438        if date_slice.is_empty() {
439            return None;
440        }
441        if let Ok(then) = httpdate::parse_http_date(date_slice) {
442            let now = std::time::SystemTime::now();
443            match then.duration_since(now) {
444                Ok(d) => return Some(d.as_millis().min(u64::MAX as u128) as u64),
445                Err(_) => return Some(0),
446            }
447        }
448        None
449    }
450}
451
452impl ErrorCode {
453    /// The [`ErrorKind`] bucket this code belongs to. Computed once,
454    /// statically — used by consumers that want the rollup behaviour
455    /// (`is_transient`, `suggested_action`) without hand-mapping codes.
456    pub fn kind(&self) -> ErrorKind {
457        match self {
458            ErrorCode::UserCancelled => ErrorKind::Cancelled,
459            ErrorCode::ExecutionTimeout | ErrorCode::CheckpointTimeout => ErrorKind::Timeout,
460            ErrorCode::ProviderRateLimit => ErrorKind::RateLimit,
461            ErrorCode::ProviderAuth | ErrorCode::ConfigMissing => ErrorKind::AuthError,
462            ErrorCode::ProviderTokenLimit => ErrorKind::TokenLimit,
463            ErrorCode::ProviderServer => ErrorKind::ServerError500,
464            ErrorCode::ProviderServer500 => ErrorKind::ServerError500,
465            ErrorCode::ProviderBadGateway502 => ErrorKind::BadGateway502,
466            ErrorCode::ProviderServiceUnavailable503 => ErrorKind::ServiceUnavailable503,
467            ErrorCode::ProviderGatewayTimeout504 => ErrorKind::GatewayTimeout504,
468            ErrorCode::ProviderNetwork => ErrorKind::NetworkError,
469            ErrorCode::ProviderParse => ErrorKind::ParseError,
470            ErrorCode::ProviderOther => ErrorKind::ServerError500,
471            ErrorCode::InternalPanic => ErrorKind::Panic,
472            ErrorCode::InternalDroppedChannel
473            | ErrorCode::InternalDeadlock
474            | ErrorCode::InternalTaskAborted
475            | ErrorCode::InternalOther => ErrorKind::Internal,
476            ErrorCode::ScriptError
477            | ErrorCode::ToolBudgetExceeded
478            | ErrorCode::ToolApprovalProtocol
479            | ErrorCode::ToolNoRegistry
480            | ErrorCode::ToolError
481            | ErrorCode::AgentToolsDoubleDispatch
482            | ErrorCode::LoopOutputBudgetExceeded
483            | ErrorCode::LoopMultiCheckpoint
484            | ErrorCode::ContextOverflow
485            | ErrorCode::ContextNativeUnsupported
486            | ErrorCode::ContextCompactionExhausted
487            | ErrorCode::CompactionThresholdInvalid
488            | ErrorCode::CompactorSignature
489            | ErrorCode::CompactionLoopOnly
490            | ErrorCode::PartialRetryExhausted
491            | ErrorCode::StdFormatMissing
492            | ErrorCode::StdFormatSyntax
493            | ErrorCode::StdJsonParse
494            | ErrorCode::StdJsonStringify
495            | ErrorCode::StdRegexInvalid
496            | ErrorCode::Other => ErrorKind::ScriptError,
497            ErrorCode::ScriptDepthExceeded => ErrorKind::ScriptDepthExceeded,
498            ErrorCode::AuthorRaise => ErrorKind::AuthorRaise,
499        }
500    }
501
502    /// Stable wire identifier (`AKRIBES-E-<UPPER-KEBAB>`). This is the
503    /// string consumers should match on for retry/UI logic.
504    pub fn as_wire(&self) -> &'static str {
505        match self {
506            ErrorCode::UserCancelled => "AKRIBES-E-USER-CANCELLED",
507            ErrorCode::ExecutionTimeout => "AKRIBES-E-EXECUTION-TIMEOUT",
508            ErrorCode::CheckpointTimeout => "AKRIBES-E-CHECKPOINT-TIMEOUT",
509            ErrorCode::ProviderRateLimit => "AKRIBES-E-PROVIDER-RATE-LIMIT",
510            ErrorCode::ProviderAuth => "AKRIBES-E-PROVIDER-AUTH",
511            ErrorCode::ProviderTokenLimit => "AKRIBES-E-PROVIDER-TOKEN-LIMIT",
512            ErrorCode::ProviderServer => "AKRIBES-E-PROVIDER-SERVER",
513            ErrorCode::ProviderServer500 => "AKRIBES-E-PROVIDER-SERVER-500",
514            ErrorCode::ProviderBadGateway502 => "AKRIBES-E-PROVIDER-BAD-GATEWAY-502",
515            ErrorCode::ProviderServiceUnavailable503 => {
516                "AKRIBES-E-PROVIDER-SERVICE-UNAVAILABLE-503"
517            }
518            ErrorCode::ProviderGatewayTimeout504 => "AKRIBES-E-PROVIDER-GATEWAY-TIMEOUT-504",
519            ErrorCode::ProviderNetwork => "AKRIBES-E-PROVIDER-NETWORK",
520            ErrorCode::ProviderParse => "AKRIBES-E-PROVIDER-PARSE",
521            ErrorCode::ProviderOther => "AKRIBES-E-PROVIDER-OTHER",
522            ErrorCode::InternalPanic => "AKRIBES-E-INTERNAL-PANIC",
523            ErrorCode::InternalDroppedChannel => "AKRIBES-E-INTERNAL-DROPPED-CHANNEL",
524            ErrorCode::InternalDeadlock => "AKRIBES-E-INTERNAL-DEADLOCK",
525            ErrorCode::InternalTaskAborted => "AKRIBES-E-INTERNAL-TASK-ABORTED",
526            ErrorCode::InternalOther => "AKRIBES-E-INTERNAL-OTHER",
527            ErrorCode::ScriptError => "AKRIBES-E-SCRIPT-ERROR",
528            ErrorCode::ScriptDepthExceeded => "AKRIBES-E-SCRIPT-DEPTH",
529            ErrorCode::PartialRetryExhausted => "AKRIBES-E-RETRY-PARTIAL-EXHAUSTED",
530            ErrorCode::AuthorRaise => "AKRIBES-E-AUTHOR-RAISE",
531            ErrorCode::ToolBudgetExceeded => "AKRIBES-E-TOOL-BUDGET",
532            ErrorCode::ToolApprovalProtocol => "AKRIBES-E-TOOL-APPROVAL-PROTOCOL",
533            ErrorCode::ToolNoRegistry => "AKRIBES-E-TOOL-NO-REGISTRY",
534            ErrorCode::ToolError => "AKRIBES-E-TOOL-ERROR",
535            ErrorCode::AgentToolsDoubleDispatch => "AKRIBES-E-AGENT-TOOLS-DOUBLE-DISPATCH",
536            ErrorCode::ConfigMissing => "AKRIBES-E-CONFIG-MISSING",
537            ErrorCode::LoopOutputBudgetExceeded => "AKRIBES-E-LOOP-OUTPUT-BUDGET-EXCEEDED",
538            ErrorCode::LoopMultiCheckpoint => "AKRIBES-E-LOOP-MULTI-CHECKPOINT",
539            ErrorCode::ContextOverflow => "AKRIBES-E-CONTEXT-OVERFLOW",
540            ErrorCode::ContextNativeUnsupported => "AKRIBES-E-CONTEXT-NATIVE-UNSUPPORTED",
541            ErrorCode::ContextCompactionExhausted => "AKRIBES-E-CONTEXT-COMPACTION-EXHAUSTED",
542            ErrorCode::CompactionThresholdInvalid => "AKRIBES-E-COMPACTION-THRESHOLD-INVALID",
543            ErrorCode::CompactorSignature => "AKRIBES-E-COMPACTOR-SIGNATURE",
544            ErrorCode::CompactionLoopOnly => "AKRIBES-E-COMPACTION-LOOP-ONLY",
545            ErrorCode::StdFormatMissing => "AKRIBES-E-STD-FORMAT-MISS-001",
546            ErrorCode::StdFormatSyntax => "AKRIBES-E-STD-FORMAT-SYNTAX-001",
547            ErrorCode::StdJsonParse => "AKRIBES-E-STD-JSON-PARSE-001",
548            ErrorCode::StdJsonStringify => "AKRIBES-E-STD-JSON-STRINGIFY-001",
549            ErrorCode::StdRegexInvalid => "AKRIBES-E-STD-REGEX-001",
550            ErrorCode::Other => "AKRIBES-E-OTHER",
551        }
552    }
553
554    /// Parse the canonical wire form (`AKRIBES-E-<UPPER-KEBAB>`) back to a
555    /// code. Returns `None` for any string we don't recognise so the
556    /// caller can decide whether to fall back to [`ErrorCode::Other`]
557    /// or surface the unknown code as-is. Used by the legacy
558    /// `Value::fatal_with_code` shim and SDK normalisers.
559    pub fn from_wire(s: &str) -> Option<Self> {
560        let code = match s {
561            "AKRIBES-E-USER-CANCELLED" => ErrorCode::UserCancelled,
562            "AKRIBES-E-EXECUTION-TIMEOUT" => ErrorCode::ExecutionTimeout,
563            "AKRIBES-E-CHECKPOINT-TIMEOUT" => ErrorCode::CheckpointTimeout,
564            "AKRIBES-E-PROVIDER-RATE-LIMIT" => ErrorCode::ProviderRateLimit,
565            "AKRIBES-E-PROVIDER-AUTH" => ErrorCode::ProviderAuth,
566            "AKRIBES-E-PROVIDER-TOKEN-LIMIT" => ErrorCode::ProviderTokenLimit,
567            "AKRIBES-E-PROVIDER-SERVER" => ErrorCode::ProviderServer,
568            "AKRIBES-E-PROVIDER-SERVER-500" => ErrorCode::ProviderServer500,
569            "AKRIBES-E-PROVIDER-BAD-GATEWAY-502" => ErrorCode::ProviderBadGateway502,
570            "AKRIBES-E-PROVIDER-SERVICE-UNAVAILABLE-503" => {
571                ErrorCode::ProviderServiceUnavailable503
572            }
573            "AKRIBES-E-PROVIDER-GATEWAY-TIMEOUT-504" => ErrorCode::ProviderGatewayTimeout504,
574            "AKRIBES-E-PROVIDER-NETWORK" => ErrorCode::ProviderNetwork,
575            "AKRIBES-E-PROVIDER-PARSE" => ErrorCode::ProviderParse,
576            "AKRIBES-E-PROVIDER-OTHER" => ErrorCode::ProviderOther,
577            "AKRIBES-E-INTERNAL-PANIC" => ErrorCode::InternalPanic,
578            "AKRIBES-E-INTERNAL-DROPPED-CHANNEL" => ErrorCode::InternalDroppedChannel,
579            "AKRIBES-E-INTERNAL-DEADLOCK" => ErrorCode::InternalDeadlock,
580            "AKRIBES-E-INTERNAL-TASK-ABORTED" => ErrorCode::InternalTaskAborted,
581            "AKRIBES-E-INTERNAL-OTHER" => ErrorCode::InternalOther,
582            "AKRIBES-E-SCRIPT-ERROR" => ErrorCode::ScriptError,
583            "AKRIBES-E-SCRIPT-DEPTH" => ErrorCode::ScriptDepthExceeded,
584            "AKRIBES-E-RETRY-PARTIAL-EXHAUSTED" => ErrorCode::PartialRetryExhausted,
585            "AKRIBES-E-AUTHOR-RAISE" => ErrorCode::AuthorRaise,
586            "AKRIBES-E-TOOL-BUDGET" => ErrorCode::ToolBudgetExceeded,
587            "AKRIBES-E-TOOL-APPROVAL-PROTOCOL" => ErrorCode::ToolApprovalProtocol,
588            "AKRIBES-E-TOOL-NO-REGISTRY" => ErrorCode::ToolNoRegistry,
589            "AKRIBES-E-TOOL-ERROR" => ErrorCode::ToolError,
590            "AKRIBES-E-AGENT-TOOLS-DOUBLE-DISPATCH" => ErrorCode::AgentToolsDoubleDispatch,
591            "AKRIBES-E-CONFIG-MISSING" => ErrorCode::ConfigMissing,
592            "AKRIBES-E-LOOP-OUTPUT-BUDGET-EXCEEDED" => ErrorCode::LoopOutputBudgetExceeded,
593            "AKRIBES-E-LOOP-MULTI-CHECKPOINT" => ErrorCode::LoopMultiCheckpoint,
594            "AKRIBES-E-CONTEXT-OVERFLOW" => ErrorCode::ContextOverflow,
595            "AKRIBES-E-CONTEXT-NATIVE-UNSUPPORTED" => ErrorCode::ContextNativeUnsupported,
596            "AKRIBES-E-CONTEXT-COMPACTION-EXHAUSTED" => ErrorCode::ContextCompactionExhausted,
597            "AKRIBES-E-COMPACTION-THRESHOLD-INVALID" => ErrorCode::CompactionThresholdInvalid,
598            "AKRIBES-E-COMPACTOR-SIGNATURE" => ErrorCode::CompactorSignature,
599            "AKRIBES-E-COMPACTION-LOOP-ONLY" => ErrorCode::CompactionLoopOnly,
600            "AKRIBES-E-STD-FORMAT-MISS-001" => ErrorCode::StdFormatMissing,
601            "AKRIBES-E-STD-FORMAT-SYNTAX-001" => ErrorCode::StdFormatSyntax,
602            "AKRIBES-E-STD-JSON-PARSE-001" => ErrorCode::StdJsonParse,
603            "AKRIBES-E-STD-JSON-STRINGIFY-001" => ErrorCode::StdJsonStringify,
604            "AKRIBES-E-STD-REGEX-001" => ErrorCode::StdRegexInvalid,
605            "AKRIBES-E-OTHER" => {
606                // Issue #1039: `AKRIBES-E-OTHER` is the explicit
607                // unclassified-fallback bucket. Decoding a wire payload
608                // tagged with it is legal, but every occurrence is a hint
609                // that an upstream producer skipped a more specific code.
610                // Surface that via a warn-log so prod can attribute the
611                // drift to the producing component (server / SDK / runner)
612                // rather than silently flattening to `ScriptError`.
613                tracing::warn!(
614                    target: "akribes_types::error",
615                    wire_code = "AKRIBES-E-OTHER",
616                    "decoded fallback ErrorCode::Other from wire payload —                      the producing component skipped a more specific AKRIBES-E-* code"
617                );
618                ErrorCode::Other
619            }
620            _ => return None,
621        };
622        Some(code)
623    }
624
625    /// Default user-facing message for this code. Constructors should
626    /// override only when there is meaningfully more to say to the user
627    /// (e.g. embedding the offending value), not just to restate the
628    /// developer message.
629    pub fn default_user_message(&self) -> &'static str {
630        match self {
631            ErrorCode::UserCancelled => "The execution was cancelled.",
632            ErrorCode::ExecutionTimeout => {
633                "The workflow ran past its time budget. Try a smaller input, simplify the workflow, or raise AKRIBES_EXECUTION_TIMEOUT."
634            }
635            ErrorCode::CheckpointTimeout => {
636                "A checkpoint waited longer than its on_timeout window without a resume."
637            }
638            ErrorCode::ProviderRateLimit => {
639                "The model provider rate-limited the request. Wait a moment and retry; consider lowering concurrency."
640            }
641            ErrorCode::ProviderAuth => {
642                "The model provider rejected our credentials. Check the provider's API key and that the configured model is enabled."
643            }
644            ErrorCode::ProviderTokenLimit => {
645                "The prompt exceeds the model's context window. Reduce input length, use a larger-context model, or split the work."
646            }
647            ErrorCode::ProviderServer => {
648                "The model provider returned a server-side error. Retrying is usually appropriate."
649            }
650            ErrorCode::ProviderServer500 => {
651                "The model provider returned HTTP 500. The origin reported an internal error; a retry with a short backoff is usually appropriate."
652            }
653            ErrorCode::ProviderBadGateway502 => {
654                "The model provider returned HTTP 502 (bad gateway). The edge fronted a failing origin; retry with a short backoff."
655            }
656            ErrorCode::ProviderServiceUnavailable503 => {
657                "The model provider returned HTTP 503 (service unavailable). This is rate-limit-adjacent — honour Retry-After if the provider sent one, otherwise back off."
658            }
659            ErrorCode::ProviderGatewayTimeout504 => {
660                "The model provider returned HTTP 504 (gateway timeout). The upstream is slow or stuck; retry with a longer backoff before alerting."
661            }
662            ErrorCode::ProviderNetwork => {
663                "Could not reach the model provider (network/DNS/TLS/timeout). Retry; check connectivity if it persists."
664            }
665            ErrorCode::ProviderParse => {
666                "The model produced output that didn't fit the declared schema. Check the prompt and the type definition."
667            }
668            ErrorCode::ProviderOther => "The model provider failed with an unclassified error.",
669            ErrorCode::InternalPanic => {
670                "An internal Akribes task crashed (AKRIBES-E-INTERNAL-PANIC). \
671                 This is a bug. Report with the execution id at \
672                 https://github.com/PodestaAI/akribes-sdks/issues."
673            }
674            ErrorCode::InternalDroppedChannel => {
675                "An internal Akribes channel was closed unexpectedly (AKRIBES-E-INTERNAL-DROPPED-CHANNEL). \
676                 This is usually a bug. Report with the execution id at \
677                 https://github.com/PodestaAI/akribes-sdks/issues."
678            }
679            ErrorCode::InternalDeadlock => {
680                "Akribes detected a stuck workflow graph (AKRIBES-E-INTERNAL-DEADLOCK). \
681                 This is a compiler/engine bug. Report at \
682                 https://github.com/PodestaAI/akribes-sdks/issues."
683            }
684            ErrorCode::InternalTaskAborted => {
685                "An internal task was aborted unexpectedly (AKRIBES-E-INTERNAL-TASK-ABORTED). \
686                 This is usually a bug. Report at \
687                 https://github.com/PodestaAI/akribes-sdks/issues."
688            }
689            ErrorCode::InternalOther => {
690                "An unspecified internal error occurred (AKRIBES-E-INTERNAL-OTHER). \
691                 Report with the execution id at \
692                 https://github.com/PodestaAI/akribes-sdks/issues."
693            }
694            ErrorCode::ScriptError => {
695                "The workflow encountered a runtime error. Check task logic, types, and inputs."
696            }
697            ErrorCode::ScriptDepthExceeded => {
698                "Workflow call(...) chain exceeded the recursion cap. Refactor to reduce nesting."
699            }
700            ErrorCode::PartialRetryExhausted => {
701                "All validation retries on a partial-retry task were exhausted."
702            }
703            ErrorCode::AuthorRaise => {
704                "The workflow's failure path fired (the LLM returned an Unable or non-success variant the script mapped to fail)."
705            }
706            ErrorCode::ToolBudgetExceeded => {
707                "An agent exceeded its tool_budget cap. Increase the cap or reduce tool use."
708            }
709            ErrorCode::ToolApprovalProtocol => {
710                "Tool approval received an unexpected payload. This is a host-integration bug."
711            }
712            ErrorCode::ToolNoRegistry => {
713                "A tool call was attempted but no MCP registry is attached. Configure mcp_server / mcp_registry, or run via a host that wires the registry."
714            }
715            ErrorCode::ToolError => {
716                "An MCP tool returned an error. Check tool configuration and the upstream service."
717            }
718            ErrorCode::AgentToolsDoubleDispatch => {
719                "An agent invoked tools more than once in a single dispatch. Agents are single-round-trip — use a `loop` block for multi-turn tool use."
720            }
721            ErrorCode::ConfigMissing => {
722                "Required configuration is missing (API key, env var, or provider setup)."
723            }
724            ErrorCode::LoopOutputBudgetExceeded => {
725                "A `loop` block exceeded its `max_total_output_tokens` cap. Raise the cap or shorten per-turn output."
726            }
727            ErrorCode::LoopMultiCheckpoint => {
728                "A loop turn fired more than one checkpoint. One checkpoint per turn is the supported envelope — split them across turns or move one outside the loop."
729            }
730            ErrorCode::ContextOverflow => {
731                "The conversation exceeds the model's context window. Configure `compaction:` on the agent (e.g. `compaction: at 80%`) or pick a model with a larger window."
732            }
733            ErrorCode::ContextNativeUnsupported => {
734                "This model doesn't support server-side native compaction. Pick a capable model (opus_4_7, opus_4_6, sonnet_4_6, gpt_5_3_codex, gpt_5_5) or switch to a custom compaction chain."
735            }
736            ErrorCode::ContextCompactionExhausted => {
737                "The compaction chain ran every configured step and the conversation still exceeds the configured cap. Add a terminal step (truncate or native) or raise the cap."
738            }
739            ErrorCode::CompactionThresholdInvalid => {
740                "A compaction threshold is invalid. Use 1..=100 with `%`, or a positive absolute token count."
741            }
742            ErrorCode::CompactorSignature => {
743                "User-defined compactor must have signature `(history: str | list[message]) -> str | list[message]`."
744            }
745            ErrorCode::CompactionLoopOnly => {
746                "`compact_to_state(...)` may only appear inside a loop's `compaction:` block — move it under the loop, or use a different primitive on the agent."
747            }
748            ErrorCode::StdFormatMissing => {
749                "`std.format` is missing a placeholder key. Pass every `{name}` in the template via the `args` map."
750            }
751            ErrorCode::StdFormatSyntax => {
752                "`std.format` template has malformed brace syntax. Use `{name}` for placeholders, `{{` / `}}` for literal braces."
753            }
754            ErrorCode::StdJsonParse => "`std.json_parse` could not parse the input as JSON.",
755            ErrorCode::StdJsonStringify => {
756                "`std.json_stringify` could not serialise the value. Check for control-plane values (FatalError) and non-JSON shapes."
757            }
758            ErrorCode::StdRegexInvalid => {
759                "`std.regex_extract` was given an invalid regex pattern. Check the syntax against the Rust `regex` crate's rules."
760            }
761            ErrorCode::Other => "An error occurred. See the developer message for detail.",
762        }
763    }
764}
765
766/// Where in the workflow an error originated. Every field is optional —
767/// fill what you know, leave the rest. SDKs render whichever fields are
768/// present; downstream tools (logs, OTel) read them as structured
769/// attributes for filtering/aggregation.
770#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
771pub struct ErrorSource {
772    /// Workflow-author-declared task name (matches `task <name>` in source).
773    #[serde(skip_serializing_if = "Option::is_none")]
774    pub task: Option<String>,
775    /// Agent name from the matching `agent <name>` declaration.
776    #[serde(skip_serializing_if = "Option::is_none")]
777    pub agent: Option<String>,
778    /// Provider id when the error came from an LLM/provider call
779    /// (`anthropic`, `google`, `openai`, …).
780    #[serde(skip_serializing_if = "Option::is_none")]
781    pub provider: Option<String>,
782    /// Model alias (`opus_4_7`, `gpt_4o_mini`, …).
783    #[serde(skip_serializing_if = "Option::is_none")]
784    pub model: Option<String>,
785    /// MCP `<alias>.<tool>` reference when the error came from a tool call.
786    #[serde(skip_serializing_if = "Option::is_none")]
787    pub tool_ref: Option<String>,
788    /// Script name when the error came from a sub-script (`call(...)`).
789    #[serde(skip_serializing_if = "Option::is_none")]
790    pub script: Option<String>,
791    /// 1-indexed source line in the originating `.akr` file.
792    #[serde(skip_serializing_if = "Option::is_none")]
793    pub line: Option<u32>,
794}
795
796impl ErrorSource {
797    pub fn empty() -> Self {
798        Self::default()
799    }
800
801    pub fn is_empty(&self) -> bool {
802        self == &Self::default()
803    }
804
805    /// Builder helpers — chainable, infallible.
806    pub fn with_task(mut self, task: impl Into<String>) -> Self {
807        self.task = Some(task.into());
808        self
809    }
810    pub fn with_agent(mut self, agent: impl Into<String>) -> Self {
811        self.agent = Some(agent.into());
812        self
813    }
814    pub fn with_provider(mut self, provider: impl Into<String>) -> Self {
815        self.provider = Some(provider.into());
816        self
817    }
818    pub fn with_model(mut self, model: impl Into<String>) -> Self {
819        self.model = Some(model.into());
820        self
821    }
822    pub fn with_tool_ref(mut self, tool_ref: impl Into<String>) -> Self {
823        self.tool_ref = Some(tool_ref.into());
824        self
825    }
826    pub fn with_script(mut self, script: impl Into<String>) -> Self {
827        self.script = Some(script.into());
828        self
829    }
830    pub fn with_line(mut self, line: u32) -> Self {
831        self.line = Some(line);
832        self
833    }
834}
835
836/// Structured failure detail attached to a [`crate::value::Value::FatalError`] and to
837/// every `EngineEvent::Error`. Replaces the previous `(message, kind)`
838/// shape with a richer envelope so SDKs can decide what to do, users get
839/// actionable text, and developers get structured fields for OTel/logs.
840///
841/// Construction patterns:
842///
843/// * Simple: `ErrorDetail::from_kind(ErrorKind::ScriptError, "div by zero")`
844///   — pulls a generic code (`AKRIBES-E-SCRIPT-ERROR`) and the kind's default
845///   user message.
846/// * Specific: `ErrorDetail::new(ErrorCode::ProviderRateLimit, "...")`
847///   — code drives kind + default user_message via [`ErrorCode::kind`].
848/// * With retry hint: `.with_retry_after_ms(30_000)`.
849/// * With source: `.with_source(ErrorSource::default().with_task("foo"))`.
850#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
851pub struct ErrorDetail {
852    pub kind: ErrorKind,
853    pub code: ErrorCode,
854    /// Developer-facing message — full detail, may include sanitized
855    /// stack/protocol fragments. Always non-empty.
856    pub message: String,
857    /// User-facing single-paragraph summary + suggested action. Always
858    /// non-empty (defaults to [`ErrorCode::default_user_message`]).
859    pub user_message: String,
860    /// When the provider supplied a `Retry-After` (or equivalent), the
861    /// suggested wait in milliseconds. None when not known.
862    #[serde(skip_serializing_if = "Option::is_none")]
863    pub retry_after_ms: Option<u64>,
864    /// Where the error originated. Empty (`is_empty()`) when no
865    /// attribution is available.
866    #[serde(skip_serializing_if = "ErrorSource::is_empty", default)]
867    pub source: ErrorSource,
868}
869
870impl ErrorDetail {
871    /// Construct from a code + developer message. Kind and user_message
872    /// are derived from the code.
873    pub fn new(code: ErrorCode, message: impl Into<String>) -> Self {
874        Self {
875            kind: code.kind(),
876            code,
877            message: message.into(),
878            user_message: code.default_user_message().to_string(),
879            retry_after_ms: None,
880            source: ErrorSource::default(),
881        }
882    }
883
884    /// Construct from an existing kind when no specific code is yet
885    /// available. Picks the closest "Other" code for that kind, and for
886    /// rate-limit/server messages also extracts a `retry_after_ms` hint
887    /// when the upstream response embedded one.
888    pub fn from_kind(kind: ErrorKind, message: impl Into<String>) -> Self {
889        let message = message.into();
890        let code = match kind {
891            ErrorKind::RateLimit => ErrorCode::ProviderRateLimit,
892            ErrorKind::AuthError => ErrorCode::ProviderAuth,
893            ErrorKind::TokenLimit => ErrorCode::ProviderTokenLimit,
894            ErrorKind::ServerError500 => ErrorCode::ProviderServer500,
895            ErrorKind::BadGateway502 => ErrorCode::ProviderBadGateway502,
896            ErrorKind::ServiceUnavailable503 => ErrorCode::ProviderServiceUnavailable503,
897            ErrorKind::GatewayTimeout504 => ErrorCode::ProviderGatewayTimeout504,
898            ErrorKind::NetworkError => ErrorCode::ProviderNetwork,
899            ErrorKind::ParseError => ErrorCode::ProviderParse,
900            ErrorKind::Cancelled => ErrorCode::UserCancelled,
901            ErrorKind::Timeout => ErrorCode::ExecutionTimeout,
902            ErrorKind::ScriptError => ErrorCode::ScriptError,
903            ErrorKind::AuthorRaise => ErrorCode::AuthorRaise,
904            ErrorKind::ScriptDepthExceeded => ErrorCode::ScriptDepthExceeded,
905            ErrorKind::Panic => ErrorCode::InternalPanic,
906            ErrorKind::Internal => ErrorCode::InternalOther,
907        };
908        // Best-effort retry hint extraction for transient kinds. Cheap
909        // (single substring scan) and only relevant for kinds that
910        // would benefit from the hint.
911        let retry_after_ms = if matches!(
912            kind,
913            ErrorKind::RateLimit
914                | ErrorKind::ServerError500
915                | ErrorKind::BadGateway502
916                | ErrorKind::ServiceUnavailable503
917                | ErrorKind::GatewayTimeout504
918                | ErrorKind::NetworkError
919        ) {
920            ErrorCode::parse_retry_after_ms(&message)
921        } else {
922            None
923        };
924        Self {
925            kind,
926            code,
927            message,
928            user_message: code.default_user_message().to_string(),
929            retry_after_ms,
930            source: ErrorSource::default(),
931        }
932    }
933
934    /// Override the user-facing message. Use when the default for the
935    /// code isn't specific enough (e.g. embedding the offending value).
936    pub fn with_user_message(mut self, msg: impl Into<String>) -> Self {
937        self.user_message = msg.into();
938        self
939    }
940
941    pub fn with_retry_after_ms(mut self, ms: u64) -> Self {
942        self.retry_after_ms = Some(ms);
943        self
944    }
945
946    pub fn with_source(mut self, source: ErrorSource) -> Self {
947        self.source = source;
948        self
949    }
950
951    /// Convenience: builder-style task attribution.
952    pub fn with_task(mut self, task: impl Into<String>) -> Self {
953        self.source.task = Some(task.into());
954        self
955    }
956
957    /// Whether retrying as-is may succeed. Pulls from the kind plus
958    /// `retry_after_ms` (an explicit hint always implies retryable).
959    pub fn is_retryable(&self) -> bool {
960        self.retry_after_ms.is_some() || self.kind.is_transient()
961    }
962
963    pub fn suggested_action(&self) -> SuggestedAction {
964        self.kind.suggested_action()
965    }
966}
967
968impl std::fmt::Display for ErrorDetail {
969    /// Renders as `<wire-code>: <message>` so log lines and the legacy
970    /// "string error" shape stay readable. Use the structured fields
971    /// directly when constructing JSON wire payloads.
972    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
973        write!(f, "{}: {}", self.code.as_wire(), self.message)
974    }
975}
976
977impl std::fmt::Display for ErrorKind {
978    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
979        match self {
980            ErrorKind::RateLimit => write!(f, "rate limit"),
981            ErrorKind::AuthError => write!(f, "authentication error"),
982            ErrorKind::TokenLimit => write!(f, "token limit"),
983            ErrorKind::ServerError500 => write!(f, "server error (HTTP 500)"),
984            ErrorKind::BadGateway502 => write!(f, "bad gateway (HTTP 502)"),
985            ErrorKind::ServiceUnavailable503 => write!(f, "service unavailable (HTTP 503)"),
986            ErrorKind::GatewayTimeout504 => write!(f, "gateway timeout (HTTP 504)"),
987            ErrorKind::NetworkError => write!(f, "network error"),
988            ErrorKind::ParseError => write!(f, "parse error"),
989            ErrorKind::Cancelled => write!(f, "cancelled"),
990            ErrorKind::Timeout => write!(f, "timeout"),
991            ErrorKind::ScriptError => write!(f, "script error"),
992            ErrorKind::AuthorRaise => write!(f, "author raise"),
993            ErrorKind::ScriptDepthExceeded => write!(f, "script depth exceeded"),
994            ErrorKind::Panic => write!(f, "panic"),
995            ErrorKind::Internal => write!(f, "internal error"),
996        }
997    }
998}