Skip to main content

ff_core/
engine_error.rs

1//! Typed engine-error surface (issue #58.6).
2//!
3//! **RFC-012 Stage 1a:** moved from `ff-sdk::engine_error` to
4//! `ff-core::engine_error` so it becomes nameable by the
5//! `EngineBackend` trait (which lives in `ff-core::engine_backend`) without
6//! forcing a public-surface dependency from ff-core on ff-script. The
7//! [`ScriptError`]-aware helpers (`From<ScriptError>`, `valkey_kind`,
8//! `transport_script`, `transport_script_ref`) live in ff-script as
9//! free functions (see `ff_script::engine_error_ext`) — ff-core owns
10//! the enum shapes; ff-script owns the transport-downcast plumbing.
11//!
12//! # Mapping shape
13//!
14//! `ScriptError` lives in the `ff-script` crate (transport-adjacent).
15//! `EngineError` lives here in `ff-core` and is what public SDK calls
16//! return via `ff_sdk::SdkError::Engine`. The bidirectional mapping:
17//!
18//! * `From<ScriptError> for EngineError` — every `ScriptError` variant
19//!   is classified into `NotFound` / `Validation` / `Contention` /
20//!   `Conflict` / `State` / `Bug` / `Transport`. `Parse` + `Valkey`
21//!   flow through `Transport { source: Box<ScriptError> }` so the
22//!   underlying `ferriskey::ErrorKind` / parse detail is preserved.
23//! * `DependencyAlreadyExists` is special: per the #58.6 design the
24//!   variant carries the pre-existing [`EdgeSnapshot`] inline.
25//!   Populating that field requires an extra round-trip (the Lua
26//!   script only knows the edge_id), so plain `From<ScriptError>`
27//!   returns a `Transport` fallback for that code — callers in the
28//!   `stage_dependency` path use `ff_sdk::engine_error::enrich_dependency_conflict`
29//!   to perform the follow-up `describe_edge` and upgrade the error
30//!   before returning.
31//!
32//! # Exhaustiveness
33//!
34//! The top-level [`EngineError`] and every sub-kind are
35//! `#[non_exhaustive]`. FF can add new Lua error codes in minors
36//! without a breaking change to this surface — consumers that
37//! `match` on a sub-kind must include a `_` arm.
38
39use crate::error::ErrorClass;
40
41/// Typed engine-error surface. See module docs.
42#[derive(Debug, thiserror::Error)]
43#[non_exhaustive]
44pub enum EngineError {
45    /// A uniquely-identified resource did not exist. `entity` is a
46    /// stable label (e.g. `"execution"`, `"flow"`, `"attempt"`) that
47    /// consumers can match without re-parsing a message.
48    #[error("not found: {entity}")]
49    NotFound { entity: &'static str },
50
51    /// Caller supplied a malformed, out-of-range, or otherwise
52    /// rejected input. `detail` carries the Lua-side payload (field
53    /// name, offending value, or CSV of missing tokens, depending on
54    /// `kind`).
55    #[error("validation: {kind:?}: {detail}")]
56    Validation {
57        kind: ValidationKind,
58        detail: String,
59    },
60
61    /// Transient conflict with another worker or with the current
62    /// state of the execution/flow. Caller should retry per
63    /// RFC-010 §10.7.
64    #[error("contention: {0:?}")]
65    Contention(ContentionKind),
66
67    /// Permanent conflict — the requested mutation conflicts with
68    /// an existing record (e.g. duplicate edge, cycle, already-in-flow).
69    /// Caller must not blindly retry.
70    #[error("conflict: {0:?}")]
71    Conflict(ConflictKind),
72
73    /// Legal but surprising state — lease expired, already-suspended,
74    /// duplicate-signal, budget-exceeded, etc. Per-variant semantics
75    /// documented on [`StateKind`].
76    #[error("state: {0:?}")]
77    State(StateKind),
78
79    /// FF-internal invariant violation that should not be reachable
80    /// in a correctly-behaving deployment. Consumers typically log
81    /// and surface as a 5xx.
82    #[error("bug: {0:?}")]
83    Bug(BugKind),
84
85    /// Backend transport fault or response-parse failure (RFC-012 §4.2
86    /// round-4 shape). Broadened in Stage 0 to carry `Box<dyn Error>`
87    /// so non-Valkey backends (Postgres, future) can route their
88    /// native transport errors through this variant without going via
89    /// `ScriptError`.
90    ///
91    /// * `backend` — static diagnostic label (`"valkey"`, `"postgres"`,
92    ///   etc.). Kept `&'static str` to avoid heap alloc on construction.
93    /// * `source` — boxed error. For the Valkey backend this is
94    ///   `ff_script::error::ScriptError`; downcast with
95    ///   `source.downcast_ref::<ScriptError>()` to recover
96    ///   `ferriskey::ErrorKind` / parse detail. Helper lives in
97    ///   `ff_script::engine_error_ext::transport_script_ref`.
98    #[error("transport ({backend}): {source}")]
99    Transport {
100        backend: &'static str,
101        #[source]
102        source: Box<dyn std::error::Error + Send + Sync + 'static>,
103    },
104
105    /// Backend method not wired up yet (RFC-012 §4.2 K#7 holdover).
106    /// Returned by staged backend impls for methods that are known
107    /// types in the trait but not yet implemented. Graceful degradation
108    /// in place of `unimplemented!()` panics. Additive; does not
109    /// participate in the `From<ScriptError>` mapping.
110    #[error("unavailable: {op}")]
111    Unavailable { op: &'static str },
112
113    /// Backend-owned concurrency pool reached its ceiling (RFC-017 §6).
114    /// `pool` is a stable label (`"stream_ops"`, `"admin_rotate"`, …);
115    /// `max` is the pool ceiling; `retry_after_ms`, when set, is an
116    /// advisory retry hint the backend computed from its own back-
117    /// pressure signal. Maps to HTTP 429 at the `ff-server` boundary.
118    #[error("resource exhausted: pool={pool} max={max}")]
119    ResourceExhausted {
120        pool: &'static str,
121        max: u32,
122        retry_after_ms: Option<u32>,
123    },
124
125    /// An operation ran past its deadline (RFC-017 §5.4
126    /// `shutdown_prepare`). `op` is a stable label, `elapsed` is how
127    /// long the operation ran before the caller aborted. Additive;
128    /// call sites that previously did not emit this variant keep
129    /// emitting whatever they emitted before.
130    #[error("timeout: op={op} elapsed={elapsed:?}")]
131    Timeout {
132        op: &'static str,
133        elapsed: std::time::Duration,
134    },
135
136    /// RFC-019 Stage A — a subscription stream returned by
137    /// [`crate::engine_backend::EngineBackend::subscribe_lease_history`]
138    /// (or its siblings) observed a backend disconnect. The cursor is
139    /// the last event position the stream successfully yielded (or
140    /// [`crate::stream_subscribe::StreamCursor::empty`] if none was
141    /// observed). Consumers reconnect by re-calling the same
142    /// `subscribe_*` method with this cursor — that is the
143    /// owner-adjudicated disconnect contract (RFC-019 §Open
144    /// Questions #2).
145    ///
146    /// Terminal from the stream's perspective: the subscription ends
147    /// after yielding this error.
148    #[error("stream disconnected; reconnect with returned cursor")]
149    StreamDisconnected {
150        cursor: crate::stream_subscribe::StreamCursor,
151    },
152
153    /// RFC-019 Stage A — a subscription stream fell behind its bounded
154    /// queue and dropped events rather than blocking the producer.
155    /// Reserved for backends that explicitly surface lag (Stage B
156    /// lands the first call-site via `subscribe_instance_tags`); no
157    /// Stage A backend emits this today but consumers match on it to
158    /// future-proof reconciliation paths.
159    ///
160    /// Non-terminal: the stream continues after this error; callers
161    /// treat it as a "refresh from authoritative state" signal.
162    #[error("stream backpressure; events dropped")]
163    StreamBackpressure,
164
165    /// An inner [`EngineError`] wrapped with a call-site label so
166    /// operators triaging logs can see which op the error came from
167    /// without inferring from surrounding spans. Constructed via
168    /// [`backend_context`]; carries a lightweight string context
169    /// (e.g. `"renew: FCALL ff_renew_lease"`).
170    ///
171    /// Classification helpers (`ErrorClass`, `BackendErrorKind`,
172    /// etc.) transparently descend into `source` so a consumer that
173    /// matches on the wrapper arm keeps the same retry/terminal
174    /// semantics as the unwrapped inner error.
175    #[error("{context}: {source}")]
176    Contextual {
177        #[source]
178        source: Box<EngineError>,
179        context: String,
180    },
181}
182
183/// Wrap an [`EngineError`] with a call-site label when the error is
184/// a transport-family fault — `Transport` or `Unavailable`. Typed
185/// classifications (`NotFound`, `Validation`, `Contention`,
186/// `Conflict`, `State`, `Bug`) form the public contract boundary
187/// for consumers that `match` on the variant, so we return them
188/// unchanged. Repeated wraps on an already-`Contextual` error
189/// nest an additional layer; callers should wrap once per op
190/// boundary.
191///
192/// Promoted to ff-core so `ff-backend-valkey` can annotate its
193/// `EngineBackend` impls with the same context shape ff-sdk's
194/// snapshot helpers use (issue #154).
195pub fn backend_context(err: EngineError, context: impl Into<String>) -> EngineError {
196    match err {
197        EngineError::Transport { .. }
198        | EngineError::Unavailable { .. }
199        | EngineError::ResourceExhausted { .. }
200        | EngineError::Timeout { .. }
201        | EngineError::Contextual { .. } => EngineError::Contextual {
202            source: Box::new(err),
203            context: context.into(),
204        },
205        // Typed classifications are part of the public contract;
206        // wrapping them would break `match` call sites that inspect
207        // the inner variant (e.g. tests asserting
208        // `EngineError::Validation { kind: Corruption, .. }`).
209        other => other,
210    }
211}
212
213/// Validation sub-kinds. 1:1 with the Lua validation codes.
214#[derive(Debug, Clone, PartialEq, Eq)]
215#[non_exhaustive]
216pub enum ValidationKind {
217    /// Generic caller-supplied input rejected (field-name detail).
218    InvalidInput,
219    /// Worker caps do not satisfy execution's required_capabilities.
220    /// `detail` is the sorted-CSV of missing tokens.
221    CapabilityMismatch,
222    /// Malformed/oversized capability list.
223    InvalidCapabilities,
224    /// `policy_json` not valid JSON or structurally wrong.
225    InvalidPolicyJson,
226    /// Signal payload > 64KB.
227    PayloadTooLarge,
228    /// Max signals per execution reached.
229    SignalLimitExceeded,
230    /// MAC verification failed on waitpoint_key.
231    InvalidWaitpointKey,
232    /// HMAC verification failed on a bearer waitpoint_token (signal
233    /// delivery path). Preserved as a distinct variant so the REST
234    /// layer can surface the Lua code `invalid_token` verbatim.
235    InvalidToken,
236    /// Pending waitpoint has no HMAC token field.
237    WaitpointNotTokenBound,
238    /// Frame > 64KB.
239    RetentionLimitExceeded,
240    /// Lease/attempt binding mismatch on suspend.
241    InvalidLeaseForSuspend,
242    /// Dependency edge not found / invalid dependency ref.
243    InvalidDependency,
244    /// Waitpoint/execution binding mismatch.
245    InvalidWaitpointForExecution,
246    /// Unrecognized blocking reason.
247    InvalidBlockingReason,
248    /// Invalid stream ID offset.
249    InvalidOffset,
250    /// Auth failed.
251    Unauthorized,
252    /// Budget scope malformed.
253    InvalidBudgetScope,
254    /// Operator privileges required.
255    BudgetOverrideNotAllowed,
256    /// Malformed quota definition.
257    InvalidQuotaSpec,
258    /// Rotation kid must be non-empty and dot-free.
259    InvalidKid,
260    /// Rotation secret must be non-empty even-length hex.
261    InvalidSecretHex,
262    /// Rotation grace_ms must be a non-negative integer.
263    InvalidGraceMs,
264    /// Tag key violates reserved-namespace rule.
265    InvalidTagKey,
266    /// Unrecognized stream frame type.
267    InvalidFrameType,
268    /// On-disk corruption or protocol drift: an engine-owned hash /
269    /// key returned a field shape the decoder could not parse (missing
270    /// required field, malformed timestamp, unknown extra field,
271    /// cross-field identity mismatch, etc.). `detail` carries the
272    /// decoder's diagnostic string — the specific field name and/or
273    /// offending value — in the form
274    /// `"<context>: <field?>: <message>"` so operators can locate the
275    /// bad key without reparsing.
276    ///
277    /// Classified as `Terminal`: a consumer retrying the read will
278    /// see the same bytes. Surface to the operator; do not loop.
279    Corruption,
280    /// The [`crate::backend::Handle`] presented to a backend op was
281    /// minted by a different backend (e.g. a Valkey-tagged handle
282    /// passed to the Postgres backend). RFC-v0.7 Wave 1c: cross-backend
283    /// migration tooling emits Handles from one backend that must not
284    /// decode as the other; backends detect the mismatch at op entry
285    /// and return this variant.
286    ///
287    /// `detail` carries `"expected=<tag> actual=<tag>"` for operator
288    /// diagnostics.
289    HandleFromOtherBackend,
290}
291
292/// Contention sub-kinds (retryable per RFC-010 §10.7). Caller should
293/// re-dispatch or re-read and retry.
294#[derive(Debug, Clone, PartialEq, Eq)]
295#[non_exhaustive]
296pub enum ContentionKind {
297    /// Re-dispatch to `claim_resumed_execution`.
298    UseClaimResumedExecution,
299    /// Re-dispatch to `claim_execution`.
300    NotAResumedExecution,
301    /// State changed since grant. Request new grant.
302    ExecutionNotLeaseable,
303    /// Another worker holds lease. Request a different execution.
304    LeaseConflict,
305    /// Grant missing/mismatched. Request new grant.
306    InvalidClaimGrant,
307    /// Grant TTL elapsed. Request new grant.
308    ClaimGrantExpired,
309    /// No execution currently available.
310    NoEligibleExecution,
311    /// Waitpoint may not exist yet. Retry with backoff.
312    WaitpointNotFound,
313    /// Route to buffer_signal_for_pending_waitpoint.
314    WaitpointPendingUseBufferScript,
315    /// Graph revision changed. Re-read adjacency, retry.
316    StaleGraphRevision,
317    /// Execution is not in `active` state (lease superseded, etc.)
318    /// Carries the Lua-side detail payload for replay reconciliation.
319    ExecutionNotActive {
320        terminal_outcome: String,
321        lease_epoch: String,
322        lifecycle_phase: String,
323        attempt_id: String,
324    },
325    /// State changed. Scheduler skips.
326    ExecutionNotEligible,
327    /// Removed by another scheduler.
328    ExecutionNotInEligibleSet,
329    /// Already reclaimed/cancelled. Skip.
330    ExecutionNotReclaimable,
331    /// Target has no active lease (already revoked/expired/unowned).
332    NoActiveLease,
333    /// Window full; caller should backoff `retry_after_ms`.
334    RateLimitExceeded,
335    /// Concurrency cap hit.
336    ConcurrencyLimitExceeded,
337    /// Returned after 3 attempts of a SERIALIZABLE transaction in
338    /// Postgres (`cancel_flow`, `deliver_signal`, `suspend`). Caller
339    /// falls back to the appropriate reconciler.
340    ///
341    /// Classified `Retryable` via the blanket `Contention(_)` arm so
342    /// consumer retry-loops don't treat it as terminal; the
343    /// reconciler backstop catches repeat exhaustion.
344    RetryExhausted,
345}
346
347/// Permanent conflict sub-kinds. Caller must reconcile rather than
348/// retry.
349#[derive(Debug, Clone, PartialEq, Eq)]
350#[non_exhaustive]
351pub enum ConflictKind {
352    /// Dependency edge already exists. Carries the pre-existing
353    /// [`EdgeSnapshot`] so callers implementing "409 on re-declare
354    /// with different kind/ref" don't need a follow-up read.
355    ///
356    /// Note: the plain `From<ScriptError> for EngineError` impl
357    /// cannot populate `existing` (that requires an async
358    /// `describe_edge` round trip), so it falls through to
359    /// `EngineError::Transport`. Callers on the `stage_dependency`
360    /// path use `ff_sdk::engine_error::enrich_dependency_conflict`
361    /// to perform the follow-up read and promote the error.
362    ///
363    /// [`EdgeSnapshot`]: crate::contracts::EdgeSnapshot
364    DependencyAlreadyExists {
365        existing: crate::contracts::EdgeSnapshot,
366    },
367    /// Edge would create a cycle.
368    CycleDetected,
369    /// Self-referencing edge (upstream == downstream).
370    SelfReferencingEdge,
371    /// Execution is already a member of another flow.
372    ExecutionAlreadyInFlow,
373    /// Waitpoint already exists (pending or active).
374    WaitpointAlreadyExists,
375    /// Budget already attached or conflicts.
376    BudgetAttachConflict,
377    /// Quota policy already attached.
378    QuotaAttachConflict,
379    /// Rotation: same kid already installed with a different secret.
380    /// String is the conflicting kid.
381    RotationConflict(String),
382    /// Invariant violation: active attempt already exists where one
383    /// was expected absent.
384    ActiveAttemptExists,
385}
386
387/// Legal-but-surprising state sub-kinds. Per-variant semantics vary
388/// (some are benign no-ops, some are terminal). Consult the RFC-010
389/// §10.7 classification table.
390#[derive(Debug, Clone, PartialEq, Eq)]
391#[non_exhaustive]
392pub enum StateKind {
393    /// Lease superseded by reclaim.
394    StaleLease,
395    /// Lease TTL elapsed.
396    LeaseExpired,
397    /// Operator revoked lease.
398    LeaseRevoked,
399    /// Already resumed/cancelled. No-op.
400    ExecutionNotSuspended,
401    /// Open suspension already active. No-op.
402    AlreadySuspended,
403    /// Signal too late — waitpoint already closed.
404    WaitpointClosed,
405    /// Execution not suspended; no valid signal target.
406    TargetNotSignalable,
407    /// Signal already delivered (dedup).
408    DuplicateSignal,
409    /// Resume conditions not satisfied.
410    ResumeConditionNotMet,
411    /// Waitpoint not in pending state.
412    WaitpointNotPending,
413    /// Pending waitpoint aged out before suspension committed.
414    PendingWaitpointExpired,
415    /// Waitpoint is not in an open state.
416    WaitpointNotOpen,
417    /// Cannot replay non-terminal execution.
418    ExecutionNotTerminal,
419    /// Replay limit reached.
420    MaxReplaysExhausted,
421    /// Attempt terminal; no appends.
422    StreamClosed,
423    /// Lease mismatch on stream append.
424    StaleOwnerCannotAppend,
425    /// Grant already issued. Skip.
426    GrantAlreadyExists,
427    /// Execution not in specified flow.
428    ExecutionNotInFlow,
429    /// Flow already in terminal state.
430    FlowAlreadyTerminal,
431    /// Dependencies not yet satisfied.
432    DepsNotSatisfied,
433    /// Not blocked by dependencies.
434    NotBlockedByDeps,
435    /// Execution not runnable.
436    NotRunnable,
437    /// Execution already terminal.
438    Terminal,
439    /// Hard budget limit reached.
440    BudgetExceeded,
441    /// Soft budget limit reached (warning; continue).
442    BudgetSoftExceeded,
443    /// Usage seq already processed. No-op.
444    OkAlreadyApplied,
445    /// Attempt not in started state.
446    AttemptNotStarted,
447    /// Attempt already ended. No-op.
448    AttemptAlreadyTerminal,
449    /// Wrong state for new attempt.
450    ExecutionNotEligibleForAttempt,
451    /// Execution not terminal or replay limit reached.
452    ReplayNotAllowed,
453    /// Retry limit reached.
454    MaxRetriesExhausted,
455    /// Already closed. No-op.
456    StreamAlreadyClosed,
457    /// RFC-013 Stage 1d — strict `suspend` path refuses the
458    /// early-satisfied branch. The underlying backend outcome is
459    /// [`crate::contracts::SuspendOutcome::AlreadySatisfied`]; only the
460    /// SDK's strict `ClaimedTask::suspend` wrapper maps it to this
461    /// error. `ClaimedTask::try_suspend` returns the outcome directly.
462    AlreadySatisfied,
463}
464
465/// FF-internal invariant-violation sub-kinds. Should not be reachable
466/// in a correctly-behaving deployment.
467#[derive(Debug, Clone, PartialEq, Eq)]
468#[non_exhaustive]
469pub enum BugKind {
470    /// `attempt_not_in_created_state`: internal sequencing error.
471    AttemptNotInCreatedState,
472}
473
474/// Backend-agnostic transport error carried across public
475/// ff-sdk / ff-server error surfaces (#88).
476///
477/// The `Valkey` variant is the only one populated today; additional
478/// variants (e.g. `Postgres`) will be added additively as other
479/// backends land. The enum is `#[non_exhaustive]` so consumers must
480/// include a wildcard arm.
481///
482/// Construction from the Valkey-native `ferriskey::Error` lives in
483/// `ff_backend_valkey::backend_error_from_ferriskey` — keeping that
484/// conversion outside ff-core preserves ff-core's ferriskey-free
485/// public surface.
486#[derive(Debug, Clone, thiserror::Error)]
487#[non_exhaustive]
488pub enum BackendError {
489    /// Valkey-backend transport failure. Carries a backend-agnostic
490    /// classification plus the backend-rendered message so downstream
491    /// consumers can inspect without depending on ferriskey.
492    #[error("valkey backend: {kind:?}: {message}")]
493    Valkey {
494        kind: BackendErrorKind,
495        message: String,
496    },
497}
498
499impl BackendError {
500    /// Returns the classified backend kind if this error is a Valkey
501    /// transport fault. Forward-compatible with future backends:
502    /// non-Valkey variants return `None` on a call that names only the
503    /// Valkey kind; code that wants a backend-specific view should
504    /// match directly on [`BackendError`].
505    pub fn kind(&self) -> BackendErrorKind {
506        match self {
507            Self::Valkey { kind, .. } => *kind,
508        }
509    }
510
511    /// Return the backend-rendered message payload.
512    pub fn message(&self) -> &str {
513        match self {
514            Self::Valkey { message, .. } => message.as_str(),
515        }
516    }
517}
518
519/// Classified backend transport errors, kept backend-agnostic on
520/// purpose (#88). Each variant maps a family of native backend error
521/// kinds into a stable, consumer-matchable shape.
522///
523/// Consumers requiring the exact native kind for a Valkey backend
524/// must go through `ff_backend_valkey` explicitly; ff-sdk/ff-server's
525/// public surface will only ever hand out [`BackendErrorKind`].
526#[derive(Debug, Clone, Copy, PartialEq, Eq)]
527#[non_exhaustive]
528pub enum BackendErrorKind {
529    /// Network / I/O failure: the request may or may not have been
530    /// processed. Typically retryable with backoff.
531    Transport,
532    /// Backend rejected the request on protocol / parse grounds. Not
533    /// retryable without a fix.
534    Protocol,
535    /// Backend timed out responding to the request. Retryable.
536    Timeout,
537    /// Authentication / authorization failure. Not retryable.
538    Auth,
539    /// Cluster topology churn (MOVED, ASK, CLUSTERDOWN, MasterDown,
540    /// CrossSlot, ConnectionNotFoundForRoute, AllConnectionsUnavailable).
541    /// Retryable after topology settles.
542    Cluster,
543    /// Backend is temporarily busy loading state (e.g. Valkey
544    /// `LOADING`). Retryable.
545    BusyLoading,
546    /// Backend indicates the referenced script/function does not
547    /// exist. Typically handled by the caller via re-load.
548    ScriptNotLoaded,
549    /// Any other classified error from the backend. Fallback bucket
550    /// for native kinds outside the curated set above.
551    Other,
552}
553
554impl BackendErrorKind {
555    /// Stable, lowercase-kebab label suitable for log fields / HTTP
556    /// `kind` body slots. Guaranteed not to change across releases
557    /// for the existing variants.
558    pub fn as_stable_str(&self) -> &'static str {
559        match self {
560            Self::Transport => "transport",
561            Self::Protocol => "protocol",
562            Self::Timeout => "timeout",
563            Self::Auth => "auth",
564            Self::Cluster => "cluster",
565            Self::BusyLoading => "busy_loading",
566            Self::ScriptNotLoaded => "script_not_loaded",
567            Self::Other => "other",
568        }
569    }
570
571    /// Whether a caller should consider this kind retryable with
572    /// backoff. Conservative — auth + protocol + other are terminal.
573    pub fn is_retryable(&self) -> bool {
574        matches!(
575            self,
576            Self::Transport | Self::Timeout | Self::Cluster | Self::BusyLoading
577        )
578    }
579}
580
581impl EngineError {
582    /// Classify an [`EngineError`] using the underlying
583    /// [`ErrorClass`] table.
584    ///
585    /// **Transport classification in ff-core:** the inner source is
586    /// `Box<dyn std::error::Error>` which ff-core cannot downcast
587    /// without naming `ScriptError`. ff-core returns `Terminal` for
588    /// every `Transport` variant by default. Callers needing the
589    /// Retryable-on-transient-Valkey-error classification use
590    /// `ff_script::engine_error_ext::class` which downcasts to
591    /// `ScriptError` and delegates to `ScriptError::class`. ff-sdk's
592    /// public `SdkError::is_retryable` / `backend_kind` methods wire
593    /// the ff-script helper in so consumers retain the Phase-1
594    /// behavior transparently. (`backend_kind` was renamed from
595    /// `valkey_kind` in #88.)
596    pub fn class(&self) -> ErrorClass {
597        match self {
598            Self::NotFound { .. } => ErrorClass::Terminal,
599            Self::Validation { .. } => ErrorClass::Terminal,
600            Self::Contention(_) => ErrorClass::Retryable,
601            Self::Conflict(_) => ErrorClass::Terminal,
602            Self::State(StateKind::BudgetExceeded) => ErrorClass::Cooperative,
603            Self::State(
604                StateKind::ExecutionNotSuspended
605                | StateKind::AlreadySuspended
606                | StateKind::AlreadySatisfied
607                | StateKind::WaitpointClosed
608                | StateKind::DuplicateSignal
609                | StateKind::GrantAlreadyExists
610                | StateKind::OkAlreadyApplied
611                | StateKind::AttemptAlreadyTerminal
612                | StateKind::StreamAlreadyClosed
613                | StateKind::BudgetSoftExceeded
614                | StateKind::WaitpointNotOpen
615                | StateKind::WaitpointNotPending
616                | StateKind::PendingWaitpointExpired
617                | StateKind::NotBlockedByDeps
618                | StateKind::DepsNotSatisfied,
619            ) => ErrorClass::Informational,
620            Self::State(_) => ErrorClass::Terminal,
621            Self::Bug(_) => ErrorClass::Bug,
622            // ff-core cannot name ScriptError. Safe default: Terminal.
623            // ff-script's engine_error_ext::class upgrades to
624            // ScriptError::class when the inner source is a
625            // ScriptError.
626            Self::Transport { .. } => ErrorClass::Terminal,
627            // Unavailable is terminal at the call site — the method is
628            // not implemented; the caller must either fall back to a
629            // different code path or surface to the user.
630            Self::Unavailable { .. } => ErrorClass::Terminal,
631            // Resource exhaustion is retryable — the ceiling is a
632            // transient server-side gate; callers back off and try
633            // again. Mirrors RFC-017 §6 and ServerError::is_retryable
634            // for the pre-migration `ConcurrencyLimitExceeded` arm.
635            Self::ResourceExhausted { .. } => ErrorClass::Retryable,
636            // Timeouts surface as terminal from the caller's POV —
637            // the specific op exceeded its budget; a retry is the
638            // caller's decision, not the error's classification.
639            Self::Timeout { .. } => ErrorClass::Terminal,
640            // RFC-019 Stage A: StreamDisconnected is terminal for the
641            // current stream — consumer reconnects with the cursor,
642            // which is a caller decision, not a retry at this layer.
643            Self::StreamDisconnected { .. } => ErrorClass::Terminal,
644            // StreamBackpressure is informational: events were dropped
645            // but the stream continues. Caller reconciles via
646            // authoritative state.
647            Self::StreamBackpressure => ErrorClass::Informational,
648            // Descend into the wrapped error — context is diagnostic;
649            // classification follows the inner cause.
650            Self::Contextual { source, .. } => source.class(),
651        }
652    }
653}
654
655#[cfg(test)]
656mod tests {
657    use super::*;
658
659    #[test]
660    fn class_contention_is_retryable() {
661        let err = EngineError::Contention(ContentionKind::LeaseConflict);
662        assert_eq!(err.class(), ErrorClass::Retryable);
663    }
664
665    #[test]
666    fn class_budget_exceeded_is_cooperative() {
667        let err = EngineError::State(StateKind::BudgetExceeded);
668        assert_eq!(err.class(), ErrorClass::Cooperative);
669    }
670
671    #[test]
672    fn class_duplicate_signal_is_informational() {
673        let err = EngineError::State(StateKind::DuplicateSignal);
674        assert_eq!(err.class(), ErrorClass::Informational);
675    }
676
677    #[test]
678    fn class_bug_variant() {
679        let err = EngineError::Bug(BugKind::AttemptNotInCreatedState);
680        assert_eq!(err.class(), ErrorClass::Bug);
681    }
682
683    #[test]
684    fn class_transport_defaults_terminal() {
685        // ff-core has no ScriptError downcast; Transport is Terminal
686        // until ff-script's engine_error_ext::class is called.
687        let raw = std::io::Error::other("simulated transport error");
688        let err = EngineError::Transport {
689            backend: "test",
690            source: Box::new(raw),
691        };
692        assert_eq!(err.class(), ErrorClass::Terminal);
693    }
694
695    #[test]
696    fn unavailable_is_terminal() {
697        assert_eq!(
698            EngineError::Unavailable { op: "foo" }.class(),
699            ErrorClass::Terminal
700        );
701    }
702
703    #[test]
704    fn backend_context_wraps_transport_and_preserves_typed() {
705        // Transport gets wrapped with the call-site label (issue #154).
706        let raw = std::io::Error::other("simulated transport error");
707        let wrapped = backend_context(
708            EngineError::Transport {
709                backend: "valkey",
710                source: Box::new(raw),
711            },
712            "renew: FCALL ff_renew_lease",
713        );
714        let rendered = format!("{wrapped}");
715        assert!(
716            rendered.starts_with("renew: FCALL ff_renew_lease: transport (valkey): "),
717            "expected context prefix, got: {rendered}"
718        );
719        // Unavailable also wraps so callers can still filter on the op.
720        let wrapped = backend_context(EngineError::Unavailable { op: "x" }, "ctx");
721        assert!(matches!(wrapped, EngineError::Contextual { .. }));
722
723        // Typed classifications pass through unchanged so existing
724        // `match` call sites keep working.
725        let inner = EngineError::Validation {
726            kind: ValidationKind::Corruption,
727            detail: "bad".into(),
728        };
729        let passthrough = backend_context(inner, "describe_edge: HGETALL edge");
730        match passthrough {
731            EngineError::Validation { kind, .. } => {
732                assert_eq!(kind, ValidationKind::Corruption);
733            }
734            other => panic!("expected Validation, got {other:?}"),
735        }
736        let inner = EngineError::Contention(ContentionKind::LeaseConflict);
737        assert_eq!(
738            backend_context(inner, "renew: FCALL ff_renew_lease").class(),
739            ErrorClass::Retryable
740        );
741    }
742
743    #[test]
744    fn backend_error_kind_round_trip() {
745        let be = BackendError::Valkey {
746            kind: BackendErrorKind::Transport,
747            message: "connection reset".into(),
748        };
749        assert_eq!(be.kind(), BackendErrorKind::Transport);
750        assert_eq!(be.message(), "connection reset");
751    }
752
753    #[test]
754    fn backend_kind_stable_strings_fixed() {
755        // Stability fence: these strings are part of the public
756        // contract (log field values, HTTP body `kind` slots). Adding
757        // a variant is additive; changing an existing string is a
758        // break.
759        assert_eq!(BackendErrorKind::Transport.as_stable_str(), "transport");
760        assert_eq!(BackendErrorKind::Protocol.as_stable_str(), "protocol");
761        assert_eq!(BackendErrorKind::Timeout.as_stable_str(), "timeout");
762        assert_eq!(BackendErrorKind::Auth.as_stable_str(), "auth");
763        assert_eq!(BackendErrorKind::Cluster.as_stable_str(), "cluster");
764        assert_eq!(
765            BackendErrorKind::BusyLoading.as_stable_str(),
766            "busy_loading"
767        );
768        assert_eq!(
769            BackendErrorKind::ScriptNotLoaded.as_stable_str(),
770            "script_not_loaded"
771        );
772        assert_eq!(BackendErrorKind::Other.as_stable_str(), "other");
773    }
774
775    #[test]
776    fn backend_kind_retryability() {
777        for k in [
778            BackendErrorKind::Transport,
779            BackendErrorKind::Timeout,
780            BackendErrorKind::Cluster,
781            BackendErrorKind::BusyLoading,
782        ] {
783            assert!(k.is_retryable(), "{k:?} should be retryable");
784        }
785        for k in [
786            BackendErrorKind::Protocol,
787            BackendErrorKind::Auth,
788            BackendErrorKind::ScriptNotLoaded,
789            BackendErrorKind::Other,
790        ] {
791            assert!(!k.is_retryable(), "{k:?} should NOT be retryable");
792        }
793    }
794}