ff_core/engine_error.rs
1//! Typed engine-error surface (issue #58.6).
2//!
3//! **RFC-012 Stage 1a:** moved from `ff-sdk::engine_error` to
4//! `ff-core::engine_error` so it becomes nameable by the
5//! `EngineBackend` trait (which lives in `ff-core::engine_backend`) without
6//! forcing a public-surface dependency from ff-core on ff-script. The
7//! [`ScriptError`]-aware helpers (`From<ScriptError>`, `valkey_kind`,
8//! `transport_script`, `transport_script_ref`) live in ff-script as
9//! free functions (see `ff_script::engine_error_ext`) — ff-core owns
10//! the enum shapes; ff-script owns the transport-downcast plumbing.
11//!
12//! # Mapping shape
13//!
14//! `ScriptError` lives in the `ff-script` crate (transport-adjacent).
15//! `EngineError` lives here in `ff-core` and is what public SDK calls
16//! return via `ff_sdk::SdkError::Engine`. The bidirectional mapping:
17//!
18//! * `From<ScriptError> for EngineError` — every `ScriptError` variant
19//! is classified into `NotFound` / `Validation` / `Contention` /
20//! `Conflict` / `State` / `Bug` / `Transport`. `Parse` + `Valkey`
21//! flow through `Transport { source: Box<ScriptError> }` so the
22//! underlying `ferriskey::ErrorKind` / parse detail is preserved.
23//! * `DependencyAlreadyExists` is special: per the #58.6 design the
24//! variant carries the pre-existing [`EdgeSnapshot`] inline.
25//! Populating that field requires an extra round-trip (the Lua
26//! script only knows the edge_id), so plain `From<ScriptError>`
27//! returns a `Transport` fallback for that code — callers in the
28//! `stage_dependency` path use `ff_sdk::engine_error::enrich_dependency_conflict`
29//! to perform the follow-up `describe_edge` and upgrade the error
30//! before returning.
31//!
32//! # Exhaustiveness
33//!
34//! The top-level [`EngineError`] and every sub-kind are
35//! `#[non_exhaustive]`. FF can add new Lua error codes in minors
36//! without a breaking change to this surface — consumers that
37//! `match` on a sub-kind must include a `_` arm.
38
39use crate::error::ErrorClass;
40
41/// Typed engine-error surface. See module docs.
42#[derive(Debug, thiserror::Error)]
43#[non_exhaustive]
44pub enum EngineError {
45 /// A uniquely-identified resource did not exist. `entity` is a
46 /// stable label (e.g. `"execution"`, `"flow"`, `"attempt"`) that
47 /// consumers can match without re-parsing a message.
48 #[error("not found: {entity}")]
49 NotFound { entity: &'static str },
50
51 /// Caller supplied a malformed, out-of-range, or otherwise
52 /// rejected input. `detail` carries the Lua-side payload (field
53 /// name, offending value, or CSV of missing tokens, depending on
54 /// `kind`).
55 #[error("validation: {kind:?}: {detail}")]
56 Validation {
57 kind: ValidationKind,
58 detail: String,
59 },
60
61 /// Transient conflict with another worker or with the current
62 /// state of the execution/flow. Caller should retry per
63 /// RFC-010 §10.7.
64 #[error("contention: {0:?}")]
65 Contention(ContentionKind),
66
67 /// Permanent conflict — the requested mutation conflicts with
68 /// an existing record (e.g. duplicate edge, cycle, already-in-flow).
69 /// Caller must not blindly retry.
70 #[error("conflict: {0:?}")]
71 Conflict(ConflictKind),
72
73 /// Legal but surprising state — lease expired, already-suspended,
74 /// duplicate-signal, budget-exceeded, etc. Per-variant semantics
75 /// documented on [`StateKind`].
76 #[error("state: {0:?}")]
77 State(StateKind),
78
79 /// FF-internal invariant violation that should not be reachable
80 /// in a correctly-behaving deployment. Consumers typically log
81 /// and surface as a 5xx.
82 #[error("bug: {0:?}")]
83 Bug(BugKind),
84
85 /// Backend transport fault or response-parse failure (RFC-012 §4.2
86 /// round-4 shape). Broadened in Stage 0 to carry `Box<dyn Error>`
87 /// so non-Valkey backends (Postgres, future) can route their
88 /// native transport errors through this variant without going via
89 /// `ScriptError`.
90 ///
91 /// * `backend` — static diagnostic label (`"valkey"`, `"postgres"`,
92 /// etc.). Kept `&'static str` to avoid heap alloc on construction.
93 /// * `source` — boxed error. For the Valkey backend this is
94 /// `ff_script::error::ScriptError`; downcast with
95 /// `source.downcast_ref::<ScriptError>()` to recover
96 /// `ferriskey::ErrorKind` / parse detail. Helper lives in
97 /// `ff_script::engine_error_ext::transport_script_ref`.
98 #[error("transport ({backend}): {source}")]
99 Transport {
100 backend: &'static str,
101 #[source]
102 source: Box<dyn std::error::Error + Send + Sync + 'static>,
103 },
104
105 /// Backend method not wired up yet (RFC-012 §4.2 K#7 holdover).
106 /// Returned by staged backend impls for methods that are known
107 /// types in the trait but not yet implemented. Graceful degradation
108 /// in place of `unimplemented!()` panics. Additive; does not
109 /// participate in the `From<ScriptError>` mapping.
110 #[error("unavailable: {op}")]
111 Unavailable { op: &'static str },
112
113 /// Backend-owned concurrency pool reached its ceiling (RFC-017 §6).
114 /// `pool` is a stable label (`"stream_ops"`, `"admin_rotate"`, …);
115 /// `max` is the pool ceiling; `retry_after_ms`, when set, is an
116 /// advisory retry hint the backend computed from its own back-
117 /// pressure signal. Maps to HTTP 429 at the `ff-server` boundary.
118 #[error("resource exhausted: pool={pool} max={max}")]
119 ResourceExhausted {
120 pool: &'static str,
121 max: u32,
122 retry_after_ms: Option<u32>,
123 },
124
125 /// An operation ran past its deadline (RFC-017 §5.4
126 /// `shutdown_prepare`). `op` is a stable label, `elapsed` is how
127 /// long the operation ran before the caller aborted. Additive;
128 /// call sites that previously did not emit this variant keep
129 /// emitting whatever they emitted before.
130 #[error("timeout: op={op} elapsed={elapsed:?}")]
131 Timeout {
132 op: &'static str,
133 elapsed: std::time::Duration,
134 },
135
136 /// RFC-019 Stage A — a subscription stream returned by
137 /// [`crate::engine_backend::EngineBackend::subscribe_lease_history`]
138 /// (or its siblings) observed a backend disconnect. The cursor is
139 /// the last event position the stream successfully yielded (or
140 /// [`crate::stream_subscribe::StreamCursor::empty`] if none was
141 /// observed). Consumers reconnect by re-calling the same
142 /// `subscribe_*` method with this cursor — that is the
143 /// owner-adjudicated disconnect contract (RFC-019 §Open
144 /// Questions #2).
145 ///
146 /// Terminal from the stream's perspective: the subscription ends
147 /// after yielding this error.
148 #[error("stream disconnected; reconnect with returned cursor")]
149 StreamDisconnected {
150 cursor: crate::stream_subscribe::StreamCursor,
151 },
152
153 /// RFC-019 Stage A — a subscription stream fell behind its bounded
154 /// queue and dropped events rather than blocking the producer.
155 /// Reserved for backends that explicitly surface lag (Stage B
156 /// lands the first call-site via `subscribe_instance_tags`); no
157 /// Stage A backend emits this today but consumers match on it to
158 /// future-proof reconciliation paths.
159 ///
160 /// Non-terminal: the stream continues after this error; callers
161 /// treat it as a "refresh from authoritative state" signal.
162 #[error("stream backpressure; events dropped")]
163 StreamBackpressure,
164
165 /// An inner [`EngineError`] wrapped with a call-site label so
166 /// operators triaging logs can see which op the error came from
167 /// without inferring from surrounding spans. Constructed via
168 /// [`backend_context`]; carries a lightweight string context
169 /// (e.g. `"renew: FCALL ff_renew_lease"`).
170 ///
171 /// Classification helpers (`ErrorClass`, `BackendErrorKind`,
172 /// etc.) transparently descend into `source` so a consumer that
173 /// matches on the wrapper arm keeps the same retry/terminal
174 /// semantics as the unwrapped inner error.
175 #[error("{context}: {source}")]
176 Contextual {
177 #[source]
178 source: Box<EngineError>,
179 context: String,
180 },
181}
182
183/// Wrap an [`EngineError`] with a call-site label when the error is
184/// a transport-family fault — `Transport` or `Unavailable`. Typed
185/// classifications (`NotFound`, `Validation`, `Contention`,
186/// `Conflict`, `State`, `Bug`) form the public contract boundary
187/// for consumers that `match` on the variant, so we return them
188/// unchanged. Repeated wraps on an already-`Contextual` error
189/// nest an additional layer; callers should wrap once per op
190/// boundary.
191///
192/// Promoted to ff-core so `ff-backend-valkey` can annotate its
193/// `EngineBackend` impls with the same context shape ff-sdk's
194/// snapshot helpers use (issue #154).
195pub fn backend_context(err: EngineError, context: impl Into<String>) -> EngineError {
196 match err {
197 EngineError::Transport { .. }
198 | EngineError::Unavailable { .. }
199 | EngineError::ResourceExhausted { .. }
200 | EngineError::Timeout { .. }
201 | EngineError::Contextual { .. } => EngineError::Contextual {
202 source: Box::new(err),
203 context: context.into(),
204 },
205 // Typed classifications are part of the public contract;
206 // wrapping them would break `match` call sites that inspect
207 // the inner variant (e.g. tests asserting
208 // `EngineError::Validation { kind: Corruption, .. }`).
209 other => other,
210 }
211}
212
213/// Validation sub-kinds. 1:1 with the Lua validation codes.
214#[derive(Debug, Clone, PartialEq, Eq)]
215#[non_exhaustive]
216pub enum ValidationKind {
217 /// Generic caller-supplied input rejected (field-name detail).
218 InvalidInput,
219 /// Worker caps do not satisfy execution's required_capabilities.
220 /// `detail` is the sorted-CSV of missing tokens.
221 CapabilityMismatch,
222 /// Malformed/oversized capability list.
223 InvalidCapabilities,
224 /// `policy_json` not valid JSON or structurally wrong.
225 InvalidPolicyJson,
226 /// Signal payload > 64KB.
227 PayloadTooLarge,
228 /// Max signals per execution reached.
229 SignalLimitExceeded,
230 /// MAC verification failed on waitpoint_key.
231 InvalidWaitpointKey,
232 /// HMAC verification failed on a bearer waitpoint_token (signal
233 /// delivery path). Preserved as a distinct variant so the REST
234 /// layer can surface the Lua code `invalid_token` verbatim.
235 InvalidToken,
236 /// Pending waitpoint has no HMAC token field.
237 WaitpointNotTokenBound,
238 /// Frame > 64KB.
239 RetentionLimitExceeded,
240 /// Lease/attempt binding mismatch on suspend.
241 InvalidLeaseForSuspend,
242 /// Dependency edge not found / invalid dependency ref.
243 InvalidDependency,
244 /// Waitpoint/execution binding mismatch.
245 InvalidWaitpointForExecution,
246 /// Unrecognized blocking reason.
247 InvalidBlockingReason,
248 /// Invalid stream ID offset.
249 InvalidOffset,
250 /// Auth failed.
251 Unauthorized,
252 /// Budget scope malformed.
253 InvalidBudgetScope,
254 /// Operator privileges required.
255 BudgetOverrideNotAllowed,
256 /// Malformed quota definition.
257 InvalidQuotaSpec,
258 /// Rotation kid must be non-empty and dot-free.
259 InvalidKid,
260 /// Rotation secret must be non-empty even-length hex.
261 InvalidSecretHex,
262 /// Rotation grace_ms must be a non-negative integer.
263 InvalidGraceMs,
264 /// Tag key violates reserved-namespace rule.
265 InvalidTagKey,
266 /// Unrecognized stream frame type.
267 InvalidFrameType,
268 /// On-disk corruption or protocol drift: an engine-owned hash /
269 /// key returned a field shape the decoder could not parse (missing
270 /// required field, malformed timestamp, unknown extra field,
271 /// cross-field identity mismatch, etc.). `detail` carries the
272 /// decoder's diagnostic string — the specific field name and/or
273 /// offending value — in the form
274 /// `"<context>: <field?>: <message>"` so operators can locate the
275 /// bad key without reparsing.
276 ///
277 /// Classified as `Terminal`: a consumer retrying the read will
278 /// see the same bytes. Surface to the operator; do not loop.
279 Corruption,
280 /// The [`crate::backend::Handle`] presented to a backend op was
281 /// minted by a different backend (e.g. a Valkey-tagged handle
282 /// passed to the Postgres backend). RFC-v0.7 Wave 1c: cross-backend
283 /// migration tooling emits Handles from one backend that must not
284 /// decode as the other; backends detect the mismatch at op entry
285 /// and return this variant.
286 ///
287 /// `detail` carries `"expected=<tag> actual=<tag>"` for operator
288 /// diagnostics.
289 HandleFromOtherBackend,
290}
291
292/// Contention sub-kinds (retryable per RFC-010 §10.7). Caller should
293/// re-dispatch or re-read and retry.
294#[derive(Debug, Clone, PartialEq, Eq)]
295#[non_exhaustive]
296pub enum ContentionKind {
297 /// Re-dispatch to `claim_resumed_execution`.
298 UseClaimResumedExecution,
299 /// Re-dispatch to `claim_execution`.
300 NotAResumedExecution,
301 /// State changed since grant. Request new grant.
302 ExecutionNotLeaseable,
303 /// Another worker holds lease. Request a different execution.
304 LeaseConflict,
305 /// Grant missing/mismatched. Request new grant.
306 InvalidClaimGrant,
307 /// Grant TTL elapsed. Request new grant.
308 ClaimGrantExpired,
309 /// No execution currently available.
310 NoEligibleExecution,
311 /// Waitpoint may not exist yet. Retry with backoff.
312 WaitpointNotFound,
313 /// Route to buffer_signal_for_pending_waitpoint.
314 WaitpointPendingUseBufferScript,
315 /// Graph revision changed. Re-read adjacency, retry.
316 StaleGraphRevision,
317 /// Execution is not in `active` state (lease superseded, etc.)
318 /// Carries the Lua-side detail payload for replay reconciliation.
319 ExecutionNotActive {
320 terminal_outcome: String,
321 lease_epoch: String,
322 lifecycle_phase: String,
323 attempt_id: String,
324 },
325 /// State changed. Scheduler skips.
326 ExecutionNotEligible,
327 /// Removed by another scheduler.
328 ExecutionNotInEligibleSet,
329 /// Already reclaimed/cancelled. Skip.
330 ExecutionNotReclaimable,
331 /// Target has no active lease (already revoked/expired/unowned).
332 NoActiveLease,
333 /// Window full; caller should backoff `retry_after_ms`.
334 RateLimitExceeded,
335 /// Concurrency cap hit.
336 ConcurrencyLimitExceeded,
337 /// Returned after 3 attempts of a SERIALIZABLE transaction in
338 /// Postgres (`cancel_flow`, `deliver_signal`, `suspend`). Caller
339 /// falls back to the appropriate reconciler.
340 ///
341 /// Classified `Retryable` via the blanket `Contention(_)` arm so
342 /// consumer retry-loops don't treat it as terminal; the
343 /// reconciler backstop catches repeat exhaustion.
344 RetryExhausted,
345}
346
347/// Permanent conflict sub-kinds. Caller must reconcile rather than
348/// retry.
349#[derive(Debug, Clone, PartialEq, Eq)]
350#[non_exhaustive]
351pub enum ConflictKind {
352 /// Dependency edge already exists. Carries the pre-existing
353 /// [`EdgeSnapshot`] so callers implementing "409 on re-declare
354 /// with different kind/ref" don't need a follow-up read.
355 ///
356 /// Note: the plain `From<ScriptError> for EngineError` impl
357 /// cannot populate `existing` (that requires an async
358 /// `describe_edge` round trip), so it falls through to
359 /// `EngineError::Transport`. Callers on the `stage_dependency`
360 /// path use `ff_sdk::engine_error::enrich_dependency_conflict`
361 /// to perform the follow-up read and promote the error.
362 ///
363 /// [`EdgeSnapshot`]: crate::contracts::EdgeSnapshot
364 DependencyAlreadyExists {
365 existing: crate::contracts::EdgeSnapshot,
366 },
367 /// Edge would create a cycle.
368 CycleDetected,
369 /// Self-referencing edge (upstream == downstream).
370 SelfReferencingEdge,
371 /// Execution is already a member of another flow.
372 ExecutionAlreadyInFlow,
373 /// Waitpoint already exists (pending or active).
374 WaitpointAlreadyExists,
375 /// Budget already attached or conflicts.
376 BudgetAttachConflict,
377 /// Quota policy already attached.
378 QuotaAttachConflict,
379 /// Rotation: same kid already installed with a different secret.
380 /// String is the conflicting kid.
381 RotationConflict(String),
382 /// Invariant violation: active attempt already exists where one
383 /// was expected absent.
384 ActiveAttemptExists,
385}
386
387/// Legal-but-surprising state sub-kinds. Per-variant semantics vary
388/// (some are benign no-ops, some are terminal). Consult the RFC-010
389/// §10.7 classification table.
390#[derive(Debug, Clone, PartialEq, Eq)]
391#[non_exhaustive]
392pub enum StateKind {
393 /// Lease superseded by reclaim.
394 StaleLease,
395 /// Lease TTL elapsed.
396 LeaseExpired,
397 /// Operator revoked lease.
398 LeaseRevoked,
399 /// Already resumed/cancelled. No-op.
400 ExecutionNotSuspended,
401 /// Open suspension already active. No-op.
402 AlreadySuspended,
403 /// Signal too late — waitpoint already closed.
404 WaitpointClosed,
405 /// Execution not suspended; no valid signal target.
406 TargetNotSignalable,
407 /// Signal already delivered (dedup).
408 DuplicateSignal,
409 /// Resume conditions not satisfied.
410 ResumeConditionNotMet,
411 /// Waitpoint not in pending state.
412 WaitpointNotPending,
413 /// Pending waitpoint aged out before suspension committed.
414 PendingWaitpointExpired,
415 /// Waitpoint is not in an open state.
416 WaitpointNotOpen,
417 /// Cannot replay non-terminal execution.
418 ExecutionNotTerminal,
419 /// Replay limit reached.
420 MaxReplaysExhausted,
421 /// Attempt terminal; no appends.
422 StreamClosed,
423 /// Lease mismatch on stream append.
424 StaleOwnerCannotAppend,
425 /// Grant already issued. Skip.
426 GrantAlreadyExists,
427 /// Execution not in specified flow.
428 ExecutionNotInFlow,
429 /// Flow already in terminal state.
430 FlowAlreadyTerminal,
431 /// Dependencies not yet satisfied.
432 DepsNotSatisfied,
433 /// Not blocked by dependencies.
434 NotBlockedByDeps,
435 /// Execution not runnable.
436 NotRunnable,
437 /// Execution already terminal.
438 Terminal,
439 /// Hard budget limit reached.
440 BudgetExceeded,
441 /// Soft budget limit reached (warning; continue).
442 BudgetSoftExceeded,
443 /// Usage seq already processed. No-op.
444 OkAlreadyApplied,
445 /// Attempt not in started state.
446 AttemptNotStarted,
447 /// Attempt already ended. No-op.
448 AttemptAlreadyTerminal,
449 /// Wrong state for new attempt.
450 ExecutionNotEligibleForAttempt,
451 /// Execution not terminal or replay limit reached.
452 ReplayNotAllowed,
453 /// Retry limit reached.
454 MaxRetriesExhausted,
455 /// Already closed. No-op.
456 StreamAlreadyClosed,
457 /// RFC-013 Stage 1d — strict `suspend` path refuses the
458 /// early-satisfied branch. The underlying backend outcome is
459 /// [`crate::contracts::SuspendOutcome::AlreadySatisfied`]; only the
460 /// SDK's strict `ClaimedTask::suspend` wrapper maps it to this
461 /// error. `ClaimedTask::try_suspend` returns the outcome directly.
462 AlreadySatisfied,
463}
464
465/// FF-internal invariant-violation sub-kinds. Should not be reachable
466/// in a correctly-behaving deployment.
467#[derive(Debug, Clone, PartialEq, Eq)]
468#[non_exhaustive]
469pub enum BugKind {
470 /// `attempt_not_in_created_state`: internal sequencing error.
471 AttemptNotInCreatedState,
472}
473
474/// Backend-agnostic transport error carried across public
475/// ff-sdk / ff-server error surfaces (#88).
476///
477/// The `Valkey` variant is the only one populated today; additional
478/// variants (e.g. `Postgres`) will be added additively as other
479/// backends land. The enum is `#[non_exhaustive]` so consumers must
480/// include a wildcard arm.
481///
482/// Construction from the Valkey-native `ferriskey::Error` lives in
483/// `ff_backend_valkey::backend_error_from_ferriskey` — keeping that
484/// conversion outside ff-core preserves ff-core's ferriskey-free
485/// public surface.
486#[derive(Debug, Clone, thiserror::Error)]
487#[non_exhaustive]
488pub enum BackendError {
489 /// Valkey-backend transport failure. Carries a backend-agnostic
490 /// classification plus the backend-rendered message so downstream
491 /// consumers can inspect without depending on ferriskey.
492 #[error("valkey backend: {kind:?}: {message}")]
493 Valkey {
494 kind: BackendErrorKind,
495 message: String,
496 },
497
498 /// RFC-023 §4.5: the SQLite dev-only backend refused to construct
499 /// because `FF_DEV_MODE=1` was not set. Exact message text mirrors
500 /// §3.3 HTTP-path text so embedded and server paths give the same
501 /// actionable signal.
502 #[error(
503 "SqliteBackend requires FF_DEV_MODE=1 to activate. SQLite is \
504 dev-only; see https://github.com/avifenesh/FlowFabric/blob/main/docs/dev-harness.md \
505 for details."
506 )]
507 RequiresDevMode,
508}
509
510impl BackendError {
511 /// Returns the classified backend kind.
512 ///
513 /// Every variant maps to a [`BackendErrorKind`] — transport
514 /// variants return their carried `kind`, configuration/guard
515 /// variants return the closest-fitting classification (e.g.
516 /// [`BackendError::RequiresDevMode`] → [`BackendErrorKind::Protocol`]
517 /// because it is a configuration refusal, not a retryable
518 /// transport fault). Consumers needing to distinguish the
519 /// underlying variant should match directly on [`BackendError`];
520 /// `kind()` is the stable, classifier-only view.
521 pub fn kind(&self) -> BackendErrorKind {
522 match self {
523 Self::Valkey { kind, .. } => *kind,
524 // RFC-023: dev-mode guard refusal is a configuration/protocol
525 // fault, not a retryable transport condition.
526 Self::RequiresDevMode => BackendErrorKind::Protocol,
527 }
528 }
529
530 /// Return the backend-rendered message payload.
531 pub fn message(&self) -> &str {
532 match self {
533 Self::Valkey { message, .. } => message.as_str(),
534 Self::RequiresDevMode => {
535 "SqliteBackend requires FF_DEV_MODE=1 to activate. SQLite is \
536 dev-only; see https://github.com/avifenesh/FlowFabric/blob/main/docs/dev-harness.md \
537 for details."
538 }
539 }
540 }
541}
542
543/// Classified backend transport errors, kept backend-agnostic on
544/// purpose (#88). Each variant maps a family of native backend error
545/// kinds into a stable, consumer-matchable shape.
546///
547/// Consumers requiring the exact native kind for a Valkey backend
548/// must go through `ff_backend_valkey` explicitly; ff-sdk/ff-server's
549/// public surface will only ever hand out [`BackendErrorKind`].
550#[derive(Debug, Clone, Copy, PartialEq, Eq)]
551#[non_exhaustive]
552pub enum BackendErrorKind {
553 /// Network / I/O failure: the request may or may not have been
554 /// processed. Typically retryable with backoff.
555 Transport,
556 /// Backend rejected the request on protocol / parse grounds. Not
557 /// retryable without a fix.
558 Protocol,
559 /// Backend timed out responding to the request. Retryable.
560 Timeout,
561 /// Authentication / authorization failure. Not retryable.
562 Auth,
563 /// Cluster topology churn (MOVED, ASK, CLUSTERDOWN, MasterDown,
564 /// CrossSlot, ConnectionNotFoundForRoute, AllConnectionsUnavailable).
565 /// Retryable after topology settles.
566 Cluster,
567 /// Backend is temporarily busy loading state (e.g. Valkey
568 /// `LOADING`). Retryable.
569 BusyLoading,
570 /// Backend indicates the referenced script/function does not
571 /// exist. Typically handled by the caller via re-load.
572 ScriptNotLoaded,
573 /// Any other classified error from the backend. Fallback bucket
574 /// for native kinds outside the curated set above.
575 Other,
576}
577
578impl BackendErrorKind {
579 /// Stable, lowercase-kebab label suitable for log fields / HTTP
580 /// `kind` body slots. Guaranteed not to change across releases
581 /// for the existing variants.
582 pub fn as_stable_str(&self) -> &'static str {
583 match self {
584 Self::Transport => "transport",
585 Self::Protocol => "protocol",
586 Self::Timeout => "timeout",
587 Self::Auth => "auth",
588 Self::Cluster => "cluster",
589 Self::BusyLoading => "busy_loading",
590 Self::ScriptNotLoaded => "script_not_loaded",
591 Self::Other => "other",
592 }
593 }
594
595 /// Whether a caller should consider this kind retryable with
596 /// backoff. Conservative — auth + protocol + other are terminal.
597 pub fn is_retryable(&self) -> bool {
598 matches!(
599 self,
600 Self::Transport | Self::Timeout | Self::Cluster | Self::BusyLoading
601 )
602 }
603}
604
605impl EngineError {
606 /// Classify an [`EngineError`] using the underlying
607 /// [`ErrorClass`] table.
608 ///
609 /// **Transport classification in ff-core:** the inner source is
610 /// `Box<dyn std::error::Error>` which ff-core cannot downcast
611 /// without naming `ScriptError`. ff-core returns `Terminal` for
612 /// every `Transport` variant by default. Callers needing the
613 /// Retryable-on-transient-Valkey-error classification use
614 /// `ff_script::engine_error_ext::class` which downcasts to
615 /// `ScriptError` and delegates to `ScriptError::class`. ff-sdk's
616 /// public `SdkError::is_retryable` / `backend_kind` methods wire
617 /// the ff-script helper in so consumers retain the Phase-1
618 /// behavior transparently. (`backend_kind` was renamed from
619 /// `valkey_kind` in #88.)
620 pub fn class(&self) -> ErrorClass {
621 match self {
622 Self::NotFound { .. } => ErrorClass::Terminal,
623 Self::Validation { .. } => ErrorClass::Terminal,
624 Self::Contention(_) => ErrorClass::Retryable,
625 Self::Conflict(_) => ErrorClass::Terminal,
626 Self::State(StateKind::BudgetExceeded) => ErrorClass::Cooperative,
627 Self::State(
628 StateKind::ExecutionNotSuspended
629 | StateKind::AlreadySuspended
630 | StateKind::AlreadySatisfied
631 | StateKind::WaitpointClosed
632 | StateKind::DuplicateSignal
633 | StateKind::GrantAlreadyExists
634 | StateKind::OkAlreadyApplied
635 | StateKind::AttemptAlreadyTerminal
636 | StateKind::StreamAlreadyClosed
637 | StateKind::BudgetSoftExceeded
638 | StateKind::WaitpointNotOpen
639 | StateKind::WaitpointNotPending
640 | StateKind::PendingWaitpointExpired
641 | StateKind::NotBlockedByDeps
642 | StateKind::DepsNotSatisfied,
643 ) => ErrorClass::Informational,
644 Self::State(_) => ErrorClass::Terminal,
645 Self::Bug(_) => ErrorClass::Bug,
646 // ff-core cannot name ScriptError. Safe default: Terminal.
647 // ff-script's engine_error_ext::class upgrades to
648 // ScriptError::class when the inner source is a
649 // ScriptError.
650 Self::Transport { .. } => ErrorClass::Terminal,
651 // Unavailable is terminal at the call site — the method is
652 // not implemented; the caller must either fall back to a
653 // different code path or surface to the user.
654 Self::Unavailable { .. } => ErrorClass::Terminal,
655 // Resource exhaustion is retryable — the ceiling is a
656 // transient server-side gate; callers back off and try
657 // again. Mirrors RFC-017 §6 and ServerError::is_retryable
658 // for the pre-migration `ConcurrencyLimitExceeded` arm.
659 Self::ResourceExhausted { .. } => ErrorClass::Retryable,
660 // Timeouts surface as terminal from the caller's POV —
661 // the specific op exceeded its budget; a retry is the
662 // caller's decision, not the error's classification.
663 Self::Timeout { .. } => ErrorClass::Terminal,
664 // RFC-019 Stage A: StreamDisconnected is terminal for the
665 // current stream — consumer reconnects with the cursor,
666 // which is a caller decision, not a retry at this layer.
667 Self::StreamDisconnected { .. } => ErrorClass::Terminal,
668 // StreamBackpressure is informational: events were dropped
669 // but the stream continues. Caller reconciles via
670 // authoritative state.
671 Self::StreamBackpressure => ErrorClass::Informational,
672 // Descend into the wrapped error — context is diagnostic;
673 // classification follows the inner cause.
674 Self::Contextual { source, .. } => source.class(),
675 }
676 }
677}
678
679#[cfg(test)]
680mod tests {
681 use super::*;
682
683 #[test]
684 fn class_contention_is_retryable() {
685 let err = EngineError::Contention(ContentionKind::LeaseConflict);
686 assert_eq!(err.class(), ErrorClass::Retryable);
687 }
688
689 #[test]
690 fn class_budget_exceeded_is_cooperative() {
691 let err = EngineError::State(StateKind::BudgetExceeded);
692 assert_eq!(err.class(), ErrorClass::Cooperative);
693 }
694
695 #[test]
696 fn class_duplicate_signal_is_informational() {
697 let err = EngineError::State(StateKind::DuplicateSignal);
698 assert_eq!(err.class(), ErrorClass::Informational);
699 }
700
701 #[test]
702 fn class_bug_variant() {
703 let err = EngineError::Bug(BugKind::AttemptNotInCreatedState);
704 assert_eq!(err.class(), ErrorClass::Bug);
705 }
706
707 #[test]
708 fn class_transport_defaults_terminal() {
709 // ff-core has no ScriptError downcast; Transport is Terminal
710 // until ff-script's engine_error_ext::class is called.
711 let raw = std::io::Error::other("simulated transport error");
712 let err = EngineError::Transport {
713 backend: "test",
714 source: Box::new(raw),
715 };
716 assert_eq!(err.class(), ErrorClass::Terminal);
717 }
718
719 #[test]
720 fn unavailable_is_terminal() {
721 assert_eq!(
722 EngineError::Unavailable { op: "foo" }.class(),
723 ErrorClass::Terminal
724 );
725 }
726
727 #[test]
728 fn backend_context_wraps_transport_and_preserves_typed() {
729 // Transport gets wrapped with the call-site label (issue #154).
730 let raw = std::io::Error::other("simulated transport error");
731 let wrapped = backend_context(
732 EngineError::Transport {
733 backend: "valkey",
734 source: Box::new(raw),
735 },
736 "renew: FCALL ff_renew_lease",
737 );
738 let rendered = format!("{wrapped}");
739 assert!(
740 rendered.starts_with("renew: FCALL ff_renew_lease: transport (valkey): "),
741 "expected context prefix, got: {rendered}"
742 );
743 // Unavailable also wraps so callers can still filter on the op.
744 let wrapped = backend_context(EngineError::Unavailable { op: "x" }, "ctx");
745 assert!(matches!(wrapped, EngineError::Contextual { .. }));
746
747 // Typed classifications pass through unchanged so existing
748 // `match` call sites keep working.
749 let inner = EngineError::Validation {
750 kind: ValidationKind::Corruption,
751 detail: "bad".into(),
752 };
753 let passthrough = backend_context(inner, "describe_edge: HGETALL edge");
754 match passthrough {
755 EngineError::Validation { kind, .. } => {
756 assert_eq!(kind, ValidationKind::Corruption);
757 }
758 other => panic!("expected Validation, got {other:?}"),
759 }
760 let inner = EngineError::Contention(ContentionKind::LeaseConflict);
761 assert_eq!(
762 backend_context(inner, "renew: FCALL ff_renew_lease").class(),
763 ErrorClass::Retryable
764 );
765 }
766
767 #[test]
768 fn backend_error_kind_round_trip() {
769 let be = BackendError::Valkey {
770 kind: BackendErrorKind::Transport,
771 message: "connection reset".into(),
772 };
773 assert_eq!(be.kind(), BackendErrorKind::Transport);
774 assert_eq!(be.message(), "connection reset");
775 }
776
777 #[test]
778 fn backend_kind_stable_strings_fixed() {
779 // Stability fence: these strings are part of the public
780 // contract (log field values, HTTP body `kind` slots). Adding
781 // a variant is additive; changing an existing string is a
782 // break.
783 assert_eq!(BackendErrorKind::Transport.as_stable_str(), "transport");
784 assert_eq!(BackendErrorKind::Protocol.as_stable_str(), "protocol");
785 assert_eq!(BackendErrorKind::Timeout.as_stable_str(), "timeout");
786 assert_eq!(BackendErrorKind::Auth.as_stable_str(), "auth");
787 assert_eq!(BackendErrorKind::Cluster.as_stable_str(), "cluster");
788 assert_eq!(
789 BackendErrorKind::BusyLoading.as_stable_str(),
790 "busy_loading"
791 );
792 assert_eq!(
793 BackendErrorKind::ScriptNotLoaded.as_stable_str(),
794 "script_not_loaded"
795 );
796 assert_eq!(BackendErrorKind::Other.as_stable_str(), "other");
797 }
798
799 #[test]
800 fn backend_kind_retryability() {
801 for k in [
802 BackendErrorKind::Transport,
803 BackendErrorKind::Timeout,
804 BackendErrorKind::Cluster,
805 BackendErrorKind::BusyLoading,
806 ] {
807 assert!(k.is_retryable(), "{k:?} should be retryable");
808 }
809 for k in [
810 BackendErrorKind::Protocol,
811 BackendErrorKind::Auth,
812 BackendErrorKind::ScriptNotLoaded,
813 BackendErrorKind::Other,
814 ] {
815 assert!(!k.is_retryable(), "{k:?} should NOT be retryable");
816 }
817 }
818}