ff_core/engine_error.rs
1//! Typed engine-error surface (issue #58.6).
2//!
3//! **RFC-012 Stage 1a:** moved from `ff-sdk::engine_error` to
4//! `ff-core::engine_error` so it becomes nameable by the
5//! `EngineBackend` trait (which lives in `ff-core::engine_backend`) without
6//! forcing a public-surface dependency from ff-core on ff-script. The
7//! [`ScriptError`]-aware helpers (`From<ScriptError>`, `valkey_kind`,
8//! `transport_script`, `transport_script_ref`) live in ff-script as
9//! free functions (see `ff_script::engine_error_ext`) — ff-core owns
10//! the enum shapes; ff-script owns the transport-downcast plumbing.
11//!
12//! # Mapping shape
13//!
14//! `ScriptError` lives in the `ff-script` crate (transport-adjacent).
15//! `EngineError` lives here in `ff-core` and is what public SDK calls
16//! return via `ff_sdk::SdkError::Engine`. The bidirectional mapping:
17//!
18//! * `From<ScriptError> for EngineError` — every `ScriptError` variant
19//! is classified into `NotFound` / `Validation` / `Contention` /
20//! `Conflict` / `State` / `Bug` / `Transport`. `Parse` + `Valkey`
21//! flow through `Transport { source: Box<ScriptError> }` so the
22//! underlying `ferriskey::ErrorKind` / parse detail is preserved.
23//! * `DependencyAlreadyExists` is special: per the #58.6 design the
24//! variant carries the pre-existing [`EdgeSnapshot`] inline.
25//! Populating that field requires an extra round-trip (the Lua
26//! script only knows the edge_id), so plain `From<ScriptError>`
27//! returns a `Transport` fallback for that code — callers in the
28//! `stage_dependency` path use `ff_sdk::engine_error::enrich_dependency_conflict`
29//! to perform the follow-up `describe_edge` and upgrade the error
30//! before returning.
31//!
32//! # Exhaustiveness
33//!
34//! The top-level [`EngineError`] and every sub-kind are
35//! `#[non_exhaustive]`. FF can add new Lua error codes in minors
36//! without a breaking change to this surface — consumers that
37//! `match` on a sub-kind must include a `_` arm.
38
39use crate::error::ErrorClass;
40
41/// Typed engine-error surface. See module docs.
42#[derive(Debug, thiserror::Error)]
43#[non_exhaustive]
44pub enum EngineError {
45 /// A uniquely-identified resource did not exist. `entity` is a
46 /// stable label (e.g. `"execution"`, `"flow"`, `"attempt"`) that
47 /// consumers can match without re-parsing a message.
48 #[error("not found: {entity}")]
49 NotFound { entity: &'static str },
50
51 /// Caller supplied a malformed, out-of-range, or otherwise
52 /// rejected input. `detail` carries the Lua-side payload (field
53 /// name, offending value, or CSV of missing tokens, depending on
54 /// `kind`).
55 #[error("validation: {kind:?}: {detail}")]
56 Validation {
57 kind: ValidationKind,
58 detail: String,
59 },
60
61 /// Transient conflict with another worker or with the current
62 /// state of the execution/flow. Caller should retry per
63 /// RFC-010 §10.7.
64 #[error("contention: {0:?}")]
65 Contention(ContentionKind),
66
67 /// Permanent conflict — the requested mutation conflicts with
68 /// an existing record (e.g. duplicate edge, cycle, already-in-flow).
69 /// Caller must not blindly retry.
70 #[error("conflict: {0:?}")]
71 Conflict(ConflictKind),
72
73 /// Legal but surprising state — lease expired, already-suspended,
74 /// duplicate-signal, budget-exceeded, etc. Per-variant semantics
75 /// documented on [`StateKind`].
76 #[error("state: {0:?}")]
77 State(StateKind),
78
79 /// FF-internal invariant violation that should not be reachable
80 /// in a correctly-behaving deployment. Consumers typically log
81 /// and surface as a 5xx.
82 #[error("bug: {0:?}")]
83 Bug(BugKind),
84
85 /// Backend transport fault or response-parse failure (RFC-012 §4.2
86 /// round-4 shape). Broadened in Stage 0 to carry `Box<dyn Error>`
87 /// so non-Valkey backends (Postgres, future) can route their
88 /// native transport errors through this variant without going via
89 /// `ScriptError`.
90 ///
91 /// * `backend` — static diagnostic label (`"valkey"`, `"postgres"`,
92 /// etc.). Kept `&'static str` to avoid heap alloc on construction.
93 /// * `source` — boxed error. For the Valkey backend this is
94 /// `ff_script::error::ScriptError`; downcast with
95 /// `source.downcast_ref::<ScriptError>()` to recover
96 /// `ferriskey::ErrorKind` / parse detail. Helper lives in
97 /// `ff_script::engine_error_ext::transport_script_ref`.
98 #[error("transport ({backend}): {source}")]
99 Transport {
100 backend: &'static str,
101 #[source]
102 source: Box<dyn std::error::Error + Send + Sync + 'static>,
103 },
104
105 /// Backend method not wired up yet (RFC-012 §4.2 K#7 holdover).
106 /// Returned by staged backend impls for methods that are known
107 /// types in the trait but not yet implemented. Graceful degradation
108 /// in place of `unimplemented!()` panics. Additive; does not
109 /// participate in the `From<ScriptError>` mapping.
110 #[error("unavailable: {op}")]
111 Unavailable { op: &'static str },
112}
113
114/// Validation sub-kinds. 1:1 with the Lua validation codes.
115#[derive(Debug, Clone, PartialEq, Eq)]
116#[non_exhaustive]
117pub enum ValidationKind {
118 /// Generic caller-supplied input rejected (field-name detail).
119 InvalidInput,
120 /// Worker caps do not satisfy execution's required_capabilities.
121 /// `detail` is the sorted-CSV of missing tokens.
122 CapabilityMismatch,
123 /// Malformed/oversized capability list.
124 InvalidCapabilities,
125 /// `policy_json` not valid JSON or structurally wrong.
126 InvalidPolicyJson,
127 /// Signal payload > 64KB.
128 PayloadTooLarge,
129 /// Max signals per execution reached.
130 SignalLimitExceeded,
131 /// MAC verification failed on waitpoint_key.
132 InvalidWaitpointKey,
133 /// Pending waitpoint has no HMAC token field.
134 WaitpointNotTokenBound,
135 /// Frame > 64KB.
136 RetentionLimitExceeded,
137 /// Lease/attempt binding mismatch on suspend.
138 InvalidLeaseForSuspend,
139 /// Dependency edge not found / invalid dependency ref.
140 InvalidDependency,
141 /// Waitpoint/execution binding mismatch.
142 InvalidWaitpointForExecution,
143 /// Unrecognized blocking reason.
144 InvalidBlockingReason,
145 /// Invalid stream ID offset.
146 InvalidOffset,
147 /// Auth failed.
148 Unauthorized,
149 /// Budget scope malformed.
150 InvalidBudgetScope,
151 /// Operator privileges required.
152 BudgetOverrideNotAllowed,
153 /// Malformed quota definition.
154 InvalidQuotaSpec,
155 /// Rotation kid must be non-empty and dot-free.
156 InvalidKid,
157 /// Rotation secret must be non-empty even-length hex.
158 InvalidSecretHex,
159 /// Rotation grace_ms must be a non-negative integer.
160 InvalidGraceMs,
161 /// Tag key violates reserved-namespace rule.
162 InvalidTagKey,
163 /// Unrecognized stream frame type.
164 InvalidFrameType,
165 /// On-disk corruption or protocol drift: an engine-owned hash /
166 /// key returned a field shape the decoder could not parse (missing
167 /// required field, malformed timestamp, unknown extra field,
168 /// cross-field identity mismatch, etc.). `detail` carries the
169 /// decoder's diagnostic string — the specific field name and/or
170 /// offending value — in the form
171 /// `"<context>: <field?>: <message>"` so operators can locate the
172 /// bad key without reparsing.
173 ///
174 /// Classified as `Terminal`: a consumer retrying the read will
175 /// see the same bytes. Surface to the operator; do not loop.
176 Corruption,
177}
178
179/// Contention sub-kinds (retryable per RFC-010 §10.7). Caller should
180/// re-dispatch or re-read and retry.
181#[derive(Debug, Clone, PartialEq, Eq)]
182#[non_exhaustive]
183pub enum ContentionKind {
184 /// Re-dispatch to `claim_resumed_execution`.
185 UseClaimResumedExecution,
186 /// Re-dispatch to `claim_execution`.
187 NotAResumedExecution,
188 /// State changed since grant. Request new grant.
189 ExecutionNotLeaseable,
190 /// Another worker holds lease. Request a different execution.
191 LeaseConflict,
192 /// Grant missing/mismatched. Request new grant.
193 InvalidClaimGrant,
194 /// Grant TTL elapsed. Request new grant.
195 ClaimGrantExpired,
196 /// No execution currently available.
197 NoEligibleExecution,
198 /// Waitpoint may not exist yet. Retry with backoff.
199 WaitpointNotFound,
200 /// Route to buffer_signal_for_pending_waitpoint.
201 WaitpointPendingUseBufferScript,
202 /// Graph revision changed. Re-read adjacency, retry.
203 StaleGraphRevision,
204 /// Execution is not in `active` state (lease superseded, etc.)
205 /// Carries the Lua-side detail payload for replay reconciliation.
206 ExecutionNotActive {
207 terminal_outcome: String,
208 lease_epoch: String,
209 lifecycle_phase: String,
210 attempt_id: String,
211 },
212 /// State changed. Scheduler skips.
213 ExecutionNotEligible,
214 /// Removed by another scheduler.
215 ExecutionNotInEligibleSet,
216 /// Already reclaimed/cancelled. Skip.
217 ExecutionNotReclaimable,
218 /// Target has no active lease (already revoked/expired/unowned).
219 NoActiveLease,
220 /// Window full; caller should backoff `retry_after_ms`.
221 RateLimitExceeded,
222 /// Concurrency cap hit.
223 ConcurrencyLimitExceeded,
224}
225
226/// Permanent conflict sub-kinds. Caller must reconcile rather than
227/// retry.
228#[derive(Debug, Clone, PartialEq, Eq)]
229#[non_exhaustive]
230pub enum ConflictKind {
231 /// Dependency edge already exists. Carries the pre-existing
232 /// [`EdgeSnapshot`] so callers implementing "409 on re-declare
233 /// with different kind/ref" don't need a follow-up read.
234 ///
235 /// Note: the plain `From<ScriptError> for EngineError` impl
236 /// cannot populate `existing` (that requires an async
237 /// `describe_edge` round trip), so it falls through to
238 /// `EngineError::Transport`. Callers on the `stage_dependency`
239 /// path use `ff_sdk::engine_error::enrich_dependency_conflict`
240 /// to perform the follow-up read and promote the error.
241 ///
242 /// [`EdgeSnapshot`]: crate::contracts::EdgeSnapshot
243 DependencyAlreadyExists {
244 existing: crate::contracts::EdgeSnapshot,
245 },
246 /// Edge would create a cycle.
247 CycleDetected,
248 /// Self-referencing edge (upstream == downstream).
249 SelfReferencingEdge,
250 /// Execution is already a member of another flow.
251 ExecutionAlreadyInFlow,
252 /// Waitpoint already exists (pending or active).
253 WaitpointAlreadyExists,
254 /// Budget already attached or conflicts.
255 BudgetAttachConflict,
256 /// Quota policy already attached.
257 QuotaAttachConflict,
258 /// Rotation: same kid already installed with a different secret.
259 /// String is the conflicting kid.
260 RotationConflict(String),
261 /// Invariant violation: active attempt already exists where one
262 /// was expected absent.
263 ActiveAttemptExists,
264}
265
266/// Legal-but-surprising state sub-kinds. Per-variant semantics vary
267/// (some are benign no-ops, some are terminal). Consult the RFC-010
268/// §10.7 classification table.
269#[derive(Debug, Clone, PartialEq, Eq)]
270#[non_exhaustive]
271pub enum StateKind {
272 /// Lease superseded by reclaim.
273 StaleLease,
274 /// Lease TTL elapsed.
275 LeaseExpired,
276 /// Operator revoked lease.
277 LeaseRevoked,
278 /// Already resumed/cancelled. No-op.
279 ExecutionNotSuspended,
280 /// Open suspension already active. No-op.
281 AlreadySuspended,
282 /// Signal too late — waitpoint already closed.
283 WaitpointClosed,
284 /// Execution not suspended; no valid signal target.
285 TargetNotSignalable,
286 /// Signal already delivered (dedup).
287 DuplicateSignal,
288 /// Resume conditions not satisfied.
289 ResumeConditionNotMet,
290 /// Waitpoint not in pending state.
291 WaitpointNotPending,
292 /// Pending waitpoint aged out before suspension committed.
293 PendingWaitpointExpired,
294 /// Waitpoint is not in an open state.
295 WaitpointNotOpen,
296 /// Cannot replay non-terminal execution.
297 ExecutionNotTerminal,
298 /// Replay limit reached.
299 MaxReplaysExhausted,
300 /// Attempt terminal; no appends.
301 StreamClosed,
302 /// Lease mismatch on stream append.
303 StaleOwnerCannotAppend,
304 /// Grant already issued. Skip.
305 GrantAlreadyExists,
306 /// Execution not in specified flow.
307 ExecutionNotInFlow,
308 /// Flow already in terminal state.
309 FlowAlreadyTerminal,
310 /// Dependencies not yet satisfied.
311 DepsNotSatisfied,
312 /// Not blocked by dependencies.
313 NotBlockedByDeps,
314 /// Execution not runnable.
315 NotRunnable,
316 /// Execution already terminal.
317 Terminal,
318 /// Hard budget limit reached.
319 BudgetExceeded,
320 /// Soft budget limit reached (warning; continue).
321 BudgetSoftExceeded,
322 /// Usage seq already processed. No-op.
323 OkAlreadyApplied,
324 /// Attempt not in started state.
325 AttemptNotStarted,
326 /// Attempt already ended. No-op.
327 AttemptAlreadyTerminal,
328 /// Wrong state for new attempt.
329 ExecutionNotEligibleForAttempt,
330 /// Execution not terminal or replay limit reached.
331 ReplayNotAllowed,
332 /// Retry limit reached.
333 MaxRetriesExhausted,
334 /// Already closed. No-op.
335 StreamAlreadyClosed,
336}
337
338/// FF-internal invariant-violation sub-kinds. Should not be reachable
339/// in a correctly-behaving deployment.
340#[derive(Debug, Clone, PartialEq, Eq)]
341#[non_exhaustive]
342pub enum BugKind {
343 /// `attempt_not_in_created_state`: internal sequencing error.
344 AttemptNotInCreatedState,
345}
346
347/// Backend-agnostic transport error carried across public
348/// ff-sdk / ff-server error surfaces (#88).
349///
350/// The `Valkey` variant is the only one populated today; additional
351/// variants (e.g. `Postgres`) will be added additively as other
352/// backends land. The enum is `#[non_exhaustive]` so consumers must
353/// include a wildcard arm.
354///
355/// Construction from the Valkey-native `ferriskey::Error` lives in
356/// `ff_backend_valkey::backend_error_from_ferriskey` — keeping that
357/// conversion outside ff-core preserves ff-core's ferriskey-free
358/// public surface.
359#[derive(Debug, Clone, thiserror::Error)]
360#[non_exhaustive]
361pub enum BackendError {
362 /// Valkey-backend transport failure. Carries a backend-agnostic
363 /// classification plus the backend-rendered message so downstream
364 /// consumers can inspect without depending on ferriskey.
365 #[error("valkey backend: {kind:?}: {message}")]
366 Valkey {
367 kind: BackendErrorKind,
368 message: String,
369 },
370}
371
372impl BackendError {
373 /// Returns the classified backend kind if this error is a Valkey
374 /// transport fault. Forward-compatible with future backends:
375 /// non-Valkey variants return `None` on a call that names only the
376 /// Valkey kind; code that wants a backend-specific view should
377 /// match directly on [`BackendError`].
378 pub fn kind(&self) -> BackendErrorKind {
379 match self {
380 Self::Valkey { kind, .. } => *kind,
381 }
382 }
383
384 /// Return the backend-rendered message payload.
385 pub fn message(&self) -> &str {
386 match self {
387 Self::Valkey { message, .. } => message.as_str(),
388 }
389 }
390}
391
392/// Classified backend transport errors, kept backend-agnostic on
393/// purpose (#88). Each variant maps a family of native backend error
394/// kinds into a stable, consumer-matchable shape.
395///
396/// Consumers requiring the exact native kind for a Valkey backend
397/// must go through `ff_backend_valkey` explicitly; ff-sdk/ff-server's
398/// public surface will only ever hand out [`BackendErrorKind`].
399#[derive(Debug, Clone, Copy, PartialEq, Eq)]
400#[non_exhaustive]
401pub enum BackendErrorKind {
402 /// Network / I/O failure: the request may or may not have been
403 /// processed. Typically retryable with backoff.
404 Transport,
405 /// Backend rejected the request on protocol / parse grounds. Not
406 /// retryable without a fix.
407 Protocol,
408 /// Backend timed out responding to the request. Retryable.
409 Timeout,
410 /// Authentication / authorization failure. Not retryable.
411 Auth,
412 /// Cluster topology churn (MOVED, ASK, CLUSTERDOWN, MasterDown,
413 /// CrossSlot, ConnectionNotFoundForRoute, AllConnectionsUnavailable).
414 /// Retryable after topology settles.
415 Cluster,
416 /// Backend is temporarily busy loading state (e.g. Valkey
417 /// `LOADING`). Retryable.
418 BusyLoading,
419 /// Backend indicates the referenced script/function does not
420 /// exist. Typically handled by the caller via re-load.
421 ScriptNotLoaded,
422 /// Any other classified error from the backend. Fallback bucket
423 /// for native kinds outside the curated set above.
424 Other,
425}
426
427impl BackendErrorKind {
428 /// Stable, lowercase-kebab label suitable for log fields / HTTP
429 /// `kind` body slots. Guaranteed not to change across releases
430 /// for the existing variants.
431 pub fn as_stable_str(&self) -> &'static str {
432 match self {
433 Self::Transport => "transport",
434 Self::Protocol => "protocol",
435 Self::Timeout => "timeout",
436 Self::Auth => "auth",
437 Self::Cluster => "cluster",
438 Self::BusyLoading => "busy_loading",
439 Self::ScriptNotLoaded => "script_not_loaded",
440 Self::Other => "other",
441 }
442 }
443
444 /// Whether a caller should consider this kind retryable with
445 /// backoff. Conservative — auth + protocol + other are terminal.
446 pub fn is_retryable(&self) -> bool {
447 matches!(
448 self,
449 Self::Transport | Self::Timeout | Self::Cluster | Self::BusyLoading
450 )
451 }
452}
453
454impl EngineError {
455 /// Classify an [`EngineError`] using the underlying
456 /// [`ErrorClass`] table.
457 ///
458 /// **Transport classification in ff-core:** the inner source is
459 /// `Box<dyn std::error::Error>` which ff-core cannot downcast
460 /// without naming `ScriptError`. ff-core returns `Terminal` for
461 /// every `Transport` variant by default. Callers needing the
462 /// Retryable-on-transient-Valkey-error classification use
463 /// `ff_script::engine_error_ext::class` which downcasts to
464 /// `ScriptError` and delegates to `ScriptError::class`. ff-sdk's
465 /// public `SdkError::is_retryable` / `backend_kind` methods wire
466 /// the ff-script helper in so consumers retain the Phase-1
467 /// behavior transparently. (`backend_kind` was renamed from
468 /// `valkey_kind` in #88.)
469 pub fn class(&self) -> ErrorClass {
470 match self {
471 Self::NotFound { .. } => ErrorClass::Terminal,
472 Self::Validation { .. } => ErrorClass::Terminal,
473 Self::Contention(_) => ErrorClass::Retryable,
474 Self::Conflict(_) => ErrorClass::Terminal,
475 Self::State(StateKind::BudgetExceeded) => ErrorClass::Cooperative,
476 Self::State(
477 StateKind::ExecutionNotSuspended
478 | StateKind::AlreadySuspended
479 | StateKind::WaitpointClosed
480 | StateKind::DuplicateSignal
481 | StateKind::GrantAlreadyExists
482 | StateKind::OkAlreadyApplied
483 | StateKind::AttemptAlreadyTerminal
484 | StateKind::StreamAlreadyClosed
485 | StateKind::BudgetSoftExceeded
486 | StateKind::WaitpointNotOpen
487 | StateKind::WaitpointNotPending
488 | StateKind::PendingWaitpointExpired
489 | StateKind::NotBlockedByDeps
490 | StateKind::DepsNotSatisfied,
491 ) => ErrorClass::Informational,
492 Self::State(_) => ErrorClass::Terminal,
493 Self::Bug(_) => ErrorClass::Bug,
494 // ff-core cannot name ScriptError. Safe default: Terminal.
495 // ff-script's engine_error_ext::class upgrades to
496 // ScriptError::class when the inner source is a
497 // ScriptError.
498 Self::Transport { .. } => ErrorClass::Terminal,
499 // Unavailable is terminal at the call site — the method is
500 // not implemented; the caller must either fall back to a
501 // different code path or surface to the user.
502 Self::Unavailable { .. } => ErrorClass::Terminal,
503 }
504 }
505}
506
507#[cfg(test)]
508mod tests {
509 use super::*;
510
511 #[test]
512 fn class_contention_is_retryable() {
513 let err = EngineError::Contention(ContentionKind::LeaseConflict);
514 assert_eq!(err.class(), ErrorClass::Retryable);
515 }
516
517 #[test]
518 fn class_budget_exceeded_is_cooperative() {
519 let err = EngineError::State(StateKind::BudgetExceeded);
520 assert_eq!(err.class(), ErrorClass::Cooperative);
521 }
522
523 #[test]
524 fn class_duplicate_signal_is_informational() {
525 let err = EngineError::State(StateKind::DuplicateSignal);
526 assert_eq!(err.class(), ErrorClass::Informational);
527 }
528
529 #[test]
530 fn class_bug_variant() {
531 let err = EngineError::Bug(BugKind::AttemptNotInCreatedState);
532 assert_eq!(err.class(), ErrorClass::Bug);
533 }
534
535 #[test]
536 fn class_transport_defaults_terminal() {
537 // ff-core has no ScriptError downcast; Transport is Terminal
538 // until ff-script's engine_error_ext::class is called.
539 let raw = std::io::Error::other("simulated transport error");
540 let err = EngineError::Transport {
541 backend: "test",
542 source: Box::new(raw),
543 };
544 assert_eq!(err.class(), ErrorClass::Terminal);
545 }
546
547 #[test]
548 fn unavailable_is_terminal() {
549 assert_eq!(
550 EngineError::Unavailable { op: "foo" }.class(),
551 ErrorClass::Terminal
552 );
553 }
554
555 #[test]
556 fn backend_error_kind_round_trip() {
557 let be = BackendError::Valkey {
558 kind: BackendErrorKind::Transport,
559 message: "connection reset".into(),
560 };
561 assert_eq!(be.kind(), BackendErrorKind::Transport);
562 assert_eq!(be.message(), "connection reset");
563 }
564
565 #[test]
566 fn backend_kind_stable_strings_fixed() {
567 // Stability fence: these strings are part of the public
568 // contract (log field values, HTTP body `kind` slots). Adding
569 // a variant is additive; changing an existing string is a
570 // break.
571 assert_eq!(BackendErrorKind::Transport.as_stable_str(), "transport");
572 assert_eq!(BackendErrorKind::Protocol.as_stable_str(), "protocol");
573 assert_eq!(BackendErrorKind::Timeout.as_stable_str(), "timeout");
574 assert_eq!(BackendErrorKind::Auth.as_stable_str(), "auth");
575 assert_eq!(BackendErrorKind::Cluster.as_stable_str(), "cluster");
576 assert_eq!(
577 BackendErrorKind::BusyLoading.as_stable_str(),
578 "busy_loading"
579 );
580 assert_eq!(
581 BackendErrorKind::ScriptNotLoaded.as_stable_str(),
582 "script_not_loaded"
583 );
584 assert_eq!(BackendErrorKind::Other.as_stable_str(), "other");
585 }
586
587 #[test]
588 fn backend_kind_retryability() {
589 for k in [
590 BackendErrorKind::Transport,
591 BackendErrorKind::Timeout,
592 BackendErrorKind::Cluster,
593 BackendErrorKind::BusyLoading,
594 ] {
595 assert!(k.is_retryable(), "{k:?} should be retryable");
596 }
597 for k in [
598 BackendErrorKind::Protocol,
599 BackendErrorKind::Auth,
600 BackendErrorKind::ScriptNotLoaded,
601 BackendErrorKind::Other,
602 ] {
603 assert!(!k.is_retryable(), "{k:?} should NOT be retryable");
604 }
605 }
606}