Skip to main content

ff_core/
engine_error.rs

1//! Typed engine-error surface (issue #58.6).
2//!
3//! **RFC-012 Stage 1a:** moved from `ff-sdk::engine_error` to
4//! `ff-core::engine_error` so it becomes nameable by the
5//! `EngineBackend` trait (which lives in `ff-core::engine_backend`) without
6//! forcing a public-surface dependency from ff-core on ff-script. The
7//! [`ScriptError`]-aware helpers (`From<ScriptError>`, `valkey_kind`,
8//! `transport_script`, `transport_script_ref`) live in ff-script as
9//! free functions (see `ff_script::engine_error_ext`) — ff-core owns
10//! the enum shapes; ff-script owns the transport-downcast plumbing.
11//!
12//! # Mapping shape
13//!
14//! `ScriptError` lives in the `ff-script` crate (transport-adjacent).
15//! `EngineError` lives here in `ff-core` and is what public SDK calls
16//! return via `ff_sdk::SdkError::Engine`. The bidirectional mapping:
17//!
18//! * `From<ScriptError> for EngineError` — every `ScriptError` variant
19//!   is classified into `NotFound` / `Validation` / `Contention` /
20//!   `Conflict` / `State` / `Bug` / `Transport`. `Parse` + `Valkey`
21//!   flow through `Transport { source: Box<ScriptError> }` so the
22//!   underlying `ferriskey::ErrorKind` / parse detail is preserved.
23//! * `DependencyAlreadyExists` is special: per the #58.6 design the
24//!   variant carries the pre-existing [`EdgeSnapshot`] inline.
25//!   Populating that field requires an extra round-trip (the Lua
26//!   script only knows the edge_id), so plain `From<ScriptError>`
27//!   returns a `Transport` fallback for that code — callers in the
28//!   `stage_dependency` path use `ff_sdk::engine_error::enrich_dependency_conflict`
29//!   to perform the follow-up `describe_edge` and upgrade the error
30//!   before returning.
31//!
32//! # Exhaustiveness
33//!
34//! The top-level [`EngineError`] and every sub-kind are
35//! `#[non_exhaustive]`. FF can add new Lua error codes in minors
36//! without a breaking change to this surface — consumers that
37//! `match` on a sub-kind must include a `_` arm.
38
39use crate::error::ErrorClass;
40
41/// Typed engine-error surface. See module docs.
42#[derive(Debug, thiserror::Error)]
43#[non_exhaustive]
44pub enum EngineError {
45    /// A uniquely-identified resource did not exist. `entity` is a
46    /// stable label (e.g. `"execution"`, `"flow"`, `"attempt"`) that
47    /// consumers can match without re-parsing a message.
48    #[error("not found: {entity}")]
49    NotFound { entity: &'static str },
50
51    /// Caller supplied a malformed, out-of-range, or otherwise
52    /// rejected input. `detail` carries the Lua-side payload (field
53    /// name, offending value, or CSV of missing tokens, depending on
54    /// `kind`).
55    #[error("validation: {kind:?}: {detail}")]
56    Validation {
57        kind: ValidationKind,
58        detail: String,
59    },
60
61    /// Transient conflict with another worker or with the current
62    /// state of the execution/flow. Caller should retry per
63    /// RFC-010 §10.7.
64    #[error("contention: {0:?}")]
65    Contention(ContentionKind),
66
67    /// Permanent conflict — the requested mutation conflicts with
68    /// an existing record (e.g. duplicate edge, cycle, already-in-flow).
69    /// Caller must not blindly retry.
70    #[error("conflict: {0:?}")]
71    Conflict(ConflictKind),
72
73    /// Legal but surprising state — lease expired, already-suspended,
74    /// duplicate-signal, budget-exceeded, etc. Per-variant semantics
75    /// documented on [`StateKind`].
76    #[error("state: {0:?}")]
77    State(StateKind),
78
79    /// FF-internal invariant violation that should not be reachable
80    /// in a correctly-behaving deployment. Consumers typically log
81    /// and surface as a 5xx.
82    #[error("bug: {0:?}")]
83    Bug(BugKind),
84
85    /// Backend transport fault or response-parse failure (RFC-012 §4.2
86    /// round-4 shape). Broadened in Stage 0 to carry `Box<dyn Error>`
87    /// so non-Valkey backends (Postgres, future) can route their
88    /// native transport errors through this variant without going via
89    /// `ScriptError`.
90    ///
91    /// * `backend` — static diagnostic label (`"valkey"`, `"postgres"`,
92    ///   etc.). Kept `&'static str` to avoid heap alloc on construction.
93    /// * `source` — boxed error. For the Valkey backend this is
94    ///   `ff_script::error::ScriptError`; downcast with
95    ///   `source.downcast_ref::<ScriptError>()` to recover
96    ///   `ferriskey::ErrorKind` / parse detail. Helper lives in
97    ///   `ff_script::engine_error_ext::transport_script_ref`.
98    #[error("transport ({backend}): {source}")]
99    Transport {
100        backend: &'static str,
101        #[source]
102        source: Box<dyn std::error::Error + Send + Sync + 'static>,
103    },
104
105    /// Backend method not wired up yet (RFC-012 §4.2 K#7 holdover).
106    /// Returned by staged backend impls for methods that are known
107    /// types in the trait but not yet implemented. Graceful degradation
108    /// in place of `unimplemented!()` panics. Additive; does not
109    /// participate in the `From<ScriptError>` mapping.
110    #[error("unavailable: {op}")]
111    Unavailable { op: &'static str },
112}
113
114/// Validation sub-kinds. 1:1 with the Lua validation codes.
115#[derive(Debug, Clone, PartialEq, Eq)]
116#[non_exhaustive]
117pub enum ValidationKind {
118    /// Generic caller-supplied input rejected (field-name detail).
119    InvalidInput,
120    /// Worker caps do not satisfy execution's required_capabilities.
121    /// `detail` is the sorted-CSV of missing tokens.
122    CapabilityMismatch,
123    /// Malformed/oversized capability list.
124    InvalidCapabilities,
125    /// `policy_json` not valid JSON or structurally wrong.
126    InvalidPolicyJson,
127    /// Signal payload > 64KB.
128    PayloadTooLarge,
129    /// Max signals per execution reached.
130    SignalLimitExceeded,
131    /// MAC verification failed on waitpoint_key.
132    InvalidWaitpointKey,
133    /// Pending waitpoint has no HMAC token field.
134    WaitpointNotTokenBound,
135    /// Frame > 64KB.
136    RetentionLimitExceeded,
137    /// Lease/attempt binding mismatch on suspend.
138    InvalidLeaseForSuspend,
139    /// Dependency edge not found / invalid dependency ref.
140    InvalidDependency,
141    /// Waitpoint/execution binding mismatch.
142    InvalidWaitpointForExecution,
143    /// Unrecognized blocking reason.
144    InvalidBlockingReason,
145    /// Invalid stream ID offset.
146    InvalidOffset,
147    /// Auth failed.
148    Unauthorized,
149    /// Budget scope malformed.
150    InvalidBudgetScope,
151    /// Operator privileges required.
152    BudgetOverrideNotAllowed,
153    /// Malformed quota definition.
154    InvalidQuotaSpec,
155    /// Rotation kid must be non-empty and dot-free.
156    InvalidKid,
157    /// Rotation secret must be non-empty even-length hex.
158    InvalidSecretHex,
159    /// Rotation grace_ms must be a non-negative integer.
160    InvalidGraceMs,
161    /// Tag key violates reserved-namespace rule.
162    InvalidTagKey,
163    /// Unrecognized stream frame type.
164    InvalidFrameType,
165}
166
167/// Contention sub-kinds (retryable per RFC-010 §10.7). Caller should
168/// re-dispatch or re-read and retry.
169#[derive(Debug, Clone, PartialEq, Eq)]
170#[non_exhaustive]
171pub enum ContentionKind {
172    /// Re-dispatch to `claim_resumed_execution`.
173    UseClaimResumedExecution,
174    /// Re-dispatch to `claim_execution`.
175    NotAResumedExecution,
176    /// State changed since grant. Request new grant.
177    ExecutionNotLeaseable,
178    /// Another worker holds lease. Request a different execution.
179    LeaseConflict,
180    /// Grant missing/mismatched. Request new grant.
181    InvalidClaimGrant,
182    /// Grant TTL elapsed. Request new grant.
183    ClaimGrantExpired,
184    /// No execution currently available.
185    NoEligibleExecution,
186    /// Waitpoint may not exist yet. Retry with backoff.
187    WaitpointNotFound,
188    /// Route to buffer_signal_for_pending_waitpoint.
189    WaitpointPendingUseBufferScript,
190    /// Graph revision changed. Re-read adjacency, retry.
191    StaleGraphRevision,
192    /// Execution is not in `active` state (lease superseded, etc.)
193    /// Carries the Lua-side detail payload for replay reconciliation.
194    ExecutionNotActive {
195        terminal_outcome: String,
196        lease_epoch: String,
197        lifecycle_phase: String,
198        attempt_id: String,
199    },
200    /// State changed. Scheduler skips.
201    ExecutionNotEligible,
202    /// Removed by another scheduler.
203    ExecutionNotInEligibleSet,
204    /// Already reclaimed/cancelled. Skip.
205    ExecutionNotReclaimable,
206    /// Target has no active lease (already revoked/expired/unowned).
207    NoActiveLease,
208    /// Window full; caller should backoff `retry_after_ms`.
209    RateLimitExceeded,
210    /// Concurrency cap hit.
211    ConcurrencyLimitExceeded,
212}
213
214/// Permanent conflict sub-kinds. Caller must reconcile rather than
215/// retry.
216#[derive(Debug, Clone, PartialEq, Eq)]
217#[non_exhaustive]
218pub enum ConflictKind {
219    /// Dependency edge already exists. Carries the pre-existing
220    /// [`EdgeSnapshot`] so callers implementing "409 on re-declare
221    /// with different kind/ref" don't need a follow-up read.
222    ///
223    /// Note: the plain `From<ScriptError> for EngineError` impl
224    /// cannot populate `existing` (that requires an async
225    /// `describe_edge` round trip), so it falls through to
226    /// `EngineError::Transport`. Callers on the `stage_dependency`
227    /// path use `ff_sdk::engine_error::enrich_dependency_conflict`
228    /// to perform the follow-up read and promote the error.
229    ///
230    /// [`EdgeSnapshot`]: crate::contracts::EdgeSnapshot
231    DependencyAlreadyExists { existing: crate::contracts::EdgeSnapshot },
232    /// Edge would create a cycle.
233    CycleDetected,
234    /// Self-referencing edge (upstream == downstream).
235    SelfReferencingEdge,
236    /// Execution is already a member of another flow.
237    ExecutionAlreadyInFlow,
238    /// Waitpoint already exists (pending or active).
239    WaitpointAlreadyExists,
240    /// Budget already attached or conflicts.
241    BudgetAttachConflict,
242    /// Quota policy already attached.
243    QuotaAttachConflict,
244    /// Rotation: same kid already installed with a different secret.
245    /// String is the conflicting kid.
246    RotationConflict(String),
247    /// Invariant violation: active attempt already exists where one
248    /// was expected absent.
249    ActiveAttemptExists,
250}
251
252/// Legal-but-surprising state sub-kinds. Per-variant semantics vary
253/// (some are benign no-ops, some are terminal). Consult the RFC-010
254/// §10.7 classification table.
255#[derive(Debug, Clone, PartialEq, Eq)]
256#[non_exhaustive]
257pub enum StateKind {
258    /// Lease superseded by reclaim.
259    StaleLease,
260    /// Lease TTL elapsed.
261    LeaseExpired,
262    /// Operator revoked lease.
263    LeaseRevoked,
264    /// Already resumed/cancelled. No-op.
265    ExecutionNotSuspended,
266    /// Open suspension already active. No-op.
267    AlreadySuspended,
268    /// Signal too late — waitpoint already closed.
269    WaitpointClosed,
270    /// Execution not suspended; no valid signal target.
271    TargetNotSignalable,
272    /// Signal already delivered (dedup).
273    DuplicateSignal,
274    /// Resume conditions not satisfied.
275    ResumeConditionNotMet,
276    /// Waitpoint not in pending state.
277    WaitpointNotPending,
278    /// Pending waitpoint aged out before suspension committed.
279    PendingWaitpointExpired,
280    /// Waitpoint is not in an open state.
281    WaitpointNotOpen,
282    /// Cannot replay non-terminal execution.
283    ExecutionNotTerminal,
284    /// Replay limit reached.
285    MaxReplaysExhausted,
286    /// Attempt terminal; no appends.
287    StreamClosed,
288    /// Lease mismatch on stream append.
289    StaleOwnerCannotAppend,
290    /// Grant already issued. Skip.
291    GrantAlreadyExists,
292    /// Execution not in specified flow.
293    ExecutionNotInFlow,
294    /// Flow already in terminal state.
295    FlowAlreadyTerminal,
296    /// Dependencies not yet satisfied.
297    DepsNotSatisfied,
298    /// Not blocked by dependencies.
299    NotBlockedByDeps,
300    /// Execution not runnable.
301    NotRunnable,
302    /// Execution already terminal.
303    Terminal,
304    /// Hard budget limit reached.
305    BudgetExceeded,
306    /// Soft budget limit reached (warning; continue).
307    BudgetSoftExceeded,
308    /// Usage seq already processed. No-op.
309    OkAlreadyApplied,
310    /// Attempt not in started state.
311    AttemptNotStarted,
312    /// Attempt already ended. No-op.
313    AttemptAlreadyTerminal,
314    /// Wrong state for new attempt.
315    ExecutionNotEligibleForAttempt,
316    /// Execution not terminal or replay limit reached.
317    ReplayNotAllowed,
318    /// Retry limit reached.
319    MaxRetriesExhausted,
320    /// Already closed. No-op.
321    StreamAlreadyClosed,
322}
323
324/// FF-internal invariant-violation sub-kinds. Should not be reachable
325/// in a correctly-behaving deployment.
326#[derive(Debug, Clone, PartialEq, Eq)]
327#[non_exhaustive]
328pub enum BugKind {
329    /// `attempt_not_in_created_state`: internal sequencing error.
330    AttemptNotInCreatedState,
331}
332
333impl EngineError {
334    /// Classify an [`EngineError`] using the underlying
335    /// [`ErrorClass`] table.
336    ///
337    /// **Transport classification in ff-core:** the inner source is
338    /// `Box<dyn std::error::Error>` which ff-core cannot downcast
339    /// without naming `ScriptError`. ff-core returns `Terminal` for
340    /// every `Transport` variant by default. Callers needing the
341    /// Retryable-on-transient-Valkey-error classification use
342    /// `ff_script::engine_error_ext::class` which downcasts to
343    /// `ScriptError` and delegates to `ScriptError::class`. ff-sdk's
344    /// public `SdkError::is_retryable` / `valkey_kind` methods wire
345    /// the ff-script helper in so consumers retain the Phase-1
346    /// behavior transparently.
347    pub fn class(&self) -> ErrorClass {
348        match self {
349            Self::NotFound { .. } => ErrorClass::Terminal,
350            Self::Validation { .. } => ErrorClass::Terminal,
351            Self::Contention(_) => ErrorClass::Retryable,
352            Self::Conflict(_) => ErrorClass::Terminal,
353            Self::State(StateKind::BudgetExceeded) => ErrorClass::Cooperative,
354            Self::State(
355                StateKind::ExecutionNotSuspended
356                | StateKind::AlreadySuspended
357                | StateKind::WaitpointClosed
358                | StateKind::DuplicateSignal
359                | StateKind::GrantAlreadyExists
360                | StateKind::OkAlreadyApplied
361                | StateKind::AttemptAlreadyTerminal
362                | StateKind::StreamAlreadyClosed
363                | StateKind::BudgetSoftExceeded
364                | StateKind::WaitpointNotOpen
365                | StateKind::WaitpointNotPending
366                | StateKind::PendingWaitpointExpired
367                | StateKind::NotBlockedByDeps
368                | StateKind::DepsNotSatisfied,
369            ) => ErrorClass::Informational,
370            Self::State(_) => ErrorClass::Terminal,
371            Self::Bug(_) => ErrorClass::Bug,
372            // ff-core cannot name ScriptError. Safe default: Terminal.
373            // ff-script's engine_error_ext::class upgrades to
374            // ScriptError::class when the inner source is a
375            // ScriptError.
376            Self::Transport { .. } => ErrorClass::Terminal,
377            // Unavailable is terminal at the call site — the method is
378            // not implemented; the caller must either fall back to a
379            // different code path or surface to the user.
380            Self::Unavailable { .. } => ErrorClass::Terminal,
381        }
382    }
383}
384
385#[cfg(test)]
386mod tests {
387    use super::*;
388
389    #[test]
390    fn class_contention_is_retryable() {
391        let err = EngineError::Contention(ContentionKind::LeaseConflict);
392        assert_eq!(err.class(), ErrorClass::Retryable);
393    }
394
395    #[test]
396    fn class_budget_exceeded_is_cooperative() {
397        let err = EngineError::State(StateKind::BudgetExceeded);
398        assert_eq!(err.class(), ErrorClass::Cooperative);
399    }
400
401    #[test]
402    fn class_duplicate_signal_is_informational() {
403        let err = EngineError::State(StateKind::DuplicateSignal);
404        assert_eq!(err.class(), ErrorClass::Informational);
405    }
406
407    #[test]
408    fn class_bug_variant() {
409        let err = EngineError::Bug(BugKind::AttemptNotInCreatedState);
410        assert_eq!(err.class(), ErrorClass::Bug);
411    }
412
413    #[test]
414    fn class_transport_defaults_terminal() {
415        // ff-core has no ScriptError downcast; Transport is Terminal
416        // until ff-script's engine_error_ext::class is called.
417        let raw = std::io::Error::other("simulated transport error");
418        let err = EngineError::Transport {
419            backend: "test",
420            source: Box::new(raw),
421        };
422        assert_eq!(err.class(), ErrorClass::Terminal);
423    }
424
425    #[test]
426    fn unavailable_is_terminal() {
427        assert_eq!(
428            EngineError::Unavailable { op: "foo" }.class(),
429            ErrorClass::Terminal
430        );
431    }
432}