cano 0.14.0

High-performance orchestration engine for building resilient, self-healing systems in Rust. Uses Finite State Machines (FSM) for strict, type-safe transitions.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
//! # Recovery — Append-Only Checkpoint Storage
//!
//! This module defines the [`CheckpointStore`] trait: a pluggable, append-only
//! log of FSM transitions that lets a workflow be resumed after a crash. Each
//! transition is recorded as one [`CheckpointRow`] — a monotonically increasing
//! sequence number, the state that was entered, the task that produced it, an
//! optional output blob for compensatable tasks, and a [`RowKind`] discriminant
//! that identifies why the row was written.
//!
//! ## Design
//!
//! - **Append-only.** A run is a sequence of rows; the store never mutates a row
//!   in place. Reconstructing a run is "load every row for this workflow id, in
//!   sequence order".
//! - **Pluggable.** Implement [`CheckpointStore`] over any backend (an embedded
//!   KV store, Postgres, an HTTP service). The trait is intentionally tiny:
//!   [`append`](CheckpointStore::append), [`load_run`](CheckpointStore::load_run),
//!   [`clear`](CheckpointStore::clear).
//! - **Optional default.** With the `recovery` feature enabled, [`RedbCheckpointStore`]
//!   provides an embedded, ACID, daemon-free implementation backed by
//!   [`redb`](https://docs.rs/redb). The default build pulls in no extra dependencies.
//! - **Versioned.** Each [`CheckpointRow`] carries a user-stamped
//!   [`workflow_version`](CheckpointRow::workflow_version) tag (default `0`); set it via
//!   [`Workflow::with_workflow_version`](crate::workflow::Workflow::with_workflow_version)
//!   and [`resume_from`](crate::workflow::Workflow::resume_from) will reject any row
//!   whose stored version disagrees. Rows written by builds that predate this field
//!   decode as `workflow_version = 0`, so existing checkpoint databases keep replaying
//!   under a workflow that hasn't opted in to a non-zero version.
//!
//! ## Example
//!
//! ```rust
//! use cano::recovery::{CheckpointRow, CheckpointStore};
//! use cano::CanoError;
//! use std::collections::HashMap;
//! use std::sync::Mutex;
//!
//! /// A trivial in-memory store, useful for tests.
//! #[derive(Default)]
//! struct InMemoryStore(Mutex<HashMap<String, Vec<CheckpointRow>>>);
//!
//! // `#[cano::checkpoint_store]` on an inherent `impl` builds the
//! // `impl CheckpointStore for InMemoryStore` header for you. (Or write that header
//! // yourself: `#[cano::checkpoint_store] impl CheckpointStore for InMemoryStore { … }`.)
//! #[cano::checkpoint_store]
//! impl InMemoryStore {
//!     async fn append(&self, workflow_id: &str, row: CheckpointRow) -> Result<(), CanoError> {
//!         let mut runs = self.0.lock().unwrap();
//!         let rows = runs.entry(workflow_id.to_string()).or_default();
//!         if rows.iter().any(|r| r.sequence == row.sequence) {
//!             return Err(CanoError::checkpoint_store(format!(
//!                 "checkpoint conflict: {workflow_id:?} already has sequence {}", row.sequence
//!             )));
//!         }
//!         rows.push(row);
//!         Ok(())
//!     }
//!     async fn load_run(&self, workflow_id: &str) -> Result<Vec<CheckpointRow>, CanoError> {
//!         let mut rows = self.0.lock().unwrap().get(workflow_id).cloned().unwrap_or_default();
//!         rows.sort_by_key(|r| r.sequence); // contract: ascending by `sequence`
//!         Ok(rows)
//!     }
//!     async fn clear(&self, workflow_id: &str) -> Result<(), CanoError> {
//!         self.0.lock().unwrap().remove(workflow_id);
//!         Ok(())
//!     }
//! }
//!
//! # #[tokio::main]
//! # async fn main() -> Result<(), CanoError> {
//! let checkpoint_store = InMemoryStore::default();
//! checkpoint_store.append("run-1", CheckpointRow::new(0, "Start", "fetch")).await?;
//! checkpoint_store.append("run-1", CheckpointRow::new(1, "Done", "process")).await?;
//! assert_eq!(checkpoint_store.load_run("run-1").await?.len(), 2);
//! # Ok(())
//! # }
//! ```

use crate::error::CanoError;
use cano_macros::checkpoint_store;

#[cfg(feature = "recovery")]
mod redb;

#[cfg(feature = "recovery")]
pub use redb::RedbCheckpointStore;

/// Why a [`CheckpointRow`] was written — distinguishes ordinary state-entry rows
/// from saga compensation-completion rows and `SteppedTask` cursor rows so
/// `resume_from` can route each correctly.
#[non_exhaustive]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)]
pub enum RowKind {
    /// An ordinary "the FSM entered this state" row (the common case).
    #[default]
    StateEntry,
    /// A saga compensatable task's completion row, carrying its serialized output in `output_blob`.
    CompensationCompletion,
    /// A `SteppedTask` iteration row, carrying the serialized cursor in `output_blob`.
    StepCursor,
}

/// One recorded FSM transition.
///
/// Rows are append-only and ordered within a run by [`sequence`](Self::sequence).
/// `output_blob` carries opaque bytes whose purpose is discriminated by [`kind`](Self::kind):
/// `None` for plain state-entry rows, `Some` for saga completion rows
/// ([`RowKind::CompensationCompletion`]) and `SteppedTask` cursor rows ([`RowKind::StepCursor`]).
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub struct CheckpointRow {
    /// Monotonically increasing position of this transition within its run.
    pub sequence: u64,
    /// The state that was entered at this transition.
    pub state: String,
    /// Identifier of the task that produced this transition (see `Task::name`).
    pub task_id: String,
    /// Optional opaque bytes payload; see [`kind`](Self::kind) for semantics.
    pub output_blob: Option<Vec<u8>>,
    /// Why this row was written; drives how [`resume_from`](crate::workflow::Workflow::resume_from)
    /// routes the row during replay.
    pub kind: RowKind,
    /// User-declared workflow version stamped at append time. Default `0` for rows
    /// written by a workflow that did not call `Workflow::with_workflow_version`.
    pub workflow_version: u32,
}

impl CheckpointRow {
    /// Build a plain state-entry row: `kind = `[`RowKind::StateEntry`]`, `output_blob = None`.
    pub fn new(sequence: u64, state: impl Into<String>, task_id: impl Into<String>) -> Self {
        Self {
            sequence,
            state: state.into(),
            task_id: task_id.into(),
            output_blob: None,
            kind: RowKind::StateEntry,
            workflow_version: 0,
        }
    }

    /// Attach a saga compensation output blob and mark the row as
    /// [`RowKind::CompensationCompletion`].
    ///
    /// Used for compensatable tasks whose output must be retained for rollback so that
    /// [`resume_from`](crate::workflow::Workflow::resume_from) can rehydrate the
    /// compensation stack. Builder-style: `CheckpointRow::new(..).with_output(bytes)`.
    pub fn with_output(mut self, output_blob: Vec<u8>) -> Self {
        self.output_blob = Some(output_blob);
        self.kind = RowKind::CompensationCompletion;
        self
    }

    /// Attach a `SteppedTask` cursor blob and mark the row as [`RowKind::StepCursor`].
    ///
    /// Mark this row as a `SteppedTask` cursor checkpoint carrying the serialized cursor
    /// bytes. Builder-style: `CheckpointRow::new(..).with_cursor(bytes)`.
    pub fn with_cursor(mut self, cursor_blob: Vec<u8>) -> Self {
        self.output_blob = Some(cursor_blob);
        self.kind = RowKind::StepCursor;
        self
    }

    /// Stamp this row with the workflow's user-declared version. Chained with `new`
    /// (and optionally `with_output` / `with_cursor`) at append time so each row carries
    /// the version of the workflow definition that produced it.
    pub fn with_workflow_version(mut self, version: u32) -> Self {
        self.workflow_version = version;
        self
    }
}

/// Append-only checkpoint log keyed by workflow id.
///
/// Implementations record one [`CheckpointRow`] per FSM transition and can
/// replay them in sequence order to resume a crashed run. The contract:
///
/// - [`append`](Self::append) durably persists `row` for `workflow_id`. It **must
///   reject a duplicate `(workflow_id, row.sequence)`** with an `Err` rather than
///   overwriting the existing row — the engine assigns sequences densely from `0`, so a
///   collision means two runs are sharing a `workflow_id` (a misuse: resume the existing
///   run, or [`clear`](Self::clear) it first). A legitimate [`resume_from`] only ever
///   appends sequences past the last persisted one, so it never collides.
/// - [`load_run`](Self::load_run) returns every row ever appended for
///   `workflow_id`, **sorted ascending by `sequence`**, or an empty `Vec` if the
///   id is unknown.
/// - [`clear`](Self::clear) removes all rows for `workflow_id` and must not
///   affect any other id. Clearing an unknown id is a no-op (`Ok`).
///
/// Backends must be `Send + Sync + 'static` so a single store can be shared
/// (typically as `Arc<dyn CheckpointStore>`) across concurrent workflows; `append`,
/// `load_run` and `clear` may be called concurrently for the same or different ids.
///
/// [`resume_from`]: crate::workflow::Workflow::resume_from
#[checkpoint_store]
pub trait CheckpointStore: Send + Sync + 'static {
    /// Durably append `row` to the log for `workflow_id`. Returns an error if a row
    /// already exists at `(workflow_id, row.sequence)` (see the trait-level contract).
    async fn append(&self, workflow_id: &str, row: CheckpointRow) -> Result<(), CanoError>;

    /// Load every row for `workflow_id`, sorted ascending by `sequence`.
    async fn load_run(&self, workflow_id: &str) -> Result<Vec<CheckpointRow>, CanoError>;

    /// Remove all rows for `workflow_id`. No-op if the id is unknown.
    async fn clear(&self, workflow_id: &str) -> Result<(), CanoError>;
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::collections::HashMap;
    use std::sync::Mutex;

    /// Minimal in-memory `CheckpointStore` for exercising the trait contract.
    #[derive(Default)]
    struct InMemoryStore(Mutex<HashMap<String, Vec<CheckpointRow>>>);

    #[checkpoint_store]
    impl CheckpointStore for InMemoryStore {
        async fn append(&self, workflow_id: &str, row: CheckpointRow) -> Result<(), CanoError> {
            let mut runs = self.0.lock().unwrap();
            let rows = runs.entry(workflow_id.to_string()).or_default();
            if rows.iter().any(|r| r.sequence == row.sequence) {
                return Err(CanoError::checkpoint_store(format!(
                    "checkpoint conflict: {workflow_id:?} already has sequence {}",
                    row.sequence
                )));
            }
            rows.push(row);
            Ok(())
        }

        async fn load_run(&self, workflow_id: &str) -> Result<Vec<CheckpointRow>, CanoError> {
            let mut rows = self
                .0
                .lock()
                .unwrap()
                .get(workflow_id)
                .cloned()
                .unwrap_or_default();
            rows.sort_by_key(|r| r.sequence);
            Ok(rows)
        }

        async fn clear(&self, workflow_id: &str) -> Result<(), CanoError> {
            self.0.lock().unwrap().remove(workflow_id);
            Ok(())
        }
    }

    #[test]
    fn checkpoint_store_is_dyn_compatible() {
        // `CheckpointStore` must stay object-safe so a single store can be shared
        // as `Arc<dyn CheckpointStore>` across concurrent workflows.
        let _erased: std::sync::Arc<dyn CheckpointStore> =
            std::sync::Arc::new(InMemoryStore::default());
    }

    #[test]
    fn checkpoint_row_constructors() {
        let bare = CheckpointRow::new(3, "Process", "worker");
        assert_eq!(bare.sequence, 3);
        assert_eq!(bare.state, "Process");
        assert_eq!(bare.task_id, "worker");
        assert_eq!(bare.output_blob, None);
        assert_eq!(bare.kind, RowKind::StateEntry);

        let carried = CheckpointRow::new(4, "Done", "worker").with_output(vec![1, 2, 3]);
        assert_eq!(carried.sequence, 4);
        assert_eq!(carried.output_blob.as_deref(), Some(&[1u8, 2, 3][..]));
        assert_eq!(carried.kind, RowKind::CompensationCompletion);

        let cursor = CheckpointRow::new(5, "Step", "stepper").with_cursor(vec![9, 8, 7]);
        assert_eq!(cursor.sequence, 5);
        assert_eq!(cursor.output_blob.as_deref(), Some(&[9u8, 8, 7][..]));
        assert_eq!(cursor.kind, RowKind::StepCursor);
    }

    #[tokio::test]
    async fn trait_roundtrip_append_load_clear() {
        let store = InMemoryStore::default();

        store
            .append("run", CheckpointRow::new(0, "A", "t0"))
            .await
            .unwrap();
        store
            .append("run", CheckpointRow::new(1, "B", "t1"))
            .await
            .unwrap();
        store
            .append("run", CheckpointRow::new(2, "C", "t2").with_output(vec![9]))
            .await
            .unwrap();

        let rows = store.load_run("run").await.unwrap();
        assert_eq!(rows.len(), 3);
        assert_eq!(
            rows.iter().map(|r| r.sequence).collect::<Vec<_>>(),
            vec![0, 1, 2]
        );
        assert_eq!(rows[2].output_blob.as_deref(), Some(&[9u8][..]));

        store.clear("run").await.unwrap();
        assert!(store.load_run("run").await.unwrap().is_empty());

        // Clearing an unknown id is a no-op.
        store.clear("never-existed").await.unwrap();
    }

    #[tokio::test]
    async fn load_run_unknown_id_is_empty() {
        let store = InMemoryStore::default();
        assert!(store.load_run("nope").await.unwrap().is_empty());
    }

    #[test]
    fn checkpoint_row_default_workflow_version_is_zero() {
        let row = CheckpointRow::new(0, "Start", "task");
        assert_eq!(row.workflow_version, 0);
    }

    #[test]
    fn checkpoint_row_with_workflow_version_sets_field() {
        let row = CheckpointRow::new(0, "Start", "task").with_workflow_version(42);
        assert_eq!(row.workflow_version, 42);
    }

    #[tokio::test]
    async fn append_rejects_duplicate_sequence() {
        let store = InMemoryStore::default();
        store
            .append("run", CheckpointRow::new(0, "A", "t0"))
            .await
            .unwrap();
        // Same `(workflow_id, sequence)` again — must be rejected, not overwrite.
        let err = store
            .append("run", CheckpointRow::new(0, "A-again", "t0"))
            .await
            .expect_err("duplicate sequence must be rejected");
        assert_eq!(err.category(), "checkpoint_store");
        // The original row is untouched and a *different* sequence still appends fine.
        store
            .append("run", CheckpointRow::new(1, "B", "t1"))
            .await
            .unwrap();
        let rows = store.load_run("run").await.unwrap();
        assert_eq!(
            rows.iter()
                .map(|r| (r.sequence, r.state.as_str()))
                .collect::<Vec<_>>(),
            vec![(0, "A"), (1, "B")]
        );
        // Distinct ids never collide.
        store
            .append("other", CheckpointRow::new(0, "A", "t0"))
            .await
            .unwrap();
    }

    // ----- edge cases: serialization round-trip (rows persist and reload across processes) -----

    #[test]
    fn checkpoint_row_json_roundtrip_preserves_all_fields() {
        // Crash recovery persists each row and reloads it (possibly in another process); a broken
        // serde derive would silently corrupt resume. Cover every RowKind + blob/version combo.
        let rows = [
            CheckpointRow::new(0, "Start", "t0"), // StateEntry, no blob, version 0
            CheckpointRow::new(1, "Pay", "charge").with_output(vec![1, 2, 3]), // CompensationCompletion
            CheckpointRow::new(2, "Step", "stepper").with_cursor(vec![]), // StepCursor, empty blob
            CheckpointRow::new(3, "Start", "t0").with_workflow_version(99), // versioned StateEntry
        ];
        for row in rows {
            let bytes = serde_json::to_vec(&row).expect("serialize");
            let back: CheckpointRow = serde_json::from_slice(&bytes).expect("deserialize");
            assert_eq!(back, row, "JSON round-trip must preserve every field");
        }
    }

    #[cfg(feature = "recovery")]
    #[test]
    fn checkpoint_row_postcard_roundtrip_preserves_all_fields() {
        // postcard is the on-disk format the `RedbCheckpointStore` uses; validate the row types
        // survive it directly (a binary format with stricter expectations than JSON).
        let rows = [
            CheckpointRow::new(0, "Start", "t0"),
            CheckpointRow::new(1, "Pay", "charge").with_output(vec![1, 2, 3]),
            CheckpointRow::new(2, "Step", "stepper").with_cursor(vec![9, 8]),
            CheckpointRow::new(3, "Start", "t0").with_workflow_version(99),
        ];
        for row in rows {
            let bytes = postcard::to_stdvec(&row).expect("serialize");
            let back: CheckpointRow = postcard::from_bytes(&bytes).expect("deserialize");
            assert_eq!(back, row, "postcard round-trip must preserve every field");
        }
    }

    // ----- edge cases: RowKind / CheckpointRow builder semantics -----

    #[test]
    fn rowkind_default_is_state_entry() {
        assert_eq!(RowKind::default(), RowKind::StateEntry);
    }

    #[test]
    fn with_output_then_with_workflow_version_is_order_independent() {
        let a = CheckpointRow::new(0, "S", "t")
            .with_output(vec![7])
            .with_workflow_version(5);
        let b = CheckpointRow::new(0, "S", "t")
            .with_workflow_version(5)
            .with_output(vec![7]);
        assert_eq!(a, b);
        assert_eq!(a.kind, RowKind::CompensationCompletion);
        assert_eq!(a.workflow_version, 5);
        assert_eq!(a.output_blob.as_deref(), Some(&[7u8][..]));
    }

    #[test]
    fn last_blob_builder_wins() {
        // The blob/kind builders overwrite: whichever of with_output / with_cursor runs last sets
        // both the blob and the kind.
        let cursor_wins = CheckpointRow::new(0, "S", "t")
            .with_output(vec![1])
            .with_cursor(vec![2]);
        assert_eq!(cursor_wins.kind, RowKind::StepCursor);
        assert_eq!(cursor_wins.output_blob.as_deref(), Some(&[2u8][..]));

        let output_wins = CheckpointRow::new(0, "S", "t")
            .with_cursor(vec![1])
            .with_output(vec![2]);
        assert_eq!(output_wins.kind, RowKind::CompensationCompletion);
        assert_eq!(output_wins.output_blob.as_deref(), Some(&[2u8][..]));
    }

    #[test]
    fn with_output_empty_blob_is_some_not_none() {
        // An empty payload is still a present blob (Some(vec![])), distinct from a plain
        // state-entry row's None — and the kind flips to CompensationCompletion.
        let row = CheckpointRow::new(0, "S", "t").with_output(vec![]);
        assert_eq!(row.output_blob, Some(vec![]));
        assert_eq!(row.kind, RowKind::CompensationCompletion);
    }

    // ----- edge cases: CheckpointStore contract -----

    #[tokio::test]
    async fn load_run_returns_rows_sorted_even_when_appended_out_of_order() {
        let store = InMemoryStore::default();
        for seq in [2u64, 0, 1] {
            store
                .append("run", CheckpointRow::new(seq, "S", "t"))
                .await
                .unwrap();
        }
        let rows = store.load_run("run").await.unwrap();
        assert_eq!(
            rows.iter().map(|r| r.sequence).collect::<Vec<_>>(),
            vec![0, 1, 2],
            "load_run must return rows sorted ascending by sequence"
        );
    }

    #[tokio::test]
    async fn clear_isolates_by_workflow_id() {
        let store = InMemoryStore::default();
        store
            .append("a", CheckpointRow::new(0, "S", "t"))
            .await
            .unwrap();
        store
            .append("b", CheckpointRow::new(0, "S", "t"))
            .await
            .unwrap();
        store.clear("a").await.unwrap();
        assert!(store.load_run("a").await.unwrap().is_empty());
        assert_eq!(
            store.load_run("b").await.unwrap().len(),
            1,
            "clearing one id must not affect another"
        );
    }

    #[tokio::test]
    async fn shared_store_accepts_appends_from_many_tasks() {
        // A single store shared as `Arc<dyn CheckpointStore>` across tasks: every distinct sequence
        // lands, and load_run returns them sorted (the contract concurrent callers rely on).
        let store: std::sync::Arc<dyn CheckpointStore> =
            std::sync::Arc::new(InMemoryStore::default());
        let mut handles = Vec::new();
        for seq in 0..20u64 {
            let s = std::sync::Arc::clone(&store);
            handles.push(tokio::spawn(async move {
                s.append("run", CheckpointRow::new(seq, "S", "t"))
                    .await
                    .unwrap();
            }));
        }
        for h in handles {
            h.await.unwrap();
        }
        let rows = store.load_run("run").await.unwrap();
        assert_eq!(
            rows.iter().map(|r| r.sequence).collect::<Vec<_>>(),
            (0..20).collect::<Vec<_>>()
        );
    }
}