handoff/drainable.rs
1//! The `Drainable` trait — the consumer contract for opaque handoff-side
2//! lifecycle hooks. The library calls these in a defined order over a handoff;
3//! the consumer (the primitive being handed off) implements them in terms of
4//! its own writer state, accept loop, and shard layout.
5
6use std::time::Instant;
7
8use crate::error::Result;
9
10/// Lifecycle hooks the primitive must implement.
11///
12/// All methods are sync. Consumers that run on an async runtime bridge to it
13/// via channels — see `ARCHITECTURE.md` for the recommended pattern.
14///
15/// # Long-running hooks are fine
16///
17/// While `drain` and `seal` are executing, the incumbent runs a background
18/// thread that emits `Heartbeat` frames every ~2s on the control socket.
19/// The supervisor's per-recv liveness timeout (10s) is reset by each
20/// heartbeat, so a hook that takes 30s, 5 minutes, or longer will *not*
21/// trip a peer-dead timeout — only an unresponsive peer (no frames for
22/// over 10 seconds) will. The overall handoff is still bounded by
23/// `SpawnSpec::deadline` (5 minutes by default; size it above the p99 of
24/// `drain` + `seal` for your workload).
25pub trait Drainable: Send + Sync {
26 /// Stop accepting new connections, cancel background tasks, drain in-flight
27 /// requests, reject new writes. Reads on already-accepted connections may
28 /// continue. Must `fsync` before returning so no acked write is lost.
29 ///
30 /// Bounded by `deadline` (passed in) and by `SpawnSpec::drain_grace`
31 /// (wall-clock cap on the supervisor side). Slow-but-progressing drains
32 /// are kept alive by the library's heartbeat thread — there's no need
33 /// to artificially shorten the work to fit a tight timeout.
34 fn drain(&self, deadline: Instant) -> Result<DrainReport>;
35
36 /// Per shard: flush, write footer, fsync, close the active file. Release
37 /// the data-dir flock immediately on success (the library does this for
38 /// you by dropping its `DataDirLock` — your `seal` need only flush state).
39 ///
40 /// May take as long as the consumer needs. While `seal` is running, the
41 /// library emits heartbeats on the control socket so the supervisor's
42 /// liveness clock stays fresh; only the overall `SpawnSpec::deadline`
43 /// (default 5 minutes) caps the wall-clock duration.
44 fn seal(&self) -> Result<SealReport>;
45
46 /// Restart the accept loop after an aborted handoff. Called by the library
47 /// in every case where `drain` ran but `seal` either failed or never
48 /// committed: post-seal `Abort`/`ResumeAfterAbort`, post-`SealFailed`,
49 /// and supervisor-disconnect-while-drained. The implementation must be
50 /// idempotent and must restart accepting in all cases. If the
51 /// pre-handoff state included an open writer that `seal` closed, this is
52 /// also where it gets re-opened.
53 fn resume_after_abort(&self) -> Result<()>;
54
55 /// Best-effort introspection for diagnostics.
56 fn snapshot_state(&self) -> StateSnapshot;
57}
58
59#[derive(Debug, Clone, Default)]
60pub struct DrainReport {
61 pub open_conns_remaining: u32,
62 pub accept_closed: bool,
63}
64
65#[derive(Debug, Clone, Default)]
66pub struct SealReport {
67 pub last_revision_per_shard: Vec<u64>,
68 pub data_dir_fingerprint: [u8; 32],
69}
70
71#[derive(Debug, Clone, Default)]
72pub struct StateSnapshot {
73 pub shard_count: u32,
74 pub open_conns: u32,
75 pub last_revision_per_shard: Vec<u64>,
76}
77
78#[derive(Debug, Clone, Default)]
79pub struct ReadinessSnapshot {
80 pub listening_on: Vec<String>,
81 pub healthz_ok: bool,
82 pub advertised_revision_per_shard: Vec<u64>,
83}