Skip to main content

handoff/
drainable.rs

1//! The `Drainable` trait — the consumer contract for opaque handoff-side
2//! lifecycle hooks. The library calls these in a defined order over a handoff;
3//! the consumer (the primitive being handed off) implements them in terms of
4//! its own writer state, accept loop, and shard layout.
5
6use std::time::Instant;
7
8use crate::error::Result;
9
10/// Lifecycle hooks the primitive must implement.
11///
12/// All methods are sync. Consumers that run on an async runtime bridge to it
13/// via channels — see `ARCHITECTURE.md` for the recommended pattern.
14///
15/// # Long-running hooks are fine
16///
17/// While `drain` and `seal` are executing, the incumbent runs a background
18/// thread that emits `Heartbeat` frames every ~2s on the control socket.
19/// The supervisor's per-recv liveness timeout (10s) is reset by each
20/// heartbeat, so a hook that takes 30s, 5 minutes, or longer will *not*
21/// trip a peer-dead timeout — only an unresponsive peer (no frames for
22/// over 10 seconds) will. The overall handoff is still bounded by
23/// `SpawnSpec::deadline` (5 minutes by default; size it above the p99 of
24/// `drain` + `seal` for your workload).
25pub trait Drainable: Send + Sync {
26    /// Stop accepting new connections, cancel background tasks, drain in-flight
27    /// requests, reject new writes. Reads on already-accepted connections may
28    /// continue. Must `fsync` before returning so no acked write is lost.
29    ///
30    /// Bounded by `deadline` (passed in) and by `SpawnSpec::drain_grace`
31    /// (wall-clock cap on the supervisor side). Slow-but-progressing drains
32    /// are kept alive by the library's heartbeat thread — there's no need
33    /// to artificially shorten the work to fit a tight timeout.
34    fn drain(&self, deadline: Instant) -> Result<DrainReport>;
35
36    /// Per shard: flush, write footer, fsync, close the active file. Release
37    /// the data-dir flock immediately on success (the library does this for
38    /// you by dropping its `DataDirLock` — your `seal` need only flush state).
39    ///
40    /// May take as long as the consumer needs. While `seal` is running, the
41    /// library emits heartbeats on the control socket so the supervisor's
42    /// liveness clock stays fresh; only the overall `SpawnSpec::deadline`
43    /// (default 5 minutes) caps the wall-clock duration.
44    fn seal(&self) -> Result<SealReport>;
45
46    /// Restart the accept loop after an aborted handoff. Called by the library
47    /// in every case where `drain` ran but `seal` either failed or never
48    /// committed: post-seal `Abort`/`ResumeAfterAbort`, post-`SealFailed`,
49    /// and supervisor-disconnect-while-drained. The implementation must be
50    /// idempotent and must restart accepting in all cases. If the
51    /// pre-handoff state included an open writer that `seal` closed, this is
52    /// also where it gets re-opened.
53    fn resume_after_abort(&self) -> Result<()>;
54
55    /// Best-effort introspection for diagnostics.
56    fn snapshot_state(&self) -> StateSnapshot;
57}
58
59#[derive(Debug, Clone, Default)]
60pub struct DrainReport {
61    pub open_conns_remaining: u32,
62    pub accept_closed: bool,
63}
64
65#[derive(Debug, Clone, Default)]
66pub struct SealReport {
67    pub last_revision_per_shard: Vec<u64>,
68    pub data_dir_fingerprint: [u8; 32],
69}
70
71#[derive(Debug, Clone, Default)]
72pub struct StateSnapshot {
73    pub shard_count: u32,
74    pub open_conns: u32,
75    pub last_revision_per_shard: Vec<u64>,
76}
77
78#[derive(Debug, Clone, Default)]
79pub struct ReadinessSnapshot {
80    pub listening_on: Vec<String>,
81    pub healthz_ok: bool,
82    pub advertised_revision_per_shard: Vec<u64>,
83}