Skip to main content

reddb_server/cluster/
control_plane.rs

1//! The Cluster Supervisor control-plane consensus boundary (issue #996,
2//! PRD #987, ADR 0052).
3//!
4//! This module pins *the decision*, in types, for how the Cluster Supervisor
5//! coordinates: a **Raft-equivalent control-plane consensus layer** carries
6//! Supervisor membership, leader election, durable vote/log state, and global
7//! ownership-catalog transitions — and **nothing else**. It is the small
8//! internal abstraction the HITL decision asked for, so follow-up slices build
9//! against a fixed boundary instead of re-opening the protocol choice or
10//! picking a consensus library (ADR 0052).
11//!
12//! ## The one boundary this module exists to enforce
13//!
14//! **User-data writes never enter the control-plane log.** That is not a
15//! convention here — it is structural. [`ControlPlaneEntry`] is the *only*
16//! thing that may be appended through [`ControlPlaneConsensus`], and it has no
17//! variant that can carry a row, a document, a queue message, or any other
18//! user payload. A user write is unrepresentable in this log by construction,
19//! so no future slice can accidentally route durable user data through
20//! Supervisor consensus and inherit its latency/availability coupling
21//! (PRD #987 user story 11; ADR 0030 "without running data payloads through a
22//! Raft log").
23//!
24//! ## What the control-plane log *does* carry
25//!
26//! Exactly the global control-plane facts that need one safe writer and a
27//! totally-ordered, durable, replicated history:
28//!
29//! * [`ControlPlaneEntry::MembershipChange`] — a member admitted, drained, or
30//!   removed from the authorized set ([`super::membership`]).
31//! * [`ControlPlaneEntry::OwnershipTransition`] — a fenced, versioned change to
32//!   the shard/range ownership catalog (ADR 0037). The Supervisor leader is the
33//!   *normal* writer for these (PRD #987 user story 12, the glossary's "Shard
34//!   ownership catalog").
35//! * [`ControlPlaneEntry::LeaderConfiguration`] — a record that a Supervisor
36//!   term elected a leader, so the elected-term history is itself durable.
37//!
38//! ## Relationship to the existing election primitives
39//!
40//! The Raft-equivalent *vote-safety* mechanics already exist for primary
41//! election in [`crate::replication::election`] — term bump, durable last-vote,
42//! majority-quorum, "no two leaders in a term". The Cluster Supervisor reuses
43//! those same mechanics for its own leader election (ADR 0030's decoupled
44//! supervisor). What is *new* here, and what this module names, is the
45//! **control-plane log**: the replicated, ordered entry stream the elected
46//! leader appends to. The trait keeps the concrete engine — single-node now, a
47//! full replicated log later — behind one seam.
48//!
49//! Everything in this module is a pure data model plus one trait, with no I/O,
50//! so the boundary is exercised deterministically. A minimal in-memory engine
51//! ([`SingleNodeControlPlane`]) implements the trait so the leader-only-append
52//! and no-user-data invariants are tested, not merely asserted in prose.
53
54use super::identity::ClusterVoterIdentity;
55
56/// A Supervisor election term — a strictly monotonic generation that fences a
57/// stale leader. A new term is minted by each election round, exactly as in
58/// [`crate::replication::election`]; the control-plane log stamps every entry
59/// with the term that produced it so a deposed leader's entries are detectable.
60#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
61pub struct ControlPlaneTerm(pub u64);
62
63impl ControlPlaneTerm {
64    /// The genesis term, before any election has run.
65    pub const GENESIS: ControlPlaneTerm = ControlPlaneTerm(0);
66
67    /// The next term a candidate stands for. Election bumps the term by one.
68    pub fn next(self) -> ControlPlaneTerm {
69        ControlPlaneTerm(self.0 + 1)
70    }
71}
72
73/// A position in the control-plane log. The log is append-only and totally
74/// ordered, so an index uniquely names one committed control-plane fact.
75#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
76pub struct ControlPlaneIndex(pub u64);
77
78/// The durable per-node vote state every voting member must persist *before*
79/// acknowledging a vote, so a member that crashes and restarts mid-term cannot
80/// double-vote and split a term (ADR 0030: "the supervisor needs durable
81/// per-node vote state (last-vote) to prevent double-voting across restarts").
82///
83/// This is the control-plane analogue of
84/// [`crate::replication::election::LastVote`]; it is named here as a *required*
85/// part of the consensus boundary so a follow-up slice cannot ship a leader
86/// election without it.
87#[derive(Debug, Clone, PartialEq, Eq)]
88pub struct DurableVoteState {
89    /// The highest term this member has voted in.
90    pub term: ControlPlaneTerm,
91    /// Who this member granted its vote to in `term`, if anyone. `None` means
92    /// the member has seen `term` but not yet voted in it.
93    pub voted_for: Option<ClusterVoterIdentity>,
94}
95
96impl DurableVoteState {
97    /// The vote state of a member that has never voted.
98    pub fn initial() -> Self {
99        Self {
100            term: ControlPlaneTerm::GENESIS,
101            voted_for: None,
102        }
103    }
104}
105
106/// The shard/range ownership-catalog change recorded by a single control-plane
107/// log entry. This is deliberately the *fact* of a fenced transition, not the
108/// full catalog schema (that lives with ADR 0037 / ADR 0045 follow-ups): the
109/// boundary this module fixes is only that ownership transitions are
110/// control-plane log entries written by the leader, with an ownership epoch
111/// that fences the old owner.
112#[derive(Debug, Clone, PartialEq, Eq)]
113pub struct OwnershipTransition {
114    /// The range whose ownership is changing, named opaquely here so this
115    /// boundary type does not pin the range-id encoding the catalog slice owns.
116    pub range: String,
117    /// The new owner after the transition.
118    pub new_owner: ClusterVoterIdentity,
119    /// The ownership epoch the transition bumps to. A monotonic epoch is what
120    /// fences a stale owner that reappears (ADR 0037 "Fencing is enforced below
121    /// routing"); recording it in the consensus log makes the fence durable.
122    pub ownership_epoch: u64,
123}
124
125/// A change to the authorized-member set, recorded durably so membership is
126/// agreed control-plane state rather than each node's local guess
127/// ([`super::membership`]).
128#[derive(Debug, Clone, PartialEq, Eq)]
129pub enum MembershipChange {
130    /// A member was admitted to the cluster (after the [`super::join`]
131    /// handshake authorized it).
132    Admit(ClusterVoterIdentity),
133    /// A member finished draining and was removed from the authorized set.
134    Remove(ClusterVoterIdentity),
135}
136
137/// The complete, closed set of things that may be appended to the Cluster
138/// Supervisor control-plane log.
139///
140/// There is **no user-data variant on purpose**. The absence is the enforcement
141/// mechanism for the central decision (ADR 0052): durable user writes cannot be
142/// expressed as a control-plane entry, so they cannot be routed through, gated
143/// by, or made to wait on Supervisor consensus. Adding a user-payload variant
144/// here would be a decision reversal and must go through a new ADR, not a code
145/// change.
146#[derive(Debug, Clone, PartialEq, Eq)]
147pub enum ControlPlaneEntry {
148    /// A member admitted to or removed from the cluster.
149    MembershipChange(MembershipChange),
150    /// A fenced, versioned shard/range ownership-catalog transition — the
151    /// Supervisor leader's normal write.
152    OwnershipTransition(OwnershipTransition),
153    /// A record that `term` elected `leader`, keeping the elected-term history
154    /// durable alongside the entries that term produced.
155    LeaderConfiguration {
156        /// The Supervisor term that elected the leader.
157        term: ControlPlaneTerm,
158        /// The member elected leader for `term`.
159        leader: ClusterVoterIdentity,
160    },
161}
162
163/// Why an attempt to append to the control-plane log was refused.
164#[derive(Debug, Clone, PartialEq, Eq)]
165pub enum ControlPlaneError {
166    /// The caller is not the current Supervisor leader. Only the elected leader
167    /// is the normal writer for control-plane entries (PRD #987 user story 12);
168    /// a follower that tries to append is told who the leader is (if known) so
169    /// it can forward.
170    NotLeader {
171        /// The current leader, if this node knows one.
172        leader: Option<ClusterVoterIdentity>,
173    },
174}
175
176impl std::fmt::Display for ControlPlaneError {
177    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
178        match self {
179            Self::NotLeader { leader: Some(l) } => {
180                write!(f, "not the control-plane leader; current leader is {l}")
181            }
182            Self::NotLeader { leader: None } => {
183                write!(f, "not the control-plane leader; no leader currently known")
184            }
185        }
186    }
187}
188
189impl std::error::Error for ControlPlaneError {}
190
191/// The small internal abstraction over the Cluster Supervisor control-plane
192/// consensus engine.
193///
194/// This trait is the seam the HITL decision asked for: follow-up slices depend
195/// on *this*, not on a concrete Raft library or hand-rolled protocol. The first
196/// implementation may be a degenerate single-node engine
197/// ([`SingleNodeControlPlane`]); a later slice may back it with a fully
198/// replicated, persisted log. Neither change reaches the callers.
199///
200/// The trait deliberately exposes only what the boundary needs:
201///
202/// * read the current [`ControlPlaneTerm`] and the elected [`leader`];
203/// * append a [`ControlPlaneEntry`] — **leader-only**, which is how "the
204///   Supervisor leader is the normal writer for ownership transitions" is
205///   enforced rather than documented; and
206/// * read the durable [`DurableVoteState`], so the election safety obligation
207///   is part of the contract.
208///
209/// [`leader`]: ControlPlaneConsensus::leader
210pub trait ControlPlaneConsensus {
211    /// The current Supervisor election term.
212    fn current_term(&self) -> ControlPlaneTerm;
213
214    /// The current Supervisor leader, if one is elected for [`current_term`].
215    ///
216    /// [`current_term`]: ControlPlaneConsensus::current_term
217    fn leader(&self) -> Option<ClusterVoterIdentity>;
218
219    /// Is *this* node the current Supervisor leader?
220    fn is_leader(&self) -> bool;
221
222    /// The durable per-node vote state, persisted before acknowledging a vote.
223    fn durable_vote(&self) -> DurableVoteState;
224
225    /// The index of the highest committed entry, or `None` if the log is empty.
226    fn commit_index(&self) -> Option<ControlPlaneIndex>;
227
228    /// Append a control-plane entry as the leader.
229    ///
230    /// Returns the committed [`ControlPlaneIndex`] on success, or
231    /// [`ControlPlaneError::NotLeader`] if this node is not the leader — a
232    /// follower must forward the request to the leader rather than write
233    /// locally. By construction, the `entry` cannot carry user data.
234    fn append(&mut self, entry: ControlPlaneEntry) -> Result<ControlPlaneIndex, ControlPlaneError>;
235}
236
237/// A minimal single-node control-plane engine: this node is the sole voter and
238/// therefore the leader of term 1. It exists to (a) give the first cut a usable
239/// implementation of the boundary, and (b) make the leader-only-append and
240/// no-user-data invariants executable tests rather than prose.
241///
242/// It is intentionally *not* the replicated engine — that is a later slice.
243/// What matters is that the later engine swaps in behind [`ControlPlaneConsensus`]
244/// without touching callers.
245#[derive(Debug)]
246pub struct SingleNodeControlPlane {
247    identity: ClusterVoterIdentity,
248    term: ControlPlaneTerm,
249    is_leader: bool,
250    vote: DurableVoteState,
251    log: Vec<ControlPlaneEntry>,
252}
253
254impl SingleNodeControlPlane {
255    /// A single-node control plane that has elected `identity` as the leader of
256    /// term 1, voting for itself. This is the trivial-but-correct quorum: a
257    /// one-member majority.
258    pub fn bootstrap_leader(identity: ClusterVoterIdentity) -> Self {
259        let term = ControlPlaneTerm::GENESIS.next();
260        Self {
261            term,
262            is_leader: true,
263            vote: DurableVoteState {
264                term,
265                voted_for: Some(identity.clone()),
266            },
267            identity,
268            log: Vec::new(),
269        }
270    }
271
272    /// The entries committed so far, in log order.
273    pub fn entries(&self) -> &[ControlPlaneEntry] {
274        &self.log
275    }
276}
277
278impl ControlPlaneConsensus for SingleNodeControlPlane {
279    fn current_term(&self) -> ControlPlaneTerm {
280        self.term
281    }
282
283    fn leader(&self) -> Option<ClusterVoterIdentity> {
284        self.is_leader.then(|| self.identity.clone())
285    }
286
287    fn is_leader(&self) -> bool {
288        self.is_leader
289    }
290
291    fn durable_vote(&self) -> DurableVoteState {
292        self.vote.clone()
293    }
294
295    fn commit_index(&self) -> Option<ControlPlaneIndex> {
296        self.log
297            .len()
298            .checked_sub(1)
299            .map(|i| ControlPlaneIndex(i as u64))
300    }
301
302    fn append(&mut self, entry: ControlPlaneEntry) -> Result<ControlPlaneIndex, ControlPlaneError> {
303        if !self.is_leader {
304            return Err(ControlPlaneError::NotLeader {
305                leader: self.leader(),
306            });
307        }
308        self.log.push(entry);
309        Ok(ControlPlaneIndex(self.log.len() as u64 - 1))
310    }
311}
312
313#[cfg(test)]
314mod tests {
315    use super::*;
316
317    fn voter(cn: &str) -> ClusterVoterIdentity {
318        ClusterVoterIdentity::from_certificate_subject(cn).unwrap()
319    }
320
321    #[test]
322    fn bootstrap_leader_elects_itself_in_term_one() {
323        let id = voter("CN=node-a");
324        let cp = SingleNodeControlPlane::bootstrap_leader(id.clone());
325
326        assert_eq!(cp.current_term(), ControlPlaneTerm(1));
327        assert!(cp.is_leader());
328        assert_eq!(cp.leader(), Some(id.clone()));
329        // The durable vote is recorded for the elected term, for itself.
330        assert_eq!(
331            cp.durable_vote(),
332            DurableVoteState {
333                term: ControlPlaneTerm(1),
334                voted_for: Some(id),
335            }
336        );
337        // An empty log has no commit index yet.
338        assert_eq!(cp.commit_index(), None);
339    }
340
341    #[test]
342    fn leader_appends_ownership_transition_and_commit_index_advances() {
343        let mut cp = SingleNodeControlPlane::bootstrap_leader(voter("CN=node-a"));
344
345        let idx = cp
346            .append(ControlPlaneEntry::OwnershipTransition(
347                OwnershipTransition {
348                    range: "users:[0,1000)".to_string(),
349                    new_owner: voter("CN=node-b"),
350                    ownership_epoch: 7,
351                },
352            ))
353            .expect("leader may append");
354
355        assert_eq!(idx, ControlPlaneIndex(0));
356        assert_eq!(cp.commit_index(), Some(ControlPlaneIndex(0)));
357        assert_eq!(cp.entries().len(), 1);
358    }
359
360    #[test]
361    fn membership_and_leader_config_entries_are_ordered() {
362        let mut cp = SingleNodeControlPlane::bootstrap_leader(voter("CN=node-a"));
363
364        let first = cp
365            .append(ControlPlaneEntry::LeaderConfiguration {
366                term: ControlPlaneTerm(1),
367                leader: voter("CN=node-a"),
368            })
369            .unwrap();
370        let second = cp
371            .append(ControlPlaneEntry::MembershipChange(
372                MembershipChange::Admit(voter("CN=node-b")),
373            ))
374            .unwrap();
375
376        assert!(second > first);
377        assert_eq!(cp.commit_index(), Some(second));
378    }
379
380    #[test]
381    fn a_follower_may_not_append() {
382        // Force the non-leader path: a node that is not the leader must refuse
383        // to write the control-plane log and point at the leader instead.
384        let mut cp = SingleNodeControlPlane::bootstrap_leader(voter("CN=node-a"));
385        cp.is_leader = false;
386
387        let err = cp
388            .append(ControlPlaneEntry::MembershipChange(
389                MembershipChange::Remove(voter("CN=node-b")),
390            ))
391            .expect_err("a follower must not append");
392
393        assert_eq!(err, ControlPlaneError::NotLeader { leader: None });
394    }
395}