reddb_server/cluster/control_plane.rs
1//! The Cluster Supervisor control-plane consensus boundary (issue #996,
2//! PRD #987, ADR 0052).
3//!
4//! This module pins *the decision*, in types, for how the Cluster Supervisor
5//! coordinates: a **Raft-equivalent control-plane consensus layer** carries
6//! Supervisor membership, leader election, durable vote/log state, and global
7//! ownership-catalog transitions — and **nothing else**. It is the small
8//! internal abstraction the HITL decision asked for, so follow-up slices build
9//! against a fixed boundary instead of re-opening the protocol choice or
10//! picking a consensus library (ADR 0052).
11//!
12//! ## The one boundary this module exists to enforce
13//!
14//! **User-data writes never enter the control-plane log.** That is not a
15//! convention here — it is structural. [`ControlPlaneEntry`] is the *only*
16//! thing that may be appended through [`ControlPlaneConsensus`], and it has no
17//! variant that can carry a row, a document, a queue message, or any other
18//! user payload. A user write is unrepresentable in this log by construction,
19//! so no future slice can accidentally route durable user data through
20//! Supervisor consensus and inherit its latency/availability coupling
21//! (PRD #987 user story 11; ADR 0030 "without running data payloads through a
22//! Raft log").
23//!
24//! ## What the control-plane log *does* carry
25//!
26//! Exactly the global control-plane facts that need one safe writer and a
27//! totally-ordered, durable, replicated history:
28//!
29//! * [`ControlPlaneEntry::MembershipChange`] — a member admitted, drained, or
30//! removed from the authorized set ([`super::membership`]).
31//! * [`ControlPlaneEntry::OwnershipTransition`] — a fenced, versioned change to
32//! the shard/range ownership catalog (ADR 0037). The Supervisor leader is the
33//! *normal* writer for these (PRD #987 user story 12, the glossary's "Shard
34//! ownership catalog").
35//! * [`ControlPlaneEntry::LeaderConfiguration`] — a record that a Supervisor
36//! term elected a leader, so the elected-term history is itself durable.
37//!
38//! ## Relationship to the existing election primitives
39//!
40//! The Raft-equivalent *vote-safety* mechanics already exist for primary
41//! election in [`crate::replication::election`] — term bump, durable last-vote,
42//! majority-quorum, "no two leaders in a term". The Cluster Supervisor reuses
43//! those same mechanics for its own leader election (ADR 0030's decoupled
44//! supervisor). What is *new* here, and what this module names, is the
45//! **control-plane log**: the replicated, ordered entry stream the elected
46//! leader appends to. The trait keeps the concrete engine — single-node now, a
47//! full replicated log later — behind one seam.
48//!
49//! Everything in this module is a pure data model plus one trait, with no I/O,
50//! so the boundary is exercised deterministically. A minimal in-memory engine
51//! ([`SingleNodeControlPlane`]) implements the trait so the leader-only-append
52//! and no-user-data invariants are tested, not merely asserted in prose.
53
54use super::identity::ClusterVoterIdentity;
55
56/// A Supervisor election term — a strictly monotonic generation that fences a
57/// stale leader. A new term is minted by each election round, exactly as in
58/// [`crate::replication::election`]; the control-plane log stamps every entry
59/// with the term that produced it so a deposed leader's entries are detectable.
60#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
61pub struct ControlPlaneTerm(pub u64);
62
63impl ControlPlaneTerm {
64 /// The genesis term, before any election has run.
65 pub const GENESIS: ControlPlaneTerm = ControlPlaneTerm(0);
66
67 /// The next term a candidate stands for. Election bumps the term by one.
68 pub fn next(self) -> ControlPlaneTerm {
69 ControlPlaneTerm(self.0 + 1)
70 }
71}
72
73/// A position in the control-plane log. The log is append-only and totally
74/// ordered, so an index uniquely names one committed control-plane fact.
75#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
76pub struct ControlPlaneIndex(pub u64);
77
78/// The durable per-node vote state every voting member must persist *before*
79/// acknowledging a vote, so a member that crashes and restarts mid-term cannot
80/// double-vote and split a term (ADR 0030: "the supervisor needs durable
81/// per-node vote state (last-vote) to prevent double-voting across restarts").
82///
83/// This is the control-plane analogue of
84/// [`crate::replication::election::LastVote`]; it is named here as a *required*
85/// part of the consensus boundary so a follow-up slice cannot ship a leader
86/// election without it.
87#[derive(Debug, Clone, PartialEq, Eq)]
88pub struct DurableVoteState {
89 /// The highest term this member has voted in.
90 pub term: ControlPlaneTerm,
91 /// Who this member granted its vote to in `term`, if anyone. `None` means
92 /// the member has seen `term` but not yet voted in it.
93 pub voted_for: Option<ClusterVoterIdentity>,
94}
95
96impl DurableVoteState {
97 /// The vote state of a member that has never voted.
98 pub fn initial() -> Self {
99 Self {
100 term: ControlPlaneTerm::GENESIS,
101 voted_for: None,
102 }
103 }
104}
105
106/// The shard/range ownership-catalog change recorded by a single control-plane
107/// log entry. This is deliberately the *fact* of a fenced transition, not the
108/// full catalog schema (that lives with ADR 0037 / ADR 0045 follow-ups): the
109/// boundary this module fixes is only that ownership transitions are
110/// control-plane log entries written by the leader, with an ownership epoch
111/// that fences the old owner.
112#[derive(Debug, Clone, PartialEq, Eq)]
113pub struct OwnershipTransition {
114 /// The range whose ownership is changing, named opaquely here so this
115 /// boundary type does not pin the range-id encoding the catalog slice owns.
116 pub range: String,
117 /// The new owner after the transition.
118 pub new_owner: ClusterVoterIdentity,
119 /// The ownership epoch the transition bumps to. A monotonic epoch is what
120 /// fences a stale owner that reappears (ADR 0037 "Fencing is enforced below
121 /// routing"); recording it in the consensus log makes the fence durable.
122 pub ownership_epoch: u64,
123}
124
125/// A change to the authorized-member set, recorded durably so membership is
126/// agreed control-plane state rather than each node's local guess
127/// ([`super::membership`]).
128#[derive(Debug, Clone, PartialEq, Eq)]
129pub enum MembershipChange {
130 /// A member was admitted to the cluster (after the [`super::join`]
131 /// handshake authorized it).
132 Admit(ClusterVoterIdentity),
133 /// A member finished draining and was removed from the authorized set.
134 Remove(ClusterVoterIdentity),
135}
136
137/// The complete, closed set of things that may be appended to the Cluster
138/// Supervisor control-plane log.
139///
140/// There is **no user-data variant on purpose**. The absence is the enforcement
141/// mechanism for the central decision (ADR 0052): durable user writes cannot be
142/// expressed as a control-plane entry, so they cannot be routed through, gated
143/// by, or made to wait on Supervisor consensus. Adding a user-payload variant
144/// here would be a decision reversal and must go through a new ADR, not a code
145/// change.
146#[derive(Debug, Clone, PartialEq, Eq)]
147pub enum ControlPlaneEntry {
148 /// A member admitted to or removed from the cluster.
149 MembershipChange(MembershipChange),
150 /// A fenced, versioned shard/range ownership-catalog transition — the
151 /// Supervisor leader's normal write.
152 OwnershipTransition(OwnershipTransition),
153 /// A record that `term` elected `leader`, keeping the elected-term history
154 /// durable alongside the entries that term produced.
155 LeaderConfiguration {
156 /// The Supervisor term that elected the leader.
157 term: ControlPlaneTerm,
158 /// The member elected leader for `term`.
159 leader: ClusterVoterIdentity,
160 },
161}
162
163/// Why an attempt to append to the control-plane log was refused.
164#[derive(Debug, Clone, PartialEq, Eq)]
165pub enum ControlPlaneError {
166 /// The caller is not the current Supervisor leader. Only the elected leader
167 /// is the normal writer for control-plane entries (PRD #987 user story 12);
168 /// a follower that tries to append is told who the leader is (if known) so
169 /// it can forward.
170 NotLeader {
171 /// The current leader, if this node knows one.
172 leader: Option<ClusterVoterIdentity>,
173 },
174}
175
176impl std::fmt::Display for ControlPlaneError {
177 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
178 match self {
179 Self::NotLeader { leader: Some(l) } => {
180 write!(f, "not the control-plane leader; current leader is {l}")
181 }
182 Self::NotLeader { leader: None } => {
183 write!(f, "not the control-plane leader; no leader currently known")
184 }
185 }
186 }
187}
188
189impl std::error::Error for ControlPlaneError {}
190
191/// The small internal abstraction over the Cluster Supervisor control-plane
192/// consensus engine.
193///
194/// This trait is the seam the HITL decision asked for: follow-up slices depend
195/// on *this*, not on a concrete Raft library or hand-rolled protocol. The first
196/// implementation may be a degenerate single-node engine
197/// ([`SingleNodeControlPlane`]); a later slice may back it with a fully
198/// replicated, persisted log. Neither change reaches the callers.
199///
200/// The trait deliberately exposes only what the boundary needs:
201///
202/// * read the current [`ControlPlaneTerm`] and the elected [`leader`];
203/// * append a [`ControlPlaneEntry`] — **leader-only**, which is how "the
204/// Supervisor leader is the normal writer for ownership transitions" is
205/// enforced rather than documented; and
206/// * read the durable [`DurableVoteState`], so the election safety obligation
207/// is part of the contract.
208///
209/// [`leader`]: ControlPlaneConsensus::leader
210pub trait ControlPlaneConsensus {
211 /// The current Supervisor election term.
212 fn current_term(&self) -> ControlPlaneTerm;
213
214 /// The current Supervisor leader, if one is elected for [`current_term`].
215 ///
216 /// [`current_term`]: ControlPlaneConsensus::current_term
217 fn leader(&self) -> Option<ClusterVoterIdentity>;
218
219 /// Is *this* node the current Supervisor leader?
220 fn is_leader(&self) -> bool;
221
222 /// The durable per-node vote state, persisted before acknowledging a vote.
223 fn durable_vote(&self) -> DurableVoteState;
224
225 /// The index of the highest committed entry, or `None` if the log is empty.
226 fn commit_index(&self) -> Option<ControlPlaneIndex>;
227
228 /// Append a control-plane entry as the leader.
229 ///
230 /// Returns the committed [`ControlPlaneIndex`] on success, or
231 /// [`ControlPlaneError::NotLeader`] if this node is not the leader — a
232 /// follower must forward the request to the leader rather than write
233 /// locally. By construction, the `entry` cannot carry user data.
234 fn append(&mut self, entry: ControlPlaneEntry) -> Result<ControlPlaneIndex, ControlPlaneError>;
235}
236
237/// A minimal single-node control-plane engine: this node is the sole voter and
238/// therefore the leader of term 1. It exists to (a) give the first cut a usable
239/// implementation of the boundary, and (b) make the leader-only-append and
240/// no-user-data invariants executable tests rather than prose.
241///
242/// It is intentionally *not* the replicated engine — that is a later slice.
243/// What matters is that the later engine swaps in behind [`ControlPlaneConsensus`]
244/// without touching callers.
245#[derive(Debug)]
246pub struct SingleNodeControlPlane {
247 identity: ClusterVoterIdentity,
248 term: ControlPlaneTerm,
249 is_leader: bool,
250 vote: DurableVoteState,
251 log: Vec<ControlPlaneEntry>,
252}
253
254impl SingleNodeControlPlane {
255 /// A single-node control plane that has elected `identity` as the leader of
256 /// term 1, voting for itself. This is the trivial-but-correct quorum: a
257 /// one-member majority.
258 pub fn bootstrap_leader(identity: ClusterVoterIdentity) -> Self {
259 let term = ControlPlaneTerm::GENESIS.next();
260 Self {
261 term,
262 is_leader: true,
263 vote: DurableVoteState {
264 term,
265 voted_for: Some(identity.clone()),
266 },
267 identity,
268 log: Vec::new(),
269 }
270 }
271
272 /// The entries committed so far, in log order.
273 pub fn entries(&self) -> &[ControlPlaneEntry] {
274 &self.log
275 }
276}
277
278impl ControlPlaneConsensus for SingleNodeControlPlane {
279 fn current_term(&self) -> ControlPlaneTerm {
280 self.term
281 }
282
283 fn leader(&self) -> Option<ClusterVoterIdentity> {
284 self.is_leader.then(|| self.identity.clone())
285 }
286
287 fn is_leader(&self) -> bool {
288 self.is_leader
289 }
290
291 fn durable_vote(&self) -> DurableVoteState {
292 self.vote.clone()
293 }
294
295 fn commit_index(&self) -> Option<ControlPlaneIndex> {
296 self.log
297 .len()
298 .checked_sub(1)
299 .map(|i| ControlPlaneIndex(i as u64))
300 }
301
302 fn append(&mut self, entry: ControlPlaneEntry) -> Result<ControlPlaneIndex, ControlPlaneError> {
303 if !self.is_leader {
304 return Err(ControlPlaneError::NotLeader {
305 leader: self.leader(),
306 });
307 }
308 self.log.push(entry);
309 Ok(ControlPlaneIndex(self.log.len() as u64 - 1))
310 }
311}
312
313#[cfg(test)]
314mod tests {
315 use super::*;
316
317 fn voter(cn: &str) -> ClusterVoterIdentity {
318 ClusterVoterIdentity::from_certificate_subject(cn).unwrap()
319 }
320
321 #[test]
322 fn bootstrap_leader_elects_itself_in_term_one() {
323 let id = voter("CN=node-a");
324 let cp = SingleNodeControlPlane::bootstrap_leader(id.clone());
325
326 assert_eq!(cp.current_term(), ControlPlaneTerm(1));
327 assert!(cp.is_leader());
328 assert_eq!(cp.leader(), Some(id.clone()));
329 // The durable vote is recorded for the elected term, for itself.
330 assert_eq!(
331 cp.durable_vote(),
332 DurableVoteState {
333 term: ControlPlaneTerm(1),
334 voted_for: Some(id),
335 }
336 );
337 // An empty log has no commit index yet.
338 assert_eq!(cp.commit_index(), None);
339 }
340
341 #[test]
342 fn leader_appends_ownership_transition_and_commit_index_advances() {
343 let mut cp = SingleNodeControlPlane::bootstrap_leader(voter("CN=node-a"));
344
345 let idx = cp
346 .append(ControlPlaneEntry::OwnershipTransition(
347 OwnershipTransition {
348 range: "users:[0,1000)".to_string(),
349 new_owner: voter("CN=node-b"),
350 ownership_epoch: 7,
351 },
352 ))
353 .expect("leader may append");
354
355 assert_eq!(idx, ControlPlaneIndex(0));
356 assert_eq!(cp.commit_index(), Some(ControlPlaneIndex(0)));
357 assert_eq!(cp.entries().len(), 1);
358 }
359
360 #[test]
361 fn membership_and_leader_config_entries_are_ordered() {
362 let mut cp = SingleNodeControlPlane::bootstrap_leader(voter("CN=node-a"));
363
364 let first = cp
365 .append(ControlPlaneEntry::LeaderConfiguration {
366 term: ControlPlaneTerm(1),
367 leader: voter("CN=node-a"),
368 })
369 .unwrap();
370 let second = cp
371 .append(ControlPlaneEntry::MembershipChange(
372 MembershipChange::Admit(voter("CN=node-b")),
373 ))
374 .unwrap();
375
376 assert!(second > first);
377 assert_eq!(cp.commit_index(), Some(second));
378 }
379
380 #[test]
381 fn a_follower_may_not_append() {
382 // Force the non-leader path: a node that is not the leader must refuse
383 // to write the control-plane log and point at the leader instead.
384 let mut cp = SingleNodeControlPlane::bootstrap_leader(voter("CN=node-a"));
385 cp.is_leader = false;
386
387 let err = cp
388 .append(ControlPlaneEntry::MembershipChange(
389 MembershipChange::Remove(voter("CN=node-b")),
390 ))
391 .expect_err("a follower must not append");
392
393 assert_eq!(err, ControlPlaneError::NotLeader { leader: None });
394 }
395}