nodedb_cluster/metadata_group/entry.rs
1// SPDX-License-Identifier: BUSL-1.1
2
3//! The canonical wire-type for every entry proposed to the metadata Raft group.
4
5use serde::{Deserialize, Serialize};
6
7use nodedb_types::Hlc;
8
9use crate::metadata_group::compensation::Compensation;
10use crate::metadata_group::descriptors::{DescriptorId, DescriptorLease};
11use crate::metadata_group::migration_state::{MigrationCheckpointPayload, MigrationPhaseTag};
12
13/// An entry in the replicated metadata log.
14///
15/// Every mutation to cluster-wide state — DDL, topology, routing,
16/// descriptor leases, cluster version bumps — is encoded as one of
17/// these variants, proposed against the metadata Raft group, and
18/// applied on every node by a
19/// [`crate::metadata_group::applier::MetadataApplier`].
20///
21/// The `CatalogDdl` variant is the single wire shape for every DDL
22/// mutation. Its `payload` is an opaque, host-serialized
23/// `nodedb::control::catalog_entry::CatalogEntry` value — the
24/// `nodedb-cluster` crate is deliberately ignorant of the host's
25/// per-DDL-object struct shapes. This keeps the cluster crate
26/// layering-clean and makes adding new DDL object types on the
27/// host side a zero-wire-change operation.
28#[derive(
29 Debug,
30 Clone,
31 PartialEq,
32 Eq,
33 Serialize,
34 Deserialize,
35 zerompk::ToMessagePack,
36 zerompk::FromMessagePack,
37)]
38pub enum MetadataEntry {
39 /// Single generic DDL entry carrying an opaque host-side payload.
40 /// Produced by every pgwire DDL handler via
41 /// `nodedb::control::metadata_proposer::propose_catalog_entry`.
42 CatalogDdl {
43 payload: Vec<u8>,
44 },
45
46 /// DDL entry with attached audit context. Produced by pgwire DDL
47 /// handlers that have the authenticated identity + raw statement
48 /// text bound at the call site (every `CREATE`, `ALTER`, `DROP`,
49 /// `GRANT`, `REVOKE` path). Applied identically to `CatalogDdl`
50 /// on every node; additionally, the production applier fsync-
51 /// appends an audit record to the audit segment WAL with the
52 /// authenticated user, HLC at commit, descriptor versions before
53 /// + after, and the raw SQL — exactly what J.4 requires.
54 ///
55 /// Carries its own payload so legacy proposers (internal lease
56 /// and descriptor-drain flows that have no SQL text) can keep
57 /// using the plain `CatalogDdl` variant without synthesizing
58 /// fake audit context.
59 CatalogDdlAudited {
60 payload: Vec<u8>,
61 /// Authenticated user id at propose time.
62 auth_user_id: String,
63 /// Authenticated username at propose time.
64 auth_user_name: String,
65 /// Raw SQL statement as the client sent it. Not parsed here —
66 /// the cluster crate is opaque to SQL syntax. Persisted on
67 /// every replica so post-hoc audit queries don't depend on
68 /// the proposing node still being alive.
69 sql_text: String,
70 },
71
72 /// Atomic batch of metadata entries proposed by a transactional
73 /// DDL session (`BEGIN; CREATE ...; CREATE ...; COMMIT;`). The
74 /// applier unpacks and applies each sub-entry in order at a
75 /// single raft log index, so either all commit or none do.
76 Batch {
77 entries: Vec<MetadataEntry>,
78 },
79
80 // ── Topology / routing ─────────────────────────────────────────────
81 TopologyChange(TopologyChange),
82 RoutingChange(RoutingChange),
83
84 // ── Cluster version ────────────────────────────────────────────────
85 ClusterVersionBump {
86 from: u16,
87 to: u16,
88 },
89
90 // ── Descriptor leases ──────────────────────────────────────────────
91 DescriptorLeaseGrant(DescriptorLease),
92 DescriptorLeaseRelease {
93 node_id: u64,
94 descriptor_ids: Vec<DescriptorId>,
95 },
96
97 // ── Descriptor lease drain ────────────────────────────────────────
98 /// Begin draining leases on a descriptor. While a drain entry
99 /// is active, any `acquire_descriptor_lease` at
100 /// `version <= up_to_version` must be rejected cluster-wide so
101 /// the in-flight DDL that bumps the version can make progress.
102 ///
103 /// `expires_at` is the HLC at which this drain entry is
104 /// considered stale and ignored by `is_draining` checks on
105 /// read. Acts as a TTL that prevents a crashed proposer from
106 /// leaving an orphaned drain that blocks the cluster forever.
107 DescriptorDrainStart {
108 descriptor_id: DescriptorId,
109 up_to_version: u64,
110 expires_at: Hlc,
111 },
112 /// End draining on a descriptor. Emitted explicitly on drain
113 /// timeout so the cluster can make progress. On the happy
114 /// path (successful `Put*` apply), the host-side applier
115 /// clears drain implicitly — this variant is the escape
116 /// hatch for the failure path.
117 DescriptorDrainEnd {
118 descriptor_id: DescriptorId,
119 },
120
121 /// Cluster-wide CA trust mutation (L.4). Proposed by
122 /// `nodedb rotate-ca --stage` (to add a new CA) and
123 /// `nodedb rotate-ca --finalize --remove <fp>` (to drop an old
124 /// CA). Applied on every node by `MetadataCommitApplier`: writes
125 /// or deletes `data_dir/tls/ca.d/<fp_hex>.crt` and triggers a
126 /// live rebuild of the rustls server + client configs so the
127 /// new trust set takes effect without restart.
128 ///
129 /// `add_ca_cert` and `remove_ca_fingerprint` are independent:
130 /// the `--stage` form sets `add_ca_cert = Some(new_ca_der)` +
131 /// `remove_ca_fingerprint = None`; `--finalize` flips both. A
132 /// single entry carrying both performs the cutover atomically
133 /// once the operator has confirmed every node has reissued.
134 CaTrustChange {
135 /// DER-encoded CA certificate to add to the trust set. `None`
136 /// when this entry only removes.
137 add_ca_cert: Option<Vec<u8>>,
138 /// SHA-256 fingerprint of the CA to remove from the trust set.
139 /// `None` when this entry only adds.
140 remove_ca_fingerprint: Option<[u8; 32]>,
141 },
142
143 // ── Surrogate identity ────────────────────────────────────────────
144 /// Advance the cluster-wide surrogate high-watermark to `hwm`.
145 ///
146 /// Proposed by the metadata-group leader whenever the local
147 /// `SurrogateRegistry` flush threshold trips (every 1024
148 /// allocations or 200 ms, whichever comes first). Applied on
149 /// every node by the host-side `MetadataCommitApplier` which
150 /// calls `SurrogateRegistry::restore_hwm(hwm)` — idempotent and
151 /// monotonic, so out-of-order replay or duplicate delivery are
152 /// both safe. Followers must never allocate surrogates locally;
153 /// they only advance their in-memory HWM via these log entries.
154 SurrogateAlloc {
155 hwm: u32,
156 },
157
158 /// Join-token lifecycle transition (L.4). Proposed by the
159 /// bootstrap-listener handler on every state change so that all
160 /// Raft peers can enforce single-use token semantics even after a
161 /// crash-restart cycle.
162 ///
163 /// `token_hash` is the SHA-256 of the token hex string — the raw
164 /// token is never stored in the log. `transition` encodes the
165 /// direction: `Register` for first issuance, `BeginInFlight` when
166 /// a joiner presents the token, `MarkConsumed` when the bundle is
167 /// delivered, `RevertInFlight` when the dead-man timer fires, and
168 /// `MarkExpired` / `MarkAborted` for the terminal states.
169 JoinTokenTransition {
170 token_hash: [u8; 32],
171 transition: JoinTokenTransitionKind,
172 /// Unix-ms timestamp at the time of the proposal.
173 ts_ms: u64,
174 },
175
176 /// Crash-safe migration phase checkpoint. Persisted on every phase
177 /// transition; on coordinator restart, recovery scans the
178 /// `MigrationStateTable` and resumes from the latest committed
179 /// checkpoint. Apply is idempotent on `(migration_id, phase, attempt)`
180 /// — duplicate delivery is a no-op. CRC32C mismatch is fatal.
181 MigrationCheckpoint {
182 /// Hyphenated UUID string (zerompk does not serialize uuid::Uuid).
183 migration_id: String,
184 phase: MigrationPhaseTag,
185 attempt: u32,
186 payload: MigrationCheckpointPayload,
187 crc32c: u32,
188 ts_ms: u64,
189 },
190
191 /// Replicated migration abort with ordered compensations. Each
192 /// compensation is applied in order; any failure is fatal (no
193 /// warn-and-continue). On success, the migration's row in
194 /// `MigrationStateTable` is deleted.
195 MigrationAbort {
196 migration_id: String,
197 reason: String,
198 compensations: Vec<Compensation>,
199 },
200}
201
202/// The direction of a join-token lifecycle transition.
203#[derive(
204 Debug,
205 Clone,
206 PartialEq,
207 Eq,
208 Serialize,
209 Deserialize,
210 zerompk::ToMessagePack,
211 zerompk::FromMessagePack,
212)]
213pub enum JoinTokenTransitionKind {
214 /// New token registered (Issued state). Carries expiry so all nodes
215 /// can enforce the TTL independently.
216 Register { expires_at_ms: u64 },
217 /// Joiner presented the token; transitioning Issued → InFlight.
218 BeginInFlight { node_addr: String },
219 /// Bundle delivered; transitioning InFlight → Consumed.
220 MarkConsumed { node_addr: String },
221 /// Dead-man timer fired; transitioning InFlight → Issued.
222 RevertInFlight,
223 /// Token TTL elapsed without consumption.
224 MarkExpired,
225 /// Explicitly invalidated by an operator.
226 MarkAborted,
227}
228
229/// Topology mutations proposed through the metadata group.
230#[derive(
231 Debug,
232 Clone,
233 PartialEq,
234 Eq,
235 Serialize,
236 Deserialize,
237 zerompk::ToMessagePack,
238 zerompk::FromMessagePack,
239)]
240pub enum TopologyChange {
241 Join { node_id: u64, addr: String },
242 Leave { node_id: u64 },
243 PromoteToVoter { node_id: u64 },
244 StartDecommission { node_id: u64 },
245 FinishDecommission { node_id: u64 },
246}
247
248/// Routing-table mutations proposed through the metadata group.
249#[derive(
250 Debug,
251 Clone,
252 PartialEq,
253 Eq,
254 Serialize,
255 Deserialize,
256 zerompk::ToMessagePack,
257 zerompk::FromMessagePack,
258)]
259pub enum RoutingChange {
260 /// Move a vShard to a new raft group leaseholder.
261 ReassignVShard {
262 vshard_id: u32,
263 new_group_id: u64,
264 new_leaseholder_node_id: u64,
265 },
266 /// Record a leadership transfer within an existing group.
267 LeadershipTransfer {
268 group_id: u64,
269 new_leader_node_id: u64,
270 },
271 /// Remove a node from a Raft group's member and learner sets.
272 ///
273 /// Used by the decommission flow to strip a draining node out of
274 /// every group it belongs to. Proposing this is only safe once
275 /// `safety::check_can_decommission` has confirmed the group will
276 /// still satisfy the configured replication factor.
277 RemoveMember { group_id: u64, node_id: u64 },
278}