Skip to main content

nodedb_cluster/metadata_group/
entry.rs

1// SPDX-License-Identifier: BUSL-1.1
2
3//! The canonical wire-type for every entry proposed to the metadata Raft group.
4
5use serde::{Deserialize, Serialize};
6
7use nodedb_types::Hlc;
8
9use crate::metadata_group::compensation::Compensation;
10use crate::metadata_group::descriptors::{DescriptorId, DescriptorLease};
11use crate::metadata_group::migration_state::{MigrationCheckpointPayload, MigrationPhaseTag};
12
13/// An entry in the replicated metadata log.
14///
15/// Every mutation to cluster-wide state — DDL, topology, routing,
16/// descriptor leases, cluster version bumps — is encoded as one of
17/// these variants, proposed against the metadata Raft group, and
18/// applied on every node by a
19/// [`crate::metadata_group::applier::MetadataApplier`].
20///
21/// The `CatalogDdl` variant is the single wire shape for every DDL
22/// mutation. Its `payload` is an opaque, host-serialized
23/// `nodedb::control::catalog_entry::CatalogEntry` value — the
24/// `nodedb-cluster` crate is deliberately ignorant of the host's
25/// per-DDL-object struct shapes. This keeps the cluster crate
26/// layering-clean and makes adding new DDL object types on the
27/// host side a zero-wire-change operation.
28#[derive(
29    Debug,
30    Clone,
31    PartialEq,
32    Eq,
33    Serialize,
34    Deserialize,
35    zerompk::ToMessagePack,
36    zerompk::FromMessagePack,
37)]
38pub enum MetadataEntry {
39    /// Single generic DDL entry carrying an opaque host-side payload.
40    /// Produced by every pgwire DDL handler via
41    /// `nodedb::control::metadata_proposer::propose_catalog_entry`.
42    CatalogDdl {
43        payload: Vec<u8>,
44    },
45
46    /// DDL entry with attached audit context. Produced by pgwire DDL
47    /// handlers that have the authenticated identity + raw statement
48    /// text bound at the call site (every `CREATE`, `ALTER`, `DROP`,
49    /// `GRANT`, `REVOKE` path). Applied identically to `CatalogDdl`
50    /// on every node; additionally, the production applier fsync-
51    /// appends an audit record to the audit segment WAL with the
52    /// authenticated user, HLC at commit, descriptor versions before
53    /// + after, and the raw SQL — exactly what J.4 requires.
54    ///
55    /// Carries its own payload so legacy proposers (internal lease
56    /// and descriptor-drain flows that have no SQL text) can keep
57    /// using the plain `CatalogDdl` variant without synthesizing
58    /// fake audit context.
59    CatalogDdlAudited {
60        payload: Vec<u8>,
61        /// Authenticated user id at propose time.
62        auth_user_id: String,
63        /// Authenticated username at propose time.
64        auth_user_name: String,
65        /// Raw SQL statement as the client sent it. Not parsed here —
66        /// the cluster crate is opaque to SQL syntax. Persisted on
67        /// every replica so post-hoc audit queries don't depend on
68        /// the proposing node still being alive.
69        sql_text: String,
70    },
71
72    /// Atomic batch of metadata entries proposed by a transactional
73    /// DDL session (`BEGIN; CREATE ...; CREATE ...; COMMIT;`). The
74    /// applier unpacks and applies each sub-entry in order at a
75    /// single raft log index, so either all commit or none do.
76    Batch {
77        entries: Vec<MetadataEntry>,
78    },
79
80    // ── Topology / routing ─────────────────────────────────────────────
81    TopologyChange(TopologyChange),
82    RoutingChange(RoutingChange),
83
84    // ── Cluster version ────────────────────────────────────────────────
85    ClusterVersionBump {
86        from: u16,
87        to: u16,
88    },
89
90    // ── Descriptor leases ──────────────────────────────────────────────
91    DescriptorLeaseGrant(DescriptorLease),
92    DescriptorLeaseRelease {
93        node_id: u64,
94        descriptor_ids: Vec<DescriptorId>,
95    },
96
97    // ── Descriptor lease drain ────────────────────────────────────────
98    /// Begin draining leases on a descriptor. While a drain entry
99    /// is active, any `acquire_descriptor_lease` at
100    /// `version <= up_to_version` must be rejected cluster-wide so
101    /// the in-flight DDL that bumps the version can make progress.
102    ///
103    /// `expires_at` is the HLC at which this drain entry is
104    /// considered stale and ignored by `is_draining` checks on
105    /// read. Acts as a TTL that prevents a crashed proposer from
106    /// leaving an orphaned drain that blocks the cluster forever.
107    DescriptorDrainStart {
108        descriptor_id: DescriptorId,
109        up_to_version: u64,
110        expires_at: Hlc,
111    },
112    /// End draining on a descriptor. Emitted explicitly on drain
113    /// timeout so the cluster can make progress. On the happy
114    /// path (successful `Put*` apply), the host-side applier
115    /// clears drain implicitly — this variant is the escape
116    /// hatch for the failure path.
117    DescriptorDrainEnd {
118        descriptor_id: DescriptorId,
119    },
120
121    /// Cluster-wide CA trust mutation (L.4). Proposed by
122    /// `nodedb rotate-ca --stage` (to add a new CA) and
123    /// `nodedb rotate-ca --finalize --remove <fp>` (to drop an old
124    /// CA). Applied on every node by `MetadataCommitApplier`: writes
125    /// or deletes `data_dir/tls/ca.d/<fp_hex>.crt` and triggers a
126    /// live rebuild of the rustls server + client configs so the
127    /// new trust set takes effect without restart.
128    ///
129    /// `add_ca_cert` and `remove_ca_fingerprint` are independent:
130    /// the `--stage` form sets `add_ca_cert = Some(new_ca_der)` +
131    /// `remove_ca_fingerprint = None`; `--finalize` flips both. A
132    /// single entry carrying both performs the cutover atomically
133    /// once the operator has confirmed every node has reissued.
134    CaTrustChange {
135        /// DER-encoded CA certificate to add to the trust set. `None`
136        /// when this entry only removes.
137        add_ca_cert: Option<Vec<u8>>,
138        /// SHA-256 fingerprint of the CA to remove from the trust set.
139        /// `None` when this entry only adds.
140        remove_ca_fingerprint: Option<[u8; 32]>,
141    },
142
143    // ── Surrogate identity ────────────────────────────────────────────
144    /// Advance the cluster-wide surrogate high-watermark to `hwm`.
145    ///
146    /// Proposed by the metadata-group leader whenever the local
147    /// `SurrogateRegistry` flush threshold trips (every 1024
148    /// allocations or 200 ms, whichever comes first). Applied on
149    /// every node by the host-side `MetadataCommitApplier` which
150    /// calls `SurrogateRegistry::restore_hwm(hwm)` — idempotent and
151    /// monotonic, so out-of-order replay or duplicate delivery are
152    /// both safe. Followers must never allocate surrogates locally;
153    /// they only advance their in-memory HWM via these log entries.
154    SurrogateAlloc {
155        hwm: u32,
156    },
157
158    /// Join-token lifecycle transition (L.4). Proposed by the
159    /// bootstrap-listener handler on every state change so that all
160    /// Raft peers can enforce single-use token semantics even after a
161    /// crash-restart cycle.
162    ///
163    /// `token_hash` is the SHA-256 of the token hex string — the raw
164    /// token is never stored in the log. `transition` encodes the
165    /// direction: `Register` for first issuance, `BeginInFlight` when
166    /// a joiner presents the token, `MarkConsumed` when the bundle is
167    /// delivered, `RevertInFlight` when the dead-man timer fires, and
168    /// `MarkExpired` / `MarkAborted` for the terminal states.
169    JoinTokenTransition {
170        token_hash: [u8; 32],
171        transition: JoinTokenTransitionKind,
172        /// Unix-ms timestamp at the time of the proposal.
173        ts_ms: u64,
174    },
175
176    /// Crash-safe migration phase checkpoint. Persisted on every phase
177    /// transition; on coordinator restart, recovery scans the
178    /// `MigrationStateTable` and resumes from the latest committed
179    /// checkpoint. Apply is idempotent on `(migration_id, phase, attempt)`
180    /// — duplicate delivery is a no-op. CRC32C mismatch is fatal.
181    MigrationCheckpoint {
182        /// Hyphenated UUID string (zerompk does not serialize uuid::Uuid).
183        migration_id: String,
184        phase: MigrationPhaseTag,
185        attempt: u32,
186        payload: MigrationCheckpointPayload,
187        crc32c: u32,
188        ts_ms: u64,
189    },
190
191    /// Replicated migration abort with ordered compensations. Each
192    /// compensation is applied in order; any failure is fatal (no
193    /// warn-and-continue). On success, the migration's row in
194    /// `MigrationStateTable` is deleted.
195    MigrationAbort {
196        migration_id: String,
197        reason: String,
198        compensations: Vec<Compensation>,
199    },
200}
201
202/// The direction of a join-token lifecycle transition.
203#[derive(
204    Debug,
205    Clone,
206    PartialEq,
207    Eq,
208    Serialize,
209    Deserialize,
210    zerompk::ToMessagePack,
211    zerompk::FromMessagePack,
212)]
213pub enum JoinTokenTransitionKind {
214    /// New token registered (Issued state). Carries expiry so all nodes
215    /// can enforce the TTL independently.
216    Register { expires_at_ms: u64 },
217    /// Joiner presented the token; transitioning Issued → InFlight.
218    BeginInFlight { node_addr: String },
219    /// Bundle delivered; transitioning InFlight → Consumed.
220    MarkConsumed { node_addr: String },
221    /// Dead-man timer fired; transitioning InFlight → Issued.
222    RevertInFlight,
223    /// Token TTL elapsed without consumption.
224    MarkExpired,
225    /// Explicitly invalidated by an operator.
226    MarkAborted,
227}
228
229/// Topology mutations proposed through the metadata group.
230#[derive(
231    Debug,
232    Clone,
233    PartialEq,
234    Eq,
235    Serialize,
236    Deserialize,
237    zerompk::ToMessagePack,
238    zerompk::FromMessagePack,
239)]
240pub enum TopologyChange {
241    Join { node_id: u64, addr: String },
242    Leave { node_id: u64 },
243    PromoteToVoter { node_id: u64 },
244    StartDecommission { node_id: u64 },
245    FinishDecommission { node_id: u64 },
246}
247
248/// Routing-table mutations proposed through the metadata group.
249#[derive(
250    Debug,
251    Clone,
252    PartialEq,
253    Eq,
254    Serialize,
255    Deserialize,
256    zerompk::ToMessagePack,
257    zerompk::FromMessagePack,
258)]
259pub enum RoutingChange {
260    /// Move a vShard to a new raft group leaseholder.
261    ReassignVShard {
262        vshard_id: u32,
263        new_group_id: u64,
264        new_leaseholder_node_id: u64,
265    },
266    /// Record a leadership transfer within an existing group.
267    LeadershipTransfer {
268        group_id: u64,
269        new_leader_node_id: u64,
270    },
271    /// Remove a node from a Raft group's member and learner sets.
272    ///
273    /// Used by the decommission flow to strip a draining node out of
274    /// every group it belongs to. Proposing this is only safe once
275    /// `safety::check_can_decommission` has confirmed the group will
276    /// still satisfy the configured replication factor.
277    RemoveMember { group_id: u64, node_id: u64 },
278}