reddb_server/cluster/bootstrap_authority.rs
1//! Cluster bootstrap authority — fail-closed seam (ADR 0058).
2//!
3//! Cluster first boot needs a single authority for global auth, vault,
4//! config, and policy. ADR 0058 makes that authority the reserved global
5//! system range owner, fenced by lease/term and ownership epoch. No
6//! concrete owner model is implemented yet, so this module is only the
7//! runtime *seam*: it decides whether a cluster-shaped boot is allowed to
8//! perform auth bootstrap, and fails closed whenever no concrete owner can
9//! be proven.
10//!
11//! Three outcomes are deliberately preserved while the owner model is absent:
12//!
13//! * A boot that observes a durable, already-published bootstrap completion
14//! marker through the authority path returns
15//! [`BootstrapDisposition::AlreadyComplete`] *before* any other check.
16//! First boot is over: restarts and duplicate attempts are idempotent and
17//! must not recreate admins, reissue the vault, or reapply mutable config
18//! over operator changes (issue #1230). This holds for every shape,
19//! including a cluster, so a once-bootstrapped cluster observes completion
20//! instead of failing closed.
21//! * Explicit `--no-auth` / `--dev` cluster-shaped boots remain allowed as
22//! a development carveout and skip every auth/bootstrap path. They must
23//! create no admin, vault, or bootstrap-complete state.
24//! * Every other cluster-shaped boot that would create auth/bootstrap state
25//! (preset env, credentials, or a manifest) is rejected, because a
26//! symmetric member cannot prove that it — and not a peer — is the single
27//! writer of global auth state.
28//!
29//! Both the cluster *topology* default and an explicit cluster *storage
30//! preset* resolve to [`DeployProfile::Cluster`], so the deploy profile is
31//! the single signal this seam reads for "cluster-shaped".
32
33use crate::storage::DeployProfile;
34
35/// The kind of auth bootstrap a boot is requesting, as classified from the
36/// CLI/env contract. Used only to render a precise denial message; the
37/// fail-closed decision does not depend on which variant is present.
38#[derive(Debug, Clone, Copy, PartialEq, Eq)]
39pub enum AuthBootstrapInput {
40 /// No auth bootstrap requested (e.g. the `simple` preset with no
41 /// credentials and no manifest). A non-owner still must not write a
42 /// per-node bootstrap-complete marker, so this is fail-closed too on a
43 /// cluster-shaped boot.
44 None,
45 /// Auth bootstrap requested through the environment/preset surface:
46 /// the `production`/`cloud`/`regulated` presets, or
47 /// `REDDB_USERNAME` + `REDDB_PASSWORD`.
48 Env,
49 /// Auth bootstrap requested through `REDDB_BOOTSTRAP_MANIFEST`.
50 Manifest,
51}
52
53impl AuthBootstrapInput {
54 const fn describe(self) -> &'static str {
55 match self {
56 Self::None => "no explicit auth bootstrap input",
57 Self::Env => "auth bootstrap env/preset input",
58 Self::Manifest => "auth bootstrap manifest input",
59 }
60 }
61}
62
63/// What the boot path should do with auth bootstrap, once the authority
64/// seam has authorized it.
65#[derive(Debug, Clone, Copy, PartialEq, Eq)]
66pub enum BootstrapDisposition {
67 /// Proceed with ordinary local single-owner bootstrap. The local node
68 /// is the sole authority for its own auth state (standalone,
69 /// serverless, or primary-replica).
70 ProceedLocal,
71 /// Skip every auth/bootstrap path. Reached only by an explicit
72 /// `--no-auth` / `--dev` boot; for a cluster shape this is the
73 /// documented development carveout.
74 SkipDevBypass,
75 /// First boot already completed: a durable bootstrap completion marker
76 /// is visible through the authority path. The caller must treat this as
77 /// idempotent — rehydrate read-only state, but recreate no users,
78 /// reissue no vault certificate, and reapply no mutable config over
79 /// operator changes (issue #1230). Restarts and duplicate bootstrap
80 /// attempts after completion land here, including on a cluster shape.
81 AlreadyComplete,
82}
83
84/// `true` when this boot is cluster-shaped for the purpose of auth
85/// bootstrap authority. Both the cluster topology default and an explicit
86/// cluster storage preset land on [`DeployProfile::Cluster`].
87pub const fn is_cluster_shaped(deploy_profile: DeployProfile) -> bool {
88 matches!(deploy_profile, DeployProfile::Cluster)
89}
90
91/// Decide whether a boot may perform auth bootstrap.
92///
93/// Returns [`BootstrapDisposition`] when the boot is allowed to continue,
94/// or an operator-facing error string when the cluster bootstrap authority
95/// fails closed.
96///
97/// * When `already_completed` is `true` a durable bootstrap completion
98/// marker is visible through the authority path, so this returns
99/// [`BootstrapDisposition::AlreadyComplete`] before any other check. First
100/// boot is over: the caller must be idempotent and recreate no global auth
101/// state (issue #1230). This wins even for a cluster shape, so a restart of
102/// a once-bootstrapped cluster observes completion instead of failing
103/// closed.
104/// * Any `--no-auth` / `--dev` boot returns [`BootstrapDisposition::SkipDevBypass`]:
105/// the caller skips all auth/bootstrap state. For cluster shapes this is
106/// the explicit development carveout from ADR 0058.
107/// * A non-cluster boot returns [`BootstrapDisposition::ProceedLocal`]: the
108/// local node is the only authority for its own auth state.
109/// * A cluster-shaped, non-`--no-auth` boot with no completion marker is
110/// rejected. There is no reserved global system range owner model yet, so
111/// no member can prove it is the single writer of global
112/// auth/vault/config/policy state.
113pub fn authorize(
114 deploy_profile: DeployProfile,
115 no_auth: bool,
116 input: AuthBootstrapInput,
117 already_completed: bool,
118) -> Result<BootstrapDisposition, String> {
119 if already_completed {
120 // The durable completion marker is the authority path's record that
121 // first boot already produced global auth state. Observing it must
122 // never recreate users, reissue the vault, or reapply mutable config
123 // over operator changes — and it must short-circuit the fail-closed
124 // gate, which only guards the *first* write of that state. Duplicate
125 // bootstrap attempts after completion therefore report the existing
126 // completed state idempotently (issue #1230).
127 return Ok(BootstrapDisposition::AlreadyComplete);
128 }
129
130 if no_auth {
131 // `--no-auth` / `--dev` is the last word on auth for this boot
132 // (issue #663). The caller skips every preset/credential path, so
133 // no admin, vault, or bootstrap marker is created — exactly the
134 // cluster development carveout ADR 0058 keeps open.
135 return Ok(BootstrapDisposition::SkipDevBypass);
136 }
137
138 if !is_cluster_shaped(deploy_profile) {
139 return Ok(BootstrapDisposition::ProceedLocal);
140 }
141
142 // Cluster-shaped, credentialled boot. ADR 0058 requires the reserved
143 // global system range owner — fenced by lease/term + ownership epoch —
144 // before any member may create admins, initialize vault material,
145 // install policy, apply a manifest, or publish the bootstrap-complete
146 // marker. No owner model is implemented, so no member can prove
147 // ownership: fail closed instead of letting a symmetric member create
148 // divergent global auth state.
149 Err(format!(
150 "cluster bootstrap authority: refusing to run auth bootstrap on a \
151 cluster-shaped boot ({}) — no concrete authority owner is available. \
152 The reserved global system range owner (ADR 0058) is not yet \
153 implemented, so no member can prove it is the single writer of \
154 global auth/vault/config/policy state. Use --no-auth / --dev for a \
155 development cluster, or run auth bootstrap on a non-cluster topology.",
156 input.describe(),
157 ))
158}
159
160/// Where cluster vault first boot must create or open its vault, and whether
161/// the owner path may consume env/`_FILE` secret inputs.
162///
163/// Issue #1231 wires vault first boot through the bootstrap authority so the
164/// vault, key material, and emitted certificate belong to the *real*
165/// cluster-global auth store the authority model selected — never a scratch
166/// or per-member-only database, which PRD #1227 explicitly forbids ("do not
167/// mint a certificate from an emptyDir/scratch database and apply it to a
168/// different real store").
169#[derive(Debug, Clone, Copy, PartialEq, Eq)]
170pub enum VaultBootstrapPlan {
171 /// Create or open the vault against the cluster-global auth store. The
172 /// boot is the proven authority owner (or the only local authority), so
173 /// the vault pages, key material, and certificate live in the real store
174 /// and the certificate unseals that same store on restart.
175 ///
176 /// `consume_secret_inputs` is `true` only on the first write
177 /// ([`BootstrapDisposition::ProceedLocal`]): the owner path reads the
178 /// env/`_FILE` secret inputs, mints the certificate, and seals the real
179 /// store. A restart that observes the durable completion marker
180 /// ([`BootstrapDisposition::AlreadyComplete`]) sets it to `false`: the
181 /// existing vault is opened and unsealed, but no secret input is consumed,
182 /// so first boot is never re-run and the vault is never rotated. Because
183 /// a non-owner cluster boot fails closed in [`authorize`] *before* any
184 /// plan is produced, secret inputs are never consumed by a non-owner.
185 OpenClusterGlobalStore { consume_secret_inputs: bool },
186 /// Skip every vault/auth path — the explicit `--no-auth` / `--dev`
187 /// development carveout. No vault is created or opened and no certificate
188 /// is minted.
189 SkipNoVault,
190}
191
192/// Map an authorized [`BootstrapDisposition`] to its cluster vault first-boot
193/// plan. This is the single place that decides the vault target store and
194/// whether secret inputs feed the owner path; callers must not re-derive it.
195pub const fn plan_vault_bootstrap(disposition: BootstrapDisposition) -> VaultBootstrapPlan {
196 match disposition {
197 // First write of global auth state: the owner consumes secret inputs
198 // and seals the real cluster-global store.
199 BootstrapDisposition::ProceedLocal => VaultBootstrapPlan::OpenClusterGlobalStore {
200 consume_secret_inputs: true,
201 },
202 // Restart after completion: open and unseal the existing real store,
203 // but consume no secret input — never re-mint or rotate the vault.
204 BootstrapDisposition::AlreadyComplete => VaultBootstrapPlan::OpenClusterGlobalStore {
205 consume_secret_inputs: false,
206 },
207 // `--no-auth` / `--dev`: skip the vault entirely.
208 BootstrapDisposition::SkipDevBypass => VaultBootstrapPlan::SkipNoVault,
209 }
210}
211
212/// Authorize cluster vault first boot end to end: run the bootstrap authority
213/// gate, then map the authorized disposition to its [`VaultBootstrapPlan`].
214///
215/// A cluster-shaped first boot with no proven owner fails closed in
216/// [`authorize`] *before* any plan is produced, so no scratch or per-member
217/// vault is ever minted and no secret input is consumed by a non-owner
218/// (issue #1231). The owner path ([`VaultBootstrapPlan::OpenClusterGlobalStore`])
219/// is the only outcome that creates or opens vault material, and it always
220/// targets the real cluster-global auth store.
221pub fn authorize_vault_bootstrap(
222 deploy_profile: DeployProfile,
223 no_auth: bool,
224 input: AuthBootstrapInput,
225 already_completed: bool,
226) -> Result<VaultBootstrapPlan, String> {
227 authorize(deploy_profile, no_auth, input, already_completed).map(plan_vault_bootstrap)
228}
229
230#[cfg(test)]
231mod tests {
232 use super::*;
233
234 #[test]
235 fn non_cluster_profiles_proceed_locally() {
236 for profile in [
237 DeployProfile::Embedded,
238 DeployProfile::Serverless,
239 DeployProfile::PrimaryReplica,
240 ] {
241 assert!(!is_cluster_shaped(profile), "{profile:?} is not cluster");
242 assert_eq!(
243 authorize(profile, false, AuthBootstrapInput::Env, false).unwrap(),
244 BootstrapDisposition::ProceedLocal,
245 "{profile:?} should proceed with local bootstrap"
246 );
247 }
248 }
249
250 #[test]
251 fn cluster_no_auth_is_the_dev_bypass_carveout() {
252 let disposition = authorize(
253 DeployProfile::Cluster,
254 true,
255 AuthBootstrapInput::None,
256 false,
257 )
258 .unwrap();
259 assert_eq!(disposition, BootstrapDisposition::SkipDevBypass);
260 }
261
262 #[test]
263 fn non_cluster_no_auth_also_skips() {
264 let disposition = authorize(
265 DeployProfile::Embedded,
266 true,
267 AuthBootstrapInput::Env,
268 false,
269 )
270 .unwrap();
271 assert_eq!(disposition, BootstrapDisposition::SkipDevBypass);
272 }
273
274 #[test]
275 fn cluster_env_bootstrap_fails_closed() {
276 let err = authorize(
277 DeployProfile::Cluster,
278 false,
279 AuthBootstrapInput::Env,
280 false,
281 )
282 .unwrap_err();
283 assert!(err.contains("no concrete authority owner"), "got: {err}");
284 assert!(err.contains("env/preset"), "got: {err}");
285 }
286
287 #[test]
288 fn cluster_manifest_bootstrap_fails_closed() {
289 let err = authorize(
290 DeployProfile::Cluster,
291 false,
292 AuthBootstrapInput::Manifest,
293 false,
294 )
295 .unwrap_err();
296 assert!(err.contains("no concrete authority owner"), "got: {err}");
297 assert!(err.contains("manifest"), "got: {err}");
298 }
299
300 #[test]
301 fn cluster_without_explicit_input_still_fails_closed() {
302 // A `simple`-preset cluster boot writes only a per-node
303 // bootstrap-complete marker, which ADR 0058 forbids without a
304 // proven owner. Fail closed so no divergent marker is written.
305 let err = authorize(
306 DeployProfile::Cluster,
307 false,
308 AuthBootstrapInput::None,
309 false,
310 )
311 .unwrap_err();
312 assert!(err.contains("no concrete authority owner"), "got: {err}");
313 }
314
315 #[test]
316 fn completion_marker_makes_local_restart_idempotent() {
317 // Acceptance #2: restart after a successful non-cluster bootstrap
318 // observes the durable completion marker and must not recreate
319 // global auth state.
320 for profile in [
321 DeployProfile::Embedded,
322 DeployProfile::Serverless,
323 DeployProfile::PrimaryReplica,
324 ] {
325 assert_eq!(
326 authorize(profile, false, AuthBootstrapInput::Env, true).unwrap(),
327 BootstrapDisposition::AlreadyComplete,
328 "{profile:?} restart should be idempotent once completed"
329 );
330 }
331 }
332
333 #[test]
334 fn completion_marker_short_circuits_cluster_fail_closed() {
335 // Acceptance #1/#3: once first boot has completed, a cluster restart
336 // observes completion through the authority path instead of failing
337 // closed, even though no concrete owner model exists yet. This is
338 // the only path that lets a credentialled cluster boot succeed.
339 let disposition = authorize(
340 DeployProfile::Cluster,
341 false,
342 AuthBootstrapInput::Manifest,
343 true,
344 )
345 .unwrap();
346 assert_eq!(disposition, BootstrapDisposition::AlreadyComplete);
347 }
348
349 #[test]
350 fn duplicate_bootstrap_after_completion_is_idempotent_for_every_input() {
351 // Acceptance #3: a duplicate bootstrap attempt after completion
352 // reports the existing completed state regardless of which auth
353 // bootstrap input the operator re-supplies.
354 for input in [
355 AuthBootstrapInput::None,
356 AuthBootstrapInput::Env,
357 AuthBootstrapInput::Manifest,
358 ] {
359 assert_eq!(
360 authorize(DeployProfile::Cluster, false, input, true).unwrap(),
361 BootstrapDisposition::AlreadyComplete,
362 "{input:?} duplicate after completion should be idempotent"
363 );
364 }
365 }
366
367 #[test]
368 fn completion_marker_wins_over_dev_bypass() {
369 // The durable completion marker is checked before the `--no-auth`
370 // carveout, so a once-bootstrapped node never silently downgrades
371 // into the anonymous dev path on restart.
372 let disposition =
373 authorize(DeployProfile::Cluster, true, AuthBootstrapInput::None, true).unwrap();
374 assert_eq!(disposition, BootstrapDisposition::AlreadyComplete);
375 }
376
377 // ----- Issue #1231: cluster vault first boot wired to the real store ----
378
379 #[test]
380 fn owner_first_boot_plan_opens_real_store_and_consumes_secrets() {
381 // ProceedLocal is the proven-owner first write: the vault is opened
382 // against the real cluster-global store and the env/`_FILE` secret
383 // inputs feed the certificate-minting owner path.
384 assert_eq!(
385 plan_vault_bootstrap(BootstrapDisposition::ProceedLocal),
386 VaultBootstrapPlan::OpenClusterGlobalStore {
387 consume_secret_inputs: true,
388 }
389 );
390 }
391
392 #[test]
393 fn restart_after_completion_unseals_real_store_without_consuming_secrets() {
394 // AlreadyComplete reopens the same real store and unseals it with the
395 // existing certificate, but consumes no secret input, so the vault is
396 // never re-minted or rotated on restart.
397 assert_eq!(
398 plan_vault_bootstrap(BootstrapDisposition::AlreadyComplete),
399 VaultBootstrapPlan::OpenClusterGlobalStore {
400 consume_secret_inputs: false,
401 }
402 );
403 }
404
405 #[test]
406 fn dev_bypass_plan_skips_the_vault() {
407 assert_eq!(
408 plan_vault_bootstrap(BootstrapDisposition::SkipDevBypass),
409 VaultBootstrapPlan::SkipNoVault
410 );
411 }
412
413 #[test]
414 fn cluster_first_boot_mints_no_vault_and_consumes_no_secret() {
415 // Acceptance: a cluster-shaped first boot with no proven owner fails
416 // closed before any plan is produced, so neither a scratch nor a
417 // per-member vault is minted and no secret input is consumed by a
418 // non-owner. This holds for every credentialled input.
419 for input in [
420 AuthBootstrapInput::None,
421 AuthBootstrapInput::Env,
422 AuthBootstrapInput::Manifest,
423 ] {
424 let err =
425 authorize_vault_bootstrap(DeployProfile::Cluster, false, input, false).unwrap_err();
426 assert!(err.contains("no concrete authority owner"), "got: {err}");
427 }
428 }
429
430 #[test]
431 fn non_cluster_owner_gets_real_store_vault_plan() {
432 for profile in [
433 DeployProfile::Embedded,
434 DeployProfile::Serverless,
435 DeployProfile::PrimaryReplica,
436 ] {
437 assert_eq!(
438 authorize_vault_bootstrap(profile, false, AuthBootstrapInput::Env, false).unwrap(),
439 VaultBootstrapPlan::OpenClusterGlobalStore {
440 consume_secret_inputs: true,
441 },
442 "{profile:?} owner should open the real store and consume secrets"
443 );
444 }
445 }
446
447 #[test]
448 fn completed_cluster_restart_gets_unseal_only_plan() {
449 // Once a cluster has bootstrapped, a restart is authorized through the
450 // completion marker and lands on the unseal-only plan against the real
451 // store — the only way a credentialled cluster boot opens a vault.
452 assert_eq!(
453 authorize_vault_bootstrap(
454 DeployProfile::Cluster,
455 false,
456 AuthBootstrapInput::Manifest,
457 true,
458 )
459 .unwrap(),
460 VaultBootstrapPlan::OpenClusterGlobalStore {
461 consume_secret_inputs: false,
462 }
463 );
464 }
465
466 // The next two tests prove the runtime consequence of the owner plan
467 // against a real pager-backed store: a freshly minted certificate seals
468 // the real store and unseals that same store on restart, while a
469 // scratch/per-member certificate cannot unseal it. The vault here is the
470 // real cluster-global auth store's pager, never an emptyDir scratch DB.
471
472 fn vault_test_pager() -> (crate::storage::engine::pager::Pager, std::path::PathBuf) {
473 use crate::storage::engine::pager::{Pager, PagerConfig};
474 use std::sync::atomic::{AtomicU64, Ordering};
475 static COUNTER: AtomicU64 = AtomicU64::new(0);
476 let id = COUNTER.fetch_add(1, Ordering::Relaxed);
477 let tmp_dir =
478 std::env::temp_dir().join(format!("reddb_cluster_vault_{}_{}", std::process::id(), id));
479 std::fs::create_dir_all(&tmp_dir).unwrap();
480 let pager = Pager::open(&tmp_dir.join("cluster.rdb"), PagerConfig::default()).unwrap();
481 (pager, tmp_dir)
482 }
483
484 #[test]
485 fn owner_certificate_unseals_the_same_store_on_restart() {
486 use crate::auth::vault::{KeyPair, Vault, VaultState};
487
488 // Only run when the plan authorizes the owner path.
489 let plan = authorize_vault_bootstrap(
490 DeployProfile::PrimaryReplica,
491 false,
492 AuthBootstrapInput::Env,
493 false,
494 )
495 .unwrap();
496 assert!(matches!(
497 plan,
498 VaultBootstrapPlan::OpenClusterGlobalStore {
499 consume_secret_inputs: true
500 }
501 ));
502
503 let (pager, tmp_dir) = vault_test_pager();
504
505 // First boot: mint the certificate and seal the real store.
506 let kp = KeyPair::generate();
507 let vault = Vault::with_certificate_bytes(&pager, &kp.certificate).unwrap();
508 let state = VaultState {
509 users: vec![],
510 api_keys: vec![],
511 bootstrapped: true,
512 master_secret: Some(kp.master_secret.clone()),
513 kv: std::collections::HashMap::new(),
514 };
515 vault.save(&pager, &state).unwrap();
516
517 // Restart (unseal-only plan): the emitted certificate unseals the same
518 // store without consuming any secret input.
519 let restart_plan = plan_vault_bootstrap(BootstrapDisposition::AlreadyComplete);
520 assert_eq!(
521 restart_plan,
522 VaultBootstrapPlan::OpenClusterGlobalStore {
523 consume_secret_inputs: false
524 }
525 );
526 let reopened = Vault::with_certificate(&pager, &kp.certificate_hex()).unwrap();
527 let loaded = reopened.load(&pager).unwrap().unwrap();
528 assert!(loaded.bootstrapped);
529 assert_eq!(loaded.master_secret, Some(kp.master_secret));
530
531 drop(pager);
532 let _ = std::fs::remove_dir_all(&tmp_dir);
533 }
534
535 #[test]
536 fn scratch_certificate_cannot_unseal_the_real_store() {
537 use crate::auth::vault::{KeyPair, Vault, VaultState};
538
539 let (pager, tmp_dir) = vault_test_pager();
540
541 // Real cluster-global store sealed by the owner's certificate.
542 let owner = KeyPair::generate();
543 let vault = Vault::with_certificate_bytes(&pager, &owner.certificate).unwrap();
544 vault
545 .save(
546 &pager,
547 &VaultState {
548 users: vec![],
549 api_keys: vec![],
550 bootstrapped: true,
551 master_secret: Some(owner.master_secret.clone()),
552 kv: std::collections::HashMap::new(),
553 },
554 )
555 .unwrap();
556
557 // A per-member scratch certificate (minted against a different DB)
558 // must not unseal the real store — the anti-goal PRD #1227 forbids.
559 let scratch = KeyPair::generate();
560 let scratch_vault = Vault::with_certificate_bytes(&pager, &scratch.certificate).unwrap();
561 assert!(scratch_vault.load(&pager).is_err());
562
563 drop(pager);
564 let _ = std::fs::remove_dir_all(&tmp_dir);
565 }
566}