nodedb_cluster/bootstrap/config.rs
1// SPDX-License-Identifier: BUSL-1.1
2
3//! Cluster configuration and post-start state.
4
5use std::net::SocketAddr;
6use std::time::Duration;
7
8use std::sync::{Arc, Mutex, RwLock};
9
10use crate::multi_raft::MultiRaft;
11use crate::routing::RoutingTable;
12use crate::topology::ClusterTopology;
13
14/// Tunable retry policy for the join loop.
15///
16/// The schedule is computed by halving from the configured ceiling:
17/// for `max_attempts = 8` and `max_backoff_secs = 32`, the per-attempt
18/// delays are `0.25 s, 0.5 s, 1 s, 2 s, 4 s, 8 s, 16 s, 32 s` — i.e.
19/// each delay is `max_backoff_secs >> (max_attempts - attempt)`. This
20/// keeps the formula obvious from a single number while preserving
21/// exponential growth.
22///
23/// Defaults match the production schedule. Tests construct their own
24/// policy with a much smaller `max_backoff_secs` so the integration
25/// suite doesn't pay a ~minute backoff on every join failure path.
26#[derive(Debug, Clone, Copy)]
27pub struct JoinRetryPolicy {
28 /// Number of join attempts before the loop gives up.
29 pub max_attempts: u32,
30 /// Cap on the per-attempt backoff delay, in seconds. The schedule
31 /// is derived from this ceiling — see the struct doc comment.
32 pub max_backoff_secs: u64,
33}
34
35impl Default for JoinRetryPolicy {
36 fn default() -> Self {
37 Self {
38 max_attempts: 8,
39 max_backoff_secs: 32,
40 }
41 }
42}
43
44impl JoinRetryPolicy {
45 /// Backoff delay before `attempt` (1-indexed). Attempt 0 is the
46 /// initial try and never sleeps. Returns `Duration::ZERO` for
47 /// out-of-range attempts.
48 pub fn backoff_for(&self, attempt: u32) -> Duration {
49 if attempt == 0 || attempt > self.max_attempts {
50 return Duration::ZERO;
51 }
52 // Schedule grows exponentially toward `max_backoff_secs`. We
53 // compute in millis so small `max_backoff_secs` values (test
54 // configs) still produce non-zero delays for the early
55 // attempts instead of being floored to zero seconds.
56 let exp = self.max_attempts - attempt;
57 let max_ms = self.max_backoff_secs.saturating_mul(1_000);
58 let ms = max_ms >> exp;
59 Duration::from_millis(ms.max(1))
60 }
61}
62
63/// Configuration for cluster formation.
64#[derive(Debug, Clone)]
65pub struct ClusterConfig {
66 /// This node's unique ID.
67 pub node_id: u64,
68 /// Address to listen on for Raft RPCs.
69 pub listen_addr: SocketAddr,
70 /// Seed node addresses for bootstrap/join.
71 pub seed_nodes: Vec<SocketAddr>,
72 /// Number of Raft groups to create on bootstrap.
73 pub num_groups: u64,
74 /// Replication factor (number of replicas per group).
75 pub replication_factor: usize,
76 /// Data directory for persistent Raft log storage.
77 pub data_dir: std::path::PathBuf,
78 /// Operator escape hatch: bypass the probe phase and bootstrap this
79 /// node unconditionally even if it is not the lexicographically
80 /// smallest seed.
81 ///
82 /// Set this only on disaster recovery when the designated
83 /// bootstrapper is permanently unreachable. Requires `listen_addr`
84 /// to be present in `seed_nodes` (enforced at the caller's config
85 /// validation layer).
86 pub force_bootstrap: bool,
87 /// Retry policy for the join loop. Defaults to production values
88 /// (`8` attempts, `32 s` ceiling). Tests override this with a
89 /// faster policy.
90 pub join_retry: JoinRetryPolicy,
91 /// Optional UDP bind address for the SWIM failure detector. `None`
92 /// disables SWIM entirely — cluster startup then relies solely on
93 /// the existing raft transport for membership observations. When
94 /// `Some`, the operator is expected to spawn SWIM separately via
95 /// [`crate::spawn_swim`] after the cluster is up and feed the
96 /// seed list from `seed_nodes`.
97 pub swim_udp_addr: Option<SocketAddr>,
98 /// Raft election timeout range. Controls how long a follower waits
99 /// before starting an election after losing contact with the leader.
100 pub election_timeout_min: Duration,
101 pub election_timeout_max: Duration,
102 /// Maximum byte size of each `InstallSnapshot` RPC chunk.
103 ///
104 /// Defaults to 4 MiB. Larger values reduce round-trip count at the cost
105 /// of higher per-RPC memory pressure. Smaller values improve retry
106 /// granularity for flaky links.
107 pub install_snapshot_chunk_bytes: u64,
108 /// Age in seconds beyond which a `.partial` snapshot file is considered
109 /// orphaned and can be removed by the GC sweep.
110 ///
111 /// Defaults to 300 s (5 min). A partial file is orphaned when the
112 /// leader that was sending it has since lost leadership or crashed.
113 pub orphan_partial_max_age_secs: u64,
114}
115
116/// Result of cluster startup — everything needed to run the Raft loop.
117///
118/// All mutable fields are wrapped in `Arc<RwLock<T>>` or `Arc<Mutex<T>>`
119/// so subsystems started during `start_cluster` can hold live references
120/// to the same shared state without copying it out of the initial
121/// bootstrap result. The `RunningCluster` produced by the subsystem
122/// registry holds `Arc` clones that keep the data alive alongside the
123/// caller's `ClusterHandle`.
124pub struct ClusterState {
125 pub topology: Arc<RwLock<ClusterTopology>>,
126 pub routing: Arc<RwLock<RoutingTable>>,
127 pub multi_raft: Arc<Mutex<MultiRaft>>,
128}