Skip to main content

nodedb_cluster/bootstrap/
config.rs

1// SPDX-License-Identifier: BUSL-1.1
2
3//! Cluster configuration and post-start state.
4
5use std::net::SocketAddr;
6use std::time::Duration;
7
8use std::sync::{Arc, Mutex, RwLock};
9
10use crate::multi_raft::MultiRaft;
11use crate::routing::RoutingTable;
12use crate::topology::ClusterTopology;
13
14/// Tunable retry policy for the join loop.
15///
16/// The schedule is computed by halving from the configured ceiling:
17/// for `max_attempts = 8` and `max_backoff_secs = 32`, the per-attempt
18/// delays are `0.25 s, 0.5 s, 1 s, 2 s, 4 s, 8 s, 16 s, 32 s` — i.e.
19/// each delay is `max_backoff_secs >> (max_attempts - attempt)`. This
20/// keeps the formula obvious from a single number while preserving
21/// exponential growth.
22///
23/// Defaults match the production schedule. Tests construct their own
24/// policy with a much smaller `max_backoff_secs` so the integration
25/// suite doesn't pay a ~minute backoff on every join failure path.
26#[derive(Debug, Clone, Copy)]
27pub struct JoinRetryPolicy {
28    /// Number of join attempts before the loop gives up.
29    pub max_attempts: u32,
30    /// Cap on the per-attempt backoff delay, in seconds. The schedule
31    /// is derived from this ceiling — see the struct doc comment.
32    pub max_backoff_secs: u64,
33}
34
35impl Default for JoinRetryPolicy {
36    fn default() -> Self {
37        Self {
38            max_attempts: 8,
39            max_backoff_secs: 32,
40        }
41    }
42}
43
44impl JoinRetryPolicy {
45    /// Backoff delay before `attempt` (1-indexed). Attempt 0 is the
46    /// initial try and never sleeps. Returns `Duration::ZERO` for
47    /// out-of-range attempts.
48    pub fn backoff_for(&self, attempt: u32) -> Duration {
49        if attempt == 0 || attempt > self.max_attempts {
50            return Duration::ZERO;
51        }
52        // Schedule grows exponentially toward `max_backoff_secs`. We
53        // compute in millis so small `max_backoff_secs` values (test
54        // configs) still produce non-zero delays for the early
55        // attempts instead of being floored to zero seconds.
56        let exp = self.max_attempts - attempt;
57        let max_ms = self.max_backoff_secs.saturating_mul(1_000);
58        let ms = max_ms >> exp;
59        Duration::from_millis(ms.max(1))
60    }
61}
62
63/// Configuration for cluster formation.
64#[derive(Debug, Clone)]
65pub struct ClusterConfig {
66    /// This node's unique ID.
67    pub node_id: u64,
68    /// Address to listen on for Raft RPCs.
69    pub listen_addr: SocketAddr,
70    /// Seed node addresses for bootstrap/join.
71    pub seed_nodes: Vec<SocketAddr>,
72    /// Number of Raft groups to create on bootstrap.
73    pub num_groups: u64,
74    /// Replication factor (number of replicas per group).
75    pub replication_factor: usize,
76    /// Data directory for persistent Raft log storage.
77    pub data_dir: std::path::PathBuf,
78    /// Operator escape hatch: bypass the probe phase and bootstrap this
79    /// node unconditionally even if it is not the lexicographically
80    /// smallest seed.
81    ///
82    /// Set this only on disaster recovery when the designated
83    /// bootstrapper is permanently unreachable. Requires `listen_addr`
84    /// to be present in `seed_nodes` (enforced at the caller's config
85    /// validation layer).
86    pub force_bootstrap: bool,
87    /// Retry policy for the join loop. Defaults to production values
88    /// (`8` attempts, `32 s` ceiling). Tests override this with a
89    /// faster policy.
90    pub join_retry: JoinRetryPolicy,
91    /// Optional UDP bind address for the SWIM failure detector. `None`
92    /// disables SWIM entirely — cluster startup then relies solely on
93    /// the existing raft transport for membership observations. When
94    /// `Some`, the operator is expected to spawn SWIM separately via
95    /// [`crate::spawn_swim`] after the cluster is up and feed the
96    /// seed list from `seed_nodes`.
97    pub swim_udp_addr: Option<SocketAddr>,
98    /// Raft election timeout range. Controls how long a follower waits
99    /// before starting an election after losing contact with the leader.
100    pub election_timeout_min: Duration,
101    pub election_timeout_max: Duration,
102    /// Maximum byte size of each `InstallSnapshot` RPC chunk.
103    ///
104    /// Defaults to 4 MiB. Larger values reduce round-trip count at the cost
105    /// of higher per-RPC memory pressure. Smaller values improve retry
106    /// granularity for flaky links.
107    pub install_snapshot_chunk_bytes: u64,
108    /// Age in seconds beyond which a `.partial` snapshot file is considered
109    /// orphaned and can be removed by the GC sweep.
110    ///
111    /// Defaults to 300 s (5 min). A partial file is orphaned when the
112    /// leader that was sending it has since lost leadership or crashed.
113    pub orphan_partial_max_age_secs: u64,
114}
115
116/// Result of cluster startup — everything needed to run the Raft loop.
117///
118/// All mutable fields are wrapped in `Arc<RwLock<T>>` or `Arc<Mutex<T>>`
119/// so subsystems started during `start_cluster` can hold live references
120/// to the same shared state without copying it out of the initial
121/// bootstrap result. The `RunningCluster` produced by the subsystem
122/// registry holds `Arc` clones that keep the data alive alongside the
123/// caller's `ClusterHandle`.
124pub struct ClusterState {
125    pub topology: Arc<RwLock<ClusterTopology>>,
126    pub routing: Arc<RwLock<RoutingTable>>,
127    pub multi_raft: Arc<Mutex<MultiRaft>>,
128}