dynomite/cluster/coverage.rs
1//! Token-ring coverage validation.
2//!
3//! Operators sometimes ship configs where one or more peers'
4//! token ranges leave a rack effectively empty, or where two
5//! peers in the same rack claim the same token point. Both
6//! shapes are silent footguns: the dispatcher will either skip
7//! the affected rack (empty continuum) or non-deterministically
8//! prefer one of the colliding peers (duplicate tokens), and
9//! the resulting routing surprises only show up under load.
10//! With the `make_error` fix shipped, the affected requests now
11//! surface as real `NoTargets` errors to clients (previously
12//! they were request-timeout hangs); this validator catches the
13//! offending configs at config-load time before they ever reach
14//! the dispatcher.
15//!
16//! # Ring semantics and why `Gap` is structurally impossible
17//!
18//! [`crate::cluster::vnode::dispatch`] uses wrap-around vnode
19//! semantics: for any non-empty continuum, the first peer owns
20//! `(last.token, u32::MAX]` and `[0, first.token]`, and each
21//! subsequent peer at index `i` owns `(prev.token, current.token]`.
22//! As a consequence, every `u32` token resolves to *some* peer
23//! whenever the rack has at least one continuum entry. There is
24//! no token configuration that leaves a hole in `[0, u32::MAX]`
25//! short of an entirely empty rack. The two failure modes this
26//! validator catches are therefore:
27//!
28//! * [`TokenCoverageError::EmptyRack`] - a `(dc, rack)` pair has
29//! peers but none contribute tokens, or has no peers at all.
30//! * [`TokenCoverageError::Overlap`] - two peers in the same rack
31//! claim the exact same token value, leaving the choice of
32//! primary peer for that point dependent on iteration order.
33//!
34//! A `Gap` variant is intentionally *not* included: gaps are not
35//! reachable under the current Dynomite vnode model, and keeping
36//! the enum to the variants we actually emit avoids dead code.
37//!
38//! # Examples
39//!
40//! ```
41//! use dynomite::cluster::coverage::validate_token_coverage;
42//! use dynomite::cluster::peer::{Peer, PeerEndpoint};
43//! use dynomite::hashkit::DynToken;
44//!
45//! let local = Peer::new(
46//! 0,
47//! PeerEndpoint::tcp("h0".into(), 8101),
48//! "r1".into(),
49//! "dc1".into(),
50//! vec![DynToken::from_u32(0)],
51//! true,
52//! true,
53//! false,
54//! );
55//! assert!(validate_token_coverage(&[local]).is_ok());
56//! ```
57
58use std::collections::BTreeMap;
59
60use thiserror::Error;
61
62use crate::cluster::peer::Peer;
63
64/// Error returned by [`validate_token_coverage`].
65#[derive(Debug, Error, PartialEq, Eq)]
66pub enum TokenCoverageError {
67 /// A `(dc, rack)` pair contributes zero tokens to the ring.
68 /// Either the rack has no peers, or every peer in the rack
69 /// has an empty token list.
70 #[error("rack {dc}/{rack} has no tokens")]
71 EmptyRack {
72 /// Datacenter name.
73 dc: String,
74 /// Rack name.
75 rack: String,
76 },
77 /// Two peers in the same rack claim the same token value.
78 /// The dispatcher's binary search will pick whichever peer
79 /// happened to sort first, which is non-deterministic across
80 /// reloads if peer indices change.
81 #[error(
82 "rack {dc}/{rack} has overlapping token {token} claimed by peer {peer_a} and peer {peer_b}"
83 )]
84 Overlap {
85 /// Datacenter name.
86 dc: String,
87 /// Rack name.
88 rack: String,
89 /// The token value claimed by both peers.
90 token: u32,
91 /// Index of the first claimant peer.
92 peer_a: u32,
93 /// Index of the second claimant peer.
94 peer_b: u32,
95 },
96}
97
98/// Validate that every `(dc, rack)` in `peers` produces a usable
99/// ring under [`crate::cluster::vnode::dispatch`] semantics.
100///
101/// The check is per-rack: a fault in DC2 still rejects even if
102/// DC1 is fully populated, because the dispatcher walks each
103/// rack independently when planning ([`collect_routable`]
104/// pushes one entry per `(dc, rack)`).
105///
106/// Returns the *first* fault encountered. The traversal order is
107/// the natural `BTreeMap` order over `(dc, rack)` strings, which
108/// is stable across runs given identical input.
109///
110/// # Errors
111///
112/// * [`TokenCoverageError::EmptyRack`] if any `(dc, rack)`
113/// contributes zero tokens.
114/// * [`TokenCoverageError::Overlap`] if two peers in the same
115/// rack claim the same token value.
116///
117/// # Examples
118///
119/// ```
120/// use dynomite::cluster::coverage::{validate_token_coverage, TokenCoverageError};
121/// use dynomite::cluster::peer::{Peer, PeerEndpoint};
122/// use dynomite::hashkit::DynToken;
123///
124/// let a = Peer::new(
125/// 0, PeerEndpoint::tcp("a".into(), 8101),
126/// "r".into(), "d".into(),
127/// vec![DynToken::from_u32(7)], true, true, false,
128/// );
129/// let b = Peer::new(
130/// 1, PeerEndpoint::tcp("b".into(), 8101),
131/// "r".into(), "d".into(),
132/// vec![DynToken::from_u32(7)], false, true, false,
133/// );
134/// assert!(matches!(
135/// validate_token_coverage(&[a, b]),
136/// Err(TokenCoverageError::Overlap { .. })
137/// ));
138/// ```
139pub fn validate_token_coverage(peers: &[Peer]) -> Result<(), TokenCoverageError> {
140 let mut by_rack: BTreeMap<(&str, &str), Vec<&Peer>> = BTreeMap::new();
141 for p in peers {
142 by_rack.entry((p.dc(), p.rack())).or_default().push(p);
143 }
144 for ((dc, rack), rack_peers) in &by_rack {
145 let mut tokens: Vec<(u32, u32)> = Vec::new();
146 for p in rack_peers {
147 for t in p.tokens() {
148 tokens.push((t.get_int(), p.idx()));
149 }
150 }
151 if tokens.is_empty() {
152 return Err(TokenCoverageError::EmptyRack {
153 dc: (*dc).to_string(),
154 rack: (*rack).to_string(),
155 });
156 }
157 tokens.sort_by_key(|&(t, _)| t);
158 for w in tokens.windows(2) {
159 if w[0].0 == w[1].0 {
160 return Err(TokenCoverageError::Overlap {
161 dc: (*dc).to_string(),
162 rack: (*rack).to_string(),
163 token: w[0].0,
164 peer_a: w[0].1,
165 peer_b: w[1].1,
166 });
167 }
168 }
169 }
170 Ok(())
171}
172
173#[cfg(test)]
174mod tests {
175 use super::*;
176 use crate::cluster::peer::PeerEndpoint;
177 use crate::hashkit::DynToken;
178
179 fn mk(idx: u32, dc: &str, rack: &str, tokens: &[u32]) -> Peer {
180 Peer::new(
181 idx,
182 PeerEndpoint::tcp("127.0.0.1".into(), 8101 + u16::try_from(idx).unwrap_or(0)),
183 rack.into(),
184 dc.into(),
185 tokens.iter().copied().map(DynToken::from_u32).collect(),
186 idx == 0,
187 true,
188 false,
189 )
190 }
191
192 #[test]
193 fn empty_input_is_ok() {
194 assert!(validate_token_coverage(&[]).is_ok());
195 }
196
197 #[test]
198 fn empty_rack_rejected() {
199 // Single peer with no tokens at all: the rack
200 // contributes nothing to the continuum, so the
201 // dispatcher would silently produce zero candidates for
202 // every key hashed against this rack.
203 let p = mk(0, "dc1", "r1", &[]);
204 let err = validate_token_coverage(&[p]).unwrap_err();
205 assert_eq!(
206 err,
207 TokenCoverageError::EmptyRack {
208 dc: "dc1".into(),
209 rack: "r1".into(),
210 }
211 );
212 }
213
214 #[test]
215 fn duplicate_token_rejected() {
216 // Two peers in the same rack with identical tokens:
217 // dispatch would non-deterministically pick whichever
218 // sorted first, producing reload-dependent routing.
219 let a = mk(0, "dc1", "r1", &[1_234_567]);
220 let b = mk(1, "dc1", "r1", &[1_234_567]);
221 let err = validate_token_coverage(&[a, b]).unwrap_err();
222 match err {
223 TokenCoverageError::Overlap {
224 dc,
225 rack,
226 token,
227 peer_a,
228 peer_b,
229 } => {
230 assert_eq!(dc, "dc1");
231 assert_eq!(rack, "r1");
232 assert_eq!(token, 1_234_567);
233 let mut got = [peer_a, peer_b];
234 got.sort_unstable();
235 assert_eq!(got, [0, 1]);
236 }
237 TokenCoverageError::EmptyRack { .. } => panic!("expected Overlap, got EmptyRack"),
238 }
239 }
240
241 #[test]
242 fn duplicate_across_racks_is_fine() {
243 // Same token in different racks is the standard vnode
244 // pattern (each rack is an independent replica of the
245 // ring) and must not be rejected.
246 let a = mk(0, "dc1", "r1", &[42]);
247 let b = mk(1, "dc1", "r2", &[42]);
248 assert!(validate_token_coverage(&[a, b]).is_ok());
249 }
250
251 #[test]
252 fn duplicate_across_dcs_is_fine() {
253 // Same token in different DCs is also normal: each DC
254 // walks its own continuum.
255 let a = mk(0, "dc1", "r1", &[42]);
256 let b = mk(1, "dc2", "r1", &[42]);
257 assert!(validate_token_coverage(&[a, b]).is_ok());
258 }
259
260 #[test]
261 fn valid_3_peer_ring() {
262 // Even token spacing for a 3-way split.
263 let a = mk(0, "dc1", "r1", &[0]);
264 let b = mk(1, "dc1", "r1", &[1_431_655_765]);
265 let c = mk(2, "dc1", "r1", &[2_863_311_530]);
266 assert!(validate_token_coverage(&[a, b, c]).is_ok());
267 }
268
269 #[test]
270 fn valid_4_peer_ring() {
271 // Even token spacing for a 4-way split (the shape pass-3
272 // intended).
273 let a = mk(0, "dc1", "r1", &[0]);
274 let b = mk(1, "dc1", "r1", &[1_073_741_824]);
275 let c = mk(2, "dc1", "r1", &[2_147_483_648]);
276 let d = mk(3, "dc1", "r1", &[3_221_225_472]);
277 assert!(validate_token_coverage(&[a, b, c, d]).is_ok());
278 }
279
280 #[test]
281 fn pass3_3_peer_subset_is_valid_config() {
282 // Pass-3 ran with only the first three of the four
283 // intended peers: floki/0, arnold/1G, nuc/2G. By the
284 // vnode wrap-around semantics this is a fully-covered
285 // ring (peer 0 owns the wrap segment from 2G+1 through
286 // u32::MAX plus the singleton at 0), so the validator
287 // must accept it. The 14% workload error rate observed
288 // in the chaos run was therefore *not* a token-coverage
289 // gap; see docs/journal/2026-05-25-token-coverage-validation.md
290 // for the alternative root-cause hypotheses.
291 let a = mk(0, "dc1", "r1", &[0]);
292 let b = mk(1, "dc1", "r1", &[1_073_741_824]);
293 let c = mk(2, "dc1", "r1", &[2_147_483_648]);
294 assert!(validate_token_coverage(&[a, b, c]).is_ok());
295 }
296
297 #[test]
298 fn multi_token_per_peer_is_fine() {
299 // A single peer holding several tokens (vnode "virtual
300 // node" pattern) is a valid configuration.
301 let a = mk(0, "dc1", "r1", &[0, 1_073_741_824]);
302 let b = mk(1, "dc1", "r1", &[2_147_483_648, 3_221_225_472]);
303 assert!(validate_token_coverage(&[a, b]).is_ok());
304 }
305
306 #[test]
307 fn duplicate_within_single_peer_token_list_rejected() {
308 // A peer whose own token list contains the same token
309 // twice is also a fault: the rack-level continuum will
310 // contain two entries with the same key and the same
311 // peer_idx, breaking the strictly-increasing precondition
312 // the dispatcher relies on.
313 let a = mk(0, "dc1", "r1", &[5, 5]);
314 let err = validate_token_coverage(&[a]).unwrap_err();
315 assert!(matches!(err, TokenCoverageError::Overlap { token: 5, .. }));
316 }
317
318 #[test]
319 fn fault_in_second_dc_still_rejects() {
320 // First DC is fine; second DC has a duplicate. The
321 // validator must still reject because the dispatcher
322 // walks each rack independently.
323 let a = mk(0, "dc1", "r1", &[10]);
324 let b = mk(1, "dc1", "r1", &[20]);
325 let c = mk(2, "dc2", "r1", &[100]);
326 let d = mk(3, "dc2", "r1", &[100]);
327 let err = validate_token_coverage(&[a, b, c, d]).unwrap_err();
328 match err {
329 TokenCoverageError::Overlap { dc, .. } => assert_eq!(dc, "dc2"),
330 TokenCoverageError::EmptyRack { .. } => {
331 panic!("expected Overlap in dc2, got EmptyRack")
332 }
333 }
334 }
335
336 #[test]
337 fn empty_rack_in_second_dc_still_rejects() {
338 // A peer present in dc2/r1 but with no tokens leaves the
339 // second DC empty even though the first is healthy.
340 let a = mk(0, "dc1", "r1", &[10]);
341 let b = mk(1, "dc2", "r1", &[]);
342 let err = validate_token_coverage(&[a, b]).unwrap_err();
343 assert_eq!(
344 err,
345 TokenCoverageError::EmptyRack {
346 dc: "dc2".into(),
347 rack: "r1".into(),
348 }
349 );
350 }
351}