dynomite/cluster/
coverage.rs

1//! Token-ring coverage validation.
2//!
3//! Operators sometimes ship configs where one or more peers'
4//! token ranges leave a rack effectively empty, or where two
5//! peers in the same rack claim the same token point. Both
6//! shapes are silent footguns: the dispatcher will either skip
7//! the affected rack (empty continuum) or non-deterministically
8//! prefer one of the colliding peers (duplicate tokens), and
9//! the resulting routing surprises only show up under load.
10//! With the `make_error` fix shipped, the affected requests now
11//! surface as real `NoTargets` errors to clients (previously
12//! they were request-timeout hangs); this validator catches the
13//! offending configs at config-load time before they ever reach
14//! the dispatcher.
15//!
16//! # Ring semantics and why `Gap` is structurally impossible
17//!
18//! [`crate::cluster::vnode::dispatch`] uses wrap-around vnode
19//! semantics: for any non-empty continuum, the first peer owns
20//! `(last.token, u32::MAX]` and `[0, first.token]`, and each
21//! subsequent peer at index `i` owns `(prev.token, current.token]`.
22//! As a consequence, every `u32` token resolves to *some* peer
23//! whenever the rack has at least one continuum entry. There is
24//! no token configuration that leaves a hole in `[0, u32::MAX]`
25//! short of an entirely empty rack. The two failure modes this
26//! validator catches are therefore:
27//!
28//! * [`TokenCoverageError::EmptyRack`] - a `(dc, rack)` pair has
29//!   peers but none contribute tokens, or has no peers at all.
30//! * [`TokenCoverageError::Overlap`] - two peers in the same rack
31//!   claim the exact same token value, leaving the choice of
32//!   primary peer for that point dependent on iteration order.
33//!
34//! A `Gap` variant is intentionally *not* included: gaps are not
35//! reachable under the current Dynomite vnode model, and keeping
36//! the enum to the variants we actually emit avoids dead code.
37//!
38//! # Examples
39//!
40//! ```
41//! use dynomite::cluster::coverage::validate_token_coverage;
42//! use dynomite::cluster::peer::{Peer, PeerEndpoint};
43//! use dynomite::hashkit::DynToken;
44//!
45//! let local = Peer::new(
46//!     0,
47//!     PeerEndpoint::tcp("h0".into(), 8101),
48//!     "r1".into(),
49//!     "dc1".into(),
50//!     vec![DynToken::from_u32(0)],
51//!     true,
52//!     true,
53//!     false,
54//! );
55//! assert!(validate_token_coverage(&[local]).is_ok());
56//! ```
57
58use std::collections::BTreeMap;
59
60use thiserror::Error;
61
62use crate::cluster::peer::Peer;
63
64/// Error returned by [`validate_token_coverage`].
65#[derive(Debug, Error, PartialEq, Eq)]
66pub enum TokenCoverageError {
67    /// A `(dc, rack)` pair contributes zero tokens to the ring.
68    /// Either the rack has no peers, or every peer in the rack
69    /// has an empty token list.
70    #[error("rack {dc}/{rack} has no tokens")]
71    EmptyRack {
72        /// Datacenter name.
73        dc: String,
74        /// Rack name.
75        rack: String,
76    },
77    /// Two peers in the same rack claim the same token value.
78    /// The dispatcher's binary search will pick whichever peer
79    /// happened to sort first, which is non-deterministic across
80    /// reloads if peer indices change.
81    #[error(
82        "rack {dc}/{rack} has overlapping token {token} claimed by peer {peer_a} and peer {peer_b}"
83    )]
84    Overlap {
85        /// Datacenter name.
86        dc: String,
87        /// Rack name.
88        rack: String,
89        /// The token value claimed by both peers.
90        token: u32,
91        /// Index of the first claimant peer.
92        peer_a: u32,
93        /// Index of the second claimant peer.
94        peer_b: u32,
95    },
96}
97
98/// Validate that every `(dc, rack)` in `peers` produces a usable
99/// ring under [`crate::cluster::vnode::dispatch`] semantics.
100///
101/// The check is per-rack: a fault in DC2 still rejects even if
102/// DC1 is fully populated, because the dispatcher walks each
103/// rack independently when planning ([`collect_routable`]
104/// pushes one entry per `(dc, rack)`).
105///
106/// Returns the *first* fault encountered. The traversal order is
107/// the natural `BTreeMap` order over `(dc, rack)` strings, which
108/// is stable across runs given identical input.
109///
110/// # Errors
111///
112/// * [`TokenCoverageError::EmptyRack`] if any `(dc, rack)`
113///   contributes zero tokens.
114/// * [`TokenCoverageError::Overlap`] if two peers in the same
115///   rack claim the same token value.
116///
117/// # Examples
118///
119/// ```
120/// use dynomite::cluster::coverage::{validate_token_coverage, TokenCoverageError};
121/// use dynomite::cluster::peer::{Peer, PeerEndpoint};
122/// use dynomite::hashkit::DynToken;
123///
124/// let a = Peer::new(
125///     0, PeerEndpoint::tcp("a".into(), 8101),
126///     "r".into(), "d".into(),
127///     vec![DynToken::from_u32(7)], true, true, false,
128/// );
129/// let b = Peer::new(
130///     1, PeerEndpoint::tcp("b".into(), 8101),
131///     "r".into(), "d".into(),
132///     vec![DynToken::from_u32(7)], false, true, false,
133/// );
134/// assert!(matches!(
135///     validate_token_coverage(&[a, b]),
136///     Err(TokenCoverageError::Overlap { .. })
137/// ));
138/// ```
139pub fn validate_token_coverage(peers: &[Peer]) -> Result<(), TokenCoverageError> {
140    let mut by_rack: BTreeMap<(&str, &str), Vec<&Peer>> = BTreeMap::new();
141    for p in peers {
142        by_rack.entry((p.dc(), p.rack())).or_default().push(p);
143    }
144    for ((dc, rack), rack_peers) in &by_rack {
145        let mut tokens: Vec<(u32, u32)> = Vec::new();
146        for p in rack_peers {
147            for t in p.tokens() {
148                tokens.push((t.get_int(), p.idx()));
149            }
150        }
151        if tokens.is_empty() {
152            return Err(TokenCoverageError::EmptyRack {
153                dc: (*dc).to_string(),
154                rack: (*rack).to_string(),
155            });
156        }
157        tokens.sort_by_key(|&(t, _)| t);
158        for w in tokens.windows(2) {
159            if w[0].0 == w[1].0 {
160                return Err(TokenCoverageError::Overlap {
161                    dc: (*dc).to_string(),
162                    rack: (*rack).to_string(),
163                    token: w[0].0,
164                    peer_a: w[0].1,
165                    peer_b: w[1].1,
166                });
167            }
168        }
169    }
170    Ok(())
171}
172
173#[cfg(test)]
174mod tests {
175    use super::*;
176    use crate::cluster::peer::PeerEndpoint;
177    use crate::hashkit::DynToken;
178
179    fn mk(idx: u32, dc: &str, rack: &str, tokens: &[u32]) -> Peer {
180        Peer::new(
181            idx,
182            PeerEndpoint::tcp("127.0.0.1".into(), 8101 + u16::try_from(idx).unwrap_or(0)),
183            rack.into(),
184            dc.into(),
185            tokens.iter().copied().map(DynToken::from_u32).collect(),
186            idx == 0,
187            true,
188            false,
189        )
190    }
191
192    #[test]
193    fn empty_input_is_ok() {
194        assert!(validate_token_coverage(&[]).is_ok());
195    }
196
197    #[test]
198    fn empty_rack_rejected() {
199        // Single peer with no tokens at all: the rack
200        // contributes nothing to the continuum, so the
201        // dispatcher would silently produce zero candidates for
202        // every key hashed against this rack.
203        let p = mk(0, "dc1", "r1", &[]);
204        let err = validate_token_coverage(&[p]).unwrap_err();
205        assert_eq!(
206            err,
207            TokenCoverageError::EmptyRack {
208                dc: "dc1".into(),
209                rack: "r1".into(),
210            }
211        );
212    }
213
214    #[test]
215    fn duplicate_token_rejected() {
216        // Two peers in the same rack with identical tokens:
217        // dispatch would non-deterministically pick whichever
218        // sorted first, producing reload-dependent routing.
219        let a = mk(0, "dc1", "r1", &[1_234_567]);
220        let b = mk(1, "dc1", "r1", &[1_234_567]);
221        let err = validate_token_coverage(&[a, b]).unwrap_err();
222        match err {
223            TokenCoverageError::Overlap {
224                dc,
225                rack,
226                token,
227                peer_a,
228                peer_b,
229            } => {
230                assert_eq!(dc, "dc1");
231                assert_eq!(rack, "r1");
232                assert_eq!(token, 1_234_567);
233                let mut got = [peer_a, peer_b];
234                got.sort_unstable();
235                assert_eq!(got, [0, 1]);
236            }
237            TokenCoverageError::EmptyRack { .. } => panic!("expected Overlap, got EmptyRack"),
238        }
239    }
240
241    #[test]
242    fn duplicate_across_racks_is_fine() {
243        // Same token in different racks is the standard vnode
244        // pattern (each rack is an independent replica of the
245        // ring) and must not be rejected.
246        let a = mk(0, "dc1", "r1", &[42]);
247        let b = mk(1, "dc1", "r2", &[42]);
248        assert!(validate_token_coverage(&[a, b]).is_ok());
249    }
250
251    #[test]
252    fn duplicate_across_dcs_is_fine() {
253        // Same token in different DCs is also normal: each DC
254        // walks its own continuum.
255        let a = mk(0, "dc1", "r1", &[42]);
256        let b = mk(1, "dc2", "r1", &[42]);
257        assert!(validate_token_coverage(&[a, b]).is_ok());
258    }
259
260    #[test]
261    fn valid_3_peer_ring() {
262        // Even token spacing for a 3-way split.
263        let a = mk(0, "dc1", "r1", &[0]);
264        let b = mk(1, "dc1", "r1", &[1_431_655_765]);
265        let c = mk(2, "dc1", "r1", &[2_863_311_530]);
266        assert!(validate_token_coverage(&[a, b, c]).is_ok());
267    }
268
269    #[test]
270    fn valid_4_peer_ring() {
271        // Even token spacing for a 4-way split (the shape pass-3
272        // intended).
273        let a = mk(0, "dc1", "r1", &[0]);
274        let b = mk(1, "dc1", "r1", &[1_073_741_824]);
275        let c = mk(2, "dc1", "r1", &[2_147_483_648]);
276        let d = mk(3, "dc1", "r1", &[3_221_225_472]);
277        assert!(validate_token_coverage(&[a, b, c, d]).is_ok());
278    }
279
280    #[test]
281    fn pass3_3_peer_subset_is_valid_config() {
282        // Pass-3 ran with only the first three of the four
283        // intended peers: floki/0, arnold/1G, nuc/2G. By the
284        // vnode wrap-around semantics this is a fully-covered
285        // ring (peer 0 owns the wrap segment from 2G+1 through
286        // u32::MAX plus the singleton at 0), so the validator
287        // must accept it. The 14% workload error rate observed
288        // in the chaos run was therefore *not* a token-coverage
289        // gap; see docs/journal/2026-05-25-token-coverage-validation.md
290        // for the alternative root-cause hypotheses.
291        let a = mk(0, "dc1", "r1", &[0]);
292        let b = mk(1, "dc1", "r1", &[1_073_741_824]);
293        let c = mk(2, "dc1", "r1", &[2_147_483_648]);
294        assert!(validate_token_coverage(&[a, b, c]).is_ok());
295    }
296
297    #[test]
298    fn multi_token_per_peer_is_fine() {
299        // A single peer holding several tokens (vnode "virtual
300        // node" pattern) is a valid configuration.
301        let a = mk(0, "dc1", "r1", &[0, 1_073_741_824]);
302        let b = mk(1, "dc1", "r1", &[2_147_483_648, 3_221_225_472]);
303        assert!(validate_token_coverage(&[a, b]).is_ok());
304    }
305
306    #[test]
307    fn duplicate_within_single_peer_token_list_rejected() {
308        // A peer whose own token list contains the same token
309        // twice is also a fault: the rack-level continuum will
310        // contain two entries with the same key and the same
311        // peer_idx, breaking the strictly-increasing precondition
312        // the dispatcher relies on.
313        let a = mk(0, "dc1", "r1", &[5, 5]);
314        let err = validate_token_coverage(&[a]).unwrap_err();
315        assert!(matches!(err, TokenCoverageError::Overlap { token: 5, .. }));
316    }
317
318    #[test]
319    fn fault_in_second_dc_still_rejects() {
320        // First DC is fine; second DC has a duplicate. The
321        // validator must still reject because the dispatcher
322        // walks each rack independently.
323        let a = mk(0, "dc1", "r1", &[10]);
324        let b = mk(1, "dc1", "r1", &[20]);
325        let c = mk(2, "dc2", "r1", &[100]);
326        let d = mk(3, "dc2", "r1", &[100]);
327        let err = validate_token_coverage(&[a, b, c, d]).unwrap_err();
328        match err {
329            TokenCoverageError::Overlap { dc, .. } => assert_eq!(dc, "dc2"),
330            TokenCoverageError::EmptyRack { .. } => {
331                panic!("expected Overlap in dc2, got EmptyRack")
332            }
333        }
334    }
335
336    #[test]
337    fn empty_rack_in_second_dc_still_rejects() {
338        // A peer present in dc2/r1 but with no tokens leaves the
339        // second DC empty even though the first is healthy.
340        let a = mk(0, "dc1", "r1", &[10]);
341        let b = mk(1, "dc2", "r1", &[]);
342        let err = validate_token_coverage(&[a, b]).unwrap_err();
343        assert_eq!(
344            err,
345            TokenCoverageError::EmptyRack {
346                dc: "dc2".into(),
347                rack: "r1".into(),
348            }
349        );
350    }
351}
dynomite/cluster/coverage.rs

dynomite/cluster/
coverage.rs