nodedb-cluster 0.1.1

Distributed coordination layer for NodeDB — vShards, QUIC transport, and replication
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
// SPDX-License-Identifier: BUSL-1.1

//! Pre-validation pass for a candidate epoch batch.
//!
//! The validator detects intra-batch write-set conflicts and produces a
//! deterministically-ordered list of admitted and rejected transactions. No
//! cross-epoch read-set checks are performed here; cross-epoch read-after-write
//! conflict detection is the scheduler's responsibility.
//!
//! # Ordering
//!
//! Transactions are ordered by `(inbox_seq, tenant_id, hash(plans))`.
//! `inbox_seq` is assigned by the sequencer leader at admit time and is the
//! primary tiebreaker. `xxh3_64` (from `xxhash-rust`) is used for `hash(plans)`
//! because it is byte-stable across processes and architectures.
//!
//! # Conflict detection
//!
//! A flat list of `(engine_discriminant, collection, key_bytes)` is built from
//! each transaction's write set. The list is sorted lexicographically. Adjacent
//! entries with identical `(discriminant, collection, key_bytes)` from different
//! txn indices indicate a conflict; the later txn (higher sort key, i.e. higher
//! `inbox_seq`) is rejected.

use std::cmp::Ordering;

use xxhash_rust::xxh3::xxh3_64;

use crate::calvin::sequencer::error::SequencerError;
use crate::calvin::sequencer::inbox::{AdmittedTx, RejectedTx};
use crate::calvin::sequencer::metrics::ConflictKey;
use crate::calvin::types::{EngineKeySet, SequencedTxn};

/// A flat write-set entry used during conflict detection.
#[derive(Debug)]
struct WriteEntry {
    /// Discriminant tag for the engine variant (Document=0, Vector=1, Kv=2, Edge=3).
    discriminant: u8,
    /// Static engine name used when building conflict context keys.
    engine_name: &'static str,
    /// Collection name.
    collection: String,
    /// Serialized key bytes.
    key_bytes: Vec<u8>,
    /// Index into the sorted-batch `Vec<AdmittedTx>`.
    txn_index: usize,
}

impl WriteEntry {
    fn sort_key(&self) -> (&u8, &str, &[u8]) {
        (
            &self.discriminant,
            self.collection.as_str(),
            &self.key_bytes,
        )
    }
}

/// Build the flat write-entry list for one transaction.
fn flatten_write_set(tx: &AdmittedTx, txn_index: usize) -> Vec<WriteEntry> {
    let mut out = Vec::new();
    for key_set in &tx.tx_class.write_set.0 {
        match key_set {
            EngineKeySet::Document {
                collection,
                surrogates,
            } => {
                for &s in surrogates.as_slice() {
                    out.push(WriteEntry {
                        discriminant: 0,
                        engine_name: "document",
                        collection: collection.clone(),
                        key_bytes: s.to_le_bytes().to_vec(),
                        txn_index,
                    });
                }
            }
            EngineKeySet::Vector {
                collection,
                surrogates,
            } => {
                for &s in surrogates.as_slice() {
                    out.push(WriteEntry {
                        discriminant: 1,
                        engine_name: "vector",
                        collection: collection.clone(),
                        key_bytes: s.to_le_bytes().to_vec(),
                        txn_index,
                    });
                }
            }
            EngineKeySet::Kv { collection, keys } => {
                for k in keys.as_slice() {
                    out.push(WriteEntry {
                        discriminant: 2,
                        engine_name: "kv",
                        collection: collection.clone(),
                        key_bytes: k.clone(),
                        txn_index,
                    });
                }
            }
            EngineKeySet::Edge { collection, edges } => {
                for &(src, dst) in edges.as_slice() {
                    let mut key_bytes = src.to_le_bytes().to_vec();
                    key_bytes.extend_from_slice(&dst.to_le_bytes());
                    out.push(WriteEntry {
                        discriminant: 3,
                        engine_name: "edge",
                        collection: collection.clone(),
                        key_bytes,
                        txn_index,
                    });
                }
            }
        }
    }
    out
}

/// Sort key for admitted transactions: `(inbox_seq, tenant_id, hash(plans))`.
fn admitted_sort_key(tx: &AdmittedTx) -> (u64, u64, u64) {
    let plan_hash = xxh3_64(&tx.tx_class.plans);
    (tx.inbox_seq, tx.tx_class.tenant_id.as_u64(), plan_hash)
}

/// Validate a candidate batch of admitted transactions.
///
/// Returns `(Vec<SequencedTxn>, Vec<RejectedTx>)`:
/// - `SequencedTxn.position` is 0-based among the admitted transactions only.
/// - `RejectedTx.reason` is `SequencerError::Conflict { position_admitted }`.
///
/// The function is pure — no I/O, no global state, deterministic.
pub fn validate_batch_with_assignments(
    epoch: u64,
    mut candidates: Vec<AdmittedTx>,
) -> (Vec<(u64, SequencedTxn)>, Vec<RejectedTx>) {
    if candidates.is_empty() {
        return (vec![], vec![]);
    }

    // Step 1: sort by (inbox_seq, tenant_id, hash(plans)).
    candidates.sort_by_key(admitted_sort_key);

    // Step 2: build flat write-entry list.
    let mut flat: Vec<WriteEntry> = Vec::new();
    for (i, tx) in candidates.iter().enumerate() {
        flat.extend(flatten_write_set(tx, i));
    }

    // Step 3: sort flat list lexicographically.
    flat.sort_by(|a, b| match a.discriminant.cmp(&b.discriminant) {
        Ordering::Equal => match a.collection.cmp(&b.collection) {
            Ordering::Equal => a.key_bytes.cmp(&b.key_bytes),
            other => other,
        },
        other => other,
    });

    // Step 4: detect conflicts — adjacent duplicate keys from different txns.
    let n = candidates.len();
    let mut rejected = vec![false; n];
    // determinism: scratch map, not iterated for output
    let mut admitted_position_for: std::collections::HashMap<usize, u32> =
        std::collections::HashMap::new();

    let mut i = 0;
    while i < flat.len() {
        // Collect run of entries with same (discriminant, collection, key_bytes).
        let mut j = i + 1;
        while j < flat.len() && flat[j].sort_key() == flat[i].sort_key() {
            j += 1;
        }
        // flat[i..j] all have the same key. If more than one distinct txn_index
        // appears, all but the lowest-indexed are rejected.
        let min_txn = flat[i..j].iter().map(|e| e.txn_index).min().unwrap_or(i);
        for entry in &flat[i..j] {
            if entry.txn_index != min_txn {
                rejected[entry.txn_index] = true;
                // Record the admitted position of the winner if not yet set.
                admitted_position_for.entry(entry.txn_index).or_insert(0);
            }
        }
        i = j;
    }

    // Step 5: build output. First pass: assign positions to admitted txns.
    let mut position_map = vec![0u32; n];
    let mut next_position: u32 = 0;
    for (idx, is_rejected) in rejected.iter().enumerate() {
        if !*is_rejected {
            position_map[idx] = next_position;
            next_position += 1;
        }
    }

    // Now build admitted/rejected output.
    let mut admitted_out: Vec<(u64, SequencedTxn)> = Vec::new();
    let mut rejected_out: Vec<RejectedTx> = Vec::new();

    for (idx, tx) in candidates.into_iter().enumerate() {
        if rejected[idx] {
            // Find the winning position: the admitted txn that holds the conflicting key.
            // We scan flat to find the min txn_index for any key this txn writes.
            let (winner_position, conflict_context) =
                find_winner_position_and_context(&flat, idx, &position_map, &tx);
            rejected_out.push(RejectedTx {
                admitted: tx,
                reason: SequencerError::Conflict {
                    position_admitted: winner_position,
                },
                conflict_context,
            });
        } else {
            let inbox_seq = tx.inbox_seq;
            admitted_out.push((
                inbox_seq,
                SequencedTxn {
                    epoch,
                    position: position_map[idx],
                    tx_class: tx.tx_class,
                    // epoch_system_ms is filled in by the service tick() when the
                    // EpochBatch is constructed; 0 is a safe placeholder here.
                    epoch_system_ms: 0,
                },
            ));
        }
    }

    (admitted_out, rejected_out)
}

pub fn validate_batch(
    epoch: u64,
    candidates: Vec<AdmittedTx>,
) -> (Vec<SequencedTxn>, Vec<RejectedTx>) {
    let (admitted, rejected) = validate_batch_with_assignments(epoch, candidates);
    (admitted.into_iter().map(|(_, txn)| txn).collect(), rejected)
}

/// Find the position of the admitted (winning) txn that conflicts with
/// `loser_idx`, and build the [`ConflictKey`] for the first conflicting write
/// entry found.
///
/// Returns `(winner_position, Some(ConflictKey))`. The `ConflictKey` uses the
/// tenant from `loser_tx`, the engine from the `WriteEntry`, and the collection
/// from that same entry — matching what the metrics layer needs for per-context
/// hotness tracking.
fn find_winner_position_and_context(
    flat: &[WriteEntry],
    loser_idx: usize,
    position_map: &[u32],
    loser_tx: &AdmittedTx,
) -> (u32, Option<ConflictKey>) {
    for entry in flat.iter().filter(|e| e.txn_index == loser_idx) {
        let min_idx = flat
            .iter()
            .filter(|e| e.sort_key() == entry.sort_key() && e.txn_index != loser_idx)
            .map(|e| e.txn_index)
            .min();
        if let Some(winner) = min_idx {
            let ctx = ConflictKey {
                tenant: loser_tx.tx_class.tenant_id.as_u64(),
                engine: entry.engine_name,
                collection: entry.collection.clone(),
            };
            return (position_map[winner], Some(ctx));
        }
    }
    (0, None)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::calvin::sequencer::inbox::AdmittedTx;
    use crate::calvin::types::{EngineKeySet, ReadWriteSet, SortedVec, TxClass};
    use nodedb_types::{TenantId, id::VShardId};

    fn find_two_distinct_collections() -> (String, String) {
        let mut first: Option<(String, u32)> = None;
        for i in 0u32..512 {
            let name = format!("col_{i}");
            let vshard = VShardId::from_collection(&name).as_u32();
            if let Some((ref fname, fv)) = first {
                if fv != vshard {
                    return (fname.clone(), name);
                }
            } else {
                first = Some((name, vshard));
            }
        }
        panic!("could not find two distinct-vshard collections in 512 tries");
    }

    fn make_tx(
        inbox_seq: u64,
        col_a: &str,
        surrogates_a: Vec<u32>,
        col_b: &str,
        surrogates_b: Vec<u32>,
    ) -> AdmittedTx {
        let write_set = ReadWriteSet::new(vec![
            EngineKeySet::Document {
                collection: col_a.to_owned(),
                surrogates: SortedVec::new(surrogates_a),
            },
            EngineKeySet::Document {
                collection: col_b.to_owned(),
                surrogates: SortedVec::new(surrogates_b),
            },
        ]);
        let tx_class = TxClass::new(
            ReadWriteSet::new(vec![]),
            write_set,
            vec![inbox_seq as u8],
            TenantId::new(1),
            None,
        )
        .expect("valid TxClass");
        AdmittedTx {
            inbox_seq,
            tx_class,
        }
    }

    #[test]
    fn empty_input_produces_empty_output() {
        let (admitted, rejected) = validate_batch(1, vec![]);
        assert!(admitted.is_empty());
        assert!(rejected.is_empty());
    }

    #[test]
    fn single_txn_admitted_at_position_zero() {
        let (col_a, col_b) = find_two_distinct_collections();
        let tx = make_tx(0, &col_a, vec![1], &col_b, vec![2]);
        let (admitted, rejected) = validate_batch(1, vec![tx]);
        assert_eq!(admitted.len(), 1);
        assert!(rejected.is_empty());
        assert_eq!(admitted[0].position, 0);
        assert_eq!(admitted[0].epoch, 1);
    }

    #[test]
    fn two_non_conflicting_txns_both_admitted_in_inbox_seq_order() {
        let (col_a, col_b) = find_two_distinct_collections();
        let tx0 = make_tx(0, &col_a, vec![1], &col_b, vec![10]);
        let tx1 = make_tx(1, &col_a, vec![2], &col_b, vec![20]);
        let (admitted, rejected) = validate_batch(2, vec![tx0, tx1]);
        assert_eq!(admitted.len(), 2);
        assert!(rejected.is_empty());
        // Position 0 should have the lower inbox_seq.
        assert_eq!(admitted[0].position, 0);
        assert_eq!(admitted[1].position, 1);
    }

    #[test]
    fn two_conflicting_txns_first_admitted_second_rejected() {
        let (col_a, col_b) = find_two_distinct_collections();
        // Both txns write to surrogate 42 in col_a.
        let tx0 = make_tx(0, &col_a, vec![42], &col_b, vec![1]);
        let tx1 = make_tx(1, &col_a, vec![42], &col_b, vec![2]);
        let (admitted, rejected) = validate_batch(3, vec![tx0, tx1]);
        assert_eq!(admitted.len(), 1);
        assert_eq!(rejected.len(), 1);
        assert_eq!(admitted[0].position, 0);
        assert!(matches!(
            rejected[0].reason,
            SequencerError::Conflict { .. }
        ));
    }

    #[test]
    fn deterministic_ordering_across_repeated_runs() {
        let (col_a, col_b) = find_two_distinct_collections();
        let tx0 = make_tx(0, &col_a, vec![1], &col_b, vec![10]);
        let tx1 = make_tx(1, &col_a, vec![1], &col_b, vec![20]);

        let (admitted1, rejected1) = validate_batch(1, vec![tx0.clone(), tx1.clone()]);
        let (admitted2, rejected2) = validate_batch(1, vec![tx0, tx1]);

        assert_eq!(admitted1.len(), admitted2.len());
        assert_eq!(rejected1.len(), rejected2.len());
        for (a, b) in admitted1.iter().zip(admitted2.iter()) {
            assert_eq!(a.position, b.position);
        }
    }

    #[test]
    fn rejected_txn_carries_conflict_error_with_winner_position() {
        let (col_a, col_b) = find_two_distinct_collections();
        let tx0 = make_tx(0, &col_a, vec![99], &col_b, vec![1]);
        let tx1 = make_tx(1, &col_a, vec![99], &col_b, vec![2]);
        let (_admitted, rejected) = validate_batch(5, vec![tx0, tx1]);
        assert_eq!(rejected.len(), 1);
        // Winner is at position 0.
        assert_eq!(
            rejected[0].reason,
            SequencerError::Conflict {
                position_admitted: 0
            }
        );
    }

    #[test]
    fn conflict_fairness_metric_keyed() {
        // Verify that RejectedTx.conflict_context is populated with the
        // correct tenant, engine name, and collection when a conflict occurs.
        let (col_a, col_b) = find_two_distinct_collections();
        // Both transactions write to surrogate 7 in col_a (Document engine).
        let tx0 = make_tx(0, &col_a, vec![7], &col_b, vec![1]);
        let tx1 = make_tx(1, &col_a, vec![7], &col_b, vec![2]);

        // tx1 uses tenant_id 1 (from make_tx helper).
        let (_admitted, rejected) = validate_batch(10, vec![tx0, tx1]);
        assert_eq!(rejected.len(), 1);

        let ctx = rejected[0]
            .conflict_context
            .as_ref()
            .expect("conflict_context must be Some for a Conflict rejection");

        assert_eq!(ctx.tenant, 1, "tenant should match tx tenant_id");
        assert_eq!(
            ctx.engine, "document",
            "engine should be 'document' for Document key set"
        );
        assert_eq!(
            ctx.collection, col_a,
            "collection should be the conflicting collection"
        );
    }
}