net-mesh 0.24.0

//! Deterministic-simulation tests for RedEX replication — Phase F
//! of [`docs/plans/REDEX_DISTRIBUTED_PLAN.md`].
//!
//! These tests run the production state machine, election, catch-up,
//! and heartbeat-tracker logic in a single-threaded deterministic
//! harness. No tokio runtime; no real mesh wire; explicit clock +
//! message queue. The point is to exercise scenarios the real-wire
//! e2e tests can't reach deterministically:
//!
//! - Multi-node partition (one or more nodes isolated)
//! - Leader crash mid-flight with replicas catching up to the new
//!   leader's tail
//! - Restart-during-sync (kill + revive a replica)
//! - Asymmetric-RTT (transient divergence)
//! - Partition-heal
//!
//! The harness is single-threaded so loom-style atomic interleaving
//! exploration is out of scope (see `tests/loom_models.rs` for
//! that). What IS in scope: every documented Phase F invariant from
//! plan §F:
//!
//! - **No-two-leaders-within-a-partition**: at every step, at most
//!   one node in any single connected partition believes itself
//!   Leader. The pure `elect()` allows dual self-winners in
//!   symmetric-RTT scenarios (per the design — see
//!   `replication_election.rs` symmetric_failover_yields_dual_self_winners_as_expected),
//!   but the broader system + capability-tag layer is expected to
//!   collapse those windows. This harness pins the timing claim:
//!   the dual-leader window closes within K heartbeats.
//! - **Convergence**: post-failure, every surviving replica's tail
//!   advances to match the (new) leader's tail within bounded steps.
//! - **Divergence-freedom**: no two replicas hold different bytes
//!   for the same seq.
//!
//! Run via the regular `cargo test --features redex --test
//! redex_replication_dst` — no special flags.
//!
//! Cross-partition split-brain is documented as out-of-scope per
//! [`REDEX_DISTRIBUTED_PLAN.md` locked decision 3] and is not
//! asserted here. Tests that involve partitions assert convergence
//! within the connected component, not globally.

#![cfg(feature = "redex")]

use std::collections::{BTreeMap, HashSet, VecDeque};
use std::time::{Duration, Instant};

use net::adapter::net::behavior::placement::NodeId;
use net::adapter::net::channel::ChannelName;
use net::adapter::net::redex::{
    apply_sync_response, election_outcome, handle_sync_request, tick, ApplyError, ChannelId,
    HeartbeatTracker, Inbound, OutboundMessage, Redex, RedexFile, RedexFileConfig, ReplicaRole,
    StateTransition, SyncNack, SyncRequestOutcome, TickInputs, TransitionSignal,
};

// ────────────────────────────────────────────────────────────────
// Harness primitives
// ────────────────────────────────────────────────────────────────

/// Bytes per SyncRequest chunk_max in this harness. Sized so the
/// catch-up loop drains a typical scenario in 1–2 chunks rather
/// than churning many small requests.
const DST_CHUNK_MAX_BYTES: u32 = 256 * 1024;
/// Default heartbeat cadence for DST scenarios. Smaller than the
/// production default so a single test scenario can exercise
/// multiple heartbeat cycles without running the harness for too
/// many steps.
const DST_HEARTBEAT_MS: u64 = 50;
/// Wall-clock granularity per harness step. One step = one heartbeat
/// tick's worth of time, so every step every node emits / observes
/// at least one heartbeat.
const STEP_DURATION_MS: u64 = DST_HEARTBEAT_MS;
/// Hard upper bound on harness steps per scenario — prevents a
/// scenario that fails to converge from running forever. Sized so
/// the worst-case convergence (5 heartbeat cycles for election +
/// 10 for catch-up) finishes well below the cap.
const STEP_BUDGET: usize = 200;

/// A single node's per-channel state in the simulated cluster.
struct VirtualNode {
    /// State-machine role. Owns the cell directly rather than
    /// going through `ReplicationCoordinator::transition_to`
    /// (which is async); we validate transitions via
    /// `StateTransition::apply` and apply manually.
    role: ReplicaRole,
    /// Per-peer last-seen / role / tail tracker. Same type the
    /// production runtime uses; the DST exercises it via the same
    /// `record_heartbeat` / `is_leader_silent` / `healthy_peers`
    /// surface.
    tracker: HeartbeatTracker,
    /// Live `RedexFile` for this channel. Heap-only — the
    /// harness doesn't exercise the persistent tier. Same file
    /// type the production catch-up helpers read / append against.
    file: RedexFile,
    /// Pending inbound events. Mesh-side dispatch in production
    /// pushes into the runtime's mpsc inbox; the DST pushes into
    /// this VecDeque on every tick after partition-filtering.
    inbox: VecDeque<Inbound>,
    /// Has this node been killed (crashed) for this scenario?
    /// Killed nodes neither tick nor accept inbound events.
    killed: bool,
    /// Mirrors the production coordinator's `election_thrash_total`
    /// gauge: counts every transition driven by `MissedHeartbeats`.
    /// The DST harness drives transitions through `force_transition`
    /// rather than `ReplicationCoordinator::transition_to`, so the
    /// real Prometheus counter is never bumped; tracking the same
    /// count locally lets election-storm scenarios pin "the bump
    /// fires once per round" without rewiring the harness around
    /// the async coordinator.
    election_thrash_count: u64,
}

impl VirtualNode {
    fn new(channel_name: &ChannelName) -> Self {
        // Each node's file is independent (different `RedexFile`
        // instance); the harness manages replication between them.
        let redex = Redex::new();
        let file = redex
            .open_file(channel_name, RedexFileConfig::default())
            .expect("open file");
        Self {
            role: ReplicaRole::Idle,
            tracker: HeartbeatTracker::new(DST_HEARTBEAT_MS),
            file,
            inbox: VecDeque::new(),
            killed: false,
            election_thrash_count: 0,
        }
    }

    fn tail_seq(&self) -> u64 {
        self.file.next_seq()
    }

    /// Drive the state machine through a transition + apply.
    /// Asserts the transition is valid per the state-machine
    /// matrix; panics on invalid transitions (which would
    /// indicate a harness bug, not a production bug).
    fn force_transition(&mut self, target: ReplicaRole, signal: TransitionSignal) {
        let _ = StateTransition::apply(self.role, target, signal)
            .expect("DST harness drove an invalid state-machine transition");
        self.role = target;
        if matches!(signal, TransitionSignal::MissedHeartbeats) {
            self.election_thrash_count += 1;
        }
    }
}

/// Deterministic-simulation cluster of replicated nodes.
///
/// Single channel; every node holds an independent `RedexFile`.
/// Time advances by `STEP_DURATION_MS` per `step()` call.
struct VirtualCluster {
    nodes: BTreeMap<NodeId, VirtualNode>,
    channel_id: ChannelId,
    replica_set: Vec<NodeId>,
    /// Sorted pairs of partitioned `(min_id, max_id)`. Bidirectional —
    /// if `(A, B)` is in the set, neither direction delivers.
    partitions: HashSet<(NodeId, NodeId)>,
    /// Wall-clock for the tracker's silence-detection logic.
    now: Instant,
    /// R-9: `self.now` at cluster construction — subtracted from
    /// the current `self.now` to derive a deterministic
    /// `wall_clock_ms` for outbound heartbeats. Without this the
    /// harness was leaking real wall-clock into the test (via
    /// `Instant::now().duration_since(self.now)`), breaking the
    /// "explicit clock" claim and making any future logic that
    /// consumed `wall_clock_ms` non-deterministic.
    initial_now: Instant,
    /// Outbound messages staged for delivery on the next step.
    /// `(src, dst, payload)` — the payload is converted from the
    /// production `OutboundMessage` shape into an `Inbound` event
    /// at delivery time.
    pending: VecDeque<(NodeId, NodeId, Inbound)>,
    /// RTT lookup table for the election function. Symmetric +
    /// fixed for the lifetime of the scenario unless the scenario
    /// explicitly mutates it.
    rtt: BTreeMap<(NodeId, NodeId), Duration>,
    /// v0.3 Phase D2: bandwidth-class stamped on every emitted
    /// SyncRequest. Defaults to `Foreground` (matches
    /// `RuntimeInputs::default_bandwidth_class`'s default), but
    /// scenarios that want to exercise the class plumbing on the
    /// wire override via [`Self::with_default_bandwidth_class`].
    default_bandwidth_class: net::adapter::net::redex::BandwidthClass,
}

impl VirtualCluster {
    fn new(ids: &[NodeId], channel_name: &str) -> Self {
        let channel_name = ChannelName::new(channel_name).unwrap();
        let channel_id = ChannelId::from_name(&channel_name);
        let mut nodes = BTreeMap::new();
        for &id in ids {
            nodes.insert(id, VirtualNode::new(&channel_name));
        }
        // Default RTT matrix: every pair at 5ms (symmetric).
        let mut rtt = BTreeMap::new();
        for i in 0..ids.len() {
            for j in (i + 1)..ids.len() {
                let a = ids[i].min(ids[j]);
                let b = ids[i].max(ids[j]);
                rtt.insert((a, b), Duration::from_millis(5));
            }
        }
        let now = Instant::now();
        Self {
            nodes,
            channel_id,
            replica_set: ids.to_vec(),
            partitions: HashSet::new(),
            now,
            initial_now: now,
            pending: VecDeque::new(),
            rtt,
            default_bandwidth_class: net::adapter::net::redex::BandwidthClass::Foreground,
        }
    }

    /// Override the bandwidth class stamped on every emitted
    /// SyncRequest. Scenarios that want to exercise Background
    /// / Realtime class plumbing on the wire flip this before
    /// stepping.
    #[allow(dead_code)]
    fn with_default_bandwidth_class(
        mut self,
        class: net::adapter::net::redex::BandwidthClass,
    ) -> Self {
        self.default_bandwidth_class = class;
        self
    }

    /// Sorted-pair key for partitions + rtt lookups. Avoids
    /// duplicate edges in both directions.
    fn pair(a: NodeId, b: NodeId) -> (NodeId, NodeId) {
        (a.min(b), a.max(b))
    }

    /// Cut the connection between `a` and `b` (bidirectional).
    /// Subsequent steps don't deliver messages between them until
    /// `heal_partition` is called.
    fn partition(&mut self, a: NodeId, b: NodeId) {
        self.partitions.insert(Self::pair(a, b));
    }

    /// Restore the connection between `a` and `b`.
    #[allow(dead_code)]
    fn heal_partition(&mut self, a: NodeId, b: NodeId) {
        self.partitions.remove(&Self::pair(a, b));
    }

    /// True if messages between `a` and `b` are currently dropped.
    /// Also true if either endpoint is killed.
    fn is_partitioned(&self, a: NodeId, b: NodeId) -> bool {
        if self.nodes.get(&a).map(|n| n.killed).unwrap_or(true) {
            return true;
        }
        if self.nodes.get(&b).map(|n| n.killed).unwrap_or(true) {
            return true;
        }
        self.partitions.contains(&Self::pair(a, b))
    }

    /// Crash `id`. Its inbox is cleared; it stops ticking; every
    /// pending message to or from it is dropped on subsequent
    /// delivery rounds. Plan §F leader-crash + replica-crash
    /// scenarios use this.
    fn kill(&mut self, id: NodeId) {
        if let Some(n) = self.nodes.get_mut(&id) {
            n.killed = true;
            n.inbox.clear();
        }
    }

    /// Drive `id` through `Idle → Replica → Candidate → Leader`.
    /// Used by scenarios that need a deterministic initial leader.
    fn force_leader(&mut self, id: NodeId) {
        let n = self.nodes.get_mut(&id).expect("force_leader: unknown id");
        n.force_transition(ReplicaRole::Replica, TransitionSignal::CapabilitySelected);
        n.force_transition(ReplicaRole::Candidate, TransitionSignal::MissedHeartbeats);
        n.force_transition(ReplicaRole::Leader, TransitionSignal::ElectionWon);
    }

    /// Drive `id` to `Replica`.
    fn force_replica(&mut self, id: NodeId) {
        let n = self.nodes.get_mut(&id).expect("force_replica: unknown id");
        n.force_transition(ReplicaRole::Replica, TransitionSignal::CapabilitySelected);
    }

    /// One harness step. Three phases per step: advance the clock,
    /// then for each live node call `tick()` to produce outbound +
    /// transitions and stage outbound messages; then deliver staged
    /// messages to non-partitioned destinations; then drain each
    /// node's inbox via the inbound handler.
    fn step(&mut self) {
        self.now += Duration::from_millis(STEP_DURATION_MS);

        // Phase 1: tick every live node.
        let node_ids: Vec<NodeId> = self.nodes.keys().copied().collect();
        for id in &node_ids {
            let killed = self.nodes.get(id).map(|n| n.killed).unwrap_or(true);
            if killed {
                continue;
            }
            self.tick_node(*id);
        }

        // Phase 2: deliver pending messages. Take ownership of
        // the queue so we can re-stage on partition rejection
        // (we just drop on partition; no retry from the harness).
        let mut staged: VecDeque<(NodeId, NodeId, Inbound)> = std::mem::take(&mut self.pending);
        while let Some((src, dst, msg)) = staged.pop_front() {
            if self.is_partitioned(src, dst) {
                continue;
            }
            if let Some(n) = self.nodes.get_mut(&dst) {
                if !n.killed {
                    n.inbox.push_back(msg);
                }
            }
        }

        // Phase 3: drain each node's inbox.
        for id in &node_ids {
            let killed = self.nodes.get(id).map(|n| n.killed).unwrap_or(true);
            if killed {
                continue;
            }
            self.drain_inbox(*id);
        }
    }

    fn tick_node(&mut self, id: NodeId) {
        // Snapshot inputs (avoiding overlapping borrows on
        // self.nodes).
        let (current_role, tail_seq) = {
            let n = self.nodes.get(&id).expect("tick: unknown id");
            (n.role, n.tail_seq())
        };

        // `tick()` borrows the tracker by reference. Scope the
        // borrow so we can mutate the node after the outcome
        // is captured.
        let outcome = {
            let n = self.nodes.get(&id).expect("tick: unknown id");
            tick(TickInputs {
                self_node_id: id,
                current_role,
                channel_id: self.channel_id,
                tail_seq,
                replica_set: &self.replica_set,
                tracker: &n.tracker,
                // R-9: derive wall_clock_ms from the step counter
                // (via `self.now - self.initial_now`), not from real
                // wall-clock — so the harness stays deterministic.
                wall_clock_ms: self.now.duration_since(self.initial_now).as_millis() as u64,
                chunk_max_bytes: DST_CHUNK_MAX_BYTES,
                now: self.now,
                default_bandwidth_class: self.default_bandwidth_class,
            })
        };

        // Apply outbound messages — stage each for delivery on
        // the next phase.
        for msg in outcome.outbound {
            match msg {
                OutboundMessage::Heartbeat { target, msg } => {
                    self.pending
                        .push_back((id, target, Inbound::Heartbeat { from: id, msg }));
                }
                OutboundMessage::SyncRequest { target, msg } => {
                    self.pending
                        .push_back((id, target, Inbound::SyncRequest { from: id, msg }));
                }
            }
        }

        // Apply transition synchronously. The production path goes
        // through `coordinator.transition_to` which is async (the
        // sink call is async); DST takes the shortcut of running
        // the state-machine validator directly + flipping the
        // cell.
        if let Some(pt) = outcome.transition {
            let n = self.nodes.get_mut(&id).expect("tick: unknown id");
            let _ = StateTransition::apply(n.role, pt.target, pt.signal)
                .expect("tick produced an invalid transition");
            n.role = pt.target;
            // Mirror the production coordinator's election_thrash
            // counter so storm scenarios can observe the same gauge.
            if matches!(pt.signal, TransitionSignal::MissedHeartbeats) {
                n.election_thrash_count += 1;
            }

            // On Candidate entry, run the election in the same
            // step (mirrors `on_tick` in the runtime). The
            // election uses the healthy-peers set + RTT lookup.
            if pt.target == ReplicaRole::Candidate {
                let healthy = n.tracker.healthy_peers(self.now);
                let rtt_lookup = {
                    let cluster_rtt = self.rtt.clone();
                    let partitions = self.partitions.clone();
                    let killed_set: HashSet<NodeId> = self
                        .nodes
                        .iter()
                        .filter(|(_, n)| n.killed)
                        .map(|(&id, _)| id)
                        .collect();
                    move |peer: NodeId| -> Option<Duration> {
                        if peer == id {
                            return Some(Duration::ZERO);
                        }
                        if killed_set.contains(&peer) {
                            return None;
                        }
                        let p = (id.min(peer), id.max(peer));
                        if partitions.contains(&p) {
                            return None;
                        }
                        cluster_rtt.get(&p).copied()
                    }
                };
                let elect = election_outcome(id, &self.replica_set, rtt_lookup, |peer| {
                    peer == id || healthy.contains(&peer)
                });
                if let Some(pt) = elect {
                    let n = self.nodes.get_mut(&id).expect("tick: unknown id");
                    let _ = StateTransition::apply(n.role, pt.target, pt.signal)
                        .expect("election produced an invalid transition");
                    n.role = pt.target;
                    // Mirror the runtime: clear the believed
                    // leader so the next round starts clean.
                    n.tracker.clear_believed_leader();
                }
            }
        }
    }

    fn drain_inbox(&mut self, id: NodeId) {
        // Drain all events from this node's inbox. Per-event
        // handling mirrors `on_inbound` in the runtime, but sync
        // (no async sink calls because the DST has no real mesh
        // or chain-tag layer).
        loop {
            let event = {
                let n = self.nodes.get_mut(&id).expect("drain: unknown id");
                n.inbox.pop_front()
            };
            let Some(event) = event else {
                break;
            };
            self.handle_inbound(id, event);
        }
    }

    fn handle_inbound(&mut self, id: NodeId, event: Inbound) {
        match event {
            Inbound::Heartbeat { from, msg } => {
                if msg.channel_id != self.channel_id {
                    return;
                }
                let n = self.nodes.get_mut(&id).expect("handle: unknown id");
                n.tracker
                    .record_heartbeat(from, msg.role, msg.tail_seq, self.now);
            }
            Inbound::SyncRequest { from, msg } => {
                let role = self.nodes.get(&id).expect("handle: unknown id").role;
                if role != ReplicaRole::Leader {
                    // Non-leader can't satisfy a sync request.
                    // Production sends NotLeader NACK; harness
                    // omits (we don't model NACK retry policy
                    // here — covered by unit tests).
                    return;
                }
                let outcome = {
                    let n = self.nodes.get(&id).expect("handle: unknown id");
                    handle_sync_request(&n.file, &msg, self.channel_id)
                };
                match outcome {
                    SyncRequestOutcome::Response(resp) => {
                        self.pending.push_back((
                            id,
                            from,
                            Inbound::SyncResponse {
                                from: id,
                                msg: resp,
                            },
                        ));
                    }
                    SyncRequestOutcome::Nack {
                        error_code,
                        leader_first_retained_seq,
                        detail,
                    } => {
                        let nack = SyncNack {
                            channel_id: self.channel_id,
                            since_seq: msg.since_seq,
                            error_code,
                            leader_first_retained_seq,
                            detail,
                            request_id: msg.request_id,
                        };
                        self.pending.push_back((
                            id,
                            from,
                            Inbound::SyncNack {
                                from: id,
                                msg: nack,
                            },
                        ));
                    }
                }
            }
            Inbound::SyncResponse { from: _, msg } => {
                let role = self.nodes.get(&id).expect("handle: unknown id").role;
                if role != ReplicaRole::Replica {
                    return;
                }
                let result = {
                    let n = self.nodes.get(&id).expect("handle: unknown id");
                    apply_sync_response(&n.file, &msg, self.channel_id)
                };
                if let Err(ApplyError::GapBeforeChunk { first_seq, .. }) = result {
                    // Skip-ahead path from Phase D — drop local
                    // tail to the leader's first available and
                    // retry the apply.
                    let n = self.nodes.get_mut(&id).expect("handle: unknown id");
                    if n.file.skip_to(first_seq).is_ok() {
                        let _ = apply_sync_response(&n.file, &msg, self.channel_id);
                    }
                }
                // Other ApplyError variants surface in production
                // as log+drop; harness mirrors.
            }
            Inbound::SyncNack { .. } => {
                // Replica-side NACK retry policy lives in the
                // production runtime's on_inbound. Harness doesn't
                // exercise it (covered by unit tests).
            }
            Inbound::Shutdown => unreachable!("DST harness doesn't queue Shutdown"),
        }
    }

    /// Run `step()` repeatedly until either the assertion holds,
    /// or `STEP_BUDGET` runs out. Returns the number of steps it
    /// took to reach the assertion (panics on budget exhaustion
    /// with a descriptive message including the most-recent state
    /// snapshot).
    fn run_until<F: Fn(&Self) -> bool>(&mut self, predicate: F, scenario: &str) -> usize {
        for step in 1..=STEP_BUDGET {
            self.step();
            if predicate(self) {
                return step;
            }
        }
        panic!(
            "DST scenario {scenario} did not reach predicate within {STEP_BUDGET} steps. \
             Final state: {}",
            self.snapshot()
        );
    }

    /// Compact textual snapshot for panic messages — every node's
    /// `(id, role, tail_seq, killed)` plus current partitions.
    fn snapshot(&self) -> String {
        let mut s = String::new();
        for (&id, n) in &self.nodes {
            use std::fmt::Write;
            let _ = write!(
                &mut s,
                "{:#x}@{:?}(t={},k={}) ",
                id,
                n.role,
                n.tail_seq(),
                n.killed
            );
        }
        if !self.partitions.is_empty() {
            s.push_str("| parts: ");
            for (a, b) in &self.partitions {
                use std::fmt::Write;
                let _ = write!(&mut s, "({a:#x}↔{b:#x}) ");
            }
        }
        s
    }

    /// Count how many live nodes currently believe themselves
    /// Leader. The "no two leaders within a partition" invariant
    /// uses this against partitioned subsets.
    fn live_leader_count(&self) -> usize {
        self.nodes
            .values()
            .filter(|n| !n.killed && n.role == ReplicaRole::Leader)
            .count()
    }
}

// ────────────────────────────────────────────────────────────────
// Append helpers
// ────────────────────────────────────────────────────────────────

/// Append `count` events on the leader's file. Used by scenarios
/// that need data to flow through the catch-up path.
fn append_on_leader(cluster: &VirtualCluster, leader_id: NodeId, count: u64, prefix: &str) {
    let leader = cluster
        .nodes
        .get(&leader_id)
        .expect("append: unknown leader");
    assert_eq!(leader.role, ReplicaRole::Leader, "append: not leader");
    for i in 0..count {
        leader
            .file
            .append(format!("{prefix}-{i}").as_bytes())
            .expect("append failed");
    }
}

// ────────────────────────────────────────────────────────────────
// Scenarios
// ────────────────────────────────────────────────────────────────

#[test]
fn three_node_happy_path_replicas_catch_up_to_leader() {
    // 3 nodes; A is leader, B + C replicas. Append 16 events on
    // A; both B and C must catch up to tail=16. Sets the baseline
    // — if this fails, all the failure scenarios are also broken.
    let a = 0x10u64;
    let b = 0x20u64;
    let c = 0x30u64;
    let mut cluster = VirtualCluster::new(&[a, b, c], "dst/happy");
    cluster.force_leader(a);
    cluster.force_replica(b);
    cluster.force_replica(c);
    append_on_leader(&cluster, a, 16, "evt");

    let steps = cluster.run_until(
        |cl| {
            cl.nodes.get(&b).map(|n| n.tail_seq()).unwrap_or(0) == 16
                && cl.nodes.get(&c).map(|n| n.tail_seq()).unwrap_or(0) == 16
        },
        "three_node_happy_path",
    );
    // Convergence should take a few heartbeat cycles (~5-10).
    // Hard cap at STEP_BUDGET (200) catches infinite-loop bugs.
    assert!(
        steps < STEP_BUDGET,
        "happy-path convergence in {steps} steps (budget {STEP_BUDGET})"
    );
}

#[test]
fn no_two_leaders_during_steady_state() {
    // Invariant: with one node manually driven to Leader and the
    // rest in Replica, no spontaneous extra Leader appears across
    // any number of steady-state ticks.
    let a = 0x10u64;
    let b = 0x20u64;
    let c = 0x30u64;
    let mut cluster = VirtualCluster::new(&[a, b, c], "dst/steady");
    cluster.force_leader(a);
    cluster.force_replica(b);
    cluster.force_replica(c);

    for step in 1..=50 {
        cluster.step();
        assert_eq!(
            cluster.live_leader_count(),
            1,
            "steady-state must keep exactly one Leader at step {step}; got {} ({})",
            cluster.live_leader_count(),
            cluster.snapshot(),
        );
    }
}

#[test]
fn leader_crash_two_node_failover_converges_on_lone_survivor() {
    // Two-node failover: leader crashes; the single surviving
    // replica is the only candidate; it wins the election and
    // becomes Leader. This is the convergence case the
    // production runtime's `leader_close_triggers_replica_election_and_promotion`
    // e2e test exercises — it works because there's only one
    // survivor.
    let a = 0x10u64;
    let b = 0x20u64;
    let mut cluster = VirtualCluster::new(&[a, b], "dst/leader_crash_2node");
    cluster.force_leader(a);
    cluster.force_replica(b);

    // Steady-state ticks so B's tracker has A as believed_leader.
    for _ in 0..5 {
        cluster.step();
    }
    assert_eq!(cluster.live_leader_count(), 1);

    // Crash A. Only B survives.
    cluster.kill(a);

    // B detects A silent → enters Candidate → elects (A is
    // unhealthy, only B in candidate set) → becomes Leader.
    let steps = cluster.run_until(
        |c| c.nodes.get(&b).map(|n| n.role).unwrap_or(ReplicaRole::Idle) == ReplicaRole::Leader,
        "leader_crash_failover_2node",
    );
    assert!(steps < STEP_BUDGET, "failover converged in {steps} steps");
    assert_eq!(cluster.live_leader_count(), 1);
}

#[test]
fn symmetric_failover_with_three_survivors_produces_dual_leaders() {
    // **Documented dual-leader window** — plan §4 locked decision 3.
    //
    // With 3+ survivors at symmetric RTT (every peer at same
    // distance), each survivor's `elect()` produces `SelfWins`
    // because self-RTT is hardcoded to zero (lower than peer RTT).
    // The state machine has no `Leader → Replica` transition,
    // so once N candidates all transition to Leader, they STAY
    // dual-leader.
    //
    // This test pins that behavior as expected. Plan §4 documents
    // that the broader system (capability tag layer + operator
    // intervention) is responsible for collapsing these windows;
    // the pure replication-runtime logic does NOT enforce
    // "exactly one Leader per partition" in the symmetric-RTT
    // case.
    //
    // See also `replication_election::tests::
    // symmetric_failover_yields_dual_self_winners_as_expected`
    // for the same property pinned at the pure-function layer.
    let a = 0x10u64;
    let b = 0x20u64;
    let c = 0x30u64;
    let mut cluster = VirtualCluster::new(&[a, b, c], "dst/dual_leader_doc");
    cluster.force_leader(a);
    cluster.force_replica(b);
    cluster.force_replica(c);

    // Steady-state to seat tracker beliefs.
    for _ in 0..5 {
        cluster.step();
    }
    assert_eq!(cluster.live_leader_count(), 1);

    cluster.kill(a);

    // Wait for the failover to fire — both survivors must
    // detect leader silence + run election + transition AWAY
    // from Replica. After kill, ~3 heartbeats of clock time
    // need to elapse for `is_leader_silent` to trip.
    cluster.run_until(
        |cl| {
            cl.nodes
                .values()
                .filter(|n| !n.killed)
                .all(|n| n.role != ReplicaRole::Replica && n.role != ReplicaRole::Candidate)
        },
        "survivors transition past Replica/Candidate",
    );

    // BOTH survivors are now Leader. This is the documented
    // dual-leader window. If a future change adds a Leader → Replica
    // demotion mechanism, this test breaks loudly.
    let leaders: Vec<NodeId> = cluster
        .nodes
        .iter()
        .filter(|(_, n)| !n.killed && n.role == ReplicaRole::Leader)
        .map(|(&id, _)| id)
        .collect();
    assert_eq!(
        leaders.len(),
        2,
        "symmetric-RTT failover with 3 survivors produces dual-leader window per plan §4; \
         got {} ({})",
        leaders.len(),
        cluster.snapshot(),
    );
    assert!(leaders.contains(&b) && leaders.contains(&c));
}

#[test]
fn asymmetric_rtt_failover_converges_when_one_peer_clearly_central() {
    // The convergence case from plan §4 election. When the RTT
    // matrix has a clear "central" peer (one node observed at
    // zero RTT from every other survivor's view), every
    // candidate's `elect()` picks that central peer. The pure
    // function converges; no dual-leader window.
    //
    // This is the asymmetric counterpoint to
    // `symmetric_failover_with_three_survivors_produces_dual_leaders`
    // — it pins that the election DOES converge when the RTT
    // matrix has the structure plan §4 expects in production
    // (where the proximity graph's measurements rank one node
    // unambiguously closest).
    let a = 0x10u64;
    let b = 0x20u64;
    let c = 0x30u64;
    let mut cluster = VirtualCluster::new(&[a, b, c], "dst/central_failover");
    cluster.force_leader(a);
    cluster.force_replica(b);
    cluster.force_replica(c);

    // Override the default 5ms symmetric matrix. Make B the
    // "central" surviving peer: C observes B at 0 RTT (clearly
    // closer than any other survivor); B observes C at non-zero.
    cluster.rtt.insert((b.min(c), b.max(c)), Duration::ZERO);

    for _ in 0..5 {
        cluster.step();
    }

    cluster.kill(a);

    // Both B and C run elect() once they detect silence:
    // - B: self-RTT=0, C@0ms (peer at zero too) → lex tie-break
    //   picks B (lower NodeId).
    // - C: self-RTT=0, B@0ms (peer at zero too) → lex tie-break
    //   picks B (lower NodeId). So C transitions to Replica,
    //   not Leader.
    cluster.run_until(
        |cl| {
            cl.live_leader_count() == 1
                && cl
                    .nodes
                    .values()
                    .filter(|n| !n.killed)
                    .all(|n| n.role != ReplicaRole::Candidate)
        },
        "asymmetric: survivors settle to exactly one Leader",
    );

    // Exactly one Leader (B); one Replica (C).
    assert_eq!(
        cluster.live_leader_count(),
        1,
        "central-peer convergence: exactly one Leader; got {}",
        cluster.snapshot(),
    );
    assert_eq!(cluster.nodes.get(&b).unwrap().role, ReplicaRole::Leader);
    assert_eq!(cluster.nodes.get(&c).unwrap().role, ReplicaRole::Replica);
}

#[test]
fn isolated_replica_does_not_advance_tail() {
    // Plan §F failure injection: one replica isolated mid-flight.
    // The isolated replica cannot observe the leader's heartbeats
    // or catch up; its local tail stays at 0. The connected
    // replicas catch up normally.
    let a = 0x10u64;
    let b = 0x20u64;
    let c = 0x30u64;
    let mut cluster = VirtualCluster::new(&[a, b, c], "dst/isolated");
    cluster.force_leader(a);
    cluster.force_replica(b);
    cluster.force_replica(c);

    // Partition C from both A and B before appends happen.
    cluster.partition(a, c);
    cluster.partition(b, c);

    append_on_leader(&cluster, a, 8, "iso");

    // B should catch up to 8; C should stay at 0.
    let steps = cluster.run_until(
        |c| c.nodes.get(&b).map(|n| n.tail_seq()).unwrap_or(0) == 8,
        "isolated_replica_b_catches_up",
    );
    assert!(steps < STEP_BUDGET);
    let c_tail = cluster.nodes.get(&c).map(|n| n.tail_seq()).unwrap_or(99);
    assert_eq!(
        c_tail, 0,
        "isolated replica's tail must NOT advance; got {c_tail}"
    );
}

#[test]
fn partition_heal_lets_isolated_replica_catch_up() {
    // Plan §F partition-heal: isolated replica's tail stayed at 0
    // during the partition; after heal, it catches up to the
    // current leader's tail via the standard catch-up cycle.
    let a = 0x10u64;
    let b = 0x20u64;
    let c = 0x30u64;
    let mut cluster = VirtualCluster::new(&[a, b, c], "dst/heal");
    cluster.force_leader(a);
    cluster.force_replica(b);
    cluster.force_replica(c);

    // Isolate C, append on A, let B catch up, then heal.
    cluster.partition(a, c);
    cluster.partition(b, c);
    append_on_leader(&cluster, a, 12, "preheal");
    cluster.run_until(
        |c| c.nodes.get(&b).map(|n| n.tail_seq()).unwrap_or(0) == 12,
        "pre-heal: B catches up",
    );

    // Heal C — restore both connections.
    cluster.heal_partition(a, c);
    cluster.heal_partition(b, c);

    // C should now catch up to the leader's tail.
    let steps = cluster.run_until(
        |c2| c2.nodes.get(&c).map(|n| n.tail_seq()).unwrap_or(0) == 12,
        "post-heal: C catches up",
    );
    assert!(steps < STEP_BUDGET);
}

#[test]
fn divergence_freedom_no_two_replicas_hold_different_payload_at_same_seq() {
    // Plan §F divergence-freedom: after catch-up, every replica's
    // file at any given seq holds the same bytes the leader
    // wrote. The replication protocol's monotonic-seq + chunk-
    // first_seq pinning prevents two replicas from observing
    // different events at the same seq.
    let a = 0x10u64;
    let b = 0x20u64;
    let c = 0x30u64;
    let mut cluster = VirtualCluster::new(&[a, b, c], "dst/divergence");
    cluster.force_leader(a);
    cluster.force_replica(b);
    cluster.force_replica(c);
    append_on_leader(&cluster, a, 24, "div");

    cluster.run_until(
        |cl| {
            cl.nodes.get(&b).map(|n| n.tail_seq()).unwrap_or(0) == 24
                && cl.nodes.get(&c).map(|n| n.tail_seq()).unwrap_or(0) == 24
        },
        "divergence-freedom convergence",
    );

    // Cross-check: read every event on every replica and assert
    // payloads match the leader's.
    let leader = cluster.nodes.get(&a).unwrap();
    let leader_events = leader.file.read_range(0, 24);
    assert_eq!(leader_events.len(), 24);

    for replica_id in [b, c] {
        let replica = cluster.nodes.get(&replica_id).unwrap();
        let events = replica.file.read_range(0, 24);
        assert_eq!(
            events.len(),
            24,
            "replica {replica_id:#x} has {} events; leader has 24",
            events.len(),
        );
        for (i, (leader_ev, replica_ev)) in leader_events.iter().zip(events.iter()).enumerate() {
            assert_eq!(
                leader_ev.entry.seq, replica_ev.entry.seq,
                "replica {replica_id:#x} event {i}: seq mismatch"
            );
            assert_eq!(
                leader_ev.payload.as_ref(),
                replica_ev.payload.as_ref(),
                "replica {replica_id:#x} event {i}: payload bytes differ"
            );
        }
    }
}

#[test]
fn restart_during_sync_replica_resumes_from_local_tail() {
    // Plan §F restart-during-sync: a replica is killed mid-
    // flight, then revived. Its local tail is whatever it
    // advanced to before the kill; on revival, the catch-up
    // cycle picks up from there.
    //
    // Harness modeling: a "revival" is `killed = false` plus
    // clearing the believed-leader so the next round of
    // heartbeats reseats it (mirrors what `RedexFile` reopen
    // would observe + the heartbeat tracker starting fresh).
    let a = 0x10u64;
    let b = 0x20u64;
    let mut cluster = VirtualCluster::new(&[a, b], "dst/restart");
    cluster.force_leader(a);
    cluster.force_replica(b);
    append_on_leader(&cluster, a, 5, "pre");

    // Let B catch up partially, then kill it.
    cluster.run_until(
        |c| c.nodes.get(&b).map(|n| n.tail_seq()).unwrap_or(0) == 5,
        "B catches up pre-kill",
    );
    let b_tail_before_kill = cluster.nodes.get(&b).unwrap().tail_seq();
    cluster.kill(b);

    // Append more events on A while B is dead.
    append_on_leader(&cluster, a, 10, "post-kill");

    // Revive B: drop the killed flag and clear its tracker so
    // the next heartbeat reseats leader.
    {
        let n = cluster.nodes.get_mut(&b).unwrap();
        n.killed = false;
        n.tracker = HeartbeatTracker::new(DST_HEARTBEAT_MS);
    }

    // B should catch up from its pre-kill tail (5) to the new
    // leader tail (15).
    let steps = cluster.run_until(
        |c| c.nodes.get(&b).map(|n| n.tail_seq()).unwrap_or(0) == b_tail_before_kill + 10,
        "B resumes catch-up post-revival",
    );
    assert!(steps < STEP_BUDGET);
    assert_eq!(cluster.nodes.get(&b).unwrap().tail_seq(), 15);
}

#[test]
fn no_two_leaders_within_a_partition_post_failover_with_central_peer() {
    // The headline Phase F invariant under the convergence case:
    // when the RTT matrix has a clear central peer, the failover
    // election produces exactly one new Leader and the state
    // stays stable across subsequent ticks.
    //
    // Note: this invariant DOESN'T hold for symmetric-RTT
    // scenarios — see
    // `symmetric_failover_with_three_survivors_produces_dual_leaders`
    // for the documented dual-leader window.
    let a = 0x10u64;
    let b = 0x20u64;
    let c = 0x30u64;
    let d = 0x40u64;
    let mut cluster = VirtualCluster::new(&[a, b, c, d], "dst/no_two_leaders_central");
    cluster.force_leader(a);
    cluster.force_replica(b);
    cluster.force_replica(c);
    cluster.force_replica(d);

    // Override RTT so B is the clearly-central peer among
    // survivors: every non-B survivor sees B at 0 RTT, and B
    // sees them at non-zero. This is the production-realistic
    // case where the proximity graph's measurements rank one
    // node unambiguously closest.
    cluster.rtt.insert((b.min(c), b.max(c)), Duration::ZERO);
    cluster.rtt.insert((b.min(d), b.max(d)), Duration::ZERO);

    for _ in 0..5 {
        cluster.step();
    }

    // Kill A — start of the failover window.
    cluster.kill(a);

    // Run until exactly one survivor is Leader.
    cluster.run_until(
        |cl| cl.live_leader_count() == 1,
        "exactly one leader post-failover (central peer)",
    );

    // Keep running a few more steps to confirm the state is
    // stable — no spontaneous extra Leader appears.
    for step in 1..=30 {
        cluster.step();
        assert_eq!(
            cluster.live_leader_count(),
            1,
            "no_two_leaders: state must stay at exactly one Leader at step +{step}: {}",
            cluster.snapshot(),
        );
    }
    // The new Leader is B (the central peer).
    assert_eq!(cluster.nodes.get(&b).unwrap().role, ReplicaRole::Leader);
}

/// R-9 regression: `wall_clock_ms` emitted in outbound heartbeats
/// must be a deterministic function of the step counter, not real
/// wall-clock. The previous implementation used
/// `self.now.elapsed()` which leaked `Instant::now()` into the
/// harness. Now we use `self.now.duration_since(self.initial_now)`
/// so the value is exactly `step_index * STEP_DURATION_MS`.
///
/// We call `tick()` directly on the harness's internal state
/// (post-`step()`) and inspect the emitted heartbeats' wall-clock
/// values. A regression to real-time leakage would make the
/// observed values depend on `Instant::now()`.
#[test]
fn wall_clock_ms_is_deterministic_function_of_step_counter() {
    fn run_and_capture(channel: &str) -> Vec<u64> {
        let a = 0x10u64;
        let b = 0x20u64;
        let mut cluster = VirtualCluster::new(&[a, b], channel);
        cluster.force_leader(a);
        cluster.force_replica(b);

        let mut emitted_wall_clocks = Vec::new();
        // Run several steps; at each step BEFORE delivery, peek
        // at the outbound heartbeats by re-running tick() on the
        // leader and inspecting the synthesized output. The
        // re-run is non-destructive because `tick()` is a pure
        // function — same inputs produce the same outputs.
        for step_idx in 1..=6u64 {
            cluster.step();
            // Re-run tick() against the leader's current state
            // to observe the wall_clock_ms it would emit. The
            // wall_clock_ms is derived from `self.now -
            // self.initial_now` in the harness, so this
            // reproduces the value emitted during the just-
            // completed step.
            let leader = cluster.nodes.get(&a).expect("leader present");
            let outcome = tick(TickInputs {
                self_node_id: a,
                current_role: leader.role,
                channel_id: cluster.channel_id,
                tail_seq: leader.tail_seq(),
                replica_set: &cluster.replica_set,
                tracker: &leader.tracker,
                wall_clock_ms: cluster.now.duration_since(cluster.initial_now).as_millis() as u64,
                chunk_max_bytes: DST_CHUNK_MAX_BYTES,
                now: cluster.now,
                default_bandwidth_class: cluster.default_bandwidth_class,
            });
            for msg in outcome.outbound {
                if let OutboundMessage::Heartbeat { msg, .. } = msg {
                    emitted_wall_clocks.push((step_idx, msg.wall_clock_ms));
                }
            }
        }
        // Pair (step, wall_clock_ms). Two runs should match.
        emitted_wall_clocks.into_iter().map(|(_, ms)| ms).collect()
    }

    // Run scenario A. Introduce a non-trivial real-time gap.
    let trace_a = run_and_capture("dst/wall_clock_a");
    std::thread::sleep(std::time::Duration::from_millis(75));
    // Run scenario B with identical setup.
    let trace_b = run_and_capture("dst/wall_clock_b");

    // Both traces must be byte-identical — wall_clock_ms is now
    // a pure function of the step counter, not real time. A
    // regression to `Instant::elapsed()` would make trace_a
    // 75ms earlier than trace_b (or differ by jitter).
    assert_eq!(
        trace_a, trace_b,
        "wall_clock_ms sequence must be deterministic across real-time gaps"
    );

    // Every value must be a multiple of STEP_DURATION_MS, and
    // bounded by the step count we ran.
    for &ms in &trace_a {
        assert!(
            ms % STEP_DURATION_MS == 0,
            "wall_clock_ms {ms} is not a multiple of STEP_DURATION_MS ({STEP_DURATION_MS})"
        );
        assert!(
            ms <= 6 * STEP_DURATION_MS,
            "wall_clock_ms {ms} exceeds 6×STEP_DURATION_MS ({})",
            6 * STEP_DURATION_MS,
        );
    }

    // We expect at least one emitted heartbeat (the leader's
    // tick fires immediately on step 1).
    assert!(!trace_a.is_empty(), "no heartbeats emitted in 6 steps");
}

/// C-1 — election-storms scenario: kill the leader, let the
/// election complete, kill the new leader, repeat. Pin that each
/// individual election still converges within the hysteresis
/// budget (3 missed heartbeats × heartbeat_ms).
///
/// Per plan §F:
/// > Election storms — sustained leader-loss detection for various
/// > reasons (heartbeat loss, proximity-graph false positives);
/// > assert election thrash is bounded by hysteresis.
///
/// The harness's `run_until` panics if any round exceeds
/// `STEP_BUDGET`, so reaching the end of the test pins the
/// "bounded by hysteresis" guarantee. Each round drives the
/// state machine through a complete Replica → Candidate →
/// Leader transition for the new winner, exercising both
/// validation (`StateTransition::apply` rejects invalid pairs)
/// and convergence.
#[test]
fn election_storm_two_rounds_each_converges_within_hysteresis() {
    let a = 0x10u64;
    let b = 0x20u64;
    let c = 0x30u64;
    let d = 0x40u64;
    let mut cluster = VirtualCluster::new(&[a, b, c, d], "dst/election_storm");
    cluster.force_leader(a);
    cluster.force_replica(b);
    cluster.force_replica(c);
    cluster.force_replica(d);

    // Asymmetric RTT — B is unambiguously the central peer among
    // {B, C, D} from C's and D's view. This makes the
    // post-A-kill election deterministically pick B.
    cluster.rtt.insert((b, c), Duration::ZERO);
    cluster.rtt.insert((b, d), Duration::ZERO);

    // Let the initial leader (A) stabilize.
    for _ in 0..5 {
        cluster.step();
    }

    // Storm round 1: kill A. Survivors {B, C, D} elect B.
    cluster.kill(a);
    cluster.run_until(
        |cl| cl.live_leader_count() >= 1,
        "storm r1: at least one new leader",
    );
    assert_eq!(
        cluster.nodes.get(&b).unwrap().role,
        ReplicaRole::Leader,
        "storm r1: B (central peer) must win"
    );

    // Stabilization steps so B's Leader heartbeats reach C and D
    // before we kill B (so their trackers update believed_leader
    // = B; otherwise R-2's post-election clear leaves them with
    // believed_leader = None, masking silence detection).
    for _ in 0..5 {
        cluster.step();
    }

    // Storm round 2: kill B. Survivors {C, D}. With both peers
    // at symmetric RTT and self-RTT=0, each surviving node sees
    // itself as the closest — the dual-leader window per plan §4.
    // The plan documents this; our pinned invariant here is that
    // both surviving nodes EVENTUALLY hold valid Leader role
    // (not stuck in Candidate), proving each round bounded by
    // hysteresis.
    cluster.kill(b);
    cluster.run_until(
        |cl| cl.live_leader_count() >= 1,
        "storm r2: at least one new leader",
    );

    // Final invariant — both C and D have completed their
    // transitions (no stuck Candidate). At least one is Leader.
    let c_role = cluster.nodes.get(&c).unwrap().role;
    let d_role = cluster.nodes.get(&d).unwrap().role;
    assert!(
        c_role != ReplicaRole::Candidate && d_role != ReplicaRole::Candidate,
        "storm r2: no node stuck in Candidate (c={c_role:?}, d={d_role:?})"
    );
    assert!(
        c_role == ReplicaRole::Leader || d_role == ReplicaRole::Leader,
        "storm r2: at least one survivor is Leader"
    );

    // The production coordinator increments `election_thrash_total`
    // on every MissedHeartbeats-driven transition. Each storm round
    // takes at least one survivor through Replica → Candidate, so
    // the cluster-wide thrash count must be ≥2 after two rounds.
    // Higher counts are fine — the dual-leader window in round 2
    // can drive both C and D through Candidate.
    let total_thrash: u64 = cluster
        .nodes
        .values()
        .map(|n| n.election_thrash_count)
        .sum();
    assert!(
        total_thrash >= 2,
        "election_thrash counter must bump on each MissedHeartbeats transition; got {total_thrash}"
    );
}

/// C-2 — divergence-freedom after partition heal. The original
/// `divergence_freedom_no_two_replicas_hold_different_payload_at_same_seq`
/// test runs the byte-for-byte equality check on the happy path
/// only. This scenario partitions a replica before any
/// heartbeats land (so its tracker stays empty and it doesn't
/// self-elect), lets the leader advance + a non-partitioned
/// replica converge, heals the partition, lets catch-up
/// complete, and then runs the same byte-equality check.
#[test]
fn divergence_freedom_after_partition_heal() {
    let a = 0x10u64;
    let b = 0x20u64;
    let c = 0x30u64;
    let mut cluster = VirtualCluster::new(&[a, b, c], "dst/divergence_after_heal");
    cluster.force_leader(a);
    cluster.force_replica(b);
    cluster.force_replica(c);

    // Partition C BEFORE any heartbeats land. With no observed
    // leader, C's silence-detection never trips, so it stays
    // Replica and never self-elects. Mirrors the shape of
    // `partition_heal_lets_isolated_replica_catch_up`.
    cluster.partition(a, c);
    cluster.partition(b, c);

    append_on_leader(&cluster, a, 12, "during-partition");
    // B catches up to 12 while C remains isolated.
    cluster.run_until(
        |cl| cl.nodes.get(&b).map(|n| n.tail_seq()).unwrap_or(0) == 12,
        "B catches up while C is partitioned",
    );
    // C remains at 0 — partition prevents catch-up.
    assert_eq!(cluster.nodes.get(&c).unwrap().tail_seq(), 0);

    // Heal the partition.
    cluster.heal_partition(a, c);
    cluster.heal_partition(b, c);

    // C catches up to 12.
    cluster.run_until(
        |cl| cl.nodes.get(&c).map(|n| n.tail_seq()).unwrap_or(0) == 12,
        "C catches up post-heal",
    );

    // C-2 invariant: every event at every seq on every replica
    // matches the leader's bytes — under the partition-heal
    // recovery path, not just happy-path.
    let leader = cluster.nodes.get(&a).unwrap();
    let leader_events = leader.file.read_range(0, 12);
    assert_eq!(leader_events.len(), 12);

    for replica_id in [b, c] {
        let replica = cluster.nodes.get(&replica_id).unwrap();
        let events = replica.file.read_range(0, 12);
        assert_eq!(
            events.len(),
            12,
            "post-heal: replica {replica_id:#x} has {} events; leader has 12",
            events.len(),
        );
        for (i, (le, re)) in leader_events.iter().zip(events.iter()).enumerate() {
            assert_eq!(
                le.entry.seq, re.entry.seq,
                "post-heal: replica {replica_id:#x} event {i} seq mismatch"
            );
            assert_eq!(
                le.payload.as_ref(),
                re.payload.as_ref(),
                "post-heal: replica {replica_id:#x} event {i} payload bytes differ"
            );
        }
    }
}

/// C-2 (companion): divergence-freedom after kill+revive. Same
/// byte-equality check as the happy-path test, but exercising
/// the restart-during-sync recovery path.
#[test]
fn divergence_freedom_after_replica_revival() {
    let a = 0x10u64;
    let b = 0x20u64;
    let mut cluster = VirtualCluster::new(&[a, b], "dst/divergence_after_revival");
    cluster.force_leader(a);
    cluster.force_replica(b);

    append_on_leader(&cluster, a, 5, "pre-kill");
    cluster.run_until(
        |cl| cl.nodes.get(&b).map(|n| n.tail_seq()).unwrap_or(0) == 5,
        "pre-kill convergence",
    );
    cluster.kill(b);
    append_on_leader(&cluster, a, 7, "during-kill");

    // Revive B.
    {
        let n = cluster.nodes.get_mut(&b).unwrap();
        n.killed = false;
        n.tracker = HeartbeatTracker::new(DST_HEARTBEAT_MS);
    }
    cluster.run_until(
        |cl| cl.nodes.get(&b).map(|n| n.tail_seq()).unwrap_or(0) == 12,
        "B catches up post-revival",
    );

    // Byte-equality check on the revived replica.
    let leader = cluster.nodes.get(&a).unwrap();
    let leader_events = leader.file.read_range(0, 12);
    let replica = cluster.nodes.get(&b).unwrap();
    let replica_events = replica.file.read_range(0, 12);
    assert_eq!(leader_events.len(), 12);
    assert_eq!(replica_events.len(), 12);
    for (i, (le, re)) in leader_events.iter().zip(replica_events.iter()).enumerate() {
        assert_eq!(
            le.entry.seq, re.entry.seq,
            "revival: event {i} seq mismatch"
        );
        assert_eq!(
            le.payload.as_ref(),
            re.payload.as_ref(),
            "revival: event {i} payload bytes differ"
        );
    }
}

/// v0.3 Phase D2 plumbing: when the cluster's
/// `default_bandwidth_class` is set to non-default (e.g.
/// Background), every emitted SyncRequest on the wire must
/// carry that class. Pre-fix the DST harness hard-coded
/// `Default::default()` (Foreground), so any future DST
/// scenario asserting class behavior would have looked like it
/// worked while testing nothing.
#[test]
fn dst_emitted_sync_request_carries_configured_bandwidth_class() {
    let a: NodeId = 0xA0;
    let b: NodeId = 0xB0;
    let mut cluster = VirtualCluster::new(&[a, b], "dst-class-plumbing")
        .with_default_bandwidth_class(net::adapter::net::redex::BandwidthClass::Background);

    // Promote a to Leader, b to Replica. b's tail is 0; a has
    // 5 events. b's next tick should emit a SyncRequest stamped
    // with Background.
    let node_a = cluster.nodes.get_mut(&a).unwrap();
    node_a.force_transition(ReplicaRole::Replica, TransitionSignal::CapabilitySelected);
    node_a.force_transition(ReplicaRole::Candidate, TransitionSignal::MissedHeartbeats);
    node_a.force_transition(ReplicaRole::Leader, TransitionSignal::ElectionWon);
    for i in 0..5 {
        node_a.file.append(format!("evt-{i}").as_bytes()).unwrap();
    }
    let node_b = cluster.nodes.get_mut(&b).unwrap();
    node_b.force_transition(ReplicaRole::Replica, TransitionSignal::CapabilitySelected);
    // Seed b's tracker with a believed-leader heartbeat so the
    // catchup-emit path fires.
    node_b
        .tracker
        .record_heartbeat(a, ReplicaRole::Leader, 5, cluster.now);

    // Tick b — should emit a SyncRequest in pending.
    cluster.tick_node(b);

    // Pull the staged SyncRequest out of `pending`. With Background
    // wired through tick, the request's class field is Background.
    let mut saw_request = false;
    while let Some((src, dst, inbound)) = cluster.pending.pop_front() {
        if src == b && dst == a {
            if let net::adapter::net::redex::Inbound::SyncRequest { msg, .. } = &inbound {
                assert_eq!(
                    msg.class,
                    net::adapter::net::redex::BandwidthClass::Background,
                    "DST: emitted SyncRequest must carry the cluster's configured class",
                );
                saw_request = true;
            }
        }
    }
    assert!(
        saw_request,
        "expected at least one SyncRequest to be staged for delivery"
    );
}