Skip to main content

dig_slashing/
orchestration.rs

1//! Epoch-boundary orchestration.
2//!
3//! Traces to: [SPEC §10](../docs/resources/SPEC.md).
4//!
5//! # Role
6//!
7//! [`run_epoch_boundary`] is the single public entry point an
8//! embedder calls once per epoch-boundary to drive every
9//! per-epoch piece of slashing state forward in a FIXED,
10//! spec-mandated order. Each downstream step depends on the
11//! state produced by earlier steps; reordering is a protocol
12//! error and pinned by DSL-127's order tests.
13//!
14//! Spec-mandated step order:
15//!
16//!   1. Compute flag deltas over `participation`'s previous-epoch
17//!      flags.
18//!   2. Update inactivity scores over the same previous-epoch
19//!      flags.
20//!   3. Compute inactivity-leak penalties for the ending epoch.
21//!   4. Finalise expired slashes (correlation penalty + reporter-
22//!      bond release + exit lock).
23//!   5. Rotate `ParticipationTracker` to `current_epoch_ending + 1`.
24//!   6. Advance `SlashingManager` epoch.
25//!   7. Resize trackers if `validator_count` changed.
26//!   8. Prune old processed evidence + correlation-window
27//!      entries.
28//!
29//! # Why this order
30//!
31//! - **1 before 2** — `update_for_epoch` reads the same
32//!   previous-epoch flags the flag-delta computation reads.
33//!   Running the update first would rotate the tracker before
34//!   the delta pass, losing the previous-epoch data permanently.
35//! - **3 before 4** — finalise uses correlation data that must
36//!   reflect the most recent inactivity update; if penalties
37//!   were computed after finalise, the cohort would use stale
38//!   scores.
39//! - **4 before 5** — `finalise_expired_slashes` reads
40//!   `correlation_window` entries keyed by the CURRENT epoch;
41//!   rotating the participation tracker first would confuse
42//!   other consumers into believing the new epoch is active
43//!   while the manager is still mid-finalise.
44//! - **8 last** — pruning drops evidence and correlation rows
45//!   that would otherwise be needed by earlier steps.
46
47use std::collections::BTreeMap;
48
49use dig_epoch::CORRELATION_WINDOW_EPOCHS;
50
51use dig_protocol::Bytes32;
52use serde::{Deserialize, Serialize};
53
54use crate::bonds::BondEscrow;
55use crate::error::SlashingError;
56use crate::inactivity::{InactivityScoreTracker, in_finality_stall};
57use crate::manager::{FinalisationResult, SlashingManager};
58use crate::participation::{FlagDelta, ParticipationTracker, compute_flag_deltas};
59use crate::protection::SlashingProtection;
60use crate::traits::{CollateralSlasher, EffectiveBalanceView, RewardPayout, ValidatorView};
61
62/// Per-epoch finality view. Returns the epoch of the most
63/// recently FINALIZED Casper-FFG checkpoint. DSL-127 consults
64/// this to derive [`in_finality_stall`]; the orchestrator does
65/// not require a full Casper view, only the finalized-epoch
66/// height.
67///
68/// Implemented by the embedder's consensus integration (DSL-143
69/// full surface). Shipped here early because DSL-127 is the
70/// first caller.
71pub trait JustificationView {
72    /// Epoch of the most recent finalized checkpoint. `0` at
73    /// genesis before any checkpoint has finalized. DSL-127
74    /// derives `in_finality_stall` from this.
75    fn latest_finalized_epoch(&self) -> u64;
76
77    /// Most recently justified checkpoint in the current epoch.
78    /// DSL-075 source-justified appeal check consumer. Default:
79    /// zero checkpoint so DSL-127 fixtures that only care about
80    /// the stall flag don't have to implement the full surface.
81    fn current_justified_checkpoint(&self) -> crate::evidence::Checkpoint {
82        crate::evidence::Checkpoint {
83            epoch: 0,
84            root: dig_protocol::Bytes32::new([0u8; 32]),
85        }
86    }
87
88    /// Checkpoint justified in the previous epoch. DSL-075
89    /// consumer. Default: zero checkpoint.
90    fn previous_justified_checkpoint(&self) -> crate::evidence::Checkpoint {
91        crate::evidence::Checkpoint {
92            epoch: 0,
93            root: dig_protocol::Bytes32::new([0u8; 32]),
94        }
95    }
96
97    /// Most recently finalized checkpoint. Default: zero-root
98    /// at [`latest_finalized_epoch`] so the epoch leg matches
99    /// DSL-127's minimum contract even when the root is
100    /// uninitialised. DSL-076 consumer.
101    fn finalized_checkpoint(&self) -> crate::evidence::Checkpoint {
102        crate::evidence::Checkpoint {
103            epoch: self.latest_finalized_epoch(),
104            root: dig_protocol::Bytes32::new([0u8; 32]),
105        }
106    }
107
108    /// Canonical block root at `slot`, or `None` for
109    /// uncommitted / future slots. DSL-076/077 head check
110    /// consumer. Default: always `None`.
111    fn canonical_block_root_at_slot(&self, _slot: u64) -> Option<dig_protocol::Bytes32> {
112        None
113    }
114
115    /// Canonical target root for `epoch` (start-of-epoch
116    /// block root), or `None` past chain tip. DSL-076 target-root
117    /// consumer. Default: `None`.
118    fn canonical_target_root_for_epoch(&self, _epoch: u64) -> Option<dig_protocol::Bytes32> {
119        None
120    }
121}
122
123/// Summary produced by [`run_epoch_boundary`]. Carries every
124/// side-effect the caller needs to route downstream (logging,
125/// reward payouts, state snapshots).
126///
127/// The struct intentionally contains vectors rather than
128/// callback channels — the orchestrator is infallible by
129/// construction and produces a complete report in one pass.
130#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
131pub struct EpochBoundaryReport {
132    /// Per-validator reward/penalty deltas from DSL-082/083.
133    pub flag_deltas: Vec<FlagDelta>,
134    /// Per-validator inactivity-leak penalties from DSL-091/092.
135    /// Empty outside a finality stall.
136    pub inactivity_penalties: Vec<(u32, u64)>,
137    /// Slashes finalised this epoch (DSL-029..033).
138    pub finalisations: Vec<FinalisationResult>,
139    /// Whether a finality stall was in effect at the start of
140    /// the epoch boundary. Drives inactivity-leak branches.
141    pub in_finality_stall: bool,
142    /// Number of stale processed-evidence entries pruned
143    /// (step 8). Observability only.
144    pub pruned_entries: usize,
145}
146
147/// Drive one epoch-boundary pass. See module docs for order.
148///
149/// # Signatures
150///
151/// Every trait argument is `&mut dyn` / `&dyn` so the embedder
152/// can inject concrete state views without committing to
153/// generics on the slashing crate. The `usize` +`u64` scalars
154/// are snapshot values measured at block N-1 (the block that
155/// closes the epoch).
156///
157/// # Invariants
158///
159/// - After the call:
160///   - `participation.current_epoch_number() == current_epoch_ending + 1`
161///   - `manager.current_epoch() == current_epoch_ending + 1`
162///   - `inactivity.validator_count() == validator_count`
163#[allow(clippy::too_many_arguments)]
164pub fn run_epoch_boundary(
165    manager: &mut SlashingManager,
166    participation: &mut ParticipationTracker,
167    inactivity: &mut InactivityScoreTracker,
168    validator_set: &mut dyn ValidatorView,
169    effective_balances: &dyn EffectiveBalanceView,
170    bond_escrow: &mut dyn BondEscrow,
171    reward_payout: &mut dyn RewardPayout,
172    justification: &dyn JustificationView,
173    current_epoch_ending: u64,
174    validator_count: usize,
175    total_active_balance: u64,
176) -> EpochBoundaryReport {
177    // Derive finality-stall state ONCE up front. Both the
178    // inactivity-score update (step 2) and the penalty
179    // computation (step 3) branch on it; deriving here keeps
180    // them consistent even if `justification` is a racing
181    // reference (should not happen under the chain lock, but
182    // defensive).
183    let finalized_epoch = justification.latest_finalized_epoch();
184    let stall = in_finality_stall(current_epoch_ending, finalized_epoch);
185
186    // ── Step 1: flag deltas over previous-epoch flags ─────
187    let flag_deltas = compute_flag_deltas(
188        participation,
189        effective_balances,
190        total_active_balance,
191        stall,
192    );
193
194    // ── Step 1b (DSL-169): route flag-delta rewards through
195    // RewardPayout. For every delta with reward > 0, call
196    // `reward_payout.pay(puzzle_hash, reward)`. Zero-reward
197    // deltas are filtered so no-op payments do not spam the
198    // embedder's accumulator. Validators missing from the view
199    // are silently skipped (defensive tolerance — the view may
200    // drift between DSL-082 computation and here if a parallel
201    // mutation is in flight, though under the chain lock this
202    // should not happen).
203    for fd in &flag_deltas {
204        if fd.reward == 0 {
205            continue;
206        }
207        if let Some(entry) = validator_set.get(fd.validator_index) {
208            reward_payout.pay(entry.puzzle_hash(), fd.reward);
209        }
210    }
211
212    // ── Step 2: inactivity-score update (reads same flags) ─
213    inactivity.update_for_epoch(participation, stall);
214
215    // ── Step 3: inactivity-leak penalties for ending epoch ─
216    let inactivity_penalties = inactivity.epoch_penalties(effective_balances, stall);
217
218    // ── Step 3b (DSL-169): apply inactivity-leak penalties to
219    // validator stakes via `ValidatorEntry::slash_absolute`.
220    // DSL-091/092 computes the per-validator penalty_mojos; the
221    // wiring here actually debits the stake. Missing validators
222    // skipped (same rationale as step 1b).
223    for &(idx, penalty_mojos) in &inactivity_penalties {
224        if let Some(entry) = validator_set.get_mut(idx) {
225            entry.slash_absolute(penalty_mojos, current_epoch_ending);
226        }
227    }
228
229    // ── Step 4: finalise expired slashes ─────────────────
230    let finalisations = manager.finalise_expired_slashes(
231        validator_set,
232        effective_balances,
233        bond_escrow,
234        total_active_balance,
235    );
236
237    // ── Step 5: rotate participation tracker ──────────────
238    participation.rotate_epoch(current_epoch_ending + 1, validator_count);
239
240    // ── Step 6: advance SlashingManager epoch ─────────────
241    manager.set_epoch(current_epoch_ending + 1);
242
243    // ── Step 7: resize trackers if validator count changed ─
244    if inactivity.validator_count() != validator_count {
245        inactivity.resize_for(validator_count);
246    }
247
248    // ── Step 8: prune old processed evidence + corr-window ─
249    // Cutoff = current_epoch_ending.saturating_sub(CORRELATION_WINDOW_EPOCHS).
250    // Keeps everything within the correlation window reachable
251    // by future DSL-030 cohort-sum computations.
252    let cutoff = current_epoch_ending.saturating_sub(u64::from(CORRELATION_WINDOW_EPOCHS));
253    let pruned_entries = manager.prune_processed_older_than(cutoff);
254
255    EpochBoundaryReport {
256        flag_deltas,
257        inactivity_penalties,
258        finalisations,
259        in_finality_stall: stall,
260        pruned_entries,
261    }
262}
263
264// `BTreeMap` imported above for Visualiser-friendly diff when
265// the module grows; currently not used directly. Suppress with
266// a no-op to avoid unused-import churn.
267#[allow(dead_code)]
268type _KeepBTreeMap<K, V> = BTreeMap<K, V>;
269
270/// Summary produced by [`rewind_all_on_reorg`]. Carries per-
271/// subsystem rewind outcomes so the caller (a chain-shell
272/// orchestrator) can log or emit metrics without re-deriving
273/// the rewind scope from internal tracker state.
274#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
275pub struct ReorgReport {
276    /// Evidence hashes rewound by
277    /// [`SlashingManager::rewind_on_reorg`] (DSL-129).
278    pub rewound_pending_slashes: Vec<Bytes32>,
279    /// Epochs dropped from the participation tracker (= reorg
280    /// depth at the moment the tracker was rewound).
281    pub participation_epochs_dropped: u64,
282    /// Epochs dropped from the inactivity tracker (same depth —
283    /// the inactivity tracker does not carry an epoch counter,
284    /// so the caller's computed depth is carried through for
285    /// uniform metric reporting).
286    pub inactivity_epochs_dropped: u64,
287    /// Whether `SlashingProtection::reconcile_with_chain_tip`
288    /// was called. `true` in every successful rewind; exposed
289    /// as a field for symmetry / future branching.
290    pub protection_rewound: bool,
291}
292
293/// Global reorg orchestrator. Rewinds every slashing-state
294/// subsystem in a fixed order.
295///
296/// Implements [DSL-130](../docs/requirements/domains/orchestration/specs/DSL-130.md).
297/// Traces to SPEC §13.
298///
299/// # Step order
300///
301///   1. [`SlashingManager::rewind_on_reorg`] (DSL-129) — must
302///      run FIRST because it reads validator-set state that the
303///      other rewinds do not touch; running it after a
304///      participation rewind would confuse the `is_slashed`
305///      check inside `credit_stake` / `restore_status`.
306///   2. [`ParticipationTracker::rewind_on_reorg`] — zero-fills
307///      both flag vectors and anchors current_epoch at the
308///      new tip.
309///   3. [`InactivityScoreTracker::rewind_on_reorg`] — zero-
310///      fills every score.
311///   4. [`SlashingProtection::reconcile_with_chain_tip`]
312///      (DSL-099) — caps proposal + attestation watermarks at
313///      the new tip and clears the attested-block hash binding.
314///
315/// After success, `manager.current_epoch()` is reset to
316/// `new_tip_epoch` so the orchestration state carries the
317/// post-reorg epoch forward.
318///
319/// # Depth limit
320///
321/// `current - new_tip_epoch > CORRELATION_WINDOW_EPOCHS` ⇒
322/// `SlashingError::ReorgTooDeep`. The correlation window is
323/// the deepest state we can reconstruct — older `slashed_in_window`
324/// rows have been pruned (DSL-127 step 8) and no subsystem
325/// retains snapshots further back.
326///
327/// # Errors
328///
329/// - [`SlashingError::ReorgTooDeep`] — reorg depth exceeds
330///   retention. No state is mutated; caller must recover via a
331///   longer-range reconciliation path (checkpoint restore /
332///   full resync).
333#[allow(clippy::too_many_arguments)]
334pub fn rewind_all_on_reorg(
335    manager: &mut SlashingManager,
336    participation: &mut ParticipationTracker,
337    inactivity: &mut InactivityScoreTracker,
338    protection: &mut SlashingProtection,
339    validator_set: &mut dyn ValidatorView,
340    collateral: Option<&mut dyn CollateralSlasher>,
341    bond_escrow: &mut dyn BondEscrow,
342    new_tip_epoch: u64,
343    new_tip_slot: u64,
344    validator_count: usize,
345) -> Result<ReorgReport, SlashingError> {
346    let current_epoch = manager.current_epoch();
347    let depth = current_epoch.saturating_sub(new_tip_epoch);
348    let limit = u64::from(CORRELATION_WINDOW_EPOCHS);
349    if depth > limit {
350        return Err(SlashingError::ReorgTooDeep { depth, limit });
351    }
352
353    // ── Step 1: manager rewind ────────────────────────────
354    let rewound_pending_slashes =
355        manager.rewind_on_reorg(new_tip_epoch, validator_set, collateral, bond_escrow);
356
357    // ── Step 2: participation rewind ──────────────────────
358    let participation_epochs_dropped =
359        participation.rewind_on_reorg(new_tip_epoch, validator_count);
360
361    // ── Step 3: inactivity rewind ─────────────────────────
362    let inactivity_epochs_dropped = inactivity.rewind_on_reorg(depth);
363
364    // ── Step 4: protection reconcile ──────────────────────
365    protection.reconcile_with_chain_tip(new_tip_slot, new_tip_epoch);
366
367    // Anchor the manager's epoch at the new tip so future
368    // epoch-boundary passes compute correctly.
369    manager.set_epoch(new_tip_epoch);
370
371    Ok(ReorgReport {
372        rewound_pending_slashes,
373        participation_epochs_dropped,
374        inactivity_epochs_dropped,
375        protection_rewound: true,
376    })
377}