Skip to main content

bb_runtime/framework/
backoff_table.rs

1//! `BackoffTable` - per-peer exponential backoff state used by wire
2//! syscalls + transport adapters per ENGINE.md §10.2.
3//!
4//! Each peer tracks consecutive failures, the timestamp of the most
5//! recent attempt, and the computed next-retry time. The backoff
6//! schedule is exponential with a configurable base and cap:
7//!
8//! ```text
9//! delay(n) = min(BASE_NS * 2^n, MAX_DELAY_NS)
10//! ```
11//!
12//! On success, `clear(peer)` resets state. On failure,
13//! `record_failure(peer, now_ns)` bumps the attempt counter +
14//! schedules the next retry. `should_retry(peer, now_ns)` reports
15//! whether the cooldown has elapsed (peers never seen before are
16//! always allowed to retry).
17
18use std::collections::HashMap;
19
20use crate::ids::PeerId;
21
22/// Default base delay (10 ms) for the first retry after a failure.
23pub const DEFAULT_BASE_NS: u64 = 10_000_000;
24
25/// Default cap (60 s). Stops the doubling from running away on long
26/// outages.
27pub const DEFAULT_MAX_DELAY_NS: u64 = 60_000_000_000;
28
29/// Per-peer backoff bookkeeping.
30#[derive(Clone, Copy, Debug, PartialEq, Eq)]
31pub struct BackoffState {
32    /// Number of consecutive failures.
33    pub attempts: u32,
34    /// `now_ns` recorded at the most recent `record_failure` call.
35    pub last_attempt_ns: u64,
36    /// Earliest `now_ns` at which the next retry is permitted.
37    pub next_retry_ns: u64,
38}
39
40/// Per-peer exponential backoff table.
41pub struct BackoffTable {
42    states: HashMap<PeerId, BackoffState>,
43    base_ns: u64,
44    max_delay_ns: u64,
45}
46
47impl Default for BackoffTable {
48    fn default() -> Self {
49        Self::new()
50    }
51}
52
53impl BackoffTable {
54    /// Construct a fresh table using the spec's default schedule
55    /// (10 ms base, 60 s cap).
56    pub fn new() -> Self {
57        Self::with_schedule(DEFAULT_BASE_NS, DEFAULT_MAX_DELAY_NS)
58    }
59
60    /// Construct a table with a custom exponential schedule.
61    pub fn with_schedule(base_ns: u64, max_delay_ns: u64) -> Self {
62        Self {
63            states: HashMap::new(),
64            base_ns: base_ns.max(1),
65            max_delay_ns: max_delay_ns.max(1),
66        }
67    }
68
69    /// Record a failure for `peer` observed at `now_ns`. Increments
70    /// the attempt counter + schedules the next retry.
71    pub fn record_failure(&mut self, peer: PeerId, now_ns: u64) {
72        let attempts = self
73            .states
74            .get(&peer)
75            .map(|s| s.attempts.saturating_add(1))
76            .unwrap_or(1);
77        let next_retry_ns = now_ns.saturating_add(self.delay_for(attempts));
78        self.states.insert(
79            peer,
80            BackoffState {
81                attempts,
82                last_attempt_ns: now_ns,
83                next_retry_ns,
84            },
85        );
86    }
87
88    /// Record a remotely-advised back-off for `peer` per the
89    /// backpressure protocol at
90    /// `docs/internal/superpowers/specs/2026-06-23-backpressure-runtime.md`
91    /// §5.2. Unlike [`Self::record_failure`], the advisory sets
92    /// `next_retry_ns = now_ns + min_backoff_ns` directly instead of
93    /// applying the local exponential schedule - the receiver knows
94    /// best how long the sender should pause.
95    ///
96    /// Bumps the attempt counter so a subsequent local
97    /// `record_failure` resumes the exponential schedule at the
98    /// next step. Caps `min_backoff_ns` at `max_delay_ns` so a
99    /// pathological remote advisory cannot pin the peer indefinitely.
100    pub fn record_remote_advisory(&mut self, peer: PeerId, now_ns: u64, min_backoff_ns: u64) {
101        let attempts = self
102            .states
103            .get(&peer)
104            .map(|s| s.attempts.saturating_add(1))
105            .unwrap_or(1);
106        let capped = min_backoff_ns.min(self.max_delay_ns);
107        let next_retry_ns = now_ns.saturating_add(capped);
108        self.states.insert(
109            peer,
110            BackoffState {
111                attempts,
112                last_attempt_ns: now_ns,
113                next_retry_ns,
114            },
115        );
116    }
117
118    /// Record a success for `peer`; clears any tracked failure
119    /// state. Subsequent `should_retry` returns `true` immediately.
120    pub fn record_success(&mut self, peer: PeerId) {
121        self.states.remove(&peer);
122    }
123
124    /// Whether a retry to `peer` is permitted at `now_ns`. Peers
125    /// with no recorded failures retry immediately.
126    pub fn should_retry(&self, peer: PeerId, now_ns: u64) -> bool {
127        match self.states.get(&peer) {
128            None => true,
129            Some(state) => now_ns >= state.next_retry_ns,
130        }
131    }
132
133    /// Inspect the recorded state for `peer`. Returns `None` when no
134    /// failures have been recorded.
135    pub fn state(&self, peer: PeerId) -> Option<BackoffState> {
136        self.states.get(&peer).copied()
137    }
138
139    /// Number of peers currently being tracked.
140    pub fn len(&self) -> usize {
141        self.states.len()
142    }
143
144    /// Whether any peer is currently backing off.
145    pub fn is_empty(&self) -> bool {
146        self.states.is_empty()
147    }
148
149    /// Iterate `(PeerId, BackoffState)` for snapshot capture.
150    /// .
151    pub fn iter(&self) -> impl Iterator<Item = (PeerId, BackoffState)> + '_ {
152        self.states.iter().map(|(p, s)| (*p, *s))
153    }
154
155    /// Restore one peer's backoff state directly. Used by
156    /// `Node::restore` to re-seed peers from a
157    /// `FrameworkSnapshot::backoff_table` entry without going
158    /// through the failure-counting path.
159    pub fn restore_state(&mut self, peer: PeerId, state: BackoffState) {
160        self.states.insert(peer, state);
161    }
162
163    fn delay_for(&self, attempts: u32) -> u64 {
164        // 2^(n-1) backoff (attempts >= 1 after first failure). Clamp
165        // shift so we never overflow before applying the max cap.
166        let shift = attempts.saturating_sub(1).min(63);
167        let factor = 1u64.checked_shl(shift).unwrap_or(u64::MAX);
168        self.base_ns.saturating_mul(factor).min(self.max_delay_ns)
169    }
170}
171