bb_runtime/framework/backoff_table.rs
1//! `BackoffTable` - per-peer exponential backoff state used by wire
2//! syscalls + transport adapters per ENGINE.md §10.2.
3//!
4//! Each peer tracks consecutive failures, the timestamp of the most
5//! recent attempt, and the computed next-retry time. The backoff
6//! schedule is exponential with a configurable base and cap:
7//!
8//! ```text
9//! delay(n) = min(BASE_NS * 2^n, MAX_DELAY_NS)
10//! ```
11//!
12//! On success, `clear(peer)` resets state. On failure,
13//! `record_failure(peer, now_ns)` bumps the attempt counter +
14//! schedules the next retry. `should_retry(peer, now_ns)` reports
15//! whether the cooldown has elapsed (peers never seen before are
16//! always allowed to retry).
17
18use std::collections::HashMap;
19
20use crate::ids::PeerId;
21
22/// Default base delay (10 ms) for the first retry after a failure.
23pub const DEFAULT_BASE_NS: u64 = 10_000_000;
24
25/// Default cap (60 s). Stops the doubling from running away on long
26/// outages.
27pub const DEFAULT_MAX_DELAY_NS: u64 = 60_000_000_000;
28
29/// Per-peer backoff bookkeeping.
30#[derive(Clone, Copy, Debug, PartialEq, Eq)]
31pub struct BackoffState {
32 /// Number of consecutive failures.
33 pub attempts: u32,
34 /// `now_ns` recorded at the most recent `record_failure` call.
35 pub last_attempt_ns: u64,
36 /// Earliest `now_ns` at which the next retry is permitted.
37 pub next_retry_ns: u64,
38}
39
40/// Per-peer exponential backoff table.
41pub struct BackoffTable {
42 states: HashMap<PeerId, BackoffState>,
43 base_ns: u64,
44 max_delay_ns: u64,
45}
46
47impl Default for BackoffTable {
48 fn default() -> Self {
49 Self::new()
50 }
51}
52
53impl BackoffTable {
54 /// Construct a fresh table using the spec's default schedule
55 /// (10 ms base, 60 s cap).
56 pub fn new() -> Self {
57 Self::with_schedule(DEFAULT_BASE_NS, DEFAULT_MAX_DELAY_NS)
58 }
59
60 /// Construct a table with a custom exponential schedule.
61 pub fn with_schedule(base_ns: u64, max_delay_ns: u64) -> Self {
62 Self {
63 states: HashMap::new(),
64 base_ns: base_ns.max(1),
65 max_delay_ns: max_delay_ns.max(1),
66 }
67 }
68
69 /// Record a failure for `peer` observed at `now_ns`. Increments
70 /// the attempt counter + schedules the next retry.
71 pub fn record_failure(&mut self, peer: PeerId, now_ns: u64) {
72 let attempts = self
73 .states
74 .get(&peer)
75 .map(|s| s.attempts.saturating_add(1))
76 .unwrap_or(1);
77 let next_retry_ns = now_ns.saturating_add(self.delay_for(attempts));
78 self.states.insert(
79 peer,
80 BackoffState {
81 attempts,
82 last_attempt_ns: now_ns,
83 next_retry_ns,
84 },
85 );
86 }
87
88 /// Record a remotely-advised back-off for `peer` per the
89 /// backpressure protocol at
90 /// `docs/internal/superpowers/specs/2026-06-23-backpressure-runtime.md`
91 /// §5.2. Unlike [`Self::record_failure`], the advisory sets
92 /// `next_retry_ns = now_ns + min_backoff_ns` directly instead of
93 /// applying the local exponential schedule - the receiver knows
94 /// best how long the sender should pause.
95 ///
96 /// Bumps the attempt counter so a subsequent local
97 /// `record_failure` resumes the exponential schedule at the
98 /// next step. Caps `min_backoff_ns` at `max_delay_ns` so a
99 /// pathological remote advisory cannot pin the peer indefinitely.
100 pub fn record_remote_advisory(&mut self, peer: PeerId, now_ns: u64, min_backoff_ns: u64) {
101 let attempts = self
102 .states
103 .get(&peer)
104 .map(|s| s.attempts.saturating_add(1))
105 .unwrap_or(1);
106 let capped = min_backoff_ns.min(self.max_delay_ns);
107 let next_retry_ns = now_ns.saturating_add(capped);
108 self.states.insert(
109 peer,
110 BackoffState {
111 attempts,
112 last_attempt_ns: now_ns,
113 next_retry_ns,
114 },
115 );
116 }
117
118 /// Record a success for `peer`; clears any tracked failure
119 /// state. Subsequent `should_retry` returns `true` immediately.
120 pub fn record_success(&mut self, peer: PeerId) {
121 self.states.remove(&peer);
122 }
123
124 /// Whether a retry to `peer` is permitted at `now_ns`. Peers
125 /// with no recorded failures retry immediately.
126 pub fn should_retry(&self, peer: PeerId, now_ns: u64) -> bool {
127 match self.states.get(&peer) {
128 None => true,
129 Some(state) => now_ns >= state.next_retry_ns,
130 }
131 }
132
133 /// Inspect the recorded state for `peer`. Returns `None` when no
134 /// failures have been recorded.
135 pub fn state(&self, peer: PeerId) -> Option<BackoffState> {
136 self.states.get(&peer).copied()
137 }
138
139 /// Number of peers currently being tracked.
140 pub fn len(&self) -> usize {
141 self.states.len()
142 }
143
144 /// Whether any peer is currently backing off.
145 pub fn is_empty(&self) -> bool {
146 self.states.is_empty()
147 }
148
149 /// Iterate `(PeerId, BackoffState)` for snapshot capture.
150 /// .
151 pub fn iter(&self) -> impl Iterator<Item = (PeerId, BackoffState)> + '_ {
152 self.states.iter().map(|(p, s)| (*p, *s))
153 }
154
155 /// Restore one peer's backoff state directly. Used by
156 /// `Node::restore` to re-seed peers from a
157 /// `FrameworkSnapshot::backoff_table` entry without going
158 /// through the failure-counting path.
159 pub fn restore_state(&mut self, peer: PeerId, state: BackoffState) {
160 self.states.insert(peer, state);
161 }
162
163 fn delay_for(&self, attempts: u32) -> u64 {
164 // 2^(n-1) backoff (attempts >= 1 after first failure). Clamp
165 // shift so we never overflow before applying the max cap.
166 let shift = attempts.saturating_sub(1).min(63);
167 let factor = 1u64.checked_shl(shift).unwrap_or(u64::MAX);
168 self.base_ns.saturating_mul(factor).min(self.max_delay_ns)
169 }
170}
171