1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
//! Exponential backoff for failed peer connection attempts.
//!
//! This module provides per-peer backoff tracking by socket address to prevent
//! rapid repeated connection attempts to the same peer. Unlike `ConnectionBackoff`
//! which uses location buckets, this tracks individual peers precisely.
//!
//! See issue #2484 for motivation: telemetry showed peers attempting connections
//! every 4 seconds to the same target, with 58% of attempts within 5 seconds of
//! the previous attempt.
use crate::util::backoff::{ExponentialBackoff, TrackedBackoff};
use std::net::SocketAddr;
use std::time::Duration;
/// Tracks backoff state for failed connection attempts to specific peers.
///
/// Uses exponential backoff: `base_interval * 2^(consecutive_failures-1)` capped at `max_backoff`.
/// First failure = base_interval, second = 2x, third = 4x, etc.
#[derive(Debug)]
pub struct PeerConnectionBackoff {
inner: TrackedBackoff<SocketAddr>,
}
impl Default for PeerConnectionBackoff {
fn default() -> Self {
Self::new()
}
}
impl PeerConnectionBackoff {
/// Default base backoff interval (30 seconds).
///
/// This is set high enough that even the first failure creates meaningful backoff.
/// Connect requests arrive approximately every 60 seconds (operation timeout interval),
/// so a 30-second base ensures the first failure already blocks half of subsequent attempts.
/// See issue #2595 for context.
const DEFAULT_BASE_INTERVAL: Duration = Duration::from_secs(30);
/// Default maximum backoff interval (90 seconds).
///
/// With 30s base and exponential growth (30s → 60s → 120s clamped to the
/// 90s cap), persistent failures cap quickly at 90s by the third failure.
/// The previous 600s cap was appropriate for random ring
/// peers but far too aggressive for configured gateways: a single gateway in a
/// 10-minute backoff means the node cannot bootstrap at all. NAT traversal
/// failures are transient (network change, temporary congestion) so a 90s cap
/// gives the network time to stabilize without long-term isolation.
///
/// `PeerConnectionBackoff` is currently used only for the `gateway_backoff`
/// tracker. If it is ever reused for ring peers, per-peer-class caps should
/// be introduced via `with_config()` rather than raising this default.
/// See issues #2595 and #3304.
const DEFAULT_MAX_BACKOFF: Duration = Duration::from_secs(90);
/// Default maximum number of tracked entries
const DEFAULT_MAX_ENTRIES: usize = 1024;
/// Create a new backoff tracker with default settings.
///
/// Respects `FREENET_BACKOFF_BASE_SECS` environment variable to override the
/// base interval (useful for CI/integration tests where 30s is too aggressive).
pub fn new() -> Self {
let base = std::env::var("FREENET_BACKOFF_BASE_SECS")
.ok()
.and_then(|v| v.parse::<u64>().ok())
.map(Duration::from_secs)
.unwrap_or(Self::DEFAULT_BASE_INTERVAL);
let config = ExponentialBackoff::new(base, Self::DEFAULT_MAX_BACKOFF);
Self {
inner: TrackedBackoff::new(config, Self::DEFAULT_MAX_ENTRIES),
}
}
/// Create a new backoff tracker with custom settings.
#[cfg(test)]
pub fn with_config(base_interval: Duration, max_backoff: Duration, max_entries: usize) -> Self {
let config = ExponentialBackoff::new(base_interval, max_backoff);
Self {
inner: TrackedBackoff::new(config, max_entries),
}
}
/// Check if a target peer is currently in backoff.
///
/// Returns `true` if we should skip this target, `false` if we can attempt connection.
pub fn is_in_backoff(&self, peer_addr: SocketAddr) -> bool {
self.inner.is_in_backoff(&peer_addr)
}
/// Get the remaining backoff duration for a peer, if any.
///
/// Returns `Some(duration)` if peer is in backoff, `None` otherwise.
pub fn remaining_backoff(&self, peer_addr: SocketAddr) -> Option<Duration> {
self.inner.remaining_backoff(&peer_addr)
}
/// Record a connection failure for a target peer.
///
/// Increments the failure count and calculates the next retry time.
pub fn record_failure(&mut self, peer_addr: SocketAddr) {
let failures_before = self.inner.failure_count(&peer_addr);
self.inner.record_failure(peer_addr);
let backoff = self.inner.config().delay_for_failures(failures_before + 1);
tracing::debug!(
peer = %peer_addr,
failures = failures_before + 1,
backoff_secs = backoff.as_secs(),
"Peer connection in backoff"
);
}
/// Record a successful connection to a target peer.
///
/// Clears the backoff state for that peer.
pub fn record_success(&mut self, peer_addr: SocketAddr) {
if self.inner.failure_count(&peer_addr) > 0 {
tracing::debug!(peer = %peer_addr, "Peer connection backoff cleared");
}
self.inner.record_success(&peer_addr);
}
/// Clean up expired backoff entries (those past their retry time and stale).
///
/// Removes entries that are both past their retry_after time AND have been
/// in backoff for longer than max_backoff (i.e., stale entries that haven't
/// had recent failures). Called periodically to prevent unbounded growth.
pub fn cleanup_expired(&mut self) {
self.inner.cleanup_expired();
}
/// Clear all backoff state. Used during isolation recovery when all
/// previous backoff timers are stale.
pub fn clear(&mut self) {
self.inner.clear();
}
/// Get the consecutive failure count for a peer (for testing).
#[cfg(test)]
fn failure_count(&self, peer_addr: SocketAddr) -> u32 {
self.inner.failure_count(&peer_addr)
}
}
#[cfg(test)]
mod tests {
use super::*;
/// Regression test for issue #3304: gateway backoff must not exceed 90s.
///
/// Before #3304, `DEFAULT_MAX_BACKOFF` was 600s. With a single configured
/// gateway this meant the node could be isolated for up to 10 minutes after
/// repeated NAT traversal failures. Verify that the production constructor
/// (`new()`) enforces the 90s cap.
#[test]
fn test_default_max_backoff_is_90s_for_gateway_recovery() {
// Use new() — the production constructor — to also validate that path.
let mut backoff = PeerConnectionBackoff::new();
let addr: SocketAddr = "127.0.0.1:8080".parse().unwrap();
// Drive through many failures until the cap is hit.
for _ in 0..10 {
backoff.record_failure(addr);
}
let remaining = backoff.remaining_backoff(addr).unwrap();
// Must not exceed the 90s cap + 20% jitter (TrackedBackoff applies ±20% jitter).
// Max possible: 90s * 1.2 = 108s.
assert!(
remaining <= Duration::from_secs(108),
"Gateway backoff exceeded 90s cap + jitter: {remaining:?} — issue #3304"
);
}
/// Regression guard for issue #3329 / #3304: the gateway backoff
/// progression must escalate at most to the 90s cap and NEVER reach the
/// old 600s (10-minute) blackout, no matter how many consecutive failures
/// accrue.
///
/// The production incident was a single configured gateway whose repeated
/// NAT-traversal failures pushed the per-address backoff to 600s, isolating
/// the node for ten minutes. Here we drive the *deterministic* delay
/// calculator (`delay_for_failures`, which applies no jitter) straight off
/// the production defaults so the assertion pins the exact cap rather than
/// a jittered upper bound.
///
/// The config is built env-independently via `with_config` from the
/// production `DEFAULT_*` constants — NOT `new()`, which reads
/// `FREENET_BACKOFF_BASE_SECS` and would yield a CI-overridden base (CI
/// sets it to 5s on the workspace test step). The asserted progression and
/// cap are derived from those same constants so the test tracks the
/// source-of-truth rather than drifting literals.
#[test]
fn test_gateway_backoff_progression_caps_at_90s_never_600s() {
let backoff = PeerConnectionBackoff::with_config(
PeerConnectionBackoff::DEFAULT_BASE_INTERVAL,
PeerConnectionBackoff::DEFAULT_MAX_BACKOFF,
PeerConnectionBackoff::DEFAULT_MAX_ENTRIES,
);
let config = backoff.inner.config();
let base = PeerConnectionBackoff::DEFAULT_BASE_INTERVAL;
let cap = PeerConnectionBackoff::DEFAULT_MAX_BACKOFF;
// Production defaults give 30s base, 90s cap, so the deterministic
// progression is base → base*2 → base*2^2 clamped to the cap:
// 30s → 60s → 120s-clamped-to-90s.
assert_eq!(config.max(), cap);
assert_eq!(config.delay_for_failures(1), base);
assert_eq!(config.delay_for_failures(2), base * 2);
assert_eq!(config.delay_for_failures(3), cap);
// From the 3rd failure onward the delay is pinned at the cap and never
// escalates — even a pathological 50-failure streak stays at the cap
// and never approaches the old 600s blackout. The explicit
// `< 600s` guard is the named #3329/#3304 tripwire: 600 is the
// intentional historical literal the cap regression must never reach.
for failures in 3..=50 {
let delay = config.delay_for_failures(failures);
assert_eq!(
delay, cap,
"backoff escalated past the {cap:?} cap at {failures} failures — issue #3329"
);
assert!(
delay < Duration::from_secs(600),
"backoff reached the old 600s blackout at {failures} failures — issue #3329"
);
}
}
#[test]
fn test_not_in_backoff_initially() {
let backoff = PeerConnectionBackoff::new();
let addr: SocketAddr = "127.0.0.1:8080".parse().unwrap();
assert!(!backoff.is_in_backoff(addr));
}
#[test]
fn test_in_backoff_after_failure() {
let mut backoff = PeerConnectionBackoff::new();
let addr: SocketAddr = "127.0.0.1:8080".parse().unwrap();
backoff.record_failure(addr);
assert!(backoff.is_in_backoff(addr));
}
#[test]
fn test_backoff_cleared_on_success() {
let mut backoff = PeerConnectionBackoff::new();
let addr: SocketAddr = "127.0.0.1:8080".parse().unwrap();
backoff.record_failure(addr);
assert!(backoff.is_in_backoff(addr));
backoff.record_success(addr);
assert!(!backoff.is_in_backoff(addr));
}
#[test]
fn test_exponential_backoff_calculation() {
let config = ExponentialBackoff::new(Duration::from_secs(1), Duration::from_secs(300));
// Formula: base * 2^(n-1) via delay_for_failures
assert_eq!(config.delay_for_failures(1), Duration::from_secs(1));
assert_eq!(config.delay_for_failures(2), Duration::from_secs(2));
assert_eq!(config.delay_for_failures(3), Duration::from_secs(4));
assert_eq!(config.delay_for_failures(4), Duration::from_secs(8));
}
#[test]
fn test_backoff_capped_at_max() {
let config = ExponentialBackoff::new(Duration::from_secs(10), Duration::from_secs(60));
// After many failures, should be capped at 60s
assert_eq!(config.delay_for_failures(10), Duration::from_secs(60));
assert_eq!(config.delay_for_failures(20), Duration::from_secs(60));
}
#[test]
fn test_different_peers_tracked_separately() {
let mut backoff = PeerConnectionBackoff::new();
let addr1: SocketAddr = "127.0.0.1:8080".parse().unwrap();
let addr2: SocketAddr = "127.0.0.1:8081".parse().unwrap();
backoff.record_failure(addr1);
// addr1 should be in backoff, addr2 should not
assert!(backoff.is_in_backoff(addr1));
assert!(!backoff.is_in_backoff(addr2));
}
#[test]
fn test_eviction_when_max_entries_exceeded() {
let mut backoff = PeerConnectionBackoff::with_config(
Duration::from_secs(5),
Duration::from_secs(300),
10, // Very low max for testing
);
// Add more than max entries
for i in 0..20 {
let addr: SocketAddr = format!("127.0.0.1:{}", 8080 + i).parse().unwrap();
backoff.record_failure(addr);
}
// Should have at most max_entries
assert!(backoff.inner.len() <= 10);
}
#[test]
fn test_consecutive_failures_increase_backoff() {
let mut backoff = PeerConnectionBackoff::with_config(
Duration::from_secs(1),
Duration::from_secs(300),
1024,
);
let addr: SocketAddr = "127.0.0.1:8080".parse().unwrap();
// First failure
backoff.record_failure(addr);
assert_eq!(backoff.failure_count(addr), 1);
// Second failure
backoff.record_failure(addr);
assert_eq!(backoff.failure_count(addr), 2);
}
#[test]
fn test_remaining_backoff() {
let mut backoff = PeerConnectionBackoff::with_config(
Duration::from_secs(10),
Duration::from_secs(300),
1024,
);
let addr: SocketAddr = "127.0.0.1:8080".parse().unwrap();
// No backoff initially
assert!(backoff.remaining_backoff(addr).is_none());
// After failure, should have remaining backoff (with ±20% jitter)
backoff.record_failure(addr);
let remaining = backoff.remaining_backoff(addr);
assert!(remaining.is_some());
// With ±20% jitter, backoff should be in [8s, 12s]
assert!(remaining.unwrap() <= Duration::from_secs(12));
assert!(remaining.unwrap() >= Duration::from_secs(7));
}
#[test]
fn test_clear_removes_all_backoff_state() {
let mut backoff = PeerConnectionBackoff::with_config(
Duration::from_secs(1),
Duration::from_secs(300),
1024,
);
let addr1: SocketAddr = "127.0.0.1:8080".parse().unwrap();
let addr2: SocketAddr = "127.0.0.1:8081".parse().unwrap();
backoff.record_failure(addr1);
backoff.record_failure(addr2);
assert!(backoff.is_in_backoff(addr1));
assert!(backoff.is_in_backoff(addr2));
backoff.clear();
assert!(!backoff.is_in_backoff(addr1));
assert!(!backoff.is_in_backoff(addr2));
}
}