1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
//! Exponential backoff for failed peer connection attempts.
//!
//! This module provides per-peer backoff tracking by socket address to prevent
//! rapid repeated connection attempts to the same peer. Unlike `ConnectionBackoff`
//! which uses location buckets, this tracks individual peers precisely.
//!
//! See issue #2484 for motivation: telemetry showed peers attempting connections
//! every 4 seconds to the same target, with 58% of attempts within 5 seconds of
//! the previous attempt.
use crate::util::backoff::{ExponentialBackoff, TrackedBackoff};
use std::net::SocketAddr;
use std::time::Duration;
/// Tracks backoff state for failed connection attempts to specific peers.
///
/// Uses exponential backoff: `base_interval * 2^(consecutive_failures-1)` capped at `max_backoff`.
/// First failure = base_interval, second = 2x, third = 4x, etc.
#[derive(Debug)]
pub struct PeerConnectionBackoff {
inner: TrackedBackoff<SocketAddr>,
}
impl Default for PeerConnectionBackoff {
fn default() -> Self {
Self::new()
}
}
impl PeerConnectionBackoff {
/// Default base backoff interval (30 seconds).
///
/// This is set high enough that even the first failure creates meaningful backoff.
/// Connect requests arrive approximately every 60 seconds (operation timeout interval),
/// so a 30-second base ensures the first failure already blocks half of subsequent attempts.
/// See issue #2595 for context.
const DEFAULT_BASE_INTERVAL: Duration = Duration::from_secs(30);
/// Default maximum backoff interval (90 seconds).
///
/// With 30s base and exponential growth (30s → 60s → 90s), persistent failures
/// cap quickly at 90s. The previous 600s cap was appropriate for random ring
/// peers but far too aggressive for configured gateways: a single gateway in a
/// 10-minute backoff means the node cannot bootstrap at all. NAT traversal
/// failures are transient (network change, temporary congestion) so a 90s cap
/// gives the network time to stabilize without long-term isolation.
///
/// `PeerConnectionBackoff` is currently used only for the `gateway_backoff`
/// tracker. If it is ever reused for ring peers, per-peer-class caps should
/// be introduced via `with_config()` rather than raising this default.
/// See issues #2595 and #3304.
const DEFAULT_MAX_BACKOFF: Duration = Duration::from_secs(90);
/// Default maximum number of tracked entries
const DEFAULT_MAX_ENTRIES: usize = 1024;
/// Create a new backoff tracker with default settings.
///
/// Respects `FREENET_BACKOFF_BASE_SECS` environment variable to override the
/// base interval (useful for CI/integration tests where 30s is too aggressive).
pub fn new() -> Self {
let base = std::env::var("FREENET_BACKOFF_BASE_SECS")
.ok()
.and_then(|v| v.parse::<u64>().ok())
.map(Duration::from_secs)
.unwrap_or(Self::DEFAULT_BASE_INTERVAL);
let config = ExponentialBackoff::new(base, Self::DEFAULT_MAX_BACKOFF);
Self {
inner: TrackedBackoff::new(config, Self::DEFAULT_MAX_ENTRIES),
}
}
/// Create a new backoff tracker with custom settings.
#[cfg(test)]
pub fn with_config(base_interval: Duration, max_backoff: Duration, max_entries: usize) -> Self {
let config = ExponentialBackoff::new(base_interval, max_backoff);
Self {
inner: TrackedBackoff::new(config, max_entries),
}
}
/// Check if a target peer is currently in backoff.
///
/// Returns `true` if we should skip this target, `false` if we can attempt connection.
pub fn is_in_backoff(&self, peer_addr: SocketAddr) -> bool {
self.inner.is_in_backoff(&peer_addr)
}
/// Get the remaining backoff duration for a peer, if any.
///
/// Returns `Some(duration)` if peer is in backoff, `None` otherwise.
pub fn remaining_backoff(&self, peer_addr: SocketAddr) -> Option<Duration> {
self.inner.remaining_backoff(&peer_addr)
}
/// Record a connection failure for a target peer.
///
/// Increments the failure count and calculates the next retry time.
pub fn record_failure(&mut self, peer_addr: SocketAddr) {
let failures_before = self.inner.failure_count(&peer_addr);
self.inner.record_failure(peer_addr);
let backoff = self.inner.config().delay_for_failures(failures_before + 1);
tracing::debug!(
peer = %peer_addr,
failures = failures_before + 1,
backoff_secs = backoff.as_secs(),
"Peer connection in backoff"
);
}
/// Record a successful connection to a target peer.
///
/// Clears the backoff state for that peer.
pub fn record_success(&mut self, peer_addr: SocketAddr) {
if self.inner.failure_count(&peer_addr) > 0 {
tracing::debug!(peer = %peer_addr, "Peer connection backoff cleared");
}
self.inner.record_success(&peer_addr);
}
/// Clean up expired backoff entries (those past their retry time and stale).
///
/// Removes entries that are both past their retry_after time AND have been
/// in backoff for longer than max_backoff (i.e., stale entries that haven't
/// had recent failures). Called periodically to prevent unbounded growth.
pub fn cleanup_expired(&mut self) {
self.inner.cleanup_expired();
}
/// Clear all backoff state. Used during isolation recovery when all
/// previous backoff timers are stale.
pub fn clear(&mut self) {
self.inner.clear();
}
/// Get the consecutive failure count for a peer (for testing).
#[cfg(test)]
fn failure_count(&self, peer_addr: SocketAddr) -> u32 {
self.inner.failure_count(&peer_addr)
}
}
#[cfg(test)]
mod tests {
use super::*;
/// Regression test for issue #3304: gateway backoff must not exceed 90s.
///
/// Before #3304, `DEFAULT_MAX_BACKOFF` was 600s. With a single configured
/// gateway this meant the node could be isolated for up to 10 minutes after
/// repeated NAT traversal failures. Verify that the production constructor
/// (`new()`) enforces the 90s cap.
#[test]
fn test_default_max_backoff_is_90s_for_gateway_recovery() {
// Use new() — the production constructor — to also validate that path.
let mut backoff = PeerConnectionBackoff::new();
let addr: SocketAddr = "127.0.0.1:8080".parse().unwrap();
// Drive through many failures until the cap is hit.
for _ in 0..10 {
backoff.record_failure(addr);
}
let remaining = backoff.remaining_backoff(addr).unwrap();
// Must not exceed the 90s cap + 20% jitter (TrackedBackoff applies ±20% jitter).
// Max possible: 90s * 1.2 = 108s.
assert!(
remaining <= Duration::from_secs(108),
"Gateway backoff exceeded 90s cap + jitter: {remaining:?} — issue #3304"
);
}
#[test]
fn test_not_in_backoff_initially() {
let backoff = PeerConnectionBackoff::new();
let addr: SocketAddr = "127.0.0.1:8080".parse().unwrap();
assert!(!backoff.is_in_backoff(addr));
}
#[test]
fn test_in_backoff_after_failure() {
let mut backoff = PeerConnectionBackoff::new();
let addr: SocketAddr = "127.0.0.1:8080".parse().unwrap();
backoff.record_failure(addr);
assert!(backoff.is_in_backoff(addr));
}
#[test]
fn test_backoff_cleared_on_success() {
let mut backoff = PeerConnectionBackoff::new();
let addr: SocketAddr = "127.0.0.1:8080".parse().unwrap();
backoff.record_failure(addr);
assert!(backoff.is_in_backoff(addr));
backoff.record_success(addr);
assert!(!backoff.is_in_backoff(addr));
}
#[test]
fn test_exponential_backoff_calculation() {
let config = ExponentialBackoff::new(Duration::from_secs(1), Duration::from_secs(300));
// Formula: base * 2^(n-1) via delay_for_failures
assert_eq!(config.delay_for_failures(1), Duration::from_secs(1));
assert_eq!(config.delay_for_failures(2), Duration::from_secs(2));
assert_eq!(config.delay_for_failures(3), Duration::from_secs(4));
assert_eq!(config.delay_for_failures(4), Duration::from_secs(8));
}
#[test]
fn test_backoff_capped_at_max() {
let config = ExponentialBackoff::new(Duration::from_secs(10), Duration::from_secs(60));
// After many failures, should be capped at 60s
assert_eq!(config.delay_for_failures(10), Duration::from_secs(60));
assert_eq!(config.delay_for_failures(20), Duration::from_secs(60));
}
#[test]
fn test_different_peers_tracked_separately() {
let mut backoff = PeerConnectionBackoff::new();
let addr1: SocketAddr = "127.0.0.1:8080".parse().unwrap();
let addr2: SocketAddr = "127.0.0.1:8081".parse().unwrap();
backoff.record_failure(addr1);
// addr1 should be in backoff, addr2 should not
assert!(backoff.is_in_backoff(addr1));
assert!(!backoff.is_in_backoff(addr2));
}
#[test]
fn test_eviction_when_max_entries_exceeded() {
let mut backoff = PeerConnectionBackoff::with_config(
Duration::from_secs(5),
Duration::from_secs(300),
10, // Very low max for testing
);
// Add more than max entries
for i in 0..20 {
let addr: SocketAddr = format!("127.0.0.1:{}", 8080 + i).parse().unwrap();
backoff.record_failure(addr);
}
// Should have at most max_entries
assert!(backoff.inner.len() <= 10);
}
#[test]
fn test_consecutive_failures_increase_backoff() {
let mut backoff = PeerConnectionBackoff::with_config(
Duration::from_secs(1),
Duration::from_secs(300),
1024,
);
let addr: SocketAddr = "127.0.0.1:8080".parse().unwrap();
// First failure
backoff.record_failure(addr);
assert_eq!(backoff.failure_count(addr), 1);
// Second failure
backoff.record_failure(addr);
assert_eq!(backoff.failure_count(addr), 2);
}
#[test]
fn test_remaining_backoff() {
let mut backoff = PeerConnectionBackoff::with_config(
Duration::from_secs(10),
Duration::from_secs(300),
1024,
);
let addr: SocketAddr = "127.0.0.1:8080".parse().unwrap();
// No backoff initially
assert!(backoff.remaining_backoff(addr).is_none());
// After failure, should have remaining backoff (with ±20% jitter)
backoff.record_failure(addr);
let remaining = backoff.remaining_backoff(addr);
assert!(remaining.is_some());
// With ±20% jitter, backoff should be in [8s, 12s]
assert!(remaining.unwrap() <= Duration::from_secs(12));
assert!(remaining.unwrap() >= Duration::from_secs(7));
}
#[test]
fn test_clear_removes_all_backoff_state() {
let mut backoff = PeerConnectionBackoff::with_config(
Duration::from_secs(1),
Duration::from_secs(300),
1024,
);
let addr1: SocketAddr = "127.0.0.1:8080".parse().unwrap();
let addr2: SocketAddr = "127.0.0.1:8081".parse().unwrap();
backoff.record_failure(addr1);
backoff.record_failure(addr2);
assert!(backoff.is_in_backoff(addr1));
assert!(backoff.is_in_backoff(addr2));
backoff.clear();
assert!(!backoff.is_in_backoff(addr1));
assert!(!backoff.is_in_backoff(addr2));
}
}