freenet 0.2.78 - Docs.rs

//! Exponential backoff for failed peer connection attempts.
//!
//! This module provides per-peer backoff tracking by socket address to prevent
//! rapid repeated connection attempts to the same peer. Unlike `ConnectionBackoff`
//! which uses location buckets, this tracks individual peers precisely.
//!
//! See issue #2484 for motivation: telemetry showed peers attempting connections
//! every 4 seconds to the same target, with 58% of attempts within 5 seconds of
//! the previous attempt.

use crate::util::backoff::{ExponentialBackoff, TrackedBackoff};
use std::net::SocketAddr;
use std::time::Duration;

/// Tracks backoff state for failed connection attempts to specific peers.
///
/// Uses exponential backoff: `base_interval * 2^(consecutive_failures-1)` capped at `max_backoff`.
/// First failure = base_interval, second = 2x, third = 4x, etc.
#[derive(Debug)]
pub struct PeerConnectionBackoff {
    inner: TrackedBackoff<SocketAddr>,
}

impl Default for PeerConnectionBackoff {
    fn default() -> Self {
        Self::new()
    }
}

impl PeerConnectionBackoff {
    /// Default base backoff interval (30 seconds).
    ///
    /// This is set high enough that even the first failure creates meaningful backoff.
    /// Connect requests arrive approximately every 60 seconds (operation timeout interval),
    /// so a 30-second base ensures the first failure already blocks half of subsequent attempts.
    /// See issue #2595 for context.
    const DEFAULT_BASE_INTERVAL: Duration = Duration::from_secs(30);

    /// Default maximum backoff interval (90 seconds).
    ///
    /// With 30s base and exponential growth (30s → 60s → 120s clamped to the
    /// 90s cap), persistent failures cap quickly at 90s by the third failure.
    /// The previous 600s cap was appropriate for random ring
    /// peers but far too aggressive for configured gateways: a single gateway in a
    /// 10-minute backoff means the node cannot bootstrap at all.  NAT traversal
    /// failures are transient (network change, temporary congestion) so a 90s cap
    /// gives the network time to stabilize without long-term isolation.
    ///
    /// `PeerConnectionBackoff` is currently used only for the `gateway_backoff`
    /// tracker.  If it is ever reused for ring peers, per-peer-class caps should
    /// be introduced via `with_config()` rather than raising this default.
    /// See issues #2595 and #3304.
    const DEFAULT_MAX_BACKOFF: Duration = Duration::from_secs(90);

    /// Default maximum number of tracked entries
    const DEFAULT_MAX_ENTRIES: usize = 1024;

    /// Create a new backoff tracker with default settings.
    ///
    /// Respects `FREENET_BACKOFF_BASE_SECS` environment variable to override the
    /// base interval (useful for CI/integration tests where 30s is too aggressive).
    pub fn new() -> Self {
        let base = std::env::var("FREENET_BACKOFF_BASE_SECS")
            .ok()
            .and_then(|v| v.parse::<u64>().ok())
            .map(Duration::from_secs)
            .unwrap_or(Self::DEFAULT_BASE_INTERVAL);
        let config = ExponentialBackoff::new(base, Self::DEFAULT_MAX_BACKOFF);
        Self {
            inner: TrackedBackoff::new(config, Self::DEFAULT_MAX_ENTRIES),
        }
    }

    /// Create a new backoff tracker with custom settings.
    #[cfg(test)]
    pub fn with_config(base_interval: Duration, max_backoff: Duration, max_entries: usize) -> Self {
        let config = ExponentialBackoff::new(base_interval, max_backoff);
        Self {
            inner: TrackedBackoff::new(config, max_entries),
        }
    }

    /// Check if a target peer is currently in backoff.
    ///
    /// Returns `true` if we should skip this target, `false` if we can attempt connection.
    pub fn is_in_backoff(&self, peer_addr: SocketAddr) -> bool {
        self.inner.is_in_backoff(&peer_addr)
    }

    /// Get the remaining backoff duration for a peer, if any.
    ///
    /// Returns `Some(duration)` if peer is in backoff, `None` otherwise.
    pub fn remaining_backoff(&self, peer_addr: SocketAddr) -> Option<Duration> {
        self.inner.remaining_backoff(&peer_addr)
    }

    /// Record a connection failure for a target peer.
    ///
    /// Increments the failure count and calculates the next retry time.
    pub fn record_failure(&mut self, peer_addr: SocketAddr) {
        let failures_before = self.inner.failure_count(&peer_addr);
        self.inner.record_failure(peer_addr);

        let backoff = self.inner.config().delay_for_failures(failures_before + 1);
        tracing::debug!(
            peer = %peer_addr,
            failures = failures_before + 1,
            backoff_secs = backoff.as_secs(),
            "Peer connection in backoff"
        );
    }

    /// Record a successful connection to a target peer.
    ///
    /// Clears the backoff state for that peer.
    pub fn record_success(&mut self, peer_addr: SocketAddr) {
        if self.inner.failure_count(&peer_addr) > 0 {
            tracing::debug!(peer = %peer_addr, "Peer connection backoff cleared");
        }
        self.inner.record_success(&peer_addr);
    }

    /// Clean up expired backoff entries (those past their retry time and stale).
    ///
    /// Removes entries that are both past their retry_after time AND have been
    /// in backoff for longer than max_backoff (i.e., stale entries that haven't
    /// had recent failures). Called periodically to prevent unbounded growth.
    pub fn cleanup_expired(&mut self) {
        self.inner.cleanup_expired();
    }

    /// Clear all backoff state. Used during isolation recovery when all
    /// previous backoff timers are stale.
    pub fn clear(&mut self) {
        self.inner.clear();
    }

    /// Get the consecutive failure count for a peer (for testing).
    #[cfg(test)]
    fn failure_count(&self, peer_addr: SocketAddr) -> u32 {
        self.inner.failure_count(&peer_addr)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Regression test for issue #3304: gateway backoff must not exceed 90s.
    ///
    /// Before #3304, `DEFAULT_MAX_BACKOFF` was 600s.  With a single configured
    /// gateway this meant the node could be isolated for up to 10 minutes after
    /// repeated NAT traversal failures.  Verify that the production constructor
    /// (`new()`) enforces the 90s cap.
    #[test]
    fn test_default_max_backoff_is_90s_for_gateway_recovery() {
        // Use new() — the production constructor — to also validate that path.
        let mut backoff = PeerConnectionBackoff::new();
        let addr: SocketAddr = "127.0.0.1:8080".parse().unwrap();

        // Drive through many failures until the cap is hit.
        for _ in 0..10 {
            backoff.record_failure(addr);
        }

        let remaining = backoff.remaining_backoff(addr).unwrap();
        // Must not exceed the 90s cap + 20% jitter (TrackedBackoff applies ±20% jitter).
        // Max possible: 90s * 1.2 = 108s.
        assert!(
            remaining <= Duration::from_secs(108),
            "Gateway backoff exceeded 90s cap + jitter: {remaining:?} — issue #3304"
        );
    }

    /// Regression guard for issue #3329 / #3304: the gateway backoff
    /// progression must escalate at most to the 90s cap and NEVER reach the
    /// old 600s (10-minute) blackout, no matter how many consecutive failures
    /// accrue.
    ///
    /// The production incident was a single configured gateway whose repeated
    /// NAT-traversal failures pushed the per-address backoff to 600s, isolating
    /// the node for ten minutes.  Here we drive the *deterministic* delay
    /// calculator (`delay_for_failures`, which applies no jitter) straight off
    /// the production defaults so the assertion pins the exact cap rather than
    /// a jittered upper bound.
    ///
    /// The config is built env-independently via `with_config` from the
    /// production `DEFAULT_*` constants — NOT `new()`, which reads
    /// `FREENET_BACKOFF_BASE_SECS` and would yield a CI-overridden base (CI
    /// sets it to 5s on the workspace test step). The asserted progression and
    /// cap are derived from those same constants so the test tracks the
    /// source-of-truth rather than drifting literals.
    #[test]
    fn test_gateway_backoff_progression_caps_at_90s_never_600s() {
        let backoff = PeerConnectionBackoff::with_config(
            PeerConnectionBackoff::DEFAULT_BASE_INTERVAL,
            PeerConnectionBackoff::DEFAULT_MAX_BACKOFF,
            PeerConnectionBackoff::DEFAULT_MAX_ENTRIES,
        );
        let config = backoff.inner.config();
        let base = PeerConnectionBackoff::DEFAULT_BASE_INTERVAL;
        let cap = PeerConnectionBackoff::DEFAULT_MAX_BACKOFF;

        // Production defaults give 30s base, 90s cap, so the deterministic
        // progression is base → base*2 → base*2^2 clamped to the cap:
        // 30s → 60s → 120s-clamped-to-90s.
        assert_eq!(config.max(), cap);
        assert_eq!(config.delay_for_failures(1), base);
        assert_eq!(config.delay_for_failures(2), base * 2);
        assert_eq!(config.delay_for_failures(3), cap);

        // From the 3rd failure onward the delay is pinned at the cap and never
        // escalates — even a pathological 50-failure streak stays at the cap
        // and never approaches the old 600s blackout. The explicit
        // `< 600s` guard is the named #3329/#3304 tripwire: 600 is the
        // intentional historical literal the cap regression must never reach.
        for failures in 3..=50 {
            let delay = config.delay_for_failures(failures);
            assert_eq!(
                delay, cap,
                "backoff escalated past the {cap:?} cap at {failures} failures — issue #3329"
            );
            assert!(
                delay < Duration::from_secs(600),
                "backoff reached the old 600s blackout at {failures} failures — issue #3329"
            );
        }
    }

    #[test]
    fn test_not_in_backoff_initially() {
        let backoff = PeerConnectionBackoff::new();
        let addr: SocketAddr = "127.0.0.1:8080".parse().unwrap();
        assert!(!backoff.is_in_backoff(addr));
    }

    #[test]
    fn test_in_backoff_after_failure() {
        let mut backoff = PeerConnectionBackoff::new();
        let addr: SocketAddr = "127.0.0.1:8080".parse().unwrap();

        backoff.record_failure(addr);
        assert!(backoff.is_in_backoff(addr));
    }

    #[test]
    fn test_backoff_cleared_on_success() {
        let mut backoff = PeerConnectionBackoff::new();
        let addr: SocketAddr = "127.0.0.1:8080".parse().unwrap();

        backoff.record_failure(addr);
        assert!(backoff.is_in_backoff(addr));

        backoff.record_success(addr);
        assert!(!backoff.is_in_backoff(addr));
    }

    #[test]
    fn test_exponential_backoff_calculation() {
        let config = ExponentialBackoff::new(Duration::from_secs(1), Duration::from_secs(300));

        // Formula: base * 2^(n-1) via delay_for_failures
        assert_eq!(config.delay_for_failures(1), Duration::from_secs(1));
        assert_eq!(config.delay_for_failures(2), Duration::from_secs(2));
        assert_eq!(config.delay_for_failures(3), Duration::from_secs(4));
        assert_eq!(config.delay_for_failures(4), Duration::from_secs(8));
    }

    #[test]
    fn test_backoff_capped_at_max() {
        let config = ExponentialBackoff::new(Duration::from_secs(10), Duration::from_secs(60));

        // After many failures, should be capped at 60s
        assert_eq!(config.delay_for_failures(10), Duration::from_secs(60));
        assert_eq!(config.delay_for_failures(20), Duration::from_secs(60));
    }

    #[test]
    fn test_different_peers_tracked_separately() {
        let mut backoff = PeerConnectionBackoff::new();
        let addr1: SocketAddr = "127.0.0.1:8080".parse().unwrap();
        let addr2: SocketAddr = "127.0.0.1:8081".parse().unwrap();

        backoff.record_failure(addr1);

        // addr1 should be in backoff, addr2 should not
        assert!(backoff.is_in_backoff(addr1));
        assert!(!backoff.is_in_backoff(addr2));
    }

    #[test]
    fn test_eviction_when_max_entries_exceeded() {
        let mut backoff = PeerConnectionBackoff::with_config(
            Duration::from_secs(5),
            Duration::from_secs(300),
            10, // Very low max for testing
        );

        // Add more than max entries
        for i in 0..20 {
            let addr: SocketAddr = format!("127.0.0.1:{}", 8080 + i).parse().unwrap();
            backoff.record_failure(addr);
        }

        // Should have at most max_entries
        assert!(backoff.inner.len() <= 10);
    }

    #[test]
    fn test_consecutive_failures_increase_backoff() {
        let mut backoff = PeerConnectionBackoff::with_config(
            Duration::from_secs(1),
            Duration::from_secs(300),
            1024,
        );
        let addr: SocketAddr = "127.0.0.1:8080".parse().unwrap();

        // First failure
        backoff.record_failure(addr);
        assert_eq!(backoff.failure_count(addr), 1);

        // Second failure
        backoff.record_failure(addr);
        assert_eq!(backoff.failure_count(addr), 2);
    }

    #[test]
    fn test_remaining_backoff() {
        let mut backoff = PeerConnectionBackoff::with_config(
            Duration::from_secs(10),
            Duration::from_secs(300),
            1024,
        );
        let addr: SocketAddr = "127.0.0.1:8080".parse().unwrap();

        // No backoff initially
        assert!(backoff.remaining_backoff(addr).is_none());

        // After failure, should have remaining backoff (with ±20% jitter)
        backoff.record_failure(addr);
        let remaining = backoff.remaining_backoff(addr);
        assert!(remaining.is_some());
        // With ±20% jitter, backoff should be in [8s, 12s]
        assert!(remaining.unwrap() <= Duration::from_secs(12));
        assert!(remaining.unwrap() >= Duration::from_secs(7));
    }

    #[test]
    fn test_clear_removes_all_backoff_state() {
        let mut backoff = PeerConnectionBackoff::with_config(
            Duration::from_secs(1),
            Duration::from_secs(300),
            1024,
        );
        let addr1: SocketAddr = "127.0.0.1:8080".parse().unwrap();
        let addr2: SocketAddr = "127.0.0.1:8081".parse().unwrap();

        backoff.record_failure(addr1);
        backoff.record_failure(addr2);
        assert!(backoff.is_in_backoff(addr1));
        assert!(backoff.is_in_backoff(addr2));

        backoff.clear();
        assert!(!backoff.is_in_backoff(addr1));
        assert!(!backoff.is_in_backoff(addr2));
    }
}