Skip to main content

synapse_pingora/crawler/
config.rs

1//! Configuration for crawler detection.
2//!
3//! ## Security
4//! - `dns_failure_policy` controls fail-secure behavior during DNS outages
5//! - `max_concurrent_dns_lookups` prevents resource exhaustion at scale
6//! - `max_stats_entries` bounds memory usage from novel bot names
7
8use serde::{Deserialize, Serialize};
9
10/// Policy for handling DNS verification failures.
11///
12/// This determines what happens when DNS lookup fails (timeout, server error, etc.)
13/// for a request claiming to be a legitimate crawler like Googlebot.
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
15#[serde(rename_all = "snake_case")]
16pub enum DnsFailurePolicy {
17    /// Allow request through (fail-open) - NOT RECOMMENDED for production
18    /// Use only for debugging or low-security environments.
19    Allow,
20
21    /// Apply risk penalty and continue (default, fail-cautious)
22    /// Request proceeds but with elevated risk score for downstream decisions.
23    #[default]
24    ApplyRiskPenalty,
25
26    /// Block request entirely (fail-secure)
27    /// Most restrictive - may cause false positives during DNS outages.
28    Block,
29}
30
31/// Configuration for crawler detection and verification.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct CrawlerConfig {
34    /// Enable crawler detection
35    pub enabled: bool,
36
37    /// DNS cache TTL in seconds (default: 300 = 5 min)
38    pub dns_cache_ttl_secs: u64,
39
40    /// Verification result cache TTL in seconds (default: 3600 = 1 hour)
41    pub verification_cache_ttl_secs: u64,
42
43    /// Maximum cache entries (default: 50000 for high-traffic deployments)
44    pub max_cache_entries: u64,
45
46    /// DNS lookup timeout in milliseconds (default: 2000 - reduced from 5000)
47    pub dns_timeout_ms: u64,
48
49    /// Maximum concurrent DNS lookups to prevent resource exhaustion (default: 100)
50    pub max_concurrent_dns_lookups: usize,
51
52    /// Verify legitimate crawlers via DNS
53    pub verify_legitimate_crawlers: bool,
54
55    /// Block detected bad bots
56    pub block_bad_bots: bool,
57
58    /// Policy when DNS verification fails (timeout, server error, etc.)
59    #[serde(default)]
60    pub dns_failure_policy: DnsFailurePolicy,
61
62    /// Risk penalty to apply when DNS verification fails (only used with ApplyRiskPenalty policy)
63    pub dns_failure_risk_penalty: u32,
64
65    /// Maximum entries in per-crawler/per-bot stats maps (prevents unbounded growth)
66    pub max_stats_entries: usize,
67}
68
69impl Default for CrawlerConfig {
70    fn default() -> Self {
71        Self {
72            enabled: true,
73            dns_cache_ttl_secs: 300,
74            verification_cache_ttl_secs: 3600,
75            // Increased from 10,000 for high-traffic deployments
76            max_cache_entries: 50_000,
77            // Reduced from 5000ms to prevent bottlenecks
78            dns_timeout_ms: 2_000,
79            // Limit concurrent DNS lookups to prevent resource exhaustion
80            max_concurrent_dns_lookups: 100,
81            verify_legitimate_crawlers: true,
82            block_bad_bots: true,
83            // Default to fail-cautious: apply risk penalty but don't block
84            dns_failure_policy: DnsFailurePolicy::ApplyRiskPenalty,
85            dns_failure_risk_penalty: 50,
86            // Limit stats map sizes to prevent unbounded growth
87            max_stats_entries: 1000,
88        }
89    }
90}
91
92impl CrawlerConfig {
93    /// Validate configuration values
94    pub fn validate(&self) -> Result<(), String> {
95        if self.dns_timeout_ms == 0 {
96            return Err("dns_timeout_ms must be greater than 0".to_string());
97        }
98        if self.dns_timeout_ms > 30_000 {
99            return Err("dns_timeout_ms should not exceed 30 seconds".to_string());
100        }
101        if self.max_concurrent_dns_lookups == 0 {
102            return Err("max_concurrent_dns_lookups must be greater than 0".to_string());
103        }
104        if self.dns_failure_risk_penalty > 100 {
105            return Err("dns_failure_risk_penalty should not exceed 100".to_string());
106        }
107        Ok(())
108    }
109}