synapse_pingora/crawler/config.rs
1//! Configuration for crawler detection.
2//!
3//! ## Security
4//! - `dns_failure_policy` controls fail-secure behavior during DNS outages
5//! - `max_concurrent_dns_lookups` prevents resource exhaustion at scale
6//! - `max_stats_entries` bounds memory usage from novel bot names
7
8use serde::{Deserialize, Serialize};
9
10/// Policy for handling DNS verification failures.
11///
12/// This determines what happens when DNS lookup fails (timeout, server error, etc.)
13/// for a request claiming to be a legitimate crawler like Googlebot.
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
15#[serde(rename_all = "snake_case")]
16pub enum DnsFailurePolicy {
17 /// Allow request through (fail-open) - NOT RECOMMENDED for production
18 /// Use only for debugging or low-security environments.
19 Allow,
20
21 /// Apply risk penalty and continue (default, fail-cautious)
22 /// Request proceeds but with elevated risk score for downstream decisions.
23 #[default]
24 ApplyRiskPenalty,
25
26 /// Block request entirely (fail-secure)
27 /// Most restrictive - may cause false positives during DNS outages.
28 Block,
29}
30
31/// Configuration for crawler detection and verification.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct CrawlerConfig {
34 /// Enable crawler detection
35 pub enabled: bool,
36
37 /// DNS cache TTL in seconds (default: 300 = 5 min)
38 pub dns_cache_ttl_secs: u64,
39
40 /// Verification result cache TTL in seconds (default: 3600 = 1 hour)
41 pub verification_cache_ttl_secs: u64,
42
43 /// Maximum cache entries (default: 50000 for high-traffic deployments)
44 pub max_cache_entries: u64,
45
46 /// DNS lookup timeout in milliseconds (default: 2000 - reduced from 5000)
47 pub dns_timeout_ms: u64,
48
49 /// Maximum concurrent DNS lookups to prevent resource exhaustion (default: 100)
50 pub max_concurrent_dns_lookups: usize,
51
52 /// Verify legitimate crawlers via DNS
53 pub verify_legitimate_crawlers: bool,
54
55 /// Block detected bad bots
56 pub block_bad_bots: bool,
57
58 /// Policy when DNS verification fails (timeout, server error, etc.)
59 #[serde(default)]
60 pub dns_failure_policy: DnsFailurePolicy,
61
62 /// Risk penalty to apply when DNS verification fails (only used with ApplyRiskPenalty policy)
63 pub dns_failure_risk_penalty: u32,
64
65 /// Maximum entries in per-crawler/per-bot stats maps (prevents unbounded growth)
66 pub max_stats_entries: usize,
67}
68
69impl Default for CrawlerConfig {
70 fn default() -> Self {
71 Self {
72 enabled: true,
73 dns_cache_ttl_secs: 300,
74 verification_cache_ttl_secs: 3600,
75 // Increased from 10,000 for high-traffic deployments
76 max_cache_entries: 50_000,
77 // Reduced from 5000ms to prevent bottlenecks
78 dns_timeout_ms: 2_000,
79 // Limit concurrent DNS lookups to prevent resource exhaustion
80 max_concurrent_dns_lookups: 100,
81 verify_legitimate_crawlers: true,
82 block_bad_bots: true,
83 // Default to fail-cautious: apply risk penalty but don't block
84 dns_failure_policy: DnsFailurePolicy::ApplyRiskPenalty,
85 dns_failure_risk_penalty: 50,
86 // Limit stats map sizes to prevent unbounded growth
87 max_stats_entries: 1000,
88 }
89 }
90}
91
92impl CrawlerConfig {
93 /// Validate configuration values
94 pub fn validate(&self) -> Result<(), String> {
95 if self.dns_timeout_ms == 0 {
96 return Err("dns_timeout_ms must be greater than 0".to_string());
97 }
98 if self.dns_timeout_ms > 30_000 {
99 return Err("dns_timeout_ms should not exceed 30 seconds".to_string());
100 }
101 if self.max_concurrent_dns_lookups == 0 {
102 return Err("max_concurrent_dns_lookups must be greater than 0".to_string());
103 }
104 if self.dns_failure_risk_penalty > 100 {
105 return Err("dns_failure_risk_penalty should not exceed 100".to_string());
106 }
107 Ok(())
108 }
109}