synapse_pingora/crawler/
detector.rs

1//! Main crawler detection engine.
2//!
3//! ## Security
4//! - Input length validation prevents ReDoS attacks
5//! - DNS failure policy prevents fail-open vulnerabilities
6//! - Stats map size limits prevent memory exhaustion
7
8use dashmap::DashMap;
9use parking_lot::RwLock;
10use regex::Regex;
11use serde::{Deserialize, Serialize};
12use std::borrow::Cow;
13use std::net::IpAddr;
14use std::sync::atomic::{AtomicU64, Ordering};
15use std::time::{Duration, Instant};
16
17pub use super::bad_bots::{BadBotSeverity, BadBotSignature, BAD_BOT_SIGNATURES};
18use super::cache::VerificationCache;
19use super::config::{CrawlerConfig, DnsFailurePolicy};
20use super::dns_resolver::{DnsError, DnsResolver};
21use super::known_crawlers::{CrawlerDefinition, KNOWN_CRAWLERS};
22
23/// Maximum allowed length for User-Agent string (512 bytes)
24pub const MAX_USER_AGENT_LENGTH: usize = 512;
25
26/// Trait for crawler detection, enabling mock implementations for testing.
27///
28/// This trait abstracts the crawler detection functionality so that:
29/// - Production code uses `CrawlerDetector` with real DNS verification
30/// - Test code can use mock implementations without network calls
31#[async_trait::async_trait]
32pub trait CrawlerDetection: Send + Sync {
33    /// Verify a request's crawler status.
34    async fn verify(&self, user_agent: &str, client_ip: IpAddr) -> CrawlerVerificationResult;
35
36    /// Check if the detector is enabled.
37    fn is_enabled(&self) -> bool;
38
39    /// Check if bad bots should be blocked.
40    fn should_block_bad_bots(&self) -> bool;
41
42    /// Get statistics snapshot.
43    fn stats(&self) -> CrawlerStatsSnapshot;
44}
45
46/// Exclusion list for generic bot patterns to prevent false positives.
47/// Maps signature name to list of crawler names that should be excluded.
48/// This is handled in code (not regex) to prevent ReDoS attacks.
49fn get_exclusions(signature_name: &str) -> &'static [&'static str] {
50    match signature_name {
51        "GenericBot" => &[
52            "googlebot",
53            "bingbot",
54            "yandexbot",
55            "baiduspider",
56            "facebookexternalhit",
57            "twitterbot",
58            "linkedinbot",
59            "applebot",
60            "pinterestbot",
61            "slackbot",
62            "discordbot",
63        ],
64        "GenericCrawler" => &["googlebot", "bingbot", "yandexbot", "baiduspider", "slurp"],
65        "GenericSpider" => &["googlebot", "bingbot", "yandexbot", "baiduspider", "slurp"],
66        "PythonUrllib" => &["googlebot"],
67        _ => &[],
68    }
69}
70
71/// Method used for crawler verification.
72#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
73#[serde(rename_all = "snake_case")]
74pub enum VerificationMethod {
75    /// Verified via DNS reverse+forward lookup
76    Dns,
77    /// Verified via IP range check
78    IpRange,
79    /// Not verified (UA match only)
80    Unverified,
81}
82
83/// Result of crawler verification.
84#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct CrawlerVerificationResult {
86    /// Is this a known crawler
87    pub is_crawler: bool,
88    /// Crawler name if matched
89    pub crawler_name: Option<String>,
90    /// Was the crawler verified
91    pub verified: bool,
92    /// How was it verified
93    pub verification_method: VerificationMethod,
94    /// Did the user agent match a crawler pattern
95    pub user_agent_match: bool,
96    /// Did the reverse DNS match
97    pub reverse_dns_match: bool,
98    /// Did the IP range match (if applicable)
99    pub ip_range_match: bool,
100    /// Is this request suspicious (e.g., UA spoofing)
101    pub suspicious: bool,
102    /// Reasons for suspicion (uses Cow for zero-copy known messages)
103    pub suspicion_reasons: Vec<Cow<'static, str>>,
104    /// Bad bot match if any
105    pub bad_bot_match: Option<String>,
106    /// Bad bot severity if matched
107    pub bad_bot_severity: Option<BadBotSeverity>,
108    /// Whether input was rejected due to length limits
109    pub input_rejected: bool,
110    /// Risk penalty applied due to DNS failure policy
111    pub dns_failure_penalty: u32,
112}
113
114impl Default for CrawlerVerificationResult {
115    fn default() -> Self {
116        Self {
117            is_crawler: false,
118            crawler_name: None,
119            verified: false,
120            verification_method: VerificationMethod::Unverified,
121            user_agent_match: false,
122            reverse_dns_match: false,
123            ip_range_match: false,
124            suspicious: false,
125            suspicion_reasons: Vec::new(),
126            bad_bot_match: None,
127            bad_bot_severity: None,
128            input_rejected: false,
129            dns_failure_penalty: 0,
130        }
131    }
132}
133
134/// Statistics for crawler detection.
135#[derive(Debug)]
136pub struct CrawlerStats {
137    pub total_verifications: AtomicU64,
138    pub verified_crawlers: AtomicU64,
139    pub unverified_crawlers: AtomicU64,
140    pub bad_bots: AtomicU64,
141    pub cache_hits: AtomicU64,
142    pub cache_misses: AtomicU64,
143    pub dns_successes: AtomicU64,
144    pub dns_failures: AtomicU64,
145    pub dns_rate_limited: AtomicU64,
146    pub input_rejected: AtomicU64,
147    /// Per-crawler name counts (bounded by max_stats_entries)
148    pub by_crawler_name: DashMap<String, u64>,
149    /// Per-bad-bot signature counts (bounded by max_stats_entries)
150    pub by_bad_bot: DashMap<String, u64>,
151}
152
153impl CrawlerStats {
154    pub fn new() -> Self {
155        Self {
156            total_verifications: AtomicU64::new(0),
157            verified_crawlers: AtomicU64::new(0),
158            unverified_crawlers: AtomicU64::new(0),
159            bad_bots: AtomicU64::new(0),
160            cache_hits: AtomicU64::new(0),
161            cache_misses: AtomicU64::new(0),
162            dns_successes: AtomicU64::new(0),
163            dns_failures: AtomicU64::new(0),
164            dns_rate_limited: AtomicU64::new(0),
165            input_rejected: AtomicU64::new(0),
166            by_crawler_name: DashMap::new(),
167            by_bad_bot: DashMap::new(),
168        }
169    }
170}
171
172impl Default for CrawlerStats {
173    fn default() -> Self {
174        Self::new()
175    }
176}
177
178/// Snapshot of crawler stats for serialization.
179#[derive(Debug, Clone, Serialize, Deserialize)]
180pub struct CrawlerStatsSnapshot {
181    pub total_verifications: u64,
182    pub verified_crawlers: u64,
183    pub unverified_crawlers: u64,
184    pub bad_bots: u64,
185    pub cache_hits: u64,
186    pub cache_misses: u64,
187    pub dns_successes: u64,
188    pub dns_failures: u64,
189    pub dns_rate_limited: u64,
190    pub input_rejected: u64,
191}
192
193impl From<&CrawlerStats> for CrawlerStatsSnapshot {
194    fn from(stats: &CrawlerStats) -> Self {
195        Self {
196            total_verifications: stats.total_verifications.load(Ordering::Relaxed),
197            verified_crawlers: stats.verified_crawlers.load(Ordering::Relaxed),
198            unverified_crawlers: stats.unverified_crawlers.load(Ordering::Relaxed),
199            bad_bots: stats.bad_bots.load(Ordering::Relaxed),
200            cache_hits: stats.cache_hits.load(Ordering::Relaxed),
201            cache_misses: stats.cache_misses.load(Ordering::Relaxed),
202            dns_successes: stats.dns_successes.load(Ordering::Relaxed),
203            dns_failures: stats.dns_failures.load(Ordering::Relaxed),
204            dns_rate_limited: stats.dns_rate_limited.load(Ordering::Relaxed),
205            input_rejected: stats.input_rejected.load(Ordering::Relaxed),
206        }
207    }
208}
209
210/// Compiled regex pattern with associated crawler definition.
211#[derive(Debug)]
212struct CompiledCrawlerPattern {
213    ua_regex: Regex,
214    dns_regex: Regex,
215    definition: &'static CrawlerDefinition,
216}
217
218/// Compiled regex pattern with associated bad bot signature.
219#[derive(Debug)]
220struct CompiledBadBotPattern {
221    regex: Regex,
222    signature: &'static BadBotSignature,
223}
224
225/// Main crawler detection engine.
226/// Cached bot distribution data: (timestamp, [(name, count)])
227type BotDistributionCache = Option<(Instant, Vec<(String, u64)>)>;
228
229/// Bot and crawler detector.
230pub struct CrawlerDetector {
231    config: CrawlerConfig,
232    dns: Option<DnsResolver>,
233    stats: CrawlerStats,
234    cache: VerificationCache,
235    crawler_patterns: Vec<CompiledCrawlerPattern>,
236    bad_bot_patterns: Vec<CompiledBadBotPattern>,
237
238    /// Cache for crawler distribution (labs-tui optimization)
239    crawler_dist_cache: RwLock<BotDistributionCache>,
240    /// Cache for bad bot distribution (labs-tui optimization)
241    bad_bot_dist_cache: RwLock<BotDistributionCache>,
242}
243
244impl CrawlerDetector {
245    /// Create a new crawler detector.
246    pub async fn new(config: CrawlerConfig) -> Result<Self, String> {
247        // Validate configuration
248        config.validate()?;
249
250        // Compile crawler patterns
251        let mut crawler_patterns = Vec::new();
252        for def in KNOWN_CRAWLERS {
253            let ua_regex = Regex::new(def.user_agent_pattern)
254                .map_err(|e| format!("Invalid UA pattern for {}: {}", def.name, e))?;
255            let dns_regex = Regex::new(def.reverse_dns_pattern)
256                .map_err(|e| format!("Invalid DNS pattern for {}: {}", def.name, e))?;
257            crawler_patterns.push(CompiledCrawlerPattern {
258                ua_regex,
259                dns_regex,
260                definition: def,
261            });
262        }
263
264        // Compile bad bot patterns
265        let mut bad_bot_patterns = Vec::new();
266        for sig in BAD_BOT_SIGNATURES {
267            let regex = Regex::new(sig.pattern)
268                .map_err(|e| format!("Invalid bad bot pattern for {}: {}", sig.name, e))?;
269            bad_bot_patterns.push(CompiledBadBotPattern {
270                regex,
271                signature: sig,
272            });
273        }
274
275        // Create DNS resolver if verification is enabled (with rate limiting)
276        let dns = if config.verify_legitimate_crawlers {
277            Some(
278                DnsResolver::new(config.dns_timeout_ms, config.max_concurrent_dns_lookups)
279                    .await
280                    .map_err(|e| format!("Failed to create DNS resolver: {}", e))?,
281            )
282        } else {
283            None
284        };
285
286        let cache = VerificationCache::new(&config);
287
288        Ok(Self {
289            config,
290            cache,
291            dns,
292            stats: CrawlerStats::new(),
293            crawler_patterns,
294            bad_bot_patterns,
295            crawler_dist_cache: RwLock::new(None),
296            bad_bot_dist_cache: RwLock::new(None),
297        })
298    }
299
300    /// Create a disabled crawler detector when initialization fails.
301    pub fn disabled() -> Self {
302        let mut config = CrawlerConfig::default();
303        config.enabled = false;
304        config.verify_legitimate_crawlers = false;
305        config.block_bad_bots = false;
306
307        Self {
308            cache: VerificationCache::new(&config),
309            dns: None,
310            stats: CrawlerStats::new(),
311            crawler_patterns: Vec::new(),
312            bad_bot_patterns: Vec::new(),
313            config,
314            crawler_dist_cache: RwLock::new(None),
315            bad_bot_dist_cache: RwLock::new(None),
316        }
317    }
318
319    /// Verify a request's crawler status.
320    ///
321    /// ## Security
322    /// - Validates user_agent length to prevent ReDoS
323    /// - Applies DNS failure policy for fail-secure behavior
324    /// - Bounds stats map sizes to prevent memory exhaustion
325    pub async fn verify(&self, user_agent: &str, client_ip: IpAddr) -> CrawlerVerificationResult {
326        self.stats
327            .total_verifications
328            .fetch_add(1, Ordering::Relaxed);
329
330        // Security: Validate input length to prevent ReDoS
331        if user_agent.len() > MAX_USER_AGENT_LENGTH {
332            self.stats.input_rejected.fetch_add(1, Ordering::Relaxed);
333            tracing::warn!(
334                ip = %client_ip,
335                ua_len = user_agent.len(),
336                "Rejected oversized User-Agent (max {})",
337                MAX_USER_AGENT_LENGTH
338            );
339            return CrawlerVerificationResult {
340                suspicious: true,
341                input_rejected: true,
342                suspicion_reasons: vec![Cow::Borrowed("User-Agent exceeds maximum allowed length")],
343                ..Default::default()
344            };
345        }
346
347        // Check cache first
348        let cache_key = VerificationCache::cache_key(user_agent, client_ip);
349        if let Some(cached) = self.cache.get_verification(&cache_key) {
350            self.stats.cache_hits.fetch_add(1, Ordering::Relaxed);
351            return cached;
352        }
353        self.stats.cache_misses.fetch_add(1, Ordering::Relaxed);
354
355        let mut result = CrawlerVerificationResult::default();
356
357        // Check for bad bots first
358        if let Some(bad_bot) = self.check_bad_bot(user_agent) {
359            self.stats.bad_bots.fetch_add(1, Ordering::Relaxed);
360            // Bound stats map size to prevent memory exhaustion from novel bot names
361            self.record_bad_bot_stat(bad_bot.name);
362
363            result.bad_bot_match = Some(bad_bot.name.to_string());
364            result.bad_bot_severity = Some(bad_bot.severity);
365            result.suspicious = true;
366            // Note: We don't expose the matched bot name in the result to prevent info disclosure
367            result
368                .suspicion_reasons
369                .push(Cow::Borrowed("Matched known malicious bot signature"));
370
371            // Cache and return immediately for bad bots
372            self.cache.put_verification(cache_key, result.clone());
373            return result;
374        }
375
376        // Check for known crawler UA match
377        let crawler_match = self.match_crawler_ua(user_agent);
378        if let Some(pattern) = crawler_match {
379            result.is_crawler = true;
380            result.crawler_name = Some(pattern.definition.name.to_string());
381            result.user_agent_match = true;
382
383            // Verify if required
384            if pattern.definition.verification_required && self.config.verify_legitimate_crawlers {
385                result = self.verify_crawler(result, pattern, client_ip).await;
386            } else {
387                result.verified = !pattern.definition.verification_required;
388                result.verification_method = VerificationMethod::Unverified;
389                self.stats
390                    .unverified_crawlers
391                    .fetch_add(1, Ordering::Relaxed);
392            }
393
394            // Bound stats map size
395            self.record_crawler_stat(pattern.definition.name);
396        }
397
398        // Cache result
399        self.cache.put_verification(cache_key, result.clone());
400        result
401    }
402
403    /// Record crawler stat with bounded map size
404    fn record_crawler_stat(&self, name: &str) {
405        if self.stats.by_crawler_name.len() < self.config.max_stats_entries {
406            *self
407                .stats
408                .by_crawler_name
409                .entry(name.to_string())
410                .or_insert(0) += 1;
411        } else if self.stats.by_crawler_name.contains_key(name) {
412            // Update existing entry even if at capacity
413            *self
414                .stats
415                .by_crawler_name
416                .entry(name.to_string())
417                .or_insert(0) += 1;
418        }
419        // Otherwise drop the stat to prevent unbounded growth
420    }
421
422    /// Record bad bot stat with bounded map size
423    fn record_bad_bot_stat(&self, name: &str) {
424        if self.stats.by_bad_bot.len() < self.config.max_stats_entries
425            || self.stats.by_bad_bot.contains_key(name)
426        {
427            *self.stats.by_bad_bot.entry(name.to_string()).or_insert(0) += 1;
428        }
429    }
430
431    /// Match user agent against crawler patterns.
432    fn match_crawler_ua(&self, user_agent: &str) -> Option<&CompiledCrawlerPattern> {
433        self.crawler_patterns
434            .iter()
435            .find(|p| p.ua_regex.is_match(user_agent))
436    }
437
438    /// Verify a crawler via DNS.
439    ///
440    /// ## Security
441    /// - Applies DnsFailurePolicy for fail-secure behavior
442    /// - Does not expose internal details in suspicion reasons
443    async fn verify_crawler(
444        &self,
445        mut result: CrawlerVerificationResult,
446        pattern: &CompiledCrawlerPattern,
447        client_ip: IpAddr,
448    ) -> CrawlerVerificationResult {
449        let dns = match &self.dns {
450            Some(d) => d,
451            None => {
452                result.verification_method = VerificationMethod::Unverified;
453                return result;
454            }
455        };
456
457        // Check IP range first if available
458        if let Some(ranges) = pattern.definition.ip_ranges {
459            if self.check_ip_ranges(client_ip, ranges) {
460                result.verified = true;
461                result.ip_range_match = true;
462                result.verification_method = VerificationMethod::IpRange;
463                self.stats.verified_crawlers.fetch_add(1, Ordering::Relaxed);
464                return result;
465            }
466        }
467
468        // DNS verification
469        match dns.verify_ip(client_ip).await {
470            Ok((verified, hostname)) => {
471                self.stats.dns_successes.fetch_add(1, Ordering::Relaxed);
472
473                if let Some(ref hostname) = hostname {
474                    result.reverse_dns_match = pattern.dns_regex.is_match(hostname);
475
476                    if verified && result.reverse_dns_match {
477                        result.verified = true;
478                        result.verification_method = VerificationMethod::Dns;
479                        self.stats.verified_crawlers.fetch_add(1, Ordering::Relaxed);
480                    } else {
481                        // UA claims crawler but DNS doesn't match - suspicious!
482                        // Log details internally but don't expose hostname in result
483                        tracing::warn!(
484                            ip = %client_ip,
485                            claimed_crawler = %pattern.definition.name,
486                            hostname = %hostname,
487                            "Crawler verification failed: DNS hostname mismatch"
488                        );
489                        result.suspicious = true;
490                        result
491                            .suspicion_reasons
492                            .push(Cow::Borrowed("Crawler claim could not be verified via DNS"));
493                        self.stats
494                            .unverified_crawlers
495                            .fetch_add(1, Ordering::Relaxed);
496                    }
497                } else {
498                    tracing::warn!(
499                        ip = %client_ip,
500                        claimed_crawler = %pattern.definition.name,
501                        "Crawler verification failed: no PTR record"
502                    );
503                    result.suspicious = true;
504                    result.suspicion_reasons.push(Cow::Borrowed(
505                        "Crawler claim could not be verified: no reverse DNS",
506                    ));
507                    self.stats
508                        .unverified_crawlers
509                        .fetch_add(1, Ordering::Relaxed);
510                }
511            }
512            Err(DnsError::RateLimited) => {
513                self.stats.dns_rate_limited.fetch_add(1, Ordering::Relaxed);
514                tracing::warn!(ip = %client_ip, "DNS verification rate limited");
515                // Apply DNS failure policy
516                self.apply_dns_failure_policy(&mut result, pattern.definition.name);
517            }
518            Err(e) => {
519                self.stats.dns_failures.fetch_add(1, Ordering::Relaxed);
520                tracing::debug!(ip = %client_ip, error = %e, "DNS verification failed");
521                // Apply DNS failure policy instead of silently continuing
522                self.apply_dns_failure_policy(&mut result, pattern.definition.name);
523            }
524        }
525
526        result
527    }
528
529    /// Apply DNS failure policy to a verification result.
530    ///
531    /// This is called when DNS verification fails (timeout, rate limited, etc.)
532    /// and determines how to handle the request based on configuration.
533    fn apply_dns_failure_policy(
534        &self,
535        result: &mut CrawlerVerificationResult,
536        _crawler_name: &str,
537    ) {
538        match self.config.dns_failure_policy {
539            DnsFailurePolicy::Allow => {
540                // Fail-open: allow but log
541                tracing::debug!("DNS failure policy: allowing unverified crawler");
542                self.stats
543                    .unverified_crawlers
544                    .fetch_add(1, Ordering::Relaxed);
545            }
546            DnsFailurePolicy::ApplyRiskPenalty => {
547                // Fail-cautious: apply risk penalty
548                result.dns_failure_penalty = self.config.dns_failure_risk_penalty;
549                result.suspicion_reasons.push(Cow::Borrowed(
550                    "DNS verification unavailable - temporary penalty applied",
551                ));
552                self.stats
553                    .unverified_crawlers
554                    .fetch_add(1, Ordering::Relaxed);
555            }
556            DnsFailurePolicy::Block => {
557                // Fail-secure: mark as suspicious for blocking
558                result.suspicious = true;
559                result
560                    .suspicion_reasons
561                    .push(Cow::Borrowed("DNS verification required but unavailable"));
562                self.stats
563                    .unverified_crawlers
564                    .fetch_add(1, Ordering::Relaxed);
565            }
566        }
567    }
568
569    /// Check if IP is in any of the given CIDR ranges.
570    fn check_ip_ranges(&self, ip: IpAddr, ranges: &[&str]) -> bool {
571        for range in ranges {
572            if let Ok(network) = range.parse::<ipnet::IpNet>() {
573                if network.contains(&ip) {
574                    return true;
575                }
576            }
577        }
578        false
579    }
580
581    /// Check if user agent matches a bad bot signature.
582    ///
583    /// ## Security
584    /// Exclusion logic is handled in code (not regex) to prevent ReDoS attacks
585    /// from complex negative lookaheads.
586    pub fn check_bad_bot(&self, user_agent: &str) -> Option<&'static BadBotSignature> {
587        let ua_lower = user_agent.to_lowercase();
588
589        self.bad_bot_patterns
590            .iter()
591            .find(|p| {
592                // Check if pattern matches
593                if !p.regex.is_match(user_agent) {
594                    return false;
595                }
596
597                // Check exclusions (handled in code to avoid ReDoS)
598                let exclusions = get_exclusions(p.signature.name);
599                for excluded in exclusions {
600                    if ua_lower.contains(excluded) {
601                        return false;
602                    }
603                }
604
605                true
606            })
607            .map(|p| p.signature)
608    }
609
610    /// Get statistics snapshot.
611    pub fn stats(&self) -> CrawlerStatsSnapshot {
612        CrawlerStatsSnapshot::from(&self.stats)
613    }
614
615    /// Check if the detector is enabled.
616    pub fn is_enabled(&self) -> bool {
617        self.config.enabled
618    }
619
620    /// Get the configuration.
621    pub fn config(&self) -> &CrawlerConfig {
622        &self.config
623    }
624
625    /// Check if bad bots should be blocked.
626    pub fn should_block_bad_bots(&self) -> bool {
627        self.config.block_bad_bots
628    }
629
630    /// Returns the distribution of crawler hits by name.
631    pub fn get_crawler_distribution(&self, limit: usize) -> Vec<(String, u64)> {
632        {
633            let cache = self.crawler_dist_cache.read();
634            if let Some((timestamp, data)) = &*cache {
635                if timestamp.elapsed() < Duration::from_secs(1) {
636                    let mut result = data.clone();
637                    result.truncate(limit);
638                    return result;
639                }
640            }
641        }
642
643        let mut dist: Vec<_> = self
644            .stats
645            .by_crawler_name
646            .iter()
647            .map(|entry| (entry.key().clone(), *entry.value()))
648            .collect();
649        dist.sort_by(|a, b| b.1.cmp(&a.1));
650
651        {
652            let mut cache = self.crawler_dist_cache.write();
653            *cache = Some((Instant::now(), dist.clone()));
654        }
655
656        dist.truncate(limit);
657        dist
658    }
659
660    /// Returns the distribution of bad bot hits by signature.
661    pub fn get_bad_bot_distribution(&self, limit: usize) -> Vec<(String, u64)> {
662        {
663            let cache = self.bad_bot_dist_cache.read();
664            if let Some((timestamp, data)) = &*cache {
665                if timestamp.elapsed() < Duration::from_secs(1) {
666                    let mut result = data.clone();
667                    result.truncate(limit);
668                    return result;
669                }
670            }
671        }
672
673        let mut dist: Vec<_> = self
674            .stats
675            .by_bad_bot
676            .iter()
677            .map(|entry| (entry.key().clone(), *entry.value()))
678            .collect();
679        dist.sort_by(|a, b| b.1.cmp(&a.1));
680
681        {
682            let mut cache = self.bad_bot_dist_cache.write();
683            *cache = Some((Instant::now(), dist.clone()));
684        }
685
686        dist.truncate(limit);
687        dist
688    }
689}
690
691/// Implement the CrawlerDetection trait for CrawlerDetector
692#[async_trait::async_trait]
693impl CrawlerDetection for CrawlerDetector {
694    async fn verify(&self, user_agent: &str, client_ip: IpAddr) -> CrawlerVerificationResult {
695        // Delegate to the inherent method
696        CrawlerDetector::verify(self, user_agent, client_ip).await
697    }
698
699    fn is_enabled(&self) -> bool {
700        self.config.enabled
701    }
702
703    fn should_block_bad_bots(&self) -> bool {
704        self.config.block_bad_bots
705    }
706
707    fn stats(&self) -> CrawlerStatsSnapshot {
708        CrawlerStatsSnapshot::from(&self.stats)
709    }
710}
711
712/// Mock crawler detector for testing (doesn't perform actual DNS lookups)
713#[cfg(test)]
714pub struct MockCrawlerDetector {
715    pub enabled: bool,
716    pub block_bad_bots: bool,
717    pub results: std::collections::HashMap<String, CrawlerVerificationResult>,
718}
719
720#[cfg(test)]
721impl MockCrawlerDetector {
722    pub fn new() -> Self {
723        Self {
724            enabled: true,
725            block_bad_bots: true,
726            results: std::collections::HashMap::new(),
727        }
728    }
729
730    pub fn with_result(mut self, user_agent: &str, result: CrawlerVerificationResult) -> Self {
731        self.results.insert(user_agent.to_string(), result);
732        self
733    }
734}
735
736#[cfg(test)]
737#[async_trait::async_trait]
738impl CrawlerDetection for MockCrawlerDetector {
739    async fn verify(&self, user_agent: &str, _client_ip: IpAddr) -> CrawlerVerificationResult {
740        self.results.get(user_agent).cloned().unwrap_or_default()
741    }
742
743    fn is_enabled(&self) -> bool {
744        self.enabled
745    }
746
747    fn should_block_bad_bots(&self) -> bool {
748        self.block_bad_bots
749    }
750
751    fn stats(&self) -> CrawlerStatsSnapshot {
752        CrawlerStatsSnapshot {
753            total_verifications: 0,
754            verified_crawlers: 0,
755            unverified_crawlers: 0,
756            bad_bots: 0,
757            cache_hits: 0,
758            cache_misses: 0,
759            dns_successes: 0,
760            dns_failures: 0,
761            dns_rate_limited: 0,
762            input_rejected: 0,
763        }
764    }
765}
766
767#[cfg(test)]
768mod tests {
769    use super::*;
770
771    #[test]
772    fn test_bad_bot_detection() {
773        let ua = "sqlmap/1.0";
774        for sig in BAD_BOT_SIGNATURES {
775            let regex = Regex::new(sig.pattern).unwrap();
776            if regex.is_match(ua) {
777                assert_eq!(sig.name, "SQLMap");
778                assert_eq!(sig.severity, BadBotSeverity::High);
779                return;
780            }
781        }
782        panic!("SQLMap not detected");
783    }
784
785    #[test]
786    fn test_crawler_pattern_matching() {
787        let ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)";
788        for def in KNOWN_CRAWLERS {
789            let regex = Regex::new(def.user_agent_pattern).unwrap();
790            if regex.is_match(ua) {
791                assert_eq!(def.name, "Googlebot");
792                return;
793            }
794        }
795        panic!("Googlebot not detected");
796    }
797
798    #[test]
799    fn test_normal_ua_not_detected() {
800        let ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";
801
802        for sig in BAD_BOT_SIGNATURES {
803            let regex = Regex::new(sig.pattern).unwrap();
804            // Skip the empty UA check which would match anything
805            if sig.pattern == "^$" {
806                continue;
807            }
808            assert!(
809                !regex.is_match(ua),
810                "Normal UA matched bad bot: {}",
811                sig.name
812            );
813        }
814    }
815}
synapse_pingora/crawler/detector.rs

synapse_pingora/crawler/
detector.rs