1use dashmap::DashMap;
9use parking_lot::RwLock;
10use regex::Regex;
11use serde::{Deserialize, Serialize};
12use std::borrow::Cow;
13use std::net::IpAddr;
14use std::sync::atomic::{AtomicU64, Ordering};
15use std::time::{Duration, Instant};
16
17pub use super::bad_bots::{BadBotSeverity, BadBotSignature, BAD_BOT_SIGNATURES};
18use super::cache::VerificationCache;
19use super::config::{CrawlerConfig, DnsFailurePolicy};
20use super::dns_resolver::{DnsError, DnsResolver};
21use super::known_crawlers::{CrawlerDefinition, KNOWN_CRAWLERS};
22
23pub const MAX_USER_AGENT_LENGTH: usize = 512;
25
26#[async_trait::async_trait]
32pub trait CrawlerDetection: Send + Sync {
33 async fn verify(&self, user_agent: &str, client_ip: IpAddr) -> CrawlerVerificationResult;
35
36 fn is_enabled(&self) -> bool;
38
39 fn should_block_bad_bots(&self) -> bool;
41
42 fn stats(&self) -> CrawlerStatsSnapshot;
44}
45
46fn get_exclusions(signature_name: &str) -> &'static [&'static str] {
50 match signature_name {
51 "GenericBot" => &[
52 "googlebot",
53 "bingbot",
54 "yandexbot",
55 "baiduspider",
56 "facebookexternalhit",
57 "twitterbot",
58 "linkedinbot",
59 "applebot",
60 "pinterestbot",
61 "slackbot",
62 "discordbot",
63 ],
64 "GenericCrawler" => &["googlebot", "bingbot", "yandexbot", "baiduspider", "slurp"],
65 "GenericSpider" => &["googlebot", "bingbot", "yandexbot", "baiduspider", "slurp"],
66 "PythonUrllib" => &["googlebot"],
67 _ => &[],
68 }
69}
70
71#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
73#[serde(rename_all = "snake_case")]
74pub enum VerificationMethod {
75 Dns,
77 IpRange,
79 Unverified,
81}
82
83#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct CrawlerVerificationResult {
86 pub is_crawler: bool,
88 pub crawler_name: Option<String>,
90 pub verified: bool,
92 pub verification_method: VerificationMethod,
94 pub user_agent_match: bool,
96 pub reverse_dns_match: bool,
98 pub ip_range_match: bool,
100 pub suspicious: bool,
102 pub suspicion_reasons: Vec<Cow<'static, str>>,
104 pub bad_bot_match: Option<String>,
106 pub bad_bot_severity: Option<BadBotSeverity>,
108 pub input_rejected: bool,
110 pub dns_failure_penalty: u32,
112}
113
114impl Default for CrawlerVerificationResult {
115 fn default() -> Self {
116 Self {
117 is_crawler: false,
118 crawler_name: None,
119 verified: false,
120 verification_method: VerificationMethod::Unverified,
121 user_agent_match: false,
122 reverse_dns_match: false,
123 ip_range_match: false,
124 suspicious: false,
125 suspicion_reasons: Vec::new(),
126 bad_bot_match: None,
127 bad_bot_severity: None,
128 input_rejected: false,
129 dns_failure_penalty: 0,
130 }
131 }
132}
133
134#[derive(Debug)]
136pub struct CrawlerStats {
137 pub total_verifications: AtomicU64,
138 pub verified_crawlers: AtomicU64,
139 pub unverified_crawlers: AtomicU64,
140 pub bad_bots: AtomicU64,
141 pub cache_hits: AtomicU64,
142 pub cache_misses: AtomicU64,
143 pub dns_successes: AtomicU64,
144 pub dns_failures: AtomicU64,
145 pub dns_rate_limited: AtomicU64,
146 pub input_rejected: AtomicU64,
147 pub by_crawler_name: DashMap<String, u64>,
149 pub by_bad_bot: DashMap<String, u64>,
151}
152
153impl CrawlerStats {
154 pub fn new() -> Self {
155 Self {
156 total_verifications: AtomicU64::new(0),
157 verified_crawlers: AtomicU64::new(0),
158 unverified_crawlers: AtomicU64::new(0),
159 bad_bots: AtomicU64::new(0),
160 cache_hits: AtomicU64::new(0),
161 cache_misses: AtomicU64::new(0),
162 dns_successes: AtomicU64::new(0),
163 dns_failures: AtomicU64::new(0),
164 dns_rate_limited: AtomicU64::new(0),
165 input_rejected: AtomicU64::new(0),
166 by_crawler_name: DashMap::new(),
167 by_bad_bot: DashMap::new(),
168 }
169 }
170}
171
172impl Default for CrawlerStats {
173 fn default() -> Self {
174 Self::new()
175 }
176}
177
178#[derive(Debug, Clone, Serialize, Deserialize)]
180pub struct CrawlerStatsSnapshot {
181 pub total_verifications: u64,
182 pub verified_crawlers: u64,
183 pub unverified_crawlers: u64,
184 pub bad_bots: u64,
185 pub cache_hits: u64,
186 pub cache_misses: u64,
187 pub dns_successes: u64,
188 pub dns_failures: u64,
189 pub dns_rate_limited: u64,
190 pub input_rejected: u64,
191}
192
193impl From<&CrawlerStats> for CrawlerStatsSnapshot {
194 fn from(stats: &CrawlerStats) -> Self {
195 Self {
196 total_verifications: stats.total_verifications.load(Ordering::Relaxed),
197 verified_crawlers: stats.verified_crawlers.load(Ordering::Relaxed),
198 unverified_crawlers: stats.unverified_crawlers.load(Ordering::Relaxed),
199 bad_bots: stats.bad_bots.load(Ordering::Relaxed),
200 cache_hits: stats.cache_hits.load(Ordering::Relaxed),
201 cache_misses: stats.cache_misses.load(Ordering::Relaxed),
202 dns_successes: stats.dns_successes.load(Ordering::Relaxed),
203 dns_failures: stats.dns_failures.load(Ordering::Relaxed),
204 dns_rate_limited: stats.dns_rate_limited.load(Ordering::Relaxed),
205 input_rejected: stats.input_rejected.load(Ordering::Relaxed),
206 }
207 }
208}
209
210#[derive(Debug)]
212struct CompiledCrawlerPattern {
213 ua_regex: Regex,
214 dns_regex: Regex,
215 definition: &'static CrawlerDefinition,
216}
217
218#[derive(Debug)]
220struct CompiledBadBotPattern {
221 regex: Regex,
222 signature: &'static BadBotSignature,
223}
224
225type BotDistributionCache = Option<(Instant, Vec<(String, u64)>)>;
228
229pub struct CrawlerDetector {
231 config: CrawlerConfig,
232 dns: Option<DnsResolver>,
233 stats: CrawlerStats,
234 cache: VerificationCache,
235 crawler_patterns: Vec<CompiledCrawlerPattern>,
236 bad_bot_patterns: Vec<CompiledBadBotPattern>,
237
238 crawler_dist_cache: RwLock<BotDistributionCache>,
240 bad_bot_dist_cache: RwLock<BotDistributionCache>,
242}
243
244impl CrawlerDetector {
245 pub async fn new(config: CrawlerConfig) -> Result<Self, String> {
247 config.validate()?;
249
250 let mut crawler_patterns = Vec::new();
252 for def in KNOWN_CRAWLERS {
253 let ua_regex = Regex::new(def.user_agent_pattern)
254 .map_err(|e| format!("Invalid UA pattern for {}: {}", def.name, e))?;
255 let dns_regex = Regex::new(def.reverse_dns_pattern)
256 .map_err(|e| format!("Invalid DNS pattern for {}: {}", def.name, e))?;
257 crawler_patterns.push(CompiledCrawlerPattern {
258 ua_regex,
259 dns_regex,
260 definition: def,
261 });
262 }
263
264 let mut bad_bot_patterns = Vec::new();
266 for sig in BAD_BOT_SIGNATURES {
267 let regex = Regex::new(sig.pattern)
268 .map_err(|e| format!("Invalid bad bot pattern for {}: {}", sig.name, e))?;
269 bad_bot_patterns.push(CompiledBadBotPattern {
270 regex,
271 signature: sig,
272 });
273 }
274
275 let dns = if config.verify_legitimate_crawlers {
277 Some(
278 DnsResolver::new(config.dns_timeout_ms, config.max_concurrent_dns_lookups)
279 .await
280 .map_err(|e| format!("Failed to create DNS resolver: {}", e))?,
281 )
282 } else {
283 None
284 };
285
286 let cache = VerificationCache::new(&config);
287
288 Ok(Self {
289 config,
290 cache,
291 dns,
292 stats: CrawlerStats::new(),
293 crawler_patterns,
294 bad_bot_patterns,
295 crawler_dist_cache: RwLock::new(None),
296 bad_bot_dist_cache: RwLock::new(None),
297 })
298 }
299
300 pub fn disabled() -> Self {
302 let mut config = CrawlerConfig::default();
303 config.enabled = false;
304 config.verify_legitimate_crawlers = false;
305 config.block_bad_bots = false;
306
307 Self {
308 cache: VerificationCache::new(&config),
309 dns: None,
310 stats: CrawlerStats::new(),
311 crawler_patterns: Vec::new(),
312 bad_bot_patterns: Vec::new(),
313 config,
314 crawler_dist_cache: RwLock::new(None),
315 bad_bot_dist_cache: RwLock::new(None),
316 }
317 }
318
319 pub async fn verify(&self, user_agent: &str, client_ip: IpAddr) -> CrawlerVerificationResult {
326 self.stats
327 .total_verifications
328 .fetch_add(1, Ordering::Relaxed);
329
330 if user_agent.len() > MAX_USER_AGENT_LENGTH {
332 self.stats.input_rejected.fetch_add(1, Ordering::Relaxed);
333 tracing::warn!(
334 ip = %client_ip,
335 ua_len = user_agent.len(),
336 "Rejected oversized User-Agent (max {})",
337 MAX_USER_AGENT_LENGTH
338 );
339 return CrawlerVerificationResult {
340 suspicious: true,
341 input_rejected: true,
342 suspicion_reasons: vec![Cow::Borrowed("User-Agent exceeds maximum allowed length")],
343 ..Default::default()
344 };
345 }
346
347 let cache_key = VerificationCache::cache_key(user_agent, client_ip);
349 if let Some(cached) = self.cache.get_verification(&cache_key) {
350 self.stats.cache_hits.fetch_add(1, Ordering::Relaxed);
351 return cached;
352 }
353 self.stats.cache_misses.fetch_add(1, Ordering::Relaxed);
354
355 let mut result = CrawlerVerificationResult::default();
356
357 if let Some(bad_bot) = self.check_bad_bot(user_agent) {
359 self.stats.bad_bots.fetch_add(1, Ordering::Relaxed);
360 self.record_bad_bot_stat(bad_bot.name);
362
363 result.bad_bot_match = Some(bad_bot.name.to_string());
364 result.bad_bot_severity = Some(bad_bot.severity);
365 result.suspicious = true;
366 result
368 .suspicion_reasons
369 .push(Cow::Borrowed("Matched known malicious bot signature"));
370
371 self.cache.put_verification(cache_key, result.clone());
373 return result;
374 }
375
376 let crawler_match = self.match_crawler_ua(user_agent);
378 if let Some(pattern) = crawler_match {
379 result.is_crawler = true;
380 result.crawler_name = Some(pattern.definition.name.to_string());
381 result.user_agent_match = true;
382
383 if pattern.definition.verification_required && self.config.verify_legitimate_crawlers {
385 result = self.verify_crawler(result, pattern, client_ip).await;
386 } else {
387 result.verified = !pattern.definition.verification_required;
388 result.verification_method = VerificationMethod::Unverified;
389 self.stats
390 .unverified_crawlers
391 .fetch_add(1, Ordering::Relaxed);
392 }
393
394 self.record_crawler_stat(pattern.definition.name);
396 }
397
398 self.cache.put_verification(cache_key, result.clone());
400 result
401 }
402
403 fn record_crawler_stat(&self, name: &str) {
405 if self.stats.by_crawler_name.len() < self.config.max_stats_entries {
406 *self
407 .stats
408 .by_crawler_name
409 .entry(name.to_string())
410 .or_insert(0) += 1;
411 } else if self.stats.by_crawler_name.contains_key(name) {
412 *self
414 .stats
415 .by_crawler_name
416 .entry(name.to_string())
417 .or_insert(0) += 1;
418 }
419 }
421
422 fn record_bad_bot_stat(&self, name: &str) {
424 if self.stats.by_bad_bot.len() < self.config.max_stats_entries
425 || self.stats.by_bad_bot.contains_key(name)
426 {
427 *self.stats.by_bad_bot.entry(name.to_string()).or_insert(0) += 1;
428 }
429 }
430
431 fn match_crawler_ua(&self, user_agent: &str) -> Option<&CompiledCrawlerPattern> {
433 self.crawler_patterns
434 .iter()
435 .find(|p| p.ua_regex.is_match(user_agent))
436 }
437
438 async fn verify_crawler(
444 &self,
445 mut result: CrawlerVerificationResult,
446 pattern: &CompiledCrawlerPattern,
447 client_ip: IpAddr,
448 ) -> CrawlerVerificationResult {
449 let dns = match &self.dns {
450 Some(d) => d,
451 None => {
452 result.verification_method = VerificationMethod::Unverified;
453 return result;
454 }
455 };
456
457 if let Some(ranges) = pattern.definition.ip_ranges {
459 if self.check_ip_ranges(client_ip, ranges) {
460 result.verified = true;
461 result.ip_range_match = true;
462 result.verification_method = VerificationMethod::IpRange;
463 self.stats.verified_crawlers.fetch_add(1, Ordering::Relaxed);
464 return result;
465 }
466 }
467
468 match dns.verify_ip(client_ip).await {
470 Ok((verified, hostname)) => {
471 self.stats.dns_successes.fetch_add(1, Ordering::Relaxed);
472
473 if let Some(ref hostname) = hostname {
474 result.reverse_dns_match = pattern.dns_regex.is_match(hostname);
475
476 if verified && result.reverse_dns_match {
477 result.verified = true;
478 result.verification_method = VerificationMethod::Dns;
479 self.stats.verified_crawlers.fetch_add(1, Ordering::Relaxed);
480 } else {
481 tracing::warn!(
484 ip = %client_ip,
485 claimed_crawler = %pattern.definition.name,
486 hostname = %hostname,
487 "Crawler verification failed: DNS hostname mismatch"
488 );
489 result.suspicious = true;
490 result
491 .suspicion_reasons
492 .push(Cow::Borrowed("Crawler claim could not be verified via DNS"));
493 self.stats
494 .unverified_crawlers
495 .fetch_add(1, Ordering::Relaxed);
496 }
497 } else {
498 tracing::warn!(
499 ip = %client_ip,
500 claimed_crawler = %pattern.definition.name,
501 "Crawler verification failed: no PTR record"
502 );
503 result.suspicious = true;
504 result.suspicion_reasons.push(Cow::Borrowed(
505 "Crawler claim could not be verified: no reverse DNS",
506 ));
507 self.stats
508 .unverified_crawlers
509 .fetch_add(1, Ordering::Relaxed);
510 }
511 }
512 Err(DnsError::RateLimited) => {
513 self.stats.dns_rate_limited.fetch_add(1, Ordering::Relaxed);
514 tracing::warn!(ip = %client_ip, "DNS verification rate limited");
515 self.apply_dns_failure_policy(&mut result, pattern.definition.name);
517 }
518 Err(e) => {
519 self.stats.dns_failures.fetch_add(1, Ordering::Relaxed);
520 tracing::debug!(ip = %client_ip, error = %e, "DNS verification failed");
521 self.apply_dns_failure_policy(&mut result, pattern.definition.name);
523 }
524 }
525
526 result
527 }
528
529 fn apply_dns_failure_policy(
534 &self,
535 result: &mut CrawlerVerificationResult,
536 _crawler_name: &str,
537 ) {
538 match self.config.dns_failure_policy {
539 DnsFailurePolicy::Allow => {
540 tracing::debug!("DNS failure policy: allowing unverified crawler");
542 self.stats
543 .unverified_crawlers
544 .fetch_add(1, Ordering::Relaxed);
545 }
546 DnsFailurePolicy::ApplyRiskPenalty => {
547 result.dns_failure_penalty = self.config.dns_failure_risk_penalty;
549 result.suspicion_reasons.push(Cow::Borrowed(
550 "DNS verification unavailable - temporary penalty applied",
551 ));
552 self.stats
553 .unverified_crawlers
554 .fetch_add(1, Ordering::Relaxed);
555 }
556 DnsFailurePolicy::Block => {
557 result.suspicious = true;
559 result
560 .suspicion_reasons
561 .push(Cow::Borrowed("DNS verification required but unavailable"));
562 self.stats
563 .unverified_crawlers
564 .fetch_add(1, Ordering::Relaxed);
565 }
566 }
567 }
568
569 fn check_ip_ranges(&self, ip: IpAddr, ranges: &[&str]) -> bool {
571 for range in ranges {
572 if let Ok(network) = range.parse::<ipnet::IpNet>() {
573 if network.contains(&ip) {
574 return true;
575 }
576 }
577 }
578 false
579 }
580
581 pub fn check_bad_bot(&self, user_agent: &str) -> Option<&'static BadBotSignature> {
587 let ua_lower = user_agent.to_lowercase();
588
589 self.bad_bot_patterns
590 .iter()
591 .find(|p| {
592 if !p.regex.is_match(user_agent) {
594 return false;
595 }
596
597 let exclusions = get_exclusions(p.signature.name);
599 for excluded in exclusions {
600 if ua_lower.contains(excluded) {
601 return false;
602 }
603 }
604
605 true
606 })
607 .map(|p| p.signature)
608 }
609
610 pub fn stats(&self) -> CrawlerStatsSnapshot {
612 CrawlerStatsSnapshot::from(&self.stats)
613 }
614
615 pub fn is_enabled(&self) -> bool {
617 self.config.enabled
618 }
619
620 pub fn config(&self) -> &CrawlerConfig {
622 &self.config
623 }
624
625 pub fn should_block_bad_bots(&self) -> bool {
627 self.config.block_bad_bots
628 }
629
630 pub fn get_crawler_distribution(&self, limit: usize) -> Vec<(String, u64)> {
632 {
633 let cache = self.crawler_dist_cache.read();
634 if let Some((timestamp, data)) = &*cache {
635 if timestamp.elapsed() < Duration::from_secs(1) {
636 let mut result = data.clone();
637 result.truncate(limit);
638 return result;
639 }
640 }
641 }
642
643 let mut dist: Vec<_> = self
644 .stats
645 .by_crawler_name
646 .iter()
647 .map(|entry| (entry.key().clone(), *entry.value()))
648 .collect();
649 dist.sort_by(|a, b| b.1.cmp(&a.1));
650
651 {
652 let mut cache = self.crawler_dist_cache.write();
653 *cache = Some((Instant::now(), dist.clone()));
654 }
655
656 dist.truncate(limit);
657 dist
658 }
659
660 pub fn get_bad_bot_distribution(&self, limit: usize) -> Vec<(String, u64)> {
662 {
663 let cache = self.bad_bot_dist_cache.read();
664 if let Some((timestamp, data)) = &*cache {
665 if timestamp.elapsed() < Duration::from_secs(1) {
666 let mut result = data.clone();
667 result.truncate(limit);
668 return result;
669 }
670 }
671 }
672
673 let mut dist: Vec<_> = self
674 .stats
675 .by_bad_bot
676 .iter()
677 .map(|entry| (entry.key().clone(), *entry.value()))
678 .collect();
679 dist.sort_by(|a, b| b.1.cmp(&a.1));
680
681 {
682 let mut cache = self.bad_bot_dist_cache.write();
683 *cache = Some((Instant::now(), dist.clone()));
684 }
685
686 dist.truncate(limit);
687 dist
688 }
689}
690
691#[async_trait::async_trait]
693impl CrawlerDetection for CrawlerDetector {
694 async fn verify(&self, user_agent: &str, client_ip: IpAddr) -> CrawlerVerificationResult {
695 CrawlerDetector::verify(self, user_agent, client_ip).await
697 }
698
699 fn is_enabled(&self) -> bool {
700 self.config.enabled
701 }
702
703 fn should_block_bad_bots(&self) -> bool {
704 self.config.block_bad_bots
705 }
706
707 fn stats(&self) -> CrawlerStatsSnapshot {
708 CrawlerStatsSnapshot::from(&self.stats)
709 }
710}
711
712#[cfg(test)]
714pub struct MockCrawlerDetector {
715 pub enabled: bool,
716 pub block_bad_bots: bool,
717 pub results: std::collections::HashMap<String, CrawlerVerificationResult>,
718}
719
720#[cfg(test)]
721impl MockCrawlerDetector {
722 pub fn new() -> Self {
723 Self {
724 enabled: true,
725 block_bad_bots: true,
726 results: std::collections::HashMap::new(),
727 }
728 }
729
730 pub fn with_result(mut self, user_agent: &str, result: CrawlerVerificationResult) -> Self {
731 self.results.insert(user_agent.to_string(), result);
732 self
733 }
734}
735
736#[cfg(test)]
737#[async_trait::async_trait]
738impl CrawlerDetection for MockCrawlerDetector {
739 async fn verify(&self, user_agent: &str, _client_ip: IpAddr) -> CrawlerVerificationResult {
740 self.results.get(user_agent).cloned().unwrap_or_default()
741 }
742
743 fn is_enabled(&self) -> bool {
744 self.enabled
745 }
746
747 fn should_block_bad_bots(&self) -> bool {
748 self.block_bad_bots
749 }
750
751 fn stats(&self) -> CrawlerStatsSnapshot {
752 CrawlerStatsSnapshot {
753 total_verifications: 0,
754 verified_crawlers: 0,
755 unverified_crawlers: 0,
756 bad_bots: 0,
757 cache_hits: 0,
758 cache_misses: 0,
759 dns_successes: 0,
760 dns_failures: 0,
761 dns_rate_limited: 0,
762 input_rejected: 0,
763 }
764 }
765}
766
767#[cfg(test)]
768mod tests {
769 use super::*;
770
771 #[test]
772 fn test_bad_bot_detection() {
773 let ua = "sqlmap/1.0";
774 for sig in BAD_BOT_SIGNATURES {
775 let regex = Regex::new(sig.pattern).unwrap();
776 if regex.is_match(ua) {
777 assert_eq!(sig.name, "SQLMap");
778 assert_eq!(sig.severity, BadBotSeverity::High);
779 return;
780 }
781 }
782 panic!("SQLMap not detected");
783 }
784
785 #[test]
786 fn test_crawler_pattern_matching() {
787 let ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)";
788 for def in KNOWN_CRAWLERS {
789 let regex = Regex::new(def.user_agent_pattern).unwrap();
790 if regex.is_match(ua) {
791 assert_eq!(def.name, "Googlebot");
792 return;
793 }
794 }
795 panic!("Googlebot not detected");
796 }
797
798 #[test]
799 fn test_normal_ua_not_detected() {
800 let ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";
801
802 for sig in BAD_BOT_SIGNATURES {
803 let regex = Regex::new(sig.pattern).unwrap();
804 if sig.pattern == "^$" {
806 continue;
807 }
808 assert!(
809 !regex.is_match(ua),
810 "Normal UA matched bad bot: {}",
811 sig.name
812 );
813 }
814 }
815}