use dashmap::DashMap;
use parking_lot::RwLock;
use regex::Regex;
use serde::{Deserialize, Serialize};
use std::borrow::Cow;
use std::net::IpAddr;
use std::sync::atomic::{AtomicU64, Ordering};
use std::time::{Duration, Instant};
pub use super::bad_bots::{BadBotSeverity, BadBotSignature, BAD_BOT_SIGNATURES};
use super::cache::VerificationCache;
use super::config::{CrawlerConfig, DnsFailurePolicy};
use super::dns_resolver::{DnsError, DnsResolver};
use super::known_crawlers::{CrawlerDefinition, KNOWN_CRAWLERS};
pub const MAX_USER_AGENT_LENGTH: usize = 512;
#[async_trait::async_trait]
pub trait CrawlerDetection: Send + Sync {
async fn verify(&self, user_agent: &str, client_ip: IpAddr) -> CrawlerVerificationResult;
fn is_enabled(&self) -> bool;
fn should_block_bad_bots(&self) -> bool;
fn stats(&self) -> CrawlerStatsSnapshot;
}
fn get_exclusions(signature_name: &str) -> &'static [&'static str] {
match signature_name {
"GenericBot" => &[
"googlebot",
"bingbot",
"yandexbot",
"baiduspider",
"facebookexternalhit",
"twitterbot",
"linkedinbot",
"applebot",
"pinterestbot",
"slackbot",
"discordbot",
],
"GenericCrawler" => &["googlebot", "bingbot", "yandexbot", "baiduspider", "slurp"],
"GenericSpider" => &["googlebot", "bingbot", "yandexbot", "baiduspider", "slurp"],
"PythonUrllib" => &["googlebot"],
_ => &[],
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum VerificationMethod {
Dns,
IpRange,
Unverified,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrawlerVerificationResult {
pub is_crawler: bool,
pub crawler_name: Option<String>,
pub verified: bool,
pub verification_method: VerificationMethod,
pub user_agent_match: bool,
pub reverse_dns_match: bool,
pub ip_range_match: bool,
pub suspicious: bool,
pub suspicion_reasons: Vec<Cow<'static, str>>,
pub bad_bot_match: Option<String>,
pub bad_bot_severity: Option<BadBotSeverity>,
pub input_rejected: bool,
pub dns_failure_penalty: u32,
}
impl Default for CrawlerVerificationResult {
fn default() -> Self {
Self {
is_crawler: false,
crawler_name: None,
verified: false,
verification_method: VerificationMethod::Unverified,
user_agent_match: false,
reverse_dns_match: false,
ip_range_match: false,
suspicious: false,
suspicion_reasons: Vec::new(),
bad_bot_match: None,
bad_bot_severity: None,
input_rejected: false,
dns_failure_penalty: 0,
}
}
}
#[derive(Debug)]
pub struct CrawlerStats {
pub total_verifications: AtomicU64,
pub verified_crawlers: AtomicU64,
pub unverified_crawlers: AtomicU64,
pub bad_bots: AtomicU64,
pub cache_hits: AtomicU64,
pub cache_misses: AtomicU64,
pub dns_successes: AtomicU64,
pub dns_failures: AtomicU64,
pub dns_rate_limited: AtomicU64,
pub input_rejected: AtomicU64,
pub by_crawler_name: DashMap<String, u64>,
pub by_bad_bot: DashMap<String, u64>,
}
impl CrawlerStats {
pub fn new() -> Self {
Self {
total_verifications: AtomicU64::new(0),
verified_crawlers: AtomicU64::new(0),
unverified_crawlers: AtomicU64::new(0),
bad_bots: AtomicU64::new(0),
cache_hits: AtomicU64::new(0),
cache_misses: AtomicU64::new(0),
dns_successes: AtomicU64::new(0),
dns_failures: AtomicU64::new(0),
dns_rate_limited: AtomicU64::new(0),
input_rejected: AtomicU64::new(0),
by_crawler_name: DashMap::new(),
by_bad_bot: DashMap::new(),
}
}
}
impl Default for CrawlerStats {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrawlerStatsSnapshot {
pub total_verifications: u64,
pub verified_crawlers: u64,
pub unverified_crawlers: u64,
pub bad_bots: u64,
pub cache_hits: u64,
pub cache_misses: u64,
pub dns_successes: u64,
pub dns_failures: u64,
pub dns_rate_limited: u64,
pub input_rejected: u64,
}
impl From<&CrawlerStats> for CrawlerStatsSnapshot {
fn from(stats: &CrawlerStats) -> Self {
Self {
total_verifications: stats.total_verifications.load(Ordering::Relaxed),
verified_crawlers: stats.verified_crawlers.load(Ordering::Relaxed),
unverified_crawlers: stats.unverified_crawlers.load(Ordering::Relaxed),
bad_bots: stats.bad_bots.load(Ordering::Relaxed),
cache_hits: stats.cache_hits.load(Ordering::Relaxed),
cache_misses: stats.cache_misses.load(Ordering::Relaxed),
dns_successes: stats.dns_successes.load(Ordering::Relaxed),
dns_failures: stats.dns_failures.load(Ordering::Relaxed),
dns_rate_limited: stats.dns_rate_limited.load(Ordering::Relaxed),
input_rejected: stats.input_rejected.load(Ordering::Relaxed),
}
}
}
#[derive(Debug)]
struct CompiledCrawlerPattern {
ua_regex: Regex,
dns_regex: Regex,
definition: &'static CrawlerDefinition,
}
#[derive(Debug)]
struct CompiledBadBotPattern {
regex: Regex,
signature: &'static BadBotSignature,
}
type BotDistributionCache = Option<(Instant, Vec<(String, u64)>)>;
pub struct CrawlerDetector {
config: CrawlerConfig,
dns: Option<DnsResolver>,
stats: CrawlerStats,
cache: VerificationCache,
crawler_patterns: Vec<CompiledCrawlerPattern>,
bad_bot_patterns: Vec<CompiledBadBotPattern>,
crawler_dist_cache: RwLock<BotDistributionCache>,
bad_bot_dist_cache: RwLock<BotDistributionCache>,
}
impl CrawlerDetector {
pub async fn new(config: CrawlerConfig) -> Result<Self, String> {
config.validate()?;
let mut crawler_patterns = Vec::new();
for def in KNOWN_CRAWLERS {
let ua_regex = Regex::new(def.user_agent_pattern)
.map_err(|e| format!("Invalid UA pattern for {}: {}", def.name, e))?;
let dns_regex = Regex::new(def.reverse_dns_pattern)
.map_err(|e| format!("Invalid DNS pattern for {}: {}", def.name, e))?;
crawler_patterns.push(CompiledCrawlerPattern {
ua_regex,
dns_regex,
definition: def,
});
}
let mut bad_bot_patterns = Vec::new();
for sig in BAD_BOT_SIGNATURES {
let regex = Regex::new(sig.pattern)
.map_err(|e| format!("Invalid bad bot pattern for {}: {}", sig.name, e))?;
bad_bot_patterns.push(CompiledBadBotPattern {
regex,
signature: sig,
});
}
let dns = if config.verify_legitimate_crawlers {
Some(
DnsResolver::new(config.dns_timeout_ms, config.max_concurrent_dns_lookups)
.await
.map_err(|e| format!("Failed to create DNS resolver: {}", e))?,
)
} else {
None
};
let cache = VerificationCache::new(&config);
Ok(Self {
config,
cache,
dns,
stats: CrawlerStats::new(),
crawler_patterns,
bad_bot_patterns,
crawler_dist_cache: RwLock::new(None),
bad_bot_dist_cache: RwLock::new(None),
})
}
pub fn disabled() -> Self {
let mut config = CrawlerConfig::default();
config.enabled = false;
config.verify_legitimate_crawlers = false;
config.block_bad_bots = false;
Self {
cache: VerificationCache::new(&config),
dns: None,
stats: CrawlerStats::new(),
crawler_patterns: Vec::new(),
bad_bot_patterns: Vec::new(),
config,
crawler_dist_cache: RwLock::new(None),
bad_bot_dist_cache: RwLock::new(None),
}
}
pub async fn verify(&self, user_agent: &str, client_ip: IpAddr) -> CrawlerVerificationResult {
self.stats
.total_verifications
.fetch_add(1, Ordering::Relaxed);
if user_agent.len() > MAX_USER_AGENT_LENGTH {
self.stats.input_rejected.fetch_add(1, Ordering::Relaxed);
tracing::warn!(
ip = %client_ip,
ua_len = user_agent.len(),
"Rejected oversized User-Agent (max {})",
MAX_USER_AGENT_LENGTH
);
return CrawlerVerificationResult {
suspicious: true,
input_rejected: true,
suspicion_reasons: vec![Cow::Borrowed("User-Agent exceeds maximum allowed length")],
..Default::default()
};
}
let cache_key = VerificationCache::cache_key(user_agent, client_ip);
if let Some(cached) = self.cache.get_verification(&cache_key) {
self.stats.cache_hits.fetch_add(1, Ordering::Relaxed);
return cached;
}
self.stats.cache_misses.fetch_add(1, Ordering::Relaxed);
let mut result = CrawlerVerificationResult::default();
if let Some(bad_bot) = self.check_bad_bot(user_agent) {
self.stats.bad_bots.fetch_add(1, Ordering::Relaxed);
self.record_bad_bot_stat(bad_bot.name);
result.bad_bot_match = Some(bad_bot.name.to_string());
result.bad_bot_severity = Some(bad_bot.severity);
result.suspicious = true;
result
.suspicion_reasons
.push(Cow::Borrowed("Matched known malicious bot signature"));
self.cache.put_verification(cache_key, result.clone());
return result;
}
let crawler_match = self.match_crawler_ua(user_agent);
if let Some(pattern) = crawler_match {
result.is_crawler = true;
result.crawler_name = Some(pattern.definition.name.to_string());
result.user_agent_match = true;
if pattern.definition.verification_required && self.config.verify_legitimate_crawlers {
result = self.verify_crawler(result, pattern, client_ip).await;
} else {
result.verified = !pattern.definition.verification_required;
result.verification_method = VerificationMethod::Unverified;
self.stats
.unverified_crawlers
.fetch_add(1, Ordering::Relaxed);
}
self.record_crawler_stat(pattern.definition.name);
}
self.cache.put_verification(cache_key, result.clone());
result
}
fn record_crawler_stat(&self, name: &str) {
if self.stats.by_crawler_name.len() < self.config.max_stats_entries {
*self
.stats
.by_crawler_name
.entry(name.to_string())
.or_insert(0) += 1;
} else if self.stats.by_crawler_name.contains_key(name) {
*self
.stats
.by_crawler_name
.entry(name.to_string())
.or_insert(0) += 1;
}
}
fn record_bad_bot_stat(&self, name: &str) {
if self.stats.by_bad_bot.len() < self.config.max_stats_entries
|| self.stats.by_bad_bot.contains_key(name)
{
*self.stats.by_bad_bot.entry(name.to_string()).or_insert(0) += 1;
}
}
fn match_crawler_ua(&self, user_agent: &str) -> Option<&CompiledCrawlerPattern> {
self.crawler_patterns
.iter()
.find(|p| p.ua_regex.is_match(user_agent))
}
async fn verify_crawler(
&self,
mut result: CrawlerVerificationResult,
pattern: &CompiledCrawlerPattern,
client_ip: IpAddr,
) -> CrawlerVerificationResult {
let dns = match &self.dns {
Some(d) => d,
None => {
result.verification_method = VerificationMethod::Unverified;
return result;
}
};
if let Some(ranges) = pattern.definition.ip_ranges {
if self.check_ip_ranges(client_ip, ranges) {
result.verified = true;
result.ip_range_match = true;
result.verification_method = VerificationMethod::IpRange;
self.stats.verified_crawlers.fetch_add(1, Ordering::Relaxed);
return result;
}
}
match dns.verify_ip(client_ip).await {
Ok((verified, hostname)) => {
self.stats.dns_successes.fetch_add(1, Ordering::Relaxed);
if let Some(ref hostname) = hostname {
result.reverse_dns_match = pattern.dns_regex.is_match(hostname);
if verified && result.reverse_dns_match {
result.verified = true;
result.verification_method = VerificationMethod::Dns;
self.stats.verified_crawlers.fetch_add(1, Ordering::Relaxed);
} else {
tracing::warn!(
ip = %client_ip,
claimed_crawler = %pattern.definition.name,
hostname = %hostname,
"Crawler verification failed: DNS hostname mismatch"
);
result.suspicious = true;
result
.suspicion_reasons
.push(Cow::Borrowed("Crawler claim could not be verified via DNS"));
self.stats
.unverified_crawlers
.fetch_add(1, Ordering::Relaxed);
}
} else {
tracing::warn!(
ip = %client_ip,
claimed_crawler = %pattern.definition.name,
"Crawler verification failed: no PTR record"
);
result.suspicious = true;
result.suspicion_reasons.push(Cow::Borrowed(
"Crawler claim could not be verified: no reverse DNS",
));
self.stats
.unverified_crawlers
.fetch_add(1, Ordering::Relaxed);
}
}
Err(DnsError::RateLimited) => {
self.stats.dns_rate_limited.fetch_add(1, Ordering::Relaxed);
tracing::warn!(ip = %client_ip, "DNS verification rate limited");
self.apply_dns_failure_policy(&mut result, pattern.definition.name);
}
Err(e) => {
self.stats.dns_failures.fetch_add(1, Ordering::Relaxed);
tracing::debug!(ip = %client_ip, error = %e, "DNS verification failed");
self.apply_dns_failure_policy(&mut result, pattern.definition.name);
}
}
result
}
fn apply_dns_failure_policy(
&self,
result: &mut CrawlerVerificationResult,
_crawler_name: &str,
) {
match self.config.dns_failure_policy {
DnsFailurePolicy::Allow => {
tracing::debug!("DNS failure policy: allowing unverified crawler");
self.stats
.unverified_crawlers
.fetch_add(1, Ordering::Relaxed);
}
DnsFailurePolicy::ApplyRiskPenalty => {
result.dns_failure_penalty = self.config.dns_failure_risk_penalty;
result.suspicion_reasons.push(Cow::Borrowed(
"DNS verification unavailable - temporary penalty applied",
));
self.stats
.unverified_crawlers
.fetch_add(1, Ordering::Relaxed);
}
DnsFailurePolicy::Block => {
result.suspicious = true;
result
.suspicion_reasons
.push(Cow::Borrowed("DNS verification required but unavailable"));
self.stats
.unverified_crawlers
.fetch_add(1, Ordering::Relaxed);
}
}
}
fn check_ip_ranges(&self, ip: IpAddr, ranges: &[&str]) -> bool {
for range in ranges {
if let Ok(network) = range.parse::<ipnet::IpNet>() {
if network.contains(&ip) {
return true;
}
}
}
false
}
pub fn check_bad_bot(&self, user_agent: &str) -> Option<&'static BadBotSignature> {
let ua_lower = user_agent.to_lowercase();
self.bad_bot_patterns
.iter()
.find(|p| {
if !p.regex.is_match(user_agent) {
return false;
}
let exclusions = get_exclusions(p.signature.name);
for excluded in exclusions {
if ua_lower.contains(excluded) {
return false;
}
}
true
})
.map(|p| p.signature)
}
pub fn stats(&self) -> CrawlerStatsSnapshot {
CrawlerStatsSnapshot::from(&self.stats)
}
pub fn is_enabled(&self) -> bool {
self.config.enabled
}
pub fn config(&self) -> &CrawlerConfig {
&self.config
}
pub fn should_block_bad_bots(&self) -> bool {
self.config.block_bad_bots
}
pub fn get_crawler_distribution(&self, limit: usize) -> Vec<(String, u64)> {
{
let cache = self.crawler_dist_cache.read();
if let Some((timestamp, data)) = &*cache {
if timestamp.elapsed() < Duration::from_secs(1) {
let mut result = data.clone();
result.truncate(limit);
return result;
}
}
}
let mut dist: Vec<_> = self
.stats
.by_crawler_name
.iter()
.map(|entry| (entry.key().clone(), *entry.value()))
.collect();
dist.sort_by(|a, b| b.1.cmp(&a.1));
{
let mut cache = self.crawler_dist_cache.write();
*cache = Some((Instant::now(), dist.clone()));
}
dist.truncate(limit);
dist
}
pub fn get_bad_bot_distribution(&self, limit: usize) -> Vec<(String, u64)> {
{
let cache = self.bad_bot_dist_cache.read();
if let Some((timestamp, data)) = &*cache {
if timestamp.elapsed() < Duration::from_secs(1) {
let mut result = data.clone();
result.truncate(limit);
return result;
}
}
}
let mut dist: Vec<_> = self
.stats
.by_bad_bot
.iter()
.map(|entry| (entry.key().clone(), *entry.value()))
.collect();
dist.sort_by(|a, b| b.1.cmp(&a.1));
{
let mut cache = self.bad_bot_dist_cache.write();
*cache = Some((Instant::now(), dist.clone()));
}
dist.truncate(limit);
dist
}
}
#[async_trait::async_trait]
impl CrawlerDetection for CrawlerDetector {
async fn verify(&self, user_agent: &str, client_ip: IpAddr) -> CrawlerVerificationResult {
CrawlerDetector::verify(self, user_agent, client_ip).await
}
fn is_enabled(&self) -> bool {
self.config.enabled
}
fn should_block_bad_bots(&self) -> bool {
self.config.block_bad_bots
}
fn stats(&self) -> CrawlerStatsSnapshot {
CrawlerStatsSnapshot::from(&self.stats)
}
}
#[cfg(test)]
pub struct MockCrawlerDetector {
pub enabled: bool,
pub block_bad_bots: bool,
pub results: std::collections::HashMap<String, CrawlerVerificationResult>,
}
#[cfg(test)]
impl MockCrawlerDetector {
pub fn new() -> Self {
Self {
enabled: true,
block_bad_bots: true,
results: std::collections::HashMap::new(),
}
}
pub fn with_result(mut self, user_agent: &str, result: CrawlerVerificationResult) -> Self {
self.results.insert(user_agent.to_string(), result);
self
}
}
#[cfg(test)]
#[async_trait::async_trait]
impl CrawlerDetection for MockCrawlerDetector {
async fn verify(&self, user_agent: &str, _client_ip: IpAddr) -> CrawlerVerificationResult {
self.results.get(user_agent).cloned().unwrap_or_default()
}
fn is_enabled(&self) -> bool {
self.enabled
}
fn should_block_bad_bots(&self) -> bool {
self.block_bad_bots
}
fn stats(&self) -> CrawlerStatsSnapshot {
CrawlerStatsSnapshot {
total_verifications: 0,
verified_crawlers: 0,
unverified_crawlers: 0,
bad_bots: 0,
cache_hits: 0,
cache_misses: 0,
dns_successes: 0,
dns_failures: 0,
dns_rate_limited: 0,
input_rejected: 0,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bad_bot_detection() {
let ua = "sqlmap/1.0";
for sig in BAD_BOT_SIGNATURES {
let regex = Regex::new(sig.pattern).unwrap();
if regex.is_match(ua) {
assert_eq!(sig.name, "SQLMap");
assert_eq!(sig.severity, BadBotSeverity::High);
return;
}
}
panic!("SQLMap not detected");
}
#[test]
fn test_crawler_pattern_matching() {
let ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)";
for def in KNOWN_CRAWLERS {
let regex = Regex::new(def.user_agent_pattern).unwrap();
if regex.is_match(ua) {
assert_eq!(def.name, "Googlebot");
return;
}
}
panic!("Googlebot not detected");
}
#[test]
fn test_normal_ua_not_detected() {
let ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";
for sig in BAD_BOT_SIGNATURES {
let regex = Regex::new(sig.pattern).unwrap();
if sig.pattern == "^$" {
continue;
}
assert!(
!regex.is_match(ua),
"Normal UA matched bad bot: {}",
sig.name
);
}
}
}