use std::net::{IpAddr, Ipv4Addr};
use std::sync::atomic::{AtomicU32, AtomicUsize, Ordering};
use std::sync::Arc;
use synapse_pingora::crawler::{
BadBotSeverity, CrawlerConfig, CrawlerDetector, DnsFailurePolicy, VerificationMethod,
};
use tokio::sync::Mutex;
struct MockDnsResolver {
reverse_lookups: Arc<AtomicUsize>,
forward_lookups: Arc<AtomicUsize>,
reverse_results: Arc<Mutex<std::collections::HashMap<String, Option<String>>>>,
forward_results: Arc<Mutex<std::collections::HashMap<String, Vec<IpAddr>>>>,
dns_failures: Arc<Mutex<std::collections::HashSet<String>>>,
dns_timeouts: Arc<AtomicU32>,
}
impl MockDnsResolver {
fn new() -> Self {
Self {
reverse_lookups: Arc::new(AtomicUsize::new(0)),
forward_lookups: Arc::new(AtomicUsize::new(0)),
reverse_results: Arc::new(Mutex::new(std::collections::HashMap::new())),
forward_results: Arc::new(Mutex::new(std::collections::HashMap::new())),
dns_failures: Arc::new(Mutex::new(std::collections::HashSet::new())),
dns_timeouts: Arc::new(AtomicU32::new(0)),
}
}
async fn set_reverse_result(&self, ip: IpAddr, hostname: Option<String>) {
let mut results = self.reverse_results.lock().await;
results.insert(ip.to_string(), hostname);
}
async fn set_forward_result(&self, hostname: String, ips: Vec<IpAddr>) {
let mut results = self.forward_results.lock().await;
results.insert(hostname, ips);
}
async fn set_dns_failure(&self, target: String) {
let mut failures = self.dns_failures.lock().await;
failures.insert(target);
}
fn reverse_lookup_count(&self) -> usize {
self.reverse_lookups.load(Ordering::SeqCst)
}
fn forward_lookup_count(&self) -> usize {
self.forward_lookups.load(Ordering::SeqCst)
}
fn timeout_count(&self) -> u32 {
self.dns_timeouts.load(Ordering::SeqCst)
}
async fn reverse_lookup_mock(&self, ip: IpAddr) -> Result<Option<String>, String> {
self.reverse_lookups.fetch_add(1, Ordering::SeqCst);
let results = self.reverse_results.lock().await;
if let Some(result) = results.get(&ip.to_string()) {
Ok(result.clone())
} else {
Ok(None)
}
}
async fn forward_lookup_mock(&self, hostname: &str) -> Result<Vec<IpAddr>, String> {
self.forward_lookups.fetch_add(1, Ordering::SeqCst);
let failures = self.dns_failures.lock().await;
if failures.contains(hostname) {
self.dns_timeouts.fetch_add(1, Ordering::SeqCst);
return Err("DNS Timeout".to_string());
}
drop(failures);
let results = self.forward_results.lock().await;
if let Some(ips) = results.get(hostname) {
Ok(ips.clone())
} else {
Ok(Vec::new())
}
}
async fn verify_ip_mock(&self, ip: IpAddr) -> Result<(bool, Option<String>), String> {
let hostname = match self.reverse_lookup_mock(ip).await? {
Some(h) => h,
None => return Ok((false, None)),
};
let resolved_ips = self.forward_lookup_mock(&hostname).await?;
let verified = resolved_ips.contains(&ip);
Ok((verified, Some(hostname)))
}
}
#[tokio::test]
async fn test_crawler_spoofing_ua_dns_mismatch() {
let mut config = CrawlerConfig::default();
config.verify_legitimate_crawlers = true;
config.dns_failure_policy = DnsFailurePolicy::ApplyRiskPenalty;
let detector = CrawlerDetector::new(config)
.await
.expect("Failed to create detector");
let spoofed_ip = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1));
let googlebot_ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)";
let result = detector.verify(googlebot_ua, spoofed_ip).await;
assert!(result.is_crawler, "Should detect as crawler");
assert_eq!(
result.crawler_name,
Some("Googlebot".to_string()),
"Should identify as Googlebot"
);
assert!(result.user_agent_match, "Should match UA pattern");
assert!(
result.suspicious || !result.verified,
"Should be suspicious or unverified due to DNS mismatch"
);
println!("Spoofing test result: {:?}", result);
}
#[tokio::test]
async fn test_dns_verification_reverse_forward_roundtrip() {
let mock_resolver = MockDnsResolver::new();
let legitimate_ip = IpAddr::V4(Ipv4Addr::new(192, 0, 2, 1));
let hostname = "googlebot.example.com".to_string();
mock_resolver
.set_reverse_result(legitimate_ip, Some(hostname.clone()))
.await;
mock_resolver
.set_forward_result(hostname.clone(), vec![legitimate_ip])
.await;
let reverse_result = mock_resolver.reverse_lookup_mock(legitimate_ip).await;
assert!(reverse_result.is_ok(), "Reverse lookup should succeed");
assert_eq!(
reverse_result.unwrap(),
Some(hostname.clone()),
"Should return correct hostname"
);
assert_eq!(
mock_resolver.reverse_lookup_count(),
1,
"Should have performed 1 reverse lookup"
);
let forward_result = mock_resolver.forward_lookup_mock(&hostname.clone()).await;
assert!(forward_result.is_ok(), "Forward lookup should succeed");
assert!(
forward_result.unwrap().contains(&legitimate_ip),
"Forward lookup should return original IP"
);
assert_eq!(
mock_resolver.forward_lookup_count(),
1,
"Should have performed 1 forward lookup"
);
let verify_result = mock_resolver.verify_ip_mock(legitimate_ip).await;
assert!(verify_result.is_ok(), "Verification should succeed");
let (verified, resolved_hostname) = verify_result.unwrap();
assert!(verified, "Round-trip verification should succeed");
assert_eq!(
resolved_hostname,
Some(hostname.clone()),
"Should return correct hostname"
);
}
#[tokio::test]
async fn test_dns_cache_ttl_behavior() {
let mut config = CrawlerConfig::default();
config.dns_cache_ttl_secs = 1; config.verify_legitimate_crawlers = true;
config.dns_failure_policy = DnsFailurePolicy::ApplyRiskPenalty;
let detector = CrawlerDetector::new(config)
.await
.expect("Failed to create detector");
let googlebot_ip = IpAddr::V4(Ipv4Addr::new(192, 0, 2, 100));
let googlebot_ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)";
let initial_stats = detector.stats();
let first_result = detector.verify(googlebot_ua, googlebot_ip).await;
let stats_after_first = detector.stats();
println!("Stats after first verify: {:?}", stats_after_first);
let second_result = detector.verify(googlebot_ua, googlebot_ip).await;
let stats_after_second = detector.stats();
println!("Stats after second verify: {:?}", stats_after_second);
assert!(
stats_after_second.cache_hits > initial_stats.cache_hits,
"Should have cache hit on second request"
);
assert_eq!(
first_result.is_crawler, second_result.is_crawler,
"Cache should return same crawler status"
);
assert_eq!(
first_result.crawler_name, second_result.crawler_name,
"Cache should return same crawler name"
);
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
let _third_result = detector.verify(googlebot_ua, googlebot_ip).await;
let stats_after_third = detector.stats();
println!(
"Stats after third verify (post-TTL): {:?}",
stats_after_third
);
assert!(
stats_after_third.cache_misses >= stats_after_second.cache_misses,
"Should eventually have cache misses after TTL expiration"
);
}
#[tokio::test]
async fn test_bad_bot_blocking_dns_failure_policy_block() {
let mut config = CrawlerConfig::default();
config.block_bad_bots = true;
config.verify_legitimate_crawlers = true;
config.dns_failure_policy = DnsFailurePolicy::Block;
let detector = CrawlerDetector::new(config)
.await
.expect("Failed to create detector");
let bad_bot_ua = "sqlmap/1.0";
let client_ip = IpAddr::V4(Ipv4Addr::new(203, 0, 113, 1));
let result = detector.verify(bad_bot_ua, client_ip).await;
assert!(result.suspicious, "Bad bot should be marked suspicious");
assert_eq!(
result.bad_bot_match,
Some("SQLMap".to_string()),
"Should detect SQLMap"
);
assert_eq!(
result.bad_bot_severity,
Some(BadBotSeverity::High),
"SQLMap should have HIGH severity"
);
let stats = detector.stats();
assert!(stats.bad_bots > 0, "Stats should track bad bot detection");
println!("Bad bot detection result: {:?}", result);
println!("Detector stats: {:?}", stats);
}
#[tokio::test]
async fn test_dns_failure_policy_block() {
let mut config = CrawlerConfig::default();
config.verify_legitimate_crawlers = true;
config.dns_failure_policy = DnsFailurePolicy::Block;
config.dns_timeout_ms = 100;
let detector = CrawlerDetector::new(config)
.await
.expect("Failed to create detector");
let googlebot_ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)";
let client_ip = IpAddr::V4(Ipv4Addr::new(192, 0, 2, 1));
let result = detector.verify(googlebot_ua, client_ip).await;
assert!(
result.suspicious || !result.verified,
"Should be suspicious or unverified when DNS fails with Block policy"
);
println!("DNS failure block policy result: {:?}", result);
println!("Suspicion reasons: {:?}", result.suspicion_reasons);
}
#[tokio::test]
async fn test_dns_failure_policy_apply_risk_penalty() {
let mut config = CrawlerConfig::default();
config.verify_legitimate_crawlers = true;
config.dns_failure_policy = DnsFailurePolicy::ApplyRiskPenalty;
config.dns_failure_risk_penalty = 75;
let detector = CrawlerDetector::new(config)
.await
.expect("Failed to create detector");
let googlebot_ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)";
let client_ip = IpAddr::V4(Ipv4Addr::new(192, 0, 2, 1));
let result = detector.verify(googlebot_ua, client_ip).await;
assert!(
result.dns_failure_penalty > 0
|| result.suspicion_reasons.iter().any(|r| r.contains("DNS")),
"Should apply risk penalty or include DNS-related suspicion reason"
);
println!("DNS failure risk penalty result: {:?}", result);
}
#[tokio::test]
async fn test_dns_failure_policy_allow() {
let mut config = CrawlerConfig::default();
config.verify_legitimate_crawlers = true;
config.dns_failure_policy = DnsFailurePolicy::Allow;
let detector = CrawlerDetector::new(config)
.await
.expect("Failed to create detector");
let googlebot_ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)";
let client_ip = IpAddr::V4(Ipv4Addr::new(192, 0, 2, 1));
let result = detector.verify(googlebot_ua, client_ip).await;
assert!(
result.is_crawler,
"Should still recognize as crawler with Allow policy"
);
println!("DNS failure allow policy result: {:?}", result);
}
#[tokio::test]
async fn test_bad_bot_severity_known_crawler() {
let mut config = CrawlerConfig::default();
config.block_bad_bots = true;
config.verify_legitimate_crawlers = false;
let detector = CrawlerDetector::new(config)
.await
.expect("Failed to create detector");
let googlebot_ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)";
let client_ip = IpAddr::V4(Ipv4Addr::new(192, 0, 2, 1));
let result = detector.verify(googlebot_ua, client_ip).await;
assert!(result.is_crawler, "Should detect as legitimate crawler");
assert_eq!(
result.crawler_name,
Some("Googlebot".to_string()),
"Should identify as Googlebot"
);
assert!(
result.bad_bot_match.is_none(),
"Should NOT match as bad bot"
);
println!("Known crawler result: {:?}", result);
}
#[tokio::test]
async fn test_bad_bot_severity_high() {
let mut config = CrawlerConfig::default();
config.block_bad_bots = true;
let detector = CrawlerDetector::new(config)
.await
.expect("Failed to create detector");
let sqlmap_ua = "sqlmap/1.0";
let client_ip = IpAddr::V4(Ipv4Addr::new(203, 0, 113, 1));
let result = detector.verify(sqlmap_ua, client_ip).await;
assert!(result.suspicious, "SQLMap should be suspicious");
assert_eq!(
result.bad_bot_severity,
Some(BadBotSeverity::High),
"SQLMap should have HIGH severity"
);
println!("HIGH severity bad bot result: {:?}", result);
}
#[tokio::test]
async fn test_bad_bot_severity_medium() {
let mut config = CrawlerConfig::default();
config.block_bad_bots = true;
let detector = CrawlerDetector::new(config)
.await
.expect("Failed to create detector");
let burp_ua = "Burp Suite";
let client_ip = IpAddr::V4(Ipv4Addr::new(203, 0, 113, 2));
let result = detector.verify(burp_ua, client_ip).await;
assert!(result.suspicious, "Burp should be suspicious");
assert_eq!(
result.bad_bot_severity,
Some(BadBotSeverity::Medium),
"Burp Suite should have MEDIUM severity"
);
println!("MEDIUM severity bad bot result: {:?}", result);
}
#[tokio::test]
async fn test_bad_bot_severity_low() {
let mut config = CrawlerConfig::default();
config.block_bad_bots = true;
let detector = CrawlerDetector::new(config)
.await
.expect("Failed to create detector");
let python_ua = "python-urllib";
let client_ip = IpAddr::V4(Ipv4Addr::new(203, 0, 113, 3));
let result = detector.verify(python_ua, client_ip).await;
assert!(result.suspicious, "Python scraper should be suspicious");
assert_eq!(
result.bad_bot_severity,
Some(BadBotSeverity::Low),
"Python urllib should have LOW severity"
);
println!("LOW severity bad bot result: {:?}", result);
}
#[tokio::test]
async fn test_legitimate_crawler_no_bad_bot_match() {
let mut config = CrawlerConfig::default();
config.block_bad_bots = true;
config.verify_legitimate_crawlers = false;
let detector = CrawlerDetector::new(config)
.await
.expect("Failed to create detector");
let legitimate_crawlers = vec![
(
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Googlebot",
),
(
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
"Bingbot",
),
(
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Baiduspider",
),
(
"Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)",
"YandexBot",
),
];
for (ua, expected_name) in legitimate_crawlers {
let client_ip = IpAddr::V4(Ipv4Addr::new(192, 0, 2, 100));
let result = detector.verify(ua, client_ip).await;
assert!(
result.is_crawler,
"Should detect {} as crawler",
expected_name
);
assert_eq!(
result.crawler_name,
Some(expected_name.to_string()),
"Should identify as {}",
expected_name
);
assert!(
result.bad_bot_match.is_none(),
"Should NOT match as bad bot: {}",
expected_name
);
println!(
"Legitimate crawler {}: verified={}, suspicious={}",
expected_name, result.verified, result.suspicious
);
}
}
#[tokio::test]
async fn test_cache_hits_and_misses() {
let mut config = CrawlerConfig::default();
config.verify_legitimate_crawlers = false;
let detector = CrawlerDetector::new(config)
.await
.expect("Failed to create detector");
let ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)";
let ip = IpAddr::V4(Ipv4Addr::new(192, 0, 2, 1));
let initial_stats = detector.stats();
assert_eq!(initial_stats.cache_hits, 0);
assert_eq!(initial_stats.cache_misses, 0);
detector.verify(ua, ip).await;
let stats_after_first = detector.stats();
assert_eq!(
stats_after_first.cache_misses, 1,
"First request should be cache miss"
);
detector.verify(ua, ip).await;
let stats_after_second = detector.stats();
assert_eq!(
stats_after_second.cache_hits, 1,
"Second request should be cache hit"
);
assert_eq!(
stats_after_second.cache_misses, 1,
"Cache miss count should not increase"
);
detector.verify(ua, ip).await;
let stats_after_third = detector.stats();
assert_eq!(
stats_after_third.cache_hits, 2,
"Third request should be cache hit"
);
assert_eq!(
stats_after_third.cache_misses, 1,
"Cache miss count should remain unchanged"
);
println!("Cache statistics: {:?}", stats_after_third);
}
#[tokio::test]
async fn test_oversized_user_agent_rejection() {
let mut config = CrawlerConfig::default();
config.verify_legitimate_crawlers = false;
let detector = CrawlerDetector::new(config)
.await
.expect("Failed to create detector");
let oversized_ua = "a".repeat(513);
let client_ip = IpAddr::V4(Ipv4Addr::new(192, 0, 2, 1));
let result = detector.verify(&oversized_ua, client_ip).await;
assert!(result.input_rejected, "Oversized UA should be rejected");
assert!(result.suspicious, "Oversized UA should be suspicious");
assert!(
result
.suspicion_reasons
.iter()
.any(|r| r.contains("exceeds maximum")),
"Should include rejection reason"
);
let stats = detector.stats();
assert!(
stats.input_rejected > 0,
"Stats should track rejected input"
);
println!("Oversized UA result: {:?}", result);
}
#[tokio::test]
async fn test_crawler_stats_distribution() {
let mut config = CrawlerConfig::default();
config.verify_legitimate_crawlers = false;
let detector = CrawlerDetector::new(config)
.await
.expect("Failed to create detector");
let test_cases = vec![
(
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Googlebot",
),
(
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
"Bingbot",
),
(
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Googlebot",
),
(
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Googlebot",
),
];
for (idx, (ua, _name)) in test_cases.iter().enumerate() {
let ip = IpAddr::V4(Ipv4Addr::new(192, 0, 2, idx as u8 + 1));
detector.verify(ua, ip).await;
}
let distribution = detector.get_crawler_distribution(10);
println!("Crawler distribution: {:?}", distribution);
assert!(!distribution.is_empty(), "Distribution should have entries");
assert_eq!(
distribution.first().map(|(_, count)| *count),
Some(3),
"Most frequent crawler should have 3 hits"
);
}
#[tokio::test]
async fn test_bad_bot_stats_distribution() {
let mut config = CrawlerConfig::default();
config.block_bad_bots = true;
let detector = CrawlerDetector::new(config)
.await
.expect("Failed to create detector");
let test_cases = vec!["sqlmap/1.0", "sqlmap/1.0", "nikto/2.0", "sqlmap/1.0"];
for (idx, ua) in test_cases.iter().enumerate() {
let ip = IpAddr::V4(Ipv4Addr::new(203, 0, 113, idx as u8 + 1));
detector.verify(ua, ip).await;
}
let distribution = detector.get_bad_bot_distribution(10);
println!("Bad bot distribution: {:?}", distribution);
assert!(!distribution.is_empty(), "Distribution should have entries");
assert_eq!(
distribution.first().map(|(_, count)| *count),
Some(3),
"Most frequent bad bot should have 3 hits"
);
}
#[tokio::test]
async fn test_verification_method_unverified() {
let mut config = CrawlerConfig::default();
config.verify_legitimate_crawlers = false;
let detector = CrawlerDetector::new(config)
.await
.expect("Failed to create detector");
let ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)";
let ip = IpAddr::V4(Ipv4Addr::new(192, 0, 2, 1));
let result = detector.verify(ua, ip).await;
assert_eq!(
result.verification_method,
VerificationMethod::Unverified,
"Should be unverified when DNS verification is disabled"
);
assert!(result.user_agent_match, "Should have matched UA");
assert!(!result.reverse_dns_match, "Should not have checked DNS");
println!("Unverified method result: {:?}", result);
}
#[tokio::test]
async fn test_disabled_detector() {
let mut config = CrawlerConfig::default();
config.enabled = false;
config.block_bad_bots = false;
let detector = CrawlerDetector::new(config)
.await
.expect("Failed to create detector");
assert!(!detector.is_enabled(), "Detector should be disabled");
let ua = "sqlmap/1.0";
let ip = IpAddr::V4(Ipv4Addr::new(203, 0, 113, 1));
let result = detector.verify(ua, ip).await;
assert!(
!detector.is_enabled(),
"Detector should be marked as disabled"
);
println!("Disabled detector result: {:?}", result);
}
#[tokio::test]
async fn test_normal_browser_ua() {
let config = CrawlerConfig::default();
let detector = CrawlerDetector::new(config)
.await
.expect("Failed to create detector");
let normal_ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
let client_ip = IpAddr::V4(Ipv4Addr::new(203, 0, 113, 100));
let result = detector.verify(normal_ua, client_ip).await;
assert!(
!result.is_crawler,
"Normal browser should not be detected as crawler"
);
assert!(
result.bad_bot_match.is_none(),
"Normal browser should not match bad bot"
);
assert!(
!result.suspicious,
"Normal browser should not be suspicious"
);
println!("Normal browser result: {:?}", result);
}