use regex::RegexSet;
use serde::{Deserialize, Serialize};
use crate::{WafDecision, WafRequest};
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum BotMode {
#[default]
Block,
Challenge,
LogOnly,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BotConfig {
#[serde(default)]
pub enabled: bool,
#[serde(default)]
pub mode: BotMode,
#[serde(default = "default_good_bots")]
pub good_bots: Vec<String>,
#[serde(default = "default_signal_threshold")]
pub signal_threshold: u32,
}
fn default_good_bots() -> Vec<String> {
vec![
"Googlebot".into(),
"Bingbot".into(),
"Slurp".into(),
"DuckDuckBot".into(),
"facebookexternalhit".into(),
"Twitterbot".into(),
"LinkedInBot".into(),
"Slackbot".into(),
"Applebot".into(),
"YandexBot".into(),
]
}
fn default_signal_threshold() -> u32 {
2
}
impl Default for BotConfig {
fn default() -> Self {
Self {
enabled: false,
mode: BotMode::Block,
good_bots: default_good_bots(),
signal_threshold: default_signal_threshold(),
}
}
}
pub struct BotDetector {
config: BotConfig,
good_bot_patterns: RegexSet,
}
impl BotDetector {
pub fn new(config: BotConfig) -> Self {
let patterns: Vec<String> = config
.good_bots
.iter()
.map(|b| format!("(?i){}", regex::escape(b)))
.collect();
let good_bot_patterns = if patterns.is_empty() {
RegexSet::new::<&[&str], &&str>(&[]).expect("empty regex set")
} else {
RegexSet::new(&patterns).expect("good bot patterns must compile")
};
Self {
config,
good_bot_patterns,
}
}
fn is_good_bot(&self, ua: &str) -> bool {
self.good_bot_patterns.is_match(ua)
}
fn compute_signals(&self, req: &WafRequest) -> (u32, Vec<&'static str>) {
let mut signals = Vec::new();
let mut count = 0u32;
match &req.user_agent {
None => {
signals.push("missing User-Agent");
count += 1;
}
Some(ua) if ua.trim().is_empty() => {
signals.push("empty User-Agent");
count += 1;
}
_ => {}
}
let has_accept = req.headers.keys().any(|k| k.eq_ignore_ascii_case("accept"));
if !has_accept {
signals.push("missing Accept header");
count += 1;
}
let has_lang = req
.headers
.keys()
.any(|k| k.eq_ignore_ascii_case("accept-language"));
if !has_lang {
signals.push("missing Accept-Language header");
count += 1;
}
if let Some(ref ua) = req.user_agent {
if crate::rules::scanner::check_scanner(ua).is_some() {
signals.push("known scanner user-agent");
count += 1;
}
}
let has_encoding = req
.headers
.keys()
.any(|k| k.eq_ignore_ascii_case("accept-encoding"));
if !has_encoding {
signals.push("missing Accept-Encoding header");
count += 1;
}
(count, signals)
}
pub fn check(&self, req: &WafRequest) -> Option<WafDecision> {
if !self.config.enabled {
return None;
}
if let Some(ref ua) = req.user_agent {
if self.is_good_bot(ua) {
return None;
}
}
let (signal_count, signals) = self.compute_signals(req);
if signal_count < self.config.signal_threshold {
return None;
}
let reason = format!(
"bot detected ({signal_count} signals: {})",
signals.join(", ")
);
match self.config.mode {
BotMode::Block => Some(WafDecision::Block {
status: 403,
reason,
rule: "bot_detection".into(),
}),
BotMode::Challenge => Some(WafDecision::Challenge {
html: generate_challenge_html(),
}),
BotMode::LogOnly => {
tracing::info!(signals = signal_count, details = ?signals, "bot detected (log-only)");
None
}
}
}
}
fn generate_challenge_html() -> String {
r#"<!DOCTYPE html>
<html>
<head><title>Checking your browser</title></head>
<body>
<noscript>Please enable JavaScript to continue.</noscript>
<p id="msg">Verifying your browser...</p>
<script>
(function(){
var ts = Date.now();
var token = btoa(String(ts) + ':' + navigator.userAgent.length);
document.cookie = '__waf_challenge=' + token + '; path=/; max-age=3600; SameSite=Strict';
document.getElementById('msg').textContent = 'Redirecting...';
setTimeout(function(){ location.reload(); }, 500);
})();
</script>
</body>
</html>"#
.to_string()
}
#[cfg(test)]
mod tests {
use super::*;
fn make_req_with_headers(ua: Option<&str>, headers: Vec<(&str, &str)>) -> WafRequest {
WafRequest {
client_ip: "10.0.0.1".parse().unwrap(),
method: "GET".into(),
path: "/".into(),
query: None,
headers: headers
.into_iter()
.map(|(k, v)| (k.into(), v.into()))
.collect(),
body: None,
user_agent: ua.map(String::from),
}
}
fn browser_headers() -> Vec<(&'static str, &'static str)> {
vec![
("Accept", "text/html,application/xhtml+xml"),
("Accept-Language", "en-US,en;q=0.9"),
("Accept-Encoding", "gzip, deflate, br"),
]
}
#[test]
fn disabled_allows_all() {
let detector = BotDetector::new(BotConfig::default());
let req = make_req_with_headers(None, vec![]);
assert!(detector.check(&req).is_none());
}
#[test]
fn good_bot_allowed() {
let config = BotConfig {
enabled: true,
signal_threshold: 1,
..Default::default()
};
let detector = BotDetector::new(config);
let req = make_req_with_headers(
Some("Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"),
vec![], );
assert!(detector.check(&req).is_none());
}
#[test]
fn real_browser_allowed() {
let config = BotConfig {
enabled: true,
signal_threshold: 2,
..Default::default()
};
let detector = BotDetector::new(config);
let req = make_req_with_headers(
Some("Mozilla/5.0 (Windows NT 10.0) Chrome/120.0"),
browser_headers(),
);
assert!(detector.check(&req).is_none());
}
#[test]
fn no_ua_and_no_headers_detected() {
let config = BotConfig {
enabled: true,
signal_threshold: 2,
..Default::default()
};
let detector = BotDetector::new(config);
let req = make_req_with_headers(None, vec![]);
let decision = detector.check(&req);
assert!(matches!(decision, Some(WafDecision::Block { .. })));
}
#[test]
fn scanner_ua_detected() {
let config = BotConfig {
enabled: true,
signal_threshold: 2,
..Default::default()
};
let detector = BotDetector::new(config);
let req = make_req_with_headers(Some("sqlmap/1.5"), vec![]);
let decision = detector.check(&req);
assert!(decision.is_some());
}
#[test]
fn challenge_mode_returns_html() {
let config = BotConfig {
enabled: true,
mode: BotMode::Challenge,
signal_threshold: 2,
..Default::default()
};
let detector = BotDetector::new(config);
let req = make_req_with_headers(None, vec![]);
let decision = detector.check(&req);
match decision {
Some(WafDecision::Challenge { html }) => {
assert!(html.contains("__waf_challenge"));
assert!(html.contains("<script>"));
}
_ => panic!("expected Challenge decision"),
}
}
#[test]
fn log_only_mode_allows() {
let config = BotConfig {
enabled: true,
mode: BotMode::LogOnly,
signal_threshold: 1,
..Default::default()
};
let detector = BotDetector::new(config);
let req = make_req_with_headers(None, vec![]);
assert!(detector.check(&req).is_none());
}
#[test]
fn below_threshold_allowed() {
let config = BotConfig {
enabled: true,
signal_threshold: 5, ..Default::default()
};
let detector = BotDetector::new(config);
let req = make_req_with_headers(
None,
vec![
("Accept", "text/html"),
("Accept-Language", "en"),
("Accept-Encoding", "gzip"),
],
);
assert!(detector.check(&req).is_none());
}
#[test]
fn challenge_html_is_valid() {
let html = generate_challenge_html();
assert!(html.contains("<!DOCTYPE html>"));
assert!(html.contains("__waf_challenge"));
assert!(html.contains("location.reload()"));
}
}