Skip to main content

bext_waf/
bot.rs

1//! Multi-signal bot detection with configurable response modes.
2//!
3//! Scores incoming requests against five behavioural signals (missing
4//! User-Agent, Accept, Accept-Language, Accept-Encoding headers and known
5//! scanner user-agent strings).  When the signal count meets the configured
6//! threshold, the request is blocked, challenged with JavaScript, or logged
7//! depending on [`BotMode`].  Known good bots (Googlebot, Bingbot, etc.) are
8//! exempted by default.
9
10use regex::RegexSet;
11use serde::{Deserialize, Serialize};
12
13use crate::{WafDecision, WafRequest};
14
15/// Bot detection mode.
16#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
17#[serde(rename_all = "snake_case")]
18pub enum BotMode {
19    /// Block detected bots.
20    #[default]
21    Block,
22    /// Issue a JS challenge to suspected bots.
23    Challenge,
24    /// Log only; do not block.
25    LogOnly,
26}
27
28/// Configuration for bot detection.
29#[derive(Debug, Clone, Serialize, Deserialize)]
30pub struct BotConfig {
31    #[serde(default)]
32    pub enabled: bool,
33    #[serde(default)]
34    pub mode: BotMode,
35    /// User-agent substrings for known good bots (always allowed).
36    #[serde(default = "default_good_bots")]
37    pub good_bots: Vec<String>,
38    /// Minimum number of signals to trigger bot detection.
39    #[serde(default = "default_signal_threshold")]
40    pub signal_threshold: u32,
41}
42
43fn default_good_bots() -> Vec<String> {
44    vec![
45        "Googlebot".into(),
46        "Bingbot".into(),
47        "Slurp".into(),
48        "DuckDuckBot".into(),
49        "facebookexternalhit".into(),
50        "Twitterbot".into(),
51        "LinkedInBot".into(),
52        "Slackbot".into(),
53        "Applebot".into(),
54        "YandexBot".into(),
55    ]
56}
57
58fn default_signal_threshold() -> u32 {
59    2
60}
61
62impl Default for BotConfig {
63    fn default() -> Self {
64        Self {
65            enabled: false,
66            mode: BotMode::Block,
67            good_bots: default_good_bots(),
68            signal_threshold: default_signal_threshold(),
69        }
70    }
71}
72
73/// Bot detection engine.
74pub struct BotDetector {
75    config: BotConfig,
76    good_bot_patterns: RegexSet,
77}
78
79impl BotDetector {
80    pub fn new(config: BotConfig) -> Self {
81        // Build the good-bot regex set from this instance's config.
82        let patterns: Vec<String> = config
83            .good_bots
84            .iter()
85            .map(|b| format!("(?i){}", regex::escape(b)))
86            .collect();
87        let good_bot_patterns = if patterns.is_empty() {
88            RegexSet::new::<&[&str], &&str>(&[]).expect("empty regex set")
89        } else {
90            RegexSet::new(&patterns).expect("good bot patterns must compile")
91        };
92
93        Self {
94            config,
95            good_bot_patterns,
96        }
97    }
98
99    fn is_good_bot(&self, ua: &str) -> bool {
100        self.good_bot_patterns.is_match(ua)
101    }
102
103    /// Compute bot signals from the request.
104    /// Returns (signal_count, signal_descriptions).
105    fn compute_signals(&self, req: &WafRequest) -> (u32, Vec<&'static str>) {
106        let mut signals = Vec::new();
107        let mut count = 0u32;
108
109        // Signal 1: No User-Agent at all.
110        match &req.user_agent {
111            None => {
112                signals.push("missing User-Agent");
113                count += 1;
114            }
115            Some(ua) if ua.trim().is_empty() => {
116                signals.push("empty User-Agent");
117                count += 1;
118            }
119            _ => {}
120        }
121
122        // Signal 2: Missing Accept header.
123        let has_accept = req.headers.keys().any(|k| k.eq_ignore_ascii_case("accept"));
124        if !has_accept {
125            signals.push("missing Accept header");
126            count += 1;
127        }
128
129        // Signal 3: Missing Accept-Language header.
130        let has_lang = req
131            .headers
132            .keys()
133            .any(|k| k.eq_ignore_ascii_case("accept-language"));
134        if !has_lang {
135            signals.push("missing Accept-Language header");
136            count += 1;
137        }
138
139        // Signal 4: Known scanner user-agent.
140        if let Some(ref ua) = req.user_agent {
141            if crate::rules::scanner::check_scanner(ua).is_some() {
142                signals.push("known scanner user-agent");
143                count += 1;
144            }
145        }
146
147        // Signal 5: Missing Accept-Encoding header (real browsers always send this).
148        let has_encoding = req
149            .headers
150            .keys()
151            .any(|k| k.eq_ignore_ascii_case("accept-encoding"));
152        if !has_encoding {
153            signals.push("missing Accept-Encoding header");
154            count += 1;
155        }
156
157        (count, signals)
158    }
159
160    /// Check a request for bot-like behavior.
161    pub fn check(&self, req: &WafRequest) -> Option<WafDecision> {
162        if !self.config.enabled {
163            return None;
164        }
165
166        // Allow known good bots.
167        if let Some(ref ua) = req.user_agent {
168            if self.is_good_bot(ua) {
169                return None;
170            }
171        }
172
173        let (signal_count, signals) = self.compute_signals(req);
174
175        if signal_count < self.config.signal_threshold {
176            return None;
177        }
178
179        let reason = format!(
180            "bot detected ({signal_count} signals: {})",
181            signals.join(", ")
182        );
183
184        match self.config.mode {
185            BotMode::Block => Some(WafDecision::Block {
186                status: 403,
187                reason,
188                rule: "bot_detection".into(),
189            }),
190            BotMode::Challenge => Some(WafDecision::Challenge {
191                html: generate_challenge_html(),
192            }),
193            BotMode::LogOnly => {
194                tracing::info!(signals = signal_count, details = ?signals, "bot detected (log-only)");
195                None
196            }
197        }
198    }
199}
200
201/// Generate a minimal JS challenge page. Bots without a JS engine will fail.
202fn generate_challenge_html() -> String {
203    r#"<!DOCTYPE html>
204<html>
205<head><title>Checking your browser</title></head>
206<body>
207<noscript>Please enable JavaScript to continue.</noscript>
208<p id="msg">Verifying your browser...</p>
209<script>
210(function(){
211  var ts = Date.now();
212  var token = btoa(String(ts) + ':' + navigator.userAgent.length);
213  document.cookie = '__waf_challenge=' + token + '; path=/; max-age=3600; SameSite=Strict';
214  document.getElementById('msg').textContent = 'Redirecting...';
215  setTimeout(function(){ location.reload(); }, 500);
216})();
217</script>
218</body>
219</html>"#
220        .to_string()
221}
222
223#[cfg(test)]
224mod tests {
225    use super::*;
226
227    fn make_req_with_headers(ua: Option<&str>, headers: Vec<(&str, &str)>) -> WafRequest {
228        WafRequest {
229            client_ip: "10.0.0.1".parse().unwrap(),
230            method: "GET".into(),
231            path: "/".into(),
232            query: None,
233            headers: headers
234                .into_iter()
235                .map(|(k, v)| (k.into(), v.into()))
236                .collect(),
237            body: None,
238            user_agent: ua.map(String::from),
239        }
240    }
241
242    fn browser_headers() -> Vec<(&'static str, &'static str)> {
243        vec![
244            ("Accept", "text/html,application/xhtml+xml"),
245            ("Accept-Language", "en-US,en;q=0.9"),
246            ("Accept-Encoding", "gzip, deflate, br"),
247        ]
248    }
249
250    #[test]
251    fn disabled_allows_all() {
252        let detector = BotDetector::new(BotConfig::default());
253        let req = make_req_with_headers(None, vec![]);
254        assert!(detector.check(&req).is_none());
255    }
256
257    #[test]
258    fn good_bot_allowed() {
259        let config = BotConfig {
260            enabled: true,
261            signal_threshold: 1,
262            ..Default::default()
263        };
264        let detector = BotDetector::new(config);
265        let req = make_req_with_headers(
266            Some("Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"),
267            vec![], // missing all headers, but it's a good bot
268        );
269        assert!(detector.check(&req).is_none());
270    }
271
272    #[test]
273    fn real_browser_allowed() {
274        let config = BotConfig {
275            enabled: true,
276            signal_threshold: 2,
277            ..Default::default()
278        };
279        let detector = BotDetector::new(config);
280        let req = make_req_with_headers(
281            Some("Mozilla/5.0 (Windows NT 10.0) Chrome/120.0"),
282            browser_headers(),
283        );
284        assert!(detector.check(&req).is_none());
285    }
286
287    #[test]
288    fn no_ua_and_no_headers_detected() {
289        let config = BotConfig {
290            enabled: true,
291            signal_threshold: 2,
292            ..Default::default()
293        };
294        let detector = BotDetector::new(config);
295        let req = make_req_with_headers(None, vec![]);
296        let decision = detector.check(&req);
297        assert!(matches!(decision, Some(WafDecision::Block { .. })));
298    }
299
300    #[test]
301    fn scanner_ua_detected() {
302        let config = BotConfig {
303            enabled: true,
304            signal_threshold: 2,
305            ..Default::default()
306        };
307        let detector = BotDetector::new(config);
308        let req = make_req_with_headers(Some("sqlmap/1.5"), vec![]);
309        let decision = detector.check(&req);
310        assert!(decision.is_some());
311    }
312
313    #[test]
314    fn challenge_mode_returns_html() {
315        let config = BotConfig {
316            enabled: true,
317            mode: BotMode::Challenge,
318            signal_threshold: 2,
319            ..Default::default()
320        };
321        let detector = BotDetector::new(config);
322        let req = make_req_with_headers(None, vec![]);
323        let decision = detector.check(&req);
324        match decision {
325            Some(WafDecision::Challenge { html }) => {
326                assert!(html.contains("__waf_challenge"));
327                assert!(html.contains("<script>"));
328            }
329            _ => panic!("expected Challenge decision"),
330        }
331    }
332
333    #[test]
334    fn log_only_mode_allows() {
335        let config = BotConfig {
336            enabled: true,
337            mode: BotMode::LogOnly,
338            signal_threshold: 1,
339            ..Default::default()
340        };
341        let detector = BotDetector::new(config);
342        let req = make_req_with_headers(None, vec![]);
343        // Log-only should return None (allow).
344        assert!(detector.check(&req).is_none());
345    }
346
347    #[test]
348    fn below_threshold_allowed() {
349        let config = BotConfig {
350            enabled: true,
351            signal_threshold: 5, // very high threshold
352            ..Default::default()
353        };
354        let detector = BotDetector::new(config);
355        // Only 1 signal (missing UA), but threshold is 5.
356        let req = make_req_with_headers(
357            None,
358            vec![
359                ("Accept", "text/html"),
360                ("Accept-Language", "en"),
361                ("Accept-Encoding", "gzip"),
362            ],
363        );
364        assert!(detector.check(&req).is_none());
365    }
366
367    #[test]
368    fn challenge_html_is_valid() {
369        let html = generate_challenge_html();
370        assert!(html.contains("<!DOCTYPE html>"));
371        assert!(html.contains("__waf_challenge"));
372        assert!(html.contains("location.reload()"));
373    }
374}