bext-waf 0.2.0

Web Application Firewall for bext — rate limiting, IP filtering, GeoIP, rule engine
Documentation
//! Multi-signal bot detection with configurable response modes.
//!
//! Scores incoming requests against five behavioural signals (missing
//! User-Agent, Accept, Accept-Language, Accept-Encoding headers and known
//! scanner user-agent strings).  When the signal count meets the configured
//! threshold, the request is blocked, challenged with JavaScript, or logged
//! depending on [`BotMode`].  Known good bots (Googlebot, Bingbot, etc.) are
//! exempted by default.

use regex::RegexSet;
use serde::{Deserialize, Serialize};

use crate::{WafDecision, WafRequest};

/// Bot detection mode.
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum BotMode {
    /// Block detected bots.
    #[default]
    Block,
    /// Issue a JS challenge to suspected bots.
    Challenge,
    /// Log only; do not block.
    LogOnly,
}

/// Configuration for bot detection.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BotConfig {
    #[serde(default)]
    pub enabled: bool,
    #[serde(default)]
    pub mode: BotMode,
    /// User-agent substrings for known good bots (always allowed).
    #[serde(default = "default_good_bots")]
    pub good_bots: Vec<String>,
    /// Minimum number of signals to trigger bot detection.
    #[serde(default = "default_signal_threshold")]
    pub signal_threshold: u32,
}

fn default_good_bots() -> Vec<String> {
    vec![
        "Googlebot".into(),
        "Bingbot".into(),
        "Slurp".into(),
        "DuckDuckBot".into(),
        "facebookexternalhit".into(),
        "Twitterbot".into(),
        "LinkedInBot".into(),
        "Slackbot".into(),
        "Applebot".into(),
        "YandexBot".into(),
    ]
}

fn default_signal_threshold() -> u32 {
    2
}

impl Default for BotConfig {
    fn default() -> Self {
        Self {
            enabled: false,
            mode: BotMode::Block,
            good_bots: default_good_bots(),
            signal_threshold: default_signal_threshold(),
        }
    }
}

/// Bot detection engine.
pub struct BotDetector {
    config: BotConfig,
    good_bot_patterns: RegexSet,
}

impl BotDetector {
    pub fn new(config: BotConfig) -> Self {
        // Build the good-bot regex set from this instance's config.
        let patterns: Vec<String> = config
            .good_bots
            .iter()
            .map(|b| format!("(?i){}", regex::escape(b)))
            .collect();
        let good_bot_patterns = if patterns.is_empty() {
            RegexSet::new::<&[&str], &&str>(&[]).expect("empty regex set")
        } else {
            RegexSet::new(&patterns).expect("good bot patterns must compile")
        };

        Self {
            config,
            good_bot_patterns,
        }
    }

    fn is_good_bot(&self, ua: &str) -> bool {
        self.good_bot_patterns.is_match(ua)
    }

    /// Compute bot signals from the request.
    /// Returns (signal_count, signal_descriptions).
    fn compute_signals(&self, req: &WafRequest) -> (u32, Vec<&'static str>) {
        let mut signals = Vec::new();
        let mut count = 0u32;

        // Signal 1: No User-Agent at all.
        match &req.user_agent {
            None => {
                signals.push("missing User-Agent");
                count += 1;
            }
            Some(ua) if ua.trim().is_empty() => {
                signals.push("empty User-Agent");
                count += 1;
            }
            _ => {}
        }

        // Signal 2: Missing Accept header.
        let has_accept = req.headers.keys().any(|k| k.eq_ignore_ascii_case("accept"));
        if !has_accept {
            signals.push("missing Accept header");
            count += 1;
        }

        // Signal 3: Missing Accept-Language header.
        let has_lang = req
            .headers
            .keys()
            .any(|k| k.eq_ignore_ascii_case("accept-language"));
        if !has_lang {
            signals.push("missing Accept-Language header");
            count += 1;
        }

        // Signal 4: Known scanner user-agent.
        if let Some(ref ua) = req.user_agent {
            if crate::rules::scanner::check_scanner(ua).is_some() {
                signals.push("known scanner user-agent");
                count += 1;
            }
        }

        // Signal 5: Missing Accept-Encoding header (real browsers always send this).
        let has_encoding = req
            .headers
            .keys()
            .any(|k| k.eq_ignore_ascii_case("accept-encoding"));
        if !has_encoding {
            signals.push("missing Accept-Encoding header");
            count += 1;
        }

        (count, signals)
    }

    /// Check a request for bot-like behavior.
    pub fn check(&self, req: &WafRequest) -> Option<WafDecision> {
        if !self.config.enabled {
            return None;
        }

        // Allow known good bots.
        if let Some(ref ua) = req.user_agent {
            if self.is_good_bot(ua) {
                return None;
            }
        }

        let (signal_count, signals) = self.compute_signals(req);

        if signal_count < self.config.signal_threshold {
            return None;
        }

        let reason = format!(
            "bot detected ({signal_count} signals: {})",
            signals.join(", ")
        );

        match self.config.mode {
            BotMode::Block => Some(WafDecision::Block {
                status: 403,
                reason,
                rule: "bot_detection".into(),
            }),
            BotMode::Challenge => Some(WafDecision::Challenge {
                html: generate_challenge_html(),
            }),
            BotMode::LogOnly => {
                tracing::info!(signals = signal_count, details = ?signals, "bot detected (log-only)");
                None
            }
        }
    }
}

/// Generate a minimal JS challenge page. Bots without a JS engine will fail.
fn generate_challenge_html() -> String {
    r#"<!DOCTYPE html>
<html>
<head><title>Checking your browser</title></head>
<body>
<noscript>Please enable JavaScript to continue.</noscript>
<p id="msg">Verifying your browser...</p>
<script>
(function(){
  var ts = Date.now();
  var token = btoa(String(ts) + ':' + navigator.userAgent.length);
  document.cookie = '__waf_challenge=' + token + '; path=/; max-age=3600; SameSite=Strict';
  document.getElementById('msg').textContent = 'Redirecting...';
  setTimeout(function(){ location.reload(); }, 500);
})();
</script>
</body>
</html>"#
        .to_string()
}

#[cfg(test)]
mod tests {
    use super::*;

    fn make_req_with_headers(ua: Option<&str>, headers: Vec<(&str, &str)>) -> WafRequest {
        WafRequest {
            client_ip: "10.0.0.1".parse().unwrap(),
            method: "GET".into(),
            path: "/".into(),
            query: None,
            headers: headers
                .into_iter()
                .map(|(k, v)| (k.into(), v.into()))
                .collect(),
            body: None,
            user_agent: ua.map(String::from),
        }
    }

    fn browser_headers() -> Vec<(&'static str, &'static str)> {
        vec![
            ("Accept", "text/html,application/xhtml+xml"),
            ("Accept-Language", "en-US,en;q=0.9"),
            ("Accept-Encoding", "gzip, deflate, br"),
        ]
    }

    #[test]
    fn disabled_allows_all() {
        let detector = BotDetector::new(BotConfig::default());
        let req = make_req_with_headers(None, vec![]);
        assert!(detector.check(&req).is_none());
    }

    #[test]
    fn good_bot_allowed() {
        let config = BotConfig {
            enabled: true,
            signal_threshold: 1,
            ..Default::default()
        };
        let detector = BotDetector::new(config);
        let req = make_req_with_headers(
            Some("Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"),
            vec![], // missing all headers, but it's a good bot
        );
        assert!(detector.check(&req).is_none());
    }

    #[test]
    fn real_browser_allowed() {
        let config = BotConfig {
            enabled: true,
            signal_threshold: 2,
            ..Default::default()
        };
        let detector = BotDetector::new(config);
        let req = make_req_with_headers(
            Some("Mozilla/5.0 (Windows NT 10.0) Chrome/120.0"),
            browser_headers(),
        );
        assert!(detector.check(&req).is_none());
    }

    #[test]
    fn no_ua_and_no_headers_detected() {
        let config = BotConfig {
            enabled: true,
            signal_threshold: 2,
            ..Default::default()
        };
        let detector = BotDetector::new(config);
        let req = make_req_with_headers(None, vec![]);
        let decision = detector.check(&req);
        assert!(matches!(decision, Some(WafDecision::Block { .. })));
    }

    #[test]
    fn scanner_ua_detected() {
        let config = BotConfig {
            enabled: true,
            signal_threshold: 2,
            ..Default::default()
        };
        let detector = BotDetector::new(config);
        let req = make_req_with_headers(Some("sqlmap/1.5"), vec![]);
        let decision = detector.check(&req);
        assert!(decision.is_some());
    }

    #[test]
    fn challenge_mode_returns_html() {
        let config = BotConfig {
            enabled: true,
            mode: BotMode::Challenge,
            signal_threshold: 2,
            ..Default::default()
        };
        let detector = BotDetector::new(config);
        let req = make_req_with_headers(None, vec![]);
        let decision = detector.check(&req);
        match decision {
            Some(WafDecision::Challenge { html }) => {
                assert!(html.contains("__waf_challenge"));
                assert!(html.contains("<script>"));
            }
            _ => panic!("expected Challenge decision"),
        }
    }

    #[test]
    fn log_only_mode_allows() {
        let config = BotConfig {
            enabled: true,
            mode: BotMode::LogOnly,
            signal_threshold: 1,
            ..Default::default()
        };
        let detector = BotDetector::new(config);
        let req = make_req_with_headers(None, vec![]);
        // Log-only should return None (allow).
        assert!(detector.check(&req).is_none());
    }

    #[test]
    fn below_threshold_allowed() {
        let config = BotConfig {
            enabled: true,
            signal_threshold: 5, // very high threshold
            ..Default::default()
        };
        let detector = BotDetector::new(config);
        // Only 1 signal (missing UA), but threshold is 5.
        let req = make_req_with_headers(
            None,
            vec![
                ("Accept", "text/html"),
                ("Accept-Language", "en"),
                ("Accept-Encoding", "gzip"),
            ],
        );
        assert!(detector.check(&req).is_none());
    }

    #[test]
    fn challenge_html_is_valid() {
        let html = generate_challenge_html();
        assert!(html.contains("<!DOCTYPE html>"));
        assert!(html.contains("__waf_challenge"));
        assert!(html.contains("location.reload()"));
    }
}