Skip to main content

forge_runtime/signals/
bot.rs

1//! Bot detection via User-Agent pattern matching.
2//!
3//! Classifies requests as bot or human based on known crawler patterns.
4//! Bot events are stored with `is_bot = true` for dashboard filtering.
5
6/// Known bot User-Agent substrings (case-insensitive matching).
7const BOT_PATTERNS: &[&str] = &[
8    // Search engines
9    "googlebot",
10    "bingbot",
11    "yandexbot",
12    "baiduspider",
13    "duckduckbot",
14    "slurp",
15    "sogou",
16    "exabot",
17    "ia_archiver",
18    // Social media
19    "facebookexternalhit",
20    "twitterbot",
21    "linkedinbot",
22    "whatsapp",
23    "telegrambot",
24    "discordbot",
25    "slackbot",
26    // Monitoring / SEO / tools
27    "uptimerobot",
28    "pingdom",
29    "site24x7",
30    "statuscake",
31    "semrushbot",
32    "ahrefsbot",
33    "mj12bot",
34    "dotbot",
35    "rogerbot",
36    "screaming frog",
37    // Headless browsers
38    "headlesschrome",
39    "phantomjs",
40    "puppeteer",
41    "playwright",
42    // Generic
43    "bot/",
44    "bot;",
45    "crawler",
46    "spider",
47    "scraper",
48    "http-client",
49    "python-requests",
50    "python-urllib",
51    "go-http-client",
52    "java/",
53    "wget",
54    "curl/",
55    "libwww",
56    "apache-httpclient",
57    "okhttp",
58    "node-fetch",
59    "axios",
60    "postman",
61];
62
63/// Check if a User-Agent string belongs to a known bot.
64pub fn is_bot(user_agent: Option<&str>) -> bool {
65    let ua = match user_agent {
66        Some(ua) if !ua.is_empty() => ua,
67        _ => return false,
68    };
69    is_bot_lower(&ua.to_ascii_lowercase())
70}
71
72/// Check if a pre-lowercased User-Agent string belongs to a known bot.
73pub fn is_bot_lower(ua_lower: &str) -> bool {
74    BOT_PATTERNS
75        .iter()
76        .any(|pattern| ua_lower.contains(pattern))
77}
78
79#[cfg(test)]
80#[allow(clippy::unwrap_used, clippy::indexing_slicing)]
81mod tests {
82    use super::*;
83
84    #[tokio::test]
85    async fn detects_googlebot() {
86        assert!(is_bot(Some(
87            "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
88        )));
89    }
90
91    #[tokio::test]
92    async fn detects_headless_chrome() {
93        assert!(is_bot(Some("Mozilla/5.0 HeadlessChrome/90.0.4430.212")));
94    }
95
96    #[tokio::test]
97    async fn detects_curl() {
98        assert!(is_bot(Some("curl/7.68.0")));
99    }
100
101    #[tokio::test]
102    async fn allows_real_browser() {
103        assert!(!is_bot(Some(
104            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
105        )));
106    }
107
108    #[tokio::test]
109    async fn allows_mobile_browser() {
110        assert!(!is_bot(Some(
111            "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1"
112        )));
113    }
114
115    #[tokio::test]
116    async fn missing_ua_is_not_bot() {
117        assert!(!is_bot(None));
118        assert!(!is_bot(Some("")));
119    }
120}