Skip to main content

systemprompt_api/services/middleware/
bot_detector.rs

1use axum::extract::Request;
2use axum::middleware::Next;
3use axum::response::Response;
4use std::sync::Arc;
5use systemprompt_analytics::matches_bot_pattern;
6
7const DATACENTER_IP_PREFIXES: &[&str] = &[
8    "47.79.", "47.82.",
9    "47.88.", "47.89.", "47.90.", "47.91.", "47.92.", "47.93.", "47.94.", "47.95.",
10    "47.96.", "47.97.", "47.98.", "47.99.", "47.100.", "47.101.", "47.102.", "47.103.",
11    "47.104.", "47.105.", "47.106.", "47.107.", "47.108.", "47.109.", "47.110.", "47.111.",
12    "47.112.", "47.113.", "47.114.", "47.115.", "47.116.", "47.117.", "47.118.", "47.119.",
13    "119.29.", "129.28.", "162.14.",
14    "119.3.", "122.112.",
15];
16
17const CHROME_MIN_VERSION: i32 = 120;
18
19#[derive(Clone, Debug)]
20pub struct BotMarker {
21    pub is_bot: bool,
22    pub bot_type: BotType,
23    pub user_agent: String,
24    pub ip_address: Option<String>,
25}
26
27#[derive(Clone, Copy, Debug, PartialEq, Eq)]
28pub enum BotType {
29    KnownBot,
30    Scanner,
31    Suspicious,
32    Human,
33}
34
35pub async fn detect_bots_early(mut req: Request, next: Next) -> Response {
36    let user_agent = req
37        .headers()
38        .get("user-agent")
39        .and_then(|h| {
40            h.to_str()
41                .map_err(|e| {
42                    tracing::trace!(error = %e, "Invalid UTF-8 in user-agent header");
43                    e
44                })
45                .ok()
46        })
47        .unwrap_or("")
48        .to_string();
49
50    let ip_address = extract_ip_address(&req);
51    let uri_path = req.uri().path().to_string();
52
53    let marker = if is_known_bot(&user_agent) {
54        BotMarker {
55            is_bot: true,
56            bot_type: BotType::KnownBot,
57            user_agent: user_agent.clone(),
58            ip_address: ip_address.clone(),
59        }
60    } else if is_datacenter_ip(ip_address.as_deref()) || is_outdated_browser(&user_agent) {
61        BotMarker {
62            is_bot: true,
63            bot_type: BotType::Suspicious,
64            user_agent: user_agent.clone(),
65            ip_address: ip_address.clone(),
66        }
67    } else if is_scanner_request(&uri_path, &user_agent) {
68        BotMarker {
69            is_bot: false,
70            bot_type: BotType::Scanner,
71            user_agent: user_agent.clone(),
72            ip_address: ip_address.clone(),
73        }
74    } else {
75        BotMarker {
76            is_bot: false,
77            bot_type: BotType::Human,
78            user_agent: user_agent.clone(),
79            ip_address,
80        }
81    };
82
83    req.extensions_mut().insert(Arc::new(marker));
84    next.run(req).await
85}
86
87fn extract_ip_address(req: &Request) -> Option<String> {
88    req.headers()
89        .get("x-forwarded-for")
90        .and_then(|v| v.to_str().ok())
91        .and_then(|s| s.split(',').next())
92        .map(|s| s.trim().to_string())
93        .or_else(|| {
94            req.headers()
95                .get("x-real-ip")
96                .and_then(|v| v.to_str().ok())
97                .map(ToString::to_string)
98        })
99        .or_else(|| {
100            req.headers()
101                .get("cf-connecting-ip")
102                .and_then(|v| v.to_str().ok())
103                .map(ToString::to_string)
104        })
105}
106
107fn is_datacenter_ip(ip: Option<&str>) -> bool {
108    ip.is_some_and(|ip_addr| {
109        DATACENTER_IP_PREFIXES
110            .iter()
111            .any(|prefix| ip_addr.starts_with(prefix))
112    })
113}
114
115fn is_known_bot(user_agent: &str) -> bool {
116    matches_bot_pattern(user_agent)
117}
118
119fn is_outdated_browser(user_agent: &str) -> bool {
120    let ua_lower = user_agent.to_lowercase();
121
122    if let Some(pos) = ua_lower.find("chrome/") {
123        let version_str = &ua_lower[pos + 7..];
124        if let Some(dot_pos) = version_str.find('.') {
125            if let Ok(major) = version_str[..dot_pos].parse::<i32>() {
126                return major < CHROME_MIN_VERSION;
127            }
128        }
129    }
130
131    false
132}
133
134fn is_scanner_request(path: &str, user_agent: &str) -> bool {
135    let scanner_paths = [
136        ".env",
137        ".git",
138        ".php",
139        "admin",
140        "wp-admin",
141        "wp-login",
142        "administrator",
143        ".sql",
144        ".backup",
145        "config.php",
146        "web.config",
147        ".well-known",
148    ];
149
150    let scanner_agents = [
151        "masscan",
152        "nmap",
153        "nikto",
154        "sqlmap",
155        "metasploit",
156        "nessus",
157        "openvas",
158        "zap",
159        "burp",
160        "qualys",
161    ];
162
163    let path_lower = path.to_lowercase();
164    let ua_lower = user_agent.to_lowercase();
165
166    scanner_paths.iter().any(|p| path_lower.contains(p))
167        || scanner_agents.iter().any(|a| ua_lower.contains(a))
168}