Skip to main content

systemprompt_api/services/middleware/
bot_detector.rs

1//! Early bot-classification middleware.
2//!
3//! [`detect_bots_early`] inspects the user agent, client IP, and request path
4//! before routing, attaching a [`BotMarker`] (with its [`BotType`]) to the
5//! request extensions so downstream layers can branch on known bots, scanners,
6//! and suspicious traffic without re-parsing headers.
7
8use axum::extract::{ConnectInfo, Request};
9use axum::middleware::Next;
10use axum::response::Response;
11use ipnet::IpNet;
12use std::net::SocketAddr;
13use std::sync::Arc;
14use systemprompt_analytics::matches_bot_pattern;
15
16use super::client_addr::resolve_client_ip;
17
18const DATACENTER_IP_PREFIXES: &[&str] = &[
19    "47.79.", "47.82.", "47.88.", "47.89.", "47.90.", "47.91.", "47.92.", "47.93.", "47.94.",
20    "47.95.", "47.96.", "47.97.", "47.98.", "47.99.", "47.100.", "47.101.", "47.102.", "47.103.",
21    "47.104.", "47.105.", "47.106.", "47.107.", "47.108.", "47.109.", "47.110.", "47.111.",
22    "47.112.", "47.113.", "47.114.", "47.115.", "47.116.", "47.117.", "47.118.", "47.119.",
23    "119.29.", "129.28.", "162.14.", "119.3.", "122.112.",
24];
25
26pub(super) const CHROME_MIN_VERSION: i32 = 120;
27
28#[derive(Clone, Debug)]
29pub struct BotMarker {
30    pub is_bot: bool,
31    pub bot_type: BotType,
32    pub user_agent: String,
33    pub ip_address: Option<String>,
34}
35
36#[derive(Clone, Copy, Debug, PartialEq, Eq)]
37pub enum BotType {
38    KnownBot,
39    Scanner,
40    Suspicious,
41    Human,
42}
43
44pub async fn detect_bots_early(
45    mut req: Request,
46    next: Next,
47    trusted_proxies: Arc<Vec<IpNet>>,
48) -> Response {
49    let user_agent = req
50        .headers()
51        .get("user-agent")
52        .and_then(|h| {
53            h.to_str()
54                .map_err(|e| {
55                    tracing::trace!(error = %e, "Invalid UTF-8 in user-agent header");
56                    e
57                })
58                .ok()
59        })
60        .unwrap_or("")
61        .to_owned();
62
63    let ip_address = resolve_client_ip(
64        req.headers(),
65        req.extensions().get::<ConnectInfo<SocketAddr>>(),
66        &trusted_proxies,
67    )
68    .map(|a| a.to_string());
69    let uri_path = req.uri().path().to_owned();
70
71    let marker = if is_known_bot(&user_agent) {
72        BotMarker {
73            is_bot: true,
74            bot_type: BotType::KnownBot,
75            user_agent: user_agent.clone(),
76            ip_address: ip_address.clone(),
77        }
78    } else if is_datacenter_ip(ip_address.as_deref()) || is_outdated_browser(&user_agent) {
79        BotMarker {
80            is_bot: true,
81            bot_type: BotType::Suspicious,
82            user_agent: user_agent.clone(),
83            ip_address: ip_address.clone(),
84        }
85    } else if is_scanner_request(&uri_path, &user_agent) {
86        BotMarker {
87            is_bot: false,
88            bot_type: BotType::Scanner,
89            user_agent: user_agent.clone(),
90            ip_address: ip_address.clone(),
91        }
92    } else {
93        BotMarker {
94            is_bot: false,
95            bot_type: BotType::Human,
96            user_agent: user_agent.clone(),
97            ip_address,
98        }
99    };
100
101    req.extensions_mut().insert(Arc::new(marker));
102    next.run(req).await
103}
104
105pub fn is_datacenter_ip(ip: Option<&str>) -> bool {
106    ip.is_some_and(|ip_addr| {
107        DATACENTER_IP_PREFIXES
108            .iter()
109            .any(|prefix| ip_addr.starts_with(prefix))
110    })
111}
112
113pub fn is_known_bot(user_agent: &str) -> bool {
114    matches_bot_pattern(user_agent)
115}
116
117pub fn is_outdated_browser(user_agent: &str) -> bool {
118    let ua_lower = user_agent.to_lowercase();
119
120    if let Some(pos) = ua_lower.find("chrome/") {
121        let version_str = &ua_lower[pos + 7..];
122        if let Some(dot_pos) = version_str.find('.') {
123            if let Ok(major) = version_str[..dot_pos].parse::<i32>() {
124                return major < CHROME_MIN_VERSION;
125            }
126        }
127    }
128
129    false
130}
131
132pub fn is_scanner_request(path: &str, user_agent: &str) -> bool {
133    let scanner_paths = [
134        ".env",
135        ".git",
136        ".php",
137        "admin",
138        "wp-admin",
139        "wp-login",
140        "administrator",
141        ".sql",
142        ".backup",
143        "config.php",
144        "web.config",
145        ".well-known",
146    ];
147
148    let scanner_agents = [
149        "masscan",
150        "nmap",
151        "nikto",
152        "sqlmap",
153        "metasploit",
154        "nessus",
155        "openvas",
156        "zap",
157        "burp",
158        "qualys",
159    ];
160
161    let path_lower = path.to_lowercase();
162    let ua_lower = user_agent.to_lowercase();
163
164    scanner_paths.iter().any(|p| path_lower.contains(p))
165        || scanner_agents.iter().any(|a| ua_lower.contains(a))
166}