Skip to main content

systemprompt_api/services/middleware/
bot_detector.rs

1use axum::extract::{ConnectInfo, Request};
2use axum::middleware::Next;
3use axum::response::Response;
4use ipnet::IpNet;
5use std::net::SocketAddr;
6use std::sync::Arc;
7use systemprompt_analytics::matches_bot_pattern;
8
9use super::client_addr::resolve_client_ip;
10
11const DATACENTER_IP_PREFIXES: &[&str] = &[
12    "47.79.", "47.82.", "47.88.", "47.89.", "47.90.", "47.91.", "47.92.", "47.93.", "47.94.",
13    "47.95.", "47.96.", "47.97.", "47.98.", "47.99.", "47.100.", "47.101.", "47.102.", "47.103.",
14    "47.104.", "47.105.", "47.106.", "47.107.", "47.108.", "47.109.", "47.110.", "47.111.",
15    "47.112.", "47.113.", "47.114.", "47.115.", "47.116.", "47.117.", "47.118.", "47.119.",
16    "119.29.", "129.28.", "162.14.", "119.3.", "122.112.",
17];
18
19const CHROME_MIN_VERSION: i32 = 120;
20
21#[derive(Clone, Debug)]
22pub struct BotMarker {
23    pub is_bot: bool,
24    pub bot_type: BotType,
25    pub user_agent: String,
26    pub ip_address: Option<String>,
27}
28
29#[derive(Clone, Copy, Debug, PartialEq, Eq)]
30pub enum BotType {
31    KnownBot,
32    Scanner,
33    Suspicious,
34    Human,
35}
36
37pub async fn detect_bots_early(
38    mut req: Request,
39    next: Next,
40    trusted_proxies: Arc<Vec<IpNet>>,
41) -> Response {
42    let user_agent = req
43        .headers()
44        .get("user-agent")
45        .and_then(|h| {
46            h.to_str()
47                .map_err(|e| {
48                    tracing::trace!(error = %e, "Invalid UTF-8 in user-agent header");
49                    e
50                })
51                .ok()
52        })
53        .unwrap_or("")
54        .to_string();
55
56    let ip_address = resolve_client_ip(
57        req.headers(),
58        req.extensions().get::<ConnectInfo<SocketAddr>>(),
59        &trusted_proxies,
60    )
61    .map(|a| a.to_string());
62    let uri_path = req.uri().path().to_string();
63
64    let marker = if is_known_bot(&user_agent) {
65        BotMarker {
66            is_bot: true,
67            bot_type: BotType::KnownBot,
68            user_agent: user_agent.clone(),
69            ip_address: ip_address.clone(),
70        }
71    } else if is_datacenter_ip(ip_address.as_deref()) || is_outdated_browser(&user_agent) {
72        BotMarker {
73            is_bot: true,
74            bot_type: BotType::Suspicious,
75            user_agent: user_agent.clone(),
76            ip_address: ip_address.clone(),
77        }
78    } else if is_scanner_request(&uri_path, &user_agent) {
79        BotMarker {
80            is_bot: false,
81            bot_type: BotType::Scanner,
82            user_agent: user_agent.clone(),
83            ip_address: ip_address.clone(),
84        }
85    } else {
86        BotMarker {
87            is_bot: false,
88            bot_type: BotType::Human,
89            user_agent: user_agent.clone(),
90            ip_address,
91        }
92    };
93
94    req.extensions_mut().insert(Arc::new(marker));
95    next.run(req).await
96}
97
98pub fn is_datacenter_ip(ip: Option<&str>) -> bool {
99    ip.is_some_and(|ip_addr| {
100        DATACENTER_IP_PREFIXES
101            .iter()
102            .any(|prefix| ip_addr.starts_with(prefix))
103    })
104}
105
106pub fn is_known_bot(user_agent: &str) -> bool {
107    matches_bot_pattern(user_agent)
108}
109
110pub fn is_outdated_browser(user_agent: &str) -> bool {
111    let ua_lower = user_agent.to_lowercase();
112
113    if let Some(pos) = ua_lower.find("chrome/") {
114        let version_str = &ua_lower[pos + 7..];
115        if let Some(dot_pos) = version_str.find('.') {
116            if let Ok(major) = version_str[..dot_pos].parse::<i32>() {
117                return major < CHROME_MIN_VERSION;
118            }
119        }
120    }
121
122    false
123}
124
125pub fn is_scanner_request(path: &str, user_agent: &str) -> bool {
126    let scanner_paths = [
127        ".env",
128        ".git",
129        ".php",
130        "admin",
131        "wp-admin",
132        "wp-login",
133        "administrator",
134        ".sql",
135        ".backup",
136        "config.php",
137        "web.config",
138        ".well-known",
139    ];
140
141    let scanner_agents = [
142        "masscan",
143        "nmap",
144        "nikto",
145        "sqlmap",
146        "metasploit",
147        "nessus",
148        "openvas",
149        "zap",
150        "burp",
151        "qualys",
152    ];
153
154    let path_lower = path.to_lowercase();
155    let ua_lower = user_agent.to_lowercase();
156
157    scanner_paths.iter().any(|p| path_lower.contains(p))
158        || scanner_agents.iter().any(|a| ua_lower.contains(a))
159}