Skip to main content

systemprompt_api/services/middleware/
bot_detector.rs

1use axum::extract::Request;
2use axum::middleware::Next;
3use axum::response::Response;
4use std::sync::Arc;
5use systemprompt_analytics::matches_bot_pattern;
6
7const DATACENTER_IP_PREFIXES: &[&str] = &["47.79.", "47.82."];
8
9#[derive(Clone, Debug)]
10pub struct BotMarker {
11    pub is_bot: bool,
12    pub bot_type: BotType,
13    pub user_agent: String,
14    pub ip_address: Option<String>,
15}
16
17#[derive(Clone, Copy, Debug, PartialEq, Eq)]
18pub enum BotType {
19    KnownBot,
20    Scanner,
21    Suspicious,
22    Human,
23}
24
25pub async fn detect_bots_early(mut req: Request, next: Next) -> Response {
26    let user_agent = req
27        .headers()
28        .get("user-agent")
29        .and_then(|h| {
30            h.to_str()
31                .map_err(|e| {
32                    tracing::trace!(error = %e, "Invalid UTF-8 in user-agent header");
33                    e
34                })
35                .ok()
36        })
37        .unwrap_or("")
38        .to_string();
39
40    let ip_address = extract_ip_address(&req);
41    let uri_path = req.uri().path().to_string();
42
43    let marker = if is_known_bot(&user_agent) {
44        BotMarker {
45            is_bot: true,
46            bot_type: BotType::KnownBot,
47            user_agent: user_agent.clone(),
48            ip_address: ip_address.clone(),
49        }
50    } else if is_datacenter_ip(ip_address.as_deref()) {
51        BotMarker {
52            is_bot: true,
53            bot_type: BotType::Suspicious,
54            user_agent: user_agent.clone(),
55            ip_address: ip_address.clone(),
56        }
57    } else if is_scanner_request(&uri_path, &user_agent) {
58        BotMarker {
59            is_bot: false,
60            bot_type: BotType::Scanner,
61            user_agent: user_agent.clone(),
62            ip_address: ip_address.clone(),
63        }
64    } else {
65        BotMarker {
66            is_bot: false,
67            bot_type: BotType::Human,
68            user_agent: user_agent.clone(),
69            ip_address,
70        }
71    };
72
73    req.extensions_mut().insert(Arc::new(marker));
74    next.run(req).await
75}
76
77fn extract_ip_address(req: &Request) -> Option<String> {
78    req.headers()
79        .get("x-forwarded-for")
80        .and_then(|v| v.to_str().ok())
81        .and_then(|s| s.split(',').next())
82        .map(|s| s.trim().to_string())
83        .or_else(|| {
84            req.headers()
85                .get("x-real-ip")
86                .and_then(|v| v.to_str().ok())
87                .map(ToString::to_string)
88        })
89        .or_else(|| {
90            req.headers()
91                .get("cf-connecting-ip")
92                .and_then(|v| v.to_str().ok())
93                .map(ToString::to_string)
94        })
95}
96
97fn is_datacenter_ip(ip: Option<&str>) -> bool {
98    ip.is_some_and(|ip_addr| {
99        DATACENTER_IP_PREFIXES
100            .iter()
101            .any(|prefix| ip_addr.starts_with(prefix))
102    })
103}
104
105fn is_known_bot(user_agent: &str) -> bool {
106    matches_bot_pattern(user_agent)
107}
108
109fn is_scanner_request(path: &str, user_agent: &str) -> bool {
110    let scanner_paths = [
111        ".env",
112        ".git",
113        ".php",
114        "admin",
115        "wp-admin",
116        "wp-login",
117        "administrator",
118        ".sql",
119        ".backup",
120        "config.php",
121        "web.config",
122        ".well-known",
123    ];
124
125    let scanner_agents = [
126        "masscan",
127        "nmap",
128        "nikto",
129        "sqlmap",
130        "metasploit",
131        "nessus",
132        "openvas",
133        "zap",
134        "burp",
135        "qualys",
136    ];
137
138    let path_lower = path.to_lowercase();
139    let ua_lower = user_agent.to_lowercase();
140
141    scanner_paths.iter().any(|p| path_lower.contains(p))
142        || scanner_agents.iter().any(|a| ua_lower.contains(a))
143}