Skip to main content

systemprompt_api/services/middleware/
bot_detector.rs

1use axum::extract::Request;
2use axum::middleware::Next;
3use axum::response::Response;
4use std::sync::Arc;
5
6const DATACENTER_IP_PREFIXES: &[&str] = &["47.79.", "47.82."];
7
8#[derive(Clone, Debug)]
9pub struct BotMarker {
10    pub is_bot: bool,
11    pub bot_type: BotType,
12    pub user_agent: String,
13    pub ip_address: Option<String>,
14}
15
16#[derive(Clone, Copy, Debug, PartialEq, Eq)]
17pub enum BotType {
18    KnownBot,
19    Scanner,
20    Suspicious,
21    Human,
22}
23
24pub async fn detect_bots_early(mut req: Request, next: Next) -> Response {
25    let user_agent = req
26        .headers()
27        .get("user-agent")
28        .and_then(|h| {
29            h.to_str()
30                .map_err(|e| {
31                    tracing::trace!(error = %e, "Invalid UTF-8 in user-agent header");
32                    e
33                })
34                .ok()
35        })
36        .unwrap_or("")
37        .to_string();
38
39    let ip_address = extract_ip_address(&req);
40    let uri_path = req.uri().path().to_string();
41
42    let marker = if is_known_bot(&user_agent) {
43        BotMarker {
44            is_bot: true,
45            bot_type: BotType::KnownBot,
46            user_agent: user_agent.clone(),
47            ip_address: ip_address.clone(),
48        }
49    } else if is_datacenter_ip(ip_address.as_deref()) {
50        BotMarker {
51            is_bot: true,
52            bot_type: BotType::Suspicious,
53            user_agent: user_agent.clone(),
54            ip_address: ip_address.clone(),
55        }
56    } else if is_scanner_request(&uri_path, &user_agent) {
57        BotMarker {
58            is_bot: false,
59            bot_type: BotType::Scanner,
60            user_agent: user_agent.clone(),
61            ip_address: ip_address.clone(),
62        }
63    } else {
64        BotMarker {
65            is_bot: false,
66            bot_type: BotType::Human,
67            user_agent: user_agent.clone(),
68            ip_address,
69        }
70    };
71
72    req.extensions_mut().insert(Arc::new(marker));
73    next.run(req).await
74}
75
76fn extract_ip_address(req: &Request) -> Option<String> {
77    req.headers()
78        .get("x-forwarded-for")
79        .and_then(|v| v.to_str().ok())
80        .and_then(|s| s.split(',').next())
81        .map(|s| s.trim().to_string())
82        .or_else(|| {
83            req.headers()
84                .get("x-real-ip")
85                .and_then(|v| v.to_str().ok())
86                .map(ToString::to_string)
87        })
88        .or_else(|| {
89            req.headers()
90                .get("cf-connecting-ip")
91                .and_then(|v| v.to_str().ok())
92                .map(ToString::to_string)
93        })
94}
95
96fn is_datacenter_ip(ip: Option<&str>) -> bool {
97    ip.is_some_and(|ip_addr| {
98        DATACENTER_IP_PREFIXES
99            .iter()
100            .any(|prefix| ip_addr.starts_with(prefix))
101    })
102}
103
104fn is_known_bot(user_agent: &str) -> bool {
105    let bot_patterns = [
106        "Googlebot",
107        "bingbot",
108        "Slurp",
109        "DuckDuckBot",
110        "Baiduspider",
111        "YandexBot",
112        "facebookexternalhit",
113        "Twitterbot",
114        "LinkedInBot",
115        "WhatsApp",
116        "TelegramBot",
117        "Discordbot",
118        "ia_archiver",
119        "curl",
120        "wget",
121        "python",
122        "java",
123        "perl",
124        "ruby",
125        "go-http-client",
126        "Node",
127        "scrapy",
128        "urllib",
129        "requests",
130        "okhttp",
131        "httpclient",
132    ];
133
134    let ua_lower = user_agent.to_lowercase();
135    bot_patterns
136        .iter()
137        .any(|pattern| ua_lower.contains(&pattern.to_lowercase()))
138}
139
140fn is_scanner_request(path: &str, user_agent: &str) -> bool {
141    let scanner_paths = [
142        ".env",
143        ".git",
144        ".php",
145        "admin",
146        "wp-admin",
147        "wp-login",
148        "administrator",
149        ".sql",
150        ".backup",
151        "config.php",
152        "web.config",
153        ".well-known",
154    ];
155
156    let scanner_agents = [
157        "masscan",
158        "nmap",
159        "nikto",
160        "sqlmap",
161        "metasploit",
162        "nessus",
163        "openvas",
164        "zap",
165        "burp",
166        "qualys",
167    ];
168
169    let path_lower = path.to_lowercase();
170    let ua_lower = user_agent.to_lowercase();
171
172    scanner_paths.iter().any(|p| path_lower.contains(p))
173        || scanner_agents.iter().any(|a| ua_lower.contains(a))
174}