systemprompt_api/services/middleware/
bot_detector.rs1use axum::extract::Request;
2use axum::middleware::Next;
3use axum::response::Response;
4use std::sync::Arc;
5use systemprompt_analytics::matches_bot_pattern;
6
7const DATACENTER_IP_PREFIXES: &[&str] = &[
8 "47.79.", "47.82.",
9 "47.88.", "47.89.", "47.90.", "47.91.", "47.92.", "47.93.", "47.94.", "47.95.",
10 "47.96.", "47.97.", "47.98.", "47.99.", "47.100.", "47.101.", "47.102.", "47.103.",
11 "47.104.", "47.105.", "47.106.", "47.107.", "47.108.", "47.109.", "47.110.", "47.111.",
12 "47.112.", "47.113.", "47.114.", "47.115.", "47.116.", "47.117.", "47.118.", "47.119.",
13 "119.29.", "129.28.", "162.14.",
14 "119.3.", "122.112.",
15];
16
17const CHROME_MIN_VERSION: i32 = 120;
18
19#[derive(Clone, Debug)]
20pub struct BotMarker {
21 pub is_bot: bool,
22 pub bot_type: BotType,
23 pub user_agent: String,
24 pub ip_address: Option<String>,
25}
26
27#[derive(Clone, Copy, Debug, PartialEq, Eq)]
28pub enum BotType {
29 KnownBot,
30 Scanner,
31 Suspicious,
32 Human,
33}
34
35pub async fn detect_bots_early(mut req: Request, next: Next) -> Response {
36 let user_agent = req
37 .headers()
38 .get("user-agent")
39 .and_then(|h| {
40 h.to_str()
41 .map_err(|e| {
42 tracing::trace!(error = %e, "Invalid UTF-8 in user-agent header");
43 e
44 })
45 .ok()
46 })
47 .unwrap_or("")
48 .to_string();
49
50 let ip_address = extract_ip_address(&req);
51 let uri_path = req.uri().path().to_string();
52
53 let marker = if is_known_bot(&user_agent) {
54 BotMarker {
55 is_bot: true,
56 bot_type: BotType::KnownBot,
57 user_agent: user_agent.clone(),
58 ip_address: ip_address.clone(),
59 }
60 } else if is_datacenter_ip(ip_address.as_deref()) || is_outdated_browser(&user_agent) {
61 BotMarker {
62 is_bot: true,
63 bot_type: BotType::Suspicious,
64 user_agent: user_agent.clone(),
65 ip_address: ip_address.clone(),
66 }
67 } else if is_scanner_request(&uri_path, &user_agent) {
68 BotMarker {
69 is_bot: false,
70 bot_type: BotType::Scanner,
71 user_agent: user_agent.clone(),
72 ip_address: ip_address.clone(),
73 }
74 } else {
75 BotMarker {
76 is_bot: false,
77 bot_type: BotType::Human,
78 user_agent: user_agent.clone(),
79 ip_address,
80 }
81 };
82
83 req.extensions_mut().insert(Arc::new(marker));
84 next.run(req).await
85}
86
87fn extract_ip_address(req: &Request) -> Option<String> {
88 req.headers()
89 .get("x-forwarded-for")
90 .and_then(|v| v.to_str().ok())
91 .and_then(|s| s.split(',').next())
92 .map(|s| s.trim().to_string())
93 .or_else(|| {
94 req.headers()
95 .get("x-real-ip")
96 .and_then(|v| v.to_str().ok())
97 .map(ToString::to_string)
98 })
99 .or_else(|| {
100 req.headers()
101 .get("cf-connecting-ip")
102 .and_then(|v| v.to_str().ok())
103 .map(ToString::to_string)
104 })
105}
106
107fn is_datacenter_ip(ip: Option<&str>) -> bool {
108 ip.is_some_and(|ip_addr| {
109 DATACENTER_IP_PREFIXES
110 .iter()
111 .any(|prefix| ip_addr.starts_with(prefix))
112 })
113}
114
115fn is_known_bot(user_agent: &str) -> bool {
116 matches_bot_pattern(user_agent)
117}
118
119fn is_outdated_browser(user_agent: &str) -> bool {
120 let ua_lower = user_agent.to_lowercase();
121
122 if let Some(pos) = ua_lower.find("chrome/") {
123 let version_str = &ua_lower[pos + 7..];
124 if let Some(dot_pos) = version_str.find('.') {
125 if let Ok(major) = version_str[..dot_pos].parse::<i32>() {
126 return major < CHROME_MIN_VERSION;
127 }
128 }
129 }
130
131 false
132}
133
134fn is_scanner_request(path: &str, user_agent: &str) -> bool {
135 let scanner_paths = [
136 ".env",
137 ".git",
138 ".php",
139 "admin",
140 "wp-admin",
141 "wp-login",
142 "administrator",
143 ".sql",
144 ".backup",
145 "config.php",
146 "web.config",
147 ".well-known",
148 ];
149
150 let scanner_agents = [
151 "masscan",
152 "nmap",
153 "nikto",
154 "sqlmap",
155 "metasploit",
156 "nessus",
157 "openvas",
158 "zap",
159 "burp",
160 "qualys",
161 ];
162
163 let path_lower = path.to_lowercase();
164 let ua_lower = user_agent.to_lowercase();
165
166 scanner_paths.iter().any(|p| path_lower.contains(p))
167 || scanner_agents.iter().any(|a| ua_lower.contains(a))
168}