systemprompt_api/services/middleware/
bot_detector.rs1use axum::extract::Request;
2use axum::middleware::Next;
3use axum::response::Response;
4use std::sync::Arc;
5use systemprompt_analytics::matches_bot_pattern;
6
7const DATACENTER_IP_PREFIXES: &[&str] = &[
8 "47.79.", "47.82.", "47.88.", "47.89.", "47.90.", "47.91.", "47.92.", "47.93.", "47.94.",
9 "47.95.", "47.96.", "47.97.", "47.98.", "47.99.", "47.100.", "47.101.", "47.102.", "47.103.",
10 "47.104.", "47.105.", "47.106.", "47.107.", "47.108.", "47.109.", "47.110.", "47.111.",
11 "47.112.", "47.113.", "47.114.", "47.115.", "47.116.", "47.117.", "47.118.", "47.119.",
12 "119.29.", "129.28.", "162.14.", "119.3.", "122.112.",
13];
14
15const CHROME_MIN_VERSION: i32 = 120;
16
17#[derive(Clone, Debug)]
18pub struct BotMarker {
19 pub is_bot: bool,
20 pub bot_type: BotType,
21 pub user_agent: String,
22 pub ip_address: Option<String>,
23}
24
25#[derive(Clone, Copy, Debug, PartialEq, Eq)]
26pub enum BotType {
27 KnownBot,
28 Scanner,
29 Suspicious,
30 Human,
31}
32
33pub async fn detect_bots_early(mut req: Request, next: Next) -> Response {
34 let user_agent = req
35 .headers()
36 .get("user-agent")
37 .and_then(|h| {
38 h.to_str()
39 .map_err(|e| {
40 tracing::trace!(error = %e, "Invalid UTF-8 in user-agent header");
41 e
42 })
43 .ok()
44 })
45 .unwrap_or("")
46 .to_string();
47
48 let ip_address = extract_ip_address(&req);
49 let uri_path = req.uri().path().to_string();
50
51 let marker = if is_known_bot(&user_agent) {
52 BotMarker {
53 is_bot: true,
54 bot_type: BotType::KnownBot,
55 user_agent: user_agent.clone(),
56 ip_address: ip_address.clone(),
57 }
58 } else if is_datacenter_ip(ip_address.as_deref()) || is_outdated_browser(&user_agent) {
59 BotMarker {
60 is_bot: true,
61 bot_type: BotType::Suspicious,
62 user_agent: user_agent.clone(),
63 ip_address: ip_address.clone(),
64 }
65 } else if is_scanner_request(&uri_path, &user_agent) {
66 BotMarker {
67 is_bot: false,
68 bot_type: BotType::Scanner,
69 user_agent: user_agent.clone(),
70 ip_address: ip_address.clone(),
71 }
72 } else {
73 BotMarker {
74 is_bot: false,
75 bot_type: BotType::Human,
76 user_agent: user_agent.clone(),
77 ip_address,
78 }
79 };
80
81 req.extensions_mut().insert(Arc::new(marker));
82 next.run(req).await
83}
84
85fn extract_ip_address(req: &Request) -> Option<String> {
86 req.headers()
87 .get("x-forwarded-for")
88 .and_then(|v| v.to_str().ok())
89 .and_then(|s| s.split(',').next())
90 .map(|s| s.trim().to_string())
91 .or_else(|| {
92 req.headers()
93 .get("x-real-ip")
94 .and_then(|v| v.to_str().ok())
95 .map(ToString::to_string)
96 })
97 .or_else(|| {
98 req.headers()
99 .get("cf-connecting-ip")
100 .and_then(|v| v.to_str().ok())
101 .map(ToString::to_string)
102 })
103 .or_else(|| {
104 req.extensions()
105 .get::<axum::extract::ConnectInfo<std::net::SocketAddr>>()
106 .map(|ci| ci.0.ip().to_string())
107 })
108}
109
110pub fn is_datacenter_ip(ip: Option<&str>) -> bool {
111 ip.is_some_and(|ip_addr| {
112 DATACENTER_IP_PREFIXES
113 .iter()
114 .any(|prefix| ip_addr.starts_with(prefix))
115 })
116}
117
118pub fn is_known_bot(user_agent: &str) -> bool {
119 matches_bot_pattern(user_agent)
120}
121
122pub fn is_outdated_browser(user_agent: &str) -> bool {
123 let ua_lower = user_agent.to_lowercase();
124
125 if let Some(pos) = ua_lower.find("chrome/") {
126 let version_str = &ua_lower[pos + 7..];
127 if let Some(dot_pos) = version_str.find('.') {
128 if let Ok(major) = version_str[..dot_pos].parse::<i32>() {
129 return major < CHROME_MIN_VERSION;
130 }
131 }
132 }
133
134 false
135}
136
137pub fn is_scanner_request(path: &str, user_agent: &str) -> bool {
138 let scanner_paths = [
139 ".env",
140 ".git",
141 ".php",
142 "admin",
143 "wp-admin",
144 "wp-login",
145 "administrator",
146 ".sql",
147 ".backup",
148 "config.php",
149 "web.config",
150 ".well-known",
151 ];
152
153 let scanner_agents = [
154 "masscan",
155 "nmap",
156 "nikto",
157 "sqlmap",
158 "metasploit",
159 "nessus",
160 "openvas",
161 "zap",
162 "burp",
163 "qualys",
164 ];
165
166 let path_lower = path.to_lowercase();
167 let ua_lower = user_agent.to_lowercase();
168
169 scanner_paths.iter().any(|p| path_lower.contains(p))
170 || scanner_agents.iter().any(|a| ua_lower.contains(a))
171}