systemprompt_api/services/middleware/
bot_detector.rs1use axum::extract::Request;
2use axum::middleware::Next;
3use axum::response::Response;
4use std::sync::Arc;
5
6const DATACENTER_IP_PREFIXES: &[&str] = &["47.79.", "47.82."];
7
8#[derive(Clone, Debug)]
9pub struct BotMarker {
10 pub is_bot: bool,
11 pub bot_type: BotType,
12 pub user_agent: String,
13 pub ip_address: Option<String>,
14}
15
16#[derive(Clone, Copy, Debug, PartialEq, Eq)]
17pub enum BotType {
18 KnownBot,
19 Scanner,
20 Suspicious,
21 Human,
22}
23
24pub async fn detect_bots_early(mut req: Request, next: Next) -> Response {
25 let user_agent = req
26 .headers()
27 .get("user-agent")
28 .and_then(|h| {
29 h.to_str()
30 .map_err(|e| {
31 tracing::trace!(error = %e, "Invalid UTF-8 in user-agent header");
32 e
33 })
34 .ok()
35 })
36 .unwrap_or("")
37 .to_string();
38
39 let ip_address = extract_ip_address(&req);
40 let uri_path = req.uri().path().to_string();
41
42 let marker = if is_known_bot(&user_agent) {
43 BotMarker {
44 is_bot: true,
45 bot_type: BotType::KnownBot,
46 user_agent: user_agent.clone(),
47 ip_address: ip_address.clone(),
48 }
49 } else if is_datacenter_ip(ip_address.as_deref()) {
50 BotMarker {
51 is_bot: true,
52 bot_type: BotType::Suspicious,
53 user_agent: user_agent.clone(),
54 ip_address: ip_address.clone(),
55 }
56 } else if is_scanner_request(&uri_path, &user_agent) {
57 BotMarker {
58 is_bot: false,
59 bot_type: BotType::Scanner,
60 user_agent: user_agent.clone(),
61 ip_address: ip_address.clone(),
62 }
63 } else {
64 BotMarker {
65 is_bot: false,
66 bot_type: BotType::Human,
67 user_agent: user_agent.clone(),
68 ip_address,
69 }
70 };
71
72 req.extensions_mut().insert(Arc::new(marker));
73 next.run(req).await
74}
75
76fn extract_ip_address(req: &Request) -> Option<String> {
77 req.headers()
78 .get("x-forwarded-for")
79 .and_then(|v| v.to_str().ok())
80 .and_then(|s| s.split(',').next())
81 .map(|s| s.trim().to_string())
82 .or_else(|| {
83 req.headers()
84 .get("x-real-ip")
85 .and_then(|v| v.to_str().ok())
86 .map(ToString::to_string)
87 })
88 .or_else(|| {
89 req.headers()
90 .get("cf-connecting-ip")
91 .and_then(|v| v.to_str().ok())
92 .map(ToString::to_string)
93 })
94}
95
96fn is_datacenter_ip(ip: Option<&str>) -> bool {
97 ip.is_some_and(|ip_addr| {
98 DATACENTER_IP_PREFIXES
99 .iter()
100 .any(|prefix| ip_addr.starts_with(prefix))
101 })
102}
103
104fn is_known_bot(user_agent: &str) -> bool {
105 let bot_patterns = [
106 "Googlebot",
107 "bingbot",
108 "Slurp",
109 "DuckDuckBot",
110 "Baiduspider",
111 "YandexBot",
112 "facebookexternalhit",
113 "Twitterbot",
114 "LinkedInBot",
115 "WhatsApp",
116 "TelegramBot",
117 "Discordbot",
118 "ia_archiver",
119 "curl",
120 "wget",
121 "python",
122 "java",
123 "perl",
124 "ruby",
125 "go-http-client",
126 "Node",
127 "scrapy",
128 "urllib",
129 "requests",
130 "okhttp",
131 "httpclient",
132 ];
133
134 let ua_lower = user_agent.to_lowercase();
135 bot_patterns
136 .iter()
137 .any(|pattern| ua_lower.contains(&pattern.to_lowercase()))
138}
139
140fn is_scanner_request(path: &str, user_agent: &str) -> bool {
141 let scanner_paths = [
142 ".env",
143 ".git",
144 ".php",
145 "admin",
146 "wp-admin",
147 "wp-login",
148 "administrator",
149 ".sql",
150 ".backup",
151 "config.php",
152 "web.config",
153 ".well-known",
154 ];
155
156 let scanner_agents = [
157 "masscan",
158 "nmap",
159 "nikto",
160 "sqlmap",
161 "metasploit",
162 "nessus",
163 "openvas",
164 "zap",
165 "burp",
166 "qualys",
167 ];
168
169 let path_lower = path.to_lowercase();
170 let ua_lower = user_agent.to_lowercase();
171
172 scanner_paths.iter().any(|p| path_lower.contains(p))
173 || scanner_agents.iter().any(|a| ua_lower.contains(a))
174}