systemprompt_api/services/middleware/
bot_detector.rs1use axum::extract::Request;
2use axum::middleware::Next;
3use axum::response::Response;
4use std::sync::Arc;
5use systemprompt_analytics::matches_bot_pattern;
6
7const DATACENTER_IP_PREFIXES: &[&str] = &["47.79.", "47.82."];
8
9#[derive(Clone, Debug)]
10pub struct BotMarker {
11 pub is_bot: bool,
12 pub bot_type: BotType,
13 pub user_agent: String,
14 pub ip_address: Option<String>,
15}
16
17#[derive(Clone, Copy, Debug, PartialEq, Eq)]
18pub enum BotType {
19 KnownBot,
20 Scanner,
21 Suspicious,
22 Human,
23}
24
25pub async fn detect_bots_early(mut req: Request, next: Next) -> Response {
26 let user_agent = req
27 .headers()
28 .get("user-agent")
29 .and_then(|h| {
30 h.to_str()
31 .map_err(|e| {
32 tracing::trace!(error = %e, "Invalid UTF-8 in user-agent header");
33 e
34 })
35 .ok()
36 })
37 .unwrap_or("")
38 .to_string();
39
40 let ip_address = extract_ip_address(&req);
41 let uri_path = req.uri().path().to_string();
42
43 let marker = if is_known_bot(&user_agent) {
44 BotMarker {
45 is_bot: true,
46 bot_type: BotType::KnownBot,
47 user_agent: user_agent.clone(),
48 ip_address: ip_address.clone(),
49 }
50 } else if is_datacenter_ip(ip_address.as_deref()) {
51 BotMarker {
52 is_bot: true,
53 bot_type: BotType::Suspicious,
54 user_agent: user_agent.clone(),
55 ip_address: ip_address.clone(),
56 }
57 } else if is_scanner_request(&uri_path, &user_agent) {
58 BotMarker {
59 is_bot: false,
60 bot_type: BotType::Scanner,
61 user_agent: user_agent.clone(),
62 ip_address: ip_address.clone(),
63 }
64 } else {
65 BotMarker {
66 is_bot: false,
67 bot_type: BotType::Human,
68 user_agent: user_agent.clone(),
69 ip_address,
70 }
71 };
72
73 req.extensions_mut().insert(Arc::new(marker));
74 next.run(req).await
75}
76
77fn extract_ip_address(req: &Request) -> Option<String> {
78 req.headers()
79 .get("x-forwarded-for")
80 .and_then(|v| v.to_str().ok())
81 .and_then(|s| s.split(',').next())
82 .map(|s| s.trim().to_string())
83 .or_else(|| {
84 req.headers()
85 .get("x-real-ip")
86 .and_then(|v| v.to_str().ok())
87 .map(ToString::to_string)
88 })
89 .or_else(|| {
90 req.headers()
91 .get("cf-connecting-ip")
92 .and_then(|v| v.to_str().ok())
93 .map(ToString::to_string)
94 })
95}
96
97fn is_datacenter_ip(ip: Option<&str>) -> bool {
98 ip.is_some_and(|ip_addr| {
99 DATACENTER_IP_PREFIXES
100 .iter()
101 .any(|prefix| ip_addr.starts_with(prefix))
102 })
103}
104
105fn is_known_bot(user_agent: &str) -> bool {
106 matches_bot_pattern(user_agent)
107}
108
109fn is_scanner_request(path: &str, user_agent: &str) -> bool {
110 let scanner_paths = [
111 ".env",
112 ".git",
113 ".php",
114 "admin",
115 "wp-admin",
116 "wp-login",
117 "administrator",
118 ".sql",
119 ".backup",
120 "config.php",
121 "web.config",
122 ".well-known",
123 ];
124
125 let scanner_agents = [
126 "masscan",
127 "nmap",
128 "nikto",
129 "sqlmap",
130 "metasploit",
131 "nessus",
132 "openvas",
133 "zap",
134 "burp",
135 "qualys",
136 ];
137
138 let path_lower = path.to_lowercase();
139 let ua_lower = user_agent.to_lowercase();
140
141 scanner_paths.iter().any(|p| path_lower.contains(p))
142 || scanner_agents.iter().any(|a| ua_lower.contains(a))
143}