Skip to main content

synapse_pingora/crawler/
bad_bots.rs

1//! Bad bot signatures for malicious traffic identification.
2//!
3//! ## Security
4//! All regex patterns are designed to avoid catastrophic backtracking (ReDoS):
5//! - No nested quantifiers (e.g., (a+)+)
6//! - No overlapping alternations with quantifiers
7//! - Negative lookaheads use anchored patterns where possible
8//! - Complex exclusion logic is handled in code, not regex
9
10use serde::{Deserialize, Serialize};
11
12/// Severity level for bad bot signatures.
13#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
14#[serde(rename_all = "lowercase")]
15pub enum BadBotSeverity {
16    Low,
17    Medium,
18    High,
19}
20
21/// Signature for detecting a bad bot.
22#[derive(Debug, Clone)]
23pub struct BadBotSignature {
24    /// Signature name
25    pub name: &'static str,
26    /// Regex pattern to match user agent (ReDoS-safe)
27    pub pattern: &'static str,
28    /// Severity level
29    pub severity: BadBotSeverity,
30    /// Description
31    pub description: &'static str,
32}
33
34/// Known bad bot signatures.
35///
36/// Patterns are designed to be ReDoS-safe:
37/// - Simple literal matches with case-insensitive flag
38/// - Exclusion logic moved to code
39/// - No nested quantifiers or complex alternations
40pub static BAD_BOT_SIGNATURES: &[BadBotSignature] = &[
41    // Attack tools (HIGH severity)
42    BadBotSignature {
43        name: "SQLMap",
44        pattern: r"(?i)sqlmap",
45        severity: BadBotSeverity::High,
46        description: "SQL injection testing tool",
47    },
48    BadBotSignature {
49        name: "Nikto",
50        pattern: r"(?i)nikto",
51        severity: BadBotSeverity::High,
52        description: "Web server scanner",
53    },
54    BadBotSignature {
55        name: "Nmap",
56        pattern: r"(?i)nmap",
57        severity: BadBotSeverity::High,
58        description: "Network scanner",
59    },
60    BadBotSignature {
61        name: "Acunetix",
62        pattern: r"(?i)acunetix",
63        severity: BadBotSeverity::High,
64        description: "Vulnerability scanner",
65    },
66    BadBotSignature {
67        name: "Nessus",
68        pattern: r"(?i)nessus",
69        severity: BadBotSeverity::High,
70        description: "Vulnerability scanner",
71    },
72    BadBotSignature {
73        name: "OpenVAS",
74        pattern: r"(?i)openvas",
75        severity: BadBotSeverity::High,
76        description: "Vulnerability scanner",
77    },
78    BadBotSignature {
79        name: "Metasploit",
80        pattern: r"(?i)metasploit",
81        severity: BadBotSeverity::High,
82        description: "Penetration testing framework",
83    },
84    BadBotSignature {
85        name: "w3af",
86        pattern: r"(?i)w3af",
87        severity: BadBotSeverity::High,
88        description: "Web application attack framework",
89    },
90    // Security testing tools (MEDIUM severity)
91    BadBotSignature {
92        name: "ZAP",
93        // Simplified: removed .* between terms to prevent backtracking
94        pattern: r"(?i)owasp[- ]?zap",
95        severity: BadBotSeverity::Medium,
96        description: "OWASP ZAP security scanner",
97    },
98    BadBotSignature {
99        name: "BurpSuite",
100        pattern: r"(?i)burp",
101        severity: BadBotSeverity::Medium,
102        description: "Security testing tool",
103    },
104    // Generic scrapers (LOW severity)
105    BadBotSignature {
106        name: "PythonUrllib",
107        // Simplified: exclusions handled in code
108        pattern: r"(?i)python-urllib",
109        severity: BadBotSeverity::Low,
110        description: "Generic Python scraper",
111    },
112    BadBotSignature {
113        name: "PythonRequests",
114        pattern: r"(?i)python-requests",
115        severity: BadBotSeverity::Low,
116        description: "Generic Python scraper",
117    },
118    BadBotSignature {
119        name: "Curl",
120        pattern: r"^curl/",
121        severity: BadBotSeverity::Low,
122        description: "Generic curl client",
123    },
124    BadBotSignature {
125        name: "Wget",
126        pattern: r"^wget/",
127        severity: BadBotSeverity::Low,
128        description: "Generic wget client",
129    },
130    BadBotSignature {
131        name: "Scrapy",
132        pattern: r"(?i)scrapy",
133        severity: BadBotSeverity::Medium,
134        description: "Python scraping framework",
135    },
136    BadBotSignature {
137        name: "BeautifulSoup",
138        pattern: r"(?i)beautifulsoup",
139        severity: BadBotSeverity::Low,
140        description: "Python scraping library",
141    },
142    BadBotSignature {
143        name: "Mechanize",
144        pattern: r"(?i)mechanize",
145        severity: BadBotSeverity::Low,
146        description: "Python web scraping library",
147    },
148    BadBotSignature {
149        name: "Selenium",
150        pattern: r"(?i)selenium",
151        severity: BadBotSeverity::Low,
152        description: "Browser automation tool",
153    },
154    BadBotSignature {
155        name: "Puppeteer",
156        // Simplified: removed .* - now just checks for both terms
157        pattern: r"(?i)puppeteer",
158        severity: BadBotSeverity::Low,
159        description: "Browser automation tool",
160    },
161    BadBotSignature {
162        name: "PhantomJS",
163        pattern: r"(?i)phantomjs",
164        severity: BadBotSeverity::Low,
165        description: "Headless browser",
166    },
167    // SEO spam bots (MEDIUM severity)
168    BadBotSignature {
169        name: "SEMrushUnauthorized",
170        pattern: r"(?i)semrush",
171        severity: BadBotSeverity::Medium,
172        description: "SEO tool when used without permission",
173    },
174    BadBotSignature {
175        name: "AhrefsUnauthorized",
176        pattern: r"(?i)ahrefs",
177        severity: BadBotSeverity::Medium,
178        description: "SEO tool when used without permission",
179    },
180    BadBotSignature {
181        name: "MajesticUnauthorized",
182        pattern: r"(?i)majestic",
183        severity: BadBotSeverity::Medium,
184        description: "SEO tool when used without permission",
185    },
186    // Content scrapers (MEDIUM severity)
187    BadBotSignature {
188        name: "HTTrack",
189        pattern: r"(?i)httrack",
190        severity: BadBotSeverity::Medium,
191        description: "Website downloader",
192    },
193    BadBotSignature {
194        name: "WebCopier",
195        pattern: r"(?i)webcopier",
196        severity: BadBotSeverity::Medium,
197        description: "Website copier tool",
198    },
199    BadBotSignature {
200        name: "WebReaper",
201        pattern: r"(?i)webreaper",
202        severity: BadBotSeverity::Medium,
203        description: "Website downloader",
204    },
205    BadBotSignature {
206        name: "WebZIP",
207        pattern: r"(?i)webzip",
208        severity: BadBotSeverity::Medium,
209        description: "Website downloader",
210    },
211    BadBotSignature {
212        name: "OfflineExplorer",
213        // Simplified: use optional space/hyphen instead of \s*
214        pattern: r"(?i)offline[- ]?explorer",
215        severity: BadBotSeverity::Medium,
216        description: "Website downloader",
217    },
218    BadBotSignature {
219        name: "TeleportPro",
220        pattern: r"(?i)teleport[- ]?pro",
221        severity: BadBotSeverity::Medium,
222        description: "Website downloader",
223    },
224    // Email harvesters (HIGH severity)
225    BadBotSignature {
226        name: "EmailCollector",
227        // Split into two patterns to avoid .* - this matches the literal
228        pattern: r"(?i)emailcollector",
229        severity: BadBotSeverity::High,
230        description: "Email harvesting tool",
231    },
232    BadBotSignature {
233        name: "EmailHarvester",
234        pattern: r"(?i)emailharvest",
235        severity: BadBotSeverity::High,
236        description: "Email harvesting tool",
237    },
238    BadBotSignature {
239        name: "EmailSiphon",
240        pattern: r"(?i)emailsiphon",
241        severity: BadBotSeverity::High,
242        description: "Email harvesting tool",
243    },
244    // Link checkers (LOW severity)
245    BadBotSignature {
246        name: "LinkChecker",
247        pattern: r"(?i)linkchecker",
248        severity: BadBotSeverity::Low,
249        description: "Automated link checker",
250    },
251    BadBotSignature {
252        name: "Xenu",
253        pattern: r"(?i)xenu",
254        severity: BadBotSeverity::Low,
255        description: "Link checker",
256    },
257    // Aggressive crawlers (MEDIUM/HIGH severity)
258    BadBotSignature {
259        name: "WebStripper",
260        pattern: r"(?i)webstripper",
261        severity: BadBotSeverity::Medium,
262        description: "Content stripper",
263    },
264    BadBotSignature {
265        name: "WebAuto",
266        pattern: r"(?i)webauto",
267        severity: BadBotSeverity::Medium,
268        description: "Automated web tool",
269    },
270    BadBotSignature {
271        name: "WebBandit",
272        pattern: r"(?i)webbandit",
273        severity: BadBotSeverity::High,
274        description: "Aggressive scraper",
275    },
276    // Suspicious patterns (MEDIUM severity)
277    BadBotSignature {
278        name: "EmptyUserAgent",
279        pattern: r"^$",
280        severity: BadBotSeverity::Medium,
281        description: "Missing user agent string",
282    },
283    // REFACTORED: Complex negative lookaheads removed
284    // Exclusion logic now handled in code
285    BadBotSignature {
286        name: "GenericBot",
287        // Simple pattern - exclusions handled in code
288        pattern: r"(?i)\bbot\b",
289        severity: BadBotSeverity::Low,
290        description: "Generic bot pattern",
291    },
292    BadBotSignature {
293        name: "GenericCrawler",
294        // Simple pattern - exclusions handled in code
295        pattern: r"(?i)\bcrawler\b",
296        severity: BadBotSeverity::Low,
297        description: "Generic crawler pattern",
298    },
299    BadBotSignature {
300        name: "GenericSpider",
301        // Simple pattern - exclusions handled in code
302        pattern: r"(?i)\bspider\b",
303        severity: BadBotSeverity::Low,
304        description: "Generic spider pattern",
305    },
306    // DDoS tools (HIGH severity)
307    BadBotSignature {
308        name: "LOIC",
309        pattern: r"(?i)loic",
310        severity: BadBotSeverity::High,
311        description: "DDoS tool",
312    },
313    BadBotSignature {
314        name: "Slowloris",
315        pattern: r"(?i)slowloris",
316        severity: BadBotSeverity::High,
317        description: "DDoS tool",
318    },
319    // Credential stuffing tools
320    BadBotSignature {
321        name: "SentryMBA",
322        // Simplified: removed .* - use word boundary instead
323        pattern: r"(?i)sentry[- ]?mba",
324        severity: BadBotSeverity::High,
325        description: "Credential stuffing tool",
326    },
327    BadBotSignature {
328        name: "STORM",
329        pattern: r"(?i)storm[- ]?cracker",
330        severity: BadBotSeverity::High,
331        description: "Credential stuffing tool",
332    },
333];