1use regex::Regex;
47use std::{collections::HashSet, fmt::Debug};
48
49#[derive(Debug)]
59pub struct Bots {
60 user_agent_patterns: HashSet<String>,
61 user_agents_regex: Regex,
62}
63
64#[cfg(feature = "include-default-bots")]
66const BOT_PATTERNS: &str = include_str!("bot_regex_patterns.txt");
67
68#[cfg(not(feature = "include-default-bots"))]
70const BOT_PATTERNS: &str = "";
71
72impl Default for Bots {
73 fn default() -> Self {
85 Bots::new(BOT_PATTERNS)
86 }
87}
88
89impl Bots {
90 pub fn new(bot_entries: &str) -> Self {
109 let user_agent_patterns = Bots::parse_lines(&bot_entries.to_ascii_lowercase());
110 let combined_user_agent_regex = Bots::to_regex(&user_agent_patterns);
111 Bots {
112 user_agent_patterns,
113 user_agents_regex: combined_user_agent_regex,
114 }
115 }
116
117 pub fn is_bot(&self, user_agent: &str) -> bool {
132 self.user_agents_regex
133 .is_match(&user_agent.to_ascii_lowercase())
134 }
135
136 pub fn append(&mut self, bots: &[&str]) {
155 for bot in bots {
156 self.user_agent_patterns.insert(bot.to_ascii_lowercase());
157 }
158 self.update_regex()
159 }
160
161 pub fn remove(&mut self, bots: &[&str]) {
181 for bot in bots {
182 self.user_agent_patterns.remove(&bot.to_ascii_lowercase());
183 }
184 self.update_regex()
185 }
186
187 fn update_regex(&mut self) {
188 self.user_agents_regex = Bots::to_regex(&self.user_agent_patterns)
189 }
190
191 fn parse_lines(bot_regex_entries: &str) -> HashSet<String> {
192 HashSet::from_iter(
193 bot_regex_entries
194 .lines()
195 .filter(|l| !l.trim().is_empty())
196 .map(ToString::to_string),
197 )
198 }
199
200 fn to_regex(regex_entries: &HashSet<String>) -> Regex {
201 let pattern = regex_entries
202 .iter()
203 .cloned()
204 .collect::<Vec<String>>()
205 .join("|");
206
207 if pattern.is_empty() {
208 return Regex::new("^$").unwrap();
209 }
210
211 Regex::new(&pattern).unwrap()
212 }
213}
214
215#[cfg(test)]
216mod tests {
217 use crate::Bots;
218
219 static GOOD_BOTS: [&str; 7] = [
220 "Googlebot",
221 "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
222 "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)",
223 "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
224 "Mozilla/5.0 (compatible; Bingbot/2.0; +http://www.bing.com/bingbot.htm)",
225 "DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)",
226 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36 Chrome-Lighthouse"
227 ];
228
229 static NOT_BOTS: [&str; 6] = [
230 "",
231 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
232 "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)",
233 "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1",
234 "Mozilla/5.0 (Linux; Android 5.0; SAMSUNG SM-N900 Build/LRX21V) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/2.1 Chrome/34.0.1847.76 Mobile Safari/537.36",
235 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36",
236 ];
237
238 #[test]
239 fn good_bots() {
240 let bots = Bots::default();
241 for bot in GOOD_BOTS {
242 assert!(bots.is_bot(bot), "Invalid bot: '{}'", bot);
243 }
244 }
245
246 #[test]
247 fn not_bots() {
248 let bots = Bots::default();
249 for bot in NOT_BOTS {
250 assert!(!bots.is_bot(bot), "Is a bot{}", bot);
251 }
252 }
253
254 #[test]
255 fn custom_user_agent_patterns() {
256 let custom_user_agent_patterns = "\
257 ^Simplebot\n\
258 anything\\s+bot\n\
259 Numerical\\d{4}\\.\\d{4}\\.\\d{4}\\.\\d{4}";
260 let bots = Bots::new(custom_user_agent_patterns);
261 assert!(!bots.is_bot("InvalidBot"));
262 assert!(!bots.is_bot("Googlebot"));
263 assert!(bots.is_bot("Simplebot/1.2"));
264 assert!(!bots.is_bot(" Simplebot/1.2"));
265 assert!(bots.is_bot("Anything Bot"));
266 assert!(!bots.is_bot("AnythingBot"));
267 assert!(bots.is_bot("numerical1101.2001.3987.4781"));
268 assert!(!bots.is_bot("numerical1.2.3.4"));
269 }
270
271 #[test]
272 fn empty_user_agent_patterns() {
273 let empty_user_agent_patterns = "";
274 let bots = Bots::new(empty_user_agent_patterns);
275 assert!(bots.is_bot(""));
276 assert!(!bots.is_bot("1"));
277 assert!(!bots.is_bot("Googlebot"));
278 }
279
280 #[test]
281 fn single_user_agent_patterns() {
282 let single_user_agent_patterns = "me";
283 let bots = Bots::new(single_user_agent_patterns);
284 assert!(!bots.is_bot(""));
285 assert!(!bots.is_bot("M"));
286 assert!(bots.is_bot("Me"));
287 assert!(!bots.is_bot("Googlebot"));
288 }
289 #[test]
290 fn add_pattern() {
291 let mut bots = Bots::default();
292 assert!(!bots.is_bot("Mozilla/5.0 (FancyNewTestB0T /1.2)"));
293 bots.append(&[r"FancyNewTestB0T\s/\d\.\d"]);
294 assert!(bots.is_bot("Mozilla/5.0 (FancyNewTestB0T /1.2)"));
295 }
296
297 #[test]
298 fn add_multiple_patterns() {
299 let mut bots = Bots::default();
300 assert!(!bots.is_bot("Mozilla/5.0 (FancyNewTestB0T /1.2)"));
301 assert!(!bots.is_bot("Special/1.0"));
302 assert!(!bots.is_bot("GoogleMetaverse/2.1 (experimental)"));
303
304 let new_bot_patterns = vec!["FancyNewTestB0T", "^GoogleMetaverse", "^Special/"];
305 bots.append(&new_bot_patterns);
306
307 assert!(bots.is_bot("Mozilla/5.0 (FancyNewTestB0T /1.2)"));
308 assert!(bots.is_bot("Special/1.0"));
309 assert!(bots.is_bot("GoogleMetaverse/2.1 (experimental)"));
310 }
311
312 #[test]
313 fn remove_pattern() {
314 let mut bots = Bots::default();
315 assert!(bots.is_bot("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36 Chrome-Lighthouse"));
316 bots.remove(&["Chrome-Lighthouse"]);
317 assert!(!bots.is_bot("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36 Chrome-Lighthouse"));
318 assert!(!bots.is_bot("Chrome-Lighthouse"));
319 assert!(bots.is_bot("Mozilla/5.0 (Windows NT 10.0; Win64; x64) adbeat.com/policy AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"));
320 }
321
322 #[test]
323 fn remove_multiple_patterns() {
324 let mut bots = Bots::default();
325 assert!(bots.is_bot("Datadog Agent/5.10.1"));
326 assert!(bots.is_bot("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36 Chrome-Lighthouse"));
327 assert!(bots.is_bot("Mozilla/5.0 (Java) outbrain"));
328 assert!(bots.is_bot("Mozilla/5.0 (compatible; Google-Site-Verification/1.0)"));
329
330 let bot_patterns_to_remove =
331 vec!["datadog agent", "Chrome-Lighthouse", "outbrain", "google-"];
332 bots.remove(&bot_patterns_to_remove);
333
334 assert!(!bots.is_bot("Datadog Agent/5.10.1"));
335 assert!(!bots.is_bot("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36 Chrome-Lighthouse"));
336 assert!(!bots.is_bot("Mozilla/5.0 (Java) outbrain"));
337 assert!(!bots.is_bot("Mozilla/5.0 (compatible; Google-Site-Verification/1.0)"));
338 }
339}