isbot/
lib.rs

1//! [![github]](https://github.com/BryanMorgan/isbot) [![crates-io]](https://crates.io/crates/isbot)
2//!
3//! [github]: <https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github>
4//! [crates-io]: <https://img.shields.io/badge/crates.io-fc8d62?style=for-the-badge&labelColor=555555&logo=rust>
5//!
6//! Detect bots or crawlers identified by matching a user-agent to a collection of known bot patterns.
7//!
8//! User-agent patterns are maintained as a single regular expression for fast validation.
9//!
10//! The default list of user-agent patterns balances a large set of known bots
11//! while ensuring real browsers are not falsely identified as bots.
12//!
13//! # Examples
14//!
15//! ```
16//! use isbot::Bots;
17//!
18//! let bots = Bots::default();
19//! assert!(bots.is_bot("Googlebot-Image/1.0"));
20//! assert!(!bots.is_bot("Opera/9.60 (Windows NT 6.0; U; en) Presto/2.1.1"));
21//! ```
22//!
23//! User-agent regular expressions can be added or removed for specific use cases.
24//! For example, you could remove the Chrome Lighthouse bot from the list of known bots:
25//!
26//! ```
27//! let mut bots = isbot::Bots::default();
28//!
29//! // By default Chrome Lighthouse is considered a bot
30//! assert!(bots.is_bot("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36 Chrome-Lighthouse"));
31//! // Remove the Chrome Lighthouse regular expression pattern to indicate it is not a bot
32//! bots.remove(&["Chrome-Lighthouse"]);
33//! assert!(!bots.is_bot("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36 Chrome-Lighthouse"));
34//! ```
35//!
36//! Or append a new user-agent to detect a custom bot:
37//! ```
38//! let mut bots = isbot::Bots::default();
39//!
40//! // Append a new custom bot user-agent regular expression
41//! assert!(!bots.is_bot("Mozilla/5.0 (CustomNewTestB0T /1.2)"));
42//! bots.append(&[r"CustomNewTestB0T\s/\d\.\d"]);
43//! assert!(bots.is_bot("Mozilla/5.0 (CustomNewTestB0T /1.2)"));
44//! ```
45
46use regex::Regex;
47use std::{collections::HashSet, fmt::Debug};
48
49/// Wrapper struct to maintain bot regular expression patterns
50///
51/// # Example
52///
53/// ```
54/// use isbot::Bots;
55///
56/// let bots = Bots::default();
57/// ```
58#[derive(Debug)]
59pub struct Bots {
60    user_agent_patterns: HashSet<String>,
61    user_agents_regex: Regex,
62}
63
64/// Load default bot user-agent regular expressions from a local file, unless the feature is disabled
65#[cfg(feature = "include-default-bots")]
66const BOT_PATTERNS: &str = include_str!("bot_regex_patterns.txt");
67
68/// Do not load any default user-agent strings into the compiled library if feature is not enabled
69#[cfg(not(feature = "include-default-bots"))]
70const BOT_PATTERNS: &str = "";
71
72impl Default for Bots {
73    /// Constructs a new instance with default user-agent patterns.
74    ///
75    /// # Example
76    ///
77    /// ```
78    /// use isbot::Bots;
79    ///
80    /// let bots = Bots::default();
81    ///
82    /// assert!(bots.is_bot("Googlebot"));
83    /// ```
84    fn default() -> Self {
85        Bots::new(BOT_PATTERNS)
86    }
87}
88
89impl Bots {
90    /// Constructs a new instance with bot user-agent regular expression entries delimited by a newline
91    ///
92    /// All user-agent regular expressions are converted to lowercase.
93    ///
94    /// # Example
95    ///
96    /// ```
97    /// use isbot::Bots;
98    ///
99    /// let custom_user_agent_patterns = r#"
100    /// ^Googlebot-Image/
101    /// bingpreview/"#;
102    /// let bots = Bots::new(custom_user_agent_patterns);
103    ///
104    /// assert!(bots.is_bot("Googlebot-Image/1.0"));
105    /// assert!(bots.is_bot("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534+ (KHTML, like Gecko) BingPreview/1.0b"));
106    /// assert!(!bots.is_bot("Googlebot"));
107    /// ```
108    pub fn new(bot_entries: &str) -> Self {
109        let user_agent_patterns = Bots::parse_lines(&bot_entries.to_ascii_lowercase());
110        let combined_user_agent_regex = Bots::to_regex(&user_agent_patterns);
111        Bots {
112            user_agent_patterns,
113            user_agents_regex: combined_user_agent_regex,
114        }
115    }
116
117    /// Returns `true` the user-agent is a known bot.
118    ///
119    /// The user-agent comparison is done using lowercase.
120    ///
121    /// # Example
122    ///
123    /// ```
124    /// use isbot::Bots;
125    ///
126    /// let bots = Bots::default();
127    ///
128    /// assert!(bots.is_bot("Googlebot/2.1 (+http://www.google.com/bot.html)"));
129    /// assert!(!bots.is_bot("Dalvik/2.1.0 (Linux; U; Android 8.0.0; SM-G930F Build/R16NW)"));
130    /// ```    
131    pub fn is_bot(&self, user_agent: &str) -> bool {
132        self.user_agents_regex
133            .is_match(&user_agent.to_ascii_lowercase())
134    }
135
136    /// Appends bot user-agent regular expressions patterns.
137    ///
138    /// Duplicates are ignored.
139    ///
140    /// # Example
141    ///
142    /// ```
143    /// use isbot::Bots;
144    ///
145    /// let mut bots = Bots::default();
146    /// assert!(!bots.is_bot("Mozilla/5.0 (CustomNewTestB0T /1.2)"));
147    /// bots.append(&[r"CustomNewTestB0T\s/\d\.\d"]);
148    /// assert!(bots.is_bot("Mozilla/5.0 (CustomNewTestB0T /1.2)"));
149    ///
150    /// let new_bot_patterns = vec!["GoogleMetaverse", "^Special/"];
151    /// bots.append(&new_bot_patterns);
152    /// assert!(bots.is_bot("Mozilla/5.0 (GoogleMetaverse/1.0)"));
153    /// ```
154    pub fn append(&mut self, bots: &[&str]) {
155        for bot in bots {
156            self.user_agent_patterns.insert(bot.to_ascii_lowercase());
157        }
158        self.update_regex()
159    }
160
161    /// Removes bot user-agent regular expressions.
162    ///
163    /// # Example
164    ///
165    /// ```
166    /// use isbot::Bots;
167    ///
168    /// let mut bots = Bots::default();
169    ///
170    ///
171    /// assert!(bots.is_bot("Chrome-Lighthouse"));
172    /// bots.remove(&["Chrome-Lighthouse"]);
173    /// assert!(!bots.is_bot("Chrome-Lighthouse"));
174    ///
175    /// let bot_patterns_to_remove = vec!["bingpreview/", "Google Favicon"];
176    /// bots.remove(&bot_patterns_to_remove);
177    /// assert!(!bots.is_bot("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534+ (KHTML, like Gecko) BingPreview/1.0b"));
178    /// assert!(!bots.is_bot("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36 Google Favicon"));
179    /// ```
180    pub fn remove(&mut self, bots: &[&str]) {
181        for bot in bots {
182            self.user_agent_patterns.remove(&bot.to_ascii_lowercase());
183        }
184        self.update_regex()
185    }
186
187    fn update_regex(&mut self) {
188        self.user_agents_regex = Bots::to_regex(&self.user_agent_patterns)
189    }
190
191    fn parse_lines(bot_regex_entries: &str) -> HashSet<String> {
192        HashSet::from_iter(
193            bot_regex_entries
194                .lines()
195                .filter(|l| !l.trim().is_empty())
196                .map(ToString::to_string),
197        )
198    }
199
200    fn to_regex(regex_entries: &HashSet<String>) -> Regex {
201        let pattern = regex_entries
202            .iter()
203            .cloned()
204            .collect::<Vec<String>>()
205            .join("|");
206
207        if pattern.is_empty() {
208            return Regex::new("^$").unwrap();
209        }
210
211        Regex::new(&pattern).unwrap()
212    }
213}
214
215#[cfg(test)]
216mod tests {
217    use crate::Bots;
218
219    static GOOD_BOTS: [&str; 7] = [
220        "Googlebot",
221        "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
222        "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)",
223        "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
224        "Mozilla/5.0 (compatible; Bingbot/2.0; +http://www.bing.com/bingbot.htm)",
225        "DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)",
226        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36 Chrome-Lighthouse"
227    ];
228
229    static NOT_BOTS: [&str; 6] = [
230        "",
231        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
232        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)",
233        "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1",
234        "Mozilla/5.0 (Linux; Android 5.0; SAMSUNG SM-N900 Build/LRX21V) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/2.1 Chrome/34.0.1847.76 Mobile Safari/537.36",
235        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36",
236    ];
237
238    #[test]
239    fn good_bots() {
240        let bots = Bots::default();
241        for bot in GOOD_BOTS {
242            assert!(bots.is_bot(bot), "Invalid bot: '{}'", bot);
243        }
244    }
245
246    #[test]
247    fn not_bots() {
248        let bots = Bots::default();
249        for bot in NOT_BOTS {
250            assert!(!bots.is_bot(bot), "Is a bot{}", bot);
251        }
252    }
253
254    #[test]
255    fn custom_user_agent_patterns() {
256        let custom_user_agent_patterns = "\
257            ^Simplebot\n\
258            anything\\s+bot\n\
259            Numerical\\d{4}\\.\\d{4}\\.\\d{4}\\.\\d{4}";
260        let bots = Bots::new(custom_user_agent_patterns);
261        assert!(!bots.is_bot("InvalidBot"));
262        assert!(!bots.is_bot("Googlebot"));
263        assert!(bots.is_bot("Simplebot/1.2"));
264        assert!(!bots.is_bot(" Simplebot/1.2"));
265        assert!(bots.is_bot("Anything  Bot"));
266        assert!(!bots.is_bot("AnythingBot"));
267        assert!(bots.is_bot("numerical1101.2001.3987.4781"));
268        assert!(!bots.is_bot("numerical1.2.3.4"));
269    }
270
271    #[test]
272    fn empty_user_agent_patterns() {
273        let empty_user_agent_patterns = "";
274        let bots = Bots::new(empty_user_agent_patterns);
275        assert!(bots.is_bot(""));
276        assert!(!bots.is_bot("1"));
277        assert!(!bots.is_bot("Googlebot"));
278    }
279
280    #[test]
281    fn single_user_agent_patterns() {
282        let single_user_agent_patterns = "me";
283        let bots = Bots::new(single_user_agent_patterns);
284        assert!(!bots.is_bot(""));
285        assert!(!bots.is_bot("M"));
286        assert!(bots.is_bot("Me"));
287        assert!(!bots.is_bot("Googlebot"));
288    }
289    #[test]
290    fn add_pattern() {
291        let mut bots = Bots::default();
292        assert!(!bots.is_bot("Mozilla/5.0 (FancyNewTestB0T /1.2)"));
293        bots.append(&[r"FancyNewTestB0T\s/\d\.\d"]);
294        assert!(bots.is_bot("Mozilla/5.0 (FancyNewTestB0T /1.2)"));
295    }
296
297    #[test]
298    fn add_multiple_patterns() {
299        let mut bots = Bots::default();
300        assert!(!bots.is_bot("Mozilla/5.0 (FancyNewTestB0T /1.2)"));
301        assert!(!bots.is_bot("Special/1.0"));
302        assert!(!bots.is_bot("GoogleMetaverse/2.1 (experimental)"));
303
304        let new_bot_patterns = vec!["FancyNewTestB0T", "^GoogleMetaverse", "^Special/"];
305        bots.append(&new_bot_patterns);
306
307        assert!(bots.is_bot("Mozilla/5.0 (FancyNewTestB0T /1.2)"));
308        assert!(bots.is_bot("Special/1.0"));
309        assert!(bots.is_bot("GoogleMetaverse/2.1 (experimental)"));
310    }
311
312    #[test]
313    fn remove_pattern() {
314        let mut bots = Bots::default();
315        assert!(bots.is_bot("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36 Chrome-Lighthouse"));
316        bots.remove(&["Chrome-Lighthouse"]);
317        assert!(!bots.is_bot("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36 Chrome-Lighthouse"));
318        assert!(!bots.is_bot("Chrome-Lighthouse"));
319        assert!(bots.is_bot("Mozilla/5.0 (Windows NT 10.0; Win64; x64) adbeat.com/policy AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"));
320    }
321
322    #[test]
323    fn remove_multiple_patterns() {
324        let mut bots = Bots::default();
325        assert!(bots.is_bot("Datadog Agent/5.10.1"));
326        assert!(bots.is_bot("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36 Chrome-Lighthouse"));
327        assert!(bots.is_bot("Mozilla/5.0 (Java) outbrain"));
328        assert!(bots.is_bot("Mozilla/5.0 (compatible; Google-Site-Verification/1.0)"));
329
330        let bot_patterns_to_remove =
331            vec!["datadog agent", "Chrome-Lighthouse", "outbrain", "google-"];
332        bots.remove(&bot_patterns_to_remove);
333
334        assert!(!bots.is_bot("Datadog Agent/5.10.1"));
335        assert!(!bots.is_bot("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36 Chrome-Lighthouse"));
336        assert!(!bots.is_bot("Mozilla/5.0 (Java) outbrain"));
337        assert!(!bots.is_bot("Mozilla/5.0 (compatible; Google-Site-Verification/1.0)"));
338    }
339}