spider/packages/robotparser/
parser.rs

1//! robots.txt parser for Rust.
2//!
3//! This package initially started from a fork of <https://docs.rs/robotparser/latest/robotparser/>
4//! that has improvements that help our case for speed.
5//!
6//! The robots.txt Exclusion Protocol is implemented as specified in
7//! <http://www.robotstxt.org/norobots-rfc.txt>
8//!
9//!
10//! Add ``extern crate robotparser`` to your crate root and your're good to go!
11//!
12//! # Examples
13//!
14//! ```rust,ignore
15//! extern crate spider;
16//!
17//! use spider::packages::robotparser::RobotFileParser;
18//! use reqwest::blocking::Client;
19//!
20//! fn main() {
21//!     let parser = RobotFileParser::new();
22//!     let client = Client::new();
23//!     parser.read(&client, &"http://www.python.org/robots.txt");
24//!     assert!(parser.can_fetch("*", "http://www.python.org/robots.txt"));
25//! }
26//! ```
27
28use crate::compact_str::CompactString;
29use crate::Client;
30#[cfg(feature = "regex")]
31use hashbrown::HashSet;
32#[cfg(feature = "regex")]
33use regex::RegexSet;
34use std::time::{Duration, SystemTime, UNIX_EPOCH};
35
36/// A rule line is a single "Allow:" (allowance==True) or "Disallow:"
37/// (allowance==False) followed by a path."""
38#[derive(Debug, Eq, PartialEq, Clone)]
39#[cfg(not(feature = "regex"))]
40pub struct RuleLine {
41    /// Path of the rule
42    pub path: String,
43    /// Is the rule allowed?
44    pub allowance: bool,
45}
46
47/// A rule line is a single "Allow:" (allowance==True) or "Disallow:"
48/// (allowance==False) followed by a path."""
49#[derive(Debug, Clone)]
50#[cfg(feature = "regex")]
51pub struct RuleLine {
52    /// Path of the rule
53    pub path: Option<regex::Regex>,
54    /// Is the rule allowed?
55    pub allowance: bool,
56}
57
58#[derive(Debug, Eq, PartialEq, Clone)]
59#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
60/// Determine the amount of request allowed between navigation or crawls.
61pub struct RequestRate {
62    /// Amount of request allowed within duration
63    pub requests: usize,
64    /// Duration in seconds between request
65    pub seconds: usize,
66}
67
68/// An entry has one or more user-agents and zero or more rulelines
69#[derive(Debug, Clone)]
70#[cfg_attr(not(feature = "regex"), derive(Eq, PartialEq))]
71pub struct Entry {
72    /// Multiple user agents to use
73    pub useragents: Vec<String>,
74    /// Rules that should be ignored
75    pub rulelines: Vec<RuleLine>,
76    /// Time to wait in between crawls
77    pub crawl_delay: Option<Duration>,
78    /// The request rate to respect
79    pub req_rate: Option<RequestRate>,
80}
81
82/// robots.txt file parser
83#[derive(Debug, Clone)]
84#[cfg_attr(not(feature = "regex"), derive(Eq, PartialEq))]
85pub struct RobotFileParser {
86    /// Entire robots.txt list of urls
87    entries: Vec<Entry>,
88    /// Base entry to list
89    default_entry: Entry,
90    /// Dis-allow links reguardless of robots.txt
91    pub disallow_all: bool,
92    /// Allow links reguardless of robots.txt
93    pub allow_all: bool,
94    /// Time last checked robots.txt file
95    pub last_checked: i64,
96    /// Disallow list of regex paths to ignore.
97    #[cfg(feature = "regex")]
98    pub disallow_paths_regex: RegexSet,
99    /// Disallow list of paths to ignore.
100    #[cfg(feature = "regex")]
101    pub disallow_paths: HashSet<String>,
102    /// Disallow list of regex agents to ignore.
103    #[cfg(feature = "regex")]
104    pub disallow_agents_regex: RegexSet,
105    /// Wild card agent provided.
106    #[cfg(feature = "regex")]
107    pub wild_card_agent: bool,
108    /// Disallow list of agents to ignore.
109    #[cfg(feature = "regex")]
110    pub disallow_agents: HashSet<String>,
111}
112
113impl RuleLine {
114    #[cfg(feature = "regex")]
115    fn new(path: &str, allowance: bool) -> RuleLine {
116        use regex::Regex;
117
118        RuleLine {
119            path: match Regex::new(path) {
120                Ok(r) => Some(r),
121                _ => None,
122            },
123            allowance: path.is_empty() && !allowance || allowance,
124        }
125    }
126
127    #[cfg(not(feature = "regex"))]
128    fn new(path: &str, allowance: bool) -> RuleLine {
129        RuleLine {
130            path: path.into(),
131            allowance: path.is_empty() && !allowance || allowance,
132        }
133    }
134
135    #[cfg(not(feature = "regex"))]
136    fn applies_to(&self, pathname: &str) -> bool {
137        if self.path == "*"
138            || self.path == "/" && pathname == "/"
139            || self.path.ends_with("/") && pathname.starts_with(&self.path)
140        {
141            true
142        } else {
143            self.path
144                .strip_suffix('*')
145                .map_or(false, |prefix| pathname.starts_with(prefix))
146                || pathname == self.path
147        }
148    }
149
150    #[cfg(feature = "regex")]
151    fn applies_to(&self, pathname: &str) -> bool {
152        match self.path {
153            Some(ref regex) => regex.is_match(pathname),
154            _ => false,
155        }
156    }
157}
158
159impl Entry {
160    /// Base collection to manage robot.txt data
161    fn new() -> Entry {
162        Entry {
163            useragents: vec![],
164            rulelines: vec![],
165            crawl_delay: None,
166            req_rate: None,
167        }
168    }
169
170    /// check if this entry applies to the specified agent
171    fn applies_to(&self, useragent: &str) -> bool {
172        let ua = useragent
173            .split('/')
174            .nth(0)
175            .unwrap_or_default()
176            .to_lowercase();
177
178        for agent in &self.useragents {
179            if agent == "*" || ua.contains(agent) {
180                return true;
181            }
182        }
183
184        false
185    }
186
187    /// Preconditions:
188    /// - our agent applies to this entry
189    /// - filename is URL decoded
190    fn allowance(&self, filename: &str) -> bool {
191        for line in &self.rulelines {
192            if line.applies_to(filename) {
193                return line.allowance;
194            }
195        }
196        true
197    }
198
199    /// Add to user agent list
200    fn push_useragent(&mut self, useragent: &str) {
201        self.useragents.push(useragent.to_lowercase());
202    }
203
204    /// Add rule to list
205    fn push_ruleline(&mut self, ruleline: RuleLine) {
206        self.rulelines.push(ruleline);
207    }
208
209    /// Determine if user agent exist
210    fn has_useragent(&self) -> bool {
211        self.useragents.iter().any(|a| a == "*")
212    }
213
214    /// Is the user-agent list empty?
215    fn is_empty(&self) -> bool {
216        self.useragents.is_empty() && self.rulelines.is_empty()
217    }
218
219    /// Set the crawl delay for the website
220    fn set_crawl_delay(&mut self, delay: Duration) {
221        self.crawl_delay = Some(delay);
222    }
223
224    /// Determine the crawl delay for the website
225    fn get_crawl_delay(&self) -> Option<Duration> {
226        self.crawl_delay
227    }
228
229    /// Establish request rates between robots.txt crawling sitemaps
230    fn set_req_rate(&mut self, req_rate: RequestRate) {
231        self.req_rate = Some(req_rate);
232    }
233
234    /// Determine the limit allowed between request before being limited.
235    fn get_req_rate(&self) -> Option<RequestRate> {
236        self.req_rate.clone()
237    }
238}
239
240impl Default for Entry {
241    fn default() -> Entry {
242        Entry::new()
243    }
244}
245
246/// extract the path of a string
247fn extract_path(url: &str) -> &str {
248    if !url.is_empty() {
249        let prefix = if url.starts_with("https://") {
250            8
251        } else if url.starts_with("http://") {
252            7
253        } else {
254            0
255        };
256
257        let url_slice = &url[prefix..];
258
259        if let Some(path_start) = url_slice.find('/') {
260            let path = &url_slice[path_start..];
261
262            if let Some(query_start) = path.find('?') {
263                &path[..query_start]
264            } else {
265                path
266            }
267        } else {
268            "/"
269        }
270    } else {
271        "/"
272    }
273}
274
275impl RobotFileParser {
276    /// Establish a new robotparser for a website domain
277    #[cfg(not(feature = "regex"))]
278    pub fn new() -> Box<RobotFileParser> {
279        RobotFileParser {
280            entries: vec![],
281            default_entry: Entry::new(),
282            disallow_all: false,
283            allow_all: false,
284            last_checked: 0i64,
285        }
286        .into()
287    }
288
289    /// Establish a new robotparser for a website domain
290    #[cfg(feature = "regex")]
291    pub fn new() -> Box<RobotFileParser> {
292        RobotFileParser {
293            entries: vec![],
294            default_entry: Entry::new(),
295            disallow_all: false,
296            disallow_paths_regex: RegexSet::default(),
297            disallow_agents_regex: RegexSet::default(),
298            disallow_paths: Default::default(),
299            disallow_agents: Default::default(),
300            wild_card_agent: false,
301            allow_all: false,
302            last_checked: 0i64,
303        }
304        .into()
305    }
306
307    /// Returns the time the robots.txt file was last fetched.
308    ///
309    /// This is useful for long-running web spiders that need to
310    /// check for new robots.txt files periodically.
311    pub fn mtime(&self) -> i64 {
312        self.last_checked
313    }
314
315    /// Sets the time the robots.txt file was last fetched to the
316    /// current time.
317    pub fn modified(&mut self) {
318        if let Ok(time) = SystemTime::now().duration_since(UNIX_EPOCH) {
319            self.last_checked = time.as_secs() as i64;
320        }
321    }
322
323    /// Get the entries inserted.
324    pub fn get_entries(&self) -> &Vec<Entry> {
325        &self.entries
326    }
327
328    /// Get the base entry inserted.
329    pub fn get_base_entry(&self) -> &Entry {
330        &self.default_entry
331    }
332
333    /// Reads the robots.txt URL and feeds it to the parser.
334    pub async fn read(&mut self, client: &Client, url: &str) {
335        use crate::client::StatusCode;
336        self.modified();
337
338        let request = client.get(string_concat!(url, "robots.txt"));
339
340        let res = match request.send().await {
341            Ok(res) => res,
342            Err(_) => {
343                return;
344            }
345        };
346        let status = res.status();
347
348        match status {
349            StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN => {
350                self.disallow_all = true;
351            }
352            status
353                if status >= StatusCode::BAD_REQUEST
354                    && status < StatusCode::INTERNAL_SERVER_ERROR =>
355            {
356                self.allow_all = true;
357            }
358            StatusCode::OK => self.from_response(res).await,
359            _ => (),
360        }
361    }
362
363    /// Reads the HTTP response and feeds it to the parser.
364    pub async fn from_response(&mut self, response: crate::client::Response) {
365        match response.text().await {
366            Ok(buf) => {
367                let lines: Vec<&str> = buf.split('\n').collect();
368
369                self.parse(&lines);
370            }
371            _ => {
372                self.allow_all = true;
373            }
374        }
375    }
376
377    fn _add_entry(&mut self, entry: Entry) {
378        if entry.has_useragent() {
379            // the default entry is considered last
380            if self.default_entry.is_empty() {
381                // the first default entry wins
382                self.default_entry = entry;
383            }
384        } else {
385            self.entries.push(entry);
386        }
387    }
388
389    ///
390    /// Parse the input lines from a robots.txt file
391    ///
392    /// We allow that a user-agent: line is not preceded by
393    /// one or more blank lines.
394    ///
395    pub fn parse<T: AsRef<str>>(&mut self, lines: &[T]) {
396        use percent_encoding::percent_decode;
397
398        // states:
399        //   0: start state
400        //   1: saw user-agent line
401        //   2: saw an allow or disallow line
402        let mut state = 0;
403        let mut entry = Entry::new();
404
405        for line in lines {
406            let mut ln = line.as_ref();
407            if ln.is_empty() {
408                match state {
409                    1 => {
410                        entry = Entry::new();
411                        state = 0;
412                    }
413                    2 => {
414                        self._add_entry(entry);
415                        entry = Entry::new();
416                        state = 0;
417                    }
418                    _ => {}
419                }
420            }
421            // remove optional comment and strip line
422            if let Some(i) = ln.find('#') {
423                ln = &ln[0..i];
424            }
425            ln = ln.trim();
426            if ln.is_empty() {
427                continue;
428            }
429            let parts: Vec<&str> = ln.splitn(2, ':').collect();
430
431            if parts.len() == 2 {
432                let part0 = parts[0].trim().to_lowercase();
433                let part1 = String::from_utf8(percent_decode(parts[1].trim().as_bytes()).collect())
434                    .unwrap_or_default();
435                match part0 {
436                    ref x if x.to_lowercase() == "user-agent" => {
437                        if state == 2 {
438                            self._add_entry(entry);
439                            entry = Entry::new();
440                        }
441                        entry.push_useragent(&part1);
442                        state = 1;
443                        self.set_disallow_agents_list(&part1);
444                    }
445                    ref x if x.to_lowercase() == "disallow" => {
446                        if state != 0 {
447                            entry.push_ruleline(RuleLine::new(&part1, false));
448                            state = 2;
449                            self.set_disallow_list(&part1);
450                        }
451                    }
452                    ref x if x.to_lowercase() == "allow" => {
453                        if state != 0 {
454                            entry.push_ruleline(RuleLine::new(&part1, true));
455                            state = 2;
456                        }
457                    }
458                    ref x if x.to_lowercase() == "crawl-delay" => {
459                        if state != 0 {
460                            if let Ok(delay) = part1.parse::<f64>() {
461                                let delay_seconds = delay.trunc();
462                                let delay_nanoseconds = delay.fract() * 10f64.powi(9);
463                                let delay =
464                                    Duration::new(delay_seconds as u64, delay_nanoseconds as u32);
465                                entry.set_crawl_delay(delay);
466                            }
467                            state = 2;
468                        }
469                    }
470                    ref x if x.to_lowercase() == "sitemap" => {
471                        if state != 0 {
472                            state = 2;
473                        }
474                    }
475                    ref x if x.to_lowercase() == "request-rate" => {
476                        if state != 0 {
477                            let numbers: Vec<Result<usize, _>> =
478                                part1.split('/').map(|x| x.parse::<usize>()).collect();
479                            if numbers.len() == 2 && numbers[0].is_ok() && numbers[1].is_ok() {
480                                let req_rate = RequestRate {
481                                    requests: numbers[0].clone().unwrap(),
482                                    seconds: numbers[1].clone().unwrap(),
483                                };
484                                entry.set_req_rate(req_rate);
485                            }
486                            state = 2;
487                        }
488                    }
489                    _ => {}
490                }
491            }
492        }
493
494        if state == 2 {
495            self._add_entry(entry);
496        }
497
498        self.build_disallow_list()
499    }
500
501    /// Include the disallow paths in the regex set. This does nothing without the 'regex' feature.
502    #[cfg(not(feature = "regex"))]
503    pub fn set_disallow_list(&mut self, _path: &str) {}
504
505    /// Include the disallow  paths in the regex set. This does nothing without the 'regex' feature.
506    #[cfg(feature = "regex")]
507    pub fn set_disallow_list(&mut self, path: &str) {
508        if !path.is_empty() {
509            self.disallow_paths.insert(path.into());
510        }
511    }
512
513    /// Include the disallow agents in the regex set. This does nothing without the 'regex' feature.
514    #[cfg(not(feature = "regex"))]
515    pub fn set_disallow_agents_list(&mut self, _agent: &str) {}
516
517    /// Include the disallow agents in the regex set. This does nothing without the 'regex' feature.
518    #[cfg(feature = "regex")]
519    pub fn set_disallow_agents_list(&mut self, agent: &str) {
520        if !agent.is_empty() {
521            if agent == "*" {
522                self.wild_card_agent = true;
523            }
524            self.disallow_agents.insert(agent.into());
525        }
526    }
527
528    /// Build the regex disallow list. This does nothing without the 'regex' feature.
529    #[cfg(not(feature = "regex"))]
530    pub fn build_disallow_list(&mut self) {}
531
532    /// Build the regex disallow list. This does nothing without the 'regex' feature.
533    #[cfg(feature = "regex")]
534    pub fn build_disallow_list(&mut self) {
535        if !self.disallow_paths.is_empty() {
536            match RegexSet::new(&self.disallow_paths) {
537                Ok(s) => self.disallow_paths_regex = s,
538                _ => (),
539            }
540        }
541        if !self.disallow_agents.is_empty() {
542            match RegexSet::new(&self.disallow_agents) {
543                Ok(s) => self.disallow_agents_regex = s,
544                _ => (),
545            }
546        }
547    }
548
549    /// Using the parsed robots.txt decide if useragent can fetch url
550    pub fn can_fetch<T: AsRef<str>>(&self, useragent: T, url: &str) -> bool {
551        // Until the robots.txt file has been read or found not
552        // to exist, we must assume that no url is allowable.
553        // This prevents false positives when a user erronenously
554        // calls can_fetch() before calling read().
555        if self.allow_all {
556            true
557        } else if self.last_checked == 0 || self.disallow_all {
558            false
559        } else {
560            // search for given user agent matches
561            // the first match counts
562            let url_str = extract_path(url);
563
564            if self.entry_allowed(&useragent, url_str) {
565                true
566            } else {
567                // try the default entry last
568                let default_entry = &self.default_entry;
569
570                if !default_entry.is_empty() {
571                    default_entry.allowance(url_str)
572                } else {
573                    // agent not found ==> access granted
574                    true
575                }
576            }
577        }
578    }
579
580    /// Is the entry apply to the robots.txt?
581    #[cfg(not(feature = "regex"))]
582    pub fn entry_allowed<T: AsRef<str>>(&self, useragent: &T, url_str: &str) -> bool {
583        for entry in &self.entries {
584            if entry.applies_to(useragent.as_ref()) {
585                return entry.allowance(url_str);
586            }
587        }
588        false
589    }
590
591    /// Is the entry apply to the robots.txt?
592    #[cfg(feature = "regex")]
593    pub fn entry_allowed<T: AsRef<str>>(&self, useragent: &T, url_str: &str) -> bool {
594        let agent_checked =
595            self.wild_card_agent || self.disallow_agents_regex.is_match(useragent.as_ref());
596        let disallow = agent_checked && self.disallow_paths_regex.is_match(url_str);
597
598        !disallow
599    }
600
601    /// Returns the crawl delay for this user agent as a `Duration`, or None if no crawl delay is defined.
602    pub fn get_crawl_delay(&self, useragent: &Option<Box<CompactString>>) -> Option<Duration> {
603        if self.last_checked == 0 {
604            None
605        } else {
606            let useragent = useragent.as_ref();
607            let crawl_delay: Option<Duration> = match useragent {
608                Some(ua) => {
609                    for entry in &self.entries {
610                        if entry.applies_to(ua) {
611                            return entry.get_crawl_delay();
612                        }
613                    }
614                    None
615                }
616                _ => None,
617            };
618
619            if crawl_delay.is_some() {
620                crawl_delay
621            } else {
622                let default_entry = &self.default_entry;
623
624                if !default_entry.is_empty() {
625                    return default_entry.get_crawl_delay();
626                }
627
628                None
629            }
630        }
631    }
632
633    /// Returns the request rate for this user agent as a `RequestRate`, or None if not request rate is defined
634    pub fn get_req_rate<T: AsRef<str>>(&self, useragent: T) -> Option<RequestRate> {
635        let useragent = useragent.as_ref();
636        if self.last_checked == 0 {
637            return None;
638        }
639        for entry in &self.entries {
640            if entry.applies_to(useragent) {
641                return entry.get_req_rate();
642            }
643        }
644        None
645    }
646}