robotstxt_with_cache/
matcher.rs

1// Copyright 2020 Folyd
2// Copyright 1999 Google LLC
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//     https://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16
17use crate::RobotsParseHandler;
18
19/// Instead of just maintaining a Boolean indicating whether a given line has
20/// matched, we maintain a count of the maximum number of characters matched by
21/// that pattern.
22///
23/// This structure stores the information associated with a match (e.g. when a
24/// Disallow is matched) as priority of the match and line matching.
25///
26/// The priority is initialized with a negative value to make sure that a match
27/// of priority 0 is higher priority than no match at all.
28pub struct Match {
29    priority: i32,
30    line: u32,
31}
32
33impl Default for Match {
34    fn default() -> Self {
35        Match::new(Self::NO_MATCH_PRIORITY, 0)
36    }
37}
38
39impl Match {
40    const NO_MATCH_PRIORITY: i32 = -1;
41    pub fn new(priority: i32, line: u32) -> Match {
42        Match { priority, line }
43    }
44
45    pub fn set(&mut self, priority: i32, line: u32) {
46        self.priority = priority;
47        self.line = line;
48    }
49
50    pub fn clear(&mut self) {
51        self.set(Self::NO_MATCH_PRIORITY, 0);
52    }
53
54    pub fn line(&self) -> u32 {
55        self.line
56    }
57
58    pub fn priority(&self) -> i32 {
59        self.priority
60    }
61
62    pub fn higher_priority_match<'a>(a: &'a Match, b: &'a Match) -> &'a Match {
63        if a.priority() > b.priority() {
64            a
65        } else {
66            b
67        }
68    }
69}
70
71#[derive(Default)]
72struct MatchHierarchy {
73    global: Match,
74    specific: Match,
75}
76
77impl MatchHierarchy {
78    pub fn clear(&mut self) {
79        self.global.clear();
80        self.specific.clear();
81    }
82}
83
84/// Create a RobotsMatcher with the default matching strategy.
85///
86/// The default matching strategy is longest-match as opposed to the former internet draft
87/// that provisioned first-match strategy. Analysis shows that longest-match,
88/// while more restrictive for crawlers, is what webmasters assume when writing
89/// directives. For example, in case of conflicting matches (both Allow and
90/// Disallow), the longest match is the one the user wants. For example, in
91/// case of a robots.txt file that has the following rules
92/// ```txt
93///   Allow: /
94///   Disallow: /cgi-bin
95/// ```
96/// it's pretty obvious what the webmaster wants: they want to allow crawl of
97/// every URI except /cgi-bin. However, according to the expired internet
98/// standard, crawlers should be allowed to crawl everything with such a rule.
99pub trait RobotsMatchStrategy: Default {
100    fn match_allow(&self, path: &str, pattern: &str) -> i32;
101
102    fn match_disallow(&self, path: &str, pattern: &str) -> i32;
103
104    /// Returns true if URI path matches the specified pattern. Pattern is anchored
105    /// at the beginning of path. '$' is special only at the end of pattern.
106    ///
107    /// Since 'path' and 'pattern' are both externally determined (by the webmaster),
108    /// we make sure to have acceptable worst-case performance.
109    /// ```rust
110    /// use robotstxt::matcher::{LongestMatchRobotsMatchStrategy, RobotsMatchStrategy};
111    ///
112    /// type Target = LongestMatchRobotsMatchStrategy;
113    /// assert_eq!(true, Target::matches("/", "/"));
114    /// assert_eq!(true, Target::matches("/abc", "/"));
115    /// assert_eq!(false, Target::matches("/", "/abc"));
116    /// assert_eq!(
117    ///     true,
118    ///     Target::matches("/google/robotstxt/tree/master", "/*/*/tree/master")
119    /// );
120    /// assert_eq!(
121    ///     true,
122    ///     Target::matches(
123    ///         "/google/robotstxt/tree/master/index.html",
124    ///         "/*/*/tree/master",
125    ///     )
126    /// );
127    /// assert_eq!(
128    ///     true,
129    ///     Target::matches("/google/robotstxt/tree/master", "/*/*/tree/master$")
130    /// );
131    /// assert_eq!(
132    ///     false,
133    ///     Target::matches("/google/robotstxt/tree/master/abc", "/*/*/tree/master$")
134    /// );
135    /// assert_eq!(
136    ///     false,
137    ///     Target::matches("/google/robotstxt/tree/abc", "/*/*/tree/master")
138    /// );
139    /// ```
140    fn matches(path: &str, pattern: &str) -> bool {
141        let pathlen = path.len();
142        let mut pos = Vec::with_capacity(pathlen + 1);
143
144        // The pos[] array holds a sorted list of indexes of 'path', with length
145        // 'numpos'.  At the start and end of each iteration of the main loop below,
146        // the pos[] array will hold a list of the prefixes of the 'path' which can
147        // match the current prefix of 'pattern'. If this list is ever empty,
148        // return false. If we reach the end of 'pattern' with at least one element
149        // in pos[], return true.
150        let mut numpos: usize = 1;
151        pos.insert(0, 0);
152
153        for (index, pat) in pattern.chars().enumerate() {
154            if pat == '$' && index + 1 == pattern.len() {
155                return pos[numpos - 1] == pathlen;
156            }
157
158            if pat == '*' {
159                numpos = pathlen - pos[0] + 1;
160                for i in 1..numpos {
161                    pos.insert(i, pos[i - 1] + 1);
162                }
163            } else {
164                // Includes '$' when not at end of pattern.
165                let mut new_numpos = 0;
166                for i in 0..numpos {
167                    // TODO Optimize chars().nth() ?
168                    if pos[i] < pathlen && path.chars().nth(pos[i]) == Some(pat) {
169                        pos.insert(new_numpos, pos[i] + 1);
170                        new_numpos += 1;
171                    }
172                }
173                numpos = new_numpos;
174
175                if numpos == 0 {
176                    return false;
177                }
178            }
179        }
180        true
181    }
182}
183
184/// Implements the default robots.txt matching strategy. The maximum number of
185/// characters matched by a pattern is returned as its match priority.
186#[derive(Default)]
187pub struct LongestMatchRobotsMatchStrategy;
188
189impl RobotsMatchStrategy for LongestMatchRobotsMatchStrategy {
190    fn match_allow(&self, path: &str, pattern: &str) -> i32 {
191        if Self::matches(path, pattern) {
192            pattern.len() as i32
193        } else {
194            -1
195        }
196    }
197
198    fn match_disallow(&self, path: &str, pattern: &str) -> i32 {
199        if Self::matches(path, pattern) {
200            pattern.len() as i32
201        } else {
202            -1
203        }
204    }
205}
206
207/// RobotsMatcher - matches robots.txt against URLs.
208///
209/// The Matcher uses a default match strategy for Allow/Disallow patterns which
210/// is the official way of Google crawler to match robots.txt. It is also
211/// possible to provide a custom match strategy.
212///
213/// The entry point for the user is to call one of the [allowed_by_robots](RobotsMatcher::allowed_by_robots())
214/// methods that return directly if a URL is being allowed according to the
215/// robots.txt and the crawl agent.
216/// The RobotsMatcher can be re-used for URLs/robots.txt but is not thread-safe.
217#[derive(Default)]
218pub struct RobotsMatcher<S: RobotsMatchStrategy> {
219    /// Characters of 'url' matching Allow.
220    allow: MatchHierarchy,
221    /// Characters of 'url' matching Disallow.
222    disallow: MatchHierarchy,
223    /// True if processing global agent rules.
224    seen_global_agent: bool,
225    /// True if processing our specific agent.
226    seen_specific_agent: bool,
227    /// True if we ever saw a block for our agent.
228    ever_seen_specific_agent: bool,
229    /// True if saw any key: value pair.
230    seen_separator: bool,
231
232    path: String,
233    user_agents: Vec<String>,
234    match_strategy: S,
235}
236
237enum ParseInvoke {
238    UserAgent {
239        line_num: u32,
240        user_agent: String,
241    },
242    Allow {
243        line_num: u32,
244        value: String,
245    },
246    Disallow {
247        line_num: u32,
248        value: String,
249    },
250    Sitemap {
251        line_num: u32,
252        value: String,
253    },
254    UnknownAction {
255        line_num: u32,
256        action: String,
257        value: String,
258    },
259}
260
261struct CachingRobotsParseHandler<S: RobotsMatchStrategy> {
262    invokes: Vec<ParseInvoke>,
263    matcher: RobotsMatcher<S>,
264}
265
266impl<S: RobotsMatchStrategy> CachingRobotsParseHandler<S> {
267    pub fn new(matcher: RobotsMatcher<S>) -> Self {
268        Self {
269            invokes: vec![],
270            matcher,
271        }
272    }
273
274    fn replay(&mut self) {
275        self.matcher.handle_robots_start();
276        for invoke in &self.invokes {
277            match invoke {
278                ParseInvoke::UserAgent {
279                    line_num,
280                    user_agent,
281                } => self.matcher.handle_user_agent(*line_num, &user_agent),
282                ParseInvoke::Allow { line_num, value } => {
283                    self.matcher.handle_allow(*line_num, &value)
284                }
285                ParseInvoke::Disallow { line_num, value } => {
286                    self.matcher.handle_disallow(*line_num, &value)
287                }
288                ParseInvoke::Sitemap { line_num, value } => {
289                    self.matcher.handle_sitemap(*line_num, &value)
290                }
291                ParseInvoke::UnknownAction {
292                    line_num,
293                    action,
294                    value,
295                } => self
296                    .matcher
297                    .handle_unknown_action(*line_num, &action, &value),
298            }
299        }
300        self.matcher.handle_robots_end();
301    }
302
303    pub fn allowed_by_robots(&mut self, user_agents: Vec<&str>, url: &str) -> bool {
304        let path = super::get_path_params_query(&url);
305        self.matcher.init_user_agents_and_path(user_agents, &path);
306        self.replay();
307        !self.matcher.disallow()
308    }
309}
310
311impl<S: RobotsMatchStrategy> RobotsParseHandler for CachingRobotsParseHandler<S> {
312    fn handle_robots_start(&mut self) {}
313
314    fn handle_robots_end(&mut self) {}
315
316    fn handle_user_agent(&mut self, line_num: u32, user_agent: &str) {
317        self.invokes.push(ParseInvoke::UserAgent {
318            line_num,
319            user_agent: String::from(user_agent),
320        })
321    }
322
323    fn handle_allow(&mut self, line_num: u32, value: &str) {
324        self.invokes.push(ParseInvoke::Allow {
325            line_num,
326            value: String::from(value),
327        })
328    }
329
330    fn handle_disallow(&mut self, line_num: u32, value: &str) {
331        self.invokes.push(ParseInvoke::Disallow {
332            line_num,
333            value: String::from(value),
334        })
335    }
336
337    fn handle_sitemap(&mut self, line_num: u32, value: &str) {
338        self.invokes.push(ParseInvoke::Sitemap {
339            line_num,
340            value: String::from(value),
341        })
342    }
343
344    fn handle_unknown_action(&mut self, line_num: u32, action: &str, value: &str) {
345        self.invokes.push(ParseInvoke::UnknownAction {
346            line_num,
347            action: String::from(action),
348            value: String::from(value),
349        })
350    }
351}
352
353pub struct CachingRobotsMatcher<S: RobotsMatchStrategy> {
354    parse_handler: CachingRobotsParseHandler<S>,
355}
356
357impl<S: RobotsMatchStrategy> CachingRobotsMatcher<S> {
358    pub fn new(matcher: RobotsMatcher<S>) -> Self {
359        Self {
360            parse_handler: CachingRobotsParseHandler::new(matcher),
361        }
362    }
363
364    pub fn parse(&mut self, robots_body: &str) {
365        super::parse_robotstxt(robots_body, &mut self.parse_handler);
366    }
367
368    pub fn allowed_by_robots(&mut self, user_agents: Vec<&str>, url: &str) -> bool {
369        self.parse_handler.allowed_by_robots(user_agents, url)
370    }
371
372    pub fn one_agent_allowed_by_robots(&mut self, user_agent: &str, url: &str) -> bool {
373        self.parse_handler.allowed_by_robots(vec![user_agent], url)
374    }
375}
376
377impl<'a, S: RobotsMatchStrategy> RobotsMatcher<S> {
378    /// Initialize next path and user-agents to check. Path must contain only the
379    /// path, params, and query (if any) of the url and must start with a '/'.
380    fn init_user_agents_and_path(&mut self, user_agents: Vec<&str>, path: &str) {
381        self.path = String::from(path);
382        self.user_agents = user_agents.into_iter().map(String::from).collect();
383    }
384
385    /// Returns true if 'url' is allowed to be fetched by any member of the
386    /// "user_agents" vector. 'url' must be %-encoded according to RFC3986.
387    pub fn allowed_by_robots(
388        &mut self,
389        robots_body: &str,
390        user_agents: Vec<&str>,
391        url: &str,
392    ) -> bool
393    where
394        Self: RobotsParseHandler,
395    {
396        // The url is not normalized (escaped, percent encoded) here because the user
397        // is asked to provide it in escaped form already.
398        let path = super::get_path_params_query(url);
399        self.init_user_agents_and_path(user_agents, &path);
400        super::parse_robotstxt(&robots_body, self);
401        !self.disallow()
402    }
403
404    /// Do robots check for 'url' when there is only one user agent. 'url' must
405    /// be %-encoded according to RFC3986.
406    pub fn one_agent_allowed_by_robots(
407        &mut self,
408        robots_txt: &str,
409        user_agent: &str,
410        url: &str,
411    ) -> bool
412    where
413        Self: RobotsParseHandler,
414    {
415        self.allowed_by_robots(robots_txt, vec![user_agent], url)
416    }
417
418    /// Returns true if we are disallowed from crawling a matching URI.
419    fn disallow(&self) -> bool {
420        if self.allow.specific.priority() > 0 || self.disallow.specific.priority() > 0 {
421            return self.disallow.specific.priority() > self.allow.specific.priority();
422        }
423
424        if self.ever_seen_specific_agent {
425            // Matching group for user-agent but either without disallow or empty one,
426            // i.e. priority == 0.
427            return false;
428        }
429
430        if self.disallow.global.priority() > 0 || self.allow.global.priority() > 0 {
431            return self.disallow.global.priority() > self.allow.global.priority();
432        }
433
434        false
435    }
436
437    /// Returns true if any user-agent was seen.
438    fn seen_any_agent(&self) -> bool {
439        self.seen_global_agent || self.seen_specific_agent
440    }
441
442    /// Extract the matchable part of a user agent string, essentially stopping at
443    /// the first invalid character.
444    /// Example: 'Googlebot/2.1' becomes 'Googlebot'
445    fn extract_user_agent(user_agent: &str) -> &str {
446        // Allowed characters in user-agent are [a-zA-Z_-].
447        if let Some(end) =
448            user_agent.find(|c: char| !(c.is_ascii_alphabetic() || c == '-' || c == '_'))
449        {
450            &user_agent[..end]
451        } else {
452            user_agent
453        }
454    }
455
456    /// Verifies that the given user agent is valid to be matched against
457    /// robots.txt. Valid user agent strings only contain the characters
458    /// [a-zA-Z_-].
459    pub fn is_valid_user_agent_to_obey(user_agent: &str) -> bool {
460        !user_agent.is_empty() && Self::extract_user_agent(user_agent) == user_agent
461    }
462
463    /// Returns the line that matched or 0 if none matched.
464    pub fn matching_line(&self) -> u32 {
465        if self.ever_seen_specific_agent {
466            return Match::higher_priority_match(&self.disallow.specific, &self.allow.specific)
467                .line();
468        }
469        Match::higher_priority_match(&self.disallow.global, &self.allow.global).line()
470    }
471}
472
473impl<S: RobotsMatchStrategy> RobotsParseHandler for RobotsMatcher<S> {
474    fn handle_robots_start(&mut self) {
475        // This is a new robots.txt file, so we need to reset all the instance member
476        // variables. We do it in the same order the instance member variables are
477        // declared, so it's easier to keep track of which ones we have (or maybe
478        // haven't!) done.
479        self.allow.clear();
480        self.disallow.clear();
481
482        self.seen_global_agent = false;
483        self.seen_specific_agent = false;
484        self.ever_seen_specific_agent = false;
485        self.seen_separator = false;
486    }
487
488    fn handle_robots_end(&mut self) {}
489
490    fn handle_user_agent(&mut self, _line_num: u32, user_agent: &str) {
491        if self.seen_separator {
492            self.seen_specific_agent = false;
493            self.seen_global_agent = false;
494            self.seen_separator = false;
495        }
496
497        // Google-specific optimization: a '*' followed by space and more characters
498        // in a user-agent record is still regarded a global rule.
499        if !user_agent.is_empty()
500            && user_agent.starts_with('*')
501            && (user_agent.len() == 1 || user_agent[1..].starts_with(char::is_whitespace))
502        {
503            self.seen_global_agent = true;
504        } else {
505            let user_agent = Self::extract_user_agent(user_agent);
506            for agent in &self.user_agents {
507                if user_agent.eq_ignore_ascii_case(&agent) {
508                    self.ever_seen_specific_agent = true;
509                    self.seen_specific_agent = true;
510                    break;
511                }
512            }
513        }
514    }
515
516    fn handle_allow(&mut self, line_num: u32, value: &str) {
517        if !self.seen_any_agent() {
518            return;
519        }
520
521        self.seen_separator = true;
522        let priority = self.match_strategy.match_disallow(&self.path, value);
523        if priority >= 0 {
524            if self.seen_specific_agent {
525                if self.allow.specific.priority() < priority {
526                    self.allow.specific.set(priority, line_num);
527                }
528            } else if self.allow.global.priority() < priority {
529                self.allow.global.set(priority, line_num);
530            }
531        } else {
532            // Google-specific optimization: 'index.htm' and 'index.html' are normalized to '/'.
533            let slash_pos = value.rfind('/');
534
535            if let Some(slash_pos) = slash_pos {
536                if value[slash_pos..].starts_with("/index.htm") {
537                    let new_pattern = format!("{}{}", &value[..(slash_pos + 1)], "$");
538                    self.handle_allow(line_num, &new_pattern);
539                }
540            }
541        }
542    }
543
544    fn handle_disallow(&mut self, line_num: u32, value: &str) {
545        if !self.seen_any_agent() {
546            return;
547        }
548
549        self.seen_separator = true;
550        let priority = self.match_strategy.match_disallow(&self.path, value);
551        if priority >= 0 {
552            if self.seen_specific_agent {
553                if self.disallow.specific.priority() < priority {
554                    self.disallow.specific.set(priority, line_num);
555                }
556            } else if self.disallow.global.priority() < priority {
557                self.disallow.global.set(priority, line_num);
558            }
559        }
560    }
561
562    fn handle_sitemap(&mut self, _line_num: u32, _value: &str) {
563        self.seen_separator = true;
564    }
565
566    fn handle_unknown_action(&mut self, _line_num: u32, _action: &str, _value: &str) {
567        self.seen_separator = true;
568    }
569}
570
571#[cfg(test)]
572mod test {
573    use crate::matcher::*;
574
575    #[test]
576    fn test_extract_user_agent<'a>() {
577        // Example: 'Googlebot/2.1' becomes 'Googlebot'
578        type Target = RobotsMatcher<LongestMatchRobotsMatchStrategy>;
579        assert_eq!("Googlebot", Target::extract_user_agent("Googlebot/2.1"));
580        assert_eq!("Googlebot", Target::extract_user_agent("Googlebot"));
581        assert_eq!("Googlebot-", Target::extract_user_agent("Googlebot-"));
582        assert_eq!("Googlebot_", Target::extract_user_agent("Googlebot_"));
583        assert_eq!("Googlebot_", Target::extract_user_agent("Googlebot_2.1"));
584        assert_eq!("", Target::extract_user_agent("1Googlebot_2.1"));
585        assert_eq!("Goo", Target::extract_user_agent("Goo1glebot_2.1"));
586    }
587}
robotstxt_with_cache/matcher.rs

robotstxt_with_cache/
matcher.rs