robotstxt_with_cache/
parser.rs

1// Copyright 2020 Folyd
2// Copyright 1999 Google LLC
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//     https://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16
17use crate::RobotsParseHandler;
18
19#[derive(Eq, PartialEq)]
20/// A enum represents key types in robotstxt.
21pub enum ParseKeyType {
22    // Generic highlevel fields.
23    UserAgent,
24    Sitemap,
25
26    // Fields within a user-agent.
27    Allow,
28    Disallow,
29
30    /// Unrecognized field; kept as-is. High number so that additions to the
31    /// enumeration above does not change the serialization.
32    Unknown = 128,
33}
34
35/// A robots.txt has lines of key/value pairs. A ParsedRobotsKey represents
36/// a key.
37///
38/// This class can parse a text-representation (including common typos)
39/// and represent them as an enumeration which allows for faster processing
40/// afterwards.
41/// For unparsable keys, the original string representation is kept.
42pub struct ParsedRobotsKey {
43    type_: ParseKeyType,
44    key_text: String,
45    /// Allow for typos such as DISALOW in robots.txt.
46    allow_typo: bool,
47}
48
49impl Default for ParsedRobotsKey {
50    fn default() -> Self {
51        ParsedRobotsKey {
52            type_: ParseKeyType::Unknown,
53            allow_typo: true,
54            key_text: String::new(),
55        }
56    }
57}
58
59impl ParsedRobotsKey {
60    /// Parse given key text. Does not copy the text, so the text_key must stay
61    /// valid for the object's life-time or the next `parse()` call.
62    pub fn parse(&mut self, key: &str) {
63        if self.validate_key(key, &["user-agent"], Some(&["useragent", "user agent"])) {
64            self.type_ = ParseKeyType::UserAgent;
65        } else if self.validate_key(key, &["allow"], None) {
66            self.type_ = ParseKeyType::Allow;
67        } else if self.validate_key(
68            key,
69            &["disallow"],
70            Some(&["dissallow", "dissalow", "disalow", "diasllow", "disallaw"]),
71        ) {
72            self.type_ = ParseKeyType::Disallow;
73        } else if self.validate_key(key, &["sitemap", "site-map"], None) {
74            self.type_ = ParseKeyType::Sitemap;
75        } else {
76            self.type_ = ParseKeyType::Unknown;
77            self.key_text = key.to_string();
78        }
79    }
80
81    /// Returns the type of key.
82    pub fn get_type(&self) -> &ParseKeyType {
83        &self.type_
84    }
85
86    /// If this is an unknown key, get the text.
87    pub fn get_unknown_text(&self) -> String {
88        self.key_text.to_string()
89    }
90
91    fn validate_key(&self, key: &str, targets: &[&str], typo_targets: Option<&[&str]>) -> bool {
92        let key = key.to_lowercase();
93        let check = |target: &&str| key.starts_with(&target.to_lowercase());
94        targets.iter().any(check)
95            || (typo_targets.is_some()
96                && self.allow_typo
97                && typo_targets.unwrap().iter().any(check))
98    }
99}
100
101/// A robotstxt parser.
102pub struct RobotsTxtParser<'a, Handler: RobotsParseHandler> {
103    robots_body: &'a str,
104    handler: &'a mut Handler,
105}
106
107impl<'a, Handler: RobotsParseHandler> RobotsTxtParser<'a, Handler> {
108    pub fn new(robots_body: &'a str, handler: &'a mut Handler) -> Self {
109        RobotsTxtParser {
110            robots_body,
111            handler,
112        }
113    }
114
115    /// Parse body of this Parser's robots.txt and emit parse callbacks. This will accept
116    /// typical typos found in robots.txt, such as 'disalow'.
117    ///
118    /// Note, this function will accept all kind of input but will skip
119    /// everything that does not look like a robots directive.
120    pub fn parse(&mut self) {
121        let utf_bom = [0xEF, 0xBB, 0xBF];
122        // Certain browsers limit the URL length to 2083 bytes. In a robots.txt, it's
123        // fairly safe to assume any valid line isn't going to be more than many times
124        // that max url length of 2KB. We want some padding for
125        // UTF-8 encoding/nulls/etc. but a much smaller bound would be okay as well.
126        // If so, we can ignore the chars on a line past that.
127        let max_line_len = 2083 * 8;
128        let mut line_num = 0;
129        let mut bom_pos = 0;
130        let mut last_was_carriage_return = false;
131        self.handler.handle_robots_start();
132
133        let mut start = 0;
134        let mut end = 0;
135        // We should skip the rest part which exceed max_line_len
136        // in the current line.
137        let mut skip_exceed = 0;
138        for (ch, char_len_utf8) in self
139            .robots_body
140            .chars()
141            .map(|ch| (ch as usize, ch.len_utf8()))
142        {
143            // Google-specific optimization: UTF-8 byte order marks should never
144            // appear in a robots.txt file, but they do nevertheless. Skipping
145            // possible BOM-prefix in the first bytes of the input.
146            if bom_pos < utf_bom.len() && ch == utf_bom[bom_pos] {
147                bom_pos += 1;
148                start += char_len_utf8;
149                end += char_len_utf8;
150                continue;
151            }
152            bom_pos = utf_bom.len();
153
154            if ch != 0x0A && ch != 0x0D {
155                // Non-line-ending char case.
156                // Put in next spot on current line, as long as there's room.
157                if (end - start) < max_line_len - 1 {
158                    end += char_len_utf8;
159                } else {
160                    skip_exceed += 1;
161                }
162            } else {
163                // Line-ending character char case.
164                // Only emit an empty line if this was not due to the second character
165                // of the DOS line-ending \r\n .
166                let is_crlf_continuation = end == start && last_was_carriage_return && ch == 0x0A;
167                if !is_crlf_continuation {
168                    line_num += 1;
169                    self.parse_and_emit_line(line_num, &self.robots_body[start..end]);
170                }
171                // Add skip_exceed to skip those chars.
172                end += skip_exceed + char_len_utf8;
173                start = end;
174                last_was_carriage_return = ch == 0x0D;
175                skip_exceed = 0;
176            }
177        }
178        line_num += 1;
179        self.parse_and_emit_line(line_num, &self.robots_body[start..end]);
180        self.handler.handle_robots_end();
181    }
182
183    /// Attempts to parse a line of robots.txt into a key/value pair.
184    ///
185    /// On success, the parsed key and value, and true, are returned. If parsing is
186    /// unsuccessful, `parse_key_value` returns two empty strings and false.
187    pub fn parse_key_value(line: &str) -> (&str, &str, bool) {
188        let mut line = line;
189        // Remove comments from the current robots.txt line.
190        if let Some(comment) = line.find('#') {
191            line = &line[..comment].trim();
192        }
193
194        // Rules must match the following pattern:
195        //   <key>[ \t]*:[ \t]*<value>
196        let mut sep = line.find(':');
197        if sep.is_none() {
198            // Google-specific optimization: some people forget the colon, so we need to
199            // accept whitespace in its stead.
200            let white = " \t";
201
202            sep = line.find(|c| white.contains(c));
203            if let Some(sep) = sep {
204                let val = &line[sep..].trim();
205                if val.is_empty() || val.find(|c| white.contains(c)).is_some() {
206                    // We only accept whitespace as a separator if there are exactly two
207                    // sequences of non-whitespace characters.  If we get here, there were
208                    // more than 2 such sequences since we stripped trailing whitespace
209                    // above.
210                    return ("", "", false);
211                }
212            }
213        }
214
215        if let Some(sep) = sep {
216            // Key starts at beginning of line.
217            let key = &line[..sep];
218            if key.is_empty() {
219                return ("", "", false);
220            }
221
222            // Value starts after the separator.
223            let value = &line[(sep + 1)..];
224            (key.trim(), value.trim(), true)
225        } else {
226            // Couldn't find a separator.
227            ("", "", false)
228        }
229    }
230
231    pub fn need_escape_value_for_key(key: &ParsedRobotsKey) -> bool {
232        !matches!(
233            key.get_type(),
234            ParseKeyType::UserAgent | ParseKeyType::Sitemap
235        )
236    }
237
238    fn parse_and_emit_line(&mut self, current_line: u32, line: &str) {
239        match Self::parse_key_value(line) {
240            (_, _, false) => {}
241            (string_key, value, true) => {
242                let mut key = ParsedRobotsKey::default();
243                key.parse(string_key);
244                if Self::need_escape_value_for_key(&key) {
245                    let value = escape_pattern(value);
246                    self.emit(current_line, &key, &value);
247                } else {
248                    self.emit(current_line, &key, value);
249                }
250            }
251        }
252    }
253
254    fn emit(&mut self, line: u32, key: &ParsedRobotsKey, value: &str) {
255        match key.get_type() {
256            ParseKeyType::UserAgent => self.handler.handle_user_agent(line, value),
257            ParseKeyType::Sitemap => self.handler.handle_sitemap(line, value),
258            ParseKeyType::Allow => self.handler.handle_allow(line, value),
259            ParseKeyType::Disallow => self.handler.handle_disallow(line, value),
260            ParseKeyType::Unknown => {
261                self.handler
262                    .handle_unknown_action(line, &key.get_unknown_text(), value)
263            }
264        }
265    }
266}
267
268const HEX_DIGITS: [char; 16] = [
269    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
270];
271
272/// Canonicalize the allowed/disallowed path patterns.
273///
274/// UTF-8 multibyte sequences (and other out-of-range ASCII values) are percent-encoded,
275/// and any existing percent-encoded values have their hex values normalised to uppercase.
276///
277/// For example:
278/// ```txt
279///     /SanJoséSellers ==> /Sanjos%C3%A9Sellers
280///     %aa ==> %AA
281/// ```
282/// If the given path pattern is already adequately escaped,
283/// the original string is returned unchanged.
284pub fn escape_pattern(path: &str) -> String {
285    let mut num_to_escape = 0;
286    let mut need_capitalize = false;
287
288    // First, scan the buffer to see if changes are needed. Most don't.
289    let mut chars = path.bytes();
290    loop {
291        match chars.next() {
292            // (a) % escape sequence.
293            Some(c) if c as char == '%' => {
294                match (
295                    chars.next().map(|c| c as char),
296                    chars.next().map(|c| c as char),
297                ) {
298                    (Some(c1), Some(c2)) if c1.is_digit(16) && c2.is_digit(16) => {
299                        if c1.is_ascii_lowercase() || c2.is_ascii_lowercase() {
300                            need_capitalize = true;
301                        }
302                    }
303                    _ => {}
304                }
305            }
306            Some(c) if c >= 0x80 => {
307                // (b) needs escaping.
308                num_to_escape += 1;
309            }
310            o => {
311                // (c) Already escaped and escape-characters normalized (eg. %2f -> %2F).
312                if o.is_none() {
313                    break;
314                }
315            }
316        }
317    }
318    // Return if no changes needed.
319    if num_to_escape == 0 && !need_capitalize {
320        return path.to_string();
321    }
322
323    let mut dest = String::with_capacity(num_to_escape * 2 + path.len() + 1);
324    chars = path.bytes();
325    loop {
326        match chars.next() {
327            Some(c) if c as char == '%' => {
328                // (a) Normalize %-escaped sequence (eg. %2f -> %2F).
329                match (
330                    chars.next().map(|c| c as char),
331                    chars.next().map(|c| c as char),
332                ) {
333                    (Some(c1), Some(c2)) if c1.is_digit(16) && c2.is_digit(16) => {
334                        dest.push(c as char);
335                        dest.push(c1.to_ascii_uppercase());
336                        dest.push(c2.to_ascii_uppercase());
337                    }
338                    _ => {}
339                }
340            }
341            Some(c) if c >= 0x80 => {
342                // (b) %-escape octets whose highest bit is set. These are outside the ASCII range.
343                dest.push('%');
344                dest.push(HEX_DIGITS[(c as usize >> 4) & 0xf]);
345                dest.push(HEX_DIGITS[c as usize & 0xf]);
346            }
347            Some(c) => {
348                // (c) Normal character, no modification needed.
349                dest.push(c as char);
350            }
351            None => {
352                break;
353            }
354        }
355    }
356    dest
357}
358
359#[cfg(test)]
360mod tests {
361    #![allow(unused_variables)]
362
363    use crate::parser::*;
364    use crate::RobotsParseHandler;
365
366    struct FooHandler;
367
368    impl RobotsParseHandler for FooHandler {
369        fn handle_robots_start(&mut self) {
370            unimplemented!()
371        }
372
373        fn handle_robots_end(&mut self) {
374            unimplemented!()
375        }
376
377        fn handle_user_agent(&mut self, line_num: u32, user_agent: &str) {
378            unimplemented!()
379        }
380
381        fn handle_allow(&mut self, line_num: u32, value: &str) {
382            unimplemented!()
383        }
384
385        fn handle_disallow(&mut self, line_num: u32, value: &str) {
386            unimplemented!()
387        }
388
389        fn handle_sitemap(&mut self, line_num: u32, value: &str) {
390            unimplemented!()
391        }
392
393        fn handle_unknown_action(&mut self, line_num: u32, action: &str, value: &str) {
394            unimplemented!()
395        }
396    }
397
398    #[test]
399    fn test_parse_key_value<'a>() {
400        type Target<'a> = RobotsTxtParser<'a, FooHandler>;
401        let negative = ("", "", false);
402        let positive = ("User-agent", "Googlebot", true);
403
404        assert_eq!(negative, Target::parse_key_value("# "));
405        assert_eq!(negative, Target::parse_key_value("# User-agent: Googlebot"));
406
407        assert_eq!(positive, Target::parse_key_value("User-agent: Googlebot"));
408        assert_eq!(positive, Target::parse_key_value("User-agent  Googlebot"));
409        assert_eq!(positive, Target::parse_key_value("User-agent \t Googlebot"));
410        assert_eq!(positive, Target::parse_key_value("User-agent\tGooglebot"));
411        assert_eq!(
412            positive,
413            Target::parse_key_value("User-agent: Googlebot # 123")
414        );
415        assert_eq!(
416            positive,
417            Target::parse_key_value("User-agent\tGooglebot # 123")
418        );
419    }
420
421    #[test]
422    fn test_escape_pattern() {
423        assert_eq!(
424            "http://www.example.com",
425            &escape_pattern("http://www.example.com")
426        );
427        assert_eq!("/a/b/c", &escape_pattern("/a/b/c"));
428        assert_eq!("%AA", &escape_pattern("%aa"));
429        assert_eq!("%AA", &escape_pattern("%aA"));
430        assert_eq!("/Sanjos%C3%A9Sellers", &escape_pattern("/SanjoséSellers"));
431        assert_eq!("%C3%A1", &escape_pattern("á"));
432    }
433}