robotstxt_with_cache/
lib.rs

1// Copyright 2020 Folyd
2// Copyright 1999 Google LLC
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//     https://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16//!
17//! A native Rust port of [Google's robots.txt parser and matcher C++ library](https://github.com/google/robotstxt).
18//!
19//! - Native Rust port, no third-part crate dependency
20//! - Preserves all behaviour of original library
21//! - 100% google original test passed
22//!
23//! # Quick start
24//! ```rust
25//! use robotstxt::DefaultMatcher;
26//!
27//! let mut matcher = DefaultMatcher::default();
28//! let robots_body = "user-agent: FooBot\n\
29//!                    disallow: /\n";
30//! assert_eq!(false, matcher.one_agent_allowed_by_robots(robots_body, "FooBot", "https://foo.com/"));
31//! ```
32
33pub mod matcher;
34pub mod parser;
35
36use crate::matcher::{CachingRobotsMatcher, LongestMatchRobotsMatchStrategy, RobotsMatcher};
37use crate::parser::RobotsTxtParser;
38
39pub type DefaultMatcher = RobotsMatcher<LongestMatchRobotsMatchStrategy>;
40pub type DefaultCachingMatcher = CachingRobotsMatcher<LongestMatchRobotsMatchStrategy>;
41
42/// Handler for directives found in robots.txt.
43pub trait RobotsParseHandler {
44    fn handle_robots_start(&mut self);
45    fn handle_robots_end(&mut self);
46    fn handle_user_agent(&mut self, line_num: u32, user_agent: &str);
47    fn handle_allow(&mut self, line_num: u32, value: &str);
48    fn handle_disallow(&mut self, line_num: u32, value: &str);
49    fn handle_sitemap(&mut self, line_num: u32, value: &str);
50    /// Any other unrecognized name/value pairs.
51    fn handle_unknown_action(&mut self, line_num: u32, action: &str, value: &str);
52}
53
54/// Extracts path (with params) and query part from URL. Removes scheme,
55/// authority, and fragment. Result always starts with "/".
56/// Returns "/" if the url doesn't have a path or is not valid.
57/// ```rust
58///use robotstxt::get_path_params_query;
59///
60///let f= get_path_params_query;
61///assert_eq!("/", f(""));
62///assert_eq!("/", f("http://www.example.com"));
63///assert_eq!("/", f("http://www.example.com/"));
64///assert_eq!("/a", f("http://www.example.com/a"));
65///assert_eq!("/a/", f("http://www.example.com/a/"));
66///assert_eq!(
67///    "/a/b?c=http://d.e/",
68///    f("http://www.example.com/a/b?c=http://d.e/")
69///);
70///assert_eq!(
71///    "/a/b?c=d&e=f",
72///    f("http://www.example.com/a/b?c=d&e=f#fragment")
73///);
74///assert_eq!("/", f("example.com"));
75///assert_eq!("/", f("example.com/"));
76///assert_eq!("/a", f("example.com/a"));
77///assert_eq!("/a/", f("example.com/a/"));
78///assert_eq!("/a/b?c=d&e=f", f("example.com/a/b?c=d&e=f#fragment"));
79///assert_eq!("/", f("a"));
80///assert_eq!("/", f("a/"));
81///assert_eq!("/a", f("/a"));
82///assert_eq!("/b", f("a/b"));
83///assert_eq!("/?a", f("example.com?a"));
84///assert_eq!("/a;b", f("example.com/a;b#c"));
85///assert_eq!("/b/c", f("//a/b/c"));
86/// ```
87pub fn get_path_params_query(url: &str) -> String {
88    fn find_first_of(s: &str, pattern: &str, start_position: usize) -> Option<usize> {
89        s[start_position..]
90            .find(|c| pattern.contains(c))
91            .map(|pos| pos + start_position)
92    }
93    fn find(s: &str, pattern: &str, start_position: usize) -> Option<usize> {
94        s[start_position..]
95            .find(pattern)
96            .map(|pos| pos + start_position)
97    }
98
99    // Initial two slashes are ignored.
100    let search_start = if url.len() >= 2 && url.get(..2) == Some("//") {
101        2
102    } else {
103        0
104    };
105    let early_path = find_first_of(url, "/?;", search_start);
106    let mut protocol_end = find(url, "://", search_start);
107
108    if early_path.is_some() && early_path < protocol_end {
109        // If path, param or query starts before ://, :// doesn't indicate protocol.
110        protocol_end = None;
111    }
112    if protocol_end.is_none() {
113        protocol_end = Some(search_start);
114    } else {
115        protocol_end = protocol_end.map(|pos| pos + 3)
116    }
117
118    if let Some(path_start) = find_first_of(url, "/?;", protocol_end.unwrap()) {
119        let hash_pos = find(url, "#", search_start);
120        if hash_pos.is_some() && hash_pos.unwrap() < path_start {
121            return String::from("/");
122        }
123
124        let path_end = hash_pos.unwrap_or_else(|| url.len());
125        if url.get(path_start..=path_start) != Some("/") {
126            // Prepend a slash if the result would start e.g. with '?'.
127            return format!("/{}", &url[path_start..path_end]);
128        }
129        return String::from(&url[path_start..path_end]);
130    }
131
132    String::from("/")
133}
134
135/// Parses body of a robots.txt and emits parse callbacks. This will accept
136/// typical typos found in robots.txt, such as 'disalow'.
137///
138/// Note, this function will accept all kind of input but will skip
139/// everything that does not look like a robots directive.
140pub fn parse_robotstxt(robots_body: &str, parse_callback: &mut impl RobotsParseHandler) {
141    let mut parser = RobotsTxtParser::new(robots_body, parse_callback);
142    parser.parse();
143}
144
145#[cfg(test)]
146mod tests {
147    #![allow(unused_variables)]
148
149    use super::*;
150
151    #[derive(Default)]
152    struct RobotsStatsReporter {
153        last_line_seen: u32,
154        valid_directives: u32,
155        unknown_directives: u32,
156        sitemap: String,
157    }
158
159    impl RobotsStatsReporter {
160        fn digest(&mut self, line_num: u32) {
161            assert!(line_num >= self.last_line_seen);
162            self.last_line_seen = line_num;
163            self.valid_directives += 1;
164        }
165    }
166
167    impl RobotsParseHandler for RobotsStatsReporter {
168        fn handle_robots_start(&mut self) {
169            self.last_line_seen = 0;
170            self.valid_directives = 0;
171            self.unknown_directives = 0;
172            self.sitemap.clear();
173        }
174
175        fn handle_robots_end(&mut self) {}
176
177        fn handle_user_agent(&mut self, line_num: u32, user_agent: &str) {
178            self.digest(line_num);
179        }
180
181        fn handle_allow(&mut self, line_num: u32, value: &str) {
182            self.digest(line_num);
183        }
184
185        fn handle_disallow(&mut self, line_num: u32, value: &str) {
186            self.digest(line_num);
187        }
188
189        fn handle_sitemap(&mut self, line_num: u32, value: &str) {
190            self.digest(line_num);
191            self.sitemap.push_str(value);
192        }
193
194        // Any other unrecognized name/v pairs.
195        fn handle_unknown_action(&mut self, line_num: u32, action: &str, value: &str) {
196            self.last_line_seen = line_num;
197            self.unknown_directives += 1;
198        }
199    }
200
201    #[test]
202    // Different kinds of line endings are all supported: %x0D / %x0A / %x0D.0A
203    fn test_lines_numbers_are_counted_correctly() {
204        let mut report = RobotsStatsReporter::default();
205        let unix_file = "User-Agent: foo\n\
206        Allow: /some/path\n\
207        User-Agent: bar\n\
208        \n\
209        \n\
210        Disallow: /\n";
211        super::parse_robotstxt(unix_file, &mut report);
212        assert_eq!(4, report.valid_directives);
213        assert_eq!(6, report.last_line_seen);
214
215        let mac_file = "User-Agent: foo\r\
216        Allow: /some/path\r\
217        User-Agent: bar\r\
218        \r\
219        \r\
220        Disallow: /\r";
221        super::parse_robotstxt(mac_file, &mut report);
222        assert_eq!(4, report.valid_directives);
223        assert_eq!(6, report.last_line_seen);
224
225        let no_final_new_line = "User-Agent: foo\n\
226        Allow: /some/path\n\
227        User-Agent: bar\n\
228        \n\
229        \n\
230        Disallow: /";
231        super::parse_robotstxt(no_final_new_line, &mut report);
232        assert_eq!(4, report.valid_directives);
233        assert_eq!(6, report.last_line_seen);
234
235        let mixed_file = "User-Agent: foo\n\
236        Allow: /some/path\r\n\
237        User-Agent: bar\n\
238        \r\n\
239        \n\
240        Disallow: /";
241        super::parse_robotstxt(mixed_file, &mut report);
242        assert_eq!(4, report.valid_directives);
243        assert_eq!(6, report.last_line_seen);
244    }
245
246    #[test]
247    // BOM characters are unparseable and thus skipped. The rules following the line
248    // are used.
249    fn test_utf8_byte_order_mark_is_skipped() {
250        let mut report = RobotsStatsReporter::default();
251        let utf8_file_full_bom = "\u{EF}\u{BB}\u{BF}\
252        User-Agent: foo\n\
253        Allow: /AnyValue\n";
254        super::parse_robotstxt(utf8_file_full_bom, &mut report);
255        assert_eq!(2, report.valid_directives);
256        assert_eq!(0, report.unknown_directives);
257
258        // We allow as well partial ByteOrderMarks.
259        let utf8_file_partial_2bom = "\u{EF}\u{BB}\
260        User-Agent: foo\n\
261        Allow: /AnyValue\n";
262        super::parse_robotstxt(utf8_file_partial_2bom, &mut report);
263        assert_eq!(2, report.valid_directives);
264        assert_eq!(0, report.unknown_directives);
265
266        let utf8_file_partial_1bom = "\u{EF}\
267        User-Agent: foo\n\
268        Allow: /AnyValue\n";
269        super::parse_robotstxt(utf8_file_partial_1bom, &mut report);
270        assert_eq!(2, report.valid_directives);
271        assert_eq!(0, report.unknown_directives);
272
273        // If the BOM is not the right sequence, the first line looks like garbage
274        // that is skipped (we essentially see "\x11\xBFUser-Agent").
275        let utf8_file_broken_bom = "\u{EF}\u{11}\u{BF}\
276        User-Agent: foo\n\
277        Allow: /AnyValue\n";
278        super::parse_robotstxt(utf8_file_broken_bom, &mut report);
279        assert_eq!(1, report.valid_directives);
280        // // We get one broken line.
281        assert_eq!(1, report.unknown_directives);
282
283        // Some other messed up file: BOMs only valid in the beginning of the file.
284        let utf8_bom_somewhere_in_middle_of_file = "User-Agent: foo\n\
285        \u{EF}\u{BB}\u{BF}\
286        Allow: /AnyValue\n";
287        super::parse_robotstxt(utf8_bom_somewhere_in_middle_of_file, &mut report);
288        assert_eq!(1, report.valid_directives);
289        assert_eq!(1, report.unknown_directives);
290    }
291
292    #[test]
293    // Google specific: the I-D allows any line that crawlers might need, such as
294    // sitemaps, which Google supports.
295    // See REP I-D section "Other records".
296    // https://tools.ietf.org/html/draft-koster-rep#section-2.2.4
297    fn test_non_standard_line_example_sitemap() {
298        let mut report = RobotsStatsReporter::default();
299
300        {
301            let sitemap_loc = "http://foo.bar/sitemap.xml";
302            let mut robotstxt: String = "User-Agent: foo\n\
303        Allow: /some/path\n\
304        User-Agent: bar\n\
305        \n\
306        \n"
307            .into();
308            robotstxt.push_str(&format!("Sitemap: {}\n", sitemap_loc));
309
310            super::parse_robotstxt(&robotstxt, &mut report);
311            assert_eq!(sitemap_loc, report.sitemap.as_str());
312        }
313
314        {
315            // A sitemap line may appear anywhere in the file.
316            let mut robotstxt = String::new();
317            let sitemap_loc = "http://foo.bar/sitemap.xml";
318            let robotstxt_temp = "User-Agent: foo\n\
319            Allow: /some/path\n\
320            User-Agent: bar\n\
321            \n\
322            \n";
323            robotstxt.push_str(&format!("Sitemap: {}\n{}", sitemap_loc, robotstxt_temp));
324
325            super::parse_robotstxt(&robotstxt, &mut report);
326            assert_eq!(sitemap_loc, report.sitemap.as_str());
327        }
328    }
329
330    #[test]
331    fn test_blank_line_case() {
332        let robots_content = r#"User-agent: *
333Disallow: /*q=
334Disallow: /users/*?
335Disallow: /join/*?
336Disallow: /morelikethis/
337Disallow: /download/
338Disallow: /checkout/
339Disallow: /global/
340Disallow: /api/
341Disallow: /critiques/
342
343Sitemap: http://sitemaps.test.net/sitemap-index.xml.gz"#;
344        let mut matcher = DefaultMatcher::default();
345        assert!(matcher.one_agent_allowed_by_robots(
346            robots_content,
347            "bot",
348            "https://www.test.com/"
349        ));
350    }
351
352    #[test]
353    fn test_unknown_robotstxt_case() {
354        let robots_content = "#!/usr/bin/env bash\n\
355# Make sure you have `curl` installed\n\
356\n\
357######## VARIABLES #########\n\
358abc";
359        let mut matcher = DefaultMatcher::default();
360        assert!(matcher.one_agent_allowed_by_robots(
361            robots_content,
362            "bot",
363            "https://www.test.com/"
364        ));
365    }
366}