Skip to main content

robotstxt/
lib.rs

1// Copyright 2020 Folyd
2// Copyright 1999 Google LLC
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//     https://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16//!
17//! A native Rust port of [Google's robots.txt parser and matcher C++ library](https://github.com/google/robotstxt).
18//!
19//! - Native Rust port, no third-part crate dependency
20//! - Preserves all behaviour of original library
21//! - 100% google original test passed
22//!
23//! # Quick start
24//! ```rust
25//! use robotstxt::DefaultMatcher;
26//!
27//! let mut matcher = DefaultMatcher::default();
28//! let robots_body = "user-agent: FooBot\n\
29//!                    disallow: /\n";
30//! assert_eq!(false, matcher.one_agent_allowed_by_robots(robots_body, "FooBot", "https://foo.com/"));
31//! ```
32
33use std::borrow::Cow;
34
35use matcher::{LongestMatchRobotsMatchStrategy, RobotsMatcher};
36use parser::RobotsTxtParser;
37
38/// A matcher module.
39pub mod matcher;
40/// A parser module.
41pub mod parser;
42
43/// A default [RobotsMatcher] with [LongestMatchRobotsMatchStrategy].
44pub type DefaultMatcher<'a> = RobotsMatcher<'a, LongestMatchRobotsMatchStrategy>;
45
46/// Handler for directives found in robots.txt.
47pub trait RobotsParseHandler {
48    fn handle_robots_start(&mut self);
49    fn handle_robots_end(&mut self);
50    fn handle_user_agent(&mut self, line_num: u32, user_agent: &str);
51    fn handle_allow(&mut self, line_num: u32, value: &str);
52    fn handle_disallow(&mut self, line_num: u32, value: &str);
53    fn handle_sitemap(&mut self, line_num: u32, value: &str);
54    /// Any other unrecognized name/value pairs.
55    fn handle_unknown_action(&mut self, line_num: u32, action: &str, value: &str);
56}
57
58/// Extracts path (with params) and query part from URL. Removes scheme,
59/// authority, and fragment. Result always starts with "/".
60/// Returns "/" if the url doesn't have a path or is not valid.
61/// ```rust
62///use robotstxt::get_path_params_query;
63///
64///let f= get_path_params_query;
65///assert_eq!("/", f(""));
66///assert_eq!("/", f("http://www.example.com"));
67///assert_eq!("/", f("http://www.example.com/"));
68///assert_eq!("/a", f("http://www.example.com/a"));
69///assert_eq!("/a/", f("http://www.example.com/a/"));
70///assert_eq!(
71///    "/a/b?c=http://d.e/",
72///    f("http://www.example.com/a/b?c=http://d.e/")
73///);
74///assert_eq!(
75///    "/a/b?c=d&e=f",
76///    f("http://www.example.com/a/b?c=d&e=f#fragment")
77///);
78///assert_eq!("/", f("example.com"));
79///assert_eq!("/", f("example.com/"));
80///assert_eq!("/a", f("example.com/a"));
81///assert_eq!("/a/", f("example.com/a/"));
82///assert_eq!("/a/b?c=d&e=f", f("example.com/a/b?c=d&e=f#fragment"));
83///assert_eq!("/", f("a"));
84///assert_eq!("/", f("a/"));
85///assert_eq!("/a", f("/a"));
86///assert_eq!("/b", f("a/b"));
87///assert_eq!("/?a", f("example.com?a"));
88///assert_eq!("/a;b", f("example.com/a;b#c"));
89///assert_eq!("/b/c", f("//a/b/c"));
90/// ```
91pub fn get_path_params_query(url: &str) -> Cow<str> {
92    fn find_first_of(s: &str, pattern: &str, start_position: usize) -> Option<usize> {
93        s[start_position..]
94            .find(|c| pattern.contains(c))
95            .map(|pos| pos + start_position)
96    }
97    fn find(s: &str, pattern: &str, start_position: usize) -> Option<usize> {
98        s[start_position..]
99            .find(pattern)
100            .map(|pos| pos + start_position)
101    }
102
103    // Initial two slashes are ignored.
104    let search_start = if url.len() >= 2 && url.get(..2) == Some("//") {
105        2
106    } else {
107        0
108    };
109    let early_path = find_first_of(url, "/?;", search_start);
110    let mut protocol_end = find(url, "://", search_start);
111
112    if early_path.is_some() && early_path < protocol_end {
113        // If path, param or query starts before ://, :// doesn't indicate protocol.
114        protocol_end = None;
115    }
116    if protocol_end.is_none() {
117        protocol_end = Some(search_start);
118    } else {
119        protocol_end = protocol_end.map(|pos| pos + 3)
120    }
121
122    if let Some(path_start) = find_first_of(url, "/?;", protocol_end.unwrap()) {
123        let hash_pos = find(url, "#", search_start);
124        if hash_pos.is_some() && hash_pos.unwrap() < path_start {
125            return Cow::Borrowed("/");
126        }
127
128        let path_end = hash_pos.unwrap_or_else(|| url.len());
129        if url.get(path_start..=path_start) != Some("/") {
130            // Prepend a slash if the result would start e.g. with '?'.
131            return Cow::Owned(format!("/{}", &url[path_start..path_end]));
132        }
133        return Cow::Borrowed(&url[path_start..path_end]);
134    }
135
136    Cow::Borrowed("/")
137}
138
139/// Parses body of a robots.txt and emits parse callbacks. This will accept
140/// typical typos found in robots.txt, such as 'disalow'.
141///
142/// Note, this function will accept all kind of input but will skip
143/// everything that does not look like a robots directive.
144pub fn parse_robotstxt(robots_body: &str, parse_callback: &mut impl RobotsParseHandler) {
145    let mut parser = RobotsTxtParser::new(robots_body, parse_callback);
146    parser.parse();
147}
148
149#[cfg(test)]
150mod tests {
151    #![allow(unused_variables)]
152
153    use super::*;
154
155    #[derive(Default)]
156    struct RobotsStatsReporter {
157        last_line_seen: u32,
158        valid_directives: u32,
159        unknown_directives: u32,
160        sitemap: String,
161    }
162
163    impl RobotsStatsReporter {
164        fn digest(&mut self, line_num: u32) {
165            assert!(line_num >= self.last_line_seen);
166            self.last_line_seen = line_num;
167            self.valid_directives += 1;
168        }
169    }
170
171    impl RobotsParseHandler for RobotsStatsReporter {
172        fn handle_robots_start(&mut self) {
173            self.last_line_seen = 0;
174            self.valid_directives = 0;
175            self.unknown_directives = 0;
176            self.sitemap.clear();
177        }
178
179        fn handle_robots_end(&mut self) {}
180
181        fn handle_user_agent(&mut self, line_num: u32, user_agent: &str) {
182            self.digest(line_num);
183        }
184
185        fn handle_allow(&mut self, line_num: u32, value: &str) {
186            self.digest(line_num);
187        }
188
189        fn handle_disallow(&mut self, line_num: u32, value: &str) {
190            self.digest(line_num);
191        }
192
193        fn handle_sitemap(&mut self, line_num: u32, value: &str) {
194            self.digest(line_num);
195            self.sitemap.push_str(value);
196        }
197
198        // Any other unrecognized name/v pairs.
199        fn handle_unknown_action(&mut self, line_num: u32, action: &str, value: &str) {
200            self.last_line_seen = line_num;
201            self.unknown_directives += 1;
202        }
203    }
204
205    #[test]
206    // Different kinds of line endings are all supported: %x0D / %x0A / %x0D.0A
207    fn test_lines_numbers_are_counted_correctly() {
208        let mut report = RobotsStatsReporter::default();
209        let unix_file = "User-Agent: foo\n\
210        Allow: /some/path\n\
211        User-Agent: bar\n\
212        \n\
213        \n\
214        Disallow: /\n";
215        super::parse_robotstxt(unix_file, &mut report);
216        assert_eq!(4, report.valid_directives);
217        assert_eq!(6, report.last_line_seen);
218
219        let mac_file = "User-Agent: foo\r\
220        Allow: /some/path\r\
221        User-Agent: bar\r\
222        \r\
223        \r\
224        Disallow: /\r";
225        super::parse_robotstxt(mac_file, &mut report);
226        assert_eq!(4, report.valid_directives);
227        assert_eq!(6, report.last_line_seen);
228
229        let no_final_new_line = "User-Agent: foo\n\
230        Allow: /some/path\n\
231        User-Agent: bar\n\
232        \n\
233        \n\
234        Disallow: /";
235        super::parse_robotstxt(no_final_new_line, &mut report);
236        assert_eq!(4, report.valid_directives);
237        assert_eq!(6, report.last_line_seen);
238
239        let mixed_file = "User-Agent: foo\n\
240        Allow: /some/path\r\n\
241        User-Agent: bar\n\
242        \r\n\
243        \n\
244        Disallow: /";
245        super::parse_robotstxt(mixed_file, &mut report);
246        assert_eq!(4, report.valid_directives);
247        assert_eq!(6, report.last_line_seen);
248    }
249
250    #[test]
251    // BOM characters are unparseable and thus skipped. The rules following the line
252    // are used.
253    fn test_utf8_byte_order_mark_is_skipped() {
254        let mut report = RobotsStatsReporter::default();
255        let utf8_file_full_bom = "\u{EF}\u{BB}\u{BF}\
256        User-Agent: foo\n\
257        Allow: /AnyValue\n";
258        super::parse_robotstxt(utf8_file_full_bom, &mut report);
259        assert_eq!(2, report.valid_directives);
260        assert_eq!(0, report.unknown_directives);
261
262        // We allow as well partial ByteOrderMarks.
263        let utf8_file_partial_2bom = "\u{EF}\u{BB}\
264        User-Agent: foo\n\
265        Allow: /AnyValue\n";
266        super::parse_robotstxt(utf8_file_partial_2bom, &mut report);
267        assert_eq!(2, report.valid_directives);
268        assert_eq!(0, report.unknown_directives);
269
270        let utf8_file_partial_1bom = "\u{EF}\
271        User-Agent: foo\n\
272        Allow: /AnyValue\n";
273        super::parse_robotstxt(utf8_file_partial_1bom, &mut report);
274        assert_eq!(2, report.valid_directives);
275        assert_eq!(0, report.unknown_directives);
276
277        // If the BOM is not the right sequence, the first line looks like garbage
278        // that is skipped (we essentially see "\x11\xBFUser-Agent").
279        let utf8_file_broken_bom = "\u{EF}\u{11}\u{BF}\
280        User-Agent: foo\n\
281        Allow: /AnyValue\n";
282        super::parse_robotstxt(utf8_file_broken_bom, &mut report);
283        assert_eq!(1, report.valid_directives);
284        // // We get one broken line.
285        assert_eq!(1, report.unknown_directives);
286
287        // Some other messed up file: BOMs only valid in the beginning of the file.
288        let utf8_bom_somewhere_in_middle_of_file = "User-Agent: foo\n\
289        \u{EF}\u{BB}\u{BF}\
290        Allow: /AnyValue\n";
291        super::parse_robotstxt(utf8_bom_somewhere_in_middle_of_file, &mut report);
292        assert_eq!(1, report.valid_directives);
293        assert_eq!(1, report.unknown_directives);
294    }
295
296    #[test]
297    // Google specific: the I-D allows any line that crawlers might need, such as
298    // sitemaps, which Google supports.
299    // See REP I-D section "Other records".
300    // https://tools.ietf.org/html/draft-koster-rep#section-2.2.4
301    fn test_non_standard_line_example_sitemap() {
302        let mut report = RobotsStatsReporter::default();
303
304        {
305            let sitemap_loc = "http://foo.bar/sitemap.xml";
306            let mut robotstxt: String = "User-Agent: foo\n\
307        Allow: /some/path\n\
308        User-Agent: bar\n\
309        \n\
310        \n"
311            .into();
312            robotstxt.push_str(&format!("Sitemap: {}\n", sitemap_loc));
313
314            super::parse_robotstxt(&robotstxt, &mut report);
315            assert_eq!(sitemap_loc, report.sitemap.as_str());
316        }
317
318        {
319            // A sitemap line may appear anywhere in the file.
320            let mut robotstxt = String::new();
321            let sitemap_loc = "http://foo.bar/sitemap.xml";
322            let robotstxt_temp = "User-Agent: foo\n\
323            Allow: /some/path\n\
324            User-Agent: bar\n\
325            \n\
326            \n";
327            robotstxt.push_str(&format!("Sitemap: {}\n{}", sitemap_loc, robotstxt_temp));
328
329            super::parse_robotstxt(&robotstxt, &mut report);
330            assert_eq!(sitemap_loc, report.sitemap.as_str());
331        }
332    }
333
334    #[test]
335    fn test_blank_line_case() {
336        let robots_content = r#"User-agent: *
337Disallow: /*q=
338Disallow: /users/*?
339Disallow: /join/*?
340Disallow: /morelikethis/
341Disallow: /download/
342Disallow: /checkout/
343Disallow: /global/
344Disallow: /api/
345Disallow: /critiques/
346 
347Sitemap: http://sitemaps.test.net/sitemap-index.xml.gz"#;
348        let mut matcher = DefaultMatcher::default();
349        assert!(matcher.one_agent_allowed_by_robots(
350            &robots_content,
351            "bot",
352            "https://www.test.com/"
353        ));
354    }
355
356    #[test]
357    fn test_unknown_robotstxt_case() {
358        let robots_content = "#!/usr/bin/env bash\n\
359# Make sure you have `curl` installed\n\
360\n\
361######## VARIABLES #########\n\
362abc";
363        let mut matcher = DefaultMatcher::default();
364        assert!(matcher.one_agent_allowed_by_robots(
365            &robots_content,
366            "bot",
367            "https://www.test.com/"
368        ));
369    }
370}