1use std::borrow::Cow;
34
35use matcher::{LongestMatchRobotsMatchStrategy, RobotsMatcher};
36use parser::RobotsTxtParser;
37
38pub mod matcher;
40pub mod parser;
42
43pub type DefaultMatcher<'a> = RobotsMatcher<'a, LongestMatchRobotsMatchStrategy>;
45
46pub trait RobotsParseHandler {
48 fn handle_robots_start(&mut self);
49 fn handle_robots_end(&mut self);
50 fn handle_user_agent(&mut self, line_num: u32, user_agent: &str);
51 fn handle_allow(&mut self, line_num: u32, value: &str);
52 fn handle_disallow(&mut self, line_num: u32, value: &str);
53 fn handle_sitemap(&mut self, line_num: u32, value: &str);
54 fn handle_unknown_action(&mut self, line_num: u32, action: &str, value: &str);
56}
57
58pub fn get_path_params_query(url: &str) -> Cow<str> {
92 fn find_first_of(s: &str, pattern: &str, start_position: usize) -> Option<usize> {
93 s[start_position..]
94 .find(|c| pattern.contains(c))
95 .map(|pos| pos + start_position)
96 }
97 fn find(s: &str, pattern: &str, start_position: usize) -> Option<usize> {
98 s[start_position..]
99 .find(pattern)
100 .map(|pos| pos + start_position)
101 }
102
103 let search_start = if url.len() >= 2 && url.get(..2) == Some("//") {
105 2
106 } else {
107 0
108 };
109 let early_path = find_first_of(url, "/?;", search_start);
110 let mut protocol_end = find(url, "://", search_start);
111
112 if early_path.is_some() && early_path < protocol_end {
113 protocol_end = None;
115 }
116 if protocol_end.is_none() {
117 protocol_end = Some(search_start);
118 } else {
119 protocol_end = protocol_end.map(|pos| pos + 3)
120 }
121
122 if let Some(path_start) = find_first_of(url, "/?;", protocol_end.unwrap()) {
123 let hash_pos = find(url, "#", search_start);
124 if hash_pos.is_some() && hash_pos.unwrap() < path_start {
125 return Cow::Borrowed("/");
126 }
127
128 let path_end = hash_pos.unwrap_or_else(|| url.len());
129 if url.get(path_start..=path_start) != Some("/") {
130 return Cow::Owned(format!("/{}", &url[path_start..path_end]));
132 }
133 return Cow::Borrowed(&url[path_start..path_end]);
134 }
135
136 Cow::Borrowed("/")
137}
138
139pub fn parse_robotstxt(robots_body: &str, parse_callback: &mut impl RobotsParseHandler) {
145 let mut parser = RobotsTxtParser::new(robots_body, parse_callback);
146 parser.parse();
147}
148
149#[cfg(test)]
150mod tests {
151 #![allow(unused_variables)]
152
153 use super::*;
154
155 #[derive(Default)]
156 struct RobotsStatsReporter {
157 last_line_seen: u32,
158 valid_directives: u32,
159 unknown_directives: u32,
160 sitemap: String,
161 }
162
163 impl RobotsStatsReporter {
164 fn digest(&mut self, line_num: u32) {
165 assert!(line_num >= self.last_line_seen);
166 self.last_line_seen = line_num;
167 self.valid_directives += 1;
168 }
169 }
170
171 impl RobotsParseHandler for RobotsStatsReporter {
172 fn handle_robots_start(&mut self) {
173 self.last_line_seen = 0;
174 self.valid_directives = 0;
175 self.unknown_directives = 0;
176 self.sitemap.clear();
177 }
178
179 fn handle_robots_end(&mut self) {}
180
181 fn handle_user_agent(&mut self, line_num: u32, user_agent: &str) {
182 self.digest(line_num);
183 }
184
185 fn handle_allow(&mut self, line_num: u32, value: &str) {
186 self.digest(line_num);
187 }
188
189 fn handle_disallow(&mut self, line_num: u32, value: &str) {
190 self.digest(line_num);
191 }
192
193 fn handle_sitemap(&mut self, line_num: u32, value: &str) {
194 self.digest(line_num);
195 self.sitemap.push_str(value);
196 }
197
198 fn handle_unknown_action(&mut self, line_num: u32, action: &str, value: &str) {
200 self.last_line_seen = line_num;
201 self.unknown_directives += 1;
202 }
203 }
204
205 #[test]
206 fn test_lines_numbers_are_counted_correctly() {
208 let mut report = RobotsStatsReporter::default();
209 let unix_file = "User-Agent: foo\n\
210 Allow: /some/path\n\
211 User-Agent: bar\n\
212 \n\
213 \n\
214 Disallow: /\n";
215 super::parse_robotstxt(unix_file, &mut report);
216 assert_eq!(4, report.valid_directives);
217 assert_eq!(6, report.last_line_seen);
218
219 let mac_file = "User-Agent: foo\r\
220 Allow: /some/path\r\
221 User-Agent: bar\r\
222 \r\
223 \r\
224 Disallow: /\r";
225 super::parse_robotstxt(mac_file, &mut report);
226 assert_eq!(4, report.valid_directives);
227 assert_eq!(6, report.last_line_seen);
228
229 let no_final_new_line = "User-Agent: foo\n\
230 Allow: /some/path\n\
231 User-Agent: bar\n\
232 \n\
233 \n\
234 Disallow: /";
235 super::parse_robotstxt(no_final_new_line, &mut report);
236 assert_eq!(4, report.valid_directives);
237 assert_eq!(6, report.last_line_seen);
238
239 let mixed_file = "User-Agent: foo\n\
240 Allow: /some/path\r\n\
241 User-Agent: bar\n\
242 \r\n\
243 \n\
244 Disallow: /";
245 super::parse_robotstxt(mixed_file, &mut report);
246 assert_eq!(4, report.valid_directives);
247 assert_eq!(6, report.last_line_seen);
248 }
249
250 #[test]
251 fn test_utf8_byte_order_mark_is_skipped() {
254 let mut report = RobotsStatsReporter::default();
255 let utf8_file_full_bom = "\u{EF}\u{BB}\u{BF}\
256 User-Agent: foo\n\
257 Allow: /AnyValue\n";
258 super::parse_robotstxt(utf8_file_full_bom, &mut report);
259 assert_eq!(2, report.valid_directives);
260 assert_eq!(0, report.unknown_directives);
261
262 let utf8_file_partial_2bom = "\u{EF}\u{BB}\
264 User-Agent: foo\n\
265 Allow: /AnyValue\n";
266 super::parse_robotstxt(utf8_file_partial_2bom, &mut report);
267 assert_eq!(2, report.valid_directives);
268 assert_eq!(0, report.unknown_directives);
269
270 let utf8_file_partial_1bom = "\u{EF}\
271 User-Agent: foo\n\
272 Allow: /AnyValue\n";
273 super::parse_robotstxt(utf8_file_partial_1bom, &mut report);
274 assert_eq!(2, report.valid_directives);
275 assert_eq!(0, report.unknown_directives);
276
277 let utf8_file_broken_bom = "\u{EF}\u{11}\u{BF}\
280 User-Agent: foo\n\
281 Allow: /AnyValue\n";
282 super::parse_robotstxt(utf8_file_broken_bom, &mut report);
283 assert_eq!(1, report.valid_directives);
284 assert_eq!(1, report.unknown_directives);
286
287 let utf8_bom_somewhere_in_middle_of_file = "User-Agent: foo\n\
289 \u{EF}\u{BB}\u{BF}\
290 Allow: /AnyValue\n";
291 super::parse_robotstxt(utf8_bom_somewhere_in_middle_of_file, &mut report);
292 assert_eq!(1, report.valid_directives);
293 assert_eq!(1, report.unknown_directives);
294 }
295
296 #[test]
297 fn test_non_standard_line_example_sitemap() {
302 let mut report = RobotsStatsReporter::default();
303
304 {
305 let sitemap_loc = "http://foo.bar/sitemap.xml";
306 let mut robotstxt: String = "User-Agent: foo\n\
307 Allow: /some/path\n\
308 User-Agent: bar\n\
309 \n\
310 \n"
311 .into();
312 robotstxt.push_str(&format!("Sitemap: {}\n", sitemap_loc));
313
314 super::parse_robotstxt(&robotstxt, &mut report);
315 assert_eq!(sitemap_loc, report.sitemap.as_str());
316 }
317
318 {
319 let mut robotstxt = String::new();
321 let sitemap_loc = "http://foo.bar/sitemap.xml";
322 let robotstxt_temp = "User-Agent: foo\n\
323 Allow: /some/path\n\
324 User-Agent: bar\n\
325 \n\
326 \n";
327 robotstxt.push_str(&format!("Sitemap: {}\n{}", sitemap_loc, robotstxt_temp));
328
329 super::parse_robotstxt(&robotstxt, &mut report);
330 assert_eq!(sitemap_loc, report.sitemap.as_str());
331 }
332 }
333
334 #[test]
335 fn test_blank_line_case() {
336 let robots_content = r#"User-agent: *
337Disallow: /*q=
338Disallow: /users/*?
339Disallow: /join/*?
340Disallow: /morelikethis/
341Disallow: /download/
342Disallow: /checkout/
343Disallow: /global/
344Disallow: /api/
345Disallow: /critiques/
346
347Sitemap: http://sitemaps.test.net/sitemap-index.xml.gz"#;
348 let mut matcher = DefaultMatcher::default();
349 assert!(matcher.one_agent_allowed_by_robots(
350 &robots_content,
351 "bot",
352 "https://www.test.com/"
353 ));
354 }
355
356 #[test]
357 fn test_unknown_robotstxt_case() {
358 let robots_content = "#!/usr/bin/env bash\n\
359# Make sure you have `curl` installed\n\
360\n\
361######## VARIABLES #########\n\
362abc";
363 let mut matcher = DefaultMatcher::default();
364 assert!(matcher.one_agent_allowed_by_robots(
365 &robots_content,
366 "bot",
367 "https://www.test.com/"
368 ));
369 }
370}