robotstxt_with_cache/
lib.rs1pub mod matcher;
34pub mod parser;
35
36use crate::matcher::{CachingRobotsMatcher, LongestMatchRobotsMatchStrategy, RobotsMatcher};
37use crate::parser::RobotsTxtParser;
38
39pub type DefaultMatcher = RobotsMatcher<LongestMatchRobotsMatchStrategy>;
40pub type DefaultCachingMatcher = CachingRobotsMatcher<LongestMatchRobotsMatchStrategy>;
41
42pub trait RobotsParseHandler {
44 fn handle_robots_start(&mut self);
45 fn handle_robots_end(&mut self);
46 fn handle_user_agent(&mut self, line_num: u32, user_agent: &str);
47 fn handle_allow(&mut self, line_num: u32, value: &str);
48 fn handle_disallow(&mut self, line_num: u32, value: &str);
49 fn handle_sitemap(&mut self, line_num: u32, value: &str);
50 fn handle_unknown_action(&mut self, line_num: u32, action: &str, value: &str);
52}
53
54pub fn get_path_params_query(url: &str) -> String {
88 fn find_first_of(s: &str, pattern: &str, start_position: usize) -> Option<usize> {
89 s[start_position..]
90 .find(|c| pattern.contains(c))
91 .map(|pos| pos + start_position)
92 }
93 fn find(s: &str, pattern: &str, start_position: usize) -> Option<usize> {
94 s[start_position..]
95 .find(pattern)
96 .map(|pos| pos + start_position)
97 }
98
99 let search_start = if url.len() >= 2 && url.get(..2) == Some("//") {
101 2
102 } else {
103 0
104 };
105 let early_path = find_first_of(url, "/?;", search_start);
106 let mut protocol_end = find(url, "://", search_start);
107
108 if early_path.is_some() && early_path < protocol_end {
109 protocol_end = None;
111 }
112 if protocol_end.is_none() {
113 protocol_end = Some(search_start);
114 } else {
115 protocol_end = protocol_end.map(|pos| pos + 3)
116 }
117
118 if let Some(path_start) = find_first_of(url, "/?;", protocol_end.unwrap()) {
119 let hash_pos = find(url, "#", search_start);
120 if hash_pos.is_some() && hash_pos.unwrap() < path_start {
121 return String::from("/");
122 }
123
124 let path_end = hash_pos.unwrap_or_else(|| url.len());
125 if url.get(path_start..=path_start) != Some("/") {
126 return format!("/{}", &url[path_start..path_end]);
128 }
129 return String::from(&url[path_start..path_end]);
130 }
131
132 String::from("/")
133}
134
135pub fn parse_robotstxt(robots_body: &str, parse_callback: &mut impl RobotsParseHandler) {
141 let mut parser = RobotsTxtParser::new(robots_body, parse_callback);
142 parser.parse();
143}
144
145#[cfg(test)]
146mod tests {
147 #![allow(unused_variables)]
148
149 use super::*;
150
151 #[derive(Default)]
152 struct RobotsStatsReporter {
153 last_line_seen: u32,
154 valid_directives: u32,
155 unknown_directives: u32,
156 sitemap: String,
157 }
158
159 impl RobotsStatsReporter {
160 fn digest(&mut self, line_num: u32) {
161 assert!(line_num >= self.last_line_seen);
162 self.last_line_seen = line_num;
163 self.valid_directives += 1;
164 }
165 }
166
167 impl RobotsParseHandler for RobotsStatsReporter {
168 fn handle_robots_start(&mut self) {
169 self.last_line_seen = 0;
170 self.valid_directives = 0;
171 self.unknown_directives = 0;
172 self.sitemap.clear();
173 }
174
175 fn handle_robots_end(&mut self) {}
176
177 fn handle_user_agent(&mut self, line_num: u32, user_agent: &str) {
178 self.digest(line_num);
179 }
180
181 fn handle_allow(&mut self, line_num: u32, value: &str) {
182 self.digest(line_num);
183 }
184
185 fn handle_disallow(&mut self, line_num: u32, value: &str) {
186 self.digest(line_num);
187 }
188
189 fn handle_sitemap(&mut self, line_num: u32, value: &str) {
190 self.digest(line_num);
191 self.sitemap.push_str(value);
192 }
193
194 fn handle_unknown_action(&mut self, line_num: u32, action: &str, value: &str) {
196 self.last_line_seen = line_num;
197 self.unknown_directives += 1;
198 }
199 }
200
201 #[test]
202 fn test_lines_numbers_are_counted_correctly() {
204 let mut report = RobotsStatsReporter::default();
205 let unix_file = "User-Agent: foo\n\
206 Allow: /some/path\n\
207 User-Agent: bar\n\
208 \n\
209 \n\
210 Disallow: /\n";
211 super::parse_robotstxt(unix_file, &mut report);
212 assert_eq!(4, report.valid_directives);
213 assert_eq!(6, report.last_line_seen);
214
215 let mac_file = "User-Agent: foo\r\
216 Allow: /some/path\r\
217 User-Agent: bar\r\
218 \r\
219 \r\
220 Disallow: /\r";
221 super::parse_robotstxt(mac_file, &mut report);
222 assert_eq!(4, report.valid_directives);
223 assert_eq!(6, report.last_line_seen);
224
225 let no_final_new_line = "User-Agent: foo\n\
226 Allow: /some/path\n\
227 User-Agent: bar\n\
228 \n\
229 \n\
230 Disallow: /";
231 super::parse_robotstxt(no_final_new_line, &mut report);
232 assert_eq!(4, report.valid_directives);
233 assert_eq!(6, report.last_line_seen);
234
235 let mixed_file = "User-Agent: foo\n\
236 Allow: /some/path\r\n\
237 User-Agent: bar\n\
238 \r\n\
239 \n\
240 Disallow: /";
241 super::parse_robotstxt(mixed_file, &mut report);
242 assert_eq!(4, report.valid_directives);
243 assert_eq!(6, report.last_line_seen);
244 }
245
246 #[test]
247 fn test_utf8_byte_order_mark_is_skipped() {
250 let mut report = RobotsStatsReporter::default();
251 let utf8_file_full_bom = "\u{EF}\u{BB}\u{BF}\
252 User-Agent: foo\n\
253 Allow: /AnyValue\n";
254 super::parse_robotstxt(utf8_file_full_bom, &mut report);
255 assert_eq!(2, report.valid_directives);
256 assert_eq!(0, report.unknown_directives);
257
258 let utf8_file_partial_2bom = "\u{EF}\u{BB}\
260 User-Agent: foo\n\
261 Allow: /AnyValue\n";
262 super::parse_robotstxt(utf8_file_partial_2bom, &mut report);
263 assert_eq!(2, report.valid_directives);
264 assert_eq!(0, report.unknown_directives);
265
266 let utf8_file_partial_1bom = "\u{EF}\
267 User-Agent: foo\n\
268 Allow: /AnyValue\n";
269 super::parse_robotstxt(utf8_file_partial_1bom, &mut report);
270 assert_eq!(2, report.valid_directives);
271 assert_eq!(0, report.unknown_directives);
272
273 let utf8_file_broken_bom = "\u{EF}\u{11}\u{BF}\
276 User-Agent: foo\n\
277 Allow: /AnyValue\n";
278 super::parse_robotstxt(utf8_file_broken_bom, &mut report);
279 assert_eq!(1, report.valid_directives);
280 assert_eq!(1, report.unknown_directives);
282
283 let utf8_bom_somewhere_in_middle_of_file = "User-Agent: foo\n\
285 \u{EF}\u{BB}\u{BF}\
286 Allow: /AnyValue\n";
287 super::parse_robotstxt(utf8_bom_somewhere_in_middle_of_file, &mut report);
288 assert_eq!(1, report.valid_directives);
289 assert_eq!(1, report.unknown_directives);
290 }
291
292 #[test]
293 fn test_non_standard_line_example_sitemap() {
298 let mut report = RobotsStatsReporter::default();
299
300 {
301 let sitemap_loc = "http://foo.bar/sitemap.xml";
302 let mut robotstxt: String = "User-Agent: foo\n\
303 Allow: /some/path\n\
304 User-Agent: bar\n\
305 \n\
306 \n"
307 .into();
308 robotstxt.push_str(&format!("Sitemap: {}\n", sitemap_loc));
309
310 super::parse_robotstxt(&robotstxt, &mut report);
311 assert_eq!(sitemap_loc, report.sitemap.as_str());
312 }
313
314 {
315 let mut robotstxt = String::new();
317 let sitemap_loc = "http://foo.bar/sitemap.xml";
318 let robotstxt_temp = "User-Agent: foo\n\
319 Allow: /some/path\n\
320 User-Agent: bar\n\
321 \n\
322 \n";
323 robotstxt.push_str(&format!("Sitemap: {}\n{}", sitemap_loc, robotstxt_temp));
324
325 super::parse_robotstxt(&robotstxt, &mut report);
326 assert_eq!(sitemap_loc, report.sitemap.as_str());
327 }
328 }
329
330 #[test]
331 fn test_blank_line_case() {
332 let robots_content = r#"User-agent: *
333Disallow: /*q=
334Disallow: /users/*?
335Disallow: /join/*?
336Disallow: /morelikethis/
337Disallow: /download/
338Disallow: /checkout/
339Disallow: /global/
340Disallow: /api/
341Disallow: /critiques/
342
343Sitemap: http://sitemaps.test.net/sitemap-index.xml.gz"#;
344 let mut matcher = DefaultMatcher::default();
345 assert!(matcher.one_agent_allowed_by_robots(
346 robots_content,
347 "bot",
348 "https://www.test.com/"
349 ));
350 }
351
352 #[test]
353 fn test_unknown_robotstxt_case() {
354 let robots_content = "#!/usr/bin/env bash\n\
355# Make sure you have `curl` installed\n\
356\n\
357######## VARIABLES #########\n\
358abc";
359 let mut matcher = DefaultMatcher::default();
360 assert!(matcher.one_agent_allowed_by_robots(
361 robots_content,
362 "bot",
363 "https://www.test.com/"
364 ));
365 }
366}