1use crate::RobotsParseHandler;
18
19pub struct Match {
29 priority: i32,
30 line: u32,
31}
32
33impl Default for Match {
34 fn default() -> Self {
35 Match::new(Self::NO_MATCH_PRIORITY, 0)
36 }
37}
38
39impl Match {
40 const NO_MATCH_PRIORITY: i32 = -1;
41 pub fn new(priority: i32, line: u32) -> Match {
42 Match { priority, line }
43 }
44
45 pub fn set(&mut self, priority: i32, line: u32) {
46 self.priority = priority;
47 self.line = line;
48 }
49
50 pub fn clear(&mut self) {
51 self.set(Self::NO_MATCH_PRIORITY, 0);
52 }
53
54 pub fn line(&self) -> u32 {
55 self.line
56 }
57
58 pub fn priority(&self) -> i32 {
59 self.priority
60 }
61
62 pub fn higher_priority_match<'a>(a: &'a Match, b: &'a Match) -> &'a Match {
63 if a.priority() > b.priority() {
64 a
65 } else {
66 b
67 }
68 }
69}
70
71#[derive(Default)]
72struct MatchHierarchy {
73 global: Match,
74 specific: Match,
75}
76
77impl MatchHierarchy {
78 pub fn clear(&mut self) {
79 self.global.clear();
80 self.specific.clear();
81 }
82}
83
84pub trait RobotsMatchStrategy: Default {
100 fn match_allow(&self, path: &str, pattern: &str) -> i32;
101
102 fn match_disallow(&self, path: &str, pattern: &str) -> i32;
103
104 fn matches(path: &str, pattern: &str) -> bool {
141 let pathlen = path.len();
142 let mut pos = Vec::with_capacity(pathlen + 1);
143
144 let mut numpos: usize = 1;
151 pos.insert(0, 0);
152
153 for (index, pat) in pattern.chars().enumerate() {
154 if pat == '$' && index + 1 == pattern.len() {
155 return pos[numpos - 1] == pathlen;
156 }
157
158 if pat == '*' {
159 numpos = pathlen - pos[0] + 1;
160 for i in 1..numpos {
161 pos.insert(i, pos[i - 1] + 1);
162 }
163 } else {
164 let mut new_numpos = 0;
166 for i in 0..numpos {
167 if pos[i] < pathlen && path.chars().nth(pos[i]) == Some(pat) {
169 pos.insert(new_numpos, pos[i] + 1);
170 new_numpos += 1;
171 }
172 }
173 numpos = new_numpos;
174
175 if numpos == 0 {
176 return false;
177 }
178 }
179 }
180 true
181 }
182}
183
184#[derive(Default)]
187pub struct LongestMatchRobotsMatchStrategy;
188
189impl RobotsMatchStrategy for LongestMatchRobotsMatchStrategy {
190 fn match_allow(&self, path: &str, pattern: &str) -> i32 {
191 if Self::matches(path, pattern) {
192 pattern.len() as i32
193 } else {
194 -1
195 }
196 }
197
198 fn match_disallow(&self, path: &str, pattern: &str) -> i32 {
199 if Self::matches(path, pattern) {
200 pattern.len() as i32
201 } else {
202 -1
203 }
204 }
205}
206
207#[derive(Default)]
218pub struct RobotsMatcher<S: RobotsMatchStrategy> {
219 allow: MatchHierarchy,
221 disallow: MatchHierarchy,
223 seen_global_agent: bool,
225 seen_specific_agent: bool,
227 ever_seen_specific_agent: bool,
229 seen_separator: bool,
231
232 path: String,
233 user_agents: Vec<String>,
234 match_strategy: S,
235}
236
237enum ParseInvoke {
238 UserAgent {
239 line_num: u32,
240 user_agent: String,
241 },
242 Allow {
243 line_num: u32,
244 value: String,
245 },
246 Disallow {
247 line_num: u32,
248 value: String,
249 },
250 Sitemap {
251 line_num: u32,
252 value: String,
253 },
254 UnknownAction {
255 line_num: u32,
256 action: String,
257 value: String,
258 },
259}
260
261struct CachingRobotsParseHandler<S: RobotsMatchStrategy> {
262 invokes: Vec<ParseInvoke>,
263 matcher: RobotsMatcher<S>,
264}
265
266impl<S: RobotsMatchStrategy> CachingRobotsParseHandler<S> {
267 pub fn new(matcher: RobotsMatcher<S>) -> Self {
268 Self {
269 invokes: vec![],
270 matcher,
271 }
272 }
273
274 fn replay(&mut self) {
275 self.matcher.handle_robots_start();
276 for invoke in &self.invokes {
277 match invoke {
278 ParseInvoke::UserAgent {
279 line_num,
280 user_agent,
281 } => self.matcher.handle_user_agent(*line_num, &user_agent),
282 ParseInvoke::Allow { line_num, value } => {
283 self.matcher.handle_allow(*line_num, &value)
284 }
285 ParseInvoke::Disallow { line_num, value } => {
286 self.matcher.handle_disallow(*line_num, &value)
287 }
288 ParseInvoke::Sitemap { line_num, value } => {
289 self.matcher.handle_sitemap(*line_num, &value)
290 }
291 ParseInvoke::UnknownAction {
292 line_num,
293 action,
294 value,
295 } => self
296 .matcher
297 .handle_unknown_action(*line_num, &action, &value),
298 }
299 }
300 self.matcher.handle_robots_end();
301 }
302
303 pub fn allowed_by_robots(&mut self, user_agents: Vec<&str>, url: &str) -> bool {
304 let path = super::get_path_params_query(&url);
305 self.matcher.init_user_agents_and_path(user_agents, &path);
306 self.replay();
307 !self.matcher.disallow()
308 }
309}
310
311impl<S: RobotsMatchStrategy> RobotsParseHandler for CachingRobotsParseHandler<S> {
312 fn handle_robots_start(&mut self) {}
313
314 fn handle_robots_end(&mut self) {}
315
316 fn handle_user_agent(&mut self, line_num: u32, user_agent: &str) {
317 self.invokes.push(ParseInvoke::UserAgent {
318 line_num,
319 user_agent: String::from(user_agent),
320 })
321 }
322
323 fn handle_allow(&mut self, line_num: u32, value: &str) {
324 self.invokes.push(ParseInvoke::Allow {
325 line_num,
326 value: String::from(value),
327 })
328 }
329
330 fn handle_disallow(&mut self, line_num: u32, value: &str) {
331 self.invokes.push(ParseInvoke::Disallow {
332 line_num,
333 value: String::from(value),
334 })
335 }
336
337 fn handle_sitemap(&mut self, line_num: u32, value: &str) {
338 self.invokes.push(ParseInvoke::Sitemap {
339 line_num,
340 value: String::from(value),
341 })
342 }
343
344 fn handle_unknown_action(&mut self, line_num: u32, action: &str, value: &str) {
345 self.invokes.push(ParseInvoke::UnknownAction {
346 line_num,
347 action: String::from(action),
348 value: String::from(value),
349 })
350 }
351}
352
353pub struct CachingRobotsMatcher<S: RobotsMatchStrategy> {
354 parse_handler: CachingRobotsParseHandler<S>,
355}
356
357impl<S: RobotsMatchStrategy> CachingRobotsMatcher<S> {
358 pub fn new(matcher: RobotsMatcher<S>) -> Self {
359 Self {
360 parse_handler: CachingRobotsParseHandler::new(matcher),
361 }
362 }
363
364 pub fn parse(&mut self, robots_body: &str) {
365 super::parse_robotstxt(robots_body, &mut self.parse_handler);
366 }
367
368 pub fn allowed_by_robots(&mut self, user_agents: Vec<&str>, url: &str) -> bool {
369 self.parse_handler.allowed_by_robots(user_agents, url)
370 }
371
372 pub fn one_agent_allowed_by_robots(&mut self, user_agent: &str, url: &str) -> bool {
373 self.parse_handler.allowed_by_robots(vec![user_agent], url)
374 }
375}
376
377impl<'a, S: RobotsMatchStrategy> RobotsMatcher<S> {
378 fn init_user_agents_and_path(&mut self, user_agents: Vec<&str>, path: &str) {
381 self.path = String::from(path);
382 self.user_agents = user_agents.into_iter().map(String::from).collect();
383 }
384
385 pub fn allowed_by_robots(
388 &mut self,
389 robots_body: &str,
390 user_agents: Vec<&str>,
391 url: &str,
392 ) -> bool
393 where
394 Self: RobotsParseHandler,
395 {
396 let path = super::get_path_params_query(url);
399 self.init_user_agents_and_path(user_agents, &path);
400 super::parse_robotstxt(&robots_body, self);
401 !self.disallow()
402 }
403
404 pub fn one_agent_allowed_by_robots(
407 &mut self,
408 robots_txt: &str,
409 user_agent: &str,
410 url: &str,
411 ) -> bool
412 where
413 Self: RobotsParseHandler,
414 {
415 self.allowed_by_robots(robots_txt, vec![user_agent], url)
416 }
417
418 fn disallow(&self) -> bool {
420 if self.allow.specific.priority() > 0 || self.disallow.specific.priority() > 0 {
421 return self.disallow.specific.priority() > self.allow.specific.priority();
422 }
423
424 if self.ever_seen_specific_agent {
425 return false;
428 }
429
430 if self.disallow.global.priority() > 0 || self.allow.global.priority() > 0 {
431 return self.disallow.global.priority() > self.allow.global.priority();
432 }
433
434 false
435 }
436
437 fn seen_any_agent(&self) -> bool {
439 self.seen_global_agent || self.seen_specific_agent
440 }
441
442 fn extract_user_agent(user_agent: &str) -> &str {
446 if let Some(end) =
448 user_agent.find(|c: char| !(c.is_ascii_alphabetic() || c == '-' || c == '_'))
449 {
450 &user_agent[..end]
451 } else {
452 user_agent
453 }
454 }
455
456 pub fn is_valid_user_agent_to_obey(user_agent: &str) -> bool {
460 !user_agent.is_empty() && Self::extract_user_agent(user_agent) == user_agent
461 }
462
463 pub fn matching_line(&self) -> u32 {
465 if self.ever_seen_specific_agent {
466 return Match::higher_priority_match(&self.disallow.specific, &self.allow.specific)
467 .line();
468 }
469 Match::higher_priority_match(&self.disallow.global, &self.allow.global).line()
470 }
471}
472
473impl<S: RobotsMatchStrategy> RobotsParseHandler for RobotsMatcher<S> {
474 fn handle_robots_start(&mut self) {
475 self.allow.clear();
480 self.disallow.clear();
481
482 self.seen_global_agent = false;
483 self.seen_specific_agent = false;
484 self.ever_seen_specific_agent = false;
485 self.seen_separator = false;
486 }
487
488 fn handle_robots_end(&mut self) {}
489
490 fn handle_user_agent(&mut self, _line_num: u32, user_agent: &str) {
491 if self.seen_separator {
492 self.seen_specific_agent = false;
493 self.seen_global_agent = false;
494 self.seen_separator = false;
495 }
496
497 if !user_agent.is_empty()
500 && user_agent.starts_with('*')
501 && (user_agent.len() == 1 || user_agent[1..].starts_with(char::is_whitespace))
502 {
503 self.seen_global_agent = true;
504 } else {
505 let user_agent = Self::extract_user_agent(user_agent);
506 for agent in &self.user_agents {
507 if user_agent.eq_ignore_ascii_case(&agent) {
508 self.ever_seen_specific_agent = true;
509 self.seen_specific_agent = true;
510 break;
511 }
512 }
513 }
514 }
515
516 fn handle_allow(&mut self, line_num: u32, value: &str) {
517 if !self.seen_any_agent() {
518 return;
519 }
520
521 self.seen_separator = true;
522 let priority = self.match_strategy.match_disallow(&self.path, value);
523 if priority >= 0 {
524 if self.seen_specific_agent {
525 if self.allow.specific.priority() < priority {
526 self.allow.specific.set(priority, line_num);
527 }
528 } else if self.allow.global.priority() < priority {
529 self.allow.global.set(priority, line_num);
530 }
531 } else {
532 let slash_pos = value.rfind('/');
534
535 if let Some(slash_pos) = slash_pos {
536 if value[slash_pos..].starts_with("/index.htm") {
537 let new_pattern = format!("{}{}", &value[..(slash_pos + 1)], "$");
538 self.handle_allow(line_num, &new_pattern);
539 }
540 }
541 }
542 }
543
544 fn handle_disallow(&mut self, line_num: u32, value: &str) {
545 if !self.seen_any_agent() {
546 return;
547 }
548
549 self.seen_separator = true;
550 let priority = self.match_strategy.match_disallow(&self.path, value);
551 if priority >= 0 {
552 if self.seen_specific_agent {
553 if self.disallow.specific.priority() < priority {
554 self.disallow.specific.set(priority, line_num);
555 }
556 } else if self.disallow.global.priority() < priority {
557 self.disallow.global.set(priority, line_num);
558 }
559 }
560 }
561
562 fn handle_sitemap(&mut self, _line_num: u32, _value: &str) {
563 self.seen_separator = true;
564 }
565
566 fn handle_unknown_action(&mut self, _line_num: u32, _action: &str, _value: &str) {
567 self.seen_separator = true;
568 }
569}
570
571#[cfg(test)]
572mod test {
573 use crate::matcher::*;
574
575 #[test]
576 fn test_extract_user_agent<'a>() {
577 type Target = RobotsMatcher<LongestMatchRobotsMatchStrategy>;
579 assert_eq!("Googlebot", Target::extract_user_agent("Googlebot/2.1"));
580 assert_eq!("Googlebot", Target::extract_user_agent("Googlebot"));
581 assert_eq!("Googlebot-", Target::extract_user_agent("Googlebot-"));
582 assert_eq!("Googlebot_", Target::extract_user_agent("Googlebot_"));
583 assert_eq!("Googlebot_", Target::extract_user_agent("Googlebot_2.1"));
584 assert_eq!("", Target::extract_user_agent("1Googlebot_2.1"));
585 assert_eq!("Goo", Target::extract_user_agent("Goo1glebot_2.1"));
586 }
587}