1#![deny(missing_docs)]
22#![deny(missing_debug_implementations)]
23#![deny(unsafe_code)]
24
25use std::{fmt, hash::Hash, ops::Deref, str::FromStr};
26
27use camino::Utf8Path;
28
29pub mod error;
30use crate::error::DirectiveParseError;
31use crate::error::DirectivePathParseError;
32use crate::error::RobotParseError;
33use crate::error::UserAgentParseError;
34
35#[doc(hidden)]
36mod readme {
37 #![doc = include_str!("../README.md")]
38}
39
40const TSPECIALS: &str = "()<>@,;:\\\"/[]?={} \t";
41const PSAFE: &str = "$-_.+~";
42const PEXTRA: &str = "!*'(),";
43
44fn is_rfc1945_token(c: char) -> bool {
45 c.is_ascii() && !c.is_ascii_control() || TSPECIALS.contains(c)
46}
47fn is_rfc1945_path(c: char) -> bool {
48 c == '/' || c == '%' || c.is_ascii_alphanumeric() || PSAFE.contains(c) || PEXTRA.contains(c)
49}
50
51#[derive(Debug, Clone)]
70pub struct UserAgent(Option<Box<str>>);
71
72impl UserAgent {
73 pub const ANY: UserAgent = UserAgent(None);
77
78 fn is_wildcard(&self) -> bool {
79 self.0.is_none()
80 }
81}
82
83impl PartialEq for UserAgent {
84 fn eq(&self, other: &Self) -> bool {
85 match (&self.0, &other.0) {
86 (Some(a), Some(b)) => a == b,
87 (None, _) => true,
88 (_, None) => true,
89 }
90 }
91}
92
93impl Eq for UserAgent {}
94
95impl Hash for UserAgent {
96 fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
97 match &self.0 {
98 Some(agent) => agent.hash(state),
99 None => "*".hash(state),
100 }
101 }
102}
103
104impl FromStr for UserAgent {
105 type Err = UserAgentParseError;
106
107 fn from_str(s: &str) -> Result<Self, Self::Err> {
108 if s == "*" {
109 return Ok(UserAgent(None));
110 }
111
112 if s.is_empty() {
113 return Err(UserAgentParseError::EmptyUserAgent);
114 }
115
116 if !s.is_ascii() {
117 return Err(UserAgentParseError::InvalidUserAgentEncoding);
118 }
119
120 if !s.chars().all(is_rfc1945_token) {
121 return Err(UserAgentParseError::InvalidCharacters);
122 }
123
124 Ok(UserAgent(Some(s.into())))
125 }
126}
127
128impl fmt::Display for UserAgent {
129 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
130 match &self.0 {
131 Some(agent) => write!(f, "{}", agent),
132 None => write!(f, "*"),
133 }
134 }
135}
136
137#[derive(Debug, Clone, Hash)]
138enum PathInner {
139 None,
140 Any,
141 Path(Box<Utf8Path>),
142 Robots,
143}
144
145#[derive(Debug, Clone)]
163pub struct DirectivePath(PathInner);
164
165impl DirectivePath {
166 pub const ANY: DirectivePath = DirectivePath(PathInner::Any);
168
169 pub const NONE: DirectivePath = DirectivePath(PathInner::None);
171
172 pub const ROBOTS: DirectivePath = DirectivePath(PathInner::Robots);
174
175 pub fn matches(&self, path: &str) -> bool {
177 match &self.0 {
178 PathInner::None => false,
179 PathInner::Any => true,
180 PathInner::Path(pattern) => {
181 let path = Utf8Path::new(path);
182 path.starts_with(pattern.deref())
183 }
184 PathInner::Robots => {
185 let path = Utf8Path::new(path);
186 path == Utf8Path::new("/robots.txt")
187 }
188 }
189 }
190
191 pub fn is_none(&self) -> bool {
193 matches!(self.0, PathInner::None)
194 }
195
196 pub fn is_any(&self) -> bool {
198 matches!(self.0, PathInner::Any)
199 }
200
201 pub fn is_robots(&self) -> bool {
203 matches!(self.0, PathInner::Robots)
204 }
205}
206
207impl fmt::Display for DirectivePath {
208 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
209 match &self.0 {
210 PathInner::None => write!(f, ""),
211 PathInner::Any => write!(f, "/"),
212 PathInner::Path(path) => write!(f, "{}", path.as_str().trim_end_matches('/')),
213 PathInner::Robots => write!(f, "/robots.txt"),
214 }
215 }
216}
217
218impl PartialEq for DirectivePath {
219 fn eq(&self, other: &Self) -> bool {
220 match (&self.0, &other.0) {
221 (PathInner::None, _) | (_, PathInner::None) => false,
222 (PathInner::Any, _) | (_, PathInner::Any) => true,
223 (PathInner::Path(a), PathInner::Path(b)) => a == b,
224 (PathInner::Robots, PathInner::Robots) => true,
225 _ => false,
226 }
227 }
228}
229
230impl FromStr for DirectivePath {
231 type Err = DirectivePathParseError;
232
233 fn from_str(s: &str) -> Result<Self, Self::Err> {
234 let path = s.trim();
235
236 if path == "/" {
237 return Ok(DirectivePath::ANY);
238 }
239
240 if path == "/robots.txt" || path == "robots.txt" {
241 return Ok(DirectivePath::ROBOTS);
242 }
243
244 if path.is_empty() {
245 return Ok(DirectivePath::NONE);
246 }
247
248 if !path.is_ascii() {
249 return Err(DirectivePathParseError::InvalidCharacters);
250 }
251
252 if !path.starts_with('/') {
253 return Err(DirectivePathParseError::InvalidCharacters);
254 }
255
256 if !path.chars().all(is_rfc1945_path) {
257 return Err(DirectivePathParseError::InvalidPathEncoding);
258 }
259
260 Ok(DirectivePath(PathInner::Path(
261 (path.to_string() + "/").as_str().into(),
262 )))
263 }
264}
265
266#[derive(Debug, Clone, PartialEq, Eq, Hash)]
272pub enum DirectiveType {
273 Allow,
275
276 Disallow,
278
279 Extension(Box<str>),
281}
282
283impl fmt::Display for DirectiveType {
284 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
285 match self {
286 DirectiveType::Allow => write!(f, "Allow"),
287 DirectiveType::Disallow => write!(f, "Disallow"),
288 DirectiveType::Extension(extension) => write!(f, "{}", extension),
289 }
290 }
291}
292
293#[derive(Debug, Clone, PartialEq)]
295pub struct Directive {
296 path: DirectivePath,
297 rule: DirectiveType,
298}
299
300impl FromStr for Directive {
301 type Err = DirectiveParseError;
302
303 fn from_str(s: &str) -> Result<Self, Self::Err> {
304 let d = s.split('#').next().unwrap_or("").trim();
305
306 let mut parts = d.splitn(2, ':');
307 let rule = match parts.next() {
308 Some("Allow") => DirectiveType::Allow,
309 Some("Disallow") => DirectiveType::Disallow,
310 Some(extension) if extension.chars().all(is_rfc1945_token) => {
311 DirectiveType::Extension(extension.into())
312 }
313 _ => return Err(DirectiveParseError::InvalidRule),
314 };
315
316 let path: DirectivePath = match parts.next() {
317 Some(path) => path.parse()?,
318 None => DirectivePath::NONE,
319 };
320
321 Ok(Directive { path, rule })
322 }
323}
324
325impl fmt::Display for Directive {
326 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
327 if self.path.is_none() {
328 write!(f, "{}:", self.rule)
330 } else {
331 write!(f, "{}: {}", self.rule, self.path)
332 }
333 }
334}
335
336#[derive(Debug, Clone, PartialEq)]
343pub struct RobotAgent {
344 agents: Vec<UserAgent>,
345 directives: Vec<Directive>,
346}
347
348impl fmt::Display for RobotAgent {
349 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
350 for agent in &self.agents {
351 writeln!(f, "User-agent: {}", agent)?;
352 }
353 for directive in &self.directives {
354 writeln!(f, "{}", directive)?;
355 }
356 Ok(())
357 }
358}
359
360#[derive(Debug, Clone, Default, PartialEq)]
390pub struct Robots {
391 pub wildcard: Vec<Directive>,
396
397 pub agents: Vec<RobotAgent>,
399}
400
401impl Robots {
402 fn push(&mut self, mut agent: RobotAgent) {
403 if agent.agents.iter().any(|a| a.is_wildcard()) {
404 if self.wildcard.is_empty() {
405 self.wildcard.extend(agent.directives.iter().cloned());
406 }
407 agent.agents.retain(|a| !a.is_wildcard());
408
409 if !agent.agents.is_empty() {
410 self.agents.push(agent);
411 }
412 } else {
413 self.agents.push(agent);
414 }
415 }
416}
417
418impl FromStr for Robots {
419 type Err = RobotParseError;
420
421 fn from_str(s: &str) -> Result<Self, Self::Err> {
422 let mut robots = Robots::default();
423
424 let mut agents = Vec::new();
425 let mut directives = Vec::new();
426
427 for line in s.lines() {
428 let line = line.split('#').next().unwrap_or("").trim();
429
430 if line.is_empty() {
431 continue;
432 }
433
434 if line.to_ascii_lowercase().starts_with("user-agent") {
436 if !directives.is_empty() {
437 robots.push(RobotAgent {
438 agents: agents.clone(),
439 directives: directives.clone(),
440 });
441 agents.clear();
442 directives.clear();
443 }
444
445 let agent = line.split_once(':').map(|x| x.1).unwrap_or("").trim();
446 agents.push(
447 agent
448 .parse()
449 .map_err(|err| RobotParseError::InvalidUserAgent(err, agent.to_string()))?,
450 );
451 } else {
452 directives.push(
453 line.parse()
454 .map_err(|err| RobotParseError::InvalidDirective(err, line.to_string()))?,
455 );
456 }
457 }
458
459 if !(agents.is_empty() && directives.is_empty()) {
460 robots.push(RobotAgent {
461 agents: agents.clone(),
462 directives: directives.clone(),
463 });
464 }
465
466 Ok(robots)
467 }
468}
469
470impl fmt::Display for Robots {
471 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
472 if let Some((last, remainder)) = self.agents.split_last() {
473 for agent in remainder {
474 writeln!(f, "{}", agent)?;
475 }
476
477 write!(f, "{}", last)?;
478 };
479
480 if !self.wildcard.is_empty() {
481 if !self.agents.is_empty() {
482 writeln!(f)?;
483 }
484 writeln!(f, "User-agent: *")?;
485 for directive in &self.wildcard {
486 writeln!(f, "{}", directive)?;
487 }
488 }
489 Ok(())
490 }
491}
492
493impl Robots {
494 pub fn deny() -> Self {
496 Self {
497 wildcard: vec![Directive {
498 path: DirectivePath::ANY,
499 rule: DirectiveType::Disallow,
500 }],
501 agents: Vec::new(),
502 }
503 }
504
505 pub fn allow() -> Self {
507 Self {
508 wildcard: vec![Directive {
509 path: DirectivePath::ANY,
510 rule: DirectiveType::Allow,
511 }],
512 agents: Vec::new(),
513 }
514 }
515
516 pub fn is_allowed(&self, user_agent: &UserAgent, path: &str) -> bool {
518 if DirectivePath::ROBOTS.matches(path) {
520 return true;
521 }
522
523 for agent in &self.agents {
524 if agent.agents.iter().any(|a| a == user_agent) {
526 for directive in &agent.directives {
528 if directive.path.matches(path) {
529 match directive.rule {
530 DirectiveType::Allow => return true,
531 DirectiveType::Disallow => return false,
532 DirectiveType::Extension(_) => {}
533 }
534 }
535 }
536
537 return true;
539 }
540 }
541
542 for directive in &self.wildcard {
544 if directive.path.matches(path) {
545 match directive.rule {
546 DirectiveType::Allow => return true,
547 DirectiveType::Disallow => return false,
548 DirectiveType::Extension(_) => {}
549 }
550 }
551 }
552
553 true
555 }
556}
557
558#[cfg(test)]
559mod test {
560 use super::*;
561
562 use indoc::indoc;
563
564 #[test]
565 fn user_agent() {
566 let ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)";
567 let ua = ua.parse::<UserAgent>().unwrap();
568 assert_eq!(ua, UserAgent::ANY);
569 assert_ne!(ua, "Mozilla/5.0".parse().unwrap());
570
571 let ua = "excite".parse::<UserAgent>().unwrap();
572 assert_ne!(&"googlebot".parse::<UserAgent>().unwrap(), &ua);
573 let ua = "*".parse::<UserAgent>().unwrap();
574 assert_eq!(&"googlebot".parse::<UserAgent>().unwrap(), &ua);
575 }
576
577 #[test]
578 fn directive_path() {
579 let path = "/foo/bar".parse::<DirectivePath>().unwrap();
580 assert!(path.matches("/foo/bar/baz"));
581 assert!(!path.matches("/foo"));
582
583 let path = DirectivePath::ANY;
584 assert!(path.matches("/foo/bar/baz"));
585 assert!(path.matches("/foo"));
586
587 let path = DirectivePath::NONE;
588 assert!(!path.matches("/foo/bar/baz"));
589 assert!(!path.matches("/foo"));
590 assert!(!path.matches(""));
591 }
592
593 #[test]
594 fn directive() {
595 let directive = "Allow: /foo/bar".parse::<Directive>().unwrap();
596 assert_eq!(directive.rule, DirectiveType::Allow);
597 assert!(matches!(directive.path, DirectivePath(PathInner::Path(_))));
598 assert!(directive.path.matches("/foo/bar/baz"));
599 assert!(!directive.path.matches("/foo"));
600
601 let directive = "Disallow: /foo/bar".parse::<Directive>().unwrap();
602 assert_eq!(directive.rule, DirectiveType::Disallow);
603 assert!(directive.path.matches("/foo/bar/baz"));
604 assert!(!directive.path.matches("/foo"));
605
606 let directive = "Allow: /foo/bar".parse::<Directive>().unwrap();
607 assert_eq!(directive.rule, DirectiveType::Allow);
608 assert!(directive.path.matches("/foo/bar/baz"));
609 assert!(!directive.path.matches("/foo"));
610
611 let directive = "Allow:".parse::<Directive>().unwrap();
612 assert_eq!(directive.rule, DirectiveType::Allow);
613 assert!(!directive.path.matches("/foo/bar/baz"));
614 assert!(!directive.path.matches("/foo"));
615
616 let directive = "Allow: /".parse::<Directive>().unwrap();
617 assert_eq!(directive.rule, DirectiveType::Allow);
618 assert!(directive.path.matches("/foo/bar/baz"));
619 assert!(directive.path.matches("/foo"));
620 }
621
622 #[test]
623 fn robot_txt() {
624 let example = indoc! {
625 r#"
626 # /robots.txt for http://www.fict.org/
627 # comments to webmaster@fict.org
628
629 User-agent: unhipbot
630 Disallow: /
631
632 User-agent: webcrawler
633 User-agent: excite
634 Disallow:
635
636 User-agent: *
637 Disallow: /org/plans.html
638 Allow: /org/
639 Allow: /serv
640 Allow: /~mak
641 Disallow: /
642 "#
643 }
644 .parse::<Robots>()
645 .unwrap();
646
647 assert!(!example.is_allowed(&"unhipbot".parse().unwrap(), "/org/plans.html"));
648 assert!(example.is_allowed(&"unhipbot".parse().unwrap(), "/robots.txt"));
649
650 assert!(example.is_allowed(&"webcrawler".parse().unwrap(), "/org/plans.html"));
651 assert!(DirectivePath::ANY.matches("/org/plans.html"));
652 assert!(example.is_allowed(&"excite".parse().unwrap(), "/org/plans.html"));
653
654 assert!(example.is_allowed(&"googlebot".parse().unwrap(), "/org/about.html"));
655 assert!(!example.is_allowed(&"googlebot".parse().unwrap(), "/org/plans.html"));
656 }
657
658 #[test]
659 fn default_deny() {
660 let robots = Robots::deny();
661 assert!(!robots.is_allowed(&"googlebot".parse().unwrap(), "/"));
662 assert!(!robots.is_allowed(&"googlebot".parse().unwrap(), "/foo"));
663 assert!(!robots.is_allowed(&"googlebot".parse().unwrap(), "/foo/bar"));
664 assert!(robots.is_allowed(&"googlebot".parse().unwrap(), "/robots.txt"));
665
666 let expected = indoc! {
667 r#"
668 User-agent: *
669 Disallow: /
670 "#
671 };
672
673 assert_eq!(robots.to_string().trim(), expected.trim());
674 }
675
676 macro_rules! test_format {
677 {$doc:tt} => {
678 let expected = indoc! {
679 $doc
680 };
681
682 let robots: Robots = expected.parse().unwrap();
683
684 assert_eq!(robots.to_string(), expected);
685 };
686 }
687
688 #[test]
689 fn format_path() {
690 test_format! {
691 r#"User-agent: *
692 Disallow: /foo/bar
693 Allow: /hello
694 "#
695 };
696 }
697
698 #[test]
699 fn format_blank_last() {
700 test_format! {
701 r#"User-agent: sus
702 Allow: /boobytrap
703 Disallow: /
704
705 User-agent: cool
706 Disallow: /secret
707 Disallow:
708 "#
709 };
710 }
711
712 #[test]
713 fn format_wildcard() {
714 test_format! {
715 r#"User-agent: sus
716 Disallow: /
717
718 User-agent: cool
719 Allow:
720
721 User-agent: *
722 Disallow: /foo/bar
723 Allow: /hello
724 "#
725 };
726 }
727}