1#![cfg_attr(docsrs, feature(doc_cfg))]
2use std::collections::HashMap;
92
93use memchr::{memchr, memmem};
94use thiserror::Error;
95
96pub const DEFAULT_MAX_BYTES: usize = 512 * 1024;
102
103#[derive(Debug, Error)]
110pub enum ParseError {
111 #[error("robots.txt is not valid UTF-8")]
113 Utf8(#[from] std::str::Utf8Error),
114
115 #[error("robots.txt is too large: {len} bytes exceeds limit of {max} bytes")]
117 TooLarge {
118 len: usize,
120 max: usize,
122 },
123}
124
125#[derive(Debug, Clone, Copy, PartialEq, Eq)]
143pub struct ParseOptions {
144 pub max_bytes: Option<usize>,
149}
150
151impl Default for ParseOptions {
152 fn default() -> Self {
153 Self {
154 max_bytes: Some(DEFAULT_MAX_BYTES),
155 }
156 }
157}
158
159#[derive(Debug, Clone, PartialEq, Eq)]
164pub struct ParseReport<'a> {
165 pub robots: RobotsTxt<'a>,
167 pub warnings: Vec<ParseWarning<'a>>,
169}
170
171#[derive(Debug, Clone, PartialEq, Eq)]
173pub struct ParseWarning<'a> {
174 pub line: usize,
176 pub kind: ParseWarningKind<'a>,
178}
179
180#[derive(Debug, Clone, PartialEq, Eq)]
182pub enum ParseWarningKind<'a> {
183 MissingSeparator {
185 line: &'a str,
187 },
188 EmptyDirectiveKey,
190 EmptyUserAgent,
192 RuleBeforeUserAgent {
194 key: &'a str,
196 },
197}
198
199#[derive(Debug, Clone, PartialEq, Eq)]
217pub struct RobotsTxt<'a> {
218 pub groups: Vec<Group<'a>>,
220 #[cfg(feature = "extensions")]
222 #[cfg_attr(docsrs, doc(cfg(feature = "extensions")))]
223 pub extensions: Extensions<'a>,
224}
225
226#[derive(Debug, Clone, PartialEq, Eq)]
232pub struct Group<'a> {
233 pub agents: Vec<&'a str>,
235 pub rules: Vec<Rule<'a>>,
237}
238
239#[derive(Debug, Clone, Copy, PartialEq, Eq)]
241pub struct Rule<'a> {
242 pub kind: RuleKind,
244 pub pattern: &'a str,
248}
249
250#[derive(Debug, Clone, Copy, PartialEq, Eq)]
252pub enum RuleKind {
253 Allow,
255 Disallow,
257}
258
259#[derive(Debug, Clone)]
266pub struct RobotsMatcher<'a> {
267 agent_groups: HashMap<String, Vec<usize>>,
268 fallback_groups: Vec<usize>,
269 compiled_rules: Vec<Vec<CompiledRule<'a>>>,
270}
271
272#[derive(Debug, Clone, Copy)]
273struct CompiledRule<'a> {
274 kind: RuleKind,
275 pattern: &'a str,
276 anchored: bool,
277 has_wildcard: bool,
278 specificity: usize,
279}
280
281#[cfg(feature = "extensions")]
303#[cfg_attr(docsrs, doc(cfg(feature = "extensions")))]
304#[derive(Debug, Clone, PartialEq, Eq, Default)]
305pub struct Extensions<'a> {
306 pub sitemaps: Vec<&'a str>,
308 pub crawl_delays: Vec<CrawlDelay<'a>>,
310 pub hosts: Vec<&'a str>,
312 pub clean_params: Vec<CleanParam<'a>>,
314 pub other: Vec<Directive<'a>>,
316}
317
318#[cfg(feature = "extensions")]
320#[cfg_attr(docsrs, doc(cfg(feature = "extensions")))]
321#[derive(Debug, Clone, PartialEq, Eq)]
322pub struct CrawlDelay<'a> {
323 pub agents: Vec<&'a str>,
327 pub value: &'a str,
329}
330
331#[cfg(feature = "extensions")]
333#[cfg_attr(docsrs, doc(cfg(feature = "extensions")))]
334#[derive(Debug, Clone, Copy, PartialEq, Eq)]
335pub struct CleanParam<'a> {
336 pub value: &'a str,
338}
339
340#[cfg(feature = "extensions")]
342#[cfg_attr(docsrs, doc(cfg(feature = "extensions")))]
343#[derive(Debug, Clone, Copy, PartialEq, Eq)]
344pub struct Directive<'a> {
345 pub key: &'a str,
347 pub value: &'a str,
349}
350
351impl<'a> RobotsTxt<'a> {
352 pub fn parse(input: &'a str) -> Self {
369 parse_inner(input, false).robots
370 }
371
372 pub fn parse_bytes(input: &'a [u8]) -> Result<Self, ParseError> {
390 Self::parse_bytes_with_options(input, ParseOptions::default())
391 }
392
393 pub fn parse_bytes_with_options(
413 input: &'a [u8],
414 options: ParseOptions,
415 ) -> Result<Self, ParseError> {
416 check_size(input.len(), options)?;
417 let input = std::str::from_utf8(input)?;
418 Ok(Self::parse(input))
419 }
420
421 pub fn parse_with_options(input: &'a str, options: ParseOptions) -> Result<Self, ParseError> {
442 check_size(input.len(), options)?;
443 Ok(Self::parse(input))
444 }
445
446 pub fn parse_with_diagnostics(input: &'a str) -> ParseReport<'a> {
468 parse_inner(input, true)
469 }
470
471 pub fn parse_with_diagnostics_options(
490 input: &'a str,
491 options: ParseOptions,
492 ) -> Result<ParseReport<'a>, ParseError> {
493 check_size(input.len(), options)?;
494 Ok(parse_inner(input, true))
495 }
496
497 pub fn parse_bytes_with_diagnostics(input: &'a [u8]) -> Result<ParseReport<'a>, ParseError> {
517 Self::parse_bytes_with_diagnostics_options(input, ParseOptions::default())
518 }
519
520 pub fn parse_bytes_with_diagnostics_options(
538 input: &'a [u8],
539 options: ParseOptions,
540 ) -> Result<ParseReport<'a>, ParseError> {
541 check_size(input.len(), options)?;
542 let input = std::str::from_utf8(input)?;
543 Ok(parse_inner(input, true))
544 }
545
546 pub fn matcher(&'a self) -> RobotsMatcher<'a> {
565 RobotsMatcher::new(self)
566 }
567
568 pub fn is_allowed(&self, user_agent: &str, path: &str) -> bool {
593 if path == "/robots.txt" {
594 return true;
595 }
596
597 let mut exact_match = false;
598 let mut best: Option<(usize, RuleKind)> = None;
599
600 for group in &self.groups {
601 if group
602 .agents
603 .iter()
604 .any(|agent| *agent != "*" && agent.eq_ignore_ascii_case(user_agent))
605 {
606 exact_match = true;
607 apply_group_rules(group, path, &mut best);
608 }
609 }
610
611 if !exact_match {
612 for group in &self.groups {
613 if group.agents.contains(&"*") {
614 apply_group_rules(group, path, &mut best);
615 }
616 }
617 }
618
619 rule_decision(best)
620 }
621}
622
623impl<'a> RobotsMatcher<'a> {
624 fn new(robots: &'a RobotsTxt<'a>) -> Self {
625 let groups = robots.groups.as_slice();
626 let mut agent_groups: HashMap<String, Vec<usize>> = HashMap::new();
627 let mut fallback_groups = Vec::new();
628 let mut compiled_rules = Vec::with_capacity(groups.len());
629
630 for (group_index, group) in groups.iter().enumerate() {
631 for agent in &group.agents {
632 if *agent == "*" {
633 fallback_groups.push(group_index);
634 } else {
635 let indexes = agent_groups.entry(agent.to_ascii_lowercase()).or_default();
636 if !indexes.contains(&group_index) {
637 indexes.push(group_index);
638 }
639 }
640 }
641
642 compiled_rules.push(group.rules.iter().filter_map(CompiledRule::new).collect());
643 }
644
645 Self {
646 agent_groups,
647 fallback_groups,
648 compiled_rules,
649 }
650 }
651
652 pub fn is_allowed(&self, user_agent: &str, path: &str) -> bool {
658 if path == "/robots.txt" {
659 return true;
660 }
661
662 let mut best: Option<(usize, RuleKind)> = None;
663 let agent = user_agent.to_ascii_lowercase();
664
665 if let Some(group_indexes) = self.agent_groups.get(&agent) {
666 self.apply_group_indexes(group_indexes, path, &mut best);
667 } else {
668 self.apply_group_indexes(&self.fallback_groups, path, &mut best);
669 }
670
671 rule_decision(best)
672 }
673
674 fn apply_group_indexes(
675 &self,
676 group_indexes: &[usize],
677 path: &str,
678 best: &mut Option<(usize, RuleKind)>,
679 ) {
680 for &group_index in group_indexes {
681 apply_compiled_rules(&self.compiled_rules[group_index], path, best);
682 }
683 }
684}
685
686impl<'a> CompiledRule<'a> {
687 fn new(rule: &Rule<'a>) -> Option<Self> {
688 if rule.pattern.is_empty() {
689 return None;
690 }
691
692 let (pattern, anchored) = strip_end_anchor(rule.pattern);
693
694 Some(Self {
695 kind: rule.kind,
696 pattern,
697 anchored,
698 has_wildcard: pattern.as_bytes().contains(&b'*'),
699 specificity: pattern.len(),
700 })
701 }
702
703 fn matching_specificity(self, path: &str) -> Option<usize> {
704 let matched = if self.has_wildcard {
705 glob_matches(self.pattern.as_bytes(), path.as_bytes(), self.anchored)
706 } else if self.anchored {
707 path == self.pattern
708 } else {
709 path.starts_with(self.pattern)
710 };
711
712 matched.then_some(self.specificity)
713 }
714}
715
716fn check_size(len: usize, options: ParseOptions) -> Result<(), ParseError> {
718 if let Some(max) = options.max_bytes {
719 if len > max {
720 return Err(ParseError::TooLarge { len, max });
721 }
722 }
723
724 Ok(())
725}
726
727#[derive(Debug, Clone, Copy, PartialEq, Eq)]
728enum DirectiveKind {
729 UserAgent,
730 Allow,
731 Disallow,
732 #[cfg(feature = "extensions")]
733 Sitemap,
734 #[cfg(feature = "extensions")]
735 CrawlDelay,
736 #[cfg(feature = "extensions")]
737 Host,
738 #[cfg(feature = "extensions")]
739 CleanParam,
740 Other,
741}
742
743fn classify_directive_key(key: &str) -> DirectiveKind {
744 match key.as_bytes() {
745 b"Allow" | b"allow" => return DirectiveKind::Allow,
746 b"Disallow" | b"disallow" => return DirectiveKind::Disallow,
747 b"User-agent" | b"user-agent" => return DirectiveKind::UserAgent,
748 #[cfg(feature = "extensions")]
749 b"Host" | b"host" => return DirectiveKind::Host,
750 #[cfg(feature = "extensions")]
751 b"Sitemap" | b"sitemap" => return DirectiveKind::Sitemap,
752 #[cfg(feature = "extensions")]
753 b"Crawl-delay" | b"crawl-delay" => return DirectiveKind::CrawlDelay,
754 #[cfg(feature = "extensions")]
755 b"Clean-param" | b"clean-param" => return DirectiveKind::CleanParam,
756 _ => {}
757 }
758
759 classify_directive_key_ignore_case(key)
760}
761
762#[cold]
763#[inline(never)]
764fn classify_directive_key_ignore_case(key: &str) -> DirectiveKind {
765 match key.len() {
766 5 if key.eq_ignore_ascii_case("allow") => DirectiveKind::Allow,
767 8 if key.eq_ignore_ascii_case("disallow") => DirectiveKind::Disallow,
768 10 if key.eq_ignore_ascii_case("user-agent") => DirectiveKind::UserAgent,
769 #[cfg(feature = "extensions")]
770 4 if key.eq_ignore_ascii_case("host") => DirectiveKind::Host,
771 #[cfg(feature = "extensions")]
772 7 if key.eq_ignore_ascii_case("sitemap") => DirectiveKind::Sitemap,
773 #[cfg(feature = "extensions")]
774 11 if key.eq_ignore_ascii_case("crawl-delay") => DirectiveKind::CrawlDelay,
775 #[cfg(feature = "extensions")]
776 11 if key.eq_ignore_ascii_case("clean-param") => DirectiveKind::CleanParam,
777 _ => DirectiveKind::Other,
778 }
779}
780
781fn new_group<'a>(agent: &'a str) -> Group<'a> {
782 Group {
783 agents: vec![agent],
784 rules: Vec::with_capacity(4),
785 }
786}
787
788fn parse_inner<'a>(input: &'a str, diagnostics: bool) -> ParseReport<'a> {
794 let mut groups = vec![];
795 let mut current: Option<Group<'a>> = None;
796 let mut current_has_rules = false;
797 let mut warnings = vec![];
798
799 #[cfg(feature = "extensions")]
800 let mut extensions = Extensions::default();
801
802 for (line_number, line) in Lines::new(input) {
803 let line = trim_ascii(strip_comment(line));
804 if line.is_empty() {
805 continue;
806 }
807
808 let Some((key, value)) = split_directive(line) else {
809 if diagnostics {
810 warnings.push(ParseWarning {
811 line: line_number,
812 kind: ParseWarningKind::MissingSeparator { line },
813 });
814 }
815 continue;
816 };
817
818 let key = trim_ascii(key);
819 let value = trim_ascii(value);
820 if key.is_empty() {
821 if diagnostics {
822 warnings.push(ParseWarning {
823 line: line_number,
824 kind: ParseWarningKind::EmptyDirectiveKey,
825 });
826 }
827 continue;
828 }
829
830 let directive = classify_directive_key(key);
831
832 match directive {
833 DirectiveKind::UserAgent => {
834 if value.is_empty() {
835 if diagnostics {
836 warnings.push(ParseWarning {
837 line: line_number,
838 kind: ParseWarningKind::EmptyUserAgent,
839 });
840 }
841 continue;
842 };
843
844 match current.as_mut() {
845 Some(group) if !current_has_rules => group.agents.push(value),
846 Some(_) => {
847 groups.push(current.take().expect("current group exists"));
848 current = Some(new_group(value));
849 current_has_rules = false;
850 }
851 None => {
852 current = Some(new_group(value));
853 }
854 }
855 }
856 DirectiveKind::Allow | DirectiveKind::Disallow => {
857 let Some(group) = current.as_mut() else {
858 if diagnostics {
859 warnings.push(ParseWarning {
860 line: line_number,
861 kind: ParseWarningKind::RuleBeforeUserAgent { key },
862 });
863 }
864 continue;
865 };
866
867 let kind = match directive {
868 DirectiveKind::Allow => RuleKind::Allow,
869 DirectiveKind::Disallow => RuleKind::Disallow,
870 _ => unreachable!("only allow/disallow directives reach this branch"),
871 };
872
873 group.rules.push(Rule {
874 kind,
875 pattern: value,
876 });
877 current_has_rules = true;
878 }
879 _ => {
880 #[cfg(feature = "extensions")]
881 collect_extension(&mut extensions, current.as_ref(), directive, key, value);
882 }
883 }
884 }
885
886 if let Some(group) = current {
887 groups.push(group);
888 }
889
890 ParseReport {
891 robots: RobotsTxt {
892 groups,
893 #[cfg(feature = "extensions")]
894 extensions,
895 },
896 warnings,
897 }
898}
899
900fn apply_group_rules(group: &Group<'_>, path: &str, best: &mut Option<(usize, RuleKind)>) {
906 for rule in &group.rules {
907 let Some(specificity) = matching_specificity(rule.pattern, path) else {
908 continue;
909 };
910
911 apply_rule_decision(specificity, rule.kind, best);
912 }
913}
914
915fn apply_compiled_rules(
916 rules: &[CompiledRule<'_>],
917 path: &str,
918 best: &mut Option<(usize, RuleKind)>,
919) {
920 for rule in rules {
921 let Some(specificity) = rule.matching_specificity(path) else {
922 continue;
923 };
924
925 apply_rule_decision(specificity, rule.kind, best);
926 }
927}
928
929fn apply_rule_decision(specificity: usize, kind: RuleKind, best: &mut Option<(usize, RuleKind)>) {
930 let should_replace = !matches!(
931 *best,
932 Some((best_specificity, best_kind))
933 if specificity < best_specificity
934 || (specificity == best_specificity
935 && !(kind == RuleKind::Allow && best_kind == RuleKind::Disallow))
936 );
937
938 if should_replace {
939 *best = Some((specificity, kind));
940 }
941}
942
943fn rule_decision(best: Option<(usize, RuleKind)>) -> bool {
944 match best {
945 Some((_, RuleKind::Allow)) | None => true,
946 Some((_, RuleKind::Disallow)) => false,
947 }
948}
949
950fn matching_specificity(pattern: &str, path: &str) -> Option<usize> {
956 if pattern.is_empty() {
957 return None;
958 }
959
960 let (pattern, anchored) = strip_end_anchor(pattern);
961 let matched = if pattern.as_bytes().contains(&b'*') {
962 glob_matches(pattern.as_bytes(), path.as_bytes(), anchored)
963 } else if anchored {
964 path == pattern
965 } else {
966 path.starts_with(pattern)
967 };
968
969 matched.then_some(pattern.len())
970}
971
972fn glob_matches(pattern: &[u8], path: &[u8], anchored: bool) -> bool {
977 let mut parts = pattern.split(|byte| *byte == b'*');
978 let Some(first) = parts.next() else {
979 return true;
980 };
981
982 if !path.starts_with(first) {
983 return false;
984 }
985
986 let mut offset = first.len();
987 let mut ends_with_star = pattern.last() == Some(&b'*');
988
989 for part in parts {
990 if part.is_empty() {
991 ends_with_star = true;
992 continue;
993 }
994
995 ends_with_star = false;
996 let Some(found) = memmem::find(&path[offset..], part) else {
997 return false;
998 };
999 offset += found + part.len();
1000 }
1001
1002 !anchored || ends_with_star || offset == path.len()
1003}
1004
1005fn strip_end_anchor(pattern: &str) -> (&str, bool) {
1006 match pattern.strip_suffix('$') {
1007 Some(pattern) => (pattern, true),
1008 None => (pattern, false),
1009 }
1010}
1011
1012#[cfg(feature = "extensions")]
1013fn collect_extension<'a>(
1019 extensions: &mut Extensions<'a>,
1020 current: Option<&Group<'a>>,
1021 directive: DirectiveKind,
1022 key: &'a str,
1023 value: &'a str,
1024) {
1025 match directive {
1026 DirectiveKind::Sitemap => {
1027 if !value.is_empty() {
1028 extensions.sitemaps.push(value);
1029 }
1030 }
1031 DirectiveKind::CrawlDelay => {
1032 extensions.crawl_delays.push(CrawlDelay {
1033 agents: current
1034 .map(|group| group.agents.clone())
1035 .unwrap_or_default(),
1036 value,
1037 });
1038 }
1039 DirectiveKind::Host => {
1040 if !value.is_empty() {
1041 extensions.hosts.push(value);
1042 }
1043 }
1044 DirectiveKind::CleanParam => {
1045 if !value.is_empty() {
1046 extensions.clean_params.push(CleanParam { value });
1047 }
1048 }
1049 _ => {
1050 extensions.other.push(Directive { key, value });
1051 }
1052 }
1053}
1054
1055fn strip_comment(line: &str) -> &str {
1057 match memchr(b'#', line.as_bytes()) {
1058 Some(index) => &line[..index],
1059 None => line,
1060 }
1061}
1062
1063fn split_directive(line: &str) -> Option<(&str, &str)> {
1067 let index = memchr(b':', line.as_bytes())?;
1068 Some((&line[..index], &line[index + 1..]))
1069}
1070
1071fn trim_ascii(value: &str) -> &str {
1076 let bytes = value.as_bytes();
1077 let Some((&first, rest)) = bytes.split_first() else {
1078 return value;
1079 };
1080 let last = rest.last().copied().unwrap_or(first);
1081
1082 if !matches!(first, b' ' | b'\t') && !matches!(last, b' ' | b'\t') {
1083 return value;
1084 }
1085
1086 let mut start = 0;
1087 let mut end = bytes.len();
1088
1089 while start < end && matches!(bytes[start], b' ' | b'\t') {
1090 start += 1;
1091 }
1092 while end > start && matches!(bytes[end - 1], b' ' | b'\t') {
1093 end -= 1;
1094 }
1095
1096 &value[start..end]
1097}
1098
1099struct Lines<'a> {
1104 input: &'a str,
1105 offset: usize,
1106 line: usize,
1107}
1108
1109impl<'a> Lines<'a> {
1110 fn new(input: &'a str) -> Self {
1112 Self {
1113 input,
1114 offset: 0,
1115 line: 1,
1116 }
1117 }
1118}
1119
1120impl<'a> Iterator for Lines<'a> {
1121 type Item = (usize, &'a str);
1122
1123 fn next(&mut self) -> Option<Self::Item> {
1125 if self.offset > self.input.len() {
1126 return None;
1127 }
1128
1129 let remaining = &self.input[self.offset..];
1130 if remaining.is_empty() {
1131 self.offset += 1;
1132 return None;
1133 }
1134
1135 let line_end = memchr(b'\n', remaining.as_bytes()).unwrap_or(remaining.len());
1136 let mut line = &remaining[..line_end];
1137 if let Some(stripped) = line.strip_suffix('\r') {
1138 line = stripped;
1139 }
1140
1141 let line_number = self.line;
1142 self.line += 1;
1143 self.offset += line_end + 1;
1144 Some((line_number, line))
1145 }
1146}
1147
1148#[cfg(test)]
1149mod tests {
1150 use super::*;
1151
1152 #[test]
1153 fn parses_groups_comments_and_crlf() {
1154 let robots = RobotsTxt::parse(
1155 "# ignored\r\nUser-agent: FooBot\r\nUser-agent: BarBot # same group\r\nDisallow: /private\r\nAllow: /private/public\r\n",
1156 );
1157
1158 assert_eq!(robots.groups.len(), 1);
1159 assert_eq!(robots.groups[0].agents, vec!["FooBot", "BarBot"]);
1160 assert_eq!(robots.groups[0].rules.len(), 2);
1161 assert!(!robots.is_allowed("FooBot", "/private/file"));
1162 assert!(robots.is_allowed("FooBot", "/private/public/file"));
1163 }
1164
1165 #[test]
1166 fn parses_directive_keys_case_insensitively() {
1167 let robots =
1168 RobotsTxt::parse("uSeR-aGeNt: FooBot\nDiSaLlOw: /private\nAlLoW: /private/public\n");
1169
1170 assert!(!robots.is_allowed("FooBot", "/private/file"));
1171 assert!(robots.is_allowed("FooBot", "/private/public/file"));
1172 }
1173
1174 #[test]
1175 fn ignores_rules_before_first_user_agent() {
1176 let robots = RobotsTxt::parse("Disallow: /\nUser-agent: *\nAllow: /\n");
1177
1178 assert!(robots.is_allowed("AnyBot", "/anything"));
1179 }
1180
1181 #[test]
1182 fn starts_new_group_after_rules() {
1183 let robots = RobotsTxt::parse(
1184 "User-agent: FooBot\nDisallow: /foo\nUser-agent: BarBot\nDisallow: /bar\n",
1185 );
1186
1187 assert_eq!(robots.groups.len(), 2);
1188 assert!(!robots.is_allowed("FooBot", "/foo"));
1189 assert!(robots.is_allowed("FooBot", "/bar"));
1190 assert!(!robots.is_allowed("BarBot", "/bar"));
1191 }
1192
1193 #[test]
1194 fn merges_multiple_exact_matching_groups() {
1195 let robots = RobotsTxt::parse(
1196 "User-agent: FooBot\nDisallow: /foo\n\nUser-agent: FooBot\nDisallow: /bar\n",
1197 );
1198
1199 assert!(!robots.is_allowed("FooBot", "/foo"));
1200 assert!(!robots.is_allowed("FooBot", "/bar"));
1201 }
1202
1203 #[test]
1204 fn falls_back_to_star_group() {
1205 let robots =
1206 RobotsTxt::parse("User-agent: *\nDisallow: /all\nUser-agent: FooBot\nAllow: /\n");
1207
1208 assert!(!robots.is_allowed("OtherBot", "/all"));
1209 assert!(robots.is_allowed("FooBot", "/all"));
1210 }
1211
1212 #[test]
1213 fn longest_match_wins_and_allow_wins_ties() {
1214 let robots = RobotsTxt::parse(
1215 "User-agent: *\nDisallow: /example/\nAllow: /example/public\nDisallow: /tie\nAllow: /tie\n",
1216 );
1217
1218 assert!(!robots.is_allowed("AnyBot", "/example/private"));
1219 assert!(robots.is_allowed("AnyBot", "/example/public/page"));
1220 assert!(robots.is_allowed("AnyBot", "/tie"));
1221 }
1222
1223 #[test]
1224 fn supports_wildcard_and_end_anchor() {
1225 let robots = RobotsTxt::parse("User-agent: *\nDisallow: /*.gif$\nAllow: /public/*.gif$\n");
1226
1227 assert!(!robots.is_allowed("AnyBot", "/images/a.gif"));
1228 assert!(robots.is_allowed("AnyBot", "/images/a.gif?size=large"));
1229 assert!(robots.is_allowed("AnyBot", "/public/a.gif"));
1230 }
1231
1232 #[test]
1233 fn empty_disallow_does_not_block() {
1234 let robots = RobotsTxt::parse("User-agent: *\nDisallow:\n");
1235
1236 assert!(robots.is_allowed("AnyBot", "/anything"));
1237 }
1238
1239 #[test]
1240 fn robots_txt_is_implicitly_allowed() {
1241 let robots = RobotsTxt::parse("User-agent: *\nDisallow: /\n");
1242
1243 assert!(robots.is_allowed("AnyBot", "/robots.txt"));
1244 }
1245
1246 #[test]
1247 fn compiled_matcher_matches_regular_matcher_for_core_rules() {
1248 let robots = RobotsTxt::parse(
1249 "User-agent: FooBot\n\
1250 Disallow: /foo\n\
1251 \n\
1252 User-agent: FooBot\n\
1253 Disallow: /bar\n\
1254 Allow: /bar/public\n\
1255 Disallow: /tie\n\
1256 Allow: /tie\n\
1257 \n\
1258 User-agent: ImageBot\n\
1259 Disallow: /*.gif$\n\
1260 Allow: /public/*.gif$\n\
1261 \n\
1262 User-agent: *\n\
1263 Disallow: /fallback\n",
1264 );
1265 let matcher = robots.matcher();
1266
1267 for (agent, path) in [
1268 ("FooBot", "/foo/page"),
1269 ("FooBot", "/bar/page"),
1270 ("FooBot", "/bar/public/page"),
1271 ("FooBot", "/tie"),
1272 ("ImageBot", "/images/a.gif"),
1273 ("ImageBot", "/images/a.gif?size=large"),
1274 ("ImageBot", "/public/a.gif"),
1275 ("OtherBot", "/fallback/page"),
1276 ("OtherBot", "/public/page"),
1277 ("OtherBot", "/robots.txt"),
1278 ] {
1279 assert_eq!(
1280 matcher.is_allowed(agent, path),
1281 robots.is_allowed(agent, path),
1282 "compiled matcher differed for {agent} {path}"
1283 );
1284 }
1285 }
1286
1287 #[test]
1288 fn parse_bytes_rejects_invalid_utf8() {
1289 let error = RobotsTxt::parse_bytes(&[0xff]).expect_err("invalid UTF-8 should fail");
1290
1291 assert!(matches!(error, ParseError::Utf8(_)));
1292 }
1293
1294 #[test]
1295 fn parse_with_options_rejects_oversized_input() {
1296 let error =
1297 RobotsTxt::parse_with_options("User-agent: *\n", ParseOptions { max_bytes: Some(4) })
1298 .expect_err("oversized input should fail");
1299
1300 assert!(matches!(error, ParseError::TooLarge { len: 14, max: 4 }));
1301 }
1302
1303 #[test]
1304 fn parse_with_options_allows_disabled_limit() {
1305 let robots = RobotsTxt::parse_with_options(
1306 "User-agent: *\nDisallow: /private\n",
1307 ParseOptions { max_bytes: None },
1308 )
1309 .expect("disabled size limit should parse");
1310
1311 assert!(!robots.is_allowed("AnyBot", "/private"));
1312 }
1313
1314 #[test]
1315 fn diagnostics_report_soft_parse_issues() {
1316 let report = RobotsTxt::parse_with_diagnostics(
1317 "Disallow: /\nMissing separator\n: value\nUser-agent:\nUser-agent: *\nDisallow: /private\n",
1318 );
1319
1320 assert_eq!(report.warnings.len(), 4);
1321 assert_eq!(
1322 report.warnings,
1323 vec![
1324 ParseWarning {
1325 line: 1,
1326 kind: ParseWarningKind::RuleBeforeUserAgent { key: "Disallow" },
1327 },
1328 ParseWarning {
1329 line: 2,
1330 kind: ParseWarningKind::MissingSeparator {
1331 line: "Missing separator",
1332 },
1333 },
1334 ParseWarning {
1335 line: 3,
1336 kind: ParseWarningKind::EmptyDirectiveKey,
1337 },
1338 ParseWarning {
1339 line: 4,
1340 kind: ParseWarningKind::EmptyUserAgent,
1341 },
1342 ]
1343 );
1344 assert!(!report.robots.is_allowed("AnyBot", "/private"));
1345 }
1346
1347 #[cfg(feature = "extensions")]
1348 #[test]
1349 fn collects_extensions_without_changing_groups() {
1350 let robots = RobotsTxt::parse(
1351 "Sitemap: https://example.com/sitemap.xml\nUser-agent: Bingbot\nCrawl-delay: 5\nDisallow: /slow\nHost: example.com\nClean-param: ref /shop\nX-Test: value\n",
1352 );
1353
1354 assert_eq!(
1355 robots.extensions.sitemaps,
1356 vec!["https://example.com/sitemap.xml"]
1357 );
1358 assert_eq!(robots.extensions.crawl_delays.len(), 1);
1359 assert_eq!(robots.extensions.crawl_delays[0].agents, vec!["Bingbot"]);
1360 assert_eq!(robots.extensions.crawl_delays[0].value, "5");
1361 assert_eq!(robots.extensions.hosts, vec!["example.com"]);
1362 assert_eq!(robots.extensions.clean_params[0].value, "ref /shop");
1363 assert_eq!(robots.extensions.other[0].key, "X-Test");
1364 assert!(!robots.is_allowed("Bingbot", "/slow"));
1365 }
1366}