1use std::time::Duration;
2
3#[derive(Debug, Clone, PartialEq)]
4pub enum Rule {
5 Allow(String),
6 Disallow(String),
7}
8
9#[derive(Debug, Clone)]
10pub struct RobotsTxt {
11 rules: Vec<Rule>,
12 crawl_delay: Option<Duration>,
13}
14
15impl RobotsTxt {
16 pub fn parse(content: &str, user_agent: &str) -> Self {
17 let mut rules = Vec::new();
18 let mut crawl_delay = None;
19 let mut in_matching_section = false;
20 let mut in_any_section = false;
21
22 for line in content.lines() {
23 let line = line.trim();
24
25 if line.is_empty() || line.starts_with('#') {
26 continue;
27 }
28
29 let line = if let Some(pos) = line.find('#') {
30 line[..pos].trim()
31 } else {
32 line
33 };
34
35 let (key, value) = match line.split_once(':') {
36 Some((k, v)) => (k.trim().to_lowercase(), v.trim()),
37 None => continue,
38 };
39
40 match key.as_str() {
41 "user-agent" => {
42 if in_matching_section {
43 break;
44 }
45 in_any_section = true;
46 let agent_pattern = value.to_lowercase();
47 in_matching_section = agent_pattern == "*"
48 || user_agent.to_lowercase().contains(&agent_pattern)
49 || agent_pattern.contains(&user_agent.to_lowercase());
50 }
51 "allow" if in_matching_section => {
52 rules.push(Rule::Allow(value.to_string()));
53 }
54 "disallow" if in_matching_section => {
55 rules.push(Rule::Disallow(value.to_string()));
56 }
57 "crawl-delay" if in_matching_section => {
58 if let Ok(seconds) = value.parse::<f64>() {
59 crawl_delay = Some(Duration::from_secs_f64(seconds));
60 }
61 }
62 _ => {}
63 }
64 }
65
66 if !in_any_section {
67 rules.push(Rule::Allow("/".to_string()));
68 }
69
70 Self { rules, crawl_delay }
71 }
72
73 pub fn is_allowed(&self, path: &str) -> bool {
74 if self.rules.is_empty() {
75 return true;
76 }
77
78 let mut allowed = true;
79 let mut best_match_len = 0;
80
81 for rule in &self.rules {
82 let (is_allow, pattern) = match rule {
83 Rule::Allow(p) => (true, p),
84 Rule::Disallow(p) => (false, p),
85 };
86
87 if pattern.is_empty() {
88 continue;
89 }
90
91 if self.matches_pattern(path, pattern) {
92 let pattern_len = pattern.len();
93 if pattern_len > best_match_len {
94 best_match_len = pattern_len;
95 allowed = is_allow;
96 }
97 }
98 }
99
100 allowed
101 }
102
103 fn matches_pattern(&self, path: &str, pattern: &str) -> bool {
104 if pattern == "/" {
105 return true;
106 }
107
108 if let Some(prefix) = pattern.strip_suffix('*') {
109 return path.starts_with(prefix);
110 }
111
112 if let Some(prefix) = pattern.strip_suffix('$') {
113 return path == prefix;
114 }
115
116 path.starts_with(pattern)
117 }
118
119 pub fn crawl_delay(&self) -> Option<Duration> {
120 self.crawl_delay
121 }
122}
123
124#[cfg(test)]
125mod tests {
126 use super::*;
127
128 #[test]
129 fn parse_empty_robots_txt() {
130 let robots = RobotsTxt::parse("", "TestBot");
131 assert!(robots.is_allowed("/"));
132 assert!(robots.is_allowed("/anything"));
133 }
134
135 #[test]
136 fn parse_wildcard_user_agent() {
137 let content = r#"
138User-agent: *
139Disallow: /admin/
140Allow: /admin/public
141"#;
142 let robots = RobotsTxt::parse(content, "TestBot");
143 assert!(!robots.is_allowed("/admin/"));
144 assert!(!robots.is_allowed("/admin/secret"));
145 assert!(robots.is_allowed("/admin/public"));
146 assert!(robots.is_allowed("/"));
147 }
148
149 #[test]
150 fn parse_specific_user_agent() {
151 let content = r#"
152User-agent: BadBot
153Disallow: /
154
155User-agent: GoodBot
156Disallow: /private/
157"#;
158 let robots = RobotsTxt::parse(content, "GoodBot");
159 assert!(!robots.is_allowed("/private/"));
160 assert!(robots.is_allowed("/public/"));
161 }
162
163 #[test]
164 fn parse_crawl_delay() {
165 let content = r#"
166User-agent: *
167Crawl-delay: 2.5
168"#;
169 let robots = RobotsTxt::parse(content, "TestBot");
170 assert_eq!(robots.crawl_delay(), Some(Duration::from_secs_f64(2.5)));
171 }
172
173 #[test]
174 fn pattern_matching_wildcard() {
175 let content = r#"
176User-agent: *
177Disallow: /temp*
178"#;
179 let robots = RobotsTxt::parse(content, "TestBot");
180 assert!(!robots.is_allowed("/temp"));
181 assert!(!robots.is_allowed("/temporary"));
182 assert!(robots.is_allowed("/other"));
183 }
184
185 #[test]
186 fn pattern_matching_end_anchor() {
187 let content = r#"
188User-agent: *
189Disallow: /file.html$
190"#;
191 let robots = RobotsTxt::parse(content, "TestBot");
192 assert!(!robots.is_allowed("/file.html"));
193 assert!(robots.is_allowed("/file.html?query=1"));
194 }
195
196 #[test]
197 fn longest_match_wins() {
198 let content = r#"
199User-agent: *
200Disallow: /admin/
201Allow: /admin/public/
202"#;
203 let robots = RobotsTxt::parse(content, "TestBot");
204 assert!(!robots.is_allowed("/admin/"));
205 assert!(!robots.is_allowed("/admin/secret"));
206 assert!(robots.is_allowed("/admin/public/"));
207 assert!(robots.is_allowed("/admin/public/page"));
208 }
209
210 #[test]
211 fn case_insensitive_user_agent() {
212 let content = r#"
213User-agent: googlebot
214Disallow: /private/
215"#;
216 let robots = RobotsTxt::parse(content, "GoogleBot");
217 assert!(!robots.is_allowed("/private/"));
218 }
219
220 #[test]
221 fn comments_ignored() {
222 let content = r#"
223# This is a comment
224User-agent: *
225Disallow: /admin/ # inline comment
226"#;
227 let robots = RobotsTxt::parse(content, "TestBot");
228 assert!(!robots.is_allowed("/admin/"));
229 }
230}