halldyll_robots/
parser.rs1use crate::types::{Group, RequestRate, Rule, RuleKind, RobotsPolicy, FetchStatus};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11use tracing::{debug, warn};
12
13pub const MAX_ROBOTS_SIZE: usize = 512 * 1024;
15
16const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
18
19fn now_millis() -> u64 {
21 SystemTime::now()
22 .duration_since(UNIX_EPOCH)
23 .unwrap_or_default()
24 .as_millis() as u64
25}
26
27fn strip_bom(content: &str) -> &str {
29 if content.as_bytes().starts_with(UTF8_BOM) {
30 &content[3..]
31 } else {
32 content
33 }
34}
35
36pub struct RobotsParser {
38 max_size: usize,
40}
41
42impl Default for RobotsParser {
43 fn default() -> Self {
44 Self::new()
45 }
46}
47
48impl RobotsParser {
49 pub fn new() -> Self {
51 Self {
52 max_size: MAX_ROBOTS_SIZE,
53 }
54 }
55
56 pub fn with_max_size(max_size: usize) -> Self {
58 Self { max_size }
59 }
60
61 pub fn parse(&self, content: &str, ttl: Duration) -> RobotsPolicy {
63 let now = now_millis();
64
65 let content = strip_bom(content);
67
68 let content = if content.len() > self.max_size {
70 warn!(
71 "robots.txt exceeds size limit ({} > {}), truncating",
72 content.len(),
73 self.max_size
74 );
75 &content[..self.max_size]
76 } else {
77 content
78 };
79
80 let content_size = content.len();
81 let mut groups: Vec<Group> = Vec::new();
82 let mut sitemaps: Vec<String> = Vec::new();
83 let mut current_group: Option<Group> = None;
84
85 for line in content.lines() {
86 let line = self.clean_line(line);
87 if line.is_empty() {
88 continue;
89 }
90
91 if let Some((directive, value)) = self.parse_directive(&line) {
93 match directive.to_lowercase().as_str() {
94 "user-agent" => {
95 if let Some(ref mut group) = current_group {
97 if group.rules.is_empty() {
98 group.user_agents.push(value.to_string());
100 } else {
101 groups.push(current_group.take().unwrap());
103 current_group = Some(Group {
104 user_agents: vec![value.to_string()],
105 rules: Vec::new(),
106 crawl_delay: None,
107 request_rate: None,
108 });
109 }
110 } else {
111 current_group = Some(Group {
112 user_agents: vec![value.to_string()],
113 rules: Vec::new(),
114 crawl_delay: None,
115 request_rate: None,
116 });
117 }
118 }
119 "allow" => {
120 if let Some(ref mut group) = current_group {
121 let pattern = self.normalize_pattern(value);
122 if !pattern.is_empty() {
123 group.rules.push(Rule::new(RuleKind::Allow, pattern));
124 }
125 }
126 }
127 "disallow" => {
128 if let Some(ref mut group) = current_group {
129 let pattern = self.normalize_pattern(value);
130 if !pattern.is_empty() {
132 group.rules.push(Rule::new(RuleKind::Disallow, pattern));
133 }
134 }
135 }
136 "crawl-delay" => {
137 if let Some(ref mut group) = current_group {
138 if let Ok(delay) = value.trim().parse::<f64>() {
139 if delay >= 0.0 {
140 group.crawl_delay = Some(delay);
141 }
142 }
143 }
144 }
145 "request-rate" => {
146 if let Some(ref mut group) = current_group {
148 if let Some(rate) = Self::parse_request_rate(value) {
149 group.request_rate = Some(rate);
150 }
151 }
152 }
153 "sitemap" => {
154 let sitemap_url = value.trim().to_string();
156 if !sitemap_url.is_empty() {
157 sitemaps.push(sitemap_url);
158 }
159 }
160 _ => {
161 debug!("Ignoring unknown robots.txt directive: {}", directive);
163 }
164 }
165 }
166 }
167
168 if let Some(group) = current_group {
170 if !group.user_agents.is_empty() {
171 groups.push(group);
172 }
173 }
174
175 RobotsPolicy {
176 fetched_at_ms: now,
177 expires_at_ms: now + ttl.as_millis() as u64,
178 fetch_status: FetchStatus::Success,
179 groups,
180 sitemaps,
181 content_size,
182 etag: None,
183 last_modified: None,
184 }
185 }
186
187 fn parse_request_rate(value: &str) -> Option<RequestRate> {
189 let parts: Vec<&str> = value.trim().split('/').collect();
190 if parts.len() == 2 {
191 let requests = parts[0].trim().parse::<u32>().ok()?;
192 let seconds = parts[1].trim().parse::<u32>().ok()?;
193 if requests > 0 && seconds > 0 {
194 return Some(RequestRate::new(requests, seconds));
195 }
196 }
197 None
198 }
199
200 fn clean_line(&self, line: &str) -> String {
202 let line = match line.find('#') {
204 Some(pos) => &line[..pos],
205 None => line,
206 };
207 line.trim().to_string()
208 }
209
210 fn parse_directive<'a>(&self, line: &'a str) -> Option<(&'a str, &'a str)> {
212 let colon_pos = line.find(':')?;
213 let directive = line[..colon_pos].trim();
214 let value = line[colon_pos + 1..].trim();
215
216 if directive.is_empty() {
217 return None;
218 }
219
220 Some((directive, value))
221 }
222
223 fn normalize_pattern(&self, pattern: &str) -> String {
225 let pattern = pattern.trim();
226
227 if pattern.is_empty() {
229 return String::new();
230 }
231
232 if !pattern.starts_with('/') && !pattern.starts_with('*') {
234 format!("/{}", pattern)
235 } else {
236 pattern.to_string()
237 }
238 }
239}
240
241pub mod encoding {
243 pub fn normalize_path_for_matching(path: &str) -> String {
246 let mut result = String::with_capacity(path.len());
247 let mut chars = path.chars().peekable();
248
249 while let Some(c) = chars.next() {
250 if c == '%' {
251 let hex: String = chars.by_ref().take(2).collect();
253 if hex.len() == 2 {
254 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
255 let decoded = byte as char;
256 if is_unreserved(decoded) {
258 result.push(decoded);
259 continue;
260 }
261 }
262 }
263 result.push('%');
265 result.push_str(&hex);
266 } else {
267 result.push(c);
268 }
269 }
270
271 result
272 }
273
274 fn is_unreserved(c: char) -> bool {
276 c.is_ascii_alphanumeric() || c == '-' || c == '.' || c == '_' || c == '~'
277 }
278
279 pub fn normalize_for_comparison(s: &str) -> String {
281 let decoded = normalize_path_for_matching(s);
283 uppercase_percent_encoding(&decoded)
285 }
286
287 fn uppercase_percent_encoding(s: &str) -> String {
289 let mut result = String::with_capacity(s.len());
290 let mut chars = s.chars().peekable();
291
292 while let Some(c) = chars.next() {
293 if c == '%' {
294 result.push('%');
295 for _ in 0..2 {
297 if let Some(hex_char) = chars.next() {
298 result.push(hex_char.to_ascii_uppercase());
299 }
300 }
301 } else {
302 result.push(c);
303 }
304 }
305
306 result
307 }
308}
309
310#[cfg(test)]
311mod tests {
312 use super::*;
313
314 #[test]
315 fn test_parse_simple() {
316 let parser = RobotsParser::new();
317 let content = r#"
318User-agent: *
319Disallow: /private/
320Allow: /private/public/
321Crawl-delay: 2
322"#;
323 let policy = parser.parse(content, Duration::from_secs(3600));
324
325 assert_eq!(policy.groups.len(), 1);
326 assert_eq!(policy.groups[0].user_agents, vec!["*"]);
327 assert_eq!(policy.groups[0].rules.len(), 2);
328 assert_eq!(policy.groups[0].crawl_delay, Some(2.0));
329 }
330
331 #[test]
332 fn test_parse_multiple_groups() {
333 let parser = RobotsParser::new();
334 let content = r#"
335User-agent: Googlebot
336User-agent: Bingbot
337Disallow: /search
338
339User-agent: *
340Disallow: /admin
341"#;
342 let policy = parser.parse(content, Duration::from_secs(3600));
343
344 assert_eq!(policy.groups.len(), 2);
345 assert_eq!(policy.groups[0].user_agents, vec!["Googlebot", "Bingbot"]);
346 assert_eq!(policy.groups[1].user_agents, vec!["*"]);
347 }
348
349 #[test]
350 fn test_parse_sitemaps() {
351 let parser = RobotsParser::new();
352 let content = r#"
353User-agent: *
354Disallow:
355
356Sitemap: https://example.com/sitemap.xml
357Sitemap: https://example.com/sitemap2.xml
358"#;
359 let policy = parser.parse(content, Duration::from_secs(3600));
360
361 assert_eq!(policy.sitemaps.len(), 2);
362 assert_eq!(policy.sitemaps[0], "https://example.com/sitemap.xml");
363 }
364
365 #[test]
366 fn test_parse_comments() {
367 let parser = RobotsParser::new();
368 let content = r#"
369# This is a comment
370User-agent: * # inline comment
371Disallow: /private # another comment
372"#;
373 let policy = parser.parse(content, Duration::from_secs(3600));
374
375 assert_eq!(policy.groups.len(), 1);
376 assert_eq!(policy.groups[0].rules.len(), 1);
377 }
378
379 #[test]
380 fn test_parse_empty_disallow() {
381 let parser = RobotsParser::new();
382 let content = r#"
383User-agent: *
384Disallow:
385"#;
386 let policy = parser.parse(content, Duration::from_secs(3600));
387
388 assert_eq!(policy.groups[0].rules.len(), 0);
390 }
391
392 #[test]
393 fn test_normalize_pattern() {
394 let parser = RobotsParser::new();
395 assert_eq!(parser.normalize_pattern("/path"), "/path");
396 assert_eq!(parser.normalize_pattern("path"), "/path");
397 assert_eq!(parser.normalize_pattern("*"), "*");
398 assert_eq!(parser.normalize_pattern(""), "");
399 }
400
401 #[test]
402 fn test_encoding_normalize() {
403 use encoding::normalize_path_for_matching;
404
405 assert_eq!(normalize_path_for_matching("/path%2Dtest"), "/path-test");
407
408 assert_eq!(normalize_path_for_matching("/path%2Ftest"), "/path%2Ftest");
410 }
411
412 #[test]
413 fn test_bom_stripping() {
414 let parser = RobotsParser::new();
415 let content = "\u{FEFF}User-agent: *\nDisallow: /private";
417 let policy = parser.parse(content, Duration::from_secs(3600));
418
419 assert_eq!(policy.groups.len(), 1);
420 assert_eq!(policy.groups[0].user_agents, vec!["*"]);
421 }
422
423 #[test]
424 fn test_request_rate_parsing() {
425 let parser = RobotsParser::new();
426 let content = r#"
427User-agent: *
428Disallow: /private
429Request-rate: 1/10
430"#;
431 let policy = parser.parse(content, Duration::from_secs(3600));
432
433 assert_eq!(policy.groups.len(), 1);
434 let rate = policy.groups[0].request_rate.unwrap();
435 assert_eq!(rate.requests, 1);
436 assert_eq!(rate.seconds, 10);
437 assert!((rate.delay_seconds() - 10.0).abs() < 0.001);
438 }
439
440 #[test]
441 fn test_crawl_delay_float() {
442 let parser = RobotsParser::new();
443 let content = r#"
444User-agent: *
445Crawl-delay: 0.5
446"#;
447 let policy = parser.parse(content, Duration::from_secs(3600));
448
449 assert_eq!(policy.groups[0].crawl_delay, Some(0.5));
450 }
451}