halldyll_robots/
parser.rs

1//! Parser - RFC 9309 compliant robots.txt parser
2//!
3//! This module provides a robust parser for robots.txt files that handles:
4//! - UTF-8 BOM stripping
5//! - Request-rate directive (non-standard but common)
6//! - Proper percent-encoding normalization
7//! - Size limits per RFC 9309
8
9use crate::types::{Group, RequestRate, Rule, RuleKind, RobotsPolicy, FetchStatus};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11use tracing::{debug, warn};
12
13/// Maximum robots.txt size (RFC 9309 requires at least 500 KiB)
14pub const MAX_ROBOTS_SIZE: usize = 512 * 1024;
15
16/// UTF-8 BOM bytes
17const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
18
19/// Get current timestamp in milliseconds
20fn now_millis() -> u64 {
21    SystemTime::now()
22        .duration_since(UNIX_EPOCH)
23        .unwrap_or_default()
24        .as_millis() as u64
25}
26
27/// Strip UTF-8 BOM if present
28fn strip_bom(content: &str) -> &str {
29    if content.as_bytes().starts_with(UTF8_BOM) {
30        &content[3..]
31    } else {
32        content
33    }
34}
35
36/// Parser for robots.txt files
37pub struct RobotsParser {
38    /// Maximum content size to parse
39    max_size: usize,
40}
41
42impl Default for RobotsParser {
43    fn default() -> Self {
44        Self::new()
45    }
46}
47
48impl RobotsParser {
49    /// Create a new parser with default settings
50    pub fn new() -> Self {
51        Self {
52            max_size: MAX_ROBOTS_SIZE,
53        }
54    }
55
56    /// Create a parser with custom max size
57    pub fn with_max_size(max_size: usize) -> Self {
58        Self { max_size }
59    }
60
61    /// Parse robots.txt content into a policy
62    pub fn parse(&self, content: &str, ttl: Duration) -> RobotsPolicy {
63        let now = now_millis();
64        
65        // Strip UTF-8 BOM if present
66        let content = strip_bom(content);
67        
68        // Enforce size limit
69        let content = if content.len() > self.max_size {
70            warn!(
71                "robots.txt exceeds size limit ({} > {}), truncating",
72                content.len(),
73                self.max_size
74            );
75            &content[..self.max_size]
76        } else {
77            content
78        };
79
80        let content_size = content.len();
81        let mut groups: Vec<Group> = Vec::new();
82        let mut sitemaps: Vec<String> = Vec::new();
83        let mut current_group: Option<Group> = None;
84
85        for line in content.lines() {
86            let line = self.clean_line(line);
87            if line.is_empty() {
88                continue;
89            }
90
91            // Parse the line
92            if let Some((directive, value)) = self.parse_directive(&line) {
93                match directive.to_lowercase().as_str() {
94                    "user-agent" => {
95                        // Start a new group or add to current
96                        if let Some(ref mut group) = current_group {
97                            if group.rules.is_empty() {
98                                // No rules yet, can add more user-agents
99                                group.user_agents.push(value.to_string());
100                            } else {
101                                // Rules exist, save current and start new
102                                groups.push(current_group.take().unwrap());
103                                current_group = Some(Group {
104                                    user_agents: vec![value.to_string()],
105                                    rules: Vec::new(),
106                                    crawl_delay: None,
107                                    request_rate: None,
108                                });
109                            }
110                        } else {
111                            current_group = Some(Group {
112                                user_agents: vec![value.to_string()],
113                                rules: Vec::new(),
114                                crawl_delay: None,
115                                request_rate: None,
116                            });
117                        }
118                    }
119                    "allow" => {
120                        if let Some(ref mut group) = current_group {
121                            let pattern = self.normalize_pattern(value);
122                            if !pattern.is_empty() {
123                                group.rules.push(Rule::new(RuleKind::Allow, pattern));
124                            }
125                        }
126                    }
127                    "disallow" => {
128                        if let Some(ref mut group) = current_group {
129                            let pattern = self.normalize_pattern(value);
130                            // Empty disallow means allow all
131                            if !pattern.is_empty() {
132                                group.rules.push(Rule::new(RuleKind::Disallow, pattern));
133                            }
134                        }
135                    }
136                    "crawl-delay" => {
137                        if let Some(ref mut group) = current_group {
138                            if let Ok(delay) = value.trim().parse::<f64>() {
139                                if delay >= 0.0 {
140                                    group.crawl_delay = Some(delay);
141                                }
142                            }
143                        }
144                    }
145                    "request-rate" => {
146                        // Non-standard but common: "requests/seconds" e.g., "1/10"
147                        if let Some(ref mut group) = current_group {
148                            if let Some(rate) = Self::parse_request_rate(value) {
149                                group.request_rate = Some(rate);
150                            }
151                        }
152                    }
153                    "sitemap" => {
154                        // Sitemap is not part of a group
155                        let sitemap_url = value.trim().to_string();
156                        if !sitemap_url.is_empty() {
157                            sitemaps.push(sitemap_url);
158                        }
159                    }
160                    _ => {
161                        // Unknown directive, ignore per RFC
162                        debug!("Ignoring unknown robots.txt directive: {}", directive);
163                    }
164                }
165            }
166        }
167
168        // Save the last group
169        if let Some(group) = current_group {
170            if !group.user_agents.is_empty() {
171                groups.push(group);
172            }
173        }
174
175        RobotsPolicy {
176            fetched_at_ms: now,
177            expires_at_ms: now + ttl.as_millis() as u64,
178            fetch_status: FetchStatus::Success,
179            groups,
180            sitemaps,
181            content_size,
182            etag: None,
183            last_modified: None,
184        }
185    }
186
187    /// Parse a request-rate value like "1/10" (1 request per 10 seconds)
188    fn parse_request_rate(value: &str) -> Option<RequestRate> {
189        let parts: Vec<&str> = value.trim().split('/').collect();
190        if parts.len() == 2 {
191            let requests = parts[0].trim().parse::<u32>().ok()?;
192            let seconds = parts[1].trim().parse::<u32>().ok()?;
193            if requests > 0 && seconds > 0 {
194                return Some(RequestRate::new(requests, seconds));
195            }
196        }
197        None
198    }
199
200    /// Clean a line (remove comments, trim whitespace)
201    fn clean_line(&self, line: &str) -> String {
202        // Remove comments
203        let line = match line.find('#') {
204            Some(pos) => &line[..pos],
205            None => line,
206        };
207        line.trim().to_string()
208    }
209
210    /// Parse a directive line into (directive, value)
211    fn parse_directive<'a>(&self, line: &'a str) -> Option<(&'a str, &'a str)> {
212        let colon_pos = line.find(':')?;
213        let directive = line[..colon_pos].trim();
214        let value = line[colon_pos + 1..].trim();
215        
216        if directive.is_empty() {
217            return None;
218        }
219
220        Some((directive, value))
221    }
222
223    /// Normalize a pattern for matching
224    fn normalize_pattern(&self, pattern: &str) -> String {
225        let pattern = pattern.trim();
226        
227        // Handle empty pattern
228        if pattern.is_empty() {
229            return String::new();
230        }
231
232        // Ensure pattern starts with /
233        if !pattern.starts_with('/') && !pattern.starts_with('*') {
234            format!("/{}", pattern)
235        } else {
236            pattern.to_string()
237        }
238    }
239}
240
241/// Utility functions for percent-encoding handling
242pub mod encoding {
243    /// Decode percent-encoded characters for matching
244    /// Only decodes unreserved characters per RFC 3986
245    pub fn normalize_path_for_matching(path: &str) -> String {
246        let mut result = String::with_capacity(path.len());
247        let mut chars = path.chars().peekable();
248
249        while let Some(c) = chars.next() {
250            if c == '%' {
251                // Try to decode
252                let hex: String = chars.by_ref().take(2).collect();
253                if hex.len() == 2 {
254                    if let Ok(byte) = u8::from_str_radix(&hex, 16) {
255                        let decoded = byte as char;
256                        // Only decode unreserved characters
257                        if is_unreserved(decoded) {
258                            result.push(decoded);
259                            continue;
260                        }
261                    }
262                }
263                // Keep as-is if can't decode
264                result.push('%');
265                result.push_str(&hex);
266            } else {
267                result.push(c);
268            }
269        }
270
271        result
272    }
273
274    /// Check if a character is unreserved per RFC 3986
275    fn is_unreserved(c: char) -> bool {
276        c.is_ascii_alphanumeric() || c == '-' || c == '.' || c == '_' || c == '~'
277    }
278
279    /// Normalize both pattern and path for comparison
280    pub fn normalize_for_comparison(s: &str) -> String {
281        // First decode unreserved characters
282        let decoded = normalize_path_for_matching(s);
283        // Uppercase remaining percent-encoding for consistency
284        uppercase_percent_encoding(&decoded)
285    }
286
287    /// Uppercase percent-encoding hex digits
288    fn uppercase_percent_encoding(s: &str) -> String {
289        let mut result = String::with_capacity(s.len());
290        let mut chars = s.chars().peekable();
291
292        while let Some(c) = chars.next() {
293            if c == '%' {
294                result.push('%');
295                // Uppercase next two characters
296                for _ in 0..2 {
297                    if let Some(hex_char) = chars.next() {
298                        result.push(hex_char.to_ascii_uppercase());
299                    }
300                }
301            } else {
302                result.push(c);
303            }
304        }
305
306        result
307    }
308}
309
310#[cfg(test)]
311mod tests {
312    use super::*;
313
314    #[test]
315    fn test_parse_simple() {
316        let parser = RobotsParser::new();
317        let content = r#"
318User-agent: *
319Disallow: /private/
320Allow: /private/public/
321Crawl-delay: 2
322"#;
323        let policy = parser.parse(content, Duration::from_secs(3600));
324        
325        assert_eq!(policy.groups.len(), 1);
326        assert_eq!(policy.groups[0].user_agents, vec!["*"]);
327        assert_eq!(policy.groups[0].rules.len(), 2);
328        assert_eq!(policy.groups[0].crawl_delay, Some(2.0));
329    }
330
331    #[test]
332    fn test_parse_multiple_groups() {
333        let parser = RobotsParser::new();
334        let content = r#"
335User-agent: Googlebot
336User-agent: Bingbot
337Disallow: /search
338
339User-agent: *
340Disallow: /admin
341"#;
342        let policy = parser.parse(content, Duration::from_secs(3600));
343        
344        assert_eq!(policy.groups.len(), 2);
345        assert_eq!(policy.groups[0].user_agents, vec!["Googlebot", "Bingbot"]);
346        assert_eq!(policy.groups[1].user_agents, vec!["*"]);
347    }
348
349    #[test]
350    fn test_parse_sitemaps() {
351        let parser = RobotsParser::new();
352        let content = r#"
353User-agent: *
354Disallow:
355
356Sitemap: https://example.com/sitemap.xml
357Sitemap: https://example.com/sitemap2.xml
358"#;
359        let policy = parser.parse(content, Duration::from_secs(3600));
360        
361        assert_eq!(policy.sitemaps.len(), 2);
362        assert_eq!(policy.sitemaps[0], "https://example.com/sitemap.xml");
363    }
364
365    #[test]
366    fn test_parse_comments() {
367        let parser = RobotsParser::new();
368        let content = r#"
369# This is a comment
370User-agent: * # inline comment
371Disallow: /private # another comment
372"#;
373        let policy = parser.parse(content, Duration::from_secs(3600));
374        
375        assert_eq!(policy.groups.len(), 1);
376        assert_eq!(policy.groups[0].rules.len(), 1);
377    }
378
379    #[test]
380    fn test_parse_empty_disallow() {
381        let parser = RobotsParser::new();
382        let content = r#"
383User-agent: *
384Disallow:
385"#;
386        let policy = parser.parse(content, Duration::from_secs(3600));
387        
388        // Empty disallow should not create a rule
389        assert_eq!(policy.groups[0].rules.len(), 0);
390    }
391
392    #[test]
393    fn test_normalize_pattern() {
394        let parser = RobotsParser::new();
395        assert_eq!(parser.normalize_pattern("/path"), "/path");
396        assert_eq!(parser.normalize_pattern("path"), "/path");
397        assert_eq!(parser.normalize_pattern("*"), "*");
398        assert_eq!(parser.normalize_pattern(""), "");
399    }
400
401    #[test]
402    fn test_encoding_normalize() {
403        use encoding::normalize_path_for_matching;
404        
405        // Unreserved characters should be decoded
406        assert_eq!(normalize_path_for_matching("/path%2Dtest"), "/path-test");
407        
408        // Reserved characters should stay encoded
409        assert_eq!(normalize_path_for_matching("/path%2Ftest"), "/path%2Ftest");
410    }
411
412    #[test]
413    fn test_bom_stripping() {
414        let parser = RobotsParser::new();
415        // UTF-8 BOM followed by valid content
416        let content = "\u{FEFF}User-agent: *\nDisallow: /private";
417        let policy = parser.parse(content, Duration::from_secs(3600));
418        
419        assert_eq!(policy.groups.len(), 1);
420        assert_eq!(policy.groups[0].user_agents, vec!["*"]);
421    }
422
423    #[test]
424    fn test_request_rate_parsing() {
425        let parser = RobotsParser::new();
426        let content = r#"
427User-agent: *
428Disallow: /private
429Request-rate: 1/10
430"#;
431        let policy = parser.parse(content, Duration::from_secs(3600));
432        
433        assert_eq!(policy.groups.len(), 1);
434        let rate = policy.groups[0].request_rate.unwrap();
435        assert_eq!(rate.requests, 1);
436        assert_eq!(rate.seconds, 10);
437        assert!((rate.delay_seconds() - 10.0).abs() < 0.001);
438    }
439
440    #[test]
441    fn test_crawl_delay_float() {
442        let parser = RobotsParser::new();
443        let content = r#"
444User-agent: *
445Crawl-delay: 0.5
446"#;
447        let policy = parser.parse(content, Duration::from_secs(3600));
448        
449        assert_eq!(policy.groups[0].crawl_delay, Some(0.5));
450    }
451}