halldyll_robots/
matcher.rs

1//! Matcher - RFC 9309 compliant path matching
2
3use crate::parser::encoding;
4use crate::types::{Decision, DecisionReason, EffectiveRules, Group, Rule, RuleKind, RobotsPolicy};
5use tracing::debug;
6
7/// Matcher for checking if paths are allowed
8pub struct RobotsMatcher {
9    /// User-agent token to match against
10    user_agent: String,
11}
12
13impl RobotsMatcher {
14    /// Create a new matcher for a user-agent
15    pub fn new(user_agent: &str) -> Self {
16        Self {
17            user_agent: user_agent.to_string(),
18        }
19    }
20
21    /// Get effective rules for this user-agent from a policy
22    pub fn effective_rules(&self, policy: &RobotsPolicy) -> EffectiveRules {
23        let mut matched_groups: Vec<&Group> = Vec::new();
24        let mut wildcard_group: Option<&Group> = None;
25
26        // Find matching groups
27        for group in &policy.groups {
28            if group.is_wildcard() {
29                wildcard_group = Some(group);
30            } else if group.matches_user_agent(&self.user_agent) {
31                matched_groups.push(group);
32            }
33        }
34
35        // If no specific groups match, use wildcard
36        if matched_groups.is_empty() {
37            if let Some(wg) = wildcard_group {
38                matched_groups.push(wg);
39            }
40        }
41
42        // Merge rules from all matching groups
43        let mut rules: Vec<Rule> = Vec::new();
44        let mut crawl_delay: Option<f64> = None;
45        let mut request_rate = None;
46        let mut matched_agents: Vec<String> = Vec::new();
47
48        for group in matched_groups {
49            matched_agents.extend(group.user_agents.clone());
50            rules.extend(group.rules.clone());
51            if crawl_delay.is_none() && group.crawl_delay.is_some() {
52                crawl_delay = group.crawl_delay;
53            }
54            if request_rate.is_none() && group.request_rate.is_some() {
55                request_rate = group.request_rate;
56            }
57        }
58
59        EffectiveRules {
60            rules,
61            crawl_delay,
62            request_rate,
63            matched_agents,
64        }
65    }
66
67    /// Check if a path is allowed by the policy
68    pub fn is_allowed(&self, policy: &RobotsPolicy, path: &str) -> Decision {
69        // Special case: /robots.txt is always allowed
70        if path == "/robots.txt" || path.starts_with("/robots.txt?") {
71            return Decision::allow(DecisionReason::RobotsTxtPath);
72        }
73
74        // Handle fetch status
75        match &policy.fetch_status {
76            crate::types::FetchStatus::Success => {
77                // Normal matching
78            }
79            crate::types::FetchStatus::NotModified => {
80                // 304 - should have been handled by caller using cached policy
81                // If we get here, treat as success (shouldn't happen normally)
82            }
83            crate::types::FetchStatus::Unavailable { .. } => {
84                return Decision::allow(DecisionReason::RobotsUnavailable);
85            }
86            crate::types::FetchStatus::Unreachable { .. } => {
87                return Decision::deny(DecisionReason::RobotsUnreachable);
88            }
89            crate::types::FetchStatus::Protected { .. } => {
90                return Decision::deny(DecisionReason::RobotsProtected);
91            }
92        }
93
94        // Get effective rules
95        let effective = self.effective_rules(policy);
96
97        if effective.rules.is_empty() {
98            // No rules means allowed
99            return Decision::allow(DecisionReason::NoMatchingRule);
100        }
101
102        // Normalize path for matching
103        let normalized_path = encoding::normalize_for_comparison(path);
104
105        // Find the best matching rule
106        self.find_best_match(&effective.rules, &normalized_path)
107    }
108
109    /// Find the best matching rule using longest match wins
110    fn find_best_match(&self, rules: &[Rule], path: &str) -> Decision {
111        let mut best_match: Option<(&Rule, usize)> = None;
112
113        for rule in rules {
114            // Normalize rule pattern for matching
115            let normalized_pattern = encoding::normalize_for_comparison(&rule.pattern);
116
117            // Check if rule matches
118            if rule.matches(path) {
119                let specificity = self.calculate_specificity(&normalized_pattern, path);
120                
121                debug!(
122                    "Rule {:?} {} matches {} with specificity {}",
123                    rule.kind, rule.pattern, path, specificity
124                );
125
126                match best_match {
127                    None => {
128                        best_match = Some((rule, specificity));
129                    }
130                    Some((_, best_spec)) => {
131                        if specificity > best_spec {
132                            // Longer match wins
133                            best_match = Some((rule, specificity));
134                        } else if specificity == best_spec {
135                            // Equal length: Allow wins (RFC 9309 SHOULD)
136                            if let Some((best_rule, _)) = best_match {
137                                if rule.kind == RuleKind::Allow && best_rule.kind == RuleKind::Disallow {
138                                    best_match = Some((rule, specificity));
139                                }
140                            }
141                        }
142                    }
143                }
144            }
145        }
146
147        match best_match {
148            Some((rule, _)) => {
149                let allowed = rule.kind == RuleKind::Allow;
150                let reason = if allowed {
151                    DecisionReason::AllowRuleMatched
152                } else {
153                    DecisionReason::DisallowRuleMatched
154                };
155                Decision {
156                    allowed,
157                    matched_rule: Some(rule.clone()),
158                    reason,
159                }
160            }
161            None => {
162                // No rule matched, default allow
163                Decision::allow(DecisionReason::NoMatchingRule)
164            }
165        }
166    }
167
168    /// Calculate specificity of a match (length of matched portion)
169    fn calculate_specificity(&self, pattern: &str, _path: &str) -> usize {
170        // RFC 9309: longest match wins
171        // We count the pattern length excluding wildcards
172        pattern
173            .chars()
174            .filter(|&c| c != '*' && c != '$')
175            .count()
176    }
177}
178
179/// Check if a URL is allowed (convenience function)
180pub fn is_allowed(policy: &RobotsPolicy, user_agent: &str, path: &str) -> Decision {
181    let matcher = RobotsMatcher::new(user_agent);
182    matcher.is_allowed(policy, path)
183}
184
185/// Get crawl delay for a user-agent (convenience function)
186pub fn get_crawl_delay(policy: &RobotsPolicy, user_agent: &str) -> Option<f64> {
187    let matcher = RobotsMatcher::new(user_agent);
188    let effective = matcher.effective_rules(policy);
189    effective.crawl_delay
190}
191
192#[cfg(test)]
193mod tests {
194    use super::*;
195    use crate::parser::RobotsParser;
196    use std::time::Duration;
197
198    fn parse_robots(content: &str) -> RobotsPolicy {
199        let parser = RobotsParser::new();
200        parser.parse(content, Duration::from_secs(3600))
201    }
202
203    #[test]
204    fn test_basic_disallow() {
205        let policy = parse_robots(r#"
206User-agent: *
207Disallow: /private/
208"#);
209        let matcher = RobotsMatcher::new("TestBot");
210        
211        assert!(!matcher.is_allowed(&policy, "/private/").allowed);
212        assert!(!matcher.is_allowed(&policy, "/private/secret").allowed);
213        assert!(matcher.is_allowed(&policy, "/public/").allowed);
214    }
215
216    #[test]
217    fn test_allow_overrides() {
218        let policy = parse_robots(r#"
219User-agent: *
220Disallow: /private/
221Allow: /private/public/
222"#);
223        let matcher = RobotsMatcher::new("TestBot");
224        
225        assert!(!matcher.is_allowed(&policy, "/private/").allowed);
226        assert!(!matcher.is_allowed(&policy, "/private/secret").allowed);
227        assert!(matcher.is_allowed(&policy, "/private/public/").allowed);
228        assert!(matcher.is_allowed(&policy, "/private/public/file.html").allowed);
229    }
230
231    #[test]
232    fn test_longest_match_wins() {
233        let policy = parse_robots(r#"
234User-agent: *
235Allow: /private/public/
236Disallow: /private/
237"#);
238        let matcher = RobotsMatcher::new("TestBot");
239        
240        // Longer Allow should win
241        assert!(matcher.is_allowed(&policy, "/private/public/").allowed);
242        assert!(!matcher.is_allowed(&policy, "/private/secret").allowed);
243    }
244
245    #[test]
246    fn test_equal_length_allow_wins() {
247        let policy = parse_robots(r#"
248User-agent: *
249Disallow: /path
250Allow: /path
251"#);
252        let matcher = RobotsMatcher::new("TestBot");
253        
254        // Equal length: Allow should win
255        assert!(matcher.is_allowed(&policy, "/path").allowed);
256    }
257
258    #[test]
259    fn test_wildcard_pattern() {
260        let policy = parse_robots(r#"
261User-agent: *
262Disallow: /*.php
263"#);
264        let matcher = RobotsMatcher::new("TestBot");
265        
266        assert!(!matcher.is_allowed(&policy, "/index.php").allowed);
267        assert!(!matcher.is_allowed(&policy, "/admin/login.php").allowed);
268        assert!(matcher.is_allowed(&policy, "/index.html").allowed);
269    }
270
271    #[test]
272    fn test_end_anchor() {
273        let policy = parse_robots(r#"
274User-agent: *
275Disallow: /exact$
276"#);
277        let matcher = RobotsMatcher::new("TestBot");
278        
279        assert!(!matcher.is_allowed(&policy, "/exact").allowed);
280        assert!(matcher.is_allowed(&policy, "/exact/").allowed);
281        assert!(matcher.is_allowed(&policy, "/exact/more").allowed);
282    }
283
284    #[test]
285    fn test_specific_user_agent() {
286        let policy = parse_robots(r#"
287User-agent: Googlebot
288Disallow: /google-only
289
290User-agent: *
291Disallow: /admin
292"#);
293        
294        let google = RobotsMatcher::new("Googlebot");
295        let other = RobotsMatcher::new("OtherBot");
296        
297        // Googlebot uses specific rules
298        assert!(!google.is_allowed(&policy, "/google-only").allowed);
299        assert!(google.is_allowed(&policy, "/admin").allowed); // Not blocked for Googlebot
300        
301        // Other bots use wildcard
302        assert!(other.is_allowed(&policy, "/google-only").allowed);
303        assert!(!other.is_allowed(&policy, "/admin").allowed);
304    }
305
306    #[test]
307    fn test_robots_txt_always_allowed() {
308        let policy = parse_robots(r#"
309User-agent: *
310Disallow: /
311"#);
312        let matcher = RobotsMatcher::new("TestBot");
313        
314        // Even with disallow all, /robots.txt is allowed
315        assert!(matcher.is_allowed(&policy, "/robots.txt").allowed);
316        assert!(!matcher.is_allowed(&policy, "/anything").allowed);
317    }
318
319    #[test]
320    fn test_user_agent_prefix_match() {
321        let policy = parse_robots(r#"
322User-agent: Googlebot
323Disallow: /
324"#);
325        
326        // Googlebot-Image should match Googlebot
327        let bot = RobotsMatcher::new("Googlebot-Image");
328        assert!(!bot.is_allowed(&policy, "/test").allowed);
329    }
330
331    #[test]
332    fn test_crawl_delay() {
333        let policy = parse_robots(r#"
334User-agent: *
335Crawl-delay: 2.5
336Disallow: /admin
337"#);
338        
339        let delay = get_crawl_delay(&policy, "TestBot");
340        assert_eq!(delay, Some(2.5));
341    }
342
343    #[test]
344    fn test_no_rules_means_allowed() {
345        let policy = parse_robots(r#"
346User-agent: *
347"#);
348        let matcher = RobotsMatcher::new("TestBot");
349        
350        assert!(matcher.is_allowed(&policy, "/anything").allowed);
351    }
352
353    #[test]
354    fn test_empty_robots() {
355        let policy = parse_robots("");
356        let matcher = RobotsMatcher::new("TestBot");
357        
358        assert!(matcher.is_allowed(&policy, "/anything").allowed);
359    }
360}