halldyll-robots 0.1.0

robots.txt parser and compliance for halldyll scraper
Documentation
//! Matcher - RFC 9309 compliant path matching

use crate::parser::encoding;
use crate::types::{Decision, DecisionReason, EffectiveRules, Group, Rule, RuleKind, RobotsPolicy};
use tracing::debug;

/// Matcher for checking if paths are allowed
pub struct RobotsMatcher {
    /// User-agent token to match against
    user_agent: String,
}

impl RobotsMatcher {
    /// Create a new matcher for a user-agent
    pub fn new(user_agent: &str) -> Self {
        Self {
            user_agent: user_agent.to_string(),
        }
    }

    /// Get effective rules for this user-agent from a policy
    pub fn effective_rules(&self, policy: &RobotsPolicy) -> EffectiveRules {
        let mut matched_groups: Vec<&Group> = Vec::new();
        let mut wildcard_group: Option<&Group> = None;

        // Find matching groups
        for group in &policy.groups {
            if group.is_wildcard() {
                wildcard_group = Some(group);
            } else if group.matches_user_agent(&self.user_agent) {
                matched_groups.push(group);
            }
        }

        // If no specific groups match, use wildcard
        if matched_groups.is_empty() {
            if let Some(wg) = wildcard_group {
                matched_groups.push(wg);
            }
        }

        // Merge rules from all matching groups
        let mut rules: Vec<Rule> = Vec::new();
        let mut crawl_delay: Option<f64> = None;
        let mut request_rate = None;
        let mut matched_agents: Vec<String> = Vec::new();

        for group in matched_groups {
            matched_agents.extend(group.user_agents.clone());
            rules.extend(group.rules.clone());
            if crawl_delay.is_none() && group.crawl_delay.is_some() {
                crawl_delay = group.crawl_delay;
            }
            if request_rate.is_none() && group.request_rate.is_some() {
                request_rate = group.request_rate;
            }
        }

        EffectiveRules {
            rules,
            crawl_delay,
            request_rate,
            matched_agents,
        }
    }

    /// Check if a path is allowed by the policy
    pub fn is_allowed(&self, policy: &RobotsPolicy, path: &str) -> Decision {
        // Special case: /robots.txt is always allowed
        if path == "/robots.txt" || path.starts_with("/robots.txt?") {
            return Decision::allow(DecisionReason::RobotsTxtPath);
        }

        // Handle fetch status
        match &policy.fetch_status {
            crate::types::FetchStatus::Success => {
                // Normal matching
            }
            crate::types::FetchStatus::NotModified => {
                // 304 - should have been handled by caller using cached policy
                // If we get here, treat as success (shouldn't happen normally)
            }
            crate::types::FetchStatus::Unavailable { .. } => {
                return Decision::allow(DecisionReason::RobotsUnavailable);
            }
            crate::types::FetchStatus::Unreachable { .. } => {
                return Decision::deny(DecisionReason::RobotsUnreachable);
            }
            crate::types::FetchStatus::Protected { .. } => {
                return Decision::deny(DecisionReason::RobotsProtected);
            }
        }

        // Get effective rules
        let effective = self.effective_rules(policy);

        if effective.rules.is_empty() {
            // No rules means allowed
            return Decision::allow(DecisionReason::NoMatchingRule);
        }

        // Normalize path for matching
        let normalized_path = encoding::normalize_for_comparison(path);

        // Find the best matching rule
        self.find_best_match(&effective.rules, &normalized_path)
    }

    /// Find the best matching rule using longest match wins
    fn find_best_match(&self, rules: &[Rule], path: &str) -> Decision {
        let mut best_match: Option<(&Rule, usize)> = None;

        for rule in rules {
            // Normalize rule pattern for matching
            let normalized_pattern = encoding::normalize_for_comparison(&rule.pattern);

            // Check if rule matches
            if rule.matches(path) {
                let specificity = self.calculate_specificity(&normalized_pattern, path);
                
                debug!(
                    "Rule {:?} {} matches {} with specificity {}",
                    rule.kind, rule.pattern, path, specificity
                );

                match best_match {
                    None => {
                        best_match = Some((rule, specificity));
                    }
                    Some((_, best_spec)) => {
                        if specificity > best_spec {
                            // Longer match wins
                            best_match = Some((rule, specificity));
                        } else if specificity == best_spec {
                            // Equal length: Allow wins (RFC 9309 SHOULD)
                            if let Some((best_rule, _)) = best_match {
                                if rule.kind == RuleKind::Allow && best_rule.kind == RuleKind::Disallow {
                                    best_match = Some((rule, specificity));
                                }
                            }
                        }
                    }
                }
            }
        }

        match best_match {
            Some((rule, _)) => {
                let allowed = rule.kind == RuleKind::Allow;
                let reason = if allowed {
                    DecisionReason::AllowRuleMatched
                } else {
                    DecisionReason::DisallowRuleMatched
                };
                Decision {
                    allowed,
                    matched_rule: Some(rule.clone()),
                    reason,
                }
            }
            None => {
                // No rule matched, default allow
                Decision::allow(DecisionReason::NoMatchingRule)
            }
        }
    }

    /// Calculate specificity of a match (length of matched portion)
    fn calculate_specificity(&self, pattern: &str, _path: &str) -> usize {
        // RFC 9309: longest match wins
        // We count the pattern length excluding wildcards
        pattern
            .chars()
            .filter(|&c| c != '*' && c != '$')
            .count()
    }
}

/// Check if a URL is allowed (convenience function)
pub fn is_allowed(policy: &RobotsPolicy, user_agent: &str, path: &str) -> Decision {
    let matcher = RobotsMatcher::new(user_agent);
    matcher.is_allowed(policy, path)
}

/// Get crawl delay for a user-agent (convenience function)
pub fn get_crawl_delay(policy: &RobotsPolicy, user_agent: &str) -> Option<f64> {
    let matcher = RobotsMatcher::new(user_agent);
    let effective = matcher.effective_rules(policy);
    effective.crawl_delay
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::parser::RobotsParser;
    use std::time::Duration;

    fn parse_robots(content: &str) -> RobotsPolicy {
        let parser = RobotsParser::new();
        parser.parse(content, Duration::from_secs(3600))
    }

    #[test]
    fn test_basic_disallow() {
        let policy = parse_robots(r#"
User-agent: *
Disallow: /private/
"#);
        let matcher = RobotsMatcher::new("TestBot");
        
        assert!(!matcher.is_allowed(&policy, "/private/").allowed);
        assert!(!matcher.is_allowed(&policy, "/private/secret").allowed);
        assert!(matcher.is_allowed(&policy, "/public/").allowed);
    }

    #[test]
    fn test_allow_overrides() {
        let policy = parse_robots(r#"
User-agent: *
Disallow: /private/
Allow: /private/public/
"#);
        let matcher = RobotsMatcher::new("TestBot");
        
        assert!(!matcher.is_allowed(&policy, "/private/").allowed);
        assert!(!matcher.is_allowed(&policy, "/private/secret").allowed);
        assert!(matcher.is_allowed(&policy, "/private/public/").allowed);
        assert!(matcher.is_allowed(&policy, "/private/public/file.html").allowed);
    }

    #[test]
    fn test_longest_match_wins() {
        let policy = parse_robots(r#"
User-agent: *
Allow: /private/public/
Disallow: /private/
"#);
        let matcher = RobotsMatcher::new("TestBot");
        
        // Longer Allow should win
        assert!(matcher.is_allowed(&policy, "/private/public/").allowed);
        assert!(!matcher.is_allowed(&policy, "/private/secret").allowed);
    }

    #[test]
    fn test_equal_length_allow_wins() {
        let policy = parse_robots(r#"
User-agent: *
Disallow: /path
Allow: /path
"#);
        let matcher = RobotsMatcher::new("TestBot");
        
        // Equal length: Allow should win
        assert!(matcher.is_allowed(&policy, "/path").allowed);
    }

    #[test]
    fn test_wildcard_pattern() {
        let policy = parse_robots(r#"
User-agent: *
Disallow: /*.php
"#);
        let matcher = RobotsMatcher::new("TestBot");
        
        assert!(!matcher.is_allowed(&policy, "/index.php").allowed);
        assert!(!matcher.is_allowed(&policy, "/admin/login.php").allowed);
        assert!(matcher.is_allowed(&policy, "/index.html").allowed);
    }

    #[test]
    fn test_end_anchor() {
        let policy = parse_robots(r#"
User-agent: *
Disallow: /exact$
"#);
        let matcher = RobotsMatcher::new("TestBot");
        
        assert!(!matcher.is_allowed(&policy, "/exact").allowed);
        assert!(matcher.is_allowed(&policy, "/exact/").allowed);
        assert!(matcher.is_allowed(&policy, "/exact/more").allowed);
    }

    #[test]
    fn test_specific_user_agent() {
        let policy = parse_robots(r#"
User-agent: Googlebot
Disallow: /google-only

User-agent: *
Disallow: /admin
"#);
        
        let google = RobotsMatcher::new("Googlebot");
        let other = RobotsMatcher::new("OtherBot");
        
        // Googlebot uses specific rules
        assert!(!google.is_allowed(&policy, "/google-only").allowed);
        assert!(google.is_allowed(&policy, "/admin").allowed); // Not blocked for Googlebot
        
        // Other bots use wildcard
        assert!(other.is_allowed(&policy, "/google-only").allowed);
        assert!(!other.is_allowed(&policy, "/admin").allowed);
    }

    #[test]
    fn test_robots_txt_always_allowed() {
        let policy = parse_robots(r#"
User-agent: *
Disallow: /
"#);
        let matcher = RobotsMatcher::new("TestBot");
        
        // Even with disallow all, /robots.txt is allowed
        assert!(matcher.is_allowed(&policy, "/robots.txt").allowed);
        assert!(!matcher.is_allowed(&policy, "/anything").allowed);
    }

    #[test]
    fn test_user_agent_prefix_match() {
        let policy = parse_robots(r#"
User-agent: Googlebot
Disallow: /
"#);
        
        // Googlebot-Image should match Googlebot
        let bot = RobotsMatcher::new("Googlebot-Image");
        assert!(!bot.is_allowed(&policy, "/test").allowed);
    }

    #[test]
    fn test_crawl_delay() {
        let policy = parse_robots(r#"
User-agent: *
Crawl-delay: 2.5
Disallow: /admin
"#);
        
        let delay = get_crawl_delay(&policy, "TestBot");
        assert_eq!(delay, Some(2.5));
    }

    #[test]
    fn test_no_rules_means_allowed() {
        let policy = parse_robots(r#"
User-agent: *
"#);
        let matcher = RobotsMatcher::new("TestBot");
        
        assert!(matcher.is_allowed(&policy, "/anything").allowed);
    }

    #[test]
    fn test_empty_robots() {
        let policy = parse_robots("");
        let matcher = RobotsMatcher::new("TestBot");
        
        assert!(matcher.is_allowed(&policy, "/anything").allowed);
    }
}