halldyll-robots 0.1.0

//! Parser - RFC 9309 compliant robots.txt parser
//!
//! This module provides a robust parser for robots.txt files that handles:
//! - UTF-8 BOM stripping
//! - Request-rate directive (non-standard but common)
//! - Proper percent-encoding normalization
//! - Size limits per RFC 9309

use crate::types::{Group, RequestRate, Rule, RuleKind, RobotsPolicy, FetchStatus};
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use tracing::{debug, warn};

/// Maximum robots.txt size (RFC 9309 requires at least 500 KiB)
pub const MAX_ROBOTS_SIZE: usize = 512 * 1024;

/// UTF-8 BOM bytes
const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];

/// Get current timestamp in milliseconds
fn now_millis() -> u64 {
    SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .unwrap_or_default()
        .as_millis() as u64
}

/// Strip UTF-8 BOM if present
fn strip_bom(content: &str) -> &str {
    if content.as_bytes().starts_with(UTF8_BOM) {
        &content[3..]
    } else {
        content
    }
}

/// Parser for robots.txt files
pub struct RobotsParser {
    /// Maximum content size to parse
    max_size: usize,
}

impl Default for RobotsParser {
    fn default() -> Self {
        Self::new()
    }
}

impl RobotsParser {
    /// Create a new parser with default settings
    pub fn new() -> Self {
        Self {
            max_size: MAX_ROBOTS_SIZE,
        }
    }

    /// Create a parser with custom max size
    pub fn with_max_size(max_size: usize) -> Self {
        Self { max_size }
    }

    /// Parse robots.txt content into a policy
    pub fn parse(&self, content: &str, ttl: Duration) -> RobotsPolicy {
        let now = now_millis();
        
        // Strip UTF-8 BOM if present
        let content = strip_bom(content);
        
        // Enforce size limit
        let content = if content.len() > self.max_size {
            warn!(
                "robots.txt exceeds size limit ({} > {}), truncating",
                content.len(),
                self.max_size
            );
            &content[..self.max_size]
        } else {
            content
        };

        let content_size = content.len();
        let mut groups: Vec<Group> = Vec::new();
        let mut sitemaps: Vec<String> = Vec::new();
        let mut current_group: Option<Group> = None;

        for line in content.lines() {
            let line = self.clean_line(line);
            if line.is_empty() {
                continue;
            }

            // Parse the line
            if let Some((directive, value)) = self.parse_directive(&line) {
                match directive.to_lowercase().as_str() {
                    "user-agent" => {
                        // Start a new group or add to current
                        if let Some(ref mut group) = current_group {
                            if group.rules.is_empty() {
                                // No rules yet, can add more user-agents
                                group.user_agents.push(value.to_string());
                            } else {
                                // Rules exist, save current and start new
                                groups.push(current_group.take().unwrap());
                                current_group = Some(Group {
                                    user_agents: vec![value.to_string()],
                                    rules: Vec::new(),
                                    crawl_delay: None,
                                    request_rate: None,
                                });
                            }
                        } else {
                            current_group = Some(Group {
                                user_agents: vec![value.to_string()],
                                rules: Vec::new(),
                                crawl_delay: None,
                                request_rate: None,
                            });
                        }
                    }
                    "allow" => {
                        if let Some(ref mut group) = current_group {
                            let pattern = self.normalize_pattern(value);
                            if !pattern.is_empty() {
                                group.rules.push(Rule::new(RuleKind::Allow, pattern));
                            }
                        }
                    }
                    "disallow" => {
                        if let Some(ref mut group) = current_group {
                            let pattern = self.normalize_pattern(value);
                            // Empty disallow means allow all
                            if !pattern.is_empty() {
                                group.rules.push(Rule::new(RuleKind::Disallow, pattern));
                            }
                        }
                    }
                    "crawl-delay" => {
                        if let Some(ref mut group) = current_group {
                            if let Ok(delay) = value.trim().parse::<f64>() {
                                if delay >= 0.0 {
                                    group.crawl_delay = Some(delay);
                                }
                            }
                        }
                    }
                    "request-rate" => {
                        // Non-standard but common: "requests/seconds" e.g., "1/10"
                        if let Some(ref mut group) = current_group {
                            if let Some(rate) = Self::parse_request_rate(value) {
                                group.request_rate = Some(rate);
                            }
                        }
                    }
                    "sitemap" => {
                        // Sitemap is not part of a group
                        let sitemap_url = value.trim().to_string();
                        if !sitemap_url.is_empty() {
                            sitemaps.push(sitemap_url);
                        }
                    }
                    _ => {
                        // Unknown directive, ignore per RFC
                        debug!("Ignoring unknown robots.txt directive: {}", directive);
                    }
                }
            }
        }

        // Save the last group
        if let Some(group) = current_group {
            if !group.user_agents.is_empty() {
                groups.push(group);
            }
        }

        RobotsPolicy {
            fetched_at_ms: now,
            expires_at_ms: now + ttl.as_millis() as u64,
            fetch_status: FetchStatus::Success,
            groups,
            sitemaps,
            content_size,
            etag: None,
            last_modified: None,
        }
    }

    /// Parse a request-rate value like "1/10" (1 request per 10 seconds)
    fn parse_request_rate(value: &str) -> Option<RequestRate> {
        let parts: Vec<&str> = value.trim().split('/').collect();
        if parts.len() == 2 {
            let requests = parts[0].trim().parse::<u32>().ok()?;
            let seconds = parts[1].trim().parse::<u32>().ok()?;
            if requests > 0 && seconds > 0 {
                return Some(RequestRate::new(requests, seconds));
            }
        }
        None
    }

    /// Clean a line (remove comments, trim whitespace)
    fn clean_line(&self, line: &str) -> String {
        // Remove comments
        let line = match line.find('#') {
            Some(pos) => &line[..pos],
            None => line,
        };
        line.trim().to_string()
    }

    /// Parse a directive line into (directive, value)
    fn parse_directive<'a>(&self, line: &'a str) -> Option<(&'a str, &'a str)> {
        let colon_pos = line.find(':')?;
        let directive = line[..colon_pos].trim();
        let value = line[colon_pos + 1..].trim();
        
        if directive.is_empty() {
            return None;
        }

        Some((directive, value))
    }

    /// Normalize a pattern for matching
    fn normalize_pattern(&self, pattern: &str) -> String {
        let pattern = pattern.trim();
        
        // Handle empty pattern
        if pattern.is_empty() {
            return String::new();
        }

        // Ensure pattern starts with /
        if !pattern.starts_with('/') && !pattern.starts_with('*') {
            format!("/{}", pattern)
        } else {
            pattern.to_string()
        }
    }
}

/// Utility functions for percent-encoding handling
pub mod encoding {
    /// Decode percent-encoded characters for matching
    /// Only decodes unreserved characters per RFC 3986
    pub fn normalize_path_for_matching(path: &str) -> String {
        let mut result = String::with_capacity(path.len());
        let mut chars = path.chars().peekable();

        while let Some(c) = chars.next() {
            if c == '%' {
                // Try to decode
                let hex: String = chars.by_ref().take(2).collect();
                if hex.len() == 2 {
                    if let Ok(byte) = u8::from_str_radix(&hex, 16) {
                        let decoded = byte as char;
                        // Only decode unreserved characters
                        if is_unreserved(decoded) {
                            result.push(decoded);
                            continue;
                        }
                    }
                }
                // Keep as-is if can't decode
                result.push('%');
                result.push_str(&hex);
            } else {
                result.push(c);
            }
        }

        result
    }

    /// Check if a character is unreserved per RFC 3986
    fn is_unreserved(c: char) -> bool {
        c.is_ascii_alphanumeric() || c == '-' || c == '.' || c == '_' || c == '~'
    }

    /// Normalize both pattern and path for comparison
    pub fn normalize_for_comparison(s: &str) -> String {
        // First decode unreserved characters
        let decoded = normalize_path_for_matching(s);
        // Uppercase remaining percent-encoding for consistency
        uppercase_percent_encoding(&decoded)
    }

    /// Uppercase percent-encoding hex digits
    fn uppercase_percent_encoding(s: &str) -> String {
        let mut result = String::with_capacity(s.len());
        let mut chars = s.chars().peekable();

        while let Some(c) = chars.next() {
            if c == '%' {
                result.push('%');
                // Uppercase next two characters
                for _ in 0..2 {
                    if let Some(hex_char) = chars.next() {
                        result.push(hex_char.to_ascii_uppercase());
                    }
                }
            } else {
                result.push(c);
            }
        }

        result
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_parse_simple() {
        let parser = RobotsParser::new();
        let content = r#"
User-agent: *
Disallow: /private/
Allow: /private/public/
Crawl-delay: 2
"#;
        let policy = parser.parse(content, Duration::from_secs(3600));
        
        assert_eq!(policy.groups.len(), 1);
        assert_eq!(policy.groups[0].user_agents, vec!["*"]);
        assert_eq!(policy.groups[0].rules.len(), 2);
        assert_eq!(policy.groups[0].crawl_delay, Some(2.0));
    }

    #[test]
    fn test_parse_multiple_groups() {
        let parser = RobotsParser::new();
        let content = r#"
User-agent: Googlebot
User-agent: Bingbot
Disallow: /search

User-agent: *
Disallow: /admin
"#;
        let policy = parser.parse(content, Duration::from_secs(3600));
        
        assert_eq!(policy.groups.len(), 2);
        assert_eq!(policy.groups[0].user_agents, vec!["Googlebot", "Bingbot"]);
        assert_eq!(policy.groups[1].user_agents, vec!["*"]);
    }

    #[test]
    fn test_parse_sitemaps() {
        let parser = RobotsParser::new();
        let content = r#"
User-agent: *
Disallow:

Sitemap: https://example.com/sitemap.xml
Sitemap: https://example.com/sitemap2.xml
"#;
        let policy = parser.parse(content, Duration::from_secs(3600));
        
        assert_eq!(policy.sitemaps.len(), 2);
        assert_eq!(policy.sitemaps[0], "https://example.com/sitemap.xml");
    }

    #[test]
    fn test_parse_comments() {
        let parser = RobotsParser::new();
        let content = r#"
# This is a comment
User-agent: * # inline comment
Disallow: /private # another comment
"#;
        let policy = parser.parse(content, Duration::from_secs(3600));
        
        assert_eq!(policy.groups.len(), 1);
        assert_eq!(policy.groups[0].rules.len(), 1);
    }

    #[test]
    fn test_parse_empty_disallow() {
        let parser = RobotsParser::new();
        let content = r#"
User-agent: *
Disallow:
"#;
        let policy = parser.parse(content, Duration::from_secs(3600));
        
        // Empty disallow should not create a rule
        assert_eq!(policy.groups[0].rules.len(), 0);
    }

    #[test]
    fn test_normalize_pattern() {
        let parser = RobotsParser::new();
        assert_eq!(parser.normalize_pattern("/path"), "/path");
        assert_eq!(parser.normalize_pattern("path"), "/path");
        assert_eq!(parser.normalize_pattern("*"), "*");
        assert_eq!(parser.normalize_pattern(""), "");
    }

    #[test]
    fn test_encoding_normalize() {
        use encoding::normalize_path_for_matching;
        
        // Unreserved characters should be decoded
        assert_eq!(normalize_path_for_matching("/path%2Dtest"), "/path-test");
        
        // Reserved characters should stay encoded
        assert_eq!(normalize_path_for_matching("/path%2Ftest"), "/path%2Ftest");
    }

    #[test]
    fn test_bom_stripping() {
        let parser = RobotsParser::new();
        // UTF-8 BOM followed by valid content
        let content = "\u{FEFF}User-agent: *\nDisallow: /private";
        let policy = parser.parse(content, Duration::from_secs(3600));
        
        assert_eq!(policy.groups.len(), 1);
        assert_eq!(policy.groups[0].user_agents, vec!["*"]);
    }

    #[test]
    fn test_request_rate_parsing() {
        let parser = RobotsParser::new();
        let content = r#"
User-agent: *
Disallow: /private
Request-rate: 1/10
"#;
        let policy = parser.parse(content, Duration::from_secs(3600));
        
        assert_eq!(policy.groups.len(), 1);
        let rate = policy.groups[0].request_rate.unwrap();
        assert_eq!(rate.requests, 1);
        assert_eq!(rate.seconds, 10);
        assert!((rate.delay_seconds() - 10.0).abs() < 0.001);
    }

    #[test]
    fn test_crawl_delay_float() {
        let parser = RobotsParser::new();
        let content = r#"
User-agent: *
Crawl-delay: 0.5
"#;
        let policy = parser.parse(content, Duration::from_secs(3600));
        
        assert_eq!(policy.groups[0].crawl_delay, Some(0.5));
    }
}