halldyll-robots 0.1.0

//! Types - Core types for robots.txt handling (RFC 9309)

use serde::{Deserialize, Serialize};
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use url::Url;

/// Cache key for robots.txt (scheme + authority)
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct RobotsCacheKey {
    /// URL scheme (http/https)
    pub scheme: String,
    /// Authority (host + optional port)
    pub authority: String,
}

/// Request-rate directive (non-standard but common)
/// Format: "requests/seconds" e.g., "1/10" means 1 request per 10 seconds
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
pub struct RequestRate {
    /// Number of requests allowed
    pub requests: u32,
    /// Time period in seconds
    pub seconds: u32,
}

impl RequestRate {
    /// Create a new request rate
    pub fn new(requests: u32, seconds: u32) -> Self {
        Self { requests, seconds }
    }

    /// Get the minimum delay between requests in seconds
    pub fn delay_seconds(&self) -> f64 {
        if self.requests == 0 {
            f64::MAX
        } else {
            self.seconds as f64 / self.requests as f64
        }
    }

    /// Get the delay as Duration
    pub fn delay(&self) -> Duration {
        Duration::from_secs_f64(self.delay_seconds())
    }
}

impl RobotsCacheKey {
    /// Create a new cache key from a URL
    pub fn from_url(url: &Url) -> Option<Self> {
        let host = url.host_str()?;
        let authority = match url.port() {
            Some(port) => format!("{}:{}", host, port),
            None => host.to_string(),
        };
        Some(Self {
            scheme: url.scheme().to_lowercase(),
            authority: authority.to_lowercase(),
        })
    }

    /// Get the robots.txt URL for this key
    pub fn robots_url(&self) -> String {
        format!("{}://{}/robots.txt", self.scheme, self.authority)
    }
}

/// Fetch status distinguishing Unavailable vs Unreachable (RFC 9309)
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub enum FetchStatus {
    /// Successfully fetched and parsed
    Success,
    /// Not modified (304) - use cached version
    NotModified,
    /// Unavailable (4xx) - crawler can access everything
    Unavailable {
        /// HTTP status code
        status_code: u16,
    },
    /// Unreachable (5xx, network error) - assume DISALLOW all
    Unreachable {
        /// Error description
        reason: String,
    },
    /// Protected (401/403) - safe mode treats as deny
    Protected {
        /// HTTP status code
        status_code: u16,
    },
}

impl FetchStatus {
    /// Check if this status allows all paths (RFC 9309: Unavailable)
    pub fn allows_all(&self) -> bool {
        matches!(self, FetchStatus::Unavailable { .. })
    }

    /// Check if this status denies all paths (RFC 9309: Unreachable)
    pub fn denies_all(&self) -> bool {
        matches!(self, FetchStatus::Unreachable { .. } | FetchStatus::Protected { .. })
    }

    /// Check if this is a not-modified response (304)
    pub fn is_not_modified(&self) -> bool {
        matches!(self, FetchStatus::NotModified)
    }
}

/// A single rule (Allow or Disallow)
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Rule {
    /// Rule kind
    pub kind: RuleKind,
    /// Original pattern from robots.txt
    pub pattern: String,
    /// Compiled pattern for matching (with wildcards expanded)
    #[serde(skip)]
    pub compiled: Option<CompiledPattern>,
}

impl Rule {
    /// Create a new rule
    pub fn new(kind: RuleKind, pattern: String) -> Self {
        let compiled = CompiledPattern::compile(&pattern);
        Self {
            kind,
            pattern,
            compiled: Some(compiled),
        }
    }

    /// Check if this rule matches a path
    pub fn matches(&self, path: &str) -> bool {
        match &self.compiled {
            Some(compiled) => compiled.matches(path),
            None => CompiledPattern::compile(&self.pattern).matches(path),
        }
    }

    /// Get the specificity (length) of this rule's pattern
    pub fn specificity(&self) -> usize {
        // Count actual characters, not wildcards
        self.pattern.chars().filter(|&c| c != '*').count()
    }
}

/// Rule kind
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum RuleKind {
    /// Allow crawling
    Allow,
    /// Disallow crawling
    Disallow,
}

/// Compiled pattern for efficient matching
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CompiledPattern {
    /// Pattern segments (between wildcards)
    segments: Vec<PatternSegment>,
    /// Whether pattern ends with $
    anchored_end: bool,
}

/// A segment of the pattern
#[derive(Debug, Clone, PartialEq, Eq)]
enum PatternSegment {
    /// Literal text to match
    Literal(String),
    /// Wildcard (matches 0+ characters)
    Wildcard,
}

impl CompiledPattern {
    /// Compile a pattern string
    pub fn compile(pattern: &str) -> Self {
        let anchored_end = pattern.ends_with('$');
        let pattern = if anchored_end {
            &pattern[..pattern.len() - 1]
        } else {
            pattern
        };

        let mut segments = Vec::new();
        let mut current = String::new();

        for c in pattern.chars() {
            if c == '*' {
                if !current.is_empty() {
                    segments.push(PatternSegment::Literal(current.clone()));
                    current.clear();
                }
                // Collapse consecutive wildcards
                if !matches!(segments.last(), Some(PatternSegment::Wildcard)) {
                    segments.push(PatternSegment::Wildcard);
                }
            } else {
                current.push(c);
            }
        }

        if !current.is_empty() {
            segments.push(PatternSegment::Literal(current));
        }

        Self {
            segments,
            anchored_end,
        }
    }

    /// Check if this pattern matches a path
    pub fn matches(&self, path: &str) -> bool {
        self.matches_recursive(path, 0)
    }

    fn matches_recursive(&self, remaining: &str, segment_idx: usize) -> bool {
        // Base case: no more segments
        if segment_idx >= self.segments.len() {
            return if self.anchored_end {
                remaining.is_empty()
            } else {
                true
            };
        }

        match &self.segments[segment_idx] {
            PatternSegment::Literal(lit) => {
                if remaining.starts_with(lit.as_str()) {
                    self.matches_recursive(&remaining[lit.len()..], segment_idx + 1)
                } else {
                    false
                }
            }
            PatternSegment::Wildcard => {
                // Try matching at every position
                if segment_idx + 1 >= self.segments.len() {
                    // Wildcard at end: matches everything
                    if self.anchored_end {
                        // Anchored: only match if this is the end
                        remaining.is_empty() || self.matches_recursive(remaining, segment_idx + 1)
                    } else {
                        true
                    }
                } else {
                    // Try matching the rest at each position
                    for i in 0..=remaining.len() {
                        if self.matches_recursive(&remaining[i..], segment_idx + 1) {
                            return true;
                        }
                    }
                    false
                }
            }
        }
    }
}

/// A group of rules for specific user-agents
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct Group {
    /// User-agent tokens this group applies to
    pub user_agents: Vec<String>,
    /// Rules in this group (in order)
    pub rules: Vec<Rule>,
    /// Crawl-delay directive (optional, in seconds)
    pub crawl_delay: Option<f64>,
    /// Request-rate directive (optional, non-standard)
    pub request_rate: Option<RequestRate>,
}

impl Group {
    /// Check if this group matches a user-agent token
    pub fn matches_user_agent(&self, token: &str) -> bool {
        let token_lower = token.to_lowercase();
        self.user_agents.iter().any(|ua| {
            let ua_lower = ua.to_lowercase();
            // Prefix match: "Googlebot" matches "Googlebot-Image"
            token_lower.starts_with(&ua_lower) || ua_lower == "*"
        })
    }

    /// Check if this is the wildcard group
    pub fn is_wildcard(&self) -> bool {
        self.user_agents.iter().any(|ua| ua == "*")
    }
}

/// Complete robots.txt policy
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RobotsPolicy {
    /// When the robots.txt was fetched (Unix timestamp millis)
    pub fetched_at_ms: u64,
    /// When this policy expires (Unix timestamp millis)
    pub expires_at_ms: u64,
    /// Fetch status
    pub fetch_status: FetchStatus,
    /// Parsed groups
    pub groups: Vec<Group>,
    /// Sitemap URLs found
    pub sitemaps: Vec<String>,
    /// Original robots.txt size in bytes
    pub content_size: usize,
    /// ETag header for conditional GET
    #[serde(skip_serializing_if = "Option::is_none")]
    pub etag: Option<String>,
    /// Last-Modified header for conditional GET
    #[serde(skip_serializing_if = "Option::is_none")]
    pub last_modified: Option<String>,
}

impl RobotsPolicy {
    /// Get current timestamp in milliseconds
    fn now_millis() -> u64 {
        SystemTime::now()
            .duration_since(UNIX_EPOCH)
            .unwrap_or_default()
            .as_millis() as u64
    }

    /// Create a policy for unavailable robots.txt (allow all)
    pub fn unavailable(status_code: u16, ttl: Duration) -> Self {
        let now = Self::now_millis();
        Self {
            fetched_at_ms: now,
            expires_at_ms: now + ttl.as_millis() as u64,
            fetch_status: FetchStatus::Unavailable { status_code },
            groups: Vec::new(),
            sitemaps: Vec::new(),
            content_size: 0,
            etag: None,
            last_modified: None,
        }
    }

    /// Create a policy for unreachable robots.txt (deny all)
    pub fn unreachable(reason: String, ttl: Duration) -> Self {
        let now = Self::now_millis();
        Self {
            fetched_at_ms: now,
            expires_at_ms: now + ttl.as_millis() as u64,
            fetch_status: FetchStatus::Unreachable { reason },
            groups: Vec::new(),
            sitemaps: Vec::new(),
            content_size: 0,
            etag: None,
            last_modified: None,
        }
    }

    /// Create a policy for protected robots.txt (deny all in safe mode)
    pub fn protected(status_code: u16, ttl: Duration) -> Self {
        let now = Self::now_millis();
        Self {
            fetched_at_ms: now,
            expires_at_ms: now + ttl.as_millis() as u64,
            fetch_status: FetchStatus::Protected { status_code },
            groups: Vec::new(),
            sitemaps: Vec::new(),
            content_size: 0,
            etag: None,
            last_modified: None,
        }
    }

    /// Create a policy indicating 304 Not Modified
    /// The caller should extend the existing cached policy's TTL
    pub fn not_modified(ttl: Duration) -> Self {
        let now = Self::now_millis();
        Self {
            fetched_at_ms: now,
            expires_at_ms: now + ttl.as_millis() as u64,
            fetch_status: FetchStatus::NotModified,
            groups: Vec::new(),
            sitemaps: Vec::new(),
            content_size: 0,
            etag: None,
            last_modified: None,
        }
    }

    /// Extend the expiration time by the given TTL
    pub fn extend_ttl(&mut self, ttl: Duration) {
        self.expires_at_ms = Self::now_millis() + ttl.as_millis() as u64;
    }

    /// Check if this policy has expired
    pub fn is_expired(&self) -> bool {
        Self::now_millis() > self.expires_at_ms
    }

    /// Get time until expiration
    pub fn ttl(&self) -> Duration {
        let now = Self::now_millis();
        if self.expires_at_ms > now {
            Duration::from_millis(self.expires_at_ms - now)
        } else {
            Duration::ZERO
        }
    }
}

/// Decision returned by is_allowed
#[derive(Debug, Clone)]
pub struct Decision {
    /// Whether access is allowed
    pub allowed: bool,
    /// The rule that matched (if any)
    pub matched_rule: Option<Rule>,
    /// Reason for the decision
    pub reason: DecisionReason,
}

impl Decision {
    /// Create an allowed decision
    pub fn allow(reason: DecisionReason) -> Self {
        Self {
            allowed: true,
            matched_rule: None,
            reason,
        }
    }

    /// Create a denied decision
    pub fn deny(reason: DecisionReason) -> Self {
        Self {
            allowed: false,
            matched_rule: None,
            reason,
        }
    }

    /// Create a decision with a matched rule
    pub fn with_rule(mut self, rule: Rule) -> Self {
        self.matched_rule = Some(rule);
        self
    }
}

/// Reason for the decision
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum DecisionReason {
    /// Robots.txt unavailable (4xx), allow all
    RobotsUnavailable,
    /// Robots.txt unreachable (5xx/network), deny all
    RobotsUnreachable,
    /// Robots.txt protected (401/403 in safe mode), deny all
    RobotsProtected,
    /// No matching rule found, default allow
    NoMatchingRule,
    /// Matched an allow rule
    AllowRuleMatched,
    /// Matched a disallow rule
    DisallowRuleMatched,
    /// Path is /robots.txt (always allowed)
    RobotsTxtPath,
    /// Robots.txt respect is disabled
    RobotsDisabled,
}

/// Effective rules for a specific user-agent (after merging groups)
#[derive(Debug, Clone)]
pub struct EffectiveRules {
    /// Merged rules from all matching groups
    pub rules: Vec<Rule>,
    /// Crawl delay (from first group that specifies it)
    pub crawl_delay: Option<f64>,
    /// Request rate (from first group that specifies it)
    pub request_rate: Option<RequestRate>,
    /// Which user-agents were matched
    pub matched_agents: Vec<String>,
}

impl EffectiveRules {
    /// Create empty effective rules
    pub fn empty() -> Self {
        Self {
            rules: Vec::new(),
            crawl_delay: None,
            request_rate: None,
            matched_agents: Vec::new(),
        }
    }

    /// Get the effective delay between requests
    /// Uses crawl_delay if set, otherwise request_rate, otherwise None
    pub fn effective_delay(&self) -> Option<Duration> {
        if let Some(delay) = self.crawl_delay {
            return Some(Duration::from_secs_f64(delay));
        }
        if let Some(rate) = self.request_rate {
            return Some(rate.delay());
        }
        None
    }
}

/// Configuration for robots.txt handling
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RobotsConfig {
    /// User-agent token to identify as
    pub user_agent: String,
    /// Cache TTL in seconds (max 24 hours per RFC)
    pub cache_ttl_secs: u64,
    /// Whether to respect robots.txt
    pub respect_robots: bool,
    /// Default crawl delay in milliseconds
    pub default_crawl_delay_ms: u64,
    /// Maximum robots.txt size in bytes (min 500 KiB per RFC)
    pub max_robots_size: usize,
    /// Maximum redirects to follow
    pub max_redirects: u32,
    /// Fetch timeout in seconds
    pub fetch_timeout_secs: u64,
    /// Use safe mode (treat 401/403 as deny)
    pub safe_mode: bool,
}

impl Default for RobotsConfig {
    fn default() -> Self {
        Self {
            user_agent: "HalldyllBot/1.0".to_string(),
            cache_ttl_secs: 3600, // 1 hour
            respect_robots: true,
            default_crawl_delay_ms: 100,
            max_robots_size: 512 * 1024, // 512 KiB (RFC minimum is 500 KiB)
            max_redirects: 5,
            fetch_timeout_secs: 10,
            safe_mode: true, // Safer default for production
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_cache_key_from_url() {
        let url = Url::parse("https://example.com:8080/path").unwrap();
        let key = RobotsCacheKey::from_url(&url).unwrap();
        assert_eq!(key.scheme, "https");
        assert_eq!(key.authority, "example.com:8080");
        assert_eq!(key.robots_url(), "https://example.com:8080/robots.txt");
    }

    #[test]
    fn test_compiled_pattern_literal() {
        let pattern = CompiledPattern::compile("/admin/");
        assert!(pattern.matches("/admin/"));
        assert!(pattern.matches("/admin/users"));
        assert!(!pattern.matches("/administrator"));
    }

    #[test]
    fn test_compiled_pattern_wildcard() {
        let pattern = CompiledPattern::compile("/api/*/users");
        assert!(pattern.matches("/api/v1/users"));
        assert!(pattern.matches("/api/v2/users"));
        assert!(pattern.matches("/api//users"));
        assert!(!pattern.matches("/api/users"));
    }

    #[test]
    fn test_compiled_pattern_anchored() {
        let pattern = CompiledPattern::compile("/exact$");
        assert!(pattern.matches("/exact"));
        assert!(!pattern.matches("/exact/"));
        assert!(!pattern.matches("/exact/more"));
    }

    #[test]
    fn test_compiled_pattern_complex() {
        let pattern = CompiledPattern::compile("/*.php$");
        assert!(pattern.matches("/index.php"));
        assert!(pattern.matches("/admin/login.php"));
        assert!(!pattern.matches("/index.php5"));
        assert!(!pattern.matches("/index.php/extra"));
    }

    #[test]
    fn test_rule_specificity() {
        let rule1 = Rule::new(RuleKind::Disallow, "/admin".to_string());
        let rule2 = Rule::new(RuleKind::Allow, "/admin/public".to_string());
        assert!(rule2.specificity() > rule1.specificity());
    }

    #[test]
    fn test_group_matches_user_agent() {
        let group = Group {
            user_agents: vec!["Googlebot".to_string(), "Bingbot".to_string()],
            rules: vec![],
            crawl_delay: None,
            request_rate: None,
        };
        assert!(group.matches_user_agent("Googlebot"));
        assert!(group.matches_user_agent("googlebot")); // case insensitive
        assert!(group.matches_user_agent("Googlebot-Image")); // prefix match
        assert!(!group.matches_user_agent("Yandexbot"));
    }

    #[test]
    fn test_request_rate() {
        let rate = RequestRate::new(1, 10);
        assert_eq!(rate.requests, 1);
        assert_eq!(rate.seconds, 10);
        assert!((rate.delay_seconds() - 10.0).abs() < 0.001);
        assert_eq!(rate.delay(), Duration::from_secs(10));

        let rate2 = RequestRate::new(2, 10);
        assert!((rate2.delay_seconds() - 5.0).abs() < 0.001);
    }

    #[test]
    fn test_effective_rules_delay() {
        let mut rules = EffectiveRules::empty();
        assert!(rules.effective_delay().is_none());

        rules.crawl_delay = Some(2.5);
        assert_eq!(rules.effective_delay(), Some(Duration::from_secs_f64(2.5)));

        // crawl_delay takes precedence over request_rate
        rules.request_rate = Some(RequestRate::new(1, 10));
        assert_eq!(rules.effective_delay(), Some(Duration::from_secs_f64(2.5)));

        // Without crawl_delay, request_rate is used
        rules.crawl_delay = None;
        assert_eq!(rules.effective_delay(), Some(Duration::from_secs(10)));
    }

    #[test]
    fn test_fetch_status_not_modified() {
        let status = FetchStatus::NotModified;
        assert!(status.is_not_modified());
        assert!(!status.allows_all());
        assert!(!status.denies_all());
    }

    #[test]
    fn test_policy_extend_ttl() {
        let mut policy = RobotsPolicy::unavailable(404, Duration::from_secs(60));
        let original_expires = policy.expires_at_ms;
        
        std::thread::sleep(Duration::from_millis(10));
        policy.extend_ttl(Duration::from_secs(3600));
        
        assert!(policy.expires_at_ms > original_expires);
    }
}