halldyll-robots 0.1.0

robots.txt parser and compliance for halldyll scraper
Documentation
//! Main robots.txt checker with caching and fetching.
//!
//! The `RobotsChecker` is the primary entry point for checking URLs against
//! robots.txt policies. It handles fetching, caching, parsing, and matching.

use std::collections::HashSet;
use std::time::Duration;

use tracing::debug;
use url::Url;

use crate::cache::{CacheStatsSnapshot, RobotsCache};
use crate::fetcher::{FetchStatsSnapshot, RobotsFetcher};
use crate::matcher::RobotsMatcher;
use crate::types::{
    Decision, DecisionReason, EffectiveRules, FetchStatus, RobotsCacheKey,
    RobotsConfig, RobotsPolicy, Rule,
};

// ============================================================================
// RobotsChecker
// ============================================================================

/// Main robots.txt checker with caching and fetching.
///
/// This is the primary entry point for checking URLs against robots.txt.
/// It combines fetching, caching, parsing, and matching into a single API.
///
/// # Example
///
/// ```rust,no_run
/// use halldyll_robots::{RobotsChecker, RobotsConfig};
/// use url::Url;
///
/// #[tokio::main]
/// async fn main() -> Result<(), Box<dyn std::error::Error>> {
///     let config = RobotsConfig::default();
///     let checker = RobotsChecker::new(config);
///     
///     let url = Url::parse("https://example.com/some/path")?;
///     let decision = checker.is_allowed(&url).await;
///     
///     if decision.allowed {
///         println!("URL is allowed");
///     } else {
///         println!("URL is blocked: {:?}", decision.reason);
///     }
///     
///     Ok(())
/// }
/// ```
pub struct RobotsChecker {
    config: RobotsConfig,
    fetcher: RobotsFetcher,
    cache: RobotsCache,
    matcher: RobotsMatcher,
}

impl RobotsChecker {
    /// Create a new robots.txt checker with the given configuration.
    pub fn new(config: RobotsConfig) -> Self {
        let matcher = RobotsMatcher::new(&config.user_agent);
        let fetcher = RobotsFetcher::new(config.clone());
        let cache = RobotsCache::new(Duration::from_secs(config.cache_ttl_secs));

        Self {
            config,
            fetcher,
            cache,
            matcher,
        }
    }

    /// Create a checker with file-based cache persistence.
    ///
    /// The cache will be saved to and loaded from the specified directory.
    pub fn with_persistence(config: RobotsConfig, persist_dir: &str) -> Self {
        let matcher = RobotsMatcher::new(&config.user_agent);
        let fetcher = RobotsFetcher::new(config.clone());
        let cache = RobotsCache::with_persistence(
            Duration::from_secs(config.cache_ttl_secs),
            persist_dir,
        );

        Self {
            config,
            fetcher,
            cache,
            matcher,
        }
    }

    // ------------------------------------------------------------------------
    // Core API
    // ------------------------------------------------------------------------

    /// Check if a URL is allowed by robots.txt.
    ///
    /// This is the main method for checking crawl permissions. It will:
    /// 1. Extract the origin (scheme + authority) from the URL
    /// 2. Check the cache for an existing policy
    /// 3. Fetch robots.txt if not cached
    /// 4. Match the URL path against the rules
    ///
    /// # Returns
    ///
    /// A `Decision` with `allowed` status and the reason for the decision.
    pub async fn is_allowed(&self, url: &Url) -> Decision {
        // Skip if robots.txt is disabled
        if !self.config.respect_robots {
            return Decision::allow(DecisionReason::RobotsDisabled);
        }

        // Get the policy (from cache or fetch)
        let policy = self.get_policy(url).await;

        // Build full path with query string
        let full_path = Self::build_full_path(url);

        self.matcher.is_allowed(&policy, &full_path)
    }

    /// Check if a path is allowed for a given origin.
    ///
    /// Use this when you already have the origin URL and want to check
    /// multiple paths without re-fetching robots.txt.
    pub async fn is_path_allowed(&self, origin: &Url, path: &str) -> Decision {
        if !self.config.respect_robots {
            return Decision::allow(DecisionReason::RobotsDisabled);
        }

        let policy = self.get_policy(origin).await;
        self.matcher.is_allowed(&policy, path)
    }

    /// Get the robots.txt policy for a URL (cached or fetched).
    ///
    /// This fetches and parses the robots.txt if not already cached,
    /// and returns the parsed policy. Uses conditional GET (If-None-Match,
    /// If-Modified-Since) for cache revalidation to save bandwidth.
    pub async fn get_policy(&self, url: &Url) -> RobotsPolicy {
        let key = match RobotsCacheKey::from_url(url) {
            Some(k) => k,
            None => {
                return RobotsPolicy::unreachable(
                    "Invalid URL".to_string(),
                    Duration::from_secs(60),
                );
            }
        };

        // Check cache first
        if let Some(cached_policy) = self.cache.get(&key) {
            // If not expired, return cached
            if !cached_policy.is_expired() {
                return cached_policy;
            }

            // Expired: try conditional GET if we have ETag or Last-Modified
            if cached_policy.etag.is_some() || cached_policy.last_modified.is_some() {
                let new_policy = self.fetcher.fetch_for_key_conditional(
                    &key,
                    cached_policy.etag.as_deref(),
                    cached_policy.last_modified.as_deref(),
                ).await;

                // If 304 Not Modified, extend the cached policy's TTL
                if new_policy.fetch_status.is_not_modified() {
                    let mut refreshed = cached_policy.clone();
                    refreshed.extend_ttl(new_policy.ttl());
                    self.cache.insert(key, refreshed.clone());
                    return refreshed;
                }

                // Otherwise use the new policy
                self.cache.insert(key, new_policy.clone());
                return new_policy;
            }
        }

        // No cache or no conditional headers: full fetch
        let policy = self.fetcher.fetch_for_key(&key).await;
        self.cache.insert(key, policy.clone());
        policy
    }

    // ------------------------------------------------------------------------
    // Additional API
    // ------------------------------------------------------------------------

    /// Get crawl delay for a URL.
    ///
    /// Returns the effective delay between requests, using crawl-delay if set,
    /// otherwise request-rate, otherwise `None`.
    pub async fn get_crawl_delay(&self, url: &Url) -> Option<Duration> {
        let policy = self.get_policy(url).await;
        let effective = self.matcher.effective_rules(&policy);
        effective.effective_delay()
    }

    /// Get request rate for a URL.
    ///
    /// Returns the request-rate directive if specified in robots.txt.
    pub async fn get_request_rate(&self, url: &Url) -> Option<crate::types::RequestRate> {
        let policy = self.get_policy(url).await;
        let effective = self.matcher.effective_rules(&policy);
        effective.request_rate
    }

    /// Get sitemaps listed in robots.txt.
    ///
    /// Returns all Sitemap directives found in the robots.txt.
    pub async fn get_sitemaps(&self, url: &Url) -> Vec<String> {
        let policy = self.get_policy(url).await;
        policy.sitemaps
    }

    /// Get effective rules for a URL.
    ///
    /// Returns the merged rules that apply to this user-agent.
    pub async fn get_effective_rules(&self, url: &Url) -> EffectiveRules {
        let policy = self.get_policy(url).await;
        self.matcher.effective_rules(&policy)
    }

    /// Get detailed diagnostics for a URL check.
    ///
    /// Useful for debugging why a URL was allowed or blocked.
    pub async fn diagnose(&self, url: &Url) -> RobotsDiagnostics {
        let policy = self.get_policy(url).await;
        let effective = self.matcher.effective_rules(&policy);
        let full_path = Self::build_full_path(url);
        let decision = self.matcher.is_allowed(&policy, &full_path);

        RobotsDiagnostics {
            url: url.to_string(),
            robots_url: RobotsCacheKey::from_url(url)
                .map(|k| k.robots_url())
                .unwrap_or_default(),
            fetch_status: policy.fetch_status.clone(),
            groups_count: policy.groups.len(),
            matched_agents: effective.matched_agents,
            applicable_rules: effective.rules,
            decision,
        }
    }

    // ------------------------------------------------------------------------
    // Cache Management
    // ------------------------------------------------------------------------

    /// Get cache statistics.
    pub fn cache_stats(&self) -> CacheStatsSnapshot {
        self.cache.stats().snapshot()
    }

    /// Get fetch statistics.
    pub fn fetch_stats(&self) -> FetchStatsSnapshot {
        self.fetcher.stats().snapshot()
    }

    /// Clear all cached policies.
    pub fn clear_cache(&self) {
        self.cache.clear();
    }

    /// Evict expired cache entries.
    ///
    /// Returns the number of entries evicted.
    pub fn evict_expired(&self) -> usize {
        self.cache.evict_expired()
    }

    /// Save cache to disk (if persistence enabled).
    pub async fn save_cache(&self) -> std::io::Result<usize> {
        self.cache.save_to_disk().await
    }

    /// Load cache from disk (if persistence enabled).
    pub async fn load_cache(&self) -> std::io::Result<usize> {
        self.cache.load_from_disk().await
    }

    /// Get list of cached domains.
    pub fn cached_domains(&self) -> Vec<String> {
        self.cache.domains()
    }

    // ------------------------------------------------------------------------
    // Configuration
    // ------------------------------------------------------------------------

    /// Get the configuration.
    pub fn config(&self) -> &RobotsConfig {
        &self.config
    }

    // ------------------------------------------------------------------------
    // Bulk Operations
    // ------------------------------------------------------------------------

    /// Prefetch robots.txt for a list of URLs.
    ///
    /// This is useful for warming the cache before crawling.
    pub async fn prefetch(&self, urls: &[Url]) {
        let mut seen = HashSet::new();

        for url in urls {
            if let Some(key) = RobotsCacheKey::from_url(url) {
                if seen.insert(key.clone()) && self.cache.get(&key).is_none() {
                    debug!("Prefetching robots.txt for {}", key.authority);
                    let policy = self.fetcher.fetch_for_key(&key).await;
                    self.cache.insert(key, policy);
                }
            }
        }
    }

    // ------------------------------------------------------------------------
    // Helpers
    // ------------------------------------------------------------------------

    fn build_full_path(url: &Url) -> String {
        let path = url.path();
        match url.query() {
            Some(q) => format!("{}?{}", path, q),
            None => path.to_string(),
        }
    }
}

// ============================================================================
// RobotsDiagnostics
// ============================================================================

/// Diagnostics for debugging robots.txt decisions.
#[derive(Debug, Clone)]
pub struct RobotsDiagnostics {
    /// The URL being checked.
    pub url: String,
    /// The robots.txt URL.
    pub robots_url: String,
    /// Fetch status of robots.txt.
    pub fetch_status: FetchStatus,
    /// Number of groups in the policy.
    pub groups_count: usize,
    /// User-agents that matched.
    pub matched_agents: Vec<String>,
    /// Rules that apply to this user-agent.
    pub applicable_rules: Vec<Rule>,
    /// Final decision.
    pub decision: Decision,
}

// ============================================================================
// Tests
// ============================================================================

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_config_default() {
        let config = RobotsConfig::default();
        assert!(config.respect_robots);
        assert!(config.safe_mode);
        assert!(config.max_robots_size >= 500 * 1024);
    }

    #[tokio::test]
    async fn test_checker_creation() {
        let config = RobotsConfig::default();
        let _checker = RobotsChecker::new(config);
    }

    #[tokio::test]
    async fn test_disabled_robots() {
        let config = RobotsConfig {
            respect_robots: false,
            ..Default::default()
        };
        let checker = RobotsChecker::new(config);

        let url = Url::parse("https://example.com/blocked").unwrap();
        let decision = checker.is_allowed(&url).await;

        assert!(decision.allowed);
        assert_eq!(decision.reason, DecisionReason::RobotsDisabled);
    }

    #[test]
    fn test_build_full_path() {
        let url = Url::parse("https://example.com/path?query=1").unwrap();
        assert_eq!(RobotsChecker::build_full_path(&url), "/path?query=1");

        let url = Url::parse("https://example.com/path").unwrap();
        assert_eq!(RobotsChecker::build_full_path(&url), "/path");
    }
}