halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Robots - Parsing and respecting robots.txt (RFC 9309)

use std::collections::HashMap;
use std::sync::RwLock;
use std::time::{Duration, Instant};
use texting_robots::Robot;
use url::Url;

/// Robots.txt cache
pub struct RobotsCache {
    /// Cached entries
    entries: RwLock<HashMap<String, RobotsEntry>>,
    /// Cache TTL (seconds)
    ttl_secs: u64,
}

/// Cached robots.txt entry
struct RobotsEntry {
    /// Parsed content
    content: String,
    /// Caching timestamp
    cached_at: Instant,
    /// Crawl-delay in seconds
    crawl_delay: Option<f64>,
}

impl Default for RobotsCache {
    fn default() -> Self {
        Self::new(3600) // 1 hour by default
    }
}

impl RobotsCache {
    /// New cache with TTL
    pub fn new(ttl_secs: u64) -> Self {
        Self {
            entries: RwLock::new(HashMap::new()),
            ttl_secs,
        }
    }

    /// Cache key (domain)
    fn cache_key(url: &Url) -> String {
        format!("{}://{}", url.scheme(), url.host_str().unwrap_or(""))
    }

    /// Retrieves an entry (if not expired)
    pub fn get(&self, url: &Url) -> Option<String> {
        let key = Self::cache_key(url);
        let entries = self.entries.read().unwrap();
        
        entries.get(&key).and_then(|entry| {
            if entry.cached_at.elapsed() < Duration::from_secs(self.ttl_secs) {
                Some(entry.content.clone())
            } else {
                None
            }
        })
    }

    /// Retrieves the crawl-delay
    pub fn get_crawl_delay(&self, url: &Url) -> Option<f64> {
        let key = Self::cache_key(url);
        let entries = self.entries.read().unwrap();
        
        entries.get(&key).and_then(|entry| {
            if entry.cached_at.elapsed() < Duration::from_secs(self.ttl_secs) {
                entry.crawl_delay
            } else {
                None
            }
        })
    }

    /// Caches an entry
    pub fn set(&self, url: &Url, content: String, crawl_delay: Option<f64>) {
        let key = Self::cache_key(url);
        let entry = RobotsEntry {
            content,
            cached_at: Instant::now(),
            crawl_delay,
        };
        
        self.entries.write().unwrap().insert(key, entry);
    }

    /// Removes an entry
    pub fn remove(&self, url: &Url) {
        let key = Self::cache_key(url);
        self.entries.write().unwrap().remove(&key);
    }

    /// Clears the cache
    pub fn clear(&self) {
        self.entries.write().unwrap().clear();
    }

    /// Cache size
    pub fn len(&self) -> usize {
        self.entries.read().unwrap().len()
    }

    /// Is cache empty?
    pub fn is_empty(&self) -> bool {
        self.entries.read().unwrap().is_empty()
    }
}

/// Robots.txt checker
pub struct RobotsChecker {
    /// User-Agent to use
    user_agent: String,
    /// Robots.txt cache
    cache: RobotsCache,
    /// Respect robots.txt?
    enabled: bool,
}

impl RobotsChecker {
    /// New checker
    pub fn new(user_agent: &str, cache_ttl_secs: u64) -> Self {
        Self {
            user_agent: user_agent.to_string(),
            cache: RobotsCache::new(cache_ttl_secs),
            enabled: true,
        }
    }

    /// Enable/disable robots.txt respect
    pub fn set_enabled(&mut self, enabled: bool) {
        self.enabled = enabled;
    }

    /// Is robots.txt respected?
    pub fn is_enabled(&self) -> bool {
        self.enabled
    }

    /// URL of the robots.txt for a site
    pub fn robots_url(url: &Url) -> Option<Url> {
        let base = format!("{}://{}/robots.txt", url.scheme(), url.host_str()?);
        Url::parse(&base).ok()
    }

    /// Caches a robots.txt
    pub fn cache_robots(&self, url: &Url, content: &str) {
        // Extract crawl-delay
        let crawl_delay = self.parse_crawl_delay(content);
        self.cache.set(url, content.to_string(), crawl_delay);
    }

    /// Parses the crawl-delay from robots.txt
    fn parse_crawl_delay(&self, content: &str) -> Option<f64> {
        let ua_lower = self.user_agent.to_lowercase();
        let mut in_matching_section = false;
        let mut found_delay: Option<f64> = None;
        let mut wildcard_delay: Option<f64> = None;

        for line in content.lines() {
            let line = line.trim();
            
            // Ignore comments
            if line.starts_with('#') || line.is_empty() {
                continue;
            }

            let parts: Vec<&str> = line.splitn(2, ':').collect();
            if parts.len() != 2 {
                continue;
            }

            let directive = parts[0].trim().to_lowercase();
            let value = parts[1].trim();

            if directive == "user-agent" {
                let ua = value.to_lowercase();
                if ua == "*" {
                    in_matching_section = true;
                } else if ua_lower.contains(&ua) || ua.contains(&ua_lower) {
                    in_matching_section = true;
                } else {
                    in_matching_section = false;
                }
            } else if directive == "crawl-delay" && in_matching_section {
                if let Ok(delay) = value.parse::<f64>() {
                    if value == "*" {
                        wildcard_delay = Some(delay);
                    } else {
                        found_delay = Some(delay);
                    }
                }
            }
        }

        found_delay.or(wildcard_delay)
    }

    /// Checks if a URL is allowed
    pub fn is_allowed(&self, url: &Url, robots_content: Option<&str>) -> bool {
        if !self.enabled {
            return true;
        }

        // Retrieve robots.txt content
        let content = match robots_content {
            Some(c) => c.to_string(),
            None => match self.cache.get(url) {
                Some(c) => c,
                None => return true, // No robots.txt = everything allowed
            },
        };

        // Parse with texting_robots
        let robot = match Robot::new(&self.user_agent, content.as_bytes()) {
            Ok(r) => r,
            Err(_) => return true, // Parsing error = allowed
        };

        robot.allowed(url.path())
    }

    /// Retrieves the crawl-delay
    pub fn get_crawl_delay(&self, url: &Url) -> Option<Duration> {
        if !self.enabled {
            return None;
        }

        self.cache
            .get_crawl_delay(url)
            .map(|secs| Duration::from_secs_f64(secs))
    }

    /// Access to the cache
    pub fn cache(&self) -> &RobotsCache {
        &self.cache
    }
}

/// Extracts sitemap rules from robots.txt
pub fn extract_sitemaps(robots_content: &str) -> Vec<String> {
    robots_content
        .lines()
        .filter_map(|line| {
            let line = line.trim();
            if line.to_lowercase().starts_with("sitemap:") {
                Some(line[8..].trim().to_string())
            } else {
                None
            }
        })
        .collect()
}