halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Allowlist - List of allowed/blocked domains

use regex::Regex;
use std::collections::HashSet;
use url::Url;

/// List of allowed domains
pub struct DomainAllowlist {
    /// Allowed domains (empty = all)
    allowed: HashSet<String>,
    /// Blocked domains
    blocked: HashSet<String>,
    /// Allowed regex patterns
    allowed_patterns: Vec<Regex>,
    /// Blocked regex patterns
    blocked_patterns: Vec<Regex>,
}

impl Default for DomainAllowlist {
    fn default() -> Self {
        Self::new()
    }
}

impl DomainAllowlist {
    /// New empty list (all allowed by default)
    pub fn new() -> Self {
        Self {
            allowed: HashSet::new(),
            blocked: HashSet::new(),
            allowed_patterns: Vec::new(),
            blocked_patterns: Vec::new(),
        }
    }

    /// Add an allowed domain
    pub fn allow_domain(&mut self, domain: &str) {
        self.allowed.insert(domain.to_lowercase());
    }

    /// Add multiple allowed domains
    pub fn allow_domains(&mut self, domains: &[&str]) {
        for domain in domains {
            self.allow_domain(domain);
        }
    }

    /// Add a blocked domain
    pub fn block_domain(&mut self, domain: &str) {
        self.blocked.insert(domain.to_lowercase());
    }

    /// Add multiple blocked domains
    pub fn block_domains(&mut self, domains: &[&str]) {
        for domain in domains {
            self.block_domain(domain);
        }
    }

    /// Add an allowed regex pattern
    pub fn allow_pattern(&mut self, pattern: &str) -> Result<(), regex::Error> {
        let regex = Regex::new(pattern)?;
        self.allowed_patterns.push(regex);
        Ok(())
    }

    /// Add a blocked regex pattern
    pub fn block_pattern(&mut self, pattern: &str) -> Result<(), regex::Error> {
        let regex = Regex::new(pattern)?;
        self.blocked_patterns.push(regex);
        Ok(())
    }

    /// Check if a URL is allowed
    pub fn is_allowed(&self, url: &Url) -> bool {
        let domain = match url.host_str() {
            Some(d) => d.to_lowercase(),
            None => return false,
        };

        // 1. Check blocked domains (priority)
        if self.blocked.contains(&domain) {
            return false;
        }

        // 2. Check blocked patterns
        for pattern in &self.blocked_patterns {
            if pattern.is_match(&domain) {
                return false;
            }
        }

        // 3. If no allowed list, everything is allowed
        if self.allowed.is_empty() && self.allowed_patterns.is_empty() {
            return true;
        }

        // 4. Check allowed domains
        if self.allowed.contains(&domain) {
            return true;
        }

        // 5. Check allowed subdomains
        for allowed in &self.allowed {
            if domain.ends_with(&format!(".{}", allowed)) {
                return true;
            }
        }

        // 6. Check allowed patterns
        for pattern in &self.allowed_patterns {
            if pattern.is_match(&domain) {
                return true;
            }
        }

        false
    }

    /// Check if a domain is allowed
    pub fn is_domain_allowed(&self, domain: &str) -> bool {
        let domain = domain.to_lowercase();

        // Blocked?
        if self.blocked.contains(&domain) {
            return false;
        }
        for pattern in &self.blocked_patterns {
            if pattern.is_match(&domain) {
                return false;
            }
        }

        // No list = allowed
        if self.allowed.is_empty() && self.allowed_patterns.is_empty() {
            return true;
        }

        // Allowed?
        if self.allowed.contains(&domain) {
            return true;
        }
        for allowed in &self.allowed {
            if domain.ends_with(&format!(".{}", allowed)) {
                return true;
            }
        }
        for pattern in &self.allowed_patterns {
            if pattern.is_match(&domain) {
                return true;
            }
        }

        false
    }

    /// Return the allowed domains
    pub fn allowed_domains(&self) -> &HashSet<String> {
        &self.allowed
    }

    /// Return the blocked domains
    pub fn blocked_domains(&self) -> &HashSet<String> {
        &self.blocked
    }
}