halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! IP Block - Blocking private/local IPs

use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
use url::Url;

/// Private IP blocker
pub struct IpBlocker {
    /// Block private IPs
    block_private: bool,
    /// Block loopback IPs
    block_loopback: bool,
    /// Block link-local IPs
    block_link_local: bool,
    /// Block multicast IPs
    block_multicast: bool,
}

impl Default for IpBlocker {
    fn default() -> Self {
        Self {
            block_private: true,
            block_loopback: true,
            block_link_local: true,
            block_multicast: true,
        }
    }
}

impl IpBlocker {
    /// New blocker
    pub fn new() -> Self {
        Self::default()
    }

    /// Configure blocking
    pub fn configure(
        block_private: bool,
        block_loopback: bool,
        block_link_local: bool,
        block_multicast: bool,
    ) -> Self {
        Self {
            block_private,
            block_loopback,
            block_link_local,
            block_multicast,
        }
    }

    /// Check if an IP is blocked
    pub fn is_blocked(&self, ip: &IpAddr) -> bool {
        match ip {
            IpAddr::V4(ipv4) => self.is_blocked_ipv4(ipv4),
            IpAddr::V6(ipv6) => self.is_blocked_ipv6(ipv6),
        }
    }

    /// Check an IPv4
    fn is_blocked_ipv4(&self, ip: &Ipv4Addr) -> bool {
        // Loopback (127.0.0.0/8)
        if self.block_loopback && ip.is_loopback() {
            return true;
        }

        // Private ranges
        if self.block_private && ip.is_private() {
            return true;
        }

        // Link-local (169.254.0.0/16)
        if self.block_link_local && ip.is_link_local() {
            return true;
        }

        // Multicast (224.0.0.0/4)
        if self.block_multicast && ip.is_multicast() {
            return true;
        }

        // Broadcast
        if ip.is_broadcast() {
            return true;
        }

        // Unspecified (0.0.0.0)
        if ip.is_unspecified() {
            return true;
        }

        // Documentation ranges (192.0.2.0/24, 198.51.100.0/24, 203.0.113.0/24)
        if ip.is_documentation() {
            return true;
        }

        false
    }

    /// Check an IPv6
    fn is_blocked_ipv6(&self, ip: &Ipv6Addr) -> bool {
        // Loopback (::1)
        if self.block_loopback && ip.is_loopback() {
            return true;
        }

        // Link-local (fe80::/10)
        // Note: is_unicast_link_local is not stable, we check it manually
        if self.block_link_local {
            let segments = ip.segments();
            if (segments[0] & 0xffc0) == 0xfe80 {
                return true;
            }
        }

        // Multicast (ff00::/8)
        if self.block_multicast && ip.is_multicast() {
            return true;
        }

        // Unspecified (::)
        if ip.is_unspecified() {
            return true;
        }

        // Unique local (fc00::/7) - IPv6 equivalent of private addresses
        if self.block_private {
            let segments = ip.segments();
            if (segments[0] & 0xfe00) == 0xfc00 {
                return true;
            }
        }

        false
    }

    /// Check if a URL points to a blocked IP (by hostname)
    pub fn is_url_hostname_blocked(&self, url: &Url) -> bool {
        // Check if the host is a direct IP
        if let Some(host) = url.host_str() {
            // Try to parse as IP
            if let Ok(ip) = host.parse::<IpAddr>() {
                return self.is_blocked(&ip);
            }

            // Check local hostnames
            let host_lower = host.to_lowercase();
            if host_lower == "localhost" 
                || host_lower == "localhost.localdomain"
                || host_lower.ends_with(".local")
                || host_lower.ends_with(".localhost")
            {
                return self.block_loopback;
            }
        }

        false
    }

    /// Description of why an IP is blocked
    pub fn block_reason(&self, ip: &IpAddr) -> Option<String> {
        match ip {
            IpAddr::V4(ipv4) => {
                if self.block_loopback && ipv4.is_loopback() {
                    return Some("loopback address".to_string());
                }
                if self.block_private && ipv4.is_private() {
                    return Some("private address".to_string());
                }
                if self.block_link_local && ipv4.is_link_local() {
                    return Some("link-local address".to_string());
                }
                if self.block_multicast && ipv4.is_multicast() {
                    return Some("multicast address".to_string());
                }
                if ipv4.is_broadcast() {
                    return Some("broadcast address".to_string());
                }
                if ipv4.is_unspecified() {
                    return Some("unspecified address".to_string());
                }
            }
            IpAddr::V6(ipv6) => {
                if self.block_loopback && ipv6.is_loopback() {
                    return Some("loopback address".to_string());
                }
                if self.block_multicast && ipv6.is_multicast() {
                    return Some("multicast address".to_string());
                }
                if ipv6.is_unspecified() {
                    return Some("unspecified address".to_string());
                }
            }
        }
        None
    }
}