halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Normalize - URL normalization (RFC 3986)

use url::Url;
use std::collections::BTreeMap;

/// URL normalizer
pub struct UrlNormalizer {
    /// Remove fragments (#)
    remove_fragments: bool,
    /// Sort query params
    sort_query_params: bool,
    /// Remove certain query params (tracking)
    remove_tracking_params: bool,
    /// Tracking params to remove
    tracking_params: Vec<String>,
    /// Force HTTPS
    force_https: bool,
    /// Remove www
    remove_www: bool,
    /// Remove trailing slash
    remove_trailing_slash: bool,
    /// Remove default port (80, 443)
    remove_default_port: bool,
}

impl Default for UrlNormalizer {
    fn default() -> Self {
        Self {
            remove_fragments: true,
            sort_query_params: true,
            remove_tracking_params: true,
            tracking_params: vec![
                "utm_source".to_string(),
                "utm_medium".to_string(),
                "utm_campaign".to_string(),
                "utm_term".to_string(),
                "utm_content".to_string(),
                "fbclid".to_string(),
                "gclid".to_string(),
                "ref".to_string(),
                "_ga".to_string(),
            ],
            force_https: false,
            remove_www: false,
            remove_trailing_slash: false,
            remove_default_port: true,
        }
    }
}

impl UrlNormalizer {
    /// New normalizer with default config
    pub fn new() -> Self {
        Self::default()
    }

    /// Normalize a URL
    pub fn normalize(&self, url: &Url) -> Url {
        let mut url = url.clone();

        // Remove fragment
        if self.remove_fragments {
            url.set_fragment(None);
        }

        // Remove default port
        if self.remove_default_port {
            if let Some(port) = url.port() {
                let default_port = match url.scheme() {
                    "http" => 80,
                    "https" => 443,
                    _ => 0,
                };
                if port == default_port {
                    let _ = url.set_port(None);
                }
            }
        }

        // Force HTTPS
        if self.force_https && url.scheme() == "http" {
            let _ = url.set_scheme("https");
        }

        // Remove www
        if self.remove_www {
            if let Some(host) = url.host_str() {
                if host.starts_with("www.") {
                    let new_host = host[4..].to_string();
                    let _ = url.set_host(Some(&new_host));
                }
            }
        }

        // Normalize path (remove trailing slash except for root)
        if self.remove_trailing_slash {
            let path = url.path().to_string();
            if path.len() > 1 && path.ends_with('/') {
                url.set_path(&path[..path.len() - 1]);
            }
        }

        // Sort and filter query params
        if self.sort_query_params || self.remove_tracking_params {
            let query_pairs: Vec<(String, String)> = url
                .query_pairs()
                .map(|(k, v)| (k.to_string(), v.to_string()))
                .collect();

            if !query_pairs.is_empty() {
                let mut filtered: BTreeMap<String, String> = BTreeMap::new();
                
                for (key, value) in query_pairs {
                    // Filter tracking params
                    if self.remove_tracking_params 
                        && self.tracking_params.iter().any(|t| t.eq_ignore_ascii_case(&key)) 
                    {
                        continue;
                    }
                    filtered.insert(key, value);
                }

                if filtered.is_empty() {
                    url.set_query(None);
                } else {
                    let query: String = filtered
                        .iter()
                        .map(|(k, v)| format!("{}={}", k, v))
                        .collect::<Vec<_>>()
                        .join("&");
                    url.set_query(Some(&query));
                }
            }
        }

        url
    }

    /// Resolve a relative URL against a base
    pub fn resolve(&self, base: &Url, relative: &str) -> Option<Url> {
        base.join(relative).ok().map(|u| self.normalize(&u))
    }

    /// Compare two URLs after normalization
    pub fn are_equal(&self, a: &Url, b: &Url) -> bool {
        self.normalize(a) == self.normalize(b)
    }
}

/// Extract the domain from a URL
pub fn extract_domain(url: &Url) -> Option<String> {
    url.host_str().map(String::from)
}

/// Extract the base domain (without subdomain)
pub fn extract_base_domain(url: &Url) -> Option<String> {
    url.host_str().map(|h| {
        let parts: Vec<&str> = h.split('.').collect();
        if parts.len() > 2 {
            // e.g.: www.example.com -> example.com
            parts[parts.len() - 2..].join(".")
        } else {
            h.to_string()
        }
    })
}

/// Check if two URLs are from the same domain
pub fn is_same_domain(a: &Url, b: &Url) -> bool {
    a.host_str() == b.host_str()
}

/// Check if two URLs are from the same base domain
pub fn is_same_base_domain(a: &Url, b: &Url) -> bool {
    extract_base_domain(a) == extract_base_domain(b)
}