halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Canonical - Canonical URL management

use std::collections::HashMap;
use std::sync::RwLock;
use url::Url;

/// Canonical resolver
pub struct CanonicalResolver {
    /// Map URL -> canonical URL
    canonical_map: RwLock<HashMap<String, Url>>,
    /// Respect canonicals?
    respect_canonicals: bool,
}

impl Default for CanonicalResolver {
    fn default() -> Self {
        Self::new(true)
    }
}

impl CanonicalResolver {
    /// New resolver
    pub fn new(respect_canonicals: bool) -> Self {
        Self {
            canonical_map: RwLock::new(HashMap::new()),
            respect_canonicals,
        }
    }

    /// Register a URL -> canonical mapping
    pub fn register(&self, url: &Url, canonical: &Url) {
        if !self.respect_canonicals {
            return;
        }
        
        let key = url.to_string();
        self.canonical_map.write().unwrap().insert(key, canonical.clone());
    }

    /// Resolve the canonical URL
    pub fn resolve(&self, url: &Url) -> Url {
        if !self.respect_canonicals {
            return url.clone();
        }

        let key = url.to_string();
        self.canonical_map
            .read()
            .unwrap()
            .get(&key)
            .cloned()
            .unwrap_or_else(|| url.clone())
    }

    /// Does the URL have a different canonical?
    pub fn has_different_canonical(&self, url: &Url) -> bool {
        let key = url.to_string();
        if let Some(canonical) = self.canonical_map.read().unwrap().get(&key) {
            canonical != url
        } else {
            false
        }
    }

    /// Number of mappings
    pub fn count(&self) -> usize {
        self.canonical_map.read().unwrap().len()
    }

    /// Clear the cache
    pub fn clear(&self) {
        self.canonical_map.write().unwrap().clear();
    }

    /// Resolve canonical from HTML content (returns None if same as base_url)
    pub fn resolve_from_html(&self, html: &str, base_url: &Url) -> Option<Url> {
        if !self.respect_canonicals {
            return None;
        }

        extract_canonical_from_html(html, base_url).filter(|canonical| canonical != base_url)
    }
}

/// Extract the canonical link from an HTML document
pub fn extract_canonical_from_html(html: &str, base_url: &Url) -> Option<Url> {
    // Parse HTML to find <link rel="canonical">
    let document = scraper::Html::parse_document(html);
    let selector = scraper::Selector::parse(r#"link[rel="canonical"]"#).ok()?;
    
    document
        .select(&selector)
        .next()
        .and_then(|el| el.value().attr("href"))
        .and_then(|href| base_url.join(href).ok())
}

/// Extract pagination links (rel=next/prev)
pub struct PaginationLinks {
    /// Next page URL from rel="next"
    pub next: Option<Url>,
    /// Previous page URL from rel="prev"
    pub prev: Option<Url>,
}

/// Extract pagination links
pub fn extract_pagination_from_html(html: &str, base_url: &Url) -> PaginationLinks {
    let document = scraper::Html::parse_document(html);
    
    let next = scraper::Selector::parse(r#"link[rel="next"]"#)
        .ok()
        .and_then(|sel| {
            document
                .select(&sel)
                .next()
                .and_then(|el| el.value().attr("href"))
                .and_then(|href| base_url.join(href).ok())
        });

    let prev = scraper::Selector::parse(r#"link[rel="prev"]"#)
        .ok()
        .and_then(|sel| {
            document
                .select(&sel)
                .next()
                .and_then(|el| el.value().attr("href"))
                .and_then(|href| base_url.join(href).ok())
        });

    PaginationLinks { next, prev }
}

/// Extract hreflang links
pub fn extract_hreflang_from_html(html: &str, base_url: &Url) -> HashMap<String, Url> {
    let document = scraper::Html::parse_document(html);
    let selector = match scraper::Selector::parse(r#"link[rel="alternate"][hreflang]"#) {
        Ok(s) => s,
        Err(_) => return HashMap::new(),
    };

    document
        .select(&selector)
        .filter_map(|el| {
            let lang = el.value().attr("hreflang")?;
            let href = el.value().attr("href")?;
            let url = base_url.join(href).ok()?;
            Some((lang.to_string(), url))
        })
        .collect()
}