halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Links - Link extraction

use ego_tree::NodeId;
use scraper::{Html, Selector};
use std::collections::HashSet;
use url::Url;

use crate::types::document::{LinkType, OutLink};
use crate::crawl::normalize::UrlNormalizer;

/// Link extractor
pub struct LinkExtractor {
    /// URL normalizer
    normalizer: UrlNormalizer,
    /// Extract navigation links?
    extract_navigation: bool,
    /// Extract footer links?
    extract_footer: bool,
}

impl Default for LinkExtractor {
    fn default() -> Self {
        Self {
            normalizer: UrlNormalizer::default(),
            extract_navigation: true,
            extract_footer: true,
        }
    }
}

impl LinkExtractor {
    /// New extractor
    pub fn new() -> Self {
        Self::default()
    }

    /// Configure extraction
    pub fn with_options(mut self, extract_navigation: bool, extract_footer: bool) -> Self {
        self.extract_navigation = extract_navigation;
        self.extract_footer = extract_footer;
        self
    }

    /// Extract all links
    pub fn extract(&self, html: &str, base_url: &Url) -> Vec<OutLink> {
        let document = Html::parse_document(html);
        let mut links = Vec::new();

        // Selector for all <a> links
        let a_selector = Selector::parse("a[href]").unwrap();

        // Selectors to identify context
        let nav_selector = Selector::parse("nav, header, .navigation, .menu, [role=\"navigation\"]").unwrap();
        let footer_selector = Selector::parse("footer, .footer, [role=\"contentinfo\"]").unwrap();
        let aside_selector = Selector::parse("aside, .sidebar, [role=\"complementary\"]").unwrap();

        // IDs of zones to identify
        let nav_ids: std::collections::HashSet<_> = document.select(&nav_selector).map(|e| e.id()).collect();
        let footer_ids: std::collections::HashSet<_> = document.select(&footer_selector).map(|e| e.id()).collect();
        let aside_ids: std::collections::HashSet<_> = document.select(&aside_selector).map(|e| e.id()).collect();

        for element in document.select(&a_selector) {
            let href = match element.value().attr("href") {
                Some(h) => h,
                None => continue,
            };

            // Ignore empty or javascript: links
            if href.is_empty() || href.starts_with("javascript:") || href.starts_with("mailto:") || href.starts_with("tel:") {
                continue;
            }

            // Resolve URL
            let url = match base_url.join(href) {
                Ok(u) => self.normalizer.normalize(&u),
                Err(_) => continue,
            };

            // Ignore non-HTTP URLs
            if !url.scheme().starts_with("http") {
                continue;
            }

            // Determine link type
            let link_type = self.determine_link_type(&element, &nav_ids, &footer_ids, &aside_ids);

            // Filter according to configuration
            if !self.extract_navigation && link_type == LinkType::Navigation {
                continue;
            }
            if !self.extract_footer && matches!(link_type, LinkType::Navigation) {
                // Footer links sont souvent de type Navigation
            }

            // Anchor text
            let anchor_text = element.text().collect::<Vec<_>>().join(" ").trim().to_string();
            let anchor_text = if anchor_text.is_empty() { None } else { Some(anchor_text) };

            // Rel attribute
            let rel = element.value().attr("rel").map(String::from);

            // Internal or external?
            let is_internal = url.host() == base_url.host();

            links.push(OutLink {
                url,
                anchor_text,
                rel,
                is_internal,
                link_type,
            });
        }

        // Deduplicate
        links.sort_by(|a, b| a.url.as_str().cmp(b.url.as_str()));
        links.dedup_by(|a, b| a.url == b.url);

        links
    }

    /// Determine link type based on its context
    fn determine_link_type(
        &self,
        element: &scraper::ElementRef,
        nav_ids: &HashSet<NodeId>,
        footer_ids: &HashSet<NodeId>,
        aside_ids: &HashSet<NodeId>,
    ) -> LinkType {
        // Check ancestors
        let mut parent = element.parent();
        while let Some(p) = parent {
            if nav_ids.contains(&p.id()) {
                return LinkType::Navigation;
            }
            if footer_ids.contains(&p.id()) {
                return LinkType::Navigation; // Footer = navigation aussi
            }
            if aside_ids.contains(&p.id()) {
                return LinkType::Other;
            }
            parent = p.parent();
        }

        LinkType::Content
    }

    /// Extract the canonical link
    pub fn extract_canonical(&self, html: &str, base_url: &Url) -> Option<Url> {
        let document = Html::parse_document(html);
        let selector = Selector::parse(r#"link[rel="canonical"]"#).ok()?;
        
        document
            .select(&selector)
            .next()
            .and_then(|el| el.value().attr("href"))
            .and_then(|href| base_url.join(href).ok())
            .map(|u| self.normalizer.normalize(&u))
    }

    /// Extract pagination links
    pub fn extract_pagination(&self, html: &str, base_url: &Url) -> PaginationLinks {
        let document = Html::parse_document(html);
        
        let next = Selector::parse(r#"link[rel="next"], a[rel="next"]"#)
            .ok()
            .and_then(|sel| {
                document.select(&sel).next()
                    .and_then(|el| el.value().attr("href"))
                    .and_then(|href| base_url.join(href).ok())
            });

        let prev = Selector::parse(r#"link[rel="prev"], a[rel="prev"]"#)
            .ok()
            .and_then(|sel| {
                document.select(&sel).next()
                    .and_then(|el| el.value().attr("href"))
                    .and_then(|href| base_url.join(href).ok())
            });

        PaginationLinks { next, prev }
    }

    /// Extract hreflang links
    pub fn extract_hreflang(&self, html: &str, base_url: &Url) -> Vec<HreflangLink> {
        let document = Html::parse_document(html);
        let selector = match Selector::parse(r#"link[rel="alternate"][hreflang]"#) {
            Ok(s) => s,
            Err(_) => return Vec::new(),
        };

        document
            .select(&selector)
            .filter_map(|el| {
                let lang = el.value().attr("hreflang")?;
                let href = el.value().attr("href")?;
                let url = base_url.join(href).ok()?;
                Some(HreflangLink {
                    lang: lang.to_string(),
                    url,
                })
            })
            .collect()
    }
}

/// Pagination links
#[derive(Debug, Clone)]
pub struct PaginationLinks {
    /// Next page URL
    pub next: Option<Url>,
    /// Previous page URL
    pub prev: Option<Url>,
}

/// Hreflang link for internationalization
#[derive(Debug, Clone)]
pub struct HreflangLink {
    /// Language code (e.g., "en", "fr-FR")
    pub lang: String,
    /// URL for this language version
    pub url: Url,
}