halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! HTML - Robust HTML parsing

use scraper::{Html, Selector, ElementRef};
use encoding_rs::Encoding;

/// HTML Parser
pub struct HtmlParser {
    document: Html,
}

impl HtmlParser {
    /// Parse an HTML document
    pub fn parse(html: &str) -> Self {
        let document = Html::parse_document(html);
        Self { document }
    }

    /// Parse with encoding detection
    pub fn parse_with_encoding(bytes: &[u8], declared_charset: Option<&str>) -> Self {
        let (html, _) = Self::decode_html(bytes, declared_charset);
        Self::parse(&html)
    }

    /// Decode HTML with the correct encoding
    fn decode_html(bytes: &[u8], declared_charset: Option<&str>) -> (String, &'static Encoding) {
        // 1. Try the declared charset
        if let Some(charset) = declared_charset {
            if let Some(encoding) = Encoding::for_label(charset.as_bytes()) {
                let (decoded, _, _) = encoding.decode(bytes);
                return (decoded.into_owned(), encoding);
            }
        }

        // 2. Detect from meta tags (first 1024 bytes)
        let peek = &bytes[..std::cmp::min(1024, bytes.len())];
        let peek_str = String::from_utf8_lossy(peek);
        
        if let Some(charset) = Self::detect_meta_charset(&peek_str) {
            if let Some(encoding) = Encoding::for_label(charset.as_bytes()) {
                let (decoded, _, _) = encoding.decode(bytes);
                return (decoded.into_owned(), encoding);
            }
        }

        // 3. Default UTF-8
        let (decoded, _, _) = encoding_rs::UTF_8.decode(bytes);
        (decoded.into_owned(), encoding_rs::UTF_8)
    }

    /// Detect charset from meta tags
    fn detect_meta_charset(html: &str) -> Option<String> {
        let html_lower = html.to_lowercase();
        
        // <meta charset="...">
        if let Some(pos) = html_lower.find("charset=") {
            let rest = &html[pos + 8..];
            let charset: String = rest
                .chars()
                .skip_while(|&c| c == '"' || c == '\'')
                .take_while(|&c| c != '"' && c != '\'' && c != ' ' && c != ';' && c != '>')
                .collect();
            if !charset.is_empty() {
                return Some(charset);
            }
        }

        None
    }

    /// Access to the parsed document
    pub fn document(&self) -> &Html {
        &self.document
    }

    /// Select elements
    pub fn select(&self, selector: &str) -> Vec<ElementRef<'_>> {
        match Selector::parse(selector) {
            Ok(sel) => self.document.select(&sel).collect(),
            Err(_) => Vec::new(),
        }
    }

    /// Select the first element
    pub fn select_first(&self, selector: &str) -> Option<ElementRef<'_>> {
        Selector::parse(selector)
            .ok()
            .and_then(|sel| self.document.select(&sel).next())
    }

    /// Get the text of an element
    pub fn text(&self, selector: &str) -> Option<String> {
        self.select_first(selector)
            .map(|el| el.text().collect::<Vec<_>>().join(" ").trim().to_string())
    }

    /// Get an attribute of an element
    pub fn attr(&self, selector: &str, attr: &str) -> Option<String> {
        self.select_first(selector)
            .and_then(|el| el.value().attr(attr).map(String::from))
    }

    /// Get the inner HTML of an element
    pub fn inner_html(&self, selector: &str) -> Option<String> {
        self.select_first(selector).map(|el| el.inner_html())
    }

    /// Check if an element exists
    pub fn exists(&self, selector: &str) -> bool {
        self.select_first(selector).is_some()
    }

    /// Count elements
    pub fn count(&self, selector: &str) -> usize {
        self.select(selector).len()
    }
}

/// Clean HTML (remove scripts, styles, etc.)
pub fn sanitize_html(html: &str) -> String {
    let document = Html::parse_document(html);
    let mut output = String::new();

    // Selectors to exclude
    let exclude_selectors = [
        "script", "style", "noscript", "iframe", "svg", "canvas",
        "template", "object", "embed", "applet",
    ];

    // Traverse and rebuild
    for node in document.root_element().descendants() {
        if let Some(element) = node.value().as_element() {
            let tag_name = element.name();
            if !exclude_selectors.contains(&tag_name) {
                // Add the text
                if let Some(text) = node.value().as_text() {
                    let text = text.trim();
                    if !text.is_empty() {
                        output.push_str(text);
                        output.push(' ');
                    }
                }
            }
        }
    }

    output.trim().to_string()
}