halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Metadata - Metadata extraction

use scraper::{Html, Selector};

/// Metadata extractor
pub struct MetadataExtractor;

impl Default for MetadataExtractor {
    fn default() -> Self {
        Self
    }
}

impl MetadataExtractor {
    /// New extractor
    pub fn new() -> Self {
        Self
    }

    /// Extract all metadata
    pub fn extract(&self, html: &str) -> PageMetadata {
        let document = Html::parse_document(html);
        
        PageMetadata {
            title: self.extract_title(&document),
            description: self.extract_meta(&document, "description"),
            author: self.extract_meta(&document, "author"),
            keywords: self.extract_keywords(&document),
            published_time: self.extract_time(&document, "article:published_time")
                .or_else(|| self.extract_time(&document, "datePublished")),
            modified_time: self.extract_time(&document, "article:modified_time")
                .or_else(|| self.extract_time(&document, "dateModified")),
            robots: self.extract_meta(&document, "robots"),
            viewport: self.extract_meta(&document, "viewport"),
            charset: self.extract_charset(&document),
            canonical: self.extract_canonical(&document),
            language: self.extract_language(&document),
        }
    }

    /// Extract the title
    fn extract_title(&self, document: &Html) -> Option<String> {
        let selector = Selector::parse("title").ok()?;
        document
            .select(&selector)
            .next()
            .map(|el| el.text().collect::<Vec<_>>().join("").trim().to_string())
    }

    /// Extract a meta tag by name
    fn extract_meta(&self, document: &Html, name: &str) -> Option<String> {
        let selector = Selector::parse(&format!(r#"meta[name="{}"]"#, name)).ok()?;
        document
            .select(&selector)
            .next()
            .and_then(|el| el.value().attr("content").map(String::from))
    }

    /// Extract keywords
    fn extract_keywords(&self, document: &Html) -> Vec<String> {
        self.extract_meta(document, "keywords")
            .map(|s| {
                s.split(',')
                    .map(|k| k.trim().to_string())
                    .filter(|k| !k.is_empty())
                    .collect()
            })
            .unwrap_or_default()
    }

    /// Extract a date/time
    fn extract_time(&self, document: &Html, property: &str) -> Option<String> {
        // Essayer property (OpenGraph style)
        let prop_selector = Selector::parse(&format!(r#"meta[property="{}"]"#, property)).ok();
        if let Some(sel) = prop_selector {
            if let Some(el) = document.select(&sel).next() {
                if let Some(content) = el.value().attr("content") {
                    return Some(content.to_string());
                }
            }
        }

        // Essayer itemprop (Schema.org style)
        let itemprop_selector = Selector::parse(&format!(r#"[itemprop="{}"]"#, property)).ok();
        if let Some(sel) = itemprop_selector {
            if let Some(el) = document.select(&sel).next() {
                // Vérifier datetime attribute ou content
                if let Some(dt) = el.value().attr("datetime") {
                    return Some(dt.to_string());
                }
                if let Some(content) = el.value().attr("content") {
                    return Some(content.to_string());
                }
            }
        }

        None
    }

    /// Extract the charset
    fn extract_charset(&self, document: &Html) -> Option<String> {
        // <meta charset="...">
        let charset_selector = Selector::parse("meta[charset]").ok()?;
        if let Some(el) = document.select(&charset_selector).next() {
            if let Some(charset) = el.value().attr("charset") {
                return Some(charset.to_string());
            }
        }

        // <meta http-equiv="Content-Type" content="text/html; charset=...">
        let content_type_selector = Selector::parse(r#"meta[http-equiv="Content-Type"]"#).ok()?;
        if let Some(el) = document.select(&content_type_selector).next() {
            if let Some(content) = el.value().attr("content") {
                if let Some(pos) = content.to_lowercase().find("charset=") {
                    let charset: String = content[pos + 8..]
                        .chars()
                        .take_while(|&c| c != ';' && c != ' ' && c != '"')
                        .collect();
                    return Some(charset);
                }
            }
        }

        None
    }

    /// Extract the canonical link
    fn extract_canonical(&self, document: &Html) -> Option<String> {
        let selector = Selector::parse(r#"link[rel="canonical"]"#).ok()?;
        document
            .select(&selector)
            .next()
            .and_then(|el| el.value().attr("href").map(String::from))
    }

    /// Extract the language
    fn extract_language(&self, document: &Html) -> Option<String> {
        // Attribut lang sur <html>
        let html_selector = Selector::parse("html").ok()?;
        if let Some(html) = document.select(&html_selector).next() {
            if let Some(lang) = html.value().attr("lang") {
                return Some(lang.to_string());
            }
        }

        // Meta language
        self.extract_meta(document, "language")
    }
}

/// Page metadata
#[derive(Debug, Clone, Default)]
pub struct PageMetadata {
    /// Page title
    pub title: Option<String>,
    /// Meta description
    pub description: Option<String>,
    /// Author
    pub author: Option<String>,
    /// Keywords
    pub keywords: Vec<String>,
    /// Publication date
    pub published_time: Option<String>,
    /// Modification date
    pub modified_time: Option<String>,
    /// Robots directives
    pub robots: Option<String>,
    /// Viewport
    pub viewport: Option<String>,
    /// Charset
    pub charset: Option<String>,
    /// Canonical URL
    pub canonical: Option<String>,
    /// Language
    pub language: Option<String>,
}