halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! OpenGraph - OpenGraph metadata extraction

use scraper::{Html, Selector};
use std::collections::HashMap;

use crate::types::document::OpenGraph;

/// OpenGraph extractor
pub struct OpenGraphExtractor;

impl Default for OpenGraphExtractor {
    fn default() -> Self {
        Self
    }
}

impl OpenGraphExtractor {
    /// New extractor
    pub fn new() -> Self {
        Self
    }

    /// Extract all OpenGraph metadata
    pub fn extract(&self, html: &str) -> OpenGraph {
        let document = Html::parse_document(html);
        let selector = Selector::parse(r#"meta[property^="og:"]"#).unwrap();
        
        let mut og = OpenGraph::default();
        let mut extra = HashMap::new();

        for meta in document.select(&selector) {
            let property = match meta.value().attr("property") {
                Some(p) => p.strip_prefix("og:").unwrap_or(p),
                None => continue,
            };
            let content = match meta.value().attr("content") {
                Some(c) => c.to_string(),
                None => continue,
            };

            match property {
                "title" => og.title = Some(content),
                "type" => og.og_type = Some(content),
                "url" => og.url = Some(content),
                "description" => og.description = Some(content),
                "image" => og.image = Some(content),
                "image:width" => og.image_width = content.parse().ok(),
                "image:height" => og.image_height = content.parse().ok(),
                "video" => og.video = Some(content),
                "video:type" => og.video_type = Some(content),
                "video:width" => og.video_width = content.parse().ok(),
                "video:height" => og.video_height = content.parse().ok(),
                "audio" => og.audio = Some(content),
                "site_name" => og.site_name = Some(content),
                "locale" => og.locale = Some(content),
                _ => {
                    extra.insert(property.to_string(), content);
                }
            }
        }

        og.extra = extra;
        og
    }

    /// Extract Twitter Cards
    pub fn extract_twitter_cards(&self, html: &str) -> TwitterCard {
        let document = Html::parse_document(html);
        let selector = Selector::parse(r#"meta[name^="twitter:"]"#).unwrap();
        
        let mut card = TwitterCard::default();

        for meta in document.select(&selector) {
            let name = match meta.value().attr("name") {
                Some(n) => n.strip_prefix("twitter:").unwrap_or(n),
                None => continue,
            };
            let content = match meta.value().attr("content") {
                Some(c) => c.to_string(),
                None => continue,
            };

            match name {
                "card" => card.card = Some(content),
                "site" => card.site = Some(content),
                "creator" => card.creator = Some(content),
                "title" => card.title = Some(content),
                "description" => card.description = Some(content),
                "image" => card.image = Some(content),
                "image:alt" => card.image_alt = Some(content),
                _ => {}
            }
        }

        card
    }
}

/// Twitter Card
#[derive(Debug, Clone, Default)]
pub struct TwitterCard {
    /// Card type (summary, summary_large_image, player, app)
    pub card: Option<String>,
    /// Site @username
    pub site: Option<String>,
    /// Creator @username
    pub creator: Option<String>,
    /// Title
    pub title: Option<String>,
    /// Description
    pub description: Option<String>,
    /// Image URL
    pub image: Option<String>,
    /// Image alt text
    pub image_alt: Option<String>,
}