halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! JSON-LD - JSON-LD structured data extraction

use scraper::{Html, Selector};
use serde_json::Value;

/// JSON-LD extractor
pub struct JsonLdExtractor;

impl Default for JsonLdExtractor {
    fn default() -> Self {
        Self
    }
}

impl JsonLdExtractor {
    /// New extractor
    pub fn new() -> Self {
        Self
    }

    /// Extract all JSON-LD blocks
    pub fn extract(&self, html: &str) -> Vec<Value> {
        let document = Html::parse_document(html);
        let selector = Selector::parse(r#"script[type="application/ld+json"]"#).unwrap();
        
        document
            .select(&selector)
            .filter_map(|script| {
                let text = script.text().collect::<Vec<_>>().join("");
                serde_json::from_str(&text).ok()
            })
            .collect()
    }

    /// Extract JSON-LD of a specific type
    pub fn extract_by_type(&self, html: &str, schema_type: &str) -> Vec<Value> {
        self.extract(html)
            .into_iter()
            .filter(|v| self.matches_type(v, schema_type))
            .collect()
    }

    /// Check if a JSON-LD matches a type
    fn matches_type(&self, value: &Value, schema_type: &str) -> bool {
        if let Some(t) = value.get("@type") {
            match t {
                Value::String(s) => s == schema_type,
                Value::Array(arr) => arr.iter().any(|v| {
                    v.as_str().map(|s| s == schema_type).unwrap_or(false)
                }),
                _ => false,
            }
        } else if let Some(graph) = value.get("@graph") {
            // Check in @graph
            if let Value::Array(items) = graph {
                items.iter().any(|item| self.matches_type(item, schema_type))
            } else {
                false
            }
        } else {
            false
        }
    }

    /// Extract FAQs (schema.org/FAQPage)
    pub fn extract_faqs(&self, html: &str) -> Vec<FaqItem> {
        let json_lds = self.extract_by_type(html, "FAQPage");
        let mut faqs = Vec::new();

        for json_ld in json_lds {
            if let Some(main_entity) = json_ld.get("mainEntity") {
                if let Value::Array(questions) = main_entity {
                    for q in questions {
                        if let (Some(question), Some(answer)) = (
                            q.get("name").and_then(|v| v.as_str()),
                            q.get("acceptedAnswer")
                                .and_then(|a| a.get("text"))
                                .and_then(|v| v.as_str()),
                        ) {
                            faqs.push(FaqItem {
                                question: question.to_string(),
                                answer: answer.to_string(),
                            });
                        }
                    }
                }
            }
        }

        faqs
    }

    /// Extract HowTos (schema.org/HowTo)
    pub fn extract_howtos(&self, html: &str) -> Vec<HowToItem> {
        let json_lds = self.extract_by_type(html, "HowTo");
        let mut howtos = Vec::new();

        for json_ld in json_lds {
            let name = json_ld.get("name").and_then(|v| v.as_str()).map(String::from);
            let description = json_ld.get("description").and_then(|v| v.as_str()).map(String::from);
            
            let mut steps = Vec::new();
            if let Some(Value::Array(step_list)) = json_ld.get("step") {
                for (i, step) in step_list.iter().enumerate() {
                    let step_name = step.get("name")
                        .or_else(|| step.get("text"))
                        .and_then(|v| v.as_str())
                        .map(String::from);
                    
                    if let Some(text) = step_name {
                        steps.push(HowToStep {
                            position: i + 1,
                            text,
                            image: step.get("image").and_then(|v| v.as_str()).map(String::from),
                        });
                    }
                }
            }

            howtos.push(HowToItem {
                name,
                description,
                steps,
            });
        }

        howtos
    }

    /// Extract articles (schema.org/Article)
    pub fn extract_articles(&self, html: &str) -> Vec<ArticleSchema> {
        let json_lds = self.extract_by_type(html, "Article");
        let mut articles = Vec::new();

        for json_ld in json_lds {
            articles.push(ArticleSchema {
                headline: json_ld.get("headline").and_then(|v| v.as_str()).map(String::from),
                author: self.extract_person_name(&json_ld, "author"),
                date_published: json_ld.get("datePublished").and_then(|v| v.as_str()).map(String::from),
                date_modified: json_ld.get("dateModified").and_then(|v| v.as_str()).map(String::from),
                description: json_ld.get("description").and_then(|v| v.as_str()).map(String::from),
                image: json_ld.get("image").and_then(|v| {
                    match v {
                        Value::String(s) => Some(s.clone()),
                        Value::Object(o) => o.get("url").and_then(|u| u.as_str()).map(String::from),
                        Value::Array(a) => a.first().and_then(|i| i.as_str()).map(String::from),
                        _ => None,
                    }
                }),
            });
        }

        articles
    }

    /// Extract person name
    fn extract_person_name(&self, json_ld: &Value, field: &str) -> Option<String> {
        json_ld.get(field).and_then(|author| {
            match author {
                Value::String(s) => Some(s.clone()),
                Value::Object(o) => o.get("name").and_then(|v| v.as_str()).map(String::from),
                Value::Array(a) => a.first().and_then(|p| {
                    match p {
                        Value::String(s) => Some(s.clone()),
                        Value::Object(o) => o.get("name").and_then(|v| v.as_str()).map(String::from),
                        _ => None,
                    }
                }),
                _ => None,
            }
        })
    }
}

/// FAQ item from structured data
#[derive(Debug, Clone)]
pub struct FaqItem {
    /// Question text
    pub question: String,
    /// Answer text
    pub answer: String,
}

/// HowTo item from structured data
#[derive(Debug, Clone)]
pub struct HowToItem {
    /// HowTo name/title
    pub name: Option<String>,
    /// HowTo description
    pub description: Option<String>,
    /// Steps in the HowTo
    pub steps: Vec<HowToStep>,
}

/// HowTo step
#[derive(Debug, Clone)]
pub struct HowToStep {
    /// Step position (1-based)
    pub position: usize,
    /// Step instruction text
    pub text: String,
    /// Optional image URL for this step
    pub image: Option<String>,
}

/// Article schema from structured data
#[derive(Debug, Clone)]
pub struct ArticleSchema {
    /// Article headline
    pub headline: Option<String>,
    /// Article author
    pub author: Option<String>,
    /// Publication date
    pub date_published: Option<String>,
    /// Last modification date
    pub date_modified: Option<String>,
    /// Article description
    pub description: Option<String>,
    /// Article main image URL
    pub image: Option<String>,
}