halldyll_core/parse/
jsonld.rs

1//! JSON-LD - JSON-LD structured data extraction
2
3use scraper::{Html, Selector};
4use serde_json::Value;
5
6/// JSON-LD extractor
7pub struct JsonLdExtractor;
8
9impl Default for JsonLdExtractor {
10    fn default() -> Self {
11        Self
12    }
13}
14
15impl JsonLdExtractor {
16    /// New extractor
17    pub fn new() -> Self {
18        Self
19    }
20
21    /// Extract all JSON-LD blocks
22    pub fn extract(&self, html: &str) -> Vec<Value> {
23        let document = Html::parse_document(html);
24        let selector = Selector::parse(r#"script[type="application/ld+json"]"#).unwrap();
25        
26        document
27            .select(&selector)
28            .filter_map(|script| {
29                let text = script.text().collect::<Vec<_>>().join("");
30                serde_json::from_str(&text).ok()
31            })
32            .collect()
33    }
34
35    /// Extract JSON-LD of a specific type
36    pub fn extract_by_type(&self, html: &str, schema_type: &str) -> Vec<Value> {
37        self.extract(html)
38            .into_iter()
39            .filter(|v| self.matches_type(v, schema_type))
40            .collect()
41    }
42
43    /// Check if a JSON-LD matches a type
44    fn matches_type(&self, value: &Value, schema_type: &str) -> bool {
45        if let Some(t) = value.get("@type") {
46            match t {
47                Value::String(s) => s == schema_type,
48                Value::Array(arr) => arr.iter().any(|v| {
49                    v.as_str().map(|s| s == schema_type).unwrap_or(false)
50                }),
51                _ => false,
52            }
53        } else if let Some(graph) = value.get("@graph") {
54            // Check in @graph
55            if let Value::Array(items) = graph {
56                items.iter().any(|item| self.matches_type(item, schema_type))
57            } else {
58                false
59            }
60        } else {
61            false
62        }
63    }
64
65    /// Extract FAQs (schema.org/FAQPage)
66    pub fn extract_faqs(&self, html: &str) -> Vec<FaqItem> {
67        let json_lds = self.extract_by_type(html, "FAQPage");
68        let mut faqs = Vec::new();
69
70        for json_ld in json_lds {
71            if let Some(main_entity) = json_ld.get("mainEntity") {
72                if let Value::Array(questions) = main_entity {
73                    for q in questions {
74                        if let (Some(question), Some(answer)) = (
75                            q.get("name").and_then(|v| v.as_str()),
76                            q.get("acceptedAnswer")
77                                .and_then(|a| a.get("text"))
78                                .and_then(|v| v.as_str()),
79                        ) {
80                            faqs.push(FaqItem {
81                                question: question.to_string(),
82                                answer: answer.to_string(),
83                            });
84                        }
85                    }
86                }
87            }
88        }
89
90        faqs
91    }
92
93    /// Extract HowTos (schema.org/HowTo)
94    pub fn extract_howtos(&self, html: &str) -> Vec<HowToItem> {
95        let json_lds = self.extract_by_type(html, "HowTo");
96        let mut howtos = Vec::new();
97
98        for json_ld in json_lds {
99            let name = json_ld.get("name").and_then(|v| v.as_str()).map(String::from);
100            let description = json_ld.get("description").and_then(|v| v.as_str()).map(String::from);
101            
102            let mut steps = Vec::new();
103            if let Some(Value::Array(step_list)) = json_ld.get("step") {
104                for (i, step) in step_list.iter().enumerate() {
105                    let step_name = step.get("name")
106                        .or_else(|| step.get("text"))
107                        .and_then(|v| v.as_str())
108                        .map(String::from);
109                    
110                    if let Some(text) = step_name {
111                        steps.push(HowToStep {
112                            position: i + 1,
113                            text,
114                            image: step.get("image").and_then(|v| v.as_str()).map(String::from),
115                        });
116                    }
117                }
118            }
119
120            howtos.push(HowToItem {
121                name,
122                description,
123                steps,
124            });
125        }
126
127        howtos
128    }
129
130    /// Extract articles (schema.org/Article)
131    pub fn extract_articles(&self, html: &str) -> Vec<ArticleSchema> {
132        let json_lds = self.extract_by_type(html, "Article");
133        let mut articles = Vec::new();
134
135        for json_ld in json_lds {
136            articles.push(ArticleSchema {
137                headline: json_ld.get("headline").and_then(|v| v.as_str()).map(String::from),
138                author: self.extract_person_name(&json_ld, "author"),
139                date_published: json_ld.get("datePublished").and_then(|v| v.as_str()).map(String::from),
140                date_modified: json_ld.get("dateModified").and_then(|v| v.as_str()).map(String::from),
141                description: json_ld.get("description").and_then(|v| v.as_str()).map(String::from),
142                image: json_ld.get("image").and_then(|v| {
143                    match v {
144                        Value::String(s) => Some(s.clone()),
145                        Value::Object(o) => o.get("url").and_then(|u| u.as_str()).map(String::from),
146                        Value::Array(a) => a.first().and_then(|i| i.as_str()).map(String::from),
147                        _ => None,
148                    }
149                }),
150            });
151        }
152
153        articles
154    }
155
156    /// Extract person name
157    fn extract_person_name(&self, json_ld: &Value, field: &str) -> Option<String> {
158        json_ld.get(field).and_then(|author| {
159            match author {
160                Value::String(s) => Some(s.clone()),
161                Value::Object(o) => o.get("name").and_then(|v| v.as_str()).map(String::from),
162                Value::Array(a) => a.first().and_then(|p| {
163                    match p {
164                        Value::String(s) => Some(s.clone()),
165                        Value::Object(o) => o.get("name").and_then(|v| v.as_str()).map(String::from),
166                        _ => None,
167                    }
168                }),
169                _ => None,
170            }
171        })
172    }
173}
174
175/// FAQ item from structured data
176#[derive(Debug, Clone)]
177pub struct FaqItem {
178    /// Question text
179    pub question: String,
180    /// Answer text
181    pub answer: String,
182}
183
184/// HowTo item from structured data
185#[derive(Debug, Clone)]
186pub struct HowToItem {
187    /// HowTo name/title
188    pub name: Option<String>,
189    /// HowTo description
190    pub description: Option<String>,
191    /// Steps in the HowTo
192    pub steps: Vec<HowToStep>,
193}
194
195/// HowTo step
196#[derive(Debug, Clone)]
197pub struct HowToStep {
198    /// Step position (1-based)
199    pub position: usize,
200    /// Step instruction text
201    pub text: String,
202    /// Optional image URL for this step
203    pub image: Option<String>,
204}
205
206/// Article schema from structured data
207#[derive(Debug, Clone)]
208pub struct ArticleSchema {
209    /// Article headline
210    pub headline: Option<String>,
211    /// Article author
212    pub author: Option<String>,
213    /// Publication date
214    pub date_published: Option<String>,
215    /// Last modification date
216    pub date_modified: Option<String>,
217    /// Article description
218    pub description: Option<String>,
219    /// Article main image URL
220    pub image: Option<String>,
221}