halldyll_core/parse/
jsonld.rs1use scraper::{Html, Selector};
4use serde_json::Value;
5
6pub struct JsonLdExtractor;
8
9impl Default for JsonLdExtractor {
10 fn default() -> Self {
11 Self
12 }
13}
14
15impl JsonLdExtractor {
16 pub fn new() -> Self {
18 Self
19 }
20
21 pub fn extract(&self, html: &str) -> Vec<Value> {
23 let document = Html::parse_document(html);
24 let selector = Selector::parse(r#"script[type="application/ld+json"]"#).unwrap();
25
26 document
27 .select(&selector)
28 .filter_map(|script| {
29 let text = script.text().collect::<Vec<_>>().join("");
30 serde_json::from_str(&text).ok()
31 })
32 .collect()
33 }
34
35 pub fn extract_by_type(&self, html: &str, schema_type: &str) -> Vec<Value> {
37 self.extract(html)
38 .into_iter()
39 .filter(|v| self.matches_type(v, schema_type))
40 .collect()
41 }
42
43 fn matches_type(&self, value: &Value, schema_type: &str) -> bool {
45 if let Some(t) = value.get("@type") {
46 match t {
47 Value::String(s) => s == schema_type,
48 Value::Array(arr) => arr.iter().any(|v| {
49 v.as_str().map(|s| s == schema_type).unwrap_or(false)
50 }),
51 _ => false,
52 }
53 } else if let Some(graph) = value.get("@graph") {
54 if let Value::Array(items) = graph {
56 items.iter().any(|item| self.matches_type(item, schema_type))
57 } else {
58 false
59 }
60 } else {
61 false
62 }
63 }
64
65 pub fn extract_faqs(&self, html: &str) -> Vec<FaqItem> {
67 let json_lds = self.extract_by_type(html, "FAQPage");
68 let mut faqs = Vec::new();
69
70 for json_ld in json_lds {
71 if let Some(main_entity) = json_ld.get("mainEntity") {
72 if let Value::Array(questions) = main_entity {
73 for q in questions {
74 if let (Some(question), Some(answer)) = (
75 q.get("name").and_then(|v| v.as_str()),
76 q.get("acceptedAnswer")
77 .and_then(|a| a.get("text"))
78 .and_then(|v| v.as_str()),
79 ) {
80 faqs.push(FaqItem {
81 question: question.to_string(),
82 answer: answer.to_string(),
83 });
84 }
85 }
86 }
87 }
88 }
89
90 faqs
91 }
92
93 pub fn extract_howtos(&self, html: &str) -> Vec<HowToItem> {
95 let json_lds = self.extract_by_type(html, "HowTo");
96 let mut howtos = Vec::new();
97
98 for json_ld in json_lds {
99 let name = json_ld.get("name").and_then(|v| v.as_str()).map(String::from);
100 let description = json_ld.get("description").and_then(|v| v.as_str()).map(String::from);
101
102 let mut steps = Vec::new();
103 if let Some(Value::Array(step_list)) = json_ld.get("step") {
104 for (i, step) in step_list.iter().enumerate() {
105 let step_name = step.get("name")
106 .or_else(|| step.get("text"))
107 .and_then(|v| v.as_str())
108 .map(String::from);
109
110 if let Some(text) = step_name {
111 steps.push(HowToStep {
112 position: i + 1,
113 text,
114 image: step.get("image").and_then(|v| v.as_str()).map(String::from),
115 });
116 }
117 }
118 }
119
120 howtos.push(HowToItem {
121 name,
122 description,
123 steps,
124 });
125 }
126
127 howtos
128 }
129
130 pub fn extract_articles(&self, html: &str) -> Vec<ArticleSchema> {
132 let json_lds = self.extract_by_type(html, "Article");
133 let mut articles = Vec::new();
134
135 for json_ld in json_lds {
136 articles.push(ArticleSchema {
137 headline: json_ld.get("headline").and_then(|v| v.as_str()).map(String::from),
138 author: self.extract_person_name(&json_ld, "author"),
139 date_published: json_ld.get("datePublished").and_then(|v| v.as_str()).map(String::from),
140 date_modified: json_ld.get("dateModified").and_then(|v| v.as_str()).map(String::from),
141 description: json_ld.get("description").and_then(|v| v.as_str()).map(String::from),
142 image: json_ld.get("image").and_then(|v| {
143 match v {
144 Value::String(s) => Some(s.clone()),
145 Value::Object(o) => o.get("url").and_then(|u| u.as_str()).map(String::from),
146 Value::Array(a) => a.first().and_then(|i| i.as_str()).map(String::from),
147 _ => None,
148 }
149 }),
150 });
151 }
152
153 articles
154 }
155
156 fn extract_person_name(&self, json_ld: &Value, field: &str) -> Option<String> {
158 json_ld.get(field).and_then(|author| {
159 match author {
160 Value::String(s) => Some(s.clone()),
161 Value::Object(o) => o.get("name").and_then(|v| v.as_str()).map(String::from),
162 Value::Array(a) => a.first().and_then(|p| {
163 match p {
164 Value::String(s) => Some(s.clone()),
165 Value::Object(o) => o.get("name").and_then(|v| v.as_str()).map(String::from),
166 _ => None,
167 }
168 }),
169 _ => None,
170 }
171 })
172 }
173}
174
175#[derive(Debug, Clone)]
177pub struct FaqItem {
178 pub question: String,
180 pub answer: String,
182}
183
184#[derive(Debug, Clone)]
186pub struct HowToItem {
187 pub name: Option<String>,
189 pub description: Option<String>,
191 pub steps: Vec<HowToStep>,
193}
194
195#[derive(Debug, Clone)]
197pub struct HowToStep {
198 pub position: usize,
200 pub text: String,
202 pub image: Option<String>,
204}
205
206#[derive(Debug, Clone)]
208pub struct ArticleSchema {
209 pub headline: Option<String>,
211 pub author: Option<String>,
213 pub date_published: Option<String>,
215 pub date_modified: Option<String>,
217 pub description: Option<String>,
219 pub image: Option<String>,
221}