1use scraper::{Html, Selector};
9use serde_json::Value;
10use url::Url;
11
12const MAX_DEPTH: usize = 8;
13
14const PRIMARY_TYPES: &[&str] = &[
15 "Article",
16 "NewsArticle",
17 "BlogPosting",
18 "WebPage",
19 "Product",
20];
21
22#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, PartialEq, Eq)]
23pub struct ExtractedMetadata {
24 pub title: Option<String>,
25 pub description: Option<String>,
26 pub author: Option<String>,
27 pub published: Option<String>,
28 pub modified: Option<String>,
29 pub image: Option<String>,
30 pub og_type: Option<String>,
31 pub canonical: Option<String>,
32 pub language: Option<String>,
33 pub schema_types: Vec<String>,
35}
36
37impl ExtractedMetadata {
38 pub fn is_empty(&self) -> bool {
39 self.title.is_none()
40 && self.description.is_none()
41 && self.author.is_none()
42 && self.published.is_none()
43 && self.modified.is_none()
44 && self.image.is_none()
45 && self.og_type.is_none()
46 && self.canonical.is_none()
47 && self.language.is_none()
48 && self.schema_types.is_empty()
49 }
50
51 fn merge_in(&mut self, other: ExtractedMetadata) {
53 if self.title.is_none() {
54 self.title = other.title;
55 }
56 if self.description.is_none() {
57 self.description = other.description;
58 }
59 if self.author.is_none() {
60 self.author = other.author;
61 }
62 if self.published.is_none() {
63 self.published = other.published;
64 }
65 if self.modified.is_none() {
66 self.modified = other.modified;
67 }
68 if self.image.is_none() {
69 self.image = other.image;
70 }
71 if self.og_type.is_none() {
72 self.og_type = other.og_type;
73 }
74 if self.canonical.is_none() {
75 self.canonical = other.canonical;
76 }
77 if self.language.is_none() {
78 self.language = other.language;
79 }
80 for t in other.schema_types {
81 if !self.schema_types.contains(&t) {
82 self.schema_types.push(t);
83 }
84 }
85 }
86}
87
88pub fn extract(html: &str, base: &Url) -> ExtractedMetadata {
89 let doc = Html::parse_document(html);
90 let mut out = ExtractedMetadata::default();
91 out.merge_in(extract_jsonld(&doc));
92 out.merge_in(extract_open_graph(&doc));
93 out.merge_in(extract_twitter(&doc));
94 out.merge_in(extract_meta_description(&doc));
95 out.merge_in(extract_html_lang(&doc));
96 out.merge_in(extract_canonical(&doc, base));
97 out
98}
99
100fn meta_content(doc: &Html, sel: &str) -> Option<String> {
101 let selector = Selector::parse(sel).ok()?;
102 doc.select(&selector)
103 .next()
104 .and_then(|el| el.value().attr("content"))
105 .map(|s| s.to_string())
106 .filter(|s| !s.is_empty())
107}
108
109fn extract_open_graph(doc: &Html) -> ExtractedMetadata {
110 ExtractedMetadata {
111 title: meta_content(doc, r#"meta[property="og:title"]"#),
112 description: meta_content(doc, r#"meta[property="og:description"]"#),
113 image: meta_content(doc, r#"meta[property="og:image"]"#),
114 og_type: meta_content(doc, r#"meta[property="og:type"]"#),
115 published: meta_content(doc, r#"meta[property="article:published_time"]"#),
116 modified: meta_content(doc, r#"meta[property="article:modified_time"]"#),
117 author: meta_content(doc, r#"meta[property="article:author"]"#),
118 ..Default::default()
119 }
120}
121
122fn extract_twitter(doc: &Html) -> ExtractedMetadata {
123 ExtractedMetadata {
124 title: meta_content(doc, r#"meta[name="twitter:title"]"#),
125 description: meta_content(doc, r#"meta[name="twitter:description"]"#),
126 image: meta_content(doc, r#"meta[name="twitter:image"]"#),
127 ..Default::default()
128 }
129}
130
131fn extract_meta_description(doc: &Html) -> ExtractedMetadata {
132 ExtractedMetadata {
133 description: meta_content(doc, r#"meta[name="description"]"#),
134 ..Default::default()
135 }
136}
137
138fn extract_html_lang(doc: &Html) -> ExtractedMetadata {
139 let selector = Selector::parse("html").unwrap();
140 let language = doc
141 .select(&selector)
142 .next()
143 .and_then(|el| el.value().attr("lang"))
144 .map(|s| s.to_string())
145 .filter(|s| !s.is_empty());
146 ExtractedMetadata {
147 language,
148 ..Default::default()
149 }
150}
151
152fn extract_canonical(doc: &Html, base: &Url) -> ExtractedMetadata {
153 let selector = Selector::parse(r#"link[rel="canonical"]"#).unwrap();
154 let canonical = doc
155 .select(&selector)
156 .next()
157 .and_then(|el| el.value().attr("href"))
158 .and_then(|href| base.join(href).ok())
159 .map(|u| u.to_string());
160 ExtractedMetadata {
161 canonical,
162 ..Default::default()
163 }
164}
165
166fn extract_jsonld(doc: &Html) -> ExtractedMetadata {
167 let mut out = ExtractedMetadata::default();
168 let selector = Selector::parse(r#"script[type="application/ld+json"]"#).unwrap();
169
170 let mut nodes_with_type: Vec<Value> = Vec::new();
172 let mut all_types: Vec<String> = Vec::new();
173
174 for el in doc.select(&selector) {
175 let text = el.text().collect::<String>();
176 let value: Value = match serde_json::from_str(&text) {
177 Ok(v) => v,
178 Err(e) => {
179 tracing::warn!(target: "rover::extractor", err = %e, "malformed JSON-LD block; skipping");
180 continue;
181 }
182 };
183 walk(&value, 0, &mut nodes_with_type, &mut all_types);
184 }
185
186 let primary = pick_primary(&nodes_with_type);
188 if let Some(node) = primary {
189 out.title = scalar(node, "headline").or_else(|| scalar(node, "name"));
190 out.description = scalar(node, "description");
191 out.author = scalar_or_person_name(node, "author");
192 out.published = scalar(node, "datePublished");
193 out.modified = scalar(node, "dateModified");
194 out.image = scalar_or_image_url(node, "image");
195 }
196
197 for t in all_types {
198 if !out.schema_types.contains(&t) {
199 out.schema_types.push(t);
200 }
201 }
202 out
203}
204
205fn walk(v: &Value, depth: usize, nodes: &mut Vec<Value>, all_types: &mut Vec<String>) {
218 if depth > MAX_DEPTH {
219 return;
220 }
221 match v {
222 Value::Object(map) => {
223 let typed = map.get("@type").map(type_names).unwrap_or_default();
224 if !typed.is_empty() {
225 nodes.push(v.clone());
226 for n in typed {
227 all_types.push(n);
228 }
229 if let Some(graph) = map.get("@graph") {
233 walk(graph, depth + 1, nodes, all_types);
234 }
235 } else {
236 for (_k, child) in map {
239 walk(child, depth + 1, nodes, all_types);
240 }
241 }
242 }
243 Value::Array(items) => {
244 for item in items {
245 walk(item, depth + 1, nodes, all_types);
246 }
247 }
248 _ => {}
249 }
250}
251
252fn type_names(t: &Value) -> Vec<String> {
253 match t {
254 Value::String(s) => vec![s.clone()],
255 Value::Array(items) => items
256 .iter()
257 .filter_map(|v| v.as_str().map(|s| s.to_string()))
258 .collect(),
259 _ => Vec::new(),
260 }
261}
262
263fn pick_primary(nodes: &[Value]) -> Option<&Value> {
264 for want in PRIMARY_TYPES {
265 for n in nodes {
266 if type_names(&n["@type"]).iter().any(|s| s == *want) {
267 return Some(n);
268 }
269 }
270 }
271 nodes.first()
272}
273
274fn scalar(node: &Value, key: &str) -> Option<String> {
275 node.get(key)
276 .and_then(|v| v.as_str())
277 .filter(|s| !s.is_empty())
278 .map(|s| s.to_string())
279}
280
281fn scalar_or_person_name(node: &Value, key: &str) -> Option<String> {
282 let v = node.get(key)?;
283 if let Some(s) = v.as_str() {
284 return (!s.is_empty()).then(|| s.to_string());
285 }
286 if let Some(obj) = v.as_object()
287 && let Some(name) = obj.get("name").and_then(|n| n.as_str())
288 {
289 return Some(name.to_string());
290 }
291 if let Some(arr) = v.as_array() {
292 for item in arr {
293 if let Some(name) = item.as_str() {
294 return Some(name.to_string());
295 }
296 if let Some(name) = item.get("name").and_then(|n| n.as_str()) {
297 return Some(name.to_string());
298 }
299 }
300 }
301 None
302}
303
304fn scalar_or_image_url(node: &Value, key: &str) -> Option<String> {
305 let v = node.get(key)?;
306 if let Some(s) = v.as_str() {
307 return (!s.is_empty()).then(|| s.to_string());
308 }
309 if let Some(obj) = v.as_object() {
310 return obj.get("url").and_then(|u| u.as_str()).map(String::from);
311 }
312 if let Some(arr) = v.as_array() {
313 for item in arr {
314 if let Some(s) = item.as_str() {
315 return Some(s.to_string());
316 }
317 if let Some(u) = item.get("url").and_then(|u| u.as_str()) {
318 return Some(u.to_string());
319 }
320 }
321 }
322 None
323}
324
325#[cfg(test)]
326mod jsonld_tests {
327 use super::*;
328 use url::Url;
329
330 fn base() -> Url {
331 Url::parse("https://example.com/article").unwrap()
332 }
333
334 const ARTICLE_HTML: &str = r#"<!doctype html><html><head>
335 <script type="application/ld+json">
336 {
337 "@context": "https://schema.org",
338 "@type": "Article",
339 "headline": "Title from JSON-LD",
340 "description": "Desc from JSON-LD",
341 "author": {"@type":"Person","name":"Ada Lovelace"},
342 "datePublished": "2026-01-01T00:00:00Z",
343 "dateModified": "2026-02-01T00:00:00Z",
344 "image": "https://example.com/og.png"
345 }
346 </script></head><body></body></html>"#;
347
348 #[test]
349 fn extracts_article_scalar_fields() {
350 let m = extract(ARTICLE_HTML, &base());
351 assert_eq!(m.title.as_deref(), Some("Title from JSON-LD"));
352 assert_eq!(m.description.as_deref(), Some("Desc from JSON-LD"));
353 assert_eq!(m.author.as_deref(), Some("Ada Lovelace"));
354 assert_eq!(m.published.as_deref(), Some("2026-01-01T00:00:00Z"));
355 assert_eq!(m.modified.as_deref(), Some("2026-02-01T00:00:00Z"));
356 assert_eq!(m.image.as_deref(), Some("https://example.com/og.png"));
357 assert_eq!(m.schema_types, vec!["Article".to_string()]);
358 }
359
360 const GRAPH_HTML: &str = r#"<!doctype html><html><head>
361 <script type="application/ld+json">
362 {"@context":"https://schema.org","@graph":[
363 {"@type":"WebPage","name":"Should be skipped"},
364 {"@type":"NewsArticle","headline":"News title","author":"Reuters"}
365 ]}
366 </script></head><body></body></html>"#;
367
368 #[test]
369 fn prefers_article_like_type_in_graph() {
370 let m = extract(GRAPH_HTML, &base());
371 assert_eq!(m.title.as_deref(), Some("News title"));
372 assert_eq!(m.author.as_deref(), Some("Reuters"));
373 assert!(m.schema_types.contains(&"WebPage".to_string()));
375 assert!(m.schema_types.contains(&"NewsArticle".to_string()));
376 }
377
378 #[test]
379 fn depth_cap_does_not_stack_overflow() {
380 let mut chain = String::from(r#"{"@type":"Leaf"}"#);
383 for _ in 0..20 {
384 chain = format!(r#"{{"nested":{chain}}}"#);
385 }
386 let payload = format!(r#"{{"@graph":[{chain}]}}"#);
387 let html = format!(
388 r#"<!doctype html><html><head><script type="application/ld+json">{payload}</script></head><body></body></html>"#
389 );
390 let m = extract(&html, &base());
391 assert!(
394 m.schema_types.is_empty(),
395 "expected cap to prevent deep walk, got {:?}",
396 m.schema_types
397 );
398 }
399
400 #[test]
401 fn malformed_jsonld_does_not_panic() {
402 let html = r#"<!doctype html><html><head>
403 <script type="application/ld+json">{ this is not json }</script>
404 </head><body></body></html>"#;
405 let m = extract(html, &base());
406 assert!(m.is_empty()); }
408}
409
410#[cfg(test)]
411mod og_twitter_tests {
412 use super::*;
413 use url::Url;
414
415 fn base() -> Url {
416 Url::parse("https://example.com/").unwrap()
417 }
418
419 #[test]
420 fn reads_open_graph_metatags() {
421 let html = r#"<!doctype html><html lang="en"><head>
422 <meta property="og:title" content="OG Title">
423 <meta property="og:description" content="OG Desc">
424 <meta property="og:image" content="https://x/og.png">
425 <meta property="og:type" content="article">
426 <meta property="article:published_time" content="2026-03-01T00:00:00Z">
427 <meta property="article:modified_time" content="2026-03-02T00:00:00Z">
428 <meta property="article:author" content="Grace Hopper">
429 </head><body></body></html>"#;
430 let m = extract(html, &base());
431 assert_eq!(m.title.as_deref(), Some("OG Title"));
432 assert_eq!(m.description.as_deref(), Some("OG Desc"));
433 assert_eq!(m.image.as_deref(), Some("https://x/og.png"));
434 assert_eq!(m.og_type.as_deref(), Some("article"));
435 assert_eq!(m.published.as_deref(), Some("2026-03-01T00:00:00Z"));
436 assert_eq!(m.modified.as_deref(), Some("2026-03-02T00:00:00Z"));
437 assert_eq!(m.author.as_deref(), Some("Grace Hopper"));
438 assert_eq!(m.language.as_deref(), Some("en"));
439 }
440
441 #[test]
442 fn twitter_fills_holes_left_by_og() {
443 let html = r#"<!doctype html><html><head>
444 <meta name="twitter:title" content="Twitter Title">
445 <meta name="twitter:description" content="Twitter Desc">
446 <meta name="twitter:image" content="https://x/tc.png">
447 </head><body></body></html>"#;
448 let m = extract(html, &base());
449 assert_eq!(m.title.as_deref(), Some("Twitter Title"));
450 assert_eq!(m.description.as_deref(), Some("Twitter Desc"));
451 assert_eq!(m.image.as_deref(), Some("https://x/tc.png"));
452 }
453
454 #[test]
455 fn jsonld_wins_over_og_wins_over_twitter() {
456 let html = r#"<!doctype html><html><head>
457 <script type="application/ld+json">
458 {"@type":"Article","headline":"JSON-LD Title"}
459 </script>
460 <meta property="og:title" content="OG Title">
461 <meta name="twitter:title" content="Twitter Title">
462 </head><body></body></html>"#;
463 let m = extract(html, &base());
464 assert_eq!(m.title.as_deref(), Some("JSON-LD Title"));
465 }
466
467 #[test]
468 fn description_meta_fills_when_others_missing() {
469 let html = r#"<!doctype html><html><head>
470 <meta name="description" content="Plain meta desc">
471 </head><body></body></html>"#;
472 let m = extract(html, &base());
473 assert_eq!(m.description.as_deref(), Some("Plain meta desc"));
474 }
475
476 #[test]
477 fn canonical_absolutized_against_base() {
478 let html = r#"<!doctype html><html><head>
479 <link rel="canonical" href="/article">
480 </head><body></body></html>"#;
481 let m = extract(html, &base());
482 assert_eq!(m.canonical.as_deref(), Some("https://example.com/article"));
483 }
484}