Skip to main content

rover/extractor/
metadata.rs

1//! Structured-metadata extraction (JSON-LD + Open Graph + Twitter Cards).
2//!
3//! JSON-LD walker flattens `@graph` arrays and nested objects up to depth
4//! 8, picks the first node whose `@type` is in the "primary" set, and
5//! surfaces its scalar fields. Task 4 adds OG, Twitter Cards, html[lang],
6//! meta description, and canonical.
7
8use scraper::{Html, Selector};
9use serde_json::Value;
10use url::Url;
11
12const MAX_DEPTH: usize = 8;
13
14const PRIMARY_TYPES: &[&str] = &[
15    "Article",
16    "NewsArticle",
17    "BlogPosting",
18    "WebPage",
19    "Product",
20];
21
22#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, PartialEq, Eq)]
23pub struct ExtractedMetadata {
24    pub title: Option<String>,
25    pub description: Option<String>,
26    pub author: Option<String>,
27    pub published: Option<String>,
28    pub modified: Option<String>,
29    pub image: Option<String>,
30    pub og_type: Option<String>,
31    pub canonical: Option<String>,
32    pub language: Option<String>,
33    /// Schema.org `@type` values in first-seen order, deduplicated.
34    pub schema_types: Vec<String>,
35}
36
37impl ExtractedMetadata {
38    pub fn is_empty(&self) -> bool {
39        self.title.is_none()
40            && self.description.is_none()
41            && self.author.is_none()
42            && self.published.is_none()
43            && self.modified.is_none()
44            && self.image.is_none()
45            && self.og_type.is_none()
46            && self.canonical.is_none()
47            && self.language.is_none()
48            && self.schema_types.is_empty()
49    }
50
51    /// Fill missing fields from `other`; existing fields are not overwritten.
52    fn merge_in(&mut self, other: ExtractedMetadata) {
53        if self.title.is_none() {
54            self.title = other.title;
55        }
56        if self.description.is_none() {
57            self.description = other.description;
58        }
59        if self.author.is_none() {
60            self.author = other.author;
61        }
62        if self.published.is_none() {
63            self.published = other.published;
64        }
65        if self.modified.is_none() {
66            self.modified = other.modified;
67        }
68        if self.image.is_none() {
69            self.image = other.image;
70        }
71        if self.og_type.is_none() {
72            self.og_type = other.og_type;
73        }
74        if self.canonical.is_none() {
75            self.canonical = other.canonical;
76        }
77        if self.language.is_none() {
78            self.language = other.language;
79        }
80        for t in other.schema_types {
81            if !self.schema_types.contains(&t) {
82                self.schema_types.push(t);
83            }
84        }
85    }
86}
87
88pub fn extract(html: &str, base: &Url) -> ExtractedMetadata {
89    let doc = Html::parse_document(html);
90    let mut out = ExtractedMetadata::default();
91    out.merge_in(extract_jsonld(&doc));
92    out.merge_in(extract_open_graph(&doc));
93    out.merge_in(extract_twitter(&doc));
94    out.merge_in(extract_meta_description(&doc));
95    out.merge_in(extract_html_lang(&doc));
96    out.merge_in(extract_canonical(&doc, base));
97    out
98}
99
100fn meta_content(doc: &Html, sel: &str) -> Option<String> {
101    let selector = Selector::parse(sel).ok()?;
102    doc.select(&selector)
103        .next()
104        .and_then(|el| el.value().attr("content"))
105        .map(|s| s.to_string())
106        .filter(|s| !s.is_empty())
107}
108
109fn extract_open_graph(doc: &Html) -> ExtractedMetadata {
110    ExtractedMetadata {
111        title: meta_content(doc, r#"meta[property="og:title"]"#),
112        description: meta_content(doc, r#"meta[property="og:description"]"#),
113        image: meta_content(doc, r#"meta[property="og:image"]"#),
114        og_type: meta_content(doc, r#"meta[property="og:type"]"#),
115        published: meta_content(doc, r#"meta[property="article:published_time"]"#),
116        modified: meta_content(doc, r#"meta[property="article:modified_time"]"#),
117        author: meta_content(doc, r#"meta[property="article:author"]"#),
118        ..Default::default()
119    }
120}
121
122fn extract_twitter(doc: &Html) -> ExtractedMetadata {
123    ExtractedMetadata {
124        title: meta_content(doc, r#"meta[name="twitter:title"]"#),
125        description: meta_content(doc, r#"meta[name="twitter:description"]"#),
126        image: meta_content(doc, r#"meta[name="twitter:image"]"#),
127        ..Default::default()
128    }
129}
130
131fn extract_meta_description(doc: &Html) -> ExtractedMetadata {
132    ExtractedMetadata {
133        description: meta_content(doc, r#"meta[name="description"]"#),
134        ..Default::default()
135    }
136}
137
138fn extract_html_lang(doc: &Html) -> ExtractedMetadata {
139    let selector = Selector::parse("html").unwrap();
140    let language = doc
141        .select(&selector)
142        .next()
143        .and_then(|el| el.value().attr("lang"))
144        .map(|s| s.to_string())
145        .filter(|s| !s.is_empty());
146    ExtractedMetadata {
147        language,
148        ..Default::default()
149    }
150}
151
152fn extract_canonical(doc: &Html, base: &Url) -> ExtractedMetadata {
153    let selector = Selector::parse(r#"link[rel="canonical"]"#).unwrap();
154    let canonical = doc
155        .select(&selector)
156        .next()
157        .and_then(|el| el.value().attr("href"))
158        .and_then(|href| base.join(href).ok())
159        .map(|u| u.to_string());
160    ExtractedMetadata {
161        canonical,
162        ..Default::default()
163    }
164}
165
166fn extract_jsonld(doc: &Html) -> ExtractedMetadata {
167    let mut out = ExtractedMetadata::default();
168    let selector = Selector::parse(r#"script[type="application/ld+json"]"#).unwrap();
169
170    // Collect all @type values across the page; pick the primary node from the first script that has one.
171    let mut nodes_with_type: Vec<Value> = Vec::new();
172    let mut all_types: Vec<String> = Vec::new();
173
174    for el in doc.select(&selector) {
175        let text = el.text().collect::<String>();
176        let value: Value = match serde_json::from_str(&text) {
177            Ok(v) => v,
178            Err(e) => {
179                tracing::warn!(target: "rover::extractor", err = %e, "malformed JSON-LD block; skipping");
180                continue;
181            }
182        };
183        walk(&value, 0, &mut nodes_with_type, &mut all_types);
184    }
185
186    // Pick primary node: prefer PRIMARY_TYPES order; else first node with any @type.
187    let primary = pick_primary(&nodes_with_type);
188    if let Some(node) = primary {
189        out.title = scalar(node, "headline").or_else(|| scalar(node, "name"));
190        out.description = scalar(node, "description");
191        out.author = scalar_or_person_name(node, "author");
192        out.published = scalar(node, "datePublished");
193        out.modified = scalar(node, "dateModified");
194        out.image = scalar_or_image_url(node, "image");
195    }
196
197    for t in all_types {
198        if !out.schema_types.contains(&t) {
199            out.schema_types.push(t);
200        }
201    }
202    out
203}
204
205/// Walk a JSON-LD value tree collecting typed nodes and `@type` strings.
206///
207/// **Recursion policy (deliberate deviation from a naive recurse-everywhere
208/// walk):** when an object has `@type`, we record it but only recurse into
209/// `@graph` from there. Typed nodes' other properties (`author`, `publisher`,
210/// `offers`, etc.) are NOT walked. This keeps `schema_types` focused on
211/// page-level classifications rather than leaking nested referenced entities
212/// (e.g. an Article's `author: Person` should not surface "Person" as a
213/// page type). Untyped containers (the document root, untyped wrappers)
214/// still recurse into all children.
215///
216/// `MAX_DEPTH` caps recursion to defend against pathological inputs.
217fn walk(v: &Value, depth: usize, nodes: &mut Vec<Value>, all_types: &mut Vec<String>) {
218    if depth > MAX_DEPTH {
219        return;
220    }
221    match v {
222        Value::Object(map) => {
223            let typed = map.get("@type").map(type_names).unwrap_or_default();
224            if !typed.is_empty() {
225                nodes.push(v.clone());
226                for n in typed {
227                    all_types.push(n);
228                }
229                // Don't recurse into a typed node's own properties — but DO follow
230                // an explicit @graph if present (some payloads nest a graph inside
231                // a typed wrapper).
232                if let Some(graph) = map.get("@graph") {
233                    walk(graph, depth + 1, nodes, all_types);
234                }
235            } else {
236                // Untyped container: descend into all children (covers top-level
237                // wrappers like `{"@context":..., "@graph":[...]}`).
238                for (_k, child) in map {
239                    walk(child, depth + 1, nodes, all_types);
240                }
241            }
242        }
243        Value::Array(items) => {
244            for item in items {
245                walk(item, depth + 1, nodes, all_types);
246            }
247        }
248        _ => {}
249    }
250}
251
252fn type_names(t: &Value) -> Vec<String> {
253    match t {
254        Value::String(s) => vec![s.clone()],
255        Value::Array(items) => items
256            .iter()
257            .filter_map(|v| v.as_str().map(|s| s.to_string()))
258            .collect(),
259        _ => Vec::new(),
260    }
261}
262
263fn pick_primary(nodes: &[Value]) -> Option<&Value> {
264    for want in PRIMARY_TYPES {
265        for n in nodes {
266            if type_names(&n["@type"]).iter().any(|s| s == *want) {
267                return Some(n);
268            }
269        }
270    }
271    nodes.first()
272}
273
274fn scalar(node: &Value, key: &str) -> Option<String> {
275    node.get(key)
276        .and_then(|v| v.as_str())
277        .filter(|s| !s.is_empty())
278        .map(|s| s.to_string())
279}
280
281fn scalar_or_person_name(node: &Value, key: &str) -> Option<String> {
282    let v = node.get(key)?;
283    if let Some(s) = v.as_str() {
284        return (!s.is_empty()).then(|| s.to_string());
285    }
286    if let Some(obj) = v.as_object()
287        && let Some(name) = obj.get("name").and_then(|n| n.as_str())
288    {
289        return Some(name.to_string());
290    }
291    if let Some(arr) = v.as_array() {
292        for item in arr {
293            if let Some(name) = item.as_str() {
294                return Some(name.to_string());
295            }
296            if let Some(name) = item.get("name").and_then(|n| n.as_str()) {
297                return Some(name.to_string());
298            }
299        }
300    }
301    None
302}
303
304fn scalar_or_image_url(node: &Value, key: &str) -> Option<String> {
305    let v = node.get(key)?;
306    if let Some(s) = v.as_str() {
307        return (!s.is_empty()).then(|| s.to_string());
308    }
309    if let Some(obj) = v.as_object() {
310        return obj.get("url").and_then(|u| u.as_str()).map(String::from);
311    }
312    if let Some(arr) = v.as_array() {
313        for item in arr {
314            if let Some(s) = item.as_str() {
315                return Some(s.to_string());
316            }
317            if let Some(u) = item.get("url").and_then(|u| u.as_str()) {
318                return Some(u.to_string());
319            }
320        }
321    }
322    None
323}
324
325#[cfg(test)]
326mod jsonld_tests {
327    use super::*;
328    use url::Url;
329
330    fn base() -> Url {
331        Url::parse("https://example.com/article").unwrap()
332    }
333
334    const ARTICLE_HTML: &str = r#"<!doctype html><html><head>
335        <script type="application/ld+json">
336        {
337          "@context": "https://schema.org",
338          "@type": "Article",
339          "headline": "Title from JSON-LD",
340          "description": "Desc from JSON-LD",
341          "author": {"@type":"Person","name":"Ada Lovelace"},
342          "datePublished": "2026-01-01T00:00:00Z",
343          "dateModified": "2026-02-01T00:00:00Z",
344          "image": "https://example.com/og.png"
345        }
346        </script></head><body></body></html>"#;
347
348    #[test]
349    fn extracts_article_scalar_fields() {
350        let m = extract(ARTICLE_HTML, &base());
351        assert_eq!(m.title.as_deref(), Some("Title from JSON-LD"));
352        assert_eq!(m.description.as_deref(), Some("Desc from JSON-LD"));
353        assert_eq!(m.author.as_deref(), Some("Ada Lovelace"));
354        assert_eq!(m.published.as_deref(), Some("2026-01-01T00:00:00Z"));
355        assert_eq!(m.modified.as_deref(), Some("2026-02-01T00:00:00Z"));
356        assert_eq!(m.image.as_deref(), Some("https://example.com/og.png"));
357        assert_eq!(m.schema_types, vec!["Article".to_string()]);
358    }
359
360    const GRAPH_HTML: &str = r#"<!doctype html><html><head>
361        <script type="application/ld+json">
362        {"@context":"https://schema.org","@graph":[
363            {"@type":"WebPage","name":"Should be skipped"},
364            {"@type":"NewsArticle","headline":"News title","author":"Reuters"}
365        ]}
366        </script></head><body></body></html>"#;
367
368    #[test]
369    fn prefers_article_like_type_in_graph() {
370        let m = extract(GRAPH_HTML, &base());
371        assert_eq!(m.title.as_deref(), Some("News title"));
372        assert_eq!(m.author.as_deref(), Some("Reuters"));
373        // Both types appear in schema_types
374        assert!(m.schema_types.contains(&"WebPage".to_string()));
375        assert!(m.schema_types.contains(&"NewsArticle".to_string()));
376    }
377
378    #[test]
379    fn depth_cap_does_not_stack_overflow() {
380        // Build a 20-deep nested object inside @graph so the walker recurses through it.
381        // The walker's MAX_DEPTH=8 cap prevents stack overflow.
382        let mut chain = String::from(r#"{"@type":"Leaf"}"#);
383        for _ in 0..20 {
384            chain = format!(r#"{{"nested":{chain}}}"#);
385        }
386        let payload = format!(r#"{{"@graph":[{chain}]}}"#);
387        let html = format!(
388            r#"<!doctype html><html><head><script type="application/ld+json">{payload}</script></head><body></body></html>"#
389        );
390        let m = extract(&html, &base());
391        // The walker stops at depth 8, so the deeply-nested Leaf is NOT reached.
392        // The test verifies that hitting the cap doesn't panic/stack-overflow.
393        assert!(
394            m.schema_types.is_empty(),
395            "expected cap to prevent deep walk, got {:?}",
396            m.schema_types
397        );
398    }
399
400    #[test]
401    fn malformed_jsonld_does_not_panic() {
402        let html = r#"<!doctype html><html><head>
403            <script type="application/ld+json">{ this is not json }</script>
404            </head><body></body></html>"#;
405        let m = extract(html, &base());
406        assert!(m.is_empty()); // soft-fail: empty contribution
407    }
408}
409
410#[cfg(test)]
411mod og_twitter_tests {
412    use super::*;
413    use url::Url;
414
415    fn base() -> Url {
416        Url::parse("https://example.com/").unwrap()
417    }
418
419    #[test]
420    fn reads_open_graph_metatags() {
421        let html = r#"<!doctype html><html lang="en"><head>
422            <meta property="og:title" content="OG Title">
423            <meta property="og:description" content="OG Desc">
424            <meta property="og:image" content="https://x/og.png">
425            <meta property="og:type" content="article">
426            <meta property="article:published_time" content="2026-03-01T00:00:00Z">
427            <meta property="article:modified_time" content="2026-03-02T00:00:00Z">
428            <meta property="article:author" content="Grace Hopper">
429            </head><body></body></html>"#;
430        let m = extract(html, &base());
431        assert_eq!(m.title.as_deref(), Some("OG Title"));
432        assert_eq!(m.description.as_deref(), Some("OG Desc"));
433        assert_eq!(m.image.as_deref(), Some("https://x/og.png"));
434        assert_eq!(m.og_type.as_deref(), Some("article"));
435        assert_eq!(m.published.as_deref(), Some("2026-03-01T00:00:00Z"));
436        assert_eq!(m.modified.as_deref(), Some("2026-03-02T00:00:00Z"));
437        assert_eq!(m.author.as_deref(), Some("Grace Hopper"));
438        assert_eq!(m.language.as_deref(), Some("en"));
439    }
440
441    #[test]
442    fn twitter_fills_holes_left_by_og() {
443        let html = r#"<!doctype html><html><head>
444            <meta name="twitter:title" content="Twitter Title">
445            <meta name="twitter:description" content="Twitter Desc">
446            <meta name="twitter:image" content="https://x/tc.png">
447            </head><body></body></html>"#;
448        let m = extract(html, &base());
449        assert_eq!(m.title.as_deref(), Some("Twitter Title"));
450        assert_eq!(m.description.as_deref(), Some("Twitter Desc"));
451        assert_eq!(m.image.as_deref(), Some("https://x/tc.png"));
452    }
453
454    #[test]
455    fn jsonld_wins_over_og_wins_over_twitter() {
456        let html = r#"<!doctype html><html><head>
457            <script type="application/ld+json">
458            {"@type":"Article","headline":"JSON-LD Title"}
459            </script>
460            <meta property="og:title" content="OG Title">
461            <meta name="twitter:title" content="Twitter Title">
462            </head><body></body></html>"#;
463        let m = extract(html, &base());
464        assert_eq!(m.title.as_deref(), Some("JSON-LD Title"));
465    }
466
467    #[test]
468    fn description_meta_fills_when_others_missing() {
469        let html = r#"<!doctype html><html><head>
470            <meta name="description" content="Plain meta desc">
471            </head><body></body></html>"#;
472        let m = extract(html, &base());
473        assert_eq!(m.description.as_deref(), Some("Plain meta desc"));
474    }
475
476    #[test]
477    fn canonical_absolutized_against_base() {
478        let html = r#"<!doctype html><html><head>
479            <link rel="canonical" href="/article">
480            </head><body></body></html>"#;
481        let m = extract(html, &base());
482        assert_eq!(m.canonical.as_deref(), Some("https://example.com/article"));
483    }
484}