webfetch/
extract.rs

1use scraper::{ElementRef, Html, Selector};
2
3use crate::types::Metadata;
4
5/// Pick the element most likely to contain the primary article content.
6///
7/// Heuristic, in priority order: `<article>`, `<main>`, `[role=main]`,
8/// then the largest `<div>` by text length, falling back to `<body>`.
9pub fn content_root(doc: &Html) -> Option<ElementRef<'_>> {
10    for sel in ["article", "main", "[role=main]"] {
11        if let Ok(selector) = Selector::parse(sel) {
12            if let Some(el) = doc.select(&selector).next() {
13                return Some(el);
14            }
15        }
16    }
17
18    // Fall back to the largest text-bearing <div>.
19    if let Ok(div_sel) = Selector::parse("div") {
20        let mut best: Option<(usize, ElementRef)> = None;
21        for el in doc.select(&div_sel) {
22            let len = el.text().map(|t| t.trim().len()).sum::<usize>();
23            if best.as_ref().is_none_or(|(b, _)| len > *b) {
24                best = Some((len, el));
25            }
26        }
27        if let Some((len, el)) = best {
28            if len > 0 {
29                return Some(el);
30            }
31        }
32    }
33
34    Selector::parse("body")
35        .ok()
36        .and_then(|sel| doc.select(&sel).next())
37}
38
39/// Extract the page title from `<title>` or the first `<h1>`.
40pub fn extract_title(doc: &Html) -> String {
41    for sel in ["title", "h1"] {
42        if let Ok(selector) = Selector::parse(sel) {
43            if let Some(el) = doc.select(&selector).next() {
44                let t = el.text().collect::<String>().trim().to_string();
45                if !t.is_empty() {
46                    return t;
47                }
48            }
49        }
50    }
51    String::new()
52}
53
54/// Read the `content` attribute of the first matching `<meta>` selector.
55fn meta(doc: &Html, selectors: &[&str]) -> Option<String> {
56    for sel in selectors {
57        if let Ok(selector) = Selector::parse(sel) {
58            if let Some(el) = doc.select(&selector).next() {
59                if let Some(c) = el.value().attr("content") {
60                    let c = c.trim();
61                    if !c.is_empty() {
62                        return Some(c.to_string());
63                    }
64                }
65            }
66        }
67    }
68    None
69}
70
71/// Extract citation-oriented metadata: description, author, publish date,
72/// language, and site name (from standard `<meta>`/OpenGraph tags).
73pub fn extract_metadata(doc: &Html) -> Metadata {
74    let lang = Selector::parse("html")
75        .ok()
76        .and_then(|sel| doc.select(&sel).next())
77        .and_then(|el| el.value().attr("lang"))
78        .map(|s| s.trim().to_string())
79        .filter(|s| !s.is_empty());
80
81    Metadata {
82        description: meta(
83            doc,
84            &["meta[name=description]", "meta[property='og:description']"],
85        ),
86        author: meta(
87            doc,
88            &["meta[name=author]", "meta[property='article:author']"],
89        ),
90        published: meta(
91            doc,
92            &[
93                "meta[property='article:published_time']",
94                "meta[name='date']",
95            ],
96        ),
97        site_name: meta(doc, &["meta[property='og:site_name']"]),
98        lang,
99    }
100}
webfetch/extract.rs

webfetch/
extract.rs