Skip to main content

webfetch/
extract.rs

1use std::collections::HashMap;
2
3use ego_tree::{NodeId, NodeRef};
4use scraper::node::Node;
5use scraper::{ElementRef, Html, Selector};
6
7use crate::types::Metadata;
8
9/// Sum the trimmed length of every descendant text node, for every node in the
10/// tree, in a single bottom-up pass.
11///
12/// The previous "largest `<div>`" heuristic called `el.text()` (a full subtree
13/// walk) once per `<div>`; on nested DOMs the same text was re-summed at every
14/// ancestor, making it ~O(n²). Computing each node's subtree text length once
15/// and reading it back from the map keeps the identical "largest text-bearing
16/// container" semantics in O(n).
17fn subtree_text_lengths(root: NodeRef<Node>, out: &mut HashMap<NodeId, usize>) -> usize {
18    let mut total = match root.value() {
19        Node::Text(t) => t.trim().len(),
20        _ => 0,
21    };
22    for child in root.children() {
23        total += subtree_text_lengths(child, out);
24    }
25    out.insert(root.id(), total);
26    total
27}
28
29/// Pick the element most likely to contain the primary article content.
30///
31/// Heuristic, in priority order: `<article>`, `<main>`, `[role=main]`,
32/// then the largest `<div>` by text length, falling back to `<body>`.
33pub fn content_root(doc: &Html) -> Option<ElementRef<'_>> {
34    for sel in ["article", "main", "[role=main]"] {
35        if let Ok(selector) = Selector::parse(sel) {
36            if let Some(el) = doc.select(&selector).next() {
37                return Some(el);
38            }
39        }
40    }
41
42    // Fall back to the largest text-bearing <div>, using one bottom-up pass to
43    // compute every node's subtree text length up front.
44    if let Ok(div_sel) = Selector::parse("div") {
45        let mut lengths: HashMap<NodeId, usize> = HashMap::new();
46        subtree_text_lengths(doc.tree.root(), &mut lengths);
47
48        let mut best: Option<(usize, ElementRef)> = None;
49        for el in doc.select(&div_sel) {
50            let len = lengths.get(&el.id()).copied().unwrap_or(0);
51            if best.as_ref().is_none_or(|(b, _)| len > *b) {
52                best = Some((len, el));
53            }
54        }
55        if let Some((len, el)) = best {
56            if len > 0 {
57                return Some(el);
58            }
59        }
60    }
61
62    Selector::parse("body")
63        .ok()
64        .and_then(|sel| doc.select(&sel).next())
65}
66
67/// Extract the page title from `<title>` or the first `<h1>`.
68pub fn extract_title(doc: &Html) -> String {
69    for sel in ["title", "h1"] {
70        if let Ok(selector) = Selector::parse(sel) {
71            if let Some(el) = doc.select(&selector).next() {
72                let t = el.text().collect::<String>().trim().to_string();
73                if !t.is_empty() {
74                    return t;
75                }
76            }
77        }
78    }
79    String::new()
80}
81
82/// Read the `content` attribute of the first matching `<meta>` selector.
83fn meta(doc: &Html, selectors: &[&str]) -> Option<String> {
84    for sel in selectors {
85        if let Ok(selector) = Selector::parse(sel) {
86            if let Some(el) = doc.select(&selector).next() {
87                if let Some(c) = el.value().attr("content") {
88                    let c = c.trim();
89                    if !c.is_empty() {
90                        return Some(c.to_string());
91                    }
92                }
93            }
94        }
95    }
96    None
97}
98
99/// Extract citation-oriented metadata: description, author, publish date,
100/// language, and site name (from standard `<meta>`/OpenGraph tags).
101pub fn extract_metadata(doc: &Html) -> Metadata {
102    let lang = Selector::parse("html")
103        .ok()
104        .and_then(|sel| doc.select(&sel).next())
105        .and_then(|el| el.value().attr("lang"))
106        .map(|s| s.trim().to_string())
107        .filter(|s| !s.is_empty());
108
109    Metadata {
110        description: meta(
111            doc,
112            &["meta[name=description]", "meta[property='og:description']"],
113        ),
114        author: meta(
115            doc,
116            &["meta[name=author]", "meta[property='article:author']"],
117        ),
118        published: meta(
119            doc,
120            &[
121                "meta[property='article:published_time']",
122                "meta[name='date']",
123            ],
124        ),
125        site_name: meta(doc, &["meta[property='og:site_name']"]),
126        lang,
127    }
128}