1use std::collections::HashMap;
2
3use ego_tree::{NodeId, NodeRef};
4use scraper::node::Node;
5use scraper::{ElementRef, Html, Selector};
6
7use crate::types::Metadata;
8
9fn subtree_text_lengths(root: NodeRef<Node>, out: &mut HashMap<NodeId, usize>) -> usize {
18 let mut total = match root.value() {
19 Node::Text(t) => t.trim().len(),
20 _ => 0,
21 };
22 for child in root.children() {
23 total += subtree_text_lengths(child, out);
24 }
25 out.insert(root.id(), total);
26 total
27}
28
29pub fn content_root(doc: &Html) -> Option<ElementRef<'_>> {
34 for sel in ["article", "main", "[role=main]"] {
35 if let Ok(selector) = Selector::parse(sel) {
36 if let Some(el) = doc.select(&selector).next() {
37 return Some(el);
38 }
39 }
40 }
41
42 if let Ok(div_sel) = Selector::parse("div") {
45 let mut lengths: HashMap<NodeId, usize> = HashMap::new();
46 subtree_text_lengths(doc.tree.root(), &mut lengths);
47
48 let mut best: Option<(usize, ElementRef)> = None;
49 for el in doc.select(&div_sel) {
50 let len = lengths.get(&el.id()).copied().unwrap_or(0);
51 if best.as_ref().is_none_or(|(b, _)| len > *b) {
52 best = Some((len, el));
53 }
54 }
55 if let Some((len, el)) = best {
56 if len > 0 {
57 return Some(el);
58 }
59 }
60 }
61
62 Selector::parse("body")
63 .ok()
64 .and_then(|sel| doc.select(&sel).next())
65}
66
67pub fn extract_title(doc: &Html) -> String {
69 for sel in ["title", "h1"] {
70 if let Ok(selector) = Selector::parse(sel) {
71 if let Some(el) = doc.select(&selector).next() {
72 let t = el.text().collect::<String>().trim().to_string();
73 if !t.is_empty() {
74 return t;
75 }
76 }
77 }
78 }
79 String::new()
80}
81
82fn meta(doc: &Html, selectors: &[&str]) -> Option<String> {
84 for sel in selectors {
85 if let Ok(selector) = Selector::parse(sel) {
86 if let Some(el) = doc.select(&selector).next() {
87 if let Some(c) = el.value().attr("content") {
88 let c = c.trim();
89 if !c.is_empty() {
90 return Some(c.to_string());
91 }
92 }
93 }
94 }
95 }
96 None
97}
98
99pub fn extract_metadata(doc: &Html) -> Metadata {
102 let lang = Selector::parse("html")
103 .ok()
104 .and_then(|sel| doc.select(&sel).next())
105 .and_then(|el| el.value().attr("lang"))
106 .map(|s| s.trim().to_string())
107 .filter(|s| !s.is_empty());
108
109 Metadata {
110 description: meta(
111 doc,
112 &["meta[name=description]", "meta[property='og:description']"],
113 ),
114 author: meta(
115 doc,
116 &["meta[name=author]", "meta[property='article:author']"],
117 ),
118 published: meta(
119 doc,
120 &[
121 "meta[property='article:published_time']",
122 "meta[name='date']",
123 ],
124 ),
125 site_name: meta(doc, &["meta[property='og:site_name']"]),
126 lang,
127 }
128}