1use scraper::{ElementRef, Html, Selector};
2
3use crate::types::Metadata;
4
5pub fn content_root(doc: &Html) -> Option<ElementRef<'_>> {
10 for sel in ["article", "main", "[role=main]"] {
11 if let Ok(selector) = Selector::parse(sel) {
12 if let Some(el) = doc.select(&selector).next() {
13 return Some(el);
14 }
15 }
16 }
17
18 if let Ok(div_sel) = Selector::parse("div") {
20 let mut best: Option<(usize, ElementRef)> = None;
21 for el in doc.select(&div_sel) {
22 let len = el.text().map(|t| t.trim().len()).sum::<usize>();
23 if best.as_ref().is_none_or(|(b, _)| len > *b) {
24 best = Some((len, el));
25 }
26 }
27 if let Some((len, el)) = best {
28 if len > 0 {
29 return Some(el);
30 }
31 }
32 }
33
34 Selector::parse("body")
35 .ok()
36 .and_then(|sel| doc.select(&sel).next())
37}
38
39pub fn extract_title(doc: &Html) -> String {
41 for sel in ["title", "h1"] {
42 if let Ok(selector) = Selector::parse(sel) {
43 if let Some(el) = doc.select(&selector).next() {
44 let t = el.text().collect::<String>().trim().to_string();
45 if !t.is_empty() {
46 return t;
47 }
48 }
49 }
50 }
51 String::new()
52}
53
54fn meta(doc: &Html, selectors: &[&str]) -> Option<String> {
56 for sel in selectors {
57 if let Ok(selector) = Selector::parse(sel) {
58 if let Some(el) = doc.select(&selector).next() {
59 if let Some(c) = el.value().attr("content") {
60 let c = c.trim();
61 if !c.is_empty() {
62 return Some(c.to_string());
63 }
64 }
65 }
66 }
67 }
68 None
69}
70
71pub fn extract_metadata(doc: &Html) -> Metadata {
74 let lang = Selector::parse("html")
75 .ok()
76 .and_then(|sel| doc.select(&sel).next())
77 .and_then(|el| el.value().attr("lang"))
78 .map(|s| s.trim().to_string())
79 .filter(|s| !s.is_empty());
80
81 Metadata {
82 description: meta(
83 doc,
84 &["meta[name=description]", "meta[property='og:description']"],
85 ),
86 author: meta(
87 doc,
88 &["meta[name=author]", "meta[property='article:author']"],
89 ),
90 published: meta(
91 doc,
92 &[
93 "meta[property='article:published_time']",
94 "meta[name='date']",
95 ],
96 ),
97 site_name: meta(doc, &["meta[property='og:site_name']"]),
98 lang,
99 }
100}