use scraper::{ElementRef, Html, Selector};
use crate::types::Metadata;
pub fn content_root(doc: &Html) -> Option<ElementRef<'_>> {
for sel in ["article", "main", "[role=main]"] {
if let Ok(selector) = Selector::parse(sel) {
if let Some(el) = doc.select(&selector).next() {
return Some(el);
}
}
}
if let Ok(div_sel) = Selector::parse("div") {
let mut best: Option<(usize, ElementRef)> = None;
for el in doc.select(&div_sel) {
let len = el.text().map(|t| t.trim().len()).sum::<usize>();
if best.as_ref().is_none_or(|(b, _)| len > *b) {
best = Some((len, el));
}
}
if let Some((len, el)) = best {
if len > 0 {
return Some(el);
}
}
}
Selector::parse("body")
.ok()
.and_then(|sel| doc.select(&sel).next())
}
pub fn extract_title(doc: &Html) -> String {
for sel in ["title", "h1"] {
if let Ok(selector) = Selector::parse(sel) {
if let Some(el) = doc.select(&selector).next() {
let t = el.text().collect::<String>().trim().to_string();
if !t.is_empty() {
return t;
}
}
}
}
String::new()
}
fn meta(doc: &Html, selectors: &[&str]) -> Option<String> {
for sel in selectors {
if let Ok(selector) = Selector::parse(sel) {
if let Some(el) = doc.select(&selector).next() {
if let Some(c) = el.value().attr("content") {
let c = c.trim();
if !c.is_empty() {
return Some(c.to_string());
}
}
}
}
}
None
}
pub fn extract_metadata(doc: &Html) -> Metadata {
let lang = Selector::parse("html")
.ok()
.and_then(|sel| doc.select(&sel).next())
.and_then(|el| el.value().attr("lang"))
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty());
Metadata {
description: meta(
doc,
&["meta[name=description]", "meta[property='og:description']"],
),
author: meta(
doc,
&["meta[name=author]", "meta[property='article:author']"],
),
published: meta(
doc,
&[
"meta[property='article:published_time']",
"meta[name='date']",
],
),
site_name: meta(doc, &["meta[property='og:site_name']"]),
lang,
}
}