use std::collections::HashSet;
use std::fmt::Write;
use scraper::Html;
use super::select::collect_inner_html;
#[must_use]
pub fn detect_main_content(html: &str) -> Option<String> {
detect_main_content_doc(&Html::parse_document(html))
}
#[must_use]
pub fn readable_content(html: &str) -> String {
let doc = Html::parse_document(html);
readable_content_doc(&doc, html)
}
#[must_use]
pub fn strip_noise(html: &str) -> String {
let doc = Html::parse_document(html);
strip_noise_doc(&doc, html)
}
const MAIN_CONTENT_SELECTORS: &[&str] = &[
"article",
"[role=\"main\"]",
"main",
".post-content",
".entry-content",
"#content",
".content",
];
const NOISE_SELECTORS: &[&str] = &[
"nav",
"footer",
"aside",
"body > header",
"[role=\"navigation\"]",
"[role=\"banner\"]",
"[role=\"contentinfo\"]",
"[role=\"complementary\"]",
"[role=\"search\"]",
"[aria-hidden=\"true\"]",
];
fn detect_main_content_doc(doc: &Html) -> Option<String> {
MAIN_CONTENT_SELECTORS
.iter()
.find_map(|sel| collect_inner_html(doc, sel))
}
pub(crate) fn readable_content_doc(doc: &Html, original: &str) -> String {
if let Some(main) = detect_main_content_doc(doc) {
return main;
}
strip_noise_doc(doc, original)
}
fn strip_noise_doc(doc: &Html, original: &str) -> String {
let noise_ids = collect_noise_ids(doc);
if noise_ids.is_empty() {
return original.to_owned();
}
let mut buf = String::with_capacity(original.len());
render_children(doc.tree.root(), &noise_ids, &mut buf);
buf
}
fn collect_noise_ids(doc: &Html) -> HashSet<ego_tree::NodeId> {
let mut ids = HashSet::new();
for &sel_str in NOISE_SELECTORS {
if let Ok(sel) = scraper::Selector::parse(sel_str) {
for el in doc.select(&sel) {
ids.insert(el.id());
}
}
}
ids
}
fn render_children(
node: ego_tree::NodeRef<'_, scraper::Node>,
skip: &HashSet<ego_tree::NodeId>,
buf: &mut String,
) {
for child in node.children() {
if skip.contains(&child.id()) {
continue;
}
match child.value() {
scraper::Node::Text(text) => buf.push_str(text),
scraper::Node::Element(el) => {
let tag = el.name();
_ = write!(buf, "<{tag}");
for (name, val) in el.attrs() {
_ = write!(buf, " {name}=\"");
escape_attr_value(val, buf);
buf.push('"');
}
buf.push('>');
if !is_void_element(tag) {
render_children(child, skip, buf);
_ = write!(buf, "</{tag}>");
}
}
scraper::Node::Document | scraper::Node::Fragment => {
render_children(child, skip, buf);
}
_ => {}
}
}
}
fn escape_attr_value(val: &str, buf: &mut String) {
for c in val.chars() {
match c {
'&' => buf.push_str("&"),
'"' => buf.push_str("""),
'<' => buf.push_str("<"),
'>' => buf.push_str(">"),
_ => buf.push(c),
}
}
}
fn is_void_element(tag: &str) -> bool {
matches!(
tag,
"area"
| "base"
| "br"
| "col"
| "embed"
| "hr"
| "img"
| "input"
| "link"
| "meta"
| "param"
| "source"
| "track"
| "wbr"
)
}