use ego_tree::NodeId;
use scraper::{Html, Selector};
use crate::dom;
use crate::scorer;
const ENTRY_POINT_SELECTORS: &[&str] = &[
"#post, .post-content, .post-body",
".article-content, #article-content",
".article_post, .article-wrapper",
".entry-content, .content-article",
".instapaper_body",
".post",
".markdown-body",
".markdown-preview-sizer",
"article, [role=\"article\"]",
"main, [role=\"main\"]",
".article-body",
"#content",
"body",
];
const FALLBACK_SELECTOR: &str = "div, section, article, main";
const MIN_WORDS_FOR_CHILD_PREFERENCE: usize = 50;
#[must_use]
pub fn find_body(html: &Html) -> Option<NodeId> {
let Ok(sel) = Selector::parse("body") else {
return None;
};
html.select(&sel).next().map(|el| el.id())
}
#[must_use]
pub fn find_main_content(html: &Html) -> NodeId {
let total_selectors = ENTRY_POINT_SELECTORS.len();
let (candidates, only_body_matched) = collect_candidates(html, total_selectors);
let best = if only_body_matched {
pick_fallback(html).or_else(|| extract_body(&candidates, html))
} else {
pick_best(html, &candidates)
};
best.or_else(|| find_body(html))
.unwrap_or_else(|| html.tree.root().id())
}
struct Candidate {
node_id: NodeId,
score: f64,
selector_index: usize,
}
fn collect_candidates(html: &Html, total_selectors: usize) -> (Vec<Candidate>, bool) {
let mut candidates = Vec::new();
let mut non_body_matched = false;
let body_index = total_selectors - 1;
for (idx, selector_str) in ENTRY_POINT_SELECTORS.iter().enumerate() {
let Ok(sel) = Selector::parse(selector_str) else {
continue;
};
for el_ref in html.select(&sel) {
let node_id = el_ref.id();
#[expect(clippy::cast_precision_loss)]
let priority_bonus = (total_selectors - idx) as f64 * 40.0;
let element_score = scorer::score_element(html, node_id);
let score = priority_bonus + element_score;
if idx != body_index {
non_body_matched = true;
}
candidates.push(Candidate {
node_id,
score,
selector_index: idx,
});
}
}
candidates.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
(candidates, !non_body_matched)
}
fn pick_best(html: &Html, candidates: &[Candidate]) -> Option<NodeId> {
let best = candidates.first()?;
let best_id = best.node_id;
let best_selector_idx = best.selector_index;
for candidate in candidates.iter().skip(1) {
if candidate.selector_index >= best_selector_idx {
continue;
}
if !dom::is_ancestor(html, candidate.node_id, best_id) {
continue;
}
let text = dom::text_content(html, candidate.node_id);
if dom::count_words(&text) > MIN_WORDS_FOR_CHILD_PREFERENCE {
return Some(candidate.node_id);
}
}
Some(best_id)
}
fn pick_fallback(html: &Html) -> Option<NodeId> {
let Ok(sel) = Selector::parse(FALLBACK_SELECTOR) else {
return None;
};
let mut best_id: Option<NodeId> = None;
let mut best_score = f64::NEG_INFINITY;
for el_ref in html.select(&sel) {
let node_id = el_ref.id();
let score = scorer::score_element(html, node_id);
if score > best_score {
best_score = score;
best_id = Some(node_id);
}
}
best_id
}
fn extract_body(candidates: &[Candidate], html: &Html) -> Option<NodeId> {
for c in candidates {
if dom::is_tag(html, c.node_id, "body") {
return Some(c.node_id);
}
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn find_body_returns_body() {
let html = Html::parse_document("<html><body><p>hello</p></body></html>");
let body = find_body(&html);
assert!(body.is_some());
}
#[test]
fn find_main_content_picks_article_over_nav() {
let words = "word ".repeat(120);
let html_str = format!(
r#"<html><body>
<nav><a href="/">Home</a><a href="/about">About</a></nav>
<article><p>{words}</p></article>
</body></html>"#
);
let doc = Html::parse_document(&html_str);
let main = find_main_content(&doc);
let tag = dom::tag_name(&doc, main);
assert_eq!(tag.as_deref(), Some("article"));
}
#[test]
fn find_main_content_picks_post_content_over_body() {
let words = "word ".repeat(120);
let html_str = format!(
r#"<html><body>
<div class="post-content"><p>{words}</p></div>
</body></html>"#
);
let doc = Html::parse_document(&html_str);
let main = find_main_content(&doc);
let class = dom::get_attr(&doc, main, "class");
assert_eq!(class.as_deref(), Some("post-content"));
}
#[test]
fn find_main_content_falls_back_to_body() {
let doc = Html::parse_document("<html><body><span>Just a span.</span></body></html>");
let main = find_main_content(&doc);
let tag = dom::tag_name(&doc, main);
assert!(tag.is_some(), "fallback should return a valid element");
}
}