use std::collections::HashSet;
use std::fmt::Write;
use scraper::Html;
#[must_use]
pub fn extract_title(html: &str) -> Option<String> {
extract_title_doc(&Html::parse_document(html))
}
#[must_use]
pub fn extract_links(html: &str) -> Vec<String> {
extract_links_doc(&Html::parse_document(html), None)
}
#[must_use]
pub fn extract_links_with_base(html: &str, base_url: &str) -> Vec<String> {
let base = url::Url::parse(base_url).ok();
extract_links_doc(&Html::parse_document(html), base.as_ref())
}
#[must_use]
pub fn select(html: &str, selector: &str) -> String {
select_doc(&Html::parse_document(html), html, selector)
}
#[must_use]
pub fn detect_main_content(html: &str) -> Option<String> {
detect_main_content_doc(&Html::parse_document(html))
}
#[must_use]
pub fn readable_content(html: &str) -> String {
let doc = Html::parse_document(html);
readable_content_doc(&doc, html)
}
#[must_use]
pub fn strip_noise(html: &str) -> String {
let doc = Html::parse_document(html);
strip_noise_doc(&doc, html)
}
pub(crate) fn extract_title_doc(doc: &Html) -> Option<String> {
let sel = scraper::Selector::parse("title").ok()?;
doc.select(&sel)
.next()
.map(|el| el.text().collect::<String>().trim().to_owned())
.filter(|t| !t.is_empty())
}
pub(crate) fn extract_links_doc(doc: &Html, base: Option<&url::Url>) -> Vec<String> {
let Ok(sel) = scraper::Selector::parse("a[href]") else {
return Vec::new();
};
doc.select(&sel)
.filter_map(|el| el.value().attr("href"))
.filter(|href| !href.is_empty() && !href.starts_with('#'))
.map(|href| resolve_href(href, base))
.collect()
}
pub(crate) fn select_doc(doc: &Html, original: &str, selector: &str) -> String {
collect_inner_html(doc, selector).unwrap_or_else(|| original.to_owned())
}
pub(crate) fn extract_language_doc(doc: &Html) -> Option<String> {
let sel = scraper::Selector::parse("html").ok()?;
doc.select(&sel)
.next()
.and_then(|el| el.value().attr("lang"))
.map(|s| s.trim().to_owned())
.filter(|s| !s.is_empty())
}
pub(crate) fn extract_description_doc(doc: &Html) -> Option<String> {
extract_meta_attr(doc, r#"meta[name="description"]"#, "content")
}
pub(crate) fn extract_og_image_doc(doc: &Html) -> Option<String> {
extract_meta_attr(doc, r#"meta[property="og:image"]"#, "content")
}
const MAIN_CONTENT_SELECTORS: &[&str] = &[
"article",
"[role=\"main\"]",
"main",
".post-content",
".entry-content",
"#content",
".content",
];
const NOISE_SELECTORS: &[&str] = &[
"nav",
"footer",
"aside",
"body > header",
"[role=\"navigation\"]",
"[role=\"banner\"]",
"[role=\"contentinfo\"]",
"[role=\"complementary\"]",
"[role=\"search\"]",
"[aria-hidden=\"true\"]",
];
pub(crate) fn detect_main_content_doc(doc: &Html) -> Option<String> {
MAIN_CONTENT_SELECTORS
.iter()
.find_map(|sel| collect_inner_html(doc, sel))
}
pub(crate) fn readable_content_doc(doc: &Html, original: &str) -> String {
if let Some(main) = detect_main_content_doc(doc) {
return main;
}
strip_noise_doc(doc, original)
}
pub(crate) fn strip_noise_doc(doc: &Html, original: &str) -> String {
let noise_ids = collect_noise_ids(doc);
if noise_ids.is_empty() {
return original.to_owned();
}
let mut buf = String::with_capacity(original.len());
render_children(doc.tree.root(), &noise_ids, &mut buf);
buf
}
fn collect_inner_html(doc: &Html, selector: &str) -> Option<String> {
let sel = scraper::Selector::parse(selector).ok()?;
let mut buf = String::new();
for el in doc.select(&sel) {
buf.push_str(&el.inner_html());
}
if buf.trim().is_empty() {
None
} else {
Some(buf)
}
}
fn extract_meta_attr(doc: &Html, selector: &str, attr: &str) -> Option<String> {
let sel = scraper::Selector::parse(selector).ok()?;
doc.select(&sel)
.next()
.and_then(|el| el.value().attr(attr))
.map(|s| s.trim().to_owned())
.filter(|s| !s.is_empty())
}
fn resolve_href(href: &str, base_url: Option<&url::Url>) -> String {
let Some(base) = base_url else {
return href.to_owned();
};
if url::Url::parse(href).is_ok() {
return href.to_owned();
}
base.join(href)
.map_or_else(|_| href.to_owned(), |u| u.to_string())
}
fn collect_noise_ids(doc: &Html) -> HashSet<ego_tree::NodeId> {
let mut ids = HashSet::new();
for &sel_str in NOISE_SELECTORS {
if let Ok(sel) = scraper::Selector::parse(sel_str) {
for el in doc.select(&sel) {
ids.insert(el.id());
}
}
}
ids
}
fn render_children(
node: ego_tree::NodeRef<'_, scraper::Node>,
skip: &HashSet<ego_tree::NodeId>,
buf: &mut String,
) {
for child in node.children() {
if skip.contains(&child.id()) {
continue;
}
match child.value() {
scraper::Node::Text(text) => buf.push_str(text),
scraper::Node::Element(el) => {
let tag = el.name();
let _ = write!(buf, "<{tag}");
for (name, val) in el.attrs() {
let escaped = val
.replace('&', "&")
.replace('"', """)
.replace('<', "<")
.replace('>', ">");
let _ = write!(buf, r#" {name}="{escaped}""#);
}
buf.push('>');
if !is_void_element(tag) {
render_children(child, skip, buf);
let _ = write!(buf, "</{tag}>");
}
}
scraper::Node::Document | scraper::Node::Fragment => {
render_children(child, skip, buf);
}
_ => {}
}
}
}
fn is_void_element(tag: &str) -> bool {
matches!(
tag,
"area"
| "base"
| "br"
| "col"
| "embed"
| "hr"
| "img"
| "input"
| "link"
| "meta"
| "param"
| "source"
| "track"
| "wbr"
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn title_basic() {
let html = "<html><head><title> Test Page </title></head><body></body></html>";
assert_eq!(extract_title(html).as_deref(), Some("Test Page"));
}
#[test]
fn title_missing() {
assert_eq!(extract_title("<html><body>no title</body></html>"), None);
}
#[test]
fn title_empty() {
assert_eq!(
extract_title("<html><head><title> </title></head></html>"),
None
);
}
#[test]
fn links_basic() {
let html = r##"<a href="/a">A</a><a href="https://b.com">B</a><a href="#c">C</a>"##;
let links = extract_links(html);
assert_eq!(links, vec!["/a", "https://b.com"]);
}
#[test]
fn links_empty_href_excluded() {
assert!(extract_links(r#"<a href="">empty</a>"#).is_empty());
}
#[test]
fn select_basic() {
let html = r"<article>content</article><aside>nav</aside>";
assert_eq!(select(html, "article"), "content");
}
#[test]
fn select_no_match_returns_original() {
let html = "<p>hello</p>";
assert_eq!(select(html, ".missing"), html);
}
#[test]
fn select_invalid_selector_returns_original() {
let html = "<p>hello</p>";
assert_eq!(select(html, ":::invalid"), html);
}
#[test]
fn detect_main_content_article() {
let html = "<nav>nav</nav><article><p>main</p></article>";
assert_eq!(detect_main_content(html).as_deref(), Some("<p>main</p>"));
}
#[test]
fn detect_main_content_none() {
assert!(detect_main_content("<div>plain</div>").is_none());
}
}