use scraper::{Html, Selector};
const MAIN_SELECTORS: &[&str] = &[
"article#main-content",
"main",
".main-content",
"#content",
".content",
"article",
"[role='main']",
"#main",
".article-content",
];
const JUNK_SELECTORS: &[&str] = &[
"header",
"footer",
"nav",
"aside",
".sidebar",
"#sidebar",
".widget",
"form",
"iframe",
"noscript",
".advertisement",
".ads",
".cookie-notice",
".share-buttons",
"[class*='social']",
];
const MIN_CONTENT_LEN: usize = 100;
pub fn extract_main_content(html: &str) -> String {
let document = Html::parse_document(html);
for selector in MAIN_SELECTORS {
if let Ok(sel) = Selector::parse(selector)
&& let Some(element) = document.select(&sel).next()
{
let content = element.html();
if content.trim().len() > MIN_CONTENT_LEN {
return content;
}
}
}
if let Ok(body_sel) = Selector::parse("body")
&& let Some(body) = document.select(&body_sel).next()
{
let mut content = body.html();
for selector in JUNK_SELECTORS {
if let Ok(sel) = Selector::parse(selector) {
for element in document.select(&sel) {
let fragment = element.html();
if !fragment.is_empty() {
content = content.replace(&fragment, "");
}
}
}
}
if content.trim().len() > MIN_CONTENT_LEN {
return content;
}
}
html.to_string()
}