use scraper::{Html, Selector};
pub fn extract_readable_content(document: &Html) -> String {
let content_selectors = [
"article",
"main",
"[role=\"main\"]",
".post-content",
".article-content",
".entry-content",
".content",
"#content",
".post",
".article",
];
for selector_str in content_selectors {
if let Ok(selector) = Selector::parse(selector_str)
&& let Some(element) = document.select(&selector).next()
{
return element.html();
}
}
if let Ok(body_selector) = Selector::parse("body")
&& let Some(body) = document.select(&body_selector).next()
{
return body.html();
}
document.html()
}
pub fn html_to_text(html: &str) -> String {
let document = Html::parse_fragment(html);
let mut text = String::new();
fn extract_text(node: scraper::ElementRef, text: &mut String) {
for child in node.children() {
if let Some(element) = scraper::ElementRef::wrap(child) {
let tag = element.value().name();
if matches!(
tag,
"script" | "style" | "nav" | "header" | "footer" | "aside" | "noscript"
) {
continue;
}
if matches!(
tag,
"p" | "div" | "br" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "li" | "tr"
) {
text.push('\n');
}
extract_text(element, text);
} else if let Some(t) = child.value().as_text() {
let trimmed = t.trim();
if !trimmed.is_empty() {
if !text.is_empty() && !text.ends_with('\n') && !text.ends_with(' ') {
text.push(' ');
}
text.push_str(trimmed);
}
}
}
}
if let Ok(root_selector) = Selector::parse(":root")
&& let Some(root) = document.select(&root_selector).next()
{
extract_text(root, &mut text);
}
text.lines()
.map(|l| l.trim())
.filter(|l| !l.is_empty())
.collect::<Vec<_>>()
.join("\n")
}
pub fn extract_text_from_html(raw_html: &str) -> String {
let document = Html::parse_document(raw_html);
let content_html = extract_readable_content(&document);
html_to_text(&content_html)
}