use lol_html::{doc_comments, element, rewrite_str, RewriteStrSettings};
use spider_agent_types::{CleaningIntent, ContentAnalysis, HtmlCleaningProfile};
pub fn clean_html_raw(html: &str) -> String {
html.to_string()
}
pub fn clean_html_base(html: &str) -> String {
match rewrite_str(
html,
RewriteStrSettings {
element_content_handlers: vec![
element!("script", |el| {
el.remove();
Ok(())
}),
element!("style", |el| {
el.remove();
Ok(())
}),
element!("link", |el| {
el.remove();
Ok(())
}),
element!("iframe", |el| {
el.remove();
Ok(())
}),
element!("[style*='display:none']", |el| {
el.remove();
Ok(())
}),
element!("[id*='ad']", |el| {
el.remove();
Ok(())
}),
element!("[class*='ad']", |el| {
el.remove();
Ok(())
}),
element!("[id*='tracking']", |el| {
el.remove();
Ok(())
}),
element!("[class*='tracking']", |el| {
el.remove();
Ok(())
}),
element!("meta", |el| {
if let Some(attribute) = el.get_attribute("name") {
if attribute != "title" && attribute != "description" {
el.remove();
}
} else {
el.remove();
}
Ok(())
}),
],
document_content_handlers: vec![doc_comments!(|c| {
c.remove();
Ok(())
})],
..RewriteStrSettings::new()
},
) {
Ok(r) => r,
_ => html.into(),
}
}
pub fn clean_html_slim(html: &str) -> String {
match rewrite_str(
html,
RewriteStrSettings {
element_content_handlers: vec![
element!("script", |el| {
el.remove();
Ok(())
}),
element!("style", |el| {
el.remove();
Ok(())
}),
element!("svg", |el| {
el.remove();
Ok(())
}),
element!("noscript", |el| {
el.remove();
Ok(())
}),
element!("link", |el| {
el.remove();
Ok(())
}),
element!("iframe", |el| {
el.remove();
Ok(())
}),
element!("canvas", |el| {
el.remove();
Ok(())
}),
element!("video", |el| {
el.remove();
Ok(())
}),
element!("img", |el| {
if let Some(src) = el.get_attribute("src") {
if src.starts_with("data:image") {
el.remove();
}
}
Ok(())
}),
element!("picture", |el| {
if let Some(src) = el.get_attribute("src") {
if src.starts_with("data:") {
el.remove();
}
}
Ok(())
}),
element!("[style*='display:none']", |el| {
el.remove();
Ok(())
}),
element!("[id*='ad']", |el| {
el.remove();
Ok(())
}),
element!("[class*='ad']", |el| {
el.remove();
Ok(())
}),
element!("[id*='tracking']", |el| {
el.remove();
Ok(())
}),
element!("[class*='tracking']", |el| {
el.remove();
Ok(())
}),
element!("meta", |el| {
if let Some(attribute) = el.get_attribute("name") {
if attribute != "title" && attribute != "description" {
el.remove();
}
} else {
el.remove();
}
Ok(())
}),
],
document_content_handlers: vec![doc_comments!(|c| {
c.remove();
Ok(())
})],
..RewriteStrSettings::new()
},
) {
Ok(r) => r,
_ => html.into(),
}
}
pub fn clean_html_full(html: &str) -> String {
match rewrite_str(
html,
RewriteStrSettings {
element_content_handlers: vec![
element!("script", |el| {
el.remove();
Ok(())
}),
element!("style", |el| {
el.remove();
Ok(())
}),
element!("svg", |el| {
el.remove();
Ok(())
}),
element!("nav", |el| {
el.remove();
Ok(())
}),
element!("footer", |el| {
el.remove();
Ok(())
}),
element!("noscript", |el| {
el.remove();
Ok(())
}),
element!("link", |el| {
el.remove();
Ok(())
}),
element!("iframe", |el| {
el.remove();
Ok(())
}),
element!("canvas", |el| {
el.remove();
Ok(())
}),
element!("video", |el| {
el.remove();
Ok(())
}),
element!("meta", |el| {
let name = el.get_attribute("name").map(|n| n.to_lowercase());
if !matches!(name.as_deref(), Some("viewport") | Some("charset")) {
el.remove();
}
Ok(())
}),
element!("*", |el| {
let mut to_remove: Vec<String> = Vec::new();
for attr in el.attributes().iter() {
let n = attr.name();
let keep = n == "id" || n == "class" || n.starts_with("data-");
if !keep {
to_remove.push(n);
}
}
for attr in to_remove {
el.remove_attribute(&attr);
}
Ok(())
}),
],
document_content_handlers: vec![doc_comments!(|c| {
c.remove();
Ok(())
})],
..RewriteStrSettings::new()
},
) {
Ok(r) => r,
_ => html.into(),
}
}
#[inline]
pub fn clean_html(html: &str) -> String {
clean_html_base(html)
}
pub fn clean_html_with_profile(html: &str, profile: HtmlCleaningProfile) -> String {
clean_html_with_profile_and_intent(html, profile, CleaningIntent::General)
}
pub fn clean_html_with_profile_and_intent(
html: &str,
profile: HtmlCleaningProfile,
intent: CleaningIntent,
) -> String {
match profile {
HtmlCleaningProfile::Raw => clean_html_raw(html),
HtmlCleaningProfile::Default => clean_html(html),
HtmlCleaningProfile::Aggressive => clean_html_full(html),
HtmlCleaningProfile::Slim => clean_html_slim(html),
HtmlCleaningProfile::Minimal => clean_html_base(html),
HtmlCleaningProfile::Auto => {
let analysis = ContentAnalysis::analyze(html);
let auto_profile =
HtmlCleaningProfile::from_content_analysis_with_intent(&analysis, intent);
clean_html_with_profile_and_intent(html, auto_profile, intent)
}
}
}
pub fn smart_clean_html(html: &str, intent: CleaningIntent) -> String {
clean_html_with_profile_and_intent(html, HtmlCleaningProfile::Auto, intent)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_clean_html_raw() {
let html = "<script>alert(1)</script><p>Hello</p>";
assert_eq!(clean_html_raw(html), html);
}
#[test]
fn test_clean_html_base() {
let html = "<script>alert(1)</script><p>Hello</p><style>.x{}</style>";
let cleaned = clean_html_base(html);
assert!(!cleaned.contains("<script>"));
assert!(!cleaned.contains("<style>"));
assert!(cleaned.contains("<p>Hello</p>"));
}
#[test]
fn test_clean_html_slim() {
let html = "<svg><path/></svg><p>Hello</p><canvas></canvas>";
let cleaned = clean_html_slim(html);
assert!(!cleaned.contains("<svg>"));
assert!(!cleaned.contains("<canvas>"));
assert!(cleaned.contains("<p>Hello</p>"));
}
#[test]
fn test_clean_html_full() {
let html = "<nav>Menu</nav><p>Hello</p><footer>Footer</footer>";
let cleaned = clean_html_full(html);
assert!(!cleaned.contains("<nav>"));
assert!(!cleaned.contains("<footer>"));
assert!(cleaned.contains("<p>Hello</p>"));
}
#[test]
fn test_smart_clean_html() {
let simple = "<html><body><p>Hello World!</p></body></html>";
let _cleaned = smart_clean_html(simple, CleaningIntent::General);
}
}