use crate::{Error, Result};
use ego_tree::NodeRef;
use html_escape::encode_double_quoted_attribute;
use scraper::{ElementRef, Html, node::Node};
const TAGS_TO_REMOVE: &[&str] = &["script", "link", "style", "svg", "base"];
const REMOVABLE_EMPTY_TAGS: &[&str] = &[
"div", "span", "p", "i", "b", "em", "strong", "section", "article", "header", "footer", "nav", "aside",
];
const META_PROPERTY_KEYWORDS: &[&str] = &["title", "url", "image", "description"];
const ALLOWED_META_ATTRS: &[&str] = &["property", "content"];
const ALLOWED_BODY_ATTRS: &[&str] = &["class", "aria-label", "href", "title", "id"];
pub fn decode_html_entities(content: &str) -> String {
html_escape::decode_html_entities(content).to_string()
}
pub fn slim(html_content: &str) -> Result<String> {
let html = Html::parse_document(html_content);
let mut output = String::new();
process_node_recursive(html.tree.root(), false, &mut output)?;
let content = remove_empty_lines(output)?;
Ok(content)
}
fn remove_empty_lines(content: String) -> Result<String> {
let lines: Vec<&str> = content.lines().filter(|line| !line.trim().is_empty()).collect();
Ok(lines.join("\n"))
}
fn is_string_effectively_empty(s: &str) -> bool {
s.trim().is_empty()
}
fn process_node_recursive(node: NodeRef<Node>, is_in_head_context: bool, output: &mut String) -> Result<()> {
match node.value() {
Node::Document => {
for child in node.children() {
process_node_recursive(child, false, output)?; }
}
Node::Doctype(doctype) => {
output.push_str("<!DOCTYPE ");
output.push_str(&doctype.name);
let has_public = !doctype.public_id.is_empty();
let has_system = !doctype.system_id.is_empty();
if has_public {
output.push_str(" PUBLIC \"");
output.push_str(&doctype.public_id);
output.push('"');
}
if has_system {
if !has_public {
output.push_str(" SYSTEM");
}
output.push(' '); output.push('"');
output.push_str(&doctype.system_id);
output.push('"');
}
output.push('>');
}
Node::Comment(_) => { }
Node::Text(text) => {
let text_content = text.trim();
if !text_content.is_empty() {
output.push_str(text);
}
}
Node::Element(element) => {
let tag_name = element.name();
let current_node_is_head = tag_name == "head";
let child_context_is_in_head = is_in_head_context || current_node_is_head;
let el_ref = ElementRef::wrap(node).ok_or_else(|| Error::custom("Failed to wrap node as ElementRef"))?;
if !child_context_is_in_head && TAGS_TO_REMOVE.contains(&tag_name) {
return Ok(());
}
if matches!(tag_name, "script" | "style" | "link" | "base" | "svg") {
return Ok(());
}
if is_in_head_context {
if tag_name == "title" {
} else if tag_name == "meta" {
if !should_keep_meta(el_ref) {
return Ok(()); }
} else {
return Ok(()); }
}
let mut children_output = String::new();
for child in node.children() {
process_node_recursive(child, child_context_is_in_head, &mut children_output)?;
}
let is_empty_after_processing = is_string_effectively_empty(&children_output);
let is_removable_tag_when_empty = !child_context_is_in_head && REMOVABLE_EMPTY_TAGS.contains(&tag_name);
let is_empty_head_tag = current_node_is_head && is_empty_after_processing;
let should_remove_node = (is_removable_tag_when_empty && is_empty_after_processing) || is_empty_head_tag;
if !should_remove_node {
output.push('<');
output.push_str(tag_name);
filter_and_write_attributes(el_ref, child_context_is_in_head, output)?;
output.push('>');
output.push_str(&children_output);
output.push_str("</");
output.push_str(tag_name);
output.push('>');
}
}
Node::Fragment => {
for child in node.children() {
process_node_recursive(child, false, output)?;
}
}
Node::ProcessingInstruction(_) => { }
}
Ok(())
}
fn should_keep_meta(element: ElementRef) -> bool {
if element.value().name() != "meta" {
return false;
}
if let Some(prop_value) = element.value().attr("property") {
let value_lower = prop_value.to_lowercase();
META_PROPERTY_KEYWORDS.iter().any(|&keyword| value_lower.contains(keyword))
} else {
false
}
}
fn filter_and_write_attributes(element: ElementRef, is_in_head_context: bool, output: &mut String) -> Result<()> {
let tag_name = element.value().name();
let allowed_attrs: &[&str] = if is_in_head_context {
match tag_name {
"meta" => ALLOWED_META_ATTRS,
"title" => &[], _ => &[], }
} else {
ALLOWED_BODY_ATTRS
};
for (name, value) in element.value().attrs() {
if allowed_attrs.contains(&name) {
output.push(' ');
output.push_str(name);
output.push_str("=\"");
output.push_str(&encode_double_quoted_attribute(value));
output.push('"');
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
type TestResult<T> = core::result::Result<T, Box<dyn std::error::Error>>;
#[test]
fn test_slimmer2_slim_basic() -> TestResult<()> {
let fx_html = r#"
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta property="og:title" content="Test Title">
<meta property="og:url" content="http://example.com">
<meta property="og:image" content="http://example.com/img.png">
<meta property="og:description" content="Test Description">
<meta name="keywords" content="test, html"> <!-- Should be removed -->
<title>Simple HTML Page</title>
<style> body{ color: red } </style>
<link rel="stylesheet" href="style.css">
<script> console.log("hi"); </script>
<base href="/"> <!-- Should be removed -->
</head>
<body class="main-body" aria-label="Page body">
<svg><path d="M0 0 L 10 10"></path></svg> <!-- Should be removed -->
<div>
<span></span> <!-- Should be removed (effectively empty after processing) -->
<p> <!-- Effectively empty after processing --> </p>
<b> </b> <!-- Effectively empty after processing -->
<i><!-- comment --></i> <!-- Effectively empty after processing -->
</div> <!-- Should be removed (effectively empty after children removed) -->
<section>Content Inside</section> <!-- Should be kept -->
<article> </article> <!-- Should be removed (empty after processing) -->
<h1 funky-attribute="removeme">Hello, World!</h1> <!-- funky-attribute removed -->
<p>This is a simple HTML page.</p>
<a href="https://example.org" class="link-style" extra="gone">Link</a> <!-- href and class kept -->
<!-- Some Comment -->
</body>
</html>
"#;
let expected_body_content = r#"<body aria-label="Page body" class="main-body"><section>Content Inside</section><h1>Hello, World!</h1><p>This is a simple HTML page.</p><a class="link-style" href="https://example.org">Link</a></body>"#;
let html = slim(fx_html)?;
assert!(html.contains("<head>"));
assert!(html.contains("</head>"));
assert!(html.contains(r#"<meta content="Test Title" property="og:title">"#));
assert!(html.contains(r#"<meta content="http://example.com" property="og:url">"#));
assert!(html.contains(r#"<meta content="http://example.com/img.png" property="og:image">"#));
assert!(html.contains(r#"<meta content="Test Description" property="og:description">"#));
assert!(html.contains(r#"<title>Simple HTML Page</title>"#));
assert!(
!html.contains("<meta charset") && !html.contains("<meta name"),
"Should remove disallowed meta tags"
);
assert!(
!html.contains("<style") && !html.contains("<link") && !html.contains("<script") && !html.contains("<base"),
"Should remove style, link, script, base"
);
assert!(
html.contains("<body")
&& html.contains(r#"class="main-body""#)
&& html.contains(r#"aria-label="Page body""#)
&& html.contains(">")
);
assert!(html.contains(r#"</body>"#));
assert!(html.contains(expected_body_content));
assert!(!html.contains("<svg>"), "Should remove svg");
assert!(!html.contains("<span>"), "Should remove empty span");
assert!(!html.contains("<p> </p>"), "Should remove empty p tag");
assert!(!html.contains("<b>"), "Should remove empty b");
assert!(!html.contains("<i>"), "Should remove empty i");
assert!(!html.contains("<div>"), "Should remove outer empty div");
assert!(!html.contains("<article>"), "Should remove empty article");
assert!(!html.contains("funky-attribute"), "Should remove funky-attribute");
assert!(!html.contains("extra=\"gone\""), "Should remove extra anchor attribute");
assert!(!html.contains("<!--"), "Should remove comments");
Ok(())
}
#[test]
fn test_slimmer2_slim_empty_head_removed() -> TestResult<()> {
let fx_html = r#"
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<link rel="icon" href="favicon.ico">
</head>
<body>
<p>Content</p>
</body>
</html>
"#;
let html = slim(fx_html)?;
assert!(
!html.contains("<head>"),
"Empty <head> tag should be removed after processing. Got: {}",
html
);
assert!(html.contains("<body><p>Content</p></body>"), "Body should remain");
Ok(())
}
#[test]
fn test_slimmer2_slim_keeps_head_if_title_present() -> TestResult<()> {
let fx_html = r#"
<!DOCTYPE html>
<html>
<head>
<title>Only Title</title>
<script></script>
</head>
<body>
<p>Content</p>
</body>
</html>
"#;
let html = slim(fx_html)?;
assert!(
html.contains("<head><title>Only Title</title></head>"),
"<head> with only title should remain"
);
assert!(!html.contains("<script>"), "Script should be removed");
assert!(html.contains("<body><p>Content</p></body>"), "Body should remain");
Ok(())
}
#[test]
fn test_slimmer2_slim_nested_empty_removal() -> TestResult<()> {
let fx_html = r#"
<!DOCTYPE html>
<html>
<body>
<div> <!-- Will become empty after children removed -->
<p> </p> <!-- empty p -->
<div> <!-- Inner div, will become empty -->
<span><!-- comment --></span> <!-- empty span -->
</div>
</div>
<section>
<h1>Title</h1> <!-- Keep H1 -->
<div> </div> <!-- Remove empty div -->
</section>
</body>
</html>
"#;
let expected_body = r#"<body><section><h1>Title</h1></section></body>"#;
let html = slim(fx_html)?;
assert!(
html.contains(expected_body),
"Should remove nested empty elements correctly after processing. Expected: '{}', Got: '{}'",
expected_body,
html
);
assert!(!html.contains("<p>"), "Empty <p> should be removed");
assert!(!html.contains("<span>"), "Empty <span> should be removed");
assert!(
!html.contains("<div>"),
"All empty <div> tags should be removed (inner and outer)"
);
assert!(html.contains("<section>"), "Section should remain");
assert!(html.contains("<h1>"), "H1 should remain");
Ok(())
}
#[test]
fn test_slimmer2_slim_keep_empty_but_not_removable() -> TestResult<()> {
let fx_html = r#"
<!DOCTYPE html>
<html>
<body>
<main></main> <!-- Should keep 'main' even if empty -->
<table><tr><td></td></tr></table> <!-- Should keep table structure even if cells empty -->
</body>
</html>
"#;
let expected_body_fragment1 = "<main></main>";
let html = slim(fx_html)?;
assert!(html.contains(expected_body_fragment1), "Should keep empty <main>");
assert!(
html.contains("<table>") && html.contains("<tr>") && html.contains("<td>") && html.contains("</table>"),
"Should keep empty table structure. Got: {}",
html
);
Ok(())
}
}