1use crate::{Error, Result};
2use ego_tree::NodeRef;
3use html_escape::encode_double_quoted_attribute;
4use scraper::{ElementRef, Html, node::Node};
5
6const TAGS_TO_REMOVE: &[&str] = &["script", "link", "style", "svg", "base"];
12
13const REMOVABLE_EMPTY_TAGS: &[&str] = &[
16 "div", "span", "p", "i", "b", "em", "strong", "section", "article", "header", "footer", "nav", "aside",
17];
18
19const META_PROPERTY_KEYWORDS: &[&str] = &["title", "url", "image", "description"];
21
22const ALLOWED_META_ATTRS: &[&str] = &["property", "content"];
24
25const ALLOWED_BODY_ATTRS: &[&str] = &["class", "aria-label", "href", "title", "id"];
27
28pub fn decode_html_entities(content: &str) -> String {
33 html_escape::decode_html_entities(content).to_string()
34}
35
36pub fn slim(html_content: &str) -> Result<String> {
62 let html = Html::parse_document(html_content);
63 let mut output = String::new();
64
65 process_node_recursive(html.tree.root(), false, &mut output)?;
67
68 let content = remove_empty_lines(output)?;
70
71 Ok(content)
72}
73
74fn remove_empty_lines(content: String) -> Result<String> {
76 let lines: Vec<&str> = content.lines().filter(|line| !line.trim().is_empty()).collect();
77 Ok(lines.join("\n"))
78}
79
80fn is_string_effectively_empty(s: &str) -> bool {
82 s.trim().is_empty()
83}
84
85fn process_node_recursive(node: NodeRef<Node>, is_in_head_context: bool, output: &mut String) -> Result<()> {
87 match node.value() {
88 Node::Document => {
89 for child in node.children() {
91 process_node_recursive(child, false, output)?; }
93 }
94
95 Node::Doctype(doctype) => {
96 output.push_str("<!DOCTYPE ");
98 output.push_str(&doctype.name);
99 let has_public = !doctype.public_id.is_empty();
100 let has_system = !doctype.system_id.is_empty();
101
102 if has_public {
103 output.push_str(" PUBLIC \"");
104 output.push_str(&doctype.public_id);
105 output.push('"');
106 }
107
108 if has_system {
109 if !has_public {
110 output.push_str(" SYSTEM");
112 }
113 output.push(' '); output.push('"');
115 output.push_str(&doctype.system_id);
116 output.push('"');
117 }
118 output.push('>');
119 }
122
123 Node::Comment(_) => { }
124
125 Node::Text(text) => {
126 let text_content = text.trim();
127 if !text_content.is_empty() {
128 output.push_str(text);
131 }
132 }
133
134 Node::Element(element) => {
135 let tag_name = element.name();
136 let current_node_is_head = tag_name == "head";
137 let child_context_is_in_head = is_in_head_context || current_node_is_head;
139
140 let el_ref = ElementRef::wrap(node).ok_or_else(|| Error::custom("Failed to wrap node as ElementRef"))?;
141
142 if !child_context_is_in_head && TAGS_TO_REMOVE.contains(&tag_name) {
147 return Ok(());
148 }
149 if matches!(tag_name, "script" | "style" | "link" | "base" | "svg") {
151 return Ok(());
152 }
153
154 if is_in_head_context {
156 if tag_name == "title" {
157 } else if tag_name == "meta" {
159 if !should_keep_meta(el_ref) {
160 return Ok(()); }
162 } else {
164 return Ok(()); }
166 }
167
168 let mut children_output = String::new();
170 for child in node.children() {
171 process_node_recursive(child, child_context_is_in_head, &mut children_output)?;
172 }
173
174 let is_empty_after_processing = is_string_effectively_empty(&children_output);
176
177 let is_removable_tag_when_empty = !child_context_is_in_head && REMOVABLE_EMPTY_TAGS.contains(&tag_name);
179
180 let is_empty_head_tag = current_node_is_head && is_empty_after_processing;
182
183 let should_remove_node = (is_removable_tag_when_empty && is_empty_after_processing) || is_empty_head_tag;
184
185 if !should_remove_node {
187 output.push('<');
189 output.push_str(tag_name);
190 filter_and_write_attributes(el_ref, child_context_is_in_head, output)?;
191 output.push('>');
192
193 output.push_str(&children_output);
195
196 output.push_str("</");
198 output.push_str(tag_name);
199 output.push('>');
200 }
201 }
202
203 Node::Fragment => {
204 for child in node.children() {
206 process_node_recursive(child, false, output)?;
207 }
208 }
209
210 Node::ProcessingInstruction(_) => { }
211 }
212 Ok(())
213}
214
215fn should_keep_meta(element: ElementRef) -> bool {
219 if element.value().name() != "meta" {
221 return false;
222 }
223
224 if let Some(prop_value) = element.value().attr("property") {
225 let value_lower = prop_value.to_lowercase();
226 META_PROPERTY_KEYWORDS.iter().any(|&keyword| value_lower.contains(keyword))
228 } else {
229 false
231 }
232}
233
234fn filter_and_write_attributes(element: ElementRef, is_in_head_context: bool, output: &mut String) -> Result<()> {
236 let tag_name = element.value().name();
237
238 let allowed_attrs: &[&str] = if is_in_head_context {
240 match tag_name {
241 "meta" => ALLOWED_META_ATTRS,
242 "title" => &[], _ => &[], }
245 } else {
246 ALLOWED_BODY_ATTRS
248 };
249
250 for (name, value) in element.value().attrs() {
252 if allowed_attrs.contains(&name) {
254 output.push(' ');
255 output.push_str(name);
256 output.push_str("=\"");
257 output.push_str(&encode_double_quoted_attribute(value));
259 output.push('"');
260 }
261 }
262
263 Ok(())
264}
265
266#[cfg(test)]
269mod tests {
270 use super::*;
271 type TestResult<T> = core::result::Result<T, Box<dyn std::error::Error>>;
273
274 #[test]
278 fn test_slimmer2_slim_basic() -> TestResult<()> {
279 let fx_html = r#"
281<!DOCTYPE html>
282<html lang="en">
283<head>
284 <meta charset="UTF-8">
285 <meta name="viewport" content="width=device-width, initial-scale=1.0">
286 <meta property="og:title" content="Test Title">
287 <meta property="og:url" content="http://example.com">
288 <meta property="og:image" content="http://example.com/img.png">
289 <meta property="og:description" content="Test Description">
290 <meta name="keywords" content="test, html"> <!-- Should be removed -->
291 <title>Simple HTML Page</title>
292 <style> body{ color: red } </style>
293 <link rel="stylesheet" href="style.css">
294 <script> console.log("hi"); </script>
295 <base href="/"> <!-- Should be removed -->
296</head>
297<body class="main-body" aria-label="Page body">
298 <svg><path d="M0 0 L 10 10"></path></svg> <!-- Should be removed -->
299 <div>
300 <span></span> <!-- Should be removed (effectively empty after processing) -->
301 <p> <!-- Effectively empty after processing --> </p>
302 <b> </b> <!-- Effectively empty after processing -->
303 <i><!-- comment --></i> <!-- Effectively empty after processing -->
304 </div> <!-- Should be removed (effectively empty after children removed) -->
305 <section>Content Inside</section> <!-- Should be kept -->
306 <article> </article> <!-- Should be removed (empty after processing) -->
307 <h1 funky-attribute="removeme">Hello, World!</h1> <!-- funky-attribute removed -->
308 <p>This is a simple HTML page.</p>
309 <a href="https://example.org" class="link-style" extra="gone">Link</a> <!-- href and class kept -->
310 <!-- Some Comment -->
311</body>
312</html>
313 "#;
314
315 let expected_body_content = r#"<body aria-label="Page body" class="main-body"><section>Content Inside</section><h1>Hello, World!</h1><p>This is a simple HTML page.</p><a class="link-style" href="https://example.org">Link</a></body>"#;
318 let html = slim(fx_html)?;
322 assert!(html.contains("<head>"));
330 assert!(html.contains("</head>"));
331 assert!(html.contains(r#"<meta content="Test Title" property="og:title">"#));
332 assert!(html.contains(r#"<meta content="http://example.com" property="og:url">"#));
333 assert!(html.contains(r#"<meta content="http://example.com/img.png" property="og:image">"#));
334 assert!(html.contains(r#"<meta content="Test Description" property="og:description">"#));
335 assert!(html.contains(r#"<title>Simple HTML Page</title>"#));
336
337 assert!(
338 !html.contains("<meta charset") && !html.contains("<meta name"),
339 "Should remove disallowed meta tags"
340 );
341 assert!(
342 !html.contains("<style") && !html.contains("<link") && !html.contains("<script") && !html.contains("<base"),
343 "Should remove style, link, script, base"
344 );
345
346 assert!(
349 html.contains("<body")
350 && html.contains(r#"class="main-body""#)
351 && html.contains(r#"aria-label="Page body""#)
352 && html.contains(">")
353 );
354 assert!(html.contains(r#"</body>"#));
355 assert!(html.contains(expected_body_content)); assert!(!html.contains("<svg>"), "Should remove svg");
359 assert!(!html.contains("<span>"), "Should remove empty span");
360 assert!(!html.contains("<p> </p>"), "Should remove empty p tag");
361 assert!(!html.contains("<b>"), "Should remove empty b");
362 assert!(!html.contains("<i>"), "Should remove empty i");
363 assert!(!html.contains("<div>"), "Should remove outer empty div");
364 assert!(!html.contains("<article>"), "Should remove empty article");
365 assert!(!html.contains("funky-attribute"), "Should remove funky-attribute");
366 assert!(!html.contains("extra=\"gone\""), "Should remove extra anchor attribute");
367 assert!(!html.contains("<!--"), "Should remove comments");
368
369 Ok(())
370 }
371
372 #[test]
373 fn test_slimmer2_slim_empty_head_removed() -> TestResult<()> {
374 let fx_html = r#"
376 <!DOCTYPE html>
377 <html>
378 <head>
379 <meta charset="utf-8">
380 <link rel="icon" href="favicon.ico">
381 </head>
382 <body>
383 <p>Content</p>
384 </body>
385 </html>
386 "#;
387
388 let html = slim(fx_html)?;
390 assert!(
395 !html.contains("<head>"),
396 "Empty <head> tag should be removed after processing. Got: {}",
397 html
398 );
399 assert!(html.contains("<body><p>Content</p></body>"), "Body should remain");
400
401 Ok(())
402 }
403
404 #[test]
405 fn test_slimmer2_slim_keeps_head_if_title_present() -> TestResult<()> {
406 let fx_html = r#"
408 <!DOCTYPE html>
409 <html>
410 <head>
411 <title>Only Title</title>
412 <script></script>
413 </head>
414 <body>
415 <p>Content</p>
416 </body>
417 </html>
418 "#;
419
420 let html = slim(fx_html)?;
422 assert!(
427 html.contains("<head><title>Only Title</title></head>"),
428 "<head> with only title should remain"
429 );
430 assert!(!html.contains("<script>"), "Script should be removed");
431 assert!(html.contains("<body><p>Content</p></body>"), "Body should remain");
432
433 Ok(())
434 }
435
436 #[test]
437 fn test_slimmer2_slim_nested_empty_removal() -> TestResult<()> {
438 let fx_html = r#"
440 <!DOCTYPE html>
441 <html>
442 <body>
443 <div> <!-- Will become empty after children removed -->
444 <p> </p> <!-- empty p -->
445 <div> <!-- Inner div, will become empty -->
446 <span><!-- comment --></span> <!-- empty span -->
447 </div>
448 </div>
449 <section>
450 <h1>Title</h1> <!-- Keep H1 -->
451 <div> </div> <!-- Remove empty div -->
452 </section>
453 </body>
454 </html>
455 "#;
456 let expected_body = r#"<body><section><h1>Title</h1></section></body>"#;
459
460 let html = slim(fx_html)?;
462 assert!(
466 html.contains(expected_body),
467 "Should remove nested empty elements correctly after processing. Expected: '{}', Got: '{}'",
468 expected_body,
469 html
470 );
471 assert!(!html.contains("<p>"), "Empty <p> should be removed");
472 assert!(!html.contains("<span>"), "Empty <span> should be removed");
473 assert!(
474 !html.contains("<div>"),
475 "All empty <div> tags should be removed (inner and outer)"
476 );
477 assert!(html.contains("<section>"), "Section should remain");
478 assert!(html.contains("<h1>"), "H1 should remain");
479
480 Ok(())
481 }
482
483 #[test]
484 fn test_slimmer2_slim_keep_empty_but_not_removable() -> TestResult<()> {
485 let fx_html = r#"
487 <!DOCTYPE html>
488 <html>
489 <body>
490 <main></main> <!-- Should keep 'main' even if empty -->
491 <table><tr><td></td></tr></table> <!-- Should keep table structure even if cells empty -->
492 </body>
493 </html>
494 "#;
495 let expected_body_fragment1 = "<main></main>";
496 let html = slim(fx_html)?;
501 assert!(html.contains(expected_body_fragment1), "Should keep empty <main>");
508 assert!(
510 html.contains("<table>") && html.contains("<tr>") && html.contains("<td>") && html.contains("</table>"),
511 "Should keep empty table structure. Got: {}",
512 html
513 );
514 Ok(())
518 }
519}
520
521