1use crate::support::rcdom::{Handle, NodeData, RcDom, SerializableHandle};
2use crate::{Error, Result};
3use html5ever::driver::ParseOpts;
4use html5ever::parse_document;
5use html5ever::serialize::SerializeOpts;
6use html5ever::tendril::TendrilSink;
7
8const TAGS_TO_REMOVE: &[&str] = &["script", "link", "style", "svg", "base"];
12
13const REMOVABLE_EMPTY_TAGS: &[&str] = &[
16 "div", "span", "p", "i", "b", "em", "strong", "section", "article", "header", "footer", "nav", "aside",
17];
18
19const META_PROPERTY_KEYWORDS: &[&str] = &["title", "url", "image", "description"];
21
22const ALLOWED_META_ATTRS: &[&str] = &["property", "content"];
24
25const ALLOWED_BODY_ATTRS: &[&str] = &["class", "aria-label", "href", "title", "id"];
27
28pub fn decode_html_entities(content: &str) -> String {
32 html_escape::decode_html_entities(content).to_string()
33}
34
35pub fn slim(html_content: &str) -> Result<String> {
60 let dom = parse_document(RcDom::default(), ParseOpts::default())
61 .from_utf8()
62 .read_from(&mut html_content.as_bytes())?;
63
64 process_node_recursive(&dom.document, false)?;
66
67 let document: SerializableHandle = dom.document.clone().into();
68 let serialize_opts = SerializeOpts {
69 ..Default::default()
73 };
74
75 let mut output = Vec::new();
76 html5ever::serialize(&mut output, &document, serialize_opts)?;
77
78 let content =
79 String::from_utf8(output).map_err(|err| Error::custom(format!("html5ever serialization non utf8. {err}")))?;
80 let content = remove_empty_lines(content)?;
81
82 Ok(content)
83}
84
85fn remove_empty_lines(content: String) -> Result<String> {
87 let lines: Vec<&str> = content.lines().filter(|line| !line.trim().is_empty()).collect();
88 Ok(lines.join("\n"))
89}
90
91fn process_node_recursive(handle: &Handle, is_in_head_context: bool) -> Result<bool> {
94 let should_keep = match &handle.data {
95 NodeData::Element { name, .. } => {
96 let tag_local_name_str = name.local.as_ref();
97 let current_node_is_head = tag_local_name_str == "head";
98 let child_context_is_in_head = is_in_head_context || current_node_is_head;
100
101 let mut keep_current_node: bool;
102
103 if is_in_head_context {
105 if tag_local_name_str == "title" {
107 keep_current_node = true; } else if tag_local_name_str == "meta" {
109 keep_current_node = should_keep_meta(handle); } else {
111 keep_current_node = false; }
113 } else {
114 if TAGS_TO_REMOVE.contains(&tag_local_name_str) {
117 keep_current_node = false; } else {
119 keep_current_node = true;
121 }
122 }
123
124 if keep_current_node {
126 let mut indices_to_remove = Vec::new();
127 let children_handles = handle.children.borrow().clone(); for (index, child) in children_handles.iter().enumerate() {
130 if !process_node_recursive(child, child_context_is_in_head)? {
132 indices_to_remove.push(index);
133 }
134 }
135
136 if !indices_to_remove.is_empty() {
138 let mut children_mut = handle
139 .children
140 .try_borrow_mut()
141 .map_err(|err| Error::custom(format!("Node children already borrowed mutably: {err}")))?;
142 for &index in indices_to_remove.iter().rev() {
143 if index < children_mut.len() {
145 children_mut.remove(index);
146 } else {
147 eprintln!("Warning: Attempted to remove child at invalid index {}", index);
149 }
150 }
151 }
152
153 filter_attributes(handle, child_context_is_in_head)?;
156
157 if (current_node_is_head && handle.children.borrow().is_empty())
162 || (!child_context_is_in_head && REMOVABLE_EMPTY_TAGS.contains(&tag_local_name_str) && is_effectively_empty(handle))
165 {
166 keep_current_node = false;
167 }
168 }
169 keep_current_node
171 }
172 NodeData::Comment { .. } => false, NodeData::Text { contents } => !contents.borrow().trim().is_empty(), NodeData::Document => {
175 let mut indices_to_remove = Vec::new();
177 let children_handles = handle.children.borrow().clone();
178 for (index, child) in children_handles.iter().enumerate() {
179 if !process_node_recursive(child, false)? {
180 indices_to_remove.push(index);
182 }
183 }
184 if !indices_to_remove.is_empty() {
185 let mut children_mut = handle
186 .children
187 .try_borrow_mut()
188 .map_err(|err| Error::custom(format!("Doc children already borrowed mutably: {err}")))?;
189 for &index in indices_to_remove.iter().rev() {
190 if index < children_mut.len() {
191 children_mut.remove(index);
192 }
193 }
194 }
195 true }
197 NodeData::Doctype { .. } => true, NodeData::ProcessingInstruction { .. } => false, };
200 Ok(should_keep)
201}
202
203fn is_effectively_empty(handle: &Handle) -> bool {
205 handle.children.borrow().iter().all(|child| match &child.data {
206 NodeData::Text { contents } => contents.borrow().trim().is_empty(),
207 NodeData::Comment { .. } => true, _ => false,
210 })
211}
212
213fn should_keep_meta(handle: &Handle) -> bool {
215 if let NodeData::Element { ref attrs, .. } = handle.data {
216 let attributes = attrs.borrow();
218 for attr in attributes.iter() {
219 if attr.name.local.as_ref() == "property" {
221 let value = attr.value.to_lowercase();
222 if META_PROPERTY_KEYWORDS.iter().any(|&keyword| value.contains(keyword)) {
224 return true; }
226 }
227 }
228 }
229 false }
231
232fn filter_attributes(handle: &Handle, is_in_head_context: bool) -> Result<()> {
234 if let NodeData::Element {
235 ref name, ref attrs, ..
236 } = handle.data
237 {
238 let mut attributes = attrs
240 .try_borrow_mut()
241 .map_err(|err| Error::custom(format!("Attrs already borrowed mutably for <{}>: {}", name.local, err)))?;
242
243 let tag_local_name_str = name.local.as_ref();
244
245 if is_in_head_context {
246 if tag_local_name_str == "meta" {
247 attributes.retain(|attr| ALLOWED_META_ATTRS.contains(&attr.name.local.as_ref()));
249 } else if tag_local_name_str == "title" {
250 attributes.clear();
252 } else {
253 attributes.clear();
255 }
256 } else {
257 attributes.retain(|attr| ALLOWED_BODY_ATTRS.contains(&attr.name.local.as_ref()));
259 }
260 }
261 Ok(())
262}
263
264#[cfg(test)]
267mod tests {
268 use super::*;
269 type TestResult<T> = core::result::Result<T, Box<dyn std::error::Error>>;
271
272 #[test]
273 fn test_slimmer_slim_basic() -> TestResult<()> {
274 let fx_html = r#"
276<!DOCTYPE html>
277<html lang="en">
278<head>
279 <meta charset="UTF-8">
280 <meta name="viewport" content="width=device-width, initial-scale=1.0">
281 <meta property="og:title" content="Test Title">
282 <meta property="og:url" content="http://example.com">
283 <meta property="og:image" content="http://example.com/img.png">
284 <meta property="og:description" content="Test Description">
285 <meta name="keywords" content="test, html"> <!-- Should be removed -->
286 <title>Simple HTML Page</title>
287 <style> body{ color: red } </style>
288 <link rel="stylesheet" href="style.css">
289 <script> console.log("hi"); </script>
290 <base href="/"> <!-- Should be removed -->
291</head>
292<body class="main-body" aria-label="Page body">
293 <svg><path d="M0 0 L 10 10"></path></svg> <!-- Should be removed -->
294 <div>
295 <span></span> <!-- Should be removed (effectively empty) -->
296 <p> <!-- Effectively empty --> </p>
297 <b> </b> <!-- Effectively empty -->
298 <i><!-- comment --></i> <!-- Effectively empty -->
299 </div> <!-- Should be removed (effectively empty after children removed) -->
300 <section>Content Inside</section> <!-- Should be kept -->
301 <article> </article> <!-- Should be removed -->
302 <h1 funky-attribute="removeme">Hello, World!</h1> <!-- funky-attribute removed -->
303 <p>This is a simple HTML page.</p>
304 <a href="https://example.org" class="link-style" extra="gone">Link</a> <!-- href and class kept -->
305 <!-- Some Comment -->
306</body>
307</html>
308 "#;
309
310 let expected_head_content = r#"<head><meta property="og:title" content="Test Title"><meta property="og:url" content="http://example.com"><meta property="og:image" content="http://example.com/img.png"><meta property="og:description" content="Test Description"><title>Simple HTML Page</title></head>"#;
311 let expected_body_content = r#"<body class="main-body" aria-label="Page body"><section>Content Inside</section><h1>Hello, World!</h1><p>This is a simple HTML page.</p><a href="https://example.org" class="link-style">Link</a></body>"#;
313
314 let html = slim(fx_html)?;
316 println!("\n---\nSlimmed HTML (Basic + Empty Removal):\n{}\n---\n", html);
317
318 assert!(
320 html.contains(expected_head_content),
321 "Should contain cleaned head content"
322 );
323 assert!(html.contains("<title>Simple HTML Page</title>"), "Should keep title");
324 assert!(html.contains(r#"meta property="og:title""#), "Should keep meta title");
325 assert!(html.contains(r#"meta property="og:url""#), "Should keep meta url");
326 assert!(html.contains(r#"meta property="og:image""#), "Should keep meta image");
327 assert!(
328 html.contains(r#"meta property="og:description""#),
329 "Should keep meta description"
330 );
331 assert!(!html.contains("<meta charset"), "Should remove meta charset");
332 assert!(!html.contains("<meta name"), "Should remove meta name tags");
333 assert!(!html.contains("<style>"), "Should remove style");
334 assert!(!html.contains("<link"), "Should remove link");
335 assert!(!html.contains("<script"), "Should remove script from head");
336 assert!(!html.contains("<base"), "Should remove base");
337
338 assert!(
340 html.contains(expected_body_content),
341 "Should contain cleaned body content (with empty elements removed)"
342 );
343 assert!(!html.contains("<svg>"), "Should remove svg");
344 assert!(!html.contains("<span>"), "Should remove empty span");
345 assert!(!html.contains("<p> </p>"), "Should remove empty p");
346 assert!(!html.contains("<b>"), "Should remove empty b");
347 assert!(!html.contains("<i>"), "Should remove empty i");
348 assert!(!html.contains("<div>"), "Should remove outer empty div");
349 assert!(!html.contains("<article>"), "Should remove empty article");
350 assert!(
351 html.contains("<section>Content Inside</section>"),
352 "Should keep non-empty section"
353 );
354 assert!(html.contains("<h1>Hello, World!</h1>"), "Should keep h1");
355 assert!(!html.contains("funky-attribute"), "Should remove funky-attribute");
356 assert!(
357 html.contains(r#"<body class="main-body" aria-label="Page body">"#),
358 "Should keep body attributes"
359 );
360 assert!(
361 html.contains(r#"<a href="https://example.org" class="link-style">Link</a>"#),
362 "Should keep allowed anchor attributes"
363 );
364 assert!(!html.contains("extra=\"gone\""), "Should remove extra anchor attribute");
365 assert!(!html.contains("<!--"), "Should remove comments");
366
367 Ok(())
368 }
369
370 #[test]
371 fn test_slimmer_slim_empty_head_removed() -> TestResult<()> {
372 let fx_html = r#"
374 <!DOCTYPE html>
375 <html>
376 <head>
377 <meta charset="utf-8">
378 <link rel="icon" href="favicon.ico">
379 </head>
380 <body>
381 <p>Content</p>
382 </body>
383 </html>
384 "#;
385
386 let html = slim(fx_html)?;
388 println!("\n---\nSlimmed HTML (Empty Head):\n{}\n---\n", html);
389
390 assert!(
392 !html.contains("<head>"),
393 "Empty <head> tag should be removed after processing"
394 );
395 assert!(html.contains("<body><p>Content</p></body>"), "Body should remain");
396
397 Ok(())
398 }
399
400 #[test]
401 fn test_slimmer_slim_keeps_head_if_title_present() -> TestResult<()> {
402 let fx_html = r#"
404 <!DOCTYPE html>
405 <html>
406 <head>
407 <title>Only Title</title>
408 <script></script>
409 </head>
410 <body>
411 <p>Content</p>
412 </body>
413 </html>
414 "#;
415
416 let html = slim(fx_html)?;
418 println!("\n---\nSlimmed HTML (Head with Title):\n{}\n---\n", html);
419
420 assert!(
422 html.contains("<head><title>Only Title</title></head>"),
423 "<head> with only title should remain"
424 );
425 assert!(!html.contains("<script>"), "Script should be removed");
426 assert!(html.contains("<body><p>Content</p></body>"), "Body should remain");
427
428 Ok(())
429 }
430
431 #[test]
432 fn test_slimmer_slim_nested_empty_removal() -> TestResult<()> {
433 let fx_html = r#"
435 <!DOCTYPE html>
436 <html>
437 <body>
438 <div>
439 <p> </p> <!-- empty p -->
440 <div> <!-- Inner div -->
441 <span><!-- comment --></span> <!-- empty span -->
442 </div>
443 </div>
444 <section>
445 <h1>Title</h1> <!-- Keep H1 -->
446 <div> </div> <!-- Remove empty div -->
447 </section>
448 </body>
449 </html>
450 "#;
451 let expected_body = r#"<body><section><h1>Title</h1></section></body>"#;
453
454 let html = slim(fx_html)?;
456 println!("\n---\nSlimmed HTML (Nested Empty):\n{}\n---\n", html);
457
458 assert!(
460 html.contains(expected_body),
461 "Should remove nested empty elements correctly"
462 );
463 assert!(!html.contains("<p>"), "Empty <p> should be removed");
464 assert!(!html.contains("<span>"), "Empty <span> should be removed");
465 assert!(
466 !html.contains("<div>"),
467 "All empty <div> tags should be removed (inner and outer)"
468 );
469 assert!(html.contains("<section>"), "Section should remain");
470 assert!(html.contains("<h1>"), "H1 should remain");
471
472 Ok(())
473 }
474
475 #[test]
476 fn test_slimmer_slim_keep_empty_but_not_removable() -> TestResult<()> {
477 let fx_html = r#"
479 <!DOCTYPE html>
480 <html>
481 <body>
482 <main></main> <!-- Should keep 'main' even if empty -->
483 <table><tr><td></td></tr></table> <!-- Should keep table structure even if cells empty -->
484 </body>
485 </html>
486 "#;
487 let html = slim(fx_html)?;
492 println!("\n---\nSlimmed HTML (Keep Non-Removable Empty):\n{}\n---\n", html);
493
494 assert!(html.contains("<main>"), "Should keep empty <main>");
497 assert!(html.contains("<table>"), "Should keep empty <table>");
498 assert!(html.contains("<tr>"), "Should keep empty <tr>");
499 assert!(html.contains("<td>"), "Should keep empty <td>");
500
501 Ok(())
502 }
503}
504
505