html_helpers/
slimmer.rs

1use crate::support::rcdom::{Handle, NodeData, RcDom, SerializableHandle};
2use crate::{Error, Result};
3use html5ever::driver::ParseOpts;
4use html5ever::parse_document;
5use html5ever::serialize::SerializeOpts;
6use html5ever::tendril::TendrilSink;
7
8// region:    --- Constants
9
10/// Tags to remove explicitly, regardless of content (unless within <head>).
11const TAGS_TO_REMOVE: &[&str] = &["script", "link", "style", "svg", "base"];
12
13/// Tags that should be removed if they become effectively empty (contain only whitespace/comments).
14/// Applies only outside the <head> element.
15const REMOVABLE_EMPTY_TAGS: &[&str] = &[
16	"div", "span", "p", "i", "b", "em", "strong", "section", "article", "header", "footer", "nav", "aside",
17];
18
19/// Keywords to check within the 'property' attribute of <meta> tags to determine if they should be kept.
20const META_PROPERTY_KEYWORDS: &[&str] = &["title", "url", "image", "description"];
21
22/// Attribute names allowed on <meta> tags within the <head>.
23const ALLOWED_META_ATTRS: &[&str] = &["property", "content"];
24
25/// Attribute names allowed on elements outside the <head>.
26const ALLOWED_BODY_ATTRS: &[&str] = &["class", "aria-label", "href", "title", "id"];
27
28// endregion: --- Constants
29
30/// Decodes HTML entities (e.g., `&lt;` becomes `<`).
31pub fn decode_html_entities(content: &str) -> String {
32	html_escape::decode_html_entities(content).to_string()
33}
34
35/// Strips non-content elements from the provided HTML content, preserving essential head tags,
36/// and returns the cleaned HTML as a string.
37///
38/// This function removes:
39/// - Non-visible tags like `<script>`, `<link>`, `<style>`, `<svg>`, `<base>`.
40/// - HTML comments.
41/// - Empty or whitespace-only text nodes.
42/// - Specific tags (like `<div>`, `<span>`, `<p>`, etc.) if they become effectively empty after processing children.
43/// - Attributes except for specific allowlists (`class`, `aria-label`, `href` outside head; `property`, `content` for relevant meta tags in head).
44///
45/// It preserves:
46/// - `<title>` tag within `<head>`.
47/// - `<meta>` tags within `<head>` if their `property` attribute matches keywords in `META_PROPERTY_KEYWORDS`.
48/// - Essential body content.
49///
50/// # Arguments
51///
52/// * `html_content` - A string slice containing the HTML content to be processed.
53///
54/// # Returns
55///
56/// A `Result<String>` which is:
57/// - `Ok(String)` containing the cleaned HTML content.
58/// - `Err` if any parsing or serialization errors occur.
59pub fn slim(html_content: &str) -> Result<String> {
60	let dom = parse_document(RcDom::default(), ParseOpts::default())
61		.from_utf8()
62		.read_from(&mut html_content.as_bytes())?;
63
64	// Process the document starting from the root, initially not inside <head>
65	process_node_recursive(&dom.document, false)?;
66
67	let document: SerializableHandle = dom.document.clone().into();
68	let serialize_opts = SerializeOpts {
69		// script_enabled: false, // Keep default, irrelevant as scripts are removed
70		// traversal_scope: markup5ever::serialize::TraversalScope::IncludeNode, // Default
71		// create_missing_html_ns: true, // Keep default
72		..Default::default()
73	};
74
75	let mut output = Vec::new();
76	html5ever::serialize(&mut output, &document, serialize_opts)?;
77
78	let content =
79		String::from_utf8(output).map_err(|err| Error::custom(format!("html5ever serialization non utf8. {err}")))?;
80	let content = remove_empty_lines(content)?;
81
82	Ok(content)
83}
84
85/// Removes empty lines from the given content, returning the cleaned string.
86fn remove_empty_lines(content: String) -> Result<String> {
87	let lines: Vec<&str> = content.lines().filter(|line| !line.trim().is_empty()).collect();
88	Ok(lines.join("\n"))
89}
90
91/// Recursively processes the DOM tree, removing unwanted nodes and attributes.
92/// Returns Ok(true) if the node should be kept, Ok(false) if it should be removed.
93fn process_node_recursive(handle: &Handle, is_in_head_context: bool) -> Result<bool> {
94	let should_keep = match &handle.data {
95		NodeData::Element { name, .. } => {
96			let tag_local_name_str = name.local.as_ref();
97			let current_node_is_head = tag_local_name_str == "head";
98			// Determine context for children: true if current node is <head> or if parent was already in <head>
99			let child_context_is_in_head = is_in_head_context || current_node_is_head;
100
101			let mut keep_current_node: bool;
102
103			// --- Determine if the current node itself should be kept (initial decision) ---
104			if is_in_head_context {
105				// Rules for nodes *directly* within <head> context
106				if tag_local_name_str == "title" {
107					keep_current_node = true; // Keep <title>
108				} else if tag_local_name_str == "meta" {
109					keep_current_node = should_keep_meta(handle); // Keep specific <meta> tags
110				} else {
111					keep_current_node = false; // Remove other tags within <head> context
112				}
113			} else {
114				// Rules for nodes *outside* <head> context OR the <head> tag itself
115				// Compare tag name string directly using the constant list
116				if TAGS_TO_REMOVE.contains(&tag_local_name_str) {
117					keep_current_node = false; // Remove explicitly listed tags
118				} else {
119					// Keep <head>, <body>, <html>, and other tags by default unless explicitly removed or emptied later.
120					keep_current_node = true;
121				}
122			}
123
124			// --- Process Children Recursively ---
125			if keep_current_node {
126				let mut indices_to_remove = Vec::new();
127				let children_handles = handle.children.borrow().clone(); // Clone Vec<Rc<Node>> for iteration
128
129				for (index, child) in children_handles.iter().enumerate() {
130					// Recurse and check if the child should be kept
131					if !process_node_recursive(child, child_context_is_in_head)? {
132						indices_to_remove.push(index);
133					}
134				}
135
136				// Remove children marked for removal after iteration
137				if !indices_to_remove.is_empty() {
138					let mut children_mut = handle
139						.children
140						.try_borrow_mut()
141						.map_err(|err| Error::custom(format!("Node children already borrowed mutably: {err}")))?;
142					for &index in indices_to_remove.iter().rev() {
143						// index must be valid as we iterated over the original length
144						if index < children_mut.len() {
145							children_mut.remove(index);
146						} else {
147							// This case should ideally not happen if indexing is correct
148							eprintln!("Warning: Attempted to remove child at invalid index {}", index);
149						}
150					}
151				}
152
153				// --- Filter Attributes of the current node (if kept) ---
154				// Pass the context where the node *lives* (is_in_head_context || current_node_is_head)
155				filter_attributes(handle, child_context_is_in_head)?;
156
157				// --- Re-evaluate if the current node should be kept (post-processing) ---
158
159				// Remove <head> if it became empty after processing children/attributes,
160				// or remove specific tags if they are effectively empty (only applies outside <head>)
161				if (current_node_is_head && handle.children.borrow().is_empty())
162					|| (!child_context_is_in_head // Check applies outside <head>
163    && REMOVABLE_EMPTY_TAGS.contains(&tag_local_name_str) // Compare string directly
164    && is_effectively_empty(handle))
165				{
166					keep_current_node = false;
167				}
168			}
169			// Return the final decision
170			keep_current_node
171		}
172		NodeData::Comment { .. } => false, // Remove comments
173		NodeData::Text { contents } => !contents.borrow().trim().is_empty(), // Keep non-empty text
174		NodeData::Document => {
175			// Process children of the document root, always keep the document node itself
176			let mut indices_to_remove = Vec::new();
177			let children_handles = handle.children.borrow().clone();
178			for (index, child) in children_handles.iter().enumerate() {
179				if !process_node_recursive(child, false)? {
180					// Start children with is_in_head_context = false
181					indices_to_remove.push(index);
182				}
183			}
184			if !indices_to_remove.is_empty() {
185				let mut children_mut = handle
186					.children
187					.try_borrow_mut()
188					.map_err(|err| Error::custom(format!("Doc children already borrowed mutably: {err}")))?;
189				for &index in indices_to_remove.iter().rev() {
190					if index < children_mut.len() {
191						children_mut.remove(index);
192					}
193				}
194			}
195			true // Keep the document node
196		}
197		NodeData::Doctype { .. } => true,                // Keep Doctype
198		NodeData::ProcessingInstruction { .. } => false, // Remove PIs
199	};
200	Ok(should_keep)
201}
202
203/// Checks if a node contains only whitespace text nodes or comments.
204fn is_effectively_empty(handle: &Handle) -> bool {
205	handle.children.borrow().iter().all(|child| match &child.data {
206		NodeData::Text { contents } => contents.borrow().trim().is_empty(),
207		NodeData::Comment { .. } => true, // Comments are ignored/removed elsewhere, treat as empty component
208		// Any other node type (Element, Doctype, PI) means it's not effectively empty
209		_ => false,
210	})
211}
212
213/// Checks if a `<meta>` tag handle should be kept based on its `property` attribute.
214fn should_keep_meta(handle: &Handle) -> bool {
215	if let NodeData::Element { ref attrs, .. } = handle.data {
216		// Borrow attributes immutably
217		let attributes = attrs.borrow();
218		for attr in attributes.iter() {
219			// Check if the attribute name is 'property'
220			if attr.name.local.as_ref() == "property" {
221				let value = attr.value.to_lowercase();
222				// Check if the property value contains any of the relevant keywords
223				if META_PROPERTY_KEYWORDS.iter().any(|&keyword| value.contains(keyword)) {
224					return true; // Keep this meta tag
225				}
226			}
227		}
228	}
229	false // Do not keep if not meta or property doesn't match
230}
231
232/// Filters attributes of an element node based on whether it's inside the `<head>` section context.
233fn filter_attributes(handle: &Handle, is_in_head_context: bool) -> Result<()> {
234	if let NodeData::Element {
235		ref name, ref attrs, ..
236	} = handle.data
237	{
238		// Borrow attributes mutably to retain specific ones
239		let mut attributes = attrs
240			.try_borrow_mut()
241			.map_err(|err| Error::custom(format!("Attrs already borrowed mutably for <{}>: {}", name.local, err)))?;
242
243		let tag_local_name_str = name.local.as_ref();
244
245		if is_in_head_context {
246			if tag_local_name_str == "meta" {
247				// For <meta> tags inside <head>, keep attributes from the allowed list
248				attributes.retain(|attr| ALLOWED_META_ATTRS.contains(&attr.name.local.as_ref()));
249			} else if tag_local_name_str == "title" {
250				// For <title> tags, remove all attributes
251				attributes.clear();
252			} else {
253				// For other unexpected tags potentially kept inside head, clear attributes just in case
254				attributes.clear();
255			}
256		} else {
257			// For elements outside <head>, keep attributes from the allowed list
258			attributes.retain(|attr| ALLOWED_BODY_ATTRS.contains(&attr.name.local.as_ref()));
259		}
260	}
261	Ok(())
262}
263
264// region:    --- Tests
265
266#[cfg(test)]
267mod tests {
268	use super::*;
269	// Result type alias for tests
270	type TestResult<T> = core::result::Result<T, Box<dyn std::error::Error>>;
271
272	#[test]
273	fn test_slimmer_slim_basic() -> TestResult<()> {
274		// -- Setup & Fixtures
275		let fx_html = r#"
276<!DOCTYPE html>
277<html lang="en">
278<head>
279    <meta charset="UTF-8">
280    <meta name="viewport" content="width=device-width, initial-scale=1.0">
281	<meta property="og:title" content="Test Title">
282	<meta property="og:url" content="http://example.com">
283	<meta property="og:image" content="http://example.com/img.png">
284	<meta property="og:description" content="Test Description">
285	<meta name="keywords" content="test, html"> <!-- Should be removed -->
286    <title>Simple HTML Page</title>
287	<style> body{ color: red } </style>
288	<link rel="stylesheet" href="style.css">
289	<script> console.log("hi"); </script>
290	<base href="/"> <!-- Should be removed -->
291</head>
292<body class="main-body" aria-label="Page body">
293	<svg><path d="M0 0 L 10 10"></path></svg> <!-- Should be removed -->
294	<div>
295		<span></span> <!-- Should be removed (effectively empty) -->
296		<p> <!-- Effectively empty --> </p>
297		<b>  </b> <!-- Effectively empty -->
298		<i><!-- comment --></i> <!-- Effectively empty -->
299	</div> <!-- Should be removed (effectively empty after children removed) -->
300	<section>Content Inside</section> <!-- Should be kept -->
301	<article>  </article> <!-- Should be removed -->
302    <h1 funky-attribute="removeme">Hello, World!</h1> <!-- funky-attribute removed -->
303    <p>This is a simple HTML page.</p>
304	<a href="https://example.org" class="link-style" extra="gone">Link</a> <!-- href and class kept -->
305	<!-- Some Comment -->
306</body>
307</html>
308		"#;
309
310		let expected_head_content = r#"<head><meta property="og:title" content="Test Title"><meta property="og:url" content="http://example.com"><meta property="og:image" content="http://example.com/img.png"><meta property="og:description" content="Test Description"><title>Simple HTML Page</title></head>"#;
311		// Note: The outer <div>, inner <span>, <p>, <b>, <i> and <article> are now removed because they become empty.
312		let expected_body_content = r#"<body class="main-body" aria-label="Page body"><section>Content Inside</section><h1>Hello, World!</h1><p>This is a simple HTML page.</p><a href="https://example.org" class="link-style">Link</a></body>"#;
313
314		// -- Exec
315		let html = slim(fx_html)?;
316		println!("\n---\nSlimmed HTML (Basic + Empty Removal):\n{}\n---\n", html);
317
318		// -- Check Head Content
319		assert!(
320			html.contains(expected_head_content),
321			"Should contain cleaned head content"
322		);
323		assert!(html.contains("<title>Simple HTML Page</title>"), "Should keep title");
324		assert!(html.contains(r#"meta property="og:title""#), "Should keep meta title");
325		assert!(html.contains(r#"meta property="og:url""#), "Should keep meta url");
326		assert!(html.contains(r#"meta property="og:image""#), "Should keep meta image");
327		assert!(
328			html.contains(r#"meta property="og:description""#),
329			"Should keep meta description"
330		);
331		assert!(!html.contains("<meta charset"), "Should remove meta charset");
332		assert!(!html.contains("<meta name"), "Should remove meta name tags");
333		assert!(!html.contains("<style>"), "Should remove style");
334		assert!(!html.contains("<link"), "Should remove link");
335		assert!(!html.contains("<script"), "Should remove script from head");
336		assert!(!html.contains("<base"), "Should remove base");
337
338		// -- Check Body Content
339		assert!(
340			html.contains(expected_body_content),
341			"Should contain cleaned body content (with empty elements removed)"
342		);
343		assert!(!html.contains("<svg>"), "Should remove svg");
344		assert!(!html.contains("<span>"), "Should remove empty span");
345		assert!(!html.contains("<p> </p>"), "Should remove empty p");
346		assert!(!html.contains("<b>"), "Should remove empty b");
347		assert!(!html.contains("<i>"), "Should remove empty i");
348		assert!(!html.contains("<div>"), "Should remove outer empty div");
349		assert!(!html.contains("<article>"), "Should remove empty article");
350		assert!(
351			html.contains("<section>Content Inside</section>"),
352			"Should keep non-empty section"
353		);
354		assert!(html.contains("<h1>Hello, World!</h1>"), "Should keep h1");
355		assert!(!html.contains("funky-attribute"), "Should remove funky-attribute");
356		assert!(
357			html.contains(r#"<body class="main-body" aria-label="Page body">"#),
358			"Should keep body attributes"
359		);
360		assert!(
361			html.contains(r#"<a href="https://example.org" class="link-style">Link</a>"#),
362			"Should keep allowed anchor attributes"
363		);
364		assert!(!html.contains("extra=\"gone\""), "Should remove extra anchor attribute");
365		assert!(!html.contains("<!--"), "Should remove comments");
366
367		Ok(())
368	}
369
370	#[test]
371	fn test_slimmer_slim_empty_head_removed() -> TestResult<()> {
372		// -- Setup & Fixtures
373		let fx_html = r#"
374		<!DOCTYPE html>
375		<html>
376		<head>
377			<meta charset="utf-8">
378			<link rel="icon" href="favicon.ico">
379		</head>
380		<body>
381			<p>Content</p>
382		</body>
383		</html>
384		"#;
385
386		// -- Exec
387		let html = slim(fx_html)?;
388		println!("\n---\nSlimmed HTML (Empty Head):\n{}\n---\n", html);
389
390		// -- Check
391		assert!(
392			!html.contains("<head>"),
393			"Empty <head> tag should be removed after processing"
394		);
395		assert!(html.contains("<body><p>Content</p></body>"), "Body should remain");
396
397		Ok(())
398	}
399
400	#[test]
401	fn test_slimmer_slim_keeps_head_if_title_present() -> TestResult<()> {
402		// -- Setup & Fixtures
403		let fx_html = r#"
404		<!DOCTYPE html>
405		<html>
406		<head>
407			<title>Only Title</title>
408			<script></script>
409		</head>
410		<body>
411			<p>Content</p>
412		</body>
413		</html>
414		"#;
415
416		// -- Exec
417		let html = slim(fx_html)?;
418		println!("\n---\nSlimmed HTML (Head with Title):\n{}\n---\n", html);
419
420		// -- Check
421		assert!(
422			html.contains("<head><title>Only Title</title></head>"),
423			"<head> with only title should remain"
424		);
425		assert!(!html.contains("<script>"), "Script should be removed");
426		assert!(html.contains("<body><p>Content</p></body>"), "Body should remain");
427
428		Ok(())
429	}
430
431	#[test]
432	fn test_slimmer_slim_nested_empty_removal() -> TestResult<()> {
433		// -- Setup & Fixtures
434		let fx_html = r#"
435		<!DOCTYPE html>
436		<html>
437		<body>
438			<div>
439				<p>  </p> <!-- empty p -->
440				<div> <!-- Inner div -->
441					<span><!-- comment --></span> <!-- empty span -->
442				</div>
443			</div>
444			<section>
445				<h1>Title</h1> <!-- Keep H1 -->
446				<div> </div> <!-- Remove empty div -->
447			</section>
448		</body>
449		</html>
450		"#;
451		// Expected: Outer div removed, inner div removed, p removed, span removed. Section and H1 remain.
452		let expected_body = r#"<body><section><h1>Title</h1></section></body>"#;
453
454		// -- Exec
455		let html = slim(fx_html)?;
456		println!("\n---\nSlimmed HTML (Nested Empty):\n{}\n---\n", html);
457
458		// -- Check
459		assert!(
460			html.contains(expected_body),
461			"Should remove nested empty elements correctly"
462		);
463		assert!(!html.contains("<p>"), "Empty <p> should be removed");
464		assert!(!html.contains("<span>"), "Empty <span> should be removed");
465		assert!(
466			!html.contains("<div>"),
467			"All empty <div> tags should be removed (inner and outer)"
468		);
469		assert!(html.contains("<section>"), "Section should remain");
470		assert!(html.contains("<h1>"), "H1 should remain");
471
472		Ok(())
473	}
474
475	#[test]
476	fn test_slimmer_slim_keep_empty_but_not_removable() -> TestResult<()> {
477		// -- Setup & Fixtures
478		let fx_html = r#"
479		<!DOCTYPE html>
480		<html>
481		<body>
482			<main></main> <!-- Should keep 'main' even if empty -->
483			<table><tr><td></td></tr></table> <!-- Should keep table structure even if cells empty -->
484		</body>
485		</html>
486		"#;
487		// let expected_body = r#"<body><main></main><table><tbody><tr><td></td></tr></tbody></table></body>"#;
488		// // Note: tbody is often inserted by parser
489
490		// -- Exec
491		let html = slim(fx_html)?;
492		println!("\n---\nSlimmed HTML (Keep Non-Removable Empty):\n{}\n---\n", html);
493
494		// -- Check
495		// Need a flexible check because the parser might add tbody
496		assert!(html.contains("<main>"), "Should keep empty <main>");
497		assert!(html.contains("<table>"), "Should keep empty <table>");
498		assert!(html.contains("<tr>"), "Should keep empty <tr>");
499		assert!(html.contains("<td>"), "Should keep empty <td>");
500
501		Ok(())
502	}
503}
504
505// endregion: --- Tests