html_helpers/
slimmer.rs

1use crate::{Error, Result};
2use ego_tree::NodeRef;
3use html_escape::encode_double_quoted_attribute;
4use scraper::{ElementRef, Html, node::Node};
5
6// region:    --- Constants
7
8// NOTE: These constants are duplicated from slimmer.rs. Consider refactoring if they need to be shared.
9
10/// Tags to remove explicitly, regardless of content (unless within <head>).
11const TAGS_TO_REMOVE: &[&str] = &["script", "link", "style", "svg", "base"];
12
13/// Tags that should be removed if they become effectively empty (contain only whitespace/comments)
14/// after processing children. Applies only outside the <head> element.
15const REMOVABLE_EMPTY_TAGS: &[&str] = &[
16	"div", "span", "p", "i", "b", "em", "strong", "section", "article", "header", "footer", "nav", "aside",
17];
18
19/// Keywords to check within the 'property' attribute of <meta> tags to determine if they should be kept.
20const META_PROPERTY_KEYWORDS: &[&str] = &["title", "url", "image", "description"];
21
22/// Attribute names allowed on <meta> tags within the <head>.
23const ALLOWED_META_ATTRS: &[&str] = &["property", "content"];
24
25/// Attribute names allowed on elements outside the <head>.
26const ALLOWED_BODY_ATTRS: &[&str] = &["class", "aria-label", "href", "title", "id"];
27
28// endregion: --- Constants
29
30/// Decodes HTML entities (e.g., `&lt;` becomes `<`).
31/// Re-exporting from the original slimmer or using html-escape directly.
32pub fn decode_html_entities(content: &str) -> String {
33	html_escape::decode_html_entities(content).to_string()
34}
35
36/// Strips non-content elements from the provided HTML content using the `scraper` crate,
37/// preserving essential head tags, and returns the cleaned HTML as a string.
38///
39/// This function aims to replicate the behavior of `slimmer::slim` using `scraper`.
40/// It removes:
41/// - Non-visible tags like `<script>`, `<link>`, `<style>`, `<svg>`, `<base>`.
42/// - HTML comments.
43/// - Empty or whitespace-only text nodes.
44/// - Specific tags (like `<div>`, `<span>`, `<p>`, etc.) if they become effectively empty *after* processing children.
45/// - Attributes except for specific allowlists (`class`, `aria-label`, `href` outside head; `property`, `content` for relevant meta tags in head).
46///
47/// It preserves:
48/// - `<title>` tag within `<head>`.
49/// - `<meta>` tags within `<head>` if their `property` attribute matches keywords in `META_PROPERTY_KEYWORDS`.
50/// - Essential body content.
51///
52/// # Arguments
53///
54/// * `html_content` - A string slice containing the HTML content to be processed.
55///
56/// # Returns
57///
58/// A `Result<String>` which is:
59/// - `Ok(String)` containing the cleaned HTML content.
60/// - `Err` if any errors occur during processing.
61pub fn slim(html_content: &str) -> Result<String> {
62	let html = Html::parse_document(html_content);
63	let mut output = String::new();
64
65	// Process the root node (which should be the Document node)
66	process_node_recursive(html.tree.root(), false, &mut output)?;
67
68	// Final cleanup of empty lines
69	let content = remove_empty_lines(output)?;
70
71	Ok(content)
72}
73
74/// Removes empty lines from the given content, returning the cleaned string.
75fn remove_empty_lines(content: String) -> Result<String> {
76	let lines: Vec<&str> = content.lines().filter(|line| !line.trim().is_empty()).collect();
77	Ok(lines.join("\n"))
78}
79
80/// Checks if a string contains only whitespace characters.
81fn is_string_effectively_empty(s: &str) -> bool {
82	s.trim().is_empty()
83}
84
85/// Recursively processes a node using `scraper`, writing allowed content to the output string.
86fn process_node_recursive(node: NodeRef<Node>, is_in_head_context: bool, output: &mut String) -> Result<()> {
87	match node.value() {
88		Node::Document => {
89			// Process children of the document (Doctype, root element <html>)
90			for child in node.children() {
91				process_node_recursive(child, false, output)?; // Start children with is_in_head_context = false
92			}
93		}
94
95		Node::Doctype(doctype) => {
96			// Serialize Doctype manually
97			output.push_str("<!DOCTYPE ");
98			output.push_str(&doctype.name);
99			let has_public = !doctype.public_id.is_empty();
100			let has_system = !doctype.system_id.is_empty();
101
102			if has_public {
103				output.push_str(" PUBLIC \"");
104				output.push_str(&doctype.public_id);
105				output.push('"');
106			}
107
108			if has_system {
109				if !has_public {
110					// Add SYSTEM keyword only if no PUBLIC id
111					output.push_str(" SYSTEM");
112				}
113				output.push(' '); // Always add space before system id string if it exists
114				output.push('"');
115				output.push_str(&doctype.system_id);
116				output.push('"');
117			}
118			output.push('>');
119			// Consider adding a newline if needed for formatting, but remove_empty_lines might handle it.
120			// output.push('\n');
121		}
122
123		Node::Comment(_) => { /* Skip comments */ }
124
125		Node::Text(text) => {
126			let text_content = text.trim();
127			if !text_content.is_empty() {
128				// Use the raw text provided by scraper, assuming it's decoded.
129				// Re-escaping is generally not needed for text nodes here.
130				output.push_str(text);
131			}
132		}
133
134		Node::Element(element) => {
135			let tag_name = element.name();
136			let current_node_is_head = tag_name == "head";
137			// Determine context for children: true if current node is <head> or if parent was already in <head>
138			let child_context_is_in_head = is_in_head_context || current_node_is_head;
139
140			let el_ref = ElementRef::wrap(node).ok_or_else(|| Error::custom("Failed to wrap node as ElementRef"))?;
141
142			// --- 1. Decide if this element should be skipped entirely (before processing children) ---
143
144			// Skip tags explicitly marked for removal (outside head context)
145			// Note: script/style/link/base removal handled separately for clarity.
146			if !child_context_is_in_head && TAGS_TO_REMOVE.contains(&tag_name) {
147				return Ok(());
148			}
149			// Skip specific non-content tags always
150			if matches!(tag_name, "script" | "style" | "link" | "base" | "svg") {
151				return Ok(());
152			}
153
154			// Skip elements within <head> context unless they are <title> or allowed <meta>
155			if is_in_head_context {
156				if tag_name == "title" {
157					// Keep title
158				} else if tag_name == "meta" {
159					if !should_keep_meta(el_ref) {
160						return Ok(()); // Remove disallowed meta tag
161					}
162					// Keep allowed meta
163				} else {
164					return Ok(()); // Remove other tags inside head context
165				}
166			}
167
168			// --- 2. Process Children Recursively into a temporary buffer ---
169			let mut children_output = String::new();
170			for child in node.children() {
171				process_node_recursive(child, child_context_is_in_head, &mut children_output)?;
172			}
173
174			// --- 3. Decide whether to keep the current node based on its content *after* processing children ---
175			let is_empty_after_processing = is_string_effectively_empty(&children_output);
176
177			// Check if it's a tag eligible for removal when empty (outside head)
178			let is_removable_tag_when_empty = !child_context_is_in_head && REMOVABLE_EMPTY_TAGS.contains(&tag_name);
179
180			// Check if it's the <head> tag itself and it's now empty
181			let is_empty_head_tag = current_node_is_head && is_empty_after_processing;
182
183			let should_remove_node = (is_removable_tag_when_empty && is_empty_after_processing) || is_empty_head_tag;
184
185			// --- 4. Construct Output if Node is Kept ---
186			if !should_remove_node {
187				// Build start tag
188				output.push('<');
189				output.push_str(tag_name);
190				filter_and_write_attributes(el_ref, child_context_is_in_head, output)?;
191				output.push('>');
192
193				// Append children's content
194				output.push_str(&children_output);
195
196				// Build end tag
197				output.push_str("</");
198				output.push_str(tag_name);
199				output.push('>');
200			}
201		}
202
203		Node::Fragment => {
204			// Should not happen with parse_document, but handle defensively
205			for child in node.children() {
206				process_node_recursive(child, false, output)?;
207			}
208		}
209
210		Node::ProcessingInstruction(_) => { /* Skip PIs */ }
211	}
212	Ok(())
213}
214
215// is_effectively_empty (on ElementRef) is no longer needed as we check the string output.
216
217/// Checks if a `<meta>` tag element should be kept based on its `property` attribute.
218fn should_keep_meta(element: ElementRef) -> bool {
219	// Check if the element is actually a <meta> tag
220	if element.value().name() != "meta" {
221		return false;
222	}
223
224	if let Some(prop_value) = element.value().attr("property") {
225		let value_lower = prop_value.to_lowercase();
226		// Check if the property value contains any of the relevant keywords
227		META_PROPERTY_KEYWORDS.iter().any(|&keyword| value_lower.contains(keyword))
228	} else {
229		// No 'property' attribute found
230		false
231	}
232}
233
234/// Filters attributes of an element and writes the allowed ones to the output string.
235fn filter_and_write_attributes(element: ElementRef, is_in_head_context: bool, output: &mut String) -> Result<()> {
236	let tag_name = element.value().name();
237
238	// Determine the correct list of allowed attributes based on context
239	let allowed_attrs: &[&str] = if is_in_head_context {
240		match tag_name {
241			"meta" => ALLOWED_META_ATTRS,
242			"title" => &[], // No attributes allowed on title
243			_ => &[],       // Default deny for other unexpected tags in head
244		}
245	} else {
246		// Outside head context
247		ALLOWED_BODY_ATTRS
248	};
249
250	// Iterate over attributes and append allowed ones
251	for (name, value) in element.value().attrs() {
252		// Check against the determined allowlist
253		if allowed_attrs.contains(&name) {
254			output.push(' ');
255			output.push_str(name);
256			output.push_str("=\"");
257			// Encode attribute value correctly
258			output.push_str(&encode_double_quoted_attribute(value));
259			output.push('"');
260		}
261	}
262
263	Ok(())
264}
265
266// region:    --- Tests
267
268#[cfg(test)]
269mod tests {
270	use super::*;
271	// Result type alias for tests
272	type TestResult<T> = core::result::Result<T, Box<dyn std::error::Error>>;
273
274	// Copied and adapted tests from slimmer.rs
275	// Renamed slim -> slim2 and test_slimmer_... -> test_slimmer2_...
276
277	#[test]
278	fn test_slimmer2_slim_basic() -> TestResult<()> {
279		// -- Setup & Fixtures
280		let fx_html = r#"
281<!DOCTYPE html>
282<html lang="en">
283<head>
284    <meta charset="UTF-8">
285    <meta name="viewport" content="width=device-width, initial-scale=1.0">
286	<meta property="og:title" content="Test Title">
287	<meta property="og:url" content="http://example.com">
288	<meta property="og:image" content="http://example.com/img.png">
289	<meta property="og:description" content="Test Description">
290	<meta name="keywords" content="test, html"> <!-- Should be removed -->
291    <title>Simple HTML Page</title>
292	<style> body{ color: red } </style>
293	<link rel="stylesheet" href="style.css">
294	<script> console.log("hi"); </script>
295	<base href="/"> <!-- Should be removed -->
296</head>
297<body class="main-body" aria-label="Page body">
298	<svg><path d="M0 0 L 10 10"></path></svg> <!-- Should be removed -->
299	<div>
300		<span></span> <!-- Should be removed (effectively empty after processing) -->
301		<p> <!-- Effectively empty after processing --> </p>
302		<b>  </b> <!-- Effectively empty after processing -->
303		<i><!-- comment --></i> <!-- Effectively empty after processing -->
304	</div> <!-- Should be removed (effectively empty after children removed) -->
305	<section>Content Inside</section> <!-- Should be kept -->
306	<article>  </article> <!-- Should be removed (empty after processing) -->
307    <h1 funky-attribute="removeme">Hello, World!</h1> <!-- funky-attribute removed -->
308    <p>This is a simple HTML page.</p>
309	<a href="https://example.org" class="link-style" extra="gone">Link</a> <!-- href and class kept -->
310	<!-- Some Comment -->
311</body>
312</html>
313		"#;
314
315		// Expected output should now match slimmer.rs more closely regarding empty element removal.
316		// let expected_head_content = r#"<head><meta content="Test Title" property="og:title"><meta content="http://example.com" property="og:url"><meta content="http://example.com/img.png" property="og:image"><meta content="Test Description" property="og:description"><title>Simple HTML Page</title></head>"#;
317		let expected_body_content = r#"<body aria-label="Page body" class="main-body"><section>Content Inside</section><h1>Hello, World!</h1><p>This is a simple HTML page.</p><a class="link-style" href="https://example.org">Link</a></body>"#;
318		// Note attribute order might differ slightly between scraper/html5ever & string building, but content should match.
319
320		// -- Exec
321		let html = slim(fx_html)?;
322		// println!(
323		// 	"\n---\nSlimmed HTML (Scraper - Basic + Post-Empty Removal):\n{}\n---\n",
324		// 	html
325		// );
326
327		// -- Check Head Content (More precise check possible now)
328		// Need flexible attribute order check for head
329		assert!(html.contains("<head>"));
330		assert!(html.contains("</head>"));
331		assert!(html.contains(r#"<meta content="Test Title" property="og:title">"#));
332		assert!(html.contains(r#"<meta content="http://example.com" property="og:url">"#));
333		assert!(html.contains(r#"<meta content="http://example.com/img.png" property="og:image">"#));
334		assert!(html.contains(r#"<meta content="Test Description" property="og:description">"#));
335		assert!(html.contains(r#"<title>Simple HTML Page</title>"#));
336
337		assert!(
338			!html.contains("<meta charset") && !html.contains("<meta name"),
339			"Should remove disallowed meta tags"
340		);
341		assert!(
342			!html.contains("<style") && !html.contains("<link") && !html.contains("<script") && !html.contains("<base"),
343			"Should remove style, link, script, base"
344		);
345
346		// -- Check Body Content (More precise check)
347		// Allow for attribute order variations in body tag
348		assert!(
349			html.contains("<body")
350				&& html.contains(r#"class="main-body""#)
351				&& html.contains(r#"aria-label="Page body""#)
352				&& html.contains(">")
353		);
354		assert!(html.contains(r#"</body>"#));
355		assert!(html.contains(expected_body_content)); // Check the exact sequence for the rest
356
357		// Check removals (should now match slimmer.rs)
358		assert!(!html.contains("<svg>"), "Should remove svg");
359		assert!(!html.contains("<span>"), "Should remove empty span");
360		assert!(!html.contains("<p> </p>"), "Should remove empty p tag");
361		assert!(!html.contains("<b>"), "Should remove empty b");
362		assert!(!html.contains("<i>"), "Should remove empty i");
363		assert!(!html.contains("<div>"), "Should remove outer empty div");
364		assert!(!html.contains("<article>"), "Should remove empty article");
365		assert!(!html.contains("funky-attribute"), "Should remove funky-attribute");
366		assert!(!html.contains("extra=\"gone\""), "Should remove extra anchor attribute");
367		assert!(!html.contains("<!--"), "Should remove comments");
368
369		Ok(())
370	}
371
372	#[test]
373	fn test_slimmer2_slim_empty_head_removed() -> TestResult<()> {
374		// -- Setup & Fixtures
375		let fx_html = r#"
376		<!DOCTYPE html>
377		<html>
378		<head>
379			<meta charset="utf-8">
380			<link rel="icon" href="favicon.ico">
381		</head>
382		<body>
383			<p>Content</p>
384		</body>
385		</html>
386		"#;
387
388		// -- Exec
389		let html = slim(fx_html)?;
390		// println!("\n---\nSlimmed HTML (Scraper - Empty Head Removed):\n{}\n---\n", html);
391
392		// -- Check
393		// The <head> tag itself should now be removed as it becomes empty after processing children.
394		assert!(
395			!html.contains("<head>"),
396			"Empty <head> tag should be removed after processing. Got: {}",
397			html
398		);
399		assert!(html.contains("<body><p>Content</p></body>"), "Body should remain");
400
401		Ok(())
402	}
403
404	#[test]
405	fn test_slimmer2_slim_keeps_head_if_title_present() -> TestResult<()> {
406		// -- Setup & Fixtures
407		let fx_html = r#"
408		<!DOCTYPE html>
409		<html>
410		<head>
411			<title>Only Title</title>
412			<script></script>
413		</head>
414		<body>
415			<p>Content</p>
416		</body>
417		</html>
418		"#;
419
420		// -- Exec
421		let html = slim(fx_html)?;
422		// println!("\n---\nSlimmed HTML (Scraper - Head with Title Kept):\n{}\n---\n", html);
423
424		// -- Check
425		// Head should remain as title is kept.
426		assert!(
427			html.contains("<head><title>Only Title</title></head>"),
428			"<head> with only title should remain"
429		);
430		assert!(!html.contains("<script>"), "Script should be removed");
431		assert!(html.contains("<body><p>Content</p></body>"), "Body should remain");
432
433		Ok(())
434	}
435
436	#[test]
437	fn test_slimmer2_slim_nested_empty_removal() -> TestResult<()> {
438		// -- Setup & Fixtures
439		let fx_html = r#"
440		<!DOCTYPE html>
441		<html>
442		<body>
443			<div> <!-- Will become empty after children removed -->
444				<p>  </p> <!-- empty p -->
445				<div> <!-- Inner div, will become empty -->
446					<span><!-- comment --></span> <!-- empty span -->
447				</div>
448			</div>
449			<section>
450				<h1>Title</h1> <!-- Keep H1 -->
451				<div> </div> <!-- Remove empty div -->
452			</section>
453		</body>
454		</html>
455		"#;
456		// Expected: Outer div removed, inner div removed, p removed, span removed. Section and H1 remain.
457		// This behaviour should now match html5ever version.
458		let expected_body = r#"<body><section><h1>Title</h1></section></body>"#;
459
460		// -- Exec
461		let html = slim(fx_html)?;
462		// println!("\n---\nSlimmed HTML (Scraper - Nested Empty Removed):\n{}\n---\n", html);
463
464		// -- Check
465		assert!(
466			html.contains(expected_body),
467			"Should remove nested empty elements correctly after processing. Expected: '{}', Got: '{}'",
468			expected_body,
469			html
470		);
471		assert!(!html.contains("<p>"), "Empty <p> should be removed");
472		assert!(!html.contains("<span>"), "Empty <span> should be removed");
473		assert!(
474			!html.contains("<div>"),
475			"All empty <div> tags should be removed (inner and outer)"
476		);
477		assert!(html.contains("<section>"), "Section should remain");
478		assert!(html.contains("<h1>"), "H1 should remain");
479
480		Ok(())
481	}
482
483	#[test]
484	fn test_slimmer2_slim_keep_empty_but_not_removable() -> TestResult<()> {
485		// -- Setup & Fixtures
486		let fx_html = r#"
487		<!DOCTYPE html>
488		<html>
489		<body>
490			<main></main> <!-- Should keep 'main' even if empty -->
491			<table><tr><td></td></tr></table> <!-- Should keep table structure even if cells empty -->
492		</body>
493		</html>
494		"#;
495		let expected_body_fragment1 = "<main></main>";
496		// Note: scraper often adds <tbody> implicitly, but the empty tags should still be present.
497		// let expected_body_fragment_table = "<table><tbody><tr><td></td></tr></tbody></table>"; // Assuming tbody insertion
498
499		// -- Exec
500		let html = slim(fx_html)?;
501		// println!(
502		// 	"\n---\nSlimmed HTML (Scraper - Keep Non-Removable Empty):\n{}\n---\n",
503		// 	html
504		// );
505
506		// -- Check
507		assert!(html.contains(expected_body_fragment1), "Should keep empty <main>");
508		// Be flexible with tbody insertion
509		assert!(
510			html.contains("<table>") && html.contains("<tr>") && html.contains("<td>") && html.contains("</table>"),
511			"Should keep empty table structure. Got: {}",
512			html
513		);
514		// If tbody is reliably inserted by the parser version used:
515		// assert!(html.contains(expected_body_fragment_table), "Should keep empty table structure with tbody. Got: {}", html);
516
517		Ok(())
518	}
519}
520
521// endregion: --- Tests
html_helpers/slimmer.rs

html_helpers/
slimmer.rs