html-helpers 0.1.0

A collection of high-level utilities for cleaning, transforming, and converting HTML content.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
use crate::{Error, Result};
use ego_tree::NodeRef;
use html_escape::encode_double_quoted_attribute;
use scraper::{ElementRef, Html, node::Node};

// region:    --- Constants

// NOTE: These constants are duplicated from slimmer.rs. Consider refactoring if they need to be shared.

/// Tags to remove explicitly, regardless of content (unless within <head>).
const TAGS_TO_REMOVE: &[&str] = &["script", "link", "style", "svg", "base"];

/// Tags that should be removed if they become effectively empty (contain only whitespace/comments)
/// after processing children. Applies only outside the <head> element.
const REMOVABLE_EMPTY_TAGS: &[&str] = &[
	"div", "span", "p", "i", "b", "em", "strong", "section", "article", "header", "footer", "nav", "aside",
];

/// Keywords to check within the 'property' attribute of <meta> tags to determine if they should be kept.
const META_PROPERTY_KEYWORDS: &[&str] = &["title", "url", "image", "description"];

/// Attribute names allowed on <meta> tags within the <head>.
const ALLOWED_META_ATTRS: &[&str] = &["property", "content"];

/// Attribute names allowed on elements outside the <head>.
const ALLOWED_BODY_ATTRS: &[&str] = &["class", "aria-label", "href", "title", "id"];

// endregion: --- Constants

/// Decodes HTML entities (e.g., `&lt;` becomes `<`).
/// Re-exporting from the original slimmer or using html-escape directly.
pub fn decode_html_entities(content: &str) -> String {
	html_escape::decode_html_entities(content).to_string()
}

/// Strips non-content elements from the provided HTML content using the `scraper` crate,
/// preserving essential head tags, and returns the cleaned HTML as a string.
///
/// This function aims to replicate the behavior of `slimmer::slim` using `scraper`.
/// It removes:
/// - Non-visible tags like `<script>`, `<link>`, `<style>`, `<svg>`, `<base>`.
/// - HTML comments.
/// - Empty or whitespace-only text nodes.
/// - Specific tags (like `<div>`, `<span>`, `<p>`, etc.) if they become effectively empty *after* processing children.
/// - Attributes except for specific allowlists (`class`, `aria-label`, `href` outside head; `property`, `content` for relevant meta tags in head).
///
/// It preserves:
/// - `<title>` tag within `<head>`.
/// - `<meta>` tags within `<head>` if their `property` attribute matches keywords in `META_PROPERTY_KEYWORDS`.
/// - Essential body content.
///
/// # Arguments
///
/// * `html_content` - A string slice containing the HTML content to be processed.
///
/// # Returns
///
/// A `Result<String>` which is:
/// - `Ok(String)` containing the cleaned HTML content.
/// - `Err` if any errors occur during processing.
pub fn slim(html_content: &str) -> Result<String> {
	let html = Html::parse_document(html_content);
	let mut output = String::new();

	// Process the root node (which should be the Document node)
	process_node_recursive(html.tree.root(), false, &mut output)?;

	// Final cleanup of empty lines
	let content = remove_empty_lines(output)?;

	Ok(content)
}

/// Removes empty lines from the given content, returning the cleaned string.
fn remove_empty_lines(content: String) -> Result<String> {
	let lines: Vec<&str> = content.lines().filter(|line| !line.trim().is_empty()).collect();
	Ok(lines.join("\n"))
}

/// Checks if a string contains only whitespace characters.
fn is_string_effectively_empty(s: &str) -> bool {
	s.trim().is_empty()
}

/// Recursively processes a node using `scraper`, writing allowed content to the output string.
fn process_node_recursive(node: NodeRef<Node>, is_in_head_context: bool, output: &mut String) -> Result<()> {
	match node.value() {
		Node::Document => {
			// Process children of the document (Doctype, root element <html>)
			for child in node.children() {
				process_node_recursive(child, false, output)?; // Start children with is_in_head_context = false
			}
		}

		Node::Doctype(doctype) => {
			// Serialize Doctype manually
			output.push_str("<!DOCTYPE ");
			output.push_str(&doctype.name);
			let has_public = !doctype.public_id.is_empty();
			let has_system = !doctype.system_id.is_empty();

			if has_public {
				output.push_str(" PUBLIC \"");
				output.push_str(&doctype.public_id);
				output.push('"');
			}

			if has_system {
				if !has_public {
					// Add SYSTEM keyword only if no PUBLIC id
					output.push_str(" SYSTEM");
				}
				output.push(' '); // Always add space before system id string if it exists
				output.push('"');
				output.push_str(&doctype.system_id);
				output.push('"');
			}
			output.push('>');
			// Consider adding a newline if needed for formatting, but remove_empty_lines might handle it.
			// output.push('\n');
		}

		Node::Comment(_) => { /* Skip comments */ }

		Node::Text(text) => {
			let text_content = text.trim();
			if !text_content.is_empty() {
				// Use the raw text provided by scraper, assuming it's decoded.
				// Re-escaping is generally not needed for text nodes here.
				output.push_str(text);
			}
		}

		Node::Element(element) => {
			let tag_name = element.name();
			let current_node_is_head = tag_name == "head";
			// Determine context for children: true if current node is <head> or if parent was already in <head>
			let child_context_is_in_head = is_in_head_context || current_node_is_head;

			let el_ref = ElementRef::wrap(node).ok_or_else(|| Error::custom("Failed to wrap node as ElementRef"))?;

			// --- 1. Decide if this element should be skipped entirely (before processing children) ---

			// Skip tags explicitly marked for removal (outside head context)
			// Note: script/style/link/base removal handled separately for clarity.
			if !child_context_is_in_head && TAGS_TO_REMOVE.contains(&tag_name) {
				return Ok(());
			}
			// Skip specific non-content tags always
			if matches!(tag_name, "script" | "style" | "link" | "base" | "svg") {
				return Ok(());
			}

			// Skip elements within <head> context unless they are <title> or allowed <meta>
			if is_in_head_context {
				if tag_name == "title" {
					// Keep title
				} else if tag_name == "meta" {
					if !should_keep_meta(el_ref) {
						return Ok(()); // Remove disallowed meta tag
					}
					// Keep allowed meta
				} else {
					return Ok(()); // Remove other tags inside head context
				}
			}

			// --- 2. Process Children Recursively into a temporary buffer ---
			let mut children_output = String::new();
			for child in node.children() {
				process_node_recursive(child, child_context_is_in_head, &mut children_output)?;
			}

			// --- 3. Decide whether to keep the current node based on its content *after* processing children ---
			let is_empty_after_processing = is_string_effectively_empty(&children_output);

			// Check if it's a tag eligible for removal when empty (outside head)
			let is_removable_tag_when_empty = !child_context_is_in_head && REMOVABLE_EMPTY_TAGS.contains(&tag_name);

			// Check if it's the <head> tag itself and it's now empty
			let is_empty_head_tag = current_node_is_head && is_empty_after_processing;

			let should_remove_node = (is_removable_tag_when_empty && is_empty_after_processing) || is_empty_head_tag;

			// --- 4. Construct Output if Node is Kept ---
			if !should_remove_node {
				// Build start tag
				output.push('<');
				output.push_str(tag_name);
				filter_and_write_attributes(el_ref, child_context_is_in_head, output)?;
				output.push('>');

				// Append children's content
				output.push_str(&children_output);

				// Build end tag
				output.push_str("</");
				output.push_str(tag_name);
				output.push('>');
			}
		}

		Node::Fragment => {
			// Should not happen with parse_document, but handle defensively
			for child in node.children() {
				process_node_recursive(child, false, output)?;
			}
		}

		Node::ProcessingInstruction(_) => { /* Skip PIs */ }
	}
	Ok(())
}

// is_effectively_empty (on ElementRef) is no longer needed as we check the string output.

/// Checks if a `<meta>` tag element should be kept based on its `property` attribute.
fn should_keep_meta(element: ElementRef) -> bool {
	// Check if the element is actually a <meta> tag
	if element.value().name() != "meta" {
		return false;
	}

	if let Some(prop_value) = element.value().attr("property") {
		let value_lower = prop_value.to_lowercase();
		// Check if the property value contains any of the relevant keywords
		META_PROPERTY_KEYWORDS.iter().any(|&keyword| value_lower.contains(keyword))
	} else {
		// No 'property' attribute found
		false
	}
}

/// Filters attributes of an element and writes the allowed ones to the output string.
fn filter_and_write_attributes(element: ElementRef, is_in_head_context: bool, output: &mut String) -> Result<()> {
	let tag_name = element.value().name();

	// Determine the correct list of allowed attributes based on context
	let allowed_attrs: &[&str] = if is_in_head_context {
		match tag_name {
			"meta" => ALLOWED_META_ATTRS,
			"title" => &[], // No attributes allowed on title
			_ => &[],       // Default deny for other unexpected tags in head
		}
	} else {
		// Outside head context
		ALLOWED_BODY_ATTRS
	};

	// Iterate over attributes and append allowed ones
	for (name, value) in element.value().attrs() {
		// Check against the determined allowlist
		if allowed_attrs.contains(&name) {
			output.push(' ');
			output.push_str(name);
			output.push_str("=\"");
			// Encode attribute value correctly
			output.push_str(&encode_double_quoted_attribute(value));
			output.push('"');
		}
	}

	Ok(())
}

// region:    --- Tests

#[cfg(test)]
mod tests {
	use super::*;
	// Result type alias for tests
	type TestResult<T> = core::result::Result<T, Box<dyn std::error::Error>>;

	// Copied and adapted tests from slimmer.rs
	// Renamed slim -> slim2 and test_slimmer_... -> test_slimmer2_...

	#[test]
	fn test_slimmer2_slim_basic() -> TestResult<()> {
		// -- Setup & Fixtures
		let fx_html = r#"
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
	<meta property="og:title" content="Test Title">
	<meta property="og:url" content="http://example.com">
	<meta property="og:image" content="http://example.com/img.png">
	<meta property="og:description" content="Test Description">
	<meta name="keywords" content="test, html"> <!-- Should be removed -->
    <title>Simple HTML Page</title>
	<style> body{ color: red } </style>
	<link rel="stylesheet" href="style.css">
	<script> console.log("hi"); </script>
	<base href="/"> <!-- Should be removed -->
</head>
<body class="main-body" aria-label="Page body">
	<svg><path d="M0 0 L 10 10"></path></svg> <!-- Should be removed -->
	<div>
		<span></span> <!-- Should be removed (effectively empty after processing) -->
		<p> <!-- Effectively empty after processing --> </p>
		<b>  </b> <!-- Effectively empty after processing -->
		<i><!-- comment --></i> <!-- Effectively empty after processing -->
	</div> <!-- Should be removed (effectively empty after children removed) -->
	<section>Content Inside</section> <!-- Should be kept -->
	<article>  </article> <!-- Should be removed (empty after processing) -->
    <h1 funky-attribute="removeme">Hello, World!</h1> <!-- funky-attribute removed -->
    <p>This is a simple HTML page.</p>
	<a href="https://example.org" class="link-style" extra="gone">Link</a> <!-- href and class kept -->
	<!-- Some Comment -->
</body>
</html>
		"#;

		// Expected output should now match slimmer.rs more closely regarding empty element removal.
		// let expected_head_content = r#"<head><meta content="Test Title" property="og:title"><meta content="http://example.com" property="og:url"><meta content="http://example.com/img.png" property="og:image"><meta content="Test Description" property="og:description"><title>Simple HTML Page</title></head>"#;
		let expected_body_content = r#"<body aria-label="Page body" class="main-body"><section>Content Inside</section><h1>Hello, World!</h1><p>This is a simple HTML page.</p><a class="link-style" href="https://example.org">Link</a></body>"#;
		// Note attribute order might differ slightly between scraper/html5ever & string building, but content should match.

		// -- Exec
		let html = slim(fx_html)?;
		// println!(
		// 	"\n---\nSlimmed HTML (Scraper - Basic + Post-Empty Removal):\n{}\n---\n",
		// 	html
		// );

		// -- Check Head Content (More precise check possible now)
		// Need flexible attribute order check for head
		assert!(html.contains("<head>"));
		assert!(html.contains("</head>"));
		assert!(html.contains(r#"<meta content="Test Title" property="og:title">"#));
		assert!(html.contains(r#"<meta content="http://example.com" property="og:url">"#));
		assert!(html.contains(r#"<meta content="http://example.com/img.png" property="og:image">"#));
		assert!(html.contains(r#"<meta content="Test Description" property="og:description">"#));
		assert!(html.contains(r#"<title>Simple HTML Page</title>"#));

		assert!(
			!html.contains("<meta charset") && !html.contains("<meta name"),
			"Should remove disallowed meta tags"
		);
		assert!(
			!html.contains("<style") && !html.contains("<link") && !html.contains("<script") && !html.contains("<base"),
			"Should remove style, link, script, base"
		);

		// -- Check Body Content (More precise check)
		// Allow for attribute order variations in body tag
		assert!(
			html.contains("<body")
				&& html.contains(r#"class="main-body""#)
				&& html.contains(r#"aria-label="Page body""#)
				&& html.contains(">")
		);
		assert!(html.contains(r#"</body>"#));
		assert!(html.contains(expected_body_content)); // Check the exact sequence for the rest

		// Check removals (should now match slimmer.rs)
		assert!(!html.contains("<svg>"), "Should remove svg");
		assert!(!html.contains("<span>"), "Should remove empty span");
		assert!(!html.contains("<p> </p>"), "Should remove empty p tag");
		assert!(!html.contains("<b>"), "Should remove empty b");
		assert!(!html.contains("<i>"), "Should remove empty i");
		assert!(!html.contains("<div>"), "Should remove outer empty div");
		assert!(!html.contains("<article>"), "Should remove empty article");
		assert!(!html.contains("funky-attribute"), "Should remove funky-attribute");
		assert!(!html.contains("extra=\"gone\""), "Should remove extra anchor attribute");
		assert!(!html.contains("<!--"), "Should remove comments");

		Ok(())
	}

	#[test]
	fn test_slimmer2_slim_empty_head_removed() -> TestResult<()> {
		// -- Setup & Fixtures
		let fx_html = r#"
		<!DOCTYPE html>
		<html>
		<head>
			<meta charset="utf-8">
			<link rel="icon" href="favicon.ico">
		</head>
		<body>
			<p>Content</p>
		</body>
		</html>
		"#;

		// -- Exec
		let html = slim(fx_html)?;
		// println!("\n---\nSlimmed HTML (Scraper - Empty Head Removed):\n{}\n---\n", html);

		// -- Check
		// The <head> tag itself should now be removed as it becomes empty after processing children.
		assert!(
			!html.contains("<head>"),
			"Empty <head> tag should be removed after processing. Got: {}",
			html
		);
		assert!(html.contains("<body><p>Content</p></body>"), "Body should remain");

		Ok(())
	}

	#[test]
	fn test_slimmer2_slim_keeps_head_if_title_present() -> TestResult<()> {
		// -- Setup & Fixtures
		let fx_html = r#"
		<!DOCTYPE html>
		<html>
		<head>
			<title>Only Title</title>
			<script></script>
		</head>
		<body>
			<p>Content</p>
		</body>
		</html>
		"#;

		// -- Exec
		let html = slim(fx_html)?;
		// println!("\n---\nSlimmed HTML (Scraper - Head with Title Kept):\n{}\n---\n", html);

		// -- Check
		// Head should remain as title is kept.
		assert!(
			html.contains("<head><title>Only Title</title></head>"),
			"<head> with only title should remain"
		);
		assert!(!html.contains("<script>"), "Script should be removed");
		assert!(html.contains("<body><p>Content</p></body>"), "Body should remain");

		Ok(())
	}

	#[test]
	fn test_slimmer2_slim_nested_empty_removal() -> TestResult<()> {
		// -- Setup & Fixtures
		let fx_html = r#"
		<!DOCTYPE html>
		<html>
		<body>
			<div> <!-- Will become empty after children removed -->
				<p>  </p> <!-- empty p -->
				<div> <!-- Inner div, will become empty -->
					<span><!-- comment --></span> <!-- empty span -->
				</div>
			</div>
			<section>
				<h1>Title</h1> <!-- Keep H1 -->
				<div> </div> <!-- Remove empty div -->
			</section>
		</body>
		</html>
		"#;
		// Expected: Outer div removed, inner div removed, p removed, span removed. Section and H1 remain.
		// This behaviour should now match html5ever version.
		let expected_body = r#"<body><section><h1>Title</h1></section></body>"#;

		// -- Exec
		let html = slim(fx_html)?;
		// println!("\n---\nSlimmed HTML (Scraper - Nested Empty Removed):\n{}\n---\n", html);

		// -- Check
		assert!(
			html.contains(expected_body),
			"Should remove nested empty elements correctly after processing. Expected: '{}', Got: '{}'",
			expected_body,
			html
		);
		assert!(!html.contains("<p>"), "Empty <p> should be removed");
		assert!(!html.contains("<span>"), "Empty <span> should be removed");
		assert!(
			!html.contains("<div>"),
			"All empty <div> tags should be removed (inner and outer)"
		);
		assert!(html.contains("<section>"), "Section should remain");
		assert!(html.contains("<h1>"), "H1 should remain");

		Ok(())
	}

	#[test]
	fn test_slimmer2_slim_keep_empty_but_not_removable() -> TestResult<()> {
		// -- Setup & Fixtures
		let fx_html = r#"
		<!DOCTYPE html>
		<html>
		<body>
			<main></main> <!-- Should keep 'main' even if empty -->
			<table><tr><td></td></tr></table> <!-- Should keep table structure even if cells empty -->
		</body>
		</html>
		"#;
		let expected_body_fragment1 = "<main></main>";
		// Note: scraper often adds <tbody> implicitly, but the empty tags should still be present.
		// let expected_body_fragment_table = "<table><tbody><tr><td></td></tr></tbody></table>"; // Assuming tbody insertion

		// -- Exec
		let html = slim(fx_html)?;
		// println!(
		// 	"\n---\nSlimmed HTML (Scraper - Keep Non-Removable Empty):\n{}\n---\n",
		// 	html
		// );

		// -- Check
		assert!(html.contains(expected_body_fragment1), "Should keep empty <main>");
		// Be flexible with tbody insertion
		assert!(
			html.contains("<table>") && html.contains("<tr>") && html.contains("<td>") && html.contains("</table>"),
			"Should keep empty table structure. Got: {}",
			html
		);
		// If tbody is reliably inserted by the parser version used:
		// assert!(html.contains(expected_body_fragment_table), "Should keep empty table structure with tbody. Got: {}", html);

		Ok(())
	}
}

// endregion: --- Tests