1use crate::{Error, Result};
2use ego_tree::NodeRef;
3use html_escape::encode_double_quoted_attribute;
4use scraper::{ElementRef, Html, node::Node};
5
6const TAGS_TO_REMOVE: &[&str] = &["script", "link", "style", "svg", "base"];
12
13const REMOVABLE_EMPTY_TAGS: &[&str] = &[
16	"div", "span", "p", "i", "b", "em", "strong", "section", "article", "header", "footer", "nav", "aside",
17];
18
19const META_PROPERTY_KEYWORDS: &[&str] = &["title", "url", "image", "description"];
21
22const ALLOWED_META_ATTRS: &[&str] = &["property", "content"];
24
25const ALLOWED_BODY_ATTRS: &[&str] = &["class", "aria-label", "href", "title", "id"];
27
28pub fn decode_html_entities(content: &str) -> String {
33	html_escape::decode_html_entities(content).to_string()
34}
35
36pub fn slim(html_content: &str) -> Result<String> {
62	let html = Html::parse_document(html_content);
63	let mut output = String::new();
64
65	process_node_recursive(html.tree.root(), false, &mut output)?;
67
68	let content = remove_empty_lines(output)?;
70
71	Ok(content)
72}
73
74fn remove_empty_lines(content: String) -> Result<String> {
76	let lines: Vec<&str> = content.lines().filter(|line| !line.trim().is_empty()).collect();
77	Ok(lines.join("\n"))
78}
79
80fn is_string_effectively_empty(s: &str) -> bool {
82	s.trim().is_empty()
83}
84
85fn process_node_recursive(node: NodeRef<Node>, is_in_head_context: bool, output: &mut String) -> Result<()> {
87	match node.value() {
88		Node::Document => {
89			for child in node.children() {
91				process_node_recursive(child, false, output)?; }
93		}
94
95		Node::Doctype(doctype) => {
96			output.push_str("<!DOCTYPE ");
98			output.push_str(&doctype.name);
99			let has_public = !doctype.public_id.is_empty();
100			let has_system = !doctype.system_id.is_empty();
101
102			if has_public {
103				output.push_str(" PUBLIC \"");
104				output.push_str(&doctype.public_id);
105				output.push('"');
106			}
107
108			if has_system {
109				if !has_public {
110					output.push_str(" SYSTEM");
112				}
113				output.push(' '); output.push('"');
115				output.push_str(&doctype.system_id);
116				output.push('"');
117			}
118			output.push('>');
119			}
122
123		Node::Comment(_) => { }
124
125		Node::Text(text) => {
126			let text_content = text.trim();
127			if !text_content.is_empty() {
128				output.push_str(text);
131			}
132		}
133
134		Node::Element(element) => {
135			let tag_name = element.name();
136			let current_node_is_head = tag_name == "head";
137			let child_context_is_in_head = is_in_head_context || current_node_is_head;
139
140			let el_ref = ElementRef::wrap(node).ok_or_else(|| Error::custom("Failed to wrap node as ElementRef"))?;
141
142			if !child_context_is_in_head && TAGS_TO_REMOVE.contains(&tag_name) {
147				return Ok(());
148			}
149			if matches!(tag_name, "script" | "style" | "link" | "base" | "svg") {
151				return Ok(());
152			}
153
154			if is_in_head_context {
156				if tag_name == "title" {
157					} else if tag_name == "meta" {
159					if !should_keep_meta(el_ref) {
160						return Ok(()); }
162					} else {
164					return Ok(()); }
166			}
167
168			let mut children_output = String::new();
170			for child in node.children() {
171				process_node_recursive(child, child_context_is_in_head, &mut children_output)?;
172			}
173
174			let is_empty_after_processing = is_string_effectively_empty(&children_output);
176
177			let is_removable_tag_when_empty = !child_context_is_in_head && REMOVABLE_EMPTY_TAGS.contains(&tag_name);
179
180			let is_empty_head_tag = current_node_is_head && is_empty_after_processing;
182
183			let should_remove_node = (is_removable_tag_when_empty && is_empty_after_processing) || is_empty_head_tag;
184
185			if !should_remove_node {
187				output.push('<');
189				output.push_str(tag_name);
190				filter_and_write_attributes(el_ref, child_context_is_in_head, output)?;
191				output.push('>');
192
193				output.push_str(&children_output);
195
196				output.push_str("</");
198				output.push_str(tag_name);
199				output.push('>');
200			}
201		}
202
203		Node::Fragment => {
204			for child in node.children() {
206				process_node_recursive(child, false, output)?;
207			}
208		}
209
210		Node::ProcessingInstruction(_) => { }
211	}
212	Ok(())
213}
214
215fn should_keep_meta(element: ElementRef) -> bool {
219	if element.value().name() != "meta" {
221		return false;
222	}
223
224	if let Some(prop_value) = element.value().attr("property") {
225		let value_lower = prop_value.to_lowercase();
226		META_PROPERTY_KEYWORDS.iter().any(|&keyword| value_lower.contains(keyword))
228	} else {
229		false
231	}
232}
233
234fn filter_and_write_attributes(element: ElementRef, is_in_head_context: bool, output: &mut String) -> Result<()> {
236	let tag_name = element.value().name();
237
238	let allowed_attrs: &[&str] = if is_in_head_context {
240		match tag_name {
241			"meta" => ALLOWED_META_ATTRS,
242			"title" => &[], _ => &[],       }
245	} else {
246		ALLOWED_BODY_ATTRS
248	};
249
250	for (name, value) in element.value().attrs() {
252		if allowed_attrs.contains(&name) {
254			output.push(' ');
255			output.push_str(name);
256			output.push_str("=\"");
257			output.push_str(&encode_double_quoted_attribute(value));
259			output.push('"');
260		}
261	}
262
263	Ok(())
264}
265
266#[cfg(test)]
269mod tests {
270	use super::*;
271	type TestResult<T> = core::result::Result<T, Box<dyn std::error::Error>>;
273
274	#[test]
278	fn test_slimmer2_slim_basic() -> TestResult<()> {
279		let fx_html = r#"
281<!DOCTYPE html>
282<html lang="en">
283<head>
284    <meta charset="UTF-8">
285    <meta name="viewport" content="width=device-width, initial-scale=1.0">
286	<meta property="og:title" content="Test Title">
287	<meta property="og:url" content="http://example.com">
288	<meta property="og:image" content="http://example.com/img.png">
289	<meta property="og:description" content="Test Description">
290	<meta name="keywords" content="test, html"> <!-- Should be removed -->
291    <title>Simple HTML Page</title>
292	<style> body{ color: red } </style>
293	<link rel="stylesheet" href="style.css">
294	<script> console.log("hi"); </script>
295	<base href="/"> <!-- Should be removed -->
296</head>
297<body class="main-body" aria-label="Page body">
298	<svg><path d="M0 0 L 10 10"></path></svg> <!-- Should be removed -->
299	<div>
300		<span></span> <!-- Should be removed (effectively empty after processing) -->
301		<p> <!-- Effectively empty after processing --> </p>
302		<b>  </b> <!-- Effectively empty after processing -->
303		<i><!-- comment --></i> <!-- Effectively empty after processing -->
304	</div> <!-- Should be removed (effectively empty after children removed) -->
305	<section>Content Inside</section> <!-- Should be kept -->
306	<article>  </article> <!-- Should be removed (empty after processing) -->
307    <h1 funky-attribute="removeme">Hello, World!</h1> <!-- funky-attribute removed -->
308    <p>This is a simple HTML page.</p>
309	<a href="https://example.org" class="link-style" extra="gone">Link</a> <!-- href and class kept -->
310	<!-- Some Comment -->
311</body>
312</html>
313		"#;
314
315		let expected_body_content = r#"<body aria-label="Page body" class="main-body"><section>Content Inside</section><h1>Hello, World!</h1><p>This is a simple HTML page.</p><a class="link-style" href="https://example.org">Link</a></body>"#;
318		let html = slim(fx_html)?;
322		assert!(html.contains("<head>"));
330		assert!(html.contains("</head>"));
331		assert!(html.contains(r#"<meta content="Test Title" property="og:title">"#));
332		assert!(html.contains(r#"<meta content="http://example.com" property="og:url">"#));
333		assert!(html.contains(r#"<meta content="http://example.com/img.png" property="og:image">"#));
334		assert!(html.contains(r#"<meta content="Test Description" property="og:description">"#));
335		assert!(html.contains(r#"<title>Simple HTML Page</title>"#));
336
337		assert!(
338			!html.contains("<meta charset") && !html.contains("<meta name"),
339			"Should remove disallowed meta tags"
340		);
341		assert!(
342			!html.contains("<style") && !html.contains("<link") && !html.contains("<script") && !html.contains("<base"),
343			"Should remove style, link, script, base"
344		);
345
346		assert!(
349			html.contains("<body")
350				&& html.contains(r#"class="main-body""#)
351				&& html.contains(r#"aria-label="Page body""#)
352				&& html.contains(">")
353		);
354		assert!(html.contains(r#"</body>"#));
355		assert!(html.contains(expected_body_content)); assert!(!html.contains("<svg>"), "Should remove svg");
359		assert!(!html.contains("<span>"), "Should remove empty span");
360		assert!(!html.contains("<p> </p>"), "Should remove empty p tag");
361		assert!(!html.contains("<b>"), "Should remove empty b");
362		assert!(!html.contains("<i>"), "Should remove empty i");
363		assert!(!html.contains("<div>"), "Should remove outer empty div");
364		assert!(!html.contains("<article>"), "Should remove empty article");
365		assert!(!html.contains("funky-attribute"), "Should remove funky-attribute");
366		assert!(!html.contains("extra=\"gone\""), "Should remove extra anchor attribute");
367		assert!(!html.contains("<!--"), "Should remove comments");
368
369		Ok(())
370	}
371
372	#[test]
373	fn test_slimmer2_slim_empty_head_removed() -> TestResult<()> {
374		let fx_html = r#"
376		<!DOCTYPE html>
377		<html>
378		<head>
379			<meta charset="utf-8">
380			<link rel="icon" href="favicon.ico">
381		</head>
382		<body>
383			<p>Content</p>
384		</body>
385		</html>
386		"#;
387
388		let html = slim(fx_html)?;
390		assert!(
395			!html.contains("<head>"),
396			"Empty <head> tag should be removed after processing. Got: {}",
397			html
398		);
399		assert!(html.contains("<body><p>Content</p></body>"), "Body should remain");
400
401		Ok(())
402	}
403
404	#[test]
405	fn test_slimmer2_slim_keeps_head_if_title_present() -> TestResult<()> {
406		let fx_html = r#"
408		<!DOCTYPE html>
409		<html>
410		<head>
411			<title>Only Title</title>
412			<script></script>
413		</head>
414		<body>
415			<p>Content</p>
416		</body>
417		</html>
418		"#;
419
420		let html = slim(fx_html)?;
422		assert!(
427			html.contains("<head><title>Only Title</title></head>"),
428			"<head> with only title should remain"
429		);
430		assert!(!html.contains("<script>"), "Script should be removed");
431		assert!(html.contains("<body><p>Content</p></body>"), "Body should remain");
432
433		Ok(())
434	}
435
436	#[test]
437	fn test_slimmer2_slim_nested_empty_removal() -> TestResult<()> {
438		let fx_html = r#"
440		<!DOCTYPE html>
441		<html>
442		<body>
443			<div> <!-- Will become empty after children removed -->
444				<p>  </p> <!-- empty p -->
445				<div> <!-- Inner div, will become empty -->
446					<span><!-- comment --></span> <!-- empty span -->
447				</div>
448			</div>
449			<section>
450				<h1>Title</h1> <!-- Keep H1 -->
451				<div> </div> <!-- Remove empty div -->
452			</section>
453		</body>
454		</html>
455		"#;
456		let expected_body = r#"<body><section><h1>Title</h1></section></body>"#;
459
460		let html = slim(fx_html)?;
462		assert!(
466			html.contains(expected_body),
467			"Should remove nested empty elements correctly after processing. Expected: '{}', Got: '{}'",
468			expected_body,
469			html
470		);
471		assert!(!html.contains("<p>"), "Empty <p> should be removed");
472		assert!(!html.contains("<span>"), "Empty <span> should be removed");
473		assert!(
474			!html.contains("<div>"),
475			"All empty <div> tags should be removed (inner and outer)"
476		);
477		assert!(html.contains("<section>"), "Section should remain");
478		assert!(html.contains("<h1>"), "H1 should remain");
479
480		Ok(())
481	}
482
483	#[test]
484	fn test_slimmer2_slim_keep_empty_but_not_removable() -> TestResult<()> {
485		let fx_html = r#"
487		<!DOCTYPE html>
488		<html>
489		<body>
490			<main></main> <!-- Should keep 'main' even if empty -->
491			<table><tr><td></td></tr></table> <!-- Should keep table structure even if cells empty -->
492		</body>
493		</html>
494		"#;
495		let expected_body_fragment1 = "<main></main>";
496		let html = slim(fx_html)?;
501		assert!(html.contains(expected_body_fragment1), "Should keep empty <main>");
508		assert!(
510			html.contains("<table>") && html.contains("<tr>") && html.contains("<td>") && html.contains("</table>"),
511			"Should keep empty table structure. Got: {}",
512			html
513		);
514		Ok(())
518	}
519}
520
521