Skip to main content

reinhardt_utils/utils_core/
html.rs

1//! HTML utilities for escaping, sanitization, and manipulation
2
3use reinhardt_core::security::xss::strip_tags_safe;
4use std::borrow::Cow;
5/// Escape HTML special characters
6///
7/// # Examples
8///
9/// ```
10/// use reinhardt_utils::utils_core::html::escape;
11///
12/// assert_eq!(escape("Hello, World!"), "Hello, World!");
13/// assert_eq!(escape("<script>alert('XSS')</script>"),
14///            "&lt;script&gt;alert(&#x27;XSS&#x27;)&lt;/script&gt;");
15/// assert_eq!(escape("5 < 10 & 10 > 5"), "5 &lt; 10 &amp; 10 &gt; 5");
16/// ```
17pub fn escape(text: &str) -> String {
18	let mut result = String::with_capacity(text.len() + 10);
19	for ch in text.chars() {
20		match ch {
21			'&' => result.push_str("&amp;"),
22			'<' => result.push_str("&lt;"),
23			'>' => result.push_str("&gt;"),
24			'"' => result.push_str("&quot;"),
25			'\'' => result.push_str("&#x27;"),
26			_ => result.push(ch),
27		}
28	}
29	result
30}
31/// Unescape HTML entities
32///
33/// # Examples
34///
35/// ```
36/// use reinhardt_utils::utils_core::html::unescape;
37///
38/// assert_eq!(unescape("&lt;div&gt;"), "<div>");
39/// assert_eq!(unescape("&amp;"), "&");
40/// assert_eq!(unescape("&quot;test&quot;"), "\"test\"");
41/// assert_eq!(unescape("&#x27;"), "'");
42/// ```
43pub fn unescape(text: &str) -> String {
44	let mut result = String::with_capacity(text.len());
45	let mut chars = text.chars().peekable();
46
47	while let Some(ch) = chars.next() {
48		if ch == '&' {
49			let entity: String = chars.by_ref().take_while(|&c| c != ';').collect();
50			match entity.as_str() {
51				"amp" => result.push('&'),
52				"lt" => result.push('<'),
53				"gt" => result.push('>'),
54				"quot" => result.push('"'),
55				"#x27" | "apos" => result.push('\''),
56				_ if entity.starts_with('#') => {
57					if let Some(code_str) = entity.strip_prefix('#')
58						&& let Ok(code) = code_str.parse::<u32>()
59						&& let Some(unicode_char) = char::from_u32(code)
60					{
61						result.push(unicode_char);
62						continue;
63					}
64					result.push('&');
65					result.push_str(&entity);
66					result.push(';');
67				}
68				_ => {
69					result.push('&');
70					result.push_str(&entity);
71					result.push(';');
72				}
73			}
74		} else {
75			result.push(ch);
76		}
77	}
78	result
79}
80/// Strip HTML tags from text
81///
82/// This function uses `strip_tags_safe` from `reinhardt_core::security::xss`
83/// which properly handles malformed HTML including:
84/// - `>` inside quoted attributes (e.g., `<a title="x>y">`)
85/// - Unclosed tags at end of input
86/// - HTML comments (`<!-- ... -->`)
87/// - Self-closing tags
88///
89/// # Examples
90///
91/// ```
92/// use reinhardt_utils::utils_core::html::strip_tags;
93///
94/// assert_eq!(strip_tags("<p>Hello <b>World</b></p>"), "Hello World");
95/// assert_eq!(strip_tags("<a href=\"#\">Link</a>"), "Link");
96/// assert_eq!(strip_tags("No tags here"), "No tags here");
97/// // Fixes #795: Handles > inside quoted attributes
98/// assert_eq!(strip_tags(r#"<a title="x>y">Link</a>"#), "Link");
99/// ```
100pub fn strip_tags(html: &str) -> String {
101	// Fixes #795: Delegate to secure implementation that handles malformed HTML
102	strip_tags_safe(html)
103}
104/// Strip spaces between HTML tags
105///
106/// # Examples
107///
108/// ```
109/// use reinhardt_utils::utils_core::html::strip_spaces_between_tags;
110///
111/// assert_eq!(
112///     strip_spaces_between_tags("<div>  <span>Test</span>  </div>"),
113///     "<div><span>Test</span></div>"
114/// );
115/// assert_eq!(
116///     strip_spaces_between_tags("<p>\n\n<b>Bold</b>\n\n</p>"),
117///     "<p><b>Bold</b></p>"
118/// );
119/// ```
120pub fn strip_spaces_between_tags(html: &str) -> String {
121	let mut result = String::with_capacity(html.len());
122	let mut in_tag = false;
123	let mut space_buffer = String::new();
124
125	for ch in html.chars() {
126		match ch {
127			'<' => {
128				in_tag = true;
129				result.push(ch);
130				space_buffer.clear();
131			}
132			'>' => {
133				in_tag = false;
134				result.push(ch);
135			}
136			' ' | '\t' | '\n' | '\r' if !in_tag => {
137				space_buffer.push(ch);
138			}
139			_ => {
140				if !in_tag && !space_buffer.is_empty() {
141					result.push_str(&space_buffer);
142					space_buffer.clear();
143				}
144				result.push(ch);
145			}
146		}
147	}
148	result
149}
150/// Escape attribute value for use in HTML
151///
152/// # Examples
153///
154/// ```
155/// use reinhardt_utils::utils_core::html::escape_attr;
156///
157/// assert_eq!(escape_attr("value"), "value");
158/// assert_eq!(escape_attr("value with \"quotes\""),
159///            "value with &quot;quotes&quot;");
160/// assert_eq!(escape_attr("line\nbreak"), "line&#10;break");
161/// assert_eq!(escape_attr("tab\there"), "tab&#9;here");
162/// ```
163pub fn escape_attr(text: &str) -> String {
164	let mut result = String::with_capacity(text.len() + 10);
165	for ch in text.chars() {
166		match ch {
167			'&' => result.push_str("&amp;"),
168			'<' => result.push_str("&lt;"),
169			'>' => result.push_str("&gt;"),
170			'"' => result.push_str("&quot;"),
171			'\'' => result.push_str("&#x27;"),
172			'\n' => result.push_str("&#10;"),
173			'\r' => result.push_str("&#13;"),
174			'\t' => result.push_str("&#9;"),
175			_ => result.push(ch),
176		}
177	}
178	result
179}
180/// Format HTML template by substituting placeholder values with HTML-escaped content
181///
182/// All substituted values are automatically HTML-escaped to prevent XSS attacks.
183/// Placeholders are in the format `{key}` and are replaced with the escaped value.
184///
185/// # Security
186///
187/// This function escapes all special HTML characters in the values:
188/// - `&` → `&amp;`
189/// - `<` → `&lt;`
190/// - `>` → `&gt;`
191/// - `"` → `&quot;`
192/// - `'` → `&#x27;`
193///
194/// # Examples
195///
196/// ```
197/// use reinhardt_utils::utils_core::html::format_html;
198///
199/// let template = "<div class=\"{class}\">{content}</div>";
200/// let args = [("class", "container"), ("content", "Hello")];
201/// assert_eq!(
202///     format_html(template, &args),
203///     "<div class=\"container\">Hello</div>"
204/// );
205///
206/// // XSS attack is prevented by escaping
207/// let template = "<p>{user_input}</p>";
208/// let args = [("user_input", "<script>alert('xss')</script>")];
209/// assert_eq!(
210///     format_html(template, &args),
211///     "<p>&lt;script&gt;alert(&#x27;xss&#x27;)&lt;/script&gt;</p>"
212/// );
213/// ```
214pub fn format_html(template: &str, args: &[(&str, &str)]) -> String {
215	let mut result = template.to_string();
216	for (key, value) in args {
217		let placeholder = format!("{{{}}}", key);
218		let escaped_value = escape(value);
219		result = result.replace(&placeholder, &escaped_value);
220	}
221	result
222}
223/// Conditional escape - only escape if not already marked as safe
224///
225/// # Examples
226///
227/// ```
228/// use reinhardt_utils::utils_core::html::conditional_escape;
229///
230/// assert_eq!(conditional_escape("<script>", true), "&lt;script&gt;");
231/// assert_eq!(conditional_escape("<script>", false), "<script>");
232/// assert_eq!(conditional_escape("Hello", false), "Hello");
233/// ```
234pub fn conditional_escape(text: &str, autoescape: bool) -> Cow<'_, str> {
235	if autoescape {
236		Cow::Owned(escape(text))
237	} else {
238		Cow::Borrowed(text)
239	}
240}
241
242/// Mark string as safe (bypasses autoescaping)
243#[derive(Debug, Clone)]
244pub struct SafeString(String);
245
246impl SafeString {
247	/// Create a new SafeString that bypasses HTML escaping
248	///
249	/// # Examples
250	///
251	/// ```
252	/// use reinhardt_utils::utils_core::html::SafeString;
253	///
254	/// let safe = SafeString::new("<b>Bold</b>");
255	/// assert_eq!(safe.as_str(), "<b>Bold</b>");
256	/// ```
257	pub fn new(s: impl Into<String>) -> Self {
258		Self(s.into())
259	}
260	/// Get the string content
261	///
262	/// # Examples
263	///
264	/// ```
265	/// use reinhardt_utils::utils_core::html::SafeString;
266	///
267	/// let safe = SafeString::new("<i>Italic</i>");
268	/// assert_eq!(safe.as_str(), "<i>Italic</i>");
269	/// ```
270	pub fn as_str(&self) -> &str {
271		&self.0
272	}
273}
274
275impl From<String> for SafeString {
276	fn from(s: String) -> Self {
277		Self(s)
278	}
279}
280
281impl From<&str> for SafeString {
282	fn from(s: &str) -> Self {
283		Self(s.to_string())
284	}
285}
286/// Truncate HTML to specified number of words, preserving tags
287///
288/// # Examples
289///
290/// ```
291/// use reinhardt_utils::utils_core::html::truncate_html_words;
292///
293/// let html = "<p>This is a <b>test</b> sentence with many words.</p>";
294/// let truncated = truncate_html_words(html, 5);
295/// assert!(truncated.contains("This"));
296/// assert!(truncated.contains("is"));
297/// assert!(truncated.contains("..."));
298///
299/// let html2 = "<div>Hello <strong>world</strong> test</div>";
300/// let truncated2 = truncate_html_words(html2, 2);
301/// assert!(truncated2.contains("<div>"));
302/// assert!(truncated2.contains("<strong>"));
303/// ```
304pub fn truncate_html_words(html: &str, num_words: usize) -> String {
305	let mut result = String::new();
306	let mut word_count = 0;
307	let mut in_tag = false;
308	let mut current_word = String::new();
309
310	for ch in html.chars() {
311		match ch {
312			'<' => {
313				if !current_word.is_empty() {
314					result.push_str(&current_word);
315					current_word.clear();
316					word_count += 1;
317					if word_count >= num_words {
318						return result + "...";
319					}
320				}
321				in_tag = true;
322				result.push(ch);
323			}
324			'>' => {
325				in_tag = false;
326				result.push(ch);
327			}
328			' ' | '\t' | '\n' | '\r' if !in_tag => {
329				if !current_word.is_empty() {
330					result.push_str(&current_word);
331					current_word.clear();
332					word_count += 1;
333					if word_count >= num_words {
334						return result + "...";
335					}
336				}
337				result.push(ch);
338			}
339			_ => {
340				if in_tag {
341					result.push(ch);
342				} else {
343					current_word.push(ch);
344				}
345			}
346		}
347	}
348
349	if !current_word.is_empty() && word_count < num_words {
350		result.push_str(&current_word);
351	}
352
353	result
354}
355
356#[cfg(test)]
357mod tests {
358	use super::*;
359
360	#[test]
361	fn test_escape() {
362		assert_eq!(escape("Hello, World!"), "Hello, World!");
363		assert_eq!(
364			escape("<script>alert('XSS')</script>"),
365			"&lt;script&gt;alert(&#x27;XSS&#x27;)&lt;/script&gt;"
366		);
367		assert_eq!(escape("5 < 10 & 10 > 5"), "5 &lt; 10 &amp; 10 &gt; 5");
368		assert_eq!(escape("\"quoted\""), "&quot;quoted&quot;");
369	}
370
371	#[test]
372	fn test_unescape() {
373		assert_eq!(unescape("&lt;div&gt;"), "<div>");
374		assert_eq!(unescape("&amp;"), "&");
375		assert_eq!(unescape("&quot;test&quot;"), "\"test\"");
376		assert_eq!(unescape("&#x27;"), "'");
377		assert_eq!(unescape("&#39;"), "'");
378	}
379
380	#[test]
381	fn test_strip_tags() {
382		assert_eq!(strip_tags("<p>Hello <b>World</b></p>"), "Hello World");
383		assert_eq!(strip_tags("<div><span>Test</span></div>"), "Test");
384		assert_eq!(strip_tags("No tags here"), "No tags here");
385		assert_eq!(strip_tags("<a href=\"#\">Link</a>"), "Link");
386	}
387
388	#[test]
389	fn test_strip_spaces_between_tags() {
390		assert_eq!(
391			strip_spaces_between_tags("<div>  <span>Test</span>  </div>"),
392			"<div><span>Test</span></div>"
393		);
394	}
395
396	#[test]
397	fn test_escape_attr() {
398		assert_eq!(escape_attr("value"), "value");
399		assert_eq!(
400			escape_attr("value with \"quotes\""),
401			"value with &quot;quotes&quot;"
402		);
403		assert_eq!(escape_attr("line\nbreak"), "line&#10;break");
404		assert_eq!(escape_attr("tab\there"), "tab&#9;here");
405	}
406
407	#[test]
408	fn test_format_html() {
409		let template = "<div class=\"{class}\">{content}</div>";
410		let args = [("class", "container"), ("content", "Hello")];
411		assert_eq!(
412			format_html(template, &args),
413			"<div class=\"container\">Hello</div>"
414		);
415	}
416
417	#[test]
418	fn test_conditional_escape() {
419		assert_eq!(conditional_escape("<script>", true), "&lt;script&gt;");
420		assert_eq!(conditional_escape("<script>", false), "<script>");
421	}
422
423	#[test]
424	fn test_safe_string() {
425		let safe = SafeString::new("<b>Bold</b>");
426		assert_eq!(safe.as_str(), "<b>Bold</b>");
427	}
428
429	#[test]
430	fn test_truncate_html_words() {
431		let html = "<p>This is a <b>test</b> sentence with many words.</p>";
432		let truncated = truncate_html_words(html, 5);
433		assert!(truncated.contains("This"));
434		assert!(truncated.contains("is"));
435		assert!(truncated.contains("..."));
436	}
437
438	#[test]
439	fn test_truncate_html_preserves_tags() {
440		let html = "<div>Hello <strong>world</strong> test</div>";
441		let truncated = truncate_html_words(html, 2);
442		assert!(truncated.contains("<div>"));
443		assert!(truncated.contains("<strong>"));
444	}
445
446	#[test]
447	fn test_safe_string_from_string() {
448		let s = String::from("<b>Bold</b>");
449		let safe = SafeString::from(s);
450		assert_eq!(safe.as_str(), "<b>Bold</b>");
451	}
452
453	#[test]
454	fn test_safe_string_from_str() {
455		let safe = SafeString::from("<i>Italic</i>");
456		assert_eq!(safe.as_str(), "<i>Italic</i>");
457	}
458
459	#[test]
460	fn test_escape_empty_string() {
461		assert_eq!(escape(""), "");
462	}
463
464	#[test]
465	fn test_escape_multibyte() {
466		assert_eq!(escape("こんにちは<>&"), "こんにちは&lt;&gt;&amp;");
467	}
468
469	#[test]
470	fn test_unescape_incomplete_entity() {
471		// Incomplete entities without semicolon are treated as entity with empty name
472		// which results in "&;" pattern
473		assert_eq!(unescape("&lt"), "<");
474		assert_eq!(unescape("&"), "&;");
475	}
476
477	#[test]
478	fn test_unescape_unknown_entity() {
479		assert_eq!(unescape("&unknown;"), "&unknown;");
480	}
481
482	#[test]
483	fn test_strip_tags_nested() {
484		assert_eq!(strip_tags("<div><p><span>Test</span></p></div>"), "Test");
485	}
486
487	#[test]
488	fn test_strip_tags_empty() {
489		assert_eq!(strip_tags(""), "");
490	}
491
492	#[test]
493	fn test_strip_tags_quoted_attributes_with_angle_brackets() {
494		// Double-quoted attribute containing >
495		assert_eq!(strip_tags(r#"<a title="x>y">Link</a>"#), "Link");
496		// Single-quoted attribute containing >
497		assert_eq!(strip_tags("<a title='x>y'>Link</a>"), "Link");
498		// Multiple quoted attributes with >
499		assert_eq!(
500			strip_tags(r#"<a title="a>b" data-value="c>d">Text</a>"#),
501			"Text"
502		);
503		// Nested quotes: double inside single
504		assert_eq!(strip_tags(r#"<a title='x"y'>Link</a>"#), "Link");
505		// Nested quotes: single inside double
506		assert_eq!(strip_tags(r#"<a title="x'y">Link</a>"#), "Link");
507	}
508
509	#[test]
510	fn test_strip_spaces_between_tags_multiple_spaces() {
511		assert_eq!(
512			strip_spaces_between_tags("<div>   \n\t   <span>Test</span>   \n\t   </div>"),
513			"<div><span>Test</span></div>"
514		);
515	}
516
517	#[test]
518	fn test_escape_attr_carriage_return() {
519		assert_eq!(escape_attr("test\rvalue"), "test&#13;value");
520	}
521
522	#[test]
523	fn test_format_html_multiple_replacements() {
524		let template = "<div id=\"{id}\" class=\"{class}\">{content}</div>";
525		let args = [("id", "main"), ("class", "container"), ("content", "Hello")];
526		assert_eq!(
527			format_html(template, &args),
528			"<div id=\"main\" class=\"container\">Hello</div>"
529		);
530	}
531
532	#[test]
533	fn test_format_html_no_replacements() {
534		let template = "<div>Static content</div>";
535		let args: [(&str, &str); 0] = [];
536		assert_eq!(format_html(template, &args), "<div>Static content</div>");
537	}
538
539	#[test]
540	fn test_format_html_xss_prevention_script_tag() {
541		// Arrange
542		let template = "<p>{content}</p>";
543		let args = [("content", "<script>alert('xss')</script>")];
544
545		// Act
546		let result = format_html(template, &args);
547
548		// Assert - script tags must be escaped
549		assert!(!result.contains("<script>"));
550		assert!(result.contains("&lt;script&gt;"));
551		assert!(result.contains("&lt;/script&gt;"));
552		assert!(result.contains("&#x27;xss&#x27;"));
553	}
554
555	#[test]
556	fn test_format_html_xss_prevention_event_handler() {
557		// Arrange
558		let template = r#"<div class="{class}">{content}</div>"#;
559		let args = [
560			("class", r#"container" onclick="alert('xss')"#),
561			("content", "Safe content"),
562		];
563
564		// Act
565		let result = format_html(template, &args);
566
567		// Assert - quotes must be escaped to prevent event handler injection
568		assert!(result.contains("&quot;"));
569		assert!(!result.contains(r#"onclick="alert"#));
570	}
571
572	#[test]
573	fn test_format_html_xss_prevention_ampersand() {
574		// Arrange
575		let template = "<a href=\"/search?q={query}\">Search</a>";
576		let args = [("query", "test&redirect=evil.com")];
577
578		// Act
579		let result = format_html(template, &args);
580
581		// Assert - ampersand must be escaped
582		assert!(result.contains("&amp;"));
583		assert!(!result.contains("test&redirect"));
584	}
585
586	#[test]
587	fn test_format_html_xss_prevention_angle_brackets() {
588		// Arrange
589		let template = "<span>{text}</span>";
590		let args = [("text", "<<SCRIPT>alert('XSS');//<</SCRIPT>")];
591
592		// Act
593		let result = format_html(template, &args);
594
595		// Assert - all angle brackets must be escaped
596		assert!(!result.contains("<SCRIPT>"));
597		assert!(result.contains("&lt;"));
598		assert!(result.contains("&gt;"));
599	}
600
601	#[test]
602	fn test_format_html_safe_values_unchanged() {
603		// Arrange - values without special characters should pass through unchanged
604		let template = "<div id=\"{id}\" class=\"{class}\">{content}</div>";
605		let args = [
606			("id", "main"),
607			("class", "container"),
608			("content", "Hello World"),
609		];
610
611		// Act
612		let result = format_html(template, &args);
613
614		// Assert
615		assert_eq!(
616			result,
617			"<div id=\"main\" class=\"container\">Hello World</div>"
618		);
619	}
620
621	#[test]
622	fn test_truncate_html_words_exact_count() {
623		let html = "<p>One two three</p>";
624		let truncated = truncate_html_words(html, 3);
625		// The function adds "..." when word_count reaches num_words
626		// To not have "...", we need more words than the count
627		assert!(truncated.contains("..."));
628	}
629
630	#[test]
631	fn test_truncate_html_words_empty() {
632		let html = "";
633		let truncated = truncate_html_words(html, 5);
634		assert_eq!(truncated, "");
635	}
636}
637
638#[cfg(test)]
639mod proptests {
640	use super::*;
641	use proptest::prelude::*;
642
643	proptest! {
644		#[test]
645		fn prop_escape_no_special_chars(s in "[^<>&\"']*") {
646			let escaped = escape(&s);
647			assert!(!escaped.contains('<'));
648			assert!(!escaped.contains('>'));
649			assert!(!escaped.contains('&'));
650			assert!(!escaped.contains('"'));
651			assert!(!escaped.contains('\''));
652		}
653
654		#[test]
655		fn prop_strip_tags_no_angle_brackets(s in "\\PC*") {
656			let stripped = strip_tags(&s);
657			assert!(!stripped.contains('<'));
658		}
659
660		#[test]
661		fn prop_strip_tags_length_decrease(s in "\\PC*") {
662			let stripped = strip_tags(&s);
663			assert!(stripped.len() <= s.len());
664		}
665
666		#[test]
667		fn prop_truncate_html_words_respects_limit(html in "\\PC*", n in 1usize..20) {
668			let truncated = truncate_html_words(&html, n);
669			let word_count = truncated
670				.split(|c: char| c.is_whitespace() || c == '<' || c == '>')
671				.filter(|w| !w.is_empty() && !w.starts_with('/'))
672				.filter(|w| !w.chars().all(|c| !c.is_alphanumeric()))
673				.count();
674
675			// Allow some flexibility due to HTML tags
676			assert!(word_count <= n + 5);
677		}
678
679		#[test]
680		fn prop_escape_attr_no_newlines(s in "\\PC*") {
681			let escaped = escape_attr(&s);
682			assert!(!escaped.contains('\n'));
683			assert!(!escaped.contains('\r'));
684			assert!(!escaped.contains('\t'));
685		}
686
687		#[test]
688		fn prop_conditional_escape_when_true(s in "\\PC*") {
689			let escaped_cond = conditional_escape(&s, true);
690			let escaped_direct = escape(&s);
691			assert_eq!(escaped_cond, escaped_direct);
692		}
693
694		#[test]
695		fn prop_conditional_escape_when_false(s in "\\PC*") {
696			let escaped = conditional_escape(&s, false);
697			assert_eq!(escaped, s);
698		}
699
700		#[test]
701		fn prop_safe_string_roundtrip(s in "\\PC*") {
702			let safe = SafeString::from(s.clone());
703			assert_eq!(safe.as_str(), &s);
704		}
705
706		#[test]
707		fn prop_format_html_preserves_non_placeholders(template in "\\PC*") {
708			let args: [(&str, &str); 0] = [];
709			let result = format_html(&template, &args);
710			assert_eq!(result, template);
711		}
712
713		#[test]
714		fn prop_strip_spaces_reduces_whitespace(s in "\\PC*") {
715			let stripped = strip_spaces_between_tags(&s);
716			// Result should not have more characters than original
717			assert!(stripped.len() <= s.len() + 100); // Allow some overhead for tag processing
718		}
719	}
720}