Skip to main content

reinhardt_utils/utils_core/
html.rs

1//! HTML utilities for escaping, sanitization, and manipulation
2
3use reinhardt_core::security::xss::strip_tags_safe;
4use std::borrow::Cow;
5/// Escape HTML special characters
6///
7/// # Examples
8///
9/// ```
10/// use reinhardt_utils::utils_core::html::escape;
11///
12/// assert_eq!(escape("Hello, World!"), "Hello, World!");
13/// assert_eq!(escape("<script>alert('XSS')</script>"),
14///            "&lt;script&gt;alert(&#x27;XSS&#x27;)&lt;/script&gt;");
15/// assert_eq!(escape("5 < 10 & 10 > 5"), "5 &lt; 10 &amp; 10 &gt; 5");
16/// ```
17pub fn escape(text: &str) -> String {
18	let mut result = String::with_capacity(text.len() + 10);
19	for ch in text.chars() {
20		match ch {
21			'&' => result.push_str("&amp;"),
22			'<' => result.push_str("&lt;"),
23			'>' => result.push_str("&gt;"),
24			'"' => result.push_str("&quot;"),
25			'\'' => result.push_str("&#x27;"),
26			_ => result.push(ch),
27		}
28	}
29	result
30}
31/// Unescape HTML entities
32///
33/// # Examples
34///
35/// ```
36/// use reinhardt_utils::utils_core::html::unescape;
37///
38/// assert_eq!(unescape("&lt;div&gt;"), "<div>");
39/// assert_eq!(unescape("&amp;"), "&");
40/// assert_eq!(unescape("&quot;test&quot;"), "\"test\"");
41/// assert_eq!(unescape("&#x27;"), "'");
42/// ```
43pub fn unescape(text: &str) -> String {
44	let mut result = String::with_capacity(text.len());
45	let mut chars = text.chars().peekable();
46
47	while let Some(ch) = chars.next() {
48		if ch == '&' {
49			let entity: String = chars.by_ref().take_while(|&c| c != ';').collect();
50			match entity.as_str() {
51				"amp" => result.push('&'),
52				"lt" => result.push('<'),
53				"gt" => result.push('>'),
54				"quot" => result.push('"'),
55				"#x27" | "apos" => result.push('\''),
56				_ if entity.starts_with('#') => {
57					if let Some(code_str) = entity.strip_prefix('#')
58						&& let Ok(code) = code_str.parse::<u32>()
59						&& let Some(unicode_char) = char::from_u32(code)
60					{
61						result.push(unicode_char);
62						continue;
63					}
64					result.push('&');
65					result.push_str(&entity);
66					result.push(';');
67				}
68				_ => {
69					result.push('&');
70					result.push_str(&entity);
71					result.push(';');
72				}
73			}
74		} else {
75			result.push(ch);
76		}
77	}
78	result
79}
80/// Strip HTML tags from text
81///
82/// This function uses `strip_tags_safe` from `reinhardt_core::security::xss`
83/// which properly handles malformed HTML including:
84/// - `>` inside quoted attributes (e.g., `<a title="x>y">`)
85/// - Unclosed tags at end of input
86/// - HTML comments (`<!-- ... -->`)
87/// - Self-closing tags
88///
89/// # Examples
90///
91/// ```
92/// use reinhardt_utils::utils_core::html::strip_tags;
93///
94/// assert_eq!(strip_tags("<p>Hello <b>World</b></p>"), "Hello World");
95/// assert_eq!(strip_tags("<a href=\"#\">Link</a>"), "Link");
96/// assert_eq!(strip_tags("No tags here"), "No tags here");
97/// // Fixes #795: Handles > inside quoted attributes
98/// assert_eq!(strip_tags(r#"<a title="x>y">Link</a>"#), "Link");
99/// ```
100pub fn strip_tags(html: &str) -> String {
101	// Fixes #795: Delegate to secure implementation that handles malformed HTML
102	strip_tags_safe(html)
103}
104/// Strip spaces between HTML tags
105///
106/// # Examples
107///
108/// ```
109/// use reinhardt_utils::utils_core::html::strip_spaces_between_tags;
110///
111/// assert_eq!(
112///     strip_spaces_between_tags("<div>  <span>Test</span>  </div>"),
113///     "<div><span>Test</span></div>"
114/// );
115/// assert_eq!(
116///     strip_spaces_between_tags("<p>\n\n<b>Bold</b>\n\n</p>"),
117///     "<p><b>Bold</b></p>"
118/// );
119/// ```
120pub fn strip_spaces_between_tags(html: &str) -> String {
121	let mut result = String::with_capacity(html.len());
122	let mut in_tag = false;
123	let mut space_buffer = String::new();
124
125	for ch in html.chars() {
126		match ch {
127			'<' => {
128				in_tag = true;
129				result.push(ch);
130				space_buffer.clear();
131			}
132			'>' => {
133				in_tag = false;
134				result.push(ch);
135			}
136			' ' | '\t' | '\n' | '\r' if !in_tag => {
137				space_buffer.push(ch);
138			}
139			_ => {
140				if !in_tag && !space_buffer.is_empty() {
141					result.push_str(&space_buffer);
142					space_buffer.clear();
143				}
144				result.push(ch);
145			}
146		}
147	}
148	result
149}
150/// Escape attribute value for use in HTML
151///
152/// # Examples
153///
154/// ```
155/// use reinhardt_utils::utils_core::html::escape_attr;
156///
157/// assert_eq!(escape_attr("value"), "value");
158/// assert_eq!(escape_attr("value with \"quotes\""),
159///            "value with &quot;quotes&quot;");
160/// assert_eq!(escape_attr("line\nbreak"), "line&#10;break");
161/// assert_eq!(escape_attr("tab\there"), "tab&#9;here");
162/// ```
163pub fn escape_attr(text: &str) -> String {
164	let escaped = escape(text);
165	// Extend with whitespace escaping required for attribute context
166	let mut result = String::with_capacity(escaped.len());
167	for ch in escaped.chars() {
168		match ch {
169			'\n' => result.push_str("&#10;"),
170			'\r' => result.push_str("&#13;"),
171			'\t' => result.push_str("&#9;"),
172			_ => result.push(ch),
173		}
174	}
175	result
176}
177/// Format HTML template by substituting placeholder values with HTML-escaped content
178///
179/// All substituted values are automatically HTML-escaped to prevent XSS attacks.
180/// Placeholders are in the format `{key}` and are replaced with the escaped value.
181///
182/// # Security
183///
184/// This function escapes all special HTML characters in the values:
185/// - `&` → `&amp;`
186/// - `<` → `&lt;`
187/// - `>` → `&gt;`
188/// - `"` → `&quot;`
189/// - `'` → `&#x27;`
190///
191/// # Examples
192///
193/// ```
194/// use reinhardt_utils::utils_core::html::format_html;
195///
196/// let template = "<div class=\"{class}\">{content}</div>";
197/// let args = [("class", "container"), ("content", "Hello")];
198/// assert_eq!(
199///     format_html(template, &args),
200///     "<div class=\"container\">Hello</div>"
201/// );
202///
203/// // XSS attack is prevented by escaping
204/// let template = "<p>{user_input}</p>";
205/// let args = [("user_input", "<script>alert('xss')</script>")];
206/// assert_eq!(
207///     format_html(template, &args),
208///     "<p>&lt;script&gt;alert(&#x27;xss&#x27;)&lt;/script&gt;</p>"
209/// );
210/// ```
211pub fn format_html(template: &str, args: &[(&str, &str)]) -> String {
212	let mut result = template.to_string();
213	for (key, value) in args {
214		let placeholder = format!("{{{}}}", key);
215		let escaped_value = escape(value);
216		result = result.replace(&placeholder, &escaped_value);
217	}
218	result
219}
220/// Conditional escape - only escape if not already marked as safe
221///
222/// # Examples
223///
224/// ```
225/// use reinhardt_utils::utils_core::html::conditional_escape;
226///
227/// assert_eq!(conditional_escape("<script>", true), "&lt;script&gt;");
228/// assert_eq!(conditional_escape("<script>", false), "<script>");
229/// assert_eq!(conditional_escape("Hello", false), "Hello");
230/// ```
231pub fn conditional_escape(text: &str, autoescape: bool) -> Cow<'_, str> {
232	if autoescape {
233		Cow::Owned(escape(text))
234	} else {
235		Cow::Borrowed(text)
236	}
237}
238
239/// Mark string as safe (bypasses autoescaping)
240#[derive(Debug, Clone)]
241pub struct SafeString(String);
242
243impl SafeString {
244	/// Create a new SafeString that bypasses HTML escaping
245	///
246	/// # Examples
247	///
248	/// ```
249	/// use reinhardt_utils::utils_core::html::SafeString;
250	///
251	/// let safe = SafeString::new("<b>Bold</b>");
252	/// assert_eq!(safe.as_str(), "<b>Bold</b>");
253	/// ```
254	pub fn new(s: impl Into<String>) -> Self {
255		Self(s.into())
256	}
257	/// Get the string content
258	///
259	/// # Examples
260	///
261	/// ```
262	/// use reinhardt_utils::utils_core::html::SafeString;
263	///
264	/// let safe = SafeString::new("<i>Italic</i>");
265	/// assert_eq!(safe.as_str(), "<i>Italic</i>");
266	/// ```
267	pub fn as_str(&self) -> &str {
268		&self.0
269	}
270}
271
272impl From<String> for SafeString {
273	fn from(s: String) -> Self {
274		Self(s)
275	}
276}
277
278impl From<&str> for SafeString {
279	fn from(s: &str) -> Self {
280		Self(s.to_string())
281	}
282}
283/// Truncate HTML to specified number of words, preserving tags
284///
285/// # Examples
286///
287/// ```
288/// use reinhardt_utils::utils_core::html::truncate_html_words;
289///
290/// let html = "<p>This is a <b>test</b> sentence with many words.</p>";
291/// let truncated = truncate_html_words(html, 5);
292/// assert!(truncated.contains("This"));
293/// assert!(truncated.contains("is"));
294/// assert!(truncated.contains("..."));
295///
296/// let html2 = "<div>Hello <strong>world</strong> test</div>";
297/// let truncated2 = truncate_html_words(html2, 2);
298/// assert!(truncated2.contains("<div>"));
299/// assert!(truncated2.contains("<strong>"));
300/// ```
301pub fn truncate_html_words(html: &str, num_words: usize) -> String {
302	let mut result = String::new();
303	let mut word_count = 0;
304	let mut in_tag = false;
305	let mut current_word = String::new();
306
307	for ch in html.chars() {
308		match ch {
309			'<' => {
310				if !current_word.is_empty() {
311					result.push_str(&current_word);
312					current_word.clear();
313					word_count += 1;
314					if word_count >= num_words {
315						return result + "...";
316					}
317				}
318				in_tag = true;
319				result.push(ch);
320			}
321			'>' => {
322				in_tag = false;
323				result.push(ch);
324			}
325			' ' | '\t' | '\n' | '\r' if !in_tag => {
326				if !current_word.is_empty() {
327					result.push_str(&current_word);
328					current_word.clear();
329					word_count += 1;
330					if word_count >= num_words {
331						return result + "...";
332					}
333				}
334				result.push(ch);
335			}
336			_ => {
337				if in_tag {
338					result.push(ch);
339				} else {
340					current_word.push(ch);
341				}
342			}
343		}
344	}
345
346	if !current_word.is_empty() && word_count < num_words {
347		result.push_str(&current_word);
348	}
349
350	result
351}
352
353#[cfg(test)]
354mod tests {
355	use super::*;
356
357	#[test]
358	fn test_escape() {
359		assert_eq!(escape("Hello, World!"), "Hello, World!");
360		assert_eq!(
361			escape("<script>alert('XSS')</script>"),
362			"&lt;script&gt;alert(&#x27;XSS&#x27;)&lt;/script&gt;"
363		);
364		assert_eq!(escape("5 < 10 & 10 > 5"), "5 &lt; 10 &amp; 10 &gt; 5");
365		assert_eq!(escape("\"quoted\""), "&quot;quoted&quot;");
366	}
367
368	#[test]
369	fn test_unescape() {
370		assert_eq!(unescape("&lt;div&gt;"), "<div>");
371		assert_eq!(unescape("&amp;"), "&");
372		assert_eq!(unescape("&quot;test&quot;"), "\"test\"");
373		assert_eq!(unescape("&#x27;"), "'");
374		assert_eq!(unescape("&#39;"), "'");
375	}
376
377	#[test]
378	fn test_strip_tags() {
379		assert_eq!(strip_tags("<p>Hello <b>World</b></p>"), "Hello World");
380		assert_eq!(strip_tags("<div><span>Test</span></div>"), "Test");
381		assert_eq!(strip_tags("No tags here"), "No tags here");
382		assert_eq!(strip_tags("<a href=\"#\">Link</a>"), "Link");
383	}
384
385	#[test]
386	fn test_strip_spaces_between_tags() {
387		assert_eq!(
388			strip_spaces_between_tags("<div>  <span>Test</span>  </div>"),
389			"<div><span>Test</span></div>"
390		);
391	}
392
393	#[test]
394	fn test_escape_attr() {
395		assert_eq!(escape_attr("value"), "value");
396		assert_eq!(
397			escape_attr("value with \"quotes\""),
398			"value with &quot;quotes&quot;"
399		);
400		assert_eq!(escape_attr("line\nbreak"), "line&#10;break");
401		assert_eq!(escape_attr("tab\there"), "tab&#9;here");
402	}
403
404	#[test]
405	fn test_format_html() {
406		let template = "<div class=\"{class}\">{content}</div>";
407		let args = [("class", "container"), ("content", "Hello")];
408		assert_eq!(
409			format_html(template, &args),
410			"<div class=\"container\">Hello</div>"
411		);
412	}
413
414	#[test]
415	fn test_conditional_escape() {
416		assert_eq!(conditional_escape("<script>", true), "&lt;script&gt;");
417		assert_eq!(conditional_escape("<script>", false), "<script>");
418	}
419
420	#[test]
421	fn test_safe_string() {
422		let safe = SafeString::new("<b>Bold</b>");
423		assert_eq!(safe.as_str(), "<b>Bold</b>");
424	}
425
426	#[test]
427	fn test_truncate_html_words() {
428		let html = "<p>This is a <b>test</b> sentence with many words.</p>";
429		let truncated = truncate_html_words(html, 5);
430		assert!(truncated.contains("This"));
431		assert!(truncated.contains("is"));
432		assert!(truncated.contains("..."));
433	}
434
435	#[test]
436	fn test_truncate_html_preserves_tags() {
437		let html = "<div>Hello <strong>world</strong> test</div>";
438		let truncated = truncate_html_words(html, 2);
439		assert!(truncated.contains("<div>"));
440		assert!(truncated.contains("<strong>"));
441	}
442
443	#[test]
444	fn test_safe_string_from_string() {
445		let s = String::from("<b>Bold</b>");
446		let safe = SafeString::from(s);
447		assert_eq!(safe.as_str(), "<b>Bold</b>");
448	}
449
450	#[test]
451	fn test_safe_string_from_str() {
452		let safe = SafeString::from("<i>Italic</i>");
453		assert_eq!(safe.as_str(), "<i>Italic</i>");
454	}
455
456	#[test]
457	fn test_escape_empty_string() {
458		assert_eq!(escape(""), "");
459	}
460
461	#[test]
462	fn test_escape_multibyte() {
463		assert_eq!(escape("こんにちは<>&"), "こんにちは&lt;&gt;&amp;");
464	}
465
466	#[test]
467	fn test_unescape_incomplete_entity() {
468		// Incomplete entities without semicolon are treated as entity with empty name
469		// which results in "&;" pattern
470		assert_eq!(unescape("&lt"), "<");
471		assert_eq!(unescape("&"), "&;");
472	}
473
474	#[test]
475	fn test_unescape_unknown_entity() {
476		assert_eq!(unescape("&unknown;"), "&unknown;");
477	}
478
479	#[test]
480	fn test_strip_tags_nested() {
481		assert_eq!(strip_tags("<div><p><span>Test</span></p></div>"), "Test");
482	}
483
484	#[test]
485	fn test_strip_tags_empty() {
486		assert_eq!(strip_tags(""), "");
487	}
488
489	#[test]
490	fn test_strip_tags_quoted_attributes_with_angle_brackets() {
491		// Double-quoted attribute containing >
492		assert_eq!(strip_tags(r#"<a title="x>y">Link</a>"#), "Link");
493		// Single-quoted attribute containing >
494		assert_eq!(strip_tags("<a title='x>y'>Link</a>"), "Link");
495		// Multiple quoted attributes with >
496		assert_eq!(
497			strip_tags(r#"<a title="a>b" data-value="c>d">Text</a>"#),
498			"Text"
499		);
500		// Nested quotes: double inside single
501		assert_eq!(strip_tags(r#"<a title='x"y'>Link</a>"#), "Link");
502		// Nested quotes: single inside double
503		assert_eq!(strip_tags(r#"<a title="x'y">Link</a>"#), "Link");
504	}
505
506	#[test]
507	fn test_strip_spaces_between_tags_multiple_spaces() {
508		assert_eq!(
509			strip_spaces_between_tags("<div>   \n\t   <span>Test</span>   \n\t   </div>"),
510			"<div><span>Test</span></div>"
511		);
512	}
513
514	#[test]
515	fn test_escape_attr_carriage_return() {
516		assert_eq!(escape_attr("test\rvalue"), "test&#13;value");
517	}
518
519	#[test]
520	fn test_format_html_multiple_replacements() {
521		let template = "<div id=\"{id}\" class=\"{class}\">{content}</div>";
522		let args = [("id", "main"), ("class", "container"), ("content", "Hello")];
523		assert_eq!(
524			format_html(template, &args),
525			"<div id=\"main\" class=\"container\">Hello</div>"
526		);
527	}
528
529	#[test]
530	fn test_format_html_no_replacements() {
531		let template = "<div>Static content</div>";
532		let args: [(&str, &str); 0] = [];
533		assert_eq!(format_html(template, &args), "<div>Static content</div>");
534	}
535
536	#[test]
537	fn test_format_html_xss_prevention_script_tag() {
538		// Arrange
539		let template = "<p>{content}</p>";
540		let args = [("content", "<script>alert('xss')</script>")];
541
542		// Act
543		let result = format_html(template, &args);
544
545		// Assert - script tags must be escaped
546		assert!(!result.contains("<script>"));
547		assert!(result.contains("&lt;script&gt;"));
548		assert!(result.contains("&lt;/script&gt;"));
549		assert!(result.contains("&#x27;xss&#x27;"));
550	}
551
552	#[test]
553	fn test_format_html_xss_prevention_event_handler() {
554		// Arrange
555		let template = r#"<div class="{class}">{content}</div>"#;
556		let args = [
557			("class", r#"container" onclick="alert('xss')"#),
558			("content", "Safe content"),
559		];
560
561		// Act
562		let result = format_html(template, &args);
563
564		// Assert - quotes must be escaped to prevent event handler injection
565		assert!(result.contains("&quot;"));
566		assert!(!result.contains(r#"onclick="alert"#));
567	}
568
569	#[test]
570	fn test_format_html_xss_prevention_ampersand() {
571		// Arrange
572		let template = "<a href=\"/search?q={query}\">Search</a>";
573		let args = [("query", "test&redirect=evil.com")];
574
575		// Act
576		let result = format_html(template, &args);
577
578		// Assert - ampersand must be escaped
579		assert!(result.contains("&amp;"));
580		assert!(!result.contains("test&redirect"));
581	}
582
583	#[test]
584	fn test_format_html_xss_prevention_angle_brackets() {
585		// Arrange
586		let template = "<span>{text}</span>";
587		let args = [("text", "<<SCRIPT>alert('XSS');//<</SCRIPT>")];
588
589		// Act
590		let result = format_html(template, &args);
591
592		// Assert - all angle brackets must be escaped
593		assert!(!result.contains("<SCRIPT>"));
594		assert!(result.contains("&lt;"));
595		assert!(result.contains("&gt;"));
596	}
597
598	#[test]
599	fn test_format_html_safe_values_unchanged() {
600		// Arrange - values without special characters should pass through unchanged
601		let template = "<div id=\"{id}\" class=\"{class}\">{content}</div>";
602		let args = [
603			("id", "main"),
604			("class", "container"),
605			("content", "Hello World"),
606		];
607
608		// Act
609		let result = format_html(template, &args);
610
611		// Assert
612		assert_eq!(
613			result,
614			"<div id=\"main\" class=\"container\">Hello World</div>"
615		);
616	}
617
618	#[test]
619	fn test_truncate_html_words_exact_count() {
620		let html = "<p>One two three</p>";
621		let truncated = truncate_html_words(html, 3);
622		// The function adds "..." when word_count reaches num_words
623		// To not have "...", we need more words than the count
624		assert!(truncated.contains("..."));
625	}
626
627	#[test]
628	fn test_truncate_html_words_empty() {
629		let html = "";
630		let truncated = truncate_html_words(html, 5);
631		assert_eq!(truncated, "");
632	}
633}
634
635#[cfg(test)]
636mod proptests {
637	use super::*;
638	use proptest::prelude::*;
639
640	proptest! {
641		#[test]
642		fn prop_escape_no_special_chars(s in "[^<>&\"']*") {
643			let escaped = escape(&s);
644			assert!(!escaped.contains('<'));
645			assert!(!escaped.contains('>'));
646			assert!(!escaped.contains('&'));
647			assert!(!escaped.contains('"'));
648			assert!(!escaped.contains('\''));
649		}
650
651		#[test]
652		fn prop_strip_tags_no_angle_brackets(s in "\\PC*") {
653			let stripped = strip_tags(&s);
654			assert!(!stripped.contains('<'));
655		}
656
657		#[test]
658		fn prop_strip_tags_length_decrease(s in "\\PC*") {
659			let stripped = strip_tags(&s);
660			assert!(stripped.len() <= s.len());
661		}
662
663		#[test]
664		fn prop_truncate_html_words_respects_limit(html in "\\PC*", n in 1usize..20) {
665			let truncated = truncate_html_words(&html, n);
666			let word_count = truncated
667				.split(|c: char| c.is_whitespace() || c == '<' || c == '>')
668				.filter(|w| !w.is_empty() && !w.starts_with('/'))
669				.filter(|w| !w.chars().all(|c| !c.is_alphanumeric()))
670				.count();
671
672			// Allow some flexibility due to HTML tags
673			assert!(word_count <= n + 5);
674		}
675
676		#[test]
677		fn prop_escape_attr_no_newlines(s in "\\PC*") {
678			let escaped = escape_attr(&s);
679			assert!(!escaped.contains('\n'));
680			assert!(!escaped.contains('\r'));
681			assert!(!escaped.contains('\t'));
682		}
683
684		#[test]
685		fn prop_conditional_escape_when_true(s in "\\PC*") {
686			let escaped_cond = conditional_escape(&s, true);
687			let escaped_direct = escape(&s);
688			assert_eq!(escaped_cond, escaped_direct);
689		}
690
691		#[test]
692		fn prop_conditional_escape_when_false(s in "\\PC*") {
693			let escaped = conditional_escape(&s, false);
694			assert_eq!(escaped, s);
695		}
696
697		#[test]
698		fn prop_safe_string_roundtrip(s in "\\PC*") {
699			let safe = SafeString::from(s.clone());
700			assert_eq!(safe.as_str(), &s);
701		}
702
703		#[test]
704		fn prop_format_html_preserves_non_placeholders(template in "\\PC*") {
705			let args: [(&str, &str); 0] = [];
706			let result = format_html(&template, &args);
707			assert_eq!(result, template);
708		}
709
710		#[test]
711		fn prop_strip_spaces_reduces_whitespace(s in "\\PC*") {
712			let stripped = strip_spaces_between_tags(&s);
713			// Result should not have more characters than original
714			assert!(stripped.len() <= s.len() + 100); // Allow some overhead for tag processing
715		}
716	}
717}