contextual_encoder/html.rs
1//! HTML / XML contextual output encoders.
2//!
3//! provides four encoding contexts with different safety guarantees:
4//!
5//! - [`for_html`] — safe for both text content and quoted attributes (most conservative)
6//! - [`for_html_content`] — safe for text content only (does not encode quotes)
7//! - [`for_html_attribute`] — safe for quoted attributes only (does not encode `>`)
8//! - [`for_html_unquoted_attribute`] — safe for unquoted attribute values (most aggressive)
9//!
10//! all encoders replace invalid XML characters (C0/C1 controls, DEL, unicode
11//! non-characters) with a replacement character (space or dash depending on
12//! context).
13//!
14//! # security notes
15//!
16//! - these encoders produce output safe for embedding in the specified context.
17//! they do not sanitize HTML — encoding is not a substitute for input validation.
18//! - never use `for_html_content` output in an attribute context.
19//! - never use `for_html_attribute` output in a text content context where `>` matters.
20//! - `for_html` is the safe default when the exact context is unknown.
21//! - tag names, attribute names, and event handler names must be validated
22//! separately — encoding cannot make arbitrary names safe.
23
24use std::fmt;
25
26use crate::engine::{encode_loop, is_invalid_for_xml, is_unicode_noncharacter};
27
28// ---------------------------------------------------------------------------
29// for_html — safe for text content AND quoted attributes
30// ---------------------------------------------------------------------------
31
32/// encodes `input` for safe embedding in HTML text content and quoted attributes.
33///
34/// this is the most conservative HTML encoder — it encodes characters needed
35/// for both text content and attribute contexts. use [`for_html_content`] or
36/// [`for_html_attribute`] for more minimal encoding when the exact context is
37/// known.
38///
39/// # encoded characters
40///
41/// | input | output |
42/// |-------|--------|
43/// | `&` | `&` |
44/// | `<` | `<` |
45/// | `>` | `>` |
46/// | `"` | `"` |
47/// | `'` | `'` |
48///
49/// invalid XML characters are replaced with a space.
50///
51/// # examples
52///
53/// ```
54/// use contextual_encoder::for_html;
55///
56/// assert_eq!(for_html("<script>alert('xss')</script>"),
57/// "<script>alert('xss')</script>");
58/// assert_eq!(for_html("safe text"), "safe text");
59/// ```
60pub fn for_html(input: &str) -> String {
61 let mut out = String::with_capacity(input.len());
62 write_html(&mut out, input).expect("writing to string cannot fail");
63 out
64}
65
66/// writes the HTML-encoded form of `input` to `out`.
67///
68/// see [`for_html`] for encoding rules.
69pub fn write_html<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
70 encode_loop(out, input, needs_html_encoding, write_html_encoded)
71}
72
73fn needs_html_encoding(c: char) -> bool {
74 matches!(c, '&' | '<' | '>' | '"' | '\'') || is_invalid_for_xml(c)
75}
76
77fn write_html_encoded<W: fmt::Write>(out: &mut W, c: char, _next: Option<char>) -> fmt::Result {
78 match c {
79 '&' => out.write_str("&"),
80 '<' => out.write_str("<"),
81 '>' => out.write_str(">"),
82 '"' => out.write_str("""),
83 '\'' => out.write_str("'"),
84 // invalid XML char → space
85 _ => out.write_char(' '),
86 }
87}
88
89// ---------------------------------------------------------------------------
90// for_html_content — safe for text content only (NOT attributes)
91// ---------------------------------------------------------------------------
92
93/// encodes `input` for safe embedding in HTML text content.
94///
95/// this encoder does **not** encode quote characters and is therefore
96/// **not safe for attribute values**. use [`for_html`] or
97/// [`for_html_attribute`] for attribute contexts.
98///
99/// # encoded characters
100///
101/// | input | output |
102/// |-------|--------|
103/// | `&` | `&` |
104/// | `<` | `<` |
105/// | `>` | `>` |
106///
107/// invalid XML characters are replaced with a space.
108///
109/// # examples
110///
111/// ```
112/// use contextual_encoder::for_html_content;
113///
114/// assert_eq!(for_html_content("1 < 2 & 3 > 0"), "1 < 2 & 3 > 0");
115/// // quotes are NOT encoded — do not use in attributes
116/// assert_eq!(for_html_content(r#"she said "hi""#), r#"she said "hi""#);
117/// ```
118pub fn for_html_content(input: &str) -> String {
119 let mut out = String::with_capacity(input.len());
120 write_html_content(&mut out, input).expect("writing to string cannot fail");
121 out
122}
123
124/// writes the HTML-content-encoded form of `input` to `out`.
125///
126/// see [`for_html_content`] for encoding rules.
127pub fn write_html_content<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
128 encode_loop(
129 out,
130 input,
131 needs_html_content_encoding,
132 write_html_content_encoded,
133 )
134}
135
136fn needs_html_content_encoding(c: char) -> bool {
137 matches!(c, '&' | '<' | '>') || is_invalid_for_xml(c)
138}
139
140fn write_html_content_encoded<W: fmt::Write>(
141 out: &mut W,
142 c: char,
143 _next: Option<char>,
144) -> fmt::Result {
145 match c {
146 '&' => out.write_str("&"),
147 '<' => out.write_str("<"),
148 '>' => out.write_str(">"),
149 _ => out.write_char(' '),
150 }
151}
152
153// ---------------------------------------------------------------------------
154// for_html_attribute — safe for quoted attributes only
155// ---------------------------------------------------------------------------
156
157/// encodes `input` for safe embedding in a quoted HTML attribute value.
158///
159/// this encoder does **not** encode `>` (harmless inside quoted attributes)
160/// and is slightly more minimal than [`for_html`]. it encodes both `"` and
161/// `'` so the output is safe regardless of which quote delimiter is used.
162///
163/// **not safe for unquoted attributes** — use [`for_html_unquoted_attribute`]
164/// for that context.
165///
166/// # encoded characters
167///
168/// | input | output |
169/// |-------|--------|
170/// | `&` | `&` |
171/// | `<` | `<` |
172/// | `"` | `"` |
173/// | `'` | `'` |
174///
175/// invalid XML characters are replaced with a space.
176///
177/// # examples
178///
179/// ```
180/// use contextual_encoder::for_html_attribute;
181///
182/// // safe for both quote styles
183/// assert_eq!(
184/// for_html_attribute(r#"it's a "test""#),
185/// "it's a "test""
186/// );
187/// // > is not encoded
188/// assert_eq!(for_html_attribute("a > b"), "a > b");
189/// ```
190pub fn for_html_attribute(input: &str) -> String {
191 let mut out = String::with_capacity(input.len());
192 write_html_attribute(&mut out, input).expect("writing to string cannot fail");
193 out
194}
195
196/// writes the HTML-attribute-encoded form of `input` to `out`.
197///
198/// see [`for_html_attribute`] for encoding rules.
199pub fn write_html_attribute<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
200 encode_loop(
201 out,
202 input,
203 needs_html_attribute_encoding,
204 write_html_attribute_encoded,
205 )
206}
207
208fn needs_html_attribute_encoding(c: char) -> bool {
209 matches!(c, '&' | '<' | '"' | '\'') || is_invalid_for_xml(c)
210}
211
212fn write_html_attribute_encoded<W: fmt::Write>(
213 out: &mut W,
214 c: char,
215 _next: Option<char>,
216) -> fmt::Result {
217 match c {
218 '&' => out.write_str("&"),
219 '<' => out.write_str("<"),
220 '"' => out.write_str("""),
221 '\'' => out.write_str("'"),
222 _ => out.write_char(' '),
223 }
224}
225
226// ---------------------------------------------------------------------------
227// for_html_unquoted_attribute — safe for unquoted attribute values
228// ---------------------------------------------------------------------------
229
230/// encodes `input` for safe embedding in an unquoted HTML attribute value.
231///
232/// this is the most aggressive HTML encoder, encoding whitespace, quote
233/// characters, grave accents, and many punctuation characters that could
234/// terminate an unquoted attribute value.
235///
236/// **prefer quoted attributes** whenever possible. unquoted attributes are
237/// fragile and this encoder exists only for cases where quoting is not an
238/// option.
239///
240/// # caveat: grave accent
241///
242/// the grave accent (`` ` ``, U+0060) is encoded as ``` because
243/// unpatched internet explorer treats it as an attribute delimiter.
244/// however, numeric character references decode back to the original
245/// character, so this encoding cannot fully protect against the IE bug
246/// in all injection scenarios. the safest mitigation is to avoid
247/// unquoted attributes entirely.
248///
249/// # encoded characters (partial list)
250///
251/// | input | output |
252/// |--------|-----------|
253/// | tab | `	` |
254/// | LF | ` ` |
255/// | FF | `` |
256/// | CR | ` ` |
257/// | space | ` ` |
258/// | `&` | `&` |
259/// | `<` | `<` |
260/// | `>` | `>` |
261/// | `"` | `"` |
262/// | `'` | `'` |
263/// | `/` | `/` |
264/// | `=` | `=` |
265/// | `` ` ``| ``` |
266///
267/// C0/C1 control characters, DEL, and unicode non-characters are replaced
268/// with `-`. NEL (U+0085) is encoded as `…`. line separator (U+2028)
269/// and paragraph separator (U+2029) are encoded as `
` and `
`.
270///
271/// # examples
272///
273/// ```
274/// use contextual_encoder::for_html_unquoted_attribute;
275///
276/// assert_eq!(for_html_unquoted_attribute("hello world"), "hello world");
277/// assert_eq!(for_html_unquoted_attribute("a=b"), "a=b");
278/// ```
279pub fn for_html_unquoted_attribute(input: &str) -> String {
280 let mut out = String::with_capacity(input.len());
281 write_html_unquoted_attribute(&mut out, input).expect("writing to string cannot fail");
282 out
283}
284
285/// writes the unquoted-HTML-attribute-encoded form of `input` to `out`.
286///
287/// see [`for_html_unquoted_attribute`] for encoding rules.
288pub fn write_html_unquoted_attribute<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
289 encode_loop(
290 out,
291 input,
292 needs_html_unquoted_attribute_encoding,
293 write_html_unquoted_attribute_encoded,
294 )
295}
296
297fn needs_html_unquoted_attribute_encoding(c: char) -> bool {
298 let cp = c as u32;
299
300 // specific ASCII characters that need encoding
301 if matches!(
302 c,
303 '\t' | '\n' | '\x0C' | '\r' | ' ' | '&' | '<' | '>' | '"' | '\'' | '/' | '=' | '`'
304 ) {
305 return true;
306 }
307
308 // C0 controls not matched above
309 if cp <= 0x1F {
310 return true;
311 }
312
313 // DEL
314 if cp == 0x7F {
315 return true;
316 }
317
318 // C1 controls (includes NEL U+0085)
319 if (0x80..=0x9F).contains(&cp) {
320 return true;
321 }
322
323 // line / paragraph separators
324 if cp == 0x2028 || cp == 0x2029 {
325 return true;
326 }
327
328 // unicode non-characters
329 if is_unicode_noncharacter(cp) {
330 return true;
331 }
332
333 false
334}
335
336fn write_html_unquoted_attribute_encoded<W: fmt::Write>(
337 out: &mut W,
338 c: char,
339 _next: Option<char>,
340) -> fmt::Result {
341 match c {
342 '\t' => out.write_str("	"),
343 '\n' => out.write_str(" "),
344 '\x0C' => out.write_str(""),
345 '\r' => out.write_str(" "),
346 ' ' => out.write_str(" "),
347 '&' => out.write_str("&"),
348 '<' => out.write_str("<"),
349 '>' => out.write_str(">"),
350 '"' => out.write_str("""),
351 '\'' => out.write_str("'"),
352 '/' => out.write_str("/"),
353 '=' => out.write_str("="),
354 '`' => out.write_str("`"),
355 '\u{0085}' => out.write_str("…"),
356 '\u{2028}' => out.write_str("
"),
357 '\u{2029}' => out.write_str("
"),
358 // remaining: C0/C1 controls, DEL, non-characters → dash
359 _ => out.write_char('-'),
360 }
361}
362
363#[cfg(test)]
364mod tests {
365 use super::*;
366
367 // -- for_html --
368
369 #[test]
370 fn html_no_encoding_needed() {
371 assert_eq!(for_html("hello world"), "hello world");
372 assert_eq!(for_html(""), "");
373 assert_eq!(for_html("abc123"), "abc123");
374 }
375
376 #[test]
377 fn html_encodes_ampersand() {
378 assert_eq!(for_html("a&b"), "a&b");
379 }
380
381 #[test]
382 fn html_encodes_angle_brackets() {
383 assert_eq!(for_html("<div>"), "<div>");
384 }
385
386 #[test]
387 fn html_encodes_quotes() {
388 assert_eq!(for_html(r#"a"b'c"#), "a"b'c");
389 }
390
391 #[test]
392 fn html_replaces_controls_with_space() {
393 assert_eq!(for_html("a\x01b"), "a b");
394 assert_eq!(for_html("a\x7Fb"), "a b");
395 }
396
397 #[test]
398 fn html_preserves_tab_lf_cr() {
399 assert_eq!(for_html("a\tb\nc\rd"), "a\tb\nc\rd");
400 }
401
402 #[test]
403 fn html_writer_variant() {
404 let mut out = String::new();
405 write_html(&mut out, "<b>").unwrap();
406 assert_eq!(out, "<b>");
407 }
408
409 // -- for_html_content --
410
411 #[test]
412 fn html_content_does_not_encode_quotes() {
413 assert_eq!(for_html_content(r#"a"b'c"#), r#"a"b'c"#);
414 }
415
416 #[test]
417 fn html_content_encodes_angle_brackets_and_amp() {
418 assert_eq!(for_html_content("a<b&c>d"), "a<b&c>d");
419 }
420
421 // -- for_html_attribute --
422
423 #[test]
424 fn html_attribute_does_not_encode_gt() {
425 assert_eq!(for_html_attribute("a>b"), "a>b");
426 }
427
428 #[test]
429 fn html_attribute_encodes_quotes_and_amp_and_lt() {
430 assert_eq!(
431 for_html_attribute(r#"a"b'c&d<e"#),
432 "a"b'c&d<e"
433 );
434 }
435
436 // -- for_html_unquoted_attribute --
437
438 #[test]
439 fn unquoted_attr_encodes_whitespace() {
440 assert_eq!(
441 for_html_unquoted_attribute("a b\tc\nd"),
442 "a b	c d"
443 );
444 }
445
446 #[test]
447 fn unquoted_attr_encodes_grave_accent() {
448 assert_eq!(for_html_unquoted_attribute("a`b"), "a`b");
449 }
450
451 #[test]
452 fn unquoted_attr_encodes_equals_and_slash() {
453 assert_eq!(for_html_unquoted_attribute("a=b/c"), "a=b/c");
454 }
455
456 #[test]
457 fn unquoted_attr_replaces_controls_with_dash() {
458 assert_eq!(for_html_unquoted_attribute("a\x01b"), "a-b");
459 assert_eq!(for_html_unquoted_attribute("a\x7Fb"), "a-b");
460 }
461
462 #[test]
463 fn unquoted_attr_encodes_nel() {
464 assert_eq!(for_html_unquoted_attribute("a\u{0085}b"), "a…b");
465 }
466
467 #[test]
468 fn unquoted_attr_encodes_line_separators() {
469 assert_eq!(
470 for_html_unquoted_attribute("a\u{2028}b\u{2029}c"),
471 "a
b
c"
472 );
473 }
474
475 #[test]
476 fn unquoted_attr_passes_through_safe_chars() {
477 let safe = "ABCxyz019!#$%()*+,-.[]\\^_}";
478 assert_eq!(for_html_unquoted_attribute(safe), safe);
479 }
480
481 #[test]
482 fn unquoted_attr_passes_through_non_ascii() {
483 assert_eq!(for_html_unquoted_attribute("café"), "café");
484 assert_eq!(for_html_unquoted_attribute("日本語"), "日本語");
485 }
486}