Skip to main content

contextual_encoder/
html.rs

1//! HTML / XML contextual output encoders.
2//!
3//! provides four encoding contexts with different safety guarantees:
4//!
5//! - [`for_html`] — safe for both text content and quoted attributes (most conservative)
6//! - [`for_html_content`] — safe for text content only (does not encode quotes)
7//! - [`for_html_attribute`] — safe for quoted attributes only (does not encode `>`)
8//! - [`for_html_unquoted_attribute`] — safe for unquoted attribute values (most aggressive)
9//!
10//! all encoders replace invalid XML characters (C0/C1 controls, DEL, unicode
11//! non-characters) with a replacement character (space or dash depending on
12//! context).
13//!
14//! # security notes
15//!
16//! - these encoders produce output safe for embedding in the specified context.
17//!   they do not sanitize HTML — encoding is not a substitute for input validation.
18//! - never use `for_html_content` output in an attribute context.
19//! - never use `for_html_attribute` output in a text content context where `>` matters.
20//! - `for_html` is the safe default when the exact context is unknown.
21//! - tag names, attribute names, and event handler names must be validated
22//!   separately — encoding cannot make arbitrary names safe.
23
24use std::fmt;
25
26use crate::engine::{encode_loop, is_invalid_for_xml, is_unicode_noncharacter};
27
28// ---------------------------------------------------------------------------
29// for_html — safe for text content AND quoted attributes
30// ---------------------------------------------------------------------------
31
32/// encodes `input` for safe embedding in HTML text content and quoted attributes.
33///
34/// this is the most conservative HTML encoder — it encodes characters needed
35/// for both text content and attribute contexts. use [`for_html_content`] or
36/// [`for_html_attribute`] for more minimal encoding when the exact context is
37/// known.
38///
39/// # encoded characters
40///
41/// | input | output |
42/// |-------|--------|
43/// | `&`   | `&`  |
44/// | `<`   | `&lt;`   |
45/// | `>`   | `&gt;`   |
46/// | `"`   | `&#34;`  |
47/// | `'`   | `&#39;`  |
48///
49/// invalid XML characters are replaced with a space.
50///
51/// # examples
52///
53/// ```
54/// use contextual_encoder::for_html;
55///
56/// assert_eq!(for_html("<script>alert('xss')</script>"),
57///            "&lt;script&gt;alert(&#39;xss&#39;)&lt;/script&gt;");
58/// assert_eq!(for_html("safe text"), "safe text");
59/// ```
60pub fn for_html(input: &str) -> String {
61    let mut out = String::with_capacity(input.len());
62    write_html(&mut out, input).expect("writing to string cannot fail");
63    out
64}
65
66/// writes the HTML-encoded form of `input` to `out`.
67///
68/// see [`for_html`] for encoding rules.
69pub fn write_html<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
70    encode_loop(out, input, needs_html_encoding, write_html_encoded)
71}
72
73fn needs_html_encoding(c: char) -> bool {
74    matches!(c, '&' | '<' | '>' | '"' | '\'') || is_invalid_for_xml(c)
75}
76
77fn write_html_encoded<W: fmt::Write>(out: &mut W, c: char, _next: Option<char>) -> fmt::Result {
78    match c {
79        '&' => out.write_str("&amp;"),
80        '<' => out.write_str("&lt;"),
81        '>' => out.write_str("&gt;"),
82        '"' => out.write_str("&#34;"),
83        '\'' => out.write_str("&#39;"),
84        // invalid XML char → space
85        _ => out.write_char(' '),
86    }
87}
88
89// ---------------------------------------------------------------------------
90// for_html_content — safe for text content only (NOT attributes)
91// ---------------------------------------------------------------------------
92
93/// encodes `input` for safe embedding in HTML text content.
94///
95/// this encoder does **not** encode quote characters and is therefore
96/// **not safe for attribute values**. use [`for_html`] or
97/// [`for_html_attribute`] for attribute contexts.
98///
99/// # encoded characters
100///
101/// | input | output |
102/// |-------|--------|
103/// | `&`   | `&amp;` |
104/// | `<`   | `&lt;`  |
105/// | `>`   | `&gt;`  |
106///
107/// invalid XML characters are replaced with a space.
108///
109/// # examples
110///
111/// ```
112/// use contextual_encoder::for_html_content;
113///
114/// assert_eq!(for_html_content("1 < 2 & 3 > 0"), "1 &lt; 2 &amp; 3 &gt; 0");
115/// // quotes are NOT encoded — do not use in attributes
116/// assert_eq!(for_html_content(r#"she said "hi""#), r#"she said "hi""#);
117/// ```
118pub fn for_html_content(input: &str) -> String {
119    let mut out = String::with_capacity(input.len());
120    write_html_content(&mut out, input).expect("writing to string cannot fail");
121    out
122}
123
124/// writes the HTML-content-encoded form of `input` to `out`.
125///
126/// see [`for_html_content`] for encoding rules.
127pub fn write_html_content<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
128    encode_loop(
129        out,
130        input,
131        needs_html_content_encoding,
132        write_html_content_encoded,
133    )
134}
135
136fn needs_html_content_encoding(c: char) -> bool {
137    matches!(c, '&' | '<' | '>') || is_invalid_for_xml(c)
138}
139
140fn write_html_content_encoded<W: fmt::Write>(
141    out: &mut W,
142    c: char,
143    _next: Option<char>,
144) -> fmt::Result {
145    match c {
146        '&' => out.write_str("&amp;"),
147        '<' => out.write_str("&lt;"),
148        '>' => out.write_str("&gt;"),
149        _ => out.write_char(' '),
150    }
151}
152
153// ---------------------------------------------------------------------------
154// for_html_attribute — safe for quoted attributes only
155// ---------------------------------------------------------------------------
156
157/// encodes `input` for safe embedding in a quoted HTML attribute value.
158///
159/// this encoder does **not** encode `>` (harmless inside quoted attributes)
160/// and is slightly more minimal than [`for_html`]. it encodes both `"` and
161/// `'` so the output is safe regardless of which quote delimiter is used.
162///
163/// **not safe for unquoted attributes** — use [`for_html_unquoted_attribute`]
164/// for that context.
165///
166/// # encoded characters
167///
168/// | input | output |
169/// |-------|--------|
170/// | `&`   | `&amp;` |
171/// | `<`   | `&lt;`  |
172/// | `"`   | `&#34;` |
173/// | `'`   | `&#39;` |
174///
175/// invalid XML characters are replaced with a space.
176///
177/// # examples
178///
179/// ```
180/// use contextual_encoder::for_html_attribute;
181///
182/// // safe for both quote styles
183/// assert_eq!(
184///     for_html_attribute(r#"it's a "test""#),
185///     "it&#39;s a &#34;test&#34;"
186/// );
187/// // > is not encoded
188/// assert_eq!(for_html_attribute("a > b"), "a > b");
189/// ```
190pub fn for_html_attribute(input: &str) -> String {
191    let mut out = String::with_capacity(input.len());
192    write_html_attribute(&mut out, input).expect("writing to string cannot fail");
193    out
194}
195
196/// writes the HTML-attribute-encoded form of `input` to `out`.
197///
198/// see [`for_html_attribute`] for encoding rules.
199pub fn write_html_attribute<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
200    encode_loop(
201        out,
202        input,
203        needs_html_attribute_encoding,
204        write_html_attribute_encoded,
205    )
206}
207
208fn needs_html_attribute_encoding(c: char) -> bool {
209    matches!(c, '&' | '<' | '"' | '\'') || is_invalid_for_xml(c)
210}
211
212fn write_html_attribute_encoded<W: fmt::Write>(
213    out: &mut W,
214    c: char,
215    _next: Option<char>,
216) -> fmt::Result {
217    match c {
218        '&' => out.write_str("&amp;"),
219        '<' => out.write_str("&lt;"),
220        '"' => out.write_str("&#34;"),
221        '\'' => out.write_str("&#39;"),
222        _ => out.write_char(' '),
223    }
224}
225
226// ---------------------------------------------------------------------------
227// for_html_unquoted_attribute — safe for unquoted attribute values
228// ---------------------------------------------------------------------------
229
230/// encodes `input` for safe embedding in an unquoted HTML attribute value.
231///
232/// this is the most aggressive HTML encoder, encoding whitespace, quote
233/// characters, grave accents, and many punctuation characters that could
234/// terminate an unquoted attribute value.
235///
236/// **prefer quoted attributes** whenever possible. unquoted attributes are
237/// fragile and this encoder exists only for cases where quoting is not an
238/// option.
239///
240/// # caveat: grave accent
241///
242/// the grave accent (`` ` ``, U+0060) is encoded as `&#96;` because
243/// unpatched internet explorer treats it as an attribute delimiter.
244/// however, numeric character references decode back to the original
245/// character, so this encoding cannot fully protect against the IE bug
246/// in all injection scenarios. the safest mitigation is to avoid
247/// unquoted attributes entirely.
248///
249/// # encoded characters (partial list)
250///
251/// | input  | output    |
252/// |--------|-----------|
253/// | tab    | `&#9;`    |
254/// | LF     | `&#10;`   |
255/// | FF     | `&#12;`   |
256/// | CR     | `&#13;`   |
257/// | space  | `&#32;`   |
258/// | `&`    | `&amp;`   |
259/// | `<`    | `&lt;`    |
260/// | `>`    | `&gt;`    |
261/// | `"`    | `&#34;`   |
262/// | `'`    | `&#39;`   |
263/// | `/`    | `&#47;`   |
264/// | `=`    | `&#61;`   |
265/// | `` ` ``| `&#96;`   |
266///
267/// C0/C1 control characters, DEL, and unicode non-characters are replaced
268/// with `-`. NEL (U+0085) is encoded as `&#133;`. line separator (U+2028)
269/// and paragraph separator (U+2029) are encoded as `&#8232;` and `&#8233;`.
270///
271/// # examples
272///
273/// ```
274/// use contextual_encoder::for_html_unquoted_attribute;
275///
276/// assert_eq!(for_html_unquoted_attribute("hello world"), "hello&#32;world");
277/// assert_eq!(for_html_unquoted_attribute("a=b"), "a&#61;b");
278/// ```
279pub fn for_html_unquoted_attribute(input: &str) -> String {
280    let mut out = String::with_capacity(input.len());
281    write_html_unquoted_attribute(&mut out, input).expect("writing to string cannot fail");
282    out
283}
284
285/// writes the unquoted-HTML-attribute-encoded form of `input` to `out`.
286///
287/// see [`for_html_unquoted_attribute`] for encoding rules.
288pub fn write_html_unquoted_attribute<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
289    encode_loop(
290        out,
291        input,
292        needs_html_unquoted_attribute_encoding,
293        write_html_unquoted_attribute_encoded,
294    )
295}
296
297fn needs_html_unquoted_attribute_encoding(c: char) -> bool {
298    let cp = c as u32;
299
300    // specific ASCII characters that need encoding
301    if matches!(
302        c,
303        '\t' | '\n' | '\x0C' | '\r' | ' ' | '&' | '<' | '>' | '"' | '\'' | '/' | '=' | '`'
304    ) {
305        return true;
306    }
307
308    // C0 controls not matched above
309    if cp <= 0x1F {
310        return true;
311    }
312
313    // DEL
314    if cp == 0x7F {
315        return true;
316    }
317
318    // C1 controls (includes NEL U+0085)
319    if (0x80..=0x9F).contains(&cp) {
320        return true;
321    }
322
323    // line / paragraph separators
324    if cp == 0x2028 || cp == 0x2029 {
325        return true;
326    }
327
328    // unicode non-characters
329    if is_unicode_noncharacter(cp) {
330        return true;
331    }
332
333    false
334}
335
336fn write_html_unquoted_attribute_encoded<W: fmt::Write>(
337    out: &mut W,
338    c: char,
339    _next: Option<char>,
340) -> fmt::Result {
341    match c {
342        '\t' => out.write_str("&#9;"),
343        '\n' => out.write_str("&#10;"),
344        '\x0C' => out.write_str("&#12;"),
345        '\r' => out.write_str("&#13;"),
346        ' ' => out.write_str("&#32;"),
347        '&' => out.write_str("&amp;"),
348        '<' => out.write_str("&lt;"),
349        '>' => out.write_str("&gt;"),
350        '"' => out.write_str("&#34;"),
351        '\'' => out.write_str("&#39;"),
352        '/' => out.write_str("&#47;"),
353        '=' => out.write_str("&#61;"),
354        '`' => out.write_str("&#96;"),
355        '\u{0085}' => out.write_str("&#133;"),
356        '\u{2028}' => out.write_str("&#8232;"),
357        '\u{2029}' => out.write_str("&#8233;"),
358        // remaining: C0/C1 controls, DEL, non-characters → dash
359        _ => out.write_char('-'),
360    }
361}
362
363#[cfg(test)]
364mod tests {
365    use super::*;
366
367    // -- for_html --
368
369    #[test]
370    fn html_no_encoding_needed() {
371        assert_eq!(for_html("hello world"), "hello world");
372        assert_eq!(for_html(""), "");
373        assert_eq!(for_html("abc123"), "abc123");
374    }
375
376    #[test]
377    fn html_encodes_ampersand() {
378        assert_eq!(for_html("a&b"), "a&amp;b");
379    }
380
381    #[test]
382    fn html_encodes_angle_brackets() {
383        assert_eq!(for_html("<div>"), "&lt;div&gt;");
384    }
385
386    #[test]
387    fn html_encodes_quotes() {
388        assert_eq!(for_html(r#"a"b'c"#), "a&#34;b&#39;c");
389    }
390
391    #[test]
392    fn html_replaces_controls_with_space() {
393        assert_eq!(for_html("a\x01b"), "a b");
394        assert_eq!(for_html("a\x7Fb"), "a b");
395    }
396
397    #[test]
398    fn html_preserves_tab_lf_cr() {
399        assert_eq!(for_html("a\tb\nc\rd"), "a\tb\nc\rd");
400    }
401
402    #[test]
403    fn html_writer_variant() {
404        let mut out = String::new();
405        write_html(&mut out, "<b>").unwrap();
406        assert_eq!(out, "&lt;b&gt;");
407    }
408
409    // -- for_html_content --
410
411    #[test]
412    fn html_content_does_not_encode_quotes() {
413        assert_eq!(for_html_content(r#"a"b'c"#), r#"a"b'c"#);
414    }
415
416    #[test]
417    fn html_content_encodes_angle_brackets_and_amp() {
418        assert_eq!(for_html_content("a<b&c>d"), "a&lt;b&amp;c&gt;d");
419    }
420
421    // -- for_html_attribute --
422
423    #[test]
424    fn html_attribute_does_not_encode_gt() {
425        assert_eq!(for_html_attribute("a>b"), "a>b");
426    }
427
428    #[test]
429    fn html_attribute_encodes_quotes_and_amp_and_lt() {
430        assert_eq!(
431            for_html_attribute(r#"a"b'c&d<e"#),
432            "a&#34;b&#39;c&amp;d&lt;e"
433        );
434    }
435
436    // -- for_html_unquoted_attribute --
437
438    #[test]
439    fn unquoted_attr_encodes_whitespace() {
440        assert_eq!(
441            for_html_unquoted_attribute("a b\tc\nd"),
442            "a&#32;b&#9;c&#10;d"
443        );
444    }
445
446    #[test]
447    fn unquoted_attr_encodes_grave_accent() {
448        assert_eq!(for_html_unquoted_attribute("a`b"), "a&#96;b");
449    }
450
451    #[test]
452    fn unquoted_attr_encodes_equals_and_slash() {
453        assert_eq!(for_html_unquoted_attribute("a=b/c"), "a&#61;b&#47;c");
454    }
455
456    #[test]
457    fn unquoted_attr_replaces_controls_with_dash() {
458        assert_eq!(for_html_unquoted_attribute("a\x01b"), "a-b");
459        assert_eq!(for_html_unquoted_attribute("a\x7Fb"), "a-b");
460    }
461
462    #[test]
463    fn unquoted_attr_encodes_nel() {
464        assert_eq!(for_html_unquoted_attribute("a\u{0085}b"), "a&#133;b");
465    }
466
467    #[test]
468    fn unquoted_attr_encodes_line_separators() {
469        assert_eq!(
470            for_html_unquoted_attribute("a\u{2028}b\u{2029}c"),
471            "a&#8232;b&#8233;c"
472        );
473    }
474
475    #[test]
476    fn unquoted_attr_passes_through_safe_chars() {
477        let safe = "ABCxyz019!#$%()*+,-.[]\\^_}";
478        assert_eq!(for_html_unquoted_attribute(safe), safe);
479    }
480
481    #[test]
482    fn unquoted_attr_passes_through_non_ascii() {
483        assert_eq!(for_html_unquoted_attribute("café"), "café");
484        assert_eq!(for_html_unquoted_attribute("日本語"), "日本語");
485    }
486}