contextual_encoder/xml.rs
1//! XML-specific contextual output encoders.
2//!
3//! provides XML aliases for the HTML encoders, plus XML-only contexts:
4//!
5//! ## XML 1.0 aliases
6//!
7//! - [`for_xml`] — alias for [`crate::for_html`]
8//! - [`for_xml_content`] — alias for [`crate::for_html_content`]
9//! - [`for_xml_attribute`] — alias for [`crate::for_html_attribute`]
10//!
11//! ## XML-only contexts
12//!
13//! - [`for_xml_comment`] — safe for XML comment content
14//! - [`for_cdata`] — safe for CDATA section content
15//!
16//! ## XML 1.1
17//!
18//! - [`for_xml11`] — XML 1.1 content + attributes
19//! - [`for_xml11_content`] — XML 1.1 content only
20//! - [`for_xml11_attribute`] — XML 1.1 attributes only
21//!
22//! # security notes
23//!
24//! - `for_xml_comment` is **not safe for HTML comments**. HTML comments have
25//! vendor-specific extensions (e.g., `<!--[if IE]>`) that make safe encoding
26//! impractical. this encoder is for XML comments only.
27//! - `for_cdata` splits CDATA sections to prevent premature closing. the
28//! caller is responsible for wrapping the output in `<![CDATA[...]]>`.
29
30use std::fmt;
31
32use crate::engine::{encode_loop, is_invalid_for_xml, is_unicode_noncharacter};
33
34// ---------------------------------------------------------------------------
35// XML 1.0 aliases
36// ---------------------------------------------------------------------------
37
38/// encodes `input` for safe embedding in XML text content and quoted attributes.
39///
40/// this is an alias for [`crate::for_html`] — the encoding rules are identical.
41///
42/// # examples
43///
44/// ```
45/// use contextual_encoder::for_xml;
46///
47/// assert_eq!(for_xml("<root attr=\"val\">"), "<root attr="val">");
48/// ```
49pub fn for_xml(input: &str) -> String {
50 crate::html::for_html(input)
51}
52
53/// writes the XML-encoded form of `input` to `out`.
54///
55/// see [`for_xml`] for encoding rules.
56pub fn write_xml<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
57 crate::html::write_html(out, input)
58}
59
60/// encodes `input` for safe embedding in XML text content only.
61///
62/// this is an alias for [`crate::for_html_content`] — the encoding rules are
63/// identical. **not safe for attributes** (does not encode quotes).
64///
65/// # examples
66///
67/// ```
68/// use contextual_encoder::for_xml_content;
69///
70/// assert_eq!(for_xml_content("a < b & c"), "a < b & c");
71/// ```
72pub fn for_xml_content(input: &str) -> String {
73 crate::html::for_html_content(input)
74}
75
76/// writes the XML-content-encoded form of `input` to `out`.
77///
78/// see [`for_xml_content`] for encoding rules.
79pub fn write_xml_content<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
80 crate::html::write_html_content(out, input)
81}
82
83/// encodes `input` for safe embedding in a quoted XML attribute value.
84///
85/// this is an alias for [`crate::for_html_attribute`] — the encoding rules
86/// are identical. **not safe for text content** (does not encode `>`).
87///
88/// # examples
89///
90/// ```
91/// use contextual_encoder::for_xml_attribute;
92///
93/// assert_eq!(for_xml_attribute("a\"b"), "a"b");
94/// ```
95pub fn for_xml_attribute(input: &str) -> String {
96 crate::html::for_html_attribute(input)
97}
98
99/// writes the XML-attribute-encoded form of `input` to `out`.
100///
101/// see [`for_xml_attribute`] for encoding rules.
102pub fn write_xml_attribute<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
103 crate::html::write_html_attribute(out, input)
104}
105
106// ---------------------------------------------------------------------------
107// for_xml_comment — safe for XML comment content
108// ---------------------------------------------------------------------------
109
110/// encodes `input` for safe embedding in an XML comment (`<!-- ... -->`).
111///
112/// the XML specification forbids `--` inside comments and a trailing `-`
113/// (which would form `--->` with the closing delimiter). this encoder
114/// replaces the second hyphen in any `--` sequence with `~`, and replaces
115/// a trailing `-` with `~`.
116///
117/// invalid XML characters are replaced with a space.
118///
119/// # security warning
120///
121/// this encoder is **not safe for HTML comments**. browsers interpret
122/// vendor-specific extensions like `<!--[if IE]>` that cannot be neutralized
123/// by encoding. never embed untrusted data in HTML comments.
124///
125/// # examples
126///
127/// ```
128/// use contextual_encoder::for_xml_comment;
129///
130/// assert_eq!(for_xml_comment("safe text"), "safe text");
131/// assert_eq!(for_xml_comment("a--b"), "a-~b");
132/// assert_eq!(for_xml_comment("trailing-"), "trailing~");
133/// ```
134pub fn for_xml_comment(input: &str) -> String {
135 let mut out = String::with_capacity(input.len());
136 write_xml_comment(&mut out, input).expect("writing to string cannot fail");
137 out
138}
139
140/// writes the XML-comment-encoded form of `input` to `out`.
141///
142/// see [`for_xml_comment`] for encoding rules.
143pub fn write_xml_comment<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
144 let mut last_was_hyphen = false;
145 let mut chars = input.chars().peekable();
146
147 while let Some(c) = chars.next() {
148 if c == '-' {
149 if last_was_hyphen {
150 // second hyphen in -- sequence → replace with ~
151 out.write_char('~')?;
152 last_was_hyphen = false;
153 } else if chars.peek().is_none() {
154 // trailing hyphen → replace with ~
155 out.write_char('~')?;
156 } else {
157 out.write_char('-')?;
158 last_was_hyphen = true;
159 }
160 } else if is_invalid_for_xml(c) {
161 out.write_char(' ')?;
162 last_was_hyphen = false;
163 } else {
164 out.write_char(c)?;
165 last_was_hyphen = false;
166 }
167 }
168
169 Ok(())
170}
171
172// ---------------------------------------------------------------------------
173// for_cdata — safe for CDATA section content
174// ---------------------------------------------------------------------------
175
176/// encodes `input` for safe embedding in an XML CDATA section.
177///
178/// the CDATA closing delimiter `]]>` cannot appear in CDATA content. when
179/// this sequence is found, the encoder splits it by closing the current
180/// CDATA section and immediately opening a new one:
181///
182/// `]]>` → `]]]]><![CDATA[>`
183///
184/// the caller is responsible for wrapping the output in `<![CDATA[...]]>`.
185///
186/// invalid XML characters are replaced with a space.
187///
188/// # examples
189///
190/// ```
191/// use contextual_encoder::for_cdata;
192///
193/// assert_eq!(for_cdata("safe text"), "safe text");
194/// assert_eq!(for_cdata("a]]>b"), "a]]]]><![CDATA[>b");
195/// assert_eq!(for_cdata("]]"), "]]");
196/// ```
197pub fn for_cdata(input: &str) -> String {
198 let mut out = String::with_capacity(input.len());
199 write_cdata(&mut out, input).expect("writing to string cannot fail");
200 out
201}
202
203/// writes the CDATA-encoded form of `input` to `out`.
204///
205/// see [`for_cdata`] for encoding rules.
206pub fn write_cdata<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
207 let mut bracket_count: u32 = 0;
208
209 for c in input.chars() {
210 if c == ']' {
211 bracket_count += 1;
212 } else if c == '>' && bracket_count >= 2 {
213 // found ]]> — flush extra brackets, then split
214 for _ in 0..(bracket_count - 2) {
215 out.write_char(']')?;
216 }
217 out.write_str("]]]]><![CDATA[>")?;
218 bracket_count = 0;
219 } else {
220 // flush buffered brackets
221 for _ in 0..bracket_count {
222 out.write_char(']')?;
223 }
224 bracket_count = 0;
225
226 if is_invalid_for_xml(c) {
227 out.write_char(' ')?;
228 } else {
229 out.write_char(c)?;
230 }
231 }
232 }
233
234 // flush remaining brackets
235 for _ in 0..bracket_count {
236 out.write_char(']')?;
237 }
238
239 Ok(())
240}
241
242// ---------------------------------------------------------------------------
243// XML 1.1 encoders
244// ---------------------------------------------------------------------------
245
246/// encodes `input` for safe embedding in XML 1.1 text content and quoted
247/// attributes.
248///
249/// like [`for_xml`] but encodes restricted characters as `&#xHH;` character
250/// references instead of replacing them with space. NUL (U+0000) and unicode
251/// non-characters are still replaced with space (they are invalid in XML 1.1).
252///
253/// NEL (U+0085) is **not** restricted in XML 1.1 and passes through unchanged.
254///
255/// # examples
256///
257/// ```
258/// use contextual_encoder::for_xml11;
259///
260/// assert_eq!(for_xml11("<b>"), "<b>");
261/// // control chars get character references instead of space
262/// assert_eq!(for_xml11("a\x01b"), "ab");
263/// // NEL passes through in XML 1.1
264/// assert_eq!(for_xml11("a\u{0085}b"), "a\u{0085}b");
265/// ```
266pub fn for_xml11(input: &str) -> String {
267 let mut out = String::with_capacity(input.len());
268 write_xml11(&mut out, input).expect("writing to string cannot fail");
269 out
270}
271
272/// writes the XML-1.1-encoded form of `input` to `out`.
273///
274/// see [`for_xml11`] for encoding rules.
275pub fn write_xml11<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
276 encode_loop(out, input, needs_xml11_encoding, write_xml11_encoded)
277}
278
279/// encodes `input` for safe embedding in XML 1.1 text content only.
280///
281/// like [`for_xml_content`] but encodes restricted characters as `&#xHH;`
282/// character references. does **not** encode quotes — not safe for attributes.
283///
284/// # examples
285///
286/// ```
287/// use contextual_encoder::for_xml11_content;
288///
289/// assert_eq!(for_xml11_content("a\x01b"), "ab");
290/// assert_eq!(for_xml11_content(r#"a"b"#), r#"a"b"#);
291/// ```
292pub fn for_xml11_content(input: &str) -> String {
293 let mut out = String::with_capacity(input.len());
294 write_xml11_content(&mut out, input).expect("writing to string cannot fail");
295 out
296}
297
298/// writes the XML-1.1-content-encoded form of `input` to `out`.
299///
300/// see [`for_xml11_content`] for encoding rules.
301pub fn write_xml11_content<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
302 encode_loop(
303 out,
304 input,
305 needs_xml11_content_encoding,
306 write_xml11_content_encoded,
307 )
308}
309
310/// encodes `input` for safe embedding in a quoted XML 1.1 attribute value.
311///
312/// like [`for_xml_attribute`] but encodes restricted characters as `&#xHH;`
313/// character references. does **not** encode `>`.
314///
315/// # examples
316///
317/// ```
318/// use contextual_encoder::for_xml11_attribute;
319///
320/// assert_eq!(for_xml11_attribute("a\x01b"), "ab");
321/// assert_eq!(for_xml11_attribute("a>b"), "a>b");
322/// ```
323pub fn for_xml11_attribute(input: &str) -> String {
324 let mut out = String::with_capacity(input.len());
325 write_xml11_attribute(&mut out, input).expect("writing to string cannot fail");
326 out
327}
328
329/// writes the XML-1.1-attribute-encoded form of `input` to `out`.
330///
331/// see [`for_xml11_attribute`] for encoding rules.
332pub fn write_xml11_attribute<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
333 encode_loop(
334 out,
335 input,
336 needs_xml11_attribute_encoding,
337 write_xml11_attribute_encoded,
338 )
339}
340
341// ---------------------------------------------------------------------------
342// XML 1.1 shared helpers
343// ---------------------------------------------------------------------------
344
345/// returns true if the character is restricted in XML 1.1.
346///
347/// restricted characters are: U+0001-U+0008, U+000B-U+000C, U+000E-U+001F,
348/// U+007F-U+0084, U+0086-U+009F. note that NUL (U+0000) is not restricted
349/// but is *invalid* (not in the Char production). NEL (U+0085) is NOT
350/// restricted in XML 1.1.
351fn is_xml11_restricted_or_invalid(c: char) -> bool {
352 let cp = c as u32;
353 cp == 0
354 || (0x01..=0x08).contains(&cp)
355 || cp == 0x0B
356 || cp == 0x0C
357 || (0x0E..=0x1F).contains(&cp)
358 || (0x7F..=0x84).contains(&cp)
359 || (0x86..=0x9F).contains(&cp)
360 || is_unicode_noncharacter(cp)
361}
362
363// --- for_xml11 (content + attributes) ---
364
365fn needs_xml11_encoding(c: char) -> bool {
366 matches!(c, '&' | '<' | '>' | '"' | '\'') || is_xml11_restricted_or_invalid(c)
367}
368
369fn write_xml11_encoded<W: fmt::Write>(out: &mut W, c: char, _next: Option<char>) -> fmt::Result {
370 match c {
371 '&' => out.write_str("&"),
372 '<' => out.write_str("<"),
373 '>' => out.write_str(">"),
374 '"' => out.write_str("""),
375 '\'' => out.write_str("'"),
376 '\0' => out.write_char(' '),
377 c if is_unicode_noncharacter(c as u32) => out.write_char(' '),
378 // restricted controls → hex character reference
379 c => write!(out, "&#x{:x};", c as u32),
380 }
381}
382
383// --- for_xml11_content ---
384
385fn needs_xml11_content_encoding(c: char) -> bool {
386 matches!(c, '&' | '<' | '>') || is_xml11_restricted_or_invalid(c)
387}
388
389fn write_xml11_content_encoded<W: fmt::Write>(
390 out: &mut W,
391 c: char,
392 _next: Option<char>,
393) -> fmt::Result {
394 match c {
395 '&' => out.write_str("&"),
396 '<' => out.write_str("<"),
397 '>' => out.write_str(">"),
398 '\0' => out.write_char(' '),
399 c if is_unicode_noncharacter(c as u32) => out.write_char(' '),
400 c => write!(out, "&#x{:x};", c as u32),
401 }
402}
403
404// --- for_xml11_attribute ---
405
406fn needs_xml11_attribute_encoding(c: char) -> bool {
407 matches!(c, '&' | '<' | '"' | '\'') || is_xml11_restricted_or_invalid(c)
408}
409
410fn write_xml11_attribute_encoded<W: fmt::Write>(
411 out: &mut W,
412 c: char,
413 _next: Option<char>,
414) -> fmt::Result {
415 match c {
416 '&' => out.write_str("&"),
417 '<' => out.write_str("<"),
418 '"' => out.write_str("""),
419 '\'' => out.write_str("'"),
420 '\0' => out.write_char(' '),
421 c if is_unicode_noncharacter(c as u32) => out.write_char(' '),
422 c => write!(out, "&#x{:x};", c as u32),
423 }
424}
425
426#[cfg(test)]
427mod tests {
428 use super::*;
429
430 // -- XML 1.0 aliases --
431
432 #[test]
433 fn xml_aliases_match_html() {
434 let input = r#"<b attr="val">&</b>"#;
435 assert_eq!(for_xml(input), crate::html::for_html(input));
436 assert_eq!(for_xml_content(input), crate::html::for_html_content(input));
437 assert_eq!(
438 for_xml_attribute(input),
439 crate::html::for_html_attribute(input)
440 );
441 }
442
443 // -- XML comment --
444
445 #[test]
446 fn comment_passthrough() {
447 assert_eq!(for_xml_comment("safe text"), "safe text");
448 assert_eq!(for_xml_comment(""), "");
449 }
450
451 #[test]
452 fn comment_double_hyphen() {
453 assert_eq!(for_xml_comment("a--b"), "a-~b");
454 assert_eq!(for_xml_comment("--"), "-~");
455 assert_eq!(for_xml_comment("---"), "-~~");
456 assert_eq!(for_xml_comment("----"), "-~-~");
457 assert_eq!(for_xml_comment("a--b--c"), "a-~b-~c");
458 }
459
460 #[test]
461 fn comment_trailing_hyphen() {
462 assert_eq!(for_xml_comment("trailing-"), "trailing~");
463 assert_eq!(for_xml_comment("-"), "~");
464 }
465
466 #[test]
467 fn comment_replaces_invalid_xml() {
468 assert_eq!(for_xml_comment("a\x01b"), "a b");
469 assert_eq!(for_xml_comment("a\x7Fb"), "a b");
470 }
471
472 #[test]
473 fn comment_preserves_non_ascii() {
474 assert_eq!(for_xml_comment("café"), "café");
475 }
476
477 #[test]
478 fn comment_writer_variant() {
479 let mut out = String::new();
480 write_xml_comment(&mut out, "a--b").unwrap();
481 assert_eq!(out, "a-~b");
482 }
483
484 // -- CDATA --
485
486 #[test]
487 fn cdata_passthrough() {
488 assert_eq!(for_cdata("safe text"), "safe text");
489 assert_eq!(for_cdata(""), "");
490 }
491
492 #[test]
493 fn cdata_splits_closing_delimiter() {
494 assert_eq!(for_cdata("a]]>b"), "a]]]]><![CDATA[>b");
495 }
496
497 #[test]
498 fn cdata_double_split() {
499 assert_eq!(for_cdata("a]]>b]]>c"), "a]]]]><![CDATA[>b]]]]><![CDATA[>c");
500 }
501
502 #[test]
503 fn cdata_brackets_without_gt() {
504 assert_eq!(for_cdata("]]"), "]]");
505 assert_eq!(for_cdata("]"), "]");
506 assert_eq!(for_cdata("]]a"), "]]a");
507 }
508
509 #[test]
510 fn cdata_extra_brackets() {
511 // ]]]> → ] + ]]> split
512 assert_eq!(for_cdata("]]]>"), "]]]]]><![CDATA[>");
513 }
514
515 #[test]
516 fn cdata_replaces_invalid_xml() {
517 assert_eq!(for_cdata("a\x01b"), "a b");
518 }
519
520 #[test]
521 fn cdata_single_bracket_gt() {
522 // ]> is not ]]>, should pass through
523 assert_eq!(for_cdata("]>"), "]>");
524 }
525
526 #[test]
527 fn cdata_writer_variant() {
528 let mut out = String::new();
529 write_cdata(&mut out, "a]]>b").unwrap();
530 assert_eq!(out, "a]]]]><![CDATA[>b");
531 }
532
533 // -- XML 1.1 --
534
535 #[test]
536 fn xml11_encodes_entities() {
537 assert_eq!(for_xml11("<&>\"'"), "<&>"'");
538 }
539
540 #[test]
541 fn xml11_controls_as_references() {
542 // C0 controls get &#xHH; instead of space
543 assert_eq!(for_xml11("a\x01b"), "ab");
544 assert_eq!(for_xml11("a\x08b"), "ab");
545 assert_eq!(for_xml11("a\x0Bb"), "ab");
546 assert_eq!(for_xml11("a\x1Fb"), "ab");
547 }
548
549 #[test]
550 fn xml11_nel_passes_through() {
551 // NEL (U+0085) is NOT restricted in XML 1.1
552 assert_eq!(for_xml11("a\u{0085}b"), "a\u{0085}b");
553 }
554
555 #[test]
556 fn xml11_del_and_c1_as_references() {
557 assert_eq!(for_xml11("a\x7Fb"), "ab");
558 assert_eq!(for_xml11("a\u{0080}b"), "a€b");
559 assert_eq!(for_xml11("a\u{009F}b"), "aŸb");
560 }
561
562 #[test]
563 fn xml11_nul_replaced_with_space() {
564 assert_eq!(for_xml11("a\x00b"), "a b");
565 }
566
567 #[test]
568 fn xml11_nonchars_replaced_with_space() {
569 assert_eq!(for_xml11("a\u{FDD0}b"), "a b");
570 }
571
572 #[test]
573 fn xml11_preserves_tab_lf_cr() {
574 assert_eq!(for_xml11("a\tb\nc\rd"), "a\tb\nc\rd");
575 }
576
577 #[test]
578 fn xml11_content_no_quotes() {
579 assert_eq!(for_xml11_content(r#"a"b'c"#), r#"a"b'c"#);
580 assert_eq!(for_xml11_content("a\x01b"), "ab");
581 }
582
583 #[test]
584 fn xml11_attribute_no_gt() {
585 assert_eq!(for_xml11_attribute("a>b"), "a>b");
586 assert_eq!(for_xml11_attribute("a\x01b"), "ab");
587 }
588}