speedy_xml/
escape.rs

1//! Functions for escaping and unescaping text in a RapidXML-compliant way.
2
3use std::borrow::Cow;
4
5use memchr::memchr2;
6
7fn resolve_entity(text: &str) -> Option<(char, &str)> {
8    let mut peek = text.chars();
9
10    let result = match peek.next()? {
11        'l' if peek.next()? == 't' => '<',
12        'g' if peek.next()? == 't' => '>',
13        'a' => match peek.next()? {
14            'p' if peek.next()? == 'o' && peek.next()? == 's' => '\'',
15            'm' if peek.next()? == 'p' => '&',
16            _ => return None,
17        },
18        'q' if peek.next()? == 'u' && peek.next()? == 'o' && peek.next()? == 't' => '"',
19        '#' => {
20            let mut code = 0;
21            let mut next = peek.next()?;
22            let radix = if next == 'x' {
23                next = peek.next()?;
24                16
25            } else {
26                10
27            };
28
29            while next != ';' {
30                code *= radix;
31                code += next.to_digit(radix)?;
32                next = peek.next()?;
33            }
34
35            let result = char::from_u32(code)?;
36            // NOTE: We've already consumed a ';' so we return early here.
37            return Some((result, peek.as_str()));
38        }
39        _ => return None,
40    };
41
42    if peek.next()? != ';' {
43        None
44    } else {
45        Some((result, peek.as_str()))
46    }
47}
48
49/// Unescapes an XML escaped string. Keeps all unresolved entities unexpanded.
50///
51/// # Notes
52///
53/// Unlike RapidXML this will not insert invalid codepoints into the string and will keep
54/// character references that would expand to them unexpanded.
55pub fn unescape(string: &'_ str) -> Cow<'_, str> {
56    let mut replaced = String::new();
57
58    let mut current = string;
59    while let Some(next) = memchr2(b'&', b'\0', current.as_bytes()) {
60        match current.as_bytes()[next] {
61            b'&' => {
62                if let Some((chr, rest)) = resolve_entity(&current[next + 1..]) {
63                    replaced.push_str(&current[..next]);
64
65                    if chr == '\0' {
66                        return Cow::Owned(replaced);
67                    }
68
69                    replaced.push(chr);
70                    current = rest;
71                } else {
72                    current = &current[1..];
73                }
74            }
75            _ => {
76                return if replaced.is_empty() {
77                    Cow::Borrowed(string)
78                } else {
79                    replaced.push_str(&current[..next]);
80                    Cow::Owned(replaced)
81                };
82            }
83        }
84    }
85
86    if replaced.is_empty() {
87        Cow::Borrowed(string)
88    } else {
89        replaced.push_str(current);
90        Cow::Owned(replaced)
91    }
92}
93
94fn escape(string: &str, next: impl Fn(&str) -> Option<usize>) -> Cow<'_, str> {
95    let mut replaced = String::new();
96
97    let mut current = string;
98    while let Some(escaped) = next(current) {
99        replaced.push_str(&current[..escaped]);
100        match current.as_bytes()[escaped] {
101            b'<' => replaced.push_str("&lt;"),
102            b'>' => replaced.push_str("&gt;"),
103            b'&' => replaced.push_str("&amp;"),
104            b'\"' => replaced.push_str("&quot;"),
105            _ => unreachable!(),
106        };
107        current = &current[escaped + 1..]
108    }
109
110    if replaced.is_empty() {
111        Cow::Borrowed(string)
112    } else {
113        replaced.push_str(current);
114        Cow::Owned(replaced)
115    }
116}
117
118/// Escapes the string so that it is a valid `"`-quoted attribute value.
119pub fn attribute_value_escape(string: &'_ str) -> Cow<'_, str> {
120    escape(string, |text| {
121        memchr::memchr3(b'<', b'&', b'"', text.as_bytes())
122    })
123}
124
125/// Escapes the string so that it is valid as a text node.
126pub fn content_escape(string: &'_ str) -> Cow<'_, str> {
127    escape(string, |text| memchr::memchr2(b'<', b'&', text.as_bytes()))
128}
129
130/// Escapes the string so that it is valid inside a comment.
131pub fn comment_escape(string: &'_ str) -> Cow<'_, str> {
132    escape(string, |text| memchr::memchr(b'>', text.as_bytes()))
133}
134
135#[cfg(test)]
136mod test {
137    use super::{content_escape, unescape};
138
139    #[test]
140    fn simple_unescape_escape() {
141        const STRINGS: &[(&str, &str, &str)] = &[
142            (
143                "&quot; hello &amp; world &apos;",
144                "\" hello & world '",
145                "\" hello &amp; world '",
146            ),
147            (
148                "&#11088; &lt;hello world&gt; &#x2B50;",
149                "⭐ <hello world> ⭐",
150                "⭐ &lt;hello world> ⭐",
151            ),
152            ("&haha; &apo", "&haha; &apo", "&amp;haha; &amp;apo"),
153        ];
154
155        for (string, expected_unescaped, expected_escaped) in STRINGS {
156            let unescaped = unescape(string);
157            assert_eq!(&unescaped, expected_unescaped);
158            assert_eq!(&content_escape(&unescaped), expected_escaped);
159        }
160    }
161}