hml_rs/escape.rs
1//a Documentation
2#![warn(missing_docs)]
3// #![warn(missing_doc_code_examples)]
4/*!
5
6# Escape handling
7
8This module is not ready for use
9
10This module provides escape handling for XML and entity replacement
11
12!*/
13
14//a Imports
15use std::collections::HashMap;
16
17/// Result of unescaping/unentity-ify a string
18pub type Result<T> = std::result::Result<T, std::io::Error>;
19
20// Bit mask of escapes that should be parsed
21//cp ESCAPE_QUOTE
22/// Bitmask to enable unescaping of "
23pub const ESCAPE_QUOTE: usize = 1;
24//cp ESCAPE_APOS
25/// Bitmask to enable unescaping of '
26pub const ESCAPE_APOS: usize = 2;
27//cp ESCAPE_GT
28/// Bitmask to enable unescaping of >
29pub const ESCAPE_GT: usize = 4;
30//cp ESCAPE_LF
31/// Bitmask to enable unescaping of 

32pub const ESCAPE_LF: usize = 8;
33//cp ESCAPE_CR
34/// Bitmask to enable unescaping of 
35pub const ESCAPE_CR: usize = 16;
36
37//cp ESCAPE_ATTR
38/// Bitmask to enable unescaping of all attributes
39pub const ESCAPE_ATTR: usize = ESCAPE_QUOTE | ESCAPE_APOS | ESCAPE_GT | ESCAPE_LF | ESCAPE_CR;
40
41//cp ESCAPE_PCDATA
42/// Bitmask used to unescape PCDATA - that is, none
43pub const ESCAPE_PCDATA: usize = 0;
44
45#[inline(always)]
46fn do_esc(char_set: usize, esc: usize) -> bool {
47    (char_set & esc) != 0
48}
49
50//fp escape_required
51/// Return a Some(string) where string is an unescaped version of the input
52pub fn escape_required(bytes: &[u8], char_set: usize, i: usize, n: usize) -> Option<String> {
53    let mut r = Vec::with_capacity(n);
54    if i > 0 {
55        r.extend_from_slice(&bytes[0..i]);
56    }
57    // for i in i..n {
58    for b in bytes.iter().take(n).skip(i) {
59        if b & 0x80 != 0 {
60            r.push(*b);
61        } else {
62            match b {
63                b'&' => {
64                    r.extend_from_slice(b"&");
65                }
66                b'<' => {
67                    r.extend_from_slice(b"<");
68                }
69                b'\'' if do_esc(char_set, ESCAPE_APOS) => {
70                    r.extend_from_slice(b"'");
71                }
72                b'\"' if do_esc(char_set, ESCAPE_QUOTE) => {
73                    r.extend_from_slice(b""");
74                }
75                b'>' if do_esc(char_set, ESCAPE_GT) => {
76                    r.extend_from_slice(b">");
77                }
78                b'\n' if do_esc(char_set, ESCAPE_LF) => {
79                    r.extend_from_slice(b"
");
80                }
81                b'\r' if do_esc(char_set, ESCAPE_CR) => {
82                    r.extend_from_slice(b"
");
83                }
84                _ => {
85                    r.push(*b);
86                }
87            }
88        }
89    }
90    let string = unsafe { String::from_utf8_unchecked(r) };
91    Some(string)
92}
93
94//fp escape
95/// Return Some(string) if escaping is needed (given char_set), else None
96pub fn escape(s: &str, char_set: usize) -> Option<String> {
97    // Note that n == s.len is the length in bytes, not in utf8 characters
98    let n = s.len();
99    let bytes = s.as_bytes();
100    for i in 0..n {
101        match bytes[i] {
102            b'&' => {
103                return escape_required(bytes, char_set, i, n);
104            }
105            b'<' => {
106                return escape_required(bytes, char_set, i, n);
107            }
108            b'\'' if do_esc(char_set, ESCAPE_APOS) => {
109                return escape_required(bytes, char_set, i, n);
110            }
111            b'\"' if do_esc(char_set, ESCAPE_QUOTE) => {
112                return escape_required(bytes, char_set, i, n);
113            }
114            b'>' if do_esc(char_set, ESCAPE_GT) => {
115                return escape_required(bytes, char_set, i, n);
116            }
117            b'\n' if do_esc(char_set, ESCAPE_LF) => {
118                return escape_required(bytes, char_set, i, n);
119            }
120            b'\r' if do_esc(char_set, ESCAPE_CR) => {
121                return escape_required(bytes, char_set, i, n);
122            }
123            _ => (),
124        }
125    }
126    None
127}
128
129//tp Entities
130/// A set of entities that should be unmapped and how they should be unmapped
131#[derive(Default)]
132pub struct Entities<'a> {
133    map: HashMap<&'a [u8], &'a str>,
134}
135
136//ip Entities
137impl<'a> Entities<'a> {
138    //fp xml
139    /// Create a new Entities set for XML entity parsing
140    pub fn xml() -> Self {
141        let mut map: HashMap<&[u8], &str> = HashMap::new();
142        map.insert(b"amp", "&");
143        map.insert(b"AMP", "&");
144        map.insert(b"lt", "<");
145        map.insert(b"LT", "<");
146        map.insert(b"gt", ">");
147        map.insert(b"GT", ">");
148        map.insert(b"apos", "'");
149        map.insert(b"APOS", "'");
150        map.insert(b"quot", "\"");
151        map.insert(b"QUOT", "\"");
152        Self { map }
153    }
154
155    //fp find_span
156    /// Find the span starting with the given index `i` that is either
157    /// from an entity (starting with '&' ending with ';') - which is
158    /// then unmapped if possible, or the span until the end of string
159    /// or the next entity.
160    ///
161    /// The return value is the index of the end of the span, and a
162    /// possible replacement string or replacement character - if the
163    /// span is an entity it can be mapped to either of these (or an
164    /// unknown/bad entity is just a simple span).
165    ///
166    /// Hence a return value of (n, Some(r), None) indicates that from
167    /// `i` to `n` (inclusive to exclusive) is an entity that can be
168    /// replaced with the string `r`.
169    ///
170    /// A return value of (n, None, Some(c)) indicates that from
171    /// `i` to `n` (inclusive to exclusive) is an entity that can be
172    /// replaced with the character `c`.
173    ///
174    /// The other possible return value is (n, None, None), indicating
175    /// that the span from `i` to `n` contains no entity references
176    fn find_span(
177        &self,
178        inc_map: bool,
179        bytes: &[u8],
180        mut i: usize,
181        n: usize,
182    ) -> (usize, Option<&str>, Option<char>) {
183        if bytes[i] == b'&' {
184            i += 1;
185            let start = i;
186            let mut is_hex = false;
187            let mut is_dec = true;
188            let mut value = 0;
189            while i < n {
190                let b = bytes[i];
191                if b == b';' {
192                    if inc_map {
193                        if let Some(c) = self.map.get(&bytes[start..i]) {
194                            return (i + 1, Some(c), None);
195                        }
196                    }
197                    if is_hex || is_dec {
198                        if let Ok(c) = char::try_from(value) {
199                            return (i + 1, None, Some(c));
200                        }
201                    }
202                    i += 1;
203                    break;
204                }
205                if i == start {
206                    if b != b'#' {
207                        is_dec = false;
208                    }
209                } else if (b'a'..=b'f').contains(&b) || (b'A'..=b'F').contains(&b) {
210                    value = (value << 4) | (((b & 0xf) + 9) as u32);
211                    is_dec = false;
212                } else if b == b'x' {
213                    if i == start + 1 && is_dec {
214                        is_hex = true;
215                    }
216                    is_dec = false;
217                } else if b.is_ascii_digit() {
218                    if is_dec {
219                        value = (value * 10).wrapping_add((b - b'0') as u32);
220                    } else {
221                        value = (value << 4) | ((b & 0xf) as u32);
222                    }
223                    if value > 0x10ffff {
224                        is_dec = false;
225                        is_hex = false;
226                        value = 0;
227                    }
228                } else {
229                    is_dec = false;
230                    is_hex = false;
231                }
232                i += 1;
233            }
234            (i, None, None)
235        } else {
236            i += 1;
237            while i < n {
238                if bytes[i] == b'&' {
239                    break;
240                }
241                i += 1;
242            }
243            (i, None, None)
244        }
245    }
246
247    //fp replace_entities
248    /// Replace general entity references and &#..; characters, using the map.
249    ///
250    /// The buffer `bytes` is the source and it has length `n`.
251    ///
252    /// The buffer at `bytes` has the span from 0..d as a valid UTF8 string;
253    /// at `d` there is an entity that ends at `i` which should be replaced with `c`.
254    ///
255    /// From `i` there may be more entities that require replacement.
256    fn replace_entities_required(
257        &self,
258        inc_map: bool,
259        bytes: &[u8],
260        c: &str,
261        d: usize,
262        mut i: usize,
263        n: usize,
264    ) -> Option<String> {
265        let mut r = Vec::with_capacity(n);
266        if d > 0 {
267            r.extend_from_slice(&bytes[0..d]);
268        }
269        r.extend_from_slice(c.as_bytes());
270        while i < n {
271            let (next_i, opt_a, opt_b) = self.find_span(inc_map, bytes, i, n);
272            if let Some(c) = opt_a {
273                r.extend_from_slice(c.as_bytes());
274            } else if let Some(c) = opt_b {
275                let mut buf = [0; 4];
276                let buf = c.encode_utf8(&mut buf).as_bytes();
277                r.extend_from_slice(buf);
278            } else {
279                r.extend_from_slice(&bytes[i..next_i]);
280            }
281            i = next_i;
282        }
283        let string = unsafe { String::from_utf8_unchecked(r) };
284        Some(string)
285    }
286
287    //fp replace_entities
288    /// Replace general entity references and &#..; characters, using the map.
289    ///
290    /// Return None if the string has no replacements required; else Some(new string).
291    ///
292    /// The replacements that are used should *also* be replaced if this is expanding a general entity use.
293    ///
294    /// We don't handle parameter entities here yet ('%thing;')
295    ///
296    /// However, the map should not be used for entity declaration
297    /// contents in XML hence inc_map is provided. However, character
298    /// entities &#..; are expanded in entity declarations.
299    ///
300    /// Character entities are *ALSO* expanded when entities are used.
301    ///
302    /// Another option would be to use two different [Entities] to
303    /// handle the two different cases.
304    ///
305    /// <!ENTITY example "<p>An ampersand (&#38;) may be escaped
306    /// numerically (&#38;#38;) or with a general entity
307    /// (&amp;).</p>" >
308    ///
309    /// makes 'example' be
310    ///
311    /// <p>An ampersand (&) may be escaped
312    /// numerically (&#38;) or with a general entity
313    /// (&amp;).</p>
314    ///
315    /// and a reference in a doc to &example; is then replaced with a 'p' element with content
316    ///
317    /// An ampersand (&) may be escaped
318    /// numerically (&) or with a general entity
319    /// (&).
320    ///
321    pub fn replace_entities(&self, inc_map: bool, s: &str) -> Option<String> {
322        // Note that s.len is the length in bytes, not in utf8 characters
323        let n = s.len();
324        let bytes = s.as_bytes();
325        let mut i = 0;
326        while i < n {
327            // Find next span
328            //
329            let (next_i, opt_a, opt_b) = self.find_span(inc_map, bytes, i, n);
330            if let Some(c) = opt_a {
331                // The return from find_span was(n, Some(c:&str), None): the span up to `n` is
332                // an entity reference to be replaced with `c`
333                return self.replace_entities_required(inc_map, bytes, c, i, next_i, n);
334            } else if let Some(c) = opt_b {
335                // The return from find_span was(n, None, Some(c:char)): the span up to `n` is
336                // an entity reference to be replaced with `c`
337                let mut buf = [0; 4];
338                let buf = c.encode_utf8(&mut buf);
339                return self.replace_entities_required(inc_map, bytes, buf, i, next_i, n);
340            }
341            // The return from find_span was(n, None, None): the span up to `n` has
342            // no entity references
343            i = next_i;
344        }
345        None
346    }
347}
348
349//a Test
350#[cfg(test)]
351mod test {
352    use super::*;
353    // fn check_ok( r:Result<Option<String>>, e:Option<&str> ) {
354    fn check_ok(r: Option<String>, e: Option<&str>) {
355        // assert!(r.is_ok());
356        // let r = r.unwrap();
357        assert_eq!(r, e.map(|s| s.into()));
358    }
359    #[test]
360    fn test0() {
361        check_ok(escape("fred", ESCAPE_ATTR), None);
362        check_ok(escape("banana", ESCAPE_ATTR), None);
363        check_ok(
364            escape("My < and more", ESCAPE_ATTR),
365            Some("My < and more"),
366        );
367        check_ok(
368            escape("My > and less", ESCAPE_ATTR),
369            Some("My > and less"),
370        );
371        check_ok(
372            escape("My '\"& etc", ESCAPE_ATTR),
373            Some("My '"& etc"),
374        );
375        check_ok(escape("\u{1f600}", ESCAPE_ATTR), None);
376        check_ok(escape("\u{1f600} <", ESCAPE_ATTR), Some("\u{1f600} <"));
377        check_ok(
378            escape("\u{1f600} < \u{1f600} ", ESCAPE_ATTR),
379            Some("\u{1f600} < \u{1f600} "),
380        );
381    }
382    #[test]
383    fn test_entities() {
384        let e = Entities::xml();
385        check_ok(e.replace_entities(true, "fred"), None);
386        check_ok(e.replace_entities(true, "&&"), Some("&&"));
387        check_ok(e.replace_entities(true, "<<>>"), Some("<<>>"));
388        check_ok(e.replace_entities(true, "&blob;""), Some("&blob;\""));
389        check_ok(e.replace_entities(true, "�"), None);
390        check_ok(e.replace_entities(true, "2"), Some("2"));
391        check_ok(e.replace_entities(true, " "), Some(" "));
392        check_ok(e.replace_entities(true, "�"), None);
393        check_ok(e.replace_entities(true, "2 "), Some("2 "));
394        check_ok(e.replace_entities(true, " 2"), Some(" 2"));
395    }
396}