hml_rs/escape.rs
1//a Documentation
2#![warn(missing_docs)]
3// #![warn(missing_doc_code_examples)]
4/*!
5
6# Escape handling
7
8This module is not ready for use
9
10This module provides escape handling for XML and entity replacement
11
12!*/
13
14//a Imports
15use std::collections::HashMap;
16
17/// Result of unescaping/unentity-ify a string
18pub type Result<T> = std::result::Result<T, std::io::Error>;
19
20// Bit mask of escapes that should be parsed
21//cp ESCAPE_QUOTE
22/// Bitmask to enable unescaping of "
23pub const ESCAPE_QUOTE: usize = 1;
24//cp ESCAPE_APOS
25/// Bitmask to enable unescaping of '
26pub const ESCAPE_APOS: usize = 2;
27//cp ESCAPE_GT
28/// Bitmask to enable unescaping of >
29pub const ESCAPE_GT: usize = 4;
30//cp ESCAPE_LF
31/// Bitmask to enable unescaping of 

32pub const ESCAPE_LF: usize = 8;
33//cp ESCAPE_CR
34/// Bitmask to enable unescaping of 
35pub const ESCAPE_CR: usize = 16;
36
37//cp ESCAPE_ATTR
38/// Bitmask to enable unescaping of all attributes
39pub const ESCAPE_ATTR: usize = ESCAPE_QUOTE | ESCAPE_APOS | ESCAPE_GT | ESCAPE_LF | ESCAPE_CR;
40
41//cp ESCAPE_PCDATA
42/// Bitmask used to unescape PCDATA - that is, none
43pub const ESCAPE_PCDATA: usize = 0;
44
45#[inline(always)]
46fn do_esc(char_set: usize, esc: usize) -> bool {
47 (char_set & esc) != 0
48}
49
50//fp escape_required
51/// Return a Some(string) where string is an unescaped version of the input
52pub fn escape_required(bytes: &[u8], char_set: usize, i: usize, n: usize) -> Option<String> {
53 let mut r = Vec::with_capacity(n);
54 if i > 0 {
55 r.extend_from_slice(&bytes[0..i]);
56 }
57 // for i in i..n {
58 for b in bytes.iter().take(n).skip(i) {
59 if b & 0x80 != 0 {
60 r.push(*b);
61 } else {
62 match b {
63 b'&' => {
64 r.extend_from_slice(b"&");
65 }
66 b'<' => {
67 r.extend_from_slice(b"<");
68 }
69 b'\'' if do_esc(char_set, ESCAPE_APOS) => {
70 r.extend_from_slice(b"'");
71 }
72 b'\"' if do_esc(char_set, ESCAPE_QUOTE) => {
73 r.extend_from_slice(b""");
74 }
75 b'>' if do_esc(char_set, ESCAPE_GT) => {
76 r.extend_from_slice(b">");
77 }
78 b'\n' if do_esc(char_set, ESCAPE_LF) => {
79 r.extend_from_slice(b"
");
80 }
81 b'\r' if do_esc(char_set, ESCAPE_CR) => {
82 r.extend_from_slice(b"
");
83 }
84 _ => {
85 r.push(*b);
86 }
87 }
88 }
89 }
90 let string = unsafe { String::from_utf8_unchecked(r) };
91 Some(string)
92}
93
94//fp escape
95/// Return Some(string) if escaping is needed (given char_set), else None
96pub fn escape(s: &str, char_set: usize) -> Option<String> {
97 // Note that n == s.len is the length in bytes, not in utf8 characters
98 let n = s.len();
99 let bytes = s.as_bytes();
100 for i in 0..n {
101 match bytes[i] {
102 b'&' => {
103 return escape_required(bytes, char_set, i, n);
104 }
105 b'<' => {
106 return escape_required(bytes, char_set, i, n);
107 }
108 b'\'' if do_esc(char_set, ESCAPE_APOS) => {
109 return escape_required(bytes, char_set, i, n);
110 }
111 b'\"' if do_esc(char_set, ESCAPE_QUOTE) => {
112 return escape_required(bytes, char_set, i, n);
113 }
114 b'>' if do_esc(char_set, ESCAPE_GT) => {
115 return escape_required(bytes, char_set, i, n);
116 }
117 b'\n' if do_esc(char_set, ESCAPE_LF) => {
118 return escape_required(bytes, char_set, i, n);
119 }
120 b'\r' if do_esc(char_set, ESCAPE_CR) => {
121 return escape_required(bytes, char_set, i, n);
122 }
123 _ => (),
124 }
125 }
126 None
127}
128
129//tp Entities
130/// A set of entities that should be unmapped and how they should be unmapped
131#[derive(Default)]
132pub struct Entities<'a> {
133 map: HashMap<&'a [u8], &'a str>,
134}
135
136//ip Entities
137impl<'a> Entities<'a> {
138 //fp xml
139 /// Create a new Entities set for XML entity parsing
140 pub fn xml() -> Self {
141 let mut map: HashMap<&[u8], &str> = HashMap::new();
142 map.insert(b"amp", "&");
143 map.insert(b"AMP", "&");
144 map.insert(b"lt", "<");
145 map.insert(b"LT", "<");
146 map.insert(b"gt", ">");
147 map.insert(b"GT", ">");
148 map.insert(b"apos", "'");
149 map.insert(b"APOS", "'");
150 map.insert(b"quot", "\"");
151 map.insert(b"QUOT", "\"");
152 Self { map }
153 }
154
155 //fp find_span
156 /// Find the span starting with the given index `i` that is either
157 /// from an entity (starting with '&' ending with ';') - which is
158 /// then unmapped if possible, or the span until the end of string
159 /// or the next entity.
160 ///
161 /// The return value is the index of the end of the span, and a
162 /// possible replacement string or replacement character - if the
163 /// span is an entity it can be mapped to either of these (or an
164 /// unknown/bad entity is just a simple span).
165 ///
166 /// Hence a return value of (n, Some(r), None) indicates that from
167 /// `i` to `n` (inclusive to exclusive) is an entity that can be
168 /// replaced with the string `r`.
169 ///
170 /// A return value of (n, None, Some(c)) indicates that from
171 /// `i` to `n` (inclusive to exclusive) is an entity that can be
172 /// replaced with the character `c`.
173 ///
174 /// The other possible return value is (n, None, None), indicating
175 /// that the span from `i` to `n` contains no entity references
176 fn find_span(
177 &self,
178 inc_map: bool,
179 bytes: &[u8],
180 mut i: usize,
181 n: usize,
182 ) -> (usize, Option<&str>, Option<char>) {
183 if bytes[i] == b'&' {
184 i += 1;
185 let start = i;
186 let mut is_hex = false;
187 let mut is_dec = true;
188 let mut value = 0;
189 while i < n {
190 let b = bytes[i];
191 if b == b';' {
192 if inc_map {
193 if let Some(c) = self.map.get(&bytes[start..i]) {
194 return (i + 1, Some(c), None);
195 }
196 }
197 if is_hex || is_dec {
198 if let Ok(c) = char::try_from(value) {
199 return (i + 1, None, Some(c));
200 }
201 }
202 i += 1;
203 break;
204 }
205 if i == start {
206 if b != b'#' {
207 is_dec = false;
208 }
209 } else if (b'a'..=b'f').contains(&b) || (b'A'..=b'F').contains(&b) {
210 value = (value << 4) | (((b & 0xf) + 9) as u32);
211 is_dec = false;
212 } else if b == b'x' {
213 if i == start + 1 && is_dec {
214 is_hex = true;
215 }
216 is_dec = false;
217 } else if b.is_ascii_digit() {
218 if is_dec {
219 value = (value * 10).wrapping_add((b - b'0') as u32);
220 } else {
221 value = (value << 4) | ((b & 0xf) as u32);
222 }
223 if value > 0x10ffff {
224 is_dec = false;
225 is_hex = false;
226 value = 0;
227 }
228 } else {
229 is_dec = false;
230 is_hex = false;
231 }
232 i += 1;
233 }
234 (i, None, None)
235 } else {
236 i += 1;
237 while i < n {
238 if bytes[i] == b'&' {
239 break;
240 }
241 i += 1;
242 }
243 (i, None, None)
244 }
245 }
246
247 //fp replace_entities
248 /// Replace general entity references and &#..; characters, using the map.
249 ///
250 /// The buffer `bytes` is the source and it has length `n`.
251 ///
252 /// The buffer at `bytes` has the span from 0..d as a valid UTF8 string;
253 /// at `d` there is an entity that ends at `i` which should be replaced with `c`.
254 ///
255 /// From `i` there may be more entities that require replacement.
256 fn replace_entities_required(
257 &self,
258 inc_map: bool,
259 bytes: &[u8],
260 c: &str,
261 d: usize,
262 mut i: usize,
263 n: usize,
264 ) -> Option<String> {
265 let mut r = Vec::with_capacity(n);
266 if d > 0 {
267 r.extend_from_slice(&bytes[0..d]);
268 }
269 r.extend_from_slice(c.as_bytes());
270 while i < n {
271 let (next_i, opt_a, opt_b) = self.find_span(inc_map, bytes, i, n);
272 if let Some(c) = opt_a {
273 r.extend_from_slice(c.as_bytes());
274 } else if let Some(c) = opt_b {
275 let mut buf = [0; 4];
276 let buf = c.encode_utf8(&mut buf).as_bytes();
277 r.extend_from_slice(buf);
278 } else {
279 r.extend_from_slice(&bytes[i..next_i]);
280 }
281 i = next_i;
282 }
283 let string = unsafe { String::from_utf8_unchecked(r) };
284 Some(string)
285 }
286
287 //fp replace_entities
288 /// Replace general entity references and &#..; characters, using the map.
289 ///
290 /// Return None if the string has no replacements required; else Some(new string).
291 ///
292 /// The replacements that are used should *also* be replaced if this is expanding a general entity use.
293 ///
294 /// We don't handle parameter entities here yet ('%thing;')
295 ///
296 /// However, the map should not be used for entity declaration
297 /// contents in XML hence inc_map is provided. However, character
298 /// entities &#..; are expanded in entity declarations.
299 ///
300 /// Character entities are *ALSO* expanded when entities are used.
301 ///
302 /// Another option would be to use two different [Entities] to
303 /// handle the two different cases.
304 ///
305 /// <!ENTITY example "<p>An ampersand (&#38;) may be escaped
306 /// numerically (&#38;#38;) or with a general entity
307 /// (&amp;).</p>" >
308 ///
309 /// makes 'example' be
310 ///
311 /// <p>An ampersand (&) may be escaped
312 /// numerically (&#38;) or with a general entity
313 /// (&amp;).</p>
314 ///
315 /// and a reference in a doc to &example; is then replaced with a 'p' element with content
316 ///
317 /// An ampersand (&) may be escaped
318 /// numerically (&) or with a general entity
319 /// (&).
320 ///
321 pub fn replace_entities(&self, inc_map: bool, s: &str) -> Option<String> {
322 // Note that s.len is the length in bytes, not in utf8 characters
323 let n = s.len();
324 let bytes = s.as_bytes();
325 let mut i = 0;
326 while i < n {
327 // Find next span
328 //
329 let (next_i, opt_a, opt_b) = self.find_span(inc_map, bytes, i, n);
330 if let Some(c) = opt_a {
331 // The return from find_span was(n, Some(c:&str), None): the span up to `n` is
332 // an entity reference to be replaced with `c`
333 return self.replace_entities_required(inc_map, bytes, c, i, next_i, n);
334 } else if let Some(c) = opt_b {
335 // The return from find_span was(n, None, Some(c:char)): the span up to `n` is
336 // an entity reference to be replaced with `c`
337 let mut buf = [0; 4];
338 let buf = c.encode_utf8(&mut buf);
339 return self.replace_entities_required(inc_map, bytes, buf, i, next_i, n);
340 }
341 // The return from find_span was(n, None, None): the span up to `n` has
342 // no entity references
343 i = next_i;
344 }
345 None
346 }
347}
348
349//a Test
350#[cfg(test)]
351mod test {
352 use super::*;
353 // fn check_ok( r:Result<Option<String>>, e:Option<&str> ) {
354 fn check_ok(r: Option<String>, e: Option<&str>) {
355 // assert!(r.is_ok());
356 // let r = r.unwrap();
357 assert_eq!(r, e.map(|s| s.into()));
358 }
359 #[test]
360 fn test0() {
361 check_ok(escape("fred", ESCAPE_ATTR), None);
362 check_ok(escape("banana", ESCAPE_ATTR), None);
363 check_ok(
364 escape("My < and more", ESCAPE_ATTR),
365 Some("My < and more"),
366 );
367 check_ok(
368 escape("My > and less", ESCAPE_ATTR),
369 Some("My > and less"),
370 );
371 check_ok(
372 escape("My '\"& etc", ESCAPE_ATTR),
373 Some("My '"& etc"),
374 );
375 check_ok(escape("\u{1f600}", ESCAPE_ATTR), None);
376 check_ok(escape("\u{1f600} <", ESCAPE_ATTR), Some("\u{1f600} <"));
377 check_ok(
378 escape("\u{1f600} < \u{1f600} ", ESCAPE_ATTR),
379 Some("\u{1f600} < \u{1f600} "),
380 );
381 }
382 #[test]
383 fn test_entities() {
384 let e = Entities::xml();
385 check_ok(e.replace_entities(true, "fred"), None);
386 check_ok(e.replace_entities(true, "&&"), Some("&&"));
387 check_ok(e.replace_entities(true, "<<>>"), Some("<<>>"));
388 check_ok(e.replace_entities(true, "&blob;""), Some("&blob;\""));
389 check_ok(e.replace_entities(true, "�"), None);
390 check_ok(e.replace_entities(true, "2"), Some("2"));
391 check_ok(e.replace_entities(true, " "), Some(" "));
392 check_ok(e.replace_entities(true, "�"), None);
393 check_ok(e.replace_entities(true, "2 "), Some("2 "));
394 check_ok(e.replace_entities(true, " 2"), Some(" 2"));
395 }
396}