Skip to main content

quick_xml/
escape.rs

1//! Manage xml character escapes
2
3use memchr::{memchr, memchr2_iter, memchr3};
4use std::borrow::Cow;
5use std::fmt::{self, Write};
6use std::num::ParseIntError;
7use std::ops::Range;
8use std::slice::Iter;
9
10/// Error of parsing character reference (`&#<dec-number>;` or `&#x<hex-number>;`).
11#[derive(Clone, Debug, PartialEq)]
12pub enum ParseCharRefError {
13    /// Number contains sign character (`+` or `-`) which is not allowed.
14    UnexpectedSign,
15    /// Number cannot be parsed due to non-number characters or a numeric overflow.
16    InvalidNumber(ParseIntError),
17    /// Character reference represents not a valid unicode codepoint.
18    InvalidCodepoint(u32),
19    /// Character reference expanded to a not permitted character for an XML.
20    ///
21    /// Currently, only `0x0` character produces this error.
22    IllegalCharacter(u32),
23}
24
25impl std::fmt::Display for ParseCharRefError {
26    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
27        match self {
28            Self::UnexpectedSign => f.write_str("unexpected number sign"),
29            Self::InvalidNumber(e) => e.fmt(f),
30            Self::InvalidCodepoint(n) => write!(f, "`{}` is not a valid codepoint", n),
31            Self::IllegalCharacter(n) => write!(f, "0x{:x} character is not permitted in XML", n),
32        }
33    }
34}
35
36impl std::error::Error for ParseCharRefError {
37    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
38        match self {
39            Self::InvalidNumber(e) => Some(e),
40            _ => None,
41        }
42    }
43}
44
45/// Error for XML escape / unescape.
46#[derive(Clone, Debug, PartialEq)]
47pub enum EscapeError {
48    /// Referenced entity in unknown to the parser.
49    UnrecognizedEntity(Range<usize>, String),
50    /// Cannot find `;` after `&`
51    UnterminatedEntity(Range<usize>),
52    /// Attempt to parse character reference (`&#<dec-number>;` or `&#x<hex-number>;`)
53    /// was unsuccessful, not all characters are decimal or hexadecimal numbers.
54    InvalidCharRef(ParseCharRefError),
55    /// Expanded more than maximum possible entities during attribute normalization.
56    ///
57    /// Attribute normalization includes expanding of general entities (`&entity;`)
58    /// which replacement text also could contain entities, which is also must be expanded.
59    /// If more than 128 entities would be expanded, this error is returned.
60    TooManyNestedEntities,
61}
62
63impl std::fmt::Display for EscapeError {
64    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
65        match self {
66            Self::UnrecognizedEntity(rge, res) => {
67                write!(f, "at {:?}: unrecognized entity `{}`", rge, res)
68            }
69            Self::UnterminatedEntity(e) => write!(
70                f,
71                "Error while escaping character at range {:?}: Cannot find ';' after '&'",
72                e
73            ),
74            Self::InvalidCharRef(e) => {
75                write!(f, "invalid character reference: {}", e)
76            }
77            Self::TooManyNestedEntities => {
78                f.write_str("too many nested entities in an attribute value")
79            }
80        }
81    }
82}
83
84impl std::error::Error for EscapeError {
85    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
86        match self {
87            Self::InvalidCharRef(e) => Some(e),
88            _ => None,
89        }
90    }
91}
92
93/// Escapes an `&str` and replaces all xml special characters (`<`, `>`, `&`, `'`, `"`)
94/// with their corresponding xml escaped value.
95///
96/// This function performs following replacements:
97///
98/// | Character | Replacement
99/// |-----------|------------
100/// | `<`       | `&lt;`
101/// | `>`       | `&gt;`
102/// | `&`       | `&amp;`
103/// | `'`       | `&apos;`
104/// | `"`       | `&quot;`
105///
106/// This function performs following replacements:
107///
108/// | Character | Replacement
109/// |-----------|------------
110/// | `<`       | `&lt;`
111/// | `>`       | `&gt;`
112/// | `&`       | `&amp;`
113/// | `'`       | `&apos;`
114/// | `"`       | `&quot;`
115pub fn escape<'a>(raw: impl Into<Cow<'a, str>>) -> Cow<'a, str> {
116    _escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&' | b'\'' | b'\"'))
117}
118
119/// Escapes an `&str` and replaces xml special characters (`<`, `>`, `&`)
120/// with their corresponding xml escaped value.
121///
122/// Should only be used for escaping text content. In XML text content, it is allowed
123/// (though not recommended) to leave the quote special characters `"` and `'` unescaped.
124///
125/// This function performs following replacements:
126///
127/// | Character | Replacement
128/// |-----------|------------
129/// | `<`       | `&lt;`
130/// | `>`       | `&gt;`
131/// | `&`       | `&amp;`
132///
133/// This function performs following replacements:
134///
135/// | Character | Replacement
136/// |-----------|------------
137/// | `<`       | `&lt;`
138/// | `>`       | `&gt;`
139/// | `&`       | `&amp;`
140pub fn partial_escape<'a>(raw: impl Into<Cow<'a, str>>) -> Cow<'a, str> {
141    _escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&'))
142}
143
144/// XML standard [requires] that only `<` and `&` was escaped in text content or
145/// attribute value. All other characters not necessary to be escaped, although
146/// for compatibility with SGML they also should be escaped. Practically, escaping
147/// only those characters is enough.
148///
149/// This function performs following replacements:
150///
151/// | Character | Replacement
152/// |-----------|------------
153/// | `<`       | `&lt;`
154/// | `&`       | `&amp;`
155///
156/// [requires]: https://www.w3.org/TR/xml11/#syntax
157pub fn minimal_escape<'a>(raw: impl Into<Cow<'a, str>>) -> Cow<'a, str> {
158    _escape(raw, |ch| matches!(ch, b'<' | b'&'))
159}
160
161pub(crate) fn escape_char<W>(writer: &mut W, value: &str, from: usize, to: usize) -> fmt::Result
162where
163    W: fmt::Write,
164{
165    writer.write_str(&value[from..to])?;
166    match value.as_bytes()[to] {
167        b'<' => writer.write_str("&lt;")?,
168        b'>' => writer.write_str("&gt;")?,
169        b'\'' => writer.write_str("&apos;")?,
170        b'&' => writer.write_str("&amp;")?,
171        b'"' => writer.write_str("&quot;")?,
172
173        // This set of escapes handles characters that should be escaped
174        // in elements of xs:lists, because those characters works as
175        // delimiters of list elements
176        b'\t' => writer.write_str("&#9;")?,
177        b'\n' => writer.write_str("&#10;")?,
178        b'\r' => writer.write_str("&#13;")?,
179        b' ' => writer.write_str("&#32;")?,
180        _ => unreachable!("Only '<', '>','\', '&', '\"', '\\t', '\\r', '\\n', and ' ' are escaped"),
181    }
182    Ok(())
183}
184
185/// Escapes an `&str` and replaces a subset of xml special characters (`<`, `>`,
186/// `&`, `'`, `"`) with their corresponding xml escaped value.
187fn _escape<'a, F: Fn(u8) -> bool>(raw: impl Into<Cow<'a, str>>, escape_chars: F) -> Cow<'a, str> {
188    let raw = raw.into();
189    let bytes = raw.as_bytes();
190    let mut escaped = None;
191    let mut iter = bytes.iter();
192    let mut pos = 0;
193    while let Some(i) = iter.position(|&b| escape_chars(b)) {
194        if escaped.is_none() {
195            escaped = Some(String::with_capacity(raw.len()));
196        }
197        let escaped = escaped.as_mut().expect("initialized");
198        let new_pos = pos + i;
199        // SAFETY: It should fail only on OOM
200        escape_char(escaped, &raw, pos, new_pos).unwrap();
201        pos = new_pos + 1;
202    }
203
204    if let Some(mut escaped) = escaped {
205        if let Some(raw) = raw.get(pos..) {
206            // SAFETY: It should fail only on OOM
207            escaped.write_str(raw).unwrap();
208        }
209        Cow::Owned(escaped)
210    } else {
211        raw
212    }
213}
214
215/// Unescape an `&str` and replaces all xml escaped characters (`&...;`) into
216/// their corresponding value.
217///
218/// If feature [`escape-html`] is enabled, then recognizes all [HTML5 escapes].
219///
220/// [`escape-html`]: ../index.html#escape-html
221/// [HTML5 escapes]: https://dev.w3.org/html5/html-author/charref
222pub fn unescape(raw: &str) -> Result<Cow<'_, str>, EscapeError> {
223    unescape_with(raw, resolve_predefined_entity)
224}
225
226/// Unescape an `&str` and replaces all xml escaped characters (`&...;`) into
227/// their corresponding value, using a resolver function for custom entities.
228///
229/// If feature [`escape-html`] is enabled, then recognizes all [HTML5 escapes].
230///
231/// Predefined entities will be resolved _after_ trying to resolve with `resolve_entity`,
232/// which allows you to override default behavior which required in some XML dialects.
233///
234/// Character references (`&#hh;`) cannot be overridden, they are resolved before
235/// calling `resolve_entity`.
236///
237/// Note, that entities will not be resolved recursively. In order to satisfy the
238/// XML [requirements] you should unescape nested entities by yourself.
239///
240/// # Example
241///
242/// ```
243/// use quick_xml::escape::resolve_xml_entity;
244/// # use quick_xml::escape::unescape_with;
245/// # use pretty_assertions::assert_eq;
246/// let override_named_entities = |entity: &str| match entity {
247///     // Override standard entities
248///     "lt" => Some("FOO"),
249///     "gt" => Some("BAR"),
250///     // Resolve custom entities
251///     "baz" => Some("&lt;"),
252///     // Delegate other entities to the default implementation
253///     _ => resolve_xml_entity(entity),
254/// };
255///
256/// assert_eq!(
257///     unescape_with("&amp;&lt;test&gt;&baz;", override_named_entities).unwrap(),
258///     "&FOOtestBAR&lt;"
259/// );
260/// ```
261///
262/// [`escape-html`]: ../index.html#escape-html
263/// [HTML5 escapes]: https://dev.w3.org/html5/html-author/charref
264/// [requirements]: https://www.w3.org/TR/xml11/#intern-replacement
265pub fn unescape_with<'input, 'entity, F>(
266    raw: &'input str,
267    mut resolve_entity: F,
268) -> Result<Cow<'input, str>, EscapeError>
269where
270    // the lifetime of the output comes from a capture or is `'static`
271    F: FnMut(&str) -> Option<&'entity str>,
272{
273    let bytes = raw.as_bytes();
274    let mut unescaped = None;
275    let mut last_end = 0;
276    let mut iter = memchr2_iter(b'&', b';', bytes);
277    while let Some(start) = iter.by_ref().find(|p| bytes[*p] == b'&') {
278        match iter.next() {
279            Some(end) if bytes[end] == b';' => {
280                // append valid data
281                if unescaped.is_none() {
282                    unescaped = Some(String::with_capacity(raw.len()));
283                }
284                let unescaped = unescaped.as_mut().expect("initialized");
285                unescaped.push_str(&raw[last_end..start]);
286
287                // search for character correctness
288                let pat = &raw[start + 1..end];
289                if let Some(entity) = pat.strip_prefix('#') {
290                    let codepoint = parse_number(entity).map_err(EscapeError::InvalidCharRef)?;
291                    unescaped.push_str(codepoint.encode_utf8(&mut [0u8; 4]));
292                } else if let Some(value) = resolve_entity(pat) {
293                    unescaped.push_str(value);
294                } else {
295                    return Err(EscapeError::UnrecognizedEntity(
296                        start + 1..end,
297                        pat.to_string(),
298                    ));
299                }
300
301                last_end = end + 1;
302            }
303            _ => return Err(EscapeError::UnterminatedEntity(start..raw.len())),
304        }
305    }
306
307    if let Some(mut unescaped) = unescaped {
308        if let Some(raw) = raw.get(last_end..) {
309            unescaped.push_str(raw);
310        }
311        Ok(Cow::Owned(unescaped))
312    } else {
313        Ok(Cow::Borrowed(raw))
314    }
315}
316
317////////////////////////////////////////////////////////////////////////////////////////////////////
318
319// TODO: It would be better to reuse buffer after decoding if possible
320pub(crate) fn normalize_xml11_eols<'input>(text: &'input str) -> Cow<'input, str> {
321    let bytes = text.as_bytes();
322
323    // The following sequences of UTF-8 encoded input should be translated into
324    // a single `\n` (U+000a) character to normalize EOLs:
325    //
326    // |UTF-8   |String|
327    // |--------|------|
328    // |0d 0a   |\r\n  |
329    // |0d c2 85|\r\x85|
330    // |0d      |\r    |
331    // |c2 85   |\x85  |
332    // |e2 80 a8|\u2028|
333    if let Some(i) = memchr3(b'\r', 0xC2, 0xE2, bytes) {
334        // We found a character that requires normalization, so create new normalized
335        // string, put the prefix as is and then put normalized character
336        let mut normalized = String::with_capacity(text.len());
337        // NOTE: unsafe { text.get_unchecked(0..i) } could be used because
338        // we are sure that index within string
339        normalized.push_str(&text[0..i]);
340
341        let mut pos = normalize_xml11_eol_step(&mut normalized, text, i, '\n');
342        while let Some(i) = memchr3(b'\r', 0xC2, 0xE2, &bytes[pos..]) {
343            let index = pos + i;
344            // NOTE: unsafe { text.get_unchecked(pos..index) } could be used because
345            // we are sure that index within string
346            normalized.push_str(&text[pos..index]);
347            pos = normalize_xml11_eol_step(&mut normalized, text, index, '\n');
348        }
349        if let Some(rest) = text.get(pos..) {
350            normalized.push_str(rest);
351        }
352        return normalized.into();
353    }
354    Cow::Borrowed(text)
355}
356
357/// All line breaks MUST have been normalized on input to #xA as described
358/// in [2.11 End-of-Line Handling][eof], so the rest of this algorithm operates
359/// on text normalized in this way.
360///
361/// To simplify the tasks of applications, the XML processor MUST behave
362/// as if it normalized all line breaks in external parsed entities
363/// (including the document entity) on input, before parsing, by translating
364/// all of the following to a single #xA character (_which attribute normalization
365/// routine will replace by #x20 character_):
366///
367/// 1. the two-character sequence #xD #xA
368/// 2. the two-character sequence #xD #x85
369/// 3. the single character #x85
370/// 4. the single character #x2028
371/// 5. any #xD character that is not immediately followed by #xA or #x85.
372///
373/// The characters #x85 and #x2028 cannot be reliably recognized and translated
374/// until an entity's encoding declaration (if present) has been read.
375/// Therefore, it is a fatal error to use them within the XML declaration or text declaration.
376///
377/// Note, that this function cannot be used to normalize HTML values. The text in HTML
378/// normally is not normalized in any way; normalization is performed only in limited
379/// contexts and [only for] `\r\n` and `\r`.
380///
381/// # Parameters
382///
383/// - `normalized`: the string with the result of normalization
384/// - `input`: UTF-8 bytes of the string to be normalized
385/// - `index`: a byte index into `input` of character which is processed right now.
386///   It always points to the first byte of character in UTF-8 encoding
387/// - `ch`: a character that should be put to the string instead of newline sequence
388///
389/// Returns the index of next unprocessed byte in the `input`.
390///
391/// [eof]: https://www.w3.org/TR/xml11/#sec-line-ends
392/// [only for]: https://html.spec.whatwg.org/#normalize-newlines
393fn normalize_xml11_eol_step(normalized: &mut String, text: &str, index: usize, ch: char) -> usize {
394    let input = text.as_bytes();
395    match input[index] {
396        b'\r' => {
397            if index + 1 < input.len() {
398                let next = input[index + 1];
399                if next == b'\n' {
400                    normalized.push(ch);
401                    return index + 2; // skip \r\n
402                }
403                if next == 0xC2 {
404                    // UTF-8 encoding of #x85 character is [c2 85]
405                    if index + 2 < input.len() && input[index + 2] == 0x85 {
406                        normalized.push(ch);
407                    } else {
408                        normalized.push(ch);
409                        // NOTE: unsafe { text.get_unchecked(index..index + 3) } could be used because
410                        // we are sure that index within string
411                        normalized.push_str(&text[index + 1..index + 3]);
412                    }
413                    return index + 3; // skip \r + UTF-8 encoding of character (c2 xx)
414                }
415            }
416            normalized.push(ch);
417            index + 1 // skip \r
418        }
419        b'\n' => {
420            normalized.push(ch);
421            index + 1 // skip \n
422        }
423        // Start of UTF-8 encoding of #x85 character (c2 85)
424        0xC2 => {
425            if index + 1 < input.len() && input[index + 1] == 0x85 {
426                normalized.push(ch);
427            } else {
428                // NOTE: unsafe { text.get_unchecked(index..index + 2) } could be used because
429                // we are sure that index within string
430                normalized.push_str(&text[index..index + 2]);
431            }
432            index + 2 // skip UTF-8 encoding of character (c2 xx)
433        }
434        // Start of UTF-8 encoding of #x2028 character (e2 80 a8)
435        0xE2 => {
436            if index + 2 < input.len() && input[index + 1] == 0x80 && input[index + 2] == 0xA8 {
437                normalized.push(ch);
438            } else {
439                // NOTE: unsafe { text.get_unchecked(index..index + 3) } could be used because
440                // we are sure that index within string
441                normalized.push_str(&text[index..index + 3]);
442            }
443            index + 3 // skip UTF-8 encoding of character (e2 xx xx)
444        }
445
446        x => unreachable!(
447            "at {}: expected ''\\n', '\\r', '\\xC2', or '\\xE2', found '{}' / {} / `0x{:X}`",
448            index, x as char, x, x
449        ),
450    }
451}
452
453////////////////////////////////////////////////////////////////////////////////////////////////////
454
455// TODO: It would be better to reuse buffer after decoding if possible
456pub(crate) fn normalize_xml10_eols<'input>(text: &'input str) -> Cow<'input, str> {
457    let bytes = text.as_bytes();
458
459    // The following sequences of UTF-8 encoded input should be translated into
460    // a single `\n` (U+000a) character to normalize EOLs:
461    //
462    // |UTF-8   |String|
463    // |--------|------|
464    // |0d 0a   |\r\n  |
465    // |0d      |\r    |
466    if let Some(i) = memchr(b'\r', bytes) {
467        // We found a character that requires normalization, so create new normalized
468        // string, put the prefix as is and then put normalized character
469        let mut normalized = String::with_capacity(text.len());
470        // NOTE: unsafe { text.get_unchecked(0..i) } could be used because
471        // we are sure that index within string
472        normalized.push_str(&text[0..i]);
473
474        let mut pos = normalize_xml10_eol_step(&mut normalized, text, i, '\n');
475        while let Some(i) = memchr(b'\r', &bytes[pos..]) {
476            let index = pos + i;
477            // NOTE: unsafe { text.get_unchecked(pos..index) } could be used because
478            // we are sure that index within string
479            normalized.push_str(&text[pos..index]);
480            pos = normalize_xml10_eol_step(&mut normalized, text, index, '\n');
481        }
482        if let Some(rest) = text.get(pos..) {
483            normalized.push_str(rest);
484        }
485        return normalized.into();
486    }
487    Cow::Borrowed(text)
488}
489
490/// The text in HTML normally is not normalized in any way; normalization is
491/// performed only in limited contexts and [only for] `\r\n` and `\r`.
492///
493/// # Parameters
494///
495/// - `normalized`: the string with the result of normalization
496/// - `input`: UTF-8 bytes of the string to be normalized
497/// - `index`: a byte index into `input` of character which is processed right now.
498///   It always points to the first byte of character in UTF-8 encoding
499/// - `ch`: a character that should be put to the string instead of newline sequence
500///
501/// [only for]: https://html.spec.whatwg.org/#normalize-newlines
502fn normalize_xml10_eol_step(normalized: &mut String, text: &str, index: usize, ch: char) -> usize {
503    let input = text.as_bytes();
504    match input[index] {
505        b'\r' => {
506            normalized.push(ch);
507            if index + 1 < input.len() && input[index + 1] == b'\n' {
508                return index + 2; // skip \r\n
509            }
510            index + 1 // skip \r
511        }
512        b'\n' => {
513            normalized.push(ch);
514            index + 1 // skip \n
515        }
516
517        x => unreachable!(
518            "at {}: expected ''\\n' or '\\r', found '{}' / {} / `0x{:X}`",
519            index, x as char, x, x
520        ),
521    }
522}
523
524////////////////////////////////////////////////////////////////////////////////////////////////////
525
526pub(crate) fn normalize_xml10_attribute_value<'input, 'entity, F>(
527    value: &'input str,
528    depth: usize,
529    resolve_entity: F,
530) -> Result<Cow<'input, str>, EscapeError>
531where
532    // the lifetime of the output comes from a capture or is `'static`
533    F: FnMut(&str) -> Option<&'entity str>,
534{
535    normalize_attribute_value(
536        value,
537        depth,
538        is_xml10_normalization_char,
539        normalize_xml10_eol_step,
540        resolve_entity,
541    )
542}
543
544const fn is_xml10_normalization_char(b: &u8) -> bool {
545    // The following sequences should be translated into a single `\n` (U+000a) character
546    // to normalize EOLs:
547    //
548    // |UTF-8   |String|
549    // |--------|------|
550    // |0d 0a   |\r\n  |
551    // |0d      |\r    |
552    matches!(*b, b'\t' | b'\r' | b'\n' | b'&')
553}
554
555////////////////////////////////////////////////////////////////////////////////////////////////////
556
557pub(crate) fn normalize_xml11_attribute_value<'input, 'entity, F>(
558    value: &'input str,
559    depth: usize,
560    resolve_entity: F,
561) -> Result<Cow<'input, str>, EscapeError>
562where
563    // the lifetime of the output comes from a capture or is `'static`
564    F: FnMut(&str) -> Option<&'entity str>,
565{
566    normalize_attribute_value(
567        value,
568        depth,
569        is_xml11_normalization_char,
570        normalize_xml11_eol_step,
571        resolve_entity,
572    )
573}
574
575const fn is_xml11_normalization_char(b: &u8) -> bool {
576    // The following sequences should be translated into a single `\n` (U+000a) character
577    // to normalize EOLs:
578    //
579    // |UTF-8   |String|
580    // |--------|------|
581    // |0d 0a   |\r\n  |
582    // |0d c2 85|\r\x85|
583    // |0d      |\r    |
584    // |c2 85   |\x85  |
585    // |e2 80 a8|\x2028|
586    matches!(*b, b'\t' | b'\r' | b'\n' | 0xC2 | 0xE2 | b'&')
587}
588
589////////////////////////////////////////////////////////////////////////////////////////////////////
590
591/// Returns the attribute value normalized as per [the XML specification],
592/// using a custom entity resolver.
593///
594/// Do not use this method with HTML attributes.
595///
596/// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
597/// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function
598/// for resolving entities can be provided as `resolve_entity`. Builtin entities will still
599/// take precedence.
600///
601/// This will allocate unless the raw attribute value does not require normalization.
602///
603/// # Parameters
604///
605/// - `value`: unnormalized attribute value
606/// - `depth`: maximum number of nested entities that can be expanded. If expansion
607///   chain will be more that this value, the function will return [`EscapeError::TooManyNestedEntities`]
608/// - `is_normalization_char`: a function to check if byte is the start byte of character
609///   that should be normalized (UTF-8 encoding is assumed)
610/// - `normalize_eol_step`: a function that performs EOL normalization of a character
611/// - `resolve_entity`: a function to resolve entity. This function could be called
612///   multiple times on the same input and can return different values in each case
613///   for the same input, although it is not recommended
614///
615/// # Lifetimes
616///
617/// - `'input`: lifetime of the unnormalized attribute. If normalization is not required,
618///   the input returned unchanged with the same lifetime
619/// - `'entity`: lifetime of all entities that is returned by the entity resolution routine
620///
621/// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize
622pub fn normalize_attribute_value<'input, 'entity, C, E, F>(
623    value: &'input str,
624    depth: usize,
625    is_normalization_char: C,
626    normalize_eol_step: E,
627    mut resolve_entity: F,
628) -> Result<Cow<'input, str>, EscapeError>
629where
630    C: Fn(&u8) -> bool,
631    E: Fn(&mut String, &str, usize, char) -> usize,
632    // the lifetime of the output comes from a capture or is `'static`
633    F: FnMut(&str) -> Option<&'entity str>,
634{
635    let mut iter = value.as_bytes().iter();
636
637    // If we found the character that requires normalization, create a normalized
638    // version of the attribute, otherwise return the value unchanged
639    if let Some(i) = iter.position(&is_normalization_char) {
640        let mut normalized = String::with_capacity(value.len());
641        let pos = normalize_attr_step(
642            &mut normalized,
643            &mut iter,
644            value,
645            0,
646            i,
647            depth,
648            &is_normalization_char,
649            &normalize_eol_step,
650            &mut resolve_entity,
651        )?;
652
653        normalize_attr_steps(
654            &mut normalized,
655            &mut iter,
656            value,
657            pos,
658            depth,
659            &is_normalization_char,
660            &normalize_eol_step,
661            &mut resolve_entity,
662        )?;
663        return Ok(normalized.into());
664    }
665    Ok(Cow::Borrowed(value))
666}
667
668fn normalize_attr_steps<'entity, C, E, F>(
669    normalized: &mut String,
670    iter: &mut Iter<u8>,
671    input: &str,
672    mut pos: usize,
673    depth: usize,
674    is_normalization_char: &C,
675    normalize_eol_step: &E,
676    resolve_entity: &mut F,
677) -> Result<(), EscapeError>
678where
679    C: Fn(&u8) -> bool,
680    E: Fn(&mut String, &str, usize, char) -> usize,
681    // the lifetime of the output comes from a capture or is `'static`
682    F: FnMut(&str) -> Option<&'entity str>,
683{
684    while let Some(i) = iter.position(is_normalization_char) {
685        pos = normalize_attr_step(
686            normalized,
687            iter,
688            input,
689            pos,
690            pos + i,
691            depth,
692            is_normalization_char,
693            normalize_eol_step,
694            resolve_entity,
695        )?;
696    }
697    if let Some(rest) = input.get(pos..) {
698        normalized.push_str(rest);
699    }
700    Ok(())
701}
702
703/// Performs one step of the [normalization algorithm] (but with recursive part):
704///
705/// 1. For a character reference, append the referenced character
706///    to the normalized value.
707/// 2. For an entity reference, recursively apply this algorithm
708///    to the replacement text of the entity.
709/// 3. For a white space character (#x20, #xD, #xA, #x9), append
710///    a space character (#x20) to the normalized value.
711/// 4. For another character, append the character to the normalized value.
712///
713/// Because [according to the specification], XML parser should parse line-of-end
714/// normalized input, but quick-xml does not do that, this function also performs
715/// normalization of EOL characters. That should be done before expanding entities
716/// and character references, so cannot be processed later.
717///
718/// This function could be used also just to normalize line ends if the iterator
719/// won't be stop on `&` characters.
720///
721/// # Parameters
722///
723/// - `normalized`: Output of the algorithm. Normalized value will be placed here
724/// - `iter`: Iterator over bytes of `input`
725/// - `input`: Original non-normalized value
726/// - `last_pos`: Index of the last byte in `input` that was processed
727/// - `index`: Index of the byte in `input` that should be processed now
728/// - `seen_cr`: `\r\n` and `\r\x85` sequences should be normalized into one space
729///   so this parameter tracks if we seen the `\r` before processing the current byte
730/// - `depth`: Current recursion depth. Too deep recursion will interrupt the algorithm
731/// - `is_normalization_char`: a function to check if byte is the start byte of character
732///   that should be normalized (UTF-8 encoding is assumed)
733/// - `normalize_eol_step`: a function that performs EOL normalization of a character
734/// - `resolve_entity`: Resolver of entities. Returns `None` for unknown entities
735///
736/// # Lifetimes
737///
738/// - `'entity`: lifetime of all entities that is returned by the entity resolution routine
739///
740/// [normalization algorithm]: https://www.w3.org/TR/xml11/#AVNormalize
741/// [according to the specification]: https://www.w3.org/TR/xml11/#sec-line-ends
742fn normalize_attr_step<'entity, C, E, F>(
743    normalized: &mut String,
744    iter: &mut Iter<u8>,
745    input: &str,
746    last_pos: usize,
747    index: usize,
748    depth: usize,
749    is_normalization_char: &C,
750    normalize_eol_step: &E,
751    resolve_entity: &mut F,
752) -> Result<usize, EscapeError>
753where
754    C: Fn(&u8) -> bool,
755    E: Fn(&mut String, &str, usize, char) -> usize,
756    // the lifetime of the output comes from a capture or is `'static`
757    F: FnMut(&str) -> Option<&'entity str>,
758{
759    if depth == 0 {
760        return Err(EscapeError::TooManyNestedEntities);
761    }
762    // 4. For another character, append the character to the normalized value.
763    normalized.push_str(&input[last_pos..index]);
764
765    match input.as_bytes()[index] {
766        b'&' => {
767            let start = index + 1; // +1 - skip `&`
768            let end = start
769                + match iter.position(|&b| b == b';') {
770                    Some(end) => end,
771                    None => return Err(EscapeError::UnterminatedEntity(index..input.len())),
772                };
773
774            // Content between & and ; - &pat;
775            // Note, that this content have non-normalized EOLs as required by the specification,
776            // but because numbers in any case cannot have spaces inside, this is not the problem.
777            // Normalization of spaces in entity references and checking that they corresponds to
778            // [`Name`] production on conscience `resolve_entity`.
779            //
780            // [`Name`]: https://www.w3.org/TR/xml11/#NT-Name
781            let pat = &input[start..end];
782            // 1. For a character reference, append the referenced character
783            //    to the normalized value.
784            if let Some(entity) = pat.strip_prefix('#') {
785                let codepoint = parse_number(entity).map_err(EscapeError::InvalidCharRef)?;
786                normalized.push_str(codepoint.encode_utf8(&mut [0u8; 4]));
787            } else
788            // Special case: '&amp;' resolves to '&' and if follow this algorithm
789            // without special handling, we got unterminated entity error
790            if pat == "amp" {
791                normalized.push('&');
792            } else
793            // 2. For an entity reference, recursively apply this algorithm
794            //    to the replacement text of the entity.
795            if let Some(value) = resolve_entity(pat) {
796                normalize_attr_steps(
797                    normalized,
798                    &mut value.as_bytes().iter(),
799                    value,
800                    0,
801                    depth.saturating_sub(1),
802                    is_normalization_char,
803                    normalize_eol_step,
804                    resolve_entity,
805                )?;
806            } else {
807                return Err(EscapeError::UnrecognizedEntity(start..end, pat.to_string()));
808            }
809            Ok(end + 1) // +1 - skip `;`
810        }
811        // 3. For a white space character (#x20, #xD, #xA, #x9), append
812        //    a space character (#x20) to the normalized value.
813        // Space character (#x20) has no special meaning, so it is handled on step 4
814        b'\t' => {
815            normalized.push(' ');
816            Ok(index + 1) // +1 - skip \t
817        }
818        _ => {
819            let pos = normalize_eol_step(normalized, input, index, ' ');
820            // We should advance iterator because we may skip several characters
821            for _ in 0..pos - index - 1 {
822                iter.next();
823            }
824            Ok(pos)
825        }
826    }
827}
828
829////////////////////////////////////////////////////////////////////////////////////////////////////
830
831/// Resolves predefined XML entities or all HTML5 entities depending on the feature
832/// [`escape-html`](https://docs.rs/quick-xml/latest/quick_xml/#escape-html).
833///
834/// Behaves like [`resolve_xml_entity`] if feature is not enabled and as
835/// [`resolve_html5_entity`] if enabled.
836#[inline]
837pub const fn resolve_predefined_entity(entity: &str) -> Option<&'static str> {
838    #[cfg(not(feature = "escape-html"))]
839    {
840        resolve_xml_entity(entity)
841    }
842
843    #[cfg(feature = "escape-html")]
844    {
845        resolve_html5_entity(entity)
846    }
847}
848
849/// Resolves predefined XML entities. If specified entity is not a predefined XML
850/// entity, `None` is returned.
851///
852/// The complete list of predefined entities are defined in the [specification].
853///
854/// ```
855/// # use quick_xml::escape::resolve_xml_entity;
856/// # use pretty_assertions::assert_eq;
857/// assert_eq!(resolve_xml_entity("lt"), Some("<"));
858/// assert_eq!(resolve_xml_entity("gt"), Some(">"));
859/// assert_eq!(resolve_xml_entity("amp"), Some("&"));
860/// assert_eq!(resolve_xml_entity("apos"), Some("'"));
861/// assert_eq!(resolve_xml_entity("quot"), Some("\""));
862///
863/// assert_eq!(resolve_xml_entity("foo"), None);
864/// ```
865///
866/// [specification]: https://www.w3.org/TR/xml11/#sec-predefined-ent
867pub const fn resolve_xml_entity(entity: &str) -> Option<&'static str> {
868    // match over strings are not allowed in const functions
869    let s = match entity.as_bytes() {
870        b"lt" => "<",
871        b"gt" => ">",
872        b"amp" => "&",
873        b"apos" => "'",
874        b"quot" => "\"",
875        _ => return None,
876    };
877    Some(s)
878}
879
880/// Resolves all HTML5 entities. For complete list see <https://dev.w3.org/html5/html-author/charref>.
881#[cfg(feature = "escape-html")]
882pub const fn resolve_html5_entity(entity: &str) -> Option<&'static str> {
883    // imported from https://dev.w3.org/html5/html-author/charref
884    // match over strings are not allowed in const functions
885    //TODO: automate up-to-dating using https://html.spec.whatwg.org/entities.json
886    //TODO: building this function increases compilation time by 10+ seconds (or 5x times)
887    // Maybe this is because of very long match
888    // See https://github.com/tafia/quick-xml/issues/763
889    let s = match entity.as_bytes() {
890        b"Tab" => "\u{09}",
891        b"NewLine" => "\u{0A}",
892        b"excl" => "\u{21}",
893        b"quot" | b"QUOT" => "\u{22}",
894        b"num" => "\u{23}",
895        b"dollar" => "\u{24}",
896        b"percnt" => "\u{25}",
897        b"amp" | b"AMP" => "\u{26}",
898        b"apos" => "\u{27}",
899        b"lpar" => "\u{28}",
900        b"rpar" => "\u{29}",
901        b"ast" | b"midast" => "\u{2A}",
902        b"plus" => "\u{2B}",
903        b"comma" => "\u{2C}",
904        b"period" => "\u{2E}",
905        b"sol" => "\u{2F}",
906        b"colon" => "\u{3A}",
907        b"semi" => "\u{3B}",
908        b"lt" | b"LT" => "\u{3C}",
909        b"equals" => "\u{3D}",
910        b"gt" | b"GT" => "\u{3E}",
911        b"quest" => "\u{3F}",
912        b"commat" => "\u{40}",
913        b"lsqb" | b"lbrack" => "\u{5B}",
914        b"bsol" => "\u{5C}",
915        b"rsqb" | b"rbrack" => "\u{5D}",
916        b"Hat" => "\u{5E}",
917        b"lowbar" => "\u{5F}",
918        b"grave" | b"DiacriticalGrave" => "\u{60}",
919        b"lcub" | b"lbrace" => "\u{7B}",
920        b"verbar" | b"vert" | b"VerticalLine" => "\u{7C}",
921        b"rcub" | b"rbrace" => "\u{7D}",
922        b"nbsp" | b"NonBreakingSpace" => "\u{A0}",
923        b"iexcl" => "\u{A1}",
924        b"cent" => "\u{A2}",
925        b"pound" => "\u{A3}",
926        b"curren" => "\u{A4}",
927        b"yen" => "\u{A5}",
928        b"brvbar" => "\u{A6}",
929        b"sect" => "\u{A7}",
930        b"Dot" | b"die" | b"DoubleDot" | b"uml" => "\u{A8}",
931        b"copy" | b"COPY" => "\u{A9}",
932        b"ordf" => "\u{AA}",
933        b"laquo" => "\u{AB}",
934        b"not" => "\u{AC}",
935        b"shy" => "\u{AD}",
936        b"reg" | b"circledR" | b"REG" => "\u{AE}",
937        b"macr" | b"OverBar" | b"strns" => "\u{AF}",
938        b"deg" => "\u{B0}",
939        b"plusmn" | b"pm" | b"PlusMinus" => "\u{B1}",
940        b"sup2" => "\u{B2}",
941        b"sup3" => "\u{B3}",
942        b"acute" | b"DiacriticalAcute" => "\u{B4}",
943        b"micro" => "\u{B5}",
944        b"para" => "\u{B6}",
945        b"middot" | b"centerdot" | b"CenterDot" => "\u{B7}",
946        b"cedil" | b"Cedilla" => "\u{B8}",
947        b"sup1" => "\u{B9}",
948        b"ordm" => "\u{BA}",
949        b"raquo" => "\u{BB}",
950        b"frac14" => "\u{BC}",
951        b"frac12" | b"half" => "\u{BD}",
952        b"frac34" => "\u{BE}",
953        b"iquest" => "\u{BF}",
954        b"Agrave" => "\u{C0}",
955        b"Aacute" => "\u{C1}",
956        b"Acirc" => "\u{C2}",
957        b"Atilde" => "\u{C3}",
958        b"Auml" => "\u{C4}",
959        b"Aring" => "\u{C5}",
960        b"AElig" => "\u{C6}",
961        b"Ccedil" => "\u{C7}",
962        b"Egrave" => "\u{C8}",
963        b"Eacute" => "\u{C9}",
964        b"Ecirc" => "\u{CA}",
965        b"Euml" => "\u{CB}",
966        b"Igrave" => "\u{CC}",
967        b"Iacute" => "\u{CD}",
968        b"Icirc" => "\u{CE}",
969        b"Iuml" => "\u{CF}",
970        b"ETH" => "\u{D0}",
971        b"Ntilde" => "\u{D1}",
972        b"Ograve" => "\u{D2}",
973        b"Oacute" => "\u{D3}",
974        b"Ocirc" => "\u{D4}",
975        b"Otilde" => "\u{D5}",
976        b"Ouml" => "\u{D6}",
977        b"times" => "\u{D7}",
978        b"Oslash" => "\u{D8}",
979        b"Ugrave" => "\u{D9}",
980        b"Uacute" => "\u{DA}",
981        b"Ucirc" => "\u{DB}",
982        b"Uuml" => "\u{DC}",
983        b"Yacute" => "\u{DD}",
984        b"THORN" => "\u{DE}",
985        b"szlig" => "\u{DF}",
986        b"agrave" => "\u{E0}",
987        b"aacute" => "\u{E1}",
988        b"acirc" => "\u{E2}",
989        b"atilde" => "\u{E3}",
990        b"auml" => "\u{E4}",
991        b"aring" => "\u{E5}",
992        b"aelig" => "\u{E6}",
993        b"ccedil" => "\u{E7}",
994        b"egrave" => "\u{E8}",
995        b"eacute" => "\u{E9}",
996        b"ecirc" => "\u{EA}",
997        b"euml" => "\u{EB}",
998        b"igrave" => "\u{EC}",
999        b"iacute" => "\u{ED}",
1000        b"icirc" => "\u{EE}",
1001        b"iuml" => "\u{EF}",
1002        b"eth" => "\u{F0}",
1003        b"ntilde" => "\u{F1}",
1004        b"ograve" => "\u{F2}",
1005        b"oacute" => "\u{F3}",
1006        b"ocirc" => "\u{F4}",
1007        b"otilde" => "\u{F5}",
1008        b"ouml" => "\u{F6}",
1009        b"divide" | b"div" => "\u{F7}",
1010        b"oslash" => "\u{F8}",
1011        b"ugrave" => "\u{F9}",
1012        b"uacute" => "\u{FA}",
1013        b"ucirc" => "\u{FB}",
1014        b"uuml" => "\u{FC}",
1015        b"yacute" => "\u{FD}",
1016        b"thorn" => "\u{FE}",
1017        b"yuml" => "\u{FF}",
1018        b"Amacr" => "\u{10}",
1019        b"amacr" => "\u{10}",
1020        b"Abreve" => "\u{10}",
1021        b"abreve" => "\u{10}",
1022        b"Aogon" => "\u{10}",
1023        b"aogon" => "\u{10}",
1024        b"Cacute" => "\u{10}",
1025        b"cacute" => "\u{10}",
1026        b"Ccirc" => "\u{10}",
1027        b"ccirc" => "\u{10}",
1028        b"Cdot" => "\u{10}",
1029        b"cdot" => "\u{10}",
1030        b"Ccaron" => "\u{10}",
1031        b"ccaron" => "\u{10}",
1032        b"Dcaron" => "\u{10}",
1033        b"dcaron" => "\u{10}",
1034        b"Dstrok" => "\u{11}",
1035        b"dstrok" => "\u{11}",
1036        b"Emacr" => "\u{11}",
1037        b"emacr" => "\u{11}",
1038        b"Edot" => "\u{11}",
1039        b"edot" => "\u{11}",
1040        b"Eogon" => "\u{11}",
1041        b"eogon" => "\u{11}",
1042        b"Ecaron" => "\u{11}",
1043        b"ecaron" => "\u{11}",
1044        b"Gcirc" => "\u{11}",
1045        b"gcirc" => "\u{11}",
1046        b"Gbreve" => "\u{11}",
1047        b"gbreve" => "\u{11}",
1048        b"Gdot" => "\u{12}",
1049        b"gdot" => "\u{12}",
1050        b"Gcedil" => "\u{12}",
1051        b"Hcirc" => "\u{12}",
1052        b"hcirc" => "\u{12}",
1053        b"Hstrok" => "\u{12}",
1054        b"hstrok" => "\u{12}",
1055        b"Itilde" => "\u{12}",
1056        b"itilde" => "\u{12}",
1057        b"Imacr" => "\u{12}",
1058        b"imacr" => "\u{12}",
1059        b"Iogon" => "\u{12}",
1060        b"iogon" => "\u{12}",
1061        b"Idot" => "\u{13}",
1062        b"imath" | b"inodot" => "\u{13}",
1063        b"IJlig" => "\u{13}",
1064        b"ijlig" => "\u{13}",
1065        b"Jcirc" => "\u{13}",
1066        b"jcirc" => "\u{13}",
1067        b"Kcedil" => "\u{13}",
1068        b"kcedil" => "\u{13}",
1069        b"kgreen" => "\u{13}",
1070        b"Lacute" => "\u{13}",
1071        b"lacute" => "\u{13}",
1072        b"Lcedil" => "\u{13}",
1073        b"lcedil" => "\u{13}",
1074        b"Lcaron" => "\u{13}",
1075        b"lcaron" => "\u{13}",
1076        b"Lmidot" => "\u{13}",
1077        b"lmidot" => "\u{14}",
1078        b"Lstrok" => "\u{14}",
1079        b"lstrok" => "\u{14}",
1080        b"Nacute" => "\u{14}",
1081        b"nacute" => "\u{14}",
1082        b"Ncedil" => "\u{14}",
1083        b"ncedil" => "\u{14}",
1084        b"Ncaron" => "\u{14}",
1085        b"ncaron" => "\u{14}",
1086        b"napos" => "\u{14}",
1087        b"ENG" => "\u{14}",
1088        b"eng" => "\u{14}",
1089        b"Omacr" => "\u{14}",
1090        b"omacr" => "\u{14}",
1091        b"Odblac" => "\u{15}",
1092        b"odblac" => "\u{15}",
1093        b"OElig" => "\u{15}",
1094        b"oelig" => "\u{15}",
1095        b"Racute" => "\u{15}",
1096        b"racute" => "\u{15}",
1097        b"Rcedil" => "\u{15}",
1098        b"rcedil" => "\u{15}",
1099        b"Rcaron" => "\u{15}",
1100        b"rcaron" => "\u{15}",
1101        b"Sacute" => "\u{15}",
1102        b"sacute" => "\u{15}",
1103        b"Scirc" => "\u{15}",
1104        b"scirc" => "\u{15}",
1105        b"Scedil" => "\u{15}",
1106        b"scedil" => "\u{15}",
1107        b"Scaron" => "\u{16}",
1108        b"scaron" => "\u{16}",
1109        b"Tcedil" => "\u{16}",
1110        b"tcedil" => "\u{16}",
1111        b"Tcaron" => "\u{16}",
1112        b"tcaron" => "\u{16}",
1113        b"Tstrok" => "\u{16}",
1114        b"tstrok" => "\u{16}",
1115        b"Utilde" => "\u{16}",
1116        b"utilde" => "\u{16}",
1117        b"Umacr" => "\u{16}",
1118        b"umacr" => "\u{16}",
1119        b"Ubreve" => "\u{16}",
1120        b"ubreve" => "\u{16}",
1121        b"Uring" => "\u{16}",
1122        b"uring" => "\u{16}",
1123        b"Udblac" => "\u{17}",
1124        b"udblac" => "\u{17}",
1125        b"Uogon" => "\u{17}",
1126        b"uogon" => "\u{17}",
1127        b"Wcirc" => "\u{17}",
1128        b"wcirc" => "\u{17}",
1129        b"Ycirc" => "\u{17}",
1130        b"ycirc" => "\u{17}",
1131        b"Yuml" => "\u{17}",
1132        b"Zacute" => "\u{17}",
1133        b"zacute" => "\u{17}",
1134        b"Zdot" => "\u{17}",
1135        b"zdot" => "\u{17}",
1136        b"Zcaron" => "\u{17}",
1137        b"zcaron" => "\u{17}",
1138        b"fnof" => "\u{19}",
1139        b"imped" => "\u{1B}",
1140        b"gacute" => "\u{1F}",
1141        b"jmath" => "\u{23}",
1142        b"circ" => "\u{2C}",
1143        b"caron" | b"Hacek" => "\u{2C}",
1144        b"breve" | b"Breve" => "\u{2D}",
1145        b"dot" | b"DiacriticalDot" => "\u{2D}",
1146        b"ring" => "\u{2D}",
1147        b"ogon" => "\u{2D}",
1148        b"tilde" | b"DiacriticalTilde" => "\u{2D}",
1149        b"dblac" | b"DiacriticalDoubleAcute" => "\u{2D}",
1150        b"DownBreve" => "\u{31}",
1151        b"UnderBar" => "\u{33}",
1152        b"Alpha" => "\u{39}",
1153        b"Beta" => "\u{39}",
1154        b"Gamma" => "\u{39}",
1155        b"Delta" => "\u{39}",
1156        b"Epsilon" => "\u{39}",
1157        b"Zeta" => "\u{39}",
1158        b"Eta" => "\u{39}",
1159        b"Theta" => "\u{39}",
1160        b"Iota" => "\u{39}",
1161        b"Kappa" => "\u{39}",
1162        b"Lambda" => "\u{39}",
1163        b"Mu" => "\u{39}",
1164        b"Nu" => "\u{39}",
1165        b"Xi" => "\u{39}",
1166        b"Omicron" => "\u{39}",
1167        b"Pi" => "\u{3A}",
1168        b"Rho" => "\u{3A}",
1169        b"Sigma" => "\u{3A}",
1170        b"Tau" => "\u{3A}",
1171        b"Upsilon" => "\u{3A}",
1172        b"Phi" => "\u{3A}",
1173        b"Chi" => "\u{3A}",
1174        b"Psi" => "\u{3A}",
1175        b"Omega" => "\u{3A}",
1176        b"alpha" => "\u{3B}",
1177        b"beta" => "\u{3B}",
1178        b"gamma" => "\u{3B}",
1179        b"delta" => "\u{3B}",
1180        b"epsiv" | b"varepsilon" | b"epsilon" => "\u{3B}",
1181        b"zeta" => "\u{3B}",
1182        b"eta" => "\u{3B}",
1183        b"theta" => "\u{3B}",
1184        b"iota" => "\u{3B}",
1185        b"kappa" => "\u{3B}",
1186        b"lambda" => "\u{3B}",
1187        b"mu" => "\u{3B}",
1188        b"nu" => "\u{3B}",
1189        b"xi" => "\u{3B}",
1190        b"omicron" => "\u{3B}",
1191        b"pi" => "\u{3C}",
1192        b"rho" => "\u{3C}",
1193        b"sigmav" | b"varsigma" | b"sigmaf" => "\u{3C}",
1194        b"sigma" => "\u{3C}",
1195        b"tau" => "\u{3C}",
1196        b"upsi" | b"upsilon" => "\u{3C}",
1197        b"phi" | b"phiv" | b"varphi" => "\u{3C}",
1198        b"chi" => "\u{3C}",
1199        b"psi" => "\u{3C}",
1200        b"omega" => "\u{3C}",
1201        b"thetav" | b"vartheta" | b"thetasym" => "\u{3D}",
1202        b"Upsi" | b"upsih" => "\u{3D}",
1203        b"straightphi" => "\u{3D}",
1204        b"piv" | b"varpi" => "\u{3D}",
1205        b"Gammad" => "\u{3D}",
1206        b"gammad" | b"digamma" => "\u{3D}",
1207        b"kappav" | b"varkappa" => "\u{3F}",
1208        b"rhov" | b"varrho" => "\u{3F}",
1209        b"epsi" | b"straightepsilon" => "\u{3F}",
1210        b"bepsi" | b"backepsilon" => "\u{3F}",
1211        b"IOcy" => "\u{40}",
1212        b"DJcy" => "\u{40}",
1213        b"GJcy" => "\u{40}",
1214        b"Jukcy" => "\u{40}",
1215        b"DScy" => "\u{40}",
1216        b"Iukcy" => "\u{40}",
1217        b"YIcy" => "\u{40}",
1218        b"Jsercy" => "\u{40}",
1219        b"LJcy" => "\u{40}",
1220        b"NJcy" => "\u{40}",
1221        b"TSHcy" => "\u{40}",
1222        b"KJcy" => "\u{40}",
1223        b"Ubrcy" => "\u{40}",
1224        b"DZcy" => "\u{40}",
1225        b"Acy" => "\u{41}",
1226        b"Bcy" => "\u{41}",
1227        b"Vcy" => "\u{41}",
1228        b"Gcy" => "\u{41}",
1229        b"Dcy" => "\u{41}",
1230        b"IEcy" => "\u{41}",
1231        b"ZHcy" => "\u{41}",
1232        b"Zcy" => "\u{41}",
1233        b"Icy" => "\u{41}",
1234        b"Jcy" => "\u{41}",
1235        b"Kcy" => "\u{41}",
1236        b"Lcy" => "\u{41}",
1237        b"Mcy" => "\u{41}",
1238        b"Ncy" => "\u{41}",
1239        b"Ocy" => "\u{41}",
1240        b"Pcy" => "\u{41}",
1241        b"Rcy" => "\u{42}",
1242        b"Scy" => "\u{42}",
1243        b"Tcy" => "\u{42}",
1244        b"Ucy" => "\u{42}",
1245        b"Fcy" => "\u{42}",
1246        b"KHcy" => "\u{42}",
1247        b"TScy" => "\u{42}",
1248        b"CHcy" => "\u{42}",
1249        b"SHcy" => "\u{42}",
1250        b"SHCHcy" => "\u{42}",
1251        b"HARDcy" => "\u{42}",
1252        b"Ycy" => "\u{42}",
1253        b"SOFTcy" => "\u{42}",
1254        b"Ecy" => "\u{42}",
1255        b"YUcy" => "\u{42}",
1256        b"YAcy" => "\u{42}",
1257        b"acy" => "\u{43}",
1258        b"bcy" => "\u{43}",
1259        b"vcy" => "\u{43}",
1260        b"gcy" => "\u{43}",
1261        b"dcy" => "\u{43}",
1262        b"iecy" => "\u{43}",
1263        b"zhcy" => "\u{43}",
1264        b"zcy" => "\u{43}",
1265        b"icy" => "\u{43}",
1266        b"jcy" => "\u{43}",
1267        b"kcy" => "\u{43}",
1268        b"lcy" => "\u{43}",
1269        b"mcy" => "\u{43}",
1270        b"ncy" => "\u{43}",
1271        b"ocy" => "\u{43}",
1272        b"pcy" => "\u{43}",
1273        b"rcy" => "\u{44}",
1274        b"scy" => "\u{44}",
1275        b"tcy" => "\u{44}",
1276        b"ucy" => "\u{44}",
1277        b"fcy" => "\u{44}",
1278        b"khcy" => "\u{44}",
1279        b"tscy" => "\u{44}",
1280        b"chcy" => "\u{44}",
1281        b"shcy" => "\u{44}",
1282        b"shchcy" => "\u{44}",
1283        b"hardcy" => "\u{44}",
1284        b"ycy" => "\u{44}",
1285        b"softcy" => "\u{44}",
1286        b"ecy" => "\u{44}",
1287        b"yucy" => "\u{44}",
1288        b"yacy" => "\u{44}",
1289        b"iocy" => "\u{45}",
1290        b"djcy" => "\u{45}",
1291        b"gjcy" => "\u{45}",
1292        b"jukcy" => "\u{45}",
1293        b"dscy" => "\u{45}",
1294        b"iukcy" => "\u{45}",
1295        b"yicy" => "\u{45}",
1296        b"jsercy" => "\u{45}",
1297        b"ljcy" => "\u{45}",
1298        b"njcy" => "\u{45}",
1299        b"tshcy" => "\u{45}",
1300        b"kjcy" => "\u{45}",
1301        b"ubrcy" => "\u{45}",
1302        b"dzcy" => "\u{45}",
1303        b"ensp" => "\u{2002}",
1304        b"emsp" => "\u{2003}",
1305        b"emsp13" => "\u{2004}",
1306        b"emsp14" => "\u{2005}",
1307        b"numsp" => "\u{2007}",
1308        b"puncsp" => "\u{2008}",
1309        b"thinsp" | b"ThinSpace" => "\u{2009}",
1310        b"hairsp" | b"VeryThinSpace" => "\u{200A}",
1311        b"ZeroWidthSpace"
1312        | b"NegativeVeryThinSpace"
1313        | b"NegativeThinSpace"
1314        | b"NegativeMediumSpace"
1315        | b"NegativeThickSpace" => "\u{200B}",
1316        b"zwnj" => "\u{200C}",
1317        b"zwj" => "\u{200D}",
1318        b"lrm" => "\u{200E}",
1319        b"rlm" => "\u{200F}",
1320        b"hyphen" | b"dash" => "\u{2010}",
1321        b"ndash" => "\u{2013}",
1322        b"mdash" => "\u{2014}",
1323        b"horbar" => "\u{2015}",
1324        b"Verbar" | b"Vert" => "\u{2016}",
1325        b"lsquo" | b"OpenCurlyQuote" => "\u{2018}",
1326        b"rsquo" | b"rsquor" | b"CloseCurlyQuote" => "\u{2019}",
1327        b"lsquor" | b"sbquo" => "\u{201A}",
1328        b"ldquo" | b"OpenCurlyDoubleQuote" => "\u{201C}",
1329        b"rdquo" | b"rdquor" | b"CloseCurlyDoubleQuote" => "\u{201D}",
1330        b"ldquor" | b"bdquo" => "\u{201E}",
1331        b"dagger" => "\u{2020}",
1332        b"Dagger" | b"ddagger" => "\u{2021}",
1333        b"bull" | b"bullet" => "\u{2022}",
1334        b"nldr" => "\u{2025}",
1335        b"hellip" | b"mldr" => "\u{2026}",
1336        b"permil" => "\u{2030}",
1337        b"pertenk" => "\u{2031}",
1338        b"prime" => "\u{2032}",
1339        b"Prime" => "\u{2033}",
1340        b"tprime" => "\u{2034}",
1341        b"bprime" | b"backprime" => "\u{2035}",
1342        b"lsaquo" => "\u{2039}",
1343        b"rsaquo" => "\u{203A}",
1344        b"oline" => "\u{203E}",
1345        b"caret" => "\u{2041}",
1346        b"hybull" => "\u{2043}",
1347        b"frasl" => "\u{2044}",
1348        b"bsemi" => "\u{204F}",
1349        b"qprime" => "\u{2057}",
1350        b"MediumSpace" => "\u{205F}",
1351        b"NoBreak" => "\u{2060}",
1352        b"ApplyFunction" | b"af" => "\u{2061}",
1353        b"InvisibleTimes" | b"it" => "\u{2062}",
1354        b"InvisibleComma" | b"ic" => "\u{2063}",
1355        b"euro" => "\u{20AC}",
1356        b"tdot" | b"TripleDot" => "\u{20DB}",
1357        b"DotDot" => "\u{20DC}",
1358        b"Copf" | b"complexes" => "\u{2102}",
1359        b"incare" => "\u{2105}",
1360        b"gscr" => "\u{210A}",
1361        b"hamilt" | b"HilbertSpace" | b"Hscr" => "\u{210B}",
1362        b"Hfr" | b"Poincareplane" => "\u{210C}",
1363        b"quaternions" | b"Hopf" => "\u{210D}",
1364        b"planckh" => "\u{210E}",
1365        b"planck" | b"hbar" | b"plankv" | b"hslash" => "\u{210F}",
1366        b"Iscr" | b"imagline" => "\u{2110}",
1367        b"image" | b"Im" | b"imagpart" | b"Ifr" => "\u{2111}",
1368        b"Lscr" | b"lagran" | b"Laplacetrf" => "\u{2112}",
1369        b"ell" => "\u{2113}",
1370        b"Nopf" | b"naturals" => "\u{2115}",
1371        b"numero" => "\u{2116}",
1372        b"copysr" => "\u{2117}",
1373        b"weierp" | b"wp" => "\u{2118}",
1374        b"Popf" | b"primes" => "\u{2119}",
1375        b"rationals" | b"Qopf" => "\u{211A}",
1376        b"Rscr" | b"realine" => "\u{211B}",
1377        b"real" | b"Re" | b"realpart" | b"Rfr" => "\u{211C}",
1378        b"reals" | b"Ropf" => "\u{211D}",
1379        b"rx" => "\u{211E}",
1380        b"trade" | b"TRADE" => "\u{2122}",
1381        b"integers" | b"Zopf" => "\u{2124}",
1382        b"ohm" => "\u{2126}",
1383        b"mho" => "\u{2127}",
1384        b"Zfr" | b"zeetrf" => "\u{2128}",
1385        b"iiota" => "\u{2129}",
1386        b"angst" => "\u{212B}",
1387        b"bernou" | b"Bernoullis" | b"Bscr" => "\u{212C}",
1388        b"Cfr" | b"Cayleys" => "\u{212D}",
1389        b"escr" => "\u{212F}",
1390        b"Escr" | b"expectation" => "\u{2130}",
1391        b"Fscr" | b"Fouriertrf" => "\u{2131}",
1392        b"phmmat" | b"Mellintrf" | b"Mscr" => "\u{2133}",
1393        b"order" | b"orderof" | b"oscr" => "\u{2134}",
1394        b"alefsym" | b"aleph" => "\u{2135}",
1395        b"beth" => "\u{2136}",
1396        b"gimel" => "\u{2137}",
1397        b"daleth" => "\u{2138}",
1398        b"CapitalDifferentialD" | b"DD" => "\u{2145}",
1399        b"DifferentialD" | b"dd" => "\u{2146}",
1400        b"ExponentialE" | b"exponentiale" | b"ee" => "\u{2147}",
1401        b"ImaginaryI" | b"ii" => "\u{2148}",
1402        b"frac13" => "\u{2153}",
1403        b"frac23" => "\u{2154}",
1404        b"frac15" => "\u{2155}",
1405        b"frac25" => "\u{2156}",
1406        b"frac35" => "\u{2157}",
1407        b"frac45" => "\u{2158}",
1408        b"frac16" => "\u{2159}",
1409        b"frac56" => "\u{215A}",
1410        b"frac18" => "\u{215B}",
1411        b"frac38" => "\u{215C}",
1412        b"frac58" => "\u{215D}",
1413        b"frac78" => "\u{215E}",
1414        b"larr" | b"leftarrow" | b"LeftArrow" | b"slarr" | b"ShortLeftArrow" => "\u{2190}",
1415        b"uarr" | b"uparrow" | b"UpArrow" | b"ShortUpArrow" => "\u{2191}",
1416        b"rarr" | b"rightarrow" | b"RightArrow" | b"srarr" | b"ShortRightArrow" => "\u{2192}",
1417        b"darr" | b"downarrow" | b"DownArrow" | b"ShortDownArrow" => "\u{2193}",
1418        b"harr" | b"leftrightarrow" | b"LeftRightArrow" => "\u{2194}",
1419        b"varr" | b"updownarrow" | b"UpDownArrow" => "\u{2195}",
1420        b"nwarr" | b"UpperLeftArrow" | b"nwarrow" => "\u{2196}",
1421        b"nearr" | b"UpperRightArrow" | b"nearrow" => "\u{2197}",
1422        b"searr" | b"searrow" | b"LowerRightArrow" => "\u{2198}",
1423        b"swarr" | b"swarrow" | b"LowerLeftArrow" => "\u{2199}",
1424        b"nlarr" | b"nleftarrow" => "\u{219A}",
1425        b"nrarr" | b"nrightarrow" => "\u{219B}",
1426        b"rarrw" | b"rightsquigarrow" => "\u{219D}",
1427        b"Larr" | b"twoheadleftarrow" => "\u{219E}",
1428        b"Uarr" => "\u{219F}",
1429        b"Rarr" | b"twoheadrightarrow" => "\u{21A0}",
1430        b"Darr" => "\u{21A1}",
1431        b"larrtl" | b"leftarrowtail" => "\u{21A2}",
1432        b"rarrtl" | b"rightarrowtail" => "\u{21A3}",
1433        b"LeftTeeArrow" | b"mapstoleft" => "\u{21A4}",
1434        b"UpTeeArrow" | b"mapstoup" => "\u{21A5}",
1435        b"map" | b"RightTeeArrow" | b"mapsto" => "\u{21A6}",
1436        b"DownTeeArrow" | b"mapstodown" => "\u{21A7}",
1437        b"larrhk" | b"hookleftarrow" => "\u{21A9}",
1438        b"rarrhk" | b"hookrightarrow" => "\u{21AA}",
1439        b"larrlp" | b"looparrowleft" => "\u{21AB}",
1440        b"rarrlp" | b"looparrowright" => "\u{21AC}",
1441        b"harrw" | b"leftrightsquigarrow" => "\u{21AD}",
1442        b"nharr" | b"nleftrightarrow" => "\u{21AE}",
1443        b"lsh" | b"Lsh" => "\u{21B0}",
1444        b"rsh" | b"Rsh" => "\u{21B1}",
1445        b"ldsh" => "\u{21B2}",
1446        b"rdsh" => "\u{21B3}",
1447        b"crarr" => "\u{21B5}",
1448        b"cularr" | b"curvearrowleft" => "\u{21B6}",
1449        b"curarr" | b"curvearrowright" => "\u{21B7}",
1450        b"olarr" | b"circlearrowleft" => "\u{21BA}",
1451        b"orarr" | b"circlearrowright" => "\u{21BB}",
1452        b"lharu" | b"LeftVector" | b"leftharpoonup" => "\u{21BC}",
1453        b"lhard" | b"leftharpoondown" | b"DownLeftVector" => "\u{21BD}",
1454        b"uharr" | b"upharpoonright" | b"RightUpVector" => "\u{21BE}",
1455        b"uharl" | b"upharpoonleft" | b"LeftUpVector" => "\u{21BF}",
1456        b"rharu" | b"RightVector" | b"rightharpoonup" => "\u{21C0}",
1457        b"rhard" | b"rightharpoondown" | b"DownRightVector" => "\u{21C1}",
1458        b"dharr" | b"RightDownVector" | b"downharpoonright" => "\u{21C2}",
1459        b"dharl" | b"LeftDownVector" | b"downharpoonleft" => "\u{21C3}",
1460        b"rlarr" | b"rightleftarrows" | b"RightArrowLeftArrow" => "\u{21C4}",
1461        b"udarr" | b"UpArrowDownArrow" => "\u{21C5}",
1462        b"lrarr" | b"leftrightarrows" | b"LeftArrowRightArrow" => "\u{21C6}",
1463        b"llarr" | b"leftleftarrows" => "\u{21C7}",
1464        b"uuarr" | b"upuparrows" => "\u{21C8}",
1465        b"rrarr" | b"rightrightarrows" => "\u{21C9}",
1466        b"ddarr" | b"downdownarrows" => "\u{21CA}",
1467        b"lrhar" | b"ReverseEquilibrium" | b"leftrightharpoons" => "\u{21CB}",
1468        b"rlhar" | b"rightleftharpoons" | b"Equilibrium" => "\u{21CC}",
1469        b"nlArr" | b"nLeftarrow" => "\u{21CD}",
1470        b"nhArr" | b"nLeftrightarrow" => "\u{21CE}",
1471        b"nrArr" | b"nRightarrow" => "\u{21CF}",
1472        b"lArr" | b"Leftarrow" | b"DoubleLeftArrow" => "\u{21D0}",
1473        b"uArr" | b"Uparrow" | b"DoubleUpArrow" => "\u{21D1}",
1474        b"rArr" | b"Rightarrow" | b"Implies" | b"DoubleRightArrow" => "\u{21D2}",
1475        b"dArr" | b"Downarrow" | b"DoubleDownArrow" => "\u{21D3}",
1476        b"hArr" | b"Leftrightarrow" | b"DoubleLeftRightArrow" | b"iff" => "\u{21D4}",
1477        b"vArr" | b"Updownarrow" | b"DoubleUpDownArrow" => "\u{21D5}",
1478        b"nwArr" => "\u{21D6}",
1479        b"neArr" => "\u{21D7}",
1480        b"seArr" => "\u{21D8}",
1481        b"swArr" => "\u{21D9}",
1482        b"lAarr" | b"Lleftarrow" => "\u{21DA}",
1483        b"rAarr" | b"Rrightarrow" => "\u{21DB}",
1484        b"zigrarr" => "\u{21DD}",
1485        b"larrb" | b"LeftArrowBar" => "\u{21E4}",
1486        b"rarrb" | b"RightArrowBar" => "\u{21E5}",
1487        b"duarr" | b"DownArrowUpArrow" => "\u{21F5}",
1488        b"loarr" => "\u{21FD}",
1489        b"roarr" => "\u{21FE}",
1490        b"hoarr" => "\u{21FF}",
1491        b"forall" | b"ForAll" => "\u{2200}",
1492        b"comp" | b"complement" => "\u{2201}",
1493        b"part" | b"PartialD" => "\u{2202}",
1494        b"exist" | b"Exists" => "\u{2203}",
1495        b"nexist" | b"NotExists" | b"nexists" => "\u{2204}",
1496        b"empty" | b"emptyset" | b"emptyv" | b"varnothing" => "\u{2205}",
1497        b"nabla" | b"Del" => "\u{2207}",
1498        b"isin" | b"isinv" | b"Element" | b"in" => "\u{2208}",
1499        b"notin" | b"NotElement" | b"notinva" => "\u{2209}",
1500        b"niv" | b"ReverseElement" | b"ni" | b"SuchThat" => "\u{220B}",
1501        b"notni" | b"notniva" | b"NotReverseElement" => "\u{220C}",
1502        b"prod" | b"Product" => "\u{220F}",
1503        b"coprod" | b"Coproduct" => "\u{2210}",
1504        b"sum" | b"Sum" => "\u{2211}",
1505        b"minus" => "\u{2212}",
1506        b"mnplus" | b"mp" | b"MinusPlus" => "\u{2213}",
1507        b"plusdo" | b"dotplus" => "\u{2214}",
1508        b"setmn" | b"setminus" | b"Backslash" | b"ssetmn" | b"smallsetminus" => "\u{2216}",
1509        b"lowast" => "\u{2217}",
1510        b"compfn" | b"SmallCircle" => "\u{2218}",
1511        b"radic" | b"Sqrt" => "\u{221A}",
1512        b"prop" | b"propto" | b"Proportional" | b"vprop" | b"varpropto" => "\u{221D}",
1513        b"infin" => "\u{221E}",
1514        b"angrt" => "\u{221F}",
1515        b"ang" | b"angle" => "\u{2220}",
1516        b"angmsd" | b"measuredangle" => "\u{2221}",
1517        b"angsph" => "\u{2222}",
1518        b"mid" | b"VerticalBar" | b"smid" | b"shortmid" => "\u{2223}",
1519        b"nmid" | b"NotVerticalBar" | b"nsmid" | b"nshortmid" => "\u{2224}",
1520        b"par" | b"parallel" | b"DoubleVerticalBar" | b"spar" | b"shortparallel" => "\u{2225}",
1521        b"npar" | b"nparallel" | b"NotDoubleVerticalBar" | b"nspar" | b"nshortparallel" => {
1522            "\u{2226}"
1523        }
1524        b"and" | b"wedge" => "\u{2227}",
1525        b"or" | b"vee" => "\u{2228}",
1526        b"cap" => "\u{2229}",
1527        b"cup" => "\u{222A}",
1528        b"int" | b"Integral" => "\u{222B}",
1529        b"Int" => "\u{222C}",
1530        b"tint" | b"iiint" => "\u{222D}",
1531        b"conint" | b"oint" | b"ContourIntegral" => "\u{222E}",
1532        b"Conint" | b"DoubleContourIntegral" => "\u{222F}",
1533        b"Cconint" => "\u{2230}",
1534        b"cwint" => "\u{2231}",
1535        b"cwconint" | b"ClockwiseContourIntegral" => "\u{2232}",
1536        b"awconint" | b"CounterClockwiseContourIntegral" => "\u{2233}",
1537        b"there4" | b"therefore" | b"Therefore" => "\u{2234}",
1538        b"becaus" | b"because" | b"Because" => "\u{2235}",
1539        b"ratio" => "\u{2236}",
1540        b"Colon" | b"Proportion" => "\u{2237}",
1541        b"minusd" | b"dotminus" => "\u{2238}",
1542        b"mDDot" => "\u{223A}",
1543        b"homtht" => "\u{223B}",
1544        b"sim" | b"Tilde" | b"thksim" | b"thicksim" => "\u{223C}",
1545        b"bsim" | b"backsim" => "\u{223D}",
1546        b"ac" | b"mstpos" => "\u{223E}",
1547        b"acd" => "\u{223F}",
1548        b"wreath" | b"VerticalTilde" | b"wr" => "\u{2240}",
1549        b"nsim" | b"NotTilde" => "\u{2241}",
1550        b"esim" | b"EqualTilde" | b"eqsim" => "\u{2242}",
1551        b"sime" | b"TildeEqual" | b"simeq" => "\u{2243}",
1552        b"nsime" | b"nsimeq" | b"NotTildeEqual" => "\u{2244}",
1553        b"cong" | b"TildeFullEqual" => "\u{2245}",
1554        b"simne" => "\u{2246}",
1555        b"ncong" | b"NotTildeFullEqual" => "\u{2247}",
1556        b"asymp" | b"ap" | b"TildeTilde" | b"approx" | b"thkap" | b"thickapprox" => "\u{2248}",
1557        b"nap" | b"NotTildeTilde" | b"napprox" => "\u{2249}",
1558        b"ape" | b"approxeq" => "\u{224A}",
1559        b"apid" => "\u{224B}",
1560        b"bcong" | b"backcong" => "\u{224C}",
1561        b"asympeq" | b"CupCap" => "\u{224D}",
1562        b"bump" | b"HumpDownHump" | b"Bumpeq" => "\u{224E}",
1563        b"bumpe" | b"HumpEqual" | b"bumpeq" => "\u{224F}",
1564        b"esdot" | b"DotEqual" | b"doteq" => "\u{2250}",
1565        b"eDot" | b"doteqdot" => "\u{2251}",
1566        b"efDot" | b"fallingdotseq" => "\u{2252}",
1567        b"erDot" | b"risingdotseq" => "\u{2253}",
1568        b"colone" | b"coloneq" | b"Assign" => "\u{2254}",
1569        b"ecolon" | b"eqcolon" => "\u{2255}",
1570        b"ecir" | b"eqcirc" => "\u{2256}",
1571        b"cire" | b"circeq" => "\u{2257}",
1572        b"wedgeq" => "\u{2259}",
1573        b"veeeq" => "\u{225A}",
1574        b"trie" | b"triangleq" => "\u{225C}",
1575        b"equest" | b"questeq" => "\u{225F}",
1576        b"ne" | b"NotEqual" => "\u{2260}",
1577        b"equiv" | b"Congruent" => "\u{2261}",
1578        b"nequiv" | b"NotCongruent" => "\u{2262}",
1579        b"le" | b"leq" => "\u{2264}",
1580        b"ge" | b"GreaterEqual" | b"geq" => "\u{2265}",
1581        b"lE" | b"LessFullEqual" | b"leqq" => "\u{2266}",
1582        b"gE" | b"GreaterFullEqual" | b"geqq" => "\u{2267}",
1583        b"lnE" | b"lneqq" => "\u{2268}",
1584        b"gnE" | b"gneqq" => "\u{2269}",
1585        b"Lt" | b"NestedLessLess" | b"ll" => "\u{226A}",
1586        b"Gt" | b"NestedGreaterGreater" | b"gg" => "\u{226B}",
1587        b"twixt" | b"between" => "\u{226C}",
1588        b"NotCupCap" => "\u{226D}",
1589        b"nlt" | b"NotLess" | b"nless" => "\u{226E}",
1590        b"ngt" | b"NotGreater" | b"ngtr" => "\u{226F}",
1591        b"nle" | b"NotLessEqual" | b"nleq" => "\u{2270}",
1592        b"nge" | b"NotGreaterEqual" | b"ngeq" => "\u{2271}",
1593        b"lsim" | b"LessTilde" | b"lesssim" => "\u{2272}",
1594        b"gsim" | b"gtrsim" | b"GreaterTilde" => "\u{2273}",
1595        b"nlsim" | b"NotLessTilde" => "\u{2274}",
1596        b"ngsim" | b"NotGreaterTilde" => "\u{2275}",
1597        b"lg" | b"lessgtr" | b"LessGreater" => "\u{2276}",
1598        b"gl" | b"gtrless" | b"GreaterLess" => "\u{2277}",
1599        b"ntlg" | b"NotLessGreater" => "\u{2278}",
1600        b"ntgl" | b"NotGreaterLess" => "\u{2279}",
1601        b"pr" | b"Precedes" | b"prec" => "\u{227A}",
1602        b"sc" | b"Succeeds" | b"succ" => "\u{227B}",
1603        b"prcue" | b"PrecedesSlantEqual" | b"preccurlyeq" => "\u{227C}",
1604        b"sccue" | b"SucceedsSlantEqual" | b"succcurlyeq" => "\u{227D}",
1605        b"prsim" | b"precsim" | b"PrecedesTilde" => "\u{227E}",
1606        b"scsim" | b"succsim" | b"SucceedsTilde" => "\u{227F}",
1607        b"npr" | b"nprec" | b"NotPrecedes" => "\u{2280}",
1608        b"nsc" | b"nsucc" | b"NotSucceeds" => "\u{2281}",
1609        b"sub" | b"subset" => "\u{2282}",
1610        b"sup" | b"supset" | b"Superset" => "\u{2283}",
1611        b"nsub" => "\u{2284}",
1612        b"nsup" => "\u{2285}",
1613        b"sube" | b"SubsetEqual" | b"subseteq" => "\u{2286}",
1614        b"supe" | b"supseteq" | b"SupersetEqual" => "\u{2287}",
1615        b"nsube" | b"nsubseteq" | b"NotSubsetEqual" => "\u{2288}",
1616        b"nsupe" | b"nsupseteq" | b"NotSupersetEqual" => "\u{2289}",
1617        b"subne" | b"subsetneq" => "\u{228A}",
1618        b"supne" | b"supsetneq" => "\u{228B}",
1619        b"cupdot" => "\u{228D}",
1620        b"uplus" | b"UnionPlus" => "\u{228E}",
1621        b"sqsub" | b"SquareSubset" | b"sqsubset" => "\u{228F}",
1622        b"sqsup" | b"SquareSuperset" | b"sqsupset" => "\u{2290}",
1623        b"sqsube" | b"SquareSubsetEqual" | b"sqsubseteq" => "\u{2291}",
1624        b"sqsupe" | b"SquareSupersetEqual" | b"sqsupseteq" => "\u{2292}",
1625        b"sqcap" | b"SquareIntersection" => "\u{2293}",
1626        b"sqcup" | b"SquareUnion" => "\u{2294}",
1627        b"oplus" | b"CirclePlus" => "\u{2295}",
1628        b"ominus" | b"CircleMinus" => "\u{2296}",
1629        b"otimes" | b"CircleTimes" => "\u{2297}",
1630        b"osol" => "\u{2298}",
1631        b"odot" | b"CircleDot" => "\u{2299}",
1632        b"ocir" | b"circledcirc" => "\u{229A}",
1633        b"oast" | b"circledast" => "\u{229B}",
1634        b"odash" | b"circleddash" => "\u{229D}",
1635        b"plusb" | b"boxplus" => "\u{229E}",
1636        b"minusb" | b"boxminus" => "\u{229F}",
1637        b"timesb" | b"boxtimes" => "\u{22A0}",
1638        b"sdotb" | b"dotsquare" => "\u{22A1}",
1639        b"vdash" | b"RightTee" => "\u{22A2}",
1640        b"dashv" | b"LeftTee" => "\u{22A3}",
1641        b"top" | b"DownTee" => "\u{22A4}",
1642        b"bottom" | b"bot" | b"perp" | b"UpTee" => "\u{22A5}",
1643        b"models" => "\u{22A7}",
1644        b"vDash" | b"DoubleRightTee" => "\u{22A8}",
1645        b"Vdash" => "\u{22A9}",
1646        b"Vvdash" => "\u{22AA}",
1647        b"VDash" => "\u{22AB}",
1648        b"nvdash" => "\u{22AC}",
1649        b"nvDash" => "\u{22AD}",
1650        b"nVdash" => "\u{22AE}",
1651        b"nVDash" => "\u{22AF}",
1652        b"prurel" => "\u{22B0}",
1653        b"vltri" | b"vartriangleleft" | b"LeftTriangle" => "\u{22B2}",
1654        b"vrtri" | b"vartriangleright" | b"RightTriangle" => "\u{22B3}",
1655        b"ltrie" | b"trianglelefteq" | b"LeftTriangleEqual" => "\u{22B4}",
1656        b"rtrie" | b"trianglerighteq" | b"RightTriangleEqual" => "\u{22B5}",
1657        b"origof" => "\u{22B6}",
1658        b"imof" => "\u{22B7}",
1659        b"mumap" | b"multimap" => "\u{22B8}",
1660        b"hercon" => "\u{22B9}",
1661        b"intcal" | b"intercal" => "\u{22BA}",
1662        b"veebar" => "\u{22BB}",
1663        b"barvee" => "\u{22BD}",
1664        b"angrtvb" => "\u{22BE}",
1665        b"lrtri" => "\u{22BF}",
1666        b"xwedge" | b"Wedge" | b"bigwedge" => "\u{22C0}",
1667        b"xvee" | b"Vee" | b"bigvee" => "\u{22C1}",
1668        b"xcap" | b"Intersection" | b"bigcap" => "\u{22C2}",
1669        b"xcup" | b"Union" | b"bigcup" => "\u{22C3}",
1670        b"diam" | b"diamond" | b"Diamond" => "\u{22C4}",
1671        b"sdot" => "\u{22C5}",
1672        b"sstarf" | b"Star" => "\u{22C6}",
1673        b"divonx" | b"divideontimes" => "\u{22C7}",
1674        b"bowtie" => "\u{22C8}",
1675        b"ltimes" => "\u{22C9}",
1676        b"rtimes" => "\u{22CA}",
1677        b"lthree" | b"leftthreetimes" => "\u{22CB}",
1678        b"rthree" | b"rightthreetimes" => "\u{22CC}",
1679        b"bsime" | b"backsimeq" => "\u{22CD}",
1680        b"cuvee" | b"curlyvee" => "\u{22CE}",
1681        b"cuwed" | b"curlywedge" => "\u{22CF}",
1682        b"Sub" | b"Subset" => "\u{22D0}",
1683        b"Sup" | b"Supset" => "\u{22D1}",
1684        b"Cap" => "\u{22D2}",
1685        b"Cup" => "\u{22D3}",
1686        b"fork" | b"pitchfork" => "\u{22D4}",
1687        b"epar" => "\u{22D5}",
1688        b"ltdot" | b"lessdot" => "\u{22D6}",
1689        b"gtdot" | b"gtrdot" => "\u{22D7}",
1690        b"Ll" => "\u{22D8}",
1691        b"Gg" | b"ggg" => "\u{22D9}",
1692        b"leg" | b"LessEqualGreater" | b"lesseqgtr" => "\u{22DA}",
1693        b"gel" | b"gtreqless" | b"GreaterEqualLess" => "\u{22DB}",
1694        b"cuepr" | b"curlyeqprec" => "\u{22DE}",
1695        b"cuesc" | b"curlyeqsucc" => "\u{22DF}",
1696        b"nprcue" | b"NotPrecedesSlantEqual" => "\u{22E0}",
1697        b"nsccue" | b"NotSucceedsSlantEqual" => "\u{22E1}",
1698        b"nsqsube" | b"NotSquareSubsetEqual" => "\u{22E2}",
1699        b"nsqsupe" | b"NotSquareSupersetEqual" => "\u{22E3}",
1700        b"lnsim" => "\u{22E6}",
1701        b"gnsim" => "\u{22E7}",
1702        b"prnsim" | b"precnsim" => "\u{22E8}",
1703        b"scnsim" | b"succnsim" => "\u{22E9}",
1704        b"nltri" | b"ntriangleleft" | b"NotLeftTriangle" => "\u{22EA}",
1705        b"nrtri" | b"ntriangleright" | b"NotRightTriangle" => "\u{22EB}",
1706        b"nltrie" | b"ntrianglelefteq" | b"NotLeftTriangleEqual" => "\u{22EC}",
1707        b"nrtrie" | b"ntrianglerighteq" | b"NotRightTriangleEqual" => "\u{22ED}",
1708        b"vellip" => "\u{22EE}",
1709        b"ctdot" => "\u{22EF}",
1710        b"utdot" => "\u{22F0}",
1711        b"dtdot" => "\u{22F1}",
1712        b"disin" => "\u{22F2}",
1713        b"isinsv" => "\u{22F3}",
1714        b"isins" => "\u{22F4}",
1715        b"isindot" => "\u{22F5}",
1716        b"notinvc" => "\u{22F6}",
1717        b"notinvb" => "\u{22F7}",
1718        b"isinE" => "\u{22F9}",
1719        b"nisd" => "\u{22FA}",
1720        b"xnis" => "\u{22FB}",
1721        b"nis" => "\u{22FC}",
1722        b"notnivc" => "\u{22FD}",
1723        b"notnivb" => "\u{22FE}",
1724        b"barwed" | b"barwedge" => "\u{2305}",
1725        b"Barwed" | b"doublebarwedge" => "\u{2306}",
1726        b"lceil" | b"LeftCeiling" => "\u{2308}",
1727        b"rceil" | b"RightCeiling" => "\u{2309}",
1728        b"lfloor" | b"LeftFloor" => "\u{230A}",
1729        b"rfloor" | b"RightFloor" => "\u{230B}",
1730        b"drcrop" => "\u{230C}",
1731        b"dlcrop" => "\u{230D}",
1732        b"urcrop" => "\u{230E}",
1733        b"ulcrop" => "\u{230F}",
1734        b"bnot" => "\u{2310}",
1735        b"profline" => "\u{2312}",
1736        b"profsurf" => "\u{2313}",
1737        b"telrec" => "\u{2315}",
1738        b"target" => "\u{2316}",
1739        b"ulcorn" | b"ulcorner" => "\u{231C}",
1740        b"urcorn" | b"urcorner" => "\u{231D}",
1741        b"dlcorn" | b"llcorner" => "\u{231E}",
1742        b"drcorn" | b"lrcorner" => "\u{231F}",
1743        b"frown" | b"sfrown" => "\u{2322}",
1744        b"smile" | b"ssmile" => "\u{2323}",
1745        b"cylcty" => "\u{232D}",
1746        b"profalar" => "\u{232E}",
1747        b"topbot" => "\u{2336}",
1748        b"ovbar" => "\u{233D}",
1749        b"solbar" => "\u{233F}",
1750        b"angzarr" => "\u{237C}",
1751        b"lmoust" | b"lmoustache" => "\u{23B0}",
1752        b"rmoust" | b"rmoustache" => "\u{23B1}",
1753        b"tbrk" | b"OverBracket" => "\u{23B4}",
1754        b"bbrk" | b"UnderBracket" => "\u{23B5}",
1755        b"bbrktbrk" => "\u{23B6}",
1756        b"OverParenthesis" => "\u{23DC}",
1757        b"UnderParenthesis" => "\u{23DD}",
1758        b"OverBrace" => "\u{23DE}",
1759        b"UnderBrace" => "\u{23DF}",
1760        b"trpezium" => "\u{23E2}",
1761        b"elinters" => "\u{23E7}",
1762        b"blank" => "\u{2423}",
1763        b"oS" | b"circledS" => "\u{24C8}",
1764        b"boxh" | b"HorizontalLine" => "\u{2500}",
1765        b"boxv" => "\u{2502}",
1766        b"boxdr" => "\u{250C}",
1767        b"boxdl" => "\u{2510}",
1768        b"boxur" => "\u{2514}",
1769        b"boxul" => "\u{2518}",
1770        b"boxvr" => "\u{251C}",
1771        b"boxvl" => "\u{2524}",
1772        b"boxhd" => "\u{252C}",
1773        b"boxhu" => "\u{2534}",
1774        b"boxvh" => "\u{253C}",
1775        b"boxH" => "\u{2550}",
1776        b"boxV" => "\u{2551}",
1777        b"boxdR" => "\u{2552}",
1778        b"boxDr" => "\u{2553}",
1779        b"boxDR" => "\u{2554}",
1780        b"boxdL" => "\u{2555}",
1781        b"boxDl" => "\u{2556}",
1782        b"boxDL" => "\u{2557}",
1783        b"boxuR" => "\u{2558}",
1784        b"boxUr" => "\u{2559}",
1785        b"boxUR" => "\u{255A}",
1786        b"boxuL" => "\u{255B}",
1787        b"boxUl" => "\u{255C}",
1788        b"boxUL" => "\u{255D}",
1789        b"boxvR" => "\u{255E}",
1790        b"boxVr" => "\u{255F}",
1791        b"boxVR" => "\u{2560}",
1792        b"boxvL" => "\u{2561}",
1793        b"boxVl" => "\u{2562}",
1794        b"boxVL" => "\u{2563}",
1795        b"boxHd" => "\u{2564}",
1796        b"boxhD" => "\u{2565}",
1797        b"boxHD" => "\u{2566}",
1798        b"boxHu" => "\u{2567}",
1799        b"boxhU" => "\u{2568}",
1800        b"boxHU" => "\u{2569}",
1801        b"boxvH" => "\u{256A}",
1802        b"boxVh" => "\u{256B}",
1803        b"boxVH" => "\u{256C}",
1804        b"uhblk" => "\u{2580}",
1805        b"lhblk" => "\u{2584}",
1806        b"block" => "\u{2588}",
1807        b"blk14" => "\u{2591}",
1808        b"blk12" => "\u{2592}",
1809        b"blk34" => "\u{2593}",
1810        b"squ" | b"square" | b"Square" => "\u{25A1}",
1811        b"squf" | b"squarf" | b"blacksquare" | b"FilledVerySmallSquare" => "\u{25AA}",
1812        b"EmptyVerySmallSquare" => "\u{25AB}",
1813        b"rect" => "\u{25AD}",
1814        b"marker" => "\u{25AE}",
1815        b"fltns" => "\u{25B1}",
1816        b"xutri" | b"bigtriangleup" => "\u{25B3}",
1817        b"utrif" | b"blacktriangle" => "\u{25B4}",
1818        b"utri" | b"triangle" => "\u{25B5}",
1819        b"rtrif" | b"blacktriangleright" => "\u{25B8}",
1820        b"rtri" | b"triangleright" => "\u{25B9}",
1821        b"xdtri" | b"bigtriangledown" => "\u{25BD}",
1822        b"dtrif" | b"blacktriangledown" => "\u{25BE}",
1823        b"dtri" | b"triangledown" => "\u{25BF}",
1824        b"ltrif" | b"blacktriangleleft" => "\u{25C2}",
1825        b"ltri" | b"triangleleft" => "\u{25C3}",
1826        b"loz" | b"lozenge" => "\u{25CA}",
1827        b"cir" => "\u{25CB}",
1828        b"tridot" => "\u{25EC}",
1829        b"xcirc" | b"bigcirc" => "\u{25EF}",
1830        b"ultri" => "\u{25F8}",
1831        b"urtri" => "\u{25F9}",
1832        b"lltri" => "\u{25FA}",
1833        b"EmptySmallSquare" => "\u{25FB}",
1834        b"FilledSmallSquare" => "\u{25FC}",
1835        b"starf" | b"bigstar" => "\u{2605}",
1836        b"star" => "\u{2606}",
1837        b"phone" => "\u{260E}",
1838        b"female" => "\u{2640}",
1839        b"male" => "\u{2642}",
1840        b"spades" | b"spadesuit" => "\u{2660}",
1841        b"clubs" | b"clubsuit" => "\u{2663}",
1842        b"hearts" | b"heartsuit" => "\u{2665}",
1843        b"diams" | b"diamondsuit" => "\u{2666}",
1844        b"sung" => "\u{266A}",
1845        b"flat" => "\u{266D}",
1846        b"natur" | b"natural" => "\u{266E}",
1847        b"sharp" => "\u{266F}",
1848        b"check" | b"checkmark" => "\u{2713}",
1849        b"cross" => "\u{2717}",
1850        b"malt" | b"maltese" => "\u{2720}",
1851        b"sext" => "\u{2736}",
1852        b"VerticalSeparator" => "\u{2758}",
1853        b"lbbrk" => "\u{2772}",
1854        b"rbbrk" => "\u{2773}",
1855        b"lobrk" | b"LeftDoubleBracket" => "\u{27E6}",
1856        b"robrk" | b"RightDoubleBracket" => "\u{27E7}",
1857        b"lang" | b"LeftAngleBracket" | b"langle" => "\u{27E8}",
1858        b"rang" | b"RightAngleBracket" | b"rangle" => "\u{27E9}",
1859        b"Lang" => "\u{27EA}",
1860        b"Rang" => "\u{27EB}",
1861        b"loang" => "\u{27EC}",
1862        b"roang" => "\u{27ED}",
1863        b"xlarr" | b"longleftarrow" | b"LongLeftArrow" => "\u{27F5}",
1864        b"xrarr" | b"longrightarrow" | b"LongRightArrow" => "\u{27F6}",
1865        b"xharr" | b"longleftrightarrow" | b"LongLeftRightArrow" => "\u{27F7}",
1866        b"xlArr" | b"Longleftarrow" | b"DoubleLongLeftArrow" => "\u{27F8}",
1867        b"xrArr" | b"Longrightarrow" | b"DoubleLongRightArrow" => "\u{27F9}",
1868        b"xhArr" | b"Longleftrightarrow" | b"DoubleLongLeftRightArrow" => "\u{27FA}",
1869        b"xmap" | b"longmapsto" => "\u{27FC}",
1870        b"dzigrarr" => "\u{27FF}",
1871        b"nvlArr" => "\u{2902}",
1872        b"nvrArr" => "\u{2903}",
1873        b"nvHarr" => "\u{2904}",
1874        b"Map" => "\u{2905}",
1875        b"lbarr" => "\u{290C}",
1876        b"rbarr" | b"bkarow" => "\u{290D}",
1877        b"lBarr" => "\u{290E}",
1878        b"rBarr" | b"dbkarow" => "\u{290F}",
1879        b"RBarr" | b"drbkarow" => "\u{2910}",
1880        b"DDotrahd" => "\u{2911}",
1881        b"UpArrowBar" => "\u{2912}",
1882        b"DownArrowBar" => "\u{2913}",
1883        b"Rarrtl" => "\u{2916}",
1884        b"latail" => "\u{2919}",
1885        b"ratail" => "\u{291A}",
1886        b"lAtail" => "\u{291B}",
1887        b"rAtail" => "\u{291C}",
1888        b"larrfs" => "\u{291D}",
1889        b"rarrfs" => "\u{291E}",
1890        b"larrbfs" => "\u{291F}",
1891        b"rarrbfs" => "\u{2920}",
1892        b"nwarhk" => "\u{2923}",
1893        b"nearhk" => "\u{2924}",
1894        b"searhk" | b"hksearow" => "\u{2925}",
1895        b"swarhk" | b"hkswarow" => "\u{2926}",
1896        b"nwnear" => "\u{2927}",
1897        b"nesear" | b"toea" => "\u{2928}",
1898        b"seswar" | b"tosa" => "\u{2929}",
1899        b"swnwar" => "\u{292A}",
1900        b"rarrc" => "\u{2933}",
1901        b"cudarrr" => "\u{2935}",
1902        b"ldca" => "\u{2936}",
1903        b"rdca" => "\u{2937}",
1904        b"cudarrl" => "\u{2938}",
1905        b"larrpl" => "\u{2939}",
1906        b"curarrm" => "\u{293C}",
1907        b"cularrp" => "\u{293D}",
1908        b"rarrpl" => "\u{2945}",
1909        b"harrcir" => "\u{2948}",
1910        b"Uarrocir" => "\u{2949}",
1911        b"lurdshar" => "\u{294A}",
1912        b"ldrushar" => "\u{294B}",
1913        b"LeftRightVector" => "\u{294E}",
1914        b"RightUpDownVector" => "\u{294F}",
1915        b"DownLeftRightVector" => "\u{2950}",
1916        b"LeftUpDownVector" => "\u{2951}",
1917        b"LeftVectorBar" => "\u{2952}",
1918        b"RightVectorBar" => "\u{2953}",
1919        b"RightUpVectorBar" => "\u{2954}",
1920        b"RightDownVectorBar" => "\u{2955}",
1921        b"DownLeftVectorBar" => "\u{2956}",
1922        b"DownRightVectorBar" => "\u{2957}",
1923        b"LeftUpVectorBar" => "\u{2958}",
1924        b"LeftDownVectorBar" => "\u{2959}",
1925        b"LeftTeeVector" => "\u{295A}",
1926        b"RightTeeVector" => "\u{295B}",
1927        b"RightUpTeeVector" => "\u{295C}",
1928        b"RightDownTeeVector" => "\u{295D}",
1929        b"DownLeftTeeVector" => "\u{295E}",
1930        b"DownRightTeeVector" => "\u{295F}",
1931        b"LeftUpTeeVector" => "\u{2960}",
1932        b"LeftDownTeeVector" => "\u{2961}",
1933        b"lHar" => "\u{2962}",
1934        b"uHar" => "\u{2963}",
1935        b"rHar" => "\u{2964}",
1936        b"dHar" => "\u{2965}",
1937        b"luruhar" => "\u{2966}",
1938        b"ldrdhar" => "\u{2967}",
1939        b"ruluhar" => "\u{2968}",
1940        b"rdldhar" => "\u{2969}",
1941        b"lharul" => "\u{296A}",
1942        b"llhard" => "\u{296B}",
1943        b"rharul" => "\u{296C}",
1944        b"lrhard" => "\u{296D}",
1945        b"udhar" | b"UpEquilibrium" => "\u{296E}",
1946        b"duhar" | b"ReverseUpEquilibrium" => "\u{296F}",
1947        b"RoundImplies" => "\u{2970}",
1948        b"erarr" => "\u{2971}",
1949        b"simrarr" => "\u{2972}",
1950        b"larrsim" => "\u{2973}",
1951        b"rarrsim" => "\u{2974}",
1952        b"rarrap" => "\u{2975}",
1953        b"ltlarr" => "\u{2976}",
1954        b"gtrarr" => "\u{2978}",
1955        b"subrarr" => "\u{2979}",
1956        b"suplarr" => "\u{297B}",
1957        b"lfisht" => "\u{297C}",
1958        b"rfisht" => "\u{297D}",
1959        b"ufisht" => "\u{297E}",
1960        b"dfisht" => "\u{297F}",
1961        b"lopar" => "\u{2985}",
1962        b"ropar" => "\u{2986}",
1963        b"lbrke" => "\u{298B}",
1964        b"rbrke" => "\u{298C}",
1965        b"lbrkslu" => "\u{298D}",
1966        b"rbrksld" => "\u{298E}",
1967        b"lbrksld" => "\u{298F}",
1968        b"rbrkslu" => "\u{2990}",
1969        b"langd" => "\u{2991}",
1970        b"rangd" => "\u{2992}",
1971        b"lparlt" => "\u{2993}",
1972        b"rpargt" => "\u{2994}",
1973        b"gtlPar" => "\u{2995}",
1974        b"ltrPar" => "\u{2996}",
1975        b"vzigzag" => "\u{299A}",
1976        b"vangrt" => "\u{299C}",
1977        b"angrtvbd" => "\u{299D}",
1978        b"ange" => "\u{29A4}",
1979        b"range" => "\u{29A5}",
1980        b"dwangle" => "\u{29A6}",
1981        b"uwangle" => "\u{29A7}",
1982        b"angmsdaa" => "\u{29A8}",
1983        b"angmsdab" => "\u{29A9}",
1984        b"angmsdac" => "\u{29AA}",
1985        b"angmsdad" => "\u{29AB}",
1986        b"angmsdae" => "\u{29AC}",
1987        b"angmsdaf" => "\u{29AD}",
1988        b"angmsdag" => "\u{29AE}",
1989        b"angmsdah" => "\u{29AF}",
1990        b"bemptyv" => "\u{29B0}",
1991        b"demptyv" => "\u{29B1}",
1992        b"cemptyv" => "\u{29B2}",
1993        b"raemptyv" => "\u{29B3}",
1994        b"laemptyv" => "\u{29B4}",
1995        b"ohbar" => "\u{29B5}",
1996        b"omid" => "\u{29B6}",
1997        b"opar" => "\u{29B7}",
1998        b"operp" => "\u{29B9}",
1999        b"olcross" => "\u{29BB}",
2000        b"odsold" => "\u{29BC}",
2001        b"olcir" => "\u{29BE}",
2002        b"ofcir" => "\u{29BF}",
2003        b"olt" => "\u{29C0}",
2004        b"ogt" => "\u{29C1}",
2005        b"cirscir" => "\u{29C2}",
2006        b"cirE" => "\u{29C3}",
2007        b"solb" => "\u{29C4}",
2008        b"bsolb" => "\u{29C5}",
2009        b"boxbox" => "\u{29C9}",
2010        b"trisb" => "\u{29CD}",
2011        b"rtriltri" => "\u{29CE}",
2012        b"LeftTriangleBar" => "\u{29CF}",
2013        b"RightTriangleBar" => "\u{29D0}",
2014        b"race" => "\u{29DA}",
2015        b"iinfin" => "\u{29DC}",
2016        b"infintie" => "\u{29DD}",
2017        b"nvinfin" => "\u{29DE}",
2018        b"eparsl" => "\u{29E3}",
2019        b"smeparsl" => "\u{29E4}",
2020        b"eqvparsl" => "\u{29E5}",
2021        b"lozf" | b"blacklozenge" => "\u{29EB}",
2022        b"RuleDelayed" => "\u{29F4}",
2023        b"dsol" => "\u{29F6}",
2024        b"xodot" | b"bigodot" => "\u{2A00}",
2025        b"xoplus" | b"bigoplus" => "\u{2A01}",
2026        b"xotime" | b"bigotimes" => "\u{2A02}",
2027        b"xuplus" | b"biguplus" => "\u{2A04}",
2028        b"xsqcup" | b"bigsqcup" => "\u{2A06}",
2029        b"qint" | b"iiiint" => "\u{2A0C}",
2030        b"fpartint" => "\u{2A0D}",
2031        b"cirfnint" => "\u{2A10}",
2032        b"awint" => "\u{2A11}",
2033        b"rppolint" => "\u{2A12}",
2034        b"scpolint" => "\u{2A13}",
2035        b"npolint" => "\u{2A14}",
2036        b"pointint" => "\u{2A15}",
2037        b"quatint" => "\u{2A16}",
2038        b"intlarhk" => "\u{2A17}",
2039        b"pluscir" => "\u{2A22}",
2040        b"plusacir" => "\u{2A23}",
2041        b"simplus" => "\u{2A24}",
2042        b"plusdu" => "\u{2A25}",
2043        b"plussim" => "\u{2A26}",
2044        b"plustwo" => "\u{2A27}",
2045        b"mcomma" => "\u{2A29}",
2046        b"minusdu" => "\u{2A2A}",
2047        b"loplus" => "\u{2A2D}",
2048        b"roplus" => "\u{2A2E}",
2049        b"Cross" => "\u{2A2F}",
2050        b"timesd" => "\u{2A30}",
2051        b"timesbar" => "\u{2A31}",
2052        b"smashp" => "\u{2A33}",
2053        b"lotimes" => "\u{2A34}",
2054        b"rotimes" => "\u{2A35}",
2055        b"otimesas" => "\u{2A36}",
2056        b"Otimes" => "\u{2A37}",
2057        b"odiv" => "\u{2A38}",
2058        b"triplus" => "\u{2A39}",
2059        b"triminus" => "\u{2A3A}",
2060        b"tritime" => "\u{2A3B}",
2061        b"iprod" | b"intprod" => "\u{2A3C}",
2062        b"amalg" => "\u{2A3F}",
2063        b"capdot" => "\u{2A40}",
2064        b"ncup" => "\u{2A42}",
2065        b"ncap" => "\u{2A43}",
2066        b"capand" => "\u{2A44}",
2067        b"cupor" => "\u{2A45}",
2068        b"cupcap" => "\u{2A46}",
2069        b"capcup" => "\u{2A47}",
2070        b"cupbrcap" => "\u{2A48}",
2071        b"capbrcup" => "\u{2A49}",
2072        b"cupcup" => "\u{2A4A}",
2073        b"capcap" => "\u{2A4B}",
2074        b"ccups" => "\u{2A4C}",
2075        b"ccaps" => "\u{2A4D}",
2076        b"ccupssm" => "\u{2A50}",
2077        b"And" => "\u{2A53}",
2078        b"Or" => "\u{2A54}",
2079        b"andand" => "\u{2A55}",
2080        b"oror" => "\u{2A56}",
2081        b"orslope" => "\u{2A57}",
2082        b"andslope" => "\u{2A58}",
2083        b"andv" => "\u{2A5A}",
2084        b"orv" => "\u{2A5B}",
2085        b"andd" => "\u{2A5C}",
2086        b"ord" => "\u{2A5D}",
2087        b"wedbar" => "\u{2A5F}",
2088        b"sdote" => "\u{2A66}",
2089        b"simdot" => "\u{2A6A}",
2090        b"congdot" => "\u{2A6D}",
2091        b"easter" => "\u{2A6E}",
2092        b"apacir" => "\u{2A6F}",
2093        b"apE" => "\u{2A70}",
2094        b"eplus" => "\u{2A71}",
2095        b"pluse" => "\u{2A72}",
2096        b"Esim" => "\u{2A73}",
2097        b"Colone" => "\u{2A74}",
2098        b"Equal" => "\u{2A75}",
2099        b"eDDot" | b"ddotseq" => "\u{2A77}",
2100        b"equivDD" => "\u{2A78}",
2101        b"ltcir" => "\u{2A79}",
2102        b"gtcir" => "\u{2A7A}",
2103        b"ltquest" => "\u{2A7B}",
2104        b"gtquest" => "\u{2A7C}",
2105        b"les" | b"LessSlantEqual" | b"leqslant" => "\u{2A7D}",
2106        b"ges" | b"GreaterSlantEqual" | b"geqslant" => "\u{2A7E}",
2107        b"lesdot" => "\u{2A7F}",
2108        b"gesdot" => "\u{2A80}",
2109        b"lesdoto" => "\u{2A81}",
2110        b"gesdoto" => "\u{2A82}",
2111        b"lesdotor" => "\u{2A83}",
2112        b"gesdotol" => "\u{2A84}",
2113        b"lap" | b"lessapprox" => "\u{2A85}",
2114        b"gap" | b"gtrapprox" => "\u{2A86}",
2115        b"lne" | b"lneq" => "\u{2A87}",
2116        b"gne" | b"gneq" => "\u{2A88}",
2117        b"lnap" | b"lnapprox" => "\u{2A89}",
2118        b"gnap" | b"gnapprox" => "\u{2A8A}",
2119        b"lEg" | b"lesseqqgtr" => "\u{2A8B}",
2120        b"gEl" | b"gtreqqless" => "\u{2A8C}",
2121        b"lsime" => "\u{2A8D}",
2122        b"gsime" => "\u{2A8E}",
2123        b"lsimg" => "\u{2A8F}",
2124        b"gsiml" => "\u{2A90}",
2125        b"lgE" => "\u{2A91}",
2126        b"glE" => "\u{2A92}",
2127        b"lesges" => "\u{2A93}",
2128        b"gesles" => "\u{2A94}",
2129        b"els" | b"eqslantless" => "\u{2A95}",
2130        b"egs" | b"eqslantgtr" => "\u{2A96}",
2131        b"elsdot" => "\u{2A97}",
2132        b"egsdot" => "\u{2A98}",
2133        b"el" => "\u{2A99}",
2134        b"eg" => "\u{2A9A}",
2135        b"siml" => "\u{2A9D}",
2136        b"simg" => "\u{2A9E}",
2137        b"simlE" => "\u{2A9F}",
2138        b"simgE" => "\u{2AA0}",
2139        b"LessLess" => "\u{2AA1}",
2140        b"GreaterGreater" => "\u{2AA2}",
2141        b"glj" => "\u{2AA4}",
2142        b"gla" => "\u{2AA5}",
2143        b"ltcc" => "\u{2AA6}",
2144        b"gtcc" => "\u{2AA7}",
2145        b"lescc" => "\u{2AA8}",
2146        b"gescc" => "\u{2AA9}",
2147        b"smt" => "\u{2AAA}",
2148        b"lat" => "\u{2AAB}",
2149        b"smte" => "\u{2AAC}",
2150        b"late" => "\u{2AAD}",
2151        b"bumpE" => "\u{2AAE}",
2152        b"pre" | b"preceq" | b"PrecedesEqual" => "\u{2AAF}",
2153        b"sce" | b"succeq" | b"SucceedsEqual" => "\u{2AB0}",
2154        b"prE" => "\u{2AB3}",
2155        b"scE" => "\u{2AB4}",
2156        b"prnE" | b"precneqq" => "\u{2AB5}",
2157        b"scnE" | b"succneqq" => "\u{2AB6}",
2158        b"prap" | b"precapprox" => "\u{2AB7}",
2159        b"scap" | b"succapprox" => "\u{2AB8}",
2160        b"prnap" | b"precnapprox" => "\u{2AB9}",
2161        b"scnap" | b"succnapprox" => "\u{2ABA}",
2162        b"Pr" => "\u{2ABB}",
2163        b"Sc" => "\u{2ABC}",
2164        b"subdot" => "\u{2ABD}",
2165        b"supdot" => "\u{2ABE}",
2166        b"subplus" => "\u{2ABF}",
2167        b"supplus" => "\u{2AC0}",
2168        b"submult" => "\u{2AC1}",
2169        b"supmult" => "\u{2AC2}",
2170        b"subedot" => "\u{2AC3}",
2171        b"supedot" => "\u{2AC4}",
2172        b"subE" | b"subseteqq" => "\u{2AC5}",
2173        b"supE" | b"supseteqq" => "\u{2AC6}",
2174        b"subsim" => "\u{2AC7}",
2175        b"supsim" => "\u{2AC8}",
2176        b"subnE" | b"subsetneqq" => "\u{2ACB}",
2177        b"supnE" | b"supsetneqq" => "\u{2ACC}",
2178        b"csub" => "\u{2ACF}",
2179        b"csup" => "\u{2AD0}",
2180        b"csube" => "\u{2AD1}",
2181        b"csupe" => "\u{2AD2}",
2182        b"subsup" => "\u{2AD3}",
2183        b"supsub" => "\u{2AD4}",
2184        b"subsub" => "\u{2AD5}",
2185        b"supsup" => "\u{2AD6}",
2186        b"suphsub" => "\u{2AD7}",
2187        b"supdsub" => "\u{2AD8}",
2188        b"forkv" => "\u{2AD9}",
2189        b"topfork" => "\u{2ADA}",
2190        b"mlcp" => "\u{2ADB}",
2191        b"Dashv" | b"DoubleLeftTee" => "\u{2AE4}",
2192        b"Vdashl" => "\u{2AE6}",
2193        b"Barv" => "\u{2AE7}",
2194        b"vBar" => "\u{2AE8}",
2195        b"vBarv" => "\u{2AE9}",
2196        b"Vbar" => "\u{2AEB}",
2197        b"Not" => "\u{2AEC}",
2198        b"bNot" => "\u{2AED}",
2199        b"rnmid" => "\u{2AEE}",
2200        b"cirmid" => "\u{2AEF}",
2201        b"midcir" => "\u{2AF0}",
2202        b"topcir" => "\u{2AF1}",
2203        b"nhpar" => "\u{2AF2}",
2204        b"parsim" => "\u{2AF3}",
2205        b"parsl" => "\u{2AFD}",
2206        b"fflig" => "\u{FB00}",
2207        b"filig" => "\u{FB01}",
2208        b"fllig" => "\u{FB02}",
2209        b"ffilig" => "\u{FB03}",
2210        b"ffllig" => "\u{FB04}",
2211        b"Ascr" => "\u{1D49}",
2212        b"Cscr" => "\u{1D49}",
2213        b"Dscr" => "\u{1D49}",
2214        b"Gscr" => "\u{1D4A}",
2215        b"Jscr" => "\u{1D4A}",
2216        b"Kscr" => "\u{1D4A}",
2217        b"Nscr" => "\u{1D4A}",
2218        b"Oscr" => "\u{1D4A}",
2219        b"Pscr" => "\u{1D4A}",
2220        b"Qscr" => "\u{1D4A}",
2221        b"Sscr" => "\u{1D4A}",
2222        b"Tscr" => "\u{1D4A}",
2223        b"Uscr" => "\u{1D4B}",
2224        b"Vscr" => "\u{1D4B}",
2225        b"Wscr" => "\u{1D4B}",
2226        b"Xscr" => "\u{1D4B}",
2227        b"Yscr" => "\u{1D4B}",
2228        b"Zscr" => "\u{1D4B}",
2229        b"ascr" => "\u{1D4B}",
2230        b"bscr" => "\u{1D4B}",
2231        b"cscr" => "\u{1D4B}",
2232        b"dscr" => "\u{1D4B}",
2233        b"fscr" => "\u{1D4B}",
2234        b"hscr" => "\u{1D4B}",
2235        b"iscr" => "\u{1D4B}",
2236        b"jscr" => "\u{1D4B}",
2237        b"kscr" => "\u{1D4C}",
2238        b"lscr" => "\u{1D4C}",
2239        b"mscr" => "\u{1D4C}",
2240        b"nscr" => "\u{1D4C}",
2241        b"pscr" => "\u{1D4C}",
2242        b"qscr" => "\u{1D4C}",
2243        b"rscr" => "\u{1D4C}",
2244        b"sscr" => "\u{1D4C}",
2245        b"tscr" => "\u{1D4C}",
2246        b"uscr" => "\u{1D4C}",
2247        b"vscr" => "\u{1D4C}",
2248        b"wscr" => "\u{1D4C}",
2249        b"xscr" => "\u{1D4C}",
2250        b"yscr" => "\u{1D4C}",
2251        b"zscr" => "\u{1D4C}",
2252        b"Afr" => "\u{1D50}",
2253        b"Bfr" => "\u{1D50}",
2254        b"Dfr" => "\u{1D50}",
2255        b"Efr" => "\u{1D50}",
2256        b"Ffr" => "\u{1D50}",
2257        b"Gfr" => "\u{1D50}",
2258        b"Jfr" => "\u{1D50}",
2259        b"Kfr" => "\u{1D50}",
2260        b"Lfr" => "\u{1D50}",
2261        b"Mfr" => "\u{1D51}",
2262        b"Nfr" => "\u{1D51}",
2263        b"Ofr" => "\u{1D51}",
2264        b"Pfr" => "\u{1D51}",
2265        b"Qfr" => "\u{1D51}",
2266        b"Sfr" => "\u{1D51}",
2267        b"Tfr" => "\u{1D51}",
2268        b"Ufr" => "\u{1D51}",
2269        b"Vfr" => "\u{1D51}",
2270        b"Wfr" => "\u{1D51}",
2271        b"Xfr" => "\u{1D51}",
2272        b"Yfr" => "\u{1D51}",
2273        b"afr" => "\u{1D51}",
2274        b"bfr" => "\u{1D51}",
2275        b"cfr" => "\u{1D52}",
2276        b"dfr" => "\u{1D52}",
2277        b"efr" => "\u{1D52}",
2278        b"ffr" => "\u{1D52}",
2279        b"gfr" => "\u{1D52}",
2280        b"hfr" => "\u{1D52}",
2281        b"ifr" => "\u{1D52}",
2282        b"jfr" => "\u{1D52}",
2283        b"kfr" => "\u{1D52}",
2284        b"lfr" => "\u{1D52}",
2285        b"mfr" => "\u{1D52}",
2286        b"nfr" => "\u{1D52}",
2287        b"ofr" => "\u{1D52}",
2288        b"pfr" => "\u{1D52}",
2289        b"qfr" => "\u{1D52}",
2290        b"rfr" => "\u{1D52}",
2291        b"sfr" => "\u{1D53}",
2292        b"tfr" => "\u{1D53}",
2293        b"ufr" => "\u{1D53}",
2294        b"vfr" => "\u{1D53}",
2295        b"wfr" => "\u{1D53}",
2296        b"xfr" => "\u{1D53}",
2297        b"yfr" => "\u{1D53}",
2298        b"zfr" => "\u{1D53}",
2299        b"Aopf" => "\u{1D53}",
2300        b"Bopf" => "\u{1D53}",
2301        b"Dopf" => "\u{1D53}",
2302        b"Eopf" => "\u{1D53}",
2303        b"Fopf" => "\u{1D53}",
2304        b"Gopf" => "\u{1D53}",
2305        b"Iopf" => "\u{1D54}",
2306        b"Jopf" => "\u{1D54}",
2307        b"Kopf" => "\u{1D54}",
2308        b"Lopf" => "\u{1D54}",
2309        b"Mopf" => "\u{1D54}",
2310        b"Oopf" => "\u{1D54}",
2311        b"Sopf" => "\u{1D54}",
2312        b"Topf" => "\u{1D54}",
2313        b"Uopf" => "\u{1D54}",
2314        b"Vopf" => "\u{1D54}",
2315        b"Wopf" => "\u{1D54}",
2316        b"Xopf" => "\u{1D54}",
2317        b"Yopf" => "\u{1D55}",
2318        b"aopf" => "\u{1D55}",
2319        b"bopf" => "\u{1D55}",
2320        b"copf" => "\u{1D55}",
2321        b"dopf" => "\u{1D55}",
2322        b"eopf" => "\u{1D55}",
2323        b"fopf" => "\u{1D55}",
2324        b"gopf" => "\u{1D55}",
2325        b"hopf" => "\u{1D55}",
2326        b"iopf" => "\u{1D55}",
2327        b"jopf" => "\u{1D55}",
2328        b"kopf" => "\u{1D55}",
2329        b"lopf" => "\u{1D55}",
2330        b"mopf" => "\u{1D55}",
2331        b"nopf" => "\u{1D55}",
2332        b"oopf" => "\u{1D56}",
2333        b"popf" => "\u{1D56}",
2334        b"qopf" => "\u{1D56}",
2335        b"ropf" => "\u{1D56}",
2336        b"sopf" => "\u{1D56}",
2337        b"topf" => "\u{1D56}",
2338        b"uopf" => "\u{1D56}",
2339        b"vopf" => "\u{1D56}",
2340        b"wopf" => "\u{1D56}",
2341        b"xopf" => "\u{1D56}",
2342        b"yopf" => "\u{1D56}",
2343        b"zopf" => "\u{1D56}",
2344        _ => return None,
2345    };
2346    Some(s)
2347}
2348
2349pub(crate) fn parse_number(num: &str) -> Result<char, ParseCharRefError> {
2350    let code = if let Some(hex) = num.strip_prefix('x') {
2351        from_str_radix(hex, 16)?
2352    } else {
2353        from_str_radix(num, 10)?
2354    };
2355    if code == 0 {
2356        return Err(ParseCharRefError::IllegalCharacter(code));
2357    }
2358    match std::char::from_u32(code) {
2359        Some(c) => Ok(c),
2360        None => Err(ParseCharRefError::InvalidCodepoint(code)),
2361    }
2362}
2363
2364#[inline]
2365fn from_str_radix(src: &str, radix: u32) -> Result<u32, ParseCharRefError> {
2366    match src.as_bytes().first().copied() {
2367        // We should not allow sign numbers, but u32::from_str_radix will accept `+`.
2368        // We also handle `-` to be consistent in returned errors
2369        Some(b'+') | Some(b'-') => Err(ParseCharRefError::UnexpectedSign),
2370        _ => u32::from_str_radix(src, radix).map_err(ParseCharRefError::InvalidNumber),
2371    }
2372}
2373
2374////////////////////////////////////////////////////////////////////////////////////////////////////
2375
2376#[cfg(test)]
2377mod normalization {
2378    use super::*;
2379
2380    mod eol {
2381        use super::*;
2382
2383        mod xml11 {
2384            use super::*;
2385            use pretty_assertions::assert_eq;
2386
2387            #[test]
2388            fn empty() {
2389                assert_eq!(normalize_xml11_eols(""), "");
2390            }
2391
2392            #[test]
2393            fn already_normalized() {
2394                assert_eq!(
2395                    normalize_xml11_eols("\nalready \n\n normalized\n"),
2396                    "\nalready \n\n normalized\n",
2397                );
2398            }
2399
2400            #[test]
2401            fn cr_lf() {
2402                assert_eq!(
2403                    normalize_xml11_eols("\r\nsome\r\n\r\ntext"),
2404                    "\nsome\n\ntext"
2405                );
2406            }
2407
2408            #[test]
2409            fn cr_u0085() {
2410                assert_eq!(
2411                    normalize_xml11_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"),
2412                    "\nsome\n\ntext",
2413                );
2414            }
2415
2416            #[test]
2417            fn u0085() {
2418                assert_eq!(
2419                    normalize_xml11_eols("\u{0085}some\u{0085}\u{0085}text"),
2420                    "\nsome\n\ntext",
2421                );
2422            }
2423
2424            #[test]
2425            fn u2028() {
2426                assert_eq!(
2427                    normalize_xml11_eols("\u{2028}some\u{2028}\u{2028}text"),
2428                    "\nsome\n\ntext",
2429                );
2430            }
2431
2432            #[test]
2433            fn mixed() {
2434                assert_eq!(
2435                    normalize_xml11_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"),
2436                    "\n\n\n\n\n\nsome\n\n\ntext",
2437                );
2438            }
2439
2440            #[test]
2441            fn utf8_0xc2() {
2442                // All possible characters encoded in 2 bytes in UTF-8 which first byte is 0xC2 (0b11000010)
2443                // Second byte follows the pattern 10xxxxxx
2444                let first = std::str::from_utf8(&[0b11000010, 0b10000000])
2445                    .unwrap()
2446                    .chars()
2447                    .next()
2448                    .unwrap();
2449                let last = std::str::from_utf8(&[0b11000010, 0b10111111])
2450                    .unwrap()
2451                    .chars()
2452                    .next()
2453                    .unwrap();
2454                let mut utf8 = [0; 2];
2455                for ch in first..=last {
2456                    ch.encode_utf8(&mut utf8);
2457                    let description = format!("UTF-8 [{:02x} {:02x}] = `{}`", utf8[0], utf8[1], ch);
2458                    let input = std::str::from_utf8(&utf8).expect(&description);
2459
2460                    dbg!((input, &description));
2461                    if ch == '\u{0085}' {
2462                        assert_eq!(normalize_xml11_eols(input), "\n", "{}", description);
2463                    } else {
2464                        assert_eq!(normalize_xml11_eols(input), input, "{}", description);
2465                    }
2466                }
2467                assert_eq!((first..=last).count(), 64);
2468            }
2469
2470            #[test]
2471            fn utf8_0x0d_0xc2() {
2472                // All possible characters encoded in 2 bytes in UTF-8 which first byte is 0xC2 (0b11000010)
2473                // Second byte follows the pattern 10xxxxxx
2474                let first = std::str::from_utf8(&[0b11000010, 0b10000000])
2475                    .unwrap()
2476                    .chars()
2477                    .next()
2478                    .unwrap();
2479                let last = std::str::from_utf8(&[0b11000010, 0b10111111])
2480                    .unwrap()
2481                    .chars()
2482                    .next()
2483                    .unwrap();
2484                let mut utf8 = [b'\r', 0, 0];
2485                for ch in first..=last {
2486                    ch.encode_utf8(&mut utf8[1..]);
2487                    let description = format!(
2488                        "UTF-8 [{:02x} {:02x} {:02x}] = `{}`",
2489                        utf8[0], utf8[1], utf8[2], ch
2490                    );
2491                    let input = std::str::from_utf8(&utf8).expect(&description);
2492
2493                    dbg!((input, &description));
2494                    if ch == '\u{0085}' {
2495                        assert_eq!(normalize_xml11_eols(input), "\n", "{}", description);
2496                    } else {
2497                        // utf8 is copied, because [u8; 3] implements Copy
2498                        let mut expected = utf8;
2499                        expected[0] = b'\n';
2500                        let expected = std::str::from_utf8(&expected).expect(&description);
2501                        assert_eq!(normalize_xml11_eols(input), expected, "{}", description);
2502                    }
2503                }
2504                assert_eq!((first..=last).count(), 64);
2505            }
2506
2507            #[test]
2508            fn utf8_0xe2() {
2509                // All possible characters encoded in 3 bytes in UTF-8 which first byte is 0xE2 (0b11100010)
2510                // Second and third bytes follows the pattern 10xxxxxx
2511                let first = std::str::from_utf8(&[0b11100010, 0b10000000, 0b10000000])
2512                    .unwrap()
2513                    .chars()
2514                    .next()
2515                    .unwrap();
2516                let last = std::str::from_utf8(&[0b11100010, 0b10111111, 0b10111111])
2517                    .unwrap()
2518                    .chars()
2519                    .next()
2520                    .unwrap();
2521                let mut buf = [0; 3];
2522                for ch in first..=last {
2523                    let input = &*ch.encode_utf8(&mut buf);
2524                    let buf = input.as_bytes();
2525                    let description = format!(
2526                        "UTF-8 [{:02x} {:02x} {:02x}] = `{}`",
2527                        buf[0], buf[1], buf[2], ch
2528                    );
2529
2530                    dbg!((input, &description));
2531                    if ch == '\u{2028}' {
2532                        assert_eq!(normalize_xml11_eols(input), "\n", "{}", description);
2533                    } else {
2534                        assert_eq!(normalize_xml11_eols(input), input, "{}", description);
2535                    }
2536                }
2537                assert_eq!((first..=last).count(), 4096);
2538            }
2539        }
2540
2541        mod xml10 {
2542            use super::*;
2543            use pretty_assertions::assert_eq;
2544
2545            #[test]
2546            fn empty() {
2547                assert_eq!(normalize_xml10_eols(""), "");
2548            }
2549
2550            #[test]
2551            fn already_normalized() {
2552                assert_eq!(
2553                    normalize_xml10_eols("\nalready \n\n normalized\n"),
2554                    "\nalready \n\n normalized\n",
2555                );
2556            }
2557
2558            #[test]
2559            fn cr_lf() {
2560                assert_eq!(
2561                    normalize_xml10_eols("\r\nsome\r\n\r\ntext"),
2562                    "\nsome\n\ntext"
2563                );
2564            }
2565
2566            #[test]
2567            fn cr_u0085() {
2568                assert_eq!(
2569                    normalize_xml10_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"),
2570                    "\n\u{0085}some\n\u{0085}\n\u{0085}text",
2571                );
2572            }
2573
2574            #[test]
2575            fn u0085() {
2576                assert_eq!(
2577                    normalize_xml10_eols("\u{0085}some\u{0085}\u{0085}text"),
2578                    "\u{0085}some\u{0085}\u{0085}text",
2579                );
2580            }
2581
2582            #[test]
2583            fn u2028() {
2584                assert_eq!(
2585                    normalize_xml10_eols("\u{2028}some\u{2028}\u{2028}text"),
2586                    "\u{2028}some\u{2028}\u{2028}text",
2587                );
2588            }
2589
2590            #[test]
2591            fn mixed() {
2592                assert_eq!(
2593                    normalize_xml10_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"),
2594                    "\n\n\n\u{2028}\n\nsome\n\u{0085}\n\u{0085}text",
2595                );
2596            }
2597        }
2598    }
2599
2600    mod attribute {
2601        use super::*;
2602        use pretty_assertions::assert_eq;
2603
2604        #[test]
2605        fn empty() {
2606            assert_eq!(
2607                normalize_xml10_attribute_value("", 5, |_| { None }),
2608                Ok("".into())
2609            );
2610            assert_eq!(
2611                normalize_xml11_attribute_value("", 5, |_| { None }),
2612                Ok("".into())
2613            );
2614        }
2615
2616        #[test]
2617        fn already_normalized() {
2618            assert_eq!(
2619                normalize_xml10_attribute_value("already normalized", 5, |_| { None }),
2620                Ok("already normalized".into())
2621            );
2622            assert_eq!(
2623                normalize_xml11_attribute_value("already normalized", 5, |_| { None }),
2624                Ok("already normalized".into())
2625            );
2626        }
2627
2628        #[test]
2629        fn only_spaces() {
2630            assert_eq!(
2631                normalize_xml10_attribute_value("   ", 5, |_| { None }),
2632                Ok("   ".into())
2633            );
2634            assert_eq!(
2635                normalize_xml11_attribute_value("   ", 5, |_| { None }),
2636                Ok("   ".into())
2637            );
2638
2639            assert_eq!(
2640                normalize_xml10_attribute_value("\t\t\t", 5, |_| { None }),
2641                Ok("   ".into())
2642            );
2643            assert_eq!(
2644                normalize_xml11_attribute_value("\t\t\t", 5, |_| { None }),
2645                Ok("   ".into())
2646            );
2647
2648            assert_eq!(
2649                normalize_xml10_attribute_value("\r\r\r", 5, |_| { None }),
2650                Ok("   ".into())
2651            );
2652            assert_eq!(
2653                normalize_xml11_attribute_value("\r\r\r", 5, |_| { None }),
2654                Ok("   ".into())
2655            );
2656
2657            assert_eq!(
2658                normalize_xml10_attribute_value("\n\n\n", 5, |_| { None }),
2659                Ok("   ".into())
2660            );
2661            assert_eq!(
2662                normalize_xml11_attribute_value("\n\n\n", 5, |_| { None }),
2663                Ok("   ".into())
2664            );
2665
2666            assert_eq!(
2667                normalize_xml10_attribute_value("\r\n\r\n\r\n", 5, |_| { None }),
2668                Ok("   ".into())
2669            );
2670            assert_eq!(
2671                normalize_xml11_attribute_value("\r\n\r\n\r\n", 5, |_| { None }),
2672                Ok("   ".into())
2673            );
2674
2675            assert_eq!(
2676                normalize_xml10_attribute_value("\t\t\n\n\r\r  ", 5, |_| None),
2677                Ok("        ".into())
2678            );
2679            assert_eq!(
2680                normalize_xml11_attribute_value("\t\t\n\n\r\r  ", 5, |_| None),
2681                Ok("        ".into())
2682            );
2683
2684            assert_eq!(
2685                normalize_xml10_attribute_value("\u{0085}\u{0085}\u{0085}", 5, |_| { None }),
2686                Ok("\u{0085}\u{0085}\u{0085}".into())
2687            );
2688            assert_eq!(
2689                normalize_xml11_attribute_value("\u{0085}\u{0085}\u{0085}", 5, |_| { None }),
2690                Ok("   ".into())
2691            );
2692
2693            assert_eq!(
2694                normalize_xml10_attribute_value("\r\u{0085}\r\u{0085}\r\u{0085}", 5, |_| { None }),
2695                Ok(" \u{0085} \u{0085} \u{0085}".into())
2696            );
2697            assert_eq!(
2698                normalize_xml11_attribute_value("\r\u{0085}\r\u{0085}\r\u{0085}", 5, |_| { None }),
2699                Ok("   ".into())
2700            );
2701
2702            assert_eq!(
2703                normalize_xml10_attribute_value("\u{2028}\u{2028}\u{2028}", 5, |_| { None }),
2704                Ok("\u{2028}\u{2028}\u{2028}".into())
2705            );
2706            assert_eq!(
2707                normalize_xml11_attribute_value("\u{2028}\u{2028}\u{2028}", 5, |_| { None }),
2708                Ok("   ".into())
2709            );
2710        }
2711
2712        #[test]
2713        fn mixed_content_normalization() {
2714            // Text with both whitespace and character references
2715            assert_eq!(
2716                normalize_xml10_attribute_value("hello\t&#32;\nworld", 5, |_| None),
2717                Ok("hello   world".into())
2718            );
2719            assert_eq!(
2720                normalize_xml11_attribute_value("hello\t&#32;\nworld", 5, |_| None),
2721                Ok("hello   world".into())
2722            );
2723
2724            // Whitespace around entities
2725            assert_eq!(
2726                normalize_xml10_attribute_value("text &entity; \n more", 5, |_| {
2727                    Some("replacement")
2728                }),
2729                Ok("text replacement   more".into())
2730            );
2731            assert_eq!(
2732                normalize_xml11_attribute_value("text &entity; \n more", 5, |_| {
2733                    Some("replacement")
2734                }),
2735                Ok("text replacement   more".into())
2736            );
2737
2738            // Complex mix of tabs, newlines, and character references
2739            // \t → space, &#65; → A, \r\n → space, &#66; → B, \t → space
2740            assert_eq!(
2741                normalize_xml10_attribute_value("\t&#65;\r\n&#66;\t", 5, |_| None),
2742                Ok(" A B ".into())
2743            );
2744            assert_eq!(
2745                normalize_xml11_attribute_value("\t&#65;\r\n&#66;\t", 5, |_| None),
2746                Ok(" A B ".into())
2747            );
2748        }
2749
2750        #[test]
2751        fn leading_trailing_whitespace() {
2752            // Leading whitespace preserved but normalized
2753            assert_eq!(
2754                normalize_xml10_attribute_value("  text", 5, |_| None),
2755                Ok("  text".into())
2756            );
2757            assert_eq!(
2758                normalize_xml11_attribute_value("  text", 5, |_| None),
2759                Ok("  text".into())
2760            );
2761
2762            assert_eq!(
2763                normalize_xml10_attribute_value("\t\ttext", 5, |_| None),
2764                Ok("  text".into())
2765            );
2766            assert_eq!(
2767                normalize_xml11_attribute_value("\t\ttext", 5, |_| None),
2768                Ok("  text".into())
2769            );
2770
2771            // Trailing whitespace preserved but normalized
2772            assert_eq!(
2773                normalize_xml10_attribute_value("text  ", 5, |_| None),
2774                Ok("text  ".into())
2775            );
2776            assert_eq!(
2777                normalize_xml11_attribute_value("text  ", 5, |_| None),
2778                Ok("text  ".into())
2779            );
2780
2781            assert_eq!(
2782                normalize_xml10_attribute_value("text\n\n", 5, |_| None),
2783                Ok("text  ".into())
2784            );
2785            assert_eq!(
2786                normalize_xml11_attribute_value("text\n\n", 5, |_| None),
2787                Ok("text  ".into())
2788            );
2789
2790            // Both leading and trailing
2791            assert_eq!(
2792                normalize_xml10_attribute_value("\n\ntext\n\n", 5, |_| None),
2793                Ok("  text  ".into())
2794            );
2795            assert_eq!(
2796                normalize_xml11_attribute_value("\n\ntext\n\n", 5, |_| None),
2797                Ok("  text  ".into())
2798            );
2799        }
2800
2801        #[test]
2802        fn characters() {
2803            assert_eq!(
2804                normalize_xml10_attribute_value("string with &#32; character", 5, |_| { None }),
2805                Ok("string with   character".into())
2806            );
2807            assert_eq!(
2808                normalize_xml10_attribute_value("string with &#x20; character", 5, |_| { None }),
2809                Ok("string with   character".into())
2810            );
2811
2812            assert_eq!(
2813                normalize_xml11_attribute_value("string with &#32; character", 5, |_| { None }),
2814                Ok("string with   character".into())
2815            );
2816            assert_eq!(
2817                normalize_xml11_attribute_value("string with &#x20; character", 5, |_| { None }),
2818                Ok("string with   character".into())
2819            );
2820        }
2821
2822        #[test]
2823        fn character_reference_edge_cases() {
2824            // Invalid hex character references
2825            assert!(matches!(
2826                normalize_xml10_attribute_value("&#xGG;", 5, |_| None),
2827                Err(EscapeError::InvalidCharRef(
2828                    ParseCharRefError::InvalidNumber(_)
2829                ))
2830            ));
2831            assert!(matches!(
2832                normalize_xml11_attribute_value("&#xGG;", 5, |_| None),
2833                Err(EscapeError::InvalidCharRef(
2834                    ParseCharRefError::InvalidNumber(_)
2835                ))
2836            ));
2837
2838            // Invalid decimal character references
2839            assert!(matches!(
2840                normalize_xml10_attribute_value("&#ABC;", 5, |_| None),
2841                Err(EscapeError::InvalidCharRef(
2842                    ParseCharRefError::InvalidNumber(_)
2843                ))
2844            ));
2845
2846            // Out-of-range Unicode (beyond U+10FFFF)
2847            assert_eq!(
2848                normalize_xml10_attribute_value("&#x110000;", 5, |_| None),
2849                Err(EscapeError::InvalidCharRef(
2850                    ParseCharRefError::InvalidCodepoint(0x110000)
2851                ))
2852            );
2853            assert_eq!(
2854                normalize_xml11_attribute_value("&#x110000;", 5, |_| None),
2855                Err(EscapeError::InvalidCharRef(
2856                    ParseCharRefError::InvalidCodepoint(0x110000)
2857                ))
2858            );
2859
2860            // Large decimal value that is not a valid Unicode codepoint
2861            assert_eq!(
2862                normalize_xml10_attribute_value("&#999999999;", 5, |_| None),
2863                Err(EscapeError::InvalidCharRef(
2864                    ParseCharRefError::InvalidCodepoint(999999999)
2865                ))
2866            );
2867
2868            // Non-whitespace character references
2869            assert_eq!(
2870                normalize_xml10_attribute_value("&#65;&#66;&#67;", 5, |_| None),
2871                Ok("ABC".into())
2872            );
2873            assert_eq!(
2874                normalize_xml11_attribute_value("&#65;&#66;&#67;", 5, |_| None),
2875                Ok("ABC".into())
2876            );
2877
2878            // Character references at boundaries
2879            assert_eq!(
2880                normalize_xml10_attribute_value("&#32;text", 5, |_| None),
2881                Ok(" text".into())
2882            );
2883            assert_eq!(
2884                normalize_xml10_attribute_value("text&#32;", 5, |_| None),
2885                Ok("text ".into())
2886            );
2887            assert_eq!(
2888                normalize_xml11_attribute_value("&#32;text", 5, |_| None),
2889                Ok(" text".into())
2890            );
2891            assert_eq!(
2892                normalize_xml11_attribute_value("text&#32;", 5, |_| None),
2893                Ok("text ".into())
2894            );
2895        }
2896
2897        #[test]
2898        fn entities() {
2899            assert_eq!(
2900                normalize_xml10_attribute_value("string with &entity; reference", 5, |_| {
2901                    Some("replacement")
2902                }),
2903                Ok("string with replacement reference".into())
2904            );
2905            assert_eq!(
2906                normalize_xml10_attribute_value("string with &entity-1; reference", 5, |entity| {
2907                    match entity {
2908                        "entity-1" => Some("recursive &entity-2;"),
2909                        "entity-2" => Some("entity&#32;2"),
2910                        _ => None,
2911                    }
2912                }),
2913                Ok("string with recursive entity 2 reference".into())
2914            );
2915            // Special case: '&' should not treated as unterminated reference, but everything '&...' should
2916            assert_eq!(
2917                normalize_xml10_attribute_value(
2918                    "string with &entity;amp; reference",
2919                    5,
2920                    |entity| {
2921                        match entity {
2922                            "entity" => Some("&amp;"),
2923                            "amp" => Some("&"),
2924                            _ => None,
2925                        }
2926                    }
2927                ),
2928                Ok("string with &amp; reference".into())
2929            );
2930
2931            assert_eq!(
2932                normalize_xml11_attribute_value("string with &entity; reference", 5, |_| {
2933                    Some("replacement")
2934                }),
2935                Ok("string with replacement reference".into())
2936            );
2937            assert_eq!(
2938                normalize_xml11_attribute_value("string with &entity-1; reference", 5, |entity| {
2939                    match entity {
2940                        "entity-1" => Some("recursive &entity-2;"),
2941                        "entity-2" => Some("entity&#32;2"),
2942                        _ => None,
2943                    }
2944                }),
2945                Ok("string with recursive entity 2 reference".into())
2946            );
2947            // Special case: '&' should not treated as unterminated reference, but everything '&...' should
2948            assert_eq!(
2949                normalize_xml11_attribute_value(
2950                    "string with &entity;amp; reference",
2951                    5,
2952                    |entity| {
2953                        match entity {
2954                            "entity" => Some("&amp;"),
2955                            "amp" => Some("&"),
2956                            _ => None,
2957                        }
2958                    }
2959                ),
2960                Ok("string with &amp; reference".into())
2961            );
2962        }
2963
2964        #[test]
2965        fn unknown_entity() {
2966            assert_eq!(
2967                normalize_xml10_attribute_value(
2968                    "string with unknown &entity; reference",
2969                    //                   ^     ^ = 21..27
2970                    5,
2971                    |_| None
2972                ),
2973                Err(EscapeError::UnrecognizedEntity(
2974                    21..27,
2975                    "entity".to_string(),
2976                ))
2977            );
2978
2979            assert_eq!(
2980                normalize_xml11_attribute_value(
2981                    "string with unknown &entity; reference",
2982                    //                   ^     ^ = 21..27
2983                    5,
2984                    |_| None
2985                ),
2986                Err(EscapeError::UnrecognizedEntity(
2987                    21..27,
2988                    "entity".to_string(),
2989                ))
2990            );
2991        }
2992
2993        #[test]
2994        fn predefined_entities() {
2995            // Test how predefined XML entities are handled
2996            assert_eq!(
2997                normalize_xml10_attribute_value(
2998                    "&lt;&gt;&quot;&apos;",
2999                    5,
3000                    resolve_predefined_entity
3001                ),
3002                Ok("<>\"'".into())
3003            );
3004            assert_eq!(
3005                normalize_xml11_attribute_value(
3006                    "&lt;&gt;&quot;&apos;",
3007                    5,
3008                    resolve_predefined_entity
3009                ),
3010                Ok("<>\"'".into())
3011            );
3012
3013            // &amp; followed by more entities
3014            assert_eq!(
3015                normalize_xml10_attribute_value("&amp;&lt;", 5, resolve_predefined_entity),
3016                Ok("&<".into())
3017            );
3018            assert_eq!(
3019                normalize_xml11_attribute_value("&amp;&lt;", 5, resolve_predefined_entity),
3020                Ok("&<".into())
3021            );
3022
3023            // Multiple &amp; in sequence
3024            assert_eq!(
3025                normalize_xml10_attribute_value("&amp;&amp;&amp;", 5, resolve_predefined_entity),
3026                Ok("&&&".into())
3027            );
3028        }
3029
3030        #[test]
3031        fn unclosed_entity() {
3032            // Text consists only of an unterminated entity reference - no name
3033            assert_eq!(
3034                normalize_xml10_attribute_value("& ", 5, |_| None),
3035                Err(EscapeError::UnterminatedEntity(0..2))
3036            );
3037            assert_eq!(
3038                normalize_xml11_attribute_value("& ", 5, |_| None),
3039                Err(EscapeError::UnterminatedEntity(0..2))
3040            );
3041
3042            // Text consists only of an unterminated character reference - no value
3043            assert_eq!(
3044                normalize_xml10_attribute_value("&# ", 5, |_| None),
3045                Err(EscapeError::UnterminatedEntity(0..3))
3046            );
3047            assert_eq!(
3048                normalize_xml11_attribute_value("&# ", 5, |_| None),
3049                Err(EscapeError::UnterminatedEntity(0..3))
3050            );
3051
3052            // Text consists only of an unterminated entity reference
3053            assert_eq!(
3054                normalize_xml10_attribute_value("&entity", 5, |_| Some("text")),
3055                Err(EscapeError::UnterminatedEntity(0..7))
3056            );
3057            assert_eq!(
3058                normalize_xml11_attribute_value("&entity", 5, |_| Some("text")),
3059                Err(EscapeError::UnterminatedEntity(0..7))
3060            );
3061
3062            // Unclosed entity reference within text
3063            assert_eq!(
3064                normalize_xml10_attribute_value(
3065                    "string with unclosed &entity reference",
3066                    //                    ^ = 21           ^ = 38
3067                    5,
3068                    |_| Some("replacement")
3069                ),
3070                Err(EscapeError::UnterminatedEntity(21..38))
3071            );
3072            assert_eq!(
3073                normalize_xml11_attribute_value(
3074                    "string with unclosed &entity reference",
3075                    //                    ^ = 21           ^ = 38
3076                    5,
3077                    |_| Some("replacement")
3078                ),
3079                Err(EscapeError::UnterminatedEntity(21..38))
3080            );
3081
3082            // Unclosed character reference within text
3083            assert_eq!(
3084                normalize_xml10_attribute_value(
3085                    "string with unclosed &#32 (character) reference",
3086                    //                    ^ = 21                    ^ = 47
3087                    5,
3088                    |_| None
3089                ),
3090                Err(EscapeError::UnterminatedEntity(21..47))
3091            );
3092            assert_eq!(
3093                normalize_xml11_attribute_value(
3094                    "string with unclosed &#32 (character) reference",
3095                    //                    ^ = 21                    ^ = 47
3096                    5,
3097                    |_| None
3098                ),
3099                Err(EscapeError::UnterminatedEntity(21..47))
3100            );
3101        }
3102
3103        #[test]
3104        fn malformed_entity() {
3105            // Empty entity name - treated as unrecognized entity with empty name
3106            assert_eq!(
3107                normalize_xml10_attribute_value("&;", 5, |_| None),
3108                Err(EscapeError::UnrecognizedEntity(1..1, "".to_string()))
3109            );
3110            assert_eq!(
3111                normalize_xml11_attribute_value("&;", 5, |_| None),
3112                Err(EscapeError::UnrecognizedEntity(1..1, "".to_string()))
3113            );
3114
3115            // Numeric entity name (should be treated as unknown entity)
3116            assert_eq!(
3117                normalize_xml10_attribute_value("&123;", 5, |_| None),
3118                Err(EscapeError::UnrecognizedEntity(1..4, "123".to_string()))
3119            );
3120            assert_eq!(
3121                normalize_xml11_attribute_value("&123;", 5, |_| None),
3122                Err(EscapeError::UnrecognizedEntity(1..4, "123".to_string()))
3123            );
3124
3125            // Empty character reference
3126            assert!(matches!(
3127                normalize_xml10_attribute_value("&#;", 5, |_| None),
3128                Err(EscapeError::InvalidCharRef(
3129                    ParseCharRefError::InvalidNumber(_)
3130                ))
3131            ));
3132            assert!(matches!(
3133                normalize_xml10_attribute_value("&#x;", 5, |_| None),
3134                Err(EscapeError::InvalidCharRef(
3135                    ParseCharRefError::InvalidNumber(_)
3136                ))
3137            ));
3138        }
3139
3140        #[test]
3141        fn recursive_entity() {
3142            assert_eq!(
3143                normalize_xml10_attribute_value("&entity; reference", 5, |_| Some(
3144                    "recursive &entity;"
3145                )),
3146                Err(EscapeError::TooManyNestedEntities),
3147            );
3148
3149            assert_eq!(
3150                normalize_xml11_attribute_value("&entity; reference", 5, |_| Some(
3151                    "recursive &entity;"
3152                )),
3153                Err(EscapeError::TooManyNestedEntities),
3154            );
3155        }
3156
3157        #[test]
3158        fn recursion_depth() {
3159            // Test at exactly 4 levels with limit of 5 (should work)
3160            // e1 → e2 → e3 → e4 → text (4 entity expansions)
3161            assert_eq!(
3162                normalize_xml10_attribute_value("&e1;", 5, |entity| {
3163                    match entity {
3164                        "e1" => Some("&e2;"),
3165                        "e2" => Some("&e3;"),
3166                        "e3" => Some("&e4;"),
3167                        "e4" => Some("text"),
3168                        _ => None,
3169                    }
3170                }),
3171                Ok("text".into())
3172            );
3173
3174            // Test at exactly 5 levels with limit of 5 (should work at boundary)
3175            // e1 → e2 → e3 → e4 → e5 → text (5 entity expansions)
3176            assert_eq!(
3177                normalize_xml10_attribute_value("&e1;", 5, |entity| {
3178                    match entity {
3179                        "e1" => Some("&e2;"),
3180                        "e2" => Some("&e3;"),
3181                        "e3" => Some("&e4;"),
3182                        "e4" => Some("&e5;"),
3183                        "e5" => Some("text"),
3184                        _ => None,
3185                    }
3186                }),
3187                Ok("text".into())
3188            );
3189
3190            // Test at exactly 6 levels with limit of 5 (should fail)
3191            // e1 → e2 → e3 → e4 → e5 → e6 → text (6 entity expansions exceeds limit)
3192            assert_eq!(
3193                normalize_xml10_attribute_value("&e1;", 5, |entity| {
3194                    match entity {
3195                        "e1" => Some("&e2;"),
3196                        "e2" => Some("&e3;"),
3197                        "e3" => Some("&e4;"),
3198                        "e4" => Some("&e5;"),
3199                        "e5" => Some("&e6;"),
3200                        "e6" => Some("text"),
3201                        _ => None,
3202                    }
3203                }),
3204                Err(EscapeError::TooManyNestedEntities)
3205            );
3206        }
3207    }
3208}