markdown/util/
character_reference.rs

1//! Helpers for character references.
2
3use crate::util::constant::{
4    CHARACTER_REFERENCES, CHARACTER_REFERENCES_HTML_4, CHARACTER_REFERENCE_DECIMAL_SIZE_MAX,
5    CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX,
6};
7use alloc::string::String;
8use core::str;
9
10/// Decode named character references.
11///
12/// Turn the name coming from a named character reference (without the `&` or
13/// `;`) into a string.
14/// This looks the given string up at `0` in the tuples of
15/// `CHARACTER_REFERENCES` (or `CHARACTER_REFERENCES_HTML_4`)
16/// and then takes the corresponding value from `1`.
17///
18/// The `html5` boolean is used for named character references, and specifier
19/// whether the 2125 names from HTML 5 or the 252 names from HTML 4 are
20/// supported.
21///
22/// The result is `String` instead of `char` because named character references
23/// can expand into multiple characters.
24///
25/// ## Examples
26///
27/// ```rust ignore
28/// use markdown::util::decode_character_reference::decode_named;
29///
30/// assert_eq!(decode_named("amp", true), "&");
31/// assert_eq!(decode_named("AElig", true), "Æ");
32/// assert_eq!(decode_named("aelig", true), "æ");
33/// ```
34///
35/// ## References
36///
37/// * [`wooorm/decode-named-character-reference`](https://github.com/wooorm/decode-named-character-reference)
38/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.31/#entity-and-numeric-character-references)
39pub fn decode_named(value: &str, html5: bool) -> Option<String> {
40    let mut iter = if html5 {
41        CHARACTER_REFERENCES.iter()
42    } else {
43        CHARACTER_REFERENCES_HTML_4.iter()
44    };
45    iter.find(|d| d.0 == value).map(|d| d.1.into())
46}
47
48/// Decode numeric character references.
49///
50/// Turn the number (in string form as either hexadecimal or decimal) coming
51/// from a numeric character reference into a string.
52/// The base of the string form must be passed as the `radix` parameter, as
53/// `10` (decimal) or `16` (hexadecimal).
54///
55/// This returns a `String` form of the associated character or a replacement
56/// character for C0 control characters (except for ASCII whitespace), C1
57/// control characters, lone surrogates, noncharacters, and out of range
58/// characters.
59///
60/// ## Examples
61///
62/// ```rust ignore
63/// use markdown::util::decode_character_reference::decode_numeric;
64///
65/// assert_eq!(decode_numeric("123", 10), "{");
66/// assert_eq!(decode_numeric("9", 16), "\t");
67/// assert_eq!(decode_numeric("0", 10), "�"); // Not allowed.
68/// ```
69///
70/// ## Panics
71///
72/// This function panics if a invalid string or an out of bounds valid string
73/// is given.
74/// It is expected that figuring out whether a number is allowed is handled in
75/// the parser.
76/// When `markdown-rs` is used, this function never panics.
77///
78/// ## References
79///
80/// * [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference)
81/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.31/#entity-and-numeric-character-references)
82pub fn decode_numeric(value: &str, radix: u32) -> String {
83    if let Some(char) = char::from_u32(u32::from_str_radix(value, radix).unwrap()) {
84        if !matches!(char,
85            // C0 except for HT, LF, FF, CR, space.
86            '\0'..='\u{08}' | '\u{0B}' | '\u{0E}'..='\u{1F}' |
87            // Control character (DEL) of C0, and C1 controls.
88            '\u{7F}'..='\u{9F}'
89            // Lone surrogates, noncharacters, and out of range are handled by
90            // Rust.
91        ) {
92            return char.into();
93        }
94    }
95
96    char::REPLACEMENT_CHARACTER.into()
97}
98
99/// Decode a character reference.
100///
101/// This turns the number (in string form as either hexadecimal or decimal) or
102/// name from a character reference into a string.
103///
104/// The marker specifies the format: `#` for hexadecimal, `x` for decimal, and
105/// `&` for named.
106///
107/// The `html5` boolean is used for named character references, and specifier
108/// whether the 2125 names from HTML 5 or the 252 names from HTML 4 are
109/// supported.
110///
111/// ## Panics
112///
113/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`.
114pub fn decode(value: &str, marker: u8, html5: bool) -> Option<String> {
115    match marker {
116        b'#' => Some(decode_numeric(value, 10)),
117        b'x' => Some(decode_numeric(value, 16)),
118        b'&' => decode_named(value, html5),
119        _ => unreachable!("Unexpected marker `{}`", marker),
120    }
121}
122
123/// Get the maximum size of a value for different kinds of references.
124///
125/// The value is the stuff after the markers, before the `;`.
126///
127/// ## Panics
128///
129/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`.
130pub fn value_max(marker: u8) -> usize {
131    match marker {
132        b'&' => CHARACTER_REFERENCE_NAMED_SIZE_MAX,
133        b'x' => CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX,
134        b'#' => CHARACTER_REFERENCE_DECIMAL_SIZE_MAX,
135        _ => unreachable!("Unexpected marker `{}`", marker),
136    }
137}
138
139/// Get a test to check if a byte is allowed as a value for different kinds of
140/// references.
141///
142/// The value is the stuff after the markers, before the `;`.
143///
144/// ## Panics
145///
146/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`.
147pub fn value_test(marker: u8) -> fn(&u8) -> bool {
148    match marker {
149        b'&' => u8::is_ascii_alphanumeric,
150        b'x' => u8::is_ascii_hexdigit,
151        b'#' => u8::is_ascii_digit,
152        _ => unreachable!("Unexpected marker `{}`", marker),
153    }
154}
155
156/// Decode character references in a string.
157///
158/// > 👉 **Note**: this currently only supports the 252 named character
159/// > references from HTML 4, as it’s only used for JSX.
160/// >
161/// > If it’s ever needed to support HTML 5 (which is what normal markdown
162/// > uses), a boolean parameter can be added here.
163pub fn parse(value: &str) -> String {
164    let bytes = value.as_bytes();
165    let mut index = 0;
166    let len = bytes.len();
167    // Grows a bit smaller with each character reference.
168    let mut result = String::with_capacity(value.len());
169    let mut start = 0;
170
171    while index < len {
172        if bytes[index] == b'&' {
173            let (marker, value_start) = if index + 1 < len && bytes[index + 1] == b'#' {
174                if index + 2 < len && matches!(bytes[index + 2], b'x' | b'X') {
175                    (b'x', index + 3)
176                } else {
177                    (b'#', index + 2)
178                }
179            } else {
180                (b'&', index + 1)
181            };
182
183            let max = value_max(marker);
184            let test = value_test(marker);
185            let mut value_index = 0;
186            while value_index < max && (value_start + value_index) < len {
187                if !test(&bytes[value_start + value_index]) {
188                    break;
189                }
190                value_index += 1;
191            }
192
193            let value_end = value_start + value_index;
194
195            // Non empty and terminated.
196            if value_index > 0 && bytes[value_end] == b';' {
197                if let Some(decoded) = decode(
198                    str::from_utf8(&bytes[value_start..value_end]).unwrap(),
199                    marker,
200                    false,
201                ) {
202                    result.push_str(&value[start..index]);
203                    result.push_str(&decoded);
204                    start = value_end + 1;
205                    index = start;
206                    continue;
207                }
208            }
209        }
210
211        index += 1;
212    }
213
214    result.push_str(&value[start..]);
215
216    result
217}
markdown/util/character_reference.rs

markdown/util/
character_reference.rs