iri_string/parser/
char.rs

1//! Characters.
2
3use crate::spec::Spec;
4
5/// A mask to test whether the character is continue character of `scheme`.
6// `ALPHA / DIGIT / "+" / "-" / "."`
7const MASK_SCHEME_CONTINUE: u8 = 1 << 0;
8
9/// A mask to test whether the character matches `unreserved`.
10// `unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"`
11const MASK_UNRESERVED: u8 = 1 << 1;
12
13/// A mask to test whether the character matches `gen-delims`.
14// `gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"`
15const MASK_GEN_DELIMS: u8 = 1 << 2;
16
17/// A mask to test whether the character matches `sub-delims`.
18// `sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="`
19const MASK_SUB_DELIMS: u8 = 1 << 3;
20
21/// A mask to test whether the character matches `pchar` (modulo percent-encoded bytes).
22// `pchar = unreserved / pct-encoded / sub-delims / ":" / "@"`
23const MASK_PCHAR: u8 = 1 << 4;
24
25/// A mask to test whether the character can appear in `query` and `fragment`.
26// `query = *( pchar / "/" / "?" )`
27// `fragment = *( pchar / "/" / "?" )`
28const MASK_FRAG_QUERY: u8 = 1 << 5;
29
30/// A mask to test whether the character can appear in `userinfo` and address of `IPvFuture`.
31// `userinfo = *( unreserved / pct-encoded / sub-delims / ":" )`
32const MASK_USERINFO_IPVFUTUREADDR: u8 = 1 << 6;
33
34/// A mask to test whether the character matches `pchar` (modulo percent-encoded bytes) or slash.
35const MASK_PCHAR_SLASH: u8 = 1 << 7;
36
37/// ASCII characters' properties.
38const TABLE: [u8; 128] = [
39    0b_0000_0000, // NUL
40    0b_0000_0000, // SOH
41    0b_0000_0000, // STX
42    0b_0000_0000, // ETX
43    0b_0000_0000, // EOT
44    0b_0000_0000, // ENQ
45    0b_0000_0000, // ACK
46    0b_0000_0000, // BEL
47    0b_0000_0000, // BS
48    0b_0000_0000, // HT
49    0b_0000_0000, // LF
50    0b_0000_0000, // VT
51    0b_0000_0000, // FF
52    0b_0000_0000, // CR
53    0b_0000_0000, // SO
54    0b_0000_0000, // SI
55    0b_0000_0000, // DLE
56    0b_0000_0000, // DC1
57    0b_0000_0000, // DC2
58    0b_0000_0000, // DC3
59    0b_0000_0000, // DC4
60    0b_0000_0000, // NAK
61    0b_0000_0000, // SYN
62    0b_0000_0000, // ETB
63    0b_0000_0000, // CAN
64    0b_0000_0000, // EM
65    0b_0000_0000, // SUB
66    0b_0000_0000, // ESC
67    0b_0000_0000, // FS
68    0b_0000_0000, // GS
69    0b_0000_0000, // RS
70    0b_0000_0000, // US
71    0b_0000_0000, // SPACE
72    0b_1111_1000, // !
73    0b_0000_0000, // "
74    0b_0000_0100, // #
75    0b_1111_1000, // $
76    0b_0000_0000, // %
77    0b_1111_1000, // &
78    0b_1111_1000, // '
79    0b_1111_1000, // (
80    0b_1111_1000, // )
81    0b_1111_1000, // *
82    0b_1111_1001, // +
83    0b_1111_1000, // ,
84    0b_1111_0011, // -
85    0b_1111_0011, // .
86    0b_1010_0100, // /
87    0b_1111_0011, // 0
88    0b_1111_0011, // 1
89    0b_1111_0011, // 2
90    0b_1111_0011, // 3
91    0b_1111_0011, // 4
92    0b_1111_0011, // 5
93    0b_1111_0011, // 6
94    0b_1111_0011, // 7
95    0b_1111_0011, // 8
96    0b_1111_0011, // 9
97    0b_1111_0100, // :
98    0b_1111_1000, // ;
99    0b_0000_0000, // <
100    0b_1111_1000, // =
101    0b_0000_0000, // >
102    0b_0010_0100, // ?
103    0b_1011_0100, // @
104    0b_1111_0011, // A
105    0b_1111_0011, // B
106    0b_1111_0011, // C
107    0b_1111_0011, // D
108    0b_1111_0011, // E
109    0b_1111_0011, // F
110    0b_1111_0011, // G
111    0b_1111_0011, // H
112    0b_1111_0011, // I
113    0b_1111_0011, // J
114    0b_1111_0011, // K
115    0b_1111_0011, // L
116    0b_1111_0011, // M
117    0b_1111_0011, // N
118    0b_1111_0011, // O
119    0b_1111_0011, // P
120    0b_1111_0011, // Q
121    0b_1111_0011, // R
122    0b_1111_0011, // S
123    0b_1111_0011, // T
124    0b_1111_0011, // U
125    0b_1111_0011, // V
126    0b_1111_0011, // W
127    0b_1111_0011, // X
128    0b_1111_0011, // Y
129    0b_1111_0011, // Z
130    0b_0000_0100, // [
131    0b_0000_0000, // \
132    0b_0000_0100, // ]
133    0b_0000_0000, // ^
134    0b_1111_0010, // _
135    0b_0000_0000, // `
136    0b_1111_0011, // a
137    0b_1111_0011, // b
138    0b_1111_0011, // c
139    0b_1111_0011, // d
140    0b_1111_0011, // e
141    0b_1111_0011, // f
142    0b_1111_0011, // g
143    0b_1111_0011, // h
144    0b_1111_0011, // i
145    0b_1111_0011, // j
146    0b_1111_0011, // k
147    0b_1111_0011, // l
148    0b_1111_0011, // m
149    0b_1111_0011, // n
150    0b_1111_0011, // o
151    0b_1111_0011, // p
152    0b_1111_0011, // q
153    0b_1111_0011, // r
154    0b_1111_0011, // s
155    0b_1111_0011, // t
156    0b_1111_0011, // u
157    0b_1111_0011, // v
158    0b_1111_0011, // w
159    0b_1111_0011, // x
160    0b_1111_0011, // y
161    0b_1111_0011, // z
162    0b_0000_0000, // {
163    0b_0000_0000, // |
164    0b_0000_0000, // }
165    0b_1111_0010, // ~
166    0b_0000_0000, // DEL
167];
168
169/// Returns `true` if the given ASCII character is allowed as continue character of `scheme` part.
170#[inline]
171#[must_use]
172pub(crate) const fn is_ascii_scheme_continue(c: u8) -> bool {
173    (TABLE[c as usize] & MASK_SCHEME_CONTINUE) != 0
174}
175
176/// Returns `true` if the given ASCII character matches `unreserved`.
177#[inline]
178#[must_use]
179pub(crate) const fn is_ascii_unreserved(c: u8) -> bool {
180    (TABLE[c as usize] & MASK_UNRESERVED) != 0
181}
182
183/// Returns true if the character is unreserved.
184#[inline]
185#[must_use]
186pub(crate) fn is_unreserved<S: Spec>(c: char) -> bool {
187    if c.is_ascii() {
188        is_ascii_unreserved(c as u8)
189    } else {
190        S::is_nonascii_char_unreserved(c)
191    }
192}
193
194///// Returns `true` if the given ASCII character matches `gen-delims`.
195//#[inline]
196//#[must_use]
197//pub(crate) const fn is_ascii_gen_delims(c: u8) -> bool {
198//    (TABLE[c as usize] & MASK_GEN_DELIMS) != 0
199//}
200
201///// Returns `true` if the given ASCII character matches `sub-delims`.
202//#[inline]
203//#[must_use]
204//pub(crate) const fn is_ascii_sub_delims(c: u8) -> bool {
205//    (TABLE[c as usize] & MASK_SUB_DELIMS) != 0
206//}
207
208///// Returns `true` if the given ASCII character matches `reserved`.
209//#[inline]
210//#[must_use]
211//pub(crate) const fn is_ascii_reserved(c: u8) -> bool {
212//    (TABLE[c as usize] & (MASK_GEN_DELIMS | MASK_SUB_DELIMS)) != 0
213//}
214
215/// Returns `true` if the given ASCII character matches `pchar` modulo `pct-encoded`.
216#[inline]
217#[must_use]
218pub(crate) const fn is_ascii_pchar(c: u8) -> bool {
219    (TABLE[c as usize] & MASK_PCHAR) != 0
220}
221
222/// Returns `true` if the given ASCII character is allowed to appear in `query` and `fragment`.
223#[inline]
224#[must_use]
225pub(crate) const fn is_ascii_frag_query(c: u8) -> bool {
226    (TABLE[c as usize] & MASK_FRAG_QUERY) != 0
227}
228
229/// Returns `true` if the given non-ASCII character is allowed to appear in `iquery`.
230#[inline]
231#[must_use]
232pub(crate) fn is_nonascii_query<S: Spec>(c: char) -> bool {
233    S::is_nonascii_char_unreserved(c) || S::is_nonascii_char_private(c)
234}
235
236/// Returns `true` if the given non-ASCII character is allowed to appear in `ifragment`.
237#[inline]
238#[must_use]
239pub(crate) fn is_nonascii_fragment<S: Spec>(c: char) -> bool {
240    S::is_nonascii_char_unreserved(c)
241}
242
243/// Returns `true` if the given ASCII character is allowed to appear in `userinfo` and `IPvFuture`.
244#[inline]
245#[must_use]
246pub(crate) const fn is_ascii_userinfo_ipvfutureaddr(c: u8) -> bool {
247    (TABLE[c as usize] & MASK_USERINFO_IPVFUTUREADDR) != 0
248}
249
250/// Returns `true` if the given non-ASCII character is allowed to appear in `iuserinfo`.
251#[inline]
252#[must_use]
253pub(crate) fn is_nonascii_userinfo<S: Spec>(c: char) -> bool {
254    S::is_nonascii_char_unreserved(c)
255}
256
257/// Returns `true` if the given ASCII character is allowed to appear in `reg-name`
258#[inline]
259#[must_use]
260pub(crate) const fn is_ascii_regname(c: u8) -> bool {
261    (TABLE[c as usize] & (MASK_UNRESERVED | MASK_SUB_DELIMS)) != 0
262}
263
264/// Returns `true` if the given non-ASCII character is allowed to appear in `ireg-name`.
265#[inline]
266#[must_use]
267pub(crate) fn is_nonascii_regname<S: Spec>(c: char) -> bool {
268    S::is_nonascii_char_unreserved(c)
269}
270
271/// Returns `true` if the given ASCII character matches `pchar` modulo `pct-encoded` or a slash.
272#[inline]
273#[must_use]
274pub(crate) const fn is_ascii_pchar_slash(c: u8) -> bool {
275    (TABLE[c as usize] & MASK_PCHAR_SLASH) != 0
276}
277
278/// Checks if the given character matches `ucschar` rule.
279#[must_use]
280pub(crate) fn is_ucschar(c: char) -> bool {
281    matches!(
282        u32::from(c),
283        0xA0..=0xD7FF |
284        0xF900..=0xFDCF |
285        0xFDF0..=0xFFEF |
286        0x1_0000..=0x1_FFFD |
287        0x2_0000..=0x2_FFFD |
288        0x3_0000..=0x3_FFFD |
289        0x4_0000..=0x4_FFFD |
290        0x5_0000..=0x5_FFFD |
291        0x6_0000..=0x6_FFFD |
292        0x7_0000..=0x7_FFFD |
293        0x8_0000..=0x8_FFFD |
294        0x9_0000..=0x9_FFFD |
295        0xA_0000..=0xA_FFFD |
296        0xB_0000..=0xB_FFFD |
297        0xC_0000..=0xC_FFFD |
298        0xD_0000..=0xD_FFFD |
299        0xE_1000..=0xE_FFFD
300    )
301}
302
303/// Returns true if the given value is a continue byte of UTF-8.
304#[inline(always)]
305#[must_use]
306pub(crate) fn is_utf8_byte_continue(byte: u8) -> bool {
307    // `0x80..=0xbf` (i.e. `0b_1000_0000..=0b_1011_1111`) is not the first byte,
308    // and `0xc0..=0xc1` (i.e. `0b_1100_0000..=0b_1100_0001` shouldn't appear
309    // anywhere in UTF-8 byte sequence.
310    // `0x80 as i8` is -128, and `0xc0 as i8` is -96.
311    //
312    // The first byte of the UTF-8 character is not `0b10xx_xxxx`, and
313    // the continue bytes is `0b10xx_xxxx`.
314    // `0b1011_1111 as i8` is -65, and `0b1000_0000 as i8` is -128.
315    (byte as i8) < -64
316}
317
318/// Returns true if the given ASCII character is `unreserved` or `reserved`.
319#[inline]
320#[must_use]
321pub(crate) const fn is_ascii_unreserved_or_reserved(c: u8) -> bool {
322    (TABLE[c as usize] & (MASK_UNRESERVED | MASK_GEN_DELIMS | MASK_SUB_DELIMS)) != 0
323}