iri_string/parser/char.rs
1//! Characters.
2
3use crate::spec::Spec;
4
5/// A mask to test whether the character is continue character of `scheme`.
6// `ALPHA / DIGIT / "+" / "-" / "."`
7const MASK_SCHEME_CONTINUE: u8 = 1 << 0;
8
9/// A mask to test whether the character matches `unreserved`.
10// `unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"`
11const MASK_UNRESERVED: u8 = 1 << 1;
12
13/// A mask to test whether the character matches `gen-delims`.
14// `gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"`
15const MASK_GEN_DELIMS: u8 = 1 << 2;
16
17/// A mask to test whether the character matches `sub-delims`.
18// `sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="`
19const MASK_SUB_DELIMS: u8 = 1 << 3;
20
21/// A mask to test whether the character matches `pchar` (modulo percent-encoded bytes).
22// `pchar = unreserved / pct-encoded / sub-delims / ":" / "@"`
23const MASK_PCHAR: u8 = 1 << 4;
24
25/// A mask to test whether the character can appear in `query` and `fragment`.
26// `query = *( pchar / "/" / "?" )`
27// `fragment = *( pchar / "/" / "?" )`
28const MASK_FRAG_QUERY: u8 = 1 << 5;
29
30/// A mask to test whether the character can appear in `userinfo` and address of `IPvFuture`.
31// `userinfo = *( unreserved / pct-encoded / sub-delims / ":" )`
32const MASK_USERINFO_IPVFUTUREADDR: u8 = 1 << 6;
33
34/// A mask to test whether the character matches `pchar` (modulo percent-encoded bytes) or slash.
35const MASK_PCHAR_SLASH: u8 = 1 << 7;
36
37/// ASCII characters' properties.
38const TABLE: [u8; 128] = [
39 0b_0000_0000, // NUL
40 0b_0000_0000, // SOH
41 0b_0000_0000, // STX
42 0b_0000_0000, // ETX
43 0b_0000_0000, // EOT
44 0b_0000_0000, // ENQ
45 0b_0000_0000, // ACK
46 0b_0000_0000, // BEL
47 0b_0000_0000, // BS
48 0b_0000_0000, // HT
49 0b_0000_0000, // LF
50 0b_0000_0000, // VT
51 0b_0000_0000, // FF
52 0b_0000_0000, // CR
53 0b_0000_0000, // SO
54 0b_0000_0000, // SI
55 0b_0000_0000, // DLE
56 0b_0000_0000, // DC1
57 0b_0000_0000, // DC2
58 0b_0000_0000, // DC3
59 0b_0000_0000, // DC4
60 0b_0000_0000, // NAK
61 0b_0000_0000, // SYN
62 0b_0000_0000, // ETB
63 0b_0000_0000, // CAN
64 0b_0000_0000, // EM
65 0b_0000_0000, // SUB
66 0b_0000_0000, // ESC
67 0b_0000_0000, // FS
68 0b_0000_0000, // GS
69 0b_0000_0000, // RS
70 0b_0000_0000, // US
71 0b_0000_0000, // SPACE
72 0b_1111_1000, // !
73 0b_0000_0000, // "
74 0b_0000_0100, // #
75 0b_1111_1000, // $
76 0b_0000_0000, // %
77 0b_1111_1000, // &
78 0b_1111_1000, // '
79 0b_1111_1000, // (
80 0b_1111_1000, // )
81 0b_1111_1000, // *
82 0b_1111_1001, // +
83 0b_1111_1000, // ,
84 0b_1111_0011, // -
85 0b_1111_0011, // .
86 0b_1010_0100, // /
87 0b_1111_0011, // 0
88 0b_1111_0011, // 1
89 0b_1111_0011, // 2
90 0b_1111_0011, // 3
91 0b_1111_0011, // 4
92 0b_1111_0011, // 5
93 0b_1111_0011, // 6
94 0b_1111_0011, // 7
95 0b_1111_0011, // 8
96 0b_1111_0011, // 9
97 0b_1111_0100, // :
98 0b_1111_1000, // ;
99 0b_0000_0000, // <
100 0b_1111_1000, // =
101 0b_0000_0000, // >
102 0b_0010_0100, // ?
103 0b_1011_0100, // @
104 0b_1111_0011, // A
105 0b_1111_0011, // B
106 0b_1111_0011, // C
107 0b_1111_0011, // D
108 0b_1111_0011, // E
109 0b_1111_0011, // F
110 0b_1111_0011, // G
111 0b_1111_0011, // H
112 0b_1111_0011, // I
113 0b_1111_0011, // J
114 0b_1111_0011, // K
115 0b_1111_0011, // L
116 0b_1111_0011, // M
117 0b_1111_0011, // N
118 0b_1111_0011, // O
119 0b_1111_0011, // P
120 0b_1111_0011, // Q
121 0b_1111_0011, // R
122 0b_1111_0011, // S
123 0b_1111_0011, // T
124 0b_1111_0011, // U
125 0b_1111_0011, // V
126 0b_1111_0011, // W
127 0b_1111_0011, // X
128 0b_1111_0011, // Y
129 0b_1111_0011, // Z
130 0b_0000_0100, // [
131 0b_0000_0000, // \
132 0b_0000_0100, // ]
133 0b_0000_0000, // ^
134 0b_1111_0010, // _
135 0b_0000_0000, // `
136 0b_1111_0011, // a
137 0b_1111_0011, // b
138 0b_1111_0011, // c
139 0b_1111_0011, // d
140 0b_1111_0011, // e
141 0b_1111_0011, // f
142 0b_1111_0011, // g
143 0b_1111_0011, // h
144 0b_1111_0011, // i
145 0b_1111_0011, // j
146 0b_1111_0011, // k
147 0b_1111_0011, // l
148 0b_1111_0011, // m
149 0b_1111_0011, // n
150 0b_1111_0011, // o
151 0b_1111_0011, // p
152 0b_1111_0011, // q
153 0b_1111_0011, // r
154 0b_1111_0011, // s
155 0b_1111_0011, // t
156 0b_1111_0011, // u
157 0b_1111_0011, // v
158 0b_1111_0011, // w
159 0b_1111_0011, // x
160 0b_1111_0011, // y
161 0b_1111_0011, // z
162 0b_0000_0000, // {
163 0b_0000_0000, // |
164 0b_0000_0000, // }
165 0b_1111_0010, // ~
166 0b_0000_0000, // DEL
167];
168
169/// Returns `true` if the given ASCII character is allowed as continue character of `scheme` part.
170#[inline]
171#[must_use]
172pub(crate) const fn is_ascii_scheme_continue(c: u8) -> bool {
173 (TABLE[c as usize] & MASK_SCHEME_CONTINUE) != 0
174}
175
176/// Returns `true` if the given ASCII character matches `unreserved`.
177#[inline]
178#[must_use]
179pub(crate) const fn is_ascii_unreserved(c: u8) -> bool {
180 (TABLE[c as usize] & MASK_UNRESERVED) != 0
181}
182
183/// Returns true if the character is unreserved.
184#[inline]
185#[must_use]
186pub(crate) fn is_unreserved<S: Spec>(c: char) -> bool {
187 if c.is_ascii() {
188 is_ascii_unreserved(c as u8)
189 } else {
190 S::is_nonascii_char_unreserved(c)
191 }
192}
193
194///// Returns `true` if the given ASCII character matches `gen-delims`.
195//#[inline]
196//#[must_use]
197//pub(crate) const fn is_ascii_gen_delims(c: u8) -> bool {
198// (TABLE[c as usize] & MASK_GEN_DELIMS) != 0
199//}
200
201///// Returns `true` if the given ASCII character matches `sub-delims`.
202//#[inline]
203//#[must_use]
204//pub(crate) const fn is_ascii_sub_delims(c: u8) -> bool {
205// (TABLE[c as usize] & MASK_SUB_DELIMS) != 0
206//}
207
208///// Returns `true` if the given ASCII character matches `reserved`.
209//#[inline]
210//#[must_use]
211//pub(crate) const fn is_ascii_reserved(c: u8) -> bool {
212// (TABLE[c as usize] & (MASK_GEN_DELIMS | MASK_SUB_DELIMS)) != 0
213//}
214
215/// Returns `true` if the given ASCII character matches `pchar` modulo `pct-encoded`.
216#[inline]
217#[must_use]
218pub(crate) const fn is_ascii_pchar(c: u8) -> bool {
219 (TABLE[c as usize] & MASK_PCHAR) != 0
220}
221
222/// Returns `true` if the given ASCII character is allowed to appear in `query` and `fragment`.
223#[inline]
224#[must_use]
225pub(crate) const fn is_ascii_frag_query(c: u8) -> bool {
226 (TABLE[c as usize] & MASK_FRAG_QUERY) != 0
227}
228
229/// Returns `true` if the given non-ASCII character is allowed to appear in `iquery`.
230#[inline]
231#[must_use]
232pub(crate) fn is_nonascii_query<S: Spec>(c: char) -> bool {
233 S::is_nonascii_char_unreserved(c) || S::is_nonascii_char_private(c)
234}
235
236/// Returns `true` if the given non-ASCII character is allowed to appear in `ifragment`.
237#[inline]
238#[must_use]
239pub(crate) fn is_nonascii_fragment<S: Spec>(c: char) -> bool {
240 S::is_nonascii_char_unreserved(c)
241}
242
243/// Returns `true` if the given ASCII character is allowed to appear in `userinfo` and `IPvFuture`.
244#[inline]
245#[must_use]
246pub(crate) const fn is_ascii_userinfo_ipvfutureaddr(c: u8) -> bool {
247 (TABLE[c as usize] & MASK_USERINFO_IPVFUTUREADDR) != 0
248}
249
250/// Returns `true` if the given non-ASCII character is allowed to appear in `iuserinfo`.
251#[inline]
252#[must_use]
253pub(crate) fn is_nonascii_userinfo<S: Spec>(c: char) -> bool {
254 S::is_nonascii_char_unreserved(c)
255}
256
257/// Returns `true` if the given ASCII character is allowed to appear in `reg-name`
258#[inline]
259#[must_use]
260pub(crate) const fn is_ascii_regname(c: u8) -> bool {
261 (TABLE[c as usize] & (MASK_UNRESERVED | MASK_SUB_DELIMS)) != 0
262}
263
264/// Returns `true` if the given non-ASCII character is allowed to appear in `ireg-name`.
265#[inline]
266#[must_use]
267pub(crate) fn is_nonascii_regname<S: Spec>(c: char) -> bool {
268 S::is_nonascii_char_unreserved(c)
269}
270
271/// Returns `true` if the given ASCII character matches `pchar` modulo `pct-encoded` or a slash.
272#[inline]
273#[must_use]
274pub(crate) const fn is_ascii_pchar_slash(c: u8) -> bool {
275 (TABLE[c as usize] & MASK_PCHAR_SLASH) != 0
276}
277
278/// Checks if the given character matches `ucschar` rule.
279#[must_use]
280pub(crate) fn is_ucschar(c: char) -> bool {
281 matches!(
282 u32::from(c),
283 0xA0..=0xD7FF |
284 0xF900..=0xFDCF |
285 0xFDF0..=0xFFEF |
286 0x1_0000..=0x1_FFFD |
287 0x2_0000..=0x2_FFFD |
288 0x3_0000..=0x3_FFFD |
289 0x4_0000..=0x4_FFFD |
290 0x5_0000..=0x5_FFFD |
291 0x6_0000..=0x6_FFFD |
292 0x7_0000..=0x7_FFFD |
293 0x8_0000..=0x8_FFFD |
294 0x9_0000..=0x9_FFFD |
295 0xA_0000..=0xA_FFFD |
296 0xB_0000..=0xB_FFFD |
297 0xC_0000..=0xC_FFFD |
298 0xD_0000..=0xD_FFFD |
299 0xE_1000..=0xE_FFFD
300 )
301}
302
303/// Returns true if the given value is a continue byte of UTF-8.
304#[inline(always)]
305#[must_use]
306pub(crate) fn is_utf8_byte_continue(byte: u8) -> bool {
307 // `0x80..=0xbf` (i.e. `0b_1000_0000..=0b_1011_1111`) is not the first byte,
308 // and `0xc0..=0xc1` (i.e. `0b_1100_0000..=0b_1100_0001` shouldn't appear
309 // anywhere in UTF-8 byte sequence.
310 // `0x80 as i8` is -128, and `0xc0 as i8` is -96.
311 //
312 // The first byte of the UTF-8 character is not `0b10xx_xxxx`, and
313 // the continue bytes is `0b10xx_xxxx`.
314 // `0b1011_1111 as i8` is -65, and `0b1000_0000 as i8` is -128.
315 (byte as i8) < -64
316}
317
318/// Returns true if the given ASCII character is `unreserved` or `reserved`.
319#[inline]
320#[must_use]
321pub(crate) const fn is_ascii_unreserved_or_reserved(c: u8) -> bool {
322 (TABLE[c as usize] & (MASK_UNRESERVED | MASK_GEN_DELIMS | MASK_SUB_DELIMS)) != 0
323}