fluent_uri/pct_enc/
table.rs

1//! Byte pattern tables from RFC 3986 and RFC 3987.
2//!
3//! The predefined table constants in this module are documented with
4//! the ABNF notation of [RFC 5234].
5//!
6//! [RFC 5234]: https://datatracker.ietf.org/doc/html/rfc5234
7
8use crate::utf8;
9
10const TABLE_LEN: usize = 256 + 3;
11const INDEX_PCT_ENCODED: usize = 256;
12const INDEX_UCSCHAR: usize = 256 + 1;
13const INDEX_IPRIVATE: usize = 256 + 2;
14
15const fn is_ucschar(x: u32) -> bool {
16    matches!(x, 0xa0..=0xd7ff | 0xf900..=0xfdcf | 0xfdf0..=0xffef)
17        || (x >= 0x10000 && x <= 0xdffff && (x & 0xffff) <= 0xfffd)
18        || (x >= 0xe1000 && x <= 0xefffd)
19}
20
21const fn is_iprivate(x: u32) -> bool {
22    (x >= 0xe000 && x <= 0xf8ff) || (x >= 0xf0000 && (x & 0xffff) <= 0xfffd)
23}
24
25/// A table specifying the byte patterns allowed in a string.
26#[derive(Clone, Copy, Debug)]
27pub struct Table {
28    table: [bool; TABLE_LEN],
29}
30
31impl Table {
32    /// Creates a table that only allows the given unencoded bytes.
33    ///
34    /// # Panics
35    ///
36    /// Panics if any of the bytes is not ASCII or equals `b'%'`.
37    #[must_use]
38    pub const fn new(mut bytes: &[u8]) -> Self {
39        let mut table = [false; TABLE_LEN];
40        while let [cur, rem @ ..] = bytes {
41            assert!(
42                cur.is_ascii() && *cur != b'%',
43                "cannot allow non-ASCII byte or %"
44            );
45            table[*cur as usize] = true;
46            bytes = rem;
47        }
48        Self { table }
49    }
50
51    /// Combines two tables into one.
52    ///
53    /// Returns a new table that allows all the byte patterns allowed
54    /// by `self` or by `other`.
55    #[must_use]
56    pub const fn or(mut self, other: &Self) -> Self {
57        let mut i = 0;
58        while i < TABLE_LEN {
59            self.table[i] |= other.table[i];
60            i += 1;
61        }
62        self
63    }
64
65    /// Marks this table as allowing percent-encoded octets.
66    #[must_use]
67    pub const fn or_pct_encoded(mut self) -> Self {
68        self.table[INDEX_PCT_ENCODED] = true;
69        self
70    }
71
72    /// Marks this table as allowing characters matching the [`ucschar`]
73    /// ABNF rule from RFC 3987.
74    ///
75    /// [`ucschar`]: https://datatracker.ietf.org/doc/html/rfc3987#section-2.2
76    #[must_use]
77    pub const fn or_ucschar(mut self) -> Self {
78        self.table[INDEX_UCSCHAR] = true;
79        self
80    }
81
82    /// Marks this table as allowing characters matching the [`iprivate`]
83    /// ABNF rule from RFC 3987.
84    ///
85    /// [`iprivate`]: https://datatracker.ietf.org/doc/html/rfc3987#section-2.2
86    #[must_use]
87    pub const fn or_iprivate(mut self) -> Self {
88        self.table[INDEX_IPRIVATE] = true;
89        self
90    }
91
92    /// Subtracts from this table.
93    ///
94    /// Returns a new table that allows all the byte patterns allowed
95    /// by `self` but not allowed by `other`.
96    #[must_use]
97    pub const fn sub(mut self, other: &Self) -> Self {
98        let mut i = 0;
99        while i < TABLE_LEN {
100            self.table[i] &= !other.table[i];
101            i += 1;
102        }
103        self
104    }
105
106    /// Checks whether the table is a subset of another, i.e., `other`
107    /// allows at least all the byte patterns allowed by `self`.
108    #[must_use]
109    pub const fn is_subset(&self, other: &Self) -> bool {
110        let mut i = 0;
111        while i < TABLE_LEN {
112            if self.table[i] & !other.table[i] {
113                return false;
114            }
115            i += 1;
116        }
117        true
118    }
119
120    #[inline]
121    pub(crate) const fn allows_ascii(&self, x: u8) -> bool {
122        self.table[x as usize]
123    }
124
125    #[inline]
126    pub(crate) const fn allows_non_ascii(&self) -> bool {
127        self.table[INDEX_UCSCHAR] | self.table[INDEX_IPRIVATE]
128    }
129
130    pub(crate) const fn allows_code_point(&self, x: u32) -> bool {
131        if x < 128 {
132            self.table[x as usize]
133        } else {
134            (self.table[INDEX_UCSCHAR] && is_ucschar(x))
135                || (self.table[INDEX_IPRIVATE] && is_iprivate(x))
136        }
137    }
138
139    /// Checks whether the given unencoded character is allowed by the table.
140    #[inline]
141    #[must_use]
142    pub const fn allows(&self, ch: char) -> bool {
143        self.allows_code_point(ch as u32)
144    }
145
146    /// Checks whether percent-encoded octets are allowed by the table.
147    #[inline]
148    #[must_use]
149    pub const fn allows_pct_encoded(&self) -> bool {
150        self.table[INDEX_PCT_ENCODED]
151    }
152
153    /// Validates the given string with the table.
154    pub(crate) const fn validate(&self, s: &[u8]) -> bool {
155        let mut i = 0;
156        let allow_pct_encoded = self.allows_pct_encoded();
157        let allow_non_ascii = self.allows_non_ascii();
158
159        while i < s.len() {
160            let x = s[i];
161            if allow_pct_encoded && x == b'%' {
162                if i + 2 >= s.len() {
163                    return false;
164                }
165                let (hi, lo) = (s[i + 1], s[i + 2]);
166
167                if !(HEXDIG.allows_ascii(hi) & HEXDIG.allows_ascii(lo)) {
168                    return false;
169                }
170                i += 3;
171            } else if allow_non_ascii {
172                let (x, len) = utf8::next_code_point(s, i);
173                if !self.allows_code_point(x) {
174                    return false;
175                }
176                i += len;
177            } else {
178                if !self.allows_ascii(x) {
179                    return false;
180                }
181                i += 1;
182            }
183        }
184        true
185    }
186}
187
188const fn new(bytes: &[u8]) -> Table {
189    Table::new(bytes)
190}
191
192// Rules from RFC 3986:
193
194/// `ALPHA = %x41-5A / %x61-7A`
195pub const ALPHA: &Table = &new(b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
196
197/// `DIGIT = %x30-39`
198pub const DIGIT: &Table = &new(b"0123456789");
199
200/// `HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F"`
201pub const HEXDIG: &Table = &DIGIT.or(&new(b"ABCDEFabcdef"));
202
203/// `scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )`
204pub const SCHEME: &Table = &ALPHA.or(DIGIT).or(&new(b"+-."));
205
206/// `userinfo = *( unreserved / pct-encoded / sub-delims / ":" )`
207pub const USERINFO: &Table = &UNRESERVED.or(SUB_DELIMS).or(&new(b":")).or_pct_encoded();
208
209/// `IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )`
210pub const IPV_FUTURE: &Table = &UNRESERVED.or(SUB_DELIMS).or(&new(b":"));
211
212/// `reg-name = *( unreserved / pct-encoded / sub-delims )`
213pub const REG_NAME: &Table = &UNRESERVED.or(SUB_DELIMS).or_pct_encoded();
214
215/// `path = *( pchar / "/" )`
216pub const PATH: &Table = &PCHAR.or(&new(b"/"));
217
218/// `segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )`
219pub const SEGMENT_NZ_NC: &Table = &UNRESERVED.or(SUB_DELIMS).or(&new(b"@")).or_pct_encoded();
220
221/// `pchar = unreserved / pct-encoded / sub-delims / ":" / "@"`
222pub const PCHAR: &Table = &UNRESERVED.or(SUB_DELIMS).or(&new(b":@")).or_pct_encoded();
223
224/// `query = *( pchar / "/" / "?" )`
225pub const QUERY: &Table = &PCHAR.or(&new(b"/?"));
226
227/// `fragment = *( pchar / "/" / "?" )`
228pub const FRAGMENT: &Table = QUERY;
229
230/// `unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"`
231pub const UNRESERVED: &Table = &ALPHA.or(DIGIT).or(&new(b"-._~"));
232
233/// `reserved = gen-delims / sub-delims`
234pub const RESERVED: &Table = &GEN_DELIMS.or(SUB_DELIMS);
235
236/// `gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"`
237pub const GEN_DELIMS: &Table = &new(b":/?#[]@");
238
239/// `sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
240///             / "*" / "+" / "," / ";" / "="`
241pub const SUB_DELIMS: &Table = &new(b"!$&'()*+,;=");
242
243// Rules from RFC 3987:
244
245pub const IUSERINFO: &Table = &USERINFO.or_ucschar();
246pub const IREG_NAME: &Table = &REG_NAME.or_ucschar();
247pub const IPATH: &Table = &PATH.or_ucschar();
248pub const ISEGMENT_NZ_NC: &Table = &SEGMENT_NZ_NC.or_ucschar();
249pub const IQUERY: &Table = &QUERY.or_ucschar().or_iprivate();
250pub const IFRAGMENT: &Table = &FRAGMENT.or_ucschar();