fluent_uri/encoding/
table.rs

1//! Byte pattern tables from RFC 3986 and RFC 3987.
2//!
3//! The predefined table constants in this module are documented with
4//! the ABNF notation of [RFC 5234].
5//!
6//! [RFC 5234]: https://datatracker.ietf.org/doc/html/rfc5234
7
8use alloc::string::String;
9
10const TABLE_LEN: usize = 256 + 3;
11const INDEX_PCT_ENCODED: usize = 256;
12const INDEX_UCSCHAR: usize = 256 + 1;
13const INDEX_IPRIVATE: usize = 256 + 2;
14
15pub(crate) const fn is_ucschar(x: u32) -> bool {
16    matches!(x, 0xa0..=0xd7ff | 0xf900..=0xfdcf | 0xfdf0..=0xffef)
17        || (x >= 0x10000 && x <= 0xdffff && (x & 0xffff) <= 0xfffd)
18        || (x >= 0xe1000 && x <= 0xefffd)
19}
20
21pub(crate) const fn is_iprivate(x: u32) -> bool {
22    (x >= 0xe000 && x <= 0xf8ff) || (x >= 0xf0000 && (x & 0xffff) <= 0xfffd)
23}
24
25/// A table specifying the byte patterns allowed in a string.
26#[derive(Clone, Copy, Debug)]
27pub struct Table {
28    table: [bool; TABLE_LEN],
29}
30
31impl Table {
32    /// Creates a table that only allows the given unencoded bytes.
33    ///
34    /// # Panics
35    ///
36    /// Panics if any of the bytes is not ASCII or equals `b'%'`.
37    #[must_use]
38    pub const fn new(mut bytes: &[u8]) -> Table {
39        let mut table = [false; TABLE_LEN];
40        while let [cur, rem @ ..] = bytes {
41            assert!(
42                cur.is_ascii() && *cur != b'%',
43                "cannot allow non-ASCII byte or %"
44            );
45            table[*cur as usize] = true;
46            bytes = rem;
47        }
48        Table { table }
49    }
50
51    /// Combines two tables into one.
52    ///
53    /// Returns a new table that allows all the byte patterns allowed
54    /// by `self` or by `other`.
55    #[must_use]
56    pub const fn or(mut self, other: &Table) -> Table {
57        let mut i = 0;
58        while i < TABLE_LEN {
59            self.table[i] |= other.table[i];
60            i += 1;
61        }
62        self
63    }
64
65    /// Marks this table as allowing percent-encoded octets.
66    #[must_use]
67    pub const fn or_pct_encoded(mut self) -> Table {
68        self.table[INDEX_PCT_ENCODED] = true;
69        self
70    }
71
72    /// Marks this table as allowing characters matching the [`ucschar`]
73    /// ABNF rule from RFC 3987.
74    ///
75    /// [`ucschar`]: https://datatracker.ietf.org/doc/html/rfc3987#section-2.2
76    #[must_use]
77    pub const fn or_ucschar(mut self) -> Table {
78        self.table[INDEX_UCSCHAR] = true;
79        self
80    }
81
82    /// Marks this table as allowing characters matching the [`iprivate`]
83    /// ABNF rule from RFC 3987.
84    ///
85    /// [`iprivate`]: https://datatracker.ietf.org/doc/html/rfc3987#section-2.2
86    #[must_use]
87    pub const fn or_iprivate(mut self) -> Table {
88        self.table[INDEX_IPRIVATE] = true;
89        self
90    }
91
92    /// Subtracts from this table.
93    ///
94    /// Returns a new table that allows all the byte patterns allowed
95    /// by `self` but not allowed by `other`.
96    #[must_use]
97    pub const fn sub(mut self, other: &Table) -> Table {
98        let mut i = 0;
99        while i < TABLE_LEN {
100            self.table[i] &= !other.table[i];
101            i += 1;
102        }
103        self
104    }
105
106    /// Checks whether the table is a subset of another, i.e., `other`
107    /// allows at least all the byte patterns allowed by `self`.
108    #[must_use]
109    pub const fn is_subset(&self, other: &Table) -> bool {
110        let mut i = 0;
111        while i < TABLE_LEN {
112            if self.table[i] & !other.table[i] {
113                return false;
114            }
115            i += 1;
116        }
117        true
118    }
119
120    #[inline]
121    pub(crate) const fn allows_ascii(&self, x: u8) -> bool {
122        self.table[x as usize]
123    }
124
125    #[inline]
126    pub(crate) const fn allows_non_ascii(&self) -> bool {
127        self.table[INDEX_UCSCHAR] | self.table[INDEX_IPRIVATE]
128    }
129
130    pub(crate) const fn allows_code_point(&self, x: u32) -> bool {
131        if x < 128 {
132            self.table[x as usize]
133        } else {
134            (self.table[INDEX_UCSCHAR] && is_ucschar(x))
135                || (self.table[INDEX_IPRIVATE] && is_iprivate(x))
136        }
137    }
138
139    /// Checks whether the given unencoded character is allowed by the table.
140    #[inline]
141    #[must_use]
142    pub const fn allows(&self, ch: char) -> bool {
143        self.allows_code_point(ch as u32)
144    }
145
146    /// Checks whether percent-encoded octets are allowed by the table.
147    #[inline]
148    #[must_use]
149    pub const fn allows_pct_encoded(&self) -> bool {
150        self.table[INDEX_PCT_ENCODED]
151    }
152
153    pub(crate) fn encode(&self, ch: char, buf: &mut String) {
154        if self.allows(ch) {
155            buf.push(ch);
156        } else {
157            for x in ch.encode_utf8(&mut [0; 4]).bytes() {
158                super::encode_byte(x, buf);
159            }
160        }
161    }
162
163    /// Validates the given string with the table.
164    pub(crate) const fn validate(&self, s: &[u8]) -> bool {
165        let mut i = 0;
166        let allow_pct_encoded = self.allows_pct_encoded();
167        let allow_non_ascii = self.allows_non_ascii();
168
169        while i < s.len() {
170            let x = s[i];
171            if allow_pct_encoded && x == b'%' {
172                if i + 2 >= s.len() {
173                    return false;
174                }
175                let (hi, lo) = (s[i + 1], s[i + 2]);
176
177                if !(HEXDIG.allows_ascii(hi) & HEXDIG.allows_ascii(lo)) {
178                    return false;
179                }
180                i += 3;
181            } else if allow_non_ascii {
182                let (x, len) = super::next_code_point(s, i);
183                if !self.allows_code_point(x) {
184                    return false;
185                }
186                i += len;
187            } else {
188                if !self.allows_ascii(x) {
189                    return false;
190                }
191                i += 1;
192            }
193        }
194        true
195    }
196}
197
198const fn new(bytes: &[u8]) -> Table {
199    Table::new(bytes)
200}
201
202// Rules from RFC 3986:
203
204/// `ALPHA = %x41-5A / %x61-7A`
205pub const ALPHA: &Table = &new(b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
206
207/// `DIGIT = %x30-39`
208pub const DIGIT: &Table = &new(b"0123456789");
209
210/// `HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F"`
211pub const HEXDIG: &Table = &DIGIT.or(&new(b"ABCDEFabcdef"));
212
213/// `scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )`
214pub const SCHEME: &Table = &ALPHA.or(DIGIT).or(&new(b"+-."));
215
216/// `userinfo = *( unreserved / pct-encoded / sub-delims / ":" )`
217pub const USERINFO: &Table = &UNRESERVED.or(SUB_DELIMS).or(&new(b":")).or_pct_encoded();
218
219/// `IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )`
220pub const IPV_FUTURE: &Table = &UNRESERVED.or(SUB_DELIMS).or(&new(b":"));
221
222/// `reg-name = *( unreserved / pct-encoded / sub-delims )`
223pub const REG_NAME: &Table = &UNRESERVED.or(SUB_DELIMS).or_pct_encoded();
224
225/// `path = *( pchar / "/" )`
226pub const PATH: &Table = &PCHAR.or(&new(b"/"));
227
228/// `segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )`
229pub const SEGMENT_NZ_NC: &Table = &UNRESERVED.or(SUB_DELIMS).or(&new(b"@")).or_pct_encoded();
230
231/// `pchar = unreserved / pct-encoded / sub-delims / ":" / "@"`
232pub const PCHAR: &Table = &UNRESERVED.or(SUB_DELIMS).or(&new(b":@")).or_pct_encoded();
233
234/// `query = *( pchar / "/" / "?" )`
235pub const QUERY: &Table = &PCHAR.or(&new(b"/?"));
236
237/// `fragment = *( pchar / "/" / "?" )`
238pub const FRAGMENT: &Table = QUERY;
239
240/// `unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"`
241pub const UNRESERVED: &Table = &ALPHA.or(DIGIT).or(&new(b"-._~"));
242
243/// `reserved = gen-delims / sub-delims`
244pub const RESERVED: &Table = &GEN_DELIMS.or(SUB_DELIMS);
245
246/// `gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"`
247pub const GEN_DELIMS: &Table = &new(b":/?#[]@");
248
249/// `sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
250///             / "*" / "+" / "," / ";" / "="`
251pub const SUB_DELIMS: &Table = &new(b"!$&'()*+,;=");
252
253// Rules from RFC 3987:
254
255pub const IUSERINFO: &Table = &USERINFO.or_ucschar();
256pub const IREG_NAME: &Table = &REG_NAME.or_ucschar();
257pub const IPATH: &Table = &PATH.or_ucschar();
258pub const ISEGMENT_NZ_NC: &Table = &SEGMENT_NZ_NC.or_ucschar();
259pub const IQUERY: &Table = &QUERY.or_ucschar().or_iprivate();
260pub const IFRAGMENT: &Table = &FRAGMENT.or_ucschar();