iri_string/parser/trusted.rs
1//! Fast parsers for trusted (already validated) input.
2//!
3//! Using this in wrong way will lead to unexpected wrong result.
4
5pub(crate) mod authority;
6
7use core::cmp::Ordering;
8use core::num::NonZeroUsize;
9
10use crate::components::{RiReferenceComponents, Splitter};
11use crate::format::eq_str_display;
12use crate::normalize::{is_pct_case_normalized, NormalizedAsciiOnlyHost, NormalizednessCheckMode};
13use crate::parser::str::{find_split2, find_split3, find_split4_hole, find_split_hole};
14use crate::spec::Spec;
15use crate::types::RiReferenceStr;
16
17/// Eats a `scheme` and a following colon, and returns the rest and the scheme.
18///
19/// Returns `(rest, scheme)`.
20///
21/// This should be called at the head of an absolute IRIs/URIs.
22#[must_use]
23fn scheme_colon(i: &str) -> (&str, &str) {
24 let (scheme, rest) =
25 find_split_hole(i, b':').expect("[precondition] absolute IRIs must have `scheme` part");
26 (rest, scheme)
27}
28
29/// Eats a `scheme` and a following colon if available, and returns the rest and the scheme.
30///
31/// This should be called at the head of an `IRI-reference` or similar.
32#[must_use]
33fn scheme_colon_opt(i: &str) -> (&str, Option<&str>) {
34 match find_split4_hole(i, b':', b'/', b'?', b'#') {
35 Some((scheme, b':', rest)) => (rest, Some(scheme)),
36 _ => (i, None),
37 }
38}
39
40/// Eats double slash and the following authority if available, and returns the authority.
41///
42/// This should be called at the head of an `IRI-reference`, or at the result of `scheme_colon`.
43#[must_use]
44fn slash_slash_authority_opt(i: &str) -> (&str, Option<&str>) {
45 let s = match i.strip_prefix("//") {
46 Some(rest) => rest,
47 None => return (i, None),
48 };
49 // `i` might match `path-abempty` (which can start with `//`), but it is not
50 // allowed as `relative-part`, so no need to care `path-abempty` rule here.
51 // A slash, question mark, and hash character won't appear in `authority`.
52 match find_split3(s, b'/', b'?', b'#') {
53 Some((authority, rest)) => (rest, Some(authority)),
54 None => ("", Some(s)),
55 }
56}
57
58/// Eats a string until the query, and returns that part (excluding `?` for the query).
59#[must_use]
60fn until_query(i: &str) -> (&str, &str) {
61 // `?` won't appear before the query part.
62 match find_split2(i, b'?', b'#') {
63 Some((before_query, rest)) => (rest, before_query),
64 None => ("", i),
65 }
66}
67
68/// Decomposes query and fragment, if available.
69///
70/// The string must starts with `?`, or `#`, or be empty.
71#[must_use]
72fn decompose_query_and_fragment(i: &str) -> (Option<&str>, Option<&str>) {
73 match i.as_bytes().first().copied() {
74 None => (None, None),
75 Some(b'?') => {
76 let rest = &i[1..];
77 match find_split_hole(rest, b'#') {
78 Some((query, fragment)) => (Some(query), Some(fragment)),
79 None => (Some(rest), None),
80 }
81 }
82 Some(c) => {
83 debug_assert_eq!(c, b'#');
84 (None, Some(&i[1..]))
85 }
86 }
87}
88
89/// Decomposes the given valid `IRI-reference`.
90#[must_use]
91pub(crate) fn decompose_iri_reference<S: Spec>(
92 i: &RiReferenceStr<S>,
93) -> RiReferenceComponents<'_, S> {
94 /// Inner function to avoid unnecessary monomorphizations on `S`.
95 fn decompose(i: &str) -> Splitter {
96 let len = i.len();
97
98 let (i, scheme_end) = {
99 let (i, scheme) = scheme_colon_opt(i);
100 let end = scheme.and_then(|s| NonZeroUsize::new(s.len()));
101 (i, end)
102 };
103 let (i, authority_end) = {
104 // 2: "//".len()
105 let start = len - i.len() + 2;
106 // `authority` does not contain the two slashes of `://'.
107 let (i, authority) = slash_slash_authority_opt(i);
108 let end = authority.and_then(|s| NonZeroUsize::new(start + s.len()));
109 (i, end)
110 };
111 let (i, _path) = until_query(i);
112
113 let (query_start, fragment_start) = {
114 // This could theoretically be zero if `len` is `usize::MAX` and
115 // `i` has neither a query nor a fragment. However, this is
116 // practically impossible.
117 let after_first_prefix = NonZeroUsize::new((len - i.len()).wrapping_add(1));
118
119 let (query, fragment) = decompose_query_and_fragment(i);
120 match (query.is_some(), fragment) {
121 (true, Some(fragment)) => {
122 (after_first_prefix, NonZeroUsize::new(len - fragment.len()))
123 }
124 (true, None) => (after_first_prefix, None),
125 (false, Some(_fragment)) => (None, after_first_prefix),
126 (false, None) => (None, None),
127 }
128 };
129
130 Splitter::new(scheme_end, authority_end, query_start, fragment_start)
131 }
132
133 RiReferenceComponents {
134 iri: i,
135 splitter: decompose(i.as_str()),
136 }
137}
138
139/// Extracts `scheme` part from an IRI reference.
140///
141/// # Precondition
142///
143/// The given string must be a valid IRI reference.
144#[inline]
145#[must_use]
146pub(crate) fn extract_scheme(i: &str) -> Option<&str> {
147 scheme_colon_opt(i).1
148}
149
150/// Extracts `scheme` part from an absolute IRI.
151///
152/// # Precondition
153///
154/// The given string must be a valid absolute IRI.
155#[inline]
156#[must_use]
157pub(crate) fn extract_scheme_absolute(i: &str) -> &str {
158 scheme_colon(i).1
159}
160
161/// Extracts `authority` part from an IRI reference.
162///
163/// # Precondition
164///
165/// The given string must be a valid IRI reference.
166#[inline]
167#[must_use]
168pub(crate) fn extract_authority(i: &str) -> Option<&str> {
169 let (i, _scheme) = scheme_colon_opt(i);
170 slash_slash_authority_opt(i).1
171}
172
173/// Extracts `authority` part from an absolute IRI.
174///
175/// # Precondition
176///
177/// The given string must be a valid absolute IRI.
178#[inline]
179#[must_use]
180pub(crate) fn extract_authority_absolute(i: &str) -> Option<&str> {
181 let (i, _scheme) = scheme_colon(i);
182 slash_slash_authority_opt(i).1
183}
184
185/// Extracts `authority` part from a relative IRI.
186///
187/// # Precondition
188///
189/// The given string must be a valid relative IRI.
190#[inline]
191#[must_use]
192pub(crate) fn extract_authority_relative(i: &str) -> Option<&str> {
193 slash_slash_authority_opt(i).1
194}
195
196/// Extracts `path` part from an IRI reference.
197///
198/// # Precondition
199///
200/// The given string must be a valid IRI reference.
201#[inline]
202#[must_use]
203pub(crate) fn extract_path(i: &str) -> &str {
204 let (i, _scheme) = scheme_colon_opt(i);
205 let (i, _authority) = slash_slash_authority_opt(i);
206 until_query(i).1
207}
208
209/// Extracts `path` part from an absolute IRI.
210///
211/// # Precondition
212///
213/// The given string must be a valid absolute IRI.
214#[inline]
215#[must_use]
216pub(crate) fn extract_path_absolute(i: &str) -> &str {
217 let (i, _scheme) = scheme_colon(i);
218 let (i, _authority) = slash_slash_authority_opt(i);
219 until_query(i).1
220}
221
222/// Extracts `path` part from a relative IRI.
223///
224/// # Precondition
225///
226/// The given string must be a valid relative IRI.
227#[inline]
228#[must_use]
229pub(crate) fn extract_path_relative(i: &str) -> &str {
230 let (i, _authority) = slash_slash_authority_opt(i);
231 until_query(i).1
232}
233
234/// Extracts `query` part from an IRI reference.
235///
236/// # Precondition
237///
238/// The given string must be a valid IRI reference.
239#[inline]
240#[must_use]
241pub(crate) fn extract_query(i: &str) -> Option<&str> {
242 let (i, _before_query) = until_query(i);
243 decompose_query_and_fragment(i).0
244}
245
246/// Extracts `query` part from an `absolute-IRI` string.
247///
248/// # Precondition
249///
250/// The given string must be a valid `absolute-IRI` string.
251#[must_use]
252pub(crate) fn extract_query_absolute_iri(i: &str) -> Option<&str> {
253 let (i, _before_query) = until_query(i);
254 if i.is_empty() {
255 None
256 } else {
257 debug_assert_eq!(
258 i.as_bytes().first(),
259 Some(&b'?'),
260 "`absolute-IRI` string must not have `fragment part"
261 );
262 Some(&i[1..])
263 }
264}
265
266/// Splits an IRI string into the prefix and the fragment part.
267///
268/// A leading `#` character is truncated if the fragment part exists.
269///
270/// # Precondition
271///
272/// The given string must be a valid IRI reference.
273#[inline]
274#[must_use]
275pub(crate) fn split_fragment(iri: &str) -> (&str, Option<&str>) {
276 // It is completely OK to find the first `#` character from valid IRI to get fragment part,
277 // because the spec says that there are no `#` characters before the fragment part.
278 //
279 // > ```
280 // > scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
281 // > ```
282 // >
283 // > --- [RFC 3986, section 3.1. Scheme](https://tools.ietf.org/html/rfc3986#section-3.1)
284 //
285 // > The authority component is preceded by a double slash ("//") and is terminated by the
286 // > next slash ("/"), question mark ("?"), or number sign ("#") character, or by the end
287 // > of the URI.
288 // >
289 // > --- [RFC 3986, section 3.2. Authority](https://tools.ietf.org/html/rfc3986#section-3.2)
290 //
291 // > The path is terminated by the first question mark ("?") or number sign ("#")
292 // > character, or by the end of the URI.
293 // >
294 // > --- [RFC 3986, section 3.3. Path](https://tools.ietf.org/html/rfc3986#section-3.3)
295 //
296 // > The query component is indicated by the first question mark ("?") character and
297 // > terminated by a number sign ("#") character or by the end of the URI.
298 // >
299 // > --- [RFC 3986, section 3.4. Query](https://tools.ietf.org/html/rfc3986#section-3.4)
300 match find_split_hole(iri, b'#') {
301 Some((prefix, fragment)) => (prefix, Some(fragment)),
302 None => (iri, None),
303 }
304}
305
306/// Returns the fragment part of the given IRI.
307///
308/// A leading `#` character of the fragment is truncated.
309#[inline]
310#[must_use]
311pub(crate) fn extract_fragment(iri: &str) -> Option<&str> {
312 split_fragment(iri).1
313}
314
315/// Returns `Ok(_)` if the string is normalized.
316///
317/// If this function returns `true`, normalization input and output will be identical.
318///
319/// In this function, "normalized" means that any of the normalization below
320/// won't change the input on normalization:
321///
322/// * syntax-based normalization,
323/// * case normalization,
324/// * percent-encoding normalization, and
325/// * path segment normalizaiton.
326///
327/// Note that scheme-based normalization is not considered.
328#[must_use]
329pub(crate) fn is_normalized<S: Spec>(i: &str, mode: NormalizednessCheckMode) -> bool {
330 let (i, scheme) = scheme_colon(i);
331 let (after_authority, authority) = slash_slash_authority_opt(i);
332 let (_after_path, path) = until_query(after_authority);
333
334 // Syntax-based normalization: uppercase chars in `scheme` should be
335 // converted to lowercase.
336 if scheme.bytes().any(|b| b.is_ascii_uppercase()) {
337 return false;
338 }
339
340 // Case normalization: ASCII alphabets in US-ASCII only `host` should be
341 // normalized to lowercase.
342 // Case normalization: ASCII alphabets in percent-encoding triplet should be
343 // normalized to uppercase.
344 // Percent-encoding normalization: unresreved characters should be decoded
345 // in `userinfo`, `host`, `path`, `query`, and `fragments`.
346 // Path segment normalization: the path should not have dot segments (`.`
347 // and/or `..`).
348 //
349 // Note that `authority` can have percent-encoded `userinfo`.
350 if let Some(authority) = authority {
351 let authority_components = authority::decompose_authority(authority);
352
353 // Check `host`.
354 let host = authority_components.host();
355 let host_is_normalized = if is_ascii_only_host(host) {
356 eq_str_display(host, &NormalizedAsciiOnlyHost::new(host))
357 } else {
358 // If the host is not ASCII-only, conversion to lowercase is not performed.
359 is_pct_case_normalized::<S>(host)
360 };
361 if !host_is_normalized {
362 return false;
363 }
364
365 // Check pencent encodings in `userinfo`.
366 if let Some(userinfo) = authority_components.userinfo() {
367 if !is_pct_case_normalized::<S>(userinfo) {
368 return false;
369 }
370 }
371 }
372
373 // Check `path`.
374 //
375 // Syntax-based normalization: Dot segments might be removed.
376 // Note that we don't have to care `%2e` and `%2E` since `.` is unreserved
377 // and they will be decoded if not normalized.
378 // Also note that WHATWG serialization will use `/.//` as a path prefix if
379 // the path is absolute and won't modify the path if the path is relative.
380 //
381 // Percent-encoding normalization: unresreved characters should be decoded
382 // in `path`, `query`, and `fragments`.
383 let path_span_no_dot_segments = if authority.is_some() {
384 Some(path)
385 } else {
386 match mode {
387 NormalizednessCheckMode::Default => Some(path.strip_prefix("/.//").unwrap_or(path)),
388 NormalizednessCheckMode::Rfc3986 => Some(path),
389 NormalizednessCheckMode::PreserveAuthoritylessRelativePath => {
390 if path.starts_with('/') {
391 // Absolute.
392 Some(path.strip_prefix("/.//").unwrap_or(path))
393 } else {
394 // Relative. Treat the path as "opaque". No span to check.
395 None
396 }
397 }
398 }
399 };
400 if let Some(path_span_no_dot_segments) = path_span_no_dot_segments {
401 if path_span_no_dot_segments
402 .split('/')
403 .any(|segment| matches!(segment, "." | ".."))
404 {
405 return false;
406 }
407 }
408 is_pct_case_normalized::<S>(after_authority)
409}
410
411/// Decodes two hexdigits into a byte.
412///
413/// # Preconditions
414///
415/// The parameters `upper` and `lower` should be an ASCII hexadecimal digit.
416#[must_use]
417pub(super) fn hexdigits_to_byte([upper, lower]: [u8; 2]) -> u8 {
418 let i_upper = match (upper & 0xf0).cmp(&0x40) {
419 Ordering::Less => upper - b'0',
420 Ordering::Equal => upper - (b'A' - 10),
421 Ordering::Greater => upper - (b'a' - 10),
422 };
423 let i_lower = match (lower & 0xf0).cmp(&0x40) {
424 Ordering::Less => lower - b'0',
425 Ordering::Equal => lower - (b'A' - 10),
426 Ordering::Greater => lower - (b'a' - 10),
427 };
428 (i_upper << 4) + i_lower
429}
430
431/// Converts the first two hexdigit bytes in the buffer into a byte.
432///
433/// # Panics
434///
435/// Panics if the string does not start with two hexdigits.
436#[must_use]
437pub(crate) fn take_xdigits2(s: &str) -> (u8, &str) {
438 let mut bytes = s.bytes();
439 let upper_xdigit = bytes
440 .next()
441 .expect("[validity] at least two bytes should follow the `%` in a valid IRI reference");
442 let lower_xdigit = bytes
443 .next()
444 .expect("[validity] at least two bytes should follow the `%` in a valid IRI reference");
445 let v = hexdigits_to_byte([upper_xdigit, lower_xdigit]);
446 (v, &s[2..])
447}
448
449/// Returns true if the given `host`/`ihost` string consists of only US-ASCII characters.
450///
451/// # Precondition
452///
453/// The given string should be valid `host` or `host ":" port` string.
454#[must_use]
455pub(crate) fn is_ascii_only_host(mut host: &str) -> bool {
456 while let Some((i, c)) = host
457 .char_indices()
458 .find(|(_i, c)| !c.is_ascii() || *c == '%')
459 {
460 if c != '%' {
461 // Non-ASCII character found.
462 debug_assert!(!c.is_ascii());
463 return false;
464 }
465 // Percent-encoded character found.
466 let after_pct = &host[(i + 1)..];
467 let (byte, rest) = take_xdigits2(after_pct);
468 if !byte.is_ascii() {
469 return false;
470 }
471 host = rest;
472 }
473
474 // Neither non-ASCII characters nor percent-encoded characters found.
475 true
476}