Skip to main content

iri_string/parser/
trusted.rs

1//! Fast parsers for trusted (already validated) input.
2//!
3//! Using this in wrong way will lead to unexpected wrong result.
4
5pub(crate) mod authority;
6
7use core::num::NonZeroUsize;
8use core::ops::{self, RangeFrom};
9
10use crate::components::{RiReferenceComponents, Splitter};
11use crate::format::eq_str_display;
12use crate::normalize::{is_pct_case_normalized, NormalizedAsciiOnlyHost, NormalizednessCheckMode};
13use crate::parser::str::{find_split2, find_split3, find_split4_hole, find_split_hole};
14use crate::spec::Spec;
15use crate::types::RiReferenceStr;
16
17/// Eats a `scheme` and a following colon, and returns the rest and the scheme.
18///
19/// Returns `(rest, scheme)`.
20///
21/// This should be called at the head of an absolute IRIs/URIs.
22#[must_use]
23fn scheme_colon(i: &str) -> (&str, &str) {
24    let (scheme, rest) =
25        find_split_hole(i, b':').expect("[precondition] absolute IRIs must have `scheme` part");
26    (rest, scheme)
27}
28
29/// Eats a `scheme` and a following colon if available, and returns the rest and the scheme.
30///
31/// This should be called at the head of an `IRI-reference` or similar.
32#[must_use]
33fn scheme_colon_opt(i: &str) -> (&str, Option<&str>) {
34    match find_split4_hole(i, b':', b'/', b'?', b'#') {
35        Some((scheme, b':', rest)) => (rest, Some(scheme)),
36        _ => (i, None),
37    }
38}
39
40/// Eats double slash and the following authority if available, and returns the authority.
41///
42/// This should be called at the head of an `IRI-reference`, or at the result of `scheme_colon`.
43#[must_use]
44fn slash_slash_authority_opt(i: &str) -> (&str, Option<&str>) {
45    let s = match i.strip_prefix("//") {
46        Some(rest) => rest,
47        None => return (i, None),
48    };
49    // `i` might match `path-abempty` (which can start with `//`), but it is not
50    // allowed as `relative-part`, so no need to care `path-abempty` rule here.
51    // A slash, question mark, and hash character won't appear in `authority`.
52    match find_split3(s, b'/', b'?', b'#') {
53        Some((authority, rest)) => (rest, Some(authority)),
54        None => ("", Some(s)),
55    }
56}
57
58/// Eats a string until the query, and returns that part (excluding `?` for the query).
59#[must_use]
60fn until_query(i: &str) -> (&str, &str) {
61    // `?` won't appear before the query part.
62    match find_split2(i, b'?', b'#') {
63        Some((before_query, rest)) => (rest, before_query),
64        None => ("", i),
65    }
66}
67
68/// Decomposes query and fragment, if available.
69///
70/// The string must starts with `?`, or `#`, or be empty.
71#[must_use]
72fn decompose_query_and_fragment(i: &str) -> (Option<&str>, Option<&str>) {
73    match i.as_bytes().first().copied() {
74        None => (None, None),
75        Some(b'?') => {
76            let rest = &i[1..];
77            match find_split_hole(rest, b'#') {
78                Some((query, fragment)) => (Some(query), Some(fragment)),
79                None => (Some(rest), None),
80            }
81        }
82        Some(c) => {
83            debug_assert_eq!(c, b'#');
84            (None, Some(&i[1..]))
85        }
86    }
87}
88
89/// Decomposes the given valid `IRI-reference`.
90#[must_use]
91pub(crate) fn decompose_iri_reference<S: Spec>(
92    i: &RiReferenceStr<S>,
93) -> RiReferenceComponents<'_, S> {
94    /// Inner function to avoid unnecessary monomorphizations on `S`.
95    fn decompose(i: &str) -> Splitter {
96        let len = i.len();
97
98        let (i, scheme_end) = {
99            let (i, scheme) = scheme_colon_opt(i);
100            let end = scheme.and_then(|s| NonZeroUsize::new(s.len()));
101            (i, end)
102        };
103        let (i, authority_end) = {
104            // 2: "//".len()
105            let start = len - i.len() + 2;
106            // `authority` does not contain the two slashes of `://'.
107            let (i, authority) = slash_slash_authority_opt(i);
108            let end = authority.and_then(|s| NonZeroUsize::new(start + s.len()));
109            (i, end)
110        };
111        let (i, _path) = until_query(i);
112
113        let (query_start, fragment_start) = {
114            // This could theoretically be zero if `len` is `usize::MAX` and
115            // `i` has neither a query nor a fragment. However, this is
116            // practically impossible.
117            let after_first_prefix = NonZeroUsize::new((len - i.len()).wrapping_add(1));
118
119            let (query, fragment) = decompose_query_and_fragment(i);
120            match (query.is_some(), fragment) {
121                (true, Some(fragment)) => {
122                    (after_first_prefix, NonZeroUsize::new(len - fragment.len()))
123                }
124                (true, None) => (after_first_prefix, None),
125                (false, Some(_fragment)) => (None, after_first_prefix),
126                (false, None) => (None, None),
127            }
128        };
129
130        Splitter::new(scheme_end, authority_end, query_start, fragment_start)
131    }
132
133    RiReferenceComponents {
134        iri: i,
135        splitter: decompose(i.as_str()),
136    }
137}
138
139/// Extracts `scheme` part from an IRI reference.
140///
141/// # Precondition
142///
143/// The given string must be a valid IRI reference.
144#[inline]
145#[must_use]
146pub(crate) fn extract_scheme(i: &str) -> Option<&str> {
147    scheme_colon_opt(i).1
148}
149
150/// Extracts `scheme` part from an absolute IRI.
151///
152/// # Precondition
153///
154/// The given string must be a valid absolute IRI.
155#[inline]
156#[must_use]
157pub(crate) fn extract_scheme_absolute(i: &str) -> &str {
158    scheme_colon(i).1
159}
160
161/// Extracts `authority` part from an IRI reference.
162///
163/// # Precondition
164///
165/// The given string must be a valid IRI reference.
166#[inline]
167#[must_use]
168pub(crate) fn extract_authority(i: &str) -> Option<&str> {
169    let (i, _scheme) = scheme_colon_opt(i);
170    slash_slash_authority_opt(i).1
171}
172
173/// Extracts `authority` part from an absolute IRI.
174///
175/// # Precondition
176///
177/// The given string must be a valid absolute IRI.
178#[inline]
179#[must_use]
180pub(crate) fn extract_authority_absolute(i: &str) -> Option<&str> {
181    let (i, _scheme) = scheme_colon(i);
182    slash_slash_authority_opt(i).1
183}
184
185/// Extracts `authority` part from a relative IRI.
186///
187/// # Precondition
188///
189/// The given string must be a valid relative IRI.
190#[inline]
191#[must_use]
192pub(crate) fn extract_authority_relative(i: &str) -> Option<&str> {
193    slash_slash_authority_opt(i).1
194}
195
196/// Extracts `authority` part and its position from an IRI reference.
197///
198/// # Precondition
199///
200/// The given string must be a valid IRI reference.
201#[cfg(feature = "alloc")]
202#[must_use]
203pub(crate) fn extract_authority_and_offset(i: &str) -> Option<(&str, usize)> {
204    let (i, scheme) = scheme_colon_opt(i);
205    let authority = slash_slash_authority_opt(i).1?;
206    // 2: `"//".len()`
207    // +3: `"://".len()`
208    let offset = scheme.map_or(2, |s| s.len() + 3);
209    Some((authority, offset))
210}
211
212/// Extracts `path` part from an IRI reference.
213///
214/// # Precondition
215///
216/// The given string must be a valid IRI reference.
217#[inline]
218#[must_use]
219pub(crate) fn extract_path(i: &str) -> &str {
220    let (i, _scheme) = scheme_colon_opt(i);
221    let (i, _authority) = slash_slash_authority_opt(i);
222    until_query(i).1
223}
224
225/// Extracts `path` part from an absolute IRI.
226///
227/// # Precondition
228///
229/// The given string must be a valid absolute IRI.
230#[inline]
231#[must_use]
232pub(crate) fn extract_path_absolute(i: &str) -> &str {
233    let (i, _scheme) = scheme_colon(i);
234    let (i, _authority) = slash_slash_authority_opt(i);
235    until_query(i).1
236}
237
238/// Extracts `path` part from a relative IRI.
239///
240/// # Precondition
241///
242/// The given string must be a valid relative IRI.
243#[inline]
244#[must_use]
245pub(crate) fn extract_path_relative(i: &str) -> &str {
246    let (i, _authority) = slash_slash_authority_opt(i);
247    until_query(i).1
248}
249
250/// Extracts `query` part from an IRI reference.
251///
252/// # Precondition
253///
254/// The given string must be a valid IRI reference.
255#[inline]
256#[must_use]
257pub(crate) fn extract_query(i: &str) -> Option<&str> {
258    let (i, _before_query) = until_query(i);
259    decompose_query_and_fragment(i).0
260}
261
262/// Extracts `query` part from an `absolute-IRI` string.
263///
264/// # Precondition
265///
266/// The given string must be a valid `absolute-IRI` string.
267#[must_use]
268pub(crate) fn extract_query_absolute_iri(i: &str) -> Option<&str> {
269    let (i, _before_query) = until_query(i);
270    if i.is_empty() {
271        None
272    } else {
273        debug_assert_eq!(
274            i.as_bytes().first(),
275            Some(&b'?'),
276            "`absolute-IRI` string must not have `fragment part"
277        );
278        Some(&i[1..])
279    }
280}
281
282/// Splits an IRI string into the prefix and the fragment part.
283///
284/// A leading `#` character is truncated if the fragment part exists.
285///
286/// # Precondition
287///
288/// The given string must be a valid IRI reference.
289#[inline]
290#[must_use]
291pub(crate) fn split_fragment(iri: &str) -> (&str, Option<&str>) {
292    // It is completely OK to find the first `#` character from valid IRI to get fragment part,
293    // because the spec says that there are no `#` characters before the fragment part.
294    //
295    // > ```
296    // > scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
297    // > ```
298    // >
299    // > --- [RFC 3986, section 3.1. Scheme](https://www.rfc-editor.org/rfc/rfc3986.html#section-3.1)
300    //
301    // > The authority component is preceded by a double slash ("//") and is terminated by the
302    // > next slash ("/"), question mark ("?"), or number sign ("#") character, or by the end
303    // > of the URI.
304    // >
305    // > --- [RFC 3986, section 3.2. Authority](https://www.rfc-editor.org/rfc/rfc3986.html#section-3.2)
306    //
307    // > The path is terminated by the first question mark ("?") or number sign ("#")
308    // > character, or by the end of the URI.
309    // >
310    // > --- [RFC 3986, section 3.3. Path](https://www.rfc-editor.org/rfc/rfc3986.html#section-3.3)
311    //
312    // > The query component is indicated by the first question mark ("?") character and
313    // > terminated by a number sign ("#") character or by the end of the URI.
314    // >
315    // > --- [RFC 3986, section 3.4. Query](https://www.rfc-editor.org/rfc/rfc3986.html#section-3.4)
316    match find_split_hole(iri, b'#') {
317        Some((prefix, fragment)) => (prefix, Some(fragment)),
318        None => (iri, None),
319    }
320}
321
322/// Returns the fragment part of the given IRI.
323///
324/// A leading `#` character of the fragment is truncated.
325#[inline]
326#[must_use]
327pub(crate) fn extract_fragment(iri: &str) -> Option<&str> {
328    split_fragment(iri).1
329}
330
331/// Returns `Ok(_)` if the string is normalized.
332///
333/// If this function returns `true`, normalization input and output will be identical.
334///
335/// In this function, "normalized" means that any of the normalization below
336/// won't change the input on normalization:
337///
338/// * syntax-based normalization,
339/// * case normalization,
340/// * percent-encoding normalization, and
341/// * path segment normalizaiton.
342///
343/// Note that scheme-based normalization is not considered.
344#[must_use]
345pub(crate) fn is_normalized<S: Spec>(i: &str, mode: NormalizednessCheckMode) -> bool {
346    let (i, scheme) = scheme_colon(i);
347    let (after_authority, authority) = slash_slash_authority_opt(i);
348    let (_after_path, path) = until_query(after_authority);
349
350    // Syntax-based normalization: uppercase chars in `scheme` should be
351    // converted to lowercase.
352    if scheme.bytes().any(|b| b.is_ascii_uppercase()) {
353        return false;
354    }
355
356    // Case normalization: ASCII alphabets in US-ASCII only `host` should be
357    // normalized to lowercase.
358    // Case normalization: ASCII alphabets in percent-encoding triplet should be
359    // normalized to uppercase.
360    // Percent-encoding normalization: unresreved characters should be decoded
361    // in `userinfo`, `host`, `path`, `query`, and `fragments`.
362    // Path segment normalization: the path should not have dot segments (`.`
363    // and/or `..`).
364    //
365    // Note that `authority` can have percent-encoded `userinfo`.
366    if let Some(authority) = authority {
367        let authority_components = authority::decompose_authority(authority);
368
369        // Check `host`.
370        let host = authority_components.host();
371        let host_is_normalized = if is_ascii_only_host(host) {
372            eq_str_display(host, &NormalizedAsciiOnlyHost::new(host))
373        } else {
374            // If the host is not ASCII-only, conversion to lowercase is not performed.
375            is_pct_case_normalized::<S>(host)
376        };
377        if !host_is_normalized {
378            return false;
379        }
380
381        // Check pencent encodings in `userinfo`.
382        if let Some(userinfo) = authority_components.userinfo() {
383            if !is_pct_case_normalized::<S>(userinfo) {
384                return false;
385            }
386        }
387    }
388
389    // Check `path`.
390    //
391    // Syntax-based normalization: Dot segments might be removed.
392    // Note that we don't have to care `%2e` and `%2E` since `.` is unreserved
393    // and they will be decoded if not normalized.
394    // Also note that WHATWG serialization will use `/.//` as a path prefix if
395    // the path is absolute and won't modify the path if the path is relative.
396    //
397    // Percent-encoding normalization: unresreved characters should be decoded
398    // in `path`, `query`, and `fragments`.
399    let path_span_no_dot_segments = if authority.is_some() {
400        Some(path)
401    } else {
402        match mode {
403            NormalizednessCheckMode::Default => Some(path.strip_prefix("/.//").unwrap_or(path)),
404            NormalizednessCheckMode::Rfc3986 => Some(path),
405            NormalizednessCheckMode::PreserveAuthoritylessRelativePath => {
406                if path.starts_with('/') {
407                    // Absolute.
408                    Some(path.strip_prefix("/.//").unwrap_or(path))
409                } else {
410                    // Relative. Treat the path as "opaque". No span to check.
411                    None
412                }
413            }
414        }
415    };
416    if let Some(path_span_no_dot_segments) = path_span_no_dot_segments {
417        if path_span_no_dot_segments
418            .split('/')
419            .any(|segment| matches!(segment, "." | ".."))
420        {
421            return false;
422        }
423    }
424    is_pct_case_normalized::<S>(after_authority)
425}
426
427/// Decodes two hexdigits into a byte.
428///
429/// # Preconditions
430///
431/// The parameters `upper` and `lower` should be an ASCII hexadecimal digit.
432#[must_use]
433pub(crate) fn hexdigits_to_byte([upper, lower]: [u8; 2]) -> u8 {
434    // 'A'..='F' (0x41..=0x46) | 'a'..='f' (0x61..=0x66) => add 9 to the nibble.
435    // '0'..='9' (0x30..=0x39) => use the nibble as is.
436    let upper_offset = if upper >= 0x40 { 9 << 4 } else { 0 };
437    let lower_offset = if lower >= 0x40 { 9 } else { 0 };
438    (upper << 4) + upper_offset + (lower & 0x0f) + lower_offset
439}
440
441/// Converts the first two hexdigit bytes in the buffer into a byte.
442///
443/// # Panics
444///
445/// Panics if the string does not start with two hexdigits.
446#[must_use]
447pub(crate) fn take_xdigits2<T>(s: &T) -> (u8, &T)
448where
449    T: ?Sized + AsRef<[u8]> + ops::Index<RangeFrom<usize>, Output = T>,
450{
451    let (upper_xdigit, lower_xdigit) = match s.as_ref() {
452        [upper, lower, ..] => (*upper, *lower),
453        _ => panic!("at least two bytes should follow the `%` in a valid IRI reference"),
454    };
455    let v = hexdigits_to_byte([upper_xdigit, lower_xdigit]);
456    (v, &s[2..])
457}
458
459/// Returns true if the given `host`/`ihost` string consists of only US-ASCII characters.
460///
461/// # Precondition
462///
463/// The given string should be valid `host` or `host ":" port` string.
464#[must_use]
465pub(crate) fn is_ascii_only_host(mut host: &str) -> bool {
466    while let Some(pos) = host.find(|c: char| !c.is_ascii() || c == '%') {
467        if host.as_bytes()[pos] != b'%' {
468            // Non-ASCII character found.
469            return false;
470        }
471        let after_pct = &host[(pos + 1)..];
472        let (byte, rest) = take_xdigits2(after_pct);
473        if !byte.is_ascii() {
474            return false;
475        }
476        host = rest;
477    }
478
479    // Neither non-ASCII characters nor percent-encoded characters found.
480    true
481}