anyxml_uri/
uri.rs

1use std::{
2    borrow::{Borrow, Cow},
3    ops::Deref,
4    path::Path,
5    rc::Rc,
6    str::{from_utf8, from_utf8_unchecked},
7    sync::Arc,
8};
9
10use crate::ParseRIError;
11
12#[derive(Debug, PartialEq, Eq, Hash)]
13#[repr(transparent)]
14pub struct URIStr {
15    uri: str,
16}
17
18impl URIStr {
19    fn new(s: &str) -> &Self {
20        unsafe {
21            // # Safety
22            // Since `URIStr` is a transparent newtype of `str`,
23            // the bit patterns are exactly the same and have the same features.
24            &*(s as *const str as *const Self)
25        }
26    }
27
28    /// Resolve the relative reference `reference` using `self` as the base URI.
29    ///
30    /// `self` must be convertible to an [absolute URI](https://datatracker.ietf.org/doc/html/rfc3986#section-4.3)
31    /// through [fragment](https://datatracker.ietf.org/doc/html/rfc3986#section-3.5) removal
32    /// and normalization.
33    ///
34    /// # Reference
35    /// - [5.1.  Establishing a Base URI](https://datatracker.ietf.org/doc/html/rfc3986#section-5.1)
36    /// - [5.2.  Relative Resolution](https://datatracker.ietf.org/doc/html/rfc3986#section-5.2)
37    pub fn resolve(&self, reference: &Self) -> URIString {
38        use Component::*;
39
40        let base = if self.is_absolute() {
41            Cow::Borrowed(self)
42        } else {
43            let mut base = self.to_owned();
44            base.normalize();
45            if let Some(frag) = base.uri.bytes().position(|b| b == b'#') {
46                base.uri.truncate(frag);
47            }
48            assert!(
49                base.is_absolute(),
50                "'{}' is not absolute",
51                base.as_escaped_str()
52            );
53            Cow::Owned(base)
54        };
55
56        let mut ref_components = reference.components().peekable();
57        if ref_components
58            .next_if(|comp| matches!(comp, Scheme(_)))
59            .is_some()
60        {
61            let mut ret = reference.to_owned();
62            ret.normalize();
63            return ret;
64        }
65
66        if ref_components
67            .next_if(|comp| matches!(comp, Authority { .. }))
68            .is_some()
69        {
70            // has authority
71            let mut ret = URIString {
72                uri: [base.scheme().unwrap(), ":", &reference.uri].concat(),
73            };
74            ret.normalize();
75            return ret;
76        }
77
78        let mut components = base.components().peekable();
79        let mut uri = String::new();
80        if let Some(Scheme(scheme)) = components.next_if(|comp| matches!(comp, Scheme(_))) {
81            uri.push_str(scheme);
82            uri.push(':');
83        }
84        if let Some(Authority {
85            userinfo,
86            host,
87            port,
88        }) = components.next_if(|comp| matches!(comp, Authority { .. }))
89        {
90            uri.push_str("//");
91            if let Some(userinfo) = userinfo {
92                uri.push_str(userinfo);
93                uri.push(':');
94            }
95            uri.push_str(host);
96            if let Some(port) = port {
97                uri.push(':');
98                uri.push_str(port);
99            }
100        }
101
102        if ref_components
103            .next_if(|comp| matches!(comp, RootSegment))
104            .is_some()
105        {
106            uri.push_str(&reference.uri);
107            let mut ret = URIString { uri };
108            ret.normalize();
109            return ret;
110        }
111
112        let mut segments = vec![];
113        let has_root = components
114            .next_if(|comp| matches!(comp, RootSegment))
115            .is_some();
116        let mut has_dot_segment = false;
117        while let Some(Segment(segment)) = components.next_if(|comp| matches!(comp, Segment(_))) {
118            segments.push(segment);
119            has_dot_segment |= segment == "." || segment == "..";
120        }
121        if has_dot_segment {
122            segments = normalize_path_segments(segments.into_iter(), has_root);
123        }
124
125        let mut has_path = false;
126        if let Some(Segment(segment)) = ref_components.next_if(|comp| matches!(comp, Segment(_))) {
127            let mut buf = vec![segment];
128            while let Some(Segment(segment)) =
129                ref_components.next_if(|comp| matches!(comp, Segment(_)))
130            {
131                buf.push(segment);
132            }
133            if buf.len() > 1 || !buf[0].is_empty() {
134                segments.pop();
135                segments.extend(buf);
136                has_path = true;
137            }
138        }
139        build_normalized_path(segments.into_iter(), has_root, &mut uri);
140
141        if let Some(Query(query)) = ref_components.next_if(|comp| matches!(comp, Query(_))) {
142            uri.push('?');
143            uri.push_str(query);
144        } else if !has_path
145            && let Some(Query(query)) = components.next_if(|comp| matches!(comp, Query(_)))
146        {
147            uri.push('?');
148            uri.push_str(query);
149        }
150
151        if let Some(Fragment(fragment)) = ref_components.next() {
152            uri.push('#');
153            uri.push_str(fragment);
154        }
155
156        URIString { uri }
157    }
158
159    /// Return the escaped URI string.
160    pub fn as_escaped_str(&self) -> &str {
161        &self.uri
162    }
163
164    /// Return the unescaped URI string.  \
165    /// If unescaping fails, return `None`.
166    pub fn as_unescaped_str(&self) -> Option<Cow<'_, str>> {
167        unescape(&self.uri).ok()
168    }
169
170    /// # Reference
171    /// [4.3.  Absolute URI](https://datatracker.ietf.org/doc/html/rfc3986#section-4.3)
172    pub fn is_absolute(&self) -> bool {
173        self.scheme().is_some() && self.fragment().is_none()
174    }
175
176    /// # Reference
177    /// [4.2.  Relative Reference](https://datatracker.ietf.org/doc/html/rfc3986#section-4.2)
178    pub fn is_relative(&self) -> bool {
179        self.scheme().is_none()
180    }
181
182    /// # Reference
183    /// [3.1.  Scheme](https://datatracker.ietf.org/doc/html/rfc3986#section-3.1)
184    pub fn scheme(&self) -> Option<&str> {
185        let pos = self.uri.bytes().position(is_reserved)?;
186        (self.uri.as_bytes()[pos] == b':').then_some(&self.uri[..pos])
187    }
188
189    /// # Reference
190    /// [3.2.  Authority](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2)
191    pub fn authority(&self) -> Option<&str> {
192        let rem = self
193            .uri
194            .strip_prefix("//")
195            .or_else(|| self.uri.split_once("://").map(|p| p.1))?;
196        Some(rem.split_once('/').map(|p| p.0).unwrap_or(rem))
197    }
198
199    /// # Reference
200    /// [3.2.1.  User Information](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.1)
201    pub fn userinfo(&self) -> Option<&str> {
202        Some(self.authority()?.split_once('@')?.0)
203    }
204
205    /// # Reference
206    /// [3.2.2.  Host](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.2)
207    pub fn host(&self) -> Option<&str> {
208        let mut auth = self.authority()?;
209        if let Some((_userinfo, rem)) = auth.split_once('@') {
210            auth = rem;
211        }
212        if let Some((host, port)) = auth.rsplit_once(':')
213            && port.bytes().all(|b| b.is_ascii_digit())
214        {
215            auth = host;
216        }
217        Some(auth)
218    }
219
220    /// # Reference
221    /// [3.2.3.  Port](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.3)
222    pub fn port(&self) -> Option<&str> {
223        let (_, port) = self.authority()?.rsplit_once(':')?;
224        port.bytes().all(|b| b.is_ascii_digit()).then_some(port)
225    }
226
227    /// # Reference
228    /// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
229    pub fn path(&self) -> &str {
230        let mut path = &self.uri;
231        if let Some(scheme) = self.scheme() {
232            // has scheme
233            path = &path[scheme.len() + 1..];
234        }
235        if let Some(rem) = path.strip_prefix("//") {
236            // has authority
237            let pos = rem.bytes().position(|b| b == b'/').unwrap_or(rem.len());
238            path = &rem[pos..]
239        }
240
241        path.split_once(['?', '#']).map(|p| p.0).unwrap_or(path)
242    }
243
244    /// # Reference
245    /// [3.4.  Query](https://datatracker.ietf.org/doc/html/rfc3986#section-3.4)
246    pub fn query(&self) -> Option<&str> {
247        let pos = self.uri.bytes().position(|b| b == b'?' || b == b'#')?;
248        if self.uri.as_bytes()[pos] == b'#' {
249            return None;
250        }
251        let query = &self.uri[pos + 1..];
252        let pos = query.bytes().position(|b| b == b'#').unwrap_or(query.len());
253        Some(&query[..pos])
254    }
255
256    /// # Reference
257    /// [3.5.  Fragment](https://datatracker.ietf.org/doc/html/rfc3986#section-3.5)
258    pub fn fragment(&self) -> Option<&str> {
259        let pos = self.uri.bytes().position(|b| b == b'#')?;
260        Some(&self.uri[pos + 1..])
261    }
262
263    /// Return an iterator that scans the URI components.
264    pub fn components(&self) -> Components<'_> {
265        Components::new(&self.uri)
266    }
267}
268
269impl ToOwned for URIStr {
270    type Owned = URIString;
271
272    fn to_owned(&self) -> Self::Owned {
273        URIString {
274            uri: self.uri.to_owned(),
275        }
276    }
277}
278
279impl From<&URIStr> for URIString {
280    fn from(value: &URIStr) -> Self {
281        value.to_owned()
282    }
283}
284
285impl AsRef<URIStr> for URIStr {
286    fn as_ref(&self) -> &URIStr {
287        self
288    }
289}
290
291impl Clone for Box<URIStr> {
292    fn clone(&self) -> Self {
293        self.as_ref().into()
294    }
295}
296
297impl std::fmt::Display for URIStr {
298    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
299        write!(
300            f,
301            "{}",
302            self.as_unescaped_str()
303                .as_deref()
304                .unwrap_or(self.as_escaped_str())
305        )
306    }
307}
308
309macro_rules! impl_boxed_convertion_uri_str {
310    ($( $t:ident ),*) => {
311        $(
312            impl From<&URIStr> for $t<URIStr> {
313                fn from(value: &URIStr) -> Self {
314                    let boxed: $t<str> = value.uri.into();
315                    unsafe {
316                        // # Safety
317                        // Since `URIStr` is a transparent newtype of `str`,
318                        // the bit patterns are exactly the same and have the same features.
319                        std::mem::transmute(boxed)
320                    }
321                }
322            }
323        )*
324    };
325}
326impl_boxed_convertion_uri_str!(Box, Rc, Arc);
327
328#[derive(Debug, Clone, PartialEq, Eq, Hash)]
329#[repr(transparent)]
330pub struct URIString {
331    /// Escaped URI string.
332    ///
333    /// Parts generated from UTF-8 strings can always be converted back
334    /// to the original UTF-8 byte sequence.
335    /// Similarly, the parts generated from Path can probably be converted back
336    /// to the original Path byte sequence.
337    ///
338    /// As a result of resolving URI references, there may be a mixture of parts generated
339    /// from UTF-8 strings and parts generated from Paths, so the whole may not always revert
340    /// to a UTF-8 string or Path byte sequence.
341    uri: String,
342}
343
344impl URIString {
345    /// Parse the string as a URI by escaping all characters not specified as
346    /// [`reserved`](https://datatracker.ietf.org/doc/html/rfc3986#section-2.2)
347    /// or [`unreserved`](https://datatracker.ietf.org/doc/html/rfc3986#section-2.3)
348    /// in [RFC 3986](https://datatracker.ietf.org/doc/html/rfc3986).
349    ///
350    /// Because certain characters containing `%` are escaped, the result of
351    /// [`URIStr::as_unescaped_str`] is equal to `uri`, but the result of
352    /// [`URIStr::as_escaped_str`] may differ from `uri`.
353    ///
354    /// Since it escapes nearly all characters—including control characters, `%`,
355    /// and non-ASCII characters—it will successfully parse any string that roughly
356    /// follows URI notation.
357    pub fn parse(uri: impl AsRef<str>) -> Result<Self, ParseRIError> {
358        fn _parse(uri: &str) -> Result<URIString, ParseRIError> {
359            let uri = escape_except(uri, |b| {
360                b.is_ascii() && (is_reserved(b as u8) || is_unreserved(b as u8))
361            });
362            URIString::parse_escaped(&uri)
363        }
364        _parse(uri.as_ref())
365    }
366
367    /// Parse the string as a URI after applying escaping according
368    /// to [XML 1.0 "4.2.2 External Entities"](https://www.w3.org/TR/xml/#sec-external-ent).
369    ///
370    /// Some characters are escaped without escaping the `%` character, so both the result
371    /// of [`URIStr::as_unescaped_str`] and the result of [`URIStr::as_escaped_str`] may
372    /// differ from `uri`.
373    ///
374    /// > System identifiers (and other XML strings meant to be used as URI references) may
375    /// > contain characters that, according to [IETF RFC 3986], must be escaped before a
376    /// > URI can be used to retrieve the referenced resource. The characters to be escaped
377    /// > are the control characters #x0 to #x1F and #x7F (most of which cannot appear in
378    /// > XML), space #x20, the delimiters '<' #x3C, '>' #x3E and '"' #x22, the unwise
379    /// > characters '{' #x7B, '}' #x7D, '|' #x7C, '\' #x5C, '^' #x5E and '`' #x60, as well
380    /// > as all characters above #x7F. Since escaping is not always a fully reversible
381    /// > process, it MUST be performed only when absolutely necessary and as late as
382    /// > possible in a processing chain. In particular, neither the process of converting
383    /// > a relative URI to an absolute one nor the process of passing a URI reference to a
384    /// > process or software component responsible for dereferencing it SHOULD trigger
385    /// > escaping. When escaping does occur, it MUST be performed as follows:
386    /// >
387    /// > 1. Each character to be escaped is represented in UTF-8 [Unicode] as one or more
388    /// >    bytes.
389    /// > 2. The resulting bytes are escaped with the URI escaping mechanism (that is,
390    /// >    converted to % HH, where HH is the hexadecimal notation of the byte value).
391    /// > 3. The original character is replaced by the resulting character sequence.
392    pub fn parse_system_id(uri: impl AsRef<str>) -> Result<Self, ParseRIError> {
393        fn _parse(uri: &str) -> Result<URIString, ParseRIError> {
394            let uri = escape_except(uri, |b| {
395                // XML 1.0 "4.2.2 External Entities"
396                b.is_ascii()
397                    && !matches!(
398                        b as u8,
399                        0..=0x1F
400                            | 0x20
401                            | 0x22
402                            | 0x3C
403                            | 0x3E
404                            | 0x5C
405                            | 0x5E
406                            | 0x60
407                            | 0x7B..=0x7D
408                            | 0x7F..
409                    )
410            });
411            URIString::parse_escaped(&uri)
412        }
413        _parse(uri.as_ref())
414    }
415
416    /// Parse the string as a URI without performing any escape processing whatsoever.  \
417    /// In other words, `uri` is treated as an escaped string.
418    ///
419    /// Since percent-encoded characters are treated as percent-encoded, the result of
420    /// [`URIStr::as_unescaped_str`] may differ from `uri`. On the other hand, since no
421    /// escaping is performed at all, the result of [`URIStr::as_escaped_str`] is always
422    /// equal to `uri`.
423    fn parse_escaped(uri: impl AsRef<str>) -> Result<Self, ParseRIError> {
424        fn _parse(uri: &str) -> Result<URIString, ParseRIError> {
425            let mut bytes = uri.as_bytes();
426            parse_uri_reference(&mut bytes)?;
427            if !bytes.is_empty() {
428                Err(ParseRIError::NotTermination)
429            } else {
430                Ok(URIString {
431                    uri: uri.to_owned(),
432                })
433            }
434        }
435        _parse(uri.as_ref())
436    }
437
438    /// # Note
439    /// In the current implementation, paths that cannot be converted to UTF-8 strings
440    /// cannot be handled.  \
441    /// I don't think there will be any problems in most environments, but there may be
442    /// some paths that cannot be handled.
443    pub fn parse_file_path(path: impl AsRef<Path>) -> Result<Self, ParseRIError> {
444        #[cfg(target_family = "unix")]
445        fn _parse_file_path(path: &Path) -> Result<URIString, ParseRIError> {
446            let mut path_str = path.to_str().ok_or(ParseRIError::Unsupported)?.to_owned();
447            if (path.is_dir() || (path.as_os_str().as_encoded_bytes().ends_with(b"\\")))
448                && !path_str.ends_with('/')
449            {
450                path_str.push('/');
451            }
452            if path.is_absolute() {
453                path_str.insert_str(0, "file://");
454            }
455            URIString::parse(path_str)
456        }
457        #[cfg(target_family = "windows")]
458        fn _parse_file_path(path: &Path) -> Result<URIString, ParseRIError> {
459            use std::path::{Component::*, Prefix::*};
460
461            let mut path_str = String::new();
462            let mut verbatim = false;
463            for comp in path.components() {
464                match comp {
465                    Prefix(prefix) => match prefix.kind() {
466                        Verbatim(root) => {
467                            path_str.push_str("file:///");
468                            path_str.push_str(
469                                &root
470                                    .to_str()
471                                    .ok_or(ParseRIError::Unsupported)?
472                                    .replace('/', "%2F"),
473                            );
474                            verbatim = true;
475                        }
476                        VerbatimUNC(server, root) => {
477                            path_str.push_str("file://");
478                            path_str.push_str(
479                                &server
480                                    .to_str()
481                                    .ok_or(ParseRIError::Unsupported)?
482                                    .replace('/', "%2F"),
483                            );
484                            path_str.push('/');
485                            path_str.push_str(
486                                &root
487                                    .to_str()
488                                    .ok_or(ParseRIError::Unsupported)?
489                                    .replace('/', "%2F"),
490                            );
491                            verbatim = true;
492                        }
493                        VerbatimDisk(letter) => {
494                            path_str.push_str("file:");
495                            path_str.push(letter as char);
496                            path_str.push(':');
497                            verbatim = true;
498                        }
499                        DeviceNS(device) => {
500                            path_str.push_str("file:///");
501                            path_str.push_str(device.to_str().ok_or(ParseRIError::Unsupported)?);
502                        }
503                        UNC(server, root) => {
504                            path_str.push_str("file://");
505                            path_str.push_str(server.to_str().ok_or(ParseRIError::Unsupported)?);
506                            path_str.push('/');
507                            path_str.push_str(root.to_str().ok_or(ParseRIError::Unsupported)?);
508                        }
509                        Disk(letter) => {
510                            path_str.push_str("file:");
511                            path_str.push(letter as char);
512                            path_str.push(':');
513                        }
514                    },
515                    RootDir => {}
516                    CurDir => {
517                        if !path_str.is_empty() {
518                            path_str.push_str("/.");
519                        } else {
520                            path_str.push_str(".");
521                        }
522                    }
523                    ParentDir => {
524                        if !path_str.is_empty() {
525                            path_str.push_str("/..");
526                        } else {
527                            path_str.push_str("..")
528                        }
529                    }
530                    Normal(segment) => {
531                        if !path_str.is_empty() {
532                            path_str.push('/');
533                        }
534                        let segment = segment.to_str().ok_or(ParseRIError::Unsupported)?;
535                        if verbatim {
536                            path_str.push_str(&segment.replace('/', "%2F"));
537                        } else {
538                            path_str.push_str(segment);
539                        }
540                    }
541                }
542            }
543            if (path.is_dir()
544                || (path.as_os_str().as_encoded_bytes().ends_with(b"\\")
545                    || (!verbatim && path.as_os_str().as_encoded_bytes().ends_with(b"/"))))
546                && !path_str.ends_with('/')
547            {
548                path_str.push('/');
549            }
550            URIString::parse(path_str)
551        }
552        #[cfg(all(not(target_family = "unix"), not(target_family = "windows")))]
553        fn _parse_file_path(path: &Path) -> Result<URIString, ParseRIError> {
554            todo!()
555        }
556        _parse_file_path(path.as_ref())
557    }
558
559    pub fn into_boxed_uri_str(self) -> Box<URIStr> {
560        Box::from(self.as_ref())
561    }
562
563    /// # Reference
564    /// [6.2.2.  Syntax-Based Normalization](https://datatracker.ietf.org/doc/html/rfc3986#section-6.2.2).
565    pub fn normalize(&mut self) {
566        use Component::*;
567
568        let mut uri = String::with_capacity(self.uri.len());
569        let mut paths = vec![];
570        let mut query = None;
571        let mut fragment = None;
572        let mut has_root = false;
573        for comp in self.components() {
574            match comp {
575                Scheme(scheme) => {
576                    uri.push_str(&scheme.to_ascii_lowercase());
577                    uri.push(':');
578                }
579                Authority {
580                    userinfo,
581                    host,
582                    port,
583                } => {
584                    uri.push_str("//");
585                    if let Some(userinfo) = userinfo {
586                        uri.push_str(userinfo);
587                        uri.push('@');
588                    }
589                    uri.push_str(host);
590                    if let Some(port) = port {
591                        uri.push(':');
592                        uri.push_str(port);
593                    }
594                }
595                RootSegment => has_root = true,
596                Segment(segment) => paths.push(segment),
597                Query(q) => query = Some(q),
598                Fragment(f) => fragment = Some(f),
599            }
600        }
601        build_normalized_path(paths.into_iter(), has_root, &mut uri);
602        if let Some(query) = query {
603            uri.push('?');
604            uri.push_str(query);
605        }
606        if let Some(fragment) = fragment {
607            uri.push('#');
608            uri.push_str(fragment);
609        }
610        self.uri = uri;
611    }
612}
613
614impl AsRef<URIStr> for URIString {
615    fn as_ref(&self) -> &URIStr {
616        URIStr::new(&self.uri)
617    }
618}
619
620impl Borrow<URIStr> for URIString {
621    fn borrow(&self) -> &URIStr {
622        self.as_ref()
623    }
624}
625
626impl Deref for URIString {
627    type Target = URIStr;
628
629    fn deref(&self) -> &Self::Target {
630        self.as_ref()
631    }
632}
633
634impl std::fmt::Display for URIString {
635    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
636        write!(f, "{}", self.as_ref())
637    }
638}
639
640macro_rules! impl_convertion_uri_string {
641    ($( $t:ty ),*) => {
642        $(
643            impl From<URIString> for $t {
644                fn from(value: URIString) -> $t {
645                    From::from(value.as_ref())
646                }
647            }
648        )*
649    };
650}
651impl_convertion_uri_string!(Box<URIStr>, Rc<URIStr>, Arc<URIStr>);
652
653fn build_normalized_path<'a>(
654    segments: impl Iterator<Item = &'a str>,
655    has_root: bool,
656    buffer: &mut String,
657) {
658    let segments = normalize_path_segments(segments, has_root);
659    if has_root {
660        buffer.push('/');
661    }
662    for (i, seg) in segments.into_iter().enumerate() {
663        if i > 0 {
664            buffer.push('/');
665        }
666        buffer.push_str(seg);
667    }
668}
669
670fn normalize_path_segments<'a>(
671    segments: impl Iterator<Item = &'a str>,
672    has_root: bool,
673) -> Vec<&'a str> {
674    let mut stack = vec![];
675    let mut last_dot = false;
676    for seg in segments {
677        if seg == "." {
678            // no op
679            last_dot = true;
680        } else if seg == ".." {
681            if !stack.is_empty() && stack.last() != Some(&"..") {
682                stack.pop();
683            } else if !has_root {
684                stack.push(seg);
685            }
686            last_dot = true;
687        } else {
688            stack.push(seg);
689            last_dot = false;
690        }
691    }
692
693    if last_dot {
694        stack.push("");
695    }
696
697    stack
698}
699
700/// # Reference
701/// [4.1.  URI Reference](https://datatracker.ietf.org/doc/html/rfc3986#section-4.1)
702///
703/// ```text
704/// URI-reference = URI / relative-ref
705/// ```
706fn parse_uri_reference(b: &mut &[u8]) -> Result<(), ParseRIError> {
707    if b.is_empty() || matches!(b[0], b'/' | b'?' | b'#') {
708        // If `b` is an empty string or starts with either '/', '?' or '#',
709        // it is definitely 'relative-ref'.
710        parse_relative_ref(b)
711    } else {
712        // Otherwise, it is necessary to distinguish between `URI` and `relative-ref`
713        // starting with `relative-part` that matches `path-noscheme`.
714
715        if !b[0].is_ascii_alphabetic() {
716            // Since `scheme` begins with at least one `ALPHA`,
717            // if it does not, it is definitely `irelative-ref`.
718            parse_relative_ref(b)
719        } else {
720            // The characters that can be used in `scheme` are very limited,
721            // so it might be quicker to try parsing `scheme` to distinguish between them?
722            // [25] scheme ::= ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
723            if let Some(&c) = b
724                .iter()
725                .find(|&&b| !b.is_ascii_alphanumeric() && !matches!(b, b'+' | b'-' | b'.'))
726                && c == b':'
727            {
728                parse_uri(b)
729            } else {
730                parse_relative_ref(b)
731            }
732        }
733    }
734}
735
736/// # Reference
737/// [3.  Syntax Components](https://datatracker.ietf.org/doc/html/rfc3986#section-3)
738///
739/// ```text
740/// URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
741/// ```
742fn parse_uri(b: &mut &[u8]) -> Result<(), ParseRIError> {
743    parse_scheme(b)?;
744    *b = b
745        .strip_prefix(b":")
746        .ok_or(ParseRIError::InvalidSchemeSeparator)?;
747    parse_hier_part(b)?;
748    if let Some(query) = b.strip_prefix(b"?") {
749        *b = query;
750        parse_query(b)?;
751    }
752    if let Some(fragment) = b.strip_prefix(b"#") {
753        *b = fragment;
754        parse_fragment(b)?;
755    }
756    Ok(())
757}
758
759/// # Reference
760/// [3.1.  Scheme](https://datatracker.ietf.org/doc/html/rfc3986#section-3.1)
761///
762/// ```text
763/// scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
764/// ```
765fn parse_scheme(b: &mut &[u8]) -> Result<(), ParseRIError> {
766    if b.is_empty() || !b[0].is_ascii_alphabetic() {
767        return Err(ParseRIError::InvalidScheme);
768    }
769    let pos = b
770        .iter()
771        .position(|&b| !b.is_ascii_alphanumeric() && !matches!(b, b'+' | b'-' | b'.'))
772        .unwrap_or(b.len());
773    *b = &b[pos..];
774    Ok(())
775}
776
777/// # Reference
778/// [3.  Syntax Components](https://datatracker.ietf.org/doc/html/rfc3986#section-3)
779///
780/// ```text
781/// hier-part   = "//" authority path-abempty
782///             / path-absolute
783///             / path-rootless
784///             / path-empty
785/// ```
786fn parse_hier_part(b: &mut &[u8]) -> Result<(), ParseRIError> {
787    if let Some(rem) = b.strip_prefix(b"/") {
788        // If `b` starts with '/', `b` starts with 'authority' or `path-absolute`,
789
790        if let Some(rem) = rem.strip_prefix(b"/") {
791            // If `b` starts with '//', it should be followed by 'authority'.
792            // This is because 'path-absolute' is followed by exactly one '/' at the beginning
793            // and optionally 'segment-nz', so there cannot be two consecutive '/' characters.
794            *b = rem;
795            parse_authority(b)?;
796            parse_path_abempty(b)
797        } else {
798            // path-absolute = "/" [ segment-nz *( "/" segment ) ]
799            // segment-nz    = 1*pchar
800            parse_path_absolute(b)
801        }
802    } else {
803        // otherwise, `b` starts with 'path-rootless' or 'path-empty'
804        let mut dum = *b;
805        if parse_pchar(&mut dum).is_ok() {
806            // If 'path-rootless' follows, one or more 'pchar' should follow.
807            parse_path_rootless(b)
808        } else {
809            // If not, it is 'path-empty'.
810            // Since 'path-empty' is an empty string,
811            // we can simply return `Ok` without doing anything.
812            Ok(())
813        }
814    }
815}
816
817/// # Reference
818/// [3.2.  Authority](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2)
819///
820/// ```text
821/// authority   = [ userinfo "@" ] host [ ":" port ]
822/// ```
823fn parse_authority(b: &mut &[u8]) -> Result<(), ParseRIError> {
824    if b.starts_with(b"[") {
825        // If `b` starts with '[', it is definitely an `host` that matches `IP-literal`.
826        parse_ip_literal(b)?;
827        if let Some(rem) = b.strip_prefix(b":") {
828            *b = rem;
829            parse_port(b)?;
830        }
831        return Ok(());
832    }
833
834    // If not, it may start with `userinfo`, or it may start with `host`
835    // that matches `IPv4address` or `reg-name`.
836    //
837    // If it is either `IPv4address` or `reg-name`, there is no need to consider `IPv4address`.
838    // This is because `reg-name` includes `IPv4address`. More specifically, since `unreserved`
839    // contains `DIGIT` and `.`, `IPv4address` can be regarded as a specific sequence of `unreserved`.
840    //
841    // `userinfo` and `reg-name` are rules that share characters other than colons.
842    // Therefore, they can be distinguished using the following algorithm.
843    //
844    // 1. Increment the counter as long as it matches `userinfo`.
845    // 2. If the first ":" is encountered, note its position.
846    // 3. Determine the matching rule according to the characters that did not match `userinfo`.
847    //      i.   If it is "@", the string seen so far is `userinfo`.
848    //      ii.  If it is "[" , then an `host` matching "IP-literal" should start there,
849    //           but since there is no "@" immediately before it, it is an error.
850    //      iii. In other cases, if the position of ":" is noted, the string before it is `host`;
851    //                           if not, all strings seen so far are `host`.
852    //
853    // userinfo    = *( unreserved / pct-encoded / sub-delims / ":" )
854    //
855    // reg-name    = *( unreserved / pct-encoded / sub-delims )
856    // unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
857    //
858    // IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
859    let mut colon = usize::MAX;
860    let mut now = 0;
861    let mut t = *b;
862    while !t.is_empty() {
863        let pos = t
864            .iter()
865            .position(|&b| !is_unreserved(b) && !is_sub_delims(b) && b != b'%')
866            .unwrap_or(t.len());
867        t = &t[pos..];
868        now += pos;
869        if let Some(rem) = t.strip_prefix(b":") {
870            now += 1;
871            t = rem;
872            colon = colon.min(now);
873        } else {
874            break;
875        }
876    }
877
878    debug_assert_eq!(now, b.len() - t.len());
879
880    if let Some(rem) = t.strip_prefix(b"@") {
881        *b = rem;
882        parse_host(b)?;
883        if let Some(rem) = b.strip_prefix(b":") {
884            *b = rem;
885            parse_port(b)?;
886        }
887        Ok(())
888    } else if t.starts_with(b"[") {
889        Err(ParseRIError::InvalidAuthority)
890    } else if colon < usize::MAX {
891        *b = &b[colon + 1..];
892        parse_port(b)
893    } else {
894        *b = t;
895        Ok(())
896    }
897}
898
899// This function has no use.
900// /// # Reference
901// /// [3.2.1.  User Information](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.1)
902// ///
903// /// ```text
904// /// userinfo    = *( unreserved / pct-encoded / sub-delims / ":" )
905// /// ```
906// fn parse_userinfo(b: &mut &[u8]) -> Result<(), ParseRIError> {
907//     todo!()
908// }
909
910/// # Reference
911/// [3.2.2.  Host]
912///
913/// ```text
914/// host        = IP-literal / IPv4address / reg-name
915/// ```
916fn parse_host(b: &mut &[u8]) -> Result<(), ParseRIError> {
917    if b.starts_with(b"[") {
918        parse_ip_literal(b)
919    } else {
920        // Since `IPv4address` is covered by `reg-name`, it does not need to be considered.
921        parse_reg_name(b)
922    }
923}
924
925/// # Reference
926/// [3.2.2.  Host]
927///
928/// ```text
929/// IP-literal  = "[" ( IPv6address / IPvFuture  ) "]"
930/// ```
931fn parse_ip_literal(b: &mut &[u8]) -> Result<(), ParseRIError> {
932    *b = b.strip_prefix(b"[").ok_or(ParseRIError::InvalidIPLiteral)?;
933    if !b.is_empty() && b[0].eq_ignore_ascii_case(&b'v') {
934        parse_ipv_future(b)?;
935    } else {
936        parse_ipv6_address(b)?;
937    }
938    *b = b.strip_prefix(b"]").ok_or(ParseRIError::InvalidIPLiteral)?;
939    Ok(())
940}
941
942/// # Reference
943/// [3.2.2.  Host]
944///
945/// ```text
946/// IPvFuture   = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
947/// ```
948fn parse_ipv_future(b: &mut &[u8]) -> Result<(), ParseRIError> {
949    if b.is_empty() || !b[0].eq_ignore_ascii_case(&b'v') {
950        return Err(ParseRIError::InvalidIPvFuture);
951    }
952    *b = &b[1..];
953    let pos = b
954        .iter()
955        .position(|&b| !b.is_ascii_hexdigit())
956        .unwrap_or(b.len());
957    if !(1..=b.len() - 2).contains(&pos) {
958        return Err(ParseRIError::InvalidIPvFuture);
959    }
960    *b = &b[pos..];
961    *b = b.strip_prefix(b".").ok_or(ParseRIError::InvalidIPvFuture)?;
962    let pos = b
963        .iter()
964        .position(|&b| !is_unreserved(b) && !is_sub_delims(b) && b != b':')
965        .unwrap_or(b.len());
966    if pos == 0 {
967        return Err(ParseRIError::InvalidIPvFuture);
968    }
969    *b = &b[pos..];
970    Ok(())
971}
972
973/// # Reference
974/// [3.2.2.  Host]
975///
976/// ```text
977/// IPv6address =                            6( h16 ":" ) ls32
978///             /                       "::" 5( h16 ":" ) ls32
979///             / [               h16 ] "::" 4( h16 ":" ) ls32
980///             / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
981///             / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
982///             / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
983///             / [ *4( h16 ":" ) h16 ] "::"              ls32
984///             / [ *5( h16 ":" ) h16 ] "::"              h16
985///             / [ *6( h16 ":" ) h16 ] "::"
986///  ls32       = ( h16 ":" h16 ) / IPv4address
987///             ; least-significant 32 bits of address
988///  h16        = 1*4HEXDIG
989///             ; 16 bits of address represented in hexadecimal
990/// ```
991fn parse_ipv6_address(b: &mut &[u8]) -> Result<(), ParseRIError> {
992    let mut cnt = 1;
993    let mut omit = false;
994    if let Some(rem) = b.strip_prefix(b":") {
995        *b = rem;
996        omit = true;
997    } else {
998        parse_h16(b)?;
999    }
1000
1001    while cnt + (omit as i32) < 8
1002        && let Some(rem) = b.strip_prefix(b":")
1003    {
1004        *b = rem;
1005        if b.starts_with(b":") {
1006            if omit {
1007                return Err(ParseRIError::InvalidIPv6address);
1008            }
1009            omit = true;
1010            cnt += 1;
1011            continue;
1012        }
1013
1014        // It's not a smart approach, but it'll probably work...
1015        //
1016        // Checking `h16` first will not work because it cannot be distinguished
1017        // from the first octet of the IPv4 address.
1018        //
1019        // Checking the positions where ':' and '.' appear also seems unlikely to work,
1020        // considering cases where such characters appear in the segments of the following paths.
1021        let mut dum = *b;
1022        if parse_ipv4_address(&mut dum).is_ok() {
1023            *b = dum;
1024            // An IPv4 address consumes two hextets.
1025            cnt += 2;
1026            // An IPv4 address only appears at the end.
1027            break;
1028        } else if !b.is_empty() && b[0].is_ascii_hexdigit() {
1029            parse_h16(b)?;
1030        }
1031    }
1032
1033    // If "::" is included, some hextets may be omitted, resulting in fewer than eight.
1034    // Otherwise, exactly eight hextets are required.
1035    if (omit && cnt <= 8) || (!omit && cnt == 8) {
1036        Ok(())
1037    } else {
1038        Err(ParseRIError::InvalidIPv6address)
1039    }
1040}
1041
1042/// # Reference
1043/// [3.2.2.  Host]
1044///
1045/// ```text
1046///  h16        = 1*4HEXDIG
1047///             ; 16 bits of address represented in hexadecimal
1048/// ```
1049fn parse_h16(b: &mut &[u8]) -> Result<(), ParseRIError> {
1050    let pos = b
1051        .iter()
1052        .position(|&b| !b.is_ascii_hexdigit())
1053        .unwrap_or(b.len());
1054    if pos == 0 {
1055        Err(ParseRIError::InvalidH16)
1056    } else {
1057        *b = &b[pos.min(4)..];
1058        Ok(())
1059    }
1060}
1061
1062/// # Reference
1063/// [3.2.2.  Host]
1064///
1065/// ```text
1066/// IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
1067/// dec-octet   = DIGIT                 ; 0-9
1068///             / %x31-39 DIGIT         ; 10-99
1069///             / "1" 2DIGIT            ; 100-199
1070///             / "2" %x30-34 DIGIT     ; 200-249
1071///             / "25" %x30-35          ; 250-255
1072/// ```
1073fn parse_ipv4_address(b: &mut &[u8]) -> Result<(), ParseRIError> {
1074    parse_dec_octet(b)?;
1075    for _ in 0..3 {
1076        *b = b.strip_prefix(b".").ok_or(ParseRIError::InvalidDecOctet)?;
1077        parse_dec_octet(b)?;
1078    }
1079    Ok(())
1080}
1081fn parse_dec_octet(b: &mut &[u8]) -> Result<(), ParseRIError> {
1082    let len = match b {
1083        [b'2', b'5', b'0'..=b'5', ..] => 3,
1084        [b'2', b'0'..=b'4', b'0'..=b'9', ..] => 3,
1085        [b'1', b'0'..=b'9', b'0'..=b'9', ..] => 3,
1086        [b'1'..=b'9', b'0'..=b'9', ..] => 2,
1087        [b'0'..=b'9', ..] => 1,
1088        _ => return Err(ParseRIError::InvalidDecOctet),
1089    };
1090    *b = &b[len..];
1091    Ok(())
1092}
1093
1094/// # Reference
1095/// [3.2.2.  Host]
1096///
1097/// ```text
1098/// reg-name    = *( unreserved / pct-encoded / sub-delims )
1099/// ```
1100fn parse_reg_name(b: &mut &[u8]) -> Result<(), ParseRIError> {
1101    // pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
1102    // reg-name      = pchar - (":" | "@")
1103    while !b.is_empty() && !matches!(b[0], b':' | b'@') && parse_pchar(b).is_ok() {}
1104    Ok(())
1105}
1106
1107/// # Reference
1108/// [3.2.3.  Port](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.3)
1109///
1110/// ```text
1111/// port        = *DIGIT
1112/// ```
1113fn parse_port(b: &mut &[u8]) -> Result<(), ParseRIError> {
1114    let pos = b
1115        .iter()
1116        .position(|&b| !b.is_ascii_digit())
1117        .unwrap_or(b.len());
1118    *b = &b[pos..];
1119    Ok(())
1120}
1121
1122/// # Reference
1123/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1124///
1125/// ```text
1126/// path-abempty  = *( "/" segment )
1127/// ```
1128fn parse_path_abempty(b: &mut &[u8]) -> Result<(), ParseRIError> {
1129    while let Some(rem) = b.strip_prefix(b"/") {
1130        *b = rem;
1131        parse_segment(b)?;
1132    }
1133    Ok(())
1134}
1135
1136/// # Reference
1137/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1138///
1139/// ```text
1140/// path-absolute = "/" [ segment-nz *( "/" segment ) ]
1141/// ```
1142fn parse_path_absolute(b: &mut &[u8]) -> Result<(), ParseRIError> {
1143    *b = b
1144        .strip_prefix(b"/")
1145        .ok_or(ParseRIError::InvalidPathAbsolute)?;
1146    if parse_segment_nz(b).is_ok() {
1147        while let Some(rem) = b.strip_prefix(b"/") {
1148            *b = rem;
1149            parse_segment(b)?;
1150        }
1151    }
1152    Ok(())
1153}
1154
1155/// # Reference
1156/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1157///
1158/// ```text
1159/// path-noscheme = segment-nz-nc *( "/" segment )
1160/// ```
1161fn parse_path_noscheme(b: &mut &[u8]) -> Result<(), ParseRIError> {
1162    parse_segment_nz_nc(b)?;
1163    while let Some(rem) = b.strip_prefix(b"/") {
1164        *b = rem;
1165        parse_segment(b)?;
1166    }
1167    Ok(())
1168}
1169
1170/// # Reference
1171/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1172///
1173/// ```text
1174/// path-rootless = segment-nz *( "/" segment )
1175/// ```
1176fn parse_path_rootless(b: &mut &[u8]) -> Result<(), ParseRIError> {
1177    parse_segment_nz(b)?;
1178    while let Some(rem) = b.strip_prefix(b"/") {
1179        *b = rem;
1180        parse_segment(b)?;
1181    }
1182    Ok(())
1183}
1184
1185// This is not necessary because this does nothing.
1186// /// # Reference
1187// /// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1188// ///
1189// /// ```text
1190// /// path-empty    = 0<pchar>
1191// /// ```
1192// fn parse_path_empty(b: &mut &[u8]) -> Result<(), ParseRIError> {
1193//     todo!()
1194// }
1195
1196/// # Reference
1197/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1198///
1199/// ```text
1200/// segment       = *pchar
1201/// ```
1202fn parse_segment(b: &mut &[u8]) -> Result<(), ParseRIError> {
1203    while parse_pchar(b).is_ok() {}
1204    Ok(())
1205}
1206
1207/// # Reference
1208/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1209///
1210/// ```text
1211/// segment-nz    = 1*pchar
1212/// ```
1213fn parse_segment_nz(b: &mut &[u8]) -> Result<(), ParseRIError> {
1214    parse_pchar(b)?;
1215    while parse_pchar(b).is_ok() {}
1216    Ok(())
1217}
1218
1219/// # Reference
1220/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1221///
1222/// ```text
1223/// segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
1224///                     ; non-zero-length segment without any colon ":"
1225/// ```
1226fn parse_segment_nz_nc(b: &mut &[u8]) -> Result<(), ParseRIError> {
1227    if b.is_empty() || b[0] == b':' || parse_pchar(b).is_err() {
1228        return Err(ParseRIError::InvalidSegmentNzNc);
1229    }
1230    while !b.is_empty() && b[0] != b':' && parse_pchar(b).is_ok() {}
1231    Ok(())
1232}
1233
1234/// # Reference
1235/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1236///
1237/// ```text
1238/// pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
1239/// ```
1240fn parse_pchar(b: &mut &[u8]) -> Result<(), ParseRIError> {
1241    if b.is_empty() {
1242        return Err(ParseRIError::InvalidPChar);
1243    }
1244
1245    if is_unreserved(b[0]) || is_sub_delims(b[0]) || matches!(b[0], b':' | b'@') {
1246        *b = &b[1..];
1247        Ok(())
1248    } else if b.len() >= 3 && b[0] == b'%' && b[1].is_ascii_hexdigit() && b[2].is_ascii_hexdigit() {
1249        *b = &b[3..];
1250        Ok(())
1251    } else {
1252        Err(ParseRIError::InvalidPChar)
1253    }
1254}
1255
1256/// # Reference
1257/// [3.4.  Query](https://datatracker.ietf.org/doc/html/rfc3986#section-3.4)
1258///
1259/// ```text
1260/// query       = *( pchar / "/" / "?" )
1261/// ```
1262fn parse_query(b: &mut &[u8]) -> Result<(), ParseRIError> {
1263    loop {
1264        if let Some(rem) = b.strip_prefix(b"/") {
1265            *b = rem;
1266        } else if let Some(rem) = b.strip_prefix(b"?") {
1267            *b = rem;
1268        } else if parse_pchar(b).is_ok() {
1269            // no op
1270        } else {
1271            break Ok(());
1272        }
1273    }
1274}
1275
1276/// # Reference
1277/// [3.5.  Fragment](https://datatracker.ietf.org/doc/html/rfc3986#section-3.5)
1278///
1279/// ```text
1280/// fragment    = *( pchar / "/" / "?" )
1281/// ```
1282fn parse_fragment(b: &mut &[u8]) -> Result<(), ParseRIError> {
1283    loop {
1284        if let Some(rem) = b.strip_prefix(b"/") {
1285            *b = rem;
1286        } else if let Some(rem) = b.strip_prefix(b"?") {
1287            *b = rem;
1288        } else if parse_pchar(b).is_ok() {
1289            // no op
1290        } else {
1291            break Ok(());
1292        }
1293    }
1294}
1295
1296/// # Reference
1297/// [4.2.  Relative Reference](https://datatracker.ietf.org/doc/html/rfc3986#section-4.2)
1298///
1299/// ```text
1300/// relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
1301/// ```
1302fn parse_relative_ref(b: &mut &[u8]) -> Result<(), ParseRIError> {
1303    parse_relative_part(b)?;
1304    if let Some(query) = b.strip_prefix(b"?") {
1305        *b = query;
1306        parse_query(b)?;
1307    }
1308    if let Some(fragment) = b.strip_prefix(b"#") {
1309        *b = fragment;
1310        parse_fragment(b)?;
1311    }
1312    Ok(())
1313}
1314
1315/// # Reference
1316/// [4.2.  Relative Reference](https://datatracker.ietf.org/doc/html/rfc3986#section-4.2)
1317///
1318/// ```text
1319/// relative-part = "//" authority path-abempty
1320///               / path-absolute
1321///               / path-noscheme
1322///               / path-empty
1323/// ```
1324fn parse_relative_part(b: &mut &[u8]) -> Result<(), ParseRIError> {
1325    if let Some(rem) = b.strip_prefix(b"/") {
1326        if let Some(rem) = rem.strip_prefix(b"/") {
1327            *b = rem;
1328            parse_authority(b)?;
1329            parse_path_abempty(b)
1330        } else {
1331            parse_path_absolute(b)
1332        }
1333    } else {
1334        let orig = b.len();
1335        let ret = parse_path_noscheme(b);
1336        // If no characters have been consumed, it matches `path-empty` and returns `Ok`.
1337        if orig == b.len() { Ok(()) } else { ret }
1338    }
1339}
1340
1341/// # Reference
1342/// [2.2.  Reserved Characters](https://datatracker.ietf.org/doc/html/rfc3986#section-2.2)
1343///
1344/// ```text
1345/// reserved    = gen-delims / sub-delims
1346/// ```
1347fn is_reserved(b: u8) -> bool {
1348    is_gen_delims(b) || is_sub_delims(b)
1349}
1350
1351/// # Reference
1352/// [2.2.  Reserved Characters](https://datatracker.ietf.org/doc/html/rfc3986#section-2.2)
1353///
1354/// ```text
1355/// gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
1356/// ```
1357fn is_gen_delims(b: u8) -> bool {
1358    matches!(b, b':' | b'/' | b'?' | b'#' | b'[' | b']' | b'@')
1359}
1360
1361/// # Reference
1362/// [2.2.  Reserved Characters](https://datatracker.ietf.org/doc/html/rfc3986#section-2.2)
1363///
1364/// ```text
1365/// sub-delims  = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
1366/// ```
1367fn is_sub_delims(b: u8) -> bool {
1368    matches!(
1369        b,
1370        b'!' | b'$' | b'&' | b'\'' | b'(' | b')' | b'*' | b'+' | b',' | b';' | b'='
1371    )
1372}
1373
1374/// # Reference
1375/// [2.3.  Unreserved Characters](https://datatracker.ietf.org/doc/html/rfc3986#section-2.3)
1376///
1377/// ```text
1378/// unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
1379/// ```
1380fn is_unreserved(b: u8) -> bool {
1381    b.is_ascii_alphanumeric() || matches!(b, b'-' | b'.' | b'_' | b'~')
1382}
1383
1384const LUT_BYTES: [u8; 256 * 3] = {
1385    const fn digit_to_hex_char(b: u8) -> u8 {
1386        if b < 10 { b + b'0' } else { b - 10 + b'A' }
1387    }
1388    let mut buf = [0u8; 256 * 3];
1389    let mut i = 0;
1390    while i < 256 {
1391        buf[3 * i] = b'%';
1392        let hi = (i as u8 >> 4) & 0xF;
1393        let lo = i as u8 & 0xF;
1394        buf[3 * i + 1] = digit_to_hex_char(hi);
1395        buf[3 * i + 2] = digit_to_hex_char(lo);
1396        i += 1;
1397    }
1398    buf
1399};
1400const LUT: &str = unsafe {
1401    // # Safety
1402    // `LUT_BYTES` contains only '%' and ASCII hex digit characters.
1403    // Therefore, UTF-8 validation won't fail.
1404    from_utf8_unchecked(&LUT_BYTES)
1405};
1406
1407pub fn escape(s: &str) -> Cow<'_, str> {
1408    escape_except(s, |_| false)
1409}
1410
1411pub fn escape_bytes(b: &[u8]) -> Cow<'_, [u8]> {
1412    escape_bytes_except(b, |_| false)
1413}
1414
1415pub fn escape_except(s: &str, is_except: impl Fn(char) -> bool) -> Cow<'_, str> {
1416    let cap = s
1417        .chars()
1418        .filter_map(|c| (!is_except(c)).then_some(c.len_utf8() * 2))
1419        .sum::<usize>();
1420    if cap == 0 {
1421        return Cow::Borrowed(s);
1422    }
1423    let mut encode = [0; 6];
1424    let mut buf = String::with_capacity(s.len() + cap);
1425    for c in s.chars() {
1426        if is_except(c) {
1427            buf.push(c);
1428        } else {
1429            let encoded = c.encode_utf8(&mut encode);
1430            for b in encoded.bytes() {
1431                let index = b as usize * 3;
1432                buf.push_str(&LUT[index..index + 3]);
1433            }
1434        }
1435    }
1436    Cow::Owned(buf)
1437}
1438
1439pub fn escape_bytes_except(b: &[u8], is_except: impl Fn(u8) -> bool) -> Cow<'_, [u8]> {
1440    let cap = b.iter().copied().filter(|&b| !is_except(b)).count() * 2;
1441    if cap == 0 {
1442        return Cow::Borrowed(b);
1443    }
1444    let mut buf = Vec::with_capacity(b.len() + cap);
1445    for &b in b {
1446        if is_except(b) {
1447            buf.push(b);
1448        } else {
1449            let index = b as usize * 3;
1450            buf.extend_from_slice(&LUT_BYTES[index..index + 3]);
1451        }
1452    }
1453    Cow::Owned(buf)
1454}
1455
1456pub enum URIUnescapeError {
1457    InvalidEscape,
1458    Utf8Error(std::str::Utf8Error),
1459}
1460
1461impl From<std::str::Utf8Error> for URIUnescapeError {
1462    fn from(value: std::str::Utf8Error) -> Self {
1463        Self::Utf8Error(value)
1464    }
1465}
1466
1467pub fn unescape(s: &str) -> Result<Cow<'_, str>, URIUnescapeError> {
1468    if !s.contains('%') {
1469        return Ok(Cow::Borrowed(s));
1470    }
1471
1472    let mut split = s.split('%');
1473    let mut buf = String::with_capacity(s.len());
1474    buf.push_str(split.next().unwrap());
1475    let mut bytes = vec![];
1476    for chunk in split {
1477        if chunk.len() < 2 {
1478            return Err(URIUnescapeError::InvalidEscape);
1479        }
1480        let byte =
1481            u8::from_str_radix(&chunk[..2], 16).map_err(|_| URIUnescapeError::InvalidEscape)?;
1482        bytes.push(byte);
1483
1484        if chunk.len() > 2 {
1485            buf.push_str(from_utf8(&bytes)?);
1486            buf.push_str(&chunk[2..]);
1487            bytes.clear();
1488        }
1489    }
1490
1491    if !bytes.is_empty() {
1492        buf.push_str(from_utf8(&bytes)?);
1493    }
1494    Ok(Cow::Owned(buf))
1495}
1496
1497pub fn unescape_bytes(b: &[u8]) -> Result<Cow<'_, [u8]>, URIUnescapeError> {
1498    if !b.contains(&b'%') {
1499        return Ok(Cow::Borrowed(b));
1500    }
1501
1502    let mut split = b.split(|&b| b == b'%');
1503    let mut buf = Vec::with_capacity(b.len());
1504    buf.extend_from_slice(split.next().unwrap());
1505
1506    fn hexdigit_to_byte(hex: u8) -> u8 {
1507        if hex.is_ascii_digit() {
1508            hex - b'0'
1509        } else if hex.is_ascii_uppercase() {
1510            hex - b'A' + 10
1511        } else {
1512            hex - b'a' + 10
1513        }
1514    }
1515    for chunk in split {
1516        if chunk.len() < 2 || !chunk[0].is_ascii_hexdigit() || !chunk[1].is_ascii_hexdigit() {
1517            return Err(URIUnescapeError::InvalidEscape);
1518        }
1519        let hi = hexdigit_to_byte(chunk[0]);
1520        let lo = hexdigit_to_byte(chunk[1]);
1521        buf.push((hi << 4) | lo);
1522    }
1523    Ok(Cow::Owned(buf))
1524}
1525
1526#[derive(Debug, Clone, Copy)]
1527enum DecomposeState {
1528    Scheme,
1529    Authority,
1530    Root,
1531    Path,
1532    Query,
1533    Fragment,
1534    Finish,
1535}
1536
1537pub struct Components<'a> {
1538    state: DecomposeState,
1539    uri: &'a str,
1540}
1541
1542impl Components<'_> {
1543    fn new(uri: &str) -> Components<'_> {
1544        Components {
1545            state: DecomposeState::Scheme,
1546            uri,
1547        }
1548    }
1549}
1550
1551impl<'a> Iterator for Components<'a> {
1552    type Item = Component<'a>;
1553
1554    fn next(&mut self) -> Option<Self::Item> {
1555        use DecomposeState::*;
1556        loop {
1557            match self.state {
1558                Scheme => {
1559                    self.state = Authority;
1560                    let mut bytes = self.uri.as_bytes();
1561                    if parse_scheme(&mut bytes).is_ok() && bytes.starts_with(b":") {
1562                        let len = self.uri.len() - bytes.len();
1563                        let (scheme, rem) = self.uri.split_at(len);
1564                        self.uri = &rem[1..];
1565                        break Some(Component::Scheme(scheme));
1566                    }
1567                }
1568                Authority => {
1569                    self.state = Root;
1570                    if let Some(rem) = self.uri.strip_prefix("//") {
1571                        let pos = rem.bytes().position(|b| b == b'/').unwrap_or(rem.len());
1572                        let (mut authority, rem) = rem.split_at(pos);
1573                        self.uri = rem;
1574                        let mut userinfo = None;
1575                        if let Some((ui, rem)) = authority.split_once('@') {
1576                            userinfo = Some(ui);
1577                            authority = rem;
1578                        }
1579                        let mut port = None;
1580                        if let Some((host, p)) = authority.rsplit_once(':')
1581                            && p.bytes().all(|b| b.is_ascii_digit())
1582                        {
1583                            port = Some(p);
1584                            authority = host;
1585                        }
1586                        break Some(Component::Authority {
1587                            userinfo,
1588                            host: authority,
1589                            port,
1590                        });
1591                    }
1592                }
1593                Root => {
1594                    self.state = Path;
1595                    if let Some(rem) = self.uri.strip_prefix('/') {
1596                        self.uri = rem;
1597                        break Some(Component::RootSegment);
1598                    }
1599                }
1600                Path => {
1601                    let pos = self
1602                        .uri
1603                        .bytes()
1604                        .position(|b| b == b'/' || b == b'?' || b == b'#')
1605                        .unwrap_or(self.uri.len());
1606                    let (segment, rem) = self.uri.split_at(pos);
1607                    if let Some(rem) = rem.strip_prefix('/') {
1608                        self.uri = rem;
1609                    } else {
1610                        self.uri = rem;
1611                        self.state = Query;
1612                    }
1613                    break Some(Component::Segment(segment));
1614                }
1615                Query => {
1616                    self.state = Fragment;
1617                    if let Some(rem) = self.uri.strip_prefix('?') {
1618                        let pos = rem.bytes().position(|b| b == b'#').unwrap_or(rem.len());
1619                        let (query, rem) = rem.split_at(pos);
1620                        self.uri = rem;
1621                        break Some(Component::Query(query));
1622                    }
1623                }
1624                Fragment => {
1625                    debug_assert!(self.uri.is_empty() || self.uri.starts_with('#'));
1626                    self.state = Finish;
1627                    if !self.uri.is_empty() {
1628                        let (_, frag) = self.uri.split_at(1);
1629                        self.uri = "";
1630                        break Some(Component::Fragment(frag));
1631                    }
1632                }
1633                Finish => break None,
1634            }
1635        }
1636    }
1637}
1638
1639pub enum Component<'a> {
1640    Scheme(&'a str),
1641    Authority {
1642        userinfo: Option<&'a str>,
1643        host: &'a str,
1644        port: Option<&'a str>,
1645    },
1646    RootSegment,
1647    Segment(&'a str),
1648    Query(&'a str),
1649    Fragment(&'a str),
1650}
anyxml_uri/uri.rs

anyxml_uri/
uri.rs