anyxml_uri/
uri.rs

1use std::{
2    borrow::{Borrow, Cow},
3    ops::Deref,
4    path::Path,
5    rc::Rc,
6    str::{from_utf8, from_utf8_unchecked},
7    sync::Arc,
8};
9
10use crate::ParseRIError;
11
12#[derive(Debug, PartialEq, Eq, Hash)]
13#[repr(transparent)]
14pub struct URIStr {
15    uri: str,
16}
17
18impl URIStr {
19    fn new(s: &str) -> &Self {
20        unsafe {
21            // # Safety
22            // Since `URIStr` is a transparent newtype of `str`,
23            // the bit patterns are exactly the same and have the same features.
24            &*(s as *const str as *const Self)
25        }
26    }
27
28    /// Resolve the relative reference `reference` using `self` as the base URI.
29    ///
30    /// `self` must be convertible to an [absolute URI](https://datatracker.ietf.org/doc/html/rfc3986#section-4.3)
31    /// through [fragment](https://datatracker.ietf.org/doc/html/rfc3986#section-3.5) removal
32    /// and normalization.
33    ///
34    /// # Reference
35    /// - [5.1.  Establishing a Base URI](https://datatracker.ietf.org/doc/html/rfc3986#section-5.1)
36    /// - [5.2.  Relative Resolution](https://datatracker.ietf.org/doc/html/rfc3986#section-5.2)
37    pub fn resolve(&self, reference: &Self) -> URIString {
38        use Component::*;
39
40        let base = if self.is_absolute() {
41            Cow::Borrowed(self)
42        } else {
43            let mut base = self.to_owned();
44            base.normalize();
45            if let Some(frag) = base.uri.bytes().position(|b| b == b'#') {
46                base.uri.truncate(frag);
47            }
48            assert!(
49                base.is_absolute(),
50                "'{}' is not absolute",
51                base.as_escaped_str()
52            );
53            Cow::Owned(base)
54        };
55
56        let mut ref_components = reference.components().peekable();
57        if ref_components
58            .next_if(|comp| matches!(comp, Scheme(_)))
59            .is_some()
60        {
61            let mut ret = reference.to_owned();
62            ret.normalize();
63            return ret;
64        }
65
66        if ref_components
67            .next_if(|comp| matches!(comp, Authority { .. }))
68            .is_some()
69        {
70            // has authority
71            let mut ret = URIString {
72                uri: [base.scheme().unwrap(), ":", &reference.uri].concat(),
73            };
74            ret.normalize();
75            return ret;
76        }
77
78        let mut components = base.components().peekable();
79        let mut uri = String::new();
80        if let Some(Scheme(scheme)) = components.next_if(|comp| matches!(comp, Scheme(_))) {
81            uri.push_str(scheme);
82            uri.push(':');
83        }
84        if let Some(Authority {
85            userinfo,
86            host,
87            port,
88        }) = components.next_if(|comp| matches!(comp, Authority { .. }))
89        {
90            uri.push_str("//");
91            if let Some(userinfo) = userinfo {
92                uri.push_str(userinfo);
93                uri.push(':');
94            }
95            uri.push_str(host);
96            if let Some(port) = port {
97                uri.push(':');
98                uri.push_str(port);
99            }
100        }
101
102        if ref_components
103            .next_if(|comp| matches!(comp, RootSegment))
104            .is_some()
105        {
106            uri.push_str(&reference.uri);
107            let mut ret = URIString { uri };
108            ret.normalize();
109            return ret;
110        }
111
112        let mut segments = vec![];
113        let has_root = components
114            .next_if(|comp| matches!(comp, RootSegment))
115            .is_some();
116        let mut has_dot_segment = false;
117        while let Some(Segment(segment)) = components.next_if(|comp| matches!(comp, Segment(_))) {
118            segments.push(segment);
119            has_dot_segment |= segment == "." || segment == "..";
120        }
121        if has_dot_segment {
122            segments = normalize_path_segments(segments.into_iter(), has_root);
123        }
124
125        let mut has_path = false;
126        if let Some(Segment(segment)) = ref_components.next_if(|comp| matches!(comp, Segment(_))) {
127            let mut buf = vec![segment];
128            while let Some(Segment(segment)) =
129                ref_components.next_if(|comp| matches!(comp, Segment(_)))
130            {
131                buf.push(segment);
132            }
133            if buf.len() > 1 || !buf[0].is_empty() {
134                segments.pop();
135                segments.extend(buf);
136                has_path = true;
137            }
138        }
139        build_normalized_path(segments.into_iter(), has_root, &mut uri);
140
141        if let Some(Query(query)) = ref_components.next_if(|comp| matches!(comp, Query(_))) {
142            uri.push('?');
143            uri.push_str(query);
144        } else if !has_path
145            && let Some(Query(query)) = components.next_if(|comp| matches!(comp, Query(_)))
146        {
147            uri.push('?');
148            uri.push_str(query);
149        }
150
151        if let Some(Fragment(fragment)) = ref_components.next() {
152            uri.push('#');
153            uri.push_str(fragment);
154        }
155
156        URIString { uri }
157    }
158
159    /// Return the escaped URI string.
160    pub fn as_escaped_str(&self) -> &str {
161        &self.uri
162    }
163
164    /// Return the unescaped URI string.  \
165    /// If unescaping fails, return `None`.
166    pub fn as_unescaped_str(&self) -> Option<Cow<'_, str>> {
167        unescape(&self.uri).ok()
168    }
169
170    /// # Reference
171    /// [4.3.  Absolute URI](https://datatracker.ietf.org/doc/html/rfc3986#section-4.3)
172    pub fn is_absolute(&self) -> bool {
173        self.scheme().is_some() && self.fragment().is_none()
174    }
175
176    /// # Reference
177    /// [4.2.  Relative Reference](https://datatracker.ietf.org/doc/html/rfc3986#section-4.2)
178    pub fn is_relative(&self) -> bool {
179        self.scheme().is_none()
180    }
181
182    /// # Reference
183    /// [3.1.  Scheme](https://datatracker.ietf.org/doc/html/rfc3986#section-3.1)
184    pub fn scheme(&self) -> Option<&str> {
185        let pos = self.uri.bytes().position(is_reserved)?;
186        (self.uri.as_bytes()[pos] == b':').then_some(&self.uri[..pos])
187    }
188
189    /// # Reference
190    /// [3.2.  Authority](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2)
191    pub fn authority(&self) -> Option<&str> {
192        let rem = self
193            .uri
194            .strip_prefix("//")
195            .or_else(|| self.uri.split_once("://").map(|p| p.1))?;
196        Some(rem.split_once('/').map(|p| p.0).unwrap_or(rem))
197    }
198
199    /// # Reference
200    /// [3.2.1.  User Information](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.1)
201    pub fn userinfo(&self) -> Option<&str> {
202        Some(self.authority()?.split_once('@')?.0)
203    }
204
205    /// # Reference
206    /// [3.2.2.  Host](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.2)
207    pub fn host(&self) -> Option<&str> {
208        let mut auth = self.authority()?;
209        if let Some((_userinfo, rem)) = auth.split_once('@') {
210            auth = rem;
211        }
212        if let Some((host, port)) = auth.rsplit_once(':')
213            && port.bytes().all(|b| b.is_ascii_digit())
214        {
215            auth = host;
216        }
217        Some(auth)
218    }
219
220    /// # Reference
221    /// [3.2.3.  Port](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.3)
222    pub fn port(&self) -> Option<&str> {
223        let (_, port) = self.authority()?.rsplit_once(':')?;
224        port.bytes().all(|b| b.is_ascii_digit()).then_some(port)
225    }
226
227    /// # Reference
228    /// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
229    pub fn path(&self) -> &str {
230        let mut path = &self.uri;
231        if let Some(scheme) = self.scheme() {
232            // has scheme
233            path = &path[scheme.len() + 1..];
234        }
235        if let Some(rem) = path.strip_prefix("//") {
236            // has authority
237            let pos = rem.bytes().position(|b| b == b'/').unwrap_or(rem.len());
238            path = &rem[pos..]
239        }
240
241        path.split_once(['?', '#']).map(|p| p.0).unwrap_or(path)
242    }
243
244    /// # Reference
245    /// [3.4.  Query](https://datatracker.ietf.org/doc/html/rfc3986#section-3.4)
246    pub fn query(&self) -> Option<&str> {
247        let pos = self.uri.bytes().position(|b| b == b'?' || b == b'#')?;
248        if self.uri.as_bytes()[pos] == b'#' {
249            return None;
250        }
251        let query = &self.uri[pos + 1..];
252        let pos = query.bytes().position(|b| b == b'#').unwrap_or(query.len());
253        Some(&query[..pos])
254    }
255
256    /// # Reference
257    /// [3.5.  Fragment](https://datatracker.ietf.org/doc/html/rfc3986#section-3.5)
258    pub fn fragment(&self) -> Option<&str> {
259        let pos = self.uri.bytes().position(|b| b == b'#')?;
260        Some(&self.uri[pos + 1..])
261    }
262
263    /// Return an iterator that scans the URI components.
264    pub fn components(&self) -> Components<'_> {
265        Components::new(&self.uri)
266    }
267}
268
269impl ToOwned for URIStr {
270    type Owned = URIString;
271
272    fn to_owned(&self) -> Self::Owned {
273        URIString {
274            uri: self.uri.to_owned(),
275        }
276    }
277}
278
279impl From<&URIStr> for URIString {
280    fn from(value: &URIStr) -> Self {
281        value.to_owned()
282    }
283}
284
285impl AsRef<URIStr> for URIStr {
286    fn as_ref(&self) -> &URIStr {
287        self
288    }
289}
290
291impl Clone for Box<URIStr> {
292    fn clone(&self) -> Self {
293        self.as_ref().into()
294    }
295}
296
297impl std::fmt::Display for URIStr {
298    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
299        write!(
300            f,
301            "{}",
302            self.as_unescaped_str()
303                .as_deref()
304                .unwrap_or(self.as_escaped_str())
305        )
306    }
307}
308
309macro_rules! impl_boxed_convertion_uri_str {
310    ($( $t:ident ),*) => {
311        $(
312            impl From<&URIStr> for $t<URIStr> {
313                fn from(value: &URIStr) -> Self {
314                    let boxed: $t<str> = value.uri.into();
315                    unsafe {
316                        // # Safety
317                        // Since `URIStr` is a transparent newtype of `str`,
318                        // the bit patterns are exactly the same and have the same features.
319                        std::mem::transmute(boxed)
320                    }
321                }
322            }
323        )*
324    };
325}
326impl_boxed_convertion_uri_str!(Box, Rc, Arc);
327
328#[derive(Debug, Clone, PartialEq, Eq, Hash)]
329#[repr(transparent)]
330pub struct URIString {
331    /// Escaped URI string.
332    ///
333    /// Parts generated from UTF-8 strings can always be converted back
334    /// to the original UTF-8 byte sequence.
335    /// Similarly, the parts generated from Path can probably be converted back
336    /// to the original Path byte sequence.
337    ///
338    /// As a result of resolving URI references, there may be a mixture of parts generated
339    /// from UTF-8 strings and parts generated from Paths, so the whole may not always revert
340    /// to a UTF-8 string or Path byte sequence.
341    uri: String,
342}
343
344impl URIString {
345    /// Parse the string as a URI by escaping all characters not specified as
346    /// [`reserved`](https://datatracker.ietf.org/doc/html/rfc3986#section-2.2)
347    /// or [`unreserved`](https://datatracker.ietf.org/doc/html/rfc3986#section-2.3)
348    /// in [RFC 3986](https://datatracker.ietf.org/doc/html/rfc3986).
349    ///
350    /// Because certain characters containing `%` are escaped, the result of
351    /// [`URIStr::as_unescaped_str`] is equal to `uri`, but the result of
352    /// [`URIStr::as_escaped_str`] may differ from `uri`.
353    ///
354    /// Since it escapes nearly all characters—including control characters, `%`,
355    /// and non-ASCII characters—it will successfully parse any string that roughly
356    /// follows URI notation.
357    pub fn parse(uri: impl AsRef<str>) -> Result<Self, ParseRIError> {
358        fn _parse(uri: &str) -> Result<URIString, ParseRIError> {
359            let uri = escape_except(uri, |b| {
360                b.is_ascii() && (is_reserved(b as u8) || is_unreserved(b as u8))
361            });
362            URIString::parse_escaped(&uri)
363        }
364        _parse(uri.as_ref())
365    }
366
367    /// Parse the string as a URI after applying escaping according
368    /// to [XML 1.0 "4.2.2 External Entities"](https://www.w3.org/TR/xml/#sec-external-ent).
369    ///
370    /// Some characters are escaped without escaping the `%` character, so both the result
371    /// of [`URIStr::as_unescaped_str`] and the result of [`URIStr::as_escaped_str`] may
372    /// differ from `uri`.
373    ///
374    /// > System identifiers (and other XML strings meant to be used as URI references) may
375    /// > contain characters that, according to [IETF RFC 3986], must be escaped before a
376    /// > URI can be used to retrieve the referenced resource. The characters to be escaped
377    /// > are the control characters #x0 to #x1F and #x7F (most of which cannot appear in
378    /// > XML), space #x20, the delimiters '<' #x3C, '>' #x3E and '"' #x22, the unwise
379    /// > characters '{' #x7B, '}' #x7D, '|' #x7C, '\' #x5C, '^' #x5E and '`' #x60, as well
380    /// > as all characters above #x7F. Since escaping is not always a fully reversible
381    /// > process, it MUST be performed only when absolutely necessary and as late as
382    /// > possible in a processing chain. In particular, neither the process of converting
383    /// > a relative URI to an absolute one nor the process of passing a URI reference to a
384    /// > process or software component responsible for dereferencing it SHOULD trigger
385    /// > escaping. When escaping does occur, it MUST be performed as follows:
386    /// >
387    /// > 1. Each character to be escaped is represented in UTF-8 [Unicode] as one or more
388    /// >    bytes.
389    /// > 2. The resulting bytes are escaped with the URI escaping mechanism (that is,
390    /// >    converted to % HH, where HH is the hexadecimal notation of the byte value).
391    /// > 3. The original character is replaced by the resulting character sequence.
392    pub fn parse_system_id(uri: impl AsRef<str>) -> Result<Self, ParseRIError> {
393        fn _parse(uri: &str) -> Result<URIString, ParseRIError> {
394            let uri = escape_except(uri, |b| {
395                // XML 1.0 "4.2.2 External Entities"
396                b.is_ascii()
397                    && !matches!(
398                        b as u8,
399                        0..=0x1F
400                            | 0x20
401                            | 0x22
402                            | 0x3C
403                            | 0x3E
404                            | 0x5C
405                            | 0x5E
406                            | 0x60
407                            | 0x7B..=0x7D
408                            | 0x7F..
409                    )
410            });
411            URIString::parse_escaped(&uri)
412        }
413        _parse(uri.as_ref())
414    }
415
416    /// Parse the string as a URI without performing any escape processing whatsoever.  \
417    /// In other words, `uri` is treated as an escaped string.
418    ///
419    /// Since percent-encoded characters are treated as percent-encoded, the result of
420    /// [`URIStr::as_unescaped_str`] may differ from `uri`. On the other hand, since no
421    /// escaping is performed at all, the result of [`URIStr::as_escaped_str`] is always
422    /// equal to `uri`.
423    fn parse_escaped(uri: impl AsRef<str>) -> Result<Self, ParseRIError> {
424        fn _parse(uri: &str) -> Result<URIString, ParseRIError> {
425            let mut bytes = uri.as_bytes();
426            parse_uri_reference(&mut bytes)?;
427            if !bytes.is_empty() {
428                Err(ParseRIError::NotTermination)
429            } else {
430                Ok(URIString {
431                    uri: uri.to_owned(),
432                })
433            }
434        }
435        _parse(uri.as_ref())
436    }
437
438    /// # Note
439    /// In the current implementation, paths that cannot be converted to UTF-8 strings
440    /// cannot be handled.  \
441    /// I don't think there will be any problems in most environments, but there may be
442    /// some paths that cannot be handled.
443    pub fn parse_file_path(path: impl AsRef<Path>) -> Result<Self, ParseRIError> {
444        #[cfg(target_family = "unix")]
445        fn _parse_file_path(path: &Path) -> Result<URIString, ParseRIError> {
446            let mut path_str = path.to_str().ok_or(ParseRIError::Unsupported)?.to_owned();
447            if (path.is_dir() || (path.as_os_str().as_encoded_bytes().ends_with(b"\\")))
448                && !path_str.ends_with('/')
449            {
450                path_str.push('/');
451            }
452            if path.is_absolute() {
453                path_str.insert_str(0, "file://");
454            }
455            URIString::parse(path_str)
456        }
457        #[cfg(target_family = "windows")]
458        fn _parse_file_path(path: &Path) -> Result<URIString, ParseRIError> {
459            use std::path::{Component::*, Prefix::*};
460
461            let mut path_str = String::new();
462            let mut verbatim = false;
463            for comp in path.components() {
464                match comp {
465                    Prefix(prefix) => match prefix.kind() {
466                        Verbatim(root) => {
467                            path_str.push_str("file:///");
468                            path_str.push_str(
469                                &root
470                                    .to_str()
471                                    .ok_or(ParseRIError::Unsupported)?
472                                    .replace('/', "%2F"),
473                            );
474                            verbatim = true;
475                        }
476                        VerbatimUNC(server, root) => {
477                            path_str.push_str("file://");
478                            path_str.push_str(
479                                &server
480                                    .to_str()
481                                    .ok_or(ParseRIError::Unsupported)?
482                                    .replace('/', "%2F"),
483                            );
484                            path_str.push('/');
485                            path_str.push_str(
486                                &root
487                                    .to_str()
488                                    .ok_or(ParseRIError::Unsupported)?
489                                    .replace('/', "%2F"),
490                            );
491                            verbatim = true;
492                        }
493                        VerbatimDisk(letter) => {
494                            path_str.push_str("file:");
495                            path_str.push(letter as char);
496                            path_str.push(':');
497                            verbatim = true;
498                        }
499                        DeviceNS(device) => {
500                            path_str.push_str("file:///");
501                            path_str.push_str(device.to_str().ok_or(ParseRIError::Unsupported)?);
502                        }
503                        UNC(server, root) => {
504                            path_str.push_str("file://");
505                            path_str.push_str(server.to_str().ok_or(ParseRIError::Unsupported)?);
506                            path_str.push('/');
507                            path_str.push_str(root.to_str().ok_or(ParseRIError::Unsupported)?);
508                        }
509                        Disk(letter) => {
510                            path_str.push_str("file:");
511                            path_str.push(letter as char);
512                            path_str.push(':');
513                        }
514                    },
515                    RootDir => {}
516                    CurDir => path_str.push_str("/."),
517                    ParentDir => path_str.push_str("/.."),
518                    Normal(segment) => {
519                        path_str.push('/');
520                        let segment = segment.to_str().ok_or(ParseRIError::Unsupported)?;
521                        if verbatim {
522                            path_str.push_str(&segment.replace('/', "%2F"));
523                        } else {
524                            path_str.push_str(segment);
525                        }
526                    }
527                }
528            }
529            if (path.is_dir()
530                || (path.as_os_str().as_encoded_bytes().ends_with(b"\\")
531                    || (!verbatim && path.as_os_str().as_encoded_bytes().ends_with(b"/"))))
532                && !path_str.ends_with('/')
533            {
534                path_str.push('/');
535            }
536            URIString::parse(path_str)
537        }
538        #[cfg(all(not(target_family = "unix"), not(target_family = "windows")))]
539        fn _parse_file_path(path: &Path) -> Result<URIString, ParseRIError> {
540            todo!()
541        }
542        _parse_file_path(path.as_ref())
543    }
544
545    pub fn into_boxed_uri_str(self) -> Box<URIStr> {
546        Box::from(self.as_ref())
547    }
548
549    /// # Reference
550    /// [6.2.2.  Syntax-Based Normalization](https://datatracker.ietf.org/doc/html/rfc3986#section-6.2.2).
551    pub fn normalize(&mut self) {
552        use Component::*;
553
554        let mut uri = String::with_capacity(self.uri.len());
555        let mut paths = vec![];
556        let mut query = None;
557        let mut fragment = None;
558        let mut has_root = false;
559        for comp in self.components() {
560            match comp {
561                Scheme(scheme) => {
562                    uri.push_str(&scheme.to_ascii_lowercase());
563                    uri.push(':');
564                }
565                Authority {
566                    userinfo,
567                    host,
568                    port,
569                } => {
570                    uri.push_str("//");
571                    if let Some(userinfo) = userinfo {
572                        uri.push_str(userinfo);
573                        uri.push('@');
574                    }
575                    uri.push_str(host);
576                    if let Some(port) = port {
577                        uri.push(':');
578                        uri.push_str(port);
579                    }
580                }
581                RootSegment => has_root = true,
582                Segment(segment) => paths.push(segment),
583                Query(q) => query = Some(q),
584                Fragment(f) => fragment = Some(f),
585            }
586        }
587        build_normalized_path(paths.into_iter(), has_root, &mut uri);
588        if let Some(query) = query {
589            uri.push('?');
590            uri.push_str(query);
591        }
592        if let Some(fragment) = fragment {
593            uri.push('#');
594            uri.push_str(fragment);
595        }
596        self.uri = uri;
597    }
598}
599
600impl AsRef<URIStr> for URIString {
601    fn as_ref(&self) -> &URIStr {
602        URIStr::new(&self.uri)
603    }
604}
605
606impl Borrow<URIStr> for URIString {
607    fn borrow(&self) -> &URIStr {
608        self.as_ref()
609    }
610}
611
612impl Deref for URIString {
613    type Target = URIStr;
614
615    fn deref(&self) -> &Self::Target {
616        self.as_ref()
617    }
618}
619
620impl std::fmt::Display for URIString {
621    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
622        write!(f, "{}", self.as_ref())
623    }
624}
625
626macro_rules! impl_convertion_uri_string {
627    ($( $t:ty ),*) => {
628        $(
629            impl From<URIString> for $t {
630                fn from(value: URIString) -> $t {
631                    From::from(value.as_ref())
632                }
633            }
634        )*
635    };
636}
637impl_convertion_uri_string!(Box<URIStr>, Rc<URIStr>, Arc<URIStr>);
638
639fn build_normalized_path<'a>(
640    segments: impl Iterator<Item = &'a str>,
641    has_root: bool,
642    buffer: &mut String,
643) {
644    let segments = normalize_path_segments(segments, has_root);
645    if has_root {
646        buffer.push('/');
647    }
648    for (i, seg) in segments.into_iter().enumerate() {
649        if i > 0 {
650            buffer.push('/');
651        }
652        buffer.push_str(seg);
653    }
654}
655
656fn normalize_path_segments<'a>(
657    segments: impl Iterator<Item = &'a str>,
658    has_root: bool,
659) -> Vec<&'a str> {
660    let mut stack = vec![];
661    let mut last_dot = false;
662    for seg in segments {
663        if seg == "." {
664            // no op
665            last_dot = true;
666        } else if seg == ".." {
667            if !stack.is_empty() && stack.last() != Some(&"..") {
668                stack.pop();
669            } else if !has_root {
670                stack.push(seg);
671            }
672            last_dot = true;
673        } else {
674            stack.push(seg);
675            last_dot = false;
676        }
677    }
678
679    if last_dot {
680        stack.push("");
681    }
682
683    stack
684}
685
686/// # Reference
687/// [4.1.  URI Reference](https://datatracker.ietf.org/doc/html/rfc3986#section-4.1)
688///
689/// ```text
690/// URI-reference = URI / relative-ref
691/// ```
692fn parse_uri_reference(b: &mut &[u8]) -> Result<(), ParseRIError> {
693    if b.is_empty() || matches!(b[0], b'/' | b'?' | b'#') {
694        // If `b` is an empty string or starts with either '/', '?' or '#',
695        // it is definitely 'relative-ref'.
696        parse_relative_ref(b)
697    } else {
698        // Otherwise, it is necessary to distinguish between `URI` and `relative-ref`
699        // starting with `relative-part` that matches `path-noscheme`.
700
701        if !b[0].is_ascii_alphabetic() {
702            // Since `scheme` begins with at least one `ALPHA`,
703            // if it does not, it is definitely `irelative-ref`.
704            parse_relative_ref(b)
705        } else {
706            // The characters that can be used in `scheme` are very limited,
707            // so it might be quicker to try parsing `scheme` to distinguish between them?
708            // [25] scheme ::= ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
709            if let Some(&c) = b
710                .iter()
711                .find(|&&b| !b.is_ascii_alphanumeric() && !matches!(b, b'+' | b'-' | b'.'))
712                && c == b':'
713            {
714                parse_uri(b)
715            } else {
716                parse_relative_ref(b)
717            }
718        }
719    }
720}
721
722/// # Reference
723/// [3.  Syntax Components](https://datatracker.ietf.org/doc/html/rfc3986#section-3)
724///
725/// ```text
726/// URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
727/// ```
728fn parse_uri(b: &mut &[u8]) -> Result<(), ParseRIError> {
729    parse_scheme(b)?;
730    *b = b
731        .strip_prefix(b":")
732        .ok_or(ParseRIError::InvalidSchemeSeparator)?;
733    parse_hier_part(b)?;
734    if let Some(query) = b.strip_prefix(b"?") {
735        *b = query;
736        parse_query(b)?;
737    }
738    if let Some(fragment) = b.strip_prefix(b"#") {
739        *b = fragment;
740        parse_fragment(b)?;
741    }
742    Ok(())
743}
744
745/// # Reference
746/// [3.1.  Scheme](https://datatracker.ietf.org/doc/html/rfc3986#section-3.1)
747///
748/// ```text
749/// scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
750/// ```
751fn parse_scheme(b: &mut &[u8]) -> Result<(), ParseRIError> {
752    if b.is_empty() || !b[0].is_ascii_alphabetic() {
753        return Err(ParseRIError::InvalidScheme);
754    }
755    let pos = b
756        .iter()
757        .position(|&b| !b.is_ascii_alphanumeric() && !matches!(b, b'+' | b'-' | b'.'))
758        .unwrap_or(b.len());
759    *b = &b[pos..];
760    Ok(())
761}
762
763/// # Reference
764/// [3.  Syntax Components](https://datatracker.ietf.org/doc/html/rfc3986#section-3)
765///
766/// ```text
767/// hier-part   = "//" authority path-abempty
768///             / path-absolute
769///             / path-rootless
770///             / path-empty
771/// ```
772fn parse_hier_part(b: &mut &[u8]) -> Result<(), ParseRIError> {
773    if let Some(rem) = b.strip_prefix(b"/") {
774        // If `b` starts with '/', `b` starts with 'authority' or `path-absolute`,
775
776        if let Some(rem) = rem.strip_prefix(b"/") {
777            // If `b` starts with '//', it should be followed by 'authority'.
778            // This is because 'path-absolute' is followed by exactly one '/' at the beginning
779            // and optionally 'segment-nz', so there cannot be two consecutive '/' characters.
780            *b = rem;
781            parse_authority(b)?;
782            parse_path_abempty(b)
783        } else {
784            // path-absolute = "/" [ segment-nz *( "/" segment ) ]
785            // segment-nz    = 1*pchar
786            parse_path_absolute(b)
787        }
788    } else {
789        // otherwise, `b` starts with 'path-rootless' or 'path-empty'
790        let mut dum = *b;
791        if parse_pchar(&mut dum).is_ok() {
792            // If 'path-rootless' follows, one or more 'pchar' should follow.
793            parse_path_rootless(b)
794        } else {
795            // If not, it is 'path-empty'.
796            // Since 'path-empty' is an empty string,
797            // we can simply return `Ok` without doing anything.
798            Ok(())
799        }
800    }
801}
802
803/// # Reference
804/// [3.2.  Authority](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2)
805///
806/// ```text
807/// authority   = [ userinfo "@" ] host [ ":" port ]
808/// ```
809fn parse_authority(b: &mut &[u8]) -> Result<(), ParseRIError> {
810    if b.starts_with(b"[") {
811        // If `b` starts with '[', it is definitely an `host` that matches `IP-literal`.
812        parse_ip_literal(b)?;
813        if let Some(rem) = b.strip_prefix(b":") {
814            *b = rem;
815            parse_port(b)?;
816        }
817        return Ok(());
818    }
819
820    // If not, it may start with `userinfo`, or it may start with `host`
821    // that matches `IPv4address` or `reg-name`.
822    //
823    // If it is either `IPv4address` or `reg-name`, there is no need to consider `IPv4address`.
824    // This is because `reg-name` includes `IPv4address`. More specifically, since `unreserved`
825    // contains `DIGIT` and `.`, `IPv4address` can be regarded as a specific sequence of `unreserved`.
826    //
827    // `userinfo` and `reg-name` are rules that share characters other than colons.
828    // Therefore, they can be distinguished using the following algorithm.
829    //
830    // 1. Increment the counter as long as it matches `userinfo`.
831    // 2. If the first ":" is encountered, note its position.
832    // 3. Determine the matching rule according to the characters that did not match `userinfo`.
833    //      i.   If it is "@", the string seen so far is `userinfo`.
834    //      ii.  If it is "[" , then an `host` matching "IP-literal" should start there,
835    //           but since there is no "@" immediately before it, it is an error.
836    //      iii. In other cases, if the position of ":" is noted, the string before it is `host`;
837    //                           if not, all strings seen so far are `host`.
838    //
839    // userinfo    = *( unreserved / pct-encoded / sub-delims / ":" )
840    //
841    // reg-name    = *( unreserved / pct-encoded / sub-delims )
842    // unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
843    //
844    // IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
845    let mut colon = usize::MAX;
846    let mut now = 0;
847    let mut t = *b;
848    while !t.is_empty() {
849        let pos = t
850            .iter()
851            .position(|&b| !is_unreserved(b) && !is_sub_delims(b) && b != b'%')
852            .unwrap_or(t.len());
853        t = &t[pos..];
854        now += pos;
855        if let Some(rem) = t.strip_prefix(b":") {
856            now += 1;
857            t = rem;
858            colon = colon.min(now);
859        } else {
860            break;
861        }
862    }
863
864    debug_assert_eq!(now, b.len() - t.len());
865
866    if let Some(rem) = t.strip_prefix(b"@") {
867        *b = rem;
868        parse_host(b)?;
869        if let Some(rem) = b.strip_prefix(b":") {
870            *b = rem;
871            parse_port(b)?;
872        }
873        Ok(())
874    } else if t.starts_with(b"[") {
875        Err(ParseRIError::InvalidAuthority)
876    } else if colon < usize::MAX {
877        *b = &b[colon + 1..];
878        parse_port(b)
879    } else {
880        *b = t;
881        Ok(())
882    }
883}
884
885// This function has no use.
886// /// # Reference
887// /// [3.2.1.  User Information](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.1)
888// ///
889// /// ```text
890// /// userinfo    = *( unreserved / pct-encoded / sub-delims / ":" )
891// /// ```
892// fn parse_userinfo(b: &mut &[u8]) -> Result<(), ParseRIError> {
893//     todo!()
894// }
895
896/// # Reference
897/// [3.2.2.  Host]
898///
899/// ```text
900/// host        = IP-literal / IPv4address / reg-name
901/// ```
902fn parse_host(b: &mut &[u8]) -> Result<(), ParseRIError> {
903    if b.starts_with(b"[") {
904        parse_ip_literal(b)
905    } else {
906        // Since `IPv4address` is covered by `reg-name`, it does not need to be considered.
907        parse_reg_name(b)
908    }
909}
910
911/// # Reference
912/// [3.2.2.  Host]
913///
914/// ```text
915/// IP-literal  = "[" ( IPv6address / IPvFuture  ) "]"
916/// ```
917fn parse_ip_literal(b: &mut &[u8]) -> Result<(), ParseRIError> {
918    *b = b.strip_prefix(b"[").ok_or(ParseRIError::InvalidIPLiteral)?;
919    if !b.is_empty() && b[0].eq_ignore_ascii_case(&b'v') {
920        parse_ipv_future(b)?;
921    } else {
922        parse_ipv6_address(b)?;
923    }
924    *b = b.strip_prefix(b"]").ok_or(ParseRIError::InvalidIPLiteral)?;
925    Ok(())
926}
927
928/// # Reference
929/// [3.2.2.  Host]
930///
931/// ```text
932/// IPvFuture   = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
933/// ```
934fn parse_ipv_future(b: &mut &[u8]) -> Result<(), ParseRIError> {
935    if b.is_empty() || !b[0].eq_ignore_ascii_case(&b'v') {
936        return Err(ParseRIError::InvalidIPvFuture);
937    }
938    *b = &b[1..];
939    let pos = b
940        .iter()
941        .position(|&b| !b.is_ascii_hexdigit())
942        .unwrap_or(b.len());
943    if !(1..=b.len() - 2).contains(&pos) {
944        return Err(ParseRIError::InvalidIPvFuture);
945    }
946    *b = &b[pos..];
947    *b = b.strip_prefix(b".").ok_or(ParseRIError::InvalidIPvFuture)?;
948    let pos = b
949        .iter()
950        .position(|&b| !is_unreserved(b) && !is_sub_delims(b) && b != b':')
951        .unwrap_or(b.len());
952    if pos == 0 {
953        return Err(ParseRIError::InvalidIPvFuture);
954    }
955    *b = &b[pos..];
956    Ok(())
957}
958
959/// # Reference
960/// [3.2.2.  Host]
961///
962/// ```text
963/// IPv6address =                            6( h16 ":" ) ls32
964///             /                       "::" 5( h16 ":" ) ls32
965///             / [               h16 ] "::" 4( h16 ":" ) ls32
966///             / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
967///             / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
968///             / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
969///             / [ *4( h16 ":" ) h16 ] "::"              ls32
970///             / [ *5( h16 ":" ) h16 ] "::"              h16
971///             / [ *6( h16 ":" ) h16 ] "::"
972///  ls32       = ( h16 ":" h16 ) / IPv4address
973///             ; least-significant 32 bits of address
974///  h16        = 1*4HEXDIG
975///             ; 16 bits of address represented in hexadecimal
976/// ```
977fn parse_ipv6_address(b: &mut &[u8]) -> Result<(), ParseRIError> {
978    let mut cnt = 1;
979    let mut omit = false;
980    if let Some(rem) = b.strip_prefix(b":") {
981        *b = rem;
982        omit = true;
983    } else {
984        parse_h16(b)?;
985    }
986
987    while cnt + (omit as i32) < 8
988        && let Some(rem) = b.strip_prefix(b":")
989    {
990        *b = rem;
991        if b.starts_with(b":") {
992            if omit {
993                return Err(ParseRIError::InvalidIPv6address);
994            }
995            omit = true;
996            cnt += 1;
997            continue;
998        }
999
1000        // It's not a smart approach, but it'll probably work...
1001        //
1002        // Checking `h16` first will not work because it cannot be distinguished
1003        // from the first octet of the IPv4 address.
1004        //
1005        // Checking the positions where ':' and '.' appear also seems unlikely to work,
1006        // considering cases where such characters appear in the segments of the following paths.
1007        let mut dum = *b;
1008        if parse_ipv4_address(&mut dum).is_ok() {
1009            *b = dum;
1010            // An IPv4 address consumes two hextets.
1011            cnt += 2;
1012            // An IPv4 address only appears at the end.
1013            break;
1014        } else if !b.is_empty() && b[0].is_ascii_hexdigit() {
1015            parse_h16(b)?;
1016        }
1017    }
1018
1019    // If "::" is included, some hextets may be omitted, resulting in fewer than eight.
1020    // Otherwise, exactly eight hextets are required.
1021    if (omit && cnt <= 8) || (!omit && cnt == 8) {
1022        Ok(())
1023    } else {
1024        Err(ParseRIError::InvalidIPv6address)
1025    }
1026}
1027
1028/// # Reference
1029/// [3.2.2.  Host]
1030///
1031/// ```text
1032///  h16        = 1*4HEXDIG
1033///             ; 16 bits of address represented in hexadecimal
1034/// ```
1035fn parse_h16(b: &mut &[u8]) -> Result<(), ParseRIError> {
1036    let pos = b
1037        .iter()
1038        .position(|&b| !b.is_ascii_hexdigit())
1039        .unwrap_or(b.len());
1040    if pos == 0 {
1041        Err(ParseRIError::InvalidH16)
1042    } else {
1043        *b = &b[pos.min(4)..];
1044        Ok(())
1045    }
1046}
1047
1048/// # Reference
1049/// [3.2.2.  Host]
1050///
1051/// ```text
1052/// IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
1053/// dec-octet   = DIGIT                 ; 0-9
1054///             / %x31-39 DIGIT         ; 10-99
1055///             / "1" 2DIGIT            ; 100-199
1056///             / "2" %x30-34 DIGIT     ; 200-249
1057///             / "25" %x30-35          ; 250-255
1058/// ```
1059fn parse_ipv4_address(b: &mut &[u8]) -> Result<(), ParseRIError> {
1060    parse_dec_octet(b)?;
1061    for _ in 0..3 {
1062        *b = b.strip_prefix(b".").ok_or(ParseRIError::InvalidDecOctet)?;
1063        parse_dec_octet(b)?;
1064    }
1065    Ok(())
1066}
1067fn parse_dec_octet(b: &mut &[u8]) -> Result<(), ParseRIError> {
1068    let len = match b {
1069        [b'2', b'5', b'0'..=b'5', ..] => 3,
1070        [b'2', b'0'..=b'4', b'0'..=b'9', ..] => 3,
1071        [b'1', b'0'..=b'9', b'0'..=b'9', ..] => 3,
1072        [b'1'..=b'9', b'0'..=b'9', ..] => 2,
1073        [b'0'..=b'9', ..] => 1,
1074        _ => return Err(ParseRIError::InvalidDecOctet),
1075    };
1076    *b = &b[len..];
1077    Ok(())
1078}
1079
1080/// # Reference
1081/// [3.2.2.  Host]
1082///
1083/// ```text
1084/// reg-name    = *( unreserved / pct-encoded / sub-delims )
1085/// ```
1086fn parse_reg_name(b: &mut &[u8]) -> Result<(), ParseRIError> {
1087    // pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
1088    // reg-name      = pchar - (":" | "@")
1089    while !b.is_empty() && !matches!(b[0], b':' | b'@') && parse_pchar(b).is_ok() {}
1090    Ok(())
1091}
1092
1093/// # Reference
1094/// [3.2.3.  Port](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.3)
1095///
1096/// ```text
1097/// port        = *DIGIT
1098/// ```
1099fn parse_port(b: &mut &[u8]) -> Result<(), ParseRIError> {
1100    let pos = b
1101        .iter()
1102        .position(|&b| !b.is_ascii_digit())
1103        .unwrap_or(b.len());
1104    *b = &b[pos..];
1105    Ok(())
1106}
1107
1108/// # Reference
1109/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1110///
1111/// ```text
1112/// path-abempty  = *( "/" segment )
1113/// ```
1114fn parse_path_abempty(b: &mut &[u8]) -> Result<(), ParseRIError> {
1115    while let Some(rem) = b.strip_prefix(b"/") {
1116        *b = rem;
1117        parse_segment(b)?;
1118    }
1119    Ok(())
1120}
1121
1122/// # Reference
1123/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1124///
1125/// ```text
1126/// path-absolute = "/" [ segment-nz *( "/" segment ) ]
1127/// ```
1128fn parse_path_absolute(b: &mut &[u8]) -> Result<(), ParseRIError> {
1129    *b = b
1130        .strip_prefix(b"/")
1131        .ok_or(ParseRIError::InvalidPathAbsolute)?;
1132    if parse_segment_nz(b).is_ok() {
1133        while let Some(rem) = b.strip_prefix(b"/") {
1134            *b = rem;
1135            parse_segment(b)?;
1136        }
1137    }
1138    Ok(())
1139}
1140
1141/// # Reference
1142/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1143///
1144/// ```text
1145/// path-noscheme = segment-nz-nc *( "/" segment )
1146/// ```
1147fn parse_path_noscheme(b: &mut &[u8]) -> Result<(), ParseRIError> {
1148    parse_segment_nz_nc(b)?;
1149    while let Some(rem) = b.strip_prefix(b"/") {
1150        *b = rem;
1151        parse_segment(b)?;
1152    }
1153    Ok(())
1154}
1155
1156/// # Reference
1157/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1158///
1159/// ```text
1160/// path-rootless = segment-nz *( "/" segment )
1161/// ```
1162fn parse_path_rootless(b: &mut &[u8]) -> Result<(), ParseRIError> {
1163    parse_segment_nz(b)?;
1164    while let Some(rem) = b.strip_prefix(b"/") {
1165        *b = rem;
1166        parse_segment(b)?;
1167    }
1168    Ok(())
1169}
1170
1171// This is not necessary because this does nothing.
1172// /// # Reference
1173// /// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1174// ///
1175// /// ```text
1176// /// path-empty    = 0<pchar>
1177// /// ```
1178// fn parse_path_empty(b: &mut &[u8]) -> Result<(), ParseRIError> {
1179//     todo!()
1180// }
1181
1182/// # Reference
1183/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1184///
1185/// ```text
1186/// segment       = *pchar
1187/// ```
1188fn parse_segment(b: &mut &[u8]) -> Result<(), ParseRIError> {
1189    while parse_pchar(b).is_ok() {}
1190    Ok(())
1191}
1192
1193/// # Reference
1194/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1195///
1196/// ```text
1197/// segment-nz    = 1*pchar
1198/// ```
1199fn parse_segment_nz(b: &mut &[u8]) -> Result<(), ParseRIError> {
1200    parse_pchar(b)?;
1201    while parse_pchar(b).is_ok() {}
1202    Ok(())
1203}
1204
1205/// # Reference
1206/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1207///
1208/// ```text
1209/// segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
1210///                     ; non-zero-length segment without any colon ":"
1211/// ```
1212fn parse_segment_nz_nc(b: &mut &[u8]) -> Result<(), ParseRIError> {
1213    if b.is_empty() || b[0] == b':' || parse_pchar(b).is_err() {
1214        return Err(ParseRIError::InvalidSegmentNzNc);
1215    }
1216    while !b.is_empty() && b[0] != b':' && parse_pchar(b).is_ok() {}
1217    Ok(())
1218}
1219
1220/// # Reference
1221/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1222///
1223/// ```text
1224/// pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
1225/// ```
1226fn parse_pchar(b: &mut &[u8]) -> Result<(), ParseRIError> {
1227    if b.is_empty() {
1228        return Err(ParseRIError::InvalidPChar);
1229    }
1230
1231    if is_unreserved(b[0]) || is_sub_delims(b[0]) || matches!(b[0], b':' | b'@') {
1232        *b = &b[1..];
1233        Ok(())
1234    } else if b.len() >= 3 && b[0] == b'%' && b[1].is_ascii_hexdigit() && b[2].is_ascii_hexdigit() {
1235        *b = &b[3..];
1236        Ok(())
1237    } else {
1238        Err(ParseRIError::InvalidPChar)
1239    }
1240}
1241
1242/// # Reference
1243/// [3.4.  Query](https://datatracker.ietf.org/doc/html/rfc3986#section-3.4)
1244///
1245/// ```text
1246/// query       = *( pchar / "/" / "?" )
1247/// ```
1248fn parse_query(b: &mut &[u8]) -> Result<(), ParseRIError> {
1249    loop {
1250        if let Some(rem) = b.strip_prefix(b"/") {
1251            *b = rem;
1252        } else if let Some(rem) = b.strip_prefix(b"?") {
1253            *b = rem;
1254        } else if parse_pchar(b).is_ok() {
1255            // no op
1256        } else {
1257            break Ok(());
1258        }
1259    }
1260}
1261
1262/// # Reference
1263/// [3.5.  Fragment](https://datatracker.ietf.org/doc/html/rfc3986#section-3.5)
1264///
1265/// ```text
1266/// fragment    = *( pchar / "/" / "?" )
1267/// ```
1268fn parse_fragment(b: &mut &[u8]) -> Result<(), ParseRIError> {
1269    loop {
1270        if let Some(rem) = b.strip_prefix(b"/") {
1271            *b = rem;
1272        } else if let Some(rem) = b.strip_prefix(b"?") {
1273            *b = rem;
1274        } else if parse_pchar(b).is_ok() {
1275            // no op
1276        } else {
1277            break Ok(());
1278        }
1279    }
1280}
1281
1282/// # Reference
1283/// [4.2.  Relative Reference](https://datatracker.ietf.org/doc/html/rfc3986#section-4.2)
1284///
1285/// ```text
1286/// relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
1287/// ```
1288fn parse_relative_ref(b: &mut &[u8]) -> Result<(), ParseRIError> {
1289    parse_relative_part(b)?;
1290    if let Some(query) = b.strip_prefix(b"?") {
1291        *b = query;
1292        parse_query(b)?;
1293    }
1294    if let Some(fragment) = b.strip_prefix(b"#") {
1295        *b = fragment;
1296        parse_fragment(b)?;
1297    }
1298    Ok(())
1299}
1300
1301/// # Reference
1302/// [4.2.  Relative Reference](https://datatracker.ietf.org/doc/html/rfc3986#section-4.2)
1303///
1304/// ```text
1305/// relative-part = "//" authority path-abempty
1306///               / path-absolute
1307///               / path-noscheme
1308///               / path-empty
1309/// ```
1310fn parse_relative_part(b: &mut &[u8]) -> Result<(), ParseRIError> {
1311    if let Some(rem) = b.strip_prefix(b"/") {
1312        if let Some(rem) = rem.strip_prefix(b"/") {
1313            *b = rem;
1314            parse_authority(b)?;
1315            parse_path_abempty(b)
1316        } else {
1317            parse_path_absolute(b)
1318        }
1319    } else {
1320        let orig = b.len();
1321        let ret = parse_path_noscheme(b);
1322        // If no characters have been consumed, it matches `path-empty` and returns `Ok`.
1323        if orig == b.len() { Ok(()) } else { ret }
1324    }
1325}
1326
1327/// # Reference
1328/// [2.2.  Reserved Characters](https://datatracker.ietf.org/doc/html/rfc3986#section-2.2)
1329///
1330/// ```text
1331/// reserved    = gen-delims / sub-delims
1332/// ```
1333fn is_reserved(b: u8) -> bool {
1334    is_gen_delims(b) || is_sub_delims(b)
1335}
1336
1337/// # Reference
1338/// [2.2.  Reserved Characters](https://datatracker.ietf.org/doc/html/rfc3986#section-2.2)
1339///
1340/// ```text
1341/// gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
1342/// ```
1343fn is_gen_delims(b: u8) -> bool {
1344    matches!(b, b':' | b'/' | b'?' | b'#' | b'[' | b']' | b'@')
1345}
1346
1347/// # Reference
1348/// [2.2.  Reserved Characters](https://datatracker.ietf.org/doc/html/rfc3986#section-2.2)
1349///
1350/// ```text
1351/// sub-delims  = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
1352/// ```
1353fn is_sub_delims(b: u8) -> bool {
1354    matches!(
1355        b,
1356        b'!' | b'$' | b'&' | b'\'' | b'(' | b')' | b'*' | b'+' | b',' | b';' | b'='
1357    )
1358}
1359
1360/// # Reference
1361/// [2.3.  Unreserved Characters](https://datatracker.ietf.org/doc/html/rfc3986#section-2.3)
1362///
1363/// ```text
1364/// unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
1365/// ```
1366fn is_unreserved(b: u8) -> bool {
1367    b.is_ascii_alphanumeric() || matches!(b, b'-' | b'.' | b'_' | b'~')
1368}
1369
1370const LUT_BYTES: [u8; 256 * 3] = {
1371    const fn digit_to_hex_char(b: u8) -> u8 {
1372        if b < 10 { b + b'0' } else { b - 10 + b'A' }
1373    }
1374    let mut buf = [0u8; 256 * 3];
1375    let mut i = 0;
1376    while i < 256 {
1377        buf[3 * i] = b'%';
1378        let hi = (i as u8 >> 4) & 0xF;
1379        let lo = i as u8 & 0xF;
1380        buf[3 * i + 1] = digit_to_hex_char(hi);
1381        buf[3 * i + 2] = digit_to_hex_char(lo);
1382        i += 1;
1383    }
1384    buf
1385};
1386const LUT: &str = unsafe {
1387    // # Safety
1388    // `LUT_BYTES` contains only '%' and ASCII hex digit characters.
1389    // Therefore, UTF-8 validation won't fail.
1390    from_utf8_unchecked(&LUT_BYTES)
1391};
1392
1393pub fn escape(s: &str) -> Cow<'_, str> {
1394    escape_except(s, |_| false)
1395}
1396
1397pub fn escape_bytes(b: &[u8]) -> Cow<'_, [u8]> {
1398    escape_bytes_except(b, |_| false)
1399}
1400
1401pub fn escape_except(s: &str, is_except: impl Fn(char) -> bool) -> Cow<'_, str> {
1402    let cap = s
1403        .chars()
1404        .filter_map(|c| (!is_except(c)).then_some(c.len_utf8() * 2))
1405        .sum::<usize>();
1406    if cap == 0 {
1407        return Cow::Borrowed(s);
1408    }
1409    let mut encode = [0; 6];
1410    let mut buf = String::with_capacity(s.len() + cap);
1411    for c in s.chars() {
1412        if is_except(c) {
1413            buf.push(c);
1414        } else {
1415            let encoded = c.encode_utf8(&mut encode);
1416            for b in encoded.bytes() {
1417                let index = b as usize * 3;
1418                buf.push_str(&LUT[index..index + 3]);
1419            }
1420        }
1421    }
1422    Cow::Owned(buf)
1423}
1424
1425pub fn escape_bytes_except(b: &[u8], is_except: impl Fn(u8) -> bool) -> Cow<'_, [u8]> {
1426    let cap = b.iter().copied().filter(|&b| !is_except(b)).count() * 2;
1427    if cap == 0 {
1428        return Cow::Borrowed(b);
1429    }
1430    let mut buf = Vec::with_capacity(b.len() + cap);
1431    for &b in b {
1432        if is_except(b) {
1433            buf.push(b);
1434        } else {
1435            let index = b as usize * 3;
1436            buf.extend_from_slice(&LUT_BYTES[index..index + 3]);
1437        }
1438    }
1439    Cow::Owned(buf)
1440}
1441
1442pub enum URIUnescapeError {
1443    InvalidEscape,
1444    Utf8Error(std::str::Utf8Error),
1445}
1446
1447impl From<std::str::Utf8Error> for URIUnescapeError {
1448    fn from(value: std::str::Utf8Error) -> Self {
1449        Self::Utf8Error(value)
1450    }
1451}
1452
1453pub fn unescape(s: &str) -> Result<Cow<'_, str>, URIUnescapeError> {
1454    if !s.contains('%') {
1455        return Ok(Cow::Borrowed(s));
1456    }
1457
1458    let mut split = s.split('%');
1459    let mut buf = String::with_capacity(s.len());
1460    buf.push_str(split.next().unwrap());
1461    let mut bytes = vec![];
1462    for chunk in split {
1463        if chunk.len() < 2 {
1464            return Err(URIUnescapeError::InvalidEscape);
1465        }
1466        let byte =
1467            u8::from_str_radix(&chunk[..2], 16).map_err(|_| URIUnescapeError::InvalidEscape)?;
1468        bytes.push(byte);
1469
1470        if chunk.len() > 2 {
1471            buf.push_str(from_utf8(&bytes)?);
1472            buf.push_str(&chunk[2..]);
1473            bytes.clear();
1474        }
1475    }
1476
1477    if !bytes.is_empty() {
1478        buf.push_str(from_utf8(&bytes)?);
1479    }
1480    Ok(Cow::Owned(buf))
1481}
1482
1483pub fn unescape_bytes(b: &[u8]) -> Result<Cow<'_, [u8]>, URIUnescapeError> {
1484    if !b.contains(&b'%') {
1485        return Ok(Cow::Borrowed(b));
1486    }
1487
1488    let mut split = b.split(|&b| b == b'%');
1489    let mut buf = Vec::with_capacity(b.len());
1490    buf.extend_from_slice(split.next().unwrap());
1491
1492    fn hexdigit_to_byte(hex: u8) -> u8 {
1493        if hex.is_ascii_digit() {
1494            hex - b'0'
1495        } else if hex.is_ascii_uppercase() {
1496            hex - b'A' + 10
1497        } else {
1498            hex - b'a' + 10
1499        }
1500    }
1501    for chunk in split {
1502        if chunk.len() < 2 || !chunk[0].is_ascii_hexdigit() || !chunk[1].is_ascii_hexdigit() {
1503            return Err(URIUnescapeError::InvalidEscape);
1504        }
1505        let hi = hexdigit_to_byte(chunk[0]);
1506        let lo = hexdigit_to_byte(chunk[1]);
1507        buf.push((hi << 4) | lo);
1508    }
1509    Ok(Cow::Owned(buf))
1510}
1511
1512#[derive(Debug, Clone, Copy)]
1513enum DecomposeState {
1514    Scheme,
1515    Authority,
1516    Root,
1517    Path,
1518    Query,
1519    Fragment,
1520    Finish,
1521}
1522
1523pub struct Components<'a> {
1524    state: DecomposeState,
1525    uri: &'a str,
1526}
1527
1528impl Components<'_> {
1529    fn new(uri: &str) -> Components<'_> {
1530        Components {
1531            state: DecomposeState::Scheme,
1532            uri,
1533        }
1534    }
1535}
1536
1537impl<'a> Iterator for Components<'a> {
1538    type Item = Component<'a>;
1539
1540    fn next(&mut self) -> Option<Self::Item> {
1541        use DecomposeState::*;
1542        loop {
1543            match self.state {
1544                Scheme => {
1545                    self.state = Authority;
1546                    let mut bytes = self.uri.as_bytes();
1547                    if parse_scheme(&mut bytes).is_ok() && bytes.starts_with(b":") {
1548                        let len = self.uri.len() - bytes.len();
1549                        let (scheme, rem) = self.uri.split_at(len);
1550                        self.uri = &rem[1..];
1551                        break Some(Component::Scheme(scheme));
1552                    }
1553                }
1554                Authority => {
1555                    self.state = Root;
1556                    if let Some(rem) = self.uri.strip_prefix("//") {
1557                        let pos = rem.bytes().position(|b| b == b'/').unwrap_or(rem.len());
1558                        let (mut authority, rem) = rem.split_at(pos);
1559                        self.uri = rem;
1560                        let mut userinfo = None;
1561                        if let Some((ui, rem)) = authority.split_once('@') {
1562                            userinfo = Some(ui);
1563                            authority = rem;
1564                        }
1565                        let mut port = None;
1566                        if let Some((host, p)) = authority.rsplit_once(':')
1567                            && p.bytes().all(|b| b.is_ascii_digit())
1568                        {
1569                            port = Some(p);
1570                            authority = host;
1571                        }
1572                        break Some(Component::Authority {
1573                            userinfo,
1574                            host: authority,
1575                            port,
1576                        });
1577                    }
1578                }
1579                Root => {
1580                    self.state = Path;
1581                    if let Some(rem) = self.uri.strip_prefix('/') {
1582                        self.uri = rem;
1583                        break Some(Component::RootSegment);
1584                    }
1585                }
1586                Path => {
1587                    let pos = self
1588                        .uri
1589                        .bytes()
1590                        .position(|b| b == b'/' || b == b'?' || b == b'#')
1591                        .unwrap_or(self.uri.len());
1592                    let (segment, rem) = self.uri.split_at(pos);
1593                    if let Some(rem) = rem.strip_prefix('/') {
1594                        self.uri = rem;
1595                    } else {
1596                        self.uri = rem;
1597                        self.state = Query;
1598                    }
1599                    break Some(Component::Segment(segment));
1600                }
1601                Query => {
1602                    self.state = Fragment;
1603                    if let Some(rem) = self.uri.strip_prefix('?') {
1604                        let pos = rem.bytes().position(|b| b == b'#').unwrap_or(rem.len());
1605                        let (query, rem) = rem.split_at(pos);
1606                        self.uri = rem;
1607                        break Some(Component::Query(query));
1608                    }
1609                }
1610                Fragment => {
1611                    debug_assert!(self.uri.is_empty() || self.uri.starts_with('#'));
1612                    self.state = Finish;
1613                    if !self.uri.is_empty() {
1614                        let (_, frag) = self.uri.split_at(1);
1615                        self.uri = "";
1616                        break Some(Component::Fragment(frag));
1617                    }
1618                }
1619                Finish => break None,
1620            }
1621        }
1622    }
1623}
1624
1625pub enum Component<'a> {
1626    Scheme(&'a str),
1627    Authority {
1628        userinfo: Option<&'a str>,
1629        host: &'a str,
1630        port: Option<&'a str>,
1631    },
1632    RootSegment,
1633    Segment(&'a str),
1634    Query(&'a str),
1635    Fragment(&'a str),
1636}
anyxml_uri/uri.rs

anyxml_uri/
uri.rs