anyxml_uri/
uri.rs

1use std::{
2    borrow::{Borrow, Cow},
3    ops::Deref,
4    path::Path,
5    rc::Rc,
6    str::{from_utf8, from_utf8_unchecked},
7    sync::Arc,
8};
9
10use crate::ParseRIError;
11
12#[derive(Debug, PartialEq, Eq, Hash)]
13#[repr(transparent)]
14pub struct URIStr {
15    uri: str,
16}
17
18impl URIStr {
19    fn new(s: &str) -> &Self {
20        unsafe {
21            // # Safety
22            // Since `URIStr` is a transparent newtype of `str`,
23            // the bit patterns are exactly the same and have the same features.
24            &*(s as *const str as *const Self)
25        }
26    }
27
28    /// Resolve the relative reference `reference` using `self` as the base URI.
29    ///
30    /// `self` must be convertible to an [absolute URI](https://datatracker.ietf.org/doc/html/rfc3986#section-4.3)
31    /// through [fragment](https://datatracker.ietf.org/doc/html/rfc3986#section-3.5) removal
32    /// and normalization.
33    ///
34    /// # Reference
35    /// - [5.1.  Establishing a Base URI](https://datatracker.ietf.org/doc/html/rfc3986#section-5.1)
36    /// - [5.2.  Relative Resolution](https://datatracker.ietf.org/doc/html/rfc3986#section-5.2)
37    pub fn resolve(&self, reference: &Self) -> URIString {
38        use Component::*;
39
40        let base = if self.is_absolute() {
41            Cow::Borrowed(self)
42        } else {
43            let mut base = self.to_owned();
44            base.normalize();
45            if let Some(frag) = base.uri.bytes().position(|b| b == b'#') {
46                base.uri.truncate(frag);
47            }
48            assert!(
49                base.is_absolute(),
50                "'{}' is not absolute",
51                base.as_escaped_str()
52            );
53            Cow::Owned(base)
54        };
55
56        let mut ref_components = reference.components().peekable();
57        if ref_components
58            .next_if(|comp| matches!(comp, Scheme(_)))
59            .is_some()
60        {
61            let mut ret = reference.to_owned();
62            ret.normalize();
63            return ret;
64        }
65
66        if ref_components
67            .next_if(|comp| matches!(comp, Authority { .. }))
68            .is_some()
69        {
70            // has authority
71            let mut ret = URIString {
72                uri: [base.scheme().unwrap(), ":", &reference.uri].concat(),
73            };
74            ret.normalize();
75            return ret;
76        }
77
78        let mut components = base.components().peekable();
79        let mut uri = String::new();
80        if let Some(Scheme(scheme)) = components.next_if(|comp| matches!(comp, Scheme(_))) {
81            uri.push_str(scheme);
82            uri.push(':');
83        }
84        if let Some(Authority {
85            userinfo,
86            host,
87            port,
88        }) = components.next_if(|comp| matches!(comp, Authority { .. }))
89        {
90            uri.push_str("//");
91            if let Some(userinfo) = userinfo {
92                uri.push_str(userinfo);
93                uri.push(':');
94            }
95            uri.push_str(host);
96            if let Some(port) = port {
97                uri.push(':');
98                uri.push_str(port);
99            }
100        }
101
102        if ref_components
103            .next_if(|comp| matches!(comp, RootSegment))
104            .is_some()
105        {
106            uri.push_str(&reference.uri);
107            let mut ret = URIString { uri };
108            ret.normalize();
109            return ret;
110        }
111
112        let mut segments = vec![];
113        let has_root = components
114            .next_if(|comp| matches!(comp, RootSegment))
115            .is_some();
116        let mut has_dot_segment = false;
117        while let Some(Segment(segment)) = components.next_if(|comp| matches!(comp, Segment(_))) {
118            segments.push(segment);
119            has_dot_segment |= segment == "." || segment == "..";
120        }
121        if has_dot_segment {
122            segments = normalize_path_segments(segments.into_iter(), has_root);
123        }
124
125        let mut has_path = false;
126        if let Some(Segment(segment)) = ref_components.next_if(|comp| matches!(comp, Segment(_))) {
127            let mut buf = vec![segment];
128            while let Some(Segment(segment)) =
129                ref_components.next_if(|comp| matches!(comp, Segment(_)))
130            {
131                buf.push(segment);
132            }
133            if buf.len() > 1 || !buf[0].is_empty() {
134                segments.pop();
135                segments.extend(buf);
136                has_path = true;
137            }
138        }
139        build_normalized_path(segments.into_iter(), has_root, &mut uri);
140
141        if let Some(Query(query)) = ref_components.next_if(|comp| matches!(comp, Query(_))) {
142            uri.push('?');
143            uri.push_str(query);
144        } else if !has_path
145            && let Some(Query(query)) = components.next_if(|comp| matches!(comp, Query(_)))
146        {
147            uri.push('?');
148            uri.push_str(query);
149        }
150
151        if let Some(Fragment(fragment)) = ref_components.next() {
152            uri.push('#');
153            uri.push_str(fragment);
154        }
155
156        URIString { uri }
157    }
158
159    /// Return the escaped URI string.
160    pub fn as_escaped_str(&self) -> &str {
161        &self.uri
162    }
163
164    /// Return the unescaped URI string.  \
165    /// If unescaping fails, return `None`.
166    pub fn as_unescaped_str(&self) -> Option<Cow<'_, str>> {
167        unescape(&self.uri).ok()
168    }
169
170    /// # Reference
171    /// [4.3.  Absolute URI](https://datatracker.ietf.org/doc/html/rfc3986#section-4.3)
172    pub fn is_absolute(&self) -> bool {
173        self.scheme().is_some() && self.fragment().is_none()
174    }
175
176    /// # Reference
177    /// [4.2.  Relative Reference](https://datatracker.ietf.org/doc/html/rfc3986#section-4.2)
178    pub fn is_relative(&self) -> bool {
179        self.scheme().is_none()
180    }
181
182    /// # Reference
183    /// [3.1.  Scheme](https://datatracker.ietf.org/doc/html/rfc3986#section-3.1)
184    pub fn scheme(&self) -> Option<&str> {
185        let pos = self.uri.bytes().position(is_reserved)?;
186        (self.uri.as_bytes()[pos] == b':').then_some(&self.uri[..pos])
187    }
188
189    /// # Reference
190    /// [3.2.  Authority](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2)
191    pub fn authority(&self) -> Option<&str> {
192        let rem = self
193            .uri
194            .strip_prefix("//")
195            .or_else(|| self.uri.split_once("://").map(|p| p.1))?;
196        Some(rem.split_once('/').map(|p| p.0).unwrap_or(rem))
197    }
198
199    /// # Reference
200    /// [3.2.1.  User Information](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.1)
201    pub fn userinfo(&self) -> Option<&str> {
202        Some(self.authority()?.split_once('@')?.0)
203    }
204
205    /// # Reference
206    /// [3.2.2.  Host](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.2)
207    pub fn host(&self) -> Option<&str> {
208        let mut auth = self.authority()?;
209        if let Some((_userinfo, rem)) = auth.split_once('@') {
210            auth = rem;
211        }
212        if let Some((host, port)) = auth.rsplit_once(':')
213            && port.bytes().all(|b| b.is_ascii_digit())
214        {
215            auth = host;
216        }
217        Some(auth)
218    }
219
220    /// # Reference
221    /// [3.2.3.  Port](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.3)
222    pub fn port(&self) -> Option<&str> {
223        let (_, port) = self.authority()?.rsplit_once(':')?;
224        port.bytes().all(|b| b.is_ascii_digit()).then_some(port)
225    }
226
227    /// # Reference
228    /// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
229    pub fn path(&self) -> &str {
230        let mut path = &self.uri;
231        if let Some(scheme) = self.scheme() {
232            // has scheme
233            path = &path[scheme.len() + 1..];
234        }
235        if let Some(rem) = path.strip_prefix("//") {
236            // has authority
237            let pos = rem.bytes().position(|b| b == b'/').unwrap_or(rem.len());
238            path = &rem[pos..]
239        }
240
241        path.split_once(['?', '#']).map(|p| p.0).unwrap_or(path)
242    }
243
244    /// # Reference
245    /// [3.4.  Query](https://datatracker.ietf.org/doc/html/rfc3986#section-3.4)
246    pub fn query(&self) -> Option<&str> {
247        let pos = self.uri.bytes().position(|b| b == b'?' || b == b'#')?;
248        if self.uri.as_bytes()[pos] == b'#' {
249            return None;
250        }
251        let query = &self.uri[pos + 1..];
252        let pos = query.bytes().position(|b| b == b'#').unwrap_or(query.len());
253        Some(&query[..pos])
254    }
255
256    /// # Reference
257    /// [3.5.  Fragment](https://datatracker.ietf.org/doc/html/rfc3986#section-3.5)
258    pub fn fragment(&self) -> Option<&str> {
259        let pos = self.uri.bytes().position(|b| b == b'#')?;
260        Some(&self.uri[pos + 1..])
261    }
262
263    /// Return an iterator that scans the URI components.
264    pub fn components(&self) -> Components<'_> {
265        Components::new(&self.uri)
266    }
267}
268
269impl ToOwned for URIStr {
270    type Owned = URIString;
271
272    fn to_owned(&self) -> Self::Owned {
273        URIString {
274            uri: self.uri.to_owned(),
275        }
276    }
277}
278
279impl From<&URIStr> for URIString {
280    fn from(value: &URIStr) -> Self {
281        value.to_owned()
282    }
283}
284
285impl AsRef<URIStr> for URIStr {
286    fn as_ref(&self) -> &URIStr {
287        self
288    }
289}
290
291impl Clone for Box<URIStr> {
292    fn clone(&self) -> Self {
293        self.as_ref().into()
294    }
295}
296
297macro_rules! impl_boxed_convertion_uri_str {
298    ($( $t:ident ),*) => {
299        $(
300            impl From<&URIStr> for $t<URIStr> {
301                fn from(value: &URIStr) -> Self {
302                    let boxed: $t<str> = value.uri.into();
303                    unsafe {
304                        // # Safety
305                        // Since `URIStr` is a transparent newtype of `str`,
306                        // the bit patterns are exactly the same and have the same features.
307                        std::mem::transmute(boxed)
308                    }
309                }
310            }
311        )*
312    };
313}
314impl_boxed_convertion_uri_str!(Box, Rc, Arc);
315
316#[derive(Debug, Clone, PartialEq, Eq, Hash)]
317#[repr(transparent)]
318pub struct URIString {
319    /// Escaped URI string.
320    ///
321    /// Parts generated from UTF-8 strings can always be converted back
322    /// to the original UTF-8 byte sequence.
323    /// Similarly, the parts generated from Path can probably be converted back
324    /// to the original Path byte sequence.
325    ///
326    /// As a result of resolving URI references, there may be a mixture of parts generated
327    /// from UTF-8 strings and parts generated from Paths, so the whole may not always revert
328    /// to a UTF-8 string or Path byte sequence.
329    uri: String,
330}
331
332impl URIString {
333    pub fn parse(uri: impl AsRef<str>) -> Result<Self, ParseRIError> {
334        fn _parse(uri: &str) -> Result<URIString, ParseRIError> {
335            let uri = escape_except(uri, |b| {
336                b.is_ascii() && (is_reserved(b as u8) || is_unreserved(b as u8))
337            });
338            let mut bytes = uri.as_bytes();
339            parse_uri_reference(&mut bytes)?;
340            if !bytes.is_empty() {
341                Err(ParseRIError::NotTermination)
342            } else {
343                Ok(URIString {
344                    uri: uri.into_owned(),
345                })
346            }
347        }
348        _parse(uri.as_ref())
349    }
350
351    /// # Note
352    /// In the current implementation, paths that cannot be converted to UTF-8 strings
353    /// cannot be handled.  \
354    /// I don't think there will be any problems in most environments, but there may be
355    /// some paths that cannot be handled.
356    pub fn parse_file_path(path: impl AsRef<Path>) -> Result<Self, ParseRIError> {
357        #[cfg(target_family = "unix")]
358        fn _parse_file_path(path: &Path) -> Result<URIString, ParseRIError> {
359            let mut path_str = path.to_str().ok_or(ParseRIError::Unsupported)?.to_owned();
360            if (path.is_dir() || (path.as_os_str().as_encoded_bytes().ends_with(b"\\")))
361                && !path_str.ends_with('/')
362            {
363                path_str.push('/');
364            }
365            if path.is_absolute() {
366                path_str.insert_str(0, "file://");
367            }
368            URIString::parse(path_str)
369        }
370        #[cfg(target_family = "windows")]
371        fn _parse_file_path(path: &Path) -> Result<URIString, ParseRIError> {
372            use std::path::{Component::*, Prefix::*};
373
374            let mut path_str = String::new();
375            let mut verbatim = false;
376            for comp in path.components() {
377                match comp {
378                    Prefix(prefix) => match prefix.kind() {
379                        Verbatim(root) => {
380                            path_str.push_str("file:///");
381                            path_str.push_str(
382                                &root
383                                    .to_str()
384                                    .ok_or(ParseRIError::Unsupported)?
385                                    .replace('/', "%2F"),
386                            );
387                            verbatim = true;
388                        }
389                        VerbatimUNC(server, root) => {
390                            path_str.push_str("file://");
391                            path_str.push_str(
392                                &server
393                                    .to_str()
394                                    .ok_or(ParseRIError::Unsupported)?
395                                    .replace('/', "%2F"),
396                            );
397                            path_str.push('/');
398                            path_str.push_str(
399                                &root
400                                    .to_str()
401                                    .ok_or(ParseRIError::Unsupported)?
402                                    .replace('/', "%2F"),
403                            );
404                            verbatim = true;
405                        }
406                        VerbatimDisk(letter) => {
407                            path_str.push_str("file:");
408                            path_str.push(letter as char);
409                            path_str.push(':');
410                            verbatim = true;
411                        }
412                        DeviceNS(device) => {
413                            path_str.push_str("file:///");
414                            path_str.push_str(device.to_str().ok_or(ParseRIError::Unsupported)?);
415                        }
416                        UNC(server, root) => {
417                            path_str.push_str("file://");
418                            path_str.push_str(server.to_str().ok_or(ParseRIError::Unsupported)?);
419                            path_str.push('/');
420                            path_str.push_str(root.to_str().ok_or(ParseRIError::Unsupported)?);
421                        }
422                        Disk(letter) => {
423                            path_str.push_str("file:");
424                            path_str.push(letter as char);
425                            path_str.push(':');
426                        }
427                    },
428                    RootDir => {}
429                    CurDir => path_str.push_str("/."),
430                    ParentDir => path_str.push_str("/.."),
431                    Normal(segment) => {
432                        path_str.push('/');
433                        let segment = segment.to_str().ok_or(ParseRIError::Unsupported)?;
434                        if verbatim {
435                            path_str.push_str(&segment.replace('/', "%2F"));
436                        } else {
437                            path_str.push_str(segment);
438                        }
439                    }
440                }
441            }
442            if (path.is_dir()
443                || (path.as_os_str().as_encoded_bytes().ends_with(b"\\")
444                    || (!verbatim && path.as_os_str().as_encoded_bytes().ends_with(b"/"))))
445                && !path_str.ends_with('/')
446            {
447                path_str.push('/');
448            }
449            URIString::parse(path_str)
450        }
451        #[cfg(all(not(target_family = "unix"), not(target_family = "windows")))]
452        fn _parse_file_path(path: &Path) -> Result<URIString, ParseRIError> {
453            todo!()
454        }
455        _parse_file_path(path.as_ref())
456    }
457
458    pub fn into_boxed_uri_str(self) -> Box<URIStr> {
459        Box::from(self.as_ref())
460    }
461
462    /// # Reference
463    /// [6.2.2.  Syntax-Based Normalization](https://datatracker.ietf.org/doc/html/rfc3986#section-6.2.2).
464    pub fn normalize(&mut self) {
465        use Component::*;
466
467        let mut uri = String::with_capacity(self.uri.len());
468        let mut paths = vec![];
469        let mut query = None;
470        let mut fragment = None;
471        let mut has_root = false;
472        for comp in self.components() {
473            match comp {
474                Scheme(scheme) => {
475                    uri.push_str(&scheme.to_ascii_lowercase());
476                    uri.push(':');
477                }
478                Authority {
479                    userinfo,
480                    host,
481                    port,
482                } => {
483                    uri.push_str("//");
484                    if let Some(userinfo) = userinfo {
485                        uri.push_str(userinfo);
486                        uri.push('@');
487                    }
488                    uri.push_str(host);
489                    if let Some(port) = port {
490                        uri.push(':');
491                        uri.push_str(port);
492                    }
493                }
494                RootSegment => has_root = true,
495                Segment(segment) => paths.push(segment),
496                Query(q) => query = Some(q),
497                Fragment(f) => fragment = Some(f),
498            }
499        }
500        build_normalized_path(paths.into_iter(), has_root, &mut uri);
501        if let Some(query) = query {
502            uri.push('?');
503            uri.push_str(query);
504        }
505        if let Some(fragment) = fragment {
506            uri.push('#');
507            uri.push_str(fragment);
508        }
509        self.uri = uri;
510    }
511}
512
513impl AsRef<URIStr> for URIString {
514    fn as_ref(&self) -> &URIStr {
515        URIStr::new(&self.uri)
516    }
517}
518
519impl Borrow<URIStr> for URIString {
520    fn borrow(&self) -> &URIStr {
521        self.as_ref()
522    }
523}
524
525impl Deref for URIString {
526    type Target = URIStr;
527
528    fn deref(&self) -> &Self::Target {
529        self.as_ref()
530    }
531}
532
533macro_rules! impl_convertion_uri_string {
534    ($( $t:ty ),*) => {
535        $(
536            impl From<URIString> for $t {
537                fn from(value: URIString) -> $t {
538                    From::from(value.as_ref())
539                }
540            }
541        )*
542    };
543}
544impl_convertion_uri_string!(Box<URIStr>, Rc<URIStr>, Arc<URIStr>);
545
546fn build_normalized_path<'a>(
547    segments: impl Iterator<Item = &'a str>,
548    has_root: bool,
549    buffer: &mut String,
550) {
551    let segments = normalize_path_segments(segments, has_root);
552    if has_root {
553        buffer.push('/');
554    }
555    for (i, seg) in segments.into_iter().enumerate() {
556        if i > 0 {
557            buffer.push('/');
558        }
559        buffer.push_str(seg);
560    }
561}
562
563fn normalize_path_segments<'a>(
564    segments: impl Iterator<Item = &'a str>,
565    has_root: bool,
566) -> Vec<&'a str> {
567    let mut stack = vec![];
568    let mut last_dot = false;
569    for seg in segments {
570        if seg == "." {
571            // no op
572            last_dot = true;
573        } else if seg == ".." {
574            if !stack.is_empty() && stack.last() != Some(&"..") {
575                stack.pop();
576            } else if !has_root {
577                stack.push(seg);
578            }
579            last_dot = true;
580        } else {
581            stack.push(seg);
582            last_dot = false;
583        }
584    }
585
586    if last_dot {
587        stack.push("");
588    }
589
590    stack
591}
592
593/// # Reference
594/// [4.1.  URI Reference](https://datatracker.ietf.org/doc/html/rfc3986#section-4.1)
595///
596/// ```text
597/// URI-reference = URI / relative-ref
598/// ```
599fn parse_uri_reference(b: &mut &[u8]) -> Result<(), ParseRIError> {
600    if b.is_empty() || matches!(b[0], b'/' | b'?' | b'#') {
601        // If `b` is an empty string or starts with either '/', '?' or '#',
602        // it is definitely 'relative-ref'.
603        parse_relative_ref(b)
604    } else {
605        // Otherwise, it is necessary to distinguish between `URI` and `relative-ref`
606        // starting with `relative-part` that matches `path-noscheme`.
607
608        if !b[0].is_ascii_alphabetic() {
609            // Since `scheme` begins with at least one `ALPHA`,
610            // if it does not, it is definitely `irelative-ref`.
611            parse_relative_ref(b)
612        } else {
613            // The characters that can be used in `scheme` are very limited,
614            // so it might be quicker to try parsing `scheme` to distinguish between them?
615            // [25] scheme ::= ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
616            if let Some(&c) = b
617                .iter()
618                .find(|&&b| !b.is_ascii_alphanumeric() && !matches!(b, b'+' | b'-' | b'.'))
619                && c == b':'
620            {
621                parse_uri(b)
622            } else {
623                parse_relative_ref(b)
624            }
625        }
626    }
627}
628
629/// # Reference
630/// [3.  Syntax Components](https://datatracker.ietf.org/doc/html/rfc3986#section-3)
631///
632/// ```text
633/// URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
634/// ```
635fn parse_uri(b: &mut &[u8]) -> Result<(), ParseRIError> {
636    parse_scheme(b)?;
637    *b = b
638        .strip_prefix(b":")
639        .ok_or(ParseRIError::InvalidSchemeSeparator)?;
640    parse_hier_part(b)?;
641    if let Some(query) = b.strip_prefix(b"?") {
642        *b = query;
643        parse_query(b)?;
644    }
645    if let Some(fragment) = b.strip_prefix(b"#") {
646        *b = fragment;
647        parse_fragment(b)?;
648    }
649    Ok(())
650}
651
652/// # Reference
653/// [3.1.  Scheme](https://datatracker.ietf.org/doc/html/rfc3986#section-3.1)
654///
655/// ```text
656/// scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
657/// ```
658fn parse_scheme(b: &mut &[u8]) -> Result<(), ParseRIError> {
659    if b.is_empty() || !b[0].is_ascii_alphabetic() {
660        return Err(ParseRIError::InvalidScheme);
661    }
662    let pos = b
663        .iter()
664        .position(|&b| !b.is_ascii_alphanumeric() && !matches!(b, b'+' | b'-' | b'.'))
665        .unwrap_or(b.len());
666    *b = &b[pos..];
667    Ok(())
668}
669
670/// # Reference
671/// [3.  Syntax Components](https://datatracker.ietf.org/doc/html/rfc3986#section-3)
672///
673/// ```text
674/// hier-part   = "//" authority path-abempty
675///             / path-absolute
676///             / path-rootless
677///             / path-empty
678/// ```
679fn parse_hier_part(b: &mut &[u8]) -> Result<(), ParseRIError> {
680    if let Some(rem) = b.strip_prefix(b"/") {
681        // If `b` starts with '/', `b` starts with 'authority' or `path-absolute`,
682
683        if let Some(rem) = rem.strip_prefix(b"/") {
684            // If `b` starts with '//', it should be followed by 'authority'.
685            // This is because 'path-absolute' is followed by exactly one '/' at the beginning
686            // and optionally 'segment-nz', so there cannot be two consecutive '/' characters.
687            *b = rem;
688            parse_authority(b)?;
689            parse_path_abempty(b)
690        } else {
691            // path-absolute = "/" [ segment-nz *( "/" segment ) ]
692            // segment-nz    = 1*pchar
693            parse_path_absolute(b)
694        }
695    } else {
696        // otherwise, `b` starts with 'path-rootless' or 'path-empty'
697        let mut dum = *b;
698        if parse_pchar(&mut dum).is_ok() {
699            // If 'path-rootless' follows, one or more 'pchar' should follow.
700            parse_path_rootless(b)
701        } else {
702            // If not, it is 'path-empty'.
703            // Since 'path-empty' is an empty string,
704            // we can simply return `Ok` without doing anything.
705            Ok(())
706        }
707    }
708}
709
710/// # Reference
711/// [3.2.  Authority](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2)
712///
713/// ```text
714/// authority   = [ userinfo "@" ] host [ ":" port ]
715/// ```
716fn parse_authority(b: &mut &[u8]) -> Result<(), ParseRIError> {
717    if b.starts_with(b"[") {
718        // If `b` starts with '[', it is definitely an `host` that matches `IP-literal`.
719        parse_ip_literal(b)?;
720        if let Some(rem) = b.strip_prefix(b":") {
721            *b = rem;
722            parse_port(b)?;
723        }
724        return Ok(());
725    }
726
727    // If not, it may start with `userinfo`, or it may start with `host`
728    // that matches `IPv4address` or `reg-name`.
729    //
730    // If it is either `IPv4address` or `reg-name`, there is no need to consider `IPv4address`.
731    // This is because `reg-name` includes `IPv4address`. More specifically, since `unreserved`
732    // contains `DIGIT` and `.`, `IPv4address` can be regarded as a specific sequence of `unreserved`.
733    //
734    // `userinfo` and `reg-name` are rules that share characters other than colons.
735    // Therefore, they can be distinguished using the following algorithm.
736    //
737    // 1. Increment the counter as long as it matches `userinfo`.
738    // 2. If the first ":" is encountered, note its position.
739    // 3. Determine the matching rule according to the characters that did not match `userinfo`.
740    //      i.   If it is "@", the string seen so far is `userinfo`.
741    //      ii.  If it is "[" , then an `host` matching "IP-literal" should start there,
742    //           but since there is no "@" immediately before it, it is an error.
743    //      iii. In other cases, if the position of ":" is noted, the string before it is `host`;
744    //                           if not, all strings seen so far are `host`.
745    //
746    // userinfo    = *( unreserved / pct-encoded / sub-delims / ":" )
747    //
748    // reg-name    = *( unreserved / pct-encoded / sub-delims )
749    // unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
750    //
751    // IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
752    let mut colon = usize::MAX;
753    let mut now = 0;
754    let mut t = *b;
755    while !t.is_empty() {
756        let pos = t
757            .iter()
758            .position(|&b| !is_unreserved(b) && !is_sub_delims(b) && b != b'%')
759            .unwrap_or(t.len());
760        t = &t[pos..];
761        now += pos;
762        if let Some(rem) = t.strip_prefix(b":") {
763            now += 1;
764            t = rem;
765            colon = colon.min(now);
766        } else {
767            break;
768        }
769    }
770
771    debug_assert_eq!(now, b.len() - t.len());
772
773    if let Some(rem) = t.strip_prefix(b"@") {
774        *b = rem;
775        parse_host(b)?;
776        if let Some(rem) = b.strip_prefix(b":") {
777            *b = rem;
778            parse_port(b)?;
779        }
780        Ok(())
781    } else if t.starts_with(b"[") {
782        Err(ParseRIError::InvalidAuthority)
783    } else if colon < usize::MAX {
784        *b = &b[colon + 1..];
785        parse_port(b)
786    } else {
787        *b = t;
788        Ok(())
789    }
790}
791
792// This function has no use.
793// /// # Reference
794// /// [3.2.1.  User Information](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.1)
795// ///
796// /// ```text
797// /// userinfo    = *( unreserved / pct-encoded / sub-delims / ":" )
798// /// ```
799// fn parse_userinfo(b: &mut &[u8]) -> Result<(), ParseRIError> {
800//     todo!()
801// }
802
803/// # Reference
804/// [3.2.2.  Host]
805///
806/// ```text
807/// host        = IP-literal / IPv4address / reg-name
808/// ```
809fn parse_host(b: &mut &[u8]) -> Result<(), ParseRIError> {
810    if b.starts_with(b"[") {
811        parse_ip_literal(b)
812    } else {
813        // Since `IPv4address` is covered by `reg-name`, it does not need to be considered.
814        parse_reg_name(b)
815    }
816}
817
818/// # Reference
819/// [3.2.2.  Host]
820///
821/// ```text
822/// IP-literal  = "[" ( IPv6address / IPvFuture  ) "]"
823/// ```
824fn parse_ip_literal(b: &mut &[u8]) -> Result<(), ParseRIError> {
825    *b = b.strip_prefix(b"[").ok_or(ParseRIError::InvalidIPLiteral)?;
826    if !b.is_empty() && b[0].eq_ignore_ascii_case(&b'v') {
827        parse_ipv_future(b)?;
828    } else {
829        parse_ipv6_address(b)?;
830    }
831    *b = b.strip_prefix(b"]").ok_or(ParseRIError::InvalidIPLiteral)?;
832    Ok(())
833}
834
835/// # Reference
836/// [3.2.2.  Host]
837///
838/// ```text
839/// IPvFuture   = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
840/// ```
841fn parse_ipv_future(b: &mut &[u8]) -> Result<(), ParseRIError> {
842    if b.is_empty() || !b[0].eq_ignore_ascii_case(&b'v') {
843        return Err(ParseRIError::InvalidIPvFuture);
844    }
845    *b = &b[1..];
846    let pos = b
847        .iter()
848        .position(|&b| !b.is_ascii_hexdigit())
849        .unwrap_or(b.len());
850    if !(1..=b.len() - 2).contains(&pos) {
851        return Err(ParseRIError::InvalidIPvFuture);
852    }
853    *b = &b[pos..];
854    *b = b.strip_prefix(b".").ok_or(ParseRIError::InvalidIPvFuture)?;
855    let pos = b
856        .iter()
857        .position(|&b| !is_unreserved(b) && !is_sub_delims(b) && b != b':')
858        .unwrap_or(b.len());
859    if pos == 0 {
860        return Err(ParseRIError::InvalidIPvFuture);
861    }
862    *b = &b[pos..];
863    Ok(())
864}
865
866/// # Reference
867/// [3.2.2.  Host]
868///
869/// ```text
870/// IPv6address =                            6( h16 ":" ) ls32
871///             /                       "::" 5( h16 ":" ) ls32
872///             / [               h16 ] "::" 4( h16 ":" ) ls32
873///             / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
874///             / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
875///             / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
876///             / [ *4( h16 ":" ) h16 ] "::"              ls32
877///             / [ *5( h16 ":" ) h16 ] "::"              h16
878///             / [ *6( h16 ":" ) h16 ] "::"
879///  ls32       = ( h16 ":" h16 ) / IPv4address
880///             ; least-significant 32 bits of address
881///  h16        = 1*4HEXDIG
882///             ; 16 bits of address represented in hexadecimal
883/// ```
884fn parse_ipv6_address(b: &mut &[u8]) -> Result<(), ParseRIError> {
885    let mut cnt = 1;
886    let mut omit = false;
887    if let Some(rem) = b.strip_prefix(b":") {
888        *b = rem;
889        omit = true;
890    } else {
891        parse_h16(b)?;
892    }
893
894    while cnt + (omit as i32) < 8
895        && let Some(rem) = b.strip_prefix(b":")
896    {
897        *b = rem;
898        if b.starts_with(b":") {
899            if omit {
900                return Err(ParseRIError::InvalidIPv6address);
901            }
902            omit = true;
903            cnt += 1;
904            continue;
905        }
906
907        // It's not a smart approach, but it'll probably work...
908        //
909        // Checking `h16` first will not work because it cannot be distinguished
910        // from the first octet of the IPv4 address.
911        //
912        // Checking the positions where ':' and '.' appear also seems unlikely to work,
913        // considering cases where such characters appear in the segments of the following paths.
914        let mut dum = *b;
915        if parse_ipv4_address(&mut dum).is_ok() {
916            *b = dum;
917            // An IPv4 address consumes two hextets.
918            cnt += 2;
919            // An IPv4 address only appears at the end.
920            break;
921        } else if !b.is_empty() && b[0].is_ascii_hexdigit() {
922            parse_h16(b)?;
923        }
924    }
925
926    // If "::" is included, some hextets may be omitted, resulting in fewer than eight.
927    // Otherwise, exactly eight hextets are required.
928    if (omit && cnt <= 8) || (!omit && cnt == 8) {
929        Ok(())
930    } else {
931        Err(ParseRIError::InvalidIPv6address)
932    }
933}
934
935/// # Reference
936/// [3.2.2.  Host]
937///
938/// ```text
939///  h16        = 1*4HEXDIG
940///             ; 16 bits of address represented in hexadecimal
941/// ```
942fn parse_h16(b: &mut &[u8]) -> Result<(), ParseRIError> {
943    let pos = b
944        .iter()
945        .position(|&b| !b.is_ascii_hexdigit())
946        .unwrap_or(b.len());
947    if pos == 0 {
948        Err(ParseRIError::InvalidH16)
949    } else {
950        *b = &b[pos.min(4)..];
951        Ok(())
952    }
953}
954
955/// # Reference
956/// [3.2.2.  Host]
957///
958/// ```text
959/// IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
960/// dec-octet   = DIGIT                 ; 0-9
961///             / %x31-39 DIGIT         ; 10-99
962///             / "1" 2DIGIT            ; 100-199
963///             / "2" %x30-34 DIGIT     ; 200-249
964///             / "25" %x30-35          ; 250-255
965/// ```
966fn parse_ipv4_address(b: &mut &[u8]) -> Result<(), ParseRIError> {
967    parse_dec_octet(b)?;
968    for _ in 0..3 {
969        *b = b.strip_prefix(b".").ok_or(ParseRIError::InvalidDecOctet)?;
970        parse_dec_octet(b)?;
971    }
972    Ok(())
973}
974fn parse_dec_octet(b: &mut &[u8]) -> Result<(), ParseRIError> {
975    let len = match b {
976        [b'2', b'5', b'0'..=b'5', ..] => 3,
977        [b'2', b'0'..=b'4', b'0'..=b'9', ..] => 3,
978        [b'1', b'0'..=b'9', b'0'..=b'9', ..] => 3,
979        [b'1'..=b'9', b'0'..=b'9', ..] => 2,
980        [b'0'..=b'9', ..] => 1,
981        _ => return Err(ParseRIError::InvalidDecOctet),
982    };
983    *b = &b[len..];
984    Ok(())
985}
986
987/// # Reference
988/// [3.2.2.  Host]
989///
990/// ```text
991/// reg-name    = *( unreserved / pct-encoded / sub-delims )
992/// ```
993fn parse_reg_name(b: &mut &[u8]) -> Result<(), ParseRIError> {
994    // pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
995    // reg-name      = pchar - (":" | "@")
996    while !b.is_empty() && !matches!(b[0], b':' | b'@') && parse_pchar(b).is_ok() {}
997    Ok(())
998}
999
1000/// # Reference
1001/// [3.2.3.  Port](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.3)
1002///
1003/// ```text
1004/// port        = *DIGIT
1005/// ```
1006fn parse_port(b: &mut &[u8]) -> Result<(), ParseRIError> {
1007    let pos = b
1008        .iter()
1009        .position(|&b| !b.is_ascii_digit())
1010        .unwrap_or(b.len());
1011    *b = &b[pos..];
1012    Ok(())
1013}
1014
1015/// # Reference
1016/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1017///
1018/// ```text
1019/// path-abempty  = *( "/" segment )
1020/// ```
1021fn parse_path_abempty(b: &mut &[u8]) -> Result<(), ParseRIError> {
1022    while let Some(rem) = b.strip_prefix(b"/") {
1023        *b = rem;
1024        parse_segment(b)?;
1025    }
1026    Ok(())
1027}
1028
1029/// # Reference
1030/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1031///
1032/// ```text
1033/// path-absolute = "/" [ segment-nz *( "/" segment ) ]
1034/// ```
1035fn parse_path_absolute(b: &mut &[u8]) -> Result<(), ParseRIError> {
1036    *b = b
1037        .strip_prefix(b"/")
1038        .ok_or(ParseRIError::InvalidPathAbsolute)?;
1039    if parse_segment_nz(b).is_ok() {
1040        while let Some(rem) = b.strip_prefix(b"/") {
1041            *b = rem;
1042            parse_segment(b)?;
1043        }
1044    }
1045    Ok(())
1046}
1047
1048/// # Reference
1049/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1050///
1051/// ```text
1052/// path-noscheme = segment-nz-nc *( "/" segment )
1053/// ```
1054fn parse_path_noscheme(b: &mut &[u8]) -> Result<(), ParseRIError> {
1055    parse_segment_nz_nc(b)?;
1056    while let Some(rem) = b.strip_prefix(b"/") {
1057        *b = rem;
1058        parse_segment(b)?;
1059    }
1060    Ok(())
1061}
1062
1063/// # Reference
1064/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1065///
1066/// ```text
1067/// path-rootless = segment-nz *( "/" segment )
1068/// ```
1069fn parse_path_rootless(b: &mut &[u8]) -> Result<(), ParseRIError> {
1070    parse_segment_nz(b)?;
1071    while let Some(rem) = b.strip_prefix(b"/") {
1072        *b = rem;
1073        parse_segment(b)?;
1074    }
1075    Ok(())
1076}
1077
1078// This is not necessary because this does nothing.
1079// /// # Reference
1080// /// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1081// ///
1082// /// ```text
1083// /// path-empty    = 0<pchar>
1084// /// ```
1085// fn parse_path_empty(b: &mut &[u8]) -> Result<(), ParseRIError> {
1086//     todo!()
1087// }
1088
1089/// # Reference
1090/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1091///
1092/// ```text
1093/// segment       = *pchar
1094/// ```
1095fn parse_segment(b: &mut &[u8]) -> Result<(), ParseRIError> {
1096    while parse_pchar(b).is_ok() {}
1097    Ok(())
1098}
1099
1100/// # Reference
1101/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1102///
1103/// ```text
1104/// segment-nz    = 1*pchar
1105/// ```
1106fn parse_segment_nz(b: &mut &[u8]) -> Result<(), ParseRIError> {
1107    parse_pchar(b)?;
1108    while parse_pchar(b).is_ok() {}
1109    Ok(())
1110}
1111
1112/// # Reference
1113/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1114///
1115/// ```text
1116/// segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
1117///                     ; non-zero-length segment without any colon ":"
1118/// ```
1119fn parse_segment_nz_nc(b: &mut &[u8]) -> Result<(), ParseRIError> {
1120    if b.is_empty() || b[0] == b':' || parse_pchar(b).is_err() {
1121        return Err(ParseRIError::InvalidSegmentNzNc);
1122    }
1123    while !b.is_empty() && b[0] != b':' && parse_pchar(b).is_ok() {}
1124    Ok(())
1125}
1126
1127/// # Reference
1128/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1129///
1130/// ```text
1131/// pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
1132/// ```
1133fn parse_pchar(b: &mut &[u8]) -> Result<(), ParseRIError> {
1134    if b.is_empty() {
1135        return Err(ParseRIError::InvalidPChar);
1136    }
1137
1138    if is_unreserved(b[0]) || is_sub_delims(b[0]) || matches!(b[0], b':' | b'@') {
1139        *b = &b[1..];
1140        Ok(())
1141    } else if b.len() >= 3 && b[0] == b'%' && b[1].is_ascii_hexdigit() && b[2].is_ascii_hexdigit() {
1142        *b = &b[3..];
1143        Ok(())
1144    } else {
1145        Err(ParseRIError::InvalidPChar)
1146    }
1147}
1148
1149/// # Reference
1150/// [3.4.  Query](https://datatracker.ietf.org/doc/html/rfc3986#section-3.4)
1151///
1152/// ```text
1153/// query       = *( pchar / "/" / "?" )
1154/// ```
1155fn parse_query(b: &mut &[u8]) -> Result<(), ParseRIError> {
1156    loop {
1157        if let Some(rem) = b.strip_prefix(b"/") {
1158            *b = rem;
1159        } else if let Some(rem) = b.strip_prefix(b"?") {
1160            *b = rem;
1161        } else if parse_pchar(b).is_ok() {
1162            // no op
1163        } else {
1164            break Ok(());
1165        }
1166    }
1167}
1168
1169/// # Reference
1170/// [3.5.  Fragment](https://datatracker.ietf.org/doc/html/rfc3986#section-3.5)
1171///
1172/// ```text
1173/// fragment    = *( pchar / "/" / "?" )
1174/// ```
1175fn parse_fragment(b: &mut &[u8]) -> Result<(), ParseRIError> {
1176    loop {
1177        if let Some(rem) = b.strip_prefix(b"/") {
1178            *b = rem;
1179        } else if let Some(rem) = b.strip_prefix(b"?") {
1180            *b = rem;
1181        } else if parse_pchar(b).is_ok() {
1182            // no op
1183        } else {
1184            break Ok(());
1185        }
1186    }
1187}
1188
1189/// # Reference
1190/// [4.2.  Relative Reference](https://datatracker.ietf.org/doc/html/rfc3986#section-4.2)
1191///
1192/// ```text
1193/// relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
1194/// ```
1195fn parse_relative_ref(b: &mut &[u8]) -> Result<(), ParseRIError> {
1196    parse_relative_part(b)?;
1197    if let Some(query) = b.strip_prefix(b"?") {
1198        *b = query;
1199        parse_query(b)?;
1200    }
1201    if let Some(fragment) = b.strip_prefix(b"#") {
1202        *b = fragment;
1203        parse_fragment(b)?;
1204    }
1205    Ok(())
1206}
1207
1208/// # Reference
1209/// [4.2.  Relative Reference](https://datatracker.ietf.org/doc/html/rfc3986#section-4.2)
1210///
1211/// ```text
1212/// relative-part = "//" authority path-abempty
1213///               / path-absolute
1214///               / path-noscheme
1215///               / path-empty
1216/// ```
1217fn parse_relative_part(b: &mut &[u8]) -> Result<(), ParseRIError> {
1218    if let Some(rem) = b.strip_prefix(b"/") {
1219        if let Some(rem) = rem.strip_prefix(b"/") {
1220            *b = rem;
1221            parse_authority(b)?;
1222            parse_path_abempty(b)
1223        } else {
1224            parse_path_absolute(b)
1225        }
1226    } else {
1227        let orig = b.len();
1228        let ret = parse_path_noscheme(b);
1229        // If no characters have been consumed, it matches `path-empty` and returns `Ok`.
1230        if orig == b.len() { Ok(()) } else { ret }
1231    }
1232}
1233
1234/// # Reference
1235/// [2.2.  Reserved Characters](https://datatracker.ietf.org/doc/html/rfc3986#section-2.2)
1236///
1237/// ```text
1238/// reserved    = gen-delims / sub-delims
1239/// ```
1240fn is_reserved(b: u8) -> bool {
1241    is_gen_delims(b) || is_sub_delims(b)
1242}
1243
1244/// # Reference
1245/// [2.2.  Reserved Characters](https://datatracker.ietf.org/doc/html/rfc3986#section-2.2)
1246///
1247/// ```text
1248/// gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
1249/// ```
1250fn is_gen_delims(b: u8) -> bool {
1251    matches!(b, b':' | b'/' | b'?' | b'#' | b'[' | b']' | b'@')
1252}
1253
1254/// # Reference
1255/// [2.2.  Reserved Characters](https://datatracker.ietf.org/doc/html/rfc3986#section-2.2)
1256///
1257/// ```text
1258/// sub-delims  = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
1259/// ```
1260fn is_sub_delims(b: u8) -> bool {
1261    matches!(
1262        b,
1263        b'!' | b'$' | b'&' | b'\'' | b'(' | b')' | b'*' | b'+' | b',' | b';' | b'='
1264    )
1265}
1266
1267/// # Reference
1268/// [2.3.  Unreserved Characters](https://datatracker.ietf.org/doc/html/rfc3986#section-2.3)
1269///
1270/// ```text
1271/// unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
1272/// ```
1273fn is_unreserved(b: u8) -> bool {
1274    b.is_ascii_alphanumeric() || matches!(b, b'-' | b'.' | b'_' | b'~')
1275}
1276
1277const LUT_BYTES: [u8; 256 * 3] = {
1278    const fn digit_to_hex_char(b: u8) -> u8 {
1279        if b < 10 { b + b'0' } else { b - 10 + b'A' }
1280    }
1281    let mut buf = [0u8; 256 * 3];
1282    let mut i = 0;
1283    while i < 256 {
1284        buf[3 * i] = b'%';
1285        let hi = (i as u8 >> 4) & 0xF;
1286        let lo = i as u8 & 0xF;
1287        buf[3 * i + 1] = digit_to_hex_char(hi);
1288        buf[3 * i + 2] = digit_to_hex_char(lo);
1289        i += 1;
1290    }
1291    buf
1292};
1293const LUT: &str = unsafe {
1294    // # Safety
1295    // `LUT_BYTES` contains only '%' and ASCII hex digit characters.
1296    // Therefore, UTF-8 validation won't fail.
1297    from_utf8_unchecked(&LUT_BYTES)
1298};
1299
1300pub fn escape(s: &str) -> Cow<'_, str> {
1301    escape_except(s, |_| false)
1302}
1303
1304pub fn escape_bytes(b: &[u8]) -> Cow<'_, [u8]> {
1305    escape_bytes_except(b, |_| false)
1306}
1307
1308pub fn escape_except(s: &str, is_except: impl Fn(char) -> bool) -> Cow<'_, str> {
1309    let cap = s
1310        .chars()
1311        .filter_map(|c| (!is_except(c)).then_some(c.len_utf8() * 2))
1312        .sum::<usize>();
1313    if cap == 0 {
1314        return Cow::Borrowed(s);
1315    }
1316    let mut encode = [0; 6];
1317    let mut buf = String::with_capacity(s.len() + cap);
1318    for c in s.chars() {
1319        if is_except(c) {
1320            buf.push(c);
1321        } else {
1322            let encoded = c.encode_utf8(&mut encode);
1323            for b in encoded.bytes() {
1324                let index = b as usize * 3;
1325                buf.push_str(&LUT[index..index + 3]);
1326            }
1327        }
1328    }
1329    Cow::Owned(buf)
1330}
1331
1332pub fn escape_bytes_except(b: &[u8], is_except: impl Fn(u8) -> bool) -> Cow<'_, [u8]> {
1333    let cap = b.iter().copied().filter(|&b| !is_except(b)).count() * 2;
1334    if cap == 0 {
1335        return Cow::Borrowed(b);
1336    }
1337    let mut buf = Vec::with_capacity(b.len() + cap);
1338    for &b in b {
1339        if is_except(b) {
1340            buf.push(b);
1341        } else {
1342            let index = b as usize * 3;
1343            buf.extend_from_slice(&LUT_BYTES[index..index + 3]);
1344        }
1345    }
1346    Cow::Owned(buf)
1347}
1348
1349pub enum URIUnescapeError {
1350    InvalidEscape,
1351    Utf8Error(std::str::Utf8Error),
1352}
1353
1354impl From<std::str::Utf8Error> for URIUnescapeError {
1355    fn from(value: std::str::Utf8Error) -> Self {
1356        Self::Utf8Error(value)
1357    }
1358}
1359
1360pub fn unescape(s: &str) -> Result<Cow<'_, str>, URIUnescapeError> {
1361    if !s.contains('%') {
1362        return Ok(Cow::Borrowed(s));
1363    }
1364
1365    let mut split = s.split('%');
1366    let mut buf = String::with_capacity(s.len());
1367    buf.push_str(split.next().unwrap());
1368    let mut bytes = vec![];
1369    for chunk in split {
1370        if chunk.len() < 2 {
1371            return Err(URIUnescapeError::InvalidEscape);
1372        }
1373        let byte =
1374            u8::from_str_radix(&chunk[..2], 16).map_err(|_| URIUnescapeError::InvalidEscape)?;
1375        bytes.push(byte);
1376
1377        if chunk.len() > 2 {
1378            buf.push_str(from_utf8(&bytes)?);
1379            buf.push_str(&chunk[2..]);
1380            bytes.clear();
1381        }
1382    }
1383
1384    if !bytes.is_empty() {
1385        buf.push_str(from_utf8(&bytes)?);
1386    }
1387    Ok(Cow::Owned(buf))
1388}
1389
1390pub fn unescape_bytes(b: &[u8]) -> Result<Cow<'_, [u8]>, URIUnescapeError> {
1391    if !b.contains(&b'%') {
1392        return Ok(Cow::Borrowed(b));
1393    }
1394
1395    let mut split = b.split(|&b| b == b'%');
1396    let mut buf = Vec::with_capacity(b.len());
1397    buf.extend_from_slice(split.next().unwrap());
1398
1399    fn hexdigit_to_byte(hex: u8) -> u8 {
1400        if hex.is_ascii_digit() {
1401            hex - b'0'
1402        } else if hex.is_ascii_uppercase() {
1403            hex - b'A' + 10
1404        } else {
1405            hex - b'a' + 10
1406        }
1407    }
1408    for chunk in split {
1409        if chunk.len() < 2 || !chunk[0].is_ascii_hexdigit() || !chunk[1].is_ascii_hexdigit() {
1410            return Err(URIUnescapeError::InvalidEscape);
1411        }
1412        let hi = hexdigit_to_byte(chunk[0]);
1413        let lo = hexdigit_to_byte(chunk[1]);
1414        buf.push((hi << 4) | lo);
1415    }
1416    Ok(Cow::Owned(buf))
1417}
1418
1419#[derive(Debug, Clone, Copy)]
1420enum DecomposeState {
1421    Scheme,
1422    Authority,
1423    Root,
1424    Path,
1425    Query,
1426    Fragment,
1427    Finish,
1428}
1429
1430pub struct Components<'a> {
1431    state: DecomposeState,
1432    uri: &'a str,
1433}
1434
1435impl Components<'_> {
1436    fn new(uri: &str) -> Components<'_> {
1437        Components {
1438            state: DecomposeState::Scheme,
1439            uri,
1440        }
1441    }
1442}
1443
1444impl<'a> Iterator for Components<'a> {
1445    type Item = Component<'a>;
1446
1447    fn next(&mut self) -> Option<Self::Item> {
1448        use DecomposeState::*;
1449        loop {
1450            match self.state {
1451                Scheme => {
1452                    self.state = Authority;
1453                    let mut bytes = self.uri.as_bytes();
1454                    if parse_scheme(&mut bytes).is_ok() && bytes.starts_with(b":") {
1455                        let len = self.uri.len() - bytes.len();
1456                        let (scheme, rem) = self.uri.split_at(len);
1457                        self.uri = &rem[1..];
1458                        break Some(Component::Scheme(scheme));
1459                    }
1460                }
1461                Authority => {
1462                    self.state = Root;
1463                    if let Some(rem) = self.uri.strip_prefix("//") {
1464                        let pos = rem.bytes().position(|b| b == b'/').unwrap_or(rem.len());
1465                        let (mut authority, rem) = rem.split_at(pos);
1466                        self.uri = rem;
1467                        let mut userinfo = None;
1468                        if let Some((ui, rem)) = authority.split_once('@') {
1469                            userinfo = Some(ui);
1470                            authority = rem;
1471                        }
1472                        let mut port = None;
1473                        if let Some((host, p)) = authority.rsplit_once(':')
1474                            && p.bytes().all(|b| b.is_ascii_digit())
1475                        {
1476                            port = Some(p);
1477                            authority = host;
1478                        }
1479                        break Some(Component::Authority {
1480                            userinfo,
1481                            host: authority,
1482                            port,
1483                        });
1484                    }
1485                }
1486                Root => {
1487                    self.state = Path;
1488                    if let Some(rem) = self.uri.strip_prefix('/') {
1489                        self.uri = rem;
1490                        break Some(Component::RootSegment);
1491                    }
1492                }
1493                Path => {
1494                    let pos = self
1495                        .uri
1496                        .bytes()
1497                        .position(|b| b == b'/' || b == b'?' || b == b'#')
1498                        .unwrap_or(self.uri.len());
1499                    let (segment, rem) = self.uri.split_at(pos);
1500                    if let Some(rem) = rem.strip_prefix('/') {
1501                        self.uri = rem;
1502                    } else {
1503                        self.uri = rem;
1504                        self.state = Query;
1505                    }
1506                    break Some(Component::Segment(segment));
1507                }
1508                Query => {
1509                    self.state = Fragment;
1510                    if let Some(rem) = self.uri.strip_prefix('?') {
1511                        let pos = rem.bytes().position(|b| b == b'#').unwrap_or(rem.len());
1512                        let (query, rem) = rem.split_at(pos);
1513                        self.uri = rem;
1514                        break Some(Component::Query(query));
1515                    }
1516                }
1517                Fragment => {
1518                    debug_assert!(self.uri.is_empty() || self.uri.starts_with('#'));
1519                    self.state = Finish;
1520                    if !self.uri.is_empty() {
1521                        let (_, frag) = self.uri.split_at(1);
1522                        self.uri = "";
1523                        break Some(Component::Fragment(frag));
1524                    }
1525                }
1526                Finish => break None,
1527            }
1528        }
1529    }
1530}
1531
1532pub enum Component<'a> {
1533    Scheme(&'a str),
1534    Authority {
1535        userinfo: Option<&'a str>,
1536        host: &'a str,
1537        port: Option<&'a str>,
1538    },
1539    RootSegment,
1540    Segment(&'a str),
1541    Query(&'a str),
1542    Fragment(&'a str),
1543}
anyxml_uri/uri.rs

anyxml_uri/
uri.rs