anyxml_uri/
uri.rs

1use std::{
2    borrow::{Borrow, Cow},
3    ops::Deref,
4    path::Path,
5    rc::Rc,
6    str::{from_utf8, from_utf8_unchecked},
7    sync::Arc,
8};
9
10use crate::ParseRIError;
11
12#[derive(Debug, PartialEq, Eq, Hash)]
13#[repr(transparent)]
14pub struct URIStr {
15    uri: str,
16}
17
18impl URIStr {
19    fn new(s: &str) -> &Self {
20        unsafe {
21            // # Safety
22            // Since `URIStr` is a transparent newtype of `str`,
23            // the bit patterns are exactly the same and have the same features.
24            &*(s as *const str as *const Self)
25        }
26    }
27
28    /// Resolve the relative reference `reference` using `self` as the base URI.
29    ///
30    /// `self` must be convertible to an [absolute URI](https://datatracker.ietf.org/doc/html/rfc3986#section-4.3)
31    /// through [fragment](https://datatracker.ietf.org/doc/html/rfc3986#section-3.5) removal
32    /// and normalization.
33    ///
34    /// # Reference
35    /// - [5.1.  Establishing a Base URI](https://datatracker.ietf.org/doc/html/rfc3986#section-5.1)
36    /// - [5.2.  Relative Resolution](https://datatracker.ietf.org/doc/html/rfc3986#section-5.2)
37    pub fn resolve(&self, reference: &Self) -> URIString {
38        use Component::*;
39
40        let base = if self.is_absolute() {
41            Cow::Borrowed(self)
42        } else {
43            let mut base = self.to_owned();
44            base.normalize();
45            if let Some(frag) = base.uri.bytes().position(|b| b == b'#') {
46                base.uri.truncate(frag);
47            }
48            assert!(
49                base.is_absolute(),
50                "'{}' is not absolute",
51                base.as_escaped_str()
52            );
53            Cow::Owned(base)
54        };
55
56        let mut ref_components = reference.components().peekable();
57        if ref_components
58            .next_if(|comp| matches!(comp, Scheme(_)))
59            .is_some()
60        {
61            let mut ret = reference.to_owned();
62            ret.normalize();
63            return ret;
64        }
65
66        if ref_components
67            .next_if(|comp| matches!(comp, Authority { .. }))
68            .is_some()
69        {
70            // has authority
71            let mut ret = URIString {
72                uri: [base.scheme().unwrap(), ":", &reference.uri].concat(),
73            };
74            ret.normalize();
75            return ret;
76        }
77
78        let mut components = base.components().peekable();
79        let mut uri = String::new();
80        if let Some(Scheme(scheme)) = components.next_if(|comp| matches!(comp, Scheme(_))) {
81            uri.push_str(scheme);
82            uri.push(':');
83        }
84        if let Some(Authority {
85            userinfo,
86            host,
87            port,
88        }) = components.next_if(|comp| matches!(comp, Authority { .. }))
89        {
90            uri.push_str("//");
91            if let Some(userinfo) = userinfo {
92                uri.push_str(userinfo);
93                uri.push(':');
94            }
95            uri.push_str(host);
96            if let Some(port) = port {
97                uri.push(':');
98                uri.push_str(port);
99            }
100        }
101
102        if ref_components
103            .next_if(|comp| matches!(comp, RootSegment))
104            .is_some()
105        {
106            uri.push_str(&reference.uri);
107            let mut ret = URIString { uri };
108            ret.normalize();
109            return ret;
110        }
111
112        let mut segments = vec![];
113        let has_root = components
114            .next_if(|comp| matches!(comp, RootSegment))
115            .is_some();
116        let mut has_dot_segment = false;
117        while let Some(Segment(segment)) = components.next_if(|comp| matches!(comp, Segment(_))) {
118            segments.push(segment);
119            has_dot_segment |= segment == "." || segment == "..";
120        }
121        if has_dot_segment {
122            segments = normalize_path_segments(segments.into_iter(), has_root);
123        }
124
125        let mut has_path = false;
126        if let Some(Segment(segment)) = ref_components.next_if(|comp| matches!(comp, Segment(_))) {
127            let mut buf = vec![segment];
128            while let Some(Segment(segment)) =
129                ref_components.next_if(|comp| matches!(comp, Segment(_)))
130            {
131                buf.push(segment);
132            }
133            if buf.len() > 1 || !buf[0].is_empty() {
134                segments.pop();
135                segments.extend(buf);
136                has_path = true;
137            }
138        }
139        build_normalized_path(segments.into_iter(), has_root, &mut uri);
140
141        if let Some(Query(query)) = ref_components.next_if(|comp| matches!(comp, Query(_))) {
142            uri.push('?');
143            uri.push_str(query);
144        } else if !has_path
145            && let Some(Query(query)) = components.next_if(|comp| matches!(comp, Query(_)))
146        {
147            uri.push('?');
148            uri.push_str(query);
149        }
150
151        if let Some(Fragment(fragment)) = ref_components.next() {
152            uri.push('#');
153            uri.push_str(fragment);
154        }
155
156        URIString { uri }
157    }
158
159    /// Return the escaped URI string.
160    pub fn as_escaped_str(&self) -> &str {
161        &self.uri
162    }
163
164    /// Return the unescaped URI string.  \
165    /// If unescaping fails, return `None`.
166    pub fn as_unescaped_str(&self) -> Option<Cow<'_, str>> {
167        unescape(&self.uri).ok()
168    }
169
170    /// # Reference
171    /// [4.3.  Absolute URI](https://datatracker.ietf.org/doc/html/rfc3986#section-4.3)
172    pub fn is_absolute(&self) -> bool {
173        self.scheme().is_some() && self.fragment().is_none()
174    }
175
176    /// # Reference
177    /// [4.2.  Relative Reference](https://datatracker.ietf.org/doc/html/rfc3986#section-4.2)
178    pub fn is_relative(&self) -> bool {
179        self.scheme().is_none()
180    }
181
182    /// # Reference
183    /// [3.1.  Scheme](https://datatracker.ietf.org/doc/html/rfc3986#section-3.1)
184    pub fn scheme(&self) -> Option<&str> {
185        let pos = self.uri.bytes().position(is_reserved)?;
186        (self.uri.as_bytes()[pos] == b':').then_some(&self.uri[..pos])
187    }
188
189    /// # Reference
190    /// [3.2.  Authority](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2)
191    pub fn authority(&self) -> Option<&str> {
192        let rem = self
193            .uri
194            .strip_prefix("//")
195            .or_else(|| self.uri.split_once("://").map(|p| p.1))?;
196        Some(rem.split_once('/').map(|p| p.0).unwrap_or(rem))
197    }
198
199    /// # Reference
200    /// [3.2.1.  User Information](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.1)
201    pub fn userinfo(&self) -> Option<&str> {
202        Some(self.authority()?.split_once('@')?.0)
203    }
204
205    /// # Reference
206    /// [3.2.2.  Host](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.2)
207    pub fn host(&self) -> Option<&str> {
208        let mut auth = self.authority()?;
209        if let Some((_userinfo, rem)) = auth.split_once('@') {
210            auth = rem;
211        }
212        if let Some((host, port)) = auth.rsplit_once(':')
213            && port.bytes().all(|b| b.is_ascii_digit())
214        {
215            auth = host;
216        }
217        Some(auth)
218    }
219
220    /// # Reference
221    /// [3.2.3.  Port](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.3)
222    pub fn port(&self) -> Option<&str> {
223        let (_, port) = self.authority()?.rsplit_once(':')?;
224        port.bytes().all(|b| b.is_ascii_digit()).then_some(port)
225    }
226
227    /// # Reference
228    /// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
229    pub fn path(&self) -> &str {
230        let mut path = &self.uri;
231        if let Some(scheme) = self.scheme() {
232            // has scheme
233            path = &path[scheme.len() + 1..];
234        }
235        if let Some(rem) = path.strip_prefix("//") {
236            // has authority
237            let pos = rem.bytes().position(|b| b == b'/').unwrap_or(rem.len());
238            path = &rem[pos..]
239        }
240
241        path.split_once(['?', '#']).map(|p| p.0).unwrap_or(path)
242    }
243
244    /// # Reference
245    /// [3.4.  Query](https://datatracker.ietf.org/doc/html/rfc3986#section-3.4)
246    pub fn query(&self) -> Option<&str> {
247        let pos = self.uri.bytes().position(|b| b == b'?' || b == b'#')?;
248        if self.uri.as_bytes()[pos] == b'#' {
249            return None;
250        }
251        let query = &self.uri[pos + 1..];
252        let pos = query.bytes().position(|b| b == b'#').unwrap_or(query.len());
253        Some(&query[..pos])
254    }
255
256    /// # Reference
257    /// [3.5.  Fragment](https://datatracker.ietf.org/doc/html/rfc3986#section-3.5)
258    pub fn fragment(&self) -> Option<&str> {
259        let pos = self.uri.bytes().position(|b| b == b'#')?;
260        Some(&self.uri[pos + 1..])
261    }
262
263    /// Return an iterator that scans the URI components.
264    pub fn components(&self) -> Components<'_> {
265        Components::new(&self.uri)
266    }
267}
268
269impl ToOwned for URIStr {
270    type Owned = URIString;
271
272    fn to_owned(&self) -> Self::Owned {
273        URIString {
274            uri: self.uri.to_owned(),
275        }
276    }
277}
278
279impl From<&URIStr> for URIString {
280    fn from(value: &URIStr) -> Self {
281        value.to_owned()
282    }
283}
284
285impl Clone for Box<URIStr> {
286    fn clone(&self) -> Self {
287        self.as_ref().into()
288    }
289}
290
291macro_rules! impl_boxed_convertion_uri_str {
292    ($( $t:ident ),*) => {
293        $(
294            impl From<&URIStr> for $t<URIStr> {
295                fn from(value: &URIStr) -> Self {
296                    let boxed: $t<str> = value.uri.into();
297                    unsafe {
298                        // # Safety
299                        // Since `URIStr` is a transparent newtype of `str`,
300                        // the bit patterns are exactly the same and have the same features.
301                        std::mem::transmute(boxed)
302                    }
303                }
304            }
305        )*
306    };
307}
308impl_boxed_convertion_uri_str!(Box, Rc, Arc);
309
310#[derive(Debug, Clone, PartialEq, Eq, Hash)]
311#[repr(transparent)]
312pub struct URIString {
313    /// Escaped URI string.
314    ///
315    /// Parts generated from UTF-8 strings can always be converted back
316    /// to the original UTF-8 byte sequence.
317    /// Similarly, the parts generated from Path can probably be converted back
318    /// to the original Path byte sequence.
319    ///
320    /// As a result of resolving URI references, there may be a mixture of parts generated
321    /// from UTF-8 strings and parts generated from Paths, so the whole may not always revert
322    /// to a UTF-8 string or Path byte sequence.
323    uri: String,
324}
325
326impl URIString {
327    pub fn parse(uri: impl AsRef<str>) -> Result<Self, ParseRIError> {
328        fn _parse(uri: &str) -> Result<URIString, ParseRIError> {
329            let uri = escape_except(uri, |b| {
330                b.is_ascii() && (is_reserved(b as u8) || is_unreserved(b as u8))
331            });
332            let mut bytes = uri.as_bytes();
333            parse_uri_reference(&mut bytes)?;
334            if !bytes.is_empty() {
335                Err(ParseRIError::NotTermination)
336            } else {
337                Ok(URIString {
338                    uri: uri.into_owned(),
339                })
340            }
341        }
342        _parse(uri.as_ref())
343    }
344
345    /// # Note
346    /// In the current implementation, paths that cannot be converted to UTF-8 strings
347    /// cannot be handled.  \
348    /// I don't think there will be any problems in most environments, but there may be
349    /// some paths that cannot be handled.
350    pub fn parse_file_path(path: impl AsRef<Path>) -> Result<Self, ParseRIError> {
351        #[cfg(target_family = "unix")]
352        fn _parse_file_path(path: &Path) -> Result<URIString, ParseRIError> {
353            let mut path_str = path.to_str().ok_or(ParseRIError::Unsupported)?.to_owned();
354            if (path.is_dir() || (path.as_os_str().as_encoded_bytes().ends_with(b"\\")))
355                && !path_str.ends_with('/')
356            {
357                path_str.push('/');
358            }
359            if path.is_absolute() {
360                path_str.insert_str(0, "file://");
361            }
362            URIString::parse(path_str)
363        }
364        #[cfg(target_family = "windows")]
365        fn _parse_file_path(path: &Path) -> Result<URIString, ParseRIError> {
366            use std::path::{Component::*, Prefix::*};
367
368            let mut path_str = String::new();
369            let mut verbatim = false;
370            for comp in path.components() {
371                match comp {
372                    Prefix(prefix) => match prefix.kind() {
373                        Verbatim(root) => {
374                            path_str.push_str("file:///");
375                            path_str.push_str(
376                                &root
377                                    .to_str()
378                                    .ok_or(ParseRIError::Unsupported)?
379                                    .replace('/', "%2F"),
380                            );
381                            verbatim = true;
382                        }
383                        VerbatimUNC(server, root) => {
384                            path_str.push_str("file://");
385                            path_str.push_str(
386                                &server
387                                    .to_str()
388                                    .ok_or(ParseRIError::Unsupported)?
389                                    .replace('/', "%2F"),
390                            );
391                            path_str.push('/');
392                            path_str.push_str(
393                                &root
394                                    .to_str()
395                                    .ok_or(ParseRIError::Unsupported)?
396                                    .replace('/', "%2F"),
397                            );
398                            verbatim = true;
399                        }
400                        VerbatimDisk(letter) => {
401                            path_str.push_str("file:");
402                            path_str.push(letter as char);
403                            path_str.push(':');
404                            verbatim = true;
405                        }
406                        DeviceNS(device) => {
407                            path_str.push_str("file:///");
408                            path_str.push_str(device.to_str().ok_or(ParseRIError::Unsupported)?);
409                        }
410                        UNC(server, root) => {
411                            path_str.push_str("file://");
412                            path_str.push_str(server.to_str().ok_or(ParseRIError::Unsupported)?);
413                            path_str.push('/');
414                            path_str.push_str(root.to_str().ok_or(ParseRIError::Unsupported)?);
415                        }
416                        Disk(letter) => {
417                            path_str.push_str("file:");
418                            path_str.push(letter as char);
419                            path_str.push(':');
420                        }
421                    },
422                    RootDir => {}
423                    CurDir => path_str.push_str("/."),
424                    ParentDir => path_str.push_str("/.."),
425                    Normal(segment) => {
426                        path_str.push('/');
427                        let segment = segment.to_str().ok_or(ParseRIError::Unsupported)?;
428                        if verbatim {
429                            path_str.push_str(&segment.replace('/', "%2F"));
430                        } else {
431                            path_str.push_str(segment);
432                        }
433                    }
434                }
435            }
436            if (path.is_dir()
437                || (path.as_os_str().as_encoded_bytes().ends_with(b"\\")
438                    || (!verbatim && path.as_os_str().as_encoded_bytes().ends_with(b"/"))))
439                && !path_str.ends_with('/')
440            {
441                path_str.push('/');
442            }
443            URIString::parse(path_str)
444        }
445        #[cfg(all(not(target_family = "unix"), not(target_family = "windows")))]
446        fn _parse_file_path(path: &Path) -> Result<URIString, ParseRIError> {
447            todo!()
448        }
449        _parse_file_path(path.as_ref())
450    }
451
452    pub fn into_boxed_uri_str(self) -> Box<URIStr> {
453        Box::from(self.as_ref())
454    }
455
456    /// # Reference
457    /// [6.2.2.  Syntax-Based Normalization](https://datatracker.ietf.org/doc/html/rfc3986#section-6.2.2).
458    pub fn normalize(&mut self) {
459        use Component::*;
460
461        let mut uri = String::with_capacity(self.uri.len());
462        let mut paths = vec![];
463        let mut query = None;
464        let mut fragment = None;
465        let mut has_root = false;
466        for comp in self.components() {
467            match comp {
468                Scheme(scheme) => {
469                    uri.push_str(&scheme.to_ascii_lowercase());
470                    uri.push(':');
471                }
472                Authority {
473                    userinfo,
474                    host,
475                    port,
476                } => {
477                    uri.push_str("//");
478                    if let Some(userinfo) = userinfo {
479                        uri.push_str(userinfo);
480                        uri.push('@');
481                    }
482                    uri.push_str(host);
483                    if let Some(port) = port {
484                        uri.push(':');
485                        uri.push_str(port);
486                    }
487                }
488                RootSegment => has_root = true,
489                Segment(segment) => paths.push(segment),
490                Query(q) => query = Some(q),
491                Fragment(f) => fragment = Some(f),
492            }
493        }
494        build_normalized_path(paths.into_iter(), has_root, &mut uri);
495        if let Some(query) = query {
496            uri.push('?');
497            uri.push_str(query);
498        }
499        if let Some(fragment) = fragment {
500            uri.push('#');
501            uri.push_str(fragment);
502        }
503        self.uri = uri;
504    }
505}
506
507impl AsRef<URIStr> for URIString {
508    fn as_ref(&self) -> &URIStr {
509        URIStr::new(&self.uri)
510    }
511}
512
513impl Borrow<URIStr> for URIString {
514    fn borrow(&self) -> &URIStr {
515        self.as_ref()
516    }
517}
518
519impl Deref for URIString {
520    type Target = URIStr;
521
522    fn deref(&self) -> &Self::Target {
523        self.as_ref()
524    }
525}
526
527macro_rules! impl_convertion_uri_string {
528    ($( $t:ty ),*) => {
529        $(
530            impl From<URIString> for $t {
531                fn from(value: URIString) -> $t {
532                    From::from(value.as_ref())
533                }
534            }
535        )*
536    };
537}
538impl_convertion_uri_string!(Box<URIStr>, Rc<URIStr>, Arc<URIStr>);
539
540fn build_normalized_path<'a>(
541    segments: impl Iterator<Item = &'a str>,
542    has_root: bool,
543    buffer: &mut String,
544) {
545    let segments = normalize_path_segments(segments, has_root);
546    if has_root {
547        buffer.push('/');
548    }
549    for (i, seg) in segments.into_iter().enumerate() {
550        if i > 0 {
551            buffer.push('/');
552        }
553        buffer.push_str(seg);
554    }
555}
556
557fn normalize_path_segments<'a>(
558    segments: impl Iterator<Item = &'a str>,
559    has_root: bool,
560) -> Vec<&'a str> {
561    let mut stack = vec![];
562    let mut last_dot = false;
563    for seg in segments {
564        if seg == "." {
565            // no op
566            last_dot = true;
567        } else if seg == ".." {
568            if !stack.is_empty() && stack.last() != Some(&"..") {
569                stack.pop();
570            } else if !has_root {
571                stack.push(seg);
572            }
573            last_dot = true;
574        } else {
575            stack.push(seg);
576            last_dot = false;
577        }
578    }
579
580    if last_dot {
581        stack.push("");
582    }
583
584    stack
585}
586
587/// # Reference
588/// [4.1.  URI Reference](https://datatracker.ietf.org/doc/html/rfc3986#section-4.1)
589///
590/// ```text
591/// URI-reference = URI / relative-ref
592/// ```
593fn parse_uri_reference(b: &mut &[u8]) -> Result<(), ParseRIError> {
594    if b.is_empty() || matches!(b[0], b'/' | b'?' | b'#') {
595        // If `b` is an empty string or starts with either '/', '?' or '#',
596        // it is definitely 'relative-ref'.
597        parse_relative_ref(b)
598    } else {
599        // Otherwise, it is necessary to distinguish between `URI` and `relative-ref`
600        // starting with `relative-part` that matches `path-noscheme`.
601
602        if !b[0].is_ascii_alphabetic() {
603            // Since `scheme` begins with at least one `ALPHA`,
604            // if it does not, it is definitely `irelative-ref`.
605            parse_relative_ref(b)
606        } else {
607            // The characters that can be used in `scheme` are very limited,
608            // so it might be quicker to try parsing `scheme` to distinguish between them?
609            // [25] scheme ::= ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
610            if let Some(&c) = b
611                .iter()
612                .find(|&&b| !b.is_ascii_alphanumeric() && !matches!(b, b'+' | b'-' | b'.'))
613                && c == b':'
614            {
615                parse_uri(b)
616            } else {
617                parse_relative_ref(b)
618            }
619        }
620    }
621}
622
623/// # Reference
624/// [3.  Syntax Components](https://datatracker.ietf.org/doc/html/rfc3986#section-3)
625///
626/// ```text
627/// URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
628/// ```
629fn parse_uri(b: &mut &[u8]) -> Result<(), ParseRIError> {
630    parse_scheme(b)?;
631    *b = b
632        .strip_prefix(b":")
633        .ok_or(ParseRIError::InvalidSchemeSeparator)?;
634    parse_hier_part(b)?;
635    if let Some(query) = b.strip_prefix(b"?") {
636        *b = query;
637        parse_query(b)?;
638    }
639    if let Some(fragment) = b.strip_prefix(b"#") {
640        *b = fragment;
641        parse_fragment(b)?;
642    }
643    Ok(())
644}
645
646/// # Reference
647/// [3.1.  Scheme](https://datatracker.ietf.org/doc/html/rfc3986#section-3.1)
648///
649/// ```text
650/// scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
651/// ```
652fn parse_scheme(b: &mut &[u8]) -> Result<(), ParseRIError> {
653    if b.is_empty() || !b[0].is_ascii_alphabetic() {
654        return Err(ParseRIError::InvalidScheme);
655    }
656    let pos = b
657        .iter()
658        .position(|&b| !b.is_ascii_alphanumeric() && !matches!(b, b'+' | b'-' | b'.'))
659        .unwrap_or(b.len());
660    *b = &b[pos..];
661    Ok(())
662}
663
664/// # Reference
665/// [3.  Syntax Components](https://datatracker.ietf.org/doc/html/rfc3986#section-3)
666///
667/// ```text
668/// hier-part   = "//" authority path-abempty
669///             / path-absolute
670///             / path-rootless
671///             / path-empty
672/// ```
673fn parse_hier_part(b: &mut &[u8]) -> Result<(), ParseRIError> {
674    if let Some(rem) = b.strip_prefix(b"/") {
675        // If `b` starts with '/', `b` starts with 'authority' or `path-absolute`,
676
677        if let Some(rem) = rem.strip_prefix(b"/") {
678            // If `b` starts with '//', it should be followed by 'authority'.
679            // This is because 'path-absolute' is followed by exactly one '/' at the beginning
680            // and optionally 'segment-nz', so there cannot be two consecutive '/' characters.
681            *b = rem;
682            parse_authority(b)?;
683            parse_path_abempty(b)
684        } else {
685            // path-absolute = "/" [ segment-nz *( "/" segment ) ]
686            // segment-nz    = 1*pchar
687            parse_path_absolute(b)
688        }
689    } else {
690        // otherwise, `b` starts with 'path-rootless' or 'path-empty'
691        let mut dum = *b;
692        if parse_pchar(&mut dum).is_ok() {
693            // If 'path-rootless' follows, one or more 'pchar' should follow.
694            parse_path_rootless(b)
695        } else {
696            // If not, it is 'path-empty'.
697            // Since 'path-empty' is an empty string,
698            // we can simply return `Ok` without doing anything.
699            Ok(())
700        }
701    }
702}
703
704/// # Reference
705/// [3.2.  Authority](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2)
706///
707/// ```text
708/// authority   = [ userinfo "@" ] host [ ":" port ]
709/// ```
710fn parse_authority(b: &mut &[u8]) -> Result<(), ParseRIError> {
711    if b.starts_with(b"[") {
712        // If `b` starts with '[', it is definitely an `host` that matches `IP-literal`.
713        parse_ip_literal(b)?;
714        if let Some(rem) = b.strip_prefix(b":") {
715            *b = rem;
716            parse_port(b)?;
717        }
718        return Ok(());
719    }
720
721    // If not, it may start with `userinfo`, or it may start with `host`
722    // that matches `IPv4address` or `reg-name`.
723    //
724    // If it is either `IPv4address` or `reg-name`, there is no need to consider `IPv4address`.
725    // This is because `reg-name` includes `IPv4address`. More specifically, since `unreserved`
726    // contains `DIGIT` and `.`, `IPv4address` can be regarded as a specific sequence of `unreserved`.
727    //
728    // `userinfo` and `reg-name` are rules that share characters other than colons.
729    // Therefore, they can be distinguished using the following algorithm.
730    //
731    // 1. Increment the counter as long as it matches `userinfo`.
732    // 2. If the first ":" is encountered, note its position.
733    // 3. Determine the matching rule according to the characters that did not match `userinfo`.
734    //      i.   If it is "@", the string seen so far is `userinfo`.
735    //      ii.  If it is "[" , then an `host` matching "IP-literal" should start there,
736    //           but since there is no "@" immediately before it, it is an error.
737    //      iii. In other cases, if the position of ":" is noted, the string before it is `host`;
738    //                           if not, all strings seen so far are `host`.
739    //
740    // userinfo    = *( unreserved / pct-encoded / sub-delims / ":" )
741    //
742    // reg-name    = *( unreserved / pct-encoded / sub-delims )
743    // unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
744    //
745    // IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
746    let mut colon = usize::MAX;
747    let mut now = 0;
748    let mut t = *b;
749    while !t.is_empty() {
750        let pos = t
751            .iter()
752            .position(|&b| !is_unreserved(b) && !is_sub_delims(b) && b != b'%')
753            .unwrap_or(t.len());
754        t = &t[pos..];
755        now += pos;
756        if let Some(rem) = t.strip_prefix(b":") {
757            now += 1;
758            t = rem;
759            colon = colon.min(now);
760        } else {
761            break;
762        }
763    }
764
765    debug_assert_eq!(now, b.len() - t.len());
766
767    if let Some(rem) = t.strip_prefix(b"@") {
768        *b = rem;
769        parse_host(b)?;
770        if let Some(rem) = b.strip_prefix(b":") {
771            *b = rem;
772            parse_port(b)?;
773        }
774        Ok(())
775    } else if t.starts_with(b"[") {
776        Err(ParseRIError::InvalidAuthority)
777    } else if colon < usize::MAX {
778        *b = &b[colon + 1..];
779        parse_port(b)
780    } else {
781        *b = t;
782        Ok(())
783    }
784}
785
786// This function has no use.
787// /// # Reference
788// /// [3.2.1.  User Information](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.1)
789// ///
790// /// ```text
791// /// userinfo    = *( unreserved / pct-encoded / sub-delims / ":" )
792// /// ```
793// fn parse_userinfo(b: &mut &[u8]) -> Result<(), ParseRIError> {
794//     todo!()
795// }
796
797/// # Reference
798/// [3.2.2.  Host]
799///
800/// ```text
801/// host        = IP-literal / IPv4address / reg-name
802/// ```
803fn parse_host(b: &mut &[u8]) -> Result<(), ParseRIError> {
804    if b.starts_with(b"[") {
805        parse_ip_literal(b)
806    } else {
807        // Since `IPv4address` is covered by `reg-name`, it does not need to be considered.
808        parse_reg_name(b)
809    }
810}
811
812/// # Reference
813/// [3.2.2.  Host]
814///
815/// ```text
816/// IP-literal  = "[" ( IPv6address / IPvFuture  ) "]"
817/// ```
818fn parse_ip_literal(b: &mut &[u8]) -> Result<(), ParseRIError> {
819    *b = b.strip_prefix(b"[").ok_or(ParseRIError::InvalidIPLiteral)?;
820    if !b.is_empty() && b[0].eq_ignore_ascii_case(&b'v') {
821        parse_ipv_future(b)?;
822    } else {
823        parse_ipv6_address(b)?;
824    }
825    *b = b.strip_prefix(b"]").ok_or(ParseRIError::InvalidIPLiteral)?;
826    Ok(())
827}
828
829/// # Reference
830/// [3.2.2.  Host]
831///
832/// ```text
833/// IPvFuture   = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
834/// ```
835fn parse_ipv_future(b: &mut &[u8]) -> Result<(), ParseRIError> {
836    if b.is_empty() || !b[0].eq_ignore_ascii_case(&b'v') {
837        return Err(ParseRIError::InvalidIPvFuture);
838    }
839    *b = &b[1..];
840    let pos = b
841        .iter()
842        .position(|&b| !b.is_ascii_hexdigit())
843        .unwrap_or(b.len());
844    if !(1..=b.len() - 2).contains(&pos) {
845        return Err(ParseRIError::InvalidIPvFuture);
846    }
847    *b = &b[pos..];
848    *b = b.strip_prefix(b".").ok_or(ParseRIError::InvalidIPvFuture)?;
849    let pos = b
850        .iter()
851        .position(|&b| !is_unreserved(b) && !is_sub_delims(b) && b != b':')
852        .unwrap_or(b.len());
853    if pos == 0 {
854        return Err(ParseRIError::InvalidIPvFuture);
855    }
856    *b = &b[pos..];
857    Ok(())
858}
859
860/// # Reference
861/// [3.2.2.  Host]
862///
863/// ```text
864/// IPv6address =                            6( h16 ":" ) ls32
865///             /                       "::" 5( h16 ":" ) ls32
866///             / [               h16 ] "::" 4( h16 ":" ) ls32
867///             / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
868///             / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
869///             / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
870///             / [ *4( h16 ":" ) h16 ] "::"              ls32
871///             / [ *5( h16 ":" ) h16 ] "::"              h16
872///             / [ *6( h16 ":" ) h16 ] "::"
873///  ls32       = ( h16 ":" h16 ) / IPv4address
874///             ; least-significant 32 bits of address
875///  h16        = 1*4HEXDIG
876///             ; 16 bits of address represented in hexadecimal
877/// ```
878fn parse_ipv6_address(b: &mut &[u8]) -> Result<(), ParseRIError> {
879    let mut cnt = 1;
880    let mut omit = false;
881    if let Some(rem) = b.strip_prefix(b":") {
882        *b = rem;
883        omit = true;
884    } else {
885        parse_h16(b)?;
886    }
887
888    while cnt + (omit as i32) < 8
889        && let Some(rem) = b.strip_prefix(b":")
890    {
891        *b = rem;
892        if b.starts_with(b":") {
893            if omit {
894                return Err(ParseRIError::InvalidIPv6address);
895            }
896            omit = true;
897            cnt += 1;
898            continue;
899        }
900
901        // It's not a smart approach, but it'll probably work...
902        //
903        // Checking `h16` first will not work because it cannot be distinguished
904        // from the first octet of the IPv4 address.
905        //
906        // Checking the positions where ':' and '.' appear also seems unlikely to work,
907        // considering cases where such characters appear in the segments of the following paths.
908        let mut dum = *b;
909        if parse_ipv4_address(&mut dum).is_ok() {
910            *b = dum;
911            // An IPv4 address consumes two hextets.
912            cnt += 2;
913            // An IPv4 address only appears at the end.
914            break;
915        } else if !b.is_empty() && b[0].is_ascii_hexdigit() {
916            parse_h16(b)?;
917        }
918    }
919
920    // If "::" is included, some hextets may be omitted, resulting in fewer than eight.
921    // Otherwise, exactly eight hextets are required.
922    if (omit && cnt <= 8) || (!omit && cnt == 8) {
923        Ok(())
924    } else {
925        Err(ParseRIError::InvalidIPv6address)
926    }
927}
928
929/// # Reference
930/// [3.2.2.  Host]
931///
932/// ```text
933///  h16        = 1*4HEXDIG
934///             ; 16 bits of address represented in hexadecimal
935/// ```
936fn parse_h16(b: &mut &[u8]) -> Result<(), ParseRIError> {
937    let pos = b
938        .iter()
939        .position(|&b| !b.is_ascii_hexdigit())
940        .unwrap_or(b.len());
941    if pos == 0 {
942        Err(ParseRIError::InvalidH16)
943    } else {
944        *b = &b[pos.min(4)..];
945        Ok(())
946    }
947}
948
949/// # Reference
950/// [3.2.2.  Host]
951///
952/// ```text
953/// IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
954/// dec-octet   = DIGIT                 ; 0-9
955///             / %x31-39 DIGIT         ; 10-99
956///             / "1" 2DIGIT            ; 100-199
957///             / "2" %x30-34 DIGIT     ; 200-249
958///             / "25" %x30-35          ; 250-255
959/// ```
960fn parse_ipv4_address(b: &mut &[u8]) -> Result<(), ParseRIError> {
961    parse_dec_octet(b)?;
962    for _ in 0..3 {
963        *b = b.strip_prefix(b".").ok_or(ParseRIError::InvalidDecOctet)?;
964        parse_dec_octet(b)?;
965    }
966    Ok(())
967}
968fn parse_dec_octet(b: &mut &[u8]) -> Result<(), ParseRIError> {
969    let len = match b {
970        [b'2', b'5', b'0'..=b'5', ..] => 3,
971        [b'2', b'0'..=b'4', b'0'..=b'9', ..] => 3,
972        [b'1', b'0'..=b'9', b'0'..=b'9', ..] => 3,
973        [b'1'..=b'9', b'0'..=b'9', ..] => 2,
974        [b'0'..=b'9', ..] => 1,
975        _ => return Err(ParseRIError::InvalidDecOctet),
976    };
977    *b = &b[len..];
978    Ok(())
979}
980
981/// # Reference
982/// [3.2.2.  Host]
983///
984/// ```text
985/// reg-name    = *( unreserved / pct-encoded / sub-delims )
986/// ```
987fn parse_reg_name(b: &mut &[u8]) -> Result<(), ParseRIError> {
988    // pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
989    // reg-name      = pchar - (":" | "@")
990    while !b.is_empty() && !matches!(b[0], b':' | b'@') && parse_pchar(b).is_ok() {}
991    Ok(())
992}
993
994/// # Reference
995/// [3.2.3.  Port](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.3)
996///
997/// ```text
998/// port        = *DIGIT
999/// ```
1000fn parse_port(b: &mut &[u8]) -> Result<(), ParseRIError> {
1001    let pos = b
1002        .iter()
1003        .position(|&b| !b.is_ascii_digit())
1004        .unwrap_or(b.len());
1005    *b = &b[pos..];
1006    Ok(())
1007}
1008
1009/// # Reference
1010/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1011///
1012/// ```text
1013/// path-abempty  = *( "/" segment )
1014/// ```
1015fn parse_path_abempty(b: &mut &[u8]) -> Result<(), ParseRIError> {
1016    while let Some(rem) = b.strip_prefix(b"/") {
1017        *b = rem;
1018        parse_segment(b)?;
1019    }
1020    Ok(())
1021}
1022
1023/// # Reference
1024/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1025///
1026/// ```text
1027/// path-absolute = "/" [ segment-nz *( "/" segment ) ]
1028/// ```
1029fn parse_path_absolute(b: &mut &[u8]) -> Result<(), ParseRIError> {
1030    *b = b
1031        .strip_prefix(b"/")
1032        .ok_or(ParseRIError::InvalidPathAbsolute)?;
1033    if parse_segment_nz(b).is_ok() {
1034        while let Some(rem) = b.strip_prefix(b"/") {
1035            *b = rem;
1036            parse_segment(b)?;
1037        }
1038    }
1039    Ok(())
1040}
1041
1042/// # Reference
1043/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1044///
1045/// ```text
1046/// path-noscheme = segment-nz-nc *( "/" segment )
1047/// ```
1048fn parse_path_noscheme(b: &mut &[u8]) -> Result<(), ParseRIError> {
1049    parse_segment_nz_nc(b)?;
1050    while let Some(rem) = b.strip_prefix(b"/") {
1051        *b = rem;
1052        parse_segment(b)?;
1053    }
1054    Ok(())
1055}
1056
1057/// # Reference
1058/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1059///
1060/// ```text
1061/// path-rootless = segment-nz *( "/" segment )
1062/// ```
1063fn parse_path_rootless(b: &mut &[u8]) -> Result<(), ParseRIError> {
1064    parse_segment_nz(b)?;
1065    while let Some(rem) = b.strip_prefix(b"/") {
1066        *b = rem;
1067        parse_segment(b)?;
1068    }
1069    Ok(())
1070}
1071
1072// This is not necessary because this does nothing.
1073// /// # Reference
1074// /// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1075// ///
1076// /// ```text
1077// /// path-empty    = 0<pchar>
1078// /// ```
1079// fn parse_path_empty(b: &mut &[u8]) -> Result<(), ParseRIError> {
1080//     todo!()
1081// }
1082
1083/// # Reference
1084/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1085///
1086/// ```text
1087/// segment       = *pchar
1088/// ```
1089fn parse_segment(b: &mut &[u8]) -> Result<(), ParseRIError> {
1090    while parse_pchar(b).is_ok() {}
1091    Ok(())
1092}
1093
1094/// # Reference
1095/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1096///
1097/// ```text
1098/// segment-nz    = 1*pchar
1099/// ```
1100fn parse_segment_nz(b: &mut &[u8]) -> Result<(), ParseRIError> {
1101    parse_pchar(b)?;
1102    while parse_pchar(b).is_ok() {}
1103    Ok(())
1104}
1105
1106/// # Reference
1107/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1108///
1109/// ```text
1110/// segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
1111///                     ; non-zero-length segment without any colon ":"
1112/// ```
1113fn parse_segment_nz_nc(b: &mut &[u8]) -> Result<(), ParseRIError> {
1114    if b.is_empty() || b[0] == b':' || parse_pchar(b).is_err() {
1115        return Err(ParseRIError::InvalidSegmentNzNc);
1116    }
1117    while !b.is_empty() && b[0] != b':' && parse_pchar(b).is_ok() {}
1118    Ok(())
1119}
1120
1121/// # Reference
1122/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1123///
1124/// ```text
1125/// pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
1126/// ```
1127fn parse_pchar(b: &mut &[u8]) -> Result<(), ParseRIError> {
1128    if b.is_empty() {
1129        return Err(ParseRIError::InvalidPChar);
1130    }
1131
1132    if is_unreserved(b[0]) || is_sub_delims(b[0]) || matches!(b[0], b':' | b'@') {
1133        *b = &b[1..];
1134        Ok(())
1135    } else if b.len() >= 3 && b[0] == b'%' && b[1].is_ascii_hexdigit() && b[2].is_ascii_hexdigit() {
1136        *b = &b[3..];
1137        Ok(())
1138    } else {
1139        Err(ParseRIError::InvalidPChar)
1140    }
1141}
1142
1143/// # Reference
1144/// [3.4.  Query](https://datatracker.ietf.org/doc/html/rfc3986#section-3.4)
1145///
1146/// ```text
1147/// query       = *( pchar / "/" / "?" )
1148/// ```
1149fn parse_query(b: &mut &[u8]) -> Result<(), ParseRIError> {
1150    loop {
1151        if let Some(rem) = b.strip_prefix(b"/") {
1152            *b = rem;
1153        } else if let Some(rem) = b.strip_prefix(b"?") {
1154            *b = rem;
1155        } else if parse_pchar(b).is_ok() {
1156            // no op
1157        } else {
1158            break Ok(());
1159        }
1160    }
1161}
1162
1163/// # Reference
1164/// [3.5.  Fragment](https://datatracker.ietf.org/doc/html/rfc3986#section-3.5)
1165///
1166/// ```text
1167/// fragment    = *( pchar / "/" / "?" )
1168/// ```
1169fn parse_fragment(b: &mut &[u8]) -> Result<(), ParseRIError> {
1170    loop {
1171        if let Some(rem) = b.strip_prefix(b"/") {
1172            *b = rem;
1173        } else if let Some(rem) = b.strip_prefix(b"?") {
1174            *b = rem;
1175        } else if parse_pchar(b).is_ok() {
1176            // no op
1177        } else {
1178            break Ok(());
1179        }
1180    }
1181}
1182
1183/// # Reference
1184/// [4.2.  Relative Reference](https://datatracker.ietf.org/doc/html/rfc3986#section-4.2)
1185///
1186/// ```text
1187/// relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
1188/// ```
1189fn parse_relative_ref(b: &mut &[u8]) -> Result<(), ParseRIError> {
1190    parse_relative_part(b)?;
1191    if let Some(query) = b.strip_prefix(b"?") {
1192        *b = query;
1193        parse_query(b)?;
1194    }
1195    if let Some(fragment) = b.strip_prefix(b"#") {
1196        *b = fragment;
1197        parse_fragment(b)?;
1198    }
1199    Ok(())
1200}
1201
1202/// # Reference
1203/// [4.2.  Relative Reference](https://datatracker.ietf.org/doc/html/rfc3986#section-4.2)
1204///
1205/// ```text
1206/// relative-part = "//" authority path-abempty
1207///               / path-absolute
1208///               / path-noscheme
1209///               / path-empty
1210/// ```
1211fn parse_relative_part(b: &mut &[u8]) -> Result<(), ParseRIError> {
1212    if let Some(rem) = b.strip_prefix(b"/") {
1213        if let Some(rem) = rem.strip_prefix(b"/") {
1214            *b = rem;
1215            parse_authority(b)?;
1216            parse_path_abempty(b)
1217        } else {
1218            parse_path_absolute(b)
1219        }
1220    } else {
1221        let orig = b.len();
1222        let ret = parse_path_noscheme(b);
1223        // If no characters have been consumed, it matches `path-empty` and returns `Ok`.
1224        if orig == b.len() { Ok(()) } else { ret }
1225    }
1226}
1227
1228/// # Reference
1229/// [2.2.  Reserved Characters](https://datatracker.ietf.org/doc/html/rfc3986#section-2.2)
1230///
1231/// ```text
1232/// reserved    = gen-delims / sub-delims
1233/// ```
1234fn is_reserved(b: u8) -> bool {
1235    is_gen_delims(b) || is_sub_delims(b)
1236}
1237
1238/// # Reference
1239/// [2.2.  Reserved Characters](https://datatracker.ietf.org/doc/html/rfc3986#section-2.2)
1240///
1241/// ```text
1242/// gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
1243/// ```
1244fn is_gen_delims(b: u8) -> bool {
1245    matches!(b, b':' | b'/' | b'?' | b'#' | b'[' | b']' | b'@')
1246}
1247
1248/// # Reference
1249/// [2.2.  Reserved Characters](https://datatracker.ietf.org/doc/html/rfc3986#section-2.2)
1250///
1251/// ```text
1252/// sub-delims  = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
1253/// ```
1254fn is_sub_delims(b: u8) -> bool {
1255    matches!(
1256        b,
1257        b'!' | b'$' | b'&' | b'\'' | b'(' | b')' | b'*' | b'+' | b',' | b';' | b'='
1258    )
1259}
1260
1261/// # Reference
1262/// [2.3.  Unreserved Characters](https://datatracker.ietf.org/doc/html/rfc3986#section-2.3)
1263///
1264/// ```text
1265/// unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
1266/// ```
1267fn is_unreserved(b: u8) -> bool {
1268    b.is_ascii_alphanumeric() || matches!(b, b'-' | b'.' | b'_' | b'~')
1269}
1270
1271const LUT_BYTES: [u8; 256 * 3] = {
1272    const fn digit_to_hex_char(b: u8) -> u8 {
1273        if b < 10 { b + b'0' } else { b - 10 + b'A' }
1274    }
1275    let mut buf = [0u8; 256 * 3];
1276    let mut i = 0;
1277    while i < 256 {
1278        buf[3 * i] = b'%';
1279        let hi = (i as u8 >> 4) & 0xF;
1280        let lo = i as u8 & 0xF;
1281        buf[3 * i + 1] = digit_to_hex_char(hi);
1282        buf[3 * i + 2] = digit_to_hex_char(lo);
1283        i += 1;
1284    }
1285    buf
1286};
1287const LUT: &str = unsafe {
1288    // # Safety
1289    // `LUT_BYTES` contains only '%' and ASCII hex digit characters.
1290    // Therefore, UTF-8 validation won't fail.
1291    from_utf8_unchecked(&LUT_BYTES)
1292};
1293
1294pub fn escape(s: &str) -> Cow<'_, str> {
1295    escape_except(s, |_| false)
1296}
1297
1298pub fn escape_bytes(b: &[u8]) -> Cow<'_, [u8]> {
1299    escape_bytes_except(b, |_| false)
1300}
1301
1302pub fn escape_except(s: &str, is_except: impl Fn(char) -> bool) -> Cow<'_, str> {
1303    let cap = s
1304        .chars()
1305        .filter_map(|c| (!is_except(c)).then_some(c.len_utf8() * 2))
1306        .sum::<usize>();
1307    if cap == 0 {
1308        return Cow::Borrowed(s);
1309    }
1310    let mut encode = [0; 6];
1311    let mut buf = String::with_capacity(s.len() + cap);
1312    for c in s.chars() {
1313        if is_except(c) {
1314            buf.push(c);
1315        } else {
1316            let encoded = c.encode_utf8(&mut encode);
1317            for b in encoded.bytes() {
1318                let index = b as usize * 3;
1319                buf.push_str(&LUT[index..index + 3]);
1320            }
1321        }
1322    }
1323    Cow::Owned(buf)
1324}
1325
1326pub fn escape_bytes_except(b: &[u8], is_except: impl Fn(u8) -> bool) -> Cow<'_, [u8]> {
1327    let cap = b.iter().copied().filter(|&b| !is_except(b)).count() * 2;
1328    if cap == 0 {
1329        return Cow::Borrowed(b);
1330    }
1331    let mut buf = Vec::with_capacity(b.len() + cap);
1332    for &b in b {
1333        if is_except(b) {
1334            buf.push(b);
1335        } else {
1336            let index = b as usize * 3;
1337            buf.extend_from_slice(&LUT_BYTES[index..index + 3]);
1338        }
1339    }
1340    Cow::Owned(buf)
1341}
1342
1343pub enum URIUnescapeError {
1344    InvalidEscape,
1345    Utf8Error(std::str::Utf8Error),
1346}
1347
1348impl From<std::str::Utf8Error> for URIUnescapeError {
1349    fn from(value: std::str::Utf8Error) -> Self {
1350        Self::Utf8Error(value)
1351    }
1352}
1353
1354pub fn unescape(s: &str) -> Result<Cow<'_, str>, URIUnescapeError> {
1355    if !s.contains('%') {
1356        return Ok(Cow::Borrowed(s));
1357    }
1358
1359    let mut split = s.split('%');
1360    let mut buf = String::with_capacity(s.len());
1361    buf.push_str(split.next().unwrap());
1362    let mut bytes = vec![];
1363    for chunk in split {
1364        if chunk.len() < 2 {
1365            return Err(URIUnescapeError::InvalidEscape);
1366        }
1367        let byte =
1368            u8::from_str_radix(&chunk[..2], 16).map_err(|_| URIUnescapeError::InvalidEscape)?;
1369        bytes.push(byte);
1370
1371        if chunk.len() > 2 {
1372            buf.push_str(from_utf8(&bytes)?);
1373            buf.push_str(&chunk[2..]);
1374            bytes.clear();
1375        }
1376    }
1377
1378    if !bytes.is_empty() {
1379        buf.push_str(from_utf8(&bytes)?);
1380    }
1381    Ok(Cow::Owned(buf))
1382}
1383
1384pub fn unescape_bytes(b: &[u8]) -> Result<Cow<'_, [u8]>, URIUnescapeError> {
1385    if !b.contains(&b'%') {
1386        return Ok(Cow::Borrowed(b));
1387    }
1388
1389    let mut split = b.split(|&b| b == b'%');
1390    let mut buf = Vec::with_capacity(b.len());
1391    buf.extend_from_slice(split.next().unwrap());
1392
1393    fn hexdigit_to_byte(hex: u8) -> u8 {
1394        if hex.is_ascii_digit() {
1395            hex - b'0'
1396        } else if hex.is_ascii_uppercase() {
1397            hex - b'A' + 10
1398        } else {
1399            hex - b'a' + 10
1400        }
1401    }
1402    for chunk in split {
1403        if chunk.len() < 2 || !chunk[0].is_ascii_hexdigit() || !chunk[1].is_ascii_hexdigit() {
1404            return Err(URIUnescapeError::InvalidEscape);
1405        }
1406        let hi = hexdigit_to_byte(chunk[0]);
1407        let lo = hexdigit_to_byte(chunk[1]);
1408        buf.push((hi << 4) | lo);
1409    }
1410    Ok(Cow::Owned(buf))
1411}
1412
1413#[derive(Debug, Clone, Copy)]
1414enum DecomposeState {
1415    Scheme,
1416    Authority,
1417    Root,
1418    Path,
1419    Query,
1420    Fragment,
1421    Finish,
1422}
1423
1424pub struct Components<'a> {
1425    state: DecomposeState,
1426    uri: &'a str,
1427}
1428
1429impl Components<'_> {
1430    fn new(uri: &str) -> Components<'_> {
1431        Components {
1432            state: DecomposeState::Scheme,
1433            uri,
1434        }
1435    }
1436}
1437
1438impl<'a> Iterator for Components<'a> {
1439    type Item = Component<'a>;
1440
1441    fn next(&mut self) -> Option<Self::Item> {
1442        use DecomposeState::*;
1443        loop {
1444            match self.state {
1445                Scheme => {
1446                    self.state = Authority;
1447                    let mut bytes = self.uri.as_bytes();
1448                    if parse_scheme(&mut bytes).is_ok() && bytes.starts_with(b":") {
1449                        let len = self.uri.len() - bytes.len();
1450                        let (scheme, rem) = self.uri.split_at(len);
1451                        self.uri = &rem[1..];
1452                        break Some(Component::Scheme(scheme));
1453                    }
1454                }
1455                Authority => {
1456                    self.state = Root;
1457                    if let Some(rem) = self.uri.strip_prefix("//") {
1458                        let pos = rem.bytes().position(|b| b == b'/').unwrap_or(rem.len());
1459                        let (mut authority, rem) = rem.split_at(pos);
1460                        self.uri = rem;
1461                        let mut userinfo = None;
1462                        if let Some((ui, rem)) = authority.split_once('@') {
1463                            userinfo = Some(ui);
1464                            authority = rem;
1465                        }
1466                        let mut port = None;
1467                        if let Some((host, p)) = authority.rsplit_once(':')
1468                            && p.bytes().all(|b| b.is_ascii_digit())
1469                        {
1470                            port = Some(p);
1471                            authority = host;
1472                        }
1473                        break Some(Component::Authority {
1474                            userinfo,
1475                            host: authority,
1476                            port,
1477                        });
1478                    }
1479                }
1480                Root => {
1481                    self.state = Path;
1482                    if let Some(rem) = self.uri.strip_prefix('/') {
1483                        self.uri = rem;
1484                        break Some(Component::RootSegment);
1485                    }
1486                }
1487                Path => {
1488                    let pos = self
1489                        .uri
1490                        .bytes()
1491                        .position(|b| b == b'/' || b == b'?' || b == b'#')
1492                        .unwrap_or(self.uri.len());
1493                    let (segment, rem) = self.uri.split_at(pos);
1494                    if let Some(rem) = rem.strip_prefix('/') {
1495                        self.uri = rem;
1496                    } else {
1497                        self.uri = rem;
1498                        self.state = Query;
1499                    }
1500                    break Some(Component::Segment(segment));
1501                }
1502                Query => {
1503                    self.state = Fragment;
1504                    if let Some(rem) = self.uri.strip_prefix('?') {
1505                        let pos = rem.bytes().position(|b| b == b'#').unwrap_or(rem.len());
1506                        let (query, rem) = rem.split_at(pos);
1507                        self.uri = rem;
1508                        break Some(Component::Query(query));
1509                    }
1510                }
1511                Fragment => {
1512                    debug_assert!(self.uri.is_empty() || self.uri.starts_with('#'));
1513                    self.state = Finish;
1514                    if !self.uri.is_empty() {
1515                        let (_, frag) = self.uri.split_at(1);
1516                        self.uri = "";
1517                        break Some(Component::Fragment(frag));
1518                    }
1519                }
1520                Finish => break None,
1521            }
1522        }
1523    }
1524}
1525
1526pub enum Component<'a> {
1527    Scheme(&'a str),
1528    Authority {
1529        userinfo: Option<&'a str>,
1530        host: &'a str,
1531        port: Option<&'a str>,
1532    },
1533    RootSegment,
1534    Segment(&'a str),
1535    Query(&'a str),
1536    Fragment(&'a str),
1537}
anyxml_uri/uri.rs

anyxml_uri/
uri.rs