anyxml_uri/
uri.rs

1use std::{
2    borrow::{Borrow, Cow},
3    ops::Deref,
4    path::Path,
5    rc::Rc,
6    str::{from_utf8, from_utf8_unchecked},
7    sync::Arc,
8};
9
10use crate::ParseRIError;
11
12/// A subtype of [`str`] that has been validated as a URI.
13#[derive(Debug, PartialEq, Eq, Hash)]
14#[repr(transparent)]
15pub struct URIStr {
16    uri: str,
17}
18
19impl URIStr {
20    fn new(s: &str) -> &Self {
21        unsafe {
22            // # Safety
23            // Since `URIStr` is a transparent newtype of `str`,
24            // the bit patterns are exactly the same and have the same features.
25            &*(s as *const str as *const Self)
26        }
27    }
28
29    /// Resolve the relative reference `reference` using `self` as the base URI.
30    ///
31    /// `self` must be convertible to an [absolute URI](https://datatracker.ietf.org/doc/html/rfc3986#section-4.3)
32    /// through [fragment](https://datatracker.ietf.org/doc/html/rfc3986#section-3.5) removal
33    /// and normalization.
34    ///
35    /// # Reference
36    /// - [5.1.  Establishing a Base URI](https://datatracker.ietf.org/doc/html/rfc3986#section-5.1)
37    /// - [5.2.  Relative Resolution](https://datatracker.ietf.org/doc/html/rfc3986#section-5.2)
38    pub fn resolve(&self, reference: &Self) -> URIString {
39        use Component::*;
40
41        let base = if self.is_absolute() {
42            Cow::Borrowed(self)
43        } else {
44            let mut base = self.to_owned();
45            base.normalize();
46            if let Some(frag) = base.uri.bytes().position(|b| b == b'#') {
47                base.uri.truncate(frag);
48            }
49            assert!(
50                base.is_absolute(),
51                "'{}' is not absolute",
52                base.as_escaped_str()
53            );
54            Cow::Owned(base)
55        };
56
57        let mut ref_components = reference.components().peekable();
58        if ref_components
59            .next_if(|comp| matches!(comp, Scheme(_)))
60            .is_some()
61        {
62            let mut ret = reference.to_owned();
63            ret.normalize();
64            return ret;
65        }
66
67        if ref_components
68            .next_if(|comp| matches!(comp, Authority { .. }))
69            .is_some()
70        {
71            // has authority
72            let mut ret = URIString {
73                uri: [base.scheme().unwrap(), ":", &reference.uri].concat(),
74            };
75            ret.normalize();
76            return ret;
77        }
78
79        let mut components = base.components().peekable();
80        let mut uri = String::new();
81        if let Some(Scheme(scheme)) = components.next_if(|comp| matches!(comp, Scheme(_))) {
82            uri.push_str(scheme);
83            uri.push(':');
84        }
85        if let Some(Authority {
86            userinfo,
87            host,
88            port,
89        }) = components.next_if(|comp| matches!(comp, Authority { .. }))
90        {
91            uri.push_str("//");
92            if let Some(userinfo) = userinfo {
93                uri.push_str(userinfo);
94                uri.push(':');
95            }
96            uri.push_str(host);
97            if let Some(port) = port {
98                uri.push(':');
99                uri.push_str(port);
100            }
101        }
102
103        if ref_components
104            .next_if(|comp| matches!(comp, RootSegment))
105            .is_some()
106        {
107            uri.push_str(&reference.uri);
108            let mut ret = URIString { uri };
109            ret.normalize();
110            return ret;
111        }
112
113        let mut segments = vec![];
114        let has_root = components
115            .next_if(|comp| matches!(comp, RootSegment))
116            .is_some();
117        let mut has_dot_segment = false;
118        while let Some(Segment(segment)) = components.next_if(|comp| matches!(comp, Segment(_))) {
119            segments.push(segment);
120            has_dot_segment |= segment == "." || segment == "..";
121        }
122        if has_dot_segment {
123            segments = normalize_path_segments(segments.into_iter(), has_root);
124        }
125
126        let mut has_path = false;
127        if let Some(Segment(segment)) = ref_components.next_if(|comp| matches!(comp, Segment(_))) {
128            let mut buf = vec![segment];
129            while let Some(Segment(segment)) =
130                ref_components.next_if(|comp| matches!(comp, Segment(_)))
131            {
132                buf.push(segment);
133            }
134            if buf.len() > 1 || !buf[0].is_empty() {
135                segments.pop();
136                segments.extend(buf);
137                has_path = true;
138            }
139        }
140        build_normalized_path(segments.into_iter(), has_root, &mut uri);
141
142        if let Some(Query(query)) = ref_components.next_if(|comp| matches!(comp, Query(_))) {
143            uri.push('?');
144            uri.push_str(query);
145        } else if !has_path
146            && let Some(Query(query)) = components.next_if(|comp| matches!(comp, Query(_)))
147        {
148            uri.push('?');
149            uri.push_str(query);
150        }
151
152        if let Some(Fragment(fragment)) = ref_components.next() {
153            uri.push('#');
154            uri.push_str(fragment);
155        }
156
157        URIString { uri }
158    }
159
160    /// Return the escaped URI string.
161    pub fn as_escaped_str(&self) -> &str {
162        &self.uri
163    }
164
165    /// Return the unescaped URI string.
166    ///
167    /// If unescaping fails, return `None`.
168    pub fn as_unescaped_str(&self) -> Option<Cow<'_, str>> {
169        unescape(&self.uri).ok()
170    }
171
172    /// Check if this URI reference is the absolute URI.
173    ///
174    /// # Reference
175    /// [4.3.  Absolute URI](https://datatracker.ietf.org/doc/html/rfc3986#section-4.3)
176    pub fn is_absolute(&self) -> bool {
177        self.scheme().is_some() && self.fragment().is_none()
178    }
179
180    /// Check if this URI reference is the relative reference.
181    ///
182    /// # Reference
183    /// [4.2.  Relative Reference](https://datatracker.ietf.org/doc/html/rfc3986#section-4.2)
184    pub fn is_relative(&self) -> bool {
185        self.scheme().is_none()
186    }
187
188    /// "scheme" part.
189    ///
190    /// # Reference
191    /// [3.1.  Scheme](https://datatracker.ietf.org/doc/html/rfc3986#section-3.1)
192    pub fn scheme(&self) -> Option<&str> {
193        let pos = self.uri.bytes().position(is_reserved)?;
194        (self.uri.as_bytes()[pos] == b':').then_some(&self.uri[..pos])
195    }
196
197    /// "authority" part.
198    ///
199    /// # Reference
200    /// [3.2.  Authority](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2)
201    pub fn authority(&self) -> Option<&str> {
202        let rem = self
203            .uri
204            .strip_prefix("//")
205            .or_else(|| self.uri.split_once("://").map(|p| p.1))?;
206        Some(rem.split_once('/').map(|p| p.0).unwrap_or(rem))
207    }
208
209    /// "userinfo" part.
210    ///
211    /// # Reference
212    /// [3.2.1.  User Information](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.1)
213    pub fn userinfo(&self) -> Option<&str> {
214        Some(self.authority()?.split_once('@')?.0)
215    }
216
217    /// "host" part.
218    ///
219    /// # Reference
220    /// [3.2.2.  Host](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.2)
221    pub fn host(&self) -> Option<&str> {
222        let mut auth = self.authority()?;
223        if let Some((_userinfo, rem)) = auth.split_once('@') {
224            auth = rem;
225        }
226        if let Some((host, port)) = auth.rsplit_once(':')
227            && port.bytes().all(|b| b.is_ascii_digit())
228        {
229            auth = host;
230        }
231        Some(auth)
232    }
233
234    /// "port" part.
235    ///
236    /// # Reference
237    /// [3.2.3.  Port](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.3)
238    pub fn port(&self) -> Option<&str> {
239        let (_, port) = self.authority()?.rsplit_once(':')?;
240        port.bytes().all(|b| b.is_ascii_digit()).then_some(port)
241    }
242
243    /// "path" part.
244    ///
245    /// # Reference
246    /// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
247    pub fn path(&self) -> &str {
248        let mut path = &self.uri;
249        if let Some(scheme) = self.scheme() {
250            // has scheme
251            path = &path[scheme.len() + 1..];
252        }
253        if let Some(rem) = path.strip_prefix("//") {
254            // has authority
255            let pos = rem.bytes().position(|b| b == b'/').unwrap_or(rem.len());
256            path = &rem[pos..]
257        }
258
259        path.split_once(['?', '#']).map(|p| p.0).unwrap_or(path)
260    }
261
262    /// "query" part.
263    ///
264    /// # Reference
265    /// [3.4.  Query](https://datatracker.ietf.org/doc/html/rfc3986#section-3.4)
266    pub fn query(&self) -> Option<&str> {
267        let pos = self.uri.bytes().position(|b| b == b'?' || b == b'#')?;
268        if self.uri.as_bytes()[pos] == b'#' {
269            return None;
270        }
271        let query = &self.uri[pos + 1..];
272        let pos = query.bytes().position(|b| b == b'#').unwrap_or(query.len());
273        Some(&query[..pos])
274    }
275
276    /// "fragment" part.
277    ///
278    /// # Reference
279    /// [3.5.  Fragment](https://datatracker.ietf.org/doc/html/rfc3986#section-3.5)
280    pub fn fragment(&self) -> Option<&str> {
281        let pos = self.uri.bytes().position(|b| b == b'#')?;
282        Some(&self.uri[pos + 1..])
283    }
284
285    /// Return an iterator that scans the URI components.
286    pub fn components(&self) -> Components<'_> {
287        Components::new(&self.uri)
288    }
289}
290
291impl ToOwned for URIStr {
292    type Owned = URIString;
293
294    fn to_owned(&self) -> Self::Owned {
295        URIString {
296            uri: self.uri.to_owned(),
297        }
298    }
299}
300
301impl From<&URIStr> for URIString {
302    fn from(value: &URIStr) -> Self {
303        value.to_owned()
304    }
305}
306
307impl AsRef<URIStr> for URIStr {
308    fn as_ref(&self) -> &URIStr {
309        self
310    }
311}
312
313impl Clone for Box<URIStr> {
314    fn clone(&self) -> Self {
315        self.as_ref().into()
316    }
317}
318
319impl std::fmt::Display for URIStr {
320    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
321        write!(
322            f,
323            "{}",
324            self.as_unescaped_str()
325                .as_deref()
326                .unwrap_or(self.as_escaped_str())
327        )
328    }
329}
330
331macro_rules! impl_boxed_convertion_uri_str {
332    ($( $t:ident ),*) => {
333        $(
334            impl From<&URIStr> for $t<URIStr> {
335                fn from(value: &URIStr) -> Self {
336                    let boxed: $t<str> = value.uri.into();
337                    unsafe {
338                        // # Safety
339                        // Since `URIStr` is a transparent newtype of `str`,
340                        // the bit patterns are exactly the same and have the same features.
341                        std::mem::transmute(boxed)
342                    }
343                }
344            }
345        )*
346    };
347}
348impl_boxed_convertion_uri_str!(Box, Rc, Arc);
349
350/// A subtype of [`String`] that has been validated as a URI.
351#[derive(Debug, Clone, PartialEq, Eq, Hash)]
352#[repr(transparent)]
353pub struct URIString {
354    /// Escaped URI string.
355    ///
356    /// Parts generated from UTF-8 strings can always be converted back
357    /// to the original UTF-8 byte sequence.
358    /// Similarly, the parts generated from Path can probably be converted back
359    /// to the original Path byte sequence.
360    ///
361    /// As a result of resolving URI references, there may be a mixture of parts generated
362    /// from UTF-8 strings and parts generated from Paths, so the whole may not always revert
363    /// to a UTF-8 string or Path byte sequence.
364    uri: String,
365}
366
367impl URIString {
368    /// Parse the string as a URI by escaping all characters not specified as
369    /// [`reserved`](https://datatracker.ietf.org/doc/html/rfc3986#section-2.2)
370    /// or [`unreserved`](https://datatracker.ietf.org/doc/html/rfc3986#section-2.3)
371    /// in [RFC 3986](https://datatracker.ietf.org/doc/html/rfc3986).
372    ///
373    /// Because certain characters containing `%` are escaped, the result of
374    /// [`URIStr::as_unescaped_str`] is equal to `uri`, but the result of
375    /// [`URIStr::as_escaped_str`] may differ from `uri`.
376    ///
377    /// Since it escapes nearly all characters—including control characters, `%`,
378    /// and non-ASCII characters—it will successfully parse any string that roughly
379    /// follows URI notation.
380    pub fn parse(uri: impl AsRef<str>) -> Result<Self, ParseRIError> {
381        fn _parse(uri: &str) -> Result<URIString, ParseRIError> {
382            let uri = escape_except(uri, |b| {
383                b.is_ascii() && (is_reserved(b as u8) || is_unreserved(b as u8))
384            });
385            URIString::parse_escaped(&uri)
386        }
387        _parse(uri.as_ref())
388    }
389
390    /// Parse the string as a URI after applying escaping according
391    /// to [XML 1.0 "4.2.2 External Entities"](https://www.w3.org/TR/xml/#sec-external-ent).
392    ///
393    /// Some characters are escaped without escaping the `%` character, so both the result
394    /// of [`URIStr::as_unescaped_str`] and the result of [`URIStr::as_escaped_str`] may
395    /// differ from `uri`.
396    ///
397    /// > System identifiers (and other XML strings meant to be used as URI references) may
398    /// > contain characters that, according to [IETF RFC 3986], must be escaped before a
399    /// > URI can be used to retrieve the referenced resource. The characters to be escaped
400    /// > are the control characters #x0 to #x1F and #x7F (most of which cannot appear in
401    /// > XML), space #x20, the delimiters '<' #x3C, '>' #x3E and '"' #x22, the unwise
402    /// > characters '{' #x7B, '}' #x7D, '|' #x7C, '\' #x5C, '^' #x5E and '`' #x60, as well
403    /// > as all characters above #x7F. Since escaping is not always a fully reversible
404    /// > process, it MUST be performed only when absolutely necessary and as late as
405    /// > possible in a processing chain. In particular, neither the process of converting
406    /// > a relative URI to an absolute one nor the process of passing a URI reference to a
407    /// > process or software component responsible for dereferencing it SHOULD trigger
408    /// > escaping. When escaping does occur, it MUST be performed as follows:
409    /// >
410    /// > 1. Each character to be escaped is represented in UTF-8 \[Unicode\] as one or more
411    /// >    bytes.
412    /// > 2. The resulting bytes are escaped with the URI escaping mechanism (that is,
413    /// >    converted to % HH, where HH is the hexadecimal notation of the byte value).
414    /// > 3. The original character is replaced by the resulting character sequence.
415    pub fn parse_system_id(uri: impl AsRef<str>) -> Result<Self, ParseRIError> {
416        fn _parse(uri: &str) -> Result<URIString, ParseRIError> {
417            let uri = escape_except(uri, |b| {
418                // XML 1.0 "4.2.2 External Entities"
419                b.is_ascii()
420                    && !matches!(
421                        b as u8,
422                        0..=0x1F
423                            | 0x20
424                            | 0x22
425                            | 0x3C
426                            | 0x3E
427                            | 0x5C
428                            | 0x5E
429                            | 0x60
430                            | 0x7B..=0x7D
431                            | 0x7F..
432                    )
433            });
434            URIString::parse_escaped(&uri)
435        }
436        _parse(uri.as_ref())
437    }
438
439    /// Parse the string as a URI without performing any escape processing whatsoever.  \
440    /// In other words, `uri` is treated as an escaped string.
441    ///
442    /// Since percent-encoded characters are treated as percent-encoded, the result of
443    /// [`URIStr::as_unescaped_str`] may differ from `uri`. On the other hand, since no
444    /// escaping is performed at all, the result of [`URIStr::as_escaped_str`] is always
445    /// equal to `uri`.
446    fn parse_escaped(uri: impl AsRef<str>) -> Result<Self, ParseRIError> {
447        fn _parse(uri: &str) -> Result<URIString, ParseRIError> {
448            let mut bytes = uri.as_bytes();
449            parse_uri_reference(&mut bytes)?;
450            if !bytes.is_empty() {
451                Err(ParseRIError::NotTermination)
452            } else {
453                Ok(URIString {
454                    uri: uri.to_owned(),
455                })
456            }
457        }
458        _parse(uri.as_ref())
459    }
460
461    /// # Note
462    /// In the current implementation, paths that cannot be converted to UTF-8 strings
463    /// cannot be handled.  \
464    /// I don't think there will be any problems in most environments, but there may be
465    /// some paths that cannot be handled.
466    pub fn parse_file_path(path: impl AsRef<Path>) -> Result<Self, ParseRIError> {
467        #[cfg(target_family = "unix")]
468        fn _parse_file_path(path: &Path) -> Result<URIString, ParseRIError> {
469            let mut path_str = path.to_str().ok_or(ParseRIError::Unsupported)?.to_owned();
470            if (path.is_dir() || (path.as_os_str().as_encoded_bytes().ends_with(b"\\")))
471                && !path_str.ends_with('/')
472            {
473                path_str.push('/');
474            }
475            if path.is_absolute() {
476                path_str.insert_str(0, "file://");
477            }
478            URIString::parse(path_str)
479        }
480        #[cfg(target_family = "windows")]
481        fn _parse_file_path(path: &Path) -> Result<URIString, ParseRIError> {
482            use std::path::{Component::*, Prefix::*};
483
484            let mut path_str = String::new();
485            let mut verbatim = false;
486            for comp in path.components() {
487                match comp {
488                    Prefix(prefix) => match prefix.kind() {
489                        Verbatim(root) => {
490                            path_str.push_str("file:///");
491                            path_str.push_str(
492                                &root
493                                    .to_str()
494                                    .ok_or(ParseRIError::Unsupported)?
495                                    .replace('/', "%2F"),
496                            );
497                            verbatim = true;
498                        }
499                        VerbatimUNC(server, root) => {
500                            path_str.push_str("file://");
501                            path_str.push_str(
502                                &server
503                                    .to_str()
504                                    .ok_or(ParseRIError::Unsupported)?
505                                    .replace('/', "%2F"),
506                            );
507                            path_str.push('/');
508                            path_str.push_str(
509                                &root
510                                    .to_str()
511                                    .ok_or(ParseRIError::Unsupported)?
512                                    .replace('/', "%2F"),
513                            );
514                            verbatim = true;
515                        }
516                        VerbatimDisk(letter) => {
517                            path_str.push_str("file:");
518                            path_str.push(letter as char);
519                            path_str.push(':');
520                            verbatim = true;
521                        }
522                        DeviceNS(device) => {
523                            path_str.push_str("file:///");
524                            path_str.push_str(device.to_str().ok_or(ParseRIError::Unsupported)?);
525                        }
526                        UNC(server, root) => {
527                            path_str.push_str("file://");
528                            path_str.push_str(server.to_str().ok_or(ParseRIError::Unsupported)?);
529                            path_str.push('/');
530                            path_str.push_str(root.to_str().ok_or(ParseRIError::Unsupported)?);
531                        }
532                        Disk(letter) => {
533                            path_str.push_str("file:");
534                            path_str.push(letter as char);
535                            path_str.push(':');
536                        }
537                    },
538                    RootDir => {}
539                    CurDir => {
540                        if !path_str.is_empty() {
541                            path_str.push_str("/.");
542                        } else {
543                            path_str.push_str(".");
544                        }
545                    }
546                    ParentDir => {
547                        if !path_str.is_empty() {
548                            path_str.push_str("/..");
549                        } else {
550                            path_str.push_str("..")
551                        }
552                    }
553                    Normal(segment) => {
554                        if !path_str.is_empty() {
555                            path_str.push('/');
556                        }
557                        let segment = segment.to_str().ok_or(ParseRIError::Unsupported)?;
558                        if verbatim {
559                            path_str.push_str(&segment.replace('/', "%2F"));
560                        } else {
561                            path_str.push_str(segment);
562                        }
563                    }
564                }
565            }
566            if (path.is_dir()
567                || (path.as_os_str().as_encoded_bytes().ends_with(b"\\")
568                    || (!verbatim && path.as_os_str().as_encoded_bytes().ends_with(b"/"))))
569                && !path_str.ends_with('/')
570            {
571                path_str.push('/');
572            }
573            URIString::parse(path_str)
574        }
575        #[cfg(all(not(target_family = "unix"), not(target_family = "windows")))]
576        fn _parse_file_path(path: &Path) -> Result<URIString, ParseRIError> {
577            todo!()
578        }
579        _parse_file_path(path.as_ref())
580    }
581
582    /// Convert [`URIString`] to [`Box<URIStr>`].
583    pub fn into_boxed_uri_str(self) -> Box<URIStr> {
584        Box::from(self.as_ref())
585    }
586
587    /// Normalize the URI according to the algorithm specified in RFC 3986.
588    ///
589    /// # Reference
590    /// [6.2.2.  Syntax-Based Normalization](https://datatracker.ietf.org/doc/html/rfc3986#section-6.2.2).
591    pub fn normalize(&mut self) {
592        use Component::*;
593
594        let mut uri = String::with_capacity(self.uri.len());
595        let mut paths = vec![];
596        let mut query = None;
597        let mut fragment = None;
598        let mut has_root = false;
599        for comp in self.components() {
600            match comp {
601                Scheme(scheme) => {
602                    uri.push_str(&scheme.to_ascii_lowercase());
603                    uri.push(':');
604                }
605                Authority {
606                    userinfo,
607                    host,
608                    port,
609                } => {
610                    uri.push_str("//");
611                    if let Some(userinfo) = userinfo {
612                        uri.push_str(userinfo);
613                        uri.push('@');
614                    }
615                    uri.push_str(host);
616                    if let Some(port) = port {
617                        uri.push(':');
618                        uri.push_str(port);
619                    }
620                }
621                RootSegment => has_root = true,
622                Segment(segment) => paths.push(segment),
623                Query(q) => query = Some(q),
624                Fragment(f) => fragment = Some(f),
625            }
626        }
627        build_normalized_path(paths.into_iter(), has_root, &mut uri);
628        if let Some(query) = query {
629            uri.push('?');
630            uri.push_str(query);
631        }
632        if let Some(fragment) = fragment {
633            uri.push('#');
634            uri.push_str(fragment);
635        }
636        self.uri = uri;
637    }
638}
639
640impl AsRef<URIStr> for URIString {
641    fn as_ref(&self) -> &URIStr {
642        URIStr::new(&self.uri)
643    }
644}
645
646impl Borrow<URIStr> for URIString {
647    fn borrow(&self) -> &URIStr {
648        self.as_ref()
649    }
650}
651
652impl Deref for URIString {
653    type Target = URIStr;
654
655    fn deref(&self) -> &Self::Target {
656        self.as_ref()
657    }
658}
659
660impl std::fmt::Display for URIString {
661    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
662        write!(f, "{}", self.as_ref())
663    }
664}
665
666macro_rules! impl_convertion_uri_string {
667    ($( $t:ty ),*) => {
668        $(
669            impl From<URIString> for $t {
670                fn from(value: URIString) -> $t {
671                    From::from(value.as_ref())
672                }
673            }
674        )*
675    };
676}
677impl_convertion_uri_string!(Box<URIStr>, Rc<URIStr>, Arc<URIStr>);
678
679fn build_normalized_path<'a>(
680    segments: impl Iterator<Item = &'a str>,
681    has_root: bool,
682    buffer: &mut String,
683) {
684    let segments = normalize_path_segments(segments, has_root);
685    if has_root {
686        buffer.push('/');
687    }
688    for (i, seg) in segments.into_iter().enumerate() {
689        if i > 0 {
690            buffer.push('/');
691        }
692        buffer.push_str(seg);
693    }
694}
695
696fn normalize_path_segments<'a>(
697    segments: impl Iterator<Item = &'a str>,
698    has_root: bool,
699) -> Vec<&'a str> {
700    let mut stack = vec![];
701    let mut last_dot = false;
702    for seg in segments {
703        if seg == "." {
704            // no op
705            last_dot = true;
706        } else if seg == ".." {
707            if !stack.is_empty() && stack.last() != Some(&"..") {
708                stack.pop();
709            } else if !has_root {
710                stack.push(seg);
711            }
712            last_dot = true;
713        } else {
714            stack.push(seg);
715            last_dot = false;
716        }
717    }
718
719    if last_dot {
720        stack.push("");
721    }
722
723    stack
724}
725
726/// # Reference
727/// [4.1.  URI Reference](https://datatracker.ietf.org/doc/html/rfc3986#section-4.1)
728///
729/// ```text
730/// URI-reference = URI / relative-ref
731/// ```
732fn parse_uri_reference(b: &mut &[u8]) -> Result<(), ParseRIError> {
733    if b.is_empty() || matches!(b[0], b'/' | b'?' | b'#') {
734        // If `b` is an empty string or starts with either '/', '?' or '#',
735        // it is definitely 'relative-ref'.
736        parse_relative_ref(b)
737    } else {
738        // Otherwise, it is necessary to distinguish between `URI` and `relative-ref`
739        // starting with `relative-part` that matches `path-noscheme`.
740
741        if !b[0].is_ascii_alphabetic() {
742            // Since `scheme` begins with at least one `ALPHA`,
743            // if it does not, it is definitely `irelative-ref`.
744            parse_relative_ref(b)
745        } else {
746            // The characters that can be used in `scheme` are very limited,
747            // so it might be quicker to try parsing `scheme` to distinguish between them?
748            // [25] scheme ::= ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
749            if let Some(&c) = b
750                .iter()
751                .find(|&&b| !b.is_ascii_alphanumeric() && !matches!(b, b'+' | b'-' | b'.'))
752                && c == b':'
753            {
754                parse_uri(b)
755            } else {
756                parse_relative_ref(b)
757            }
758        }
759    }
760}
761
762/// # Reference
763/// [3.  Syntax Components](https://datatracker.ietf.org/doc/html/rfc3986#section-3)
764///
765/// ```text
766/// URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
767/// ```
768fn parse_uri(b: &mut &[u8]) -> Result<(), ParseRIError> {
769    parse_scheme(b)?;
770    *b = b
771        .strip_prefix(b":")
772        .ok_or(ParseRIError::InvalidSchemeSeparator)?;
773    parse_hier_part(b)?;
774    if let Some(query) = b.strip_prefix(b"?") {
775        *b = query;
776        parse_query(b)?;
777    }
778    if let Some(fragment) = b.strip_prefix(b"#") {
779        *b = fragment;
780        parse_fragment(b)?;
781    }
782    Ok(())
783}
784
785/// # Reference
786/// [3.1.  Scheme](https://datatracker.ietf.org/doc/html/rfc3986#section-3.1)
787///
788/// ```text
789/// scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
790/// ```
791fn parse_scheme(b: &mut &[u8]) -> Result<(), ParseRIError> {
792    if b.is_empty() || !b[0].is_ascii_alphabetic() {
793        return Err(ParseRIError::InvalidScheme);
794    }
795    let pos = b
796        .iter()
797        .position(|&b| !b.is_ascii_alphanumeric() && !matches!(b, b'+' | b'-' | b'.'))
798        .unwrap_or(b.len());
799    *b = &b[pos..];
800    Ok(())
801}
802
803/// # Reference
804/// [3.  Syntax Components](https://datatracker.ietf.org/doc/html/rfc3986#section-3)
805///
806/// ```text
807/// hier-part   = "//" authority path-abempty
808///             / path-absolute
809///             / path-rootless
810///             / path-empty
811/// ```
812fn parse_hier_part(b: &mut &[u8]) -> Result<(), ParseRIError> {
813    if let Some(rem) = b.strip_prefix(b"/") {
814        // If `b` starts with '/', `b` starts with 'authority' or `path-absolute`,
815
816        if let Some(rem) = rem.strip_prefix(b"/") {
817            // If `b` starts with '//', it should be followed by 'authority'.
818            // This is because 'path-absolute' is followed by exactly one '/' at the beginning
819            // and optionally 'segment-nz', so there cannot be two consecutive '/' characters.
820            *b = rem;
821            parse_authority(b)?;
822            parse_path_abempty(b)
823        } else {
824            // path-absolute = "/" [ segment-nz *( "/" segment ) ]
825            // segment-nz    = 1*pchar
826            parse_path_absolute(b)
827        }
828    } else {
829        // otherwise, `b` starts with 'path-rootless' or 'path-empty'
830        let mut dum = *b;
831        if parse_pchar(&mut dum).is_ok() {
832            // If 'path-rootless' follows, one or more 'pchar' should follow.
833            parse_path_rootless(b)
834        } else {
835            // If not, it is 'path-empty'.
836            // Since 'path-empty' is an empty string,
837            // we can simply return `Ok` without doing anything.
838            Ok(())
839        }
840    }
841}
842
843/// # Reference
844/// [3.2.  Authority](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2)
845///
846/// ```text
847/// authority   = [ userinfo "@" ] host [ ":" port ]
848/// ```
849fn parse_authority(b: &mut &[u8]) -> Result<(), ParseRIError> {
850    if b.starts_with(b"[") {
851        // If `b` starts with '[', it is definitely an `host` that matches `IP-literal`.
852        parse_ip_literal(b)?;
853        if let Some(rem) = b.strip_prefix(b":") {
854            *b = rem;
855            parse_port(b)?;
856        }
857        return Ok(());
858    }
859
860    // If not, it may start with `userinfo`, or it may start with `host`
861    // that matches `IPv4address` or `reg-name`.
862    //
863    // If it is either `IPv4address` or `reg-name`, there is no need to consider `IPv4address`.
864    // This is because `reg-name` includes `IPv4address`. More specifically, since `unreserved`
865    // contains `DIGIT` and `.`, `IPv4address` can be regarded as a specific sequence of `unreserved`.
866    //
867    // `userinfo` and `reg-name` are rules that share characters other than colons.
868    // Therefore, they can be distinguished using the following algorithm.
869    //
870    // 1. Increment the counter as long as it matches `userinfo`.
871    // 2. If the first ":" is encountered, note its position.
872    // 3. Determine the matching rule according to the characters that did not match `userinfo`.
873    //      i.   If it is "@", the string seen so far is `userinfo`.
874    //      ii.  If it is "[" , then an `host` matching "IP-literal" should start there,
875    //           but since there is no "@" immediately before it, it is an error.
876    //      iii. In other cases, if the position of ":" is noted, the string before it is `host`;
877    //                           if not, all strings seen so far are `host`.
878    //
879    // userinfo    = *( unreserved / pct-encoded / sub-delims / ":" )
880    //
881    // reg-name    = *( unreserved / pct-encoded / sub-delims )
882    // unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
883    //
884    // IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
885    let mut colon = usize::MAX;
886    let mut now = 0;
887    let mut t = *b;
888    while !t.is_empty() {
889        let pos = t
890            .iter()
891            .position(|&b| !is_unreserved(b) && !is_sub_delims(b) && b != b'%')
892            .unwrap_or(t.len());
893        t = &t[pos..];
894        now += pos;
895        if let Some(rem) = t.strip_prefix(b":") {
896            now += 1;
897            t = rem;
898            colon = colon.min(now);
899        } else {
900            break;
901        }
902    }
903
904    debug_assert_eq!(now, b.len() - t.len());
905
906    if let Some(rem) = t.strip_prefix(b"@") {
907        *b = rem;
908        parse_host(b)?;
909        if let Some(rem) = b.strip_prefix(b":") {
910            *b = rem;
911            parse_port(b)?;
912        }
913        Ok(())
914    } else if t.starts_with(b"[") {
915        Err(ParseRIError::InvalidAuthority)
916    } else if colon < usize::MAX {
917        *b = &b[colon + 1..];
918        parse_port(b)
919    } else {
920        *b = t;
921        Ok(())
922    }
923}
924
925// This function has no use.
926// /// # Reference
927// /// [3.2.1.  User Information](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.1)
928// ///
929// /// ```text
930// /// userinfo    = *( unreserved / pct-encoded / sub-delims / ":" )
931// /// ```
932// fn parse_userinfo(b: &mut &[u8]) -> Result<(), ParseRIError> {
933//     todo!()
934// }
935
936/// # Reference
937/// [3.2.2.  Host]
938///
939/// ```text
940/// host        = IP-literal / IPv4address / reg-name
941/// ```
942fn parse_host(b: &mut &[u8]) -> Result<(), ParseRIError> {
943    if b.starts_with(b"[") {
944        parse_ip_literal(b)
945    } else {
946        // Since `IPv4address` is covered by `reg-name`, it does not need to be considered.
947        parse_reg_name(b)
948    }
949}
950
951/// # Reference
952/// [3.2.2.  Host]
953///
954/// ```text
955/// IP-literal  = "[" ( IPv6address / IPvFuture  ) "]"
956/// ```
957fn parse_ip_literal(b: &mut &[u8]) -> Result<(), ParseRIError> {
958    *b = b.strip_prefix(b"[").ok_or(ParseRIError::InvalidIPLiteral)?;
959    if !b.is_empty() && b[0].eq_ignore_ascii_case(&b'v') {
960        parse_ipv_future(b)?;
961    } else {
962        parse_ipv6_address(b)?;
963    }
964    *b = b.strip_prefix(b"]").ok_or(ParseRIError::InvalidIPLiteral)?;
965    Ok(())
966}
967
968/// # Reference
969/// [3.2.2.  Host]
970///
971/// ```text
972/// IPvFuture   = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
973/// ```
974fn parse_ipv_future(b: &mut &[u8]) -> Result<(), ParseRIError> {
975    if b.is_empty() || !b[0].eq_ignore_ascii_case(&b'v') {
976        return Err(ParseRIError::InvalidIPvFuture);
977    }
978    *b = &b[1..];
979    let pos = b
980        .iter()
981        .position(|&b| !b.is_ascii_hexdigit())
982        .unwrap_or(b.len());
983    if !(1..=b.len() - 2).contains(&pos) {
984        return Err(ParseRIError::InvalidIPvFuture);
985    }
986    *b = &b[pos..];
987    *b = b.strip_prefix(b".").ok_or(ParseRIError::InvalidIPvFuture)?;
988    let pos = b
989        .iter()
990        .position(|&b| !is_unreserved(b) && !is_sub_delims(b) && b != b':')
991        .unwrap_or(b.len());
992    if pos == 0 {
993        return Err(ParseRIError::InvalidIPvFuture);
994    }
995    *b = &b[pos..];
996    Ok(())
997}
998
999/// # Reference
1000/// [3.2.2.  Host]
1001///
1002/// ```text
1003/// IPv6address =                            6( h16 ":" ) ls32
1004///             /                       "::" 5( h16 ":" ) ls32
1005///             / [               h16 ] "::" 4( h16 ":" ) ls32
1006///             / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
1007///             / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
1008///             / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
1009///             / [ *4( h16 ":" ) h16 ] "::"              ls32
1010///             / [ *5( h16 ":" ) h16 ] "::"              h16
1011///             / [ *6( h16 ":" ) h16 ] "::"
1012///  ls32       = ( h16 ":" h16 ) / IPv4address
1013///             ; least-significant 32 bits of address
1014///  h16        = 1*4HEXDIG
1015///             ; 16 bits of address represented in hexadecimal
1016/// ```
1017fn parse_ipv6_address(b: &mut &[u8]) -> Result<(), ParseRIError> {
1018    let mut cnt = 1;
1019    let mut omit = false;
1020    if let Some(rem) = b.strip_prefix(b":") {
1021        *b = rem;
1022        omit = true;
1023    } else {
1024        parse_h16(b)?;
1025    }
1026
1027    while cnt + (omit as i32) < 8
1028        && let Some(rem) = b.strip_prefix(b":")
1029    {
1030        *b = rem;
1031        if b.starts_with(b":") {
1032            if omit {
1033                return Err(ParseRIError::InvalidIPv6address);
1034            }
1035            omit = true;
1036            cnt += 1;
1037            continue;
1038        }
1039
1040        // It's not a smart approach, but it'll probably work...
1041        //
1042        // Checking `h16` first will not work because it cannot be distinguished
1043        // from the first octet of the IPv4 address.
1044        //
1045        // Checking the positions where ':' and '.' appear also seems unlikely to work,
1046        // considering cases where such characters appear in the segments of the following paths.
1047        let mut dum = *b;
1048        if parse_ipv4_address(&mut dum).is_ok() {
1049            *b = dum;
1050            // An IPv4 address consumes two hextets.
1051            cnt += 2;
1052            // An IPv4 address only appears at the end.
1053            break;
1054        } else if !b.is_empty() && b[0].is_ascii_hexdigit() {
1055            parse_h16(b)?;
1056        }
1057    }
1058
1059    // If "::" is included, some hextets may be omitted, resulting in fewer than eight.
1060    // Otherwise, exactly eight hextets are required.
1061    if (omit && cnt <= 8) || (!omit && cnt == 8) {
1062        Ok(())
1063    } else {
1064        Err(ParseRIError::InvalidIPv6address)
1065    }
1066}
1067
1068/// # Reference
1069/// [3.2.2.  Host]
1070///
1071/// ```text
1072///  h16        = 1*4HEXDIG
1073///             ; 16 bits of address represented in hexadecimal
1074/// ```
1075fn parse_h16(b: &mut &[u8]) -> Result<(), ParseRIError> {
1076    let pos = b
1077        .iter()
1078        .position(|&b| !b.is_ascii_hexdigit())
1079        .unwrap_or(b.len());
1080    if pos == 0 {
1081        Err(ParseRIError::InvalidH16)
1082    } else {
1083        *b = &b[pos.min(4)..];
1084        Ok(())
1085    }
1086}
1087
1088/// # Reference
1089/// [3.2.2.  Host]
1090///
1091/// ```text
1092/// IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
1093/// dec-octet   = DIGIT                 ; 0-9
1094///             / %x31-39 DIGIT         ; 10-99
1095///             / "1" 2DIGIT            ; 100-199
1096///             / "2" %x30-34 DIGIT     ; 200-249
1097///             / "25" %x30-35          ; 250-255
1098/// ```
1099fn parse_ipv4_address(b: &mut &[u8]) -> Result<(), ParseRIError> {
1100    parse_dec_octet(b)?;
1101    for _ in 0..3 {
1102        *b = b.strip_prefix(b".").ok_or(ParseRIError::InvalidDecOctet)?;
1103        parse_dec_octet(b)?;
1104    }
1105    Ok(())
1106}
1107fn parse_dec_octet(b: &mut &[u8]) -> Result<(), ParseRIError> {
1108    let len = match b {
1109        [b'2', b'5', b'0'..=b'5', ..] => 3,
1110        [b'2', b'0'..=b'4', b'0'..=b'9', ..] => 3,
1111        [b'1', b'0'..=b'9', b'0'..=b'9', ..] => 3,
1112        [b'1'..=b'9', b'0'..=b'9', ..] => 2,
1113        [b'0'..=b'9', ..] => 1,
1114        _ => return Err(ParseRIError::InvalidDecOctet),
1115    };
1116    *b = &b[len..];
1117    Ok(())
1118}
1119
1120/// # Reference
1121/// [3.2.2.  Host]
1122///
1123/// ```text
1124/// reg-name    = *( unreserved / pct-encoded / sub-delims )
1125/// ```
1126fn parse_reg_name(b: &mut &[u8]) -> Result<(), ParseRIError> {
1127    // pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
1128    // reg-name      = pchar - (":" | "@")
1129    while !b.is_empty() && !matches!(b[0], b':' | b'@') && parse_pchar(b).is_ok() {}
1130    Ok(())
1131}
1132
1133/// # Reference
1134/// [3.2.3.  Port](https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.3)
1135///
1136/// ```text
1137/// port        = *DIGIT
1138/// ```
1139fn parse_port(b: &mut &[u8]) -> Result<(), ParseRIError> {
1140    let pos = b
1141        .iter()
1142        .position(|&b| !b.is_ascii_digit())
1143        .unwrap_or(b.len());
1144    *b = &b[pos..];
1145    Ok(())
1146}
1147
1148/// # Reference
1149/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1150///
1151/// ```text
1152/// path-abempty  = *( "/" segment )
1153/// ```
1154fn parse_path_abempty(b: &mut &[u8]) -> Result<(), ParseRIError> {
1155    while let Some(rem) = b.strip_prefix(b"/") {
1156        *b = rem;
1157        parse_segment(b)?;
1158    }
1159    Ok(())
1160}
1161
1162/// # Reference
1163/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1164///
1165/// ```text
1166/// path-absolute = "/" [ segment-nz *( "/" segment ) ]
1167/// ```
1168fn parse_path_absolute(b: &mut &[u8]) -> Result<(), ParseRIError> {
1169    *b = b
1170        .strip_prefix(b"/")
1171        .ok_or(ParseRIError::InvalidPathAbsolute)?;
1172    if parse_segment_nz(b).is_ok() {
1173        while let Some(rem) = b.strip_prefix(b"/") {
1174            *b = rem;
1175            parse_segment(b)?;
1176        }
1177    }
1178    Ok(())
1179}
1180
1181/// # Reference
1182/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1183///
1184/// ```text
1185/// path-noscheme = segment-nz-nc *( "/" segment )
1186/// ```
1187fn parse_path_noscheme(b: &mut &[u8]) -> Result<(), ParseRIError> {
1188    parse_segment_nz_nc(b)?;
1189    while let Some(rem) = b.strip_prefix(b"/") {
1190        *b = rem;
1191        parse_segment(b)?;
1192    }
1193    Ok(())
1194}
1195
1196/// # Reference
1197/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1198///
1199/// ```text
1200/// path-rootless = segment-nz *( "/" segment )
1201/// ```
1202fn parse_path_rootless(b: &mut &[u8]) -> Result<(), ParseRIError> {
1203    parse_segment_nz(b)?;
1204    while let Some(rem) = b.strip_prefix(b"/") {
1205        *b = rem;
1206        parse_segment(b)?;
1207    }
1208    Ok(())
1209}
1210
1211// This is not necessary because this does nothing.
1212// /// # Reference
1213// /// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1214// ///
1215// /// ```text
1216// /// path-empty    = 0<pchar>
1217// /// ```
1218// fn parse_path_empty(b: &mut &[u8]) -> Result<(), ParseRIError> {
1219//     todo!()
1220// }
1221
1222/// # Reference
1223/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1224///
1225/// ```text
1226/// segment       = *pchar
1227/// ```
1228fn parse_segment(b: &mut &[u8]) -> Result<(), ParseRIError> {
1229    while parse_pchar(b).is_ok() {}
1230    Ok(())
1231}
1232
1233/// # Reference
1234/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1235///
1236/// ```text
1237/// segment-nz    = 1*pchar
1238/// ```
1239fn parse_segment_nz(b: &mut &[u8]) -> Result<(), ParseRIError> {
1240    parse_pchar(b)?;
1241    while parse_pchar(b).is_ok() {}
1242    Ok(())
1243}
1244
1245/// # Reference
1246/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1247///
1248/// ```text
1249/// segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
1250///                     ; non-zero-length segment without any colon ":"
1251/// ```
1252fn parse_segment_nz_nc(b: &mut &[u8]) -> Result<(), ParseRIError> {
1253    if b.is_empty() || b[0] == b':' || parse_pchar(b).is_err() {
1254        return Err(ParseRIError::InvalidSegmentNzNc);
1255    }
1256    while !b.is_empty() && b[0] != b':' && parse_pchar(b).is_ok() {}
1257    Ok(())
1258}
1259
1260/// # Reference
1261/// [3.3.  Path](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)
1262///
1263/// ```text
1264/// pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
1265/// ```
1266fn parse_pchar(b: &mut &[u8]) -> Result<(), ParseRIError> {
1267    if b.is_empty() {
1268        return Err(ParseRIError::InvalidPChar);
1269    }
1270
1271    if is_unreserved(b[0]) || is_sub_delims(b[0]) || matches!(b[0], b':' | b'@') {
1272        *b = &b[1..];
1273        Ok(())
1274    } else if b.len() >= 3 && b[0] == b'%' && b[1].is_ascii_hexdigit() && b[2].is_ascii_hexdigit() {
1275        *b = &b[3..];
1276        Ok(())
1277    } else {
1278        Err(ParseRIError::InvalidPChar)
1279    }
1280}
1281
1282/// # Reference
1283/// [3.4.  Query](https://datatracker.ietf.org/doc/html/rfc3986#section-3.4)
1284///
1285/// ```text
1286/// query       = *( pchar / "/" / "?" )
1287/// ```
1288fn parse_query(b: &mut &[u8]) -> Result<(), ParseRIError> {
1289    loop {
1290        if let Some(rem) = b.strip_prefix(b"/") {
1291            *b = rem;
1292        } else if let Some(rem) = b.strip_prefix(b"?") {
1293            *b = rem;
1294        } else if parse_pchar(b).is_ok() {
1295            // no op
1296        } else {
1297            break Ok(());
1298        }
1299    }
1300}
1301
1302/// # Reference
1303/// [3.5.  Fragment](https://datatracker.ietf.org/doc/html/rfc3986#section-3.5)
1304///
1305/// ```text
1306/// fragment    = *( pchar / "/" / "?" )
1307/// ```
1308fn parse_fragment(b: &mut &[u8]) -> Result<(), ParseRIError> {
1309    loop {
1310        if let Some(rem) = b.strip_prefix(b"/") {
1311            *b = rem;
1312        } else if let Some(rem) = b.strip_prefix(b"?") {
1313            *b = rem;
1314        } else if parse_pchar(b).is_ok() {
1315            // no op
1316        } else {
1317            break Ok(());
1318        }
1319    }
1320}
1321
1322/// # Reference
1323/// [4.2.  Relative Reference](https://datatracker.ietf.org/doc/html/rfc3986#section-4.2)
1324///
1325/// ```text
1326/// relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
1327/// ```
1328fn parse_relative_ref(b: &mut &[u8]) -> Result<(), ParseRIError> {
1329    parse_relative_part(b)?;
1330    if let Some(query) = b.strip_prefix(b"?") {
1331        *b = query;
1332        parse_query(b)?;
1333    }
1334    if let Some(fragment) = b.strip_prefix(b"#") {
1335        *b = fragment;
1336        parse_fragment(b)?;
1337    }
1338    Ok(())
1339}
1340
1341/// # Reference
1342/// [4.2.  Relative Reference](https://datatracker.ietf.org/doc/html/rfc3986#section-4.2)
1343///
1344/// ```text
1345/// relative-part = "//" authority path-abempty
1346///               / path-absolute
1347///               / path-noscheme
1348///               / path-empty
1349/// ```
1350fn parse_relative_part(b: &mut &[u8]) -> Result<(), ParseRIError> {
1351    if let Some(rem) = b.strip_prefix(b"/") {
1352        if let Some(rem) = rem.strip_prefix(b"/") {
1353            *b = rem;
1354            parse_authority(b)?;
1355            parse_path_abempty(b)
1356        } else {
1357            parse_path_absolute(b)
1358        }
1359    } else {
1360        let orig = b.len();
1361        let ret = parse_path_noscheme(b);
1362        // If no characters have been consumed, it matches `path-empty` and returns `Ok`.
1363        if orig == b.len() { Ok(()) } else { ret }
1364    }
1365}
1366
1367/// # Reference
1368/// [2.2.  Reserved Characters](https://datatracker.ietf.org/doc/html/rfc3986#section-2.2)
1369///
1370/// ```text
1371/// reserved    = gen-delims / sub-delims
1372/// ```
1373fn is_reserved(b: u8) -> bool {
1374    is_gen_delims(b) || is_sub_delims(b)
1375}
1376
1377/// # Reference
1378/// [2.2.  Reserved Characters](https://datatracker.ietf.org/doc/html/rfc3986#section-2.2)
1379///
1380/// ```text
1381/// gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
1382/// ```
1383fn is_gen_delims(b: u8) -> bool {
1384    matches!(b, b':' | b'/' | b'?' | b'#' | b'[' | b']' | b'@')
1385}
1386
1387/// # Reference
1388/// [2.2.  Reserved Characters](https://datatracker.ietf.org/doc/html/rfc3986#section-2.2)
1389///
1390/// ```text
1391/// sub-delims  = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
1392/// ```
1393fn is_sub_delims(b: u8) -> bool {
1394    matches!(
1395        b,
1396        b'!' | b'$' | b'&' | b'\'' | b'(' | b')' | b'*' | b'+' | b',' | b';' | b'='
1397    )
1398}
1399
1400/// # Reference
1401/// [2.3.  Unreserved Characters](https://datatracker.ietf.org/doc/html/rfc3986#section-2.3)
1402///
1403/// ```text
1404/// unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
1405/// ```
1406fn is_unreserved(b: u8) -> bool {
1407    b.is_ascii_alphanumeric() || matches!(b, b'-' | b'.' | b'_' | b'~')
1408}
1409
1410const LUT_BYTES: [u8; 256 * 3] = {
1411    const fn digit_to_hex_char(b: u8) -> u8 {
1412        if b < 10 { b + b'0' } else { b - 10 + b'A' }
1413    }
1414    let mut buf = [0u8; 256 * 3];
1415    let mut i = 0;
1416    while i < 256 {
1417        buf[3 * i] = b'%';
1418        let hi = (i as u8 >> 4) & 0xF;
1419        let lo = i as u8 & 0xF;
1420        buf[3 * i + 1] = digit_to_hex_char(hi);
1421        buf[3 * i + 2] = digit_to_hex_char(lo);
1422        i += 1;
1423    }
1424    buf
1425};
1426const LUT: &str = unsafe {
1427    // # Safety
1428    // `LUT_BYTES` contains only '%' and ASCII hex digit characters.
1429    // Therefore, UTF-8 validation won't fail.
1430    from_utf8_unchecked(&LUT_BYTES)
1431};
1432
1433/// Return a string in which all characters have been percent-encoded.
1434pub fn escape(s: &str) -> Cow<'_, str> {
1435    escape_except(s, |_| false)
1436}
1437
1438/// Return a byte sequence in which all bytes have been percent-encoded.
1439pub fn escape_bytes(b: &[u8]) -> Cow<'_, [u8]> {
1440    escape_bytes_except(b, |_| false)
1441}
1442
1443/// Return a string in which all characters other than those for which `is_except` returns
1444/// `true` are percent-encoded.
1445pub fn escape_except(s: &str, is_except: impl Fn(char) -> bool) -> Cow<'_, str> {
1446    let cap = s
1447        .chars()
1448        .filter_map(|c| (!is_except(c)).then_some(c.len_utf8() * 2))
1449        .sum::<usize>();
1450    if cap == 0 {
1451        return Cow::Borrowed(s);
1452    }
1453    let mut encode = [0; 6];
1454    let mut buf = String::with_capacity(s.len() + cap);
1455    for c in s.chars() {
1456        if is_except(c) {
1457            buf.push(c);
1458        } else {
1459            let encoded = c.encode_utf8(&mut encode);
1460            for b in encoded.bytes() {
1461                let index = b as usize * 3;
1462                buf.push_str(&LUT[index..index + 3]);
1463            }
1464        }
1465    }
1466    Cow::Owned(buf)
1467}
1468
1469/// Return a byte sequence in which all bytes other than those for which `is_except` returns
1470/// `true` are percent-encoded.
1471pub fn escape_bytes_except(b: &[u8], is_except: impl Fn(u8) -> bool) -> Cow<'_, [u8]> {
1472    let cap = b.iter().copied().filter(|&b| !is_except(b)).count() * 2;
1473    if cap == 0 {
1474        return Cow::Borrowed(b);
1475    }
1476    let mut buf = Vec::with_capacity(b.len() + cap);
1477    for &b in b {
1478        if is_except(b) {
1479            buf.push(b);
1480        } else {
1481            let index = b as usize * 3;
1482            buf.extend_from_slice(&LUT_BYTES[index..index + 3]);
1483        }
1484    }
1485    Cow::Owned(buf)
1486}
1487
1488/// Percent-encoded string unescaping error
1489pub enum URIUnescapeError {
1490    InvalidEscape,
1491    Utf8Error(std::str::Utf8Error),
1492}
1493
1494impl From<std::str::Utf8Error> for URIUnescapeError {
1495    fn from(value: std::str::Utf8Error) -> Self {
1496        Self::Utf8Error(value)
1497    }
1498}
1499
1500/// Return a string with all percent-encoded characters decoded.
1501///
1502/// Processing assumes that all percent-encoded byte sequences are UTF-8 byte sequences.
1503///
1504/// If a byte sequence that cannot be decoded is encountered, return [`Err`].
1505pub fn unescape(s: &str) -> Result<Cow<'_, str>, URIUnescapeError> {
1506    if !s.contains('%') {
1507        return Ok(Cow::Borrowed(s));
1508    }
1509
1510    let mut split = s.split('%');
1511    let mut buf = String::with_capacity(s.len());
1512    buf.push_str(split.next().unwrap());
1513    let mut bytes = vec![];
1514    for chunk in split {
1515        if chunk.len() < 2 {
1516            return Err(URIUnescapeError::InvalidEscape);
1517        }
1518        let byte =
1519            u8::from_str_radix(&chunk[..2], 16).map_err(|_| URIUnescapeError::InvalidEscape)?;
1520        bytes.push(byte);
1521
1522        if chunk.len() > 2 {
1523            buf.push_str(from_utf8(&bytes)?);
1524            buf.push_str(&chunk[2..]);
1525            bytes.clear();
1526        }
1527    }
1528
1529    if !bytes.is_empty() {
1530        buf.push_str(from_utf8(&bytes)?);
1531    }
1532    Ok(Cow::Owned(buf))
1533}
1534
1535/// Return a byte sequence with all percent-encoded bytes decoded.
1536///
1537/// Even if the decoded byte sequence does not form a valid string in any encoding, it is not
1538/// an error. However, if the byte sequence is not in the correct format for a percent-encoded
1539/// string, return [`Err`].
1540pub fn unescape_bytes(b: &[u8]) -> Result<Cow<'_, [u8]>, URIUnescapeError> {
1541    if !b.contains(&b'%') {
1542        return Ok(Cow::Borrowed(b));
1543    }
1544
1545    let mut split = b.split(|&b| b == b'%');
1546    let mut buf = Vec::with_capacity(b.len());
1547    buf.extend_from_slice(split.next().unwrap());
1548
1549    fn hexdigit_to_byte(hex: u8) -> u8 {
1550        if hex.is_ascii_digit() {
1551            hex - b'0'
1552        } else if hex.is_ascii_uppercase() {
1553            hex - b'A' + 10
1554        } else {
1555            hex - b'a' + 10
1556        }
1557    }
1558    for chunk in split {
1559        if chunk.len() < 2 || !chunk[0].is_ascii_hexdigit() || !chunk[1].is_ascii_hexdigit() {
1560            return Err(URIUnescapeError::InvalidEscape);
1561        }
1562        let hi = hexdigit_to_byte(chunk[0]);
1563        let lo = hexdigit_to_byte(chunk[1]);
1564        buf.push((hi << 4) | lo);
1565    }
1566    Ok(Cow::Owned(buf))
1567}
1568
1569#[derive(Debug, Clone, Copy)]
1570enum DecomposeState {
1571    Scheme,
1572    Authority,
1573    Root,
1574    Path,
1575    Query,
1576    Fragment,
1577    Finish,
1578}
1579
1580/// Iterator for URI components.
1581pub struct Components<'a> {
1582    state: DecomposeState,
1583    uri: &'a str,
1584}
1585
1586impl Components<'_> {
1587    fn new(uri: &str) -> Components<'_> {
1588        Components {
1589            state: DecomposeState::Scheme,
1590            uri,
1591        }
1592    }
1593}
1594
1595impl<'a> Iterator for Components<'a> {
1596    type Item = Component<'a>;
1597
1598    fn next(&mut self) -> Option<Self::Item> {
1599        use DecomposeState::*;
1600        loop {
1601            match self.state {
1602                Scheme => {
1603                    self.state = Authority;
1604                    let mut bytes = self.uri.as_bytes();
1605                    if parse_scheme(&mut bytes).is_ok() && bytes.starts_with(b":") {
1606                        let len = self.uri.len() - bytes.len();
1607                        let (scheme, rem) = self.uri.split_at(len);
1608                        self.uri = &rem[1..];
1609                        break Some(Component::Scheme(scheme));
1610                    }
1611                }
1612                Authority => {
1613                    self.state = Root;
1614                    if let Some(rem) = self.uri.strip_prefix("//") {
1615                        let pos = rem.bytes().position(|b| b == b'/').unwrap_or(rem.len());
1616                        let (mut authority, rem) = rem.split_at(pos);
1617                        self.uri = rem;
1618                        let mut userinfo = None;
1619                        if let Some((ui, rem)) = authority.split_once('@') {
1620                            userinfo = Some(ui);
1621                            authority = rem;
1622                        }
1623                        let mut port = None;
1624                        if let Some((host, p)) = authority.rsplit_once(':')
1625                            && p.bytes().all(|b| b.is_ascii_digit())
1626                        {
1627                            port = Some(p);
1628                            authority = host;
1629                        }
1630                        break Some(Component::Authority {
1631                            userinfo,
1632                            host: authority,
1633                            port,
1634                        });
1635                    }
1636                }
1637                Root => {
1638                    self.state = Path;
1639                    if let Some(rem) = self.uri.strip_prefix('/') {
1640                        self.uri = rem;
1641                        break Some(Component::RootSegment);
1642                    }
1643                }
1644                Path => {
1645                    let pos = self
1646                        .uri
1647                        .bytes()
1648                        .position(|b| b == b'/' || b == b'?' || b == b'#')
1649                        .unwrap_or(self.uri.len());
1650                    let (segment, rem) = self.uri.split_at(pos);
1651                    if let Some(rem) = rem.strip_prefix('/') {
1652                        self.uri = rem;
1653                    } else {
1654                        self.uri = rem;
1655                        self.state = Query;
1656                    }
1657                    break Some(Component::Segment(segment));
1658                }
1659                Query => {
1660                    self.state = Fragment;
1661                    if let Some(rem) = self.uri.strip_prefix('?') {
1662                        let pos = rem.bytes().position(|b| b == b'#').unwrap_or(rem.len());
1663                        let (query, rem) = rem.split_at(pos);
1664                        self.uri = rem;
1665                        break Some(Component::Query(query));
1666                    }
1667                }
1668                Fragment => {
1669                    debug_assert!(self.uri.is_empty() || self.uri.starts_with('#'));
1670                    self.state = Finish;
1671                    if !self.uri.is_empty() {
1672                        let (_, frag) = self.uri.split_at(1);
1673                        self.uri = "";
1674                        break Some(Component::Fragment(frag));
1675                    }
1676                }
1677                Finish => break None,
1678            }
1679        }
1680    }
1681}
1682
1683/// URI component.
1684pub enum Component<'a> {
1685    Scheme(&'a str),
1686    Authority {
1687        userinfo: Option<&'a str>,
1688        host: &'a str,
1689        port: Option<&'a str>,
1690    },
1691    RootSegment,
1692    Segment(&'a str),
1693    Query(&'a str),
1694    Fragment(&'a str),
1695}
anyxml_uri/uri.rs

anyxml_uri/
uri.rs