fluent_uri/
parse.rs

1use crate::{
2    imp::{AuthMeta, Constraints, HostMeta, Meta},
3    pct_enc::{table::*, Table, OCTET_TABLE_LO},
4    utf8,
5};
6use core::{
7    num::NonZeroUsize,
8    ops::{Deref, DerefMut},
9    str,
10};
11
12/// Detailed cause of a [`ParseError`].
13#[derive(Clone, Copy, Debug, Eq, PartialEq)]
14pub enum ParseErrorKind {
15    /// Invalid percent-encoded octet that is either non-hexadecimal or incomplete.
16    ///
17    /// The error index points to the percent character "%" of the octet.
18    InvalidPctEncodedOctet,
19    /// Unexpected character that is not allowed by the URI/IRI syntax.
20    ///
21    /// The error index points to the first byte of the character.
22    UnexpectedChar,
23    /// Invalid IPv6 address.
24    ///
25    /// The error index points to the first byte of the address.
26    InvalidIpv6Addr,
27}
28
29/// An error occurred when parsing a URI/IRI (reference).
30#[derive(Clone, Copy, Debug, Eq, PartialEq)]
31pub struct ParseError {
32    pub(crate) index: usize,
33    pub(crate) kind: ParseErrorKind,
34}
35
36impl ParseError {
37    /// Returns the index at which the error occurred.
38    #[must_use]
39    pub fn index(&self) -> usize {
40        self.index
41    }
42
43    /// Returns the detailed cause of the error.
44    #[must_use]
45    pub fn kind(&self) -> ParseErrorKind {
46        self.kind
47    }
48}
49
50#[cfg(feature = "impl-error")]
51impl crate::Error for ParseError {}
52
53type Result<T> = core::result::Result<T, crate::parse::ParseError>;
54
55/// Returns immediately with an error.
56macro_rules! err {
57    ($index:expr, $kind:ident) => {
58        return Err(crate::parse::ParseError {
59            index: $index,
60            kind: crate::parse::ParseErrorKind::$kind,
61        })
62    };
63}
64
65pub(crate) fn parse(bytes: &[u8], constraints: Constraints) -> Result<Meta> {
66    let mut parser = Parser {
67        constraints,
68        reader: Reader::new(bytes),
69        out: Meta::default(),
70    };
71    parser.parse_from_scheme()?;
72    Ok(parser.out)
73}
74
75/// URI/IRI parser.
76///
77/// # Invariants
78///
79/// `pos <= len`, `pos` is non-decreasing and on the boundary of a UTF-8 code point.
80///
81/// # Preconditions and guarantees
82///
83/// Before parsing, ensure that `pos == 0`, `out` is default initialized
84/// and `bytes` is valid UTF-8.
85///
86/// Start and finish parsing by calling `parse_from_scheme`.
87/// The following are guaranteed when parsing succeeds:
88///
89/// - All output indexes are within bounds, correctly ordered
90///   and on the boundary of a UTF-8 code point.
91/// - All URI/IRI components defined by output indexes are validated.
92struct Parser<'a> {
93    constraints: Constraints,
94    reader: Reader<'a>,
95    out: Meta,
96}
97
98struct Reader<'a> {
99    bytes: &'a [u8],
100    pos: usize,
101}
102
103impl<'a> Deref for Parser<'a> {
104    type Target = Reader<'a>;
105
106    fn deref(&self) -> &Self::Target {
107        &self.reader
108    }
109}
110
111impl DerefMut for Parser<'_> {
112    fn deref_mut(&mut self) -> &mut Self::Target {
113        &mut self.reader
114    }
115}
116
117enum PathKind {
118    General,
119    AbEmpty,
120    ContinuedNoScheme,
121}
122
123enum Seg {
124    // *1":" 1*4HEXDIG
125    Normal(u16, bool),
126    // "::"
127    Ellipsis,
128    // *1":" 1*4HEXDIG "."
129    MaybeV4(bool),
130    // ":"
131    SingleColon,
132}
133
134impl<'a> Reader<'a> {
135    fn new(bytes: &'a [u8]) -> Self {
136        Reader { bytes, pos: 0 }
137    }
138
139    fn len(&self) -> usize {
140        self.bytes.len()
141    }
142
143    fn has_remaining(&self) -> bool {
144        self.pos < self.len()
145    }
146
147    fn peek(&self, i: usize) -> Option<u8> {
148        self.bytes.get(self.pos + i).copied()
149    }
150
151    // Any call to this method must keep the invariants.
152    fn skip(&mut self, n: usize) {
153        // INVARIANT: `pos` is non-decreasing.
154        self.pos += n;
155        debug_assert!(self.pos <= self.len());
156    }
157
158    // Returns `true` iff any byte is read.
159    fn read(&mut self, table: &Table) -> Result<bool> {
160        let start = self.pos;
161        self._read(table, |_, _| {})?;
162        Ok(self.pos > start)
163    }
164
165    fn _read(&mut self, table: &Table, mut f: impl FnMut(usize, u32)) -> Result<()> {
166        let mut i = self.pos;
167        let allow_pct_encoded = table.allows_pct_encoded();
168        let allow_non_ascii = table.allows_non_ascii();
169
170        while i < self.len() {
171            let x = self.bytes[i];
172            if allow_pct_encoded && x == b'%' {
173                let [hi, lo, ..] = self.bytes[i + 1..] else {
174                    err!(i, InvalidPctEncodedOctet);
175                };
176                if !(HEXDIG.allows_ascii(hi) & HEXDIG.allows_ascii(lo)) {
177                    err!(i, InvalidPctEncodedOctet);
178                }
179                i += 3;
180            } else if allow_non_ascii {
181                let (x, len) = utf8::next_code_point(self.bytes, i);
182                if !table.allows_code_point(x) {
183                    break;
184                }
185                f(i, x);
186                i += len;
187            } else {
188                if !table.allows_ascii(x) {
189                    break;
190                }
191                f(i, x as u32);
192                i += 1;
193            }
194        }
195
196        // INVARIANT: `i` is non-decreasing.
197        self.pos = i;
198        Ok(())
199    }
200
201    fn read_str(&mut self, s: &str) -> bool {
202        if self.bytes[self.pos..].starts_with(s.as_bytes()) {
203            // INVARIANT: The remaining bytes start with `s` so it's fine to skip `s.len()`.
204            self.skip(s.len());
205            true
206        } else {
207            false
208        }
209    }
210
211    fn read_v6(&mut self) -> Option<[u16; 8]> {
212        let mut segs = [0; 8];
213        let mut ellipsis_i = 8;
214
215        let mut i = 0;
216        while i < 8 {
217            match self.read_v6_segment() {
218                Some(Seg::Normal(seg, colon)) => {
219                    if colon == (i == 0 || i == ellipsis_i) {
220                        // Leading colon, triple colons, or no colon.
221                        return None;
222                    }
223                    segs[i] = seg;
224                    i += 1;
225                }
226                Some(Seg::Ellipsis) => {
227                    if ellipsis_i != 8 {
228                        // Multiple ellipses.
229                        return None;
230                    }
231                    ellipsis_i = i;
232                }
233                Some(Seg::MaybeV4(colon)) => {
234                    if i > 6 || colon == (i == ellipsis_i) {
235                        // Not enough space, triple colons, or no colon.
236                        return None;
237                    }
238                    let octets = self.read_v4()?.to_be_bytes();
239                    segs[i] = u16::from_be_bytes([octets[0], octets[1]]);
240                    segs[i + 1] = u16::from_be_bytes([octets[2], octets[3]]);
241                    i += 2;
242                    break;
243                }
244                Some(Seg::SingleColon) => return None,
245                None => break,
246            }
247        }
248
249        if ellipsis_i == 8 {
250            // No ellipsis.
251            if i != 8 {
252                // Too short.
253                return None;
254            }
255        } else if i == 8 {
256            // Eliding nothing.
257            return None;
258        } else {
259            // Shift the segments after the ellipsis to the right.
260            for j in (ellipsis_i..i).rev() {
261                segs[8 - (i - j)] = segs[j];
262                segs[j] = 0;
263            }
264        }
265
266        Some(segs)
267    }
268
269    fn read_v6_segment(&mut self) -> Option<Seg> {
270        let colon = self.read_str(":");
271        if !self.has_remaining() {
272            return colon.then_some(Seg::SingleColon);
273        }
274
275        let first = self.peek(0).unwrap();
276        let mut x = match OCTET_TABLE_LO[first as usize] {
277            v if v < 128 => v as u16,
278            _ => {
279                return colon.then(|| {
280                    if first == b':' {
281                        // INVARIANT: Skipping ":" is fine.
282                        self.skip(1);
283                        Seg::Ellipsis
284                    } else {
285                        Seg::SingleColon
286                    }
287                });
288            }
289        };
290        let mut i = 1;
291
292        while i < 4 {
293            let Some(b) = self.peek(i) else {
294                // INVARIANT: Skipping `i` hexadecimal digits is fine.
295                self.skip(i);
296                return None;
297            };
298            match OCTET_TABLE_LO[b as usize] {
299                v if v < 128 => {
300                    x = (x << 4) | v as u16;
301                    i += 1;
302                    continue;
303                }
304                _ if b == b'.' => return Some(Seg::MaybeV4(colon)),
305                _ => break,
306            }
307        }
308        // INVARIANT: Skipping `i` hexadecimal digits is fine.
309        self.skip(i);
310        Some(Seg::Normal(x, colon))
311    }
312
313    fn read_v4(&mut self) -> Option<u32> {
314        let mut addr = self.read_v4_octet()? << 24;
315        for i in (0..3).rev() {
316            if !self.read_str(".") {
317                return None;
318            }
319            addr |= self.read_v4_octet()? << (i * 8);
320        }
321        Some(addr)
322    }
323
324    fn read_v4_octet(&mut self) -> Option<u32> {
325        let mut res = self.peek_digit(0)?;
326        if res == 0 {
327            // INVARIANT: Skipping "0" is fine.
328            self.skip(1);
329            return Some(0);
330        }
331
332        for i in 1..3 {
333            let Some(x) = self.peek_digit(i) else {
334                // INVARIANT: Skipping `i` digits is fine.
335                self.skip(i);
336                return Some(res);
337            };
338            res = res * 10 + x;
339        }
340        // INVARIANT: Skipping 3 digits is fine.
341        self.skip(3);
342
343        u8::try_from(res).is_ok().then_some(res)
344    }
345
346    fn peek_digit(&self, i: usize) -> Option<u32> {
347        self.peek(i).and_then(|x| (x as char).to_digit(10))
348    }
349
350    fn read_port(&mut self) {
351        if self.read_str(":") {
352            let mut i = 0;
353            while self.peek_digit(i).is_some() {
354                i += 1;
355            }
356            // INVARIANT: Skipping `i` digits is fine.
357            self.skip(i);
358        }
359    }
360
361    fn read_ip_literal(&mut self) -> Result<Option<HostMeta>> {
362        if !self.read_str("[") {
363            return Ok(None);
364        }
365
366        let start = self.pos;
367
368        let meta = if let Some(_addr) = self.read_v6() {
369            HostMeta::Ipv6(
370                #[cfg(feature = "net")]
371                _addr.into(),
372            )
373        } else if self.pos == start {
374            self.read_ipv_future()?;
375            HostMeta::IpvFuture
376        } else {
377            err!(start, InvalidIpv6Addr);
378        };
379
380        if !self.read_str("]") {
381            err!(self.pos, UnexpectedChar);
382        }
383        Ok(Some(meta))
384    }
385
386    fn read_ipv_future(&mut self) -> Result<()> {
387        if let Some(b'v' | b'V') = self.peek(0) {
388            // INVARIANT: Skipping "v" or "V" is fine.
389            self.skip(1);
390            if self.read(HEXDIG)? && self.read_str(".") && self.read(IPV_FUTURE)? {
391                return Ok(());
392            }
393        }
394        err!(self.pos, UnexpectedChar);
395    }
396}
397
398pub(crate) fn parse_v4_or_reg_name(bytes: &[u8]) -> HostMeta {
399    let mut reader = Reader::new(bytes);
400    match reader.read_v4() {
401        Some(_addr) if !reader.has_remaining() => HostMeta::Ipv4(
402            #[cfg(feature = "net")]
403            _addr.into(),
404        ),
405        _ => HostMeta::RegName,
406    }
407}
408
409#[cfg(all(feature = "alloc", not(feature = "net")))]
410pub(crate) fn parse_v6(bytes: &[u8]) -> [u16; 8] {
411    Reader::new(bytes).read_v6().unwrap()
412}
413
414impl Parser<'_> {
415    fn select<T>(&self, for_uri: T, for_iri: T) -> T {
416        if self.constraints.ascii_only {
417            for_uri
418        } else {
419            for_iri
420        }
421    }
422
423    fn read_v4_or_reg_name(&mut self) -> Result<HostMeta> {
424        let reg_name_table = self.select(REG_NAME, IREG_NAME);
425        Ok(match (self.read_v4(), self.read(reg_name_table)?) {
426            (Some(_addr), false) => HostMeta::Ipv4(
427                #[cfg(feature = "net")]
428                _addr.into(),
429            ),
430            _ => HostMeta::RegName,
431        })
432    }
433
434    fn read_host(&mut self) -> Result<HostMeta> {
435        match self.read_ip_literal()? {
436            Some(host) => Ok(host),
437            None => self.read_v4_or_reg_name(),
438        }
439    }
440
441    fn parse_from_scheme(&mut self) -> Result<()> {
442        self.read(SCHEME)?;
443
444        if self.peek(0) == Some(b':') {
445            // Scheme starts with a letter.
446            if self.pos > 0 && self.bytes[0].is_ascii_alphabetic() {
447                self.out.scheme_end = NonZeroUsize::new(self.pos);
448            } else {
449                err!(0, UnexpectedChar);
450            }
451
452            // INVARIANT: Skipping ":" is fine.
453            self.skip(1);
454            return if self.read_str("//") {
455                self.parse_from_authority()
456            } else {
457                self.parse_from_path(PathKind::General)
458            };
459        } else if self.constraints.scheme_required {
460            err!(self.pos, UnexpectedChar);
461        } else if self.pos == 0 {
462            // Nothing read.
463            if self.read_str("//") {
464                return self.parse_from_authority();
465            }
466        }
467        // Scheme chars are valid for path.
468        self.parse_from_path(PathKind::ContinuedNoScheme)
469    }
470
471    fn parse_from_authority(&mut self) -> Result<()> {
472        let host;
473
474        let mut colon_cnt = 0;
475        let mut colon_i = 0;
476
477        let auth_start = self.pos;
478
479        let userinfo_table = self.select(USERINFO, IUSERINFO);
480        // `userinfo_table` contains userinfo, registered name, ':', and port.
481        self._read(userinfo_table, |i, x| {
482            if x == ':' as u32 {
483                colon_cnt += 1;
484                colon_i = i;
485            }
486        })?;
487
488        if self.peek(0) == Some(b'@') {
489            // Userinfo present.
490            // INVARIANT: Skipping "@" is fine.
491            self.skip(1);
492
493            let host_start = self.pos;
494            let meta = self.read_host()?;
495            host = (host_start, self.pos, meta);
496
497            self.read_port();
498        } else if self.pos == auth_start {
499            // Nothing read. We're now at the start of an IP literal or the path.
500            if let Some(meta) = self.read_ip_literal()? {
501                host = (auth_start, self.pos, meta);
502                self.read_port();
503            } else {
504                // Empty authority.
505                host = (self.pos, self.pos, HostMeta::RegName);
506            }
507        } else {
508            // The whole authority read. Try to parse the host and port.
509            let host_end = match colon_cnt {
510                // All host.
511                0 => self.pos,
512                // Host and port.
513                1 => {
514                    for i in colon_i + 1..self.pos {
515                        if !self.bytes[i].is_ascii_digit() {
516                            err!(i, UnexpectedChar);
517                        }
518                    }
519                    colon_i
520                }
521                // Multiple colons.
522                _ => err!(colon_i, UnexpectedChar),
523            };
524
525            let meta = parse_v4_or_reg_name(&self.bytes[auth_start..host_end]);
526            host = (auth_start, host_end, meta);
527        }
528
529        self.out.auth_meta = Some(AuthMeta {
530            host_bounds: (host.0, host.1),
531            host_meta: host.2,
532        });
533        self.parse_from_path(PathKind::AbEmpty)
534    }
535
536    fn parse_from_path(&mut self, kind: PathKind) -> Result<()> {
537        let path_table = self.select(PATH, IPATH);
538        self.out.path_bounds = match kind {
539            PathKind::General => {
540                let start = self.pos;
541                self.read(path_table)?;
542                (start, self.pos)
543            }
544            PathKind::AbEmpty => {
545                let start = self.pos;
546                // Either empty or starting with '/'.
547                if self.read(path_table)? && self.bytes[start] != b'/' {
548                    err!(start, UnexpectedChar);
549                }
550                (start, self.pos)
551            }
552            PathKind::ContinuedNoScheme => {
553                let segment_table = self.select(SEGMENT_NZ_NC, ISEGMENT_NZ_NC);
554                self.read(segment_table)?;
555
556                if self.peek(0) == Some(b':') {
557                    // In a relative reference, the first path
558                    // segment cannot contain a colon character.
559                    err!(self.pos, UnexpectedChar);
560                }
561
562                self.read(path_table)?;
563                (0, self.pos)
564            }
565        };
566
567        if self.read_str("?") {
568            let query_table = self.select(QUERY, IQUERY);
569            self.read(query_table)?;
570            self.out.query_end = NonZeroUsize::new(self.pos);
571        }
572
573        if self.read_str("#") {
574            let fragment_table = self.select(FRAGMENT, IFRAGMENT);
575            self.read(fragment_table)?;
576        }
577
578        if self.has_remaining() {
579            err!(self.pos, UnexpectedChar);
580        }
581        Ok(())
582    }
583}