url/
parser.rs

1// Copyright 2013-2016 The rust-url developers.
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9use std::error::Error;
10use std::fmt::{self, Formatter, Write};
11use std::str;
12
13use crate::host::{Host, HostInternal};
14use crate::Url;
15use form_urlencoded::EncodingOverride;
16use percent_encoding::{percent_encode, utf8_percent_encode, AsciiSet, CONTROLS};
17
18/// https://url.spec.whatwg.org/#fragment-percent-encode-set
19const FRAGMENT: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'<').add(b'>').add(b'`');
20
21/// https://url.spec.whatwg.org/#path-percent-encode-set
22const PATH: &AsciiSet = &FRAGMENT.add(b'#').add(b'?').add(b'{').add(b'}');
23
24/// https://url.spec.whatwg.org/#userinfo-percent-encode-set
25pub(crate) const USERINFO: &AsciiSet = &PATH
26    .add(b'/')
27    .add(b':')
28    .add(b';')
29    .add(b'=')
30    .add(b'@')
31    .add(b'[')
32    .add(b'\\')
33    .add(b']')
34    .add(b'^')
35    .add(b'|');
36
37pub(crate) const PATH_SEGMENT: &AsciiSet = &PATH.add(b'/').add(b'%');
38
39// The backslash (\) character is treated as a path separator in special URLs
40// so it needs to be additionally escaped in that case.
41pub(crate) const SPECIAL_PATH_SEGMENT: &AsciiSet = &PATH_SEGMENT.add(b'\\');
42
43// https://url.spec.whatwg.org/#query-state
44const QUERY: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'#').add(b'<').add(b'>');
45const SPECIAL_QUERY: &AsciiSet = &QUERY.add(b'\'');
46
47pub type ParseResult<T> = Result<T, ParseError>;
48
49macro_rules! simple_enum_error {
50    ($($name: ident => $description: expr,)+) => {
51        /// Errors that can occur during parsing.
52        ///
53        /// This may be extended in the future so exhaustive matching is
54        /// discouraged with an unused variant.
55        #[derive(PartialEq, Eq, Clone, Copy, Debug)]
56        #[non_exhaustive]
57        pub enum ParseError {
58            $(
59                $name,
60            )+
61        }
62
63        impl fmt::Display for ParseError {
64            fn fmt(&self, fmt: &mut Formatter<'_>) -> fmt::Result {
65                match *self {
66                    $(
67                        ParseError::$name => fmt.write_str($description),
68                    )+
69                }
70            }
71        }
72    }
73}
74
75impl Error for ParseError {}
76
77simple_enum_error! {
78    EmptyHost => "empty host",
79    IdnaError => "invalid international domain name",
80    InvalidPort => "invalid port number",
81    InvalidIpv4Address => "invalid IPv4 address",
82    InvalidIpv6Address => "invalid IPv6 address",
83    InvalidScionAddress => "invalid Scion address",
84    InvalidDomainCharacter => "invalid domain character",
85    RelativeUrlWithoutBase => "relative URL without a base",
86    RelativeUrlWithCannotBeABaseBase => "relative URL with a cannot-be-a-base base",
87    SetHostOnCannotBeABaseUrl => "a cannot-be-a-base URL doesn’t have a host to set",
88    Overflow => "URLs more than 4 GB are not supported",
89}
90
91impl From<::idna::Errors> for ParseError {
92    fn from(_: ::idna::Errors) -> ParseError {
93        ParseError::IdnaError
94    }
95}
96
97macro_rules! syntax_violation_enum {
98    ($($name: ident => $description: expr,)+) => {
99        /// Non-fatal syntax violations that can occur during parsing.
100        ///
101        /// This may be extended in the future so exhaustive matching is
102        /// discouraged with an unused variant.
103        #[derive(PartialEq, Eq, Clone, Copy, Debug)]
104        #[non_exhaustive]
105        pub enum SyntaxViolation {
106            $(
107                $name,
108            )+
109        }
110
111        impl SyntaxViolation {
112            pub fn description(&self) -> &'static str {
113                match *self {
114                    $(
115                        SyntaxViolation::$name => $description,
116                    )+
117                }
118            }
119        }
120    }
121}
122
123syntax_violation_enum! {
124    Backslash => "backslash",
125    C0SpaceIgnored =>
126        "leading or trailing control or space character are ignored in URLs",
127    EmbeddedCredentials =>
128        "embedding authentication information (username or password) \
129         in an URL is not recommended",
130    ExpectedDoubleSlash => "expected //",
131    ExpectedFileDoubleSlash => "expected // after file:",
132    FileWithHostAndWindowsDrive => "file: with host and Windows drive letter",
133    NonUrlCodePoint => "non-URL code point",
134    NullInFragment => "NULL characters are ignored in URL fragment identifiers",
135    PercentDecode => "expected 2 hex digits after %",
136    TabOrNewlineIgnored => "tabs or newlines are ignored in URLs",
137    UnencodedAtSign => "unencoded @ sign in username or password",
138}
139
140impl fmt::Display for SyntaxViolation {
141    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
142        fmt::Display::fmt(self.description(), f)
143    }
144}
145
146#[derive(Copy, Clone, PartialEq, Eq)]
147pub enum SchemeType {
148    File,
149    SpecialNotFile,
150    NotSpecial,
151}
152
153impl SchemeType {
154    pub fn is_special(&self) -> bool {
155        !matches!(*self, SchemeType::NotSpecial)
156    }
157
158    pub fn is_file(&self) -> bool {
159        matches!(*self, SchemeType::File)
160    }
161}
162
163impl<T: AsRef<str>> From<T> for SchemeType {
164    fn from(s: T) -> Self {
165        match s.as_ref() {
166            "http" | "https" | "ws" | "wss" | "ftp" => SchemeType::SpecialNotFile,
167            "file" => SchemeType::File,
168            _ => SchemeType::NotSpecial,
169        }
170    }
171}
172
173pub fn default_port(scheme: &str) -> Option<u16> {
174    match scheme {
175        "http" | "ws" => Some(80),
176        "https" | "wss" => Some(443),
177        "ftp" => Some(21),
178        _ => None,
179    }
180}
181
182#[derive(Clone, Debug)]
183pub struct Input<'i> {
184    chars: str::Chars<'i>,
185}
186
187impl<'i> Input<'i> {
188    pub fn new_no_trim(input: &'i str) -> Self {
189        Input {
190            chars: input.chars(),
191        }
192    }
193
194    pub fn new_trim_tab_and_newlines(
195        original_input: &'i str,
196        vfn: Option<&dyn Fn(SyntaxViolation)>,
197    ) -> Self {
198        let input = original_input.trim_matches(ascii_tab_or_new_line);
199        if let Some(vfn) = vfn {
200            if input.len() < original_input.len() {
201                vfn(SyntaxViolation::C0SpaceIgnored)
202            }
203            if input.chars().any(|c| matches!(c, '\t' | '\n' | '\r')) {
204                vfn(SyntaxViolation::TabOrNewlineIgnored)
205            }
206        }
207        Input {
208            chars: input.chars(),
209        }
210    }
211
212    pub fn new_trim_c0_control_and_space(
213        original_input: &'i str,
214        vfn: Option<&dyn Fn(SyntaxViolation)>,
215    ) -> Self {
216        let input = original_input.trim_matches(c0_control_or_space);
217        if let Some(vfn) = vfn {
218            if input.len() < original_input.len() {
219                vfn(SyntaxViolation::C0SpaceIgnored)
220            }
221            if input.chars().any(|c| matches!(c, '\t' | '\n' | '\r')) {
222                vfn(SyntaxViolation::TabOrNewlineIgnored)
223            }
224        }
225        Input {
226            chars: input.chars(),
227        }
228    }
229
230    #[inline]
231    pub fn is_empty(&self) -> bool {
232        self.clone().next().is_none()
233    }
234
235    #[inline]
236    fn starts_with<P: Pattern>(&self, p: P) -> bool {
237        p.split_prefix(&mut self.clone())
238    }
239
240    #[inline]
241    pub fn split_prefix<P: Pattern>(&self, p: P) -> Option<Self> {
242        let mut remaining = self.clone();
243        if p.split_prefix(&mut remaining) {
244            Some(remaining)
245        } else {
246            None
247        }
248    }
249
250    #[inline]
251    fn split_first(&self) -> (Option<char>, Self) {
252        let mut remaining = self.clone();
253        (remaining.next(), remaining)
254    }
255
256    #[inline]
257    fn count_matching<F: Fn(char) -> bool>(&self, f: F) -> (u32, Self) {
258        let mut count = 0;
259        let mut remaining = self.clone();
260        loop {
261            let mut input = remaining.clone();
262            if matches!(input.next(), Some(c) if f(c)) {
263                remaining = input;
264                count += 1;
265            } else {
266                return (count, remaining);
267            }
268        }
269    }
270
271    #[inline]
272    fn next_utf8(&mut self) -> Option<(char, &'i str)> {
273        loop {
274            let utf8 = self.chars.as_str();
275            match self.chars.next() {
276                Some(c) => {
277                    if !matches!(c, '\t' | '\n' | '\r') {
278                        return Some((c, &utf8[..c.len_utf8()]));
279                    }
280                }
281                None => return None,
282            }
283        }
284    }
285}
286
287pub trait Pattern {
288    fn split_prefix(self, input: &mut Input) -> bool;
289}
290
291impl Pattern for char {
292    fn split_prefix(self, input: &mut Input) -> bool {
293        input.next() == Some(self)
294    }
295}
296
297impl<'a> Pattern for &'a str {
298    fn split_prefix(self, input: &mut Input) -> bool {
299        for c in self.chars() {
300            if input.next() != Some(c) {
301                return false;
302            }
303        }
304        true
305    }
306}
307
308impl<F: FnMut(char) -> bool> Pattern for F {
309    fn split_prefix(self, input: &mut Input) -> bool {
310        input.next().map_or(false, self)
311    }
312}
313
314impl<'i> Iterator for Input<'i> {
315    type Item = char;
316    fn next(&mut self) -> Option<char> {
317        self.chars
318            .by_ref()
319            .find(|&c| !matches!(c, '\t' | '\n' | '\r'))
320    }
321}
322
323pub struct Parser<'a> {
324    pub serialization: String,
325    pub base_url: Option<&'a Url>,
326    pub query_encoding_override: EncodingOverride<'a>,
327    pub violation_fn: Option<&'a dyn Fn(SyntaxViolation)>,
328    pub context: Context,
329}
330
331#[derive(PartialEq, Eq, Copy, Clone)]
332pub enum Context {
333    UrlParser,
334    Setter,
335    PathSegmentSetter,
336}
337
338impl<'a> Parser<'a> {
339    fn log_violation(&self, v: SyntaxViolation) {
340        if let Some(f) = self.violation_fn {
341            f(v)
342        }
343    }
344
345    fn log_violation_if(&self, v: SyntaxViolation, test: impl FnOnce() -> bool) {
346        if let Some(f) = self.violation_fn {
347            if test() {
348                f(v)
349            }
350        }
351    }
352
353    pub fn for_setter(serialization: String) -> Parser<'a> {
354        Parser {
355            serialization,
356            base_url: None,
357            query_encoding_override: None,
358            violation_fn: None,
359            context: Context::Setter,
360        }
361    }
362
363    /// https://url.spec.whatwg.org/#concept-basic-url-parser
364    pub fn parse_url(mut self, input: &str) -> ParseResult<Url> {
365        let input = Input::new_trim_c0_control_and_space(input, self.violation_fn);
366        if let Ok(remaining) = self.parse_scheme(input.clone()) {
367            return self.parse_with_scheme(remaining);
368        }
369
370        // No-scheme state
371        if let Some(base_url) = self.base_url {
372            if input.starts_with('#') {
373                self.fragment_only(base_url, input)
374            } else if base_url.cannot_be_a_base() {
375                Err(ParseError::RelativeUrlWithCannotBeABaseBase)
376            } else {
377                let scheme_type = SchemeType::from(base_url.scheme());
378                if scheme_type.is_file() {
379                    self.parse_file(input, scheme_type, Some(base_url))
380                } else {
381                    self.parse_relative(input, scheme_type, base_url)
382                }
383            }
384        } else {
385            Err(ParseError::RelativeUrlWithoutBase)
386        }
387    }
388
389    pub fn parse_scheme<'i>(&mut self, mut input: Input<'i>) -> Result<Input<'i>, ()> {
390        if input.is_empty() || !input.starts_with(ascii_alpha) {
391            return Err(());
392        }
393        debug_assert!(self.serialization.is_empty());
394        while let Some(c) = input.next() {
395            match c {
396                'a'..='z' | 'A'..='Z' | '0'..='9' | '+' | '-' | '.' => {
397                    self.serialization.push(c.to_ascii_lowercase())
398                }
399                ':' => return Ok(input),
400                _ => {
401                    self.serialization.clear();
402                    return Err(());
403                }
404            }
405        }
406        // EOF before ':'
407        if self.context == Context::Setter {
408            Ok(input)
409        } else {
410            self.serialization.clear();
411            Err(())
412        }
413    }
414
415    fn parse_with_scheme(mut self, input: Input<'_>) -> ParseResult<Url> {
416        use crate::SyntaxViolation::{ExpectedDoubleSlash, ExpectedFileDoubleSlash};
417        let scheme_end = to_u32(self.serialization.len())?;
418        let scheme_type = SchemeType::from(&self.serialization);
419        self.serialization.push(':');
420        match scheme_type {
421            SchemeType::File => {
422                self.log_violation_if(ExpectedFileDoubleSlash, || !input.starts_with("//"));
423                let base_file_url = self.base_url.and_then(|base| {
424                    if base.scheme() == "file" {
425                        Some(base)
426                    } else {
427                        None
428                    }
429                });
430                self.serialization.clear();
431                self.parse_file(input, scheme_type, base_file_url)
432            }
433            SchemeType::SpecialNotFile => {
434                // special relative or authority state
435                let (slashes_count, remaining) = input.count_matching(|c| matches!(c, '/' | '\\'));
436                if let Some(base_url) = self.base_url {
437                    if slashes_count < 2
438                        && base_url.scheme() == &self.serialization[..scheme_end as usize]
439                    {
440                        // "Cannot-be-a-base" URLs only happen with "not special" schemes.
441                        debug_assert!(!base_url.cannot_be_a_base());
442                        self.serialization.clear();
443                        return self.parse_relative(input, scheme_type, base_url);
444                    }
445                }
446                // special authority slashes state
447                self.log_violation_if(ExpectedDoubleSlash, || {
448                    input
449                        .clone()
450                        .take_while(|&c| matches!(c, '/' | '\\'))
451                        .collect::<String>()
452                        != "//"
453                });
454                self.after_double_slash(remaining, scheme_type, scheme_end)
455            }
456            SchemeType::NotSpecial => self.parse_non_special(input, scheme_type, scheme_end),
457        }
458    }
459
460    /// Scheme other than file, http, https, ws, ws, ftp.
461    fn parse_non_special(
462        mut self,
463        input: Input<'_>,
464        scheme_type: SchemeType,
465        scheme_end: u32,
466    ) -> ParseResult<Url> {
467        // path or authority state (
468        if let Some(input) = input.split_prefix("//") {
469            return self.after_double_slash(input, scheme_type, scheme_end);
470        }
471        // Anarchist URL (no authority)
472        let path_start = to_u32(self.serialization.len())?;
473        let username_end = path_start;
474        let host_start = path_start;
475        let host_end = path_start;
476        let host = HostInternal::None;
477        let port = None;
478        let remaining = if let Some(input) = input.split_prefix('/') {
479            self.serialization.push('/');
480            self.parse_path(scheme_type, &mut false, path_start as usize, input)
481        } else {
482            self.parse_cannot_be_a_base_path(input)
483        };
484        self.with_query_and_fragment(
485            scheme_type,
486            scheme_end,
487            username_end,
488            host_start,
489            host_end,
490            host,
491            port,
492            path_start,
493            remaining,
494        )
495    }
496
497    fn parse_file(
498        mut self,
499        input: Input<'_>,
500        scheme_type: SchemeType,
501        base_file_url: Option<&Url>,
502    ) -> ParseResult<Url> {
503        use crate::SyntaxViolation::Backslash;
504        // file state
505        debug_assert!(self.serialization.is_empty());
506        let (first_char, input_after_first_char) = input.split_first();
507        if matches!(first_char, Some('/') | Some('\\')) {
508            self.log_violation_if(SyntaxViolation::Backslash, || first_char == Some('\\'));
509            // file slash state
510            let (next_char, input_after_next_char) = input_after_first_char.split_first();
511            if matches!(next_char, Some('/') | Some('\\')) {
512                self.log_violation_if(Backslash, || next_char == Some('\\'));
513                // file host state
514                self.serialization.push_str("file://");
515                let scheme_end = "file".len() as u32;
516                let host_start = "file://".len() as u32;
517                let (path_start, mut host, remaining) =
518                    self.parse_file_host(input_after_next_char)?;
519                let mut host_end = to_u32(self.serialization.len())?;
520                let mut has_host = !matches!(host, HostInternal::None);
521                let remaining = if path_start {
522                    self.parse_path_start(SchemeType::File, &mut has_host, remaining)
523                } else {
524                    let path_start = self.serialization.len();
525                    self.serialization.push('/');
526                    self.parse_path(SchemeType::File, &mut has_host, path_start, remaining)
527                };
528
529                // For file URLs that have a host and whose path starts
530                // with the windows drive letter we just remove the host.
531                if !has_host {
532                    self.serialization
533                        .drain(host_start as usize..host_end as usize);
534                    host_end = host_start;
535                    host = HostInternal::None;
536                }
537                let (query_start, fragment_start) =
538                    self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
539                return Ok(Url {
540                    serialization: self.serialization,
541                    scheme_end,
542                    username_end: host_start,
543                    host_start,
544                    host_end,
545                    host,
546                    port: None,
547                    path_start: host_end,
548                    query_start,
549                    fragment_start,
550                });
551            } else {
552                self.serialization.push_str("file://");
553                let scheme_end = "file".len() as u32;
554                let host_start = "file://".len();
555                let mut host_end = host_start;
556                let mut host = HostInternal::None;
557                if !starts_with_windows_drive_letter_segment(&input_after_first_char) {
558                    if let Some(base_url) = base_file_url {
559                        let first_segment = base_url.path_segments().unwrap().next().unwrap();
560                        if is_normalized_windows_drive_letter(first_segment) {
561                            self.serialization.push('/');
562                            self.serialization.push_str(first_segment);
563                        } else if let Some(host_str) = base_url.host_str() {
564                            self.serialization.push_str(host_str);
565                            host_end = self.serialization.len();
566                            host = base_url.host;
567                        }
568                    }
569                }
570                // If c is the EOF code point, U+002F (/), U+005C (\), U+003F (?), or U+0023 (#), then decrease pointer by one
571                let parse_path_input = if let Some(c) = first_char {
572                    if c == '/' || c == '\\' || c == '?' || c == '#' {
573                        input
574                    } else {
575                        input_after_first_char
576                    }
577                } else {
578                    input_after_first_char
579                };
580
581                let remaining =
582                    self.parse_path(SchemeType::File, &mut false, host_end, parse_path_input);
583
584                let host_start = host_start as u32;
585
586                let (query_start, fragment_start) =
587                    self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
588
589                let host_end = host_end as u32;
590                return Ok(Url {
591                    serialization: self.serialization,
592                    scheme_end,
593                    username_end: host_start,
594                    host_start,
595                    host_end,
596                    host,
597                    port: None,
598                    path_start: host_end,
599                    query_start,
600                    fragment_start,
601                });
602            }
603        }
604        if let Some(base_url) = base_file_url {
605            match first_char {
606                None => {
607                    // Copy everything except the fragment
608                    let before_fragment = match base_url.fragment_start {
609                        Some(i) => &base_url.serialization[..i as usize],
610                        None => &*base_url.serialization,
611                    };
612                    self.serialization.push_str(before_fragment);
613                    Ok(Url {
614                        serialization: self.serialization,
615                        fragment_start: None,
616                        ..*base_url
617                    })
618                }
619                Some('?') => {
620                    // Copy everything up to the query string
621                    let before_query = match (base_url.query_start, base_url.fragment_start) {
622                        (None, None) => &*base_url.serialization,
623                        (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
624                    };
625                    self.serialization.push_str(before_query);
626                    let (query_start, fragment_start) =
627                        self.parse_query_and_fragment(scheme_type, base_url.scheme_end, input)?;
628                    Ok(Url {
629                        serialization: self.serialization,
630                        query_start,
631                        fragment_start,
632                        ..*base_url
633                    })
634                }
635                Some('#') => self.fragment_only(base_url, input),
636                _ => {
637                    if !starts_with_windows_drive_letter_segment(&input) {
638                        let before_query = match (base_url.query_start, base_url.fragment_start) {
639                            (None, None) => &*base_url.serialization,
640                            (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
641                        };
642                        self.serialization.push_str(before_query);
643                        self.shorten_path(SchemeType::File, base_url.path_start as usize);
644                        let remaining = self.parse_path(
645                            SchemeType::File,
646                            &mut true,
647                            base_url.path_start as usize,
648                            input,
649                        );
650                        self.with_query_and_fragment(
651                            SchemeType::File,
652                            base_url.scheme_end,
653                            base_url.username_end,
654                            base_url.host_start,
655                            base_url.host_end,
656                            base_url.host,
657                            base_url.port,
658                            base_url.path_start,
659                            remaining,
660                        )
661                    } else {
662                        self.serialization.push_str("file:///");
663                        let scheme_end = "file".len() as u32;
664                        let path_start = "file://".len();
665                        let remaining =
666                            self.parse_path(SchemeType::File, &mut false, path_start, input);
667                        let (query_start, fragment_start) =
668                            self.parse_query_and_fragment(SchemeType::File, scheme_end, remaining)?;
669                        let path_start = path_start as u32;
670                        Ok(Url {
671                            serialization: self.serialization,
672                            scheme_end,
673                            username_end: path_start,
674                            host_start: path_start,
675                            host_end: path_start,
676                            host: HostInternal::None,
677                            port: None,
678                            path_start,
679                            query_start,
680                            fragment_start,
681                        })
682                    }
683                }
684            }
685        } else {
686            self.serialization.push_str("file:///");
687            let scheme_end = "file".len() as u32;
688            let path_start = "file://".len();
689            let remaining = self.parse_path(SchemeType::File, &mut false, path_start, input);
690            let (query_start, fragment_start) =
691                self.parse_query_and_fragment(SchemeType::File, scheme_end, remaining)?;
692            let path_start = path_start as u32;
693            Ok(Url {
694                serialization: self.serialization,
695                scheme_end,
696                username_end: path_start,
697                host_start: path_start,
698                host_end: path_start,
699                host: HostInternal::None,
700                port: None,
701                path_start,
702                query_start,
703                fragment_start,
704            })
705        }
706    }
707
708    fn parse_relative(
709        mut self,
710        input: Input<'_>,
711        scheme_type: SchemeType,
712        base_url: &Url,
713    ) -> ParseResult<Url> {
714        // relative state
715        debug_assert!(self.serialization.is_empty());
716        let (first_char, input_after_first_char) = input.split_first();
717        match first_char {
718            None => {
719                // Copy everything except the fragment
720                let before_fragment = match base_url.fragment_start {
721                    Some(i) => &base_url.serialization[..i as usize],
722                    None => &*base_url.serialization,
723                };
724                self.serialization.push_str(before_fragment);
725                Ok(Url {
726                    serialization: self.serialization,
727                    fragment_start: None,
728                    ..*base_url
729                })
730            }
731            Some('?') => {
732                // Copy everything up to the query string
733                let before_query = match (base_url.query_start, base_url.fragment_start) {
734                    (None, None) => &*base_url.serialization,
735                    (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
736                };
737                self.serialization.push_str(before_query);
738                let (query_start, fragment_start) =
739                    self.parse_query_and_fragment(scheme_type, base_url.scheme_end, input)?;
740                Ok(Url {
741                    serialization: self.serialization,
742                    query_start,
743                    fragment_start,
744                    ..*base_url
745                })
746            }
747            Some('#') => self.fragment_only(base_url, input),
748            Some('/') | Some('\\') => {
749                let (slashes_count, remaining) = input.count_matching(|c| matches!(c, '/' | '\\'));
750                if slashes_count >= 2 {
751                    self.log_violation_if(SyntaxViolation::ExpectedDoubleSlash, || {
752                        input
753                            .clone()
754                            .take_while(|&c| matches!(c, '/' | '\\'))
755                            .collect::<String>()
756                            != "//"
757                    });
758                    let scheme_end = base_url.scheme_end;
759                    debug_assert!(base_url.byte_at(scheme_end) == b':');
760                    self.serialization
761                        .push_str(base_url.slice(..scheme_end + 1));
762                    if let Some(after_prefix) = input.split_prefix("//") {
763                        return self.after_double_slash(after_prefix, scheme_type, scheme_end);
764                    }
765                    return self.after_double_slash(remaining, scheme_type, scheme_end);
766                }
767                let path_start = base_url.path_start;
768                self.serialization.push_str(base_url.slice(..path_start));
769                self.serialization.push('/');
770                let remaining = self.parse_path(
771                    scheme_type,
772                    &mut true,
773                    path_start as usize,
774                    input_after_first_char,
775                );
776                self.with_query_and_fragment(
777                    scheme_type,
778                    base_url.scheme_end,
779                    base_url.username_end,
780                    base_url.host_start,
781                    base_url.host_end,
782                    base_url.host,
783                    base_url.port,
784                    base_url.path_start,
785                    remaining,
786                )
787            }
788            _ => {
789                let before_query = match (base_url.query_start, base_url.fragment_start) {
790                    (None, None) => &*base_url.serialization,
791                    (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
792                };
793                self.serialization.push_str(before_query);
794                // FIXME spec says just "remove last entry", not the "pop" algorithm
795                self.pop_path(scheme_type, base_url.path_start as usize);
796                // A special url always has a path.
797                // A path always starts with '/'
798                if self.serialization.len() == base_url.path_start as usize
799                    && (SchemeType::from(base_url.scheme()).is_special() || !input.is_empty())
800                {
801                    self.serialization.push('/');
802                }
803                let remaining = match input.split_first() {
804                    (Some('/'), remaining) => self.parse_path(
805                        scheme_type,
806                        &mut true,
807                        base_url.path_start as usize,
808                        remaining,
809                    ),
810                    _ => {
811                        self.parse_path(scheme_type, &mut true, base_url.path_start as usize, input)
812                    }
813                };
814                self.with_query_and_fragment(
815                    scheme_type,
816                    base_url.scheme_end,
817                    base_url.username_end,
818                    base_url.host_start,
819                    base_url.host_end,
820                    base_url.host,
821                    base_url.port,
822                    base_url.path_start,
823                    remaining,
824                )
825            }
826        }
827    }
828
829    fn after_double_slash(
830        mut self,
831        input: Input<'_>,
832        scheme_type: SchemeType,
833        scheme_end: u32,
834    ) -> ParseResult<Url> {
835        self.serialization.push('/');
836        self.serialization.push('/');
837        // authority state
838        let before_authority = self.serialization.len();
839        let (username_end, remaining) = self.parse_userinfo(input, scheme_type)?;
840        let has_authority = before_authority != self.serialization.len();
841        // host state
842        let host_start = to_u32(self.serialization.len())?;
843        let (host_end, host, port, remaining) =
844            self.parse_host_and_port(remaining, scheme_end, scheme_type)?;
845        if host == HostInternal::None && has_authority {
846            return Err(ParseError::EmptyHost);
847        }
848        // path state
849        let path_start = to_u32(self.serialization.len())?;
850        let remaining = self.parse_path_start(scheme_type, &mut true, remaining);
851        self.with_query_and_fragment(
852            scheme_type,
853            scheme_end,
854            username_end,
855            host_start,
856            host_end,
857            host,
858            port,
859            path_start,
860            remaining,
861        )
862    }
863
864    /// Return (username_end, remaining)
865    fn parse_userinfo<'i>(
866        &mut self,
867        mut input: Input<'i>,
868        scheme_type: SchemeType,
869    ) -> ParseResult<(u32, Input<'i>)> {
870        let mut last_at = None;
871        let mut remaining = input.clone();
872        let mut char_count = 0;
873        while let Some(c) = remaining.next() {
874            match c {
875                '@' => {
876                    if last_at.is_some() {
877                        self.log_violation(SyntaxViolation::UnencodedAtSign)
878                    } else {
879                        self.log_violation(SyntaxViolation::EmbeddedCredentials)
880                    }
881                    last_at = Some((char_count, remaining.clone()))
882                }
883                '/' | '?' | '#' => break,
884                '\\' if scheme_type.is_special() => break,
885                _ => (),
886            }
887            char_count += 1;
888        }
889        let (mut userinfo_char_count, remaining) = match last_at {
890            None => return Ok((to_u32(self.serialization.len())?, input)),
891            Some((0, remaining)) => {
892                // Otherwise, if one of the following is true
893                // c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
894                // url is special and c is U+005C (\)
895                // If @ flag is set and buffer is the empty string, validation error, return failure.
896                if let (Some(c), _) = remaining.split_first() {
897                    if c == '/' || c == '?' || c == '#' || (scheme_type.is_special() && c == '\\') {
898                        return Err(ParseError::EmptyHost);
899                    }
900                }
901                return Ok((to_u32(self.serialization.len())?, remaining));
902            }
903            Some(x) => x,
904        };
905
906        let mut username_end = None;
907        let mut has_password = false;
908        let mut has_username = false;
909        while userinfo_char_count > 0 {
910            let (c, utf8_c) = input.next_utf8().unwrap();
911            userinfo_char_count -= 1;
912            if c == ':' && username_end.is_none() {
913                // Start parsing password
914                username_end = Some(to_u32(self.serialization.len())?);
915                // We don't add a colon if the password is empty
916                if userinfo_char_count > 0 {
917                    self.serialization.push(':');
918                    has_password = true;
919                }
920            } else {
921                if !has_password {
922                    has_username = true;
923                }
924                self.check_url_code_point(c, &input);
925                self.serialization
926                    .extend(utf8_percent_encode(utf8_c, USERINFO));
927            }
928        }
929        let username_end = match username_end {
930            Some(i) => i,
931            None => to_u32(self.serialization.len())?,
932        };
933        if has_username || has_password {
934            self.serialization.push('@');
935        }
936        Ok((username_end, remaining))
937    }
938
939    fn parse_host_and_port<'i>(
940        &mut self,
941        input: Input<'i>,
942        scheme_end: u32,
943        scheme_type: SchemeType,
944    ) -> ParseResult<(u32, HostInternal, Option<u16>, Input<'i>)> {
945        let (host, remaining) = Parser::parse_host(input, scheme_type)?;
946        write!(&mut self.serialization, "{}", host).unwrap();
947        let host_end = to_u32(self.serialization.len())?;
948        if let Host::Domain(h) = &host {
949            if h.is_empty() {
950                // Port with an empty host
951                if remaining.starts_with(":") {
952                    return Err(ParseError::EmptyHost);
953                }
954                if scheme_type.is_special() {
955                    return Err(ParseError::EmptyHost);
956                }
957            }
958        };
959
960        let (port, remaining) = if let Some(remaining) = remaining.split_prefix(':') {
961            let scheme = || default_port(&self.serialization[..scheme_end as usize]);
962            Parser::parse_port(remaining, scheme, self.context)?
963        } else {
964            (None, remaining)
965        };
966        if let Some(port) = port {
967            write!(&mut self.serialization, ":{}", port).unwrap()
968        }
969        Ok((host_end, host.into(), port, remaining))
970    }
971
972    pub fn parse_host(
973        mut input: Input<'_>,
974        scheme_type: SchemeType,
975    ) -> ParseResult<(Host<String>, Input<'_>)> {
976        if scheme_type.is_file() {
977            return Parser::get_file_host(input);
978        }
979        // Undo the Input abstraction here to avoid allocating in the common case
980        // where the host part of the input does not contain any tab or newline
981        let input_str = input.chars.as_str();
982        let mut inside_square_brackets = false;
983        let mut has_ignored_chars = false;
984        let mut non_ignored_chars = 0;
985        let mut bytes = 0;
986        for c in input_str.chars() {
987            match c {
988                ':' if !inside_square_brackets => break,
989                '\\' if scheme_type.is_special() => break,
990                '/' | '?' | '#' => break,
991                '\t' | '\n' | '\r' => {
992                    has_ignored_chars = true;
993                }
994                '[' => {
995                    inside_square_brackets = true;
996                    non_ignored_chars += 1
997                }
998                ']' => {
999                    inside_square_brackets = false;
1000                    non_ignored_chars += 1
1001                }
1002                _ => non_ignored_chars += 1,
1003            }
1004            bytes += c.len_utf8();
1005        }
1006        let replaced: String;
1007        let host_str;
1008        {
1009            let host_input = input.by_ref().take(non_ignored_chars);
1010            if has_ignored_chars {
1011                replaced = host_input.collect();
1012                host_str = &*replaced
1013            } else {
1014                for _ in host_input {}
1015                host_str = &input_str[..bytes]
1016            }
1017        }
1018        if scheme_type == SchemeType::SpecialNotFile && host_str.is_empty() {
1019            return Err(ParseError::EmptyHost);
1020        }
1021        if !scheme_type.is_special() {
1022            let host = Host::parse_opaque(host_str)?;
1023            return Ok((host, input));
1024        }
1025        let host = Host::parse(host_str)?;
1026        Ok((host, input))
1027    }
1028
1029    fn get_file_host(input: Input<'_>) -> ParseResult<(Host<String>, Input<'_>)> {
1030        let (_, host_str, remaining) = Parser::file_host(input)?;
1031        let host = match Host::parse(&host_str)? {
1032            Host::Domain(ref d) if d == "localhost" => Host::Domain("".to_string()),
1033            host => host,
1034        };
1035        Ok((host, remaining))
1036    }
1037
1038    fn parse_file_host<'i>(
1039        &mut self,
1040        input: Input<'i>,
1041    ) -> ParseResult<(bool, HostInternal, Input<'i>)> {
1042        let has_host;
1043        let (_, host_str, remaining) = Parser::file_host(input)?;
1044        let host = if host_str.is_empty() {
1045            has_host = false;
1046            HostInternal::None
1047        } else {
1048            match Host::parse(&host_str)? {
1049                Host::Domain(ref d) if d == "localhost" => {
1050                    has_host = false;
1051                    HostInternal::None
1052                }
1053                host => {
1054                    write!(&mut self.serialization, "{}", host).unwrap();
1055                    has_host = true;
1056                    host.into()
1057                }
1058            }
1059        };
1060        Ok((has_host, host, remaining))
1061    }
1062
1063    pub fn file_host(input: Input) -> ParseResult<(bool, String, Input)> {
1064        // Undo the Input abstraction here to avoid allocating in the common case
1065        // where the host part of the input does not contain any tab or newline
1066        let input_str = input.chars.as_str();
1067        let mut has_ignored_chars = false;
1068        let mut non_ignored_chars = 0;
1069        let mut bytes = 0;
1070        for c in input_str.chars() {
1071            match c {
1072                '/' | '\\' | '?' | '#' => break,
1073                '\t' | '\n' | '\r' => has_ignored_chars = true,
1074                _ => non_ignored_chars += 1,
1075            }
1076            bytes += c.len_utf8();
1077        }
1078        let replaced: String;
1079        let host_str;
1080        let mut remaining = input.clone();
1081        {
1082            let host_input = remaining.by_ref().take(non_ignored_chars);
1083            if has_ignored_chars {
1084                replaced = host_input.collect();
1085                host_str = &*replaced
1086            } else {
1087                for _ in host_input {}
1088                host_str = &input_str[..bytes]
1089            }
1090        }
1091        if is_windows_drive_letter(host_str) {
1092            return Ok((false, "".to_string(), input));
1093        }
1094        Ok((true, host_str.to_string(), remaining))
1095    }
1096
1097    pub fn parse_port<P>(
1098        mut input: Input<'_>,
1099        default_port: P,
1100        context: Context,
1101    ) -> ParseResult<(Option<u16>, Input<'_>)>
1102    where
1103        P: Fn() -> Option<u16>,
1104    {
1105        let mut port: u32 = 0;
1106        let mut has_any_digit = false;
1107        while let (Some(c), remaining) = input.split_first() {
1108            if let Some(digit) = c.to_digit(10) {
1109                port = port * 10 + digit;
1110                if port > ::std::u16::MAX as u32 {
1111                    return Err(ParseError::InvalidPort);
1112                }
1113                has_any_digit = true;
1114            } else if context == Context::UrlParser && !matches!(c, '/' | '\\' | '?' | '#') {
1115                return Err(ParseError::InvalidPort);
1116            } else {
1117                break;
1118            }
1119            input = remaining;
1120        }
1121        let mut opt_port = Some(port as u16);
1122        if !has_any_digit || opt_port == default_port() {
1123            opt_port = None;
1124        }
1125        Ok((opt_port, input))
1126    }
1127
1128    pub fn parse_path_start<'i>(
1129        &mut self,
1130        scheme_type: SchemeType,
1131        has_host: &mut bool,
1132        input: Input<'i>,
1133    ) -> Input<'i> {
1134        let path_start = self.serialization.len();
1135        let (maybe_c, remaining) = input.split_first();
1136        // If url is special, then:
1137        if scheme_type.is_special() {
1138            if maybe_c == Some('\\') {
1139                // If c is U+005C (\), validation error.
1140                self.log_violation(SyntaxViolation::Backslash);
1141            }
1142            // A special URL always has a non-empty path.
1143            if !self.serialization.ends_with('/') {
1144                self.serialization.push('/');
1145                // We have already made sure the forward slash is present.
1146                if maybe_c == Some('/') || maybe_c == Some('\\') {
1147                    return self.parse_path(scheme_type, has_host, path_start, remaining);
1148                }
1149            }
1150            return self.parse_path(scheme_type, has_host, path_start, input);
1151        } else if maybe_c == Some('?') || maybe_c == Some('#') {
1152            // Otherwise, if state override is not given and c is U+003F (?),
1153            // set url’s query to the empty string and state to query state.
1154            // Otherwise, if state override is not given and c is U+0023 (#),
1155            // set url’s fragment to the empty string and state to fragment state.
1156            // The query and path states will be handled by the caller.
1157            return input;
1158        }
1159
1160        if maybe_c.is_some() && maybe_c != Some('/') {
1161            self.serialization.push('/');
1162        }
1163        // Otherwise, if c is not the EOF code point:
1164        self.parse_path(scheme_type, has_host, path_start, input)
1165    }
1166
1167    pub fn parse_path<'i>(
1168        &mut self,
1169        scheme_type: SchemeType,
1170        has_host: &mut bool,
1171        path_start: usize,
1172        mut input: Input<'i>,
1173    ) -> Input<'i> {
1174        // Relative path state
1175        loop {
1176            let mut segment_start = self.serialization.len();
1177            let mut ends_with_slash = false;
1178            loop {
1179                let input_before_c = input.clone();
1180                let (c, utf8_c) = if let Some(x) = input.next_utf8() {
1181                    x
1182                } else {
1183                    break;
1184                };
1185                match c {
1186                    '/' if self.context != Context::PathSegmentSetter => {
1187                        self.serialization.push(c);
1188                        ends_with_slash = true;
1189                        break;
1190                    }
1191                    '\\' if self.context != Context::PathSegmentSetter
1192                        && scheme_type.is_special() =>
1193                    {
1194                        self.log_violation(SyntaxViolation::Backslash);
1195                        self.serialization.push('/');
1196                        ends_with_slash = true;
1197                        break;
1198                    }
1199                    '?' | '#' if self.context == Context::UrlParser => {
1200                        input = input_before_c;
1201                        break;
1202                    }
1203                    _ => {
1204                        self.check_url_code_point(c, &input);
1205                        if scheme_type.is_file()
1206                            && self.serialization.len() > path_start
1207                            && is_normalized_windows_drive_letter(
1208                                &self.serialization[path_start + 1..],
1209                            )
1210                        {
1211                            self.serialization.push('/');
1212                            segment_start += 1;
1213                        }
1214                        if self.context == Context::PathSegmentSetter {
1215                            if scheme_type.is_special() {
1216                                self.serialization
1217                                    .extend(utf8_percent_encode(utf8_c, SPECIAL_PATH_SEGMENT));
1218                            } else {
1219                                self.serialization
1220                                    .extend(utf8_percent_encode(utf8_c, PATH_SEGMENT));
1221                            }
1222                        } else {
1223                            self.serialization.extend(utf8_percent_encode(utf8_c, PATH));
1224                        }
1225                    }
1226                }
1227            }
1228            let segment_before_slash = if ends_with_slash {
1229                &self.serialization[segment_start..self.serialization.len() - 1]
1230            } else {
1231                &self.serialization[segment_start..self.serialization.len()]
1232            };
1233            match segment_before_slash {
1234                // If buffer is a double-dot path segment, shorten url’s path,
1235                ".." | "%2e%2e" | "%2e%2E" | "%2E%2e" | "%2E%2E" | "%2e." | "%2E." | ".%2e"
1236                | ".%2E" => {
1237                    debug_assert!(self.serialization.as_bytes()[segment_start - 1] == b'/');
1238                    self.serialization.truncate(segment_start);
1239                    if self.serialization.ends_with('/')
1240                        && Parser::last_slash_can_be_removed(&self.serialization, path_start)
1241                    {
1242                        self.serialization.pop();
1243                    }
1244                    self.shorten_path(scheme_type, path_start);
1245
1246                    // and then if neither c is U+002F (/), nor url is special and c is U+005C (\), append the empty string to url’s path.
1247                    if ends_with_slash && !self.serialization.ends_with('/') {
1248                        self.serialization.push('/');
1249                    }
1250                }
1251                // Otherwise, if buffer is a single-dot path segment and if neither c is U+002F (/),
1252                // nor url is special and c is U+005C (\), append the empty string to url’s path.
1253                "." | "%2e" | "%2E" => {
1254                    self.serialization.truncate(segment_start);
1255                    if !self.serialization.ends_with('/') {
1256                        self.serialization.push('/');
1257                    }
1258                }
1259                _ => {
1260                    // If url’s scheme is "file", url’s path is empty, and buffer is a Windows drive letter, then
1261                    if scheme_type.is_file()
1262                        && segment_start == path_start + 1
1263                        && is_windows_drive_letter(segment_before_slash)
1264                    {
1265                        // Replace the second code point in buffer with U+003A (:).
1266                        if let Some(c) = segment_before_slash.chars().next() {
1267                            self.serialization.truncate(segment_start);
1268                            self.serialization.push(c);
1269                            self.serialization.push(':');
1270                            if ends_with_slash {
1271                                self.serialization.push('/');
1272                            }
1273                        }
1274                        // If url’s host is neither the empty string nor null,
1275                        // validation error, set url’s host to the empty string.
1276                        if *has_host {
1277                            self.log_violation(SyntaxViolation::FileWithHostAndWindowsDrive);
1278                            *has_host = false; // FIXME account for this in callers
1279                        }
1280                    }
1281                }
1282            }
1283            if !ends_with_slash {
1284                break;
1285            }
1286        }
1287        if scheme_type.is_file() {
1288            // while url’s path’s size is greater than 1
1289            // and url’s path[0] is the empty string,
1290            // validation error, remove the first item from url’s path.
1291            //FIXME: log violation
1292            let path = self.serialization.split_off(path_start);
1293            self.serialization.push('/');
1294            self.serialization.push_str(path.trim_start_matches('/'));
1295        }
1296
1297        input
1298    }
1299
1300    fn last_slash_can_be_removed(serialization: &str, path_start: usize) -> bool {
1301        let url_before_segment = &serialization[..serialization.len() - 1];
1302        if let Some(segment_before_start) = url_before_segment.rfind('/') {
1303            // Do not remove the root slash
1304            segment_before_start >= path_start
1305                // Or a windows drive letter slash
1306                && !path_starts_with_windows_drive_letter(&serialization[segment_before_start..])
1307        } else {
1308            false
1309        }
1310    }
1311
1312    /// https://url.spec.whatwg.org/#shorten-a-urls-path
1313    fn shorten_path(&mut self, scheme_type: SchemeType, path_start: usize) {
1314        // If path is empty, then return.
1315        if self.serialization.len() == path_start {
1316            return;
1317        }
1318        // If url’s scheme is "file", path’s size is 1, and path[0] is a normalized Windows drive letter, then return.
1319        if scheme_type.is_file()
1320            && is_normalized_windows_drive_letter(&self.serialization[path_start..])
1321        {
1322            return;
1323        }
1324        // Remove path’s last item.
1325        self.pop_path(scheme_type, path_start);
1326    }
1327
1328    /// https://url.spec.whatwg.org/#pop-a-urls-path
1329    fn pop_path(&mut self, scheme_type: SchemeType, path_start: usize) {
1330        if self.serialization.len() > path_start {
1331            let slash_position = self.serialization[path_start..].rfind('/').unwrap();
1332            // + 1 since rfind returns the position before the slash.
1333            let segment_start = path_start + slash_position + 1;
1334            // Don’t pop a Windows drive letter
1335            if !(scheme_type.is_file()
1336                && is_normalized_windows_drive_letter(&self.serialization[segment_start..]))
1337            {
1338                self.serialization.truncate(segment_start);
1339            }
1340        }
1341    }
1342
1343    pub fn parse_cannot_be_a_base_path<'i>(&mut self, mut input: Input<'i>) -> Input<'i> {
1344        loop {
1345            let input_before_c = input.clone();
1346            match input.next_utf8() {
1347                Some(('?', _)) | Some(('#', _)) if self.context == Context::UrlParser => {
1348                    return input_before_c
1349                }
1350                Some((c, utf8_c)) => {
1351                    self.check_url_code_point(c, &input);
1352                    self.serialization
1353                        .extend(utf8_percent_encode(utf8_c, CONTROLS));
1354                }
1355                None => return input,
1356            }
1357        }
1358    }
1359
1360    #[allow(clippy::too_many_arguments)]
1361    fn with_query_and_fragment(
1362        mut self,
1363        scheme_type: SchemeType,
1364        scheme_end: u32,
1365        username_end: u32,
1366        host_start: u32,
1367        host_end: u32,
1368        host: HostInternal,
1369        port: Option<u16>,
1370        mut path_start: u32,
1371        remaining: Input<'_>,
1372    ) -> ParseResult<Url> {
1373        // Special case for anarchist URL's with a leading empty path segment
1374        // This prevents web+demo:/.//not-a-host/ or web+demo:/path/..//not-a-host/,
1375        // when parsed and then serialized, from ending up as web+demo://not-a-host/
1376        // (they end up as web+demo:/.//not-a-host/).
1377        //
1378        // If url’s host is null, url does not have an opaque path,
1379        // url’s path’s size is greater than 1, and url’s path[0] is the empty string,
1380        // then append U+002F (/) followed by U+002E (.) to output.
1381        let scheme_end_as_usize = scheme_end as usize;
1382        let path_start_as_usize = path_start as usize;
1383        if path_start_as_usize == scheme_end_as_usize + 1 {
1384            // Anarchist URL
1385            if self.serialization[path_start_as_usize..].starts_with("//") {
1386                // Case 1: The base URL did not have an empty path segment, but the resulting one does
1387                // Insert the "/." prefix
1388                self.serialization.insert_str(path_start_as_usize, "/.");
1389                path_start += 2;
1390            }
1391            assert!(!self.serialization[scheme_end_as_usize..].starts_with("://"));
1392        } else if path_start_as_usize == scheme_end_as_usize + 3
1393            && &self.serialization[scheme_end_as_usize..path_start_as_usize] == ":/."
1394        {
1395            // Anarchist URL with leading empty path segment
1396            // The base URL has a "/." between the host and the path
1397            assert_eq!(self.serialization.as_bytes()[path_start_as_usize], b'/');
1398            if self
1399                .serialization
1400                .as_bytes()
1401                .get(path_start_as_usize + 1)
1402                .copied()
1403                != Some(b'/')
1404            {
1405                // Case 2: The base URL had an empty path segment, but the resulting one does not
1406                // Remove the "/." prefix
1407                self.serialization
1408                    .replace_range(scheme_end_as_usize..path_start_as_usize, ":");
1409                path_start -= 2;
1410            }
1411            assert!(!self.serialization[scheme_end_as_usize..].starts_with("://"));
1412        }
1413
1414        let (query_start, fragment_start) =
1415            self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
1416        Ok(Url {
1417            serialization: self.serialization,
1418            scheme_end,
1419            username_end,
1420            host_start,
1421            host_end,
1422            host,
1423            port,
1424            path_start,
1425            query_start,
1426            fragment_start,
1427        })
1428    }
1429
1430    /// Return (query_start, fragment_start)
1431    fn parse_query_and_fragment(
1432        &mut self,
1433        scheme_type: SchemeType,
1434        scheme_end: u32,
1435        mut input: Input<'_>,
1436    ) -> ParseResult<(Option<u32>, Option<u32>)> {
1437        let mut query_start = None;
1438        match input.next() {
1439            Some('#') => {}
1440            Some('?') => {
1441                query_start = Some(to_u32(self.serialization.len())?);
1442                self.serialization.push('?');
1443                let remaining = self.parse_query(scheme_type, scheme_end, input);
1444                if let Some(remaining) = remaining {
1445                    input = remaining
1446                } else {
1447                    return Ok((query_start, None));
1448                }
1449            }
1450            None => return Ok((None, None)),
1451            _ => panic!("Programming error. parse_query_and_fragment() called without ? or #"),
1452        }
1453
1454        let fragment_start = to_u32(self.serialization.len())?;
1455        self.serialization.push('#');
1456        self.parse_fragment(input);
1457        Ok((query_start, Some(fragment_start)))
1458    }
1459
1460    pub fn parse_query<'i>(
1461        &mut self,
1462        scheme_type: SchemeType,
1463        scheme_end: u32,
1464        mut input: Input<'i>,
1465    ) -> Option<Input<'i>> {
1466        let len = input.chars.as_str().len();
1467        let mut query = String::with_capacity(len); // FIXME: use a streaming decoder instead
1468        let mut remaining = None;
1469        while let Some(c) = input.next() {
1470            if c == '#' && self.context == Context::UrlParser {
1471                remaining = Some(input);
1472                break;
1473            } else {
1474                self.check_url_code_point(c, &input);
1475                query.push(c);
1476            }
1477        }
1478
1479        let encoding = match &self.serialization[..scheme_end as usize] {
1480            "http" | "https" | "file" | "ftp" => self.query_encoding_override,
1481            _ => None,
1482        };
1483        let query_bytes = if let Some(o) = encoding {
1484            o(&query)
1485        } else {
1486            query.as_bytes().into()
1487        };
1488        let set = if scheme_type.is_special() {
1489            SPECIAL_QUERY
1490        } else {
1491            QUERY
1492        };
1493        self.serialization.extend(percent_encode(&query_bytes, set));
1494        remaining
1495    }
1496
1497    fn fragment_only(mut self, base_url: &Url, mut input: Input<'_>) -> ParseResult<Url> {
1498        let before_fragment = match base_url.fragment_start {
1499            Some(i) => base_url.slice(..i),
1500            None => &*base_url.serialization,
1501        };
1502        debug_assert!(self.serialization.is_empty());
1503        self.serialization
1504            .reserve(before_fragment.len() + input.chars.as_str().len());
1505        self.serialization.push_str(before_fragment);
1506        self.serialization.push('#');
1507        let next = input.next();
1508        debug_assert!(next == Some('#'));
1509        self.parse_fragment(input);
1510        Ok(Url {
1511            serialization: self.serialization,
1512            fragment_start: Some(to_u32(before_fragment.len())?),
1513            ..*base_url
1514        })
1515    }
1516
1517    pub fn parse_fragment(&mut self, mut input: Input<'_>) {
1518        while let Some((c, utf8_c)) = input.next_utf8() {
1519            if c == '\0' {
1520                self.log_violation(SyntaxViolation::NullInFragment)
1521            } else {
1522                self.check_url_code_point(c, &input);
1523            }
1524            self.serialization
1525                .extend(utf8_percent_encode(utf8_c, FRAGMENT));
1526        }
1527    }
1528
1529    fn check_url_code_point(&self, c: char, input: &Input<'_>) {
1530        if let Some(vfn) = self.violation_fn {
1531            if c == '%' {
1532                let mut input = input.clone();
1533                if !matches!((input.next(), input.next()), (Some(a), Some(b))
1534                             if a.is_ascii_hexdigit() && b.is_ascii_hexdigit())
1535                {
1536                    vfn(SyntaxViolation::PercentDecode)
1537                }
1538            } else if !is_url_code_point(c) {
1539                vfn(SyntaxViolation::NonUrlCodePoint)
1540            }
1541        }
1542    }
1543}
1544
1545// Non URL code points:
1546// U+0000 to U+0020 (space)
1547// " # % < > [ \ ] ^ ` { | }
1548// U+007F to U+009F
1549// surrogates
1550// U+FDD0 to U+FDEF
1551// Last two of each plane: U+__FFFE to U+__FFFF for __ in 00 to 10 hex
1552#[inline]
1553fn is_url_code_point(c: char) -> bool {
1554    matches!(c,
1555        'a'..='z' |
1556        'A'..='Z' |
1557        '0'..='9' |
1558        '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | '-' |
1559        '.' | '/' | ':' | ';' | '=' | '?' | '@' | '_' | '~' |
1560        '\u{A0}'..='\u{D7FF}' | '\u{E000}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' |
1561        '\u{10000}'..='\u{1FFFD}' | '\u{20000}'..='\u{2FFFD}' |
1562        '\u{30000}'..='\u{3FFFD}' | '\u{40000}'..='\u{4FFFD}' |
1563        '\u{50000}'..='\u{5FFFD}' | '\u{60000}'..='\u{6FFFD}' |
1564        '\u{70000}'..='\u{7FFFD}' | '\u{80000}'..='\u{8FFFD}' |
1565        '\u{90000}'..='\u{9FFFD}' | '\u{A0000}'..='\u{AFFFD}' |
1566        '\u{B0000}'..='\u{BFFFD}' | '\u{C0000}'..='\u{CFFFD}' |
1567        '\u{D0000}'..='\u{DFFFD}' | '\u{E1000}'..='\u{EFFFD}' |
1568        '\u{F0000}'..='\u{FFFFD}' | '\u{100000}'..='\u{10FFFD}')
1569}
1570
1571/// https://url.spec.whatwg.org/#c0-controls-and-space
1572#[inline]
1573fn c0_control_or_space(ch: char) -> bool {
1574    ch <= ' ' // U+0000 to U+0020
1575}
1576
1577/// https://infra.spec.whatwg.org/#ascii-tab-or-newline
1578#[inline]
1579fn ascii_tab_or_new_line(ch: char) -> bool {
1580    matches!(ch, '\t' | '\r' | '\n')
1581}
1582
1583/// https://url.spec.whatwg.org/#ascii-alpha
1584#[inline]
1585pub fn ascii_alpha(ch: char) -> bool {
1586    ch.is_ascii_alphabetic()
1587}
1588
1589#[inline]
1590pub fn to_u32(i: usize) -> ParseResult<u32> {
1591    if i <= ::std::u32::MAX as usize {
1592        Ok(i as u32)
1593    } else {
1594        Err(ParseError::Overflow)
1595    }
1596}
1597
1598fn is_normalized_windows_drive_letter(segment: &str) -> bool {
1599    is_windows_drive_letter(segment) && segment.as_bytes()[1] == b':'
1600}
1601
1602/// Whether the scheme is file:, the path has a single segment, and that segment
1603/// is a Windows drive letter
1604#[inline]
1605pub fn is_windows_drive_letter(segment: &str) -> bool {
1606    segment.len() == 2 && starts_with_windows_drive_letter(segment)
1607}
1608
1609/// Whether path starts with a root slash
1610/// and a windows drive letter eg: "/c:" or "/a:/"
1611fn path_starts_with_windows_drive_letter(s: &str) -> bool {
1612    if let Some(c) = s.as_bytes().first() {
1613        matches!(c, b'/' | b'\\' | b'?' | b'#') && starts_with_windows_drive_letter(&s[1..])
1614    } else {
1615        false
1616    }
1617}
1618
1619fn starts_with_windows_drive_letter(s: &str) -> bool {
1620    s.len() >= 2
1621        && ascii_alpha(s.as_bytes()[0] as char)
1622        && matches!(s.as_bytes()[1], b':' | b'|')
1623        && (s.len() == 2 || matches!(s.as_bytes()[2], b'/' | b'\\' | b'?' | b'#'))
1624}
1625
1626/// https://url.spec.whatwg.org/#start-with-a-windows-drive-letter
1627fn starts_with_windows_drive_letter_segment(input: &Input<'_>) -> bool {
1628    let mut input = input.clone();
1629    match (input.next(), input.next(), input.next()) {
1630        // its first two code points are a Windows drive letter
1631        // its third code point is U+002F (/), U+005C (\), U+003F (?), or U+0023 (#).
1632        (Some(a), Some(b), Some(c))
1633            if ascii_alpha(a) && matches!(b, ':' | '|') && matches!(c, '/' | '\\' | '?' | '#') =>
1634        {
1635            true
1636        }
1637        // its first two code points are a Windows drive letter
1638        // its length is 2
1639        (Some(a), Some(b), None) if ascii_alpha(a) && matches!(b, ':' | '|') => true,
1640        _ => false,
1641    }
1642}