gix_url/
parse.rs

1use std::convert::Infallible;
2
3use bstr::{BStr, BString, ByteSlice};
4use percent_encoding::percent_decode_str;
5
6use crate::Scheme;
7
8/// The error returned by [parse()](crate::parse()).
9#[derive(Debug, thiserror::Error)]
10#[allow(missing_docs)]
11pub enum Error {
12    #[error("{} \"{url}\" is not valid UTF-8", kind.as_str())]
13    Utf8 {
14        url: BString,
15        kind: UrlKind,
16        source: std::str::Utf8Error,
17    },
18    #[error("{} {url:?} can not be parsed as valid URL", kind.as_str())]
19    Url {
20        url: String,
21        kind: UrlKind,
22        source: crate::simple_url::UrlParseError,
23    },
24
25    #[error("The host portion of the following URL is too long ({} bytes, {len} bytes total): {truncated_url:?}", truncated_url.len())]
26    TooLong { truncated_url: BString, len: usize },
27    #[error("{} \"{url}\" does not specify a path to a repository", kind.as_str())]
28    MissingRepositoryPath { url: BString, kind: UrlKind },
29    #[error("URL {url:?} is relative which is not allowed in this context")]
30    RelativeUrl { url: String },
31}
32
33impl From<Infallible> for Error {
34    fn from(_: Infallible) -> Self {
35        unreachable!("Cannot actually happen, but it seems there can't be a blanket impl for this")
36    }
37}
38
39///
40#[derive(Debug, Clone, Copy)]
41pub enum UrlKind {
42    ///
43    Url,
44    ///
45    Scp,
46    ///
47    Local,
48}
49
50impl UrlKind {
51    fn as_str(&self) -> &'static str {
52        match self {
53            UrlKind::Url => "URL",
54            UrlKind::Scp => "SCP-like target",
55            UrlKind::Local => "local path",
56        }
57    }
58}
59
60pub(crate) enum InputScheme {
61    Url { protocol_end: usize },
62    Scp { colon: usize },
63    Local,
64}
65
66pub(crate) fn find_scheme(input: &BStr) -> InputScheme {
67    // TODO: url's may only contain `:/`, we should additionally check if the characters used for
68    //       protocol are all valid
69    if let Some(protocol_end) = input.find("://") {
70        return InputScheme::Url { protocol_end };
71    }
72
73    if let Some(colon) = input.find_byte(b':') {
74        // allow user to select files containing a `:` by passing them as absolute or relative path
75        // this is behavior explicitly mentioned by the scp and git manuals
76        let explicitly_local = &input[..colon].contains(&b'/');
77        let dos_driver_letter = cfg!(windows) && input[..colon].len() == 1;
78
79        if !explicitly_local && !dos_driver_letter {
80            return InputScheme::Scp { colon };
81        }
82    }
83
84    InputScheme::Local
85}
86
87pub(crate) fn url(input: &BStr, protocol_end: usize) -> Result<crate::Url, Error> {
88    const MAX_LEN: usize = 1024;
89    let bytes_to_path = input[protocol_end + "://".len()..]
90        .iter()
91        .filter(|b| !b.is_ascii_whitespace())
92        .skip_while(|b| **b == b'/' || **b == b'\\')
93        .position(|b| *b == b'/')
94        .unwrap_or(input.len() - protocol_end);
95    if bytes_to_path > MAX_LEN || protocol_end > MAX_LEN {
96        return Err(Error::TooLong {
97            truncated_url: input[..(protocol_end + "://".len() + MAX_LEN).min(input.len())].into(),
98            len: input.len(),
99        });
100    }
101    let (input, url) = input_to_utf8_and_url(input, UrlKind::Url)?;
102    let scheme = Scheme::from(url.scheme.as_str());
103
104    if matches!(scheme, Scheme::Git | Scheme::Ssh) && url.path.is_empty() {
105        return Err(Error::MissingRepositoryPath {
106            url: input.into(),
107            kind: UrlKind::Url,
108        });
109    }
110
111    // Normalize empty path to "/" for http/https URLs only
112    let path = if url.path.is_empty() && matches!(scheme, Scheme::Http | Scheme::Https) {
113        "/".into()
114    } else {
115        url.path.into()
116    };
117
118    Ok(crate::Url {
119        serialize_alternative_form: false,
120        scheme,
121        user: url_user(&url, UrlKind::Url)?,
122        password: url
123            .password
124            .map(|s| percent_decoded_utf8(s, UrlKind::Url))
125            .transpose()?,
126        host: url.host,
127        port: url.port,
128        path,
129    })
130}
131
132fn percent_decoded_utf8(s: &str, kind: UrlKind) -> Result<String, Error> {
133    Ok(percent_decode_str(s)
134        .decode_utf8()
135        .map_err(|err| Error::Utf8 {
136            url: s.into(),
137            kind,
138            source: err,
139        })?
140        .into_owned())
141}
142
143pub(crate) fn scp(input: &BStr, colon: usize) -> Result<crate::Url, Error> {
144    let input = input_to_utf8(input, UrlKind::Scp)?;
145
146    // TODO: this incorrectly splits at IPv6 addresses, check for `[]` before splitting
147    let (host, path) = input.split_at(colon);
148    debug_assert_eq!(path.get(..1), Some(":"), "{path} should start with :");
149    let path = &path[1..];
150
151    if path.is_empty() {
152        return Err(Error::MissingRepositoryPath {
153            url: input.to_owned().into(),
154            kind: UrlKind::Scp,
155        });
156    }
157
158    // The path returned by the parsed url often has the wrong number of leading `/` characters but
159    // should never differ in any other way (ssh URLs should not contain a query or fragment part).
160    // To avoid the various off-by-one errors caused by the `/` characters, we keep using the path
161    // determined above and can therefore skip parsing it here as well.
162    let url_string = format!("ssh://{host}");
163    let url = crate::simple_url::ParsedUrl::parse(&url_string).map_err(|source| Error::Url {
164        url: input.to_owned(),
165        kind: UrlKind::Scp,
166        source,
167    })?;
168
169    Ok(crate::Url {
170        serialize_alternative_form: true,
171        scheme: Scheme::from(url.scheme.as_str()),
172        user: url_user(&url, UrlKind::Scp)?,
173        password: url
174            .password
175            .map(|s| percent_decoded_utf8(s, UrlKind::Scp))
176            .transpose()?,
177        host: url.host,
178        port: url.port,
179        path: path.into(),
180    })
181}
182
183fn url_user(url: &crate::simple_url::ParsedUrl<'_>, kind: UrlKind) -> Result<Option<String>, Error> {
184    if url.username.is_empty() && url.password.is_none() {
185        Ok(None)
186    } else {
187        Ok(Some(percent_decoded_utf8(url.username, kind)?))
188    }
189}
190
191pub(crate) fn file_url(input: &BStr, protocol_colon: usize) -> Result<crate::Url, Error> {
192    let input = input_to_utf8(input, UrlKind::Url)?;
193    let input_after_protocol = &input[protocol_colon + "://".len()..];
194
195    let Some(first_slash) = input_after_protocol
196        .find('/')
197        .or_else(|| cfg!(windows).then(|| input_after_protocol.find('\\')).flatten())
198    else {
199        return Err(Error::MissingRepositoryPath {
200            url: input.to_owned().into(),
201            kind: UrlKind::Url,
202        });
203    };
204
205    // We cannot use the url crate to parse host and path because it special cases Windows
206    // driver letters. With the url crate an input of `file://x:/path/to/git` is parsed as empty
207    // host and with `x:/path/to/git` as path. This behavior is wrong for Git which only follows
208    // that rule on Windows and parses `x:` as host on Unix platforms. Additionally, the url crate
209    // does not account for Windows special UNC path support.
210
211    // TODO: implement UNC path special case
212    let windows_special_path = if cfg!(windows) {
213        // Inputs created via url::Url::from_file_path contain an additional `/` between the
214        // protocol and the absolute path. Make sure we ignore that first slash character to avoid
215        // producing invalid paths.
216        let input_after_protocol = if first_slash == 0 {
217            &input_after_protocol[1..]
218        } else {
219            input_after_protocol
220        };
221        // parse `file://x:/path/to/git` as explained above
222        if input_after_protocol.chars().nth(1) == Some(':') {
223            Some(input_after_protocol)
224        } else {
225            None
226        }
227    } else {
228        None
229    };
230
231    let host = if windows_special_path.is_some() || first_slash == 0 {
232        // `file:///path/to/git` or a windows special case was triggered
233        None
234    } else {
235        // `file://host/path/to/git`
236        Some(&input_after_protocol[..first_slash])
237    };
238
239    // default behavior on Unix platforms and if no Windows special case was triggered
240    let path = windows_special_path.unwrap_or(&input_after_protocol[first_slash..]);
241
242    Ok(crate::Url {
243        serialize_alternative_form: false,
244        host: host.map(Into::into),
245        ..local(path.into())?
246    })
247}
248
249pub(crate) fn local(input: &BStr) -> Result<crate::Url, Error> {
250    if input.is_empty() {
251        return Err(Error::MissingRepositoryPath {
252            url: input.to_owned(),
253            kind: UrlKind::Local,
254        });
255    }
256
257    Ok(crate::Url {
258        serialize_alternative_form: true,
259        scheme: Scheme::File,
260        password: None,
261        user: None,
262        host: None,
263        port: None,
264        path: input.to_owned(),
265    })
266}
267
268fn input_to_utf8(input: &BStr, kind: UrlKind) -> Result<&str, Error> {
269    std::str::from_utf8(input).map_err(|source| Error::Utf8 {
270        url: input.to_owned(),
271        kind,
272        source,
273    })
274}
275
276fn input_to_utf8_and_url(input: &BStr, kind: UrlKind) -> Result<(&str, crate::simple_url::ParsedUrl<'_>), Error> {
277    let input = input_to_utf8(input, kind)?;
278    crate::simple_url::ParsedUrl::parse(input)
279        .map(|url| (input, url))
280        .map_err(|source| {
281            // If the parser rejected it as RelativeUrlWithoutBase, map to Error::RelativeUrl
282            // to match the expected error type for malformed URLs like "invalid:://"
283            match source {
284                crate::simple_url::UrlParseError::RelativeUrlWithoutBase => {
285                    Error::RelativeUrl { url: input.to_owned() }
286                }
287                _ => Error::Url {
288                    url: input.to_owned(),
289                    kind,
290                    source,
291                },
292            }
293        })
294}