gix_url/
parse.rs

1use std::convert::Infallible;
2
3use bstr::{BStr, BString, ByteSlice};
4
5use crate::Scheme;
6
7/// The error returned by [parse()](crate::parse()).
8#[derive(Debug, thiserror::Error)]
9#[allow(missing_docs)]
10pub enum Error {
11    #[error("{} \"{url}\" is not valid UTF-8", kind.as_str())]
12    Utf8 {
13        url: BString,
14        kind: UrlKind,
15        source: std::str::Utf8Error,
16    },
17    #[error("{} {url:?} can not be parsed as valid URL", kind.as_str())]
18    Url {
19        url: String,
20        kind: UrlKind,
21        source: crate::simple_url::UrlParseError,
22    },
23
24    #[error("The host portion of the following URL is too long ({} bytes, {len} bytes total): {truncated_url:?}", truncated_url.len())]
25    TooLong { truncated_url: BString, len: usize },
26    #[error("{} \"{url}\" does not specify a path to a repository", kind.as_str())]
27    MissingRepositoryPath { url: BString, kind: UrlKind },
28    #[error("URL {url:?} is relative which is not allowed in this context")]
29    RelativeUrl { url: String },
30}
31
32impl From<Infallible> for Error {
33    fn from(_: Infallible) -> Self {
34        unreachable!("Cannot actually happen, but it seems there can't be a blanket impl for this")
35    }
36}
37
38///
39#[derive(Debug, Clone, Copy)]
40pub enum UrlKind {
41    ///
42    Url,
43    ///
44    Scp,
45    ///
46    Local,
47}
48
49impl UrlKind {
50    fn as_str(&self) -> &'static str {
51        match self {
52            UrlKind::Url => "URL",
53            UrlKind::Scp => "SCP-like target",
54            UrlKind::Local => "local path",
55        }
56    }
57}
58
59pub(crate) enum InputScheme {
60    Url { protocol_end: usize },
61    Scp { colon: usize },
62    Local,
63}
64
65pub(crate) fn find_scheme(input: &BStr) -> InputScheme {
66    // TODO: url's may only contain `:/`, we should additionally check if the characters used for
67    //       protocol are all valid
68    if let Some(protocol_end) = input.find("://") {
69        return InputScheme::Url { protocol_end };
70    }
71
72    // Find colon, but skip over IPv6 brackets if present
73    let colon = if input.starts_with(b"[") {
74        // IPv6 address, find the closing bracket first
75        if let Some(bracket_end) = input.find_byte(b']') {
76            // Look for colon after the bracket
77            input[bracket_end + 1..]
78                .find_byte(b':')
79                .map(|pos| bracket_end + 1 + pos)
80        } else {
81            // No closing bracket, treat as regular search
82            input.find_byte(b':')
83        }
84    } else {
85        input.find_byte(b':')
86    };
87
88    if let Some(colon) = colon {
89        // allow user to select files containing a `:` by passing them as absolute or relative path
90        // this is behavior explicitly mentioned by the scp and git manuals
91        let explicitly_local = &input[..colon].contains(&b'/');
92        let dos_driver_letter = cfg!(windows) && input[..colon].len() == 1;
93
94        if !explicitly_local && !dos_driver_letter {
95            return InputScheme::Scp { colon };
96        }
97    }
98
99    InputScheme::Local
100}
101
102pub(crate) fn url(input: &BStr, protocol_end: usize) -> Result<crate::Url, Error> {
103    const MAX_LEN: usize = 1024;
104    let bytes_to_path = input[protocol_end + "://".len()..]
105        .iter()
106        .filter(|b| !b.is_ascii_whitespace())
107        .skip_while(|b| **b == b'/' || **b == b'\\')
108        .position(|b| *b == b'/')
109        .unwrap_or(input.len() - protocol_end);
110    if bytes_to_path > MAX_LEN || protocol_end > MAX_LEN {
111        return Err(Error::TooLong {
112            truncated_url: input[..(protocol_end + "://".len() + MAX_LEN).min(input.len())].into(),
113            len: input.len(),
114        });
115    }
116    let (input, url) = input_to_utf8_and_url(input, UrlKind::Url)?;
117    let scheme = Scheme::from(url.scheme.as_str());
118
119    if matches!(scheme, Scheme::Git | Scheme::Ssh) && url.path.is_empty() {
120        return Err(Error::MissingRepositoryPath {
121            url: input.into(),
122            kind: UrlKind::Url,
123        });
124    }
125
126    // Normalize empty path to "/" for http/https URLs only
127    let path: BString = if url.path.is_empty() && matches!(scheme, Scheme::Http | Scheme::Https) {
128        "/".into()
129    } else if matches!(scheme, Scheme::Ssh | Scheme::Git) && url.path.starts_with("/~") {
130        // For SSH and Git protocols, strip leading '/' from paths starting with '~'
131        // e.g., "ssh://host/~repo" -> path is "~repo", not "/~repo"
132        url.path[1..].into()
133    } else {
134        url.path.into()
135    };
136
137    let user = if url.username.is_empty() && url.password.is_none() {
138        None
139    } else {
140        Some(url.username)
141    };
142    let password = url.password;
143    let port = url.port;
144
145    // For SSH URLs, strip brackets from IPv6 addresses
146    let host = if scheme == Scheme::Ssh {
147        url.host.map(|mut h| {
148            // Bracketed IPv6 forms
149            if let Some(h2) = h.strip_prefix('[') {
150                if let Some(inner) = h2.strip_suffix("]:") {
151                    // "[::1]:" → "::1"
152                    h = inner.to_string();
153                } else if let Some(inner) = h2.strip_suffix(']') {
154                    // "[::1]" → "::1"
155                    h = inner.to_string();
156                }
157            } else {
158                // Non-bracketed host: strip a single trailing colon
159                let colon_count = h.chars().filter(|&c| c == ':').take(2).count();
160                if colon_count == 1 {
161                    if let Some(inner) = h.strip_suffix(':') {
162                        h = inner.to_string();
163                    }
164                }
165            }
166            h
167        })
168    } else {
169        url.host
170    };
171    Ok(crate::Url {
172        serialize_alternative_form: false,
173        scheme,
174        user,
175        password,
176        host,
177        port,
178        path,
179    })
180}
181
182pub(crate) fn scp(input: &BStr, colon: usize) -> Result<crate::Url, Error> {
183    let input = input_to_utf8(input, UrlKind::Scp)?;
184
185    // TODO: this incorrectly splits at IPv6 addresses, check for `[]` before splitting
186    let (host, path) = input.split_at(colon);
187    debug_assert_eq!(path.get(..1), Some(":"), "{path} should start with :");
188    let path = &path[1..];
189
190    if path.is_empty() {
191        return Err(Error::MissingRepositoryPath {
192            url: input.to_owned().into(),
193            kind: UrlKind::Scp,
194        });
195    }
196
197    // The path returned by the parsed url often has the wrong number of leading `/` characters but
198    // should never differ in any other way (ssh URLs should not contain a query or fragment part).
199    // To avoid the various off-by-one errors caused by the `/` characters, we keep using the path
200    // determined above and can therefore skip parsing it here as well.
201    let url_string = format!("ssh://{host}");
202    let url = crate::simple_url::ParsedUrl::parse(&url_string).map_err(|source| Error::Url {
203        url: input.to_owned(),
204        kind: UrlKind::Scp,
205        source,
206    })?;
207
208    // For SCP-like SSH URLs, strip leading '/' from paths starting with '/~'
209    // e.g., "user@host:/~repo" -> path is "~repo", not "/~repo"
210    let path = if path.starts_with("/~") { &path[1..] } else { path };
211
212    let user = if url.username.is_empty() && url.password.is_none() {
213        None
214    } else {
215        Some(url.username)
216    };
217    let password = url.password;
218    let port = url.port;
219
220    // For SCP-like SSH URLs, strip brackets from IPv6 addresses
221    let host = url.host.map(|h| {
222        if let Some(h) = h.strip_prefix("[").and_then(|h| h.strip_suffix("]")) {
223            h.to_string()
224        } else {
225            h
226        }
227    });
228
229    Ok(crate::Url {
230        serialize_alternative_form: true,
231        scheme: Scheme::from(url.scheme.as_str()),
232        user,
233        password,
234        host,
235        port,
236        path: path.into(),
237    })
238}
239
240pub(crate) fn file_url(input: &BStr, protocol_colon: usize) -> Result<crate::Url, Error> {
241    let input = input_to_utf8(input, UrlKind::Url)?;
242    let input_after_protocol = &input[protocol_colon + "://".len()..];
243
244    let Some(first_slash) = input_after_protocol
245        .find('/')
246        .or_else(|| cfg!(windows).then(|| input_after_protocol.find('\\')).flatten())
247    else {
248        return Err(Error::MissingRepositoryPath {
249            url: input.to_owned().into(),
250            kind: UrlKind::Url,
251        });
252    };
253
254    // We cannot use the url crate to parse host and path because it special cases Windows
255    // driver letters. With the url crate an input of `file://x:/path/to/git` is parsed as empty
256    // host and with `x:/path/to/git` as path. This behavior is wrong for Git which only follows
257    // that rule on Windows and parses `x:` as host on Unix platforms. Additionally, the url crate
258    // does not account for Windows special UNC path support.
259
260    // TODO: implement UNC path special case
261    let windows_special_path = if cfg!(windows) {
262        // Inputs created via url::Url::from_file_path contain an additional `/` between the
263        // protocol and the absolute path. Make sure we ignore that first slash character to avoid
264        // producing invalid paths.
265        let input_after_protocol = if first_slash == 0 {
266            &input_after_protocol[1..]
267        } else {
268            input_after_protocol
269        };
270        // parse `file://x:/path/to/git` as explained above
271        if input_after_protocol.chars().nth(1) == Some(':') {
272            Some(input_after_protocol)
273        } else {
274            None
275        }
276    } else {
277        None
278    };
279
280    let host = if windows_special_path.is_some() || first_slash == 0 {
281        // `file:///path/to/git` or a windows special case was triggered
282        None
283    } else {
284        // `file://host/path/to/git`
285        Some(&input_after_protocol[..first_slash])
286    };
287
288    // default behavior on Unix platforms and if no Windows special case was triggered
289    let path = windows_special_path.unwrap_or(&input_after_protocol[first_slash..]);
290
291    Ok(crate::Url {
292        serialize_alternative_form: false,
293        host: host.map(Into::into),
294        ..local(path.into())?
295    })
296}
297
298pub(crate) fn local(input: &BStr) -> Result<crate::Url, Error> {
299    if input.is_empty() {
300        return Err(Error::MissingRepositoryPath {
301            url: input.to_owned(),
302            kind: UrlKind::Local,
303        });
304    }
305
306    Ok(crate::Url {
307        serialize_alternative_form: true,
308        scheme: Scheme::File,
309        password: None,
310        user: None,
311        host: None,
312        port: None,
313        path: input.to_owned(),
314    })
315}
316
317fn input_to_utf8(input: &BStr, kind: UrlKind) -> Result<&str, Error> {
318    std::str::from_utf8(input).map_err(|source| Error::Utf8 {
319        url: input.to_owned(),
320        kind,
321        source,
322    })
323}
324
325fn input_to_utf8_and_url(input: &BStr, kind: UrlKind) -> Result<(&str, crate::simple_url::ParsedUrl), Error> {
326    let input = input_to_utf8(input, kind)?;
327    crate::simple_url::ParsedUrl::parse(input)
328        .map(|url| (input, url))
329        .map_err(|source| {
330            // If the parser rejected it as RelativeUrlWithoutBase, map to Error::RelativeUrl
331            // to match the expected error type for malformed URLs like "invalid:://"
332            match source {
333                crate::simple_url::UrlParseError::RelativeUrlWithoutBase => {
334                    Error::RelativeUrl { url: input.to_owned() }
335                }
336                _ => Error::Url {
337                    url: input.to_owned(),
338                    kind,
339                    source,
340                },
341            }
342        })
343}