gix_url/
parse.rs

1use std::convert::Infallible;
2
3use bstr::{BStr, BString, ByteSlice};
4use percent_encoding::percent_decode_str;
5
6use crate::Scheme;
7
8/// The error returned by [parse()](crate::parse()).
9#[derive(Debug, thiserror::Error)]
10#[allow(missing_docs)]
11pub enum Error {
12    #[error("{} \"{url}\" is not valid UTF-8", kind.as_str())]
13    Utf8 {
14        url: BString,
15        kind: UrlKind,
16        source: std::str::Utf8Error,
17    },
18    #[error("{} {url:?} can not be parsed as valid URL", kind.as_str())]
19    Url {
20        url: String,
21        kind: UrlKind,
22        source: crate::simple_url::UrlParseError,
23    },
24
25    #[error("The host portion of the following URL is too long ({} bytes, {len} bytes total): {truncated_url:?}", truncated_url.len())]
26    TooLong { truncated_url: BString, len: usize },
27    #[error("{} \"{url}\" does not specify a path to a repository", kind.as_str())]
28    MissingRepositoryPath { url: BString, kind: UrlKind },
29    #[error("URL {url:?} is relative which is not allowed in this context")]
30    RelativeUrl { url: String },
31}
32
33impl From<Infallible> for Error {
34    fn from(_: Infallible) -> Self {
35        unreachable!("Cannot actually happen, but it seems there can't be a blanket impl for this")
36    }
37}
38
39///
40#[derive(Debug, Clone, Copy)]
41pub enum UrlKind {
42    ///
43    Url,
44    ///
45    Scp,
46    ///
47    Local,
48}
49
50impl UrlKind {
51    fn as_str(&self) -> &'static str {
52        match self {
53            UrlKind::Url => "URL",
54            UrlKind::Scp => "SCP-like target",
55            UrlKind::Local => "local path",
56        }
57    }
58}
59
60pub(crate) enum InputScheme {
61    Url { protocol_end: usize },
62    Scp { colon: usize },
63    Local,
64}
65
66pub(crate) fn find_scheme(input: &BStr) -> InputScheme {
67    // TODO: url's may only contain `:/`, we should additionally check if the characters used for
68    //       protocol are all valid
69    if let Some(protocol_end) = input.find("://") {
70        return InputScheme::Url { protocol_end };
71    }
72
73    // Find colon, but skip over IPv6 brackets if present
74    let colon = if input.starts_with(b"[") {
75        // IPv6 address, find the closing bracket first
76        if let Some(bracket_end) = input.find_byte(b']') {
77            // Look for colon after the bracket
78            input[bracket_end + 1..]
79                .find_byte(b':')
80                .map(|pos| bracket_end + 1 + pos)
81        } else {
82            // No closing bracket, treat as regular search
83            input.find_byte(b':')
84        }
85    } else {
86        input.find_byte(b':')
87    };
88
89    if let Some(colon) = colon {
90        // allow user to select files containing a `:` by passing them as absolute or relative path
91        // this is behavior explicitly mentioned by the scp and git manuals
92        let explicitly_local = &input[..colon].contains(&b'/');
93        let dos_driver_letter = cfg!(windows) && input[..colon].len() == 1;
94
95        if !explicitly_local && !dos_driver_letter {
96            return InputScheme::Scp { colon };
97        }
98    }
99
100    InputScheme::Local
101}
102
103pub(crate) fn url(input: &BStr, protocol_end: usize) -> Result<crate::Url, Error> {
104    const MAX_LEN: usize = 1024;
105    let bytes_to_path = input[protocol_end + "://".len()..]
106        .iter()
107        .filter(|b| !b.is_ascii_whitespace())
108        .skip_while(|b| **b == b'/' || **b == b'\\')
109        .position(|b| *b == b'/')
110        .unwrap_or(input.len() - protocol_end);
111    if bytes_to_path > MAX_LEN || protocol_end > MAX_LEN {
112        return Err(Error::TooLong {
113            truncated_url: input[..(protocol_end + "://".len() + MAX_LEN).min(input.len())].into(),
114            len: input.len(),
115        });
116    }
117    let (input, url) = input_to_utf8_and_url(input, UrlKind::Url)?;
118    let scheme = Scheme::from(url.scheme.as_str());
119
120    if matches!(scheme, Scheme::Git | Scheme::Ssh) && url.path.is_empty() {
121        return Err(Error::MissingRepositoryPath {
122            url: input.into(),
123            kind: UrlKind::Url,
124        });
125    }
126
127    // Normalize empty path to "/" for http/https URLs only
128    let path = if url.path.is_empty() && matches!(scheme, Scheme::Http | Scheme::Https) {
129        "/".into()
130    } else if matches!(scheme, Scheme::Ssh | Scheme::Git) && url.path.starts_with("/~") {
131        // For SSH and Git protocols, strip leading '/' from paths starting with '~'
132        // e.g., "ssh://host/~repo" -> path is "~repo", not "/~repo"
133        url.path[1..].into()
134    } else {
135        url.path.into()
136    };
137
138    let user = url_user(&url, UrlKind::Url)?;
139    let password = url
140        .password
141        .map(|s| percent_decoded_utf8(s, UrlKind::Url))
142        .transpose()?;
143    let port = url.port;
144
145    // For SSH URLs, strip brackets from IPv6 addresses
146    let host = if scheme == Scheme::Ssh {
147        url.host.map(|mut h| {
148            // Bracketed IPv6 forms
149            if let Some(h2) = h.strip_prefix('[') {
150                if let Some(inner) = h2.strip_suffix("]:") {
151                    // "[::1]:" → "::1"
152                    h = inner.to_string();
153                } else if let Some(inner) = h2.strip_suffix(']') {
154                    // "[::1]" → "::1"
155                    h = inner.to_string();
156                }
157            } else {
158                // Non-bracketed host: strip a single trailing colon
159                let colon_count = h.chars().filter(|&c| c == ':').take(2).count();
160                if colon_count == 1 {
161                    if let Some(inner) = h.strip_suffix(':') {
162                        h = inner.to_string();
163                    }
164                }
165            }
166            h
167        })
168    } else {
169        url.host
170    };
171    Ok(crate::Url {
172        serialize_alternative_form: false,
173        scheme,
174        user,
175        password,
176        host,
177        port,
178        path,
179    })
180}
181
182fn percent_decoded_utf8(s: &str, kind: UrlKind) -> Result<String, Error> {
183    Ok(percent_decode_str(s)
184        .decode_utf8()
185        .map_err(|err| Error::Utf8 {
186            url: s.into(),
187            kind,
188            source: err,
189        })?
190        .into_owned())
191}
192
193pub(crate) fn scp(input: &BStr, colon: usize) -> Result<crate::Url, Error> {
194    let input = input_to_utf8(input, UrlKind::Scp)?;
195
196    // TODO: this incorrectly splits at IPv6 addresses, check for `[]` before splitting
197    let (host, path) = input.split_at(colon);
198    debug_assert_eq!(path.get(..1), Some(":"), "{path} should start with :");
199    let path = &path[1..];
200
201    if path.is_empty() {
202        return Err(Error::MissingRepositoryPath {
203            url: input.to_owned().into(),
204            kind: UrlKind::Scp,
205        });
206    }
207
208    // The path returned by the parsed url often has the wrong number of leading `/` characters but
209    // should never differ in any other way (ssh URLs should not contain a query or fragment part).
210    // To avoid the various off-by-one errors caused by the `/` characters, we keep using the path
211    // determined above and can therefore skip parsing it here as well.
212    let url_string = format!("ssh://{host}");
213    let url = crate::simple_url::ParsedUrl::parse(&url_string).map_err(|source| Error::Url {
214        url: input.to_owned(),
215        kind: UrlKind::Scp,
216        source,
217    })?;
218
219    // For SCP-like SSH URLs, strip leading '/' from paths starting with '/~'
220    // e.g., "user@host:/~repo" -> path is "~repo", not "/~repo"
221    let path = if path.starts_with("/~") { &path[1..] } else { path };
222
223    let user = url_user(&url, UrlKind::Scp)?;
224    let password = url
225        .password
226        .map(|s| percent_decoded_utf8(s, UrlKind::Scp))
227        .transpose()?;
228    let port = url.port;
229
230    // For SCP-like SSH URLs, strip brackets from IPv6 addresses
231    let host = url.host.map(|h| {
232        if let Some(h) = h.strip_prefix("[").and_then(|h| h.strip_suffix("]")) {
233            h.to_string()
234        } else {
235            h
236        }
237    });
238
239    Ok(crate::Url {
240        serialize_alternative_form: true,
241        scheme: Scheme::from(url.scheme.as_str()),
242        user,
243        password,
244        host,
245        port,
246        path: path.into(),
247    })
248}
249
250fn url_user(url: &crate::simple_url::ParsedUrl<'_>, kind: UrlKind) -> Result<Option<String>, Error> {
251    if url.username.is_empty() && url.password.is_none() {
252        Ok(None)
253    } else {
254        Ok(Some(percent_decoded_utf8(url.username, kind)?))
255    }
256}
257
258pub(crate) fn file_url(input: &BStr, protocol_colon: usize) -> Result<crate::Url, Error> {
259    let input = input_to_utf8(input, UrlKind::Url)?;
260    let input_after_protocol = &input[protocol_colon + "://".len()..];
261
262    let Some(first_slash) = input_after_protocol
263        .find('/')
264        .or_else(|| cfg!(windows).then(|| input_after_protocol.find('\\')).flatten())
265    else {
266        return Err(Error::MissingRepositoryPath {
267            url: input.to_owned().into(),
268            kind: UrlKind::Url,
269        });
270    };
271
272    // We cannot use the url crate to parse host and path because it special cases Windows
273    // driver letters. With the url crate an input of `file://x:/path/to/git` is parsed as empty
274    // host and with `x:/path/to/git` as path. This behavior is wrong for Git which only follows
275    // that rule on Windows and parses `x:` as host on Unix platforms. Additionally, the url crate
276    // does not account for Windows special UNC path support.
277
278    // TODO: implement UNC path special case
279    let windows_special_path = if cfg!(windows) {
280        // Inputs created via url::Url::from_file_path contain an additional `/` between the
281        // protocol and the absolute path. Make sure we ignore that first slash character to avoid
282        // producing invalid paths.
283        let input_after_protocol = if first_slash == 0 {
284            &input_after_protocol[1..]
285        } else {
286            input_after_protocol
287        };
288        // parse `file://x:/path/to/git` as explained above
289        if input_after_protocol.chars().nth(1) == Some(':') {
290            Some(input_after_protocol)
291        } else {
292            None
293        }
294    } else {
295        None
296    };
297
298    let host = if windows_special_path.is_some() || first_slash == 0 {
299        // `file:///path/to/git` or a windows special case was triggered
300        None
301    } else {
302        // `file://host/path/to/git`
303        Some(&input_after_protocol[..first_slash])
304    };
305
306    // default behavior on Unix platforms and if no Windows special case was triggered
307    let path = windows_special_path.unwrap_or(&input_after_protocol[first_slash..]);
308
309    Ok(crate::Url {
310        serialize_alternative_form: false,
311        host: host.map(Into::into),
312        ..local(path.into())?
313    })
314}
315
316pub(crate) fn local(input: &BStr) -> Result<crate::Url, Error> {
317    if input.is_empty() {
318        return Err(Error::MissingRepositoryPath {
319            url: input.to_owned(),
320            kind: UrlKind::Local,
321        });
322    }
323
324    Ok(crate::Url {
325        serialize_alternative_form: true,
326        scheme: Scheme::File,
327        password: None,
328        user: None,
329        host: None,
330        port: None,
331        path: input.to_owned(),
332    })
333}
334
335fn input_to_utf8(input: &BStr, kind: UrlKind) -> Result<&str, Error> {
336    std::str::from_utf8(input).map_err(|source| Error::Utf8 {
337        url: input.to_owned(),
338        kind,
339        source,
340    })
341}
342
343fn input_to_utf8_and_url(input: &BStr, kind: UrlKind) -> Result<(&str, crate::simple_url::ParsedUrl<'_>), Error> {
344    let input = input_to_utf8(input, kind)?;
345    crate::simple_url::ParsedUrl::parse(input)
346        .map(|url| (input, url))
347        .map_err(|source| {
348            // If the parser rejected it as RelativeUrlWithoutBase, map to Error::RelativeUrl
349            // to match the expected error type for malformed URLs like "invalid:://"
350            match source {
351                crate::simple_url::UrlParseError::RelativeUrlWithoutBase => {
352                    Error::RelativeUrl { url: input.to_owned() }
353                }
354                _ => Error::Url {
355                    url: input.to_owned(),
356                    kind,
357                    source,
358                },
359            }
360        })
361}