Skip to main content

gix_url/
lib.rs

1//! A library implementing a URL for use in git with access to its special capabilities.
2//! ## Feature Flags
3#![cfg_attr(
4    all(doc, feature = "document-features"),
5    doc = ::document_features::document_features!()
6)]
7#![cfg_attr(all(doc, feature = "document-features"), feature(doc_cfg))]
8#![deny(rust_2018_idioms, missing_docs)]
9#![forbid(unsafe_code)]
10
11use std::{borrow::Cow, path::PathBuf};
12
13use bstr::{BStr, BString};
14
15///
16pub mod expand_path;
17
18mod scheme;
19pub use scheme::Scheme;
20mod impls;
21
22///
23pub mod parse;
24
25/// Minimal URL parser to replace the `url` crate dependency
26mod simple_url;
27
28/// Parse the given `bytes` as a [git url](Url).
29///
30/// # Note
31///
32/// We cannot and should never have to deal with UTF-16 encoded windows strings, so bytes input is acceptable.
33/// For file-paths, we don't expect UTF8 encoding either.
34pub fn parse(input: &BStr) -> Result<Url, parse::Error> {
35    use parse::InputScheme;
36    match parse::find_scheme(input) {
37        InputScheme::Local => parse::local(input),
38        InputScheme::Url { protocol_end } if input[..protocol_end].eq_ignore_ascii_case(b"file") => {
39            parse::file_url(input, protocol_end)
40        }
41        InputScheme::Url { protocol_end } => parse::url(input, protocol_end),
42        InputScheme::Scp { colon } => parse::scp(input, colon),
43    }
44}
45
46/// Expand `path` for the given `user`, which can be obtained by [`parse()`], resolving the home directories
47/// of `user` automatically.
48///
49/// If more precise control of the resolution mechanism is needed, then use the [expand_path::with()] function.
50pub fn expand_path(user: Option<&expand_path::ForUser>, path: &BStr) -> Result<PathBuf, expand_path::Error> {
51    expand_path::with(user, path, |user| match user {
52        expand_path::ForUser::Current => gix_path::env::home_dir(),
53        expand_path::ForUser::Name(user) => {
54            gix_path::env::home_dir().and_then(|home| home.parent().map(|home_dirs| home_dirs.join(user.to_string())))
55        }
56    })
57}
58
59/// Classification of a portion of a URL by whether it is *syntactically* safe to pass as an argument to a command-line program.
60///
61/// Various parts of URLs can be specified to begin with `-`. If they are used as options to a command-line application
62/// such as an SSH client, they will be treated as options rather than as non-option arguments as the developer intended.
63/// This is a security risk, because URLs are not always trusted and can often be composed or influenced by an attacker.
64/// See <https://secure.phabricator.com/T12961> for details.
65///
66/// # Security Warning
67///
68/// This type only expresses known *syntactic* risk. It does not cover other risks, such as passing a personal access
69/// token as a username rather than a password in an application that logs usernames.
70#[derive(Debug, PartialEq, Eq, Copy, Clone)]
71pub enum ArgumentSafety<'a> {
72    /// May be safe. There is nothing to pass, so there is nothing dangerous.
73    Absent,
74    /// May be safe. The argument does not begin with a `-` and so will not be confused as an option.
75    Usable(&'a str),
76    /// Dangerous! Begins with `-` and could be treated as an option. Use the value in error messages only.
77    Dangerous(&'a str),
78}
79
80/// A URL with support for specialized git related capabilities.
81///
82/// Additionally, there is support for [deserialization](Url::from_bytes()) and [serialization](Url::to_bstring()).
83///
84/// # Mutability Warning
85///
86/// Due to the mutability of this type, it's possible that the URL serializes to something invalid
87/// when fields are modified directly. URLs should always be parsed to this type from string or byte
88/// parameters, but never be accepted as an instance of this type and then reconstructed, to maintain
89/// validity guarantees.
90///
91/// # Serialization
92///
93/// This type does not implement `Into<String>`, `From<Url> for String` because URLs
94/// can contain non-UTF-8 sequences in the path component when parsed from raw bytes.
95/// Use [to_bstring()](Url::to_bstring()) for lossless serialization, or use the [`Display`](std::fmt::Display)
96/// trait for a UTF-8 representation that redacts passwords for safe logging.
97///
98/// When the `serde` feature is enabled, this type implements `serde::Serialize` and `serde::Deserialize`,
99/// which will serialize *all* fields, including the password.
100///
101/// # Security Warning
102///
103/// URLs may contain passwords and using standard [formatting](std::fmt::Display) will redact
104/// such password, whereas [lossless serialization](Url::to_bstring()) will contain all parts of the
105/// URL.
106/// **Beware that some URLs still print secrets if they use them outside of the designated password fields.**
107///
108/// Also note that URLs that fail to parse are typically stored in [the resulting error](parse::Error) type
109/// and printed in full using its display implementation.
110#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)]
111#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
112pub struct Url {
113    /// The URL scheme.
114    pub scheme: Scheme,
115    /// The user to impersonate on the remote.
116    ///
117    /// Stored in decoded form: percent-encoded characters are decoded during parsing.
118    /// Re-encoded during canonical serialization, but written as-is in alternative form.
119    pub user: Option<String>,
120    /// The password associated with a user.
121    ///
122    /// Stored in decoded form: percent-encoded characters are decoded during parsing.
123    /// Re-encoded during canonical serialization. Cannot be serialized in alternative form (will panic in debug builds).
124    pub password: Option<String>,
125    /// The host to which to connect. Localhost is implied if `None`.
126    ///
127    /// IPv6 addresses are stored *without* brackets for SSH schemes, but *with* brackets for other schemes.
128    /// Brackets are automatically added during serialization when needed (e.g., when a port is specified with an IPv6 host).
129    pub host: Option<String>,
130    /// When serializing, use the alternative forms as it was parsed as such.
131    ///
132    /// Alternative forms include SCP-like syntax (`user@host:path`) and bare file paths.
133    /// When `true`, password and port cannot be serialized (will panic in debug builds).
134    pub serialize_alternative_form: bool,
135    /// The port to use when connecting to a host. If `None`, standard ports depending on `scheme` will be used.
136    pub port: Option<u16>,
137    /// The path portion of the URL, usually the location of the git repository.
138    ///
139    /// Paths are stored in decoded form: percent-encoded characters are decoded during parsing
140    /// and re-encoded during canonical serialization (e.g., `%20` becomes a space in this field).
141    ///
142    /// Path normalization during parsing:
143    /// - SSH/Git schemes: Leading `/~` is stripped (e.g., `/~repo` becomes `~repo`)
144    /// - SSH/Git schemes: Empty paths are rejected as errors
145    /// - HTTP/HTTPS schemes: Empty paths are normalized to `/`
146    ///
147    /// During serialization, SSH/Git URLs prepend `/` to paths not starting with `/`.
148    ///
149    /// # Security Warning
150    ///
151    /// URLs allow paths to start with `-` which makes it possible to mask command-line arguments as path which then leads to
152    /// the invocation of programs from an attacker controlled URL. See <https://secure.phabricator.com/T12961> for details.
153    ///
154    /// If this value is ever going to be passed to a command-line application, call [Self::path_argument_safe()] instead.
155    pub path: BString,
156}
157
158/// Instantiation
159impl Url {
160    /// Create a new instance from the given parts, including a password, which will be validated by parsing them back.
161    pub fn from_parts(
162        scheme: Scheme,
163        user: Option<String>,
164        password: Option<String>,
165        host: Option<String>,
166        port: Option<u16>,
167        path: BString,
168        serialize_alternative_form: bool,
169    ) -> Result<Self, parse::Error> {
170        parse(
171            Url {
172                scheme,
173                user,
174                password,
175                host,
176                port,
177                path,
178                serialize_alternative_form,
179            }
180            .to_bstring()
181            .as_ref(),
182        )
183    }
184}
185
186/// Modification
187impl Url {
188    /// Set the given `user`, or unset it with `None`. Return the previous value.
189    pub fn set_user(&mut self, user: Option<String>) -> Option<String> {
190        let prev = self.user.take();
191        self.user = user;
192        prev
193    }
194
195    /// Set the given `password`, or unset it with `None`. Return the previous value.
196    pub fn set_password(&mut self, password: Option<String>) -> Option<String> {
197        let prev = self.password.take();
198        self.password = password;
199        prev
200    }
201}
202
203/// Builder
204impl Url {
205    /// Enable alternate serialization for this url, e.g. `file:///path` becomes `/path`.
206    ///
207    /// This is automatically set correctly for parsed URLs, but can be set here for urls
208    /// created by constructor.
209    pub fn serialize_alternate_form(mut self, use_alternate_form: bool) -> Self {
210        self.serialize_alternative_form = use_alternate_form;
211        self
212    }
213
214    /// Turn a file url like `file://relative` into `file:///root/relative`, hence it assures the url's path component is absolute,
215    /// using `current_dir` if needed to achieve that.
216    pub fn canonicalize(&mut self, current_dir: &std::path::Path) -> Result<(), gix_path::realpath::Error> {
217        if self.scheme == Scheme::File {
218            let path = gix_path::from_bstr(Cow::Borrowed(self.path.as_ref()));
219            let abs_path = gix_path::realpath_opts(path.as_ref(), current_dir, gix_path::realpath::MAX_SYMLINKS)?;
220            self.path = gix_path::into_bstr(abs_path).into_owned();
221        }
222        Ok(())
223    }
224}
225
226/// Access
227impl Url {
228    /// Return the username mentioned in the URL, if present.
229    ///
230    /// # Security Warning
231    ///
232    /// URLs allow usernames to start with `-` which makes it possible to mask command-line arguments as username which then leads to
233    /// the invocation of programs from an attacker controlled URL. See <https://secure.phabricator.com/T12961> for details.
234    ///
235    /// If this value is ever going to be passed to a command-line application, call [Self::user_argument_safe()] instead.
236    pub fn user(&self) -> Option<&str> {
237        self.user.as_deref()
238    }
239
240    /// Classify the username of this URL by whether it is safe to pass as a command-line argument.
241    ///
242    /// Use this method instead of [Self::user()] if the host is going to be passed to a command-line application.
243    /// If the unsafe and absent cases need not be distinguished, [Self::user_argument_safe()] may also be used.
244    pub fn user_as_argument(&self) -> ArgumentSafety<'_> {
245        match self.user() {
246            Some(user) if looks_like_command_line_option(user.as_bytes()) => ArgumentSafety::Dangerous(user),
247            Some(user) => ArgumentSafety::Usable(user),
248            None => ArgumentSafety::Absent,
249        }
250    }
251
252    /// Return the username of this URL if present *and* if it can't be mistaken for a command-line argument.
253    ///
254    /// Use this method or [Self::user_as_argument()] instead of [Self::user()] if the host is going to be
255    /// passed to a command-line application. Prefer [Self::user_as_argument()] unless the unsafe and absent
256    /// cases need not be distinguished from each other.
257    pub fn user_argument_safe(&self) -> Option<&str> {
258        match self.user_as_argument() {
259            ArgumentSafety::Usable(user) => Some(user),
260            _ => None,
261        }
262    }
263
264    /// Return the password mentioned in the url, if present.
265    pub fn password(&self) -> Option<&str> {
266        self.password.as_deref()
267    }
268
269    /// Return the host mentioned in the URL, if present.
270    ///
271    /// # Security Warning
272    ///
273    /// URLs allow hosts to start with `-` which makes it possible to mask command-line arguments as host which then leads to
274    /// the invocation of programs from an attacker controlled URL. See <https://secure.phabricator.com/T12961> for details.
275    ///
276    /// If this value is ever going to be passed to a command-line application, call [Self::host_as_argument()]
277    /// or [Self::host_argument_safe()] instead.
278    pub fn host(&self) -> Option<&str> {
279        self.host.as_deref()
280    }
281
282    /// Classify the host of this URL by whether it is safe to pass as a command-line argument.
283    ///
284    /// Use this method instead of [Self::host()] if the host is going to be passed to a command-line application.
285    /// If the unsafe and absent cases need not be distinguished, [Self::host_argument_safe()] may also be used.
286    pub fn host_as_argument(&self) -> ArgumentSafety<'_> {
287        match self.host() {
288            Some(host) if looks_like_command_line_option(host.as_bytes()) => ArgumentSafety::Dangerous(host),
289            Some(host) => ArgumentSafety::Usable(host),
290            None => ArgumentSafety::Absent,
291        }
292    }
293
294    /// Return the host of this URL if present *and* if it can't be mistaken for a command-line argument.
295    ///
296    /// Use this method or [Self::host_as_argument()] instead of [Self::host()] if the host is going to be
297    /// passed to a command-line application. Prefer [Self::host_as_argument()] unless the unsafe and absent
298    /// cases need not be distinguished from each other.
299    pub fn host_argument_safe(&self) -> Option<&str> {
300        match self.host_as_argument() {
301            ArgumentSafety::Usable(host) => Some(host),
302            _ => None,
303        }
304    }
305
306    /// Return the path of this URL *if* it can't be mistaken for a command-line argument.
307    /// Note that it always begins with a slash, which is ignored for this comparison.
308    ///
309    /// Use this method instead of accessing [Self::path] directly if the path is going to be passed to a
310    /// command-line application, unless it is certain that the leading `/` will always be included.
311    pub fn path_argument_safe(&self) -> Option<&BStr> {
312        self.path
313            .get(1..)
314            .and_then(|truncated| (!looks_like_command_line_option(truncated)).then_some(self.path.as_ref()))
315    }
316
317    /// Return true if the path portion of the URL is `/`.
318    pub fn path_is_root(&self) -> bool {
319        self.path == "/"
320    }
321
322    /// Return the actual or default port for use according to the URL scheme.
323    /// Note that there may be no default port either.
324    pub fn port_or_default(&self) -> Option<u16> {
325        self.port.or_else(|| {
326            use Scheme::*;
327            Some(match self.scheme {
328                Http => 80,
329                Https => 443,
330                Ssh => 22,
331                Git => 9418,
332                File | Ext(_) => return None,
333            })
334        })
335    }
336}
337
338fn looks_like_command_line_option(b: &[u8]) -> bool {
339    b.first() == Some(&b'-')
340}
341
342/// Transformation
343impl Url {
344    /// Turn a file URL like `file://relative` into `file:///root/relative`, hence it assures the URL's path component is absolute, using
345    /// `current_dir` if necessary.
346    pub fn canonicalized(&self, current_dir: &std::path::Path) -> Result<Self, gix_path::realpath::Error> {
347        let mut res = self.clone();
348        res.canonicalize(current_dir)?;
349        Ok(res)
350    }
351}
352
353/// Serialization
354impl Url {
355    /// Write this URL losslessly to `out`, ready to be parsed again.
356    pub fn write_to(&self, out: &mut dyn std::io::Write) -> std::io::Result<()> {
357        // Since alternative form doesn't employ any escape syntax, password and
358        // port number cannot be encoded.
359        if self.serialize_alternative_form
360            && (self.scheme == Scheme::File || self.scheme == Scheme::Ssh)
361            && self.password.is_none()
362            && self.port.is_none()
363        {
364            self.write_alternative_form_to(out)
365        } else {
366            self.write_canonical_form_to(out)
367        }
368    }
369
370    fn write_canonical_form_to(&self, out: &mut dyn std::io::Write) -> std::io::Result<()> {
371        fn percent_encode(s: &str) -> Cow<'_, str> {
372            /// Characters that must be percent-encoded in the userinfo component of a URL.
373            ///
374            /// According to RFC 3986, userinfo can contain:
375            /// - unreserved characters: `A-Z a-z 0-9 - . _ ~`
376            /// - percent-encoded characters
377            /// - sub-delims: `! $ & ' ( ) * + , ; =`
378            /// - `:`
379            ///
380            /// This encode-set encodes everything else, particularly `@` (userinfo delimiter),
381            /// `/` `?` `#` (path/query/fragment delimiters), and various other special characters.
382            const USERINFO_ENCODE_SET: &percent_encoding::AsciiSet = &percent_encoding::CONTROLS
383                .add(b' ')
384                .add(b'"')
385                .add(b'#')
386                .add(b'%')
387                .add(b'/')
388                .add(b'<')
389                .add(b'>')
390                .add(b'?')
391                .add(b'@')
392                .add(b'[')
393                .add(b'\\')
394                .add(b']')
395                .add(b'^')
396                .add(b'`')
397                .add(b'{')
398                .add(b'|')
399                .add(b'}');
400            percent_encoding::utf8_percent_encode(s, USERINFO_ENCODE_SET).into()
401        }
402
403        out.write_all(self.scheme.as_str().as_bytes())?;
404        out.write_all(b"://")?;
405
406        let needs_brackets = self.port.is_some() && self.host_needs_brackets();
407
408        match (&self.user, &self.host) {
409            (Some(user), Some(host)) => {
410                out.write_all(percent_encode(user).as_bytes())?;
411                if let Some(password) = &self.password {
412                    out.write_all(b":")?;
413                    out.write_all(percent_encode(password).as_bytes())?;
414                }
415                out.write_all(b"@")?;
416                if needs_brackets {
417                    out.write_all(b"[")?;
418                }
419                out.write_all(host.as_bytes())?;
420                if needs_brackets {
421                    out.write_all(b"]")?;
422                }
423            }
424            (None, Some(host)) => {
425                if needs_brackets {
426                    out.write_all(b"[")?;
427                }
428                out.write_all(host.as_bytes())?;
429                if needs_brackets {
430                    out.write_all(b"]")?;
431                }
432            }
433            (None, None) => {}
434            (Some(_user), None) => {
435                return Err(std::io::Error::other(
436                    "Invalid URL structure: user specified without host",
437                ));
438            }
439        }
440        if let Some(port) = &self.port {
441            write!(out, ":{port}")?;
442        }
443        // For SSH and Git URLs, add leading '/' if path doesn't start with '/'
444        // This handles paths like "~repo" which serialize as "/~repo" in URL form
445        if matches!(self.scheme, Scheme::Ssh | Scheme::Git) && !self.path.starts_with(b"/") {
446            out.write_all(b"/")?;
447        }
448        out.write_all(&self.path)?;
449        Ok(())
450    }
451
452    fn host_needs_brackets(&self) -> bool {
453        fn is_ipv6(h: &str) -> bool {
454            h.contains(':') && !h.starts_with('[')
455        }
456        self.host.as_ref().is_some_and(|h| is_ipv6(h))
457    }
458
459    fn write_alternative_form_to(&self, out: &mut dyn std::io::Write) -> std::io::Result<()> {
460        let needs_brackets = self.host_needs_brackets();
461
462        match (&self.user, &self.host) {
463            (Some(user), Some(host)) => {
464                out.write_all(user.as_bytes())?;
465                out.write_all(b"@")?;
466                if needs_brackets {
467                    out.write_all(b"[")?;
468                }
469                out.write_all(host.as_bytes())?;
470                if needs_brackets {
471                    out.write_all(b"]")?;
472                }
473            }
474            (None, Some(host)) => {
475                if needs_brackets {
476                    out.write_all(b"[")?;
477                }
478                out.write_all(host.as_bytes())?;
479                if needs_brackets {
480                    out.write_all(b"]")?;
481                }
482            }
483            (None, None) => {}
484            (Some(_user), None) => {
485                return Err(std::io::Error::other(
486                    "Invalid URL structure: user specified without host",
487                ));
488            }
489        }
490        if self.scheme == Scheme::Ssh {
491            out.write_all(b":")?;
492        }
493        out.write_all(&self.path)?;
494        Ok(())
495    }
496
497    /// Transform ourselves into a binary string, losslessly, or fail if the URL is malformed due to host or user parts being incorrect.
498    pub fn to_bstring(&self) -> BString {
499        let mut buf = Vec::with_capacity(
500            (5 + 3)
501                + self.user.as_ref().map(String::len).unwrap_or_default()
502                + 1
503                + self.host.as_ref().map(String::len).unwrap_or_default()
504                + self.port.map(|_| 5).unwrap_or_default()
505                + self.path.len(),
506        );
507        self.write_to(&mut buf).expect("io cannot fail in memory");
508        buf.into()
509    }
510}
511
512/// Deserialization
513impl Url {
514    /// Parse a URL from `bytes`.
515    pub fn from_bytes(bytes: &BStr) -> Result<Self, parse::Error> {
516        parse(bytes)
517    }
518}
519
520/// This module contains extensions to the [Url] struct which are only intended to be used
521/// for testing code. Do not use this module in production! For all intents and purposes, the APIs of
522/// all functions and types exposed by this module are considered unstable and are allowed to break
523/// even in patch releases!
524#[doc(hidden)]
525pub mod testing {
526    use bstr::BString;
527
528    use crate::{Scheme, Url};
529
530    /// Additional functions for [Url] which are only intended to be used for tests.
531    pub trait TestUrlExtension {
532        /// Create a new instance from the given parts without validating them.
533        ///
534        /// This function is primarily intended for testing purposes. For production code please
535        /// consider using [Url::from_parts] instead!
536        fn from_parts_unchecked(
537            scheme: Scheme,
538            user: Option<String>,
539            password: Option<String>,
540            host: Option<String>,
541            port: Option<u16>,
542            path: BString,
543            serialize_alternative_form: bool,
544        ) -> Url {
545            Url {
546                scheme,
547                user,
548                password,
549                host,
550                port,
551                path,
552                serialize_alternative_form,
553            }
554        }
555    }
556
557    impl TestUrlExtension for Url {}
558}