gix_url/
lib.rs

1//! A library implementing a URL for use in git with access to its special capabilities.
2//! ## Feature Flags
3#![cfg_attr(
4    all(doc, feature = "document-features"),
5    doc = ::document_features::document_features!()
6)]
7#![cfg_attr(all(doc, feature = "document-features"), feature(doc_cfg))]
8#![deny(rust_2018_idioms, missing_docs)]
9#![forbid(unsafe_code)]
10
11use std::{borrow::Cow, path::PathBuf};
12
13use bstr::{BStr, BString};
14
15///
16pub mod expand_path;
17
18mod scheme;
19pub use scheme::Scheme;
20mod impls;
21
22///
23pub mod parse;
24
25/// Parse the given `bytes` as a [git url](Url).
26///
27/// # Note
28///
29/// We cannot and should never have to deal with UTF-16 encoded windows strings, so bytes input is acceptable.
30/// For file-paths, we don't expect UTF8 encoding either.
31pub fn parse(input: &BStr) -> Result<Url, parse::Error> {
32    use parse::InputScheme;
33    match parse::find_scheme(input) {
34        InputScheme::Local => parse::local(input),
35        InputScheme::Url { protocol_end } if input[..protocol_end].eq_ignore_ascii_case(b"file") => {
36            parse::file_url(input, protocol_end)
37        }
38        InputScheme::Url { protocol_end } => parse::url(input, protocol_end),
39        InputScheme::Scp { colon } => parse::scp(input, colon),
40    }
41}
42
43/// Expand `path` for the given `user`, which can be obtained by [`parse()`], resolving the home directories
44/// of `user` automatically.
45///
46/// If more precise control of the resolution mechanism is needed, then use the [expand_path::with()] function.
47pub fn expand_path(user: Option<&expand_path::ForUser>, path: &BStr) -> Result<PathBuf, expand_path::Error> {
48    expand_path::with(user, path, |user| match user {
49        expand_path::ForUser::Current => gix_path::env::home_dir(),
50        expand_path::ForUser::Name(user) => {
51            gix_path::env::home_dir().and_then(|home| home.parent().map(|home_dirs| home_dirs.join(user.to_string())))
52        }
53    })
54}
55
56/// Classification of a portion of a URL by whether it is *syntactically* safe to pass as an argument to a command-line program.
57///
58/// Various parts of URLs can be specified to begin with `-`. If they are used as options to a command-line application
59/// such as an SSH client, they will be treated as options rather than as non-option arguments as the developer intended.
60/// This is a security risk, because URLs are not always trusted and can often be composed or influenced by an attacker.
61/// See <https://secure.phabricator.com/T12961> for details.
62///
63/// # Security Warning
64///
65/// This type only expresses known *syntactic* risk. It does not cover other risks, such as passing a personal access
66/// token as a username rather than a password in an application that logs usernames.
67#[derive(Debug, PartialEq, Eq, Copy, Clone)]
68pub enum ArgumentSafety<'a> {
69    /// May be safe. There is nothing to pass, so there is nothing dangerous.
70    Absent,
71    /// May be safe. The argument does not begin with a `-` and so will not be confused as an option.
72    Usable(&'a str),
73    /// Dangerous! Begins with `-` and could be treated as an option. Use the value in error messages only.
74    Dangerous(&'a str),
75}
76
77/// A URL with support for specialized git related capabilities.
78///
79/// Additionally, there is support for [deserialization](Url::from_bytes()) and [serialization](Url::to_bstring()).
80///
81/// # Mutability Warning
82///
83/// Due to the mutability of this type, it's possible that the URL serializes to something invalid
84/// when fields are modified directly. URLs should always be parsed to this type from string or byte
85/// parameters, but never be accepted as an instance of this type and then reconstructed, to maintain
86/// validity guarantees.
87///
88/// # Serialization
89///
90/// This type does not implement `Into<String>`, `From<Url> for String` because URLs
91/// can contain non-UTF-8 sequences in the path component when parsed from raw bytes.
92/// Use [to_bstring()](Url::to_bstring()) for lossless serialization, or use the [`Display`](std::fmt::Display)
93/// trait for a UTF-8 representation that redacts passwords for safe logging.
94///
95/// When the `serde` feature is enabled, this type implements `serde::Serialize` and `serde::Deserialize`,
96/// which will serialize *all* fields, including the password.
97///
98/// # Security Warning
99///
100/// URLs may contain passwords and using standard [formatting](std::fmt::Display) will redact
101/// such password, whereas [lossless serialization](Url::to_bstring()) will contain all parts of the
102/// URL.
103/// **Beware that some URLs still print secrets if they use them outside of the designated password fields.**
104///
105/// Also note that URLs that fail to parse are typically stored in [the resulting error](parse::Error) type
106/// and printed in full using its display implementation.
107#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)]
108#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
109pub struct Url {
110    /// The URL scheme.
111    pub scheme: Scheme,
112    /// The user to impersonate on the remote.
113    pub user: Option<String>,
114    /// The password associated with a user.
115    pub password: Option<String>,
116    /// The host to which to connect. Localhost is implied if `None`.
117    pub host: Option<String>,
118    /// When serializing, use the alternative forms as it was parsed as such.
119    pub serialize_alternative_form: bool,
120    /// The port to use when connecting to a host. If `None`, standard ports depending on `scheme` will be used.
121    pub port: Option<u16>,
122    /// The path portion of the URL, usually the location of the git repository.
123    ///
124    /// # Security Warning
125    ///
126    /// URLs allow paths to start with `-` which makes it possible to mask command-line arguments as path which then leads to
127    /// the invocation of programs from an attacker controlled URL. See <https://secure.phabricator.com/T12961> for details.
128    ///
129    /// If this value is ever going to be passed to a command-line application, call [Self::path_argument_safe()] instead.
130    pub path: BString,
131}
132
133/// Instantiation
134impl Url {
135    /// Create a new instance from the given parts, including a password, which will be validated by parsing them back.
136    pub fn from_parts(
137        scheme: Scheme,
138        user: Option<String>,
139        password: Option<String>,
140        host: Option<String>,
141        port: Option<u16>,
142        path: BString,
143        serialize_alternative_form: bool,
144    ) -> Result<Self, parse::Error> {
145        parse(
146            Url {
147                scheme,
148                user,
149                password,
150                host,
151                port,
152                path,
153                serialize_alternative_form,
154            }
155            .to_bstring()
156            .as_ref(),
157        )
158    }
159}
160
161/// Modification
162impl Url {
163    /// Set the given `user`, or unset it with `None`. Return the previous value.
164    pub fn set_user(&mut self, user: Option<String>) -> Option<String> {
165        let prev = self.user.take();
166        self.user = user;
167        prev
168    }
169
170    /// Set the given `password`, or unset it with `None`. Return the previous value.
171    pub fn set_password(&mut self, password: Option<String>) -> Option<String> {
172        let prev = self.password.take();
173        self.password = password;
174        prev
175    }
176}
177
178/// Builder
179impl Url {
180    /// Enable alternate serialization for this url, e.g. `file:///path` becomes `/path`.
181    ///
182    /// This is automatically set correctly for parsed URLs, but can be set here for urls
183    /// created by constructor.
184    pub fn serialize_alternate_form(mut self, use_alternate_form: bool) -> Self {
185        self.serialize_alternative_form = use_alternate_form;
186        self
187    }
188
189    /// Turn a file url like `file://relative` into `file:///root/relative`, hence it assures the url's path component is absolute,
190    /// using `current_dir` if needed to achieve that.
191    pub fn canonicalize(&mut self, current_dir: &std::path::Path) -> Result<(), gix_path::realpath::Error> {
192        if self.scheme == Scheme::File {
193            let path = gix_path::from_bstr(Cow::Borrowed(self.path.as_ref()));
194            let abs_path = gix_path::realpath_opts(path.as_ref(), current_dir, gix_path::realpath::MAX_SYMLINKS)?;
195            self.path = gix_path::into_bstr(abs_path).into_owned();
196        }
197        Ok(())
198    }
199}
200
201/// Access
202impl Url {
203    /// Return the username mentioned in the URL, if present.
204    ///
205    /// # Security Warning
206    ///
207    /// URLs allow usernames to start with `-` which makes it possible to mask command-line arguments as username which then leads to
208    /// the invocation of programs from an attacker controlled URL. See <https://secure.phabricator.com/T12961> for details.
209    ///
210    /// If this value is ever going to be passed to a command-line application, call [Self::user_argument_safe()] instead.
211    pub fn user(&self) -> Option<&str> {
212        self.user.as_deref()
213    }
214
215    /// Classify the username of this URL by whether it is safe to pass as a command-line argument.
216    ///
217    /// Use this method instead of [Self::user()] if the host is going to be passed to a command-line application.
218    /// If the unsafe and absent cases need not be distinguished, [Self::user_argument_safe()] may also be used.
219    pub fn user_as_argument(&self) -> ArgumentSafety<'_> {
220        match self.user() {
221            Some(user) if looks_like_command_line_option(user.as_bytes()) => ArgumentSafety::Dangerous(user),
222            Some(user) => ArgumentSafety::Usable(user),
223            None => ArgumentSafety::Absent,
224        }
225    }
226
227    /// Return the username of this URL if present *and* if it can't be mistaken for a command-line argument.
228    ///
229    /// Use this method or [Self::user_as_argument()] instead of [Self::user()] if the host is going to be
230    /// passed to a command-line application. Prefer [Self::user_as_argument()] unless the unsafe and absent
231    /// cases need not be distinguished from each other.
232    pub fn user_argument_safe(&self) -> Option<&str> {
233        match self.user_as_argument() {
234            ArgumentSafety::Usable(user) => Some(user),
235            _ => None,
236        }
237    }
238
239    /// Return the password mentioned in the url, if present.
240    pub fn password(&self) -> Option<&str> {
241        self.password.as_deref()
242    }
243
244    /// Return the host mentioned in the URL, if present.
245    ///
246    /// # Security Warning
247    ///
248    /// URLs allow hosts to start with `-` which makes it possible to mask command-line arguments as host which then leads to
249    /// the invocation of programs from an attacker controlled URL. See <https://secure.phabricator.com/T12961> for details.
250    ///
251    /// If this value is ever going to be passed to a command-line application, call [Self::host_as_argument()]
252    /// or [Self::host_argument_safe()] instead.
253    pub fn host(&self) -> Option<&str> {
254        self.host.as_deref()
255    }
256
257    /// Classify the host of this URL by whether it is safe to pass as a command-line argument.
258    ///
259    /// Use this method instead of [Self::host()] if the host is going to be passed to a command-line application.
260    /// If the unsafe and absent cases need not be distinguished, [Self::host_argument_safe()] may also be used.
261    pub fn host_as_argument(&self) -> ArgumentSafety<'_> {
262        match self.host() {
263            Some(host) if looks_like_command_line_option(host.as_bytes()) => ArgumentSafety::Dangerous(host),
264            Some(host) => ArgumentSafety::Usable(host),
265            None => ArgumentSafety::Absent,
266        }
267    }
268
269    /// Return the host of this URL if present *and* if it can't be mistaken for a command-line argument.
270    ///
271    /// Use this method or [Self::host_as_argument()] instead of [Self::host()] if the host is going to be
272    /// passed to a command-line application. Prefer [Self::host_as_argument()] unless the unsafe and absent
273    /// cases need not be distinguished from each other.
274    pub fn host_argument_safe(&self) -> Option<&str> {
275        match self.host_as_argument() {
276            ArgumentSafety::Usable(host) => Some(host),
277            _ => None,
278        }
279    }
280
281    /// Return the path of this URL *if* it can't be mistaken for a command-line argument.
282    /// Note that it always begins with a slash, which is ignored for this comparison.
283    ///
284    /// Use this method instead of accessing [Self::path] directly if the path is going to be passed to a
285    /// command-line application, unless it is certain that the leading `/` will always be included.
286    pub fn path_argument_safe(&self) -> Option<&BStr> {
287        self.path
288            .get(1..)
289            .and_then(|truncated| (!looks_like_command_line_option(truncated)).then_some(self.path.as_ref()))
290    }
291
292    /// Return true if the path portion of the URL is `/`.
293    pub fn path_is_root(&self) -> bool {
294        self.path == "/"
295    }
296
297    /// Return the actual or default port for use according to the URL scheme.
298    /// Note that there may be no default port either.
299    pub fn port_or_default(&self) -> Option<u16> {
300        self.port.or_else(|| {
301            use Scheme::*;
302            Some(match self.scheme {
303                Http => 80,
304                Https => 443,
305                Ssh => 22,
306                Git => 9418,
307                File | Ext(_) => return None,
308            })
309        })
310    }
311}
312
313fn looks_like_command_line_option(b: &[u8]) -> bool {
314    b.first() == Some(&b'-')
315}
316
317/// Transformation
318impl Url {
319    /// Turn a file URL like `file://relative` into `file:///root/relative`, hence it assures the URL's path component is absolute, using
320    /// `current_dir` if necessary.
321    pub fn canonicalized(&self, current_dir: &std::path::Path) -> Result<Self, gix_path::realpath::Error> {
322        let mut res = self.clone();
323        res.canonicalize(current_dir)?;
324        Ok(res)
325    }
326}
327
328fn percent_encode(s: &str) -> Cow<'_, str> {
329    percent_encoding::utf8_percent_encode(s, percent_encoding::NON_ALPHANUMERIC).into()
330}
331
332/// Serialization
333impl Url {
334    /// Write this URL losslessly to `out`, ready to be parsed again.
335    pub fn write_to(&self, out: &mut dyn std::io::Write) -> std::io::Result<()> {
336        // Since alternative form doesn't employ any escape syntax, password and
337        // port number cannot be encoded.
338        if self.serialize_alternative_form
339            && (self.scheme == Scheme::File || self.scheme == Scheme::Ssh)
340            && self.password.is_none()
341            && self.port.is_none()
342        {
343            self.write_alternative_form_to(out)
344        } else {
345            self.write_canonical_form_to(out)
346        }
347    }
348
349    fn write_canonical_form_to(&self, out: &mut dyn std::io::Write) -> std::io::Result<()> {
350        out.write_all(self.scheme.as_str().as_bytes())?;
351        out.write_all(b"://")?;
352        match (&self.user, &self.host) {
353            (Some(user), Some(host)) => {
354                out.write_all(percent_encode(user).as_bytes())?;
355                if let Some(password) = &self.password {
356                    out.write_all(b":")?;
357                    out.write_all(percent_encode(password).as_bytes())?;
358                }
359                out.write_all(b"@")?;
360                out.write_all(host.as_bytes())?;
361            }
362            (None, Some(host)) => {
363                out.write_all(host.as_bytes())?;
364            }
365            (None, None) => {}
366            (Some(_user), None) => {
367                return Err(std::io::Error::other(
368                    "Invalid URL structure: user specified without host",
369                ));
370            }
371        }
372        if let Some(port) = &self.port {
373            write!(out, ":{port}")?;
374        }
375        out.write_all(&self.path)?;
376        Ok(())
377    }
378
379    fn write_alternative_form_to(&self, out: &mut dyn std::io::Write) -> std::io::Result<()> {
380        match (&self.user, &self.host) {
381            (Some(user), Some(host)) => {
382                out.write_all(user.as_bytes())?;
383                assert!(
384                    self.password.is_none(),
385                    "BUG: cannot serialize password in alternative form"
386                );
387                out.write_all(b"@")?;
388                out.write_all(host.as_bytes())?;
389            }
390            (None, Some(host)) => {
391                out.write_all(host.as_bytes())?;
392            }
393            (None, None) => {}
394            (Some(_user), None) => {
395                return Err(std::io::Error::other(
396                    "Invalid URL structure: user specified without host",
397                ));
398            }
399        }
400        assert!(self.port.is_none(), "BUG: cannot serialize port in alternative form");
401        if self.scheme == Scheme::Ssh {
402            out.write_all(b":")?;
403        }
404        out.write_all(&self.path)?;
405        Ok(())
406    }
407
408    /// Transform ourselves into a binary string, losslessly, or fail if the URL is malformed due to host or user parts being incorrect.
409    pub fn to_bstring(&self) -> BString {
410        let mut buf = Vec::with_capacity(
411            (5 + 3)
412                + self.user.as_ref().map(String::len).unwrap_or_default()
413                + 1
414                + self.host.as_ref().map(String::len).unwrap_or_default()
415                + self.port.map(|_| 5).unwrap_or_default()
416                + self.path.len(),
417        );
418        self.write_to(&mut buf).expect("io cannot fail in memory");
419        buf.into()
420    }
421}
422
423/// Deserialization
424impl Url {
425    /// Parse a URL from `bytes`.
426    pub fn from_bytes(bytes: &BStr) -> Result<Self, parse::Error> {
427        parse(bytes)
428    }
429}
430
431/// This module contains extensions to the [Url] struct which are only intended to be used
432/// for testing code. Do not use this module in production! For all intents and purposes, the APIs of
433/// all functions and types exposed by this module are considered unstable and are allowed to break
434/// even in patch releases!
435#[doc(hidden)]
436pub mod testing {
437    use bstr::BString;
438
439    use crate::{Scheme, Url};
440
441    /// Additional functions for [Url] which are only intended to be used for tests.
442    pub trait TestUrlExtension {
443        /// Create a new instance from the given parts without validating them.
444        ///
445        /// This function is primarily intended for testing purposes. For production code please
446        /// consider using [Url::from_parts] instead!
447        fn from_parts_unchecked(
448            scheme: Scheme,
449            user: Option<String>,
450            password: Option<String>,
451            host: Option<String>,
452            port: Option<u16>,
453            path: BString,
454            serialize_alternative_form: bool,
455        ) -> Url {
456            Url {
457                scheme,
458                user,
459                password,
460                host,
461                port,
462                path,
463                serialize_alternative_form,
464            }
465        }
466    }
467
468    impl TestUrlExtension for Url {}
469}