Skip to main content

gix_url/
lib.rs

1//! A library implementing a URL for use in git with access to its special capabilities.
2//!
3//! ## Examples
4//!
5//! ```
6//! let mut url = gix_url::parse("ssh://git@example.com/gitoxide".into()).unwrap();
7//! assert_eq!(url.user(), Some("git"));
8//! assert_eq!(url.host(), Some("example.com"));
9//! assert_eq!(url.to_bstring(), "ssh://git@example.com/gitoxide");
10//!
11//! assert_eq!(url.set_user(Some("byron".into())), Some("git".into()));
12//! assert_eq!(url.user_argument_safe(), Some("byron"));
13//! assert_eq!(url.to_bstring(), "ssh://byron@example.com/gitoxide");
14//!
15//! let suspicious = gix_url::parse("ssh://-Fconfig@host/repo".into()).unwrap();
16//! assert_eq!(suspicious.user_argument_safe(), None, "The user isn't returned as it looks like an argument");
17//! ```
18//! ## Feature Flags
19#![cfg_attr(
20    all(doc, feature = "document-features"),
21    doc = ::document_features::document_features!()
22)]
23#![cfg_attr(all(doc, feature = "document-features"), feature(doc_cfg))]
24#![deny(rust_2018_idioms, missing_docs)]
25#![forbid(unsafe_code)]
26
27use std::{borrow::Cow, path::PathBuf};
28
29use bstr::{BStr, BString};
30
31///
32pub mod expand_path;
33
34mod scheme;
35pub use scheme::Scheme;
36mod impls;
37
38///
39pub mod parse;
40
41/// Minimal URL parser to replace the `url` crate dependency
42mod simple_url;
43
44/// Parse the given `bytes` as a [git url](Url).
45///
46/// # Note
47///
48/// We cannot and should never have to deal with UTF-16 encoded windows strings, so bytes input is acceptable.
49/// For file-paths, we don't expect UTF8 encoding either.
50pub fn parse(input: &BStr) -> Result<Url, parse::Error> {
51    use parse::InputScheme;
52    match parse::find_scheme(input) {
53        InputScheme::Local => parse::local(input),
54        InputScheme::Url { protocol_end } if input[..protocol_end].eq_ignore_ascii_case(b"file") => {
55            parse::file_url(input, protocol_end)
56        }
57        InputScheme::Url { protocol_end } => parse::url(input, protocol_end),
58        InputScheme::Scp { colon } => parse::scp(input, colon),
59    }
60}
61
62/// Expand `path` for the given `user`, which can be obtained by [`parse()`], resolving the home directories
63/// of `user` automatically.
64///
65/// If more precise control of the resolution mechanism is needed, then use the [expand_path::with()] function.
66pub fn expand_path(user: Option<&expand_path::ForUser>, path: &BStr) -> Result<PathBuf, expand_path::Error> {
67    expand_path::with(user, path, |user| match user {
68        expand_path::ForUser::Current => gix_path::env::home_dir(),
69        expand_path::ForUser::Name(user) => {
70            gix_path::env::home_dir().and_then(|home| home.parent().map(|home_dirs| home_dirs.join(user.to_string())))
71        }
72    })
73}
74
75/// Classification of a portion of a URL by whether it is *syntactically* safe to pass as an argument to a command-line program.
76///
77/// Various parts of URLs can be specified to begin with `-`. If they are used as options to a command-line application
78/// such as an SSH client, they will be treated as options rather than as non-option arguments as the developer intended.
79/// This is a security risk, because URLs are not always trusted and can often be composed or influenced by an attacker.
80/// See <https://secure.phabricator.com/T12961> for details.
81///
82/// # Security Warning
83///
84/// This type only expresses known *syntactic* risk. It does not cover other risks, such as passing a personal access
85/// token as a username rather than a password in an application that logs usernames.
86#[derive(Debug, PartialEq, Eq, Copy, Clone)]
87pub enum ArgumentSafety<'a> {
88    /// May be safe. There is nothing to pass, so there is nothing dangerous.
89    Absent,
90    /// May be safe. The argument does not begin with a `-` and so will not be confused as an option.
91    Usable(&'a str),
92    /// Dangerous! Begins with `-` and could be treated as an option. Use the value in error messages only.
93    Dangerous(&'a str),
94}
95
96/// A URL with support for specialized git related capabilities.
97///
98/// Additionally, there is support for [deserialization](Url::from_bytes()) and [serialization](Url::to_bstring()).
99///
100/// # Mutability Warning
101///
102/// Due to the mutability of this type, it's possible that the URL serializes to something invalid
103/// when fields are modified directly. URLs should always be parsed to this type from string or byte
104/// parameters, but never be accepted as an instance of this type and then reconstructed, to maintain
105/// validity guarantees.
106///
107/// # Serialization
108///
109/// This type does not implement `Into<String>`, `From<Url> for String` because URLs
110/// can contain non-UTF-8 sequences in the path component when parsed from raw bytes.
111/// Use [to_bstring()](Url::to_bstring()) for lossless serialization, or use the [`Display`](std::fmt::Display)
112/// trait for a UTF-8 representation that redacts passwords for safe logging.
113///
114/// When the `serde` feature is enabled, this type implements `serde::Serialize` and `serde::Deserialize`,
115/// which will serialize *all* fields, including the password.
116///
117/// # Security Warning
118///
119/// URLs may contain passwords and using standard [formatting](std::fmt::Display) will redact
120/// such password, whereas [lossless serialization](Url::to_bstring()) will contain all parts of the
121/// URL.
122/// **Beware that some URLs still print secrets if they use them outside of the designated password fields.**
123///
124/// Also note that URLs that fail to parse are typically stored in [the resulting error](parse::Error) type
125/// and printed in full using its display implementation.
126#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)]
127#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
128pub struct Url {
129    /// The URL scheme.
130    pub scheme: Scheme,
131    /// The user to impersonate on the remote.
132    ///
133    /// Stored in decoded form: percent-encoded characters are decoded during parsing.
134    /// Re-encoded during canonical serialization, but written as-is in alternative form.
135    pub user: Option<String>,
136    /// The password associated with a user.
137    ///
138    /// Stored in decoded form: percent-encoded characters are decoded during parsing.
139    /// Re-encoded during canonical serialization. Cannot be serialized in alternative form (will panic in debug builds).
140    pub password: Option<String>,
141    /// The host to which to connect. Localhost is implied if `None`.
142    ///
143    /// IPv6 addresses are stored *without* brackets for SSH schemes, but *with* brackets for other schemes.
144    /// Brackets are automatically added during serialization when needed (e.g., when a port is specified with an IPv6 host).
145    pub host: Option<String>,
146    /// When serializing, use the alternative forms as it was parsed as such.
147    ///
148    /// Alternative forms include SCP-like syntax (`user@host:path`) and bare file paths.
149    /// When `true`, password and port cannot be serialized (will panic in debug builds).
150    pub serialize_alternative_form: bool,
151    /// The port to use when connecting to a host. If `None`, standard ports depending on `scheme` will be used.
152    pub port: Option<u16>,
153    /// The path portion of the URL, usually the location of the git repository.
154    ///
155    /// Paths are stored in decoded form: percent-encoded characters are decoded during parsing
156    /// and re-encoded during canonical serialization (e.g., `%20` becomes a space in this field).
157    ///
158    /// Path normalization during parsing:
159    /// - SSH/Git schemes: Leading `/~` is stripped (e.g., `/~repo` becomes `~repo`)
160    /// - SSH/Git schemes: Empty paths are rejected as errors
161    /// - HTTP/HTTPS schemes: Empty paths are normalized to `/`
162    ///
163    /// During serialization, SSH/Git URLs prepend `/` to paths not starting with `/`.
164    ///
165    /// # Security Warning
166    ///
167    /// URLs allow paths to start with `-` which makes it possible to mask command-line arguments as path which then leads to
168    /// the invocation of programs from an attacker controlled URL. See <https://secure.phabricator.com/T12961> for details.
169    ///
170    /// If this value is ever going to be passed to a command-line application, call [Self::path_argument_safe()] instead.
171    pub path: BString,
172}
173
174/// Instantiation
175impl Url {
176    /// Create a new instance from the given parts, including a password, which will be validated by parsing them back.
177    pub fn from_parts(
178        scheme: Scheme,
179        user: Option<String>,
180        password: Option<String>,
181        host: Option<String>,
182        port: Option<u16>,
183        path: BString,
184        serialize_alternative_form: bool,
185    ) -> Result<Self, parse::Error> {
186        parse(
187            Url {
188                scheme,
189                user,
190                password,
191                host,
192                port,
193                path,
194                serialize_alternative_form,
195            }
196            .to_bstring()
197            .as_ref(),
198        )
199    }
200}
201
202/// Modification
203impl Url {
204    /// Set the given `user`, or unset it with `None`. Return the previous value.
205    pub fn set_user(&mut self, user: Option<String>) -> Option<String> {
206        let prev = self.user.take();
207        self.user = user;
208        prev
209    }
210
211    /// Set the given `password`, or unset it with `None`. Return the previous value.
212    pub fn set_password(&mut self, password: Option<String>) -> Option<String> {
213        let prev = self.password.take();
214        self.password = password;
215        prev
216    }
217}
218
219/// Builder
220impl Url {
221    /// Enable alternate serialization for this url, e.g. `file:///path` becomes `/path`.
222    ///
223    /// This is automatically set correctly for parsed URLs, but can be set here for urls
224    /// created by constructor.
225    pub fn serialize_alternate_form(mut self, use_alternate_form: bool) -> Self {
226        self.serialize_alternative_form = use_alternate_form;
227        self
228    }
229
230    /// Turn a file url like `file://relative` into `file:///root/relative`, hence it assures the url's path component is absolute,
231    /// using `current_dir` if needed to achieve that.
232    pub fn canonicalize(&mut self, current_dir: &std::path::Path) -> Result<(), gix_path::realpath::Error> {
233        if self.scheme == Scheme::File {
234            let path = gix_path::from_bstr(Cow::Borrowed(self.path.as_ref()));
235            let abs_path = gix_path::realpath_opts(path.as_ref(), current_dir, gix_path::realpath::MAX_SYMLINKS)?;
236            self.path = gix_path::into_bstr(abs_path).into_owned();
237        }
238        Ok(())
239    }
240}
241
242/// Access
243impl Url {
244    /// Return the username mentioned in the URL, if present.
245    ///
246    /// # Security Warning
247    ///
248    /// URLs allow usernames to start with `-` which makes it possible to mask command-line arguments as username which then leads to
249    /// the invocation of programs from an attacker controlled URL. See <https://secure.phabricator.com/T12961> for details.
250    ///
251    /// If this value is ever going to be passed to a command-line application, call [Self::user_argument_safe()] instead.
252    pub fn user(&self) -> Option<&str> {
253        self.user.as_deref()
254    }
255
256    /// Classify the username of this URL by whether it is safe to pass as a command-line argument.
257    ///
258    /// Use this method instead of [Self::user()] if the host is going to be passed to a command-line application.
259    /// If the unsafe and absent cases need not be distinguished, [Self::user_argument_safe()] may also be used.
260    pub fn user_as_argument(&self) -> ArgumentSafety<'_> {
261        match self.user() {
262            Some(user) if looks_like_command_line_option(user.as_bytes()) => ArgumentSafety::Dangerous(user),
263            Some(user) => ArgumentSafety::Usable(user),
264            None => ArgumentSafety::Absent,
265        }
266    }
267
268    /// Return the username of this URL if present *and* if it can't be mistaken for a command-line argument.
269    ///
270    /// Use this method or [Self::user_as_argument()] instead of [Self::user()] if the host is going to be
271    /// passed to a command-line application. Prefer [Self::user_as_argument()] unless the unsafe and absent
272    /// cases need not be distinguished from each other.
273    pub fn user_argument_safe(&self) -> Option<&str> {
274        match self.user_as_argument() {
275            ArgumentSafety::Usable(user) => Some(user),
276            _ => None,
277        }
278    }
279
280    /// Return the password mentioned in the url, if present.
281    pub fn password(&self) -> Option<&str> {
282        self.password.as_deref()
283    }
284
285    /// Return the host mentioned in the URL, if present.
286    ///
287    /// # Security Warning
288    ///
289    /// URLs allow hosts to start with `-` which makes it possible to mask command-line arguments as host which then leads to
290    /// the invocation of programs from an attacker controlled URL. See <https://secure.phabricator.com/T12961> for details.
291    ///
292    /// If this value is ever going to be passed to a command-line application, call [Self::host_as_argument()]
293    /// or [Self::host_argument_safe()] instead.
294    pub fn host(&self) -> Option<&str> {
295        self.host.as_deref()
296    }
297
298    /// Classify the host of this URL by whether it is safe to pass as a command-line argument.
299    ///
300    /// Use this method instead of [Self::host()] if the host is going to be passed to a command-line application.
301    /// If the unsafe and absent cases need not be distinguished, [Self::host_argument_safe()] may also be used.
302    pub fn host_as_argument(&self) -> ArgumentSafety<'_> {
303        match self.host() {
304            Some(host) if looks_like_command_line_option(host.as_bytes()) => ArgumentSafety::Dangerous(host),
305            Some(host) => ArgumentSafety::Usable(host),
306            None => ArgumentSafety::Absent,
307        }
308    }
309
310    /// Return the host of this URL if present *and* if it can't be mistaken for a command-line argument.
311    ///
312    /// Use this method or [Self::host_as_argument()] instead of [Self::host()] if the host is going to be
313    /// passed to a command-line application. Prefer [Self::host_as_argument()] unless the unsafe and absent
314    /// cases need not be distinguished from each other.
315    pub fn host_argument_safe(&self) -> Option<&str> {
316        match self.host_as_argument() {
317            ArgumentSafety::Usable(host) => Some(host),
318            _ => None,
319        }
320    }
321
322    /// Return the path of this URL *if* it can't be mistaken for a command-line argument.
323    /// Note that it always begins with a slash, which is ignored for this comparison.
324    ///
325    /// Use this method instead of accessing [Self::path] directly if the path is going to be passed to a
326    /// command-line application, unless it is certain that the leading `/` will always be included.
327    pub fn path_argument_safe(&self) -> Option<&BStr> {
328        self.path
329            .get(1..)
330            .and_then(|truncated| (!looks_like_command_line_option(truncated)).then_some(self.path.as_ref()))
331    }
332
333    /// Return true if the path portion of the URL is `/`.
334    pub fn path_is_root(&self) -> bool {
335        self.path == "/"
336    }
337
338    /// Return the actual or default port for use according to the URL scheme.
339    /// Note that there may be no default port either.
340    pub fn port_or_default(&self) -> Option<u16> {
341        self.port.or_else(|| {
342            use Scheme::*;
343            Some(match self.scheme {
344                Http => 80,
345                Https => 443,
346                Ssh => 22,
347                Git => 9418,
348                File | Ext(_) => return None,
349            })
350        })
351    }
352}
353
354fn looks_like_command_line_option(b: &[u8]) -> bool {
355    b.first() == Some(&b'-')
356}
357
358/// Transformation
359impl Url {
360    /// Turn a file URL like `file://relative` into `file:///root/relative`, hence it assures the URL's path component is absolute, using
361    /// `current_dir` if necessary.
362    pub fn canonicalized(&self, current_dir: &std::path::Path) -> Result<Self, gix_path::realpath::Error> {
363        let mut res = self.clone();
364        res.canonicalize(current_dir)?;
365        Ok(res)
366    }
367}
368
369/// Serialization
370impl Url {
371    /// Write this URL losslessly to `out`, ready to be parsed again.
372    pub fn write_to(&self, out: &mut dyn std::io::Write) -> std::io::Result<()> {
373        // Since alternative form doesn't employ any escape syntax, password and
374        // port number cannot be encoded.
375        if self.serialize_alternative_form
376            && (self.scheme == Scheme::File || self.scheme == Scheme::Ssh)
377            && self.password.is_none()
378            && self.port.is_none()
379        {
380            self.write_alternative_form_to(out)
381        } else {
382            self.write_canonical_form_to(out)
383        }
384    }
385
386    fn write_canonical_form_to(&self, out: &mut dyn std::io::Write) -> std::io::Result<()> {
387        fn percent_encode(s: &str) -> Cow<'_, str> {
388            /// Characters that must be percent-encoded in the userinfo component of a URL.
389            ///
390            /// According to RFC 3986, userinfo can contain:
391            /// - unreserved characters: `A-Z a-z 0-9 - . _ ~`
392            /// - percent-encoded characters
393            /// - sub-delims: `! $ & ' ( ) * + , ; =`
394            /// - `:`
395            ///
396            /// This encode-set encodes everything else, particularly `@` (userinfo delimiter),
397            /// `/` `?` `#` (path/query/fragment delimiters), and various other special characters.
398            const USERINFO_ENCODE_SET: &percent_encoding::AsciiSet = &percent_encoding::CONTROLS
399                .add(b' ')
400                .add(b'"')
401                .add(b'#')
402                .add(b'%')
403                .add(b'/')
404                .add(b'<')
405                .add(b'>')
406                .add(b'?')
407                .add(b'@')
408                .add(b'[')
409                .add(b'\\')
410                .add(b']')
411                .add(b'^')
412                .add(b'`')
413                .add(b'{')
414                .add(b'|')
415                .add(b'}');
416            percent_encoding::utf8_percent_encode(s, USERINFO_ENCODE_SET).into()
417        }
418
419        out.write_all(self.scheme.as_str().as_bytes())?;
420        out.write_all(b"://")?;
421
422        let needs_brackets = self.port.is_some() && self.host_needs_brackets();
423
424        match (&self.user, &self.host) {
425            (Some(user), Some(host)) => {
426                out.write_all(percent_encode(user).as_bytes())?;
427                if let Some(password) = &self.password {
428                    out.write_all(b":")?;
429                    out.write_all(percent_encode(password).as_bytes())?;
430                }
431                out.write_all(b"@")?;
432                if needs_brackets {
433                    out.write_all(b"[")?;
434                }
435                out.write_all(host.as_bytes())?;
436                if needs_brackets {
437                    out.write_all(b"]")?;
438                }
439            }
440            (None, Some(host)) => {
441                if needs_brackets {
442                    out.write_all(b"[")?;
443                }
444                out.write_all(host.as_bytes())?;
445                if needs_brackets {
446                    out.write_all(b"]")?;
447                }
448            }
449            (None, None) => {}
450            (Some(_user), None) => {
451                return Err(std::io::Error::other(
452                    "Invalid URL structure: user specified without host",
453                ));
454            }
455        }
456        if let Some(port) = &self.port {
457            write!(out, ":{port}")?;
458        }
459        // For SSH and Git URLs, add leading '/' if path doesn't start with '/'
460        // This handles paths like "~repo" which serialize as "/~repo" in URL form
461        if matches!(self.scheme, Scheme::Ssh | Scheme::Git) && !self.path.starts_with(b"/") {
462            out.write_all(b"/")?;
463        }
464        if matches!(self.scheme, Scheme::Http | Scheme::Https) {
465            // We intentionally do not encode '?' and '#': ParsedUrl keeps them in `path`,
466            // and encoding would change routed endpoints for already parsed URLs.
467            const PATH_ENCODE_SET: &percent_encoding::AsciiSet = &percent_encoding::CONTROLS
468                .add(b' ')
469                .add(b'"')
470                .add(b'%')
471                .add(b'<')
472                .add(b'>')
473                .add(b'`')
474                .add(b'{')
475                .add(b'}');
476            write!(
477                out,
478                "{}",
479                percent_encoding::percent_encode(self.path.as_ref(), PATH_ENCODE_SET)
480            )?;
481        } else {
482            out.write_all(&self.path)?;
483        }
484        Ok(())
485    }
486
487    fn host_needs_brackets(&self) -> bool {
488        fn is_ipv6(h: &str) -> bool {
489            h.contains(':') && !h.starts_with('[')
490        }
491        self.host.as_ref().is_some_and(|h| is_ipv6(h))
492    }
493
494    fn write_alternative_form_to(&self, out: &mut dyn std::io::Write) -> std::io::Result<()> {
495        let needs_brackets = self.host_needs_brackets();
496
497        match (&self.user, &self.host) {
498            (Some(user), Some(host)) => {
499                out.write_all(user.as_bytes())?;
500                out.write_all(b"@")?;
501                if needs_brackets {
502                    out.write_all(b"[")?;
503                }
504                out.write_all(host.as_bytes())?;
505                if needs_brackets {
506                    out.write_all(b"]")?;
507                }
508            }
509            (None, Some(host)) => {
510                if needs_brackets {
511                    out.write_all(b"[")?;
512                }
513                out.write_all(host.as_bytes())?;
514                if needs_brackets {
515                    out.write_all(b"]")?;
516                }
517            }
518            (None, None) => {}
519            (Some(_user), None) => {
520                return Err(std::io::Error::other(
521                    "Invalid URL structure: user specified without host",
522                ));
523            }
524        }
525        if self.scheme == Scheme::Ssh {
526            out.write_all(b":")?;
527        }
528        out.write_all(&self.path)?;
529        Ok(())
530    }
531
532    /// Transform ourselves into a binary string, losslessly, or fail if the URL is malformed due to host or user parts being incorrect.
533    pub fn to_bstring(&self) -> BString {
534        let mut buf = Vec::with_capacity(
535            (5 + 3)
536                + self.user.as_ref().map(String::len).unwrap_or_default()
537                + 1
538                + self.host.as_ref().map(String::len).unwrap_or_default()
539                + self.port.map(|_| 5).unwrap_or_default()
540                + self.path.len(),
541        );
542        self.write_to(&mut buf).expect("io cannot fail in memory");
543        buf.into()
544    }
545}
546
547/// Deserialization
548impl Url {
549    /// Parse a URL from `bytes`.
550    pub fn from_bytes(bytes: &BStr) -> Result<Self, parse::Error> {
551        parse(bytes)
552    }
553}
554
555/// This module contains extensions to the [Url] struct which are only intended to be used
556/// for testing code. Do not use this module in production! For all intents and purposes, the APIs of
557/// all functions and types exposed by this module are considered unstable and are allowed to break
558/// even in patch releases!
559#[doc(hidden)]
560pub mod testing {
561    use bstr::BString;
562
563    use crate::{Scheme, Url};
564
565    /// Additional functions for [Url] which are only intended to be used for tests.
566    pub trait TestUrlExtension {
567        /// Create a new instance from the given parts without validating them.
568        ///
569        /// This function is primarily intended for testing purposes. For production code please
570        /// consider using [Url::from_parts] instead!
571        fn from_parts_unchecked(
572            scheme: Scheme,
573            user: Option<String>,
574            password: Option<String>,
575            host: Option<String>,
576            port: Option<u16>,
577            path: BString,
578            serialize_alternative_form: bool,
579        ) -> Url {
580            Url {
581                scheme,
582                user,
583                password,
584                host,
585                port,
586                path,
587                serialize_alternative_form,
588            }
589        }
590    }
591
592    impl TestUrlExtension for Url {}
593}