git_url_parse/
lib.rs

1use std::fmt;
2use std::str::FromStr;
3use strum::{Display, EnumString, VariantNames};
4use thiserror::Error;
5use url::Url;
6
7#[cfg(feature = "tracing")]
8use tracing::debug;
9
10/// Supported uri schemes for parsing
11#[derive(Debug, PartialEq, Eq, EnumString, VariantNames, Clone, Display, Copy)]
12#[strum(serialize_all = "kebab_case")]
13pub enum Scheme {
14    /// Represents `file://` url scheme
15    File,
16    /// Represents `ftp://` url scheme
17    Ftp,
18    /// Represents `ftps://` url scheme
19    Ftps,
20    /// Represents `git://` url scheme
21    Git,
22    /// Represents `git+ssh://` url scheme
23    #[strum(serialize = "git+ssh")]
24    GitSsh,
25    /// Represents `http://` url scheme
26    Http,
27    /// Represents `https://` url scheme
28    Https,
29    /// Represents `ssh://` url scheme
30    Ssh,
31    /// Represents No url scheme
32    Unspecified,
33}
34
35/// GitUrl represents an input url that is a url used by git
36/// Internally during parsing the url is sanitized and uses the `url` crate to perform
37/// the majority of the parsing effort, and with some extra handling to expose
38/// metadata used my many git hosting services
39#[derive(Debug, PartialEq, Eq, Clone)]
40pub struct GitUrl {
41    /// The fully qualified domain name (FQDN) or IP of the repo
42    pub host: Option<String>,
43    /// The name of the repo
44    pub name: String,
45    /// The owner/account/project name
46    pub owner: Option<String>,
47    /// The organization name. Supported by Azure DevOps
48    pub organization: Option<String>,
49    /// The full name of the repo, formatted as "owner/name"
50    pub fullname: String,
51    /// The git url scheme
52    pub scheme: Scheme,
53    /// The authentication user
54    pub user: Option<String>,
55    /// The oauth token (could appear in the https urls)
56    pub token: Option<String>,
57    /// The non-conventional port where git service is hosted
58    pub port: Option<u16>,
59    /// The path to repo w/ respect to user + hostname
60    pub path: String,
61    /// Indicate if url uses the .git suffix
62    pub git_suffix: bool,
63    /// Indicate if url explicitly uses its scheme
64    pub scheme_prefix: bool,
65}
66
67/// Build the printable GitUrl from its components
68impl fmt::Display for GitUrl {
69    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
70        let scheme_prefix = match self.scheme_prefix {
71            true => format!("{}://", self.scheme),
72            false => String::new(),
73        };
74
75        let auth_info = match self.scheme {
76            Scheme::Ssh | Scheme::Git | Scheme::GitSsh => {
77                if let Some(user) = &self.user {
78                    format!("{}@", user)
79                } else {
80                    String::new()
81                }
82            }
83            Scheme::Http | Scheme::Https => match (&self.user, &self.token) {
84                (Some(user), Some(token)) => format!("{}:{}@", user, token),
85                (Some(user), None) => format!("{}@", user),
86                (None, Some(token)) => format!("{}@", token),
87                (None, None) => String::new(),
88            },
89            _ => String::new(),
90        };
91
92        let host = match &self.host {
93            Some(host) => host.to_string(),
94            None => String::new(),
95        };
96
97        let port = match &self.port {
98            Some(p) => format!(":{}", p),
99            None => String::new(),
100        };
101
102        let path = match &self.scheme {
103            Scheme::Ssh => {
104                if self.port.is_some() {
105                    format!("/{}", &self.path)
106                } else {
107                    format!(":{}", &self.path)
108                }
109            }
110            _ => (&self.path).to_string(),
111        };
112
113        let git_url_str = format!("{}{}{}{}{}", scheme_prefix, auth_info, host, port, path);
114
115        write!(f, "{}", git_url_str)
116    }
117}
118
119impl Default for GitUrl {
120    fn default() -> Self {
121        GitUrl {
122            host: None,
123            name: "".to_string(),
124            owner: None,
125            organization: None,
126            fullname: "".to_string(),
127            scheme: Scheme::Unspecified,
128            user: None,
129            token: None,
130            port: None,
131            path: "".to_string(),
132            git_suffix: false,
133            scheme_prefix: false,
134        }
135    }
136}
137
138impl FromStr for GitUrl {
139    type Err = GitUrlParseError;
140
141    fn from_str(s: &str) -> Result<Self, Self::Err> {
142        GitUrl::parse(s)
143    }
144}
145
146impl GitUrl {
147    /// Returns `GitUrl` after removing `user` and `token` values
148    /// Intended use-case is for non-destructive printing GitUrl excluding any embedded auth info
149    pub fn trim_auth(&self) -> GitUrl {
150        let mut new_giturl = self.clone();
151        new_giturl.user = None;
152        new_giturl.token = None;
153        new_giturl
154    }
155
156    /// Returns a `Result<GitUrl>` after normalizing and parsing `url` for metadata
157    pub fn parse(url: &str) -> Result<GitUrl, GitUrlParseError> {
158        // Normalize the url so we can use Url crate to process ssh urls
159        let normalized = if let Ok(url) = normalize_url(url) {
160            url
161        } else {
162            return Err(GitUrlParseError::UrlNormalizeFailed);
163        };
164
165        // Some pre-processing for paths
166        let scheme = if let Ok(scheme) = Scheme::from_str(normalized.scheme()) {
167            scheme
168        } else {
169            return Err(GitUrlParseError::UnsupportedScheme(
170                normalized.scheme().to_string(),
171            ));
172        };
173
174        // Normalized ssh urls can always have their first '/' removed
175        let urlpath = match &scheme {
176            Scheme::Ssh => {
177                // At the moment, we're relying on url::Url's parse() behavior to not duplicate
178                // the leading '/' when we normalize
179                normalized.path()[1..].to_string()
180            }
181            _ => normalized.path().to_string(),
182        };
183
184        let git_suffix_check = &urlpath.ends_with(".git");
185
186        // Parse through path for name,owner,organization
187        // Support organizations for Azure Devops
188        #[cfg(feature = "tracing")]
189        debug!("The urlpath: {:?}", &urlpath);
190
191        // Most git services use the path for metadata in the same way, so we're going to separate
192        // the metadata
193        // ex. github.com/accountname/reponame
194        // owner = accountname
195        // name = reponame
196        //
197        // organizations are going to be supported on a per-host basis
198        let splitpath = &urlpath.rsplit_terminator('/').collect::<Vec<&str>>();
199
200        #[cfg(feature = "tracing")]
201        debug!("rsplit results for metadata: {:?}", splitpath);
202
203        let name = splitpath[0].trim_end_matches(".git").to_string();
204
205        // TODO:  I think here is where we want to update the url pattern identification step.. I want to be able to have a hint that the user can pass
206
207        let (owner, organization, fullname) = match &scheme {
208            // We're not going to assume anything about metadata from a filepath
209            Scheme::File => (None::<String>, None::<String>, name.clone()),
210            _ => {
211                let mut fullname: Vec<&str> = Vec::new();
212
213                // TODO: Add support for parsing out orgs from these urls
214                let hosts_w_organization_in_path = vec!["dev.azure.com", "ssh.dev.azure.com"];
215                //vec!["dev.azure.com", "ssh.dev.azure.com", "visualstudio.com"];
216
217                let host_str = if let Some(host) = normalized.host_str() {
218                    host
219                } else {
220                    return Err(GitUrlParseError::UnsupportedUrlHostFormat);
221                };
222
223                match hosts_w_organization_in_path.contains(&host_str) {
224                    true => {
225                        #[cfg(feature = "tracing")]
226                        debug!("Found a git provider with an org");
227
228                        // The path differs between git:// and https:// schemes
229
230                        match &scheme {
231                            // Example: "git@ssh.dev.azure.com:v3/CompanyName/ProjectName/RepoName",
232                            Scheme::Ssh => {
233                                // Organization
234                                fullname.push(splitpath[2]);
235                                // Project/Owner name
236                                fullname.push(splitpath[1]);
237                                // Repo name
238                                fullname.push(splitpath[0]);
239
240                                (
241                                    Some(splitpath[1].to_string()),
242                                    Some(splitpath[2].to_string()),
243                                    fullname.join("/"),
244                                )
245                            }
246                            // Example: "https://CompanyName@dev.azure.com/CompanyName/ProjectName/_git/RepoName",
247                            Scheme::Https => {
248                                // Organization
249                                fullname.push(splitpath[3]);
250                                // Project/Owner name
251                                fullname.push(splitpath[2]);
252                                // Repo name
253                                fullname.push(splitpath[0]);
254
255                                (
256                                    Some(splitpath[2].to_string()),
257                                    Some(splitpath[3].to_string()),
258                                    fullname.join("/"),
259                                )
260                            }
261
262                            // TODO: I'm not sure if I want to support throwing this error long-term
263                            _ => return Err(GitUrlParseError::UnexpectedScheme),
264                        }
265                    }
266                    false => {
267                        if !url.starts_with("ssh") && splitpath.len() < 2 {
268                            return Err(GitUrlParseError::UnexpectedFormat);
269                        }
270
271                        let position = match splitpath.len() {
272                            0 => return Err(GitUrlParseError::UnexpectedFormat),
273                            1 => 0,
274                            _ => 1,
275                        };
276
277                        // push owner
278                        fullname.push(splitpath[position]);
279                        // push name
280                        fullname.push(name.as_str());
281
282                        (
283                            Some(splitpath[position].to_string()),
284                            None::<String>,
285                            fullname.join("/"),
286                        )
287                    }
288                }
289            }
290        };
291
292        let final_host = match scheme {
293            Scheme::File => None,
294            _ => normalized.host_str().map(|h| h.to_string()),
295        };
296
297        let final_path = match scheme {
298            Scheme::File => {
299                if let Some(host) = normalized.host_str() {
300                    format!("{}{}", host, urlpath)
301                } else {
302                    urlpath
303                }
304            }
305            _ => urlpath,
306        };
307
308        Ok(GitUrl {
309            host: final_host,
310            name,
311            owner,
312            organization,
313            fullname,
314            scheme,
315            user: match normalized.username().to_string().len() {
316                0 => None,
317                _ => Some(normalized.username().to_string()),
318            },
319            token: normalized.password().map(|p| p.to_string()),
320            port: normalized.port(),
321            path: final_path,
322            git_suffix: *git_suffix_check,
323            scheme_prefix: url.contains("://") || url.starts_with("git:"),
324        })
325    }
326}
327
328/// `normalize_ssh_url` takes in an ssh url that separates the login info
329/// from the path into with a `:` and replaces it with `/`.
330///
331/// Prepends `ssh://` to url
332///
333/// Supports absolute and relative paths
334fn normalize_ssh_url(url: &str) -> Result<Url, GitUrlParseError> {
335    let u = url.split(':').collect::<Vec<&str>>();
336
337    match u.len() {
338        2 => {
339            #[cfg(feature = "tracing")]
340            debug!("Normalizing ssh url: {:?}", u);
341            normalize_url(&format!("ssh://{}/{}", u[0], u[1]))
342        }
343        3 => {
344            #[cfg(feature = "tracing")]
345            debug!("Normalizing ssh url with ports: {:?}", u);
346            normalize_url(&format!("ssh://{}:{}/{}", u[0], u[1], u[2]))
347        }
348        _default => Err(GitUrlParseError::UnsupportedSshUrlFormat),
349    }
350}
351
352/// `normalize_file_path` takes in a filepath and uses `Url::from_file_path()` to parse
353///
354/// Prepends `file://` to url
355#[cfg(any(unix, windows, target_os = "redox", target_os = "wasi"))]
356fn normalize_file_path(filepath: &str) -> Result<Url, GitUrlParseError> {
357    let fp = Url::from_file_path(filepath);
358
359    match fp {
360        Ok(path) => Ok(path),
361        Err(_e) => {
362            if let Ok(file_url) = normalize_url(&format!("file://{}", filepath)) {
363                Ok(file_url)
364            } else {
365                return Err(GitUrlParseError::FileUrlNormalizeFailedSchemeAdded);
366            }
367        }
368    }
369}
370
371#[cfg(target_arch = "wasm32")]
372fn normalize_file_path(_filepath: &str) -> Result<Url, GitUrlParseError> {
373    unreachable!()
374}
375
376/// `normalize_url` takes in url as `&str` and takes an opinionated approach to identify
377/// `ssh://` or `file://` urls that require more information to be added so that
378/// they can be parsed more effectively by `url::Url::parse()`
379pub fn normalize_url(url: &str) -> Result<Url, GitUrlParseError> {
380    #[cfg(feature = "tracing")]
381    debug!("Processing: {:?}", &url);
382
383    // TODO: Should this be extended to check for any whitespace?
384    // Error if there are null bytes within the url
385    // https://github.com/tjtelan/git-url-parse-rs/issues/16
386    if url.contains('\0') {
387        return Err(GitUrlParseError::FoundNullBytes);
388    }
389
390    // We're going to remove any trailing slash before running through Url::parse
391    let trim_url = url.trim_end_matches('/');
392
393    // TODO: Remove support for this form when I go to next major version.
394    // I forget what it supports, and it isn't obvious after searching for examples
395    // normalize short git url notation: git:host/path
396    let url_to_parse = if trim_url.starts_with("git:") && !trim_url.starts_with("git://") {
397        trim_url.replace("git:", "git://")
398    } else {
399        trim_url.to_string()
400    };
401
402    let url_parse = Url::parse(&url_to_parse);
403
404    Ok(match url_parse {
405        Ok(u) => {
406            match Scheme::from_str(u.scheme()) {
407                Ok(_p) => u,
408                Err(_e) => {
409                    // Catch case when an ssh url is given w/o a user
410                    #[cfg(feature = "tracing")]
411                    debug!("Scheme parse fail. Assuming a userless ssh url");
412                    if let Ok(ssh_url) = normalize_ssh_url(trim_url) {
413                        ssh_url
414                    } else {
415                        return Err(GitUrlParseError::SshUrlNormalizeFailedNoScheme);
416                    }
417                }
418            }
419        }
420
421        // If we're here, we're only looking for Scheme::Ssh or Scheme::File
422        // TODO: Add test for this
423        Err(url::ParseError::RelativeUrlWithoutBase) => {
424            // Assuming we have found Scheme::Ssh if we can find an "@" before ":"
425            // Otherwise we have Scheme::File
426            //let re = Regex::new(r"^\S+(@)\S+(:).*$").with_context(|| {
427            //    "Failed to build ssh git url regex for testing against url".to_string()
428            //})?;
429
430            match is_ssh_url(trim_url) {
431                true => {
432                    #[cfg(feature = "tracing")]
433                    debug!("Scheme::SSH match for normalization");
434                    normalize_ssh_url(trim_url)?
435                }
436                false => {
437                    #[cfg(feature = "tracing")]
438                    debug!("Scheme::File match for normalization");
439                    normalize_file_path(trim_url)?
440                }
441            }
442        }
443        Err(err) => {
444            return Err(GitUrlParseError::from(err));
445        }
446    })
447}
448
449// Valid ssh `url` for cloning have a usernames,
450// but we don't require it classification or parsing purposes
451// However a path must be specified with a `:`
452fn is_ssh_url(url: &str) -> bool {
453    // if we do not have a path
454    if !url.contains(':') {
455        return false;
456    }
457
458    // if we have a username, expect it before the path (Are usernames with colons valid?)
459    if let (Some(at_pos), Some(colon_pos)) = (url.find('@'), url.find(':')) {
460        if colon_pos < at_pos {
461            return false;
462        }
463
464        // Make sure we provided a username, and not just `@`
465        let parts: Vec<&str> = url.split('@').collect();
466        if parts.len() != 2 && !parts[0].is_empty() {
467            return false;
468        } else {
469            return true;
470        }
471    }
472
473    // it's an ssh url if we have a domain:path pattern
474    let parts: Vec<&str> = url.split(':').collect();
475
476    // FIXME: I am not sure how to validate a url with a port
477    //if parts.len() != 3 && !parts[0].is_empty() && !parts[1].is_empty() && !parts[2].is_empty() {
478    //    return false;
479    //}
480
481    // This should also handle if a port is specified
482    // no port example: ssh://user@domain:path/to/repo.git
483    // port example: ssh://user@domain:port/path/to/repo.git
484    if parts.len() != 2 && !parts[0].is_empty() && !parts[1].is_empty() {
485        return false;
486    } else {
487        return true;
488    }
489}
490
491#[derive(Error, Debug, PartialEq, Eq)]
492pub enum GitUrlParseError {
493    #[error("Error from Url crate")]
494    UrlParseError(#[from] url::ParseError),
495
496    #[error("Url normalization into url::Url failed")]
497    UrlNormalizeFailed,
498
499    #[error("No url scheme was found, then failed to normalize as ssh url.")]
500    SshUrlNormalizeFailedNoScheme,
501
502    #[error("No url scheme was found, then failed to normalize as ssh url after adding 'ssh://'")]
503    SshUrlNormalizeFailedSchemeAdded,
504
505    #[error("Failed to normalize as ssh url after adding 'ssh://'")]
506    SshUrlNormalizeFailedSchemeAddedWithPorts,
507
508    #[error("No url scheme was found, then failed to normalize as file url.")]
509    FileUrlNormalizeFailedNoScheme,
510
511    #[error(
512        "No url scheme was found, then failed to normalize as file url after adding 'file://'"
513    )]
514    FileUrlNormalizeFailedSchemeAdded,
515
516    #[error("Git Url not in expected format")]
517    UnexpectedFormat,
518
519    // FIXME: Keep an eye on this error for removal
520    #[error("Git Url for host using unexpected scheme")]
521    UnexpectedScheme,
522
523    #[error("Scheme unsupported: {0}")]
524    UnsupportedScheme(String),
525    #[error("Host from Url cannot be str or does not exist")]
526    UnsupportedUrlHostFormat,
527    #[error("Git Url not in expected format for SSH")]
528    UnsupportedSshUrlFormat,
529
530    #[error("Found null bytes within input url before parsing")]
531    FoundNullBytes,
532}