git_url_parse/
lib.rs

1use std::fmt;
2use std::str::FromStr;
3use strum::{Display, EnumString, VariantNames};
4use thiserror::Error;
5use url::Url;
6
7#[cfg(feature = "tracing")]
8use tracing::debug;
9
10/// Supported uri schemes for parsing
11#[derive(Debug, PartialEq, Eq, EnumString, VariantNames, Clone, Display, Copy)]
12#[strum(serialize_all = "kebab_case")]
13pub enum Scheme {
14    /// Represents `file://` url scheme
15    File,
16    /// Represents `ftp://` url scheme
17    Ftp,
18    /// Represents `ftps://` url scheme
19    Ftps,
20    /// Represents `git://` url scheme
21    Git,
22    /// Represents `git+ssh://` url scheme
23    #[strum(serialize = "git+ssh")]
24    GitSsh,
25    /// Represents `http://` url scheme
26    Http,
27    /// Represents `https://` url scheme
28    Https,
29    /// Represents `ssh://` url scheme
30    Ssh,
31    /// Represents No url scheme
32    Unspecified,
33}
34
35/// GitUrl represents an input url that is a url used by git
36/// Internally during parsing the url is sanitized and uses the `url` crate to perform
37/// the majority of the parsing effort, and with some extra handling to expose
38/// metadata used my many git hosting services
39#[derive(Debug, PartialEq, Eq, Clone)]
40pub struct GitUrl {
41    /// The fully qualified domain name (FQDN) or IP of the repo
42    pub host: Option<String>,
43    /// The name of the repo
44    pub name: String,
45    /// The owner/account/project name
46    pub owner: Option<String>,
47    /// The organization name. Supported by Azure DevOps
48    pub organization: Option<String>,
49    /// The full name of the repo, formatted as "owner/name"
50    pub fullname: String,
51    /// The git url scheme
52    pub scheme: Scheme,
53    /// The authentication user
54    pub user: Option<String>,
55    /// The oauth token (could appear in the https urls)
56    pub token: Option<String>,
57    /// The non-conventional port where git service is hosted
58    pub port: Option<u16>,
59    /// The path to repo w/ respect to user + hostname
60    pub path: String,
61    /// Indicate if url uses the .git suffix
62    pub git_suffix: bool,
63    /// Indicate if url explicitly uses its scheme
64    pub scheme_prefix: bool,
65}
66
67/// Build the printable GitUrl from its components
68impl fmt::Display for GitUrl {
69    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
70        let scheme_prefix = match self.scheme_prefix {
71            true => format!("{}://", self.scheme),
72            false => String::new(),
73        };
74
75        let auth_info = match self.scheme {
76            Scheme::Ssh | Scheme::Git | Scheme::GitSsh => {
77                if let Some(user) = &self.user {
78                    format!("{}@", user)
79                } else {
80                    String::new()
81                }
82            }
83            Scheme::Http | Scheme::Https => match (&self.user, &self.token) {
84                (Some(user), Some(token)) => format!("{}:{}@", user, token),
85                (Some(user), None) => format!("{}@", user),
86                (None, Some(token)) => format!("{}@", token),
87                (None, None) => String::new(),
88            },
89            _ => String::new(),
90        };
91
92        let host = match &self.host {
93            Some(host) => host.to_string(),
94            None => String::new(),
95        };
96
97        let port = match &self.port {
98            Some(p) => format!(":{}", p),
99            None => String::new(),
100        };
101
102        let path = match &self.scheme {
103            Scheme::Ssh => {
104                if self.port.is_some() {
105                    format!("/{}", &self.path)
106                } else {
107                    format!(":{}", &self.path)
108                }
109            }
110            _ => self.path.to_string(),
111        };
112
113        let git_url_str = format!("{}{}{}{}{}", scheme_prefix, auth_info, host, port, path);
114
115        write!(f, "{}", git_url_str)
116    }
117}
118
119impl Default for GitUrl {
120    fn default() -> Self {
121        GitUrl {
122            host: None,
123            name: "".to_string(),
124            owner: None,
125            organization: None,
126            fullname: "".to_string(),
127            scheme: Scheme::Unspecified,
128            user: None,
129            token: None,
130            port: None,
131            path: "".to_string(),
132            git_suffix: false,
133            scheme_prefix: false,
134        }
135    }
136}
137
138impl FromStr for GitUrl {
139    type Err = GitUrlParseError;
140
141    fn from_str(s: &str) -> Result<Self, Self::Err> {
142        GitUrl::parse(s)
143    }
144}
145
146impl GitUrl {
147    /// Returns `GitUrl` after removing `user` and `token` values
148    /// Intended use-case is for non-destructive printing GitUrl excluding any embedded auth info
149    pub fn trim_auth(&self) -> GitUrl {
150        let mut new_giturl = self.clone();
151        new_giturl.user = None;
152        new_giturl.token = None;
153        new_giturl
154    }
155
156    /// Returns a `Result<GitUrl>` after normalizing and parsing `url` for metadata
157    pub fn parse(url: &str) -> Result<GitUrl, GitUrlParseError> {
158        // Normalize the url so we can use Url crate to process ssh urls
159        let normalized = normalize_url(url)?;
160
161        // Some pre-processing for paths
162        let scheme = if let Ok(scheme) = Scheme::from_str(normalized.scheme()) {
163            scheme
164        } else {
165            return Err(GitUrlParseError::UnsupportedScheme(
166                normalized.scheme().to_string(),
167            ));
168        };
169        if normalized.path().is_empty() {
170            return Err(GitUrlParseError::EmptyPath);
171        }
172
173        // Normalized ssh urls can always have their first '/' removed
174        let urlpath = match &scheme {
175            Scheme::Ssh => {
176                // At the moment, we're relying on url::Url's parse() behavior to not duplicate
177                // the leading '/' when we normalize
178                normalized.path()[1..].to_string()
179            }
180            _ => normalized.path().to_string(),
181        };
182
183        let git_suffix_check = &urlpath.ends_with(".git");
184
185        // Parse through path for name,owner,organization
186        // Support organizations for Azure Devops
187        #[cfg(feature = "tracing")]
188        debug!("The urlpath: {:?}", &urlpath);
189
190        // Most git services use the path for metadata in the same way, so we're going to separate
191        // the metadata
192        // ex. github.com/accountname/reponame
193        // owner = accountname
194        // name = reponame
195        //
196        // organizations are going to be supported on a per-host basis
197        let splitpath = &urlpath.rsplit_terminator('/').collect::<Vec<&str>>();
198
199        #[cfg(feature = "tracing")]
200        debug!("rsplit results for metadata: {:?}", splitpath);
201
202        let name = splitpath[0].trim_end_matches(".git").to_string();
203
204        // TODO:  I think here is where we want to update the url pattern identification step.. I want to be able to have a hint that the user can pass
205
206        let (owner, organization, fullname) = match &scheme {
207            // We're not going to assume anything about metadata from a filepath
208            Scheme::File => (None::<String>, None::<String>, name.clone()),
209            _ => {
210                let mut fullname: Vec<&str> = Vec::new();
211
212                // TODO: Add support for parsing out orgs from these urls
213                let hosts_w_organization_in_path = ["dev.azure.com", "ssh.dev.azure.com"];
214                //vec!["dev.azure.com", "ssh.dev.azure.com", "visualstudio.com"];
215
216                let host_str = if let Some(host) = normalized.host_str() {
217                    host
218                } else {
219                    return Err(GitUrlParseError::UnsupportedUrlHostFormat);
220                };
221
222                match hosts_w_organization_in_path.contains(&host_str) {
223                    true => {
224                        #[cfg(feature = "tracing")]
225                        debug!("Found a git provider with an org");
226
227                        // The path differs between git:// and https:// schemes
228
229                        match &scheme {
230                            // Example: "git@ssh.dev.azure.com:v3/CompanyName/ProjectName/RepoName",
231                            Scheme::Ssh => {
232                                // Organization
233                                fullname.push(splitpath[2]);
234                                // Project/Owner name
235                                fullname.push(splitpath[1]);
236                                // Repo name
237                                fullname.push(splitpath[0]);
238
239                                (
240                                    Some(splitpath[1].to_string()),
241                                    Some(splitpath[2].to_string()),
242                                    fullname.join("/"),
243                                )
244                            }
245                            // Example: "https://CompanyName@dev.azure.com/CompanyName/ProjectName/_git/RepoName",
246                            Scheme::Https => {
247                                // Organization
248                                fullname.push(splitpath[3]);
249                                // Project/Owner name
250                                fullname.push(splitpath[2]);
251                                // Repo name
252                                fullname.push(splitpath[0]);
253
254                                (
255                                    Some(splitpath[2].to_string()),
256                                    Some(splitpath[3].to_string()),
257                                    fullname.join("/"),
258                                )
259                            }
260
261                            // TODO: I'm not sure if I want to support throwing this error long-term
262                            _ => return Err(GitUrlParseError::UnexpectedScheme),
263                        }
264                    }
265                    false => {
266                        if !url.starts_with("ssh") && splitpath.len() < 2 {
267                            return Err(GitUrlParseError::UnexpectedFormat);
268                        }
269
270                        let position = match splitpath.len() {
271                            0 => return Err(GitUrlParseError::UnexpectedFormat),
272                            1 => 0,
273                            _ => 1,
274                        };
275
276                        // push owner
277                        fullname.push(splitpath[position]);
278                        // push name
279                        fullname.push(name.as_str());
280
281                        (
282                            Some(splitpath[position].to_string()),
283                            None::<String>,
284                            fullname.join("/"),
285                        )
286                    }
287                }
288            }
289        };
290
291        let final_host = match scheme {
292            Scheme::File => None,
293            _ => normalized.host_str().map(|h| h.to_string()),
294        };
295
296        let final_path = match scheme {
297            Scheme::File => {
298                if let Some(host) = normalized.host_str() {
299                    format!("{}{}", host, urlpath)
300                } else {
301                    urlpath
302                }
303            }
304            _ => urlpath,
305        };
306
307        Ok(GitUrl {
308            host: final_host,
309            name,
310            owner,
311            organization,
312            fullname,
313            scheme,
314            user: match normalized.username().to_string().len() {
315                0 => None,
316                _ => Some(normalized.username().to_string()),
317            },
318            token: normalized.password().map(|p| p.to_string()),
319            port: normalized.port(),
320            path: final_path,
321            git_suffix: *git_suffix_check,
322            scheme_prefix: url.contains("://") || url.starts_with("git:"),
323        })
324    }
325}
326
327/// `normalize_ssh_url` takes in an ssh url that separates the login info
328/// from the path into with a `:` and replaces it with `/`.
329///
330/// Prepends `ssh://` to url
331///
332/// Supports absolute and relative paths
333fn normalize_ssh_url(url: &str) -> Result<Url, GitUrlParseError> {
334    let u = url.split(':').collect::<Vec<&str>>();
335
336    match u.len() {
337        2 => {
338            #[cfg(feature = "tracing")]
339            debug!("Normalizing ssh url: {:?}", u);
340            normalize_url(&format!("ssh://{}/{}", u[0], u[1]))
341        }
342        3 => {
343            #[cfg(feature = "tracing")]
344            debug!("Normalizing ssh url with ports: {:?}", u);
345            normalize_url(&format!("ssh://{}:{}/{}", u[0], u[1], u[2]))
346        }
347        _default => Err(GitUrlParseError::UnsupportedSshUrlFormat),
348    }
349}
350
351/// `normalize_file_path` takes in a filepath and uses `Url::from_file_path()` to parse
352///
353/// Prepends `file://` to url
354#[cfg(any(unix, windows, target_os = "redox", target_os = "wasi"))]
355fn normalize_file_path(filepath: &str) -> Result<Url, GitUrlParseError> {
356    let fp = Url::from_file_path(filepath);
357
358    match fp {
359        Ok(path) => Ok(path),
360        Err(_e) => {
361            if let Ok(file_url) = normalize_url(&format!("file://{}", filepath)) {
362                Ok(file_url)
363            } else {
364                Err(GitUrlParseError::FileUrlNormalizeFailedSchemeAdded)
365            }
366        }
367    }
368}
369
370#[cfg(target_arch = "wasm32")]
371fn normalize_file_path(_filepath: &str) -> Result<Url, GitUrlParseError> {
372    unreachable!()
373}
374
375/// `normalize_url` takes in url as `&str` and takes an opinionated approach to identify
376/// `ssh://` or `file://` urls that require more information to be added so that
377/// they can be parsed more effectively by `url::Url::parse()`
378pub fn normalize_url(url: &str) -> Result<Url, GitUrlParseError> {
379    #[cfg(feature = "tracing")]
380    debug!("Processing: {:?}", &url);
381
382    // TODO: Should this be extended to check for any whitespace?
383    // Error if there are null bytes within the url
384    // https://github.com/tjtelan/git-url-parse-rs/issues/16
385    if url.contains('\0') {
386        return Err(GitUrlParseError::FoundNullBytes);
387    }
388
389    // We're going to remove any trailing slash before running through Url::parse
390    let trim_url = url.trim_end_matches('/');
391
392    // TODO: Remove support for this form when I go to next major version.
393    // I forget what it supports, and it isn't obvious after searching for examples
394    // normalize short git url notation: git:host/path
395    let url_to_parse = if trim_url.starts_with("git:") && !trim_url.starts_with("git://") {
396        trim_url.replace("git:", "git://")
397    } else {
398        trim_url.to_string()
399    };
400
401    let url_parse = Url::parse(&url_to_parse);
402
403    Ok(match url_parse {
404        Ok(u) => {
405            match Scheme::from_str(u.scheme()) {
406                Ok(_p) => u,
407                Err(_e) => {
408                    // Catch case when an ssh url is given w/o a user
409                    #[cfg(feature = "tracing")]
410                    debug!("Scheme parse fail. Assuming a userless ssh url");
411                    if let Ok(ssh_url) = normalize_ssh_url(trim_url) {
412                        ssh_url
413                    } else {
414                        return Err(GitUrlParseError::SshUrlNormalizeFailedNoScheme);
415                    }
416                }
417            }
418        }
419
420        // If we're here, we're only looking for Scheme::Ssh or Scheme::File
421        // TODO: Add test for this
422        Err(url::ParseError::RelativeUrlWithoutBase) => {
423            // Assuming we have found Scheme::Ssh if we can find an "@" before ":"
424            // Otherwise we have Scheme::File
425            //let re = Regex::new(r"^\S+(@)\S+(:).*$").with_context(|| {
426            //    "Failed to build ssh git url regex for testing against url".to_string()
427            //})?;
428
429            match is_ssh_url(trim_url) {
430                true => {
431                    #[cfg(feature = "tracing")]
432                    debug!("Scheme::SSH match for normalization");
433                    normalize_ssh_url(trim_url)?
434                }
435                false => {
436                    #[cfg(feature = "tracing")]
437                    debug!("Scheme::File match for normalization");
438                    normalize_file_path(trim_url)?
439                }
440            }
441        }
442        Err(err) => {
443            return Err(GitUrlParseError::from(err));
444        }
445    })
446}
447
448// Valid ssh `url` for cloning have a usernames,
449// but we don't require it classification or parsing purposes
450// However a path must be specified with a `:`
451fn is_ssh_url(url: &str) -> bool {
452    // if we do not have a path
453    if !url.contains(':') {
454        return false;
455    }
456
457    // if we have a username, expect it before the path (Are usernames with colons valid?)
458    if let (Some(at_pos), Some(colon_pos)) = (url.find('@'), url.find(':')) {
459        if colon_pos < at_pos {
460            return false;
461        }
462
463        // Make sure we provided a username, and not just `@`
464        let parts: Vec<&str> = url.split('@').collect();
465        return parts.len() == 2 || parts[0].is_empty();
466    }
467
468    // it's an ssh url if we have a domain:path pattern
469    let parts: Vec<&str> = url.split(':').collect();
470
471    // FIXME: I am not sure how to validate a url with a port
472    //if parts.len() != 3 && !parts[0].is_empty() && !parts[1].is_empty() && !parts[2].is_empty() {
473    //    return false;
474    //}
475
476    // This should also handle if a port is specified
477    // no port example: ssh://user@domain:path/to/repo.git
478    // port example: ssh://user@domain:port/path/to/repo.git
479    parts.len() == 2 && parts[0].is_empty() && parts[1].is_empty()
480}
481
482#[derive(Error, Debug, PartialEq, Eq)]
483pub enum GitUrlParseError {
484    #[error("Error from Url crate: {0}")]
485    UrlParseError(#[from] url::ParseError),
486
487    #[error("No url scheme was found, then failed to normalize as ssh url.")]
488    SshUrlNormalizeFailedNoScheme,
489
490    #[error("No url scheme was found, then failed to normalize as ssh url after adding 'ssh://'")]
491    SshUrlNormalizeFailedSchemeAdded,
492
493    #[error("Failed to normalize as ssh url after adding 'ssh://'")]
494    SshUrlNormalizeFailedSchemeAddedWithPorts,
495
496    #[error("No url scheme was found, then failed to normalize as file url.")]
497    FileUrlNormalizeFailedNoScheme,
498
499    #[error(
500        "No url scheme was found, then failed to normalize as file url after adding 'file://'"
501    )]
502    FileUrlNormalizeFailedSchemeAdded,
503
504    #[error("Git Url not in expected format")]
505    UnexpectedFormat,
506
507    // FIXME: Keep an eye on this error for removal
508    #[error("Git Url for host using unexpected scheme")]
509    UnexpectedScheme,
510
511    #[error("Scheme unsupported: {0}")]
512    UnsupportedScheme(String),
513    #[error("Host from Url cannot be str or does not exist")]
514    UnsupportedUrlHostFormat,
515    #[error("Git Url not in expected format for SSH")]
516    UnsupportedSshUrlFormat,
517    #[error("Normalized URL has no path")]
518    EmptyPath,
519
520    #[error("Found null bytes within input url before parsing")]
521    FoundNullBytes,
522}