git-url-parse 0.6.0

A parser for urls used by git
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
//! # GitUrl url spec parser
//!
//! Internal structs with RFC 3968 parsing logic for Git urls
//!

use getset::Getters;
#[cfg(feature = "log")]
use log::debug;
use nom::Finish;
use nom::branch::alt;
use nom::bytes::complete::{tag, take_while};
use nom::character::complete::alpha1;
use nom::combinator::{map_opt, peek, recognize, verify};
use nom::error::context;
use nom::multi::{many0, many1};
use nom::sequence::{pair, preceded, separated_pair, terminated};
use nom::{IResult, Parser, combinator::opt};

/// Top-level struct for RFC 3986 spec parser
#[derive(Debug, Default, Clone, Getters)]
#[getset(get = "pub")]
pub(crate) struct UrlSpecParser {
    /// RFC 3986 scheme
    pub(crate) scheme: Option<String>,
    /// RFC 3986 hier-part
    pub(crate) hier_part: UrlHierPart,
}

impl UrlSpecParser {
    /// https://datatracker.ietf.org/doc/html/rfc3986
    /// Based on rfc3986, but does not strictly cover the spec
    /// * No support for:
    ///     * query, fragment, percent-encoding, and much of the edges for path support
    ///     * many forms of ip representations like ipv6, hexdigits
    /// * Added support for:
    ///     * parsing ssh git urls which use ":" as a delimiter between the authority and path
    ///     * parsing userinfo into user:token (but its officially deprecated, per #section-3.2.1)
    ///     * some limited support for windows/linux filepaths
    pub(crate) fn parse(input: &str) -> IResult<&str, Self> {
        let (input, scheme) = Self::parse_scheme.parse(input).finish().unwrap_or_default();
        let (input, heir_part) = Self::parse_hier_part(input).finish().unwrap_or_default();

        let parsed = UrlSpecParser {
            scheme,
            hier_part: heir_part,
        };

        Ok((input, parsed))
    }

    /// RFC 3986 scheme
    fn parse_scheme(input: &str) -> IResult<&str, Option<String>> {
        #[cfg(feature = "log")]
        {
            debug!("Looking ahead before parsing for scheme");
        }

        let mut check = context(
            "scheme validate",
            peek(pair(
                pair(
                    alpha1,
                    take_while(|c: char| {
                        c.is_ascii_alphabetic()
                            || c.is_ascii_digit()
                            || c == '+'
                            || c == '-'
                            || c == '.'
                    }),
                ),
                tag::<&str, &str, nom::error::Error<&str>>("://"),
            )),
        );

        // Check if we have scheme 'git:' without the '//' for normalizing to 'git://'
        if Self::short_git_scheme_check(input) {
            // return early if we are normalizing 'git:' (short git)
            if let Ok((input, scheme)) = Self::short_git_scheme_parser().parse(input) {
                return Ok((input, scheme.map(|s| s.to_string())));
            }
        }

        if check.parse(input).is_err() {
            #[cfg(feature = "log")]
            {
                debug!("Look ahead check for scheme failed");
            }
            return Ok((input, None));
        }

        #[cfg(feature = "log")]
        {
            debug!("Look ahead check passed, parsing for scheme");
        }

        // Must start with alpha character, then alpha/digit/+/-/.
        let (input, scheme) = context(
            "Scheme parse",
            opt(verify(
                terminated(
                    recognize(pair(
                        alpha1,
                        take_while(|c: char| {
                            c.is_ascii_alphabetic()
                                || c.is_ascii_digit()
                                || c == '+'
                                || c == '-'
                                || c == '.'
                        }),
                    )),
                    // Not part of spec. We consume the "://" here to more easily manage scheme to be optional
                    tag("://"),
                ),
                |s: &str| !s.is_empty(),
            )),
        )
        .parse(input)?;

        #[cfg(feature = "log")]
        {
            debug!("{input:?}");
            debug!("{scheme:?}");
        }

        Ok((input, scheme.map(|s| s.to_string())))
    }

    /// RFC 3986 hier-part
    // https://datatracker.ietf.org/doc/html/rfc3986#section-3.2
    // The rfc says parsing the "//" part of the uri belongs to the hier-part parsing
    // but we only support common internet protocols, file paths, but not other "baseless" ones
    // so it is sensible for this move it with scheme parsing to support git user service urls
    fn parse_hier_part(input: &str) -> IResult<&str, UrlHierPart> {
        #[cfg(feature = "log")]
        {
            debug!("Parsing for heir-part");
        }

        let (input, authority) = Self::parse_authority(input)?;

        let (input, path) = context(
            "Top of path parsers",
            verify(
                alt((
                    //preceded(tag("//"), Self::path_abempty_parser()),
                    Self::path_abempty_parser(),
                    Self::path_rootless_parser(),
                    Self::path_ssh_parser(),
                )),
                |s: &str| !s.is_empty(),
            ),
        )
        .parse(input)?;

        let hier_part = UrlHierPart {
            authority,
            path: path.to_string(),
        };

        #[cfg(feature = "log")]
        {
            debug!("{:?}", input);
            debug!("{:?}", hier_part);
        }

        Ok((input, hier_part))
    }

    /// RFC 3986 authority
    fn parse_authority(input: &str) -> IResult<&str, UrlAuthority> {
        #[cfg(feature = "log")]
        {
            debug!("Parsing for Authority");
        }

        // Optional: username / token
        let (input, userinfo) = Self::parse_userinfo(input)?;

        // Host
        #[cfg(feature = "log")]
        {
            debug!("Looking ahead for windows-style path vs host");
        }

        // peek ahead to check for windows path stuff
        let check = context(
            "Host check for windows path",
            peek(preceded(
                take_while(|c| reg_name_uri_chars(c) && c != '\\'),
                tag::<&str, &str, nom::error::Error<&str>>(":\\"),
            )),
        )
        .parse(input);

        if check.is_ok() {
            #[cfg(feature = "log")]
            {
                debug!(
                    "Host check failed. Found potential windows-style path while looking for host"
                );
            }

            return Ok((input, UrlAuthority::default()));
        }

        #[cfg(feature = "log")]
        {
            debug!("Parsing for host");
        }

        let (input, host) = context(
            "Host parser",
            opt(verify(
                recognize(take_while(|c: char| reg_name_uri_chars(c))),
                |s: &str| {
                    let has_alphanum = s.chars().any(char::is_alphanumeric);
                    let starts_with_alphanum = s.chars().next().is_some_and(char::is_alphanumeric);

                    has_alphanum && starts_with_alphanum && !s.is_empty()
                },
            )),
        )
        .parse(input)?;

        #[cfg(feature = "log")]
        {
            debug!("host found: {host:?}");
        }

        // Optional: port
        let (input, port) = Self::parse_port(input)?;

        let authority = UrlAuthority {
            userinfo,
            host: host.map(|h| h.to_string()),
            port,
        };

        #[cfg(feature = "log")]
        {
            debug!("{input:?}");
            debug!("{authority:?}");
        }

        Ok((input, authority))
    }

    /// RFC 3986 userinfo
    fn parse_userinfo(authority_input: &str) -> IResult<&str, UrlUserInfo> {
        // Peek for username@
        #[cfg(feature = "log")]
        {
            debug!("Checking for for Userinfo");
        }

        let mut check = context(
            "Userinfo validation",
            peek(pair(
                take_while(|c: char| unreserved_uri_chars(c) || subdelims_uri_chars(c) || c == ':'),
                tag::<&str, &str, nom::error::Error<&str>>("@"),
            )),
        );

        if check.parse(authority_input).is_err() {
            #[cfg(feature = "log")]
            {
                debug!("Userinfo check failed");
            }
            return Ok((authority_input, UrlUserInfo::default()));
        }

        // Userinfo
        let (authority_input, userinfo) = context(
            "Userinfo parser",
            opt(verify(
                recognize(take_while(|c: char| {
                    unreserved_uri_chars(c) || subdelims_uri_chars(c) || c == ':'
                })),
                |s: &str| !s.is_empty(),
            )),
        )
        .parse(authority_input)?;

        let (authority_input, _) = if userinfo.is_some() {
            #[cfg(feature = "log")]
            {
                debug!("Userinfo found. Parsing for '@'");
            }

            context("Userinfo '@' parser", tag("@")).parse(authority_input)?
        } else {
            // No change to input, but let the compiler be happy
            (authority_input, authority_input)
        };

        // Break down userinfo into user and token
        let (user, token) = if let Some(userinfo) = userinfo {
            if userinfo.contains(":") {
                #[cfg(feature = "log")]
                {
                    debug!("Continue break down userinfo into user:token");
                }
                let (_, (user, token)) = context(
                    "Userinfo with colon parser",
                    separated_pair(
                        verify(
                            take_while(|c: char| unreserved_uri_chars(c) || subdelims_uri_chars(c)),
                            |s: &str| !s.is_empty(),
                        ),
                        tag(":"),
                        verify(
                            take_while(|c: char| unreserved_uri_chars(c) || subdelims_uri_chars(c)),
                            |s: &str| !s.is_empty(),
                        ),
                    ),
                )
                .parse(userinfo)?;
                (Some(user), Some(token))
            } else {
                (Some(userinfo), None)
            }
        } else {
            (None, None)
        };

        let userinfo = UrlUserInfo {
            user: user.map(|u| u.to_string()),
            token: token.map(|u| u.to_string()),
        };

        #[cfg(feature = "log")]
        {
            debug!("{authority_input:?}");
            debug!("{userinfo:?}");
        }

        Ok((authority_input, userinfo))
    }

    /// RFC 3986 port
    fn parse_port(authority_input: &str) -> IResult<&str, Option<u16>> {
        #[cfg(feature = "log")]
        {
            debug!("Parsing port");
        }

        // We need to pull the full value of what's in the segment THEN parse for numbers
        let (input, port) = context(
            "Port parser",
            opt(map_opt(
                verify(
                    preceded(
                        tag(":"),
                        take_while(|c: char| unreserved_uri_chars(c) || subdelims_uri_chars(c)),
                    ),
                    |p_str: &str| !p_str.is_empty(),
                ),
                |s: &str| s.parse::<u16>().ok(),
            )),
        )
        .parse(authority_input)?;

        #[cfg(feature = "log")]
        {
            debug!("{authority_input:?}");
            debug!("{port:?}");
        }

        Ok((input, port))
    }

    /// RFC 3986 path-abempty
    fn path_abempty_parser<'url>(
    ) -> impl Parser<
        &'url str,
        Output = <dyn Parser<&'url str, Output = &'url str, Error = nom::error::Error<&'url str>> as Parser<
            &'url str,
        >>::Output,
        Error = nom::error::Error<&'url str>,
    >{
        #[cfg(feature = "log")]
        {
            debug!("parsing abempty path");
        }

        // Starts with '/' or empty
        context(
            "Path parser (abempty)",
            recognize(many1(pair(
                tag("/"),
                take_while(|c: char| pchar_uri_chars(c)),
            ))),
        )
    }

    /// Not part of RFC 3986 - ssh-based url path
    fn path_ssh_parser<'url>(
    ) -> impl Parser<
        &'url str,
        Output = <dyn Parser<&'url str, Output = &'url str, Error = nom::error::Error<&'url str>> as Parser<
            &'url str,
        >>::Output,
        Error = nom::error::Error<&'url str>,
    >{
        #[cfg(feature = "log")]
        {
            debug!("Parsing ssh path");
        }

        context(
            "Path parser (ssh)",
            recognize((
                tag(":"),
                take_while(|c: char| pchar_uri_chars(c)),
                many1(pair(tag("/"), take_while(|c: char| pchar_uri_chars(c)))),
            )),
        )
    }

    /// RFC 3986 path-rootless
    fn path_rootless_parser<'url>(
    ) -> impl Parser<
        &'url str,
        Output = <dyn Parser<&'url str, Output = &'url str, Error = nom::error::Error<&'url str>> as Parser<
            &'url str,
        >>::Output,
        Error = nom::error::Error<&'url str>,
    >{
        #[cfg(feature = "log")]
        {
            debug!("Parsing rootless path");
        }

        context(
            "Path parser (rootless)",
            recognize(pair(
                take_while(|c: char| pchar_uri_chars(c)),
                many0(pair(tag("/"), take_while(|c: char| pchar_uri_chars(c)))),
            )),
        )
    }

    /// consuming parser for `git:` (short git) as scheme for normalizing
    fn short_git_scheme_parser<'url>() -> impl Parser<
        &'url str,
        Output = <dyn Parser<
            &'url str,
            Output = Option<&'url str>,
            Error = nom::error::Error<&'url str>,
        > as Parser<&'url str>>::Output,
        Error = nom::error::Error<&'url str>,
    > {
        #[cfg(feature = "log")]
        {
            debug!("Parsing short git scheme");
        }

        context(
            "short git scheme parse",
            opt(terminated(
                tag::<&str, &str, nom::error::Error<&str>>("git"),
                tag::<&str, &str, nom::error::Error<&str>>(":"),
            )),
        )
    }

    /// Non-consuming check for `git:` (short git) as scheme for normalizing
    fn short_git_scheme_check(input: &str) -> bool {
        context(
            "short git validate",
            peek(terminated(
                tag::<&str, &str, nom::error::Error<&str>>("git"),
                tag::<&str, &str, nom::error::Error<&str>>(":"),
            )),
        )
        .parse(input)
        .is_ok()
    }
}

/// RFC 3986 userinfo
#[derive(Debug, Default, Clone, Getters)]
#[getset(get = "pub")]
pub(crate) struct UrlUserInfo {
    /// RFC 3986 Userinfo
    pub(crate) user: Option<String>,
    /// Non-spec, deprecated
    pub(crate) token: Option<String>,
}

/// RFC 3986 authority
#[derive(Debug, Default, Clone, Getters)]
#[getset(get = "pub")]
pub(crate) struct UrlAuthority {
    /// RFC 3986 Username, non-spec token
    pub(crate) userinfo: UrlUserInfo,
    /// RFC 3986 Host
    pub(crate) host: Option<String>,
    /// RFC 3986 Port
    pub(crate) port: Option<u16>,
}

/// RFC 3986 hier-part
#[derive(Debug, Default, Clone, Getters)]
#[getset(get = "pub")]
pub(crate) struct UrlHierPart {
    /// RFC 3986 authority
    pub(crate) authority: UrlAuthority,
    /// RFC 3986 relative-part
    pub(crate) path: String,
}

/// RFC 3986 pchar
pub(crate) fn pchar_uri_chars(c: char) -> bool {
    // unreserved / pct-encoded (not implemented) / sub-delims / ":" / "@"
    unreserved_uri_chars(c) || subdelims_uri_chars(c) || c == ':' || c == '@'
}

/// RFC 3986 reg-name
pub(crate) fn reg_name_uri_chars(c: char) -> bool {
    // *( unreserved / pct-encoded (not implemented) / sub-delims )
    unreserved_uri_chars(c) || subdelims_uri_chars(c)
}

/// RFC 3986 unreserved
pub(crate) fn unreserved_uri_chars(c: char) -> bool {
    c.is_alphanumeric() || c == '-' || c == '.' || c == '_' || c == '~'
}

/// RFC 3986 sub-delims (mostly)
pub(crate) fn subdelims_uri_chars(c: char) -> bool {
    c == '!'
        || c == '$'
        || c == '&'
        || c == '\''
        || c == '('
        || c == ')'
        || c == '*'
        || c == '+'
        || c == ','
        || c == ';'
        || c == '='
        || c == '\\' // This is not part of spec, but used for windows paths
}