Skip to main content

mkit_git_bridge/
gitparse.rs

1//! Tolerant parsers for ARBITRARY git object bytes
2//! (SPEC-GIT-IMPORT §2, §3).
3//!
4//! This is the import direction's untrusted-input boundary, and its
5//! contract is the opposite of [`crate::reconstruct`]'s: reconstruct
6//! is strict by design (it proves bridge shape and MUST stay that
7//! way); these parsers accept everything git itself accepts —
8//! multi-line continuation headers (`gpgsig`, `mergetag`), unknown
9//! headers, the `encoding` header, historic malformed person lines —
10//! and either parse faithfully or refuse loudly. They never crash on
11//! malformed input and never silently alter bytes (fuzzed; see
12//! FUZZ.md).
13//!
14//! Parsers stop at structure: policy (mode normalization vs fork-mode
15//! refusal, name legality, timestamp range) lives in the import
16//! driver, which maps [`GitParseError`] / parsed values onto
17//! [`crate::error::Refusal`].
18
19use crate::gitobj::{Sha1Id, sha1_from_hex};
20use std::fmt;
21
22/// Hard cap on a commit/tag header block (everything before the blank
23/// line). Real gpgsig/mergetag blocks are a few KiB; 10 MiB refuses
24/// pathological input before any allocation amplification.
25pub const MAX_HEADER_BLOCK: usize = 10 * 1024 * 1024;
26
27/// Structural parse failure (not policy — see module docs).
28#[derive(Debug, Clone, PartialEq, Eq)]
29pub enum GitParseError {
30    /// No `\n\n` header/message separator, or header block over the cap.
31    Malformed(&'static str),
32    /// A required header (`tree`, `object`, `type`, `tag`) is missing
33    /// or duplicated.
34    Header(&'static str),
35    /// A hash-valued header is not 40 lowercase/uppercase hex chars.
36    BadId(&'static str),
37    /// A person line has no parseable timestamp where one is required.
38    PersonTimestamp,
39}
40
41impl fmt::Display for GitParseError {
42    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
43        match self {
44            Self::Malformed(d) => write!(f, "malformed git object: {d}"),
45            Self::Header(d) => write!(f, "git header: {d}"),
46            Self::BadId(d) => write!(f, "git id field: {d}"),
47            Self::PersonTimestamp => write!(f, "person line has no parseable timestamp"),
48        }
49    }
50}
51
52impl std::error::Error for GitParseError {}
53
54/// A parsed git person line (`author` / `committer` / `tagger`).
55#[derive(Debug, Clone, PartialEq, Eq)]
56pub struct Person {
57    /// The SPEC-GIT-IMPORT §3.2 identity payload: a verbatim byte
58    /// slice of the line — through the closing `>` of the LAST
59    /// `<...>` group, or the bracket-less rule.
60    pub identity: Vec<u8>,
61    /// Epoch seconds. `i64` because git timestamps are signed —
62    /// negative values are the import driver's refusal, not ours.
63    pub timestamp: i64,
64    /// Timezone suffix as written (e.g. `+0200`), when present.
65    pub timezone: Option<Vec<u8>>,
66}
67
68/// Parse a person-line VALUE (the bytes after `author ` etc.).
69///
70/// Tolerances, per SPEC-GIT-IMPORT §3.2:
71/// - identity = bytes through the closing `>` of the last `<...>`
72///   group, verbatim (interior malformations preserved);
73/// - bracket-less lines: identity = remainder with one trailing
74///   `␣<decimal>␣[+-]NNNN` match stripped;
75/// - timestamp = first whitespace-separated decimal (optionally
76///   `-`-signed) after the identity; missing → error (commits/tags
77///   need one);
78/// - timezone = the following token when it looks like `[+-]NNNN`.
79pub fn parse_person(value: &[u8]) -> Result<Person, GitParseError> {
80    // Identity boundary: closing '>' of the LAST '<...>' group.
81    let identity_end = value
82        .iter()
83        .rposition(|&b| b == b'>')
84        .filter(|&gt| value[..gt].contains(&b'<'))
85        .map(|gt| gt + 1);
86
87    if let Some(end) = identity_end {
88        let identity = value[..end].to_vec();
89        let rest = &value[end..];
90        let (timestamp, timezone) = parse_ts_tz(rest).ok_or(GitParseError::PersonTimestamp)?;
91        return Ok(Person {
92            identity,
93            timestamp,
94            timezone,
95        });
96    }
97
98    // Bracket-less rule: strip one trailing `␣secs␣[+-]NNNN` match.
99    if let Some((cut, timestamp, timezone)) = trailing_ts_tz(value) {
100        return Ok(Person {
101            identity: value[..cut].to_vec(),
102            timestamp,
103            timezone: Some(timezone),
104        });
105    }
106    Err(GitParseError::PersonTimestamp)
107}
108
109/// Parse ` <secs> [<tz>]` after the identity. Returns the timestamp
110/// and optional timezone token.
111fn parse_ts_tz(rest: &[u8]) -> Option<(i64, Option<Vec<u8>>)> {
112    let mut tokens = rest.split(|&b| b == b' ').filter(|t| !t.is_empty());
113    let ts_tok = tokens.next()?;
114    let ts = parse_i64(ts_tok)?;
115    let tz = tokens.next().filter(|t| is_tz(t)).map(<[u8]>::to_vec);
116    Some((ts, tz))
117}
118
119/// Match one trailing `␣<decimal>␣[+-]NNNN` group. Returns
120/// (identity-end, timestamp, timezone).
121fn trailing_ts_tz(value: &[u8]) -> Option<(usize, i64, Vec<u8>)> {
122    let last_sp = value.iter().rposition(|&b| b == b' ')?;
123    let tz = &value[last_sp + 1..];
124    if !is_tz(tz) {
125        return None;
126    }
127    let prev_sp = value[..last_sp].iter().rposition(|&b| b == b' ')?;
128    let secs = &value[prev_sp + 1..last_sp];
129    let ts = parse_i64(secs)?;
130    Some((prev_sp, ts, tz.to_vec()))
131}
132
133fn is_tz(t: &[u8]) -> bool {
134    t.len() == 5 && (t[0] == b'+' || t[0] == b'-') && t[1..].iter().all(u8::is_ascii_digit)
135}
136
137fn parse_i64(t: &[u8]) -> Option<i64> {
138    let s = std::str::from_utf8(t).ok()?;
139    // Reject Rust's leading-`+` tolerance: git timestamps are bare
140    // decimals with an optional `-`.
141    if s.starts_with('+') {
142        return None;
143    }
144    s.parse::<i64>().ok()
145}
146
147/// A header line after continuation folding: `(key, value-bytes)`.
148/// Continuation lines (leading space) re-join with `\n` so a folded
149/// value round-trips to the original block minus the fold markers —
150/// faithful for carrying, never re-serialized by the importer.
151type Headers = Vec<(Vec<u8>, Vec<u8>)>;
152
153/// Split an arbitrary git commit/tag body into folded headers + the
154/// verbatim message bytes.
155fn split_headers(body: &[u8]) -> Result<(Headers, &[u8]), GitParseError> {
156    let sep = body
157        .windows(2)
158        .position(|w| w == b"\n\n")
159        .ok_or(GitParseError::Malformed("no header/message separator"))?;
160    if sep + 1 > MAX_HEADER_BLOCK {
161        return Err(GitParseError::Malformed("header block over cap"));
162    }
163    let (head, message) = (&body[..sep], &body[sep + 2..]);
164    let mut headers: Headers = Vec::new();
165    for line in head.split(|&b| b == b'\n') {
166        if let Some(cont) = line.strip_prefix(b" ") {
167            // Continuation: belongs to the previous header.
168            match headers.last_mut() {
169                Some((_, v)) => {
170                    v.push(b'\n');
171                    v.extend_from_slice(cont);
172                }
173                None => return Err(GitParseError::Malformed("leading continuation line")),
174            }
175            continue;
176        }
177        let sp = line
178            .iter()
179            .position(|&b| b == b' ')
180            .ok_or(GitParseError::Malformed("header line without value"))?;
181        headers.push((line[..sp].to_vec(), line[sp + 1..].to_vec()));
182    }
183    Ok((headers, message))
184}
185
186fn one(headers: &Headers, key: &[u8], what: &'static str) -> Result<Vec<u8>, GitParseError> {
187    let mut found = None;
188    for (k, v) in headers {
189        if k == key {
190            if found.is_some() {
191                return Err(GitParseError::Header(what));
192            }
193            found = Some(v.clone());
194        }
195    }
196    found.ok_or(GitParseError::Header(what))
197}
198
199fn id_of(value: &[u8], what: &'static str) -> Result<Sha1Id, GitParseError> {
200    std::str::from_utf8(value)
201        .ok()
202        .map(str::to_ascii_lowercase)
203        .as_deref()
204        .and_then(sha1_from_hex)
205        .ok_or(GitParseError::BadId(what))
206}
207
208/// A parsed (arbitrary) git commit.
209#[derive(Debug, Clone, PartialEq, Eq)]
210pub struct GitCommit {
211    pub tree: Sha1Id,
212    pub parents: Vec<Sha1Id>,
213    pub author: Person,
214    pub committer: Person,
215    /// Verbatim message bytes (may be any encoding).
216    pub message: Vec<u8>,
217    /// `true` when a `gpgsig`/`gpgsig-sha256` header was present
218    /// (carried via retained raw bytes, surfaced for UX/provenance).
219    pub has_gpgsig: bool,
220}
221
222/// Parse arbitrary git commit body bytes (after the object header).
223pub fn parse_commit(body: &[u8]) -> Result<GitCommit, GitParseError> {
224    let (headers, message) = split_headers(body)?;
225    let tree = id_of(
226        &one(&headers, b"tree", "tree missing or duplicated")?,
227        "tree",
228    )?;
229    let mut parents = Vec::new();
230    for (k, v) in &headers {
231        if k == b"parent" {
232            parents.push(id_of(v, "parent")?);
233        }
234    }
235    let author = parse_person(&one(&headers, b"author", "author missing or duplicated")?)?;
236    let committer = parse_person(&one(
237        &headers,
238        b"committer",
239        "committer missing or duplicated",
240    )?)?;
241    let has_gpgsig = headers
242        .iter()
243        .any(|(k, _)| k == b"gpgsig" || k == b"gpgsig-sha256");
244    Ok(GitCommit {
245        tree,
246        parents,
247        author,
248        committer,
249        message: message.to_vec(),
250        has_gpgsig,
251    })
252}
253
254/// A parsed (arbitrary) git annotated tag.
255#[derive(Debug, Clone, PartialEq, Eq)]
256pub struct GitTag {
257    pub object: Sha1Id,
258    /// The `type` header value (`commit`, `tree`, `blob`, `tag`).
259    pub target_type: Vec<u8>,
260    pub name: Vec<u8>,
261    /// `None` for historic tagger-less tags (git v0.99 era).
262    pub tagger: Option<Person>,
263    pub message: Vec<u8>,
264    /// `true` when the message carries a PGP signature block.
265    pub has_signature: bool,
266}
267
268/// Parse arbitrary git tag body bytes.
269pub fn parse_tag(body: &[u8]) -> Result<GitTag, GitParseError> {
270    let (headers, message) = split_headers(body)?;
271    let object = id_of(
272        &one(&headers, b"object", "object missing or duplicated")?,
273        "object",
274    )?;
275    let target_type = one(&headers, b"type", "type missing or duplicated")?;
276    let name = one(&headers, b"tag", "tag name missing or duplicated")?;
277    let tagger = match headers.iter().find(|(k, _)| k == b"tagger") {
278        Some((_, v)) => Some(parse_person(v)?),
279        None => None,
280    };
281    let has_signature = message
282        .windows(b"-----BEGIN PGP SIGNATURE-----".len())
283        .any(|w| w == b"-----BEGIN PGP SIGNATURE-----");
284    Ok(GitTag {
285        object,
286        target_type,
287        name,
288        tagger,
289        message: message.to_vec(),
290        has_signature,
291    })
292}
293
294/// One raw git tree entry: mode string verbatim, name bytes, child id.
295#[derive(Debug, Clone, PartialEq, Eq)]
296pub struct GitTreeEntry {
297    pub mode: Vec<u8>,
298    pub name: Vec<u8>,
299    pub id: Sha1Id,
300}
301
302/// Parse arbitrary git tree body bytes. Purely structural — mode
303/// policy (canonical/normalize/refuse) is the driver's.
304pub fn parse_tree(body: &[u8]) -> Result<Vec<GitTreeEntry>, GitParseError> {
305    let mut entries = Vec::new();
306    let mut rest = body;
307    while !rest.is_empty() {
308        let sp = rest
309            .iter()
310            .position(|&b| b == b' ')
311            .ok_or(GitParseError::Malformed(
312                "tree entry missing mode terminator",
313            ))?;
314        let mode = rest[..sp].to_vec();
315        if mode.is_empty() || mode.len() > 7 || !mode.iter().all(u8::is_ascii_digit) {
316            return Err(GitParseError::Malformed("tree entry mode not octal"));
317        }
318        rest = &rest[sp + 1..];
319        let nul = rest
320            .iter()
321            .position(|&b| b == 0)
322            .ok_or(GitParseError::Malformed("tree entry missing NUL"))?;
323        let name = rest[..nul].to_vec();
324        if name.is_empty() {
325            return Err(GitParseError::Malformed("tree entry with empty name"));
326        }
327        rest = &rest[nul + 1..];
328        if rest.len() < 20 {
329            return Err(GitParseError::Malformed("tree entry truncated id"));
330        }
331        let mut id = [0u8; 20];
332        id.copy_from_slice(&rest[..20]);
333        rest = &rest[20..];
334        entries.push(GitTreeEntry { mode, name, id });
335    }
336    Ok(entries)
337}
338
339/// SPEC-GIT-IMPORT §3.3 mode policy outcome.
340#[derive(Debug, Clone, Copy, PartialEq, Eq)]
341pub enum ModeMapping {
342    /// A canonical git mode.
343    Canonical(mkit_core::object::EntryMode),
344    /// A historic spelling, normalized to its canonical equivalent
345    /// (declared-lossy; refused in fork-mode state dirs).
346    Normalized(mkit_core::object::EntryMode),
347    /// Submodule gitlink — always refused.
348    Gitlink,
349    /// Not a mode the mapping covers.
350    Unknown,
351}
352
353/// Classify a git tree-entry mode string per the pinned §3.3 table.
354#[must_use]
355pub fn map_mode(mode: &[u8]) -> ModeMapping {
356    use mkit_core::object::EntryMode;
357    match mode {
358        b"100644" => ModeMapping::Canonical(EntryMode::Blob),
359        b"40000" => ModeMapping::Canonical(EntryMode::Tree),
360        b"120000" => ModeMapping::Canonical(EntryMode::Symlink),
361        b"100755" => ModeMapping::Canonical(EntryMode::Executable),
362        b"100664" | b"100640" | b"100600" => ModeMapping::Normalized(EntryMode::Blob),
363        b"040000" => ModeMapping::Normalized(EntryMode::Tree),
364        b"160000" => ModeMapping::Gitlink,
365        _ => ModeMapping::Unknown,
366    }
367}
368
369#[cfg(test)]
370mod tests {
371    use super::*;
372    use mkit_core::object::EntryMode;
373
374    #[test]
375    fn person_plain() {
376        let p = parse_person(b"Alice Example <alice@example.com> 1700000000 +0200").unwrap();
377        assert_eq!(p.identity, b"Alice Example <alice@example.com>");
378        assert_eq!(p.timestamp, 1_700_000_000);
379        assert_eq!(p.timezone.as_deref(), Some(b"+0200".as_slice()));
380    }
381
382    #[test]
383    fn person_malformations_preserved_verbatim() {
384        // Doubled space, no space before '<', nested '<' in name: the
385        // last '>' rule slices verbatim.
386        let p = parse_person(b"Weird  Name<a@b> 5 +0000").unwrap();
387        assert_eq!(p.identity, b"Weird  Name<a@b>");
388        let p = parse_person(b"A <b> C <d@e> 5 +0000").unwrap();
389        assert_eq!(p.identity, b"A <b> C <d@e>");
390    }
391
392    #[test]
393    fn person_negative_timestamp_parses() {
394        // Policy (refusal) is the driver's; the parser is faithful.
395        let p = parse_person(b"Old Soul <o@s> -86400 +0000").unwrap();
396        assert_eq!(p.timestamp, -86400);
397    }
398
399    #[test]
400    fn person_bracketless_rules() {
401        let p = parse_person(b"Just A Name 1700000000 +0000").unwrap();
402        assert_eq!(p.identity, b"Just A Name");
403        assert_eq!(p.timestamp, 1_700_000_000);
404        // No trailing pattern → no timestamp → error.
405        assert_eq!(
406            parse_person(b"no timestamp here"),
407            Err(GitParseError::PersonTimestamp)
408        );
409    }
410
411    #[test]
412    fn commit_with_gpgsig_continuation() {
413        // Built line-by-line: Rust string-literal continuations would
414        // eat the load-bearing leading spaces of the fold lines.
415        let lines: &[&[u8]] = &[
416            b"tree 4b825dc642cb6eb9a060e54bf8d69288fbee4904",
417            b"parent ce013625030ba8dba906f756967f9e9ca394464a",
418            b"author A <a@x> 1700000000 +0000",
419            b"committer B <b@x> 1700000001 -0500",
420            b"gpgsig -----BEGIN SSH SIGNATURE-----",
421            b" U1NIU0lHbGluZTI=",
422            b" -----END SSH SIGNATURE-----",
423            b"",
424            b"msg body",
425            b"",
426            b"with blank line",
427        ];
428        let mut body = lines.join(&b"\n"[..]);
429        body.push(b'\n');
430        let c = parse_commit(&body).unwrap();
431        assert_eq!(c.parents.len(), 1);
432        assert!(c.has_gpgsig);
433        assert_eq!(c.author.identity, b"A <a@x>");
434        assert_eq!(c.committer.timestamp, 1_700_000_001);
435        assert_eq!(c.message, b"msg body\n\nwith blank line\n");
436    }
437
438    #[test]
439    fn commit_rejects_missing_or_duplicate_required() {
440        assert!(parse_commit(b"author A <a@x> 5 +0000\ncommitter A <a@x> 5 +0000\n\nx").is_err());
441        let dup = b"tree 4b825dc642cb6eb9a060e54bf8d69288fbee4904\n\
442tree 4b825dc642cb6eb9a060e54bf8d69288fbee4904\n\
443author A <a@x> 5 +0000\ncommitter A <a@x> 5 +0000\n\nx";
444        assert!(parse_commit(dup).is_err());
445    }
446
447    #[test]
448    fn commit_tolerates_unknown_and_encoding_headers() {
449        let body = b"tree 4b825dc642cb6eb9a060e54bf8d69288fbee4904\n\
450author A <a@x> 5 +0000\n\
451committer A <a@x> 5 +0000\n\
452encoding ISO-8859-1\n\
453x-custom whatever\n\
454\n\
455Ren\xe9\n";
456        let c = parse_commit(body).unwrap();
457        assert_eq!(c.message, b"Ren\xe9\n");
458    }
459
460    #[test]
461    fn tag_with_and_without_tagger() {
462        let body = b"object ce013625030ba8dba906f756967f9e9ca394464a\n\
463type commit\ntag v1.0.0\ntagger T <t@x> 5 +0000\n\nrelease\n";
464        let t = parse_tag(body).unwrap();
465        assert_eq!(t.name, b"v1.0.0");
466        assert!(t.tagger.is_some());
467        // git v0.99-era tagger-less tag.
468        let body = b"object ce013625030ba8dba906f756967f9e9ca394464a\n\
469type commit\ntag old\n\nancient\n";
470        let t = parse_tag(body).unwrap();
471        assert!(t.tagger.is_none());
472    }
473
474    #[test]
475    fn tree_parses_and_modes_classify() {
476        let mut body = Vec::new();
477        for (mode, name) in [
478            (&b"100644"[..], &b"a.txt"[..]),
479            (b"040000", b"olddir"),
480            (b"160000", b"sub"),
481        ] {
482            body.extend_from_slice(mode);
483            body.push(b' ');
484            body.extend_from_slice(name);
485            body.push(0);
486            body.extend_from_slice(&[7u8; 20]);
487        }
488        let entries = parse_tree(&body).unwrap();
489        assert_eq!(entries.len(), 3);
490        assert_eq!(
491            map_mode(&entries[0].mode),
492            ModeMapping::Canonical(EntryMode::Blob)
493        );
494        assert_eq!(
495            map_mode(&entries[1].mode),
496            ModeMapping::Normalized(EntryMode::Tree)
497        );
498        assert_eq!(map_mode(&entries[2].mode), ModeMapping::Gitlink);
499        assert_eq!(map_mode(b"777777"), ModeMapping::Unknown);
500    }
501
502    #[test]
503    fn parsers_never_panic_on_junk() {
504        for junk in [
505            &b""[..],
506            b"\n\n",
507            b" leading continuation\n\nx",
508            b"tree short\n\nx",
509            b"\x00\xff\xfe",
510        ] {
511            let _ = parse_commit(junk);
512            let _ = parse_tag(junk);
513            let _ = parse_tree(junk);
514            let _ = parse_person(junk);
515        }
516    }
517}