Skip to main content

grit_lib/
objects.rs

1//! Git object model: object IDs, kinds, and in-memory representations.
2//!
3//! # Object ID
4//!
5//! [`ObjectId`] is a 20-byte SHA-1 digest.  It implements `Display` as
6//! lowercase hex, `FromStr` from a 40-character hex string, and the standard
7//! ordering traits so it can be used as a map key.
8//!
9//! # Object Kind
10//!
11//! [`ObjectKind`] represents the four Git object types: blob, tree, commit,
12//! and tag.  The raw header byte-slice is parsed with [`ObjectKind::from_bytes`].
13//!
14//! # Parsed objects
15//!
16//! [`Object`] bundles a kind and its raw (decompressed, header-stripped) byte
17//! content.  Higher-level parsed forms (e.g. [`TreeEntry`], [`CommitData`])
18//! live in this module and are produced by fallible `TryFrom<&Object>`
19//! conversions.
20
21use std::fmt;
22use std::str::FromStr;
23
24use crate::commit_encoding;
25use crate::error::{Error, Result};
26
27/// A Git hash algorithm.
28///
29/// Git supports two object-id hash functions: the historical SHA-1 (20-byte
30/// digests, 40 hex chars) and the newer SHA-256 (32-byte digests, 64 hex
31/// chars). A repository's algorithm is recorded in `extensions.objectformat`.
32#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
33pub enum HashAlgo {
34    /// SHA-1 — 20-byte digests.
35    #[default]
36    Sha1,
37    /// SHA-256 — 32-byte digests.
38    Sha256,
39}
40
41impl HashAlgo {
42    /// The raw digest length in bytes (20 for SHA-1, 32 for SHA-256).
43    #[must_use]
44    pub const fn len(self) -> usize {
45        match self {
46            Self::Sha1 => 20,
47            Self::Sha256 => 32,
48        }
49    }
50
51    /// The hex-encoded digest length (40 for SHA-1, 64 for SHA-256).
52    #[must_use]
53    pub const fn hex_len(self) -> usize {
54        self.len() * 2
55    }
56
57    /// The lowercase name as written in `extensions.objectformat`.
58    #[must_use]
59    pub const fn name(self) -> &'static str {
60        match self {
61            Self::Sha1 => "sha1",
62            Self::Sha256 => "sha256",
63        }
64    }
65
66    /// The `oid_version` byte used in `.idx`/multi-pack-index headers
67    /// (SHA-1 → 1, SHA-256 → 2).
68    #[must_use]
69    pub const fn oid_version(self) -> u8 {
70        match self {
71            Self::Sha1 => 1,
72            Self::Sha256 => 2,
73        }
74    }
75
76    /// Parse from the name used in `extensions.objectformat` config.
77    #[must_use]
78    pub fn from_name(name: &str) -> Option<Self> {
79        match name.trim() {
80            "sha1" => Some(Self::Sha1),
81            "sha256" => Some(Self::Sha256),
82            _ => None,
83        }
84    }
85
86    /// The algorithm implied by a raw digest length, if recognised.
87    #[must_use]
88    pub const fn from_len(len: usize) -> Option<Self> {
89        match len {
90            20 => Some(Self::Sha1),
91            32 => Some(Self::Sha256),
92            _ => None,
93        }
94    }
95}
96
97/// Maximum raw digest length across supported hash algorithms (SHA-256).
98const MAX_OID_LEN: usize = 32;
99
100/// A Git object identifier: a SHA-1 (20-byte) or SHA-256 (32-byte) digest.
101///
102/// The digest is stored in a fixed 32-byte buffer with an explicit length;
103/// bytes beyond `len` are always zero, so the derived `Eq`/`Ord`/`Hash`
104/// remain correct. The hash algorithm is inferred from the length.
105#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
106pub struct ObjectId {
107    bytes: [u8; MAX_OID_LEN],
108    len: u8,
109}
110
111impl ObjectId {
112    /// The all-zero SHA-1 object id (Git's "null" OID).
113    ///
114    /// Used for index placeholders such as intent-to-add entries and for
115    /// special cases in plumbing output. For an algorithm-specific null OID
116    /// (e.g. 64 zeros in a SHA-256 repo) use [`ObjectId::null`].
117    #[must_use]
118    pub const fn zero() -> Self {
119        Self {
120            bytes: [0u8; MAX_OID_LEN],
121            len: 20,
122        }
123    }
124
125    /// The all-zero ("null") object id for a given hash algorithm.
126    #[must_use]
127    pub const fn null(algo: HashAlgo) -> Self {
128        Self {
129            bytes: [0u8; MAX_OID_LEN],
130            len: algo.len() as u8,
131        }
132    }
133
134    /// Construct from a raw digest slice (20 bytes for SHA-1, 32 for SHA-256).
135    ///
136    /// # Errors
137    ///
138    /// Returns [`Error::InvalidObjectId`] when `bytes` is not a recognised
139    /// digest length.
140    pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
141        if HashAlgo::from_len(bytes.len()).is_none() {
142            return Err(Error::InvalidObjectId(hex::encode(bytes)));
143        }
144        let mut buf = [0u8; MAX_OID_LEN];
145        buf[..bytes.len()].copy_from_slice(bytes);
146        Ok(Self {
147            bytes: buf,
148            len: bytes.len() as u8,
149        })
150    }
151
152    /// Raw digest bytes (20 or 32 bytes depending on the hash algorithm).
153    #[must_use]
154    pub fn as_bytes(&self) -> &[u8] {
155        &self.bytes[..self.len as usize]
156    }
157
158    /// The hash algorithm this OID belongs to.
159    #[must_use]
160    pub fn algo(&self) -> HashAlgo {
161        HashAlgo::from_len(self.len as usize).unwrap_or(HashAlgo::Sha1)
162    }
163
164    /// Check if this is the null (all-zero) object ID.
165    #[must_use]
166    pub fn is_zero(&self) -> bool {
167        self.as_bytes().iter().all(|&b| b == 0)
168    }
169
170    /// Lowercase hex representation (40 or 64 characters).
171    #[must_use]
172    pub fn to_hex(&self) -> String {
173        hex::encode(self.as_bytes())
174    }
175
176    /// The two-character directory prefix used by the loose object store.
177    ///
178    /// Returns the first two hex chars (e.g. `"ab"` for `"ab3f…"`).
179    #[must_use]
180    pub fn loose_prefix(&self) -> String {
181        hex::encode(&self.bytes[..1])
182    }
183
184    /// Parse an object ID from a hex string (40 chars for SHA-1, 64 for
185    /// SHA-256).
186    ///
187    /// # Errors
188    ///
189    /// Returns [`Error::InvalidObjectId`] if the string is not a valid hex OID.
190    pub fn from_hex(s: &str) -> Result<Self> {
191        s.parse()
192    }
193
194    /// The suffix used as the filename inside the loose prefix dir (the digest
195    /// minus its first byte: 38 hex chars for SHA-1, 62 for SHA-256).
196    #[must_use]
197    pub fn loose_suffix(&self) -> String {
198        hex::encode(&self.bytes[1..self.len as usize])
199    }
200
201    /// Whether `s` is a full-length hex OID for a supported hash algorithm
202    /// (40 hex chars for SHA-1 or 64 for SHA-256), case-insensitive.
203    #[must_use]
204    pub fn is_full_hex(s: &str) -> bool {
205        (s.len() == HashAlgo::Sha1.hex_len() || s.len() == HashAlgo::Sha256.hex_len())
206            && s.bytes().all(|b| b.is_ascii_hexdigit())
207    }
208
209    /// Whether `len` is a valid full hex-OID length (40 or 64).
210    #[must_use]
211    pub const fn is_hex_len(len: usize) -> bool {
212        len == HashAlgo::Sha1.hex_len() || len == HashAlgo::Sha256.hex_len()
213    }
214
215    /// Whether `len` is a valid loose-object filename suffix length, i.e. a full
216    /// hex OID minus its first byte (38 for SHA-1, 62 for SHA-256).
217    #[must_use]
218    pub const fn is_loose_suffix_len(len: usize) -> bool {
219        len == HashAlgo::Sha1.hex_len() - 2 || len == HashAlgo::Sha256.hex_len() - 2
220    }
221}
222
223impl fmt::Display for ObjectId {
224    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
225        f.write_str(&self.to_hex())
226    }
227}
228
229impl fmt::Debug for ObjectId {
230    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
231        write!(f, "ObjectId({})", self.to_hex())
232    }
233}
234
235impl FromStr for ObjectId {
236    type Err = Error;
237
238    fn from_str(s: &str) -> Result<Self> {
239        if s.len() != HashAlgo::Sha1.hex_len() && s.len() != HashAlgo::Sha256.hex_len() {
240            return Err(Error::InvalidObjectId(s.to_owned()));
241        }
242        let bytes = hex::decode(s).map_err(|_| Error::InvalidObjectId(s.to_owned()))?;
243        Self::from_bytes(&bytes)
244    }
245}
246
247/// The four Git object types.
248#[derive(Debug, Clone, Copy, PartialEq, Eq)]
249pub enum ObjectKind {
250    /// A raw file snapshot.
251    Blob,
252    /// A directory listing.
253    Tree,
254    /// A snapshot with metadata and parentage.
255    Commit,
256    /// An annotated tag.
257    Tag,
258}
259
260impl ObjectKind {
261    /// Parse from the ASCII keyword used in Git object headers.
262    ///
263    /// # Errors
264    ///
265    /// Returns [`Error::UnknownObjectType`] for unrecognised strings.
266    pub fn from_bytes(b: &[u8]) -> Result<Self> {
267        match b {
268            b"blob" => Ok(Self::Blob),
269            b"tree" => Ok(Self::Tree),
270            b"commit" => Ok(Self::Commit),
271            b"tag" => Ok(Self::Tag),
272            other => Err(Error::UnknownObjectType(
273                String::from_utf8_lossy(other).into_owned(),
274            )),
275        }
276    }
277
278    /// Parse the `type` field on an annotated tag object (Git `type_from_string_gently` rules).
279    ///
280    /// The tag header line is `type <typename>\n` where `typename` must match a known object type
281    /// keyword **exactly** (no extra characters, no strict prefix of a longer keyword).
282    #[must_use]
283    pub fn from_tag_type_field(line: &[u8]) -> Option<Self> {
284        fn keyword_matches(canonical: &[u8], field: &[u8]) -> bool {
285            if field.is_empty() {
286                return false;
287            }
288            for (i, &bc) in field.iter().enumerate() {
289                let sc = canonical.get(i).copied().unwrap_or(0);
290                if sc != bc {
291                    return false;
292                }
293            }
294            canonical.get(field.len()).copied().unwrap_or(0) == 0
295        }
296
297        const NAMES: &[(ObjectKind, &[u8])] = &[
298            (ObjectKind::Blob, b"blob"),
299            (ObjectKind::Tree, b"tree"),
300            (ObjectKind::Commit, b"commit"),
301            (ObjectKind::Tag, b"tag"),
302        ];
303        for &(kind, name) in NAMES {
304            if keyword_matches(name, line) {
305                return Some(kind);
306            }
307        }
308        None
309    }
310
311    /// The ASCII keyword for this kind (used in object headers).
312    #[must_use]
313    pub fn as_str(&self) -> &'static str {
314        match self {
315            Self::Blob => "blob",
316            Self::Tree => "tree",
317            Self::Commit => "commit",
318            Self::Tag => "tag",
319        }
320    }
321}
322
323impl fmt::Display for ObjectKind {
324    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
325        f.write_str(self.as_str())
326    }
327}
328
329impl FromStr for ObjectKind {
330    type Err = Error;
331
332    fn from_str(s: &str) -> Result<Self> {
333        Self::from_bytes(s.as_bytes())
334    }
335}
336
337/// A decompressed, header-stripped Git object.
338#[derive(Debug, Clone)]
339pub struct Object {
340    /// The type of this object.
341    pub kind: ObjectKind,
342    /// Raw byte content (everything after the NUL in the header).
343    pub data: Vec<u8>,
344}
345
346impl Object {
347    /// Construct a new object from its kind and raw data.
348    #[must_use]
349    pub fn new(kind: ObjectKind, data: Vec<u8>) -> Self {
350        Self { kind, data }
351    }
352
353    /// Serialize to the canonical Git object format: `"<kind> <size>\0<data>"`.
354    #[must_use]
355    pub fn to_store_bytes(&self) -> Vec<u8> {
356        let header = format!("{} {}\0", self.kind, self.data.len());
357        let mut out = Vec::with_capacity(header.len() + self.data.len());
358        out.extend_from_slice(header.as_bytes());
359        out.extend_from_slice(&self.data);
360        out
361    }
362}
363
364/// A single entry in a Git tree object.
365#[derive(Debug, Clone, PartialEq, Eq)]
366pub struct TreeEntry {
367    /// Unix file mode (e.g. `0o100644` for a regular file, `0o040000` for a tree).
368    pub mode: u32,
369    /// Entry name (file or directory name only, no path separators).
370    pub name: Vec<u8>,
371    /// The object ID of the blob or sub-tree.
372    pub oid: ObjectId,
373}
374
375impl TreeEntry {
376    /// Format the mode as Git does: no leading zero, minimal digits.
377    ///
378    /// Git uses `"40000"` for trees (not `"040000"`), and `"100644"` for blobs.
379    #[must_use]
380    pub fn mode_str(&self) -> String {
381        // Git omits the leading zero for tree mode
382        if self.mode == 0o040000 {
383            "40000".to_owned()
384        } else {
385            format!("{:o}", self.mode)
386        }
387    }
388}
389
390/// Parse the raw data of a tree object into its entries.
391///
392/// # Format
393///
394/// Each entry is `"<mode> <name>\0<20-byte-sha1>"` concatenated with no
395/// separator between entries.
396///
397/// # Errors
398///
399/// Returns [`Error::CorruptObject`] if the data is malformed.
400pub fn parse_tree(data: &[u8]) -> Result<Vec<TreeEntry>> {
401    // A tree blob does not record its OID width; it is fixed by the repo's
402    // hash algorithm. Rather than thread `HashAlgo` through ~140 call sites,
403    // infer the width: a well-formed tree only parses cleanly (consuming the
404    // whole buffer) with the correct OID length. Try SHA-1 (20) first, then
405    // SHA-256 (32).
406    match parse_tree_with_oid_len(data, HashAlgo::Sha1.len()) {
407        Ok(entries) => Ok(entries),
408        Err(sha1_err) => parse_tree_with_oid_len(data, HashAlgo::Sha256.len()).map_err(|_| sha1_err),
409    }
410}
411
412/// Parse a tree blob assuming a fixed raw OID width (`oid_len` bytes).
413///
414/// Returns an error if the data does not parse cleanly into whole entries of
415/// that width (used by [`parse_tree`] to detect the hash algorithm).
416pub fn parse_tree_with_oid_len(data: &[u8], oid_len: usize) -> Result<Vec<TreeEntry>> {
417    let mut entries = Vec::new();
418    let mut pos = 0;
419
420    while pos < data.len() {
421        // Find the space separating mode from name
422        let sp = data[pos..]
423            .iter()
424            .position(|&b| b == b' ')
425            .ok_or_else(|| Error::CorruptObject("tree entry missing space".to_owned()))?;
426        let mode_bytes = &data[pos..pos + sp];
427        let mode = std::str::from_utf8(mode_bytes)
428            .ok()
429            .and_then(|s| u32::from_str_radix(s, 8).ok())
430            .ok_or_else(|| {
431                Error::CorruptObject(format!(
432                    "invalid tree mode: {}",
433                    String::from_utf8_lossy(mode_bytes)
434                ))
435            })?;
436        pos += sp + 1;
437
438        // Find the NUL separating name from the raw OID
439        let nul = data[pos..]
440            .iter()
441            .position(|&b| b == 0)
442            .ok_or_else(|| Error::CorruptObject("tree entry missing NUL".to_owned()))?;
443        let name = data[pos..pos + nul].to_vec();
444        pos += nul + 1;
445
446        if pos + oid_len > data.len() {
447            return Err(Error::CorruptObject("tree entry truncated SHA".to_owned()));
448        }
449        let oid = ObjectId::from_bytes(&data[pos..pos + oid_len])?;
450        pos += oid_len;
451
452        entries.push(TreeEntry { mode, name, oid });
453    }
454
455    Ok(entries)
456}
457
458/// Build the raw bytes of a tree object from a slice of entries.
459///
460/// Entries **must** already be sorted in Git tree order (see [`tree_entry_cmp`])
461/// before calling this function.
462#[must_use]
463pub fn serialize_tree(entries: &[TreeEntry]) -> Vec<u8> {
464    let mut out = Vec::new();
465    for e in entries {
466        out.extend_from_slice(e.mode_str().as_bytes());
467        out.push(b' ');
468        out.extend_from_slice(&e.name);
469        out.push(0);
470        out.extend_from_slice(e.oid.as_bytes());
471    }
472    out
473}
474
475/// Git's tree-entry sort comparator.
476///
477/// Trees are sorted byte-by-byte by `"<name>"` for blobs and `"<name>/"` for
478/// sub-trees, so a directory `foo` sorts after a file `foo-bar` but before
479/// `fooz`.  This matches `base_name_compare` in `tree.c`.
480///
481/// # Parameters
482///
483/// - `a_name`: name bytes of the first entry
484/// - `a_is_tree`: whether the first entry is a sub-tree (`mode == 0o040000`)
485/// - `b_name`: name bytes of the second entry
486/// - `b_is_tree`: whether the second entry is a sub-tree
487#[must_use]
488pub fn tree_entry_cmp(
489    a_name: &[u8],
490    a_is_tree: bool,
491    b_name: &[u8],
492    b_is_tree: bool,
493) -> std::cmp::Ordering {
494    let a_trailer = if a_is_tree { b'/' } else { 0u8 };
495    let b_trailer = if b_is_tree { b'/' } else { 0u8 };
496
497    let min_len = a_name.len().min(b_name.len());
498    let cmp = a_name[..min_len].cmp(&b_name[..min_len]);
499    if cmp != std::cmp::Ordering::Equal {
500        return cmp;
501    }
502    // Names share a prefix; compare the next character (or trailer).
503    let ac = a_name.get(min_len).copied().unwrap_or(a_trailer);
504    let bc = b_name.get(min_len).copied().unwrap_or(b_trailer);
505    ac.cmp(&bc)
506}
507
508/// Parsed representation of a commit object.
509#[derive(Debug, Clone)]
510pub struct CommitData {
511    /// The tree this commit points to.
512    pub tree: ObjectId,
513    /// Parent commit IDs (zero or more).
514    pub parents: Vec<ObjectId>,
515    /// Author field decoded to Unicode (using `encoding` when present, else UTF-8).
516    pub author: String,
517    /// Committer field decoded to Unicode.
518    pub committer: String,
519    /// Exact `author` header payload bytes as stored in the object (after `author `).
520    ///
521    /// Empty means treat [`Self::author`] as UTF-8 when serializing (new commits).
522    pub author_raw: Vec<u8>,
523    /// Exact `committer` header payload bytes as stored in the object.
524    pub committer_raw: Vec<u8>,
525    /// Optional encoding override (e.g. `"UTF-8"`).
526    pub encoding: Option<String>,
527    /// Commit message (everything after the blank line).
528    pub message: String,
529    /// Optional raw message bytes for non-UTF-8 commit messages.
530    /// When set, `serialize_commit` uses these bytes instead of `message`.
531    #[doc = "Optional raw message bytes for non-UTF-8 messages."]
532    pub raw_message: Option<Vec<u8>>,
533}
534
535/// Parse the raw data of a commit object.
536///
537/// # Errors
538///
539/// Returns [`Error::CorruptObject`] if required headers are missing.
540pub fn parse_commit(data: &[u8]) -> Result<CommitData> {
541    // Header lines are mostly ASCII; author/committer payloads may match the `encoding` header.
542    // Continuation lines (leading SP) append to the previous header for author/committer, or are
543    // skipped for multiline headers Git allows (`gpgsig`, `mergetag`, …).
544    #[derive(Clone, Copy)]
545    enum Continuation {
546        Author,
547        Committer,
548        Multiline,
549        Ignore,
550    }
551
552    let mut pos = 0usize;
553    let mut tree = None;
554    let mut parents = Vec::new();
555    let mut author_raw: Option<Vec<u8>> = None;
556    let mut committer_raw: Option<Vec<u8>> = None;
557    let mut encoding: Option<String> = None;
558    let mut cont = Continuation::Ignore;
559
560    while pos < data.len() {
561        let line_start = pos;
562        let mut line_end = pos;
563        while line_end < data.len() && data[line_end] != b'\n' {
564            line_end += 1;
565        }
566        let line = &data[line_start..line_end];
567        let after_nl = line_end.saturating_add(1);
568        if line.is_empty() {
569            let body = data.get(after_nl..).unwrap_or_default();
570            let message = commit_encoding::decode_bytes(encoding.as_deref(), body);
571            // Preserve the exact message tail: Git allows commits whose log ends without a
572            // final newline (`commit-tree` from a file). `serialize_commit` appends `\n` when
573            // only `message` is set, so keep raw bytes when the body is not LF-terminated.
574            let has_non_utf8_encoding = encoding.as_deref().is_some_and(|label| {
575                !label.eq_ignore_ascii_case("utf-8") && !label.eq_ignore_ascii_case("utf8")
576            });
577            let raw_message = if body.is_empty() {
578                None
579            } else if has_non_utf8_encoding
580                || std::str::from_utf8(body).is_err()
581                || !body.ends_with(b"\n")
582            {
583                Some(body.to_vec())
584            } else {
585                None
586            };
587            let author_bytes = author_raw
588                .ok_or_else(|| Error::CorruptObject("commit missing author header".to_owned()))?;
589            let committer_bytes = committer_raw.ok_or_else(|| {
590                Error::CorruptObject("commit missing committer header".to_owned())
591            })?;
592            let author = commit_encoding::decode_bytes(encoding.as_deref(), &author_bytes);
593            let committer = commit_encoding::decode_bytes(encoding.as_deref(), &committer_bytes);
594            return Ok(CommitData {
595                tree: tree
596                    .ok_or_else(|| Error::CorruptObject("commit missing tree header".to_owned()))?,
597                parents,
598                author,
599                committer,
600                author_raw: author_bytes,
601                committer_raw: committer_bytes,
602                encoding,
603                message,
604                raw_message,
605            });
606        }
607
608        if line.first() == Some(&b' ') {
609            let rest = line.get(1..).unwrap_or_default();
610            match cont {
611                Continuation::Author => {
612                    let a = author_raw.as_mut().ok_or_else(|| {
613                        Error::CorruptObject("orphan header continuation".to_owned())
614                    })?;
615                    a.extend_from_slice(rest);
616                }
617                Continuation::Committer => {
618                    let c = committer_raw.as_mut().ok_or_else(|| {
619                        Error::CorruptObject("orphan header continuation".to_owned())
620                    })?;
621                    c.extend_from_slice(rest);
622                }
623                Continuation::Multiline | Continuation::Ignore => {}
624            }
625            pos = after_nl;
626            continue;
627        }
628
629        let key_end = line
630            .iter()
631            .position(|&b| b == b' ')
632            .ok_or_else(|| Error::CorruptObject("malformed commit header line".to_owned()))?;
633        let key = &line[..key_end];
634        let rest = line.get(key_end + 1..).unwrap_or_default();
635
636        match key {
637            b"tree" => {
638                let line_str = std::str::from_utf8(rest).map_err(|_| {
639                    Error::CorruptObject("commit tree line is not valid UTF-8".to_owned())
640                })?;
641                tree = Some(line_str.trim().parse::<ObjectId>()?);
642                cont = Continuation::Ignore;
643            }
644            b"parent" => {
645                let line_str = std::str::from_utf8(rest).map_err(|_| {
646                    Error::CorruptObject("commit parent line is not valid UTF-8".to_owned())
647                })?;
648                parents.push(line_str.trim().parse::<ObjectId>()?);
649                cont = Continuation::Ignore;
650            }
651            b"author" => {
652                author_raw = Some(rest.to_vec());
653                cont = Continuation::Author;
654            }
655            b"committer" => {
656                committer_raw = Some(rest.to_vec());
657                cont = Continuation::Committer;
658            }
659            b"encoding" => {
660                let line_str = std::str::from_utf8(rest).map_err(|_| {
661                    Error::CorruptObject("commit encoding line is not valid UTF-8".to_owned())
662                })?;
663                encoding = Some(line_str.to_owned());
664                cont = Continuation::Ignore;
665            }
666            _ => {
667                cont = Continuation::Multiline;
668            }
669        }
670        pos = after_nl;
671    }
672
673    Err(Error::CorruptObject(
674        "commit missing blank line before message".to_owned(),
675    ))
676}
677
678/// Value after `prefix` on the first header line that starts with `prefix`, scanning until a blank
679/// line (Git tag headers). Returns `None` if no such line exists before the body.
680#[must_use]
681pub fn tag_header_field(data: &[u8], prefix: &[u8]) -> Option<String> {
682    let mut pos = 0usize;
683    while pos < data.len() {
684        let rest = &data[pos..];
685        let nl = rest.iter().position(|&b| b == b'\n');
686        let line = if let Some(i) = nl { &rest[..i] } else { rest };
687        if line.is_empty() {
688            break;
689        }
690        if let Some(after) = line.strip_prefix(prefix) {
691            return Some(String::from_utf8_lossy(after).trim().to_owned());
692        }
693        pos += line.len().saturating_add(nl.map(|_| 1).unwrap_or(0));
694        if nl.is_none() {
695            break;
696        }
697    }
698    None
699}
700
701/// OID from the first `object <hex>` line in the tag header block, if hex parses.
702#[must_use]
703pub fn tag_object_line_oid(data: &[u8]) -> Option<ObjectId> {
704    let s = tag_header_field(data, b"object ")?;
705    s.parse().ok()
706}
707
708/// Parsed representation of an annotated tag object.
709#[derive(Debug, Clone)]
710pub struct TagData {
711    /// The object this tag points to.
712    pub object: ObjectId,
713    /// The type of the tagged object (e.g. `"commit"`).
714    pub object_type: String,
715    /// The short tag name (without `refs/tags/` prefix).
716    pub tag: String,
717    /// The tagger identity and timestamp (raw Git format).
718    pub tagger: Option<String>,
719    /// The tag message (everything after the blank line).
720    pub message: String,
721}
722
723/// Parse the raw data of a tag object.
724///
725/// # Errors
726///
727/// Returns [`Error::CorruptObject`] if required headers are missing or malformed.
728pub fn parse_tag(data: &[u8]) -> Result<TagData> {
729    let text = std::str::from_utf8(data)
730        .map_err(|_| Error::CorruptObject("tag is not valid UTF-8".to_owned()))?;
731
732    let mut object = None;
733    let mut object_type = None;
734    let mut tag_name = None;
735    let mut tagger = None;
736    let mut message = String::new();
737    let mut in_message = false;
738
739    for line in text.split('\n') {
740        if in_message {
741            message.push_str(line);
742            message.push('\n');
743            continue;
744        }
745        if line.is_empty() {
746            in_message = true;
747            continue;
748        }
749        if let Some(rest) = line.strip_prefix("object ") {
750            object = Some(rest.trim().parse::<ObjectId>()?);
751        } else if let Some(rest) = line.strip_prefix("type ") {
752            let typ = rest.trim();
753            if ObjectKind::from_tag_type_field(typ.as_bytes()).is_none() {
754                return Err(Error::CorruptObject(format!(
755                    "invalid 'type' value in tag: {typ}"
756                )));
757            }
758            object_type = Some(typ.to_owned());
759        } else if let Some(rest) = line.strip_prefix("tag ") {
760            tag_name = Some(rest.trim().to_owned());
761        } else if let Some(rest) = line.strip_prefix("tagger ") {
762            tagger = Some(rest.to_owned());
763        }
764    }
765
766    // Strip one trailing newline that split adds
767    if message.ends_with('\n') {
768        message.pop();
769    }
770
771    Ok(TagData {
772        object: object
773            .ok_or_else(|| Error::CorruptObject("tag missing object header".to_owned()))?,
774        object_type: object_type
775            .ok_or_else(|| Error::CorruptObject("tag missing type header".to_owned()))?,
776        tag: tag_name.ok_or_else(|| Error::CorruptObject("tag missing tag header".to_owned()))?,
777        tagger,
778        message,
779    })
780}
781
782/// Serialize a [`TagData`] into the raw bytes suitable for storage as a tag object.
783///
784/// The caller is responsible for supplying a correctly-formatted `tagger` string
785/// (including timestamp and timezone) when present.
786#[must_use]
787pub fn serialize_tag(t: &TagData) -> Vec<u8> {
788    let mut out = String::new();
789    out.push_str(&format!("object {}\n", t.object));
790    out.push_str(&format!("type {}\n", t.object_type));
791    out.push_str(&format!("tag {}\n", t.tag));
792    if let Some(ref tagger) = t.tagger {
793        out.push_str(&format!("tagger {tagger}\n"));
794    }
795    out.push('\n');
796    // Only add message if non-empty (don't add extra blank line for empty message)
797    let msg = t.message.trim_end_matches('\n');
798    if !msg.is_empty() {
799        out.push_str(msg);
800        out.push('\n');
801    }
802    out.into_bytes()
803}
804
805/// Serialize a [`CommitData`] into the raw bytes suitable for storage.
806///
807/// The caller is responsible for supplying a correctly-formatted `author` and
808/// `committer` string (including timestamp and timezone).
809///
810/// The message body is written exactly as given: `git commit` and `git commit-tree -m`
811/// supply a trailing LF; `git commit-tree` reading from stdin or `-F` does not add one.
812#[must_use]
813pub fn serialize_commit(c: &CommitData) -> Vec<u8> {
814    let mut out = Vec::new();
815    out.extend_from_slice(format!("tree {}\n", c.tree).as_bytes());
816    for p in &c.parents {
817        out.extend_from_slice(format!("parent {p}\n").as_bytes());
818    }
819    out.extend_from_slice(b"author ");
820    if c.author_raw.is_empty() {
821        out.extend_from_slice(c.author.as_bytes());
822    } else {
823        out.extend_from_slice(&c.author_raw);
824    }
825    out.push(b'\n');
826    out.extend_from_slice(b"committer ");
827    if c.committer_raw.is_empty() {
828        out.extend_from_slice(c.committer.as_bytes());
829    } else {
830        out.extend_from_slice(&c.committer_raw);
831    }
832    out.push(b'\n');
833    if let Some(enc) = &c.encoding {
834        out.extend_from_slice(format!("encoding {enc}\n").as_bytes());
835    }
836    out.push(b'\n');
837    if let Some(raw) = &c.raw_message {
838        out.extend_from_slice(raw);
839    } else if !c.message.is_empty() {
840        out.extend_from_slice(c.message.as_bytes());
841    }
842    out
843}
844
845#[cfg(test)]
846mod commit_parse_tests {
847    use super::*;
848
849    #[test]
850    fn parse_commit_skips_multiline_gpgsig_continuation() {
851        let raw = concat!(
852            "tree 4b825dc642cb6eb9a060e54bf8d69288fbee4904\n",
853            "author A U Thor <author@example.com> 1 +0000\n",
854            "committer C O Mitter <committer@example.com> 1 +0000\n",
855            "gpgsig -----BEGIN PGP SIGNATURE-----\n",
856            " abcdef\n",
857            " -----END PGP SIGNATURE-----\n",
858            "\n",
859            "msg\n",
860        );
861        let c = parse_commit(raw.as_bytes()).expect("parse signed commit");
862        assert_eq!(c.tree.to_hex(), "4b825dc642cb6eb9a060e54bf8d69288fbee4904");
863        assert_eq!(c.message, "msg\n");
864    }
865}