Skip to main content

grit_lib/
objects.rs

1//! Git object model: object IDs, kinds, and in-memory representations.
2//!
3//! # Object ID
4//!
5//! [`ObjectId`] is a 20-byte SHA-1 digest.  It implements `Display` as
6//! lowercase hex, `FromStr` from a 40-character hex string, and the standard
7//! ordering traits so it can be used as a map key.
8//!
9//! # Object Kind
10//!
11//! [`ObjectKind`] represents the four Git object types: blob, tree, commit,
12//! and tag.  The raw header byte-slice is parsed with [`ObjectKind::from_bytes`].
13//!
14//! # Parsed objects
15//!
16//! [`Object`] bundles a kind and its raw (decompressed, header-stripped) byte
17//! content.  Higher-level parsed forms (e.g. [`TreeEntry`], [`CommitData`])
18//! live in this module and are produced by fallible `TryFrom<&Object>`
19//! conversions.
20
21use std::fmt;
22use std::str::FromStr;
23
24use crate::commit_encoding;
25use crate::error::{Error, Result};
26
27/// A 20-byte SHA-1 object identifier.
28#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
29pub struct ObjectId([u8; 20]);
30
31impl ObjectId {
32    /// The all-zero object id (Git's "null" OID).
33    ///
34    /// Used for index placeholders such as intent-to-add entries and for
35    /// special cases in plumbing output.
36    #[must_use]
37    pub const fn zero() -> Self {
38        Self([0u8; 20])
39    }
40
41    /// Construct from a 20-byte slice.
42    ///
43    /// # Errors
44    ///
45    /// Returns [`Error::InvalidObjectId`] when `bytes` is not exactly 20 bytes.
46    pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
47        let arr: [u8; 20] = bytes
48            .try_into()
49            .map_err(|_| Error::InvalidObjectId(hex::encode(bytes)))?;
50        Ok(Self(arr))
51    }
52
53    /// Raw 20-byte digest.
54    #[must_use]
55    pub fn as_bytes(&self) -> &[u8; 20] {
56        &self.0
57    }
58
59    /// Check if this is the null (all-zero) object ID.
60    #[must_use]
61    pub fn is_zero(&self) -> bool {
62        self.0 == [0u8; 20]
63    }
64
65    /// Lowercase hex representation (40 characters).
66    #[must_use]
67    pub fn to_hex(&self) -> String {
68        hex::encode(self.0)
69    }
70
71    /// The two-character directory prefix used by the loose object store.
72    ///
73    /// Returns the first two hex chars (e.g. `"ab"` for `"ab3f…"`).
74    #[must_use]
75    pub fn loose_prefix(&self) -> String {
76        hex::encode(&self.0[..1])
77    }
78
79    /// Parse an object ID from a hex string.
80    ///
81    /// # Errors
82    ///
83    /// Returns [`Error::InvalidObjectId`] if the string is not a valid
84    /// 40-character hex OID.
85    pub fn from_hex(s: &str) -> Result<Self> {
86        s.parse()
87    }
88
89    /// The 38-character suffix used as the filename inside the loose prefix dir.
90    #[must_use]
91    pub fn loose_suffix(&self) -> String {
92        hex::encode(&self.0[1..])
93    }
94}
95
96impl fmt::Display for ObjectId {
97    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
98        f.write_str(&self.to_hex())
99    }
100}
101
102impl fmt::Debug for ObjectId {
103    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
104        write!(f, "ObjectId({})", self.to_hex())
105    }
106}
107
108impl FromStr for ObjectId {
109    type Err = Error;
110
111    fn from_str(s: &str) -> Result<Self> {
112        if s.len() != 40 {
113            return Err(Error::InvalidObjectId(s.to_owned()));
114        }
115        let bytes = hex::decode(s).map_err(|_| Error::InvalidObjectId(s.to_owned()))?;
116        Self::from_bytes(&bytes)
117    }
118}
119
120/// The four Git object types.
121#[derive(Debug, Clone, Copy, PartialEq, Eq)]
122pub enum ObjectKind {
123    /// A raw file snapshot.
124    Blob,
125    /// A directory listing.
126    Tree,
127    /// A snapshot with metadata and parentage.
128    Commit,
129    /// An annotated tag.
130    Tag,
131}
132
133impl ObjectKind {
134    /// Parse from the ASCII keyword used in Git object headers.
135    ///
136    /// # Errors
137    ///
138    /// Returns [`Error::UnknownObjectType`] for unrecognised strings.
139    pub fn from_bytes(b: &[u8]) -> Result<Self> {
140        match b {
141            b"blob" => Ok(Self::Blob),
142            b"tree" => Ok(Self::Tree),
143            b"commit" => Ok(Self::Commit),
144            b"tag" => Ok(Self::Tag),
145            other => Err(Error::UnknownObjectType(
146                String::from_utf8_lossy(other).into_owned(),
147            )),
148        }
149    }
150
151    /// The ASCII keyword for this kind (used in object headers).
152    #[must_use]
153    pub fn as_str(&self) -> &'static str {
154        match self {
155            Self::Blob => "blob",
156            Self::Tree => "tree",
157            Self::Commit => "commit",
158            Self::Tag => "tag",
159        }
160    }
161}
162
163impl fmt::Display for ObjectKind {
164    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
165        f.write_str(self.as_str())
166    }
167}
168
169impl FromStr for ObjectKind {
170    type Err = Error;
171
172    fn from_str(s: &str) -> Result<Self> {
173        Self::from_bytes(s.as_bytes())
174    }
175}
176
177/// A decompressed, header-stripped Git object.
178#[derive(Debug, Clone)]
179pub struct Object {
180    /// The type of this object.
181    pub kind: ObjectKind,
182    /// Raw byte content (everything after the NUL in the header).
183    pub data: Vec<u8>,
184}
185
186impl Object {
187    /// Construct a new object from its kind and raw data.
188    #[must_use]
189    pub fn new(kind: ObjectKind, data: Vec<u8>) -> Self {
190        Self { kind, data }
191    }
192
193    /// Serialize to the canonical Git object format: `"<kind> <size>\0<data>"`.
194    #[must_use]
195    pub fn to_store_bytes(&self) -> Vec<u8> {
196        let header = format!("{} {}\0", self.kind, self.data.len());
197        let mut out = Vec::with_capacity(header.len() + self.data.len());
198        out.extend_from_slice(header.as_bytes());
199        out.extend_from_slice(&self.data);
200        out
201    }
202}
203
204/// A single entry in a Git tree object.
205#[derive(Debug, Clone, PartialEq, Eq)]
206pub struct TreeEntry {
207    /// Unix file mode (e.g. `0o100644` for a regular file, `0o040000` for a tree).
208    pub mode: u32,
209    /// Entry name (file or directory name only, no path separators).
210    pub name: Vec<u8>,
211    /// The object ID of the blob or sub-tree.
212    pub oid: ObjectId,
213}
214
215impl TreeEntry {
216    /// Format the mode as Git does: no leading zero, minimal digits.
217    ///
218    /// Git uses `"40000"` for trees (not `"040000"`), and `"100644"` for blobs.
219    #[must_use]
220    pub fn mode_str(&self) -> String {
221        // Git omits the leading zero for tree mode
222        if self.mode == 0o040000 {
223            "40000".to_owned()
224        } else {
225            format!("{:o}", self.mode)
226        }
227    }
228}
229
230/// Parse the raw data of a tree object into its entries.
231///
232/// # Format
233///
234/// Each entry is `"<mode> <name>\0<20-byte-sha1>"` concatenated with no
235/// separator between entries.
236///
237/// # Errors
238///
239/// Returns [`Error::CorruptObject`] if the data is malformed.
240pub fn parse_tree(data: &[u8]) -> Result<Vec<TreeEntry>> {
241    let mut entries = Vec::new();
242    let mut pos = 0;
243
244    while pos < data.len() {
245        // Find the space separating mode from name
246        let sp = data[pos..]
247            .iter()
248            .position(|&b| b == b' ')
249            .ok_or_else(|| Error::CorruptObject("tree entry missing space".to_owned()))?;
250        let mode_bytes = &data[pos..pos + sp];
251        let mode = std::str::from_utf8(mode_bytes)
252            .ok()
253            .and_then(|s| u32::from_str_radix(s, 8).ok())
254            .ok_or_else(|| {
255                Error::CorruptObject(format!(
256                    "invalid tree mode: {}",
257                    String::from_utf8_lossy(mode_bytes)
258                ))
259            })?;
260        pos += sp + 1;
261
262        // Find the NUL separating name from the 20-byte SHA
263        let nul = data[pos..]
264            .iter()
265            .position(|&b| b == 0)
266            .ok_or_else(|| Error::CorruptObject("tree entry missing NUL".to_owned()))?;
267        let name = data[pos..pos + nul].to_vec();
268        pos += nul + 1;
269
270        if pos + 20 > data.len() {
271            return Err(Error::CorruptObject("tree entry truncated SHA".to_owned()));
272        }
273        let oid = ObjectId::from_bytes(&data[pos..pos + 20])?;
274        pos += 20;
275
276        entries.push(TreeEntry { mode, name, oid });
277    }
278
279    Ok(entries)
280}
281
282/// Build the raw bytes of a tree object from a slice of entries.
283///
284/// Entries **must** already be sorted in Git tree order (see [`tree_entry_cmp`])
285/// before calling this function.
286#[must_use]
287pub fn serialize_tree(entries: &[TreeEntry]) -> Vec<u8> {
288    let mut out = Vec::new();
289    for e in entries {
290        out.extend_from_slice(e.mode_str().as_bytes());
291        out.push(b' ');
292        out.extend_from_slice(&e.name);
293        out.push(0);
294        out.extend_from_slice(e.oid.as_bytes());
295    }
296    out
297}
298
299/// Git's tree-entry sort comparator.
300///
301/// Trees are sorted byte-by-byte by `"<name>"` for blobs and `"<name>/"` for
302/// sub-trees, so a directory `foo` sorts after a file `foo-bar` but before
303/// `fooz`.  This matches `base_name_compare` in `tree.c`.
304///
305/// # Parameters
306///
307/// - `a_name`: name bytes of the first entry
308/// - `a_is_tree`: whether the first entry is a sub-tree (`mode == 0o040000`)
309/// - `b_name`: name bytes of the second entry
310/// - `b_is_tree`: whether the second entry is a sub-tree
311#[must_use]
312pub fn tree_entry_cmp(
313    a_name: &[u8],
314    a_is_tree: bool,
315    b_name: &[u8],
316    b_is_tree: bool,
317) -> std::cmp::Ordering {
318    let a_trailer = if a_is_tree { b'/' } else { 0u8 };
319    let b_trailer = if b_is_tree { b'/' } else { 0u8 };
320
321    let min_len = a_name.len().min(b_name.len());
322    let cmp = a_name[..min_len].cmp(&b_name[..min_len]);
323    if cmp != std::cmp::Ordering::Equal {
324        return cmp;
325    }
326    // Names share a prefix; compare the next character (or trailer).
327    let ac = a_name.get(min_len).copied().unwrap_or(a_trailer);
328    let bc = b_name.get(min_len).copied().unwrap_or(b_trailer);
329    ac.cmp(&bc)
330}
331
332/// Parsed representation of a commit object.
333#[derive(Debug, Clone)]
334pub struct CommitData {
335    /// The tree this commit points to.
336    pub tree: ObjectId,
337    /// Parent commit IDs (zero or more).
338    pub parents: Vec<ObjectId>,
339    /// Author field decoded to Unicode (using `encoding` when present, else UTF-8).
340    pub author: String,
341    /// Committer field decoded to Unicode.
342    pub committer: String,
343    /// Exact `author` header payload bytes as stored in the object (after `author `).
344    ///
345    /// Empty means treat [`Self::author`] as UTF-8 when serializing (new commits).
346    pub author_raw: Vec<u8>,
347    /// Exact `committer` header payload bytes as stored in the object.
348    pub committer_raw: Vec<u8>,
349    /// Optional encoding override (e.g. `"UTF-8"`).
350    pub encoding: Option<String>,
351    /// Commit message (everything after the blank line).
352    pub message: String,
353    /// Optional raw message bytes for non-UTF-8 commit messages.
354    /// When set, `serialize_commit` uses these bytes instead of `message`.
355    #[doc = "Optional raw message bytes for non-UTF-8 messages."]
356    pub raw_message: Option<Vec<u8>>,
357}
358
359/// Parse the raw data of a commit object.
360///
361/// # Errors
362///
363/// Returns [`Error::CorruptObject`] if required headers are missing.
364pub fn parse_commit(data: &[u8]) -> Result<CommitData> {
365    // Header lines are mostly ASCII; author/committer payloads may match the `encoding` header.
366    // Continuation lines (leading SP) append to the previous header; we only retain author/committer.
367    #[derive(Clone, Copy)]
368    enum Continuation {
369        Author,
370        Committer,
371        Ignore,
372    }
373
374    let mut pos = 0usize;
375    let mut tree = None;
376    let mut parents = Vec::new();
377    let mut author_raw: Option<Vec<u8>> = None;
378    let mut committer_raw: Option<Vec<u8>> = None;
379    let mut encoding = None;
380    let mut cont = Continuation::Ignore;
381
382    while pos < data.len() {
383        let line_start = pos;
384        let mut line_end = pos;
385        while line_end < data.len() && data[line_end] != b'\n' {
386            line_end += 1;
387        }
388        let line = &data[line_start..line_end];
389        let after_nl = line_end.saturating_add(1);
390        if line.is_empty() {
391            let body = data.get(after_nl..).unwrap_or_default();
392            let message = commit_encoding::decode_bytes(encoding.as_deref(), body);
393            // Preserve the exact message tail: Git allows commits whose log ends without a
394            // final newline (`commit-tree` from a file). `serialize_commit` appends `\n` when
395            // only `message` is set, so keep raw bytes when the body is not LF-terminated.
396            let raw_message = if body.is_empty() {
397                None
398            } else if std::str::from_utf8(body).is_err() || !body.ends_with(b"\n") {
399                Some(body.to_vec())
400            } else {
401                None
402            };
403            let author_bytes = author_raw
404                .ok_or_else(|| Error::CorruptObject("commit missing author header".to_owned()))?;
405            let committer_bytes = committer_raw.ok_or_else(|| {
406                Error::CorruptObject("commit missing committer header".to_owned())
407            })?;
408            let author = commit_encoding::decode_bytes(encoding.as_deref(), &author_bytes);
409            let committer = commit_encoding::decode_bytes(encoding.as_deref(), &committer_bytes);
410            return Ok(CommitData {
411                tree: tree
412                    .ok_or_else(|| Error::CorruptObject("commit missing tree header".to_owned()))?,
413                parents,
414                author,
415                committer,
416                author_raw: author_bytes,
417                committer_raw: committer_bytes,
418                encoding,
419                message,
420                raw_message,
421            });
422        }
423
424        if line.first() == Some(&b' ') {
425            let rest = line.get(1..).unwrap_or_default();
426            match cont {
427                Continuation::Author => {
428                    let a = author_raw.as_mut().ok_or_else(|| {
429                        Error::CorruptObject("orphan header continuation".to_owned())
430                    })?;
431                    a.extend_from_slice(rest);
432                }
433                Continuation::Committer => {
434                    let c = committer_raw.as_mut().ok_or_else(|| {
435                        Error::CorruptObject("orphan header continuation".to_owned())
436                    })?;
437                    c.extend_from_slice(rest);
438                }
439                Continuation::Ignore => {}
440            }
441            pos = after_nl;
442            continue;
443        }
444
445        let key_end = line
446            .iter()
447            .position(|&b| b == b' ')
448            .ok_or_else(|| Error::CorruptObject("malformed commit header line".to_owned()))?;
449        let key = &line[..key_end];
450        let rest = line.get(key_end + 1..).unwrap_or_default();
451
452        match key {
453            b"tree" => {
454                let line_str = std::str::from_utf8(rest).map_err(|_| {
455                    Error::CorruptObject("commit tree line is not valid UTF-8".to_owned())
456                })?;
457                tree = Some(line_str.trim().parse::<ObjectId>()?);
458                cont = Continuation::Ignore;
459            }
460            b"parent" => {
461                let line_str = std::str::from_utf8(rest).map_err(|_| {
462                    Error::CorruptObject("commit parent line is not valid UTF-8".to_owned())
463                })?;
464                parents.push(line_str.trim().parse::<ObjectId>()?);
465                cont = Continuation::Ignore;
466            }
467            b"author" => {
468                author_raw = Some(rest.to_vec());
469                cont = Continuation::Author;
470            }
471            b"committer" => {
472                committer_raw = Some(rest.to_vec());
473                cont = Continuation::Committer;
474            }
475            b"encoding" => {
476                let line_str = std::str::from_utf8(rest).map_err(|_| {
477                    Error::CorruptObject("commit encoding line is not valid UTF-8".to_owned())
478                })?;
479                encoding = Some(line_str.to_owned());
480                cont = Continuation::Ignore;
481            }
482            _ => {
483                cont = Continuation::Ignore;
484            }
485        }
486        pos = after_nl;
487    }
488
489    Err(Error::CorruptObject(
490        "commit missing blank line before message".to_owned(),
491    ))
492}
493
494/// Parsed representation of an annotated tag object.
495#[derive(Debug, Clone)]
496pub struct TagData {
497    /// The object this tag points to.
498    pub object: ObjectId,
499    /// The type of the tagged object (e.g. `"commit"`).
500    pub object_type: String,
501    /// The short tag name (without `refs/tags/` prefix).
502    pub tag: String,
503    /// The tagger identity and timestamp (raw Git format).
504    pub tagger: Option<String>,
505    /// The tag message (everything after the blank line).
506    pub message: String,
507}
508
509/// Parse the raw data of a tag object.
510///
511/// # Errors
512///
513/// Returns [`Error::CorruptObject`] if required headers are missing or malformed.
514pub fn parse_tag(data: &[u8]) -> Result<TagData> {
515    let text = std::str::from_utf8(data)
516        .map_err(|_| Error::CorruptObject("tag is not valid UTF-8".to_owned()))?;
517
518    let mut object = None;
519    let mut object_type = None;
520    let mut tag_name = None;
521    let mut tagger = None;
522    let mut message = String::new();
523    let mut in_message = false;
524
525    for line in text.split('\n') {
526        if in_message {
527            message.push_str(line);
528            message.push('\n');
529            continue;
530        }
531        if line.is_empty() {
532            in_message = true;
533            continue;
534        }
535        if let Some(rest) = line.strip_prefix("object ") {
536            object = Some(rest.trim().parse::<ObjectId>()?);
537        } else if let Some(rest) = line.strip_prefix("type ") {
538            object_type = Some(rest.trim().to_owned());
539        } else if let Some(rest) = line.strip_prefix("tag ") {
540            tag_name = Some(rest.trim().to_owned());
541        } else if let Some(rest) = line.strip_prefix("tagger ") {
542            tagger = Some(rest.to_owned());
543        }
544    }
545
546    // Strip one trailing newline that split adds
547    if message.ends_with('\n') {
548        message.pop();
549    }
550
551    Ok(TagData {
552        object: object
553            .ok_or_else(|| Error::CorruptObject("tag missing object header".to_owned()))?,
554        object_type: object_type
555            .ok_or_else(|| Error::CorruptObject("tag missing type header".to_owned()))?,
556        tag: tag_name.ok_or_else(|| Error::CorruptObject("tag missing tag header".to_owned()))?,
557        tagger,
558        message,
559    })
560}
561
562/// Serialize a [`TagData`] into the raw bytes suitable for storage as a tag object.
563///
564/// The caller is responsible for supplying a correctly-formatted `tagger` string
565/// (including timestamp and timezone) when present.
566#[must_use]
567pub fn serialize_tag(t: &TagData) -> Vec<u8> {
568    let mut out = String::new();
569    out.push_str(&format!("object {}\n", t.object));
570    out.push_str(&format!("type {}\n", t.object_type));
571    out.push_str(&format!("tag {}\n", t.tag));
572    if let Some(ref tagger) = t.tagger {
573        out.push_str(&format!("tagger {tagger}\n"));
574    }
575    out.push('\n');
576    // Only add message if non-empty (don't add extra blank line for empty message)
577    let msg = t.message.trim_end_matches('\n');
578    if !msg.is_empty() {
579        out.push_str(msg);
580        out.push('\n');
581    }
582    out.into_bytes()
583}
584
585/// Serialize a [`CommitData`] into the raw bytes suitable for storage.
586///
587/// The caller is responsible for supplying a correctly-formatted `author` and
588/// `committer` string (including timestamp and timezone).
589///
590/// The message body is written exactly as given: `git commit` and `git commit-tree -m`
591/// supply a trailing LF; `git commit-tree` reading from stdin or `-F` does not add one.
592#[must_use]
593pub fn serialize_commit(c: &CommitData) -> Vec<u8> {
594    let mut out = Vec::new();
595    out.extend_from_slice(format!("tree {}\n", c.tree).as_bytes());
596    for p in &c.parents {
597        out.extend_from_slice(format!("parent {p}\n").as_bytes());
598    }
599    out.extend_from_slice(b"author ");
600    if c.author_raw.is_empty() {
601        out.extend_from_slice(c.author.as_bytes());
602    } else {
603        out.extend_from_slice(&c.author_raw);
604    }
605    out.push(b'\n');
606    out.extend_from_slice(b"committer ");
607    if c.committer_raw.is_empty() {
608        out.extend_from_slice(c.committer.as_bytes());
609    } else {
610        out.extend_from_slice(&c.committer_raw);
611    }
612    out.push(b'\n');
613    if let Some(enc) = &c.encoding {
614        out.extend_from_slice(format!("encoding {enc}\n").as_bytes());
615    }
616    out.push(b'\n');
617    if let Some(raw) = &c.raw_message {
618        out.extend_from_slice(raw);
619    } else if !c.message.is_empty() {
620        out.extend_from_slice(c.message.as_bytes());
621    }
622    out
623}