Skip to main content

grit_lib/
objects.rs

1//! Git object model: object IDs, kinds, and in-memory representations.
2//!
3//! # Object ID
4//!
5//! [`ObjectId`] is a 20-byte SHA-1 digest.  It implements `Display` as
6//! lowercase hex, `FromStr` from a 40-character hex string, and the standard
7//! ordering traits so it can be used as a map key.
8//!
9//! # Object Kind
10//!
11//! [`ObjectKind`] represents the four Git object types: blob, tree, commit,
12//! and tag.  The raw header byte-slice is parsed with [`ObjectKind::from_bytes`].
13//!
14//! # Parsed objects
15//!
16//! [`Object`] bundles a kind and its raw (decompressed, header-stripped) byte
17//! content.  Higher-level parsed forms (e.g. [`TreeEntry`], [`CommitData`])
18//! live in this module and are produced by fallible `TryFrom<&Object>`
19//! conversions.
20
21use std::fmt;
22use std::str::FromStr;
23
24use crate::error::{Error, Result};
25
26/// A 20-byte SHA-1 object identifier.
27#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
28pub struct ObjectId([u8; 20]);
29
30impl ObjectId {
31    /// Construct from a 20-byte slice.
32    ///
33    /// # Errors
34    ///
35    /// Returns [`Error::InvalidObjectId`] when `bytes` is not exactly 20 bytes.
36    pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
37        let arr: [u8; 20] = bytes
38            .try_into()
39            .map_err(|_| Error::InvalidObjectId(hex::encode(bytes)))?;
40        Ok(Self(arr))
41    }
42
43    /// Raw 20-byte digest.
44    #[must_use]
45    pub fn as_bytes(&self) -> &[u8; 20] {
46        &self.0
47    }
48
49    /// Check if this is the null (all-zero) object ID.
50    #[must_use]
51    pub fn is_zero(&self) -> bool {
52        self.0 == [0u8; 20]
53    }
54
55    /// Lowercase hex representation (40 characters).
56    #[must_use]
57    pub fn to_hex(&self) -> String {
58        hex::encode(self.0)
59    }
60
61    /// The two-character directory prefix used by the loose object store.
62    ///
63    /// Returns the first two hex chars (e.g. `"ab"` for `"ab3f…"`).
64    #[must_use]
65    pub fn loose_prefix(&self) -> String {
66        hex::encode(&self.0[..1])
67    }
68
69    /// Parse an object ID from a hex string.
70    ///
71    /// # Errors
72    ///
73    /// Returns [`Error::InvalidObjectId`] if the string is not a valid
74    /// 40-character hex OID.
75    pub fn from_hex(s: &str) -> Result<Self> {
76        s.parse()
77    }
78
79    /// The 38-character suffix used as the filename inside the loose prefix dir.
80    #[must_use]
81    pub fn loose_suffix(&self) -> String {
82        hex::encode(&self.0[1..])
83    }
84}
85
86impl fmt::Display for ObjectId {
87    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
88        f.write_str(&self.to_hex())
89    }
90}
91
92impl fmt::Debug for ObjectId {
93    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
94        write!(f, "ObjectId({})", self.to_hex())
95    }
96}
97
98impl FromStr for ObjectId {
99    type Err = Error;
100
101    fn from_str(s: &str) -> Result<Self> {
102        if s.len() != 40 {
103            return Err(Error::InvalidObjectId(s.to_owned()));
104        }
105        let bytes = hex::decode(s).map_err(|_| Error::InvalidObjectId(s.to_owned()))?;
106        Self::from_bytes(&bytes)
107    }
108}
109
110/// The four Git object types.
111#[derive(Debug, Clone, Copy, PartialEq, Eq)]
112pub enum ObjectKind {
113    /// A raw file snapshot.
114    Blob,
115    /// A directory listing.
116    Tree,
117    /// A snapshot with metadata and parentage.
118    Commit,
119    /// An annotated tag.
120    Tag,
121}
122
123impl ObjectKind {
124    /// Parse from the ASCII keyword used in Git object headers.
125    ///
126    /// # Errors
127    ///
128    /// Returns [`Error::UnknownObjectType`] for unrecognised strings.
129    pub fn from_bytes(b: &[u8]) -> Result<Self> {
130        match b {
131            b"blob" => Ok(Self::Blob),
132            b"tree" => Ok(Self::Tree),
133            b"commit" => Ok(Self::Commit),
134            b"tag" => Ok(Self::Tag),
135            other => Err(Error::UnknownObjectType(
136                String::from_utf8_lossy(other).into_owned(),
137            )),
138        }
139    }
140
141    /// The ASCII keyword for this kind (used in object headers).
142    #[must_use]
143    pub fn as_str(&self) -> &'static str {
144        match self {
145            Self::Blob => "blob",
146            Self::Tree => "tree",
147            Self::Commit => "commit",
148            Self::Tag => "tag",
149        }
150    }
151}
152
153impl fmt::Display for ObjectKind {
154    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
155        f.write_str(self.as_str())
156    }
157}
158
159impl FromStr for ObjectKind {
160    type Err = Error;
161
162    fn from_str(s: &str) -> Result<Self> {
163        Self::from_bytes(s.as_bytes())
164    }
165}
166
167/// A decompressed, header-stripped Git object.
168#[derive(Debug, Clone)]
169pub struct Object {
170    /// The type of this object.
171    pub kind: ObjectKind,
172    /// Raw byte content (everything after the NUL in the header).
173    pub data: Vec<u8>,
174}
175
176impl Object {
177    /// Construct a new object from its kind and raw data.
178    #[must_use]
179    pub fn new(kind: ObjectKind, data: Vec<u8>) -> Self {
180        Self { kind, data }
181    }
182
183    /// Serialize to the canonical Git object format: `"<kind> <size>\0<data>"`.
184    #[must_use]
185    pub fn to_store_bytes(&self) -> Vec<u8> {
186        let header = format!("{} {}\0", self.kind, self.data.len());
187        let mut out = Vec::with_capacity(header.len() + self.data.len());
188        out.extend_from_slice(header.as_bytes());
189        out.extend_from_slice(&self.data);
190        out
191    }
192}
193
194/// A single entry in a Git tree object.
195#[derive(Debug, Clone, PartialEq, Eq)]
196pub struct TreeEntry {
197    /// Unix file mode (e.g. `0o100644` for a regular file, `0o040000` for a tree).
198    pub mode: u32,
199    /// Entry name (file or directory name only, no path separators).
200    pub name: Vec<u8>,
201    /// The object ID of the blob or sub-tree.
202    pub oid: ObjectId,
203}
204
205impl TreeEntry {
206    /// Format the mode as Git does: no leading zero, minimal digits.
207    ///
208    /// Git uses `"40000"` for trees (not `"040000"`), and `"100644"` for blobs.
209    #[must_use]
210    pub fn mode_str(&self) -> String {
211        // Git omits the leading zero for tree mode
212        if self.mode == 0o040000 {
213            "40000".to_owned()
214        } else {
215            format!("{:o}", self.mode)
216        }
217    }
218}
219
220/// Parse the raw data of a tree object into its entries.
221///
222/// # Format
223///
224/// Each entry is `"<mode> <name>\0<20-byte-sha1>"` concatenated with no
225/// separator between entries.
226///
227/// # Errors
228///
229/// Returns [`Error::CorruptObject`] if the data is malformed.
230pub fn parse_tree(data: &[u8]) -> Result<Vec<TreeEntry>> {
231    let mut entries = Vec::new();
232    let mut pos = 0;
233
234    while pos < data.len() {
235        // Find the space separating mode from name
236        let sp = data[pos..]
237            .iter()
238            .position(|&b| b == b' ')
239            .ok_or_else(|| Error::CorruptObject("tree entry missing space".to_owned()))?;
240        let mode_bytes = &data[pos..pos + sp];
241        let mode = std::str::from_utf8(mode_bytes)
242            .ok()
243            .and_then(|s| u32::from_str_radix(s, 8).ok())
244            .ok_or_else(|| {
245                Error::CorruptObject(format!(
246                    "invalid tree mode: {}",
247                    String::from_utf8_lossy(mode_bytes)
248                ))
249            })?;
250        pos += sp + 1;
251
252        // Find the NUL separating name from the 20-byte SHA
253        let nul = data[pos..]
254            .iter()
255            .position(|&b| b == 0)
256            .ok_or_else(|| Error::CorruptObject("tree entry missing NUL".to_owned()))?;
257        let name = data[pos..pos + nul].to_vec();
258        pos += nul + 1;
259
260        if pos + 20 > data.len() {
261            return Err(Error::CorruptObject("tree entry truncated SHA".to_owned()));
262        }
263        let oid = ObjectId::from_bytes(&data[pos..pos + 20])?;
264        pos += 20;
265
266        entries.push(TreeEntry { mode, name, oid });
267    }
268
269    Ok(entries)
270}
271
272/// Build the raw bytes of a tree object from a slice of entries.
273///
274/// Entries **must** already be sorted in Git tree order (see [`tree_entry_cmp`])
275/// before calling this function.
276#[must_use]
277pub fn serialize_tree(entries: &[TreeEntry]) -> Vec<u8> {
278    let mut out = Vec::new();
279    for e in entries {
280        out.extend_from_slice(e.mode_str().as_bytes());
281        out.push(b' ');
282        out.extend_from_slice(&e.name);
283        out.push(0);
284        out.extend_from_slice(e.oid.as_bytes());
285    }
286    out
287}
288
289/// Git's tree-entry sort comparator.
290///
291/// Trees are sorted byte-by-byte by `"<name>"` for blobs and `"<name>/"` for
292/// sub-trees, so a directory `foo` sorts after a file `foo-bar` but before
293/// `fooz`.  This matches `base_name_compare` in `tree.c`.
294///
295/// # Parameters
296///
297/// - `a_name`: name bytes of the first entry
298/// - `a_is_tree`: whether the first entry is a sub-tree (`mode == 0o040000`)
299/// - `b_name`: name bytes of the second entry
300/// - `b_is_tree`: whether the second entry is a sub-tree
301#[must_use]
302pub fn tree_entry_cmp(
303    a_name: &[u8],
304    a_is_tree: bool,
305    b_name: &[u8],
306    b_is_tree: bool,
307) -> std::cmp::Ordering {
308    let a_trailer = if a_is_tree { b'/' } else { 0u8 };
309    let b_trailer = if b_is_tree { b'/' } else { 0u8 };
310
311    let min_len = a_name.len().min(b_name.len());
312    let cmp = a_name[..min_len].cmp(&b_name[..min_len]);
313    if cmp != std::cmp::Ordering::Equal {
314        return cmp;
315    }
316    // Names share a prefix; compare the next character (or trailer).
317    let ac = a_name.get(min_len).copied().unwrap_or(a_trailer);
318    let bc = b_name.get(min_len).copied().unwrap_or(b_trailer);
319    ac.cmp(&bc)
320}
321
322/// Parsed representation of a commit object.
323#[derive(Debug, Clone)]
324pub struct CommitData {
325    /// The tree this commit points to.
326    pub tree: ObjectId,
327    /// Parent commit IDs (zero or more).
328    pub parents: Vec<ObjectId>,
329    /// Author field (raw string as Git stores it).
330    pub author: String,
331    /// Committer field (raw string as Git stores it).
332    pub committer: String,
333    /// Optional encoding override (e.g. `"UTF-8"`).
334    pub encoding: Option<String>,
335    /// Commit message (everything after the blank line).
336    pub message: String,
337    /// Optional raw message bytes for non-UTF-8 commit messages.
338    /// When set, `serialize_commit` uses these bytes instead of `message`.
339    #[doc = "Optional raw message bytes for non-UTF-8 messages."]
340    pub raw_message: Option<Vec<u8>>,
341}
342
343/// Parse the raw data of a commit object.
344///
345/// # Errors
346///
347/// Returns [`Error::CorruptObject`] if required headers are missing.
348pub fn parse_commit(data: &[u8]) -> Result<CommitData> {
349    // Use lossy UTF-8 conversion so commits with non-UTF-8 encoded
350    // messages (e.g. iso-8859-7 with an `encoding` header) can still
351    // be parsed.  The header fields are always ASCII-safe.
352    let text = String::from_utf8_lossy(data);
353
354    let mut tree = None;
355    let mut parents = Vec::new();
356    let mut author = None;
357    let mut committer = None;
358    let mut encoding = None;
359    let mut message = String::new();
360    let mut in_message = false;
361
362    for line in text.split('\n') {
363        if in_message {
364            message.push_str(line);
365            message.push('\n');
366            continue;
367        }
368        if line.is_empty() {
369            in_message = true;
370            continue;
371        }
372        if let Some(rest) = line.strip_prefix("tree ") {
373            tree = Some(rest.trim().parse::<ObjectId>()?);
374        } else if let Some(rest) = line.strip_prefix("parent ") {
375            parents.push(rest.trim().parse::<ObjectId>()?);
376        } else if let Some(rest) = line.strip_prefix("author ") {
377            author = Some(rest.to_owned());
378        } else if let Some(rest) = line.strip_prefix("committer ") {
379            committer = Some(rest.to_owned());
380        } else if let Some(rest) = line.strip_prefix("encoding ") {
381            encoding = Some(rest.to_owned());
382        }
383    }
384
385    // Strip one trailing newline that split adds
386    if message.ends_with('\n') {
387        message.pop();
388    }
389
390    Ok(CommitData {
391        tree: tree.ok_or_else(|| Error::CorruptObject("commit missing tree header".to_owned()))?,
392        parents,
393        author: author
394            .ok_or_else(|| Error::CorruptObject("commit missing author header".to_owned()))?,
395        committer: committer
396            .ok_or_else(|| Error::CorruptObject("commit missing committer header".to_owned()))?,
397        encoding,
398        message,
399        raw_message: None,
400    })
401}
402
403/// Parsed representation of an annotated tag object.
404#[derive(Debug, Clone)]
405pub struct TagData {
406    /// The object this tag points to.
407    pub object: ObjectId,
408    /// The type of the tagged object (e.g. `"commit"`).
409    pub object_type: String,
410    /// The short tag name (without `refs/tags/` prefix).
411    pub tag: String,
412    /// The tagger identity and timestamp (raw Git format).
413    pub tagger: Option<String>,
414    /// The tag message (everything after the blank line).
415    pub message: String,
416}
417
418/// Parse the raw data of a tag object.
419///
420/// # Errors
421///
422/// Returns [`Error::CorruptObject`] if required headers are missing or malformed.
423pub fn parse_tag(data: &[u8]) -> Result<TagData> {
424    let text = std::str::from_utf8(data)
425        .map_err(|_| Error::CorruptObject("tag is not valid UTF-8".to_owned()))?;
426
427    let mut object = None;
428    let mut object_type = None;
429    let mut tag_name = None;
430    let mut tagger = None;
431    let mut message = String::new();
432    let mut in_message = false;
433
434    for line in text.split('\n') {
435        if in_message {
436            message.push_str(line);
437            message.push('\n');
438            continue;
439        }
440        if line.is_empty() {
441            in_message = true;
442            continue;
443        }
444        if let Some(rest) = line.strip_prefix("object ") {
445            object = Some(rest.trim().parse::<ObjectId>()?);
446        } else if let Some(rest) = line.strip_prefix("type ") {
447            object_type = Some(rest.trim().to_owned());
448        } else if let Some(rest) = line.strip_prefix("tag ") {
449            tag_name = Some(rest.trim().to_owned());
450        } else if let Some(rest) = line.strip_prefix("tagger ") {
451            tagger = Some(rest.to_owned());
452        }
453    }
454
455    // Strip one trailing newline that split adds
456    if message.ends_with('\n') {
457        message.pop();
458    }
459
460    Ok(TagData {
461        object: object
462            .ok_or_else(|| Error::CorruptObject("tag missing object header".to_owned()))?,
463        object_type: object_type
464            .ok_or_else(|| Error::CorruptObject("tag missing type header".to_owned()))?,
465        tag: tag_name.ok_or_else(|| Error::CorruptObject("tag missing tag header".to_owned()))?,
466        tagger,
467        message,
468    })
469}
470
471/// Serialize a [`TagData`] into the raw bytes suitable for storage as a tag object.
472///
473/// The caller is responsible for supplying a correctly-formatted `tagger` string
474/// (including timestamp and timezone) when present.
475#[must_use]
476pub fn serialize_tag(t: &TagData) -> Vec<u8> {
477    let mut out = String::new();
478    out.push_str(&format!("object {}\n", t.object));
479    out.push_str(&format!("type {}\n", t.object_type));
480    out.push_str(&format!("tag {}\n", t.tag));
481    if let Some(ref tagger) = t.tagger {
482        out.push_str(&format!("tagger {tagger}\n"));
483    }
484    out.push('\n');
485    // Only add message if non-empty (don't add extra blank line for empty message)
486    let msg = t.message.trim_end_matches('\n');
487    if !msg.is_empty() {
488        out.push_str(msg);
489        out.push('\n');
490    }
491    out.into_bytes()
492}
493
494/// Serialize a [`CommitData`] into the raw bytes suitable for storage.
495///
496/// The caller is responsible for supplying a correctly-formatted `author` and
497/// `committer` string (including timestamp and timezone).
498#[must_use]
499pub fn serialize_commit(c: &CommitData) -> Vec<u8> {
500    let mut out = Vec::new();
501    out.extend_from_slice(format!("tree {}\n", c.tree).as_bytes());
502    for p in &c.parents {
503        out.extend_from_slice(format!("parent {p}\n").as_bytes());
504    }
505    out.extend_from_slice(format!("author {}\n", c.author).as_bytes());
506    out.extend_from_slice(format!("committer {}\n", c.committer).as_bytes());
507    if let Some(enc) = &c.encoding {
508        out.extend_from_slice(format!("encoding {enc}\n").as_bytes());
509    }
510    out.push(b'\n');
511    // Use raw_message bytes if available (for non-UTF-8 commit messages),
512    // otherwise fall back to the UTF-8 message field.
513    // Callers are responsible for trailing newlines (commit-tree preserves
514    // stdin exactly; other callers use ensure_trailing_newline).
515    if let Some(raw) = &c.raw_message {
516        out.extend_from_slice(raw);
517    } else {
518        out.extend_from_slice(c.message.as_bytes());
519    }
520    out
521}