Skip to main content

grit_lib/
objects.rs

1//! Git object model: object IDs, kinds, and in-memory representations.
2//!
3//! # Object ID
4//!
5//! [`ObjectId`] is a 20-byte SHA-1 digest.  It implements `Display` as
6//! lowercase hex, `FromStr` from a 40-character hex string, and the standard
7//! ordering traits so it can be used as a map key.
8//!
9//! # Object Kind
10//!
11//! [`ObjectKind`] represents the four Git object types: blob, tree, commit,
12//! and tag.  The raw header byte-slice is parsed with [`ObjectKind::from_bytes`].
13//!
14//! # Parsed objects
15//!
16//! [`Object`] bundles a kind and its raw (decompressed, header-stripped) byte
17//! content.  Higher-level parsed forms (e.g. [`TreeEntry`], [`CommitData`])
18//! live in this module and are produced by fallible `TryFrom<&Object>`
19//! conversions.
20
21use std::fmt;
22use std::str::FromStr;
23
24use crate::error::{Error, Result};
25
26/// A 20-byte SHA-1 object identifier.
27#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
28pub struct ObjectId([u8; 20]);
29
30impl ObjectId {
31    /// Construct from a 20-byte slice.
32    ///
33    /// # Errors
34    ///
35    /// Returns [`Error::InvalidObjectId`] when `bytes` is not exactly 20 bytes.
36    pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
37        let arr: [u8; 20] = bytes
38            .try_into()
39            .map_err(|_| Error::InvalidObjectId(hex::encode(bytes)))?;
40        Ok(Self(arr))
41    }
42
43    /// Raw 20-byte digest.
44    #[must_use]
45    pub fn as_bytes(&self) -> &[u8; 20] {
46        &self.0
47    }
48
49    /// Lowercase hex representation (40 characters).
50    #[must_use]
51    pub fn to_hex(&self) -> String {
52        hex::encode(self.0)
53    }
54
55    /// The two-character directory prefix used by the loose object store.
56    ///
57    /// Returns the first two hex chars (e.g. `"ab"` for `"ab3f…"`).
58    #[must_use]
59    pub fn loose_prefix(&self) -> String {
60        hex::encode(&self.0[..1])
61    }
62
63    /// Parse an object ID from a hex string.
64    ///
65    /// # Errors
66    ///
67    /// Returns [`Error::InvalidObjectId`] if the string is not a valid
68    /// 40-character hex OID.
69    pub fn from_hex(s: &str) -> Result<Self> {
70        s.parse()
71    }
72
73    /// The 38-character suffix used as the filename inside the loose prefix dir.
74    #[must_use]
75    pub fn loose_suffix(&self) -> String {
76        hex::encode(&self.0[1..])
77    }
78}
79
80impl fmt::Display for ObjectId {
81    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
82        f.write_str(&self.to_hex())
83    }
84}
85
86impl fmt::Debug for ObjectId {
87    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
88        write!(f, "ObjectId({})", self.to_hex())
89    }
90}
91
92impl FromStr for ObjectId {
93    type Err = Error;
94
95    fn from_str(s: &str) -> Result<Self> {
96        if s.len() != 40 {
97            return Err(Error::InvalidObjectId(s.to_owned()));
98        }
99        let bytes = hex::decode(s).map_err(|_| Error::InvalidObjectId(s.to_owned()))?;
100        Self::from_bytes(&bytes)
101    }
102}
103
104/// The four Git object types.
105#[derive(Debug, Clone, Copy, PartialEq, Eq)]
106pub enum ObjectKind {
107    /// A raw file snapshot.
108    Blob,
109    /// A directory listing.
110    Tree,
111    /// A snapshot with metadata and parentage.
112    Commit,
113    /// An annotated tag.
114    Tag,
115}
116
117impl ObjectKind {
118    /// Parse from the ASCII keyword used in Git object headers.
119    ///
120    /// # Errors
121    ///
122    /// Returns [`Error::UnknownObjectType`] for unrecognised strings.
123    pub fn from_bytes(b: &[u8]) -> Result<Self> {
124        match b {
125            b"blob" => Ok(Self::Blob),
126            b"tree" => Ok(Self::Tree),
127            b"commit" => Ok(Self::Commit),
128            b"tag" => Ok(Self::Tag),
129            other => Err(Error::UnknownObjectType(
130                String::from_utf8_lossy(other).into_owned(),
131            )),
132        }
133    }
134
135    /// The ASCII keyword for this kind (used in object headers).
136    #[must_use]
137    pub fn as_str(&self) -> &'static str {
138        match self {
139            Self::Blob => "blob",
140            Self::Tree => "tree",
141            Self::Commit => "commit",
142            Self::Tag => "tag",
143        }
144    }
145}
146
147impl fmt::Display for ObjectKind {
148    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
149        f.write_str(self.as_str())
150    }
151}
152
153impl FromStr for ObjectKind {
154    type Err = Error;
155
156    fn from_str(s: &str) -> Result<Self> {
157        Self::from_bytes(s.as_bytes())
158    }
159}
160
161/// A decompressed, header-stripped Git object.
162#[derive(Debug, Clone)]
163pub struct Object {
164    /// The type of this object.
165    pub kind: ObjectKind,
166    /// Raw byte content (everything after the NUL in the header).
167    pub data: Vec<u8>,
168}
169
170impl Object {
171    /// Construct a new object from its kind and raw data.
172    #[must_use]
173    pub fn new(kind: ObjectKind, data: Vec<u8>) -> Self {
174        Self { kind, data }
175    }
176
177    /// Serialize to the canonical Git object format: `"<kind> <size>\0<data>"`.
178    #[must_use]
179    pub fn to_store_bytes(&self) -> Vec<u8> {
180        let header = format!("{} {}\0", self.kind, self.data.len());
181        let mut out = Vec::with_capacity(header.len() + self.data.len());
182        out.extend_from_slice(header.as_bytes());
183        out.extend_from_slice(&self.data);
184        out
185    }
186}
187
188/// A single entry in a Git tree object.
189#[derive(Debug, Clone, PartialEq, Eq)]
190pub struct TreeEntry {
191    /// Unix file mode (e.g. `0o100644` for a regular file, `0o040000` for a tree).
192    pub mode: u32,
193    /// Entry name (file or directory name only, no path separators).
194    pub name: Vec<u8>,
195    /// The object ID of the blob or sub-tree.
196    pub oid: ObjectId,
197}
198
199impl TreeEntry {
200    /// Format the mode as Git does: no leading zero, minimal digits.
201    ///
202    /// Git uses `"40000"` for trees (not `"040000"`), and `"100644"` for blobs.
203    #[must_use]
204    pub fn mode_str(&self) -> String {
205        // Git omits the leading zero for tree mode
206        if self.mode == 0o040000 {
207            "40000".to_owned()
208        } else {
209            format!("{:o}", self.mode)
210        }
211    }
212}
213
214/// Parse the raw data of a tree object into its entries.
215///
216/// # Format
217///
218/// Each entry is `"<mode> <name>\0<20-byte-sha1>"` concatenated with no
219/// separator between entries.
220///
221/// # Errors
222///
223/// Returns [`Error::CorruptObject`] if the data is malformed.
224pub fn parse_tree(data: &[u8]) -> Result<Vec<TreeEntry>> {
225    let mut entries = Vec::new();
226    let mut pos = 0;
227
228    while pos < data.len() {
229        // Find the space separating mode from name
230        let sp = data[pos..]
231            .iter()
232            .position(|&b| b == b' ')
233            .ok_or_else(|| Error::CorruptObject("tree entry missing space".to_owned()))?;
234        let mode_bytes = &data[pos..pos + sp];
235        let mode = std::str::from_utf8(mode_bytes)
236            .ok()
237            .and_then(|s| u32::from_str_radix(s, 8).ok())
238            .ok_or_else(|| {
239                Error::CorruptObject(format!(
240                    "invalid tree mode: {}",
241                    String::from_utf8_lossy(mode_bytes)
242                ))
243            })?;
244        pos += sp + 1;
245
246        // Find the NUL separating name from the 20-byte SHA
247        let nul = data[pos..]
248            .iter()
249            .position(|&b| b == 0)
250            .ok_or_else(|| Error::CorruptObject("tree entry missing NUL".to_owned()))?;
251        let name = data[pos..pos + nul].to_vec();
252        pos += nul + 1;
253
254        if pos + 20 > data.len() {
255            return Err(Error::CorruptObject("tree entry truncated SHA".to_owned()));
256        }
257        let oid = ObjectId::from_bytes(&data[pos..pos + 20])?;
258        pos += 20;
259
260        entries.push(TreeEntry { mode, name, oid });
261    }
262
263    Ok(entries)
264}
265
266/// Build the raw bytes of a tree object from a slice of entries.
267///
268/// Entries **must** already be sorted in Git tree order (see [`tree_entry_cmp`])
269/// before calling this function.
270#[must_use]
271pub fn serialize_tree(entries: &[TreeEntry]) -> Vec<u8> {
272    let mut out = Vec::new();
273    for e in entries {
274        out.extend_from_slice(e.mode_str().as_bytes());
275        out.push(b' ');
276        out.extend_from_slice(&e.name);
277        out.push(0);
278        out.extend_from_slice(e.oid.as_bytes());
279    }
280    out
281}
282
283/// Git's tree-entry sort comparator.
284///
285/// Trees are sorted byte-by-byte by `"<name>"` for blobs and `"<name>/"` for
286/// sub-trees, so a directory `foo` sorts after a file `foo-bar` but before
287/// `fooz`.  This matches `base_name_compare` in `tree.c`.
288///
289/// # Parameters
290///
291/// - `a_name`: name bytes of the first entry
292/// - `a_is_tree`: whether the first entry is a sub-tree (`mode == 0o040000`)
293/// - `b_name`: name bytes of the second entry
294/// - `b_is_tree`: whether the second entry is a sub-tree
295#[must_use]
296pub fn tree_entry_cmp(
297    a_name: &[u8],
298    a_is_tree: bool,
299    b_name: &[u8],
300    b_is_tree: bool,
301) -> std::cmp::Ordering {
302    let a_trailer = if a_is_tree { b'/' } else { 0u8 };
303    let b_trailer = if b_is_tree { b'/' } else { 0u8 };
304
305    let min_len = a_name.len().min(b_name.len());
306    let cmp = a_name[..min_len].cmp(&b_name[..min_len]);
307    if cmp != std::cmp::Ordering::Equal {
308        return cmp;
309    }
310    // Names share a prefix; compare the next character (or trailer).
311    let ac = a_name.get(min_len).copied().unwrap_or(a_trailer);
312    let bc = b_name.get(min_len).copied().unwrap_or(b_trailer);
313    ac.cmp(&bc)
314}
315
316/// Parsed representation of a commit object.
317#[derive(Debug, Clone)]
318pub struct CommitData {
319    /// The tree this commit points to.
320    pub tree: ObjectId,
321    /// Parent commit IDs (zero or more).
322    pub parents: Vec<ObjectId>,
323    /// Author field (raw string as Git stores it).
324    pub author: String,
325    /// Committer field (raw string as Git stores it).
326    pub committer: String,
327    /// Optional encoding override (e.g. `"UTF-8"`).
328    pub encoding: Option<String>,
329    /// Commit message (everything after the blank line).
330    pub message: String,
331}
332
333/// Parse the raw data of a commit object.
334///
335/// # Errors
336///
337/// Returns [`Error::CorruptObject`] if required headers are missing.
338pub fn parse_commit(data: &[u8]) -> Result<CommitData> {
339    let text = std::str::from_utf8(data)
340        .map_err(|_| Error::CorruptObject("commit is not valid UTF-8".to_owned()))?;
341
342    let mut tree = None;
343    let mut parents = Vec::new();
344    let mut author = None;
345    let mut committer = None;
346    let mut encoding = None;
347    let mut message = String::new();
348    let mut in_message = false;
349
350    for line in text.split('\n') {
351        if in_message {
352            message.push_str(line);
353            message.push('\n');
354            continue;
355        }
356        if line.is_empty() {
357            in_message = true;
358            continue;
359        }
360        if let Some(rest) = line.strip_prefix("tree ") {
361            tree = Some(rest.trim().parse::<ObjectId>()?);
362        } else if let Some(rest) = line.strip_prefix("parent ") {
363            parents.push(rest.trim().parse::<ObjectId>()?);
364        } else if let Some(rest) = line.strip_prefix("author ") {
365            author = Some(rest.to_owned());
366        } else if let Some(rest) = line.strip_prefix("committer ") {
367            committer = Some(rest.to_owned());
368        } else if let Some(rest) = line.strip_prefix("encoding ") {
369            encoding = Some(rest.to_owned());
370        }
371    }
372
373    // Strip one trailing newline that split adds
374    if message.ends_with('\n') {
375        message.pop();
376    }
377
378    Ok(CommitData {
379        tree: tree.ok_or_else(|| Error::CorruptObject("commit missing tree header".to_owned()))?,
380        parents,
381        author: author
382            .ok_or_else(|| Error::CorruptObject("commit missing author header".to_owned()))?,
383        committer: committer
384            .ok_or_else(|| Error::CorruptObject("commit missing committer header".to_owned()))?,
385        encoding,
386        message,
387    })
388}
389
390/// Parsed representation of an annotated tag object.
391#[derive(Debug, Clone)]
392pub struct TagData {
393    /// The object this tag points to.
394    pub object: ObjectId,
395    /// The type of the tagged object (e.g. `"commit"`).
396    pub object_type: String,
397    /// The short tag name (without `refs/tags/` prefix).
398    pub tag: String,
399    /// The tagger identity and timestamp (raw Git format).
400    pub tagger: Option<String>,
401    /// The tag message (everything after the blank line).
402    pub message: String,
403}
404
405/// Parse the raw data of a tag object.
406///
407/// # Errors
408///
409/// Returns [`Error::CorruptObject`] if required headers are missing or malformed.
410pub fn parse_tag(data: &[u8]) -> Result<TagData> {
411    let text = std::str::from_utf8(data)
412        .map_err(|_| Error::CorruptObject("tag is not valid UTF-8".to_owned()))?;
413
414    let mut object = None;
415    let mut object_type = None;
416    let mut tag_name = None;
417    let mut tagger = None;
418    let mut message = String::new();
419    let mut in_message = false;
420
421    for line in text.split('\n') {
422        if in_message {
423            message.push_str(line);
424            message.push('\n');
425            continue;
426        }
427        if line.is_empty() {
428            in_message = true;
429            continue;
430        }
431        if let Some(rest) = line.strip_prefix("object ") {
432            object = Some(rest.trim().parse::<ObjectId>()?);
433        } else if let Some(rest) = line.strip_prefix("type ") {
434            object_type = Some(rest.trim().to_owned());
435        } else if let Some(rest) = line.strip_prefix("tag ") {
436            tag_name = Some(rest.trim().to_owned());
437        } else if let Some(rest) = line.strip_prefix("tagger ") {
438            tagger = Some(rest.to_owned());
439        }
440    }
441
442    // Strip one trailing newline that split adds
443    if message.ends_with('\n') {
444        message.pop();
445    }
446
447    Ok(TagData {
448        object: object
449            .ok_or_else(|| Error::CorruptObject("tag missing object header".to_owned()))?,
450        object_type: object_type
451            .ok_or_else(|| Error::CorruptObject("tag missing type header".to_owned()))?,
452        tag: tag_name.ok_or_else(|| Error::CorruptObject("tag missing tag header".to_owned()))?,
453        tagger,
454        message,
455    })
456}
457
458/// Serialize a [`TagData`] into the raw bytes suitable for storage as a tag object.
459///
460/// The caller is responsible for supplying a correctly-formatted `tagger` string
461/// (including timestamp and timezone) when present.
462#[must_use]
463pub fn serialize_tag(t: &TagData) -> Vec<u8> {
464    let mut out = String::new();
465    out.push_str(&format!("object {}\n", t.object));
466    out.push_str(&format!("type {}\n", t.object_type));
467    out.push_str(&format!("tag {}\n", t.tag));
468    if let Some(ref tagger) = t.tagger {
469        out.push_str(&format!("tagger {tagger}\n"));
470    }
471    out.push('\n');
472    out.push_str(&t.message);
473    if !t.message.is_empty() && !t.message.ends_with('\n') {
474        out.push('\n');
475    }
476    out.into_bytes()
477}
478
479/// Serialize a [`CommitData`] into the raw bytes suitable for storage.
480///
481/// The caller is responsible for supplying a correctly-formatted `author` and
482/// `committer` string (including timestamp and timezone).
483#[must_use]
484pub fn serialize_commit(c: &CommitData) -> Vec<u8> {
485    let mut out = String::new();
486    out.push_str(&format!("tree {}\n", c.tree));
487    for p in &c.parents {
488        out.push_str(&format!("parent {p}\n"));
489    }
490    out.push_str(&format!("author {}\n", c.author));
491    out.push_str(&format!("committer {}\n", c.committer));
492    if let Some(enc) = &c.encoding {
493        out.push_str(&format!("encoding {enc}\n"));
494    }
495    out.push('\n');
496    out.push_str(&c.message);
497    if !c.message.ends_with('\n') {
498        out.push('\n');
499    }
500    out.into_bytes()
501}