Skip to main content

mkit_git_bridge/
gitobj.rs

1//! git object encoding: `SHA1("<type> <len>\0" || body)` ids and
2//! zlib loose-object storage (SPEC-GIT-BRIDGE §2).
3
4use crate::error::BridgeError;
5use flate2::Compression;
6use flate2::write::ZlibEncoder;
7use sha1::{Digest, Sha1};
8use std::io::Write as _;
9use std::path::{Path, PathBuf};
10
11/// 20-byte git object id.
12pub type Sha1Id = [u8; 20];
13
14/// The four storable git object types the bridge emits.
15#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
16pub enum GitType {
17    Blob,
18    Tree,
19    Commit,
20    Tag,
21}
22
23impl GitType {
24    /// The ASCII type name used in the object header.
25    #[must_use]
26    pub fn name(self) -> &'static str {
27        match self {
28            Self::Blob => "blob",
29            Self::Tree => "tree",
30            Self::Commit => "commit",
31            Self::Tag => "tag",
32        }
33    }
34
35    /// Parse a git object-header type name.
36    #[must_use]
37    pub fn from_name(name: &[u8]) -> Option<Self> {
38        Some(match name {
39            b"blob" => Self::Blob,
40            b"tree" => Self::Tree,
41            b"commit" => Self::Commit,
42            b"tag" => Self::Tag,
43            _ => return None,
44        })
45    }
46}
47
48/// An encoded git object: type + body (the bytes after the
49/// `"<type> <len>\0"` header).
50#[derive(Debug, Clone, PartialEq, Eq)]
51pub struct GitObject {
52    pub gtype: GitType,
53    pub body: Vec<u8>,
54}
55
56impl GitObject {
57    /// The full header+body byte string the id is computed over.
58    #[must_use]
59    pub fn raw(&self) -> Vec<u8> {
60        let mut out = Vec::with_capacity(self.gtype.name().len() + 12 + self.body.len());
61        out.extend_from_slice(self.gtype.name().as_bytes());
62        out.push(b' ');
63        out.extend_from_slice(self.body.len().to_string().as_bytes());
64        out.push(0);
65        out.extend_from_slice(&self.body);
66        out
67    }
68
69    /// git object id: SHA-1 of [`Self::raw`].
70    #[must_use]
71    pub fn id(&self) -> Sha1Id {
72        let mut h = Sha1::new();
73        h.update(self.gtype.name().as_bytes());
74        h.update(b" ");
75        h.update(self.body.len().to_string().as_bytes());
76        h.update([0u8]);
77        h.update(&self.body);
78        h.finalize().into()
79    }
80
81    /// Loose-object path under a `.git` (or bare repo) directory.
82    #[must_use]
83    pub fn loose_path(git_dir: &Path, id: &Sha1Id) -> PathBuf {
84        let hex = sha1_hex(id);
85        git_dir.join("objects").join(&hex[..2]).join(&hex[2..])
86    }
87
88    /// Write this object loose into `git_dir/objects/`, returning its
89    /// id. Idempotent: an existing object file is left untouched
90    /// (same bytes by content addressing). The write is
91    /// temp-file + rename so a crash never leaves a torn object.
92    pub fn write_loose(&self, git_dir: &Path) -> Result<Sha1Id, BridgeError> {
93        let id = self.id();
94        let path = Self::loose_path(git_dir, &id);
95        if path.exists() {
96            return Ok(id);
97        }
98        let dir = path
99            .parent()
100            .ok_or_else(|| BridgeError::Source("loose path has no parent".into()))?;
101        std::fs::create_dir_all(dir)?;
102        let mut enc = ZlibEncoder::new(Vec::new(), Compression::default());
103        enc.write_all(&self.raw())?;
104        let compressed = enc.finish()?;
105        // Unique per process so concurrent writers never share a tmp
106        // path; content addressing makes the rename race benign.
107        let tmp = dir.join(format!(".tmp-{}-{}", std::process::id(), sha1_hex(&id)));
108        std::fs::write(&tmp, &compressed)?;
109        match std::fs::rename(&tmp, &path) {
110            Ok(()) => Ok(id),
111            Err(e) => {
112                let _ = std::fs::remove_file(&tmp);
113                // Lost a race to another writer: same content, fine.
114                if path.exists() { Ok(id) } else { Err(e.into()) }
115            }
116        }
117    }
118}
119
120impl GitObject {
121    /// Parse `"<type> <len>\0<body>"` bytes (the inverse of
122    /// [`Self::raw`]; what a zlib-decompressed loose object contains).
123    #[must_use]
124    pub fn parse_raw(raw: &[u8]) -> Option<Self> {
125        let sp = raw.iter().position(|&b| b == b' ')?;
126        let gtype = GitType::from_name(&raw[..sp])?;
127        let nul = raw.iter().position(|&b| b == 0)?;
128        let len: usize = std::str::from_utf8(&raw[sp + 1..nul]).ok()?.parse().ok()?;
129        let body = raw.get(nul + 1..)?;
130        (body.len() == len).then(|| Self {
131            gtype,
132            body: body.to_vec(),
133        })
134    }
135
136    /// Read and parse a loose object from a git objects dir,
137    /// verifying the bytes hash back to the requested id (also
138    /// rejects non-canonical headers, since [`Self::id`] re-renders
139    /// the canonical form).
140    pub fn read_loose(git_dir: &Path, id: &Sha1Id) -> Result<Self, BridgeError> {
141        let compressed = std::fs::read(Self::loose_path(git_dir, id))?;
142        let mut dec = flate2::read::ZlibDecoder::new(&compressed[..]);
143        let mut raw = Vec::new();
144        std::io::Read::read_to_end(&mut dec, &mut raw)?;
145        let obj = Self::parse_raw(&raw)
146            .ok_or_else(|| BridgeError::NotBridgeObject("malformed loose object header".into()))?;
147        if obj.id() != *id {
148            return Err(BridgeError::Integrity(format!(
149                "loose object {} hashes to {}",
150                sha1_hex(id),
151                sha1_hex(&obj.id())
152            )));
153        }
154        Ok(obj)
155    }
156}
157
158/// Lowercase hex of a 20-byte git id.
159#[must_use]
160pub fn sha1_hex(id: &Sha1Id) -> String {
161    let mut s = String::with_capacity(40);
162    for b in id {
163        use std::fmt::Write as _;
164        let _ = write!(s, "{b:02x}");
165    }
166    s
167}
168
169/// Strict inverse of [`sha1_hex`] (lowercase only).
170#[must_use]
171pub fn sha1_from_hex(s: &str) -> Option<Sha1Id> {
172    let bytes = s.as_bytes();
173    if bytes.len() != 40 {
174        return None;
175    }
176    let mut out = [0u8; 20];
177    for (i, pair) in bytes.chunks(2).enumerate() {
178        let hi = hex_val(pair[0])?;
179        let lo = hex_val(pair[1])?;
180        out[i] = (hi << 4) | lo;
181    }
182    Some(out)
183}
184
185pub(crate) fn hex_val(b: u8) -> Option<u8> {
186    match b {
187        b'0'..=b'9' => Some(b - b'0'),
188        b'a'..=b'f' => Some(b - b'a' + 10),
189        _ => None,
190    }
191}
192
193/// Lowercase hex of arbitrary bytes (used for the 64-byte signature
194/// and 32-byte hash header values).
195#[must_use]
196pub fn bytes_hex(data: &[u8]) -> String {
197    let mut s = String::with_capacity(data.len() * 2);
198    for b in data {
199        use std::fmt::Write as _;
200        let _ = write!(s, "{b:02x}");
201    }
202    s
203}
204
205/// Strict lowercase-hex decode of an exact expected length.
206#[must_use]
207pub fn bytes_from_hex(s: &str, expect_len: usize) -> Option<Vec<u8>> {
208    let bytes = s.as_bytes();
209    if bytes.len() != expect_len * 2 {
210        return None;
211    }
212    let mut out = Vec::with_capacity(expect_len);
213    for pair in bytes.chunks(2) {
214        out.push((hex_val(pair[0])? << 4) | hex_val(pair[1])?);
215    }
216    Some(out)
217}
218
219#[cfg(test)]
220mod tests {
221    use super::*;
222
223    #[test]
224    fn empty_blob_id_matches_git() {
225        // `git hash-object -t blob /dev/null`
226        let obj = GitObject {
227            gtype: GitType::Blob,
228            body: Vec::new(),
229        };
230        assert_eq!(
231            sha1_hex(&obj.id()),
232            "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
233        );
234    }
235
236    #[test]
237    fn empty_tree_id_matches_git() {
238        let obj = GitObject {
239            gtype: GitType::Tree,
240            body: Vec::new(),
241        };
242        assert_eq!(
243            sha1_hex(&obj.id()),
244            "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
245        );
246    }
247
248    #[test]
249    fn hello_blob_id_matches_git() {
250        // `printf 'hello\n' | git hash-object --stdin`
251        let obj = GitObject {
252            gtype: GitType::Blob,
253            body: b"hello\n".to_vec(),
254        };
255        assert_eq!(
256            sha1_hex(&obj.id()),
257            "ce013625030ba8dba906f756967f9e9ca394464a"
258        );
259    }
260
261    #[test]
262    fn loose_write_round_trips() {
263        let dir = tempfile::tempdir().unwrap();
264        let obj = GitObject {
265            gtype: GitType::Blob,
266            body: b"abc".to_vec(),
267        };
268        let id = obj.write_loose(dir.path()).unwrap();
269        let path = GitObject::loose_path(dir.path(), &id);
270        assert!(path.exists());
271        // Idempotent second write.
272        assert_eq!(obj.write_loose(dir.path()).unwrap(), id);
273        // Decompresses back to header+body.
274        let compressed = std::fs::read(path).unwrap();
275        let mut dec = flate2::read::ZlibDecoder::new(&compressed[..]);
276        let mut raw = Vec::new();
277        std::io::Read::read_to_end(&mut dec, &mut raw).unwrap();
278        assert_eq!(raw, obj.raw());
279    }
280
281    #[test]
282    fn hex_round_trips() {
283        let id: Sha1Id = [0xAB; 20];
284        assert_eq!(sha1_from_hex(&sha1_hex(&id)).unwrap(), id);
285        assert!(sha1_from_hex("AB").is_none());
286        assert!(bytes_from_hex("0aff", 2).is_some());
287        assert!(bytes_from_hex("0AFF", 2).is_none(), "uppercase rejected");
288    }
289}