Skip to main content

grit_lib/
unpack_objects.rs

1//! `unpack-objects`: unpack a pack stream into loose objects.
2//!
3//! Reads a pack-format byte stream, validates the trailing checksum, and
4//! writes each object as a loose file in the object database.  Delta objects
5//! (both `OFS_DELTA` and `REF_DELTA`) are resolved against already-unpacked
6//! objects or objects already present in the ODB.
7
8use std::collections::HashMap;
9use std::io::Read;
10
11use flate2::read::ZlibDecoder;
12use sha1::{Digest, Sha1};
13
14use crate::error::{Error, Result};
15use crate::objects::{ObjectId, ObjectKind};
16use crate::odb::Odb;
17
18/// Options controlling `unpack-objects` behaviour.
19#[derive(Debug, Default)]
20pub struct UnpackOptions {
21    /// Validate and decompress objects but do not write them to the ODB.
22    pub dry_run: bool,
23    /// Suppress informational output.
24    pub quiet: bool,
25}
26
27/// A delta that could not yet be resolved because its base was not yet known.
28struct PendingDelta {
29    /// Byte offset of this object in the pack stream (used to anchor
30    /// `OFS_DELTA` back-references from later objects).
31    offset: usize,
32    /// For `REF_DELTA`: SHA-1 of the base object.
33    base_oid: Option<ObjectId>,
34    /// For `OFS_DELTA`: absolute byte offset of the base object.
35    base_offset: Option<usize>,
36    /// Decompressed delta data.
37    delta_data: Vec<u8>,
38}
39
40/// Unpack a pack stream from `reader` into `odb`.
41///
42/// Reads the complete pack from `reader`, validates the trailing SHA-1
43/// checksum, unpacks all objects (including full delta-chain resolution), and —
44/// unless [`UnpackOptions::dry_run`] is set — writes each object to `odb`.
45///
46/// Returns the total number of objects processed.
47///
48/// # Errors
49///
50/// - [`Error::CorruptObject`] — invalid pack format, checksum mismatch, or
51///   unresolvable delta chains.
52/// - [`Error::Io`] — I/O failure reading from `reader`.
53/// - [`Error::Zlib`] — decompression failure.
54pub fn unpack_objects(reader: &mut dyn Read, odb: &Odb, opts: &UnpackOptions) -> Result<usize> {
55    let mut raw = Vec::new();
56    reader.read_to_end(&mut raw).map_err(Error::Io)?;
57
58    let mut rd = PackReader::new(raw);
59
60    // Validate magic and version.
61    let sig = rd.read_exact(4)?;
62    if sig != b"PACK" {
63        return Err(Error::CorruptObject(
64            "not a pack stream: invalid signature".to_owned(),
65        ));
66    }
67    let version = rd.read_u32_be()?;
68    if version != 2 && version != 3 {
69        return Err(Error::CorruptObject(format!(
70            "unsupported pack version {version}"
71        )));
72    }
73    let nr_objects = rd.read_u32_be()? as usize;
74
75    // Maps used for delta resolution.
76    // pack-stream offset → (kind, decompressed-data) for objects resolved in
77    // this pass (needed to service OFS_DELTA back-references).
78    let mut by_offset: HashMap<usize, (ObjectKind, Vec<u8>)> = HashMap::new();
79    // ObjectId → (kind, data) for in-pack objects (REF_DELTA resolution).
80    let mut by_oid: HashMap<ObjectId, (ObjectKind, Vec<u8>)> = HashMap::new();
81
82    let mut pending: Vec<PendingDelta> = Vec::new();
83    let mut count = 0usize;
84
85    for _ in 0..nr_objects {
86        let obj_offset = rd.pos;
87        let (type_code, size) = rd.read_type_size()?;
88
89        match type_code {
90            1..=4 => {
91                let kind = type_code_to_kind(type_code)?;
92                let data = rd.decompress(size)?;
93                let oid = write_or_hash(kind, &data, odb, opts.dry_run)?;
94                by_offset.insert(obj_offset, (kind, data.clone()));
95                by_oid.insert(oid, (kind, data));
96                count += 1;
97            }
98            6 => {
99                // OFS_DELTA: base at a negative encoded offset from this object.
100                let neg = rd.read_ofs_neg_offset()?;
101                let base_offset = obj_offset.checked_sub(neg).ok_or_else(|| {
102                    Error::CorruptObject("ofs-delta base offset underflow".to_owned())
103                })?;
104                let delta_data = rd.decompress(size)?;
105                pending.push(PendingDelta {
106                    offset: obj_offset,
107                    base_oid: None,
108                    base_offset: Some(base_offset),
109                    delta_data,
110                });
111            }
112            7 => {
113                // REF_DELTA: base identified by its SHA-1.
114                let base_bytes = rd.read_exact(20)?;
115                let base_oid = ObjectId::from_bytes(base_bytes)?;
116                let delta_data = rd.decompress(size)?;
117                pending.push(PendingDelta {
118                    offset: obj_offset,
119                    base_oid: Some(base_oid),
120                    base_offset: None,
121                    delta_data,
122                });
123            }
124            other => {
125                return Err(Error::CorruptObject(format!(
126                    "unknown packed-object type {other}"
127                )))
128            }
129        }
130    }
131
132    // Verify the pack trailing checksum: SHA-1 over all bytes consumed so far.
133    let consumed = rd.pos;
134    {
135        let mut hasher = Sha1::new();
136        hasher.update(&rd.data[..consumed]);
137        let digest = hasher.finalize();
138        let trailing = rd.read_exact(20)?;
139        if digest.as_slice() != trailing {
140            return Err(Error::CorruptObject(
141                "pack trailing checksum mismatch".to_owned(),
142            ));
143        }
144    }
145
146    // Resolve pending deltas iteratively.  Each pass resolves all deltas whose
147    // base is now known; repeat until none remain or we stall (corrupt pack).
148    let mut remaining = pending;
149    loop {
150        if remaining.is_empty() {
151            break;
152        }
153        let before = remaining.len();
154        let mut still_pending: Vec<PendingDelta> = Vec::new();
155
156        for delta in remaining {
157            let base = if let Some(base_off) = delta.base_offset {
158                by_offset.get(&base_off).cloned()
159            } else if let Some(ref base_id) = delta.base_oid {
160                if let Some(entry) = by_oid.get(base_id) {
161                    Some(entry.clone())
162                } else if !opts.dry_run {
163                    odb.read(base_id).ok().map(|obj| (obj.kind, obj.data))
164                } else {
165                    None
166                }
167            } else {
168                None
169            };
170
171            if let Some((base_kind, base_data)) = base {
172                let result = apply_delta(&base_data, &delta.delta_data)?;
173                let oid = write_or_hash(base_kind, &result, odb, opts.dry_run)?;
174                by_offset.insert(delta.offset, (base_kind, result.clone()));
175                by_oid.insert(oid, (base_kind, result));
176                count += 1;
177            } else {
178                still_pending.push(delta);
179            }
180        }
181
182        remaining = still_pending;
183        if remaining.len() == before {
184            return Err(Error::CorruptObject(format!(
185                "{} delta(s) could not be resolved",
186                remaining.len()
187            )));
188        }
189    }
190
191    Ok(count)
192}
193
194/// Either write `data` as a loose object (if `!dry_run`) or just compute its
195/// [`ObjectId`] without touching the filesystem.
196fn write_or_hash(kind: ObjectKind, data: &[u8], odb: &Odb, dry_run: bool) -> Result<ObjectId> {
197    if dry_run {
198        Ok(Odb::hash_object_data(kind, data))
199    } else {
200        odb.write(kind, data)
201    }
202}
203
204/// Convert a pack object type code to an [`ObjectKind`].
205fn type_code_to_kind(code: u8) -> Result<ObjectKind> {
206    match code {
207        1 => Ok(ObjectKind::Commit),
208        2 => Ok(ObjectKind::Tree),
209        3 => Ok(ObjectKind::Blob),
210        4 => Ok(ObjectKind::Tag),
211        _ => Err(Error::CorruptObject(format!(
212            "type code {code} is not a regular object type"
213        ))),
214    }
215}
216
217/// Low-level cursor over a buffered pack byte stream.
218struct PackReader {
219    data: Vec<u8>,
220    pos: usize,
221}
222
223impl PackReader {
224    fn new(data: Vec<u8>) -> Self {
225        Self { data, pos: 0 }
226    }
227
228    /// Read exactly `n` bytes and advance the cursor, returning a slice into
229    /// the internal buffer.
230    fn read_exact(&mut self, n: usize) -> Result<&[u8]> {
231        if self.pos + n > self.data.len() {
232            return Err(Error::CorruptObject(format!(
233                "pack stream truncated: need {n} bytes at offset {}",
234                self.pos
235            )));
236        }
237        let slice = &self.data[self.pos..self.pos + n];
238        self.pos += n;
239        Ok(slice)
240    }
241
242    /// Read a single byte and advance the cursor.
243    fn read_byte(&mut self) -> Result<u8> {
244        if self.pos >= self.data.len() {
245            return Err(Error::CorruptObject(
246                "unexpected end of pack stream".to_owned(),
247            ));
248        }
249        let b = self.data[self.pos];
250        self.pos += 1;
251        Ok(b)
252    }
253
254    /// Read a big-endian `u32`.
255    fn read_u32_be(&mut self) -> Result<u32> {
256        let bytes = self.read_exact(4)?;
257        Ok(u32::from_be_bytes(bytes.try_into().map_err(|_| {
258            Error::CorruptObject("u32 read failed".to_owned())
259        })?))
260    }
261
262    /// Read the packed-object type + size header (variable-length big-endian
263    /// encoding with the type in bits 4-6 of the first byte).
264    ///
265    /// Returns `(type_code, uncompressed_size)`.
266    fn read_type_size(&mut self) -> Result<(u8, usize)> {
267        let c = self.read_byte()?;
268        let type_code = (c >> 4) & 0x7;
269        let mut size = (c & 0x0f) as usize;
270        let mut shift = 4u32;
271        let mut cur = c;
272        while cur & 0x80 != 0 {
273            cur = self.read_byte()?;
274            size |= ((cur & 0x7f) as usize) << shift;
275            shift += 7;
276        }
277        Ok((type_code, size))
278    }
279
280    /// Read an `OFS_DELTA` negative-offset value.
281    ///
282    /// The encoding uses a big-endian variable-length integer with a +1 bias
283    /// on each continuation byte, yielding values ≥ 1.
284    fn read_ofs_neg_offset(&mut self) -> Result<usize> {
285        let mut c = self.read_byte()?;
286        let mut value = (c & 0x7f) as usize;
287        while c & 0x80 != 0 {
288            c = self.read_byte()?;
289            value = (value + 1) << 7 | (c & 0x7f) as usize;
290        }
291        Ok(value)
292    }
293
294    /// Decompress zlib-compressed data starting at the current cursor position.
295    ///
296    /// Advances the cursor by exactly the number of compressed bytes consumed.
297    /// Returns an error if the decompressed length differs from `expected_size`.
298    fn decompress(&mut self, expected_size: usize) -> Result<Vec<u8>> {
299        let slice = &self.data[self.pos..];
300        let mut decoder = ZlibDecoder::new(slice);
301        let mut out = Vec::with_capacity(expected_size);
302        decoder
303            .read_to_end(&mut out)
304            .map_err(|e| Error::Zlib(e.to_string()))?;
305        if out.len() != expected_size {
306            return Err(Error::CorruptObject(format!(
307                "decompressed {} bytes but expected {}",
308                out.len(),
309                expected_size
310            )));
311        }
312        self.pos += decoder.total_in() as usize;
313        Ok(out)
314    }
315}
316
317/// Apply a git "patch delta" to `base`, producing the patched result.
318///
319/// The delta binary format is:
320/// 1. Source size: variable-length little-endian integer (must equal
321///    `base.len()`).
322/// 2. Destination size: variable-length little-endian integer.
323/// 3. A sequence of COPY (MSB set) and INSERT (MSB clear) instructions.
324///
325/// # Errors
326///
327/// Returns [`Error::CorruptObject`] if the delta is malformed, the source-size
328/// field does not match `base.len()`, or the result length does not match the
329/// declared destination size.
330pub fn apply_delta(base: &[u8], delta: &[u8]) -> Result<Vec<u8>> {
331    let mut pos = 0usize;
332
333    let src_size = read_delta_varint(delta, &mut pos)?;
334    if src_size != base.len() {
335        return Err(Error::CorruptObject(format!(
336            "delta source size {src_size} != base size {}",
337            base.len()
338        )));
339    }
340    let dest_size = read_delta_varint(delta, &mut pos)?;
341    let mut result = Vec::with_capacity(dest_size);
342
343    while pos < delta.len() {
344        let cmd = delta[pos];
345        pos += 1;
346        if cmd == 0 {
347            return Err(Error::CorruptObject(
348                "reserved opcode 0 in delta stream".to_owned(),
349            ));
350        }
351        if cmd & 0x80 != 0 {
352            // COPY instruction: up to 4 offset bytes (bits 0-3) and up to 3
353            // size bytes (bits 4-6) are present, each controlled by a flag bit.
354            let mut offset = 0usize;
355            let mut size = 0usize;
356
357            macro_rules! maybe_read_byte {
358                ($flag:expr, $shift:expr, $target:expr) => {
359                    if cmd & $flag != 0 {
360                        let b = *delta.get(pos).ok_or_else(|| {
361                            Error::CorruptObject("truncated delta COPY operand".to_owned())
362                        })?;
363                        pos += 1;
364                        $target |= (b as usize) << $shift;
365                    }
366                };
367            }
368
369            maybe_read_byte!(0x01, 0, offset);
370            maybe_read_byte!(0x02, 8, offset);
371            maybe_read_byte!(0x04, 16, offset);
372            maybe_read_byte!(0x08, 24, offset);
373            maybe_read_byte!(0x10, 0, size);
374            maybe_read_byte!(0x20, 8, size);
375            maybe_read_byte!(0x40, 16, size);
376
377            if size == 0 {
378                size = 0x10000;
379            }
380
381            let end = offset.checked_add(size).ok_or_else(|| {
382                Error::CorruptObject("delta COPY range overflows usize".to_owned())
383            })?;
384            let chunk = base.get(offset..end).ok_or_else(|| {
385                Error::CorruptObject(format!(
386                    "delta COPY [{offset},{end}) out of range (base is {} bytes)",
387                    base.len()
388                ))
389            })?;
390            result.extend_from_slice(chunk);
391        } else {
392            // INSERT instruction: copy the next `cmd` literal bytes verbatim.
393            let n = cmd as usize;
394            let chunk = delta
395                .get(pos..pos + n)
396                .ok_or_else(|| Error::CorruptObject("truncated delta INSERT data".to_owned()))?;
397            result.extend_from_slice(chunk);
398            pos += n;
399        }
400    }
401
402    if result.len() != dest_size {
403        return Err(Error::CorruptObject(format!(
404            "delta produced {} bytes but expected {dest_size}",
405            result.len()
406        )));
407    }
408
409    Ok(result)
410}
411
412/// Read a variable-length little-endian integer from `data` starting at `*pos`.
413///
414/// Advances `*pos` past the consumed bytes.
415fn read_delta_varint(data: &[u8], pos: &mut usize) -> Result<usize> {
416    let mut value = 0usize;
417    let mut shift = 0u32;
418    loop {
419        let b = *data
420            .get(*pos)
421            .ok_or_else(|| Error::CorruptObject("truncated delta varint".to_owned()))?;
422        *pos += 1;
423        value |= ((b & 0x7f) as usize) << shift;
424        shift += 7;
425        if b & 0x80 == 0 {
426            break;
427        }
428    }
429    Ok(value)
430}
431
432#[cfg(test)]
433mod tests {
434    use super::*;
435
436    // Helper: build a minimal pack from a list of (kind, data) pairs.
437    // Returns the raw pack bytes.
438    fn make_pack(objects: &[(ObjectKind, &[u8])]) -> Vec<u8> {
439        use flate2::write::ZlibEncoder;
440        use std::io::Write;
441
442        let mut entries: Vec<Vec<u8>> = Vec::new();
443        for (kind, data) in objects {
444            let type_code: u8 = match kind {
445                ObjectKind::Commit => 1,
446                ObjectKind::Tree => 2,
447                ObjectKind::Blob => 3,
448                ObjectKind::Tag => 4,
449            };
450            // Encode type+size header.
451            let mut header = Vec::new();
452            let mut size = data.len();
453            let first = ((type_code & 0x7) << 4) as u8 | (size & 0x0f) as u8;
454            size >>= 4;
455            if size > 0 {
456                header.push(first | 0x80);
457                while size > 0 {
458                    let b = (size & 0x7f) as u8;
459                    size >>= 7;
460                    header.push(if size > 0 { b | 0x80 } else { b });
461                }
462            } else {
463                header.push(first);
464            }
465            // zlib-compress data.
466            let mut enc = ZlibEncoder::new(Vec::new(), flate2::Compression::default());
467            enc.write_all(data).unwrap();
468            let compressed = enc.finish().unwrap();
469            let mut entry = header;
470            entry.extend_from_slice(&compressed);
471            entries.push(entry);
472        }
473
474        // Assemble: PACK + version(2) + count + entries + SHA-1.
475        let mut pack = Vec::new();
476        pack.extend_from_slice(b"PACK");
477        pack.extend_from_slice(&2u32.to_be_bytes());
478        pack.extend_from_slice(&(objects.len() as u32).to_be_bytes());
479        for entry in &entries {
480            pack.extend_from_slice(entry);
481        }
482        let mut hasher = Sha1::new();
483        hasher.update(&pack);
484        let digest = hasher.finalize();
485        pack.extend_from_slice(digest.as_slice());
486        pack
487    }
488
489    #[test]
490    fn test_apply_delta_simple() {
491        // Build a trivial delta: insert "hello world".
492        let base = b"hello";
493        let mut delta = Vec::new();
494        // src_size = 5
495        delta.push(5u8);
496        // dest_size = 11
497        delta.push(11u8);
498        // COPY instruction: copy base[0..5]
499        // cmd = 0x80 | 0x01 (offset present, byte 0) | 0x10 (size byte 0)
500        delta.push(0x80 | 0x01 | 0x10); // 0x91
501        delta.push(0u8); // offset = 0
502        delta.push(5u8); // size = 5
503                         // INSERT " world" (6 bytes)
504        delta.push(6u8);
505        delta.extend_from_slice(b" world");
506
507        let result = apply_delta(base, &delta).unwrap();
508        assert_eq!(result, b"hello world");
509    }
510
511    #[test]
512    fn test_apply_delta_insert_only() {
513        let base = b"";
514        let mut delta = Vec::new();
515        delta.push(0u8); // src_size = 0
516        delta.push(5u8); // dest_size = 5
517        delta.push(5u8); // INSERT 5 bytes
518        delta.extend_from_slice(b"hello");
519
520        let result = apply_delta(base, &delta).unwrap();
521        assert_eq!(result, b"hello");
522    }
523
524    #[test]
525    fn test_apply_delta_copy_only() {
526        let base = b"abcdef";
527        let mut delta = Vec::new();
528        delta.push(6u8); // src_size = 6
529        delta.push(3u8); // dest_size = 3
530                         // COPY base[2..5]: offset=2, size=3
531                         // cmd = 0x80 | 0x01 | 0x10
532        delta.push(0x91u8);
533        delta.push(2u8); // offset = 2
534        delta.push(3u8); // size = 3
535
536        let result = apply_delta(base, &delta).unwrap();
537        assert_eq!(result, b"cde");
538    }
539
540    #[test]
541    fn test_apply_delta_size_zero_means_65536() {
542        // A COPY with size bytes all zero means 0x10000 = 65536.
543        let base = vec![0xABu8; 65536];
544        let mut delta = Vec::new();
545        // src_size = 65536, encoded as 3 bytes little-endian varint
546        delta.push(0x80 | (65536 & 0x7f) as u8); // 0
547        delta.push(0x80 | ((65536 >> 7) & 0x7f) as u8); // 0x80
548        delta.push(((65536 >> 14) & 0x7f) as u8); // 4
549                                                  // dest_size = 65536, same
550        delta.push(0x80 | (65536 & 0x7f) as u8);
551        delta.push(0x80 | ((65536 >> 7) & 0x7f) as u8);
552        delta.push(((65536 >> 14) & 0x7f) as u8);
553        // COPY: offset=0 (no offset bytes), size=0 (no size bytes) → means 0x10000
554        // cmd = 0x80 (no offset/size bytes present at all → offset=0, size=0→65536)
555        delta.push(0x80u8);
556
557        let result = apply_delta(&base, &delta).unwrap();
558        assert_eq!(result.len(), 65536);
559        assert!(result.iter().all(|&b| b == 0xAB));
560    }
561
562    #[test]
563    fn test_unpack_objects_blobs() {
564        use tempfile::TempDir;
565        let tmp = TempDir::new().unwrap();
566        let objects_dir = tmp.path().join("objects");
567        std::fs::create_dir_all(&objects_dir).unwrap();
568        let odb = Odb::new(&objects_dir);
569
570        let pack = make_pack(&[
571            (ObjectKind::Blob, b"hello\n"),
572            (ObjectKind::Blob, b"world\n"),
573        ]);
574
575        let opts = UnpackOptions::default();
576        let count = unpack_objects(&mut pack.as_slice(), &odb, &opts).unwrap();
577        assert_eq!(count, 2);
578
579        // Verify both blobs can be read back.
580        let oid1 = Odb::hash_object_data(ObjectKind::Blob, b"hello\n");
581        let oid2 = Odb::hash_object_data(ObjectKind::Blob, b"world\n");
582        let obj1 = odb.read(&oid1).unwrap();
583        let obj2 = odb.read(&oid2).unwrap();
584        assert_eq!(obj1.data, b"hello\n");
585        assert_eq!(obj2.data, b"world\n");
586    }
587
588    #[test]
589    fn test_unpack_objects_dry_run_writes_nothing() {
590        use tempfile::TempDir;
591        let tmp = TempDir::new().unwrap();
592        let objects_dir = tmp.path().join("objects");
593        std::fs::create_dir_all(&objects_dir).unwrap();
594        let odb = Odb::new(&objects_dir);
595
596        let pack = make_pack(&[(ObjectKind::Blob, b"test content")]);
597
598        let opts = UnpackOptions {
599            dry_run: true,
600            quiet: true,
601        };
602        let count = unpack_objects(&mut pack.as_slice(), &odb, &opts).unwrap();
603        assert_eq!(count, 1);
604
605        // Nothing should be written.
606        let oid = Odb::hash_object_data(ObjectKind::Blob, b"test content");
607        assert!(!odb.exists(&oid));
608    }
609
610    #[test]
611    fn test_unpack_objects_bad_signature() {
612        use tempfile::TempDir;
613        let tmp = TempDir::new().unwrap();
614        let objects_dir = tmp.path().join("objects");
615        std::fs::create_dir_all(&objects_dir).unwrap();
616        let odb = Odb::new(&objects_dir);
617
618        let mut bad = b"NOPE\x00\x00\x00\x02\x00\x00\x00\x00".to_vec();
619        bad.extend_from_slice(&[0u8; 20]);
620        let opts = UnpackOptions::default();
621        let err = unpack_objects(&mut bad.as_slice(), &odb, &opts).unwrap_err();
622        assert!(err.to_string().contains("invalid signature"));
623    }
624
625    #[test]
626    fn test_unpack_objects_checksum_mismatch() {
627        use tempfile::TempDir;
628        let tmp = TempDir::new().unwrap();
629        let objects_dir = tmp.path().join("objects");
630        std::fs::create_dir_all(&objects_dir).unwrap();
631        let odb = Odb::new(&objects_dir);
632
633        let mut pack = make_pack(&[(ObjectKind::Blob, b"data")]);
634        // Corrupt the trailing checksum.
635        let n = pack.len();
636        pack[n - 1] ^= 0xFF;
637
638        let opts = UnpackOptions::default();
639        let err = unpack_objects(&mut pack.as_slice(), &odb, &opts).unwrap_err();
640        assert!(err.to_string().contains("checksum"));
641    }
642
643    #[test]
644    fn test_apply_delta_source_size_mismatch() {
645        let base = b"hi";
646        let delta = [3u8, 2u8, 2u8, b'h', b'i']; // src_size=3 != base.len()=2
647        let err = apply_delta(base, &delta).unwrap_err();
648        assert!(err.to_string().contains("source size"));
649    }
650}