Skip to main content

grit_lib/
unpack_objects.rs

1//! `unpack-objects`: unpack a pack stream into loose objects.
2//!
3//! Reads a pack-format byte stream, validates the trailing checksum, and
4//! writes each object as a loose file in the object database.  Delta objects
5//! (both `OFS_DELTA` and `REF_DELTA`) are resolved against already-unpacked
6//! objects or objects already present in the ODB.
7//!
8//! Large blobs are written to the ODB and dropped from the in-memory maps so
9//! cloning multi-gigabyte repositories does not require holding the full pack
10//! in RAM (streaming read + bounded retention).
11
12use std::borrow::Cow;
13use std::collections::HashMap;
14use std::io::{self, Read};
15
16use flate2::read::ZlibDecoder;
17use sha1::{Digest, Sha1};
18
19use crate::error::{Error, Result};
20use crate::objects::{parse_commit, parse_tag, parse_tree, Object, ObjectId, ObjectKind};
21use crate::odb::Odb;
22
23/// Options controlling `unpack-objects` behaviour.
24#[derive(Debug, Default)]
25pub struct UnpackOptions {
26    /// Validate and decompress objects but do not write them to the ODB.
27    pub dry_run: bool,
28    /// Suppress informational output.
29    pub quiet: bool,
30    /// Reject packs whose commits/trees/tags reference missing objects.
31    pub strict: bool,
32}
33
34/// A delta that could not yet be resolved because its base was not yet known.
35struct PendingDelta {
36    /// Byte offset of this object in the pack stream (used to anchor
37    /// `OFS_DELTA` back-references from later objects).
38    offset: usize,
39    /// For `REF_DELTA`: SHA-1 of the base object.
40    base_oid: Option<ObjectId>,
41    /// For `OFS_DELTA`: absolute byte offset of the base object.
42    base_offset: Option<usize>,
43    /// Decompressed delta data.
44    delta_data: Vec<u8>,
45}
46
47/// Unpack a pack stream from `reader` into `odb`.
48///
49/// Reads the complete pack from `reader`, validates the trailing SHA-1
50/// checksum, unpacks all objects (including full delta-chain resolution), and —
51/// unless [`UnpackOptions::dry_run`] is set — writes each object to `odb`.
52///
53/// Returns the total number of objects processed.
54///
55/// # Errors
56///
57/// - [`Error::CorruptObject`] — invalid pack format, checksum mismatch, or
58///   unresolvable delta chains.
59/// - [`Error::Io`] — I/O failure reading from `reader`.
60/// - [`Error::Zlib`] — decompression failure.
61pub fn unpack_objects(reader: &mut dyn Read, odb: &Odb, opts: &UnpackOptions) -> Result<usize> {
62    /// Blobs larger than this stay on disk only (after write) so huge packs do
63    /// not retain every blob in RAM. Smaller objects are kept for delta bases
64    /// and `--strict` graph walks without extra ODB reads.
65    const MAX_RETAIN_BYTES: usize = 1024 * 1024;
66
67    let mut rd = StreamingPackReader::new(reader);
68
69    // Validate magic and version.
70    let sig = rd.read_exact_n(4)?;
71    if sig != b"PACK" {
72        return Err(Error::CorruptObject(
73            "not a pack stream: invalid signature".to_owned(),
74        ));
75    }
76    let version = rd.read_u32_be()?;
77    if version != 2 && version != 3 {
78        return Err(Error::CorruptObject(format!(
79            "unsupported pack version {version}"
80        )));
81    }
82    let nr_objects = rd.read_u32_be()? as usize;
83
84    // pack-stream offset → resolved object (see [`PackedObjectEntry`]).
85    let mut by_offset: HashMap<usize, PackedObjectEntry> = HashMap::new();
86    // ObjectId → in-pack object for REF_DELTA resolution and strict checks.
87    let mut by_oid: HashMap<ObjectId, PackedObjectEntry> = HashMap::new();
88
89    let mut pending: Vec<PendingDelta> = Vec::new();
90    let mut count = 0usize;
91
92    for _ in 0..nr_objects {
93        let obj_offset = rd.stream_pos();
94        let (type_code, size) = rd.read_type_size()?;
95
96        match type_code {
97            1..=4 => {
98                let kind = type_code_to_kind(type_code)?;
99                let data = rd.decompress(size)?;
100                let oid = write_or_hash(kind, &data, odb, opts.dry_run)?;
101                let entry = packed_entry_after_write(kind, data, oid, odb, opts, MAX_RETAIN_BYTES);
102                by_offset.insert(obj_offset, entry.clone());
103                by_oid.insert(oid, entry);
104                count += 1;
105            }
106            6 => {
107                // OFS_DELTA: base at a negative encoded offset from this object.
108                let neg = rd.read_ofs_neg_offset()?;
109                let base_offset = obj_offset.checked_sub(neg).ok_or_else(|| {
110                    Error::CorruptObject("ofs-delta base offset underflow".to_owned())
111                })?;
112                let delta_data = rd.decompress(size)?;
113                pending.push(PendingDelta {
114                    offset: obj_offset,
115                    base_oid: None,
116                    base_offset: Some(base_offset),
117                    delta_data,
118                });
119            }
120            7 => {
121                // REF_DELTA: base identified by its SHA-1.
122                let base_bytes = rd.read_exact_n(20)?;
123                let base_oid = ObjectId::from_bytes(&base_bytes)?;
124                let delta_data = rd.decompress(size)?;
125                pending.push(PendingDelta {
126                    offset: obj_offset,
127                    base_oid: Some(base_oid),
128                    base_offset: None,
129                    delta_data,
130                });
131            }
132            other => {
133                return Err(Error::CorruptObject(format!(
134                    "unknown packed-object type {other}"
135                )))
136            }
137        }
138    }
139
140    // Trailing pack checksum (SHA-1 of all preceding bytes); not included in the hash.
141    let digest = rd.finalize_hasher();
142    let trailing = rd.read_trailer_20()?;
143    if digest.as_slice() != trailing {
144        return Err(Error::CorruptObject(
145            "pack trailing checksum mismatch".to_owned(),
146        ));
147    }
148
149    // Resolve pending deltas iteratively.  Each pass resolves all deltas whose
150    // base is now known; repeat until none remain or we stall (corrupt pack).
151    let mut remaining = pending;
152    loop {
153        if remaining.is_empty() {
154            break;
155        }
156        let before = remaining.len();
157        let mut still_pending: Vec<PendingDelta> = Vec::new();
158
159        for delta in remaining {
160            let base_res: Option<Result<(ObjectKind, Cow<'_, [u8]>)>> =
161                if let Some(base_off) = delta.base_offset {
162                    by_offset
163                        .get(&base_off)
164                        .map(|e| entry_object_bytes(e, odb).map(|d| (e.kind(), d)))
165                } else if let Some(ref base_id) = delta.base_oid {
166                    if let Some(e) = by_oid.get(base_id) {
167                        Some(entry_object_bytes(e, odb).map(|d| (e.kind(), d)))
168                    } else if !opts.dry_run {
169                        odb.read(base_id)
170                            .ok()
171                            .map(|obj| Ok((obj.kind, Cow::Owned(obj.data))))
172                    } else {
173                        None
174                    }
175                } else {
176                    None
177                };
178
179            match base_res {
180                Some(Ok((base_kind, base_data))) => {
181                    let result = apply_delta(base_data.as_ref(), &delta.delta_data)?;
182                    let oid = write_or_hash(base_kind, &result, odb, opts.dry_run)?;
183                    let new_entry = packed_entry_after_write(
184                        base_kind,
185                        result,
186                        oid,
187                        odb,
188                        opts,
189                        MAX_RETAIN_BYTES,
190                    );
191                    by_offset.insert(delta.offset, new_entry.clone());
192                    by_oid.insert(oid, new_entry);
193                    count += 1;
194                }
195                Some(Err(e)) => return Err(e),
196                None => still_pending.push(delta),
197            }
198        }
199
200        remaining = still_pending;
201        if remaining.len() == before {
202            return Err(Error::CorruptObject(format!(
203                "{} delta(s) could not be resolved",
204                remaining.len()
205            )));
206        }
207    }
208
209    if opts.strict {
210        strict_verify_packed_references_map(Some(odb), &by_oid)?;
211    }
212
213    Ok(count)
214}
215
216/// Resolved non-delta object: either full bytes in memory or a large blob on disk.
217#[derive(Debug, Clone)]
218enum PackedObjectEntry {
219    InMemory { kind: ObjectKind, data: Vec<u8> },
220    BlobOnDisk { oid: ObjectId },
221}
222
223impl PackedObjectEntry {
224    fn kind(&self) -> ObjectKind {
225        match self {
226            PackedObjectEntry::InMemory { kind, .. } => *kind,
227            PackedObjectEntry::BlobOnDisk { .. } => ObjectKind::Blob,
228        }
229    }
230}
231
232fn packed_entry_after_write(
233    kind: ObjectKind,
234    data: Vec<u8>,
235    oid: ObjectId,
236    _odb: &Odb,
237    opts: &UnpackOptions,
238    max_retain: usize,
239) -> PackedObjectEntry {
240    if !opts.dry_run && kind == ObjectKind::Blob && data.len() > max_retain {
241        PackedObjectEntry::BlobOnDisk { oid }
242    } else {
243        PackedObjectEntry::InMemory { kind, data }
244    }
245}
246
247fn entry_object_bytes<'a>(entry: &'a PackedObjectEntry, odb: &Odb) -> Result<Cow<'a, [u8]>> {
248    match entry {
249        PackedObjectEntry::InMemory { data, .. } => Ok(Cow::Borrowed(data.as_slice())),
250        PackedObjectEntry::BlobOnDisk { oid } => Ok(Cow::Owned(odb.read(oid)?.data)),
251    }
252}
253
254fn strict_verify_packed_references_map(
255    odb: Option<&Odb>,
256    pack: &HashMap<ObjectId, PackedObjectEntry>,
257) -> Result<()> {
258    for entry in pack.values() {
259        match entry {
260            PackedObjectEntry::BlobOnDisk { .. } => {}
261            PackedObjectEntry::InMemory { kind, data } => match kind {
262                ObjectKind::Tree => {
263                    for e in parse_tree(data)? {
264                        if !strict_ref_resolves_map(&e.oid, pack, odb) {
265                            return Err(Error::CorruptObject(format!(
266                                "strict: missing object {} referenced by tree",
267                                e.oid.to_hex()
268                            )));
269                        }
270                    }
271                }
272                ObjectKind::Commit => {
273                    let c = parse_commit(data)?;
274                    if !strict_ref_resolves_map(&c.tree, pack, odb) {
275                        return Err(Error::CorruptObject(format!(
276                            "strict: missing tree {} referenced by commit",
277                            c.tree.to_hex()
278                        )));
279                    }
280                    for p in &c.parents {
281                        if !strict_ref_resolves_map(p, pack, odb) {
282                            return Err(Error::CorruptObject(format!(
283                                "strict: missing parent {} referenced by commit",
284                                p.to_hex()
285                            )));
286                        }
287                    }
288                }
289                ObjectKind::Tag => {
290                    let t = parse_tag(data)?;
291                    if !strict_ref_resolves_map(&t.object, pack, odb) {
292                        return Err(Error::CorruptObject(format!(
293                            "strict: missing object {} referenced by tag",
294                            t.object.to_hex()
295                        )));
296                    }
297                }
298                ObjectKind::Blob => {}
299            },
300        }
301    }
302    Ok(())
303}
304
305fn strict_ref_resolves_map(
306    oid: &ObjectId,
307    pack: &HashMap<ObjectId, PackedObjectEntry>,
308    odb: Option<&Odb>,
309) -> bool {
310    pack.contains_key(oid) || odb.is_some_and(|o| o.exists(oid))
311}
312
313fn strict_ref_resolves(
314    oid: &ObjectId,
315    pack: &std::collections::HashMap<ObjectId, (ObjectKind, Vec<u8>)>,
316    odb: Option<&Odb>,
317) -> bool {
318    pack.contains_key(oid) || odb.is_some_and(|o| o.exists(oid))
319}
320
321/// Verifies that references from commits, trees, and tags resolve to objects present in `pack`
322/// or, when `odb` is [`Some`], to loose objects in that database.
323///
324/// Use [`None`] for `odb` when indexing or unpacking in a context with no repository (Git allows
325/// `index-pack --strict` outside a work tree when the pack is self-contained).
326pub fn strict_verify_packed_references(
327    odb: Option<&Odb>,
328    pack: &HashMap<ObjectId, (ObjectKind, Vec<u8>)>,
329) -> Result<()> {
330    for (kind, data) in pack.values() {
331        match kind {
332            ObjectKind::Tree => {
333                for e in parse_tree(data)? {
334                    if !strict_ref_resolves(&e.oid, pack, odb) {
335                        return Err(Error::CorruptObject(format!(
336                            "strict: missing object {} referenced by tree",
337                            e.oid.to_hex()
338                        )));
339                    }
340                }
341            }
342            ObjectKind::Commit => {
343                let c = parse_commit(data)?;
344                if !strict_ref_resolves(&c.tree, pack, odb) {
345                    return Err(Error::CorruptObject(format!(
346                        "strict: missing tree {} referenced by commit",
347                        c.tree.to_hex()
348                    )));
349                }
350                for p in &c.parents {
351                    if !strict_ref_resolves(p, pack, odb) {
352                        return Err(Error::CorruptObject(format!(
353                            "strict: missing parent {} referenced by commit",
354                            p.to_hex()
355                        )));
356                    }
357                }
358            }
359            ObjectKind::Tag => {
360                let t = parse_tag(data)?;
361                if !strict_ref_resolves(&t.object, pack, odb) {
362                    return Err(Error::CorruptObject(format!(
363                        "strict: missing object {} referenced by tag",
364                        t.object.to_hex()
365                    )));
366                }
367            }
368            ObjectKind::Blob => {}
369        }
370    }
371    Ok(())
372}
373
374/// Parse a pack byte stream and return every resolved object (after delta resolution) keyed by OID.
375///
376/// Does not write to any object database. Used for receive-pack connectivity checks before
377/// applying a push to the permanent ODB.
378///
379/// Thin-pack bases may be resolved from `odb` when they are not present in the pack.
380pub fn pack_bytes_to_object_map(data: &[u8], odb: &Odb) -> Result<HashMap<ObjectId, Object>> {
381    let rd = PackReader::new(data.to_vec());
382    build_pack_object_map(rd, odb)
383}
384
385fn build_pack_object_map(mut rd: PackReader, odb: &Odb) -> Result<HashMap<ObjectId, Object>> {
386    let sig = rd.read_exact(4)?;
387    if sig != b"PACK" {
388        return Err(Error::CorruptObject(
389            "not a pack stream: invalid signature".to_owned(),
390        ));
391    }
392    let version = rd.read_u32_be()?;
393    if version != 2 && version != 3 {
394        return Err(Error::CorruptObject(format!(
395            "unsupported pack version {version}"
396        )));
397    }
398    let nr_objects = rd.read_u32_be()? as usize;
399
400    let mut by_offset: HashMap<usize, (ObjectKind, Vec<u8>)> = HashMap::new();
401    let mut by_oid: HashMap<ObjectId, (ObjectKind, Vec<u8>)> = HashMap::new();
402    let mut pending: Vec<PendingDelta> = Vec::new();
403
404    fn base_from_pack_or_odb(
405        by_oid: &HashMap<ObjectId, (ObjectKind, Vec<u8>)>,
406        odb: &Odb,
407        id: &ObjectId,
408    ) -> Option<(ObjectKind, Vec<u8>)> {
409        if let Some(e) = by_oid.get(id) {
410            return Some(e.clone());
411        }
412        odb.read(id).ok().map(|o| (o.kind, o.data))
413    }
414
415    for _ in 0..nr_objects {
416        let obj_offset = rd.pos;
417        let (type_code, size) = rd.read_type_size()?;
418
419        match type_code {
420            1..=4 => {
421                let kind = type_code_to_kind(type_code)?;
422                let data = rd.decompress(size)?;
423                let oid = Odb::hash_object_data(kind, &data);
424                by_offset.insert(obj_offset, (kind, data.clone()));
425                by_oid.insert(oid, (kind, data));
426            }
427            6 => {
428                let neg = rd.read_ofs_neg_offset()?;
429                let base_offset = obj_offset.checked_sub(neg).ok_or_else(|| {
430                    Error::CorruptObject("ofs-delta base offset underflow".to_owned())
431                })?;
432                let delta_data = rd.decompress(size)?;
433                pending.push(PendingDelta {
434                    offset: obj_offset,
435                    base_oid: None,
436                    base_offset: Some(base_offset),
437                    delta_data,
438                });
439            }
440            7 => {
441                let base_bytes = rd.read_exact(20)?;
442                let base_oid = ObjectId::from_bytes(base_bytes)?;
443                let delta_data = rd.decompress(size)?;
444                pending.push(PendingDelta {
445                    offset: obj_offset,
446                    base_oid: Some(base_oid),
447                    base_offset: None,
448                    delta_data,
449                });
450            }
451            other => {
452                return Err(Error::CorruptObject(format!(
453                    "unknown packed-object type {other}"
454                )))
455            }
456        }
457    }
458
459    let consumed = rd.pos;
460    {
461        let mut hasher = Sha1::new();
462        hasher.update(&rd.data[..consumed]);
463        let digest = hasher.finalize();
464        let trailing = rd.read_exact(20)?;
465        if digest.as_slice() != trailing {
466            return Err(Error::CorruptObject(
467                "pack trailing checksum mismatch".to_owned(),
468            ));
469        }
470    }
471
472    let mut remaining = pending;
473    loop {
474        if remaining.is_empty() {
475            break;
476        }
477        let before = remaining.len();
478        let mut still_pending: Vec<PendingDelta> = Vec::new();
479
480        for delta in remaining {
481            let base = if let Some(base_off) = delta.base_offset {
482                by_offset.get(&base_off).cloned()
483            } else if let Some(ref base_id) = delta.base_oid {
484                base_from_pack_or_odb(&by_oid, odb, base_id)
485            } else {
486                None
487            };
488
489            if let Some((base_kind, base_data)) = base {
490                let result = apply_delta(&base_data, &delta.delta_data)?;
491                let oid = Odb::hash_object_data(base_kind, &result);
492                by_offset.insert(delta.offset, (base_kind, result.clone()));
493                by_oid.insert(oid, (base_kind, result));
494            } else {
495                still_pending.push(delta);
496            }
497        }
498
499        remaining = still_pending;
500        if remaining.len() == before {
501            return Err(Error::CorruptObject(format!(
502                "{} delta(s) could not be resolved",
503                remaining.len()
504            )));
505        }
506    }
507
508    Ok(by_oid
509        .into_iter()
510        .map(|(oid, (kind, data))| (oid, Object::new(kind, data)))
511        .collect())
512}
513
514/// Either write `data` as a loose object (if `!dry_run`) or just compute its
515/// [`ObjectId`] without touching the filesystem.
516fn write_or_hash(kind: ObjectKind, data: &[u8], odb: &Odb, dry_run: bool) -> Result<ObjectId> {
517    if dry_run {
518        Ok(Odb::hash_object_data(kind, data))
519    } else {
520        odb.write(kind, data)
521    }
522}
523
524/// Convert a pack object type code to an [`ObjectKind`].
525fn type_code_to_kind(code: u8) -> Result<ObjectKind> {
526    match code {
527        1 => Ok(ObjectKind::Commit),
528        2 => Ok(ObjectKind::Tree),
529        3 => Ok(ObjectKind::Blob),
530        4 => Ok(ObjectKind::Tag),
531        _ => Err(Error::CorruptObject(format!(
532            "type code {code} is not a regular object type"
533        ))),
534    }
535}
536
537/// Low-level cursor over a buffered pack byte stream (in-memory pack parsing).
538struct PackReader {
539    data: Vec<u8>,
540    pos: usize,
541}
542
543impl PackReader {
544    fn new(data: Vec<u8>) -> Self {
545        Self { data, pos: 0 }
546    }
547
548    /// Read exactly `n` bytes and advance the cursor, returning a slice into
549    /// the internal buffer.
550    fn read_exact(&mut self, n: usize) -> Result<&[u8]> {
551        if self.pos + n > self.data.len() {
552            return Err(Error::CorruptObject(format!(
553                "pack stream truncated: need {n} bytes at offset {}",
554                self.pos
555            )));
556        }
557        let slice = &self.data[self.pos..self.pos + n];
558        self.pos += n;
559        Ok(slice)
560    }
561
562    /// Read a single byte and advance the cursor.
563    fn read_byte(&mut self) -> Result<u8> {
564        if self.pos >= self.data.len() {
565            return Err(Error::CorruptObject(
566                "unexpected end of pack stream".to_owned(),
567            ));
568        }
569        let b = self.data[self.pos];
570        self.pos += 1;
571        Ok(b)
572    }
573
574    /// Read a big-endian `u32`.
575    fn read_u32_be(&mut self) -> Result<u32> {
576        let bytes = self.read_exact(4)?;
577        Ok(u32::from_be_bytes(bytes.try_into().map_err(|_| {
578            Error::CorruptObject("u32 read failed".to_owned())
579        })?))
580    }
581
582    /// Read the packed-object type + size header (variable-length big-endian
583    /// encoding with the type in bits 4-6 of the first byte).
584    ///
585    /// Returns `(type_code, uncompressed_size)`.
586    fn read_type_size(&mut self) -> Result<(u8, usize)> {
587        let c = self.read_byte()?;
588        let type_code = (c >> 4) & 0x7;
589        let mut size = (c & 0x0f) as usize;
590        let mut shift = 4u32;
591        let mut cur = c;
592        while cur & 0x80 != 0 {
593            cur = self.read_byte()?;
594            size |= ((cur & 0x7f) as usize) << shift;
595            shift += 7;
596        }
597        Ok((type_code, size))
598    }
599
600    /// Read an `OFS_DELTA` negative-offset value.
601    ///
602    /// The encoding uses a big-endian variable-length integer with a +1 bias
603    /// on each continuation byte, yielding values ≥ 1.
604    fn read_ofs_neg_offset(&mut self) -> Result<usize> {
605        let mut c = self.read_byte()?;
606        let mut value = (c & 0x7f) as usize;
607        while c & 0x80 != 0 {
608            c = self.read_byte()?;
609            value = (value + 1) << 7 | (c & 0x7f) as usize;
610        }
611        Ok(value)
612    }
613
614    /// Decompress zlib-compressed data starting at the current cursor position.
615    ///
616    /// Advances the cursor by exactly the number of compressed bytes consumed.
617    /// Returns an error if the decompressed length differs from `expected_size`.
618    fn decompress(&mut self, expected_size: usize) -> Result<Vec<u8>> {
619        let slice = &self.data[self.pos..];
620        let mut decoder = ZlibDecoder::new(slice);
621        let mut out = Vec::with_capacity(expected_size);
622        decoder
623            .read_to_end(&mut out)
624            .map_err(|e| Error::Zlib(e.to_string()))?;
625        if out.len() != expected_size {
626            return Err(Error::CorruptObject(format!(
627                "decompressed {} bytes but expected {}",
628                out.len(),
629                expected_size
630            )));
631        }
632        self.pos += decoder.total_in() as usize;
633        Ok(out)
634    }
635}
636
637fn io_to_corrupt_eof(e: io::Error, stream_pos: usize, context: &str) -> Error {
638    if e.kind() == io::ErrorKind::UnexpectedEof {
639        Error::CorruptObject(format!(
640            "pack stream truncated ({context}) at offset {stream_pos}"
641        ))
642    } else {
643        Error::Io(e)
644    }
645}
646
647/// Streaming cursor over a pack file: hashes body bytes incrementally (no full-buffer read).
648///
649/// Raw pack bytes are either consumed as object headers (via [`Self::read_byte`]) or as zlib
650/// payloads.  Zlib decoders may read ahead; overflow bytes stay in [`Self::pending`] so the next
651/// object header or zlib stream starts at the correct offset.
652struct StreamingPackReader<'a> {
653    inner: &'a mut dyn Read,
654    pack_hasher: Sha1,
655    stream_pos: usize,
656    /// Compressed (or other) bytes already read from `inner` and hashed but not yet consumed by
657    /// the current parsing step.
658    pending: Vec<u8>,
659}
660
661impl<'a> StreamingPackReader<'a> {
662    fn new(inner: &'a mut dyn Read) -> Self {
663        Self {
664            inner,
665            pack_hasher: Sha1::new(),
666            stream_pos: 0,
667            pending: Vec::new(),
668        }
669    }
670
671    fn stream_pos(&self) -> usize {
672        self.stream_pos
673    }
674
675    /// Read pack-body bytes (hashed). Used for headers and non-zlib payload reads only.
676    fn read_from_source(&mut self, buf: &mut [u8]) -> Result<usize> {
677        let n = if !self.pending.is_empty() {
678            let take = buf.len().min(self.pending.len());
679            buf[..take].copy_from_slice(&self.pending[..take]);
680            self.pending.drain(..take);
681            take
682        } else {
683            self.inner.read(buf).map_err(Error::Io)?
684        };
685        if n > 0 {
686            self.pack_hasher.update(&buf[..n]);
687            self.stream_pos += n;
688        }
689        Ok(n)
690    }
691
692    fn read_byte(&mut self) -> Result<u8> {
693        let mut b = [0u8; 1];
694        let n = self.read_from_source(&mut b)?;
695        if n == 0 {
696            return Err(Error::CorruptObject(format!(
697                "pack stream truncated (read byte) at offset {}",
698                self.stream_pos
699            )));
700        }
701        Ok(b[0])
702    }
703
704    fn read_exact_n(&mut self, n: usize) -> Result<Vec<u8>> {
705        let mut v = vec![0u8; n];
706        let mut got = 0usize;
707        while got < n {
708            let m = self.read_from_source(&mut v[got..n])?;
709            if m == 0 {
710                return Err(Error::CorruptObject(format!(
711                    "pack stream truncated (read exact) at offset {}",
712                    self.stream_pos
713                )));
714            }
715            got += m;
716        }
717        Ok(v)
718    }
719
720    fn read_u32_be(&mut self) -> Result<u32> {
721        let mut b = [0u8; 4];
722        let mut got = 0usize;
723        while got < 4 {
724            let m = self.read_from_source(&mut b[got..4])?;
725            if m == 0 {
726                return Err(Error::CorruptObject(format!(
727                    "pack stream truncated (read u32) at offset {}",
728                    self.stream_pos
729                )));
730            }
731            got += m;
732        }
733        Ok(u32::from_be_bytes(b))
734    }
735
736    fn read_type_size(&mut self) -> Result<(u8, usize)> {
737        let c = self.read_byte()?;
738        let type_code = (c >> 4) & 0x7;
739        let mut size = (c & 0x0f) as usize;
740        let mut shift = 4u32;
741        let mut cur = c;
742        while cur & 0x80 != 0 {
743            cur = self.read_byte()?;
744            size |= ((cur & 0x7f) as usize) << shift;
745            shift += 7;
746        }
747        Ok((type_code, size))
748    }
749
750    fn read_ofs_neg_offset(&mut self) -> Result<usize> {
751        let mut c = self.read_byte()?;
752        let mut value = (c & 0x7f) as usize;
753        while c & 0x80 != 0 {
754            c = self.read_byte()?;
755            value = (value + 1) << 7 | (c & 0x7f) as usize;
756        }
757        Ok(value)
758    }
759
760    /// Pull zlib-compressed bytes until one object inflates to `expected_size` bytes.
761    ///
762    /// Bytes read from `inner` into `pending` are not hashed until we know how many belong to the
763    /// zlib stream (`total_in()`). Lookahead past the zlib end (including the 20-byte pack
764    /// trailer) must never be fed to the pack checksum.
765    fn decompress(&mut self, expected_size: usize) -> Result<Vec<u8>> {
766        const CHUNK: usize = 64 * 1024;
767        let mut scratch = [0u8; CHUNK];
768        let mut out = vec![0u8; expected_size];
769        loop {
770            let mut cursor = std::io::Cursor::new(self.pending.as_slice());
771            let mut z = ZlibDecoder::new(&mut cursor);
772            match z.read_exact(&mut out) {
773                Ok(()) => {
774                    let consumed = z.total_in() as usize;
775                    if consumed > self.pending.len() {
776                        return Err(Error::CorruptObject(
777                            "zlib total_in exceeds pending buffer".to_owned(),
778                        ));
779                    }
780                    self.pack_hasher.update(&self.pending[..consumed]);
781                    self.stream_pos += consumed;
782                    self.pending.drain(..consumed);
783                    return Ok(out);
784                }
785                Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => {
786                    let n = self.inner.read(&mut scratch).map_err(Error::Io)?;
787                    if n == 0 {
788                        return Err(Error::CorruptObject(format!(
789                            "pack stream truncated (zlib) at offset {}",
790                            self.stream_pos
791                        )));
792                    }
793                    self.pending.extend_from_slice(&scratch[..n]);
794                }
795                Err(e) => return Err(Error::Zlib(e.to_string())),
796            }
797        }
798    }
799
800    /// SHA-1 over all pack bytes read so far (objects only; trailer not yet read).
801    fn finalize_hasher(
802        &self,
803    ) -> sha1::digest::generic_array::GenericArray<u8, sha1::digest::consts::U20> {
804        self.pack_hasher.clone().finalize()
805    }
806
807    /// Trailing pack checksum; not included in [`Self::finalize_hasher`].
808    fn read_trailer_20(&mut self) -> Result<[u8; 20]> {
809        let mut b = [0u8; 20];
810        if self.pending.len() >= 20 {
811            b.copy_from_slice(&self.pending[..20]);
812            self.pending.drain(..20);
813            self.stream_pos += 20;
814            return Ok(b);
815        }
816        let tail = self.pending.len();
817        if tail > 0 {
818            b[..tail].copy_from_slice(&self.pending[..]);
819            self.pending.clear();
820        }
821        self.inner
822            .read_exact(&mut b[tail..])
823            .map_err(|e| io_to_corrupt_eof(e, self.stream_pos, "trailer"))?;
824        self.stream_pos += 20;
825        Ok(b)
826    }
827}
828
829/// Apply a git "patch delta" to `base`, producing the patched result.
830///
831/// The delta binary format is:
832/// 1. Source size: variable-length little-endian integer (must equal
833///    `base.len()`).
834/// 2. Destination size: variable-length little-endian integer.
835/// 3. A sequence of COPY (MSB set) and INSERT (MSB clear) instructions.
836///
837/// # Errors
838///
839/// Returns [`Error::CorruptObject`] if the delta is malformed, the source-size
840/// field does not match `base.len()`, or the result length does not match the
841/// declared destination size.
842pub fn apply_delta(base: &[u8], delta: &[u8]) -> Result<Vec<u8>> {
843    let mut pos = 0usize;
844
845    let src_size = read_delta_varint(delta, &mut pos)?;
846    if src_size != base.len() {
847        return Err(Error::CorruptObject(format!(
848            "delta source size {src_size} != base size {}",
849            base.len()
850        )));
851    }
852    let dest_size = read_delta_varint(delta, &mut pos)?;
853    let mut result = Vec::with_capacity(dest_size);
854
855    while pos < delta.len() {
856        let cmd = delta[pos];
857        pos += 1;
858        if cmd == 0 {
859            return Err(Error::CorruptObject(
860                "reserved opcode 0 in delta stream".to_owned(),
861            ));
862        }
863        if cmd & 0x80 != 0 {
864            // COPY instruction: up to 4 offset bytes (bits 0-3) and up to 3
865            // size bytes (bits 4-6) are present, each controlled by a flag bit.
866            let mut offset = 0usize;
867            let mut size = 0usize;
868
869            macro_rules! maybe_read_byte {
870                ($flag:expr, $shift:expr, $target:expr) => {
871                    if cmd & $flag != 0 {
872                        let b = *delta.get(pos).ok_or_else(|| {
873                            Error::CorruptObject("truncated delta COPY operand".to_owned())
874                        })?;
875                        pos += 1;
876                        $target |= (b as usize) << $shift;
877                    }
878                };
879            }
880
881            maybe_read_byte!(0x01, 0, offset);
882            maybe_read_byte!(0x02, 8, offset);
883            maybe_read_byte!(0x04, 16, offset);
884            maybe_read_byte!(0x08, 24, offset);
885            maybe_read_byte!(0x10, 0, size);
886            maybe_read_byte!(0x20, 8, size);
887            maybe_read_byte!(0x40, 16, size);
888
889            if size == 0 {
890                size = 0x10000;
891            }
892
893            let end = offset.checked_add(size).ok_or_else(|| {
894                Error::CorruptObject("delta COPY range overflows usize".to_owned())
895            })?;
896            let chunk = base.get(offset..end).ok_or_else(|| {
897                Error::CorruptObject(format!(
898                    "delta COPY [{offset},{end}) out of range (base is {} bytes)",
899                    base.len()
900                ))
901            })?;
902            result.extend_from_slice(chunk);
903        } else {
904            // INSERT instruction: copy the next `cmd` literal bytes verbatim.
905            let n = cmd as usize;
906            let chunk = delta
907                .get(pos..pos + n)
908                .ok_or_else(|| Error::CorruptObject("truncated delta INSERT data".to_owned()))?;
909            result.extend_from_slice(chunk);
910            pos += n;
911        }
912    }
913
914    if result.len() != dest_size {
915        return Err(Error::CorruptObject(format!(
916            "delta produced {} bytes but expected {dest_size}",
917            result.len()
918        )));
919    }
920
921    Ok(result)
922}
923
924/// Read a variable-length little-endian integer from `data` starting at `*pos`.
925///
926/// Advances `*pos` past the consumed bytes.
927fn read_delta_varint(data: &[u8], pos: &mut usize) -> Result<usize> {
928    let mut value = 0usize;
929    let mut shift = 0u32;
930    loop {
931        let b = *data
932            .get(*pos)
933            .ok_or_else(|| Error::CorruptObject("truncated delta varint".to_owned()))?;
934        *pos += 1;
935        value |= ((b & 0x7f) as usize) << shift;
936        shift += 7;
937        if b & 0x80 == 0 {
938            break;
939        }
940    }
941    Ok(value)
942}
943
944#[cfg(test)]
945mod tests {
946    use super::*;
947
948    // Helper: build a minimal pack from a list of (kind, data) pairs.
949    // Returns the raw pack bytes.
950    fn make_pack(objects: &[(ObjectKind, &[u8])]) -> Vec<u8> {
951        use flate2::write::ZlibEncoder;
952        use std::io::Write;
953
954        let mut entries: Vec<Vec<u8>> = Vec::new();
955        for (kind, data) in objects {
956            let type_code: u8 = match kind {
957                ObjectKind::Commit => 1,
958                ObjectKind::Tree => 2,
959                ObjectKind::Blob => 3,
960                ObjectKind::Tag => 4,
961            };
962            // Encode type+size header.
963            let mut header = Vec::new();
964            let mut size = data.len();
965            let first = ((type_code & 0x7) << 4) | (size & 0x0f) as u8;
966            size >>= 4;
967            if size > 0 {
968                header.push(first | 0x80);
969                while size > 0 {
970                    let b = (size & 0x7f) as u8;
971                    size >>= 7;
972                    header.push(if size > 0 { b | 0x80 } else { b });
973                }
974            } else {
975                header.push(first);
976            }
977            // zlib-compress data.
978            let mut enc = ZlibEncoder::new(Vec::new(), flate2::Compression::default());
979            enc.write_all(data).unwrap();
980            let compressed = enc.finish().unwrap();
981            let mut entry = header;
982            entry.extend_from_slice(&compressed);
983            entries.push(entry);
984        }
985
986        // Assemble: PACK + version(2) + count + entries + SHA-1.
987        let mut pack = Vec::new();
988        pack.extend_from_slice(b"PACK");
989        pack.extend_from_slice(&2u32.to_be_bytes());
990        pack.extend_from_slice(&(objects.len() as u32).to_be_bytes());
991        for entry in &entries {
992            pack.extend_from_slice(entry);
993        }
994        let mut hasher = Sha1::new();
995        hasher.update(&pack);
996        let digest = hasher.finalize();
997        pack.extend_from_slice(digest.as_slice());
998        pack
999    }
1000
1001    #[test]
1002    fn test_apply_delta_simple() {
1003        // Build a trivial delta: insert "hello world".
1004        let base = b"hello";
1005        let mut delta = Vec::new();
1006        // src_size = 5
1007        delta.push(5u8);
1008        // dest_size = 11
1009        delta.push(11u8);
1010        // COPY instruction: copy base[0..5]
1011        // cmd = 0x80 | 0x01 (offset present, byte 0) | 0x10 (size byte 0)
1012        delta.push(0x80 | 0x01 | 0x10); // 0x91
1013        delta.push(0u8); // offset = 0
1014        delta.push(5u8); // size = 5
1015                         // INSERT " world" (6 bytes)
1016        delta.push(6u8);
1017        delta.extend_from_slice(b" world");
1018
1019        let result = apply_delta(base, &delta).unwrap();
1020        assert_eq!(result, b"hello world");
1021    }
1022
1023    #[test]
1024    fn test_apply_delta_insert_only() {
1025        let base = b"";
1026        let mut delta = Vec::new();
1027        delta.push(0u8); // src_size = 0
1028        delta.push(5u8); // dest_size = 5
1029        delta.push(5u8); // INSERT 5 bytes
1030        delta.extend_from_slice(b"hello");
1031
1032        let result = apply_delta(base, &delta).unwrap();
1033        assert_eq!(result, b"hello");
1034    }
1035
1036    #[test]
1037    fn test_apply_delta_copy_only() {
1038        let base = b"abcdef";
1039        let mut delta = Vec::new();
1040        delta.push(6u8); // src_size = 6
1041        delta.push(3u8); // dest_size = 3
1042                         // COPY base[2..5]: offset=2, size=3
1043                         // cmd = 0x80 | 0x01 | 0x10
1044        delta.push(0x91u8);
1045        delta.push(2u8); // offset = 2
1046        delta.push(3u8); // size = 3
1047
1048        let result = apply_delta(base, &delta).unwrap();
1049        assert_eq!(result, b"cde");
1050    }
1051
1052    #[test]
1053    fn test_apply_delta_size_zero_means_65536() {
1054        // A COPY with size bytes all zero means 0x10000 = 65536.
1055        let base = vec![0xABu8; 65536];
1056        let mut delta = Vec::new();
1057        // src_size = 65536, encoded as 3 bytes little-endian varint
1058        delta.push(0x80 | (65536 & 0x7f) as u8); // 0
1059        delta.push(0x80 | ((65536 >> 7) & 0x7f) as u8); // 0x80
1060        delta.push(((65536 >> 14) & 0x7f) as u8); // 4
1061                                                  // dest_size = 65536, same
1062        delta.push(0x80 | (65536 & 0x7f) as u8);
1063        delta.push(0x80 | ((65536 >> 7) & 0x7f) as u8);
1064        delta.push(((65536 >> 14) & 0x7f) as u8);
1065        // COPY: offset=0 (no offset bytes), size=0 (no size bytes) → means 0x10000
1066        // cmd = 0x80 (no offset/size bytes present at all → offset=0, size=0→65536)
1067        delta.push(0x80u8);
1068
1069        let result = apply_delta(&base, &delta).unwrap();
1070        assert_eq!(result.len(), 65536);
1071        assert!(result.iter().all(|&b| b == 0xAB));
1072    }
1073
1074    #[test]
1075    fn test_unpack_objects_blobs() {
1076        use tempfile::TempDir;
1077        let tmp = TempDir::new().unwrap();
1078        let objects_dir = tmp.path().join("objects");
1079        std::fs::create_dir_all(&objects_dir).unwrap();
1080        let odb = Odb::new(&objects_dir);
1081
1082        let pack = make_pack(&[
1083            (ObjectKind::Blob, b"hello\n"),
1084            (ObjectKind::Blob, b"world\n"),
1085        ]);
1086
1087        let opts = UnpackOptions::default();
1088        let count = unpack_objects(&mut pack.as_slice(), &odb, &opts).unwrap();
1089        assert_eq!(count, 2);
1090
1091        // Verify both blobs can be read back.
1092        let oid1 = Odb::hash_object_data(ObjectKind::Blob, b"hello\n");
1093        let oid2 = Odb::hash_object_data(ObjectKind::Blob, b"world\n");
1094        let obj1 = odb.read(&oid1).unwrap();
1095        let obj2 = odb.read(&oid2).unwrap();
1096        assert_eq!(obj1.data, b"hello\n");
1097        assert_eq!(obj2.data, b"world\n");
1098    }
1099
1100    #[test]
1101    fn test_unpack_objects_dry_run_writes_nothing() {
1102        use tempfile::TempDir;
1103        let tmp = TempDir::new().unwrap();
1104        let objects_dir = tmp.path().join("objects");
1105        std::fs::create_dir_all(&objects_dir).unwrap();
1106        let odb = Odb::new(&objects_dir);
1107
1108        let pack = make_pack(&[(ObjectKind::Blob, b"test content")]);
1109
1110        let opts = UnpackOptions {
1111            dry_run: true,
1112            quiet: true,
1113            strict: false,
1114        };
1115        let count = unpack_objects(&mut pack.as_slice(), &odb, &opts).unwrap();
1116        assert_eq!(count, 1);
1117
1118        // Nothing should be written.
1119        let oid = Odb::hash_object_data(ObjectKind::Blob, b"test content");
1120        assert!(!odb.exists(&oid));
1121    }
1122
1123    #[test]
1124    fn test_unpack_objects_bad_signature() {
1125        use tempfile::TempDir;
1126        let tmp = TempDir::new().unwrap();
1127        let objects_dir = tmp.path().join("objects");
1128        std::fs::create_dir_all(&objects_dir).unwrap();
1129        let odb = Odb::new(&objects_dir);
1130
1131        let mut bad = b"NOPE\x00\x00\x00\x02\x00\x00\x00\x00".to_vec();
1132        bad.extend_from_slice(&[0u8; 20]);
1133        let opts = UnpackOptions::default();
1134        let err = unpack_objects(&mut bad.as_slice(), &odb, &opts).unwrap_err();
1135        assert!(err.to_string().contains("invalid signature"));
1136    }
1137
1138    #[test]
1139    fn test_unpack_objects_checksum_mismatch() {
1140        use tempfile::TempDir;
1141        let tmp = TempDir::new().unwrap();
1142        let objects_dir = tmp.path().join("objects");
1143        std::fs::create_dir_all(&objects_dir).unwrap();
1144        let odb = Odb::new(&objects_dir);
1145
1146        let mut pack = make_pack(&[(ObjectKind::Blob, b"data")]);
1147        // Corrupt the trailing checksum.
1148        let n = pack.len();
1149        pack[n - 1] ^= 0xFF;
1150
1151        let opts = UnpackOptions::default();
1152        let err = unpack_objects(&mut pack.as_slice(), &odb, &opts).unwrap_err();
1153        assert!(err.to_string().contains("checksum"));
1154    }
1155
1156    #[test]
1157    fn test_apply_delta_source_size_mismatch() {
1158        let base = b"hi";
1159        let delta = [3u8, 2u8, 2u8, b'h', b'i']; // src_size=3 != base.len()=2
1160        let err = apply_delta(base, &delta).unwrap_err();
1161        assert!(err.to_string().contains("source size"));
1162    }
1163}