Skip to main content

grit_lib/
unpack_objects.rs

1//! `unpack-objects`: unpack a pack stream into loose objects.
2//!
3//! Reads a pack-format byte stream, validates the trailing checksum, and
4//! writes each object as a loose file in the object database.  Delta objects
5//! (both `OFS_DELTA` and `REF_DELTA`) are resolved against already-unpacked
6//! objects or objects already present in the ODB.
7//!
8//! Large blobs are written to the ODB and dropped from the in-memory maps so
9//! cloning multi-gigabyte repositories does not require holding the full pack
10//! in RAM (streaming read + bounded retention).
11
12use std::borrow::Cow;
13use std::collections::{HashMap, HashSet};
14use std::io::{self, Read};
15
16use flate2::read::ZlibDecoder;
17use flate2::{Decompress, FlushDecompress, Status};
18use sha1::{Digest, Sha1};
19use sha2::{Digest as Sha256Digest, Sha256};
20
21use crate::error::{Error, Result};
22use crate::gitmodules;
23use crate::index::MODE_GITLINK;
24use crate::objects::{parse_commit, parse_tag, parse_tree, HashAlgo, Object, ObjectId, ObjectKind};
25use crate::odb::Odb;
26
27/// Incremental pack checksum hasher matching the repository hash algorithm.
28#[derive(Clone)]
29enum PackHasher {
30    Sha1(Sha1),
31    Sha256(Sha256),
32}
33
34impl PackHasher {
35    fn new(algo: HashAlgo) -> Self {
36        match algo {
37            HashAlgo::Sha1 => Self::Sha1(Sha1::new()),
38            HashAlgo::Sha256 => Self::Sha256(Sha256::new()),
39        }
40    }
41
42    fn update(&mut self, data: &[u8]) {
43        match self {
44            Self::Sha1(h) => Digest::update(h, data),
45            Self::Sha256(h) => Sha256Digest::update(h, data),
46        }
47    }
48
49    fn finalize(self) -> Vec<u8> {
50        match self {
51            Self::Sha1(h) => h.finalize().to_vec(),
52            Self::Sha256(h) => h.finalize().to_vec(),
53        }
54    }
55
56    fn len(&self) -> usize {
57        match self {
58            Self::Sha1(_) => 20,
59            Self::Sha256(_) => 32,
60        }
61    }
62}
63
64/// Compute an object id for `data` of the given `kind` using `algo`, without an
65/// `Odb` in scope (Git store form: `"<kind> <len>\0<data>"`).
66fn hash_object_with(algo: HashAlgo, kind: ObjectKind, data: &[u8]) -> ObjectId {
67    let header = format!("{kind} {}\0", data.len());
68    let mut h = PackHasher::new(algo);
69    h.update(header.as_bytes());
70    h.update(data);
71    ObjectId::from_bytes(&h.finalize()).expect("digest is a valid OID width")
72}
73
74/// Options controlling `unpack-objects` behaviour.
75#[derive(Debug, Default)]
76pub struct UnpackOptions {
77    /// Validate and decompress objects but do not write them to the ODB.
78    pub dry_run: bool,
79    /// Suppress informational output.
80    pub quiet: bool,
81    /// Reject packs whose commits/trees/tags reference missing objects.
82    pub strict: bool,
83    /// Object IDs that strict connectivity may treat as promised by a configured promisor remote.
84    pub allowed_missing: HashSet<ObjectId>,
85    /// Whether strict connectivity may tolerate references to missing objects in a promisor repo.
86    pub allow_promisor_missing_references: bool,
87    /// Maximum number of raw pack bytes that may be consumed (including the 20-byte trailer).
88    ///
89    /// Matches Git's `unpack-objects --max-input-size` / `receive.maxInputSize`: counts every
90    /// byte read from the pack stream after crossing the limit. `None` means no limit.
91    pub max_input_bytes: Option<u64>,
92    /// Commit OIDs that are shallow boundaries (grafts): their parents are intentionally absent and
93    /// must not be required during the `--strict` connectivity walk.
94    ///
95    /// Mirrors `unpack-objects --shallow-file <file>` in upstream `receive-pack`, where the shallow
96    /// file lists the commits whose parent objects were deliberately not transferred.
97    pub shallow_boundaries: HashSet<ObjectId>,
98}
99
100/// A delta that could not yet be resolved because its base was not yet known.
101struct PendingDelta {
102    /// Byte offset of this object in the pack stream (used to anchor
103    /// `OFS_DELTA` back-references from later objects).
104    offset: usize,
105    /// For `REF_DELTA`: SHA-1 of the base object.
106    base_oid: Option<ObjectId>,
107    /// For `OFS_DELTA`: absolute byte offset of the base object.
108    base_offset: Option<usize>,
109    /// Decompressed delta data.
110    delta_data: Vec<u8>,
111}
112
113/// Unpack a pack stream from `reader` into `odb`.
114///
115/// Reads the complete pack from `reader`, validates the trailing SHA-1
116/// checksum, unpacks all objects (including full delta-chain resolution), and —
117/// unless [`UnpackOptions::dry_run`] is set — writes each object to `odb`.
118///
119/// Returns the total number of objects processed.
120///
121/// # Errors
122///
123/// - [`Error::CorruptObject`] — invalid pack format, checksum mismatch, or
124///   unresolvable delta chains.
125/// - [`Error::Io`] — I/O failure reading from `reader`.
126/// - [`Error::Zlib`] — decompression failure.
127pub fn unpack_objects(reader: &mut dyn Read, odb: &Odb, opts: &UnpackOptions) -> Result<usize> {
128    /// Blobs larger than this stay on disk only (after write) so huge packs do
129    /// not retain every blob in RAM. Smaller objects are kept for delta bases
130    /// and `--strict` graph walks without extra ODB reads.
131    const MAX_RETAIN_BYTES: usize = 1024 * 1024;
132
133    let algo = odb.hash_algo();
134    let mut rd = StreamingPackReader::new(reader, opts.max_input_bytes, algo);
135
136    // Validate magic and version.
137    let sig = rd.read_exact_n(4)?;
138    if sig != b"PACK" {
139        return Err(Error::CorruptObject(
140            "not a pack stream: invalid signature".to_owned(),
141        ));
142    }
143    let version = rd.read_u32_be()?;
144    if version != 2 && version != 3 {
145        return Err(Error::CorruptObject(format!(
146            "unsupported pack version {version}"
147        )));
148    }
149    let nr_objects = rd.read_u32_be()? as usize;
150
151    // pack-stream offset → resolved object (see [`PackedObjectEntry`]).
152    let mut by_offset: HashMap<usize, PackedObjectEntry> = HashMap::new();
153    // ObjectId → in-pack object for REF_DELTA resolution and strict checks.
154    let mut by_oid: HashMap<ObjectId, PackedObjectEntry> = HashMap::new();
155
156    let mut pending: Vec<PendingDelta> = Vec::new();
157    let mut count = 0usize;
158
159    for _ in 0..nr_objects {
160        let obj_offset = rd.stream_pos();
161        let (type_code, size) = rd.read_type_size()?;
162
163        match type_code {
164            1..=4 => {
165                let kind = type_code_to_kind(type_code)?;
166                let data = rd.decompress(size)?;
167                let oid = write_or_hash(kind, &data, odb, opts.dry_run)?;
168                let entry = packed_entry_after_write(kind, data, oid, odb, opts, MAX_RETAIN_BYTES);
169                by_offset.insert(obj_offset, entry.clone());
170                by_oid.insert(oid, entry);
171                count += 1;
172            }
173            6 => {
174                // OFS_DELTA: base at a negative encoded offset from this object.
175                let neg = rd.read_ofs_neg_offset()?;
176                let base_offset = obj_offset.checked_sub(neg).ok_or_else(|| {
177                    Error::CorruptObject("ofs-delta base offset underflow".to_owned())
178                })?;
179                let delta_data = rd.decompress(size)?;
180                pending.push(PendingDelta {
181                    offset: obj_offset,
182                    base_oid: None,
183                    base_offset: Some(base_offset),
184                    delta_data,
185                });
186            }
187            7 => {
188                // REF_DELTA: base identified by its object id (hash-width bytes).
189                let base_bytes = rd.read_exact_n(algo.len())?;
190                let base_oid = ObjectId::from_bytes(&base_bytes)?;
191                let delta_data = rd.decompress(size)?;
192                pending.push(PendingDelta {
193                    offset: obj_offset,
194                    base_oid: Some(base_oid),
195                    base_offset: None,
196                    delta_data,
197                });
198            }
199            other => {
200                return Err(Error::CorruptObject(format!(
201                    "unknown packed-object type {other}"
202                )))
203            }
204        }
205    }
206
207    // Trailing pack checksum (hash of all preceding bytes); not included in the hash.
208    let digest = rd.finalize_hasher();
209    let trailing = rd.read_trailer()?;
210    if digest != trailing {
211        return Err(Error::CorruptObject(
212            "pack trailing checksum mismatch".to_owned(),
213        ));
214    }
215
216    // Resolve pending deltas iteratively.  Each pass resolves all deltas whose
217    // base is now known; repeat until none remain or we stall (corrupt pack).
218    let mut remaining = pending;
219    loop {
220        if remaining.is_empty() {
221            break;
222        }
223        let before = remaining.len();
224        let mut still_pending: Vec<PendingDelta> = Vec::new();
225
226        for delta in remaining {
227            let base_res: Option<Result<(ObjectKind, Cow<'_, [u8]>)>> =
228                if let Some(base_off) = delta.base_offset {
229                    by_offset
230                        .get(&base_off)
231                        .map(|e| entry_object_bytes(e, odb).map(|d| (e.kind(), d)))
232                } else if let Some(ref base_id) = delta.base_oid {
233                    if let Some(e) = by_oid.get(base_id) {
234                        Some(entry_object_bytes(e, odb).map(|d| (e.kind(), d)))
235                    } else if !opts.dry_run {
236                        odb.read(base_id)
237                            .ok()
238                            .map(|obj| Ok((obj.kind, Cow::Owned(obj.data))))
239                    } else {
240                        None
241                    }
242                } else {
243                    None
244                };
245
246            match base_res {
247                Some(Ok((base_kind, base_data))) => {
248                    let result = apply_delta(base_data.as_ref(), &delta.delta_data)?;
249                    let oid = write_or_hash(base_kind, &result, odb, opts.dry_run)?;
250                    let new_entry = packed_entry_after_write(
251                        base_kind,
252                        result,
253                        oid,
254                        odb,
255                        opts,
256                        MAX_RETAIN_BYTES,
257                    );
258                    by_offset.insert(delta.offset, new_entry.clone());
259                    by_oid.insert(oid, new_entry);
260                    count += 1;
261                }
262                Some(Err(e)) => return Err(e),
263                None => still_pending.push(delta),
264            }
265        }
266
267        remaining = still_pending;
268        if remaining.len() == before {
269            return Err(Error::CorruptObject(format!(
270                "{} delta(s) could not be resolved",
271                remaining.len()
272            )));
273        }
274    }
275
276    if opts.strict {
277        let mut dot_fsck_map: HashMap<ObjectId, (ObjectKind, Vec<u8>)> =
278            HashMap::with_capacity(by_oid.len());
279        for (oid, entry) in &by_oid {
280            let kind = entry.kind();
281            let data = match entry {
282                PackedObjectEntry::InMemory { data, .. } => data.clone(),
283                PackedObjectEntry::BlobOnDisk { oid: blob_oid } => odb.read(blob_oid)?.data,
284            };
285            dot_fsck_map.insert(*oid, (kind, data));
286        }
287        gitmodules::verify_packed_dot_special(&dot_fsck_map)?;
288        strict_verify_packed_references_map(
289            Some(odb),
290            &by_oid,
291            &opts.allowed_missing,
292            opts.allow_promisor_missing_references,
293            &opts.shallow_boundaries,
294        )?;
295    }
296
297    Ok(count)
298}
299
300/// Resolved non-delta object: either full bytes in memory or a large blob on disk.
301#[derive(Debug, Clone)]
302enum PackedObjectEntry {
303    InMemory { kind: ObjectKind, data: Vec<u8> },
304    BlobOnDisk { oid: ObjectId },
305}
306
307impl PackedObjectEntry {
308    fn kind(&self) -> ObjectKind {
309        match self {
310            PackedObjectEntry::InMemory { kind, .. } => *kind,
311            PackedObjectEntry::BlobOnDisk { .. } => ObjectKind::Blob,
312        }
313    }
314}
315
316fn packed_entry_after_write(
317    kind: ObjectKind,
318    data: Vec<u8>,
319    oid: ObjectId,
320    _odb: &Odb,
321    opts: &UnpackOptions,
322    max_retain: usize,
323) -> PackedObjectEntry {
324    if !opts.dry_run && kind == ObjectKind::Blob && data.len() > max_retain {
325        PackedObjectEntry::BlobOnDisk { oid }
326    } else {
327        PackedObjectEntry::InMemory { kind, data }
328    }
329}
330
331fn entry_object_bytes<'a>(entry: &'a PackedObjectEntry, odb: &Odb) -> Result<Cow<'a, [u8]>> {
332    match entry {
333        PackedObjectEntry::InMemory { data, .. } => Ok(Cow::Borrowed(data.as_slice())),
334        PackedObjectEntry::BlobOnDisk { oid } => Ok(Cow::Owned(odb.read(oid)?.data)),
335    }
336}
337
338fn strict_verify_packed_references_map(
339    odb: Option<&Odb>,
340    pack: &HashMap<ObjectId, PackedObjectEntry>,
341    allowed_missing: &HashSet<ObjectId>,
342    allow_promisor_missing_references: bool,
343    shallow_boundaries: &HashSet<ObjectId>,
344) -> Result<()> {
345    for (oid, entry) in pack {
346        match entry {
347            PackedObjectEntry::BlobOnDisk { .. } => {}
348            PackedObjectEntry::InMemory { kind, data } => match kind {
349                ObjectKind::Tree => {
350                    for e in parse_tree(data)? {
351                        // Gitlink (submodule) entries point at commits that live
352                        // in the submodule repository, not the superproject's
353                        // pack/ODB. Skip them in the connectivity walk, matching
354                        // upstream git (git/fsck.c:374 `if (S_ISGITLINK) continue;`).
355                        if e.mode == MODE_GITLINK {
356                            continue;
357                        }
358                        if !strict_ref_resolves_map(
359                            &e.oid,
360                            pack,
361                            odb,
362                            allowed_missing,
363                            allow_promisor_missing_references,
364                        ) {
365                            return Err(Error::CorruptObject(format!(
366                                "strict: missing object {} referenced by tree",
367                                e.oid.to_hex()
368                            )));
369                        }
370                    }
371                }
372                ObjectKind::Commit => {
373                    let c = parse_commit(data)?;
374                    if !strict_ref_resolves_map(
375                        &c.tree,
376                        pack,
377                        odb,
378                        allowed_missing,
379                        allow_promisor_missing_references,
380                    ) {
381                        return Err(Error::CorruptObject(format!(
382                            "strict: missing tree {} referenced by commit",
383                            c.tree.to_hex()
384                        )));
385                    }
386                    // A commit recorded as a shallow boundary (graft) has its parents intentionally
387                    // absent — skip parent connectivity for it, matching unpack-objects run with a
388                    // `--shallow-file` listing this commit.
389                    if shallow_boundaries.contains(oid) {
390                        continue;
391                    }
392                    for p in &c.parents {
393                        if !strict_ref_resolves_map(
394                            p,
395                            pack,
396                            odb,
397                            allowed_missing,
398                            allow_promisor_missing_references,
399                        ) {
400                            return Err(Error::CorruptObject(format!(
401                                "strict: missing parent {} referenced by commit",
402                                p.to_hex()
403                            )));
404                        }
405                    }
406                }
407                ObjectKind::Tag => {
408                    let t = parse_tag(data)?;
409                    if !strict_ref_resolves_map(
410                        &t.object,
411                        pack,
412                        odb,
413                        allowed_missing,
414                        allow_promisor_missing_references,
415                    ) {
416                        return Err(Error::CorruptObject(format!(
417                            "strict: missing object {} referenced by tag",
418                            t.object.to_hex()
419                        )));
420                    }
421                }
422                ObjectKind::Blob => {}
423            },
424        }
425    }
426    Ok(())
427}
428
429fn strict_ref_resolves_map(
430    oid: &ObjectId,
431    pack: &HashMap<ObjectId, PackedObjectEntry>,
432    odb: Option<&Odb>,
433    allowed_missing: &HashSet<ObjectId>,
434    allow_promisor_missing_references: bool,
435) -> bool {
436    pack.contains_key(oid)
437        || allowed_missing.contains(oid)
438        || odb.is_some_and(|o| o.exists(oid))
439        || allow_promisor_missing_references
440}
441
442fn strict_ref_resolves(
443    oid: &ObjectId,
444    pack: &std::collections::HashMap<ObjectId, (ObjectKind, Vec<u8>)>,
445    odb: Option<&Odb>,
446) -> bool {
447    pack.contains_key(oid) || odb.is_some_and(|o| o.exists(oid))
448}
449
450/// Verifies that references from commits, trees, and tags resolve to objects present in `pack`
451/// or, when `odb` is [`Some`], to loose objects in that database.
452///
453/// Use [`None`] for `odb` when indexing or unpacking in a context with no repository (Git allows
454/// `index-pack --strict` outside a work tree when the pack is self-contained).
455pub fn strict_verify_packed_references(
456    odb: Option<&Odb>,
457    pack: &HashMap<ObjectId, (ObjectKind, Vec<u8>)>,
458) -> Result<()> {
459    for (kind, data) in pack.values() {
460        match kind {
461            ObjectKind::Tree => {
462                for e in parse_tree(data)? {
463                    // Gitlink (submodule) entries point at commits that live in
464                    // the submodule repository, not this pack/ODB. Skip them in
465                    // the connectivity walk, matching upstream git
466                    // (git/fsck.c:374 `if (S_ISGITLINK) continue;`).
467                    if e.mode == MODE_GITLINK {
468                        continue;
469                    }
470                    if !strict_ref_resolves(&e.oid, pack, odb) {
471                        return Err(Error::CorruptObject(format!(
472                            "strict: missing object {} referenced by tree",
473                            e.oid.to_hex()
474                        )));
475                    }
476                }
477            }
478            ObjectKind::Commit => {
479                let c = parse_commit(data)?;
480                if !strict_ref_resolves(&c.tree, pack, odb) {
481                    return Err(Error::CorruptObject(format!(
482                        "strict: missing tree {} referenced by commit",
483                        c.tree.to_hex()
484                    )));
485                }
486                for p in &c.parents {
487                    if !strict_ref_resolves(p, pack, odb) {
488                        return Err(Error::CorruptObject(format!(
489                            "strict: missing parent {} referenced by commit",
490                            p.to_hex()
491                        )));
492                    }
493                }
494            }
495            ObjectKind::Tag => {
496                let t = parse_tag(data)?;
497                if !strict_ref_resolves(&t.object, pack, odb) {
498                    return Err(Error::CorruptObject(format!(
499                        "strict: missing object {} referenced by tag",
500                        t.object.to_hex()
501                    )));
502                }
503            }
504            ObjectKind::Blob => {}
505        }
506    }
507    Ok(())
508}
509
510/// Whether `data` is a *thin* pack — i.e. it contains a `ref-delta` (type 7) whose base object is
511/// not itself present in the pack. `git pack-objects --thin` produces such packs; a receiver that
512/// rejects thin packs (`receive-pack --reject-thin-pack-for-testing`) uses this to refuse them.
513///
514/// Conservative: any parse error makes this return `false` (treat as non-thin) so a malformed pack
515/// is handled by the normal ingestion path rather than mislabeled.
516pub fn pack_is_thin(data: &[u8], algo: HashAlgo) -> bool {
517    pack_is_thin_inner(data, algo).unwrap_or(false)
518}
519
520fn pack_is_thin_inner(data: &[u8], algo: HashAlgo) -> Result<bool> {
521    let mut rd = PackReader::new(data.to_vec());
522    if rd.read_exact(4)? != b"PACK" {
523        return Ok(false);
524    }
525    let _version = rd.read_u32_be()?;
526    let nr_objects = rd.read_u32_be()? as usize;
527
528    let mut in_pack: HashSet<ObjectId> = HashSet::new();
529    let mut ref_delta_bases: Vec<ObjectId> = Vec::new();
530    for _ in 0..nr_objects {
531        let obj_offset = rd.pos;
532        let (type_code, size) = rd.read_type_size()?;
533        match type_code {
534            1..=4 => {
535                let kind = type_code_to_kind(type_code)?;
536                let obj_data = rd.decompress(size)?;
537                in_pack.insert(hash_object_with(algo, kind, &obj_data));
538            }
539            6 => {
540                // ofs-delta: base is always in-pack (referenced by relative offset).
541                let _neg = rd.read_ofs_neg_offset()?;
542                let _ = obj_offset;
543                let _ = rd.decompress(size)?;
544            }
545            7 => {
546                let base_bytes = rd.read_exact(algo.len())?;
547                ref_delta_bases.push(ObjectId::from_bytes(base_bytes)?);
548                let _ = rd.decompress(size)?;
549            }
550            _ => return Ok(false),
551        }
552    }
553    // Thin iff any ref-delta points at a base that is not packed alongside it.
554    Ok(ref_delta_bases.iter().any(|b| !in_pack.contains(b)))
555}
556
557/// Parse a pack byte stream and return every resolved object (after delta resolution) keyed by OID.
558///
559/// Does not write to any object database. Used for receive-pack connectivity checks before
560/// applying a push to the permanent ODB.
561///
562/// Thin-pack bases may be resolved from `odb` when they are not present in the pack.
563pub fn pack_bytes_to_object_map(data: &[u8], odb: &Odb) -> Result<HashMap<ObjectId, Object>> {
564    let rd = PackReader::new(data.to_vec());
565    build_pack_object_map(rd, odb)
566}
567
568fn build_pack_object_map(mut rd: PackReader, odb: &Odb) -> Result<HashMap<ObjectId, Object>> {
569    let algo = odb.hash_algo();
570    let sig = rd.read_exact(4)?;
571    if sig != b"PACK" {
572        return Err(Error::CorruptObject(
573            "not a pack stream: invalid signature".to_owned(),
574        ));
575    }
576    let version = rd.read_u32_be()?;
577    if version != 2 && version != 3 {
578        return Err(Error::CorruptObject(format!(
579            "unsupported pack version {version}"
580        )));
581    }
582    let nr_objects = rd.read_u32_be()? as usize;
583
584    let mut by_offset: HashMap<usize, (ObjectKind, Vec<u8>)> = HashMap::new();
585    let mut by_oid: HashMap<ObjectId, (ObjectKind, Vec<u8>)> = HashMap::new();
586    let mut pending: Vec<PendingDelta> = Vec::new();
587
588    fn base_from_pack_or_odb(
589        by_oid: &HashMap<ObjectId, (ObjectKind, Vec<u8>)>,
590        odb: &Odb,
591        id: &ObjectId,
592    ) -> Option<(ObjectKind, Vec<u8>)> {
593        if let Some(e) = by_oid.get(id) {
594            return Some(e.clone());
595        }
596        odb.read(id).ok().map(|o| (o.kind, o.data))
597    }
598
599    for _ in 0..nr_objects {
600        let obj_offset = rd.pos;
601        let (type_code, size) = rd.read_type_size()?;
602
603        match type_code {
604            1..=4 => {
605                let kind = type_code_to_kind(type_code)?;
606                let data = rd.decompress(size)?;
607                let oid = odb.hash(kind, &data);
608                by_offset.insert(obj_offset, (kind, data.clone()));
609                by_oid.insert(oid, (kind, data));
610            }
611            6 => {
612                let neg = rd.read_ofs_neg_offset()?;
613                let base_offset = obj_offset.checked_sub(neg).ok_or_else(|| {
614                    Error::CorruptObject("ofs-delta base offset underflow".to_owned())
615                })?;
616                let delta_data = rd.decompress(size)?;
617                pending.push(PendingDelta {
618                    offset: obj_offset,
619                    base_oid: None,
620                    base_offset: Some(base_offset),
621                    delta_data,
622                });
623            }
624            7 => {
625                let base_bytes = rd.read_exact(algo.len())?;
626                let base_oid = ObjectId::from_bytes(base_bytes)?;
627                let delta_data = rd.decompress(size)?;
628                pending.push(PendingDelta {
629                    offset: obj_offset,
630                    base_oid: Some(base_oid),
631                    base_offset: None,
632                    delta_data,
633                });
634            }
635            other => {
636                return Err(Error::CorruptObject(format!(
637                    "unknown packed-object type {other}"
638                )))
639            }
640        }
641    }
642
643    let consumed = rd.pos;
644    {
645        let mut hasher = PackHasher::new(algo);
646        hasher.update(&rd.data[..consumed]);
647        let digest = hasher.finalize();
648        let trailing = rd.read_exact(algo.len())?;
649        if digest.as_slice() != trailing {
650            return Err(Error::CorruptObject(
651                "pack trailing checksum mismatch".to_owned(),
652            ));
653        }
654    }
655
656    let mut remaining = pending;
657    loop {
658        if remaining.is_empty() {
659            break;
660        }
661        let before = remaining.len();
662        let mut still_pending: Vec<PendingDelta> = Vec::new();
663
664        for delta in remaining {
665            let base = if let Some(base_off) = delta.base_offset {
666                by_offset.get(&base_off).cloned()
667            } else if let Some(ref base_id) = delta.base_oid {
668                base_from_pack_or_odb(&by_oid, odb, base_id)
669            } else {
670                None
671            };
672
673            if let Some((base_kind, base_data)) = base {
674                let result = apply_delta(&base_data, &delta.delta_data)?;
675                let oid = odb.hash(base_kind, &result);
676                by_offset.insert(delta.offset, (base_kind, result.clone()));
677                by_oid.insert(oid, (base_kind, result));
678            } else {
679                still_pending.push(delta);
680            }
681        }
682
683        remaining = still_pending;
684        if remaining.len() == before {
685            return Err(Error::CorruptObject(format!(
686                "{} delta(s) could not be resolved",
687                remaining.len()
688            )));
689        }
690    }
691
692    Ok(by_oid
693        .into_iter()
694        .map(|(oid, (kind, data))| (oid, Object::new(kind, data)))
695        .collect())
696}
697
698/// Either write `data` as a loose object (if `!dry_run`) or just compute its
699/// [`ObjectId`] without touching the filesystem.
700fn write_or_hash(kind: ObjectKind, data: &[u8], odb: &Odb, dry_run: bool) -> Result<ObjectId> {
701    if dry_run {
702        Ok(odb.hash(kind, data))
703    } else {
704        // Always materialize into this ODB: objects reachable only via alternates must still be
705        // written locally (matches git unpack-objects; t5519-push-alternates).
706        odb.write_local(kind, data)
707    }
708}
709
710/// Convert a pack object type code to an [`ObjectKind`].
711fn type_code_to_kind(code: u8) -> Result<ObjectKind> {
712    match code {
713        1 => Ok(ObjectKind::Commit),
714        2 => Ok(ObjectKind::Tree),
715        3 => Ok(ObjectKind::Blob),
716        4 => Ok(ObjectKind::Tag),
717        _ => Err(Error::CorruptObject(format!(
718            "type code {code} is not a regular object type"
719        ))),
720    }
721}
722
723/// Low-level cursor over a buffered pack byte stream (in-memory pack parsing).
724struct PackReader {
725    data: Vec<u8>,
726    pos: usize,
727}
728
729impl PackReader {
730    fn new(data: Vec<u8>) -> Self {
731        Self { data, pos: 0 }
732    }
733
734    /// Read exactly `n` bytes and advance the cursor, returning a slice into
735    /// the internal buffer.
736    fn read_exact(&mut self, n: usize) -> Result<&[u8]> {
737        if self.pos + n > self.data.len() {
738            return Err(Error::CorruptObject(format!(
739                "pack stream truncated: need {n} bytes at offset {}",
740                self.pos
741            )));
742        }
743        let slice = &self.data[self.pos..self.pos + n];
744        self.pos += n;
745        Ok(slice)
746    }
747
748    /// Read a single byte and advance the cursor.
749    fn read_byte(&mut self) -> Result<u8> {
750        if self.pos >= self.data.len() {
751            return Err(Error::CorruptObject(
752                "unexpected end of pack stream".to_owned(),
753            ));
754        }
755        let b = self.data[self.pos];
756        self.pos += 1;
757        Ok(b)
758    }
759
760    /// Read a big-endian `u32`.
761    fn read_u32_be(&mut self) -> Result<u32> {
762        let bytes = self.read_exact(4)?;
763        Ok(u32::from_be_bytes(bytes.try_into().map_err(|_| {
764            Error::CorruptObject("u32 read failed".to_owned())
765        })?))
766    }
767
768    /// Read the packed-object type + size header (variable-length big-endian
769    /// encoding with the type in bits 4-6 of the first byte).
770    ///
771    /// Returns `(type_code, uncompressed_size)`.
772    fn read_type_size(&mut self) -> Result<(u8, usize)> {
773        let c = self.read_byte()?;
774        let type_code = (c >> 4) & 0x7;
775        let mut size = (c & 0x0f) as usize;
776        let mut shift = 4u32;
777        let mut cur = c;
778        while cur & 0x80 != 0 {
779            cur = self.read_byte()?;
780            size |= ((cur & 0x7f) as usize) << shift;
781            shift += 7;
782        }
783        Ok((type_code, size))
784    }
785
786    /// Read an `OFS_DELTA` negative-offset value.
787    ///
788    /// The encoding uses a big-endian variable-length integer with a +1 bias
789    /// on each continuation byte, yielding values ≥ 1.
790    fn read_ofs_neg_offset(&mut self) -> Result<usize> {
791        let mut c = self.read_byte()?;
792        let mut value = (c & 0x7f) as usize;
793        while c & 0x80 != 0 {
794            c = self.read_byte()?;
795            value = (value + 1) << 7 | (c & 0x7f) as usize;
796        }
797        Ok(value)
798    }
799
800    /// Decompress zlib-compressed data starting at the current cursor position.
801    ///
802    /// Advances the cursor by exactly the number of compressed bytes consumed.
803    /// Returns an error if the decompressed length differs from `expected_size`.
804    fn decompress(&mut self, expected_size: usize) -> Result<Vec<u8>> {
805        let slice = &self.data[self.pos..];
806        let mut decoder = ZlibDecoder::new(slice);
807        let mut out = Vec::with_capacity(expected_size);
808        decoder
809            .read_to_end(&mut out)
810            .map_err(|e| Error::Zlib(e.to_string()))?;
811        if out.len() != expected_size {
812            return Err(Error::CorruptObject(format!(
813                "decompressed {} bytes but expected {}",
814                out.len(),
815                expected_size
816            )));
817        }
818        self.pos += decoder.total_in() as usize;
819        Ok(out)
820    }
821}
822
823fn io_to_corrupt_eof(e: io::Error, stream_pos: usize, context: &str) -> Error {
824    if e.kind() == io::ErrorKind::UnexpectedEof {
825        Error::CorruptObject(format!(
826            "pack stream truncated ({context}) at offset {stream_pos}"
827        ))
828    } else {
829        Error::Io(e)
830    }
831}
832
833/// Streaming cursor over a pack file: hashes body bytes incrementally (no full-buffer read).
834///
835/// Raw pack bytes are either consumed as object headers (via [`Self::read_byte`]) or as zlib
836/// payloads.  Zlib decoders may read ahead; overflow bytes stay in [`Self::pending`] so the next
837/// object header or zlib stream starts at the correct offset.
838struct StreamingPackReader<'a> {
839    inner: &'a mut dyn Read,
840    pack_hasher: PackHasher,
841    stream_pos: usize,
842    max_input_bytes: Option<u64>,
843    /// Compressed (or other) bytes already read from `inner` and hashed but not yet consumed by
844    /// the current parsing step.
845    pending: Vec<u8>,
846}
847
848impl<'a> StreamingPackReader<'a> {
849    fn new(inner: &'a mut dyn Read, max_input_bytes: Option<u64>, algo: HashAlgo) -> Self {
850        Self {
851            inner,
852            pack_hasher: PackHasher::new(algo),
853            stream_pos: 0,
854            max_input_bytes,
855            pending: Vec::new(),
856        }
857    }
858
859    fn stream_pos(&self) -> usize {
860        self.stream_pos
861    }
862
863    fn enforce_max_input(&self) -> Result<()> {
864        if let Some(limit) = self.max_input_bytes {
865            let pos = u64::try_from(self.stream_pos)
866                .map_err(|_| Error::CorruptObject("pack stream position overflow".to_owned()))?;
867            if pos > limit {
868                return Err(Error::CorruptObject(
869                    "pack exceeds maximum allowed size".to_owned(),
870                ));
871            }
872        }
873        Ok(())
874    }
875
876    /// Read pack-body bytes (hashed). Used for headers and non-zlib payload reads only.
877    fn read_from_source(&mut self, buf: &mut [u8]) -> Result<usize> {
878        let n = if !self.pending.is_empty() {
879            let take = buf.len().min(self.pending.len());
880            buf[..take].copy_from_slice(&self.pending[..take]);
881            self.pending.drain(..take);
882            take
883        } else {
884            self.inner.read(buf).map_err(Error::Io)?
885        };
886        if n > 0 {
887            self.pack_hasher.update(&buf[..n]);
888            self.stream_pos += n;
889            self.enforce_max_input()?;
890        }
891        Ok(n)
892    }
893
894    fn read_byte(&mut self) -> Result<u8> {
895        let mut b = [0u8; 1];
896        let n = self.read_from_source(&mut b)?;
897        if n == 0 {
898            return Err(Error::CorruptObject(format!(
899                "pack stream truncated (read byte) at offset {}",
900                self.stream_pos
901            )));
902        }
903        Ok(b[0])
904    }
905
906    fn read_exact_n(&mut self, n: usize) -> Result<Vec<u8>> {
907        let mut v = vec![0u8; n];
908        let mut got = 0usize;
909        while got < n {
910            let m = self.read_from_source(&mut v[got..n])?;
911            if m == 0 {
912                return Err(Error::CorruptObject(format!(
913                    "pack stream truncated (read exact) at offset {}",
914                    self.stream_pos
915                )));
916            }
917            got += m;
918        }
919        Ok(v)
920    }
921
922    fn read_u32_be(&mut self) -> Result<u32> {
923        let mut b = [0u8; 4];
924        let mut got = 0usize;
925        while got < 4 {
926            let m = self.read_from_source(&mut b[got..4])?;
927            if m == 0 {
928                return Err(Error::CorruptObject(format!(
929                    "pack stream truncated (read u32) at offset {}",
930                    self.stream_pos
931                )));
932            }
933            got += m;
934        }
935        Ok(u32::from_be_bytes(b))
936    }
937
938    fn read_type_size(&mut self) -> Result<(u8, usize)> {
939        let c = self.read_byte()?;
940        let type_code = (c >> 4) & 0x7;
941        let mut size = (c & 0x0f) as usize;
942        let mut shift = 4u32;
943        let mut cur = c;
944        while cur & 0x80 != 0 {
945            cur = self.read_byte()?;
946            size |= ((cur & 0x7f) as usize) << shift;
947            shift += 7;
948        }
949        Ok((type_code, size))
950    }
951
952    fn read_ofs_neg_offset(&mut self) -> Result<usize> {
953        let mut c = self.read_byte()?;
954        let mut value = (c & 0x7f) as usize;
955        while c & 0x80 != 0 {
956            c = self.read_byte()?;
957            value = (value + 1) << 7 | (c & 0x7f) as usize;
958        }
959        Ok(value)
960    }
961
962    /// Pull zlib-compressed bytes until one object inflates to `expected_size` bytes.
963    ///
964    /// Bytes read from `inner` into `pending` are not hashed until we know how many belong to the
965    /// zlib stream (`total_in()`). Lookahead past the zlib end (including the 20-byte pack
966    /// trailer) must never be fed to the pack checksum.
967    ///
968    /// When the pack arrives in small chunks (e.g. side-band-64k from `upload-pack`), `flate2` may
969    /// return an error before the full deflate stream is in `pending`. Retry after reading more
970    /// from `inner` (same idea as [`PackReader::decompress`], which sees the whole zlib at once).
971    fn decompress(&mut self, expected_size: usize) -> Result<Vec<u8>> {
972        // `Read::read_exact` into an empty buffer returns `Ok` immediately without touching the
973        // decoder, so a 0-byte packed object would leave the zlib header in `pending` and desync
974        // the pack stream (bundle / clone unpack). Always run the zlib decoder once.
975        if expected_size == 0 {
976            const CHUNK: usize = 64 * 1024;
977            let mut scratch = [0u8; CHUNK];
978            loop {
979                let mut cursor = std::io::Cursor::new(self.pending.as_slice());
980                let mut z = ZlibDecoder::new(&mut cursor);
981                let mut sink = [0u8; 1];
982                match z.read(&mut sink) {
983                    Ok(0) => {
984                        let consumed = z.total_in() as usize;
985                        if consumed > self.pending.len() {
986                            return Err(Error::CorruptObject(
987                                "zlib total_in exceeds pending buffer".to_owned(),
988                            ));
989                        }
990                        if consumed == 0 {
991                            let n = self.inner.read(&mut scratch).map_err(Error::Io)?;
992                            if n == 0 {
993                                return Err(Error::CorruptObject(format!(
994                                    "pack stream truncated (zlib) at offset {}",
995                                    self.stream_pos
996                                )));
997                            }
998                            self.pending.extend_from_slice(&scratch[..n]);
999                            continue;
1000                        }
1001                        self.pack_hasher.update(&self.pending[..consumed]);
1002                        self.stream_pos += consumed;
1003                        self.pending.drain(..consumed);
1004                        self.enforce_max_input()?;
1005                        return Ok(Vec::new());
1006                    }
1007                    Ok(_) => {
1008                        return Err(Error::CorruptObject(
1009                            "0-byte packed object inflated to non-empty output".to_owned(),
1010                        ));
1011                    }
1012                    Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => {
1013                        let n = self.inner.read(&mut scratch).map_err(Error::Io)?;
1014                        if n == 0 {
1015                            return Err(Error::CorruptObject(format!(
1016                                "pack stream truncated (zlib) at offset {}",
1017                                self.stream_pos
1018                            )));
1019                        }
1020                        self.pending.extend_from_slice(&scratch[..n]);
1021                    }
1022                    Err(e) => return Err(Error::Zlib(e.to_string())),
1023                }
1024            }
1025        }
1026
1027        const CHUNK: usize = 64 * 1024;
1028        let mut scratch = [0u8; CHUNK];
1029
1030        let mut out = vec![0u8; expected_size];
1031        let mut z = Decompress::new(true);
1032        let mut out_pos = 0usize;
1033        let mut eof = false;
1034        loop {
1035            if self.pending.is_empty() && !eof {
1036                let n = self.inner.read(&mut scratch).map_err(Error::Io)?;
1037                if n == 0 {
1038                    eof = true;
1039                } else {
1040                    self.pending.extend_from_slice(&scratch[..n]);
1041                }
1042            }
1043
1044            let flush = if eof && self.pending.is_empty() {
1045                FlushDecompress::Finish
1046            } else {
1047                FlushDecompress::None
1048            };
1049
1050            let before_in = z.total_in();
1051            let before_out = z.total_out();
1052            let status = z
1053                .decompress(self.pending.as_slice(), &mut out[out_pos..], flush)
1054                .map_err(|e| Error::Zlib(e.to_string()))?;
1055            let consumed = (z.total_in() - before_in) as usize;
1056            if consumed > self.pending.len() {
1057                return Err(Error::CorruptObject(
1058                    "zlib consumed more than pending buffer".to_owned(),
1059                ));
1060            }
1061            self.pack_hasher.update(&self.pending[..consumed]);
1062            self.stream_pos += consumed;
1063            self.pending.drain(..consumed);
1064            self.enforce_max_input()?;
1065            out_pos += (z.total_out() - before_out) as usize;
1066
1067            match status {
1068                Status::StreamEnd => {
1069                    if out_pos != expected_size {
1070                        return Err(Error::CorruptObject(format!(
1071                            "decompressed size mismatch: got {out_pos}, want {expected_size}"
1072                        )));
1073                    }
1074                    return Ok(out);
1075                }
1076                Status::Ok | Status::BufError => {
1077                    if consumed == 0 && !eof {
1078                        let n = self.inner.read(&mut scratch).map_err(Error::Io)?;
1079                        if n == 0 {
1080                            eof = true;
1081                        } else {
1082                            self.pending.extend_from_slice(&scratch[..n]);
1083                        }
1084                    } else if eof && self.pending.is_empty() && out_pos != expected_size {
1085                        return Err(Error::CorruptObject(format!(
1086                            "pack stream truncated (zlib) at offset {}",
1087                            self.stream_pos
1088                        )));
1089                    }
1090                }
1091            }
1092        }
1093    }
1094
1095    /// Hash over all pack bytes read so far (objects only; trailer not yet read).
1096    fn finalize_hasher(&self) -> Vec<u8> {
1097        self.pack_hasher.clone().finalize()
1098    }
1099
1100    /// Trailing pack checksum (hash-width bytes); not included in [`Self::finalize_hasher`].
1101    fn read_trailer(&mut self) -> Result<Vec<u8>> {
1102        let hash_len = self.pack_hasher.len();
1103        let mut b = vec![0u8; hash_len];
1104        if self.pending.len() >= hash_len {
1105            b.copy_from_slice(&self.pending[..hash_len]);
1106            self.pending.drain(..hash_len);
1107            self.stream_pos += hash_len;
1108            self.enforce_max_input()?;
1109            return Ok(b);
1110        }
1111        let tail = self.pending.len();
1112        if tail > 0 {
1113            b[..tail].copy_from_slice(&self.pending[..]);
1114            self.pending.clear();
1115        }
1116        self.inner
1117            .read_exact(&mut b[tail..])
1118            .map_err(|e| io_to_corrupt_eof(e, self.stream_pos, "trailer"))?;
1119        self.stream_pos += hash_len;
1120        self.enforce_max_input()?;
1121        Ok(b)
1122    }
1123}
1124
1125/// Apply a git "patch delta" to `base`, producing the patched result.
1126///
1127/// The delta binary format is:
1128/// 1. Source size: variable-length little-endian integer (must equal
1129///    `base.len()`).
1130/// 2. Destination size: variable-length little-endian integer.
1131/// 3. A sequence of COPY (MSB set) and INSERT (MSB clear) instructions.
1132///
1133/// # Errors
1134///
1135/// Returns [`Error::CorruptObject`] if the delta is malformed, the source-size
1136/// field does not match `base.len()`, or the result length does not match the
1137/// declared destination size.
1138pub fn apply_delta(base: &[u8], delta: &[u8]) -> Result<Vec<u8>> {
1139    let mut pos = 0usize;
1140
1141    let src_size = read_delta_varint(delta, &mut pos)?;
1142    if src_size != base.len() {
1143        return Err(Error::CorruptObject(format!(
1144            "delta source size {src_size} != base size {}",
1145            base.len()
1146        )));
1147    }
1148    let dest_size = read_delta_varint(delta, &mut pos)?;
1149    let mut result = Vec::with_capacity(dest_size);
1150
1151    while pos < delta.len() {
1152        let cmd = delta[pos];
1153        pos += 1;
1154        if cmd == 0 {
1155            return Err(Error::CorruptObject(
1156                "reserved opcode 0 in delta stream".to_owned(),
1157            ));
1158        }
1159        if cmd & 0x80 != 0 {
1160            // COPY instruction: up to 4 offset bytes (bits 0-3) and up to 3
1161            // size bytes (bits 4-6) are present, each controlled by a flag bit.
1162            let mut offset = 0usize;
1163            let mut size = 0usize;
1164
1165            macro_rules! maybe_read_byte {
1166                ($flag:expr, $shift:expr, $target:expr) => {
1167                    if cmd & $flag != 0 {
1168                        let b = *delta.get(pos).ok_or_else(|| {
1169                            Error::CorruptObject("truncated delta COPY operand".to_owned())
1170                        })?;
1171                        pos += 1;
1172                        $target |= (b as usize) << $shift;
1173                    }
1174                };
1175            }
1176
1177            maybe_read_byte!(0x01, 0, offset);
1178            maybe_read_byte!(0x02, 8, offset);
1179            maybe_read_byte!(0x04, 16, offset);
1180            maybe_read_byte!(0x08, 24, offset);
1181            maybe_read_byte!(0x10, 0, size);
1182            maybe_read_byte!(0x20, 8, size);
1183            maybe_read_byte!(0x40, 16, size);
1184
1185            if size == 0 {
1186                size = 0x10000;
1187            }
1188
1189            let end = offset.checked_add(size).ok_or_else(|| {
1190                Error::CorruptObject("delta COPY range overflows usize".to_owned())
1191            })?;
1192            let chunk = base.get(offset..end).ok_or_else(|| {
1193                Error::CorruptObject(format!(
1194                    "delta COPY [{offset},{end}) out of range (base is {} bytes)",
1195                    base.len()
1196                ))
1197            })?;
1198            result.extend_from_slice(chunk);
1199        } else {
1200            // INSERT instruction: copy the next `cmd` literal bytes verbatim.
1201            let n = cmd as usize;
1202            let chunk = delta
1203                .get(pos..pos + n)
1204                .ok_or_else(|| Error::CorruptObject("truncated delta INSERT data".to_owned()))?;
1205            result.extend_from_slice(chunk);
1206            pos += n;
1207        }
1208    }
1209
1210    if result.len() != dest_size {
1211        return Err(Error::CorruptObject(format!(
1212            "delta produced {} bytes but expected {dest_size}",
1213            result.len()
1214        )));
1215    }
1216
1217    Ok(result)
1218}
1219
1220/// Read a variable-length little-endian integer from `data` starting at `*pos`.
1221///
1222/// Advances `*pos` past the consumed bytes.
1223fn read_delta_varint(data: &[u8], pos: &mut usize) -> Result<usize> {
1224    let mut value = 0usize;
1225    let mut shift = 0u32;
1226    loop {
1227        let b = *data
1228            .get(*pos)
1229            .ok_or_else(|| Error::CorruptObject("truncated delta varint".to_owned()))?;
1230        *pos += 1;
1231        value |= ((b & 0x7f) as usize) << shift;
1232        shift += 7;
1233        if b & 0x80 == 0 {
1234            break;
1235        }
1236    }
1237    Ok(value)
1238}
1239
1240#[cfg(test)]
1241mod tests {
1242    use super::*;
1243
1244    // Helper: build a minimal pack from a list of (kind, data) pairs.
1245    // Returns the raw pack bytes.
1246    fn make_pack(objects: &[(ObjectKind, &[u8])]) -> Vec<u8> {
1247        use flate2::write::ZlibEncoder;
1248        use std::io::Write;
1249
1250        let mut entries: Vec<Vec<u8>> = Vec::new();
1251        for (kind, data) in objects {
1252            let type_code: u8 = match kind {
1253                ObjectKind::Commit => 1,
1254                ObjectKind::Tree => 2,
1255                ObjectKind::Blob => 3,
1256                ObjectKind::Tag => 4,
1257            };
1258            // Encode type+size header.
1259            let mut header = Vec::new();
1260            let mut size = data.len();
1261            let first = ((type_code & 0x7) << 4) | (size & 0x0f) as u8;
1262            size >>= 4;
1263            if size > 0 {
1264                header.push(first | 0x80);
1265                while size > 0 {
1266                    let b = (size & 0x7f) as u8;
1267                    size >>= 7;
1268                    header.push(if size > 0 { b | 0x80 } else { b });
1269                }
1270            } else {
1271                header.push(first);
1272            }
1273            // zlib-compress data.
1274            let mut enc = ZlibEncoder::new(Vec::new(), flate2::Compression::default());
1275            enc.write_all(data).unwrap();
1276            let compressed = enc.finish().unwrap();
1277            let mut entry = header;
1278            entry.extend_from_slice(&compressed);
1279            entries.push(entry);
1280        }
1281
1282        // Assemble: PACK + version(2) + count + entries + SHA-1.
1283        let mut pack = Vec::new();
1284        pack.extend_from_slice(b"PACK");
1285        pack.extend_from_slice(&2u32.to_be_bytes());
1286        pack.extend_from_slice(&(objects.len() as u32).to_be_bytes());
1287        for entry in &entries {
1288            pack.extend_from_slice(entry);
1289        }
1290        let mut hasher = Sha1::new();
1291        hasher.update(&pack);
1292        let digest = hasher.finalize();
1293        pack.extend_from_slice(digest.as_slice());
1294        pack
1295    }
1296
1297    #[test]
1298    fn test_apply_delta_simple() {
1299        // Build a trivial delta: insert "hello world".
1300        let base = b"hello";
1301        let mut delta = Vec::new();
1302        // src_size = 5
1303        delta.push(5u8);
1304        // dest_size = 11
1305        delta.push(11u8);
1306        // COPY instruction: copy base[0..5]
1307        // cmd = 0x80 | 0x01 (offset present, byte 0) | 0x10 (size byte 0)
1308        delta.push(0x80 | 0x01 | 0x10); // 0x91
1309        delta.push(0u8); // offset = 0
1310        delta.push(5u8); // size = 5
1311                         // INSERT " world" (6 bytes)
1312        delta.push(6u8);
1313        delta.extend_from_slice(b" world");
1314
1315        let result = apply_delta(base, &delta).unwrap();
1316        assert_eq!(result, b"hello world");
1317    }
1318
1319    #[test]
1320    fn test_apply_delta_insert_only() {
1321        let base = b"";
1322        let mut delta = Vec::new();
1323        delta.push(0u8); // src_size = 0
1324        delta.push(5u8); // dest_size = 5
1325        delta.push(5u8); // INSERT 5 bytes
1326        delta.extend_from_slice(b"hello");
1327
1328        let result = apply_delta(base, &delta).unwrap();
1329        assert_eq!(result, b"hello");
1330    }
1331
1332    #[test]
1333    fn test_apply_delta_copy_only() {
1334        let base = b"abcdef";
1335        let mut delta = Vec::new();
1336        delta.push(6u8); // src_size = 6
1337        delta.push(3u8); // dest_size = 3
1338                         // COPY base[2..5]: offset=2, size=3
1339                         // cmd = 0x80 | 0x01 | 0x10
1340        delta.push(0x91u8);
1341        delta.push(2u8); // offset = 2
1342        delta.push(3u8); // size = 3
1343
1344        let result = apply_delta(base, &delta).unwrap();
1345        assert_eq!(result, b"cde");
1346    }
1347
1348    #[test]
1349    fn test_apply_delta_size_zero_means_65536() {
1350        // A COPY with size bytes all zero means 0x10000 = 65536.
1351        let base = vec![0xABu8; 65536];
1352        let mut delta = Vec::new();
1353        // src_size = 65536, encoded as 3 bytes little-endian varint
1354        delta.push(0x80 | (65536 & 0x7f) as u8); // 0
1355        delta.push(0x80 | ((65536 >> 7) & 0x7f) as u8); // 0x80
1356        delta.push(((65536 >> 14) & 0x7f) as u8); // 4
1357                                                  // dest_size = 65536, same
1358        delta.push(0x80 | (65536 & 0x7f) as u8);
1359        delta.push(0x80 | ((65536 >> 7) & 0x7f) as u8);
1360        delta.push(((65536 >> 14) & 0x7f) as u8);
1361        // COPY: offset=0 (no offset bytes), size=0 (no size bytes) → means 0x10000
1362        // cmd = 0x80 (no offset/size bytes present at all → offset=0, size=0→65536)
1363        delta.push(0x80u8);
1364
1365        let result = apply_delta(&base, &delta).unwrap();
1366        assert_eq!(result.len(), 65536);
1367        assert!(result.iter().all(|&b| b == 0xAB));
1368    }
1369
1370    #[test]
1371    fn test_unpack_objects_blobs() {
1372        use tempfile::TempDir;
1373        let tmp = TempDir::new().unwrap();
1374        let objects_dir = tmp.path().join("objects");
1375        std::fs::create_dir_all(&objects_dir).unwrap();
1376        let odb = Odb::new(&objects_dir);
1377
1378        let pack = make_pack(&[
1379            (ObjectKind::Blob, b"hello\n"),
1380            (ObjectKind::Blob, b"world\n"),
1381        ]);
1382
1383        let opts = UnpackOptions::default();
1384        let count = unpack_objects(&mut pack.as_slice(), &odb, &opts).unwrap();
1385        assert_eq!(count, 2);
1386
1387        // Verify both blobs can be read back.
1388        let oid1 = Odb::hash_object_data(ObjectKind::Blob, b"hello\n");
1389        let oid2 = Odb::hash_object_data(ObjectKind::Blob, b"world\n");
1390        let obj1 = odb.read(&oid1).unwrap();
1391        let obj2 = odb.read(&oid2).unwrap();
1392        assert_eq!(obj1.data, b"hello\n");
1393        assert_eq!(obj2.data, b"world\n");
1394    }
1395
1396    #[test]
1397    fn test_unpack_objects_empty_tree() {
1398        use tempfile::TempDir;
1399        let tmp = TempDir::new().unwrap();
1400        let objects_dir = tmp.path().join("objects");
1401        std::fs::create_dir_all(&objects_dir).unwrap();
1402        let odb = Odb::new(&objects_dir);
1403
1404        let pack = make_pack(&[(ObjectKind::Tree, b"")]);
1405        let opts = UnpackOptions::default();
1406        assert_eq!(
1407            unpack_objects(&mut pack.as_slice(), &odb, &opts).unwrap(),
1408            1
1409        );
1410        let oid = Odb::hash_object_data(ObjectKind::Tree, b"");
1411        assert!(odb.exists(&oid));
1412        let loose = objects_dir
1413            .join(oid.loose_prefix())
1414            .join(oid.loose_suffix());
1415        assert!(
1416            loose.is_file(),
1417            "empty tree must be materialized as a loose object during unpack"
1418        );
1419    }
1420
1421    #[test]
1422    fn test_strict_skips_gitlink_tree_entries() {
1423        use crate::index::{MODE_GITLINK, MODE_REGULAR};
1424        use crate::objects::{serialize_tree, TreeEntry};
1425
1426        // A submodule commit oid that is NOT in the pack/ODB (lives in the
1427        // submodule repository, like a 160000 gitlink target on push).
1428        let submodule_oid = ObjectId::from_hex(&"7f".repeat(20)).unwrap();
1429
1430        // Superproject tree referencing the submodule via a gitlink entry.
1431        let tree_data = serialize_tree(&[TreeEntry {
1432            mode: MODE_GITLINK,
1433            name: b"sub".to_vec(),
1434            oid: submodule_oid,
1435        }]);
1436        let tree_oid = Odb::hash_object_data(ObjectKind::Tree, &tree_data);
1437
1438        // Strict connectivity must NOT flag the gitlink target as missing,
1439        // matching upstream git (git/fsck.c skips S_ISGITLINK entries).
1440        let mut pack = HashMap::new();
1441        pack.insert(tree_oid, (ObjectKind::Tree, tree_data.clone()));
1442        assert!(strict_verify_packed_references(None, &pack).is_ok());
1443
1444        // Regression guard: a non-gitlink (regular file) entry pointing at an
1445        // absent blob must still be reported as a strict connectivity error.
1446        let bad_tree = serialize_tree(&[TreeEntry {
1447            mode: MODE_REGULAR,
1448            name: b"file".to_vec(),
1449            oid: ObjectId::from_hex(&"ab".repeat(20)).unwrap(),
1450        }]);
1451        let bad_oid = Odb::hash_object_data(ObjectKind::Tree, &bad_tree);
1452        let mut bad_pack = HashMap::new();
1453        bad_pack.insert(bad_oid, (ObjectKind::Tree, bad_tree));
1454        assert!(matches!(
1455            strict_verify_packed_references(None, &bad_pack),
1456            Err(Error::CorruptObject(_))
1457        ));
1458    }
1459
1460    /// `Read` that returns at most `max_len` bytes per call (simulates side-band chunking).
1461    struct ChunkedReader<'a> {
1462        data: &'a [u8],
1463        pos: usize,
1464        max_len: usize,
1465    }
1466
1467    impl io::Read for ChunkedReader<'_> {
1468        fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
1469            if self.pos >= self.data.len() {
1470                return Ok(0);
1471            }
1472            let take = (self.data.len() - self.pos)
1473                .min(self.max_len)
1474                .min(buf.len());
1475            buf[..take].copy_from_slice(&self.data[self.pos..self.pos + take]);
1476            self.pos += take;
1477            Ok(take)
1478        }
1479    }
1480
1481    #[test]
1482    fn test_unpack_objects_chunked_read_matches_full_buffer() {
1483        use tempfile::TempDir;
1484        let pack = make_pack(&[(ObjectKind::Blob, b"chunked-stream")]);
1485        let opts = UnpackOptions::default();
1486        let oid = Odb::hash_object_data(ObjectKind::Blob, b"chunked-stream");
1487
1488        let tmp = TempDir::new().unwrap();
1489        let objects_dir = tmp.path().join("objects");
1490        std::fs::create_dir_all(&objects_dir).unwrap();
1491        let odb = Odb::new(&objects_dir);
1492        assert_eq!(
1493            unpack_objects(&mut pack.as_slice(), &odb, &opts).unwrap(),
1494            1
1495        );
1496        assert!(odb.exists(&oid));
1497
1498        let tmp2 = TempDir::new().unwrap();
1499        let objects_dir2 = tmp2.path().join("objects");
1500        std::fs::create_dir_all(&objects_dir2).unwrap();
1501        let odb2 = Odb::new(&objects_dir2);
1502        let mut chunked = ChunkedReader {
1503            data: pack.as_slice(),
1504            pos: 0,
1505            max_len: 8,
1506        };
1507        assert_eq!(unpack_objects(&mut chunked, &odb2, &opts).unwrap(), 1);
1508        assert!(odb2.exists(&oid));
1509    }
1510
1511    #[test]
1512    fn test_unpack_objects_dry_run_writes_nothing() {
1513        use tempfile::TempDir;
1514        let tmp = TempDir::new().unwrap();
1515        let objects_dir = tmp.path().join("objects");
1516        std::fs::create_dir_all(&objects_dir).unwrap();
1517        let odb = Odb::new(&objects_dir);
1518
1519        let pack = make_pack(&[(ObjectKind::Blob, b"test content")]);
1520
1521        let opts = UnpackOptions {
1522            dry_run: true,
1523            quiet: true,
1524            strict: false,
1525            allowed_missing: Default::default(),
1526            allow_promisor_missing_references: false,
1527            max_input_bytes: None,
1528            ..Default::default()
1529        };
1530        let count = unpack_objects(&mut pack.as_slice(), &odb, &opts).unwrap();
1531        assert_eq!(count, 1);
1532
1533        // Nothing should be written.
1534        let oid = Odb::hash_object_data(ObjectKind::Blob, b"test content");
1535        assert!(!odb.exists(&oid));
1536    }
1537
1538    #[test]
1539    fn test_unpack_objects_bad_signature() {
1540        use tempfile::TempDir;
1541        let tmp = TempDir::new().unwrap();
1542        let objects_dir = tmp.path().join("objects");
1543        std::fs::create_dir_all(&objects_dir).unwrap();
1544        let odb = Odb::new(&objects_dir);
1545
1546        let mut bad = b"NOPE\x00\x00\x00\x02\x00\x00\x00\x00".to_vec();
1547        bad.extend_from_slice(&[0u8; 20]);
1548        let opts = UnpackOptions::default();
1549        let err = unpack_objects(&mut bad.as_slice(), &odb, &opts).unwrap_err();
1550        assert!(err.to_string().contains("invalid signature"));
1551    }
1552
1553    #[test]
1554    fn test_unpack_objects_checksum_mismatch() {
1555        use tempfile::TempDir;
1556        let tmp = TempDir::new().unwrap();
1557        let objects_dir = tmp.path().join("objects");
1558        std::fs::create_dir_all(&objects_dir).unwrap();
1559        let odb = Odb::new(&objects_dir);
1560
1561        let mut pack = make_pack(&[(ObjectKind::Blob, b"data")]);
1562        // Corrupt the trailing checksum.
1563        let n = pack.len();
1564        pack[n - 1] ^= 0xFF;
1565
1566        let opts = UnpackOptions::default();
1567        let err = unpack_objects(&mut pack.as_slice(), &odb, &opts).unwrap_err();
1568        assert!(err.to_string().contains("checksum"));
1569    }
1570
1571    #[test]
1572    fn test_apply_delta_source_size_mismatch() {
1573        let base = b"hi";
1574        let delta = [3u8, 2u8, 2u8, b'h', b'i']; // src_size=3 != base.len()=2
1575        let err = apply_delta(base, &delta).unwrap_err();
1576        assert!(err.to_string().contains("source size"));
1577    }
1578}