Skip to main content

git_remote_object_store/packchain/
read.rs

1//! Direct file access against a packchain remote (issue #65).
2//!
3//! [`read_blob`] is the differentiated value-add of the packchain
4//! engine: a caller fetches a single file at a ref's tip without
5//! cloning, materialising a working tree, or invoking git. The
6//! lookup walks the on-bucket artefacts the Phase 2 push wrote:
7//!
8//! 1. `chain.json` to verify the ref exists.
9//! 2. `path-index.json` to resolve `path` → blob SHA at tip.
10//! 3. Each segment's `.idx` (newest-first) to locate the blob's pack
11//!    entry.
12//! 4. A ranged GET against the matching `.pack` to fetch the entry
13//!    bytes, zlib-decompressed (and delta-applied, if applicable).
14//!
15//! The pack-index parses are amortised across calls via
16//! [`PackIndexCache`], a byte-bounded LRU keyed by
17//! `(prefix, content-sha)`. Single-shot callers can pass
18//! `&PackIndexCache::default()` and let the cache GC at drop.
19//!
20//! ## Delta resolution
21//!
22//! Pack entries may be deltas against a base elsewhere in the chain.
23//! `OFS_DELTA` resolves within the same pack via a relative back-offset;
24//! `REF_DELTA` resolves to a SHA which may live in any pack in the
25//! chain. The walker recurses, capped at [`MAX_DELTA_DEPTH`] (matching
26//! git's own limit) so a corrupted chain with a delta cycle aborts
27//! cleanly instead of looping forever.
28//!
29//! ## What this module does *not* do
30//!
31//! - **No on-disk cache**: indices live in memory only. CI agents that
32//!   want cross-process amortisation should layer their own.
33//! - **No directory listings**: [`read_blob`] is single-file. The
34//!   nested-tree shape supports listing cleanly, but it's a separate
35//!   API and out of scope for issue #65.
36
37use std::collections::{BTreeMap, HashMap, VecDeque};
38use std::sync::{Arc, Mutex};
39
40use bytes::Bytes;
41use gix_pack::data::entry::Header as EntryHeader;
42use tracing::{debug, warn};
43
44use crate::git::RefName;
45use crate::object_store::{ObjectStore, ObjectStoreError};
46use crate::remote::Remote;
47use crate::url::StorageEngine;
48
49use super::PackchainError;
50use super::keys::{pack_idx_key, pack_key};
51use super::manifest::{load_chain, load_path_index};
52use super::retry::{
53    PACK_MISSING_MAX_RETRIES, PACK_MISSING_RETRY_BACKOFFS, chain_references_pack_key,
54};
55use super::schema::{ChainManifest, ChainSegment, PathNode, Sha40};
56
57/// Hard cap on delta-chain depth, matching git's own
58/// `pack.deltaCacheLimit`-adjacent recursion limit. A correctly built
59/// chain won't approach this; tripping the cap means the pack is
60/// corrupted (a cycle) or pathologically deep, and either way
61/// stopping is the right call.
62pub const MAX_DELTA_DEPTH: u32 = 50;
63
64/// Default in-memory budget for [`PackIndexCache`] (64 MiB), matching
65/// the cap the issue #65 plan calls out. Covers a chain of dozens of
66/// large packs without thrashing.
67pub const DEFAULT_CACHE_CAPACITY_BYTES: u64 = 64 * 1024 * 1024;
68
69/// Upper safety bound on the bytes a single ranged GET may request,
70/// covering both the terminal-entry path in [`fetch_entry_bytes`] and
71/// the fallback widening in [`inflate_with_retry`]. Past this point
72/// we surface a typed error rather than pulling unbounded bytes — a
73/// single multi-GiB blob in a code repo is overwhelmingly likely to
74/// be a misuse (git-LFS material) rather than a legitimate
75/// `read_blob` target.
76const MAX_RANGE_BYTES: u64 = 1024 * 1024 * 1024;
77
78/// Hard cap on a single decompressed pack object (1 GiB), enforced
79/// against attacker-controlled values from the pack entry header
80/// (`decompressed_size`) and the delta dst-size header. A malicious
81/// bucket can craft these to claim huge sizes; without a cap, we
82/// would `vec![0u8; n]` or `Vec::with_capacity(n)` for that many
83/// bytes and either panic or thrash. 1 GiB matches [`MAX_RANGE_BYTES`]
84/// and exceeds any realistic source-tree blob; LFS material lives in
85/// the LFS path, not [`read_blob`].
86const MAX_DECOMPRESSED_BYTES: u64 = 1024 * 1024 * 1024;
87
88/// Maximum number of times the fallback range may expand before the
89/// reader gives up with [`PackchainError::MalformedPackEntry`]. Each
90/// expansion doubles the range, so 6 retries cover up to ~1 GiB.
91const MAX_RANGE_EXPANSIONS: u32 = 6;
92
93/// In-process LRU cache of decoded pack indices keyed by
94/// `(prefix, content-sha)`.
95///
96/// Capacity is bounded by **byte size**, not entry count: a single 1 GB
97/// pack carries an .idx file of multiple MiB, so an entry-count cap
98/// would either over- or under-budget for realistic chains. Eviction
99/// is least-recently-used.
100///
101/// The cache is `Send + Sync` and shareable across [`read_blob`] calls.
102/// Multiple concurrent calls block briefly on the inner mutex during
103/// lookup / insert; the inflate / range-GET work happens outside the
104/// lock so contention stays bounded.
105///
106/// ## LRU bookkeeping cost
107///
108/// `get` and `insert` walk the order [`VecDeque`] via `iter().position`
109/// to move the touched key to the back — **O(n) in the cache size**.
110/// For typical packchain workloads (single-digit indices in flight),
111/// the constant factor dominates and this is faster than a true O(1)
112/// linked-list LRU. If a workload starts seeing hundreds of cached
113/// indices, this should be revisited (e.g. swap to the `lru` crate or
114/// hand-roll a `HashMap` + intrusive doubly-linked list). The simple
115/// shape is intentional for now.
116///
117/// # Example
118///
119/// ```no_run
120/// # #[tokio::main] async fn main() -> Result<(), Box<dyn std::error::Error>> {
121/// use git_remote_object_store::{packchain::PackIndexCache, Remote};
122///
123/// let remote = Remote::connect("s3+https://bucket/repo?engine=packchain").await?;
124/// let cache = PackIndexCache::default();
125/// let bytes = git_remote_object_store::packchain::read_blob(
126///     &remote,
127///     "refs/heads/main",
128///     "src/main.rs",
129///     &cache,
130/// ).await?;
131/// println!("{}", String::from_utf8_lossy(&bytes));
132/// # Ok(())
133/// # }
134/// ```
135pub struct PackIndexCache {
136    inner: Mutex<CacheInner>,
137    capacity_bytes: u64,
138}
139
140struct CacheInner {
141    /// Owned indices keyed by `(prefix, content-sha)`.
142    ///
143    /// `Arc` lets [`read_blob`] hold a long-lived reference to the
144    /// index while the cache lock is dropped, so the inflate /
145    /// range-GET work below doesn't block sibling cache lookups.
146    map: HashMap<CacheKey, Arc<CachedIndex>>,
147    /// LRU order — front is least-recently-used, back is most-recent.
148    order: VecDeque<CacheKey>,
149    total_bytes: u64,
150}
151
152type CacheKey = (String, Sha40);
153
154struct CachedIndex {
155    /// Parsed .idx file owning its bytes (in-memory parse via
156    /// [`gix_pack::index::File::from_data`]).
157    file: gix_pack::index::File<Vec<u8>>,
158    /// Pre-sorted ascending pack offsets. Used to derive the
159    /// next-offset upper bound for a ranged GET against the matching
160    /// pack file. Computed once at insert.
161    sorted_offsets: Vec<u64>,
162    /// Approximate resident byte count (the .idx body plus the offsets
163    /// vector). Used for the LRU byte-budget bookkeeping.
164    bytes: u64,
165}
166
167impl PackIndexCache {
168    /// Construct a cache with the requested byte budget.
169    ///
170    /// `capacity_bytes` of zero disables caching (every lookup misses).
171    /// Use [`Self::default`] for the standard 64 MiB budget.
172    #[must_use]
173    pub fn new(capacity_bytes: u64) -> Self {
174        Self {
175            inner: Mutex::new(CacheInner {
176                map: HashMap::new(),
177                order: VecDeque::new(),
178                total_bytes: 0,
179            }),
180            capacity_bytes,
181        }
182    }
183
184    /// Total resident bytes accounted for by the cache.
185    ///
186    /// # Panics
187    ///
188    /// Panics only if a previous holder of the inner mutex panicked
189    /// while mutating cache state — an invariant violation that would
190    /// be unsafe to silently recover from.
191    #[must_use]
192    pub fn resident_bytes(&self) -> u64 {
193        self.lock().total_bytes
194    }
195
196    /// Number of cached entries.
197    ///
198    /// # Panics
199    ///
200    /// See [`Self::resident_bytes`].
201    #[must_use]
202    pub fn len(&self) -> usize {
203        self.lock().map.len()
204    }
205
206    /// Whether the cache currently holds zero entries.
207    #[must_use]
208    pub fn is_empty(&self) -> bool {
209        self.len() == 0
210    }
211
212    fn lock(&self) -> std::sync::MutexGuard<'_, CacheInner> {
213        self.inner.lock().expect("cache mutex poisoned")
214    }
215
216    fn get(&self, key: &CacheKey) -> Option<Arc<CachedIndex>> {
217        let mut inner = self.lock();
218        let entry = inner.map.get(key).cloned()?;
219        // Move to most-recently-used position.
220        remove_from_order(&mut inner.order, key);
221        inner.order.push_back(key.clone());
222        Some(entry)
223    }
224
225    fn insert(&self, key: CacheKey, value: Arc<CachedIndex>) {
226        let mut inner = self.lock();
227        let bytes = value.bytes;
228        // Replace existing entry's accounting if present.
229        if let Some(prev) = inner.map.remove(&key) {
230            inner.total_bytes = inner.total_bytes.saturating_sub(prev.bytes);
231            remove_from_order(&mut inner.order, &key);
232        }
233        // If a single entry exceeds the budget, refuse to cache it
234        // (otherwise we'd evict everything and still overshoot).
235        if bytes > self.capacity_bytes {
236            return;
237        }
238        // Evict oldest until the new entry fits.
239        while inner.total_bytes + bytes > self.capacity_bytes {
240            let Some(oldest) = inner.order.pop_front() else {
241                break;
242            };
243            if let Some(removed) = inner.map.remove(&oldest) {
244                inner.total_bytes = inner.total_bytes.saturating_sub(removed.bytes);
245            }
246        }
247        inner.total_bytes += bytes;
248        inner.order.push_back(key.clone());
249        inner.map.insert(key, value);
250    }
251}
252
253fn remove_from_order(order: &mut VecDeque<CacheKey>, key: &CacheKey) {
254    if let Some(pos) = order.iter().position(|k| k == key) {
255        order.remove(pos);
256    }
257}
258
259impl Default for PackIndexCache {
260    fn default() -> Self {
261        Self::new(DEFAULT_CACHE_CAPACITY_BYTES)
262    }
263}
264
265impl std::fmt::Debug for PackIndexCache {
266    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
267        // Custom impl avoids deadlock-prone `Mutex` Debug while still
268        // exposing operationally interesting state: the static budget,
269        // current accounting, and current entry count. The `inner`
270        // field is *not* surfaced by design — it's an implementation
271        // detail and would print the entire cache contents.
272        f.debug_struct("PackIndexCache")
273            .field("capacity_bytes", &self.capacity_bytes)
274            .field("resident_bytes", &self.resident_bytes())
275            .field("entries", &self.len())
276            .finish_non_exhaustive()
277    }
278}
279
280/// Read the contents of `path` at `ref_name`'s tip from a packchain
281/// remote.
282///
283/// Walks `chain.json` + `path-index.json` to resolve `path` → blob
284/// SHA, then consults each segment's `.idx` newest-first for the
285/// blob's pack entry. The matching entry's bytes are fetched via a
286/// ranged GET, zlib-decompressed, and (when the entry is a delta)
287/// recursively resolved against its base. The entry's eventual blob
288/// payload is returned as an owned [`Bytes`].
289///
290/// `cache` amortises pack-index parsing across calls within the same
291/// process. Long-running consumers (CI agents, build systems) should
292/// keep one [`PackIndexCache`] for the lifetime of the process so the
293/// per-call cost is one or two API calls plus a zlib inflate; one-shot
294/// callers can pass `&PackIndexCache::default()` and discard.
295///
296/// # Errors
297///
298/// - [`PackchainError::WrongEngine`] when the remote's engine is not
299///   [`StorageEngine::Packchain`].
300/// - [`PackchainError::ChainAbsent`] when the branch is unknown to
301///   the bucket.
302/// - [`PackchainError::PathIndexAbsent`] when `chain.json` exists but
303///   `path-index.json` does not (a partially crashed first push).
304/// - [`PackchainError::TransientChainPathIndexMismatch`] when both
305///   exist but `path_index.tip != chain.tip` — a brief window during
306///   a concurrent push or compact where `chain.json` has been
307///   overwritten ahead of `path-index.json`. Retry-shaped: a
308///   subsequent read converges once the writer finishes.
309/// - [`PackchainError::MalformedPath`] for `..` segments, leading
310///   `/`, empty path, or empty segments (consecutive slashes).
311/// - [`PackchainError::PathNotFound`] when the path does not exist
312///   in the resolved tree.
313/// - [`PackchainError::PathNotABlob`] when the path resolves to a
314///   directory rather than a file.
315/// - [`PackchainError::BlobNotInChain`] when the path-index named a
316///   blob SHA absent from every pack referenced by `chain.json`.
317/// - [`PackchainError::DeltaTooDeep`] / [`PackchainError::MalformedDelta`]
318///   / [`PackchainError::MalformedPackEntry`] / [`PackchainError::Decompress`]
319///   for pack-corruption shapes.
320/// - [`PackchainError::PackMissing`], [`PackchainError::Store`], or
321///   [`PackchainError::Io`] for transport / I/O failures.
322/// - [`PackchainError::ConcurrentGcRetriesExhausted`] when a
323///   vigorous concurrent `manage gc sweep` kept deleting packs the
324///   reader had just discovered, exhausting the internal retry
325///   schedule. Callers should re-invoke `read_blob` after the
326///   compaction settles.
327pub async fn read_blob(
328    remote: &Remote,
329    ref_name: &str,
330    path: &str,
331    cache: &PackIndexCache,
332) -> Result<Bytes, PackchainError> {
333    if remote.engine() != StorageEngine::Packchain {
334        return Err(PackchainError::WrongEngine {
335            found: remote.engine(),
336        });
337    }
338
339    let segments = parse_path(path)?;
340    let remote_ref = RefName::new(ref_name).map_err(|_| PackchainError::InvalidRefName {
341        name: ref_name.to_owned(),
342    })?;
343    // `Remote::prefix()` returns `""` for bucket-root remotes; the
344    // engine-wide convention (post-#103) is `None` for "no prefix"
345    // and `Some(non-empty)` otherwise. Normalise here so cache keys
346    // and any downstream `Some(p) if !p.is_empty()` matches agree
347    // with the rest of the codebase.
348    let prefix_opt = (!remote.prefix().is_empty()).then(|| remote.prefix());
349
350    let chain = load_chain(remote.store(), prefix_opt, &remote_ref)
351        .await?
352        .ok_or_else(|| PackchainError::ChainAbsent {
353            ref_name: ref_name.to_owned(),
354        })?;
355
356    let path_index = load_path_index(remote.store(), prefix_opt, &remote_ref)
357        .await?
358        .ok_or_else(|| PackchainError::PathIndexAbsent {
359            ref_name: ref_name.to_owned(),
360        })?;
361
362    // Writers PUT chain.json before path-index.json (see the engine
363    // module doc on the linearisation point and issue #114). A crash
364    // or in-flight push between the two PUTs leaves a stale
365    // `path_index.tip` paired with a fresh `chain.tip`; resolving a
366    // path through the stale path-index would yield a blob SHA that
367    // names a different file than the caller intended (or one absent
368    // from the new chain entirely). Refuse to do that — surface the
369    // mismatch as a typed transient error so the caller can retry,
370    // and so `BlobNotInChain` keeps its honest "bucket corruption"
371    // meaning rather than masking a race window.
372    if path_index.tip != chain.tip {
373        return Err(PackchainError::TransientChainPathIndexMismatch {
374            ref_name: ref_name.to_owned(),
375            chain_tip: chain.tip.as_str().to_owned(),
376            path_index_tip: path_index.tip.as_str().to_owned(),
377        });
378    }
379
380    let blob_sha = walk_path(&path_index.tree, &segments, ref_name, path)?;
381
382    debug!(
383        ref_name = %ref_name,
384        path = %path,
385        blob = %blob_sha.as_str(),
386        segments = chain.segments.len(),
387        "read_blob: resolved path to blob, scanning chain"
388    );
389
390    let blob_oid = sha40_to_object_id(&blob_sha);
391    let result = read_with_pack_missing_retries(
392        remote.store(),
393        prefix_opt,
394        &remote_ref,
395        ref_name,
396        chain,
397        &blob_oid,
398        cache,
399    )
400    .await;
401    let blob_not_in_chain = || PackchainError::BlobNotInChain {
402        sha: blob_sha.as_str().to_owned(),
403        path: path.to_owned(),
404    };
405    match result {
406        Ok(ResolvedObject {
407            payload,
408            kind: ObjectKind::Blob,
409        }) => Ok(Bytes::from(payload)),
410        // path-index pointed at a non-blob — bucket inconsistency.
411        Ok(_) => Err(blob_not_in_chain()),
412        // Inner walker returns BlobNotInChain with an empty path field
413        // (it doesn't know the caller's path); replace with one that
414        // carries the caller's path for diagnostic clarity. Inner
415        // BlobNotInChain values for *other* shas (delta-base lookups)
416        // pass through unchanged.
417        Err(PackchainError::BlobNotInChain { sha, .. }) if sha == blob_sha.as_str() => {
418            Err(blob_not_in_chain())
419        }
420        Err(e) => Err(e),
421    }
422}
423
424/// Pack-read driver for [`read_blob`]. Walks the supplied chain for
425/// `blob_oid`; on a [`PackchainError::PackMissing`] caused by a
426/// concurrent `manage gc sweep` (detected by reloading `chain.json`
427/// and observing that the failing key is no longer referenced), waits
428/// a backoff and retries against the fresh chain. After
429/// [`PACK_MISSING_MAX_RETRIES`] retries gives up with
430/// [`PackchainError::ConcurrentGcRetriesExhausted`] (issue #136).
431///
432/// Path-index is **not** reloaded across retries — the original
433/// `blob_oid` represents the snapshot the caller asked about, and
434/// compaction preserves blob content addressing so the same SHA
435/// resolves in any compatible chain. A new push that overwrote
436/// path-index between the initial load and the retry would resolve
437/// to a different blob SHA, but the read is still well-defined at
438/// the snapshot point (the path-as-of-the-initial-load) and that is
439/// the right semantics for a stateless point-in-time read.
440///
441/// Non-`PackMissing` errors (parse, decompress, transport, etc.)
442/// pass through immediately — they are not retryable in this
443/// fashion and would otherwise wait through the backoff schedule
444/// for no reason. `BlobNotInChain` is also a passthrough so the
445/// caller can attach the user-facing `path` field.
446async fn read_with_pack_missing_retries(
447    store: &dyn ObjectStore,
448    prefix: Option<&str>,
449    remote_ref: &RefName,
450    ref_name: &str,
451    initial_chain: ChainManifest,
452    blob_oid: &gix_hash::ObjectId,
453    cache: &PackIndexCache,
454) -> Result<ResolvedObject, PackchainError> {
455    let mut current_chain = initial_chain;
456    let mut attempt: u32 = 0;
457    loop {
458        let mut depth = 0u32;
459        let result = read_object_from_chain(
460            store,
461            prefix,
462            &current_chain.segments,
463            blob_oid,
464            cache,
465            &mut depth,
466        )
467        .await;
468        let missing_key = match result {
469            Ok(resolved) => return Ok(resolved),
470            Err(PackchainError::PackMissing { key }) => key,
471            Err(e) => return Err(e),
472        };
473        // PackMissing — distinguish concurrent GC (retryable) from
474        // genuine bucket inconsistency (data loss → fail fast).
475        let reloaded = load_chain(store, prefix, remote_ref)
476            .await?
477            .ok_or_else(|| PackchainError::ChainAbsent {
478                ref_name: ref_name.to_owned(),
479            })?;
480        if chain_references_pack_key(&reloaded, prefix, &missing_key)? {
481            // The reloaded chain still names this pack — the bucket
482            // is genuinely missing data the chain still references.
483            // Surface the original PackMissing without retrying.
484            return Err(PackchainError::PackMissing { key: missing_key });
485        }
486        if attempt >= PACK_MISSING_MAX_RETRIES {
487            warn!(
488                ref_name = %ref_name,
489                last_missing_key = %missing_key,
490                attempts = attempt,
491                "read_blob: exhausted pack-missing retries against concurrent GC"
492            );
493            return Err(PackchainError::ConcurrentGcRetriesExhausted {
494                last_missing_key: missing_key,
495                attempts: attempt,
496            });
497        }
498        debug!(
499            ref_name = %ref_name,
500            missing_key = %missing_key,
501            attempt = attempt,
502            "read_blob: PackMissing on chain no longer references the pack — retrying after GC race"
503        );
504        tokio::time::sleep(PACK_MISSING_RETRY_BACKOFFS[attempt as usize]).await;
505        attempt += 1;
506        current_chain = reloaded;
507    }
508}
509
510/// Decoded pack object — the kind discriminates blobs from other
511/// types so [`read_blob`] can refuse to return a tree as a "blob".
512#[derive(Debug)]
513struct ResolvedObject {
514    payload: Vec<u8>,
515    kind: ObjectKind,
516}
517
518#[derive(Debug, Clone, Copy, PartialEq, Eq)]
519enum ObjectKind {
520    Blob,
521    Commit,
522    Tree,
523    Tag,
524}
525
526/// Validate `path` and split it on `/`.
527///
528/// Rejects shapes that don't map to git tree semantics: empty paths,
529/// `/`-prefixed (absolute), `..` segments, and empty segments
530/// (consecutive slashes / trailing slashes).
531fn parse_path(path: &str) -> Result<Vec<&str>, PackchainError> {
532    if path.is_empty() {
533        return Err(PackchainError::MalformedPath {
534            path: path.to_owned(),
535            reason: "empty path",
536        });
537    }
538    if path.starts_with('/') {
539        return Err(PackchainError::MalformedPath {
540            path: path.to_owned(),
541            reason: "absolute paths are not allowed",
542        });
543    }
544    let segments: Vec<&str> = path.split('/').collect();
545    for seg in &segments {
546        if seg.is_empty() {
547            return Err(PackchainError::MalformedPath {
548                path: path.to_owned(),
549                reason: "empty segment (consecutive or trailing slash)",
550            });
551        }
552        if *seg == ".." {
553            return Err(PackchainError::MalformedPath {
554                path: path.to_owned(),
555                reason: "`..` segments are not allowed",
556            });
557        }
558        if *seg == "." {
559            return Err(PackchainError::MalformedPath {
560                path: path.to_owned(),
561                reason: "`.` segments are not allowed",
562            });
563        }
564    }
565    Ok(segments)
566}
567
568/// Walk the nested path-index tree following `segments`. Returns the
569/// terminal blob's SHA on success.
570fn walk_path(
571    root: &BTreeMap<String, PathNode>,
572    segments: &[&str],
573    ref_name: &str,
574    path: &str,
575) -> Result<Sha40, PackchainError> {
576    let path_not_found = || PackchainError::PathNotFound {
577        ref_name: ref_name.to_owned(),
578        path: path.to_owned(),
579    };
580    // Splitting up front asserts the invariant `parse_path` guarantees
581    // (segments is non-empty) and lets the rest of the function be a
582    // straight walk-then-leaf-check with no unreachable fallthrough.
583    let (last_seg, prefix_segs) = segments
584        .split_last()
585        .expect("parse_path guarantees at least one segment");
586    let mut current = root;
587    for seg in prefix_segs {
588        // A mid-path blob (`a/file.txt/extra`) and a missing key both
589        // mean the caller's path doesn't resolve in this tree.
590        let Some(PathNode::Tree(children)) = current.get(*seg) else {
591            return Err(path_not_found());
592        };
593        current = children;
594    }
595    match current.get(*last_seg) {
596        Some(PathNode::Blob(sha)) => Ok(sha.clone()),
597        Some(PathNode::Tree(_)) => Err(PackchainError::PathNotABlob {
598            path: path.to_owned(),
599        }),
600        None => Err(path_not_found()),
601    }
602}
603
604fn sha40_to_object_id(sha: &Sha40) -> gix_hash::ObjectId {
605    // Sha40 invariant: exactly 40 lowercase hex characters. The
606    // gix_hash parser accepts that shape unconditionally, so the
607    // unwrap-via-expect is documenting the invariant rather than
608    // introducing a panic site (see .claude/rules/rust.md).
609    gix_hash::ObjectId::from_hex(sha.as_str().as_bytes())
610        .expect("Sha40 is always 40 lowercase hex by construction")
611}
612
613/// Locate `target_oid` in the chain (newest-first) and decode its
614/// pack entry, applying delta resolution as needed.
615async fn read_object_from_chain(
616    store: &dyn ObjectStore,
617    prefix: Option<&str>,
618    segments: &[ChainSegment],
619    target_oid: &gix_hash::ObjectId,
620    cache: &PackIndexCache,
621    depth: &mut u32,
622) -> Result<ResolvedObject, PackchainError> {
623    // Note: the delta-depth guard lives in `decode_entry`, the single
624    // chokepoint every recursive resolution path traverses. Putting it
625    // here would miss the OFS_DELTA branch, which recurses through
626    // `decode_entry` directly without coming back via this function
627    // (issue #83).
628    for segment in segments {
629        let content_sha = super::keys::segment_pack_sha(segment)?;
630        let idx = load_index(store, prefix, &content_sha, cache).await?;
631        let Some(entry_index) = idx.file.lookup(target_oid) else {
632            continue;
633        };
634        let pack_offset = idx.file.pack_offset_at_index(entry_index);
635        let bytes = fetch_entry_bytes(store, prefix, &content_sha, pack_offset, &idx).await?;
636        let resolved = Box::pin(decode_entry(
637            store,
638            prefix,
639            segments,
640            &content_sha,
641            pack_offset,
642            &bytes,
643            cache,
644            depth,
645        ))
646        .await?;
647        return Ok(resolved);
648    }
649    Err(PackchainError::BlobNotInChain {
650        // `gix_hash::ObjectId: Display` already produces 40-lowercase-hex
651        // (`Display` → `to_hex()` → `HexDisplay`).
652        sha: target_oid.to_string(),
653        path: String::new(),
654    })
655}
656
657async fn load_index(
658    store: &dyn ObjectStore,
659    prefix: Option<&str>,
660    content_sha: &Sha40,
661    cache: &PackIndexCache,
662) -> Result<Arc<CachedIndex>, PackchainError> {
663    let key = (prefix.unwrap_or("").to_owned(), content_sha.clone());
664    if let Some(hit) = cache.get(&key) {
665        return Ok(hit);
666    }
667
668    let idx_key = pack_idx_key(prefix, content_sha);
669    let idx_bytes = match store.get_bytes(&idx_key).await {
670        Ok(b) => b,
671        Err(ObjectStoreError::NotFound(_)) => {
672            return Err(PackchainError::PackMissing { key: idx_key });
673        }
674        Err(e) => return Err(PackchainError::Store(e)),
675    };
676
677    let owned: Vec<u8> = idx_bytes.to_vec();
678    let owned_len = owned.len() as u64;
679    let path = std::path::PathBuf::from(idx_key);
680    let file =
681        gix_pack::index::File::from_data(owned, path, gix_hash::Kind::Sha1).map_err(|e| {
682            PackchainError::MalformedPackEntry {
683                offset: 0,
684                reason: format!("idx parse: {e}"),
685            }
686        })?;
687    let sorted_offsets = file.sorted_offsets();
688    let offsets_bytes = (sorted_offsets.len() as u64).saturating_mul(8);
689    let cached = Arc::new(CachedIndex {
690        file,
691        sorted_offsets,
692        bytes: owned_len.saturating_add(offsets_bytes),
693    });
694    cache.insert(key, Arc::clone(&cached));
695    Ok(cached)
696}
697
698/// Range-GET the pack bytes for the entry starting at `pack_offset`.
699///
700/// Bounds are derived from `idx.sorted_offsets`: the next-greater
701/// offset is the entry's end. When `pack_offset` is the highest
702/// recorded offset (the last entry), the actual entry end is the
703/// trailer position — which we don't know without an extra round
704/// trip. Strategy:
705///
706/// 1. If `next_offset` is known, range-GET `[pack_offset, next_offset)`.
707/// 2. Otherwise, `HEAD` the pack to learn its length, then range-GET
708///    `[pack_offset, pack_len)`. The HEAD round-trip only fires for
709///    the very last entry in a pack — every other entry's bound is
710///    already in `sorted_offsets`. Both branches enforce
711///    [`MAX_RANGE_BYTES`]; a terminal-entry tail above the cap is
712///    rejected as [`PackchainError::MalformedPackEntry`] rather than
713///    pulled in full.
714async fn fetch_entry_bytes(
715    store: &dyn ObjectStore,
716    prefix: Option<&str>,
717    content_sha: &Sha40,
718    pack_offset: u64,
719    idx: &CachedIndex,
720) -> Result<Bytes, PackchainError> {
721    let pack = pack_key(prefix, content_sha);
722    let next_offset = idx
723        .sorted_offsets
724        .iter()
725        .copied()
726        .find(|&o| o > pack_offset);
727    let end = if let Some(end) = next_offset {
728        end
729    } else {
730        // Last entry in the pack — learn pack length via HEAD so the
731        // range can be bounded the same way as non-terminal entries.
732        let meta = match store.head(&pack).await {
733            Ok(m) => m,
734            Err(ObjectStoreError::NotFound(_)) => {
735                return Err(PackchainError::PackMissing { key: pack });
736            }
737            Err(e) => return Err(PackchainError::Store(e)),
738        };
739        if pack_offset >= meta.size {
740            return Err(PackchainError::MalformedPackEntry {
741                offset: pack_offset,
742                reason: "entry offset beyond pack EOF".to_owned(),
743            });
744        }
745        meta.size
746    };
747    let span = end.saturating_sub(pack_offset);
748    if span > MAX_RANGE_BYTES {
749        return Err(PackchainError::MalformedPackEntry {
750            offset: pack_offset,
751            reason: format!("entry range {span} bytes exceeds {MAX_RANGE_BYTES}-byte cap"),
752        });
753    }
754    match store.get_bytes_range(&pack, pack_offset..end).await {
755        Ok(b) => Ok(b),
756        Err(ObjectStoreError::NotFound(_)) => Err(PackchainError::PackMissing { key: pack }),
757        Err(e) => Err(PackchainError::Store(e)),
758    }
759}
760
761#[allow(clippy::too_many_arguments)]
762async fn decode_entry(
763    store: &dyn ObjectStore,
764    prefix: Option<&str>,
765    chain: &[ChainSegment],
766    content_sha: &Sha40,
767    pack_offset: u64,
768    raw: &[u8],
769    cache: &PackIndexCache,
770    depth: &mut u32,
771) -> Result<ResolvedObject, PackchainError> {
772    // Single chokepoint for the delta-depth budget: every recursive
773    // delta resolution path — REF_DELTA via `read_object_from_chain`
774    // and OFS_DELTA via the direct recursion below — re-enters here.
775    // Guarding only `read_object_from_chain` (the previous shape) let
776    // a pure-OFS_DELTA chain stack-overflow because OFS_DELTA never
777    // re-routed through it (issue #83).
778    if *depth > MAX_DELTA_DEPTH {
779        return Err(PackchainError::DeltaTooDeep {
780            max: MAX_DELTA_DEPTH,
781        });
782    }
783    *depth += 1;
784
785    let entry =
786        gix_pack::data::Entry::from_bytes(raw, pack_offset, gix_hash::Kind::Sha1.len_in_bytes())
787            .map_err(|e| PackchainError::MalformedPackEntry {
788                offset: pack_offset,
789                reason: e.to_string(),
790            })?;
791
792    // `data_offset` is absolute (pack_offset + header_size). Convert
793    // to an index into our locally-fetched buffer. Both casts must
794    // succeed: header_size is the number of bytes the entry header
795    // consumed (always tiny), and decompressed_size came from the
796    // entry header itself (capped by the pack format at u32-ish).
797    let header_size: usize = usize::try_from(entry.data_offset - pack_offset).map_err(|_| {
798        PackchainError::MalformedPackEntry {
799            offset: pack_offset,
800            reason: "entry header size exceeds usize".to_owned(),
801        }
802    })?;
803    // Reject pack-header-driven sizes above the hard cap *before*
804    // converting to `usize` and allocating. A malicious bucket can
805    // claim arbitrary `decompressed_size`; without this guard we'd
806    // `vec![0u8; n]` for that many bytes in `inflate_to`.
807    if entry.decompressed_size > MAX_DECOMPRESSED_BYTES {
808        return Err(PackchainError::MalformedPackEntry {
809            offset: pack_offset,
810            reason: format!(
811                "decompressed object size {} exceeds {}-byte cap",
812                entry.decompressed_size, MAX_DECOMPRESSED_BYTES
813            ),
814        });
815    }
816    let decompressed_size: usize = usize::try_from(entry.decompressed_size).map_err(|_| {
817        PackchainError::MalformedPackEntry {
818            offset: pack_offset,
819            reason: "decompressed object size exceeds usize".to_owned(),
820        }
821    })?;
822
823    let inflated = inflate_with_retry(
824        store,
825        prefix,
826        content_sha,
827        pack_offset,
828        raw,
829        header_size,
830        decompressed_size,
831    )
832    .await?;
833
834    match entry.header {
835        EntryHeader::Blob => Ok(ResolvedObject {
836            payload: inflated,
837            kind: ObjectKind::Blob,
838        }),
839        EntryHeader::Commit => Ok(ResolvedObject {
840            payload: inflated,
841            kind: ObjectKind::Commit,
842        }),
843        EntryHeader::Tree => Ok(ResolvedObject {
844            payload: inflated,
845            kind: ObjectKind::Tree,
846        }),
847        EntryHeader::Tag => Ok(ResolvedObject {
848            payload: inflated,
849            kind: ObjectKind::Tag,
850        }),
851        EntryHeader::OfsDelta { base_distance } => {
852            let base_offset = pack_offset.checked_sub(base_distance).ok_or(
853                PackchainError::MalformedPackEntry {
854                    offset: pack_offset,
855                    reason: "ofs-delta base distance underflows pack offset".to_owned(),
856                },
857            )?;
858            let idx = load_index(store, prefix, content_sha, cache).await?;
859            let base_bytes =
860                fetch_entry_bytes(store, prefix, content_sha, base_offset, &idx).await?;
861            let base = Box::pin(decode_entry(
862                store,
863                prefix,
864                chain,
865                content_sha,
866                base_offset,
867                &base_bytes,
868                cache,
869                depth,
870            ))
871            .await?;
872            apply_delta(&base, &inflated)
873        }
874        EntryHeader::RefDelta { base_id } => {
875            let base = Box::pin(read_object_from_chain(
876                store, prefix, chain, &base_id, cache, depth,
877            ))
878            .await?;
879            apply_delta(&base, &inflated)
880        }
881    }
882}
883
884/// Inflate the entry's compressed payload, widening the range and
885/// retrying when the locally-fetched buffer is short of the zlib
886/// stream end. Only fires for the very last entry in a pack — every
887/// other entry's range is bounded by [`CachedIndex::sorted_offsets`].
888async fn inflate_with_retry(
889    store: &dyn ObjectStore,
890    prefix: Option<&str>,
891    content_sha: &Sha40,
892    pack_offset: u64,
893    raw: &[u8],
894    header_size: usize,
895    decompressed_size: usize,
896) -> Result<Vec<u8>, PackchainError> {
897    // Own the wider buffer as `Bytes` when a retry has fetched more.
898    // `Bytes` is Arc-backed so storing it (vs `Vec<u8>`) avoids the
899    // `.to_vec()` copy on every retry. The `&buf[header_size..]`
900    // re-borrow below auto-derefs through `Bytes`'s `Deref<Target=[u8]>`
901    // — we're not constructing a `Bytes::slice`, just indexing into
902    // the existing buffer.
903    let mut current_buffer: Option<Bytes> = None;
904    let mut current_end = pack_offset.saturating_add(raw.len() as u64);
905    let mut expansions = 0u32;
906    loop {
907        let compressed: &[u8] = match &current_buffer {
908            Some(buf) => &buf[header_size..],
909            None => &raw[header_size..],
910        };
911        match inflate_to(compressed, decompressed_size) {
912            Ok(v) => return Ok(v),
913            Err(InflateOutcome::NeedMoreInput) => {
914                if expansions >= MAX_RANGE_EXPANSIONS {
915                    return Err(PackchainError::MalformedPackEntry {
916                        offset: pack_offset,
917                        reason: "ran out of compressed bytes after maximum range expansion"
918                            .to_owned(),
919                    });
920                }
921                let next_size = ((current_end - pack_offset) * 2).min(MAX_RANGE_BYTES);
922                if next_size <= current_end - pack_offset {
923                    return Err(PackchainError::MalformedPackEntry {
924                        offset: pack_offset,
925                        reason: "range expansion hit safety cap".to_owned(),
926                    });
927                }
928                let new_end = pack_offset + next_size;
929                let pack = pack_key(prefix, content_sha);
930                let bytes = match store.get_bytes_range(&pack, pack_offset..new_end).await {
931                    Ok(b) => b,
932                    Err(ObjectStoreError::NotFound(_)) => {
933                        return Err(PackchainError::PackMissing { key: pack });
934                    }
935                    Err(ObjectStoreError::RangeNotSatisfiable { .. }) => {
936                        return Err(PackchainError::MalformedPackEntry {
937                            offset: pack_offset,
938                            reason: "zlib stream truncated at pack EOF".to_owned(),
939                        });
940                    }
941                    Err(e) => return Err(PackchainError::Store(e)),
942                };
943                current_buffer = Some(bytes);
944                current_end = new_end;
945                expansions += 1;
946            }
947            Err(InflateOutcome::Failed) => {
948                return Err(PackchainError::Decompress {
949                    offset: pack_offset,
950                });
951            }
952        }
953    }
954}
955
956/// One-shot zlib inflate into a buffer of the announced decompressed
957/// size. `gix_features::zlib::Inflate` handles the actual decode; the
958/// outer return distinguishes "need more input" (caller can widen the
959/// range) from "stream is broken".
960fn inflate_to(input: &[u8], announced_size: usize) -> Result<Vec<u8>, InflateOutcome> {
961    use gix::features::zlib::{FlushDecompress, Status};
962
963    let mut state = gix::features::zlib::Decompress::new();
964    let mut out = vec![0u8; announced_size];
965    match state.decompress(input, &mut out, FlushDecompress::Finish) {
966        Ok(Status::StreamEnd) => {
967            let produced =
968                usize::try_from(state.total_out()).map_err(|_| InflateOutcome::Failed)?;
969            if produced != announced_size {
970                return Err(InflateOutcome::Failed);
971            }
972            Ok(out)
973        }
974        Ok(Status::Ok | Status::BufError) => Err(InflateOutcome::NeedMoreInput),
975        Err(_) => Err(InflateOutcome::Failed),
976    }
977}
978
979enum InflateOutcome {
980    NeedMoreInput,
981    Failed,
982}
983
984/// Apply a git pack-format delta to `base`, returning the
985/// reconstituted object with the same kind as `base`.
986fn apply_delta(base: &ResolvedObject, delta: &[u8]) -> Result<ResolvedObject, PackchainError> {
987    let mut cursor = 0usize;
988    let (src_size, n) = read_size_varint(delta, cursor).ok_or(PackchainError::MalformedDelta {
989        reason: "truncated source size header",
990    })?;
991    cursor += n;
992    let (dst_size, n) = read_size_varint(delta, cursor).ok_or(PackchainError::MalformedDelta {
993        reason: "truncated destination size header",
994    })?;
995    cursor += n;
996    if src_size != base.payload.len() as u64 {
997        return Err(PackchainError::MalformedDelta {
998            reason: "delta source size does not match base object size",
999        });
1000    }
1001    // Cap dst-size before allocating: it comes from the delta's
1002    // varint header (attacker-controlled in a malicious bucket).
1003    // Without this guard, `Vec::with_capacity(huge)` would panic
1004    // or thrash. Same cap as the entry-header path uses.
1005    if dst_size > MAX_DECOMPRESSED_BYTES {
1006        return Err(PackchainError::MalformedDelta {
1007            reason: "delta destination size exceeds 1 GiB cap",
1008        });
1009    }
1010    let dst_size_usize = usize::try_from(dst_size).map_err(|_| PackchainError::MalformedDelta {
1011        reason: "delta destination size exceeds usize",
1012    })?;
1013    let mut out = Vec::with_capacity(dst_size_usize);
1014    while cursor < delta.len() {
1015        let op = delta[cursor];
1016        cursor += 1;
1017        if op & 0x80 != 0 {
1018            apply_delta_copy_op(op, delta, &mut cursor, &base.payload, &mut out)?;
1019        } else if op == 0 {
1020            return Err(PackchainError::MalformedDelta {
1021                reason: "reserved zero opcode",
1022            });
1023        } else {
1024            apply_delta_insert_op(op, delta, &mut cursor, &mut out)?;
1025        }
1026        // Bound per-op growth so a malicious delta cannot grow `out`
1027        // without limit between the dst_size header check and the
1028        // post-loop equality check. Mirrors git's `patch-delta.c`
1029        // `size -= cp_size` invariant (any op that would push past
1030        // the announced destination size is rejected immediately).
1031        if out.len() > dst_size_usize {
1032            return Err(PackchainError::MalformedDelta {
1033                reason: "produced object exceeds announced destination size",
1034            });
1035        }
1036    }
1037    if out.len() as u64 != dst_size {
1038        return Err(PackchainError::MalformedDelta {
1039            reason: "produced object does not match announced destination size",
1040        });
1041    }
1042    Ok(ResolvedObject {
1043        payload: out,
1044        kind: base.kind,
1045    })
1046}
1047
1048/// Decode a packed-bitfield operand: for each set bit in `bitmask`,
1049/// consume the next byte of `delta` and OR it in at `bit_index * 8`.
1050fn read_packed_operand(
1051    delta: &[u8],
1052    cursor: &mut usize,
1053    bitmask: u8,
1054    bits: u8,
1055    truncated_reason: &'static str,
1056) -> Result<u32, PackchainError> {
1057    let mut value = 0u32;
1058    for shift in 0..bits {
1059        if bitmask & (1 << shift) != 0 {
1060            let byte = *delta.get(*cursor).ok_or(PackchainError::MalformedDelta {
1061                reason: truncated_reason,
1062            })?;
1063            value |= u32::from(byte) << (u32::from(shift) * 8);
1064            *cursor += 1;
1065        }
1066    }
1067    Ok(value)
1068}
1069
1070/// Git's documented default copy size when the delta's size operand is
1071/// zero (`pack-format.txt`: "if the size is zero, it is assumed to be
1072/// 0x10000").
1073const GIT_DELTA_DEFAULT_COPY_SIZE: u32 = 0x1_0000;
1074
1075/// Handle a copy-from-base opcode (high bit set). Low 4 bits of `op`
1076/// signal which offset bytes follow; the next 3 bits signal which size
1077/// bytes follow. A zero size means git's documented default
1078/// ([`GIT_DELTA_DEFAULT_COPY_SIZE`]).
1079fn apply_delta_copy_op(
1080    op: u8,
1081    delta: &[u8],
1082    cursor: &mut usize,
1083    base: &[u8],
1084    out: &mut Vec<u8>,
1085) -> Result<(), PackchainError> {
1086    let copy_offset = read_packed_operand(delta, cursor, op, 4, "truncated delta copy offset")?;
1087    let mut copy_size =
1088        read_packed_operand(delta, cursor, op >> 4, 3, "truncated delta copy size")?;
1089    if copy_size == 0 {
1090        copy_size = GIT_DELTA_DEFAULT_COPY_SIZE;
1091    }
1092    let start = copy_offset as usize;
1093    let end = start
1094        .checked_add(copy_size as usize)
1095        .ok_or(PackchainError::MalformedDelta {
1096            reason: "copy span overflow",
1097        })?;
1098    if end > base.len() {
1099        return Err(PackchainError::MalformedDelta {
1100            reason: "copy span exceeds base object",
1101        });
1102    }
1103    out.extend_from_slice(&base[start..end]);
1104    Ok(())
1105}
1106
1107/// Handle an insert opcode. Low 7 bits of `op` are the literal length;
1108/// that many bytes follow and are copied verbatim into `out`.
1109fn apply_delta_insert_op(
1110    op: u8,
1111    delta: &[u8],
1112    cursor: &mut usize,
1113    out: &mut Vec<u8>,
1114) -> Result<(), PackchainError> {
1115    let len = op as usize;
1116    let end = cursor
1117        .checked_add(len)
1118        .ok_or(PackchainError::MalformedDelta {
1119            reason: "insert span overflow",
1120        })?;
1121    if end > delta.len() {
1122        return Err(PackchainError::MalformedDelta {
1123            reason: "insert span exceeds delta payload",
1124        });
1125    }
1126    out.extend_from_slice(&delta[*cursor..end]);
1127    *cursor = end;
1128    Ok(())
1129}
1130
1131/// Read the variable-length size encoding used at the head of a delta
1132/// payload (LEB128-ish: 7 bits per byte, MSB = continuation).
1133fn read_size_varint(data: &[u8], mut cursor: usize) -> Option<(u64, usize)> {
1134    let start = cursor;
1135    let mut value: u64 = 0;
1136    let mut shift = 0u32;
1137    loop {
1138        let byte = *data.get(cursor)?;
1139        cursor += 1;
1140        value |= u64::from(byte & 0x7f).checked_shl(shift)?;
1141        if byte & 0x80 == 0 {
1142            return Some((value, cursor - start));
1143        }
1144        shift += 7;
1145        if shift >= 64 {
1146            return None;
1147        }
1148    }
1149}
1150
1151#[cfg(test)]
1152mod tests {
1153    use super::*;
1154
1155    fn sha40(s: &str) -> Sha40 {
1156        Sha40::try_new(s).expect("test fixture sha is valid")
1157    }
1158
1159    #[test]
1160    fn parse_path_rejects_empty() {
1161        let err = parse_path("").unwrap_err();
1162        assert!(matches!(err, PackchainError::MalformedPath { .. }));
1163    }
1164
1165    #[test]
1166    fn parse_path_rejects_absolute() {
1167        let err = parse_path("/etc/passwd").unwrap_err();
1168        let PackchainError::MalformedPath { reason, .. } = err else {
1169            panic!("expected MalformedPath");
1170        };
1171        assert!(reason.contains("absolute"));
1172    }
1173
1174    #[test]
1175    fn parse_path_rejects_dotdot() {
1176        let err = parse_path("src/../etc").unwrap_err();
1177        assert!(matches!(err, PackchainError::MalformedPath { .. }));
1178    }
1179
1180    #[test]
1181    fn parse_path_rejects_dot() {
1182        let err = parse_path("./src").unwrap_err();
1183        assert!(matches!(err, PackchainError::MalformedPath { .. }));
1184    }
1185
1186    #[test]
1187    fn parse_path_rejects_double_slash() {
1188        let err = parse_path("src//main.rs").unwrap_err();
1189        assert!(matches!(err, PackchainError::MalformedPath { .. }));
1190    }
1191
1192    #[test]
1193    fn parse_path_rejects_trailing_slash() {
1194        let err = parse_path("src/main.rs/").unwrap_err();
1195        assert!(matches!(err, PackchainError::MalformedPath { .. }));
1196    }
1197
1198    #[test]
1199    fn parse_path_accepts_nested() {
1200        let segs = parse_path("src/lib/mod.rs").unwrap();
1201        assert_eq!(segs, vec!["src", "lib", "mod.rs"]);
1202    }
1203
1204    #[test]
1205    fn parse_path_accepts_single_segment() {
1206        let segs = parse_path("Cargo.toml").unwrap();
1207        assert_eq!(segs, vec!["Cargo.toml"]);
1208    }
1209
1210    const SHA_A: &str = "0123456789abcdef0123456789abcdef01234567";
1211    const SHA_B: &str = "fedcba9876543210fedcba9876543210fedcba98";
1212    const SHA_C: &str = "1111111111111111111111111111111111111111";
1213
1214    #[test]
1215    fn walk_path_finds_top_level_blob() {
1216        let mut tree = BTreeMap::new();
1217        tree.insert("Cargo.toml".to_owned(), PathNode::Blob(sha40(SHA_A)));
1218        let segs = parse_path("Cargo.toml").unwrap();
1219        let result = walk_path(&tree, &segs, "refs/heads/main", "Cargo.toml").unwrap();
1220        assert_eq!(result.as_str(), SHA_A);
1221    }
1222
1223    #[test]
1224    fn walk_path_descends_subtree() {
1225        let mut subtree = BTreeMap::new();
1226        subtree.insert("main.rs".to_owned(), PathNode::Blob(sha40(SHA_A)));
1227        let mut tree = BTreeMap::new();
1228        tree.insert("src".to_owned(), PathNode::Tree(subtree));
1229        let segs = parse_path("src/main.rs").unwrap();
1230        let result = walk_path(&tree, &segs, "refs/heads/main", "src/main.rs").unwrap();
1231        assert_eq!(result.as_str(), SHA_A);
1232    }
1233
1234    #[test]
1235    fn walk_path_missing_returns_path_not_found() {
1236        let mut tree = BTreeMap::new();
1237        tree.insert("Cargo.toml".to_owned(), PathNode::Blob(sha40(SHA_A)));
1238        let segs = parse_path("missing.txt").unwrap();
1239        let err = walk_path(&tree, &segs, "refs/heads/main", "missing.txt").unwrap_err();
1240        assert!(matches!(err, PackchainError::PathNotFound { .. }));
1241    }
1242
1243    #[test]
1244    fn walk_path_directory_returns_path_not_a_blob() {
1245        let mut subtree = BTreeMap::new();
1246        subtree.insert("main.rs".to_owned(), PathNode::Blob(sha40(SHA_A)));
1247        let mut tree = BTreeMap::new();
1248        tree.insert("src".to_owned(), PathNode::Tree(subtree));
1249        let segs = parse_path("src").unwrap();
1250        let err = walk_path(&tree, &segs, "refs/heads/main", "src").unwrap_err();
1251        assert!(matches!(err, PackchainError::PathNotABlob { .. }));
1252    }
1253
1254    #[test]
1255    fn walk_path_through_blob_returns_not_found() {
1256        let mut tree = BTreeMap::new();
1257        tree.insert("Cargo.toml".to_owned(), PathNode::Blob(sha40(SHA_A)));
1258        let segs = parse_path("Cargo.toml/extra").unwrap();
1259        let err = walk_path(&tree, &segs, "refs/heads/main", "Cargo.toml/extra").unwrap_err();
1260        assert!(matches!(err, PackchainError::PathNotFound { .. }));
1261    }
1262
1263    #[test]
1264    fn read_size_varint_single_byte() {
1265        let (v, n) = read_size_varint(&[0x05], 0).unwrap();
1266        assert_eq!(v, 5);
1267        assert_eq!(n, 1);
1268    }
1269
1270    #[test]
1271    fn read_size_varint_multi_byte() {
1272        // 0x83 = 0b10000011 → low 7 bits 3, continuation set.
1273        // 0x02 = 0b00000010 → low 7 bits 2, no continuation.
1274        // Decoded: 3 | (2 << 7) = 3 | 256 = 259.
1275        let (v, n) = read_size_varint(&[0x83, 0x02], 0).unwrap();
1276        assert_eq!(v, 259);
1277        assert_eq!(n, 2);
1278    }
1279
1280    #[test]
1281    fn read_size_varint_truncated() {
1282        // Continuation bit set on last available byte.
1283        assert!(read_size_varint(&[0x80], 0).is_none());
1284    }
1285
1286    #[test]
1287    fn cache_default_starts_empty() {
1288        // Capacity (`capacity_bytes`) is not part of the public API
1289        // surface, so this test only covers what is observable: a
1290        // freshly-defaulted cache has zero entries and zero resident
1291        // bytes. The 64 MiB default value itself is checked by the
1292        // single-entry budget check in `cache_default_rejects_oversize_entry`.
1293        let cache = PackIndexCache::default();
1294        assert_eq!(cache.len(), 0);
1295        assert!(cache.is_empty());
1296        assert_eq!(cache.resident_bytes(), 0);
1297    }
1298
1299    /// Pin the default capacity (`DEFAULT_CACHE_CAPACITY_BYTES`) by
1300    /// observing the boundary the public API exposes: an entry one
1301    /// byte over the documented 64 MiB cap is silently rejected, an
1302    /// entry exactly at the cap is accepted. A regression that
1303    /// changed the default to a different power of two would flip
1304    /// one of these two assertions.
1305    #[test]
1306    fn cache_default_enforces_64mib_capacity() {
1307        let cache = PackIndexCache::default();
1308        // Just over: rejected.
1309        cache.insert(
1310            ("p".into(), sha40(SHA_A)),
1311            Arc::new(make_dummy_index(DEFAULT_CACHE_CAPACITY_BYTES + 1)),
1312        );
1313        assert_eq!(cache.len(), 0, "entry over 64 MiB must be rejected");
1314        // Exactly at: accepted.
1315        cache.insert(
1316            ("p".into(), sha40(SHA_B)),
1317            Arc::new(make_dummy_index(DEFAULT_CACHE_CAPACITY_BYTES)),
1318        );
1319        assert_eq!(cache.len(), 1, "entry at 64 MiB must be accepted");
1320    }
1321
1322    #[test]
1323    fn cache_explicit_capacity_zero_disables_caching() {
1324        let cache = PackIndexCache::new(0);
1325        // Inserting any non-empty entry must be a no-op (single-entry
1326        // budget check).
1327        let dummy = make_dummy_index(1_024);
1328        cache.insert(("p".into(), sha40(SHA_A)), Arc::new(dummy));
1329        assert_eq!(cache.len(), 0);
1330    }
1331
1332    #[test]
1333    fn cache_evicts_lru_when_over_capacity() {
1334        let cache = PackIndexCache::new(3_000);
1335        cache.insert(
1336            ("p".into(), sha40(SHA_A)),
1337            Arc::new(make_dummy_index(1_000)),
1338        );
1339        cache.insert(
1340            ("p".into(), sha40(SHA_B)),
1341            Arc::new(make_dummy_index(1_000)),
1342        );
1343        cache.insert(
1344            ("p".into(), sha40(SHA_C)),
1345            Arc::new(make_dummy_index(1_000)),
1346        );
1347        assert_eq!(cache.len(), 3);
1348        assert_eq!(cache.resident_bytes(), 3_000);
1349
1350        // Touch SHA_A so SHA_B becomes LRU. Then insert a fourth entry
1351        // that pushes us over capacity — SHA_B must be evicted.
1352        let _ = cache.get(&("p".into(), sha40(SHA_A)));
1353        cache.insert(
1354            (
1355                "p".into(),
1356                sha40("dddddddddddddddddddddddddddddddddddddddd"),
1357            ),
1358            Arc::new(make_dummy_index(1_000)),
1359        );
1360        assert_eq!(cache.len(), 3);
1361        assert!(cache.get(&("p".into(), sha40(SHA_A))).is_some());
1362        assert!(cache.get(&("p".into(), sha40(SHA_B))).is_none());
1363    }
1364
1365    #[test]
1366    fn cache_repeated_inserts_replace_accounting() {
1367        let cache = PackIndexCache::new(10_000);
1368        let key: CacheKey = ("p".into(), sha40(SHA_A));
1369        cache.insert(key.clone(), Arc::new(make_dummy_index(1_000)));
1370        cache.insert(key.clone(), Arc::new(make_dummy_index(2_500)));
1371        assert_eq!(cache.len(), 1);
1372        assert_eq!(cache.resident_bytes(), 2_500);
1373    }
1374
1375    /// Construct a [`CachedIndex`] without a real .idx file, only for
1376    /// exercising the LRU bookkeeping. The `file` field is left
1377    /// uninitialised by parsing a minimal hand-crafted v2 idx; this is
1378    /// not used by the cache-mechanics tests.
1379    fn make_dummy_index(bytes: u64) -> CachedIndex {
1380        // A minimal v2 idx that gix_pack accepts: signature, version,
1381        // 256 fan-out entries (all zero — zero objects), and a 20-byte
1382        // pack-trailer + 20-byte idx-trailer at the end.
1383        let mut data = Vec::with_capacity(8 + 256 * 4 + 40);
1384        data.extend_from_slice(b"\xfftOc"); // V2 signature
1385        data.extend_from_slice(&2u32.to_be_bytes()); // version 2
1386        for _ in 0..256 {
1387            data.extend_from_slice(&0u32.to_be_bytes()); // fan-out: 0 objects under each leading byte
1388        }
1389        data.extend_from_slice(&[0u8; 20]); // pack trailer placeholder
1390        data.extend_from_slice(&[0u8; 20]); // idx trailer placeholder
1391        let file = gix_pack::index::File::from_data(
1392            data,
1393            std::path::PathBuf::from("dummy.idx"),
1394            gix_hash::Kind::Sha1,
1395        )
1396        .expect("hand-crafted minimal v2 idx parses");
1397        CachedIndex {
1398            file,
1399            sorted_offsets: Vec::new(),
1400            bytes,
1401        }
1402    }
1403
1404    #[test]
1405    fn sha40_to_object_id_roundtrips() {
1406        let sha = sha40(SHA_A);
1407        let oid = sha40_to_object_id(&sha);
1408        assert_eq!(oid.to_string(), SHA_A);
1409    }
1410
1411    // --- apply_delta -------------------------------------------------------
1412    //
1413    // Hand-craft delta payloads (per the git delta format) and verify
1414    // [`apply_delta`] reconstructs the right output. Without this,
1415    // OFS_DELTA / REF_DELTA paths in [`decode_entry`] are not exercised
1416    // by any test — the integration suite uses small text files that
1417    // gix-pack does not delta-encode.
1418
1419    fn base_blob(payload: &[u8]) -> ResolvedObject {
1420        ResolvedObject {
1421            payload: payload.to_vec(),
1422            kind: ObjectKind::Blob,
1423        }
1424    }
1425
1426    /// Encode a single varint per the delta header format (LEB128-ish:
1427    /// 7 bits per byte, MSB = continuation).
1428    fn varint(mut value: u64) -> Vec<u8> {
1429        let mut out = Vec::new();
1430        loop {
1431            let byte = (value & 0x7f) as u8;
1432            value >>= 7;
1433            if value == 0 {
1434                out.push(byte);
1435                return out;
1436            }
1437            out.push(byte | 0x80);
1438        }
1439    }
1440
1441    #[test]
1442    fn apply_delta_insert_only_round_trips() {
1443        // Empty base, delta is pure-insert. Reconstructed payload
1444        // must be byte-equal to the literal data the insert opcode
1445        // carries.
1446        let base = base_blob(b"");
1447        let literal = b"Hello, packchain!";
1448        let mut delta = Vec::new();
1449        delta.extend_from_slice(&varint(0)); // src_size
1450        delta.extend_from_slice(&varint(literal.len() as u64)); // dst_size
1451        // Insert opcode: low 7 bits = literal length. The literal is
1452        // 17 bytes here, so the cast to u8 is the desired narrow.
1453        delta.push(u8::try_from(literal.len()).expect("test literal fits in 7 bits"));
1454        delta.extend_from_slice(literal);
1455        let out = apply_delta(&base, &delta).expect("insert-only delta applies");
1456        assert_eq!(out.payload, literal);
1457        assert_eq!(out.kind, ObjectKind::Blob);
1458    }
1459
1460    #[test]
1461    fn apply_delta_copy_only_round_trips() {
1462        // Copy first 5 bytes from a 10-byte base.
1463        let base = base_blob(b"abcdefghij");
1464        let mut delta = Vec::new();
1465        delta.extend_from_slice(&varint(10)); // src_size
1466        delta.extend_from_slice(&varint(5)); // dst_size
1467        // Copy opcode: MSB=1; bit0 set (1 byte of offset follows);
1468        // bit4 set (1 byte of size follows).
1469        delta.push(0b1001_0001);
1470        delta.push(0); // offset = 0
1471        delta.push(5); // size = 5
1472        let out = apply_delta(&base, &delta).expect("copy-only delta applies");
1473        assert_eq!(out.payload, b"abcde");
1474    }
1475
1476    #[test]
1477    fn apply_delta_mixed_copy_and_insert_round_trips() {
1478        // Reconstruct "HELLO world" by copying "HELLO" from the base
1479        // and inserting " world".
1480        let base = base_blob(b"HELLO!?");
1481        let mut delta = Vec::new();
1482        delta.extend_from_slice(&varint(7)); // src_size
1483        delta.extend_from_slice(&varint(11)); // dst_size: "HELLO world"
1484        // Copy 5 bytes from offset 0.
1485        delta.push(0b1001_0001);
1486        delta.push(0);
1487        delta.push(5);
1488        // Insert 6 literal bytes.
1489        let literal = b" world";
1490        delta.push(u8::try_from(literal.len()).expect("test literal fits in 7 bits"));
1491        delta.extend_from_slice(literal);
1492        let out = apply_delta(&base, &delta).expect("mixed delta applies");
1493        assert_eq!(out.payload, b"HELLO world");
1494    }
1495
1496    #[test]
1497    fn apply_delta_preserves_base_kind() {
1498        // A delta against a Tree base must produce a Tree result —
1499        // delta application doesn't change object kind. Confirms the
1500        // `kind: base.kind` line at the bottom of `apply_delta`.
1501        let base = ResolvedObject {
1502            payload: b"x".to_vec(),
1503            kind: ObjectKind::Tree,
1504        };
1505        let mut delta = Vec::new();
1506        delta.extend_from_slice(&varint(1));
1507        delta.extend_from_slice(&varint(1));
1508        delta.push(0b1001_0001);
1509        delta.push(0);
1510        delta.push(1);
1511        let out = apply_delta(&base, &delta).expect("kind-preserving delta applies");
1512        assert_eq!(out.kind, ObjectKind::Tree);
1513    }
1514
1515    #[test]
1516    fn apply_delta_rejects_source_size_mismatch() {
1517        // Delta claims source size 99, base is 1 byte. Must reject
1518        // before producing output.
1519        let base = base_blob(b"x");
1520        let mut delta = Vec::new();
1521        delta.extend_from_slice(&varint(99));
1522        delta.extend_from_slice(&varint(1));
1523        delta.push(1);
1524        delta.push(b'y');
1525        let err = apply_delta(&base, &delta).expect_err("size mismatch must fail");
1526        assert!(
1527            matches!(err, PackchainError::MalformedDelta { reason } if reason.contains("source size")),
1528            "expected MalformedDelta source-size mismatch, got {err:?}",
1529        );
1530    }
1531
1532    #[test]
1533    fn apply_delta_rejects_copy_past_base_end() {
1534        // Copy opcode asks for bytes [3..8) from a 4-byte base. Bounds
1535        // check must fire.
1536        let base = base_blob(b"abcd");
1537        let mut delta = Vec::new();
1538        delta.extend_from_slice(&varint(4));
1539        delta.extend_from_slice(&varint(5));
1540        delta.push(0b1001_0001);
1541        delta.push(3); // offset = 3
1542        delta.push(5); // size = 5 → end = 8 > 4
1543        let err = apply_delta(&base, &delta).expect_err("out-of-range copy must fail");
1544        assert!(
1545            matches!(err, PackchainError::MalformedDelta { reason } if reason.contains("copy span")),
1546            "expected MalformedDelta copy-span error, got {err:?}",
1547        );
1548    }
1549
1550    #[test]
1551    fn apply_delta_rejects_dst_size_over_cap() {
1552        // dst_size header above MAX_DECOMPRESSED_BYTES must reject
1553        // before allocating.
1554        let base = base_blob(b"");
1555        let mut delta = Vec::new();
1556        delta.extend_from_slice(&varint(0));
1557        delta.extend_from_slice(&varint(MAX_DECOMPRESSED_BYTES + 1));
1558        let err = apply_delta(&base, &delta).expect_err("oversize dst must fail");
1559        assert!(
1560            matches!(err, PackchainError::MalformedDelta { reason } if reason.contains("1 GiB cap")),
1561            "expected MalformedDelta cap error, got {err:?}",
1562        );
1563    }
1564
1565    #[test]
1566    fn apply_delta_rejects_reserved_zero_opcode() {
1567        // 0x00 is reserved per the git delta format.
1568        let base = base_blob(b"");
1569        let mut delta = Vec::new();
1570        delta.extend_from_slice(&varint(0));
1571        delta.extend_from_slice(&varint(0));
1572        delta.push(0); // reserved opcode
1573        let err = apply_delta(&base, &delta).expect_err("reserved opcode must fail");
1574        assert!(
1575            matches!(err, PackchainError::MalformedDelta { reason } if reason.contains("zero opcode")),
1576            "expected MalformedDelta reserved-opcode error, got {err:?}",
1577        );
1578    }
1579
1580    #[test]
1581    fn apply_delta_copy_size_zero_substitutes_default() {
1582        // Copy opcode with NO size operand bytes (high bits 4..6 of op
1583        // all zero) — the decoded size is zero, which per git's spec
1584        // must be substituted with `GIT_DELTA_DEFAULT_COPY_SIZE`
1585        // (0x10000). We can't easily verify the produced length without
1586        // a >=64 KiB base, so use a small base and check the substitution
1587        // fires by observing the bounds-check failure: with the default
1588        // substituted, the span (offset 0, size 0x10000) overshoots the
1589        // 1-byte base and triggers "copy span exceeds base object". If
1590        // the substitution were skipped, the span would be empty and
1591        // the post-loop "destination size" check would fire instead.
1592        let base = base_blob(b"x");
1593        let mut delta = Vec::new();
1594        delta.extend_from_slice(&varint(1));
1595        delta.extend_from_slice(&varint(2)); // dst_size irrelevant; copy errors first
1596        // Copy opcode: MSB=1, bit0 set (1 byte of offset follows), all
1597        // size bits (4..6) cleared.
1598        delta.push(0b1000_0001);
1599        delta.push(0); // offset = 0
1600        let err = apply_delta(&base, &delta)
1601            .expect_err("default-size substitution must fail bounds check");
1602        assert!(
1603            matches!(&err, PackchainError::MalformedDelta { reason } if reason.contains("copy span exceeds base")),
1604            "expected copy-span-exceeds-base (proves default size was substituted), got {err:?}",
1605        );
1606    }
1607
1608    #[test]
1609    fn apply_delta_rejects_dst_size_undershoot() {
1610        // delta finishes (no more opcodes) but produced output is
1611        // shorter than the announced dst_size. The post-loop check
1612        // must catch this.
1613        let base = base_blob(b"abcdef");
1614        let mut delta = Vec::new();
1615        delta.extend_from_slice(&varint(6));
1616        delta.extend_from_slice(&varint(10)); // claim 10
1617        // ... but only emit 3 bytes via copy.
1618        delta.push(0b1001_0001);
1619        delta.push(0);
1620        delta.push(3);
1621        let err = apply_delta(&base, &delta).expect_err("undershoot must fail");
1622        assert!(
1623            matches!(err, PackchainError::MalformedDelta { reason } if reason.contains("destination size")),
1624            "expected MalformedDelta undershoot error, got {err:?}",
1625        );
1626    }
1627
1628    #[test]
1629    fn apply_delta_rejects_overshoot() {
1630        // Delta announces dst_size=4 but emits 8 bytes via a single
1631        // copy op. Without the per-op bound, `out` would grow past
1632        // the announced size and only get caught by the post-loop
1633        // equality check — a malicious delta with a multi-TiB total
1634        // could OOM the helper before reaching that point. The
1635        // per-op bound must reject as soon as `out.len()` exceeds
1636        // `dst_size_usize`.
1637        let base = base_blob(b"abcdefgh");
1638        let mut delta = Vec::new();
1639        delta.extend_from_slice(&varint(8)); // src_size
1640        delta.extend_from_slice(&varint(4)); // dst_size — under-claim
1641        // Copy 8 bytes from offset 0 (overshoots dst_size).
1642        delta.push(0b1001_0001);
1643        delta.push(0);
1644        delta.push(8);
1645        let err = apply_delta(&base, &delta).expect_err("overshoot must fail");
1646        assert!(
1647            matches!(
1648                err,
1649                PackchainError::MalformedDelta {
1650                    reason: "produced object exceeds announced destination size"
1651                }
1652            ),
1653            "expected MalformedDelta overshoot error, got {err:?}",
1654        );
1655    }
1656
1657    #[test]
1658    fn apply_delta_overshoot_check_fires_after_single_default_size_copy() {
1659        // Aggressive variant: small base (1 byte, repeated 16 times so
1660        // copy size 0x1_0000 stays in-bounds against the base), and a
1661        // copy opcode with size operand bits cleared so it falls back
1662        // to GIT_DELTA_DEFAULT_COPY_SIZE (0x1_0000 = 64 KiB). A single
1663        // op therefore emits 64 KiB, which must trip the per-op bound
1664        // when dst_size is set to 4. This proves the check fires
1665        // after the FIRST op, not just at end-of-loop — a chain of
1666        // such ops in a real attack would otherwise blow through
1667        // memory before the post-loop check ever ran.
1668        // Heap allocation avoids large_stack_arrays clippy lint (16 KiB cap).
1669        let base_payload = vec![b'x'; 0x1_0000];
1670        let base = base_blob(&base_payload);
1671        let mut delta = Vec::new();
1672        delta.extend_from_slice(&varint(0x1_0000)); // src_size matches base
1673        delta.extend_from_slice(&varint(4)); // dst_size — tiny
1674        // Copy opcode: MSB=1, bit0 set (1 byte of offset follows),
1675        // size bits (4..6) cleared so default 0x1_0000 substitutes.
1676        delta.push(0b1000_0001);
1677        delta.push(0); // offset = 0
1678        let err = apply_delta(&base, &delta).expect_err("default-size overshoot must fail");
1679        assert!(
1680            matches!(
1681                err,
1682                PackchainError::MalformedDelta {
1683                    reason: "produced object exceeds announced destination size"
1684                }
1685            ),
1686            "expected MalformedDelta overshoot error after first op, got {err:?}",
1687        );
1688    }
1689
1690    #[test]
1691    fn apply_delta_exact_match_does_not_trip_overshoot_check() {
1692        // Boundary: a delta that exactly fills dst_size must succeed.
1693        // The per-op bound rejects only `>`, never `==`, so an exact
1694        // match flows through to the post-loop equality check.
1695        let base = base_blob(b"abcd");
1696        let mut delta = Vec::new();
1697        delta.extend_from_slice(&varint(4));
1698        delta.extend_from_slice(&varint(4));
1699        delta.push(0b1001_0001);
1700        delta.push(0);
1701        delta.push(4);
1702        let out = apply_delta(&base, &delta).expect("exact-match delta applies");
1703        assert_eq!(out.payload, b"abcd");
1704    }
1705
1706    // --- delta-depth guard (issue #83) -------------------------------------
1707    //
1708    // The fix moves the depth guard from `read_object_from_chain` into
1709    // `decode_entry`, the single chokepoint every recursive resolution
1710    // path traverses. These tests exercise both the boundary and the
1711    // OFS_DELTA bypass that the old shape allowed.
1712    //
1713    // The synthesised packs use bounded payloads (small literal byte
1714    // strings), so the `as usize` / `as u8` casts cannot truncate at
1715    // runtime. Suppressing the lints here keeps the test setup direct;
1716    // production code paths use `try_from`.
1717
1718    use crate::object_store::mock::MockStore;
1719    use flate2::Compression;
1720    use flate2::write::ZlibEncoder;
1721    use std::io::Write;
1722
1723    /// Encode a pack-entry header per gix-pack's canonical encoding:
1724    /// 4-bit type tag + 4-bit low size, then 7-bit continuation bytes
1725    /// for the upper bits of `size`. This is the inverse of
1726    /// `gix_pack::data::entry::decode::parse_header_info`.
1727    #[allow(clippy::cast_possible_truncation)]
1728    fn encode_pack_entry_header(type_id: u8, mut size: u64) -> Vec<u8> {
1729        let mut out = Vec::new();
1730        let low4 = (size & 0x0f) as u8;
1731        size >>= 4;
1732        let mut byte = (type_id << 4) | low4;
1733        if size != 0 {
1734            byte |= 0x80;
1735        }
1736        out.push(byte);
1737        while size != 0 {
1738            let mut next = (size & 0x7f) as u8;
1739            size >>= 7;
1740            if size != 0 {
1741                next |= 0x80;
1742            }
1743            out.push(next);
1744        }
1745        out
1746    }
1747
1748    /// Encode an `OFS_DELTA` `base_distance` per the gix-pack
1749    /// `parse_leb64` shape (offset-LEB128, with implicit `+1` between
1750    /// continuation bytes).
1751    #[allow(clippy::cast_possible_truncation)]
1752    fn encode_ofs_delta_distance(distance: u64) -> Vec<u8> {
1753        // The decoder's invariant: `value = ((((b0 & 0x7f) + 1) << 7) |
1754        // (b1 & 0x7f) + 1) << 7) | ...` — i.e. each continuation step
1755        // adds one before shifting. Build the byte sequence by
1756        // repeatedly subtracting one and shifting right, so the decoder
1757        // reconstructs the original distance.
1758        let mut bytes = Vec::new();
1759        let mut v = distance;
1760        bytes.push((v & 0x7f) as u8);
1761        v >>= 7;
1762        while v != 0 {
1763            v -= 1;
1764            bytes.push(((v & 0x7f) as u8) | 0x80);
1765            v >>= 7;
1766        }
1767        bytes.reverse();
1768        bytes
1769    }
1770
1771    fn zlib_compress(data: &[u8]) -> Vec<u8> {
1772        let mut e = ZlibEncoder::new(Vec::new(), Compression::default());
1773        e.write_all(data).expect("zlib encode");
1774        e.finish().expect("zlib finish")
1775    }
1776
1777    /// Build a delta payload with the canonical varint header and a
1778    /// single-insert opcode that copies `payload` literally. The result
1779    /// reconstructs to `payload` regardless of the base content (the
1780    /// source-size check still runs against the base, so callers must
1781    /// pass `base_size` matching their base).
1782    #[allow(clippy::cast_possible_truncation)]
1783    fn make_insert_delta(base_size: u64, payload: &[u8]) -> Vec<u8> {
1784        let mut d = Vec::new();
1785        // src_size and dst_size as size-varint (low-bit-first, MSB
1786        // continuation), per `read_size_varint`.
1787        let put_varint = |mut v: u64, buf: &mut Vec<u8>| loop {
1788            let byte = (v & 0x7f) as u8;
1789            v >>= 7;
1790            if v == 0 {
1791                buf.push(byte);
1792                return;
1793            }
1794            buf.push(byte | 0x80);
1795        };
1796        put_varint(base_size, &mut d);
1797        put_varint(payload.len() as u64, &mut d);
1798        // Insert opcode: low 7 bits are length. Tests use small literals.
1799        assert!(payload.len() < 0x80, "test literal too long for one insert");
1800        d.push(payload.len() as u8);
1801        d.extend_from_slice(payload);
1802        d
1803    }
1804
1805    /// Append a complete pack entry (header + zlib-compressed payload)
1806    /// to `pack` and record the entry's start offset in `offsets`.
1807    #[allow(clippy::cast_possible_truncation)]
1808    fn push_pack_entry(
1809        pack: &mut Vec<u8>,
1810        offsets: &mut Vec<u64>,
1811        type_id: u8,
1812        ofs_delta_distance: Option<u64>,
1813        decompressed_payload: &[u8],
1814    ) {
1815        let start = pack.len() as u64;
1816        offsets.push(start);
1817        pack.extend(encode_pack_entry_header(
1818            type_id,
1819            decompressed_payload.len() as u64,
1820        ));
1821        if let Some(d) = ofs_delta_distance {
1822            pack.extend(encode_ofs_delta_distance(d));
1823        }
1824        pack.extend(zlib_compress(decompressed_payload));
1825    }
1826
1827    /// Wire up a `PackIndexCache` with a hand-rolled `CachedIndex`
1828    /// whose only contract with the test is `sorted_offsets`. The
1829    /// stored idx file is a zero-entry v2 stub; tests call
1830    /// `decode_entry` directly so the file-side lookup is never used.
1831    fn install_cached_index(
1832        cache: &PackIndexCache,
1833        prefix: &str,
1834        content_sha: &Sha40,
1835        offsets: Vec<u64>,
1836    ) {
1837        let cached = CachedIndex {
1838            file: minimal_v2_idx(),
1839            sorted_offsets: offsets,
1840            bytes: 1_024,
1841        };
1842        cache.insert((prefix.to_owned(), content_sha.clone()), Arc::new(cached));
1843    }
1844
1845    fn minimal_v2_idx() -> gix_pack::index::File<Vec<u8>> {
1846        let mut data = Vec::with_capacity(8 + 256 * 4 + 40);
1847        data.extend_from_slice(b"\xfftOc");
1848        data.extend_from_slice(&2u32.to_be_bytes());
1849        for _ in 0..256 {
1850            data.extend_from_slice(&0u32.to_be_bytes());
1851        }
1852        data.extend_from_slice(&[0u8; 20]);
1853        data.extend_from_slice(&[0u8; 20]);
1854        gix_pack::index::File::from_data(
1855            data,
1856            std::path::PathBuf::from("dummy.idx"),
1857            gix_hash::Kind::Sha1,
1858        )
1859        .expect("hand-crafted minimal v2 idx parses")
1860    }
1861
1862    /// `decode_entry` is the single chokepoint for the depth budget:
1863    /// invoking it with `*depth > MAX_DELTA_DEPTH` must fail before
1864    /// any decode work happens. This is what catches a recursive
1865    /// caller (`REF_DELTA` via `read_object_from_chain`, or `OFS_DELTA`
1866    /// directly) blowing past the cap.
1867    #[tokio::test]
1868    async fn decode_entry_rejects_when_depth_already_over_cap() {
1869        let store = MockStore::new();
1870        let cache = PackIndexCache::default();
1871        let chain: Vec<ChainSegment> = Vec::new();
1872        let content_sha = sha40(SHA_A);
1873        // Any well-formed Blob entry — depth check fires first, so
1874        // contents are immaterial.
1875        let mut pack = Vec::new();
1876        let mut offsets = Vec::new();
1877        push_pack_entry(&mut pack, &mut offsets, 3 /* BLOB */, None, b"x");
1878
1879        let mut depth = MAX_DELTA_DEPTH + 1;
1880        let err = decode_entry(
1881            &store,
1882            None,
1883            &chain,
1884            &content_sha,
1885            offsets[0],
1886            &pack[usize::try_from(offsets[0]).unwrap()..],
1887            &cache,
1888            &mut depth,
1889        )
1890        .await
1891        .expect_err("over-cap depth must fail");
1892        assert!(
1893            matches!(err, PackchainError::DeltaTooDeep { max } if max == MAX_DELTA_DEPTH),
1894            "expected DeltaTooDeep, got {err:?}",
1895        );
1896    }
1897
1898    /// At exactly `*depth == MAX_DELTA_DEPTH` and a non-delta entry,
1899    /// `decode_entry` must succeed: a non-delta base reached at the
1900    /// boundary is the deepest legal point in the chain. This is the
1901    /// off-by-one boundary opposite the failing case above.
1902    #[tokio::test]
1903    async fn decode_entry_at_cap_with_non_delta_base_succeeds() {
1904        let store = MockStore::new();
1905        let cache = PackIndexCache::default();
1906        let chain: Vec<ChainSegment> = Vec::new();
1907        let content_sha = sha40(SHA_A);
1908        let mut pack = Vec::new();
1909        let mut offsets = Vec::new();
1910        push_pack_entry(
1911            &mut pack,
1912            &mut offsets,
1913            3, /* BLOB */
1914            None,
1915            b"deepest-base",
1916        );
1917
1918        let mut depth = MAX_DELTA_DEPTH;
1919        let resolved = decode_entry(
1920            &store,
1921            None,
1922            &chain,
1923            &content_sha,
1924            offsets[0],
1925            &pack[usize::try_from(offsets[0]).unwrap()..],
1926            &cache,
1927            &mut depth,
1928        )
1929        .await
1930        .expect("blob at MAX boundary must decode");
1931        assert_eq!(resolved.payload, b"deepest-base");
1932        assert_eq!(resolved.kind, ObjectKind::Blob);
1933    }
1934
1935    /// Pre-fix regression: a pure-`OFS_DELTA` chain bypassed the depth
1936    /// guard because the recursive call hopped through `decode_entry`
1937    /// directly without re-entering `read_object_from_chain`. With the
1938    /// guard moved into `decode_entry`, a 2-entry pack (a base blob +
1939    /// one `OFS_DELTA` pointing at it) entered with
1940    /// `depth = MAX_DELTA_DEPTH` must fail on the recursive call to the
1941    /// base, even though the outer entry is itself a single layer.
1942    ///
1943    /// Synthesised in-memory pack — does NOT actually approach a real
1944    /// stack-overflow depth, so this test would behave identically (and
1945    /// pass) on the unfixed code's `REF_DELTA` path. It catches the
1946    /// `OFS_DELTA` bypass specifically.
1947    #[tokio::test]
1948    async fn ofs_delta_recursion_consumes_depth_budget() {
1949        let store = MockStore::new();
1950        let cache = PackIndexCache::default();
1951        let chain: Vec<ChainSegment> = Vec::new();
1952        let content_sha = sha40(SHA_A);
1953
1954        let base_payload = b"base-blob";
1955        let mut pack = Vec::new();
1956        let mut offsets = Vec::new();
1957        // Entry 0: BLOB base.
1958        push_pack_entry(&mut pack, &mut offsets, 3, None, base_payload);
1959        // Entry 1: OFS_DELTA pointing back to entry 0. The encoded
1960        // distance per gix-pack is `entry_offset - base_offset`; entry
1961        // 1's start is `pack.len()` before the push, so capture it
1962        // explicitly.
1963        let delta = make_insert_delta(base_payload.len() as u64, b"reconstructed");
1964        let entry1_start = pack.len() as u64;
1965        let distance = entry1_start - offsets[0];
1966        push_pack_entry(&mut pack, &mut offsets, 6, Some(distance), &delta);
1967
1968        // Plant the pack body in the store under the canonical key so
1969        // `fetch_entry_bytes` can range-GET the base. The cache is
1970        // pre-populated with the offsets so no .idx round-trip is
1971        // needed.
1972        store.insert(pack_key(None, &content_sha), Bytes::from(pack.clone()));
1973        install_cached_index(&cache, "", &content_sha, offsets.clone());
1974
1975        // Enter at exactly MAX_DELTA_DEPTH so the OFS_DELTA pass bumps
1976        // the budget to MAX+1 and the recursive base decode trips the
1977        // guard.
1978        let mut depth = MAX_DELTA_DEPTH;
1979        let err = decode_entry(
1980            &store,
1981            None,
1982            &chain,
1983            &content_sha,
1984            offsets[1],
1985            &pack[usize::try_from(offsets[1]).unwrap()..],
1986            &cache,
1987            &mut depth,
1988        )
1989        .await
1990        .expect_err("OFS_DELTA recursion must trip the depth guard");
1991        assert!(
1992            matches!(err, PackchainError::DeltaTooDeep { max } if max == MAX_DELTA_DEPTH),
1993            "expected DeltaTooDeep from OFS_DELTA recursion, got {err:?}",
1994        );
1995    }
1996
1997    // --- terminal-entry size cap (issue #115) ------------------------------
1998    //
1999    // `fetch_entry_bytes` previously fell back to `get_bytes(&pack)` for
2000    // the last entry in a pack — unbounded by `MAX_RANGE_BYTES`. The fix
2001    // routes the terminal entry through a `HEAD` + ranged GET, enforcing
2002    // the same cap as non-terminal entries.
2003
2004    /// Delegates everything to an inner `MockStore` except `head`, which
2005    /// returns a synthetic `size` chosen by the test. Lets us exercise
2006    /// the `> MAX_RANGE_BYTES` branch without actually allocating a
2007    /// multi-GiB body.
2008    struct FakeSizeStore {
2009        inner: MockStore,
2010        fake_size: u64,
2011    }
2012
2013    #[async_trait::async_trait]
2014    impl ObjectStore for FakeSizeStore {
2015        async fn list(
2016            &self,
2017            prefix: &str,
2018        ) -> Result<Vec<crate::object_store::ObjectMeta>, ObjectStoreError> {
2019            self.inner.list(prefix).await
2020        }
2021        async fn get_to_file(
2022            &self,
2023            key: &str,
2024            dest: &std::path::Path,
2025            opts: crate::object_store::GetOpts,
2026        ) -> Result<(), ObjectStoreError> {
2027            self.inner.get_to_file(key, dest, opts).await
2028        }
2029        async fn get_bytes(&self, key: &str) -> Result<Bytes, ObjectStoreError> {
2030            self.inner.get_bytes(key).await
2031        }
2032        async fn get_bytes_range(
2033            &self,
2034            key: &str,
2035            range: std::ops::Range<u64>,
2036        ) -> Result<Bytes, ObjectStoreError> {
2037            self.inner.get_bytes_range(key, range).await
2038        }
2039        async fn put_bytes(
2040            &self,
2041            key: &str,
2042            body: Bytes,
2043            opts: crate::object_store::PutOpts,
2044        ) -> Result<(), ObjectStoreError> {
2045            self.inner.put_bytes(key, body, opts).await
2046        }
2047        async fn put_if_absent(&self, key: &str, body: Bytes) -> Result<bool, ObjectStoreError> {
2048            self.inner.put_if_absent(key, body).await
2049        }
2050        async fn head(
2051            &self,
2052            key: &str,
2053        ) -> Result<crate::object_store::ObjectMeta, ObjectStoreError> {
2054            // Forward NotFound from the inner store, but report
2055            // `fake_size` on success regardless of the real body length.
2056            let meta = self.inner.head(key).await?;
2057            Ok(crate::object_store::ObjectMeta {
2058                size: self.fake_size,
2059                ..meta
2060            })
2061        }
2062        async fn copy(&self, src: &str, dst: &str) -> Result<(), ObjectStoreError> {
2063            self.inner.copy(src, dst).await
2064        }
2065        async fn delete(&self, key: &str) -> Result<(), ObjectStoreError> {
2066            self.inner.delete(key).await
2067        }
2068    }
2069
2070    /// Happy path: the last entry's span fits under the cap, so
2071    /// `fetch_entry_bytes` issues a single bounded ranged GET and
2072    /// returns the tail of the pack starting at `pack_offset`.
2073    #[tokio::test]
2074    async fn fetch_entry_bytes_terminal_entry_under_cap_succeeds() {
2075        let store = MockStore::new();
2076        let cache = PackIndexCache::default();
2077        let content_sha = sha40(SHA_A);
2078
2079        // Plant a tiny pack body. The entry shape doesn't matter — we
2080        // only assert on the bytes `fetch_entry_bytes` returns.
2081        let body: &[u8] = b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09";
2082        store.insert(pack_key(None, &content_sha), Bytes::from(body.to_vec()));
2083        // Pretend the only entry starts at offset 2 — sorted_offsets
2084        // contains it and nothing greater, so `next_offset` is `None`
2085        // and the terminal-entry branch fires.
2086        install_cached_index(&cache, "", &content_sha, vec![2]);
2087        let idx = cache
2088            .get(&(String::new(), content_sha.clone()))
2089            .expect("cache hit");
2090
2091        let got = fetch_entry_bytes(&store, None, &content_sha, 2, &idx)
2092            .await
2093            .expect("terminal entry under cap must succeed");
2094        assert_eq!(got.as_ref(), &body[2..]);
2095    }
2096
2097    /// Security regression for issue #115: when the implied terminal
2098    /// range exceeds `MAX_RANGE_BYTES`, the fetcher must fail with a
2099    /// typed error rather than allocate a multi-GiB buffer.
2100    #[tokio::test]
2101    async fn fetch_entry_bytes_terminal_entry_over_cap_rejected() {
2102        let inner = MockStore::new();
2103        let cache = PackIndexCache::default();
2104        let content_sha = sha40(SHA_A);
2105
2106        // The wrapper's `head` will report `MAX_RANGE_BYTES + 1`, so the
2107        // implied span `(end - pack_offset)` with `pack_offset = 0`
2108        // exceeds the cap. The body is a stub — `get_bytes_range` must
2109        // never be called.
2110        inner.insert(pack_key(None, &content_sha), Bytes::from_static(b"stub"));
2111        install_cached_index(&cache, "", &content_sha, vec![0]);
2112        let idx = cache
2113            .get(&(String::new(), content_sha.clone()))
2114            .expect("cache hit");
2115
2116        let store = FakeSizeStore {
2117            inner,
2118            fake_size: MAX_RANGE_BYTES + 1,
2119        };
2120
2121        let err = fetch_entry_bytes(&store, None, &content_sha, 0, &idx)
2122            .await
2123            .expect_err("terminal entry above cap must be rejected");
2124        assert!(
2125            matches!(
2126                err,
2127                PackchainError::MalformedPackEntry { offset: 0, ref reason }
2128                    if reason.contains("exceeds") && reason.contains("cap")
2129            ),
2130            "expected MalformedPackEntry size-cap error, got {err:?}",
2131        );
2132    }
2133
2134    /// `pack_offset >= pack_len` on the terminal branch is an
2135    /// out-of-bounds index. The fetcher must report it as
2136    /// `MalformedPackEntry` rather than issue a zero-length range GET.
2137    #[tokio::test]
2138    async fn fetch_entry_bytes_terminal_entry_offset_past_eof_rejected() {
2139        let store = MockStore::new();
2140        let cache = PackIndexCache::default();
2141        let content_sha = sha40(SHA_A);
2142
2143        store.insert(pack_key(None, &content_sha), Bytes::from_static(b"abc"));
2144        // `sorted_offsets` contains an offset at/past the body length,
2145        // so `next_offset` is `None` and the terminal branch fires.
2146        install_cached_index(&cache, "", &content_sha, vec![100]);
2147        let idx = cache
2148            .get(&(String::new(), content_sha.clone()))
2149            .expect("cache hit");
2150
2151        let err = fetch_entry_bytes(&store, None, &content_sha, 100, &idx)
2152            .await
2153            .expect_err("offset beyond EOF must be rejected");
2154        assert!(
2155            matches!(
2156                err,
2157                PackchainError::MalformedPackEntry { offset: 100, ref reason }
2158                    if reason.contains("beyond pack EOF")
2159            ),
2160            "expected MalformedPackEntry EOF error, got {err:?}",
2161        );
2162    }
2163
2164    /// Below the cap, the same `OFS_DELTA` shape decodes cleanly. This
2165    /// pins the positive case so the boundary test above is meaningful
2166    /// (without it, a regression that always returned `DeltaTooDeep`
2167    /// would still pass the over-cap assertion).
2168    #[tokio::test]
2169    async fn ofs_delta_below_cap_decodes() {
2170        let store = MockStore::new();
2171        let cache = PackIndexCache::default();
2172        let chain: Vec<ChainSegment> = Vec::new();
2173        let content_sha = sha40(SHA_A);
2174
2175        let base_payload = b"base";
2176        let mut pack = Vec::new();
2177        let mut offsets = Vec::new();
2178        push_pack_entry(&mut pack, &mut offsets, 3, None, base_payload);
2179        let delta = make_insert_delta(base_payload.len() as u64, b"hi");
2180        let entry1_start = pack.len() as u64;
2181        let distance = entry1_start - offsets[0];
2182        push_pack_entry(&mut pack, &mut offsets, 6, Some(distance), &delta);
2183
2184        store.insert(pack_key(None, &content_sha), Bytes::from(pack.clone()));
2185        install_cached_index(&cache, "", &content_sha, offsets.clone());
2186
2187        let mut depth = 0u32;
2188        let resolved = decode_entry(
2189            &store,
2190            None,
2191            &chain,
2192            &content_sha,
2193            offsets[1],
2194            &pack[usize::try_from(offsets[1]).unwrap()..],
2195            &cache,
2196            &mut depth,
2197        )
2198        .await
2199        .expect("OFS_DELTA decodes below cap");
2200        assert_eq!(resolved.payload, b"hi");
2201        assert_eq!(resolved.kind, ObjectKind::Blob);
2202    }
2203
2204    // --- concurrent-GC retry loop (issue #136) -----------------------------
2205    //
2206    // `read_blob` must transparently retry `PackMissing` failures that
2207    // were caused by a concurrent `manage gc sweep` deleting compacted-
2208    // away packs. The retry reloads `chain.json` and inspects whether
2209    // the missing key is still referenced (data loss → fail fast) or
2210    // gone (GC → retry).
2211
2212    use crate::packchain::keys::chain_key;
2213    use crate::packchain::schema::ChainManifest;
2214    use std::sync::atomic::{AtomicUsize, Ordering};
2215
2216    fn make_chain_with(tip_hex: &str, pack_sha_hex: &str) -> ChainManifest {
2217        ChainManifest {
2218            v: ChainManifest::SCHEMA_VERSION,
2219            tip: sha40(tip_hex),
2220            full_at: sha40(tip_hex),
2221            segments: vec![ChainSegment {
2222                sha: sha40(tip_hex),
2223                parent_sha: None,
2224                pack: format!("packs/{pack_sha_hex}.pack"),
2225                bytes: 1_024,
2226            }],
2227        }
2228    }
2229
2230    #[test]
2231    fn chain_references_pack_key_matches_pack_and_idx_keys() {
2232        let chain = make_chain_with(SHA_A, SHA_B);
2233        // Both `.pack` and `.idx` belonging to the same content-sha
2234        // are considered referenced.
2235        assert!(chain_references_pack_key(&chain, None, &format!("packs/{SHA_B}.pack")).unwrap());
2236        assert!(chain_references_pack_key(&chain, None, &format!("packs/{SHA_B}.idx")).unwrap());
2237    }
2238
2239    #[test]
2240    fn chain_references_pack_key_returns_false_for_unreferenced_pack() {
2241        let chain = make_chain_with(SHA_A, SHA_B);
2242        assert!(!chain_references_pack_key(&chain, None, &format!("packs/{SHA_C}.pack")).unwrap());
2243        assert!(!chain_references_pack_key(&chain, None, &format!("packs/{SHA_C}.idx")).unwrap());
2244    }
2245
2246    #[test]
2247    fn chain_references_pack_key_respects_prefix() {
2248        let chain = make_chain_with(SHA_A, SHA_B);
2249        // With a prefix, the membership check is against the
2250        // prefix-joined key — an un-prefixed key for the same
2251        // content-sha is *not* a match (the join differs).
2252        assert!(
2253            chain_references_pack_key(&chain, Some("repo"), &format!("repo/packs/{SHA_B}.pack"))
2254                .unwrap()
2255        );
2256        assert!(
2257            !chain_references_pack_key(&chain, Some("repo"), &format!("packs/{SHA_B}.pack"))
2258                .unwrap()
2259        );
2260    }
2261
2262    #[test]
2263    fn chain_references_pack_key_returns_false_for_malformed_missing_key() {
2264        // Commit 8fbe693's refactor short-circuits on a missing_key
2265        // that fails to parse as `[<prefix>/]packs/<sha>.{pack,idx}` —
2266        // the function returns Ok(false) without iterating segments.
2267        // This pins that behavior so a future change that re-routes
2268        // malformed keys through the per-segment path (or that
2269        // surfaces a parse error to the caller) fails this test.
2270        let chain = make_chain_with(SHA_A, SHA_B);
2271        // No `packs/` segment.
2272        assert!(!chain_references_pack_key(&chain, None, "weird/key").unwrap());
2273        // `packs/` present but non-hex stem — fails Sha40 validation.
2274        assert!(!chain_references_pack_key(&chain, None, "packs/not-a-sha.pack").unwrap());
2275        // Wrong extension.
2276        assert!(!chain_references_pack_key(&chain, None, &format!("packs/{SHA_B}.bin")).unwrap());
2277        // Empty key.
2278        assert!(!chain_references_pack_key(&chain, None, "").unwrap());
2279    }
2280
2281    /// `ObjectStore` wrapper that serves a list of `chain.json` bodies
2282    /// in order: call 0 returns `bodies[0]`, call 1 returns `bodies[1]`,
2283    /// and so on. The last entry is repeated for any further calls.
2284    /// All other keys go through to the inner [`MockStore`].
2285    ///
2286    /// Lets a test simulate compact+sweep happening *between* the
2287    /// reader's chain reloads: the reader observes a new chain (a
2288    /// different segment set) on each retry without the test needing
2289    /// to race a real concurrent task.
2290    struct EvolvingChainStore {
2291        inner: MockStore,
2292        chain_key: String,
2293        bodies: Vec<Bytes>,
2294        calls: AtomicUsize,
2295        /// Counts `get_bytes` calls whose key ends in
2296        /// `/path-index.json`. Used by the issue-#136 contract test
2297        /// (`read_with_pack_missing_retries_does_not_reload_path_index`)
2298        /// to prove the retry path never re-reads path-index — only
2299        /// `chain.json` is reloaded.
2300        path_index_calls: AtomicUsize,
2301    }
2302
2303    impl EvolvingChainStore {
2304        fn new(inner: MockStore, chain_key: String, bodies: Vec<Bytes>) -> Self {
2305            assert!(!bodies.is_empty(), "must supply at least one chain body");
2306            Self {
2307                inner,
2308                chain_key,
2309                bodies,
2310                calls: AtomicUsize::new(0),
2311                path_index_calls: AtomicUsize::new(0),
2312            }
2313        }
2314
2315        fn chain_calls(&self) -> usize {
2316            self.calls.load(Ordering::SeqCst)
2317        }
2318
2319        fn path_index_calls(&self) -> usize {
2320            self.path_index_calls.load(Ordering::SeqCst)
2321        }
2322    }
2323
2324    // Note: `put_path` is intentionally omitted from the forward list
2325    // to preserve the original behavior (trait default → `Self::put_bytes`
2326    // → `inner.put_bytes`), which bypasses `MockStore::put_path`. The
2327    // test never invokes `put_path` on this decorator, so the bypass is
2328    // a no-op in practice but kept for byte-for-byte parity.
2329    crate::delegate_to_inner_impl! {
2330        impl ObjectStore for EvolvingChainStore {
2331            forward: list, get_to_file, get_bytes_range,
2332                     put_bytes, put_if_absent,
2333                     head, copy, delete;
2334
2335            async fn get_bytes(&self, key: &str) -> Result<Bytes, ObjectStoreError> {
2336                if key == self.chain_key {
2337                    let idx = self.calls.fetch_add(1, Ordering::SeqCst);
2338                    let pick = idx.min(self.bodies.len() - 1);
2339                    return Ok(self.bodies[pick].clone());
2340                }
2341                if key.ends_with("/path-index.json") {
2342                    self.path_index_calls.fetch_add(1, Ordering::SeqCst);
2343                }
2344                self.inner.get_bytes(key).await
2345            }
2346        }
2347    }
2348
2349    /// Build a real v2 `.idx` carrying a single object: `target_sha` at
2350    /// pack offset `pack_offset`. `gix_pack`'s parser validates the
2351    /// fanout and table sizes, so this must match the format exactly.
2352    fn build_one_object_v2_idx(target_sha: &Sha40, pack_offset: u32) -> Vec<u8> {
2353        // Decode the 40-hex sha into 20 raw bytes. `gix_hash::ObjectId`
2354        // already does this for us — re-use it rather than hand-rolling
2355        // a hex parser.
2356        let oid = sha40_to_object_id(target_sha);
2357        let sha_bytes = oid.as_bytes();
2358        let first_byte = sha_bytes[0];
2359
2360        let mut data = Vec::with_capacity(8 + 256 * 4 + 20 + 4 + 4 + 20 + 20);
2361        // V2 magic + version.
2362        data.extend_from_slice(b"\xfftOc");
2363        data.extend_from_slice(&2u32.to_be_bytes());
2364        // Fanout: cumulative count of objects with sha[0] <= i. With a
2365        // single object whose first byte is `first_byte`, every entry
2366        // from `first_byte` onwards is 1. `u8::try_from(i)` cannot fail
2367        // for `i in 0u16..256`.
2368        for i in 0u16..256 {
2369            let count = u32::from(u8::try_from(i).expect("0..256 fits in u8") >= first_byte);
2370            data.extend_from_slice(&count.to_be_bytes());
2371        }
2372        // Names table (20 bytes per sha).
2373        data.extend_from_slice(sha_bytes);
2374        // CRC32 table (one u32; value is unused by `lookup`).
2375        data.extend_from_slice(&0u32.to_be_bytes());
2376        // Offset table (one u32; MSB clear → 32-bit absolute offset).
2377        data.extend_from_slice(&pack_offset.to_be_bytes());
2378        // Pack trailer (20-byte sha placeholder) + idx trailer
2379        // (20-byte sha placeholder). The parser doesn't validate the
2380        // content of either against the body for `from_data`.
2381        data.extend_from_slice(&[0u8; 20]);
2382        data.extend_from_slice(&[0u8; 20]);
2383        data
2384    }
2385
2386    /// Convenience: turn the `tip_hex` and `pack_sha_hex` strings into
2387    /// a serialised chain.json `Bytes` body the wrapper can return.
2388    fn chain_json_bytes(tip_hex: &str, pack_sha_hex: &str) -> Bytes {
2389        let json = make_chain_with(tip_hex, pack_sha_hex)
2390            .to_json_pretty()
2391            .expect("chain serialise");
2392        Bytes::from(json)
2393    }
2394
2395    /// GC-race retry succeeds: the initial chain points at pack P1
2396    /// (absent from the store — simulating "already deleted by a
2397    /// concurrent sweep"); the reloaded chain points at pack P2 which
2398    /// is fully present (real one-object `.idx` + zlib-compressed
2399    /// blob entry). The retry must find the blob via P2 and return
2400    /// the payload.
2401    #[tokio::test]
2402    async fn read_with_pack_missing_retries_succeeds_after_chain_reload() {
2403        let inner = MockStore::new();
2404        let cache = PackIndexCache::default();
2405
2406        let p1_sha = sha40(SHA_A);
2407        let p2_sha = sha40(SHA_B);
2408        let blob_payload = b"recovered blob";
2409        // The blob's git OID — `gix_hash::ObjectId` for "hello blob"
2410        // shape. We don't compute the real git sha here; we register
2411        // the same sha in both the .idx names table and in
2412        // `target_oid` so `gix_pack::index::File::lookup` returns the
2413        // single object.
2414        let blob_oid_sha = sha40(SHA_C);
2415        let blob_oid = sha40_to_object_id(&blob_oid_sha);
2416
2417        // Build a P2 pack containing one Blob entry at offset 0.
2418        let mut pack = Vec::new();
2419        let mut offsets = Vec::new();
2420        push_pack_entry(
2421            &mut pack,
2422            &mut offsets,
2423            3, /* BLOB */
2424            None,
2425            blob_payload,
2426        );
2427        inner.insert(pack_key(None, &p2_sha), Bytes::from(pack.clone()));
2428
2429        // Build the corresponding one-object v2 .idx for P2 and plant
2430        // it under the canonical idx key. `read_object_from_chain`
2431        // will load it via `load_index`.
2432        let idx_bytes = build_one_object_v2_idx(&blob_oid_sha, 0);
2433        inner.insert(pack_idx_key(None, &p2_sha), Bytes::from(idx_bytes));
2434
2435        // The reader enters the helper carrying chain v1 (refs P1,
2436        // absent from the store). When it reloads `chain.json` after
2437        // the first PackMissing, the wrapper serves v2 (refs P2 with
2438        // a working idx + pack), simulating a compact+sweep that
2439        // happened between the reader's two loads.
2440        let chain_key = chain_key(None, "refs/heads/main");
2441        let v1 = chain_json_bytes(SHA_A, p1_sha.as_str());
2442        let v2 = chain_json_bytes(SHA_A, p2_sha.as_str());
2443        let store = EvolvingChainStore::new(inner, chain_key, vec![v2]);
2444
2445        let initial = ChainManifest::from_json_bytes(&v1).expect("chain v1 parses");
2446        let remote_ref = RefName::new("refs/heads/main").expect("ref name valid");
2447
2448        let resolved = read_with_pack_missing_retries(
2449            &store,
2450            None,
2451            &remote_ref,
2452            "refs/heads/main",
2453            initial,
2454            &blob_oid,
2455            &cache,
2456        )
2457        .await
2458        .expect("retry must succeed after chain reload");
2459        assert_eq!(resolved.payload, blob_payload);
2460        assert_eq!(resolved.kind, ObjectKind::Blob);
2461        // Exactly one chain reload was needed — the first read against
2462        // the initial in-memory chain, then one reload that swapped to
2463        // v2 referencing P2.
2464        assert_eq!(
2465            store.chain_calls(),
2466            1,
2467            "exactly one chain reload should have fired"
2468        );
2469        // Issue #136 contract: the retry path only reloads `chain.json`.
2470        // Path-index is loaded once by `read_blob` before this helper is
2471        // entered (the original `blob_oid` represents the snapshot the
2472        // caller asked about), and compaction preserves blob content
2473        // addressing — so reloading path-index on retry would silently
2474        // re-resolve the path against a newer tip's tree, which is the
2475        // wrong semantics for a stateless point-in-time read.
2476        // `read_with_pack_missing_retries` itself never touches
2477        // path-index; pin that with a counter assertion.
2478        assert_eq!(
2479            store.path_index_calls(),
2480            0,
2481            "retry path must not reload path-index.json",
2482        );
2483    }
2484
2485    /// Companion to `succeeds_after_chain_reload`: pins the
2486    /// "path-index is NOT reloaded across retries" half of the
2487    /// `read_with_pack_missing_retries` contract documented at issue
2488    /// #136. A regression that re-resolved the blob OID via a fresh
2489    /// path-index load between retries would resolve to a different
2490    /// blob SHA on a force-pushed tree and surface as a stale-read
2491    /// bug. We arm a single pack-missing retry, count `get_bytes`
2492    /// calls against `path-index.json` through the `EvolvingChainStore`
2493    /// counter, and assert the count stays at zero.
2494    #[tokio::test]
2495    async fn read_with_pack_missing_retries_does_not_reload_path_index() {
2496        let inner = MockStore::new();
2497        let cache = PackIndexCache::default();
2498
2499        let p1_sha = sha40(SHA_A);
2500        let p2_sha = sha40(SHA_B);
2501        let blob_payload = b"recovered blob";
2502        let blob_oid_sha = sha40(SHA_C);
2503        let blob_oid = sha40_to_object_id(&blob_oid_sha);
2504
2505        // Build the P2 pack + idx so the retry's read finds the blob.
2506        let mut pack = Vec::new();
2507        let mut offsets = Vec::new();
2508        push_pack_entry(
2509            &mut pack,
2510            &mut offsets,
2511            3, /* BLOB */
2512            None,
2513            blob_payload,
2514        );
2515        inner.insert(pack_key(None, &p2_sha), Bytes::from(pack));
2516        let idx_bytes = build_one_object_v2_idx(&blob_oid_sha, 0);
2517        inner.insert(pack_idx_key(None, &p2_sha), Bytes::from(idx_bytes));
2518
2519        // Pre-write a sentinel `path-index.json` body so any spurious
2520        // load would consume it (and bump the counter) rather than
2521        // silently 404 and slip past the assertion.
2522        inner.insert("refs/heads/main/path-index.json", Bytes::from_static(b"{}"));
2523
2524        // Initial chain refs P1 (absent), reloaded chain refs P2 (present).
2525        let chain_key = chain_key(None, "refs/heads/main");
2526        let v1 = chain_json_bytes(SHA_A, p1_sha.as_str());
2527        let v2 = chain_json_bytes(SHA_A, p2_sha.as_str());
2528        let store = EvolvingChainStore::new(inner, chain_key, vec![v2]);
2529
2530        let initial = ChainManifest::from_json_bytes(&v1).expect("chain v1 parses");
2531        let remote_ref = RefName::new("refs/heads/main").expect("ref name valid");
2532
2533        let resolved = read_with_pack_missing_retries(
2534            &store,
2535            None,
2536            &remote_ref,
2537            "refs/heads/main",
2538            initial,
2539            &blob_oid,
2540            &cache,
2541        )
2542        .await
2543        .expect("retry must succeed");
2544        assert_eq!(resolved.payload, blob_payload);
2545        // The retry fired (chain reload count == 1).
2546        assert_eq!(store.chain_calls(), 1);
2547        // Critical contract: path-index.json is never re-loaded by the
2548        // retry path. `read_with_pack_missing_retries` operates on the
2549        // already-resolved `blob_oid`; reloading path-index would
2550        // re-resolve the path against a possibly-newer tree.
2551        assert_eq!(
2552            store.path_index_calls(),
2553            0,
2554            "retry path read path-index.json {} times; must be zero",
2555            store.path_index_calls(),
2556        );
2557    }
2558
2559    /// `PackMissing` where the reload still references the same pack
2560    /// means genuine data loss — the bucket is missing a pack
2561    /// `chain.json` still names. The reader must fail fast with
2562    /// `PackMissing` rather than retry forever or upgrade to the
2563    /// concurrent-GC retry-exhausted variant.
2564    #[tokio::test]
2565    async fn read_with_pack_missing_retries_fails_fast_when_chain_still_references_missing_pack() {
2566        let inner = MockStore::new();
2567        let cache = PackIndexCache::default();
2568
2569        let p1_sha = sha40(SHA_A);
2570        let blob_oid = sha40_to_object_id(&sha40(SHA_C));
2571
2572        // Initial and reloaded chain both reference the same missing
2573        // pack — the "data loss" scenario, distinct from GC.
2574        let chain_key = chain_key(None, "refs/heads/main");
2575        let body = chain_json_bytes(SHA_A, p1_sha.as_str());
2576        let store = EvolvingChainStore::new(inner, chain_key, vec![body.clone()]);
2577        let initial = ChainManifest::from_json_bytes(&body).expect("chain parses");
2578        let remote_ref = RefName::new("refs/heads/main").expect("ref name valid");
2579
2580        let err = read_with_pack_missing_retries(
2581            &store,
2582            None,
2583            &remote_ref,
2584            "refs/heads/main",
2585            initial,
2586            &blob_oid,
2587            &cache,
2588        )
2589        .await
2590        .expect_err("missing pack still in chain must fail fast");
2591        match err {
2592            PackchainError::PackMissing { key } => {
2593                assert!(
2594                    key.contains(&format!("packs/{SHA_A}")),
2595                    "PackMissing key should name the missing pack, got {key}",
2596                );
2597            }
2598            other => panic!("expected fail-fast PackMissing, got {other:?}"),
2599        }
2600        // Exactly one chain reload was issued (to verify the missing
2601        // key was still referenced) — no further reloads or sleeps.
2602        assert_eq!(store.chain_calls(), 1);
2603    }
2604
2605    /// Retries are exhausted when each reload shows a *different*
2606    /// missing pack — i.e. compact+sweep keeps outpacing the reader.
2607    /// After `PACK_MISSING_MAX_RETRIES` retries the call surfaces
2608    /// `ConcurrentGcRetriesExhausted` with the last observed key.
2609    #[tokio::test(start_paused = true)]
2610    async fn read_with_pack_missing_retries_surfaces_exhausted_after_max_retries() {
2611        // `start_paused` lets the tokio runtime auto-advance time
2612        // through the backoff sleeps so the test doesn't pay the
2613        // wall-clock 2.6 s worst case.
2614        let inner = MockStore::new();
2615        let cache = PackIndexCache::default();
2616
2617        let blob_oid = sha40_to_object_id(&sha40(SHA_C));
2618
2619        // Five distinct chain versions, each referencing a different
2620        // missing pack. The initial in-memory chain is v1; the
2621        // wrapper serves v2..v5 on successive reloads. The reader's
2622        // walk is:
2623        //   - attempt 0 with v1 → PackMissing(P0); reload → v2 (refs P1)
2624        //   - attempt 1 with v2 → PackMissing(P1); reload → v3 (refs P2)
2625        //   - attempt 2 with v3 → PackMissing(P2); reload → v4 (refs P3)
2626        //   - attempt 3 with v4 → PackMissing(P3); reload → v5 (refs P4)
2627        //     → attempt >= MAX → exhausted with key for P3.
2628        let pack_shas = [
2629            "0000000000000000000000000000000000000000",
2630            "1111111111111111111111111111111111111111",
2631            "2222222222222222222222222222222222222222",
2632            "3333333333333333333333333333333333333333",
2633            "4444444444444444444444444444444444444444",
2634        ];
2635        let chain_key = chain_key(None, "refs/heads/main");
2636        let v1 = chain_json_bytes(SHA_A, pack_shas[0]);
2637        let reload_bodies: Vec<Bytes> = pack_shas[1..]
2638            .iter()
2639            .map(|sha| chain_json_bytes(SHA_A, sha))
2640            .collect();
2641        let initial = ChainManifest::from_json_bytes(&v1).expect("chain v1 parses");
2642        let store = EvolvingChainStore::new(inner, chain_key, reload_bodies);
2643        let remote_ref = RefName::new("refs/heads/main").expect("ref name valid");
2644
2645        let err = read_with_pack_missing_retries(
2646            &store,
2647            None,
2648            &remote_ref,
2649            "refs/heads/main",
2650            initial,
2651            &blob_oid,
2652            &cache,
2653        )
2654        .await
2655        .expect_err("exhausted retries must error");
2656        match err {
2657            PackchainError::ConcurrentGcRetriesExhausted {
2658                last_missing_key,
2659                attempts,
2660            } => {
2661                // The last attempt was against v4 referencing P3
2662                // (the fourth pack in our table). `attempts` records
2663                // retries beyond the initial attempt: 3.
2664                assert_eq!(attempts, PACK_MISSING_MAX_RETRIES);
2665                assert!(
2666                    last_missing_key.contains(pack_shas[3]),
2667                    "last missing key should name pack[3], got {last_missing_key}"
2668                );
2669            }
2670            other => panic!("expected ConcurrentGcRetriesExhausted, got {other:?}"),
2671        }
2672        // Exactly MAX_RETRIES + 1 reloads: one verification per
2673        // attempted read. Each reload showed the missing pack absent
2674        // from the freshly-loaded chain so the loop kept retrying
2675        // (or, on the final reload, recorded "exhausted").
2676        assert_eq!(
2677            store.chain_calls(),
2678            usize::try_from(PACK_MISSING_MAX_RETRIES + 1).unwrap()
2679        );
2680    }
2681
2682    /// Non-PackMissing errors are not retried — they pass through
2683    /// directly. Otherwise a transport error or a malformed pack would
2684    /// wait through the backoff schedule for no reason.
2685    #[tokio::test]
2686    async fn read_with_pack_missing_retries_does_not_retry_on_non_pack_missing_errors() {
2687        let inner = MockStore::new();
2688        let cache = PackIndexCache::default();
2689
2690        // Plant a malformed `.idx` for the chain's pack. `load_index`
2691        // will parse-fail with `MalformedPackEntry`, *not*
2692        // `PackMissing`, so the retry path must not fire.
2693        let p1_sha = sha40(SHA_A);
2694        inner.insert(
2695            pack_idx_key(None, &p1_sha),
2696            Bytes::from_static(b"not a real idx"),
2697        );
2698
2699        let chain_key = chain_key(None, "refs/heads/main");
2700        let body = chain_json_bytes(SHA_A, p1_sha.as_str());
2701        let store = EvolvingChainStore::new(inner, chain_key, vec![body.clone()]);
2702        let initial = ChainManifest::from_json_bytes(&body).expect("chain parses");
2703        let remote_ref = RefName::new("refs/heads/main").expect("ref name valid");
2704        let blob_oid = sha40_to_object_id(&sha40(SHA_C));
2705
2706        let err = read_with_pack_missing_retries(
2707            &store,
2708            None,
2709            &remote_ref,
2710            "refs/heads/main",
2711            initial,
2712            &blob_oid,
2713            &cache,
2714        )
2715        .await
2716        .expect_err("malformed idx must surface immediately");
2717        assert!(
2718            matches!(err, PackchainError::MalformedPackEntry { .. }),
2719            "expected MalformedPackEntry passthrough, got {err:?}"
2720        );
2721        // No chain reloads — the error path did not enter the retry
2722        // branch at all.
2723        assert_eq!(store.chain_calls(), 0);
2724    }
2725
2726    /// Regression for #136: a chain reload that itself errors (transport
2727    /// failure on `chain.json`) must surface as `PackchainError::Store`
2728    /// — NOT be converted into `ConcurrentGcRetriesExhausted` and NOT
2729    /// swallowed back into the original `PackMissing`. The retry loop
2730    /// uses `?` on `load_chain`, so a network fault during reload
2731    /// short-circuits to the wrapped store error.
2732    ///
2733    /// Setup: initial in-memory chain refs P1; the store has no pack
2734    /// for P1 (so the first read fails with `PackMissing`), and a
2735    /// one-shot `NetworkOnGetBytes` fault armed on the chain key
2736    /// fires when the retry loop calls `load_chain`.
2737    #[tokio::test]
2738    async fn read_with_pack_missing_retries_surfaces_chain_reload_error() {
2739        use crate::object_store::mock::Fault;
2740
2741        let store = MockStore::new();
2742        let cache = PackIndexCache::default();
2743
2744        let p1_sha = sha40(SHA_A);
2745        let blob_oid = sha40_to_object_id(&sha40(SHA_C));
2746
2747        // Initial chain refs P1, which is absent from the store. The
2748        // first pack-read will surface PackMissing, sending the loop
2749        // into its reload branch.
2750        let chain_key_str = chain_key(None, "refs/heads/main");
2751        let body = chain_json_bytes(SHA_A, p1_sha.as_str());
2752        let initial = ChainManifest::from_json_bytes(&body).expect("chain v1 parses");
2753        let remote_ref = RefName::new("refs/heads/main").expect("ref name valid");
2754
2755        // Arm a network fault on the chain key. `load_chain` calls
2756        // `store.get_bytes(chain_key)`; the fault fires there and the
2757        // wrapped error must propagate out of the retry helper.
2758        store.arm(Fault::NetworkOnGetBytes { key: chain_key_str });
2759
2760        let err = read_with_pack_missing_retries(
2761            &store,
2762            None,
2763            &remote_ref,
2764            "refs/heads/main",
2765            initial,
2766            &blob_oid,
2767            &cache,
2768        )
2769        .await
2770        .expect_err("chain reload failure must surface as an error");
2771
2772        assert!(
2773            matches!(err, PackchainError::Store(_)),
2774            "expected PackchainError::Store wrapping the chain-reload transport error; \
2775             a regression that swallowed the reload error would yield \
2776             ConcurrentGcRetriesExhausted or the original PackMissing instead. got {err:?}"
2777        );
2778        // Fault was consumed exactly once by the single reload attempt.
2779        assert_eq!(
2780            store.pending_faults(),
2781            0,
2782            "armed chain-reload fault must have fired exactly once"
2783        );
2784    }
2785}