Skip to main content

git_remote_object_store/packchain/
mod.rs

1//! Incremental pack-chain storage engine (issue #52).
2//!
3//! Push (#63) writes incremental packs keyed by content SHA, a
4//! newest-first [`schema::ChainManifest`], a nested
5//! [`schema::PathIndex`] of repo paths to blob SHAs, and a baseline
6//! bundle on the first / force push so a fresh clone short-circuits
7//! through `bundle-uri`. Fetch (#64), direct file access (#65,
8//! `read_blob` library API), compaction (#67), and GC (#66) are all
9//! implemented in sibling modules. Push artefacts on the bucket:
10//!
11//! ```text
12//! <prefix>/FORMAT                                "packchain"
13//! <prefix>/HEAD                                  "refs/heads/main"
14//! <prefix>/refs/heads/<branch>/LOCK#.lock        held during write, released after
15//! <prefix>/refs/heads/<branch>/chain.json        newest-first manifest (THE commit point)
16//! <prefix>/refs/heads/<branch>/path-index.json   nested tree → blob SHA map
17//! <prefix>/refs/heads/<branch>/<tip>.bundle      baseline (first / force push only)
18//! <prefix>/packs/<content-sha>.pack              incremental pack
19//! <prefix>/packs/<content-sha>.idx               pack index
20//! ```
21//!
22//! Once the push lands, fetch resolves shallow / full clones via
23//! sequential pack install (`fetch.rs`), `read_blob` reads single
24//! blobs via the path-index without rehydrating the chain
25//! (`read.rs`), and the `manage compact` / `manage gc` subcommands
26//! reap orphans and collapse the chain (`compact.rs`, `gc.rs`).
27//!
28//! ## Linearization point
29//!
30//! `chain.json` is the commit point: pack/idx/baseline upload
31//! pre-lock, then under the per-ref lock the push writes
32//! FORMAT → HEAD → chain.json → path-index.json. Anything that
33//! crashed before the chain.json PUT leaves orphan keys
34//! (pack/idx/baseline at content-SHA or tip-SHA names) which
35//! `manage gc` reaps. Anything written after chain.json
36//! (path-index.json overwrite, force-push baseline cleanup) is
37//! post-commit and may be retried by re-running the push or compact.
38//!
39//! ## chain.json → path-index.json ordering and the reader contract
40//!
41//! Writing `path-index.json` LAST means a crash between the
42//! `chain.json` PUT and the `path-index.json` PUT leaves the bucket
43//! with a fresh chain alongside a stale path-index whose `tip` field
44//! still names the prior chain.tip. The reader detects this with a
45//! single tip-equality check (`path_index.tip == chain.tip`) and
46//! surfaces it as
47//! [`PackchainError::TransientChainPathIndexMismatch`] — a typed,
48//! retry-shaped error — rather than silently returning the wrong
49//! blob bytes or failing with the confusing
50//! [`PackchainError::BlobNotInChain`] that the old (path-index-first)
51//! ordering produced (issue #114).
52//!
53//! The reverse ordering (path-index before chain.json) is rejected
54//! because it lets a stale chain coexist with a fresh path-index
55//! whose blob SHAs are NOT yet in any chain pack, surfacing as
56//! `BlobNotInChain` — indistinguishable from genuine corruption.
57//!
58//! ## Lost-race orphan packs
59//!
60//! Packs upload BEFORE the per-ref lock is acquired so the lock-hold
61//! window stays bounded by chain.json + path-index PUT latency. When
62//! two pushers race they both upload their packs pre-lock; the loser
63//! sees `stale chain` after re-reading `chain.json` under the lock
64//! and returns without committing, leaving its pack as an
65//! unreferenced orphan that `manage gc` sweeps. The orphan-bandwidth
66//! cost is the deliberate trade-off for keeping the lock window
67//! short — an in-lock-upload alternative would block sibling pushers
68//! for the full duration of a multi-GiB upload.
69
70pub(crate) mod audit;
71pub(crate) mod compact;
72pub(crate) mod fetch;
73pub mod gc;
74pub(crate) mod git;
75pub(crate) mod keys;
76pub(crate) mod list;
77pub(crate) mod manifest;
78pub(crate) mod pack;
79pub(crate) mod push;
80// `pub` (parity with `gc`) so `git_remote_object_store::packchain::read`
81// is reachable for rustdoc discovery. The convenience re-exports
82// below (`packchain::PackIndexCache`, `packchain::read_blob`) remain
83// the canonical short paths.
84pub mod read;
85pub(crate) mod retry;
86pub(crate) mod schema;
87
88pub use read::{PackIndexCache, read_blob};
89
90/// Errors surfaced by the packchain engine. `pub` because the
91/// [`crate::protocol::push::PushError::Packchain`] variant — which is
92/// public — wraps it; making this `pub(crate)` would leak a private
93/// type through a public API. The packchain engine itself stays
94/// `pub(crate)` (see `pub(crate) mod push` etc.); only `gc` and `read`
95/// are `pub` for rustdoc / direct-access reachability.
96#[derive(Debug, thiserror::Error)]
97pub enum PackchainError {
98    /// On-bucket schema declares a version this build cannot read. The
99    /// `expected` field is the version this build writes; `found` is
100    /// the value parsed from the JSON. Lets a future v=2 reader refuse
101    /// v=1 clients (and vice versa) cleanly.
102    #[error("packchain schema version {found} unsupported (this build reads v{expected})")]
103    UnsupportedSchemaVersion {
104        /// Version found in the parsed JSON.
105        found: u32,
106        /// Version this build expects.
107        expected: u32,
108    },
109
110    /// A field that should hold a 40-lowercase-hex SHA contained
111    /// something else. Validation runs on every [`schema::Sha40`]
112    /// deserialise so a malformed `chain.json` or `path-index.json`
113    /// cannot leak past the parser into the rest of the engine.
114    #[error("invalid 40-hex sha `{found}`: must be 40 lowercase hex characters")]
115    InvalidSha {
116        /// The rejected string (truncated by `Display`'s default
117        /// formatter at the wire level).
118        found: String,
119    },
120
121    /// Underlying `serde_json` parse error (malformed JSON, missing
122    /// fields, type mismatches that aren't caught by [`schema::Sha40`]'s
123    /// validator).
124    #[error("packchain schema parse error: {0}")]
125    ParseJson(#[from] serde_json::Error),
126
127    /// Tree entry filename was not valid UTF-8. Git allows arbitrary
128    /// bytes in tree entry names, but the on-bucket JSON layer cannot
129    /// represent non-UTF-8 keys without a lossy encoding (banned by
130    /// `.claude/rules/rust.md`). Carries the offending bytes verbatim
131    /// for diagnostics.
132    #[error("invalid path: {} (not valid UTF-8)", String::from_utf8_lossy(bytes))]
133    InvalidPath {
134        /// The offending bytes from the tree entry's filename.
135        bytes: Vec<u8>,
136    },
137
138    /// Underlying gix / git error from tree-walking, ref lookups, or
139    /// other git-side operations.
140    #[error("packchain git error: {0}")]
141    Git(#[from] crate::git::GitError),
142
143    /// Local repository is shallow (a `.git/shallow` file exists) and
144    /// the rev-walk from the local tip crosses a shallow boundary, so
145    /// a complete pack cannot be produced. Pushing from a shallow
146    /// clone would leave the server with permanently incomplete
147    /// history; better to refuse loudly than to corrupt the remote.
148    #[error("cannot push from a shallow clone: rev-walk crosses a shallow boundary")]
149    ShallowPushRejected,
150
151    /// `chain.json` is missing for the requested ref. Either the
152    /// branch was never pushed under the packchain engine or it was
153    /// deleted server-side. Distinct from
154    /// [`Self::Store`]`(NotFound)` so the wire-line is explicit
155    /// about which artefact is missing.
156    #[error("chain.json absent for {ref_name}; the branch is unknown to the bucket")]
157    ChainAbsent {
158        /// The ref name the fetch asked about.
159        ref_name: String,
160    },
161
162    /// `chain.json` references a pack that is not present on the
163    /// bucket. Pinning this as a typed error so `doctor` can flag it
164    /// specifically rather than the operator having to disambiguate a
165    /// generic `NotFound` from a transient failure. Issue #64 calls
166    /// this out as a regression case to surface loudly rather than
167    /// silently zero-byte-fetch.
168    #[error("packchain: chain.json references missing pack at {key}")]
169    PackMissing {
170        /// Bucket-relative pack key recorded in `chain.json`.
171        key: String,
172    },
173
174    /// Baseline bundle (the `<full_at>.bundle` artefact) is missing.
175    /// Surfaces during a clone where the chain walk reached the root
176    /// segment but the baseline that should be alongside it is gone.
177    #[error("packchain: baseline bundle missing at {key}")]
178    BaselineMissing {
179        /// Bucket key of the missing `<full_at>.bundle`.
180        key: String,
181    },
182
183    /// Pack content SHA could not be derived (file shorter than the
184    /// 32-byte minimum PACK header + trailer, or an I/O error reading
185    /// the trailer).
186    #[error("pack content SHA unavailable: {0}")]
187    PackTrailer(String),
188
189    /// `gix_pack::data::output::count::objects` or `FromEntriesIter`
190    /// failed during pack emission.
191    #[error("pack build error: {0}")]
192    PackBuild(String),
193
194    /// `gix_pack::Bundle::write_to_directory` failed during the
195    /// post-pack `.idx` derivation pass.
196    #[error("pack index write error: {0}")]
197    PackIndexWrite(Box<gix_pack::bundle::write::Error>),
198
199    /// Underlying object-store transport / auth error.
200    #[error("packchain object-store error: {0}")]
201    Store(#[from] crate::object_store::ObjectStoreError),
202
203    /// Local I/O failure (tempdir, file read, file persist).
204    #[error("packchain I/O error: {0}")]
205    Io(#[from] std::io::Error),
206
207    /// [`read::read_blob`] was called against a remote whose resolved
208    /// engine is not [`crate::url::StorageEngine::Packchain`]. Surfaces
209    /// before any artefact lookup so callers see a typed mismatch
210    /// instead of a misleading `chain.json` not-found.
211    #[error(
212        "read_blob requires the packchain engine; this remote uses `{found}` — \
213         check the URL's `?engine=` parameter or the bucket's `FORMAT` key"
214    )]
215    WrongEngine {
216        /// Engine the remote actually resolved to.
217        found: crate::url::StorageEngine,
218    },
219
220    /// `path-index.json` is missing for the requested ref. Distinct
221    /// from [`Self::ChainAbsent`] so an operator sees which artefact is
222    /// gone — chain.json being present without path-index indicates a
223    /// crashed-mid-push state `manage gc` will reconcile.
224    #[error("path-index.json absent for {ref_name}; the branch's path map is unavailable")]
225    PathIndexAbsent {
226        /// The ref name [`read::read_blob`] was asked about.
227        ref_name: String,
228    },
229
230    /// Caller passed a `path` that does not exist in this commit's tree.
231    #[error("path `{path}` not found in {ref_name}")]
232    PathNotFound {
233        /// Ref the lookup ran against.
234        ref_name: String,
235        /// The path the caller asked for, returned verbatim.
236        path: String,
237    },
238
239    /// Caller passed a malformed path: empty, absolute (`/`-prefixed),
240    /// containing a `..` segment, or containing empty segments
241    /// (consecutive slashes). These shapes don't map to git tree
242    /// semantics; reject before walking.
243    #[error("malformed path `{path}`: {reason}")]
244    MalformedPath {
245        /// The rejected path, returned verbatim.
246        path: String,
247        /// Human-readable reason (`"empty"`, `"absolute"`, `"contains ..\""`, etc.).
248        reason: &'static str,
249    },
250
251    /// Path resolved to a tree node, not a blob — the caller asked for
252    /// a directory, not a file. Distinct from [`Self::PathNotFound`] so
253    /// the caller can distinguish "wrong shape" from "missing".
254    #[error("path `{path}` resolves to a directory, not a file")]
255    PathNotABlob {
256        /// The path the caller asked for.
257        path: String,
258    },
259
260    /// Blob SHA recorded in `path-index.json` was not present in any
261    /// pack referenced by `chain.json`. Indicates a corrupted bucket
262    /// (path-index points at a blob the chain doesn't carry); typed
263    /// distinctly so `doctor` can flag it specifically.
264    #[error("blob {sha} for path `{path}` not present in any chain pack")]
265    BlobNotInChain {
266        /// The blob SHA the path-index named.
267        sha: String,
268        /// The path the caller asked for.
269        path: String,
270    },
271
272    /// Pack entry header could not be decoded (truncated bytes, unknown
273    /// type id, non-canonical size encoding, etc.).
274    #[error("malformed pack entry at offset {offset}: {reason}")]
275    MalformedPackEntry {
276        /// Pack-relative offset of the entry that failed to decode.
277        offset: u64,
278        /// Human-readable reason from the underlying decoder.
279        reason: String,
280    },
281
282    /// Zlib stream embedded in a pack entry could not be inflated.
283    #[error("zlib decompression failure for entry at offset {offset}")]
284    Decompress {
285        /// Pack-relative offset of the entry whose payload failed.
286        offset: u64,
287    },
288
289    /// Delta resolution exceeded [`read::MAX_DELTA_DEPTH`]. Mirrors
290    /// git's own depth cap — most legitimate chains stay well under it,
291    /// so a deep chain is almost certainly a corrupted pack with a
292    /// delta cycle.
293    #[error("pack delta chain exceeds maximum depth ({max})")]
294    DeltaTooDeep {
295        /// The depth limit (always [`read::MAX_DELTA_DEPTH`]).
296        max: u32,
297    },
298
299    /// Delta payload could not be applied (truncated instructions,
300    /// out-of-range copy span, source size mismatch).
301    #[error("malformed delta payload: {reason}")]
302    MalformedDelta {
303        /// Human-readable reason from the delta decoder.
304        reason: &'static str,
305    },
306
307    /// `read_blob` was given a ref name that fails `gix-validate`'s
308    /// reference-name rules (empty, control characters, `..`, etc.).
309    #[error("invalid ref name `{name}`")]
310    InvalidRefName {
311        /// The ref name the caller passed.
312        name: String,
313    },
314
315    /// Tree closure walk encountered a cycle: a tree references itself
316    /// directly or transitively via an ancestor on the current descent.
317    /// Content-addressing makes cycles impossible in a healthy ODB, so
318    /// this surfaces a corrupted or adversarial repository rather than
319    /// looping unbounded and exhausting the call stack.
320    #[error("tree {oid} forms a cycle in the path-index walk")]
321    TreeCycle {
322        /// The tree OID whose presence in the ancestor set was detected.
323        oid: String,
324    },
325
326    /// Reader observed `chain.json` and `path-index.json` with
327    /// mismatched tips — a transient state during the brief window
328    /// where a push or compact has committed the new `chain.json` but
329    /// not yet overwritten `path-index.json` (issue #114). The reader
330    /// refuses to resolve a path against an out-of-sync path-index
331    /// because the resolved blob SHA may name a different file than
332    /// the caller intended; instead it surfaces this typed error so
333    /// the caller can retry. Subsequent reads converge once the writer
334    /// finishes the path-index PUT.
335    #[error(
336        "transient chain/path-index mismatch for {ref_name}: \
337         chain.tip = {chain_tip}, path_index.tip = {path_index_tip}; retry"
338    )]
339    TransientChainPathIndexMismatch {
340        /// The ref the lookup ran against.
341        ref_name: String,
342        /// Tip recorded in `chain.json` at read time.
343        chain_tip: String,
344        /// Tip recorded in `path-index.json` at read time.
345        path_index_tip: String,
346    },
347
348    /// [`read::read_blob`] retried [`Self::PackMissing`] failures the
349    /// configured number of times and gave up. Each retry reloaded
350    /// `chain.json` and observed that the failing pack key was no
351    /// longer referenced — consistent with a concurrent
352    /// `manage gc sweep` deleting compacted-away packs — but a fresh
353    /// `PackMissing` showed up on the new chain anyway, suggesting a
354    /// vigorous compact+sweep cycle that kept outpacing the reader.
355    /// Distinct from [`Self::PackMissing`] so callers can treat it as
356    /// "retry the whole `read_blob` call later" rather than as a
357    /// permanent bucket inconsistency (issue #136).
358    #[error(
359        "read_blob exhausted {attempts} retries against concurrent GC: \
360         last missing pack `{last_missing_key}`; retry the call"
361    )]
362    ConcurrentGcRetriesExhausted {
363        /// The key whose final `PackMissing` ended the retry loop.
364        last_missing_key: String,
365        /// Number of retry attempts that were made (excluding the
366        /// initial attempt). Pinned in the error so logs and tests can
367        /// assert on it.
368        attempts: u32,
369    },
370}
371
372impl From<gix_pack::bundle::write::Error> for PackchainError {
373    fn from(value: gix_pack::bundle::write::Error) -> Self {
374        Self::PackIndexWrite(Box::new(value))
375    }
376}
377
378impl From<tokio::task::JoinError> for PackchainError {
379    fn from(value: tokio::task::JoinError) -> Self {
380        // `JoinError` carries either a panic payload or a cancellation
381        // signal; flatten both through the Io variant since neither
382        // matches a more specific PackchainError category.
383        Self::Io(std::io::Error::other(value.to_string()))
384    }
385}
386
387#[cfg(test)]
388mod tests {
389    use super::*;
390
391    #[test]
392    fn unsupported_schema_version_renders_both_versions() {
393        let err = PackchainError::UnsupportedSchemaVersion {
394            found: 2,
395            expected: 1,
396        };
397        assert_eq!(
398            err.to_string(),
399            "packchain schema version 2 unsupported (this build reads v1)"
400        );
401    }
402
403    #[test]
404    fn shallow_push_rejected_includes_actionable_wording() {
405        let err = PackchainError::ShallowPushRejected;
406        // The wire-line client-facing wording must remain stable for
407        // shellspec assertions; pin it here too.
408        let msg = err.to_string();
409        assert!(
410            msg.contains("shallow clone"),
411            "shallow rejection wording must mention shallow clone: {msg}",
412        );
413    }
414}