git_remote_object_store/packchain/mod.rs
1//! Incremental pack-chain storage engine (issue #52).
2//!
3//! Push (#63) writes incremental packs keyed by content SHA, a
4//! newest-first [`schema::ChainManifest`], a nested
5//! [`schema::PathIndex`] of repo paths to blob SHAs, and a baseline
6//! bundle on the first / force push so a fresh clone short-circuits
7//! through `bundle-uri`. Fetch (#64), direct file access (#65,
8//! `read_blob` library API), compaction (#67), and GC (#66) are all
9//! implemented in sibling modules. Push artefacts on the bucket:
10//!
11//! ```text
12//! <prefix>/FORMAT "packchain"
13//! <prefix>/HEAD "refs/heads/main"
14//! <prefix>/refs/heads/<branch>/LOCK#.lock held during write, released after
15//! <prefix>/refs/heads/<branch>/chain.json newest-first manifest (THE commit point)
16//! <prefix>/refs/heads/<branch>/path-index.json nested tree → blob SHA map
17//! <prefix>/refs/heads/<branch>/<tip>.bundle baseline (first / force push only)
18//! <prefix>/packs/<content-sha>.pack incremental pack
19//! <prefix>/packs/<content-sha>.idx pack index
20//! ```
21//!
22//! Once the push lands, fetch resolves shallow / full clones via
23//! sequential pack install (`fetch.rs`), `read_blob` reads single
24//! blobs via the path-index without rehydrating the chain
25//! (`read.rs`), and the `manage compact` / `manage gc` subcommands
26//! reap orphans and collapse the chain (`compact.rs`, `gc.rs`).
27//!
28//! ## Linearization point
29//!
30//! `chain.json` is the commit point: pack/idx/baseline upload
31//! pre-lock, then under the per-ref lock the push writes
32//! FORMAT → HEAD → chain.json → path-index.json. Anything that
33//! crashed before the chain.json PUT leaves orphan keys
34//! (pack/idx/baseline at content-SHA or tip-SHA names) which
35//! `manage gc` reaps. Anything written after chain.json
36//! (path-index.json overwrite, force-push baseline cleanup) is
37//! post-commit and may be retried by re-running the push or compact.
38//!
39//! ## chain.json → path-index.json ordering and the reader contract
40//!
41//! Writing `path-index.json` LAST means a crash between the
42//! `chain.json` PUT and the `path-index.json` PUT leaves the bucket
43//! with a fresh chain alongside a stale path-index whose `tip` field
44//! still names the prior chain.tip. The reader detects this with a
45//! single tip-equality check (`path_index.tip == chain.tip`) and
46//! surfaces it as
47//! [`PackchainError::TransientChainPathIndexMismatch`] — a typed,
48//! retry-shaped error — rather than silently returning the wrong
49//! blob bytes or failing with the confusing
50//! [`PackchainError::BlobNotInChain`] that the old (path-index-first)
51//! ordering produced (issue #114).
52//!
53//! The reverse ordering (path-index before chain.json) is rejected
54//! because it lets a stale chain coexist with a fresh path-index
55//! whose blob SHAs are NOT yet in any chain pack, surfacing as
56//! `BlobNotInChain` — indistinguishable from genuine corruption.
57//!
58//! ## Lost-race orphan packs
59//!
60//! Packs upload BEFORE the per-ref lock is acquired so the lock-hold
61//! window stays bounded by chain.json + path-index PUT latency. When
62//! two pushers race they both upload their packs pre-lock; the loser
63//! sees `stale chain` after re-reading `chain.json` under the lock
64//! and returns without committing, leaving its pack as an
65//! unreferenced orphan that `manage gc` sweeps. The orphan-bandwidth
66//! cost is the deliberate trade-off for keeping the lock window
67//! short — an in-lock-upload alternative would block sibling pushers
68//! for the full duration of a multi-GiB upload.
69
70pub(crate) mod audit;
71pub(crate) mod compact;
72pub(crate) mod fetch;
73pub mod gc;
74pub(crate) mod git;
75pub(crate) mod keys;
76pub(crate) mod list;
77pub(crate) mod manifest;
78pub(crate) mod pack;
79pub(crate) mod push;
80// `pub` (parity with `gc`) so `git_remote_object_store::packchain::read`
81// is reachable for rustdoc discovery. The convenience re-exports
82// below (`packchain::PackIndexCache`, `packchain::read_blob`) remain
83// the canonical short paths.
84pub mod read;
85pub(crate) mod retry;
86pub(crate) mod schema;
87
88pub use read::{PackIndexCache, read_blob};
89
90/// Errors surfaced by the packchain engine. `pub` because the
91/// [`crate::protocol::push::PushError::Packchain`] variant — which is
92/// public — wraps it; making this `pub(crate)` would leak a private
93/// type through a public API. The packchain engine itself stays
94/// `pub(crate)` (see `pub(crate) mod push` etc.); only `gc` and `read`
95/// are `pub` for rustdoc / direct-access reachability.
96#[derive(Debug, thiserror::Error)]
97pub enum PackchainError {
98 /// On-bucket schema declares a version this build cannot read. The
99 /// `expected` field is the version this build writes; `found` is
100 /// the value parsed from the JSON. Lets a future v=2 reader refuse
101 /// v=1 clients (and vice versa) cleanly.
102 #[error("packchain schema version {found} unsupported (this build reads v{expected})")]
103 UnsupportedSchemaVersion {
104 /// Version found in the parsed JSON.
105 found: u32,
106 /// Version this build expects.
107 expected: u32,
108 },
109
110 /// A field that should hold a 40-lowercase-hex SHA contained
111 /// something else. Validation runs on every [`schema::Sha40`]
112 /// deserialise so a malformed `chain.json` or `path-index.json`
113 /// cannot leak past the parser into the rest of the engine.
114 #[error("invalid 40-hex sha `{found}`: must be 40 lowercase hex characters")]
115 InvalidSha {
116 /// The rejected string (truncated by `Display`'s default
117 /// formatter at the wire level).
118 found: String,
119 },
120
121 /// Underlying `serde_json` parse error (malformed JSON, missing
122 /// fields, type mismatches that aren't caught by [`schema::Sha40`]'s
123 /// validator).
124 #[error("packchain schema parse error: {0}")]
125 ParseJson(#[from] serde_json::Error),
126
127 /// Tree entry filename was not valid UTF-8. Git allows arbitrary
128 /// bytes in tree entry names, but the on-bucket JSON layer cannot
129 /// represent non-UTF-8 keys without a lossy encoding (banned by
130 /// `.claude/rules/rust.md`). Carries the offending bytes verbatim
131 /// for diagnostics.
132 #[error("invalid path: {} (not valid UTF-8)", String::from_utf8_lossy(bytes))]
133 InvalidPath {
134 /// The offending bytes from the tree entry's filename.
135 bytes: Vec<u8>,
136 },
137
138 /// Underlying gix / git error from tree-walking, ref lookups, or
139 /// other git-side operations.
140 #[error("packchain git error: {0}")]
141 Git(#[from] crate::git::GitError),
142
143 /// Local repository is shallow (a `.git/shallow` file exists) and
144 /// the rev-walk from the local tip crosses a shallow boundary, so
145 /// a complete pack cannot be produced. Pushing from a shallow
146 /// clone would leave the server with permanently incomplete
147 /// history; better to refuse loudly than to corrupt the remote.
148 #[error("cannot push from a shallow clone: rev-walk crosses a shallow boundary")]
149 ShallowPushRejected,
150
151 /// `chain.json` is missing for the requested ref. Either the
152 /// branch was never pushed under the packchain engine or it was
153 /// deleted server-side. Distinct from
154 /// [`Self::Store`]`(NotFound)` so the wire-line is explicit
155 /// about which artefact is missing.
156 #[error("chain.json absent for {ref_name}; the branch is unknown to the bucket")]
157 ChainAbsent {
158 /// The ref name the fetch asked about.
159 ref_name: String,
160 },
161
162 /// `chain.json` references a pack that is not present on the
163 /// bucket. Pinning this as a typed error so `doctor` can flag it
164 /// specifically rather than the operator having to disambiguate a
165 /// generic `NotFound` from a transient failure. Issue #64 calls
166 /// this out as a regression case to surface loudly rather than
167 /// silently zero-byte-fetch.
168 #[error("packchain: chain.json references missing pack at {key}")]
169 PackMissing {
170 /// Bucket-relative pack key recorded in `chain.json`.
171 key: String,
172 },
173
174 /// Baseline bundle (the `<full_at>.bundle` artefact) is missing.
175 /// Surfaces during a clone where the chain walk reached the root
176 /// segment but the baseline that should be alongside it is gone.
177 #[error("packchain: baseline bundle missing at {key}")]
178 BaselineMissing {
179 /// Bucket key of the missing `<full_at>.bundle`.
180 key: String,
181 },
182
183 /// Pack content SHA could not be derived (file shorter than the
184 /// 32-byte minimum PACK header + trailer, or an I/O error reading
185 /// the trailer).
186 #[error("pack content SHA unavailable: {0}")]
187 PackTrailer(String),
188
189 /// `gix_pack::data::output::count::objects` or `FromEntriesIter`
190 /// failed during pack emission.
191 #[error("pack build error: {0}")]
192 PackBuild(String),
193
194 /// `gix_pack::Bundle::write_to_directory` failed during the
195 /// post-pack `.idx` derivation pass.
196 #[error("pack index write error: {0}")]
197 PackIndexWrite(Box<gix_pack::bundle::write::Error>),
198
199 /// Underlying object-store transport / auth error.
200 #[error("packchain object-store error: {0}")]
201 Store(#[from] crate::object_store::ObjectStoreError),
202
203 /// Local I/O failure (tempdir, file read, file persist).
204 #[error("packchain I/O error: {0}")]
205 Io(#[from] std::io::Error),
206
207 /// [`read::read_blob`] was called against a remote whose resolved
208 /// engine is not [`crate::url::StorageEngine::Packchain`]. Surfaces
209 /// before any artefact lookup so callers see a typed mismatch
210 /// instead of a misleading `chain.json` not-found.
211 #[error(
212 "read_blob requires the packchain engine; this remote uses `{found}` — \
213 check the URL's `?engine=` parameter or the bucket's `FORMAT` key"
214 )]
215 WrongEngine {
216 /// Engine the remote actually resolved to.
217 found: crate::url::StorageEngine,
218 },
219
220 /// `path-index.json` is missing for the requested ref. Distinct
221 /// from [`Self::ChainAbsent`] so an operator sees which artefact is
222 /// gone — chain.json being present without path-index indicates a
223 /// crashed-mid-push state `manage gc` will reconcile.
224 #[error("path-index.json absent for {ref_name}; the branch's path map is unavailable")]
225 PathIndexAbsent {
226 /// The ref name [`read::read_blob`] was asked about.
227 ref_name: String,
228 },
229
230 /// Caller passed a `path` that does not exist in this commit's tree.
231 #[error("path `{path}` not found in {ref_name}")]
232 PathNotFound {
233 /// Ref the lookup ran against.
234 ref_name: String,
235 /// The path the caller asked for, returned verbatim.
236 path: String,
237 },
238
239 /// Caller passed a malformed path: empty, absolute (`/`-prefixed),
240 /// containing a `..` segment, or containing empty segments
241 /// (consecutive slashes). These shapes don't map to git tree
242 /// semantics; reject before walking.
243 #[error("malformed path `{path}`: {reason}")]
244 MalformedPath {
245 /// The rejected path, returned verbatim.
246 path: String,
247 /// Human-readable reason (`"empty"`, `"absolute"`, `"contains ..\""`, etc.).
248 reason: &'static str,
249 },
250
251 /// Path resolved to a tree node, not a blob — the caller asked for
252 /// a directory, not a file. Distinct from [`Self::PathNotFound`] so
253 /// the caller can distinguish "wrong shape" from "missing".
254 #[error("path `{path}` resolves to a directory, not a file")]
255 PathNotABlob {
256 /// The path the caller asked for.
257 path: String,
258 },
259
260 /// Blob SHA recorded in `path-index.json` was not present in any
261 /// pack referenced by `chain.json`. Indicates a corrupted bucket
262 /// (path-index points at a blob the chain doesn't carry); typed
263 /// distinctly so `doctor` can flag it specifically.
264 #[error("blob {sha} for path `{path}` not present in any chain pack")]
265 BlobNotInChain {
266 /// The blob SHA the path-index named.
267 sha: String,
268 /// The path the caller asked for.
269 path: String,
270 },
271
272 /// Pack entry header could not be decoded (truncated bytes, unknown
273 /// type id, non-canonical size encoding, etc.).
274 #[error("malformed pack entry at offset {offset}: {reason}")]
275 MalformedPackEntry {
276 /// Pack-relative offset of the entry that failed to decode.
277 offset: u64,
278 /// Human-readable reason from the underlying decoder.
279 reason: String,
280 },
281
282 /// Zlib stream embedded in a pack entry could not be inflated.
283 #[error("zlib decompression failure for entry at offset {offset}")]
284 Decompress {
285 /// Pack-relative offset of the entry whose payload failed.
286 offset: u64,
287 },
288
289 /// Delta resolution exceeded [`read::MAX_DELTA_DEPTH`]. Mirrors
290 /// git's own depth cap — most legitimate chains stay well under it,
291 /// so a deep chain is almost certainly a corrupted pack with a
292 /// delta cycle.
293 #[error("pack delta chain exceeds maximum depth ({max})")]
294 DeltaTooDeep {
295 /// The depth limit (always [`read::MAX_DELTA_DEPTH`]).
296 max: u32,
297 },
298
299 /// Delta payload could not be applied (truncated instructions,
300 /// out-of-range copy span, source size mismatch).
301 #[error("malformed delta payload: {reason}")]
302 MalformedDelta {
303 /// Human-readable reason from the delta decoder.
304 reason: &'static str,
305 },
306
307 /// `read_blob` was given a ref name that fails `gix-validate`'s
308 /// reference-name rules (empty, control characters, `..`, etc.).
309 #[error("invalid ref name `{name}`")]
310 InvalidRefName {
311 /// The ref name the caller passed.
312 name: String,
313 },
314
315 /// Tree closure walk encountered a cycle: a tree references itself
316 /// directly or transitively via an ancestor on the current descent.
317 /// Content-addressing makes cycles impossible in a healthy ODB, so
318 /// this surfaces a corrupted or adversarial repository rather than
319 /// looping unbounded and exhausting the call stack.
320 #[error("tree {oid} forms a cycle in the path-index walk")]
321 TreeCycle {
322 /// The tree OID whose presence in the ancestor set was detected.
323 oid: String,
324 },
325
326 /// Reader observed `chain.json` and `path-index.json` with
327 /// mismatched tips — a transient state during the brief window
328 /// where a push or compact has committed the new `chain.json` but
329 /// not yet overwritten `path-index.json` (issue #114). The reader
330 /// refuses to resolve a path against an out-of-sync path-index
331 /// because the resolved blob SHA may name a different file than
332 /// the caller intended; instead it surfaces this typed error so
333 /// the caller can retry. Subsequent reads converge once the writer
334 /// finishes the path-index PUT.
335 #[error(
336 "transient chain/path-index mismatch for {ref_name}: \
337 chain.tip = {chain_tip}, path_index.tip = {path_index_tip}; retry"
338 )]
339 TransientChainPathIndexMismatch {
340 /// The ref the lookup ran against.
341 ref_name: String,
342 /// Tip recorded in `chain.json` at read time.
343 chain_tip: String,
344 /// Tip recorded in `path-index.json` at read time.
345 path_index_tip: String,
346 },
347
348 /// [`read::read_blob`] retried [`Self::PackMissing`] failures the
349 /// configured number of times and gave up. Each retry reloaded
350 /// `chain.json` and observed that the failing pack key was no
351 /// longer referenced — consistent with a concurrent
352 /// `manage gc sweep` deleting compacted-away packs — but a fresh
353 /// `PackMissing` showed up on the new chain anyway, suggesting a
354 /// vigorous compact+sweep cycle that kept outpacing the reader.
355 /// Distinct from [`Self::PackMissing`] so callers can treat it as
356 /// "retry the whole `read_blob` call later" rather than as a
357 /// permanent bucket inconsistency (issue #136).
358 #[error(
359 "read_blob exhausted {attempts} retries against concurrent GC: \
360 last missing pack `{last_missing_key}`; retry the call"
361 )]
362 ConcurrentGcRetriesExhausted {
363 /// The key whose final `PackMissing` ended the retry loop.
364 last_missing_key: String,
365 /// Number of retry attempts that were made (excluding the
366 /// initial attempt). Pinned in the error so logs and tests can
367 /// assert on it.
368 attempts: u32,
369 },
370}
371
372impl From<gix_pack::bundle::write::Error> for PackchainError {
373 fn from(value: gix_pack::bundle::write::Error) -> Self {
374 Self::PackIndexWrite(Box::new(value))
375 }
376}
377
378impl From<tokio::task::JoinError> for PackchainError {
379 fn from(value: tokio::task::JoinError) -> Self {
380 // `JoinError` carries either a panic payload or a cancellation
381 // signal; flatten both through the Io variant since neither
382 // matches a more specific PackchainError category.
383 Self::Io(std::io::Error::other(value.to_string()))
384 }
385}
386
387#[cfg(test)]
388mod tests {
389 use super::*;
390
391 #[test]
392 fn unsupported_schema_version_renders_both_versions() {
393 let err = PackchainError::UnsupportedSchemaVersion {
394 found: 2,
395 expected: 1,
396 };
397 assert_eq!(
398 err.to_string(),
399 "packchain schema version 2 unsupported (this build reads v1)"
400 );
401 }
402
403 #[test]
404 fn shallow_push_rejected_includes_actionable_wording() {
405 let err = PackchainError::ShallowPushRejected;
406 // The wire-line client-facing wording must remain stable for
407 // shellspec assertions; pin it here too.
408 let msg = err.to_string();
409 assert!(
410 msg.contains("shallow clone"),
411 "shallow rejection wording must mention shallow clone: {msg}",
412 );
413 }
414}