Skip to main content

cli/bridge/
git_reconstruct.rs

1// SPDX-License-Identifier: Apache-2.0
2#![deny(clippy::cast_possible_truncation)]
3
4//! Byte-exact git commit object serialization from a Heddle [`State`] (#566).
5//!
6//! Reconstructs the exact bytes `git cat-file commit <sha>` prints from the
7//! de-lossy fidelity fields #565 captured, so that re-framing (§0) and
8//! SHA-1-hashing the result reproduces the *original* commit's object id. This
9//! is the consumer that makes #565's fields load-bearing and the step that lets
10//! the git mirror be eliminated (#568): a commit can be rebuilt from Heddle
11//! state alone — no stored git object.
12//!
13//! The wire format is specced byte-for-byte in
14//! `.heddleco-orchestrator/briefs/spike-566-serializer-format.md`; the `§N`
15//! references below point into it. Tag-object reconstruction
16//! (`reconstruct_tag_bytes`) is deferred to #575, where annotated tags become
17//! first-class content-addressed objects; lightweight tags need no object (just
18//! a ref at the commit).
19
20use objects::{
21    object::{Principal, State},
22    store::{ObjectStore, StoreError},
23};
24use repo::Repository as HeddleRepository;
25use sley::{
26    GitObjectType, ObjectFormat, ObjectId, Repository as SleyRepository,
27    plumbing::sley_object::EncodedObject,
28};
29
30use crate::bridge::{
31    git_core::{GitBridge, GitBridgeError, GitResult, SyncMapping, git_err},
32    git_export::export_tree,
33};
34
35/// Frame an object's content for hashing per spike §0:
36/// `<kind> <ascii-decimal-len>\0<content>`. A git object's id is the SHA-1 of
37/// THIS buffer — never of the bare content (`git cat-file` strips the framing).
38/// `<len>` is the byte length of `content` (after all folding/newlines), with no
39/// leading zeros.
40pub fn frame_git_object(kind: &str, content: &[u8]) -> Vec<u8> {
41    let mut framed = Vec::with_capacity(kind.len() + 2 + 20 + content.len());
42    framed.extend_from_slice(kind.as_bytes());
43    framed.push(b' ');
44    framed.extend_from_slice(content.len().to_string().as_bytes());
45    framed.push(0);
46    framed.extend_from_slice(content);
47    framed
48}
49
50/// The git object id (SHA-1) of a commit whose reconstructed content bytes are
51/// `content`: frame per §0, then hash. Equals the original commit SHA exactly
52/// when `content` is byte-identical to the original object.
53pub fn commit_object_id(content: &[u8]) -> ObjectId {
54    sley::plumbing::sley_core::object_id_for_bytes(ObjectFormat::Sha1, "commit", content)
55        .expect("SHA-1 commit object id over in-memory bytes cannot fail")
56}
57
58/// Reconstruct the byte-exact git commit object **content** (the bytes
59/// `git cat-file commit` prints, WITHOUT the §0 framing) for `state`.
60///
61/// `repo` is any writable sley repo: the git tree OID is resolved by re-exporting
62/// `state.tree` through [`export_tree`] (git trees are content-addressed, so the
63/// resulting OID is independent of which repo it is written into — the round-trip
64/// fidelity gate proves this path reproduces the original tree SHA). Parent OIDs
65/// come from the import `mapping` (`ChangeId` → original git OID), in
66/// `state.parents` order — order is part of a commit's identity (§1.2).
67pub fn reconstruct_commit_bytes(
68    heddle_repo: &HeddleRepository,
69    repo: &SleyRepository,
70    mapping: &SyncMapping,
71    state: &State,
72) -> GitResult<Vec<u8>> {
73    let tree_oid = export_tree(heddle_repo, repo, &state.tree)?;
74    let parent_oids = state
75        .parents
76        .iter()
77        .map(|parent| {
78            mapping
79                .get_git(parent)
80                .ok_or(GitBridgeError::StateNotFound(*parent))
81        })
82        .collect::<GitResult<Vec<_>>>()?;
83    build_commit_content(state, &tree_oid, &parent_oids)
84}
85
86/// Frame + write a reconstructed commit object's `content` bytes into `repo`'s
87/// object database, returning its git OID — the SHA-1 of the framed object (§0),
88/// equal to the original commit's id exactly when `content` is byte-identical to
89/// the original.
90///
91/// This is the write side of export-from-state (#567): export regenerates each
92/// commit object from Heddle state and writes it here, rather than relying on the
93/// git mirror still holding the verbatim imported bytes — the dependency #568
94/// removes. Idempotent: sley's object writer hashes first and no-ops when the
95/// object already exists, so re-writing a commit the mirror already carries (the
96/// common case today) costs nothing.
97pub fn write_commit_object(repo: &SleyRepository, content: &[u8]) -> GitResult<ObjectId> {
98    repo.write_object(EncodedObject::new(GitObjectType::Commit, content.to_vec()))
99        .map_err(git_err)
100}
101
102/// Assemble the commit content bytes from already-resolved OIDs. Pure (no repo,
103/// no mapping) so the byte layout — header order, actor lines, header folding,
104/// verbatim message — is unit-testable in isolation (§1/§2/§5/§6).
105fn build_commit_content(
106    state: &State,
107    tree_oid: &ObjectId,
108    parent_oids: &[ObjectId],
109) -> GitResult<Vec<u8>> {
110    let mut out = Vec::new();
111
112    // `tree` is always first, exactly once (§1.1).
113    out.extend_from_slice(b"tree ");
114    out.extend_from_slice(tree_oid.to_string().as_bytes());
115    out.push(b'\n');
116
117    // `parent` lines follow, zero or more, in recorded order (§1.2).
118    for parent in parent_oids {
119        out.extend_from_slice(b"parent ");
120        out.extend_from_slice(parent.to_string().as_bytes());
121        out.push(b'\n');
122    }
123
124    // `author` then `committer` (§1.3/§5). Author time/tz come from the #565
125    // `authored_at` + `authored_tz_offset` (with `created_at` as the native-commit
126    // fallback); committer identity/time/tz from the distinct `committer`
127    // Principal (author fallback when absent) + `created_at` + `committer_tz_offset`
128    // — NOT a hardcoded `+0000`.
129    let author_seconds = state.authored_at.unwrap_or(state.created_at).timestamp();
130    write_actor_line(
131        &mut out,
132        b"author",
133        &state.attribution.principal,
134        author_seconds,
135        state.authored_tz_offset,
136    )?;
137    let committer = state
138        .committer
139        .as_ref()
140        .unwrap_or(&state.attribution.principal);
141    write_actor_line(
142        &mut out,
143        b"committer",
144        committer,
145        state.created_at.timestamp(),
146        state.committer_tz_offset,
147    )?;
148
149    // Extension headers (`encoding`/`gpgsig`/`mergetag`/unknown) at their captured
150    // ordinal, multi-line values re-folded (§1.4/§2). The ordered `Vec` is the
151    // source of truth — gpgsig and mergetag are just entries here, never
152    // special-cased; when both are present git emits mergetag before gpgsig and
153    // the captured order already encodes that.
154    for (name, value) in &state.extra_headers {
155        out.extend_from_slice(name);
156        out.push(b' ');
157        append_folded(&mut out, value);
158        out.push(b'\n');
159    }
160
161    // Exactly one blank line separates headers from the body (§1.5) — always
162    // present, even for an empty message.
163    out.push(b'\n');
164
165    // Message bytes verbatim: no trim, no appended newline (§6). An empty message
166    // contributes zero bytes; a message without a trailing newline ends mid-line.
167    if let Some(message) = &state.raw_message {
168        out.extend_from_slice(message);
169    }
170
171    Ok(out)
172}
173
174/// `<label> <name> <<email>> <unix-seconds> <±HHMM>\n` (§5).
175fn write_actor_line(
176    out: &mut Vec<u8>,
177    label: &[u8],
178    who: &Principal,
179    seconds: i64,
180    tz_offset_secs: i32,
181) -> GitResult<()> {
182    let seconds = checked_actor_timestamp(label, seconds, tz_offset_secs)?;
183    out.extend_from_slice(label);
184    out.push(b' ');
185    out.extend_from_slice(who.name.as_bytes());
186    out.extend_from_slice(b" <");
187    out.extend_from_slice(who.email.as_bytes());
188    out.extend_from_slice(b"> ");
189    out.extend_from_slice(seconds.to_string().as_bytes());
190    out.push(b' ');
191    out.extend_from_slice(format_tz_offset(tz_offset_secs).as_bytes());
192    out.push(b'\n');
193    Ok(())
194}
195
196fn checked_actor_timestamp(label: &[u8], seconds: i64, tz_offset_secs: i32) -> GitResult<i64> {
197    // Git serializes UTC seconds plus a timezone offset. Validate the local
198    // seconds implied by that pair so malformed fidelity data cannot overflow
199    // reconstruct-time timestamp arithmetic.
200    seconds
201        .checked_add(i64::from(tz_offset_secs))
202        .map(|_| seconds)
203        .ok_or_else(|| {
204            let label = String::from_utf8_lossy(label);
205            GitBridgeError::Store(StoreError::InvalidObject(format!(
206                "{label} timestamp {seconds} with timezone offset {tz_offset_secs} overflows i64"
207            )))
208        })
209}
210
211/// Render a timezone offset — stored as **seconds** east of UTC (#565's `i32`
212/// unit) — as git's `±HHMM` (§5). The sign is
213/// always present; zero is `+0000` (git never emits `-0000` for a real commit);
214/// odd offsets like `-0830` / `+1245` survive verbatim.
215fn format_tz_offset(offset_secs: i32) -> String {
216    let sign = if offset_secs < 0 { '-' } else { '+' };
217    let minutes = offset_secs.unsigned_abs() / 60;
218    format!("{sign}{:02}{:02}", minutes / 60, minutes % 60)
219}
220
221/// Fold a stored (unfolded) extension-header value onto the wire (§2): each
222/// internal `\n` becomes `\n ` (newline + one continuation space). A value with
223/// an internal blank line folds to a line containing exactly one space — never a
224/// truly empty line, which git would read as the header/body separator. Exact
225/// inverse of `objects::object::parse_commit_extension_headers`'s unfold.
226fn append_folded(out: &mut Vec<u8>, value: &[u8]) {
227    let mut first = true;
228    for segment in value.split(|&b| b == b'\n') {
229        if first {
230            first = false;
231        } else {
232            out.push(b'\n');
233            out.push(b' ');
234        }
235        out.extend_from_slice(segment);
236    }
237}
238
239impl GitBridge<'_> {
240    /// Open (initializing if necessary) a writable sley repo suitable for
241    /// reconstruction's tree-OID resolution. Any writable odb works — git trees
242    /// are content-addressed — so the bridge's own mirror is reused.
243    pub fn reconstruction_repo(&mut self) -> GitResult<SleyRepository> {
244        self.init_mirror()?;
245        self.open_git_repo()
246    }
247
248    /// Reconstruct the byte-exact commit content for `state` against `repo` (see
249    /// [`reconstruct_commit_bytes`]), using the bridge's import-built mapping for
250    /// parent OIDs.
251    pub fn reconstruct_commit_bytes(
252        &self,
253        repo: &SleyRepository,
254        state: &State,
255    ) -> GitResult<Vec<u8>> {
256        reconstruct_commit_bytes(self.heddle_repo, repo, &self.mapping, state)
257    }
258
259    /// Reconstruct `state`'s commit object from Heddle state and WRITE it into
260    /// `repo`'s object database, returning its git OID (see [`write_commit_object`]).
261    /// The export's commit-minting step (#567): the object is regenerated from
262    /// state, so it lands at the original SHA without the mirror needing to hold
263    /// the verbatim bytes.
264    pub fn reconstruct_and_write_commit(
265        &self,
266        repo: &SleyRepository,
267        state: &State,
268    ) -> GitResult<ObjectId> {
269        let content = self.reconstruct_commit_bytes(repo, state)?;
270        write_commit_object(repo, &content)
271    }
272
273    /// Reconstruct the commit currently mapped to the git object `sha` (40-hex),
274    /// or `None` if no Heddle state maps to it. Convenience for callers keyed by
275    /// the original git OID — e.g. the #566 conformance gate, which compares the
276    /// reconstruction of each original commit against its captured golden bytes.
277    pub fn reconstruct_commit_for_git_sha(
278        &self,
279        repo: &SleyRepository,
280        sha: &str,
281    ) -> GitResult<Option<Vec<u8>>> {
282        let oid = ObjectId::from_hex(ObjectFormat::Sha1, sha).map_err(git_err)?;
283        let Some(change_id) = self.mapping.get_heddle(oid) else {
284            return Ok(None);
285        };
286        let Some(state) = self.heddle_repo.store().get_state(&change_id)? else {
287            return Ok(None);
288        };
289        Ok(Some(reconstruct_commit_bytes(
290            self.heddle_repo,
291            repo,
292            &self.mapping,
293            &state,
294        )?))
295    }
296
297    /// Reconstruct the commit mapped to git object `sha` and WRITE it into `repo`,
298    /// returning the written OID (or `None` if no Heddle state maps to `sha`).
299    /// Combines [`Self::reconstruct_commit_for_git_sha`] with the odb write so the
300    /// #567 export-from-state path is exercisable against an arbitrary repo —
301    /// notably a FRESH one that never received the verbatim imported bytes, which
302    /// is how the export gate proves the object is regenerated from state, not
303    /// copied from the mirror.
304    pub fn reconstruct_and_write_commit_for_git_sha(
305        &self,
306        repo: &SleyRepository,
307        sha: &str,
308    ) -> GitResult<Option<ObjectId>> {
309        let oid = ObjectId::from_hex(ObjectFormat::Sha1, sha).map_err(git_err)?;
310        let Some(change_id) = self.mapping.get_heddle(oid) else {
311            return Ok(None);
312        };
313        let Some(state) = self.heddle_repo.store().get_state(&change_id)? else {
314            return Ok(None);
315        };
316        Ok(Some(self.reconstruct_and_write_commit(repo, &state)?))
317    }
318}
319
320#[cfg(test)]
321mod tests {
322    use objects::object::parse_commit_extension_headers;
323
324    use super::*;
325
326    #[test]
327    fn tz_offset_renders_sign_hours_minutes() {
328        assert_eq!(format_tz_offset(0), "+0000");
329        assert_eq!(format_tz_offset(2 * 3600), "+0200");
330        assert_eq!(format_tz_offset(-8 * 3600), "-0800");
331        // Odd, sub-hour offsets survive verbatim (§5).
332        assert_eq!(format_tz_offset(-(8 * 3600 + 30 * 60)), "-0830");
333        assert_eq!(format_tz_offset(12 * 3600 + 45 * 60), "+1245");
334        assert_eq!(format_tz_offset(5 * 3600 + 30 * 60), "+0530");
335    }
336
337    #[test]
338    fn frame_prepends_kind_len_nul() {
339        assert_eq!(frame_git_object("commit", b"abc"), b"commit 3\0abc");
340        assert_eq!(frame_git_object("commit", b""), b"commit 0\0");
341    }
342
343    #[test]
344    fn fold_then_unfold_round_trips() {
345        // A gpgsig-shaped value: a leading line, an internal blank line (the
346        // armor's empty line), then body lines and the END marker — stored
347        // unfolded, with no trailing newline (§2).
348        let value: &[u8] =
349            b"-----BEGIN PGP SIGNATURE-----\n\niHUEsigbytes\nmoresig\n-----END PGP SIGNATURE-----";
350
351        // Fold the way the serializer writes the wire.
352        let mut folded = Vec::new();
353        folded.extend_from_slice(b"gpgsig ");
354        append_folded(&mut folded, value);
355        folded.push(b'\n');
356
357        // The internal blank line must fold to a line that is exactly one space,
358        // never an empty line (which would terminate the header block).
359        assert!(folded.windows(3).any(|w| w == b"\n \n"));
360
361        // Re-parsing a minimal commit header block carrying this folded header
362        // must recover the original unfolded value byte-for-byte.
363        let mut content = Vec::new();
364        content.extend_from_slice(b"tree ");
365        content.extend_from_slice(&[b'0'; 40]);
366        content.push(b'\n');
367        content.extend_from_slice(b"author A <a@x> 1 +0000\n");
368        content.extend_from_slice(b"committer A <a@x> 1 +0000\n");
369        content.extend_from_slice(&folded);
370        content.extend_from_slice(b"\nbody\n");
371
372        let headers = parse_commit_extension_headers(&content);
373        assert_eq!(headers.len(), 1);
374        assert_eq!(headers[0].0, b"gpgsig");
375        assert_eq!(headers[0].1, value);
376    }
377
378    #[test]
379    fn write_actor_line_rejects_overflowing_timestamp_offset_arithmetic() {
380        let principal = Principal::new("A", "a@example.com");
381        let mut out = Vec::new();
382
383        let error = write_actor_line(&mut out, b"author", &principal, i64::MAX, 1)
384            .expect_err("timestamp plus timezone offset must not overflow");
385
386        assert!(
387            matches!(&error, GitBridgeError::Store(StoreError::InvalidObject(message)) if message.contains("overflows i64")),
388            "expected InvalidObject overflow error, got: {error:?}",
389        );
390        assert!(
391            out.is_empty(),
392            "failed actor line must not emit partial bytes"
393        );
394    }
395
396    #[test]
397    fn write_actor_line_valid_timestamp_is_unchanged() {
398        let principal = Principal::new("A", "a@example.com");
399        let mut out = Vec::new();
400
401        write_actor_line(&mut out, b"author", &principal, 1_700_000_000, -8 * 3600)
402            .expect("valid timestamp should serialize");
403
404        assert_eq!(out, b"author A <a@example.com> 1700000000 -0800\n");
405    }
406}