cli/bridge/git_reconstruct.rs
1// SPDX-License-Identifier: Apache-2.0
2#![deny(clippy::cast_possible_truncation)]
3
4//! Byte-exact git commit object serialization from a Heddle [`State`] (#566).
5//!
6//! Reconstructs the exact bytes `git cat-file commit <sha>` prints from the
7//! de-lossy fidelity fields #565 captured, so that re-framing (§0) and
8//! SHA-1-hashing the result reproduces the *original* commit's object id. This
9//! is the consumer that makes #565's fields load-bearing and the step that lets
10//! the git mirror be eliminated (#568): a commit can be rebuilt from Heddle
11//! state alone — no stored git object.
12//!
13//! The wire format is specced byte-for-byte in
14//! `.heddleco-orchestrator/briefs/spike-566-serializer-format.md`; the `§N`
15//! references below point into it. Tag-object reconstruction
16//! (`reconstruct_tag_bytes`) is deferred to #575, where annotated tags become
17//! first-class content-addressed objects; lightweight tags need no object (just
18//! a ref at the commit).
19
20use objects::{
21 object::{Principal, State},
22 store::{ObjectStore, StoreError},
23};
24use repo::Repository as HeddleRepository;
25use sley::{
26 GitObjectType, ObjectFormat, ObjectId, Repository as SleyRepository,
27 plumbing::sley_object::EncodedObject,
28};
29
30use crate::bridge::{
31 git_core::{GitBridge, GitBridgeError, GitResult, SyncMapping, git_err},
32 git_export::export_tree,
33};
34
35/// Frame an object's content for hashing per spike §0:
36/// `<kind> <ascii-decimal-len>\0<content>`. A git object's id is the SHA-1 of
37/// THIS buffer — never of the bare content (`git cat-file` strips the framing).
38/// `<len>` is the byte length of `content` (after all folding/newlines), with no
39/// leading zeros.
40pub fn frame_git_object(kind: &str, content: &[u8]) -> Vec<u8> {
41 let mut framed = Vec::with_capacity(kind.len() + 2 + 20 + content.len());
42 framed.extend_from_slice(kind.as_bytes());
43 framed.push(b' ');
44 framed.extend_from_slice(content.len().to_string().as_bytes());
45 framed.push(0);
46 framed.extend_from_slice(content);
47 framed
48}
49
50/// The git object id (SHA-1) of a commit whose reconstructed content bytes are
51/// `content`: frame per §0, then hash. Equals the original commit SHA exactly
52/// when `content` is byte-identical to the original object.
53pub fn commit_object_id(content: &[u8]) -> ObjectId {
54 sley::plumbing::sley_core::object_id_for_bytes(ObjectFormat::Sha1, "commit", content)
55 .expect("SHA-1 commit object id over in-memory bytes cannot fail")
56}
57
58/// Reconstruct the byte-exact git commit object **content** (the bytes
59/// `git cat-file commit` prints, WITHOUT the §0 framing) for `state`.
60///
61/// `repo` is any writable sley repo: the git tree OID is resolved by re-exporting
62/// `state.tree` through [`export_tree`] (git trees are content-addressed, so the
63/// resulting OID is independent of which repo it is written into — the round-trip
64/// fidelity gate proves this path reproduces the original tree SHA). Parent OIDs
65/// come from the import `mapping` (`ChangeId` → original git OID), in
66/// `state.parents` order — order is part of a commit's identity (§1.2).
67pub fn reconstruct_commit_bytes(
68 heddle_repo: &HeddleRepository,
69 repo: &SleyRepository,
70 mapping: &SyncMapping,
71 state: &State,
72) -> GitResult<Vec<u8>> {
73 let tree_oid = export_tree(heddle_repo, repo, &state.tree)?;
74 let parent_oids = state
75 .parents
76 .iter()
77 .map(|parent| {
78 mapping
79 .get_git(parent)
80 .ok_or(GitBridgeError::StateNotFound(*parent))
81 })
82 .collect::<GitResult<Vec<_>>>()?;
83 build_commit_content(state, &tree_oid, &parent_oids)
84}
85
86/// Frame + write a reconstructed commit object's `content` bytes into `repo`'s
87/// object database, returning its git OID — the SHA-1 of the framed object (§0),
88/// equal to the original commit's id exactly when `content` is byte-identical to
89/// the original.
90///
91/// This is the write side of export-from-state (#567): export regenerates each
92/// commit object from Heddle state and writes it here, rather than relying on the
93/// git mirror still holding the verbatim imported bytes — the dependency #568
94/// removes. Idempotent: sley's object writer hashes first and no-ops when the
95/// object already exists, so re-writing a commit the mirror already carries (the
96/// common case today) costs nothing.
97pub fn write_commit_object(repo: &SleyRepository, content: &[u8]) -> GitResult<ObjectId> {
98 repo.write_object(EncodedObject::new(GitObjectType::Commit, content.to_vec()))
99 .map_err(git_err)
100}
101
102/// Assemble the commit content bytes from already-resolved OIDs. Pure (no repo,
103/// no mapping) so the byte layout — header order, actor lines, header folding,
104/// verbatim message — is unit-testable in isolation (§1/§2/§5/§6).
105fn build_commit_content(
106 state: &State,
107 tree_oid: &ObjectId,
108 parent_oids: &[ObjectId],
109) -> GitResult<Vec<u8>> {
110 let mut out = Vec::new();
111
112 // `tree` is always first, exactly once (§1.1).
113 out.extend_from_slice(b"tree ");
114 out.extend_from_slice(tree_oid.to_string().as_bytes());
115 out.push(b'\n');
116
117 // `parent` lines follow, zero or more, in recorded order (§1.2).
118 for parent in parent_oids {
119 out.extend_from_slice(b"parent ");
120 out.extend_from_slice(parent.to_string().as_bytes());
121 out.push(b'\n');
122 }
123
124 // `author` then `committer` (§1.3/§5). Author time/tz come from the #565
125 // `authored_at` + `authored_tz_offset` (with `created_at` as the native-commit
126 // fallback); committer identity/time/tz from the distinct `committer`
127 // Principal (author fallback when absent) + `created_at` + `committer_tz_offset`
128 // — NOT a hardcoded `+0000`.
129 let author_seconds = state.authored_at.unwrap_or(state.created_at).timestamp();
130 write_actor_line(
131 &mut out,
132 b"author",
133 &state.attribution.principal,
134 author_seconds,
135 state.authored_tz_offset,
136 )?;
137 let committer = state
138 .committer
139 .as_ref()
140 .unwrap_or(&state.attribution.principal);
141 write_actor_line(
142 &mut out,
143 b"committer",
144 committer,
145 state.created_at.timestamp(),
146 state.committer_tz_offset,
147 )?;
148
149 // Extension headers (`encoding`/`gpgsig`/`mergetag`/unknown) at their captured
150 // ordinal, multi-line values re-folded (§1.4/§2). The ordered `Vec` is the
151 // source of truth — gpgsig and mergetag are just entries here, never
152 // special-cased; when both are present git emits mergetag before gpgsig and
153 // the captured order already encodes that.
154 for (name, value) in &state.extra_headers {
155 out.extend_from_slice(name);
156 out.push(b' ');
157 append_folded(&mut out, value);
158 out.push(b'\n');
159 }
160
161 // Exactly one blank line separates headers from the body (§1.5) — always
162 // present, even for an empty message.
163 out.push(b'\n');
164
165 // Message bytes verbatim: no trim, no appended newline (§6). An empty message
166 // contributes zero bytes; a message without a trailing newline ends mid-line.
167 if let Some(message) = &state.raw_message {
168 out.extend_from_slice(message);
169 }
170
171 Ok(out)
172}
173
174/// `<label> <name> <<email>> <unix-seconds> <±HHMM>\n` (§5).
175fn write_actor_line(
176 out: &mut Vec<u8>,
177 label: &[u8],
178 who: &Principal,
179 seconds: i64,
180 tz_offset_secs: i32,
181) -> GitResult<()> {
182 let seconds = checked_actor_timestamp(label, seconds, tz_offset_secs)?;
183 out.extend_from_slice(label);
184 out.push(b' ');
185 out.extend_from_slice(who.name.as_bytes());
186 out.extend_from_slice(b" <");
187 out.extend_from_slice(who.email.as_bytes());
188 out.extend_from_slice(b"> ");
189 out.extend_from_slice(seconds.to_string().as_bytes());
190 out.push(b' ');
191 out.extend_from_slice(format_tz_offset(tz_offset_secs).as_bytes());
192 out.push(b'\n');
193 Ok(())
194}
195
196fn checked_actor_timestamp(label: &[u8], seconds: i64, tz_offset_secs: i32) -> GitResult<i64> {
197 // Git serializes UTC seconds plus a timezone offset. Validate the local
198 // seconds implied by that pair so malformed fidelity data cannot overflow
199 // reconstruct-time timestamp arithmetic.
200 seconds
201 .checked_add(i64::from(tz_offset_secs))
202 .map(|_| seconds)
203 .ok_or_else(|| {
204 let label = String::from_utf8_lossy(label);
205 GitBridgeError::Store(StoreError::InvalidObject(format!(
206 "{label} timestamp {seconds} with timezone offset {tz_offset_secs} overflows i64"
207 )))
208 })
209}
210
211/// Render a timezone offset — stored as **seconds** east of UTC (#565's `i32`
212/// unit) — as git's `±HHMM` (§5). The sign is
213/// always present; zero is `+0000` (git never emits `-0000` for a real commit);
214/// odd offsets like `-0830` / `+1245` survive verbatim.
215fn format_tz_offset(offset_secs: i32) -> String {
216 let sign = if offset_secs < 0 { '-' } else { '+' };
217 let minutes = offset_secs.unsigned_abs() / 60;
218 format!("{sign}{:02}{:02}", minutes / 60, minutes % 60)
219}
220
221/// Fold a stored (unfolded) extension-header value onto the wire (§2): each
222/// internal `\n` becomes `\n ` (newline + one continuation space). A value with
223/// an internal blank line folds to a line containing exactly one space — never a
224/// truly empty line, which git would read as the header/body separator. Exact
225/// inverse of `objects::object::parse_commit_extension_headers`'s unfold.
226fn append_folded(out: &mut Vec<u8>, value: &[u8]) {
227 let mut first = true;
228 for segment in value.split(|&b| b == b'\n') {
229 if first {
230 first = false;
231 } else {
232 out.push(b'\n');
233 out.push(b' ');
234 }
235 out.extend_from_slice(segment);
236 }
237}
238
239impl GitBridge<'_> {
240 /// Open (initializing if necessary) a writable sley repo suitable for
241 /// reconstruction's tree-OID resolution. Any writable odb works — git trees
242 /// are content-addressed — so the bridge's own mirror is reused.
243 pub fn reconstruction_repo(&mut self) -> GitResult<SleyRepository> {
244 self.init_mirror()?;
245 self.open_git_repo()
246 }
247
248 /// Reconstruct the byte-exact commit content for `state` against `repo` (see
249 /// [`reconstruct_commit_bytes`]), using the bridge's import-built mapping for
250 /// parent OIDs.
251 pub fn reconstruct_commit_bytes(
252 &self,
253 repo: &SleyRepository,
254 state: &State,
255 ) -> GitResult<Vec<u8>> {
256 reconstruct_commit_bytes(self.heddle_repo, repo, &self.mapping, state)
257 }
258
259 /// Reconstruct `state`'s commit object from Heddle state and WRITE it into
260 /// `repo`'s object database, returning its git OID (see [`write_commit_object`]).
261 /// The export's commit-minting step (#567): the object is regenerated from
262 /// state, so it lands at the original SHA without the mirror needing to hold
263 /// the verbatim bytes.
264 pub fn reconstruct_and_write_commit(
265 &self,
266 repo: &SleyRepository,
267 state: &State,
268 ) -> GitResult<ObjectId> {
269 let content = self.reconstruct_commit_bytes(repo, state)?;
270 write_commit_object(repo, &content)
271 }
272
273 /// Reconstruct the commit currently mapped to the git object `sha` (40-hex),
274 /// or `None` if no Heddle state maps to it. Convenience for callers keyed by
275 /// the original git OID — e.g. the #566 conformance gate, which compares the
276 /// reconstruction of each original commit against its captured golden bytes.
277 pub fn reconstruct_commit_for_git_sha(
278 &self,
279 repo: &SleyRepository,
280 sha: &str,
281 ) -> GitResult<Option<Vec<u8>>> {
282 let oid = ObjectId::from_hex(ObjectFormat::Sha1, sha).map_err(git_err)?;
283 let Some(change_id) = self.mapping.get_heddle(oid) else {
284 return Ok(None);
285 };
286 let Some(state) = self.heddle_repo.store().get_state(&change_id)? else {
287 return Ok(None);
288 };
289 Ok(Some(reconstruct_commit_bytes(
290 self.heddle_repo,
291 repo,
292 &self.mapping,
293 &state,
294 )?))
295 }
296
297 /// Reconstruct the commit mapped to git object `sha` and WRITE it into `repo`,
298 /// returning the written OID (or `None` if no Heddle state maps to `sha`).
299 /// Combines [`Self::reconstruct_commit_for_git_sha`] with the odb write so the
300 /// #567 export-from-state path is exercisable against an arbitrary repo —
301 /// notably a FRESH one that never received the verbatim imported bytes, which
302 /// is how the export gate proves the object is regenerated from state, not
303 /// copied from the mirror.
304 pub fn reconstruct_and_write_commit_for_git_sha(
305 &self,
306 repo: &SleyRepository,
307 sha: &str,
308 ) -> GitResult<Option<ObjectId>> {
309 let oid = ObjectId::from_hex(ObjectFormat::Sha1, sha).map_err(git_err)?;
310 let Some(change_id) = self.mapping.get_heddle(oid) else {
311 return Ok(None);
312 };
313 let Some(state) = self.heddle_repo.store().get_state(&change_id)? else {
314 return Ok(None);
315 };
316 Ok(Some(self.reconstruct_and_write_commit(repo, &state)?))
317 }
318}
319
320#[cfg(test)]
321mod tests {
322 use objects::object::parse_commit_extension_headers;
323
324 use super::*;
325
326 #[test]
327 fn tz_offset_renders_sign_hours_minutes() {
328 assert_eq!(format_tz_offset(0), "+0000");
329 assert_eq!(format_tz_offset(2 * 3600), "+0200");
330 assert_eq!(format_tz_offset(-8 * 3600), "-0800");
331 // Odd, sub-hour offsets survive verbatim (§5).
332 assert_eq!(format_tz_offset(-(8 * 3600 + 30 * 60)), "-0830");
333 assert_eq!(format_tz_offset(12 * 3600 + 45 * 60), "+1245");
334 assert_eq!(format_tz_offset(5 * 3600 + 30 * 60), "+0530");
335 }
336
337 #[test]
338 fn frame_prepends_kind_len_nul() {
339 assert_eq!(frame_git_object("commit", b"abc"), b"commit 3\0abc");
340 assert_eq!(frame_git_object("commit", b""), b"commit 0\0");
341 }
342
343 #[test]
344 fn fold_then_unfold_round_trips() {
345 // A gpgsig-shaped value: a leading line, an internal blank line (the
346 // armor's empty line), then body lines and the END marker — stored
347 // unfolded, with no trailing newline (§2).
348 let value: &[u8] =
349 b"-----BEGIN PGP SIGNATURE-----\n\niHUEsigbytes\nmoresig\n-----END PGP SIGNATURE-----";
350
351 // Fold the way the serializer writes the wire.
352 let mut folded = Vec::new();
353 folded.extend_from_slice(b"gpgsig ");
354 append_folded(&mut folded, value);
355 folded.push(b'\n');
356
357 // The internal blank line must fold to a line that is exactly one space,
358 // never an empty line (which would terminate the header block).
359 assert!(folded.windows(3).any(|w| w == b"\n \n"));
360
361 // Re-parsing a minimal commit header block carrying this folded header
362 // must recover the original unfolded value byte-for-byte.
363 let mut content = Vec::new();
364 content.extend_from_slice(b"tree ");
365 content.extend_from_slice(&[b'0'; 40]);
366 content.push(b'\n');
367 content.extend_from_slice(b"author A <a@x> 1 +0000\n");
368 content.extend_from_slice(b"committer A <a@x> 1 +0000\n");
369 content.extend_from_slice(&folded);
370 content.extend_from_slice(b"\nbody\n");
371
372 let headers = parse_commit_extension_headers(&content);
373 assert_eq!(headers.len(), 1);
374 assert_eq!(headers[0].0, b"gpgsig");
375 assert_eq!(headers[0].1, value);
376 }
377
378 #[test]
379 fn write_actor_line_rejects_overflowing_timestamp_offset_arithmetic() {
380 let principal = Principal::new("A", "a@example.com");
381 let mut out = Vec::new();
382
383 let error = write_actor_line(&mut out, b"author", &principal, i64::MAX, 1)
384 .expect_err("timestamp plus timezone offset must not overflow");
385
386 assert!(
387 matches!(&error, GitBridgeError::Store(StoreError::InvalidObject(message)) if message.contains("overflows i64")),
388 "expected InvalidObject overflow error, got: {error:?}",
389 );
390 assert!(
391 out.is_empty(),
392 "failed actor line must not emit partial bytes"
393 );
394 }
395
396 #[test]
397 fn write_actor_line_valid_timestamp_is_unchanged() {
398 let principal = Principal::new("A", "a@example.com");
399 let mut out = Vec::new();
400
401 write_actor_line(&mut out, b"author", &principal, 1_700_000_000, -8 * 3600)
402 .expect("valid timestamp should serialize");
403
404 assert_eq!(out, b"author A <a@example.com> 1700000000 -0800\n");
405 }
406}