Skip to main content

objects/object/
state_core.rs

1// SPDX-License-Identifier: Apache-2.0
2//! Core state type and its leaf value types (Status, StateSignature,
3//! SignatureStatus, Verification).
4
5use std::collections::BTreeMap;
6
7use chrono::{DateTime, Utc};
8use serde::{Deserialize, Serialize};
9
10use super::{Attribution, ChangeId, ContentHash, Principal};
11
12// ── Status ──────────────────────────────────────────────────────────
13
14/// Lifecycle status of a state.
15#[derive(Clone, Copy, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
16pub enum Status {
17    #[default]
18    Draft,
19    Published,
20}
21
22impl Status {
23    pub fn to_byte(&self) -> u8 {
24        match self {
25            Status::Draft => 0,
26            Status::Published => 1,
27        }
28    }
29
30    pub fn from_byte(b: u8) -> Option<Self> {
31        match b {
32            0 => Some(Status::Draft),
33            1 => Some(Status::Published),
34            _ => None,
35        }
36    }
37}
38
39// ── StateSignature ──────────────────────────────────────────────────
40
41/// Signature information for a state.
42#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
43pub struct StateSignature {
44    pub algorithm: String,
45    pub public_key: String,
46    pub signature: String,
47}
48
49impl StateSignature {
50    pub fn algorithm(&self) -> &str {
51        &self.algorithm
52    }
53}
54
55/// Signature verification result.
56#[derive(Clone, Copy, Debug, PartialEq, Eq)]
57pub enum SignatureStatus {
58    Valid,
59    Invalid,
60    Unsigned,
61}
62
63impl SignatureStatus {
64    pub fn is_valid(self) -> bool {
65        self == SignatureStatus::Valid
66    }
67
68    pub fn is_unsigned(self) -> bool {
69        self == SignatureStatus::Unsigned
70    }
71}
72
73// ── Verification ────────────────────────────────────────────────────
74
75/// Verification information for a state.
76#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
77pub struct Verification {
78    pub tests_passed: Option<bool>,
79    pub tests_failed: Option<u32>,
80    pub coverage_pct: Option<f32>,
81    pub coverage_delta: Option<f32>,
82    pub lint_warnings: Option<u32>,
83    #[serde(default)]
84    pub custom: BTreeMap<String, serde_json::Value>,
85}
86
87impl Verification {
88    pub fn new() -> Self {
89        Self::default()
90    }
91
92    pub fn with_tests_passed(mut self, passed: bool) -> Self {
93        self.tests_passed = Some(passed);
94        self
95    }
96
97    pub fn with_tests_failed(mut self, failed: u32) -> Self {
98        self.tests_failed = Some(failed);
99        self
100    }
101
102    pub fn is_empty(&self) -> bool {
103        self.tests_passed.is_none()
104            && self.tests_failed.is_none()
105            && self.coverage_pct.is_none()
106            && self.coverage_delta.is_none()
107            && self.lint_warnings.is_none()
108            && self.custom.is_empty()
109    }
110
111    pub(crate) fn hash_len(&self) -> usize {
112        let mut len = 0;
113        len += 1 + self.tests_passed.map(|_| 1).unwrap_or(0);
114        len += 1 + self.tests_failed.map(|_| 4).unwrap_or(0);
115        len += 1 + self.coverage_pct.map(|_| 4).unwrap_or(0);
116        len += 1 + self.coverage_delta.map(|_| 4).unwrap_or(0);
117        len += 1 + self.lint_warnings.map(|_| 4).unwrap_or(0);
118        len += 4;
119        for (key, value) in &self.custom {
120            let value_bytes = serde_json::to_vec(value).unwrap_or_default();
121            len += 4 + key.len();
122            len += 4 + value_bytes.len();
123        }
124        len
125    }
126
127    pub(crate) fn update_hasher(&self, hasher: &mut blake3::Hasher) {
128        let tests_passed = self.tests_passed.map(u8::from);
129        write_optional_u8(hasher, tests_passed);
130        write_optional_u32(hasher, self.tests_failed);
131        write_optional_f32(hasher, self.coverage_pct);
132        write_optional_f32(hasher, self.coverage_delta);
133        write_optional_u32(hasher, self.lint_warnings);
134        let custom_len = self.custom.len() as u32;
135        hasher.update(&custom_len.to_le_bytes());
136        for (key, value) in &self.custom {
137            let key_bytes = key.as_bytes();
138            let value_bytes = serde_json::to_vec(value).unwrap_or_default();
139            hasher.update(&(key_bytes.len() as u32).to_le_bytes());
140            hasher.update(key_bytes);
141            hasher.update(&(value_bytes.len() as u32).to_le_bytes());
142            hasher.update(&value_bytes);
143        }
144    }
145}
146
147fn write_optional_u8(hasher: &mut blake3::Hasher, value: Option<u8>) {
148    match value {
149        Some(v) => {
150            hasher.update(&[1]);
151            hasher.update(&[v]);
152        }
153        None => {
154            hasher.update(&[0]);
155        }
156    }
157}
158
159fn write_optional_u32(hasher: &mut blake3::Hasher, value: Option<u32>) {
160    match value {
161        Some(v) => {
162            hasher.update(&[1]);
163            hasher.update(&v.to_le_bytes());
164        }
165        None => {
166            hasher.update(&[0]);
167        }
168    }
169}
170
171fn write_optional_f32(hasher: &mut blake3::Hasher, value: Option<f32>) {
172    match value {
173        Some(v) => {
174            hasher.update(&[1]);
175            hasher.update(&v.to_le_bytes());
176        }
177        None => {
178            hasher.update(&[0]);
179        }
180    }
181}
182
183// ── State ───────────────────────────────────────────────────────────
184
185/// A state is an immutable snapshot with rich metadata.
186///
187/// On-disk encoding is rmp-serde's positional struct format (a fixed-length
188/// tuple). This is sensitive to field order: inserting a field in the middle
189/// of the tuple breaks every pre-existing on-disk state. The invariant we
190/// keep going forward is:
191///
192/// > **New optional fields are added at the tail of the struct, below
193/// > `status`, with `#[serde(default)]`.** Mid-struct inserts are
194/// > forbidden. rmp-serde's positional deserializer tolerates missing
195/// > trailing fields when they have a `Default` impl, so tail-only growth
196/// > is forward-compatible automatically.
197///
198/// Required (non-optional) fields — `change_id`, `tree`, `parents`,
199/// `attribution`, `created_at`, `status` — must never move. Optional fields
200/// may be reordered only among themselves, and only at the tail.
201#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
202pub struct State {
203    pub change_id: ChangeId,
204    #[serde(skip)]
205    content_hash: Option<ContentHash>,
206    pub tree: ContentHash,
207    pub parents: Vec<ChangeId>,
208    pub attribution: Attribution,
209    pub intent: Option<String>,
210    pub confidence: Option<f32>,
211    pub created_at: DateTime<Utc>,
212    pub verification: Option<Verification>,
213    pub signature: Option<StateSignature>,
214    pub status: Status,
215    // --- tail-only optional fields below. Add new fields here, never above. ---
216    #[serde(default)]
217    pub provenance: Option<ContentHash>,
218    #[serde(default)]
219    pub logical_change_id: Option<ChangeId>,
220    /// Optional context tree root for code annotations.
221    #[serde(default)]
222    pub context: Option<ContentHash>,
223    /// Authoring timestamp for this state, when distinct from
224    /// `created_at`.
225    ///
226    /// `created_at` is the *committer* time — when the state object
227    /// came into being in its current form. `authored_at` is the
228    /// *author* time — when someone actually wrote the change — which
229    /// survives `git rebase`, cherry-pick, squash-merge, and `git
230    /// commit --amend`. The ingest-backed `bridge git import` path fills
231    /// this from the git author time; native heddle commits leave it
232    /// `None` and blame falls back to `created_at`.
233    ///
234    /// **Part of the state hash (#564 de-lossy step 1).** Author time
235    /// is part of a git commit's identity: two commits that differ
236    /// *only* by author timestamp are distinct git objects, so folding
237    /// it into the hash keeps them from dedup-colliding to one State in
238    /// the content-addressed store. `None` hashes as a single absence
239    /// byte, so native commits are unaffected beyond the format bump.
240    #[serde(default)]
241    pub authored_at: Option<DateTime<Utc>>,
242    /// Content hash of the state's [`RiskSignalBlob`](crate::object::RiskSignalBlob),
243    /// when present. Computed and persisted whenever risk signals fire on a
244    /// state. `None` for states from before W1 and for states where no
245    /// signals fired.
246    ///
247    /// Hash framing: a single `0` byte when `None`, `[1]` + 32-byte hash when
248    /// `Some`. Legacy states without this field deserialize as `None` and
249    /// hash byte-identical to before W1.
250    #[serde(default)]
251    pub risk_signals: Option<ContentHash>,
252    /// Content hash of the state's [`ReviewSignaturesBlob`](crate::object::ReviewSignaturesBlob),
253    /// when reviewers have signed off (read / agent-preview / agent-co-review).
254    #[serde(default)]
255    pub review_signatures: Option<ContentHash>,
256    /// Content hash of the state's [`DiscussionsBlob`](crate::object::DiscussionsBlob),
257    /// when discussions are anchored to this state.
258    #[serde(default)]
259    pub discussions: Option<ContentHash>,
260    /// Content hash of the state's [`StructuredConflict`](crate::object::StructuredConflict),
261    /// when this state captures an unresolved merge conflict as data.
262    #[serde(default)]
263    pub structured_conflicts: Option<ContentHash>,
264    // --- git-fidelity fields (#564 de-lossy step 1, #565) ---
265    //
266    // These preserve the parts of an imported git commit that Heddle's
267    // model used to drop, so a commit can be byte-reconstructed later
268    // (#566/#567) and the git mirror can be eliminated (#568). UNLIKE the
269    // W1 tail fields above, these ARE part of the content hash (see
270    // `update_hash`): two git-distinct commits that differ only in
271    // committer, timezone, verbatim message, gpgsig, or extra headers must
272    // hash differently so they can't dedup-collide in the content-addressed
273    // store. They are still tail-append + `#[serde(default)]` so legacy
274    // on-disk states keep deserializing.
275    /// The git committer identity, when distinct from the author
276    /// ([`Attribution::principal`]). Git records both an author (who wrote
277    /// the change) and a committer (who created this commit object); for
278    /// rebased / cherry-picked / amended commits the two differ. `None`
279    /// for native heddle commits and for legacy imports from before #565.
280    #[serde(default)]
281    pub committer: Option<Principal>,
282    /// Timezone offset (seconds east of UTC) of the *author* timestamp
283    /// ([`State::authored_at`] / `created_at` fallback). Git stores the
284    /// author's local offset (e.g. `+0000`, `-0700`); Heddle used to
285    /// discard it. `0` for native commits and legacy imports.
286    #[serde(default)]
287    pub authored_tz_offset: i32,
288    /// Timezone offset (seconds east of UTC) of the *committer* timestamp
289    /// (`created_at`). `0` for native commits and legacy imports.
290    #[serde(default)]
291    pub committer_tz_offset: i32,
292    /// The verbatim git commit message body (everything after the header
293    /// block), preserved exactly so reconstruction is byte-stable. Distinct
294    /// from `intent`, which is the trimmed first line surfaced in the UI.
295    /// `None` for native commits and legacy imports.
296    ///
297    /// Stored as raw bytes, NOT a `String`: a commit with a non-UTF8
298    /// `encoding` (latin-1, shift-jis, …) carries message bytes that are not
299    /// valid UTF-8 (e.g. `0xe9` for latin-1 `é`); a `String` could not
300    /// round-trip them byte-identically. (non-UTF8 author/committer identity
301    /// *names* are not yet byte-preserved — `Principal` is still `String`; see
302    /// #564.)
303    #[serde(default)]
304    pub raw_message: Option<Vec<u8>>,
305    /// The SINGLE canonical "this state's content is NOT byte-faithful to the
306    /// original git object" marker (#567). Set to `true` by lossy import
307    /// population paths whenever an unrepresentable tree entry was dropped or
308    /// converted during import, so the rebuilt tree (hence commit) no longer
309    /// hashes to the original SHA. The git-export fidelity guard reads this one
310    /// flag to decide whether reconstruct-from-state is safe, instead of
311    /// enumerating import surfaces. `false` for native heddle commits and for
312    /// lossless imports.
313    ///
314    /// Provenance metadata, NOT part of the content hash: a lossy import always
315    /// drops/converts tree entries, so its tree — and therefore the rest of the
316    /// hashed identity — already differs from a lossless import of the same
317    /// source; folding the flag in would add nothing but break every existing
318    /// content hash.
319    #[serde(default)]
320    pub git_lossy: bool,
321    /// Every git commit header beyond the ones Heddle models natively
322    /// (tree/parents/author/committer), in their original order. ORDER IS
323    /// LOAD-BEARING for #566 byte-exactness — this is a `Vec`, never a map.
324    /// Empty for native commits and legacy imports.
325    ///
326    /// `gpgsig` is just one of these headers and is kept INLINE at its
327    /// captured ordinal (not split into a separate field): when a commit's
328    /// extension headers are in non-canonical order — e.g. `x-custom`, then
329    /// `gpgsig`, then `mergetag` — splitting gpgsig out would lose its
330    /// position and break byte-identical reconstruction. The serialization
331    /// source of truth for the signature is its position here (spike §3).
332    ///
333    /// Both the header name and value are raw bytes (`Vec<u8>`), NOT
334    /// `String`s: extra-header VALUES (a `mergetag` payload is a full tag
335    /// object; custom headers; gpgsig armor) can be non-UTF8, so a
336    /// `String` would force a lossy `to_string()` that destroys those bytes.
337    /// Names are ASCII by git's spec but are bytes too so the whole tuple is
338    /// byte-exact and no conversion sneaks in.
339    #[serde(default)]
340    pub extra_headers: Vec<(Vec<u8>, Vec<u8>)>,
341}
342
343impl State {
344    pub fn new(tree: ContentHash, parents: Vec<ChangeId>, attribution: Attribution) -> Self {
345        Self::new_snapshot(tree, parents, attribution)
346    }
347
348    pub fn new_snapshot(
349        tree: ContentHash,
350        parents: Vec<ChangeId>,
351        attribution: Attribution,
352    ) -> Self {
353        let change_id = ChangeId::generate();
354        Self::new_with_logical_change_id(tree, parents, attribution, change_id)
355    }
356
357    pub fn new_merge(tree: ContentHash, parents: Vec<ChangeId>, attribution: Attribution) -> Self {
358        Self::new_snapshot(tree, parents, attribution)
359    }
360
361    pub fn new_refresh_of(
362        tree: ContentHash,
363        parents: Vec<ChangeId>,
364        attribution: Attribution,
365        logical_change_id: ChangeId,
366    ) -> Self {
367        Self::new_with_logical_change_id(tree, parents, attribution, logical_change_id)
368    }
369
370    pub fn new_fork_of(
371        tree: ContentHash,
372        parents: Vec<ChangeId>,
373        attribution: Attribution,
374    ) -> Self {
375        Self::new_snapshot(tree, parents, attribution)
376    }
377
378    pub fn new_collapse_of(
379        tree: ContentHash,
380        parents: Vec<ChangeId>,
381        attribution: Attribution,
382    ) -> Self {
383        Self::new_snapshot(tree, parents, attribution)
384    }
385
386    fn new_with_logical_change_id(
387        tree: ContentHash,
388        parents: Vec<ChangeId>,
389        attribution: Attribution,
390        logical_change_id: ChangeId,
391    ) -> Self {
392        Self {
393            change_id: ChangeId::generate(),
394            logical_change_id: Some(logical_change_id),
395            content_hash: None,
396            tree,
397            parents,
398            attribution,
399            intent: None,
400            confidence: None,
401            created_at: Utc::now(),
402            verification: None,
403            signature: None,
404            provenance: None,
405            context: None,
406            authored_at: None,
407            risk_signals: None,
408            review_signatures: None,
409            discussions: None,
410            structured_conflicts: None,
411            committer: None,
412            authored_tz_offset: 0,
413            committer_tz_offset: 0,
414            raw_message: None,
415            git_lossy: false,
416            extra_headers: Vec::new(),
417            status: Status::Draft,
418        }
419    }
420
421    pub fn with_intent(mut self, intent: impl Into<String>) -> Self {
422        self.intent = Some(intent.into());
423        self.content_hash = None;
424        self
425    }
426
427    pub fn with_confidence(mut self, confidence: f32) -> Self {
428        self.confidence = Some(confidence.clamp(0.0, 1.0));
429        self.content_hash = None;
430        self
431    }
432
433    pub fn with_verification(mut self, verification: Verification) -> Self {
434        self.verification = Some(verification);
435        self.content_hash = None;
436        self
437    }
438
439    pub fn with_signature(mut self, signature: StateSignature) -> Self {
440        self.signature = Some(signature);
441        self
442    }
443
444    pub fn with_provenance(mut self, provenance: ContentHash) -> Self {
445        self.provenance = Some(provenance);
446        self.content_hash = None;
447        self
448    }
449
450    /// Set the context tree root.
451    pub fn with_context(mut self, context: ContentHash) -> Self {
452        self.context = Some(context);
453        self.content_hash = None;
454        self
455    }
456
457    /// Attach a [`RiskSignalBlob`](crate::object::RiskSignalBlob) hash.
458    /// Render-time tick budgeting (selecting which signals to surface) is a
459    /// view over this stored data, not part of storage itself.
460    ///
461    /// **Not part of the state hash.** Risk signals are derived data computed
462    /// *about* a state from the diff against its parent; including them in
463    /// identity would make the same logical state hash differently depending
464    /// on which signals fired. That breaks every "is this the same state?"
465    /// check in the system. See `authored_at` for the same pattern.
466    pub fn with_risk_signals(mut self, risk_signals: ContentHash) -> Self {
467        self.risk_signals = Some(risk_signals);
468        self
469    }
470
471    /// Attach a [`ReviewSignaturesBlob`](crate::object::ReviewSignaturesBlob)
472    /// hash. The state's authoring [`StateSignature`] is unaffected; review
473    /// signatures live alongside it and accumulate over time.
474    ///
475    /// **Not part of the state hash.** Review signatures accumulate
476    /// post-capture; including them in identity would mean every signature
477    /// re-keys the state. See `authored_at` for the same pattern.
478    pub fn with_review_signatures(mut self, review_signatures: ContentHash) -> Self {
479        self.review_signatures = Some(review_signatures);
480        self
481    }
482
483    /// Attach a [`DiscussionsBlob`](crate::object::DiscussionsBlob) hash.
484    ///
485    /// **Not part of the state hash.** Discussions evolve independently of
486    /// the state they're anchored to — appending a turn must not change the
487    /// state's identity. See `authored_at` for the same pattern.
488    pub fn with_discussions(mut self, discussions: ContentHash) -> Self {
489        self.discussions = Some(discussions);
490        self
491    }
492
493    /// Attach a [`StructuredConflict`](crate::object::StructuredConflict) hash.
494    ///
495    /// **Not part of the state hash.** Conflict objects describe the merge's
496    /// disagreement; the state's tree and parents already encode what's being
497    /// merged. See `authored_at` for the same pattern.
498    pub fn with_structured_conflicts(mut self, structured_conflicts: ContentHash) -> Self {
499        self.structured_conflicts = Some(structured_conflicts);
500        self
501    }
502
503    /// Record the authoring timestamp separately from `created_at`.
504    /// Used by the git-ingest importer to preserve the distinction
505    /// between "when the change was originally written" (authored)
506    /// and "when this commit object came into being" (committer time,
507    /// stored in `created_at` so re-imports stay deterministic).
508    /// Native heddle commits leave this `None`; blame display then
509    /// falls back to `created_at`.
510    ///
511    /// **Part of the state hash (#564 de-lossy step 1)** — see the
512    /// `authored_at` field docs and `update_hash`.
513    pub fn with_authored_at(mut self, timestamp: DateTime<Utc>) -> Self {
514        self.authored_at = Some(timestamp);
515        self.content_hash = None;
516        self
517    }
518
519    /// Record the git committer identity (distinct from the author).
520    ///
521    /// **Part of the state hash** — see the `committer` field docs and
522    /// `update_hash`. #564 de-lossy step 1.
523    pub fn with_committer(mut self, committer: Principal) -> Self {
524        self.committer = Some(committer);
525        self.content_hash = None;
526        self
527    }
528
529    /// Record the author/committer timezone offsets (seconds east of UTC).
530    /// **Part of the state hash.** #564 de-lossy step 1.
531    pub fn with_tz_offsets(mut self, authored: i32, committer: i32) -> Self {
532        self.authored_tz_offset = authored;
533        self.committer_tz_offset = committer;
534        self.content_hash = None;
535        self
536    }
537
538    /// Record the verbatim git commit message body, as raw bytes (so a
539    /// non-UTF8 message round-trips byte-identically; see the `raw_message`
540    /// field docs). **Part of the state hash.** #564 de-lossy step 1.
541    pub fn with_raw_message(mut self, raw_message: impl AsRef<[u8]>) -> Self {
542        self.raw_message = Some(raw_message.as_ref().to_vec());
543        self.content_hash = None;
544        self
545    }
546
547    /// Mark this state's content as NOT byte-faithful to the original git
548    /// object — set by the `--lossy` import/ingest paths when a tree entry was
549    /// dropped or converted. The git-export fidelity guard reads this single
550    /// signal to skip reconstruct-from-state (#567). Not part of the content
551    /// hash (see the `git_lossy` field docs).
552    pub fn with_git_lossy(mut self, git_lossy: bool) -> Self {
553        self.git_lossy = git_lossy;
554        self.content_hash = None;
555        self
556    }
557
558    /// Record the ordered remaining git commit headers as raw bytes. ORDER
559    /// IS LOAD-BEARING (#566). **Part of the state hash.** #564 de-lossy
560    /// step 1.
561    pub fn with_extra_headers(mut self, extra_headers: Vec<(Vec<u8>, Vec<u8>)>) -> Self {
562        self.extra_headers = extra_headers;
563        self.content_hash = None;
564        self
565    }
566
567    pub fn with_status(mut self, status: Status) -> Self {
568        self.status = status;
569        self.content_hash = None;
570        self
571    }
572
573    pub fn with_change_id(mut self, change_id: ChangeId) -> Self {
574        let previous_change_id = self.change_id;
575        self.change_id = change_id;
576        if self.logical_change_id == Some(previous_change_id) || self.logical_change_id.is_none() {
577            self.logical_change_id = Some(change_id);
578            self.content_hash = None;
579        }
580        self
581    }
582
583    pub fn with_logical_change_id(mut self, logical_change_id: ChangeId) -> Self {
584        self.logical_change_id = Some(logical_change_id);
585        self.content_hash = None;
586        self
587    }
588
589    pub fn logical_change_id(&self) -> ChangeId {
590        self.logical_change_id.unwrap_or(self.change_id)
591    }
592
593    pub fn with_timestamp(mut self, timestamp: DateTime<Utc>) -> Self {
594        self.created_at = timestamp;
595        self.content_hash = None;
596        self
597    }
598
599    pub fn compute_hash(&self) -> ContentHash {
600        let content_len = self.hash_len();
601        ContentHash::compute_typed_with_len("state", content_len, |hasher| {
602            self.update_hash(hasher);
603        })
604    }
605
606    /// The pre-#565 content hash: the hash a state had BEFORE the git-fidelity
607    /// fields were folded into identity (the format bump in #565). It omits the
608    /// trailing fidelity block from both the hashed bytes AND the content-length
609    /// prefix, exactly as the old code did — so for a state signed before the
610    /// bump, this reproduces the hash its `StateSignature` was actually made
611    /// over.
612    ///
613    /// The #570 fidelity backfill verifies an existing signature against this
614    /// (in addition to the current `compute_hash`) before re-signing: a legacy
615    /// signature was made over THIS hash, not the post-bump one, so checking
616    /// only the new hash would wrongly reject a valid legacy signature as
617    /// unreproducible. #565 only *appended* the fidelity block to `hash_len` /
618    /// `update_hash`, so stopping before it is a faithful pre-bump hash.
619    pub fn compute_hash_pre_fidelity(&self) -> ContentHash {
620        let content_len = self.hash_len_core();
621        ContentHash::compute_typed_with_len("state", content_len, |hasher| {
622            self.update_hash_core(hasher);
623        })
624    }
625
626    pub fn hash(&mut self) -> ContentHash {
627        if self.content_hash.is_none() {
628            self.content_hash = Some(self.compute_hash());
629        }
630        self.content_hash.expect("hash was just computed above")
631    }
632
633    pub fn is_root(&self) -> bool {
634        self.parents.is_empty()
635    }
636
637    pub fn is_merge(&self) -> bool {
638        self.parents.len() > 1
639    }
640
641    pub fn is_agent_authored(&self) -> bool {
642        self.attribution.agent.is_some()
643    }
644
645    pub fn first_parent(&self) -> Option<&ChangeId> {
646        self.parents.first()
647    }
648
649    fn hash_len(&self) -> u64 {
650        self.hash_len_core() + self.hash_len_fidelity()
651    }
652
653    /// Hashed length of the pre-#565 fields (everything through the status
654    /// byte). Mirrors [`Self::update_hash_core`]. Split out so the pre-bump
655    /// hash ([`Self::compute_hash_pre_fidelity`]) can be reproduced exactly.
656    fn hash_len_core(&self) -> u64 {
657        let principal = &self.attribution.principal;
658        let mut len = 0u64;
659
660        len += 1;
661        if self.logical_change_id.is_some() {
662            len += 16;
663        }
664
665        len += self.tree.as_bytes().len() as u64;
666        len += 4;
667        len += (self.parents.len() * 16) as u64;
668
669        len += principal.name.len() as u64 + 1;
670        len += principal.email.len() as u64 + 1;
671
672        len += 1;
673        if let Some(agent) = &self.attribution.agent {
674            len += agent.provider.len() as u64 + 1;
675            len += agent.model.len() as u64 + 1;
676
677            len += 1;
678            if let Some(session_id) = &agent.session_id {
679                len += session_id.len() as u64 + 1;
680            }
681
682            len += 1;
683            if let Some(policy_id) = &agent.policy_id {
684                len += policy_id.len() as u64 + 1;
685            }
686        }
687
688        len += 1;
689        if let Some(intent) = &self.intent {
690            len += intent.len() as u64 + 1;
691        }
692
693        len += 1;
694        if self.confidence.is_some() {
695            len += 4;
696        }
697
698        len += 8;
699
700        len += 1;
701        if let Some(verification) = &self.verification {
702            len += verification.hash_len() as u64;
703        }
704
705        len += 1;
706        if self.provenance.is_some() {
707            len += 32;
708        }
709
710        len += 1;
711        if self.context.is_some() {
712            len += 32;
713        }
714
715        len += 1;
716
717        len
718    }
719
720    /// Hashed length of the appended git-fidelity block (#565). Mirrors
721    /// [`Self::update_hash_fidelity`] byte-for-byte. Kept separate from
722    /// [`Self::hash_len_core`] so the pre-bump hash can omit it exactly.
723    fn hash_len_fidelity(&self) -> u64 {
724        let mut len = 0u64;
725
726        // git-fidelity fields (#564 step 1). Must mirror `update_hash`
727        // byte-for-byte. committer: 1 tag byte + (name+NUL, email+NUL).
728        len += 1;
729        if let Some(committer) = &self.committer {
730            len += committer.name.len() as u64 + 1;
731            len += committer.email.len() as u64 + 1;
732        }
733        // both tz offsets: i32 LE, always present.
734        len += 4;
735        len += 4;
736        // authored_at (author time): 1 tag byte + (i64 LE when Some).
737        len += 1;
738        if self.authored_at.is_some() {
739            len += 8;
740        }
741        // raw_message: optional-bytes framing (1 tag + u32 len + bytes) — a
742        // length prefix, not NUL-termination, since the message can contain
743        // NUL bytes (it's byte-typed for non-UTF8 fidelity).
744        len += 1;
745        if let Some(raw_message) = &self.raw_message {
746            len += 4 + raw_message.len() as u64;
747        }
748        // extra_headers (gpgsig rides inline here at its captured position):
749        // u32 count, then per pair u32 key_len+key, u32 val_len+val.
750        len += 4;
751        for (key, value) in &self.extra_headers {
752            len += 4 + key.len() as u64;
753            len += 4 + value.len() as u64;
754        }
755
756        len
757    }
758
759    fn update_hash(&self, hasher: &mut blake3::Hasher) {
760        self.update_hash_core(hasher);
761        self.update_hash_fidelity(hasher);
762    }
763
764    /// Hash the pre-#565 fields (everything through the status byte). Mirrors
765    /// [`Self::hash_len_core`]. The pre-bump hash
766    /// ([`Self::compute_hash_pre_fidelity`]) is exactly this with no fidelity
767    /// block appended.
768    fn update_hash_core(&self, hasher: &mut blake3::Hasher) {
769        let principal = &self.attribution.principal;
770
771        if let Some(logical_change_id) = self.logical_change_id {
772            hasher.update(&[1]);
773            hasher.update(logical_change_id.as_bytes());
774        } else {
775            hasher.update(&[0]);
776        }
777
778        hasher.update(self.tree.as_bytes());
779        hasher.update(&(self.parents.len() as u32).to_le_bytes());
780        for parent in &self.parents {
781            hasher.update(parent.as_bytes());
782        }
783
784        hasher.update(principal.name.as_bytes());
785        hasher.update(&[0]);
786        hasher.update(principal.email.as_bytes());
787        hasher.update(&[0]);
788
789        if let Some(agent) = &self.attribution.agent {
790            hasher.update(&[1]);
791            hasher.update(agent.provider.as_bytes());
792            hasher.update(&[0]);
793            hasher.update(agent.model.as_bytes());
794            hasher.update(&[0]);
795            write_optional_string(hasher, &agent.session_id);
796            write_optional_string(hasher, &agent.segment_id);
797            write_optional_string(hasher, &agent.policy_id);
798        } else {
799            hasher.update(&[0]);
800        }
801
802        write_optional_string(hasher, &self.intent);
803
804        if let Some(confidence) = self.confidence {
805            hasher.update(&[1]);
806            hasher.update(&confidence.to_le_bytes());
807        } else {
808            hasher.update(&[0]);
809        }
810
811        hasher.update(&self.created_at.timestamp().to_le_bytes());
812
813        if let Some(verification) = &self.verification {
814            hasher.update(&[1]);
815            verification.update_hasher(hasher);
816        } else {
817            hasher.update(&[0]);
818        }
819
820        if let Some(provenance) = self.provenance {
821            hasher.update(&[1]);
822            hasher.update(provenance.as_bytes());
823        } else {
824            hasher.update(&[0]);
825        }
826
827        if let Some(context) = self.context {
828            hasher.update(&[1]);
829            hasher.update(context.as_bytes());
830        } else {
831            hasher.update(&[0]);
832        }
833
834        hasher.update(&[self.status.to_byte()]);
835    }
836
837    /// Hash the appended git-fidelity block (#565). Mirrors
838    /// [`Self::hash_len_fidelity`]. Kept separate from
839    /// [`Self::update_hash_core`] so a pre-bump hash can omit it exactly.
840    ///
841    /// git-fidelity fields (#564 de-lossy step 1, #565) are DELIBERATELY part
842    /// of the content hash — the opposite of the W1 tail fields. Two git
843    /// commits that differ only in committer, author/committer time, timezone,
844    /// verbatim message, or extra headers (gpgsig included) are distinct git
845    /// objects; folding these into identity prevents them from dedup-colliding
846    /// to one State in the content-addressed store. This re-hashes every
847    /// pre-#565 state (a real format bump; acceptable pre-0.3). Keep this in
848    /// sync with `hash_len_fidelity`.
849    fn update_hash_fidelity(&self, hasher: &mut blake3::Hasher) {
850        if let Some(committer) = &self.committer {
851            hasher.update(&[1]);
852            hasher.update(committer.name.as_bytes());
853            hasher.update(&[0]);
854            hasher.update(committer.email.as_bytes());
855            hasher.update(&[0]);
856        } else {
857            hasher.update(&[0]);
858        }
859
860        hasher.update(&self.authored_tz_offset.to_le_bytes());
861        hasher.update(&self.committer_tz_offset.to_le_bytes());
862
863        // Author time (#564): committer time is hashed above as created_at;
864        // author time is the other half of a git commit's temporal identity.
865        if let Some(authored_at) = self.authored_at {
866            hasher.update(&[1]);
867            hasher.update(&authored_at.timestamp().to_le_bytes());
868        } else {
869            hasher.update(&[0]);
870        }
871
872        write_optional_bytes(hasher, &self.raw_message);
873
874        // extra_headers (gpgsig is one of these, kept inline at its position).
875        hasher.update(&(self.extra_headers.len() as u32).to_le_bytes());
876        for (key, value) in &self.extra_headers {
877            hasher.update(&(key.len() as u32).to_le_bytes());
878            hasher.update(key);
879            hasher.update(&(value.len() as u32).to_le_bytes());
880            hasher.update(value);
881        }
882    }
883}
884
885/// Length-prefixed optional-bytes framing for the hash: `[1] + u32-LE len +
886/// bytes` when `Some`, a single `[0]` when `None`. Unlike
887/// [`write_optional_string`]'s NUL-terminated framing this is binary-safe —
888/// `raw_message` can contain NUL bytes, so a length prefix (not a terminator)
889/// is required to keep the hash unambiguous.
890fn write_optional_bytes(hasher: &mut blake3::Hasher, value: &Option<Vec<u8>>) {
891    match value {
892        Some(bytes) => {
893            hasher.update(&[1]);
894            hasher.update(&(bytes.len() as u32).to_le_bytes());
895            hasher.update(bytes);
896        }
897        None => {
898            hasher.update(&[0]);
899        }
900    }
901}
902
903fn write_optional_string(hasher: &mut blake3::Hasher, value: &Option<String>) {
904    match value {
905        Some(value) => {
906            hasher.update(&[1]);
907            hasher.update(value.as_bytes());
908            hasher.update(&[0]);
909        }
910        None => {
911            hasher.update(&[0]);
912        }
913    }
914}
915
916/// Parse the *extension* headers from a raw git commit object's content bytes
917/// (the bytes `git cat-file commit <sha>` prints — i.e. gix's `Commit::data`),
918/// in their exact on-the-wire order, ready to store in [`State::extra_headers`].
919///
920/// A commit's header block runs from the start of the content up to the first
921/// blank line (the header/body separator). Its leading headers are always, in
922/// fixed order, `tree`, zero-or-more `parent`, `author`, `committer`; Heddle
923/// models those natively. Every header **after** `committer` is an extension
924/// header (`encoding`, `gpgsig`, `mergetag`, or any unknown/future name) and is
925/// returned here as a `(name, value)` byte pair at its real position.
926///
927/// **This is the single source of truth for extension-header order and bytes.**
928/// Both git import paths (the CLI bridge and the ingest walker) build
929/// `extra_headers` from it. The alternative — stitching the vec back together
930/// from a decoder's *typed* accessors (gix surfaces `encoding`, and historically
931/// `gpgsig`, as fields *outside* its `extra_headers`) — silently reorders the
932/// headers git happens to model as typed fields, which breaks #566 byte-exact
933/// reconstruction. So we never consult those typed accessors for position; the
934/// raw header block is authoritative. (#564 de-lossy step 1 — close-the-class.)
935///
936/// Folded continuation lines (a value line beginning with a single space
937/// `0x20`, used by `gpgsig`/`mergetag`) are **unfolded**: each continuation
938/// contributes a `\n` plus the line with exactly one leading space stripped, so
939/// the stored value holds the value's real internal newlines with no trailing
940/// newline. The serializer (#566) re-folds by mapping every `\n` back to `\n `
941/// (spike §2). A "blank" line inside an armored value is ` \n` on the wire (one
942/// space), so it unfolds to an empty segment — never confused with the
943/// header/body separator, which is a truly empty line.
944pub fn parse_commit_extension_headers(commit_content: &[u8]) -> Vec<(Vec<u8>, Vec<u8>)> {
945    // The header block ends at the first *empty* line. Folded "blank" lines
946    // inside an armored value are ` \n` (a single space), never empty, so the
947    // first `\n\n` reliably marks the header/body boundary.
948    let header_block = match find_subslice(commit_content, b"\n\n") {
949        Some(idx) => &commit_content[..idx],
950        // No separator (malformed / header-only) — treat all of it as headers.
951        None => commit_content,
952    };
953
954    // Collect every logical header (name, unfolded value) in order; the
955    // extension headers are the ones after the `committer` line.
956    let mut headers: Vec<(Vec<u8>, Vec<u8>)> = Vec::new();
957    for line in header_block.split(|&b| b == b'\n') {
958        if line.first() == Some(&b' ') {
959            // Continuation of the current header value: restore the newline
960            // that folding replaced and strip exactly one leading space.
961            if let Some((_, value)) = headers.last_mut() {
962                value.push(b'\n');
963                value.extend_from_slice(&line[1..]);
964            }
965            // A continuation with no preceding header is malformed git; skip it
966            // rather than panic.
967            continue;
968        }
969        // New header: `name<SP>value`. A header line with no space is degenerate
970        // (git never emits one in this region) — record it with an empty value
971        // so no bytes are silently dropped.
972        let (name, value) = match line.iter().position(|&b| b == b' ') {
973            Some(sp) => (line[..sp].to_vec(), line[sp + 1..].to_vec()),
974            None => (line.to_vec(), Vec::new()),
975        };
976        headers.push((name, value));
977    }
978
979    // Extension headers are everything strictly after `committer`. git always
980    // emits exactly one committer line ahead of the extension headers; if it is
981    // somehow absent, fall back to excluding the four core names so nothing is
982    // silently dropped or mis-captured.
983    match headers.iter().position(|(name, _)| name == b"committer") {
984        Some(idx) => headers.split_off(idx + 1),
985        None => headers
986            .into_iter()
987            .filter(|(name, _)| {
988                !matches!(
989                    name.as_slice(),
990                    b"tree" | b"parent" | b"author" | b"committer"
991                )
992            })
993            .collect(),
994    }
995}
996
997/// Index of the first occurrence of `needle` in `haystack`, or `None`.
998fn find_subslice(haystack: &[u8], needle: &[u8]) -> Option<usize> {
999    if needle.is_empty() || needle.len() > haystack.len() {
1000        return None;
1001    }
1002    haystack.windows(needle.len()).position(|w| w == needle)
1003}
1004
1005#[cfg(test)]
1006mod tests {
1007    use super::*;
1008    use crate::object::Principal;
1009
1010    fn sample_attribution() -> Attribution {
1011        Attribution::human(Principal::new("Alice", "alice@example.com"))
1012    }
1013
1014    #[test]
1015    fn new_snapshot_sets_fresh_logical_identity() {
1016        let state =
1017            State::new_snapshot(ContentHash::compute(b"tree"), vec![], sample_attribution());
1018        let logical_change_id = state
1019            .logical_change_id
1020            .expect("snapshot should set logical identity");
1021        assert_ne!(state.logical_change_id(), state.change_id);
1022        assert_eq!(state.logical_change_id(), logical_change_id);
1023    }
1024
1025    #[test]
1026    fn new_refresh_preserves_explicit_logical_identity() {
1027        let logical_change_id = ChangeId::from_bytes([7; 16]);
1028        let state = State::new_refresh_of(
1029            ContentHash::compute(b"tree"),
1030            vec![],
1031            sample_attribution(),
1032            logical_change_id,
1033        );
1034        assert_eq!(state.logical_change_id(), logical_change_id);
1035        assert_ne!(state.change_id, logical_change_id);
1036    }
1037
1038    #[test]
1039    fn new_merge_uses_fresh_logical_identity() {
1040        let state = State::new_merge(
1041            ContentHash::compute(b"tree"),
1042            vec![ChangeId::from_bytes([1; 16]), ChangeId::from_bytes([2; 16])],
1043            sample_attribution(),
1044        );
1045        let logical_change_id = state
1046            .logical_change_id
1047            .expect("merge should set logical identity");
1048        assert_ne!(state.logical_change_id(), state.change_id);
1049        assert_eq!(state.logical_change_id(), logical_change_id);
1050        assert!(state.is_merge());
1051    }
1052
1053    #[test]
1054    fn with_change_id_invalidates_cached_hash_when_logical_identity_changes() {
1055        let mut state =
1056            State::new_snapshot(ContentHash::compute(b"tree"), vec![], sample_attribution());
1057        let previous_change_id = state.change_id;
1058        state = state.with_logical_change_id(previous_change_id);
1059        let original_hash = state.hash();
1060        let replacement = ChangeId::from_bytes([9; 16]);
1061
1062        let mut updated = state.with_change_id(replacement);
1063
1064        assert_eq!(updated.logical_change_id(), replacement);
1065        assert_ne!(updated.hash(), original_hash);
1066        assert_eq!(updated.hash(), updated.compute_hash());
1067    }
1068
1069    #[test]
1070    fn agent_segment_is_part_of_state_hash() {
1071        let principal = Principal::new("Alice", "alice@example.com");
1072        let attribution_a = Attribution::with_agent(
1073            principal.clone(),
1074            crate::object::Agent::new("openai", "gpt-5").with_session("sess-1", "seg-1"),
1075        );
1076        let attribution_b = Attribution::with_agent(
1077            principal,
1078            crate::object::Agent::new("openai", "gpt-5").with_session("sess-1", "seg-2"),
1079        );
1080        let tree = ContentHash::compute(b"tree");
1081        let timestamp = Utc::now();
1082        let logical_change_id = ChangeId::from_bytes([3; 16]);
1083        let state_a = State::new_snapshot(tree, vec![], attribution_a)
1084            .with_logical_change_id(logical_change_id)
1085            .with_timestamp(timestamp);
1086        let state_b = State::new_snapshot(tree, vec![], attribution_b)
1087            .with_logical_change_id(logical_change_id)
1088            .with_timestamp(timestamp);
1089
1090        assert_ne!(state_a.compute_hash(), state_b.compute_hash());
1091    }
1092
1093    fn sample_state() -> State {
1094        State::new_snapshot(ContentHash::compute(b"tree"), vec![], sample_attribution())
1095    }
1096
1097    fn assert_mutator_invalidates_cached_hash(
1098        mut state: State,
1099        mutate: impl FnOnce(State) -> State,
1100    ) {
1101        let original_hash = state.hash();
1102        let mut updated = mutate(state);
1103        assert_ne!(updated.hash(), original_hash);
1104        assert_eq!(updated.hash(), updated.compute_hash());
1105    }
1106
1107    #[test]
1108    fn with_intent_invalidates_cached_hash() {
1109        assert_mutator_invalidates_cached_hash(sample_state(), |state| {
1110            state.with_intent("capture intent")
1111        });
1112    }
1113
1114    #[test]
1115    fn with_confidence_invalidates_cached_hash() {
1116        assert_mutator_invalidates_cached_hash(sample_state(), |state| state.with_confidence(0.9));
1117    }
1118
1119    #[test]
1120    fn with_verification_invalidates_cached_hash() {
1121        assert_mutator_invalidates_cached_hash(sample_state(), |state| {
1122            state.with_verification(Verification::new().with_tests_passed(true))
1123        });
1124    }
1125
1126    #[test]
1127    fn with_status_invalidates_cached_hash() {
1128        assert_mutator_invalidates_cached_hash(sample_state(), |state| {
1129            state.with_status(Status::Published)
1130        });
1131    }
1132
1133    #[test]
1134    fn with_timestamp_invalidates_cached_hash() {
1135        assert_mutator_invalidates_cached_hash(sample_state(), |state| {
1136            state.with_timestamp(Utc::now() + chrono::Duration::seconds(1))
1137        });
1138    }
1139
1140    /// Locks the contract that W1 tail-append fields (risk_signals,
1141    /// review_signatures, discussions, structured_conflicts) are NOT
1142    /// part of the state hash. Adding them to identity would mean the
1143    /// same logical state hashes differently depending on what signals
1144    /// fired, what review signatures arrived, or whether a discussion
1145    /// was anchored — which would break every "same state?" check in
1146    /// the system. Their persistence is independent of identity.
1147    #[test]
1148    fn w1_tail_fields_are_not_part_of_state_hash() {
1149        let mut bare = sample_state();
1150        let bare_hash = bare.hash();
1151
1152        let mut decorated = sample_state()
1153            .with_change_id(bare.change_id)
1154            .with_logical_change_id(bare.logical_change_id())
1155            .with_risk_signals(ContentHash::compute(b"risk-signals-blob"))
1156            .with_review_signatures(ContentHash::compute(b"review-signatures-blob"))
1157            .with_discussions(ContentHash::compute(b"discussions-blob"))
1158            .with_structured_conflicts(ContentHash::compute(b"conflicts-blob"));
1159        decorated.created_at = bare.created_at;
1160
1161        assert_eq!(
1162            decorated.hash(),
1163            bare_hash,
1164            "W1 tail fields must not affect the state hash"
1165        );
1166    }
1167
1168    /// The inverse of `w1_tail_fields_are_not_part_of_state_hash`: the
1169    /// git-fidelity fields (#564 step 1) MUST be part of the hash so two
1170    /// git-distinct commits can't dedup-collide. Each field, set in
1171    /// isolation, must move the hash.
1172    #[test]
1173    fn fidelity_fields_are_part_of_state_hash() {
1174        let base = sample_state();
1175        let base_hash = base.compute_hash();
1176
1177        let with_committer = sample_state()
1178            .with_change_id(base.change_id)
1179            .with_logical_change_id(base.logical_change_id());
1180        let mut with_committer =
1181            with_committer.with_committer(Principal::new("Carol", "carol@example.com"));
1182        with_committer.created_at = base.created_at;
1183        assert_ne!(
1184            with_committer.hash(),
1185            base_hash,
1186            "committer must affect the state hash"
1187        );
1188
1189        for mutate in [
1190            |s: State| s.with_tz_offsets(3600, -7200),
1191            |s: State| s.with_authored_at(Utc::now() + chrono::Duration::seconds(1)),
1192            |s: State| s.with_raw_message("verbatim body\n"),
1193            // gpgsig now rides inline in extra_headers at its captured position.
1194            |s: State| {
1195                s.with_extra_headers(vec![(
1196                    b"gpgsig".to_vec(),
1197                    b"-----BEGIN PGP SIGNATURE-----\n".to_vec(),
1198                )])
1199            },
1200            |s: State| s.with_extra_headers(vec![(b"mergetag".to_vec(), b"x".to_vec())]),
1201        ] {
1202            let seeded = sample_state()
1203                .with_change_id(base.change_id)
1204                .with_logical_change_id(base.logical_change_id());
1205            let mut decorated = mutate(seeded);
1206            decorated.created_at = base.created_at;
1207            assert_ne!(
1208                decorated.hash(),
1209                base_hash,
1210                "fidelity field must affect the state hash"
1211            );
1212        }
1213    }
1214
1215    #[test]
1216    fn pre_fidelity_hash_matches_legacy_golden_vector() {
1217        let state = State::new_snapshot(
1218            ContentHash::compute(b"issue-633-tree"),
1219            vec![ChangeId::from_bytes([0x11; 16])],
1220            Attribution::with_agent(
1221                Principal::new("Legacy Author", "legacy@example.com"),
1222                crate::object::Agent::new("openai", "gpt-5")
1223                    .with_session("session-633", "segment-001")
1224                    .with_policy("policy-legacy"),
1225            ),
1226        )
1227        .with_logical_change_id(ChangeId::from_bytes([0x63; 16]))
1228        .with_intent("freeze pre-565 hash")
1229        .with_confidence(0.875)
1230        .with_timestamp(DateTime::from_timestamp(1_700_000_000, 0).expect("valid timestamp"))
1231        .with_committer(Principal::new("Legacy Committer", "committer@example.com"))
1232        .with_tz_offsets(3600, -18000)
1233        .with_authored_at(DateTime::from_timestamp(1_699_999_000, 0).expect("valid timestamp"))
1234        .with_raw_message(b"legacy commit message\n")
1235        .with_extra_headers(vec![(b"encoding".to_vec(), b"UTF-8".to_vec())])
1236        .with_status(Status::Published);
1237
1238        let legacy_hash = state.compute_hash_pre_fidelity();
1239        // Golden vector for the pre-#565 state hash format. Legacy
1240        // StateSignature verification depends on `compute_hash_pre_fidelity`
1241        // staying byte-identical to that old format; if `hash_len_core` and
1242        // `update_hash_core` drift, real pre-#565 signatures become
1243        // unverifiable even though round-trip tests can still pass.
1244        assert_eq!(
1245            legacy_hash.to_hex(),
1246            "b89e1b40e681a1bf88679db7cfcacdafb1f370bc40ed5d50760dae1d4ab49dab",
1247        );
1248        assert_ne!(
1249            legacy_hash,
1250            state.compute_hash(),
1251            "fixture must distinguish the pre-#565 legacy path from the current hash",
1252        );
1253    }
1254
1255    /// extra_headers order is load-bearing (#566): the same pairs in a
1256    /// different order must hash differently.
1257    #[test]
1258    fn extra_headers_order_affects_hash() {
1259        let base = sample_state();
1260        let one = sample_state()
1261            .with_change_id(base.change_id)
1262            .with_logical_change_id(base.logical_change_id());
1263        let mut one = one.with_extra_headers(vec![
1264            (b"a".to_vec(), b"1".to_vec()),
1265            (b"b".to_vec(), b"2".to_vec()),
1266        ]);
1267        one.created_at = base.created_at;
1268
1269        let two = sample_state()
1270            .with_change_id(base.change_id)
1271            .with_logical_change_id(base.logical_change_id());
1272        let mut two = two.with_extra_headers(vec![
1273            (b"b".to_vec(), b"2".to_vec()),
1274            (b"a".to_vec(), b"1".to_vec()),
1275        ]);
1276        two.created_at = base.created_at;
1277
1278        assert_ne!(one.hash(), two.hash());
1279    }
1280
1281    /// The fidelity fields set together produce a stable, recomputable
1282    /// hash (guards against a `hash_len`/`update_hash` divergence making
1283    /// the cached hash differ from a fresh `compute_hash`).
1284    #[test]
1285    fn fidelity_fields_hash_is_stable() {
1286        let mut state = sample_state()
1287            .with_committer(Principal::new("Dave", "dave@example.com"))
1288            .with_tz_offsets(3600, 0)
1289            .with_authored_at(Utc::now())
1290            .with_raw_message("body\n")
1291            .with_extra_headers(vec![
1292                (b"gpgsig".to_vec(), b"sig".to_vec()),
1293                (b"k".to_vec(), b"v".to_vec()),
1294            ]);
1295        assert_eq!(state.hash(), state.compute_hash());
1296    }
1297
1298    /// A non-UTF8 git message body (latin-1 `café` = `caf\xe9`) must be
1299    /// stored byte-identically. `raw_message` is `Vec<u8>`, not `String`,
1300    /// precisely so these bytes survive; the hash stays stable/recomputable
1301    /// over the raw bytes (length-prefixed framing, NUL-safe). #564 step 1.
1302    #[test]
1303    fn non_utf8_raw_message_is_byte_preserved() {
1304        let raw = b"caf\xe9\n".to_vec();
1305        assert!(
1306            String::from_utf8(raw.clone()).is_err(),
1307            "test fixture must be invalid UTF-8 to be meaningful"
1308        );
1309        let mut state = sample_state().with_raw_message(&raw);
1310        assert_eq!(
1311            state.raw_message.as_deref(),
1312            Some(raw.as_slice()),
1313            "raw bytes preserved verbatim"
1314        );
1315        // rmp serialize → deserialize (the store's on-disk codec) keeps the
1316        // bytes intact, and the hash recomputes identically afterwards.
1317        let bytes = rmp_serde::to_vec(&state).expect("serialize state");
1318        let back: State = rmp_serde::from_slice(&bytes).expect("deserialize state");
1319        assert_eq!(back.raw_message.as_deref(), Some(raw.as_slice()));
1320        let mut back = back;
1321        assert_eq!(state.hash(), back.hash());
1322        assert_eq!(back.hash(), back.compute_hash());
1323    }
1324
1325    /// A NUL byte inside the message must not be swallowed/truncated by the
1326    /// hash framing — length-prefixed `raw_message` is what makes this safe,
1327    /// where the old NUL-terminated string framing would have been ambiguous.
1328    #[test]
1329    fn raw_message_with_nul_byte_changes_hash() {
1330        let base = sample_state();
1331        let with_nul = sample_state()
1332            .with_change_id(base.change_id)
1333            .with_logical_change_id(base.logical_change_id());
1334        let mut a = with_nul.with_raw_message(b"a\x00b");
1335        a.created_at = base.created_at;
1336
1337        let other = sample_state()
1338            .with_change_id(base.change_id)
1339            .with_logical_change_id(base.logical_change_id());
1340        let mut b = other.with_raw_message(b"a\x00c");
1341        b.created_at = base.created_at;
1342
1343        assert_ne!(a.hash(), b.hash());
1344    }
1345
1346    /// Close-the-class conformance: extension headers are captured from the
1347    /// raw commit header block in their EXACT on-the-wire order, regardless of
1348    /// which ones a decoder would surface as typed fields. A commit whose
1349    /// optional headers are in non-canonical order — `x-custom`, then a folded
1350    /// `gpgsig`, then `encoding`, then a folded `mergetag` — must reproduce that
1351    /// exact ordered `(name, value)` byte sequence. This fails if any header is
1352    /// reordered, prepended, appended, or dropped. #564 de-lossy step 1.
1353    #[test]
1354    fn parse_extension_headers_preserves_noncanonical_wire_order() {
1355        // A folded `mergetag` value carries a full tag object, which itself has
1356        // an internal blank line between the tag headers and the tag message —
1357        // on the wire that blank line is folded to a single space (` `), NEVER
1358        // an empty line, so it must not be mistaken for the header/body split.
1359        // Built line-by-line (NOT a `\`-continued literal, which would eat the
1360        // load-bearing leading space on each folded continuation line).
1361        let lines: &[&[u8]] = &[
1362            b"tree 1111111111111111111111111111111111111111",
1363            b"parent 2222222222222222222222222222222222222222",
1364            b"author Alice <alice@example.com> 1700000000 +0000",
1365            b"committer Bob <bob@example.com> 1700000100 +0000",
1366            b"x-custom custom value",
1367            b"gpgsig -----BEGIN PGP SIGNATURE-----",
1368            b" sig-line-1",
1369            b" -----END PGP SIGNATURE-----",
1370            b"encoding ISO-8859-1",
1371            b"mergetag object 3333333333333333333333333333333333333333",
1372            b" type commit",
1373            b" tag sidetag",
1374            b" tagger Carol <carol@example.com> 1700000050 +0000",
1375            b" ", // folded blank line inside the tag object (one space)
1376            b" signed side tag",
1377            b"", // the real header/body separator (empty line)
1378            b"the commit message",
1379            b"",
1380        ];
1381        let content = lines.join(&b'\n');
1382
1383        let headers = parse_commit_extension_headers(&content);
1384
1385        let expected: Vec<(Vec<u8>, Vec<u8>)> = vec![
1386            (b"x-custom".to_vec(), b"custom value".to_vec()),
1387            (
1388                b"gpgsig".to_vec(),
1389                // Unfolded: internal newlines restored, NO trailing newline (the
1390                // serializer re-folds each `\n` to `\n `, spike §2).
1391                b"-----BEGIN PGP SIGNATURE-----\nsig-line-1\n-----END PGP SIGNATURE-----"
1392                    .to_vec(),
1393            ),
1394            (b"encoding".to_vec(), b"ISO-8859-1".to_vec()),
1395            (
1396                b"mergetag".to_vec(),
1397                // The folded ` \n` blank line unfolds to an empty segment, so the
1398                // tag object's header/message split survives as a real `\n\n`.
1399                b"object 3333333333333333333333333333333333333333\ntype commit\ntag sidetag\ntagger Carol <carol@example.com> 1700000050 +0000\n\nsigned side tag".to_vec(),
1400            ),
1401        ];
1402
1403        assert_eq!(headers, expected);
1404    }
1405
1406    /// A commit with no extension headers (the common case) yields an empty
1407    /// vec — `tree`/`parent`/`author`/`committer` are modelled natively and
1408    /// never leak into `extra_headers`.
1409    #[test]
1410    fn parse_extension_headers_empty_when_only_core_headers() {
1411        let content: &[u8] = b"\
1412tree 1111111111111111111111111111111111111111\n\
1413author Alice <alice@example.com> 1700000000 +0000\n\
1414committer Bob <bob@example.com> 1700000100 +0000\n\
1415\n\
1416just a message\n";
1417        assert!(parse_commit_extension_headers(content).is_empty());
1418    }
1419}