Skip to main content

objects/object/
state_core.rs

1// SPDX-License-Identifier: Apache-2.0
2//! Core state type and its leaf value types (Status, StateSignature,
3//! SignatureStatus, Verification).
4
5use std::collections::BTreeMap;
6
7use chrono::{DateTime, Utc};
8use serde::{Deserialize, Serialize};
9
10use super::{Attribution, ChangeId, ContentHash, Principal};
11
12// ── Status ──────────────────────────────────────────────────────────
13
14/// Lifecycle status of a state.
15#[derive(Clone, Copy, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
16pub enum Status {
17    #[default]
18    Draft,
19    Published,
20}
21
22impl Status {
23    pub fn to_byte(&self) -> u8 {
24        match self {
25            Status::Draft => 0,
26            Status::Published => 1,
27        }
28    }
29
30    pub fn from_byte(b: u8) -> Option<Self> {
31        match b {
32            0 => Some(Status::Draft),
33            1 => Some(Status::Published),
34            _ => None,
35        }
36    }
37}
38
39// ── StateSignature ──────────────────────────────────────────────────
40
41/// Signature information for a state.
42#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
43pub struct StateSignature {
44    pub algorithm: String,
45    pub public_key: String,
46    pub signature: String,
47}
48
49impl StateSignature {
50    pub fn algorithm(&self) -> &str {
51        &self.algorithm
52    }
53}
54
55/// Signature verification result.
56#[derive(Clone, Copy, Debug, PartialEq, Eq)]
57pub enum SignatureStatus {
58    Valid,
59    Invalid,
60    Unsigned,
61}
62
63impl SignatureStatus {
64    pub fn is_valid(self) -> bool {
65        self == SignatureStatus::Valid
66    }
67
68    pub fn is_unsigned(self) -> bool {
69        self == SignatureStatus::Unsigned
70    }
71}
72
73// ── Verification ────────────────────────────────────────────────────
74
75/// Verification information for a state.
76#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
77pub struct Verification {
78    pub tests_passed: Option<bool>,
79    pub tests_failed: Option<u32>,
80    pub coverage_pct: Option<f32>,
81    pub coverage_delta: Option<f32>,
82    pub lint_warnings: Option<u32>,
83    #[serde(default)]
84    pub custom: BTreeMap<String, serde_json::Value>,
85}
86
87impl Verification {
88    pub fn new() -> Self {
89        Self::default()
90    }
91
92    pub fn with_tests_passed(mut self, passed: bool) -> Self {
93        self.tests_passed = Some(passed);
94        self
95    }
96
97    pub fn with_tests_failed(mut self, failed: u32) -> Self {
98        self.tests_failed = Some(failed);
99        self
100    }
101
102    pub fn is_empty(&self) -> bool {
103        self.tests_passed.is_none()
104            && self.tests_failed.is_none()
105            && self.coverage_pct.is_none()
106            && self.coverage_delta.is_none()
107            && self.lint_warnings.is_none()
108            && self.custom.is_empty()
109    }
110
111    pub(crate) fn hash_len(&self) -> usize {
112        let mut len = 0;
113        len += 1 + self.tests_passed.map(|_| 1).unwrap_or(0);
114        len += 1 + self.tests_failed.map(|_| 4).unwrap_or(0);
115        len += 1 + self.coverage_pct.map(|_| 4).unwrap_or(0);
116        len += 1 + self.coverage_delta.map(|_| 4).unwrap_or(0);
117        len += 1 + self.lint_warnings.map(|_| 4).unwrap_or(0);
118        len += 4;
119        for (key, value) in &self.custom {
120            let value_bytes = serde_json::to_vec(value).unwrap_or_default();
121            len += 4 + key.len();
122            len += 4 + value_bytes.len();
123        }
124        len
125    }
126
127    pub(crate) fn update_hasher(&self, hasher: &mut blake3::Hasher) {
128        let tests_passed = self.tests_passed.map(u8::from);
129        write_optional_u8(hasher, tests_passed);
130        write_optional_u32(hasher, self.tests_failed);
131        write_optional_f32(hasher, self.coverage_pct);
132        write_optional_f32(hasher, self.coverage_delta);
133        write_optional_u32(hasher, self.lint_warnings);
134        let custom_len = self.custom.len() as u32;
135        hasher.update(&custom_len.to_le_bytes());
136        for (key, value) in &self.custom {
137            let key_bytes = key.as_bytes();
138            let value_bytes = serde_json::to_vec(value).unwrap_or_default();
139            hasher.update(&(key_bytes.len() as u32).to_le_bytes());
140            hasher.update(key_bytes);
141            hasher.update(&(value_bytes.len() as u32).to_le_bytes());
142            hasher.update(&value_bytes);
143        }
144    }
145}
146
147fn write_optional_u8(hasher: &mut blake3::Hasher, value: Option<u8>) {
148    match value {
149        Some(v) => {
150            hasher.update(&[1]);
151            hasher.update(&[v]);
152        }
153        None => {
154            hasher.update(&[0]);
155        }
156    }
157}
158
159fn write_optional_u32(hasher: &mut blake3::Hasher, value: Option<u32>) {
160    match value {
161        Some(v) => {
162            hasher.update(&[1]);
163            hasher.update(&v.to_le_bytes());
164        }
165        None => {
166            hasher.update(&[0]);
167        }
168    }
169}
170
171fn write_optional_f32(hasher: &mut blake3::Hasher, value: Option<f32>) {
172    match value {
173        Some(v) => {
174            hasher.update(&[1]);
175            hasher.update(&v.to_le_bytes());
176        }
177        None => {
178            hasher.update(&[0]);
179        }
180    }
181}
182
183// ── State ───────────────────────────────────────────────────────────
184
185/// A state is an immutable snapshot with rich metadata.
186///
187/// On-disk encoding is rmp-serde's positional struct format (a fixed-length
188/// tuple). This is sensitive to field order: inserting a field in the middle
189/// of the tuple breaks every pre-existing on-disk state. The invariant we
190/// keep going forward is:
191///
192/// > **New optional fields are added at the tail of the struct, below
193/// > `status`, with `#[serde(default)]`.** Mid-struct inserts are
194/// > forbidden. rmp-serde's positional deserializer tolerates missing
195/// > trailing fields when they have a `Default` impl, so tail-only growth
196/// > is forward-compatible automatically.
197///
198/// Required (non-optional) fields — `change_id`, `tree`, `parents`,
199/// `attribution`, `created_at`, `status` — must never move. Optional fields
200/// may be reordered only among themselves, and only at the tail.
201#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
202pub struct State {
203    pub change_id: ChangeId,
204    #[serde(skip)]
205    content_hash: Option<ContentHash>,
206    pub tree: ContentHash,
207    pub parents: Vec<ChangeId>,
208    pub attribution: Attribution,
209    pub intent: Option<String>,
210    pub confidence: Option<f32>,
211    pub created_at: DateTime<Utc>,
212    pub verification: Option<Verification>,
213    pub signature: Option<StateSignature>,
214    pub status: Status,
215    // --- tail-only optional fields below. Add new fields here, never above. ---
216    #[serde(default)]
217    pub provenance: Option<ContentHash>,
218    #[serde(default)]
219    pub logical_change_id: Option<ChangeId>,
220    /// Optional context tree root for code annotations.
221    #[serde(default)]
222    pub context: Option<ContentHash>,
223    /// Authoring timestamp for this state, when distinct from
224    /// `created_at`.
225    ///
226    /// `created_at` is the *committer* time — when the state object
227    /// came into being in its current form. `authored_at` is the
228    /// *author* time — when someone actually wrote the change — which
229    /// survives `git rebase`, cherry-pick, squash-merge, and `git
230    /// commit --amend`. The ingest-backed `bridge git import` path fills
231    /// this from the git author time; native heddle commits leave it
232    /// `None` and blame falls back to `created_at`.
233    ///
234    /// **Part of the state hash (#564 de-lossy step 1).** Author time
235    /// is part of a git commit's identity: two commits that differ
236    /// *only* by author timestamp are distinct git objects, so folding
237    /// it into the hash keeps them from dedup-colliding to one State in
238    /// the content-addressed store. `None` hashes as a single absence
239    /// byte, so native commits are unaffected beyond the format bump.
240    #[serde(default)]
241    pub authored_at: Option<DateTime<Utc>>,
242    /// Content hash of the state's [`RiskSignalBlob`](crate::object::RiskSignalBlob),
243    /// when present. Computed and persisted whenever risk signals fire on a
244    /// state. `None` for states from before W1 and for states where no
245    /// signals fired.
246    ///
247    /// Hash framing: a single `0` byte when `None`, `[1]` + 32-byte hash when
248    /// `Some`. Legacy states without this field deserialize as `None` and
249    /// hash byte-identical to before W1.
250    #[serde(default)]
251    pub risk_signals: Option<ContentHash>,
252    /// Content hash of the state's [`ReviewSignaturesBlob`](crate::object::ReviewSignaturesBlob),
253    /// when reviewers have signed off (read / agent-preview / agent-co-review).
254    #[serde(default)]
255    pub review_signatures: Option<ContentHash>,
256    /// Content hash of the state's [`DiscussionsBlob`](crate::object::DiscussionsBlob),
257    /// when discussions are anchored to this state.
258    #[serde(default)]
259    pub discussions: Option<ContentHash>,
260    /// Content hash of the state's [`StructuredConflict`](crate::object::StructuredConflict),
261    /// when this state captures an unresolved merge conflict as data.
262    #[serde(default)]
263    pub structured_conflicts: Option<ContentHash>,
264    // --- git-fidelity fields (#564 de-lossy step 1, #565) ---
265    //
266    // These preserve the parts of an imported git commit that Heddle's
267    // model used to drop, so a commit can be byte-reconstructed later
268    // (#566/#567) and the git mirror can be eliminated (#568). UNLIKE the
269    // W1 tail fields above, these ARE part of the content hash (see
270    // `update_hash`): two git-distinct commits that differ only in
271    // committer, timezone, verbatim message, gpgsig, or extra headers must
272    // hash differently so they can't dedup-collide in the content-addressed
273    // store. They are still tail-append + `#[serde(default)]` so legacy
274    // on-disk states keep deserializing.
275    /// The git committer identity, when distinct from the author
276    /// ([`Attribution::principal`]). Git records both an author (who wrote
277    /// the change) and a committer (who created this commit object); for
278    /// rebased / cherry-picked / amended commits the two differ. `None`
279    /// for native heddle commits and for legacy imports from before #565.
280    #[serde(default)]
281    pub committer: Option<Principal>,
282    /// Timezone offset (seconds east of UTC) of the *author* timestamp
283    /// ([`State::authored_at`] / `created_at` fallback). Git stores the
284    /// author's local offset (e.g. `+0000`, `-0700`); Heddle used to
285    /// discard it. `0` for native commits and legacy imports.
286    #[serde(default)]
287    pub authored_tz_offset: i32,
288    /// Timezone offset (seconds east of UTC) of the *committer* timestamp
289    /// (`created_at`). `0` for native commits and legacy imports.
290    #[serde(default)]
291    pub committer_tz_offset: i32,
292    /// The verbatim git commit message body (everything after the header
293    /// block), preserved exactly so reconstruction is byte-stable. Distinct
294    /// from `intent`, which is the trimmed first line surfaced in the UI.
295    /// `None` for native commits and legacy imports.
296    ///
297    /// Stored as raw bytes, NOT a `String`: a commit with a non-UTF8
298    /// `encoding` (latin-1, shift-jis, …) carries message bytes that are not
299    /// valid UTF-8 (e.g. `0xe9` for latin-1 `é`); a `String` could not
300    /// round-trip them byte-identically. (non-UTF8 author/committer identity
301    /// *names* are not yet byte-preserved — `Principal` is still `String`; see
302    /// #564.)
303    #[serde(default)]
304    pub raw_message: Option<Vec<u8>>,
305    /// The SINGLE canonical "this state's content is NOT byte-faithful to the
306    /// original git object" marker (#567). Set to `true` by lossy import
307    /// population paths whenever an unrepresentable tree entry was dropped or
308    /// converted during import, so the rebuilt tree (hence commit) no longer
309    /// hashes to the original SHA. The git-export fidelity guard reads this one
310    /// flag to decide whether reconstruct-from-state is safe, instead of
311    /// enumerating import surfaces. `false` for native heddle commits and for
312    /// lossless imports.
313    ///
314    /// Provenance metadata, NOT part of the content hash: a lossy import always
315    /// drops/converts tree entries, so its tree — and therefore the rest of the
316    /// hashed identity — already differs from a lossless import of the same
317    /// source; folding the flag in would add nothing but break every existing
318    /// content hash.
319    #[serde(default)]
320    pub git_lossy: bool,
321    /// Every git commit header beyond the ones Heddle models natively
322    /// (tree/parents/author/committer), in their original order. ORDER IS
323    /// LOAD-BEARING for #566 byte-exactness — this is a `Vec`, never a map.
324    /// Empty for native commits and legacy imports.
325    ///
326    /// `gpgsig` is just one of these headers and is kept INLINE at its
327    /// captured ordinal (not split into a separate field): when a commit's
328    /// extension headers are in non-canonical order — e.g. `x-custom`, then
329    /// `gpgsig`, then `mergetag` — splitting gpgsig out would lose its
330    /// position and break byte-identical reconstruction. The serialization
331    /// source of truth for the signature is its position here (spike §3).
332    ///
333    /// Both the header name and value are raw bytes (`Vec<u8>`), NOT
334    /// `String`s: extra-header VALUES (a `mergetag` payload is a full tag
335    /// object; custom headers; gpgsig armor) can be non-UTF8, so a
336    /// `String` would force a lossy `to_string()` that destroys those bytes.
337    /// Names are ASCII by git's spec but are bytes too so the whole tuple is
338    /// byte-exact and no conversion sneaks in.
339    #[serde(default)]
340    pub extra_headers: Vec<(Vec<u8>, Vec<u8>)>,
341}
342
343impl State {
344    pub fn new(tree: ContentHash, parents: Vec<ChangeId>, attribution: Attribution) -> Self {
345        Self::new_snapshot(tree, parents, attribution)
346    }
347
348    pub fn new_snapshot(
349        tree: ContentHash,
350        parents: Vec<ChangeId>,
351        attribution: Attribution,
352    ) -> Self {
353        let change_id = ChangeId::generate();
354        Self::new_with_logical_change_id(tree, parents, attribution, change_id)
355    }
356
357    pub fn new_merge(tree: ContentHash, parents: Vec<ChangeId>, attribution: Attribution) -> Self {
358        Self::new_snapshot(tree, parents, attribution)
359    }
360
361    pub fn new_refresh_of(
362        tree: ContentHash,
363        parents: Vec<ChangeId>,
364        attribution: Attribution,
365        logical_change_id: ChangeId,
366    ) -> Self {
367        Self::new_with_logical_change_id(tree, parents, attribution, logical_change_id)
368    }
369
370    pub fn new_fork_of(
371        tree: ContentHash,
372        parents: Vec<ChangeId>,
373        attribution: Attribution,
374    ) -> Self {
375        Self::new_snapshot(tree, parents, attribution)
376    }
377
378    pub fn new_collapse_of(
379        tree: ContentHash,
380        parents: Vec<ChangeId>,
381        attribution: Attribution,
382    ) -> Self {
383        Self::new_snapshot(tree, parents, attribution)
384    }
385
386    fn new_with_logical_change_id(
387        tree: ContentHash,
388        parents: Vec<ChangeId>,
389        attribution: Attribution,
390        logical_change_id: ChangeId,
391    ) -> Self {
392        Self {
393            change_id: ChangeId::generate(),
394            logical_change_id: Some(logical_change_id),
395            content_hash: None,
396            tree,
397            parents,
398            attribution,
399            intent: None,
400            confidence: None,
401            created_at: Utc::now(),
402            verification: None,
403            signature: None,
404            provenance: None,
405            context: None,
406            authored_at: None,
407            risk_signals: None,
408            review_signatures: None,
409            discussions: None,
410            structured_conflicts: None,
411            committer: None,
412            authored_tz_offset: 0,
413            committer_tz_offset: 0,
414            raw_message: None,
415            git_lossy: false,
416            extra_headers: Vec::new(),
417            status: Status::Draft,
418        }
419    }
420
421    pub fn with_intent(mut self, intent: impl Into<String>) -> Self {
422        self.intent = Some(intent.into());
423        self.content_hash = None;
424        self
425    }
426
427    pub fn with_confidence(mut self, confidence: f32) -> Self {
428        self.confidence = Some(confidence.clamp(0.0, 1.0));
429        self.content_hash = None;
430        self
431    }
432
433    pub fn with_verification(mut self, verification: Verification) -> Self {
434        self.verification = Some(verification);
435        self.content_hash = None;
436        self
437    }
438
439    pub fn with_signature(mut self, signature: StateSignature) -> Self {
440        self.signature = Some(signature);
441        self
442    }
443
444    pub fn with_provenance(mut self, provenance: ContentHash) -> Self {
445        self.provenance = Some(provenance);
446        self.content_hash = None;
447        self
448    }
449
450    /// Set the context tree root.
451    pub fn with_context(mut self, context: ContentHash) -> Self {
452        self.context = Some(context);
453        self.content_hash = None;
454        self
455    }
456
457    /// Attach a [`RiskSignalBlob`](crate::object::RiskSignalBlob) hash.
458    /// Render-time tick budgeting (selecting which signals to surface) is a
459    /// view over this stored data, not part of storage itself.
460    ///
461    /// **Not part of the state hash.** Risk signals are derived data computed
462    /// *about* a state from the diff against its parent; including them in
463    /// identity would make the same logical state hash differently depending
464    /// on which signals fired. That breaks every "is this the same state?"
465    /// check in the system. See `authored_at` for the same pattern.
466    pub fn with_risk_signals(mut self, risk_signals: ContentHash) -> Self {
467        self.risk_signals = Some(risk_signals);
468        self
469    }
470
471    /// Attach a [`ReviewSignaturesBlob`](crate::object::ReviewSignaturesBlob)
472    /// hash. The state's authoring [`StateSignature`] is unaffected; review
473    /// signatures live alongside it and accumulate over time.
474    ///
475    /// **Not part of the state hash.** Review signatures accumulate
476    /// post-capture; including them in identity would mean every signature
477    /// re-keys the state. See `authored_at` for the same pattern.
478    pub fn with_review_signatures(mut self, review_signatures: ContentHash) -> Self {
479        self.review_signatures = Some(review_signatures);
480        self
481    }
482
483    /// Attach a [`DiscussionsBlob`](crate::object::DiscussionsBlob) hash.
484    ///
485    /// **Not part of the state hash.** Discussions evolve independently of
486    /// the state they're anchored to — appending a turn must not change the
487    /// state's identity. See `authored_at` for the same pattern.
488    pub fn with_discussions(mut self, discussions: ContentHash) -> Self {
489        self.discussions = Some(discussions);
490        self
491    }
492
493    /// Attach a [`StructuredConflict`](crate::object::StructuredConflict) hash.
494    ///
495    /// **Not part of the state hash.** Conflict objects describe the merge's
496    /// disagreement; the state's tree and parents already encode what's being
497    /// merged. See `authored_at` for the same pattern.
498    pub fn with_structured_conflicts(mut self, structured_conflicts: ContentHash) -> Self {
499        self.structured_conflicts = Some(structured_conflicts);
500        self
501    }
502
503    /// Record the authoring timestamp separately from `created_at`.
504    /// Used by the git-ingest importer to preserve the distinction
505    /// between "when the change was originally written" (authored)
506    /// and "when this commit object came into being" (committer time,
507    /// stored in `created_at` so re-imports stay deterministic).
508    /// Native heddle commits leave this `None`; blame display then
509    /// falls back to `created_at`.
510    ///
511    /// **Part of the state hash (#564 de-lossy step 1)** — see the
512    /// `authored_at` field docs and `update_hash`.
513    pub fn with_authored_at(mut self, timestamp: DateTime<Utc>) -> Self {
514        self.authored_at = Some(timestamp);
515        self.content_hash = None;
516        self
517    }
518
519    /// Record the git committer identity (distinct from the author).
520    ///
521    /// **Part of the state hash** — see the `committer` field docs and
522    /// `update_hash`. #564 de-lossy step 1.
523    pub fn with_committer(mut self, committer: Principal) -> Self {
524        self.committer = Some(committer);
525        self.content_hash = None;
526        self
527    }
528
529    /// Record the author/committer timezone offsets (seconds east of UTC).
530    /// **Part of the state hash.** #564 de-lossy step 1.
531    pub fn with_tz_offsets(mut self, authored: i32, committer: i32) -> Self {
532        self.authored_tz_offset = authored;
533        self.committer_tz_offset = committer;
534        self.content_hash = None;
535        self
536    }
537
538    /// Record the verbatim git commit message body, as raw bytes (so a
539    /// non-UTF8 message round-trips byte-identically; see the `raw_message`
540    /// field docs). **Part of the state hash.** #564 de-lossy step 1.
541    pub fn with_raw_message(mut self, raw_message: impl AsRef<[u8]>) -> Self {
542        self.raw_message = Some(raw_message.as_ref().to_vec());
543        self.content_hash = None;
544        self
545    }
546
547    /// Mark this state's content as NOT byte-faithful to the original git
548    /// object — set by the `--lossy` import/ingest paths when a tree entry was
549    /// dropped or converted. The git-export fidelity guard reads this single
550    /// signal to skip reconstruct-from-state (#567). Not part of the content
551    /// hash (see the `git_lossy` field docs).
552    pub fn with_git_lossy(mut self, git_lossy: bool) -> Self {
553        self.git_lossy = git_lossy;
554        self.content_hash = None;
555        self
556    }
557
558    /// Record the ordered remaining git commit headers as raw bytes. ORDER
559    /// IS LOAD-BEARING (#566). **Part of the state hash.** #564 de-lossy
560    /// step 1.
561    pub fn with_extra_headers(mut self, extra_headers: Vec<(Vec<u8>, Vec<u8>)>) -> Self {
562        self.extra_headers = extra_headers;
563        self.content_hash = None;
564        self
565    }
566
567    pub fn with_status(mut self, status: Status) -> Self {
568        self.status = status;
569        self.content_hash = None;
570        self
571    }
572
573    pub fn with_change_id(mut self, change_id: ChangeId) -> Self {
574        let previous_change_id = self.change_id;
575        self.change_id = change_id;
576        if self.logical_change_id == Some(previous_change_id) || self.logical_change_id.is_none() {
577            self.logical_change_id = Some(change_id);
578            self.content_hash = None;
579        }
580        self
581    }
582
583    pub fn with_logical_change_id(mut self, logical_change_id: ChangeId) -> Self {
584        self.logical_change_id = Some(logical_change_id);
585        self.content_hash = None;
586        self
587    }
588
589    pub fn logical_change_id(&self) -> ChangeId {
590        self.logical_change_id.unwrap_or(self.change_id)
591    }
592
593    pub fn with_timestamp(mut self, timestamp: DateTime<Utc>) -> Self {
594        self.created_at = timestamp;
595        self.content_hash = None;
596        self
597    }
598
599    pub fn compute_hash(&self) -> ContentHash {
600        let content_len = self.hash_len();
601        ContentHash::compute_typed_with_len("state", content_len, |hasher| {
602            self.update_hash(hasher);
603        })
604    }
605
606    /// Migration-only hash for states signed before #565 folded git-fidelity
607    /// fields into identity. Runtime callers should use [`Self::compute_hash`].
608    ///
609    /// Kept public only because repository migrations live in another crate.
610    /// Do not use for new compatibility checks; the deletion-wave migration is
611    /// the owner of this legacy recipe.
612    #[doc(hidden)]
613    pub fn compute_hash_for_legacy_signature_migration(&self) -> ContentHash {
614        let content_len = self.hash_len_core();
615        ContentHash::compute_typed_with_len("state", content_len, |hasher| {
616            self.update_hash_core(hasher);
617        })
618    }
619
620    pub fn hash(&mut self) -> ContentHash {
621        if self.content_hash.is_none() {
622            self.content_hash = Some(self.compute_hash());
623        }
624        self.content_hash.expect("hash was just computed above")
625    }
626
627    pub fn is_root(&self) -> bool {
628        self.parents.is_empty()
629    }
630
631    pub fn is_merge(&self) -> bool {
632        self.parents.len() > 1
633    }
634
635    pub fn is_agent_authored(&self) -> bool {
636        self.attribution.agent.is_some()
637    }
638
639    pub fn first_parent(&self) -> Option<&ChangeId> {
640        self.parents.first()
641    }
642
643    fn hash_len(&self) -> u64 {
644        self.hash_len_core() + self.hash_len_fidelity()
645    }
646
647    /// Hashed length of the pre-#565 fields (everything through the status
648    /// byte). Mirrors [`Self::update_hash_core`]. Split out so the legacy
649    /// signature migration can reproduce the pre-#565 hash exactly.
650    fn hash_len_core(&self) -> u64 {
651        let principal = &self.attribution.principal;
652        let mut len = 0u64;
653
654        len += 1;
655        if self.logical_change_id.is_some() {
656            len += 16;
657        }
658
659        len += self.tree.as_bytes().len() as u64;
660        len += 4;
661        len += (self.parents.len() * 16) as u64;
662
663        len += principal.name.len() as u64 + 1;
664        len += principal.email.len() as u64 + 1;
665
666        len += 1;
667        if let Some(agent) = &self.attribution.agent {
668            len += agent.provider.len() as u64 + 1;
669            len += agent.model.len() as u64 + 1;
670
671            len += 1;
672            if let Some(session_id) = &agent.session_id {
673                len += session_id.len() as u64 + 1;
674            }
675
676            len += 1;
677            if let Some(policy_id) = &agent.policy_id {
678                len += policy_id.len() as u64 + 1;
679            }
680        }
681
682        len += 1;
683        if let Some(intent) = &self.intent {
684            len += intent.len() as u64 + 1;
685        }
686
687        len += 1;
688        if self.confidence.is_some() {
689            len += 4;
690        }
691
692        len += 8;
693
694        len += 1;
695        if let Some(verification) = &self.verification {
696            len += verification.hash_len() as u64;
697        }
698
699        len += 1;
700        if self.provenance.is_some() {
701            len += 32;
702        }
703
704        len += 1;
705        if self.context.is_some() {
706            len += 32;
707        }
708
709        len += 1;
710
711        len
712    }
713
714    /// Hashed length of the appended git-fidelity block (#565). Mirrors
715    /// [`Self::update_hash_fidelity`] byte-for-byte. Kept separate from
716    /// [`Self::hash_len_core`] so the migration-only pre-bump hash can omit it
717    /// exactly.
718    fn hash_len_fidelity(&self) -> u64 {
719        let mut len = 0u64;
720
721        // git-fidelity fields (#564 step 1). Must mirror `update_hash`
722        // byte-for-byte. committer: 1 tag byte + (name+NUL, email+NUL).
723        len += 1;
724        if let Some(committer) = &self.committer {
725            len += committer.name.len() as u64 + 1;
726            len += committer.email.len() as u64 + 1;
727        }
728        // both tz offsets: i32 LE, always present.
729        len += 4;
730        len += 4;
731        // authored_at (author time): 1 tag byte + (i64 LE when Some).
732        len += 1;
733        if self.authored_at.is_some() {
734            len += 8;
735        }
736        // raw_message: optional-bytes framing (1 tag + u32 len + bytes) — a
737        // length prefix, not NUL-termination, since the message can contain
738        // NUL bytes (it's byte-typed for non-UTF8 fidelity).
739        len += 1;
740        if let Some(raw_message) = &self.raw_message {
741            len += 4 + raw_message.len() as u64;
742        }
743        // extra_headers (gpgsig rides inline here at its captured position):
744        // u32 count, then per pair u32 key_len+key, u32 val_len+val.
745        len += 4;
746        for (key, value) in &self.extra_headers {
747            len += 4 + key.len() as u64;
748            len += 4 + value.len() as u64;
749        }
750
751        len
752    }
753
754    fn update_hash(&self, hasher: &mut blake3::Hasher) {
755        self.update_hash_core(hasher);
756        self.update_hash_fidelity(hasher);
757    }
758
759    /// Hash the pre-#565 fields (everything through the status byte). Mirrors
760    /// [`Self::hash_len_core`]. The migration-only pre-bump hash is exactly
761    /// this with no fidelity block appended.
762    fn update_hash_core(&self, hasher: &mut blake3::Hasher) {
763        let principal = &self.attribution.principal;
764
765        if let Some(logical_change_id) = self.logical_change_id {
766            hasher.update(&[1]);
767            hasher.update(logical_change_id.as_bytes());
768        } else {
769            hasher.update(&[0]);
770        }
771
772        hasher.update(self.tree.as_bytes());
773        hasher.update(&(self.parents.len() as u32).to_le_bytes());
774        for parent in &self.parents {
775            hasher.update(parent.as_bytes());
776        }
777
778        hasher.update(principal.name.as_bytes());
779        hasher.update(&[0]);
780        hasher.update(principal.email.as_bytes());
781        hasher.update(&[0]);
782
783        if let Some(agent) = &self.attribution.agent {
784            hasher.update(&[1]);
785            hasher.update(agent.provider.as_bytes());
786            hasher.update(&[0]);
787            hasher.update(agent.model.as_bytes());
788            hasher.update(&[0]);
789            write_optional_string(hasher, &agent.session_id);
790            write_optional_string(hasher, &agent.segment_id);
791            write_optional_string(hasher, &agent.policy_id);
792        } else {
793            hasher.update(&[0]);
794        }
795
796        write_optional_string(hasher, &self.intent);
797
798        if let Some(confidence) = self.confidence {
799            hasher.update(&[1]);
800            hasher.update(&confidence.to_le_bytes());
801        } else {
802            hasher.update(&[0]);
803        }
804
805        hasher.update(&self.created_at.timestamp().to_le_bytes());
806
807        if let Some(verification) = &self.verification {
808            hasher.update(&[1]);
809            verification.update_hasher(hasher);
810        } else {
811            hasher.update(&[0]);
812        }
813
814        if let Some(provenance) = self.provenance {
815            hasher.update(&[1]);
816            hasher.update(provenance.as_bytes());
817        } else {
818            hasher.update(&[0]);
819        }
820
821        if let Some(context) = self.context {
822            hasher.update(&[1]);
823            hasher.update(context.as_bytes());
824        } else {
825            hasher.update(&[0]);
826        }
827
828        hasher.update(&[self.status.to_byte()]);
829    }
830
831    /// Hash the appended git-fidelity block (#565). Mirrors
832    /// [`Self::hash_len_fidelity`]. Kept separate from
833    /// [`Self::update_hash_core`] so the migration-only pre-bump hash can omit
834    /// it exactly.
835    ///
836    /// git-fidelity fields (#564 de-lossy step 1, #565) are DELIBERATELY part
837    /// of the content hash — the opposite of the W1 tail fields. Two git
838    /// commits that differ only in committer, author/committer time, timezone,
839    /// verbatim message, or extra headers (gpgsig included) are distinct git
840    /// objects; folding these into identity prevents them from dedup-colliding
841    /// to one State in the content-addressed store. This re-hashes every
842    /// pre-#565 state (a real format bump; acceptable pre-0.3). Keep this in
843    /// sync with `hash_len_fidelity`.
844    fn update_hash_fidelity(&self, hasher: &mut blake3::Hasher) {
845        if let Some(committer) = &self.committer {
846            hasher.update(&[1]);
847            hasher.update(committer.name.as_bytes());
848            hasher.update(&[0]);
849            hasher.update(committer.email.as_bytes());
850            hasher.update(&[0]);
851        } else {
852            hasher.update(&[0]);
853        }
854
855        hasher.update(&self.authored_tz_offset.to_le_bytes());
856        hasher.update(&self.committer_tz_offset.to_le_bytes());
857
858        // Author time (#564): committer time is hashed above as created_at;
859        // author time is the other half of a git commit's temporal identity.
860        if let Some(authored_at) = self.authored_at {
861            hasher.update(&[1]);
862            hasher.update(&authored_at.timestamp().to_le_bytes());
863        } else {
864            hasher.update(&[0]);
865        }
866
867        write_optional_bytes(hasher, &self.raw_message);
868
869        // extra_headers (gpgsig is one of these, kept inline at its position).
870        hasher.update(&(self.extra_headers.len() as u32).to_le_bytes());
871        for (key, value) in &self.extra_headers {
872            hasher.update(&(key.len() as u32).to_le_bytes());
873            hasher.update(key);
874            hasher.update(&(value.len() as u32).to_le_bytes());
875            hasher.update(value);
876        }
877    }
878}
879
880/// Length-prefixed optional-bytes framing for the hash: `[1] + u32-LE len +
881/// bytes` when `Some`, a single `[0]` when `None`. Unlike
882/// [`write_optional_string`]'s NUL-terminated framing this is binary-safe —
883/// `raw_message` can contain NUL bytes, so a length prefix (not a terminator)
884/// is required to keep the hash unambiguous.
885fn write_optional_bytes(hasher: &mut blake3::Hasher, value: &Option<Vec<u8>>) {
886    match value {
887        Some(bytes) => {
888            hasher.update(&[1]);
889            hasher.update(&(bytes.len() as u32).to_le_bytes());
890            hasher.update(bytes);
891        }
892        None => {
893            hasher.update(&[0]);
894        }
895    }
896}
897
898fn write_optional_string(hasher: &mut blake3::Hasher, value: &Option<String>) {
899    match value {
900        Some(value) => {
901            hasher.update(&[1]);
902            hasher.update(value.as_bytes());
903            hasher.update(&[0]);
904        }
905        None => {
906            hasher.update(&[0]);
907        }
908    }
909}
910
911/// Parse the *extension* headers from a raw git commit object's content bytes
912/// (the bytes `git cat-file commit <sha>` prints — i.e. gix's `Commit::data`),
913/// in their exact on-the-wire order, ready to store in [`State::extra_headers`].
914///
915/// A commit's header block runs from the start of the content up to the first
916/// blank line (the header/body separator). Its leading headers are always, in
917/// fixed order, `tree`, zero-or-more `parent`, `author`, `committer`; Heddle
918/// models those natively. Every header **after** `committer` is an extension
919/// header (`encoding`, `gpgsig`, `mergetag`, or any unknown/future name) and is
920/// returned here as a `(name, value)` byte pair at its real position.
921///
922/// **This is the single source of truth for extension-header order and bytes.**
923/// Both git import paths (the CLI bridge and the ingest walker) build
924/// `extra_headers` from it. The alternative — stitching the vec back together
925/// from a decoder's *typed* accessors (gix surfaces `encoding`, and historically
926/// `gpgsig`, as fields *outside* its `extra_headers`) — silently reorders the
927/// headers git happens to model as typed fields, which breaks #566 byte-exact
928/// reconstruction. So we never consult those typed accessors for position; the
929/// raw header block is authoritative. (#564 de-lossy step 1 — close-the-class.)
930///
931/// Folded continuation lines (a value line beginning with a single space
932/// `0x20`, used by `gpgsig`/`mergetag`) are **unfolded**: each continuation
933/// contributes a `\n` plus the line with exactly one leading space stripped, so
934/// the stored value holds the value's real internal newlines with no trailing
935/// newline. The serializer (#566) re-folds by mapping every `\n` back to `\n `
936/// (spike §2). A "blank" line inside an armored value is ` \n` on the wire (one
937/// space), so it unfolds to an empty segment — never confused with the
938/// header/body separator, which is a truly empty line.
939pub fn parse_commit_extension_headers(commit_content: &[u8]) -> Vec<(Vec<u8>, Vec<u8>)> {
940    // The header block ends at the first *empty* line. Folded "blank" lines
941    // inside an armored value are ` \n` (a single space), never empty, so the
942    // first `\n\n` reliably marks the header/body boundary.
943    let header_block = match find_subslice(commit_content, b"\n\n") {
944        Some(idx) => &commit_content[..idx],
945        // No separator (malformed / header-only) — treat all of it as headers.
946        None => commit_content,
947    };
948
949    // Collect every logical header (name, unfolded value) in order; the
950    // extension headers are the ones after the `committer` line.
951    let mut headers: Vec<(Vec<u8>, Vec<u8>)> = Vec::new();
952    for line in header_block.split(|&b| b == b'\n') {
953        if line.first() == Some(&b' ') {
954            // Continuation of the current header value: restore the newline
955            // that folding replaced and strip exactly one leading space.
956            if let Some((_, value)) = headers.last_mut() {
957                value.push(b'\n');
958                value.extend_from_slice(&line[1..]);
959            }
960            // A continuation with no preceding header is malformed git; skip it
961            // rather than panic.
962            continue;
963        }
964        // New header: `name<SP>value`. A header line with no space is degenerate
965        // (git never emits one in this region) — record it with an empty value
966        // so no bytes are silently dropped.
967        let (name, value) = match line.iter().position(|&b| b == b' ') {
968            Some(sp) => (line[..sp].to_vec(), line[sp + 1..].to_vec()),
969            None => (line.to_vec(), Vec::new()),
970        };
971        headers.push((name, value));
972    }
973
974    // Extension headers are everything strictly after `committer`. git always
975    // emits exactly one committer line ahead of the extension headers; if it is
976    // somehow absent, fall back to excluding the four core names so nothing is
977    // silently dropped or mis-captured.
978    match headers.iter().position(|(name, _)| name == b"committer") {
979        Some(idx) => headers.split_off(idx + 1),
980        None => headers
981            .into_iter()
982            .filter(|(name, _)| {
983                !matches!(
984                    name.as_slice(),
985                    b"tree" | b"parent" | b"author" | b"committer"
986                )
987            })
988            .collect(),
989    }
990}
991
992/// Index of the first occurrence of `needle` in `haystack`, or `None`.
993fn find_subslice(haystack: &[u8], needle: &[u8]) -> Option<usize> {
994    if needle.is_empty() || needle.len() > haystack.len() {
995        return None;
996    }
997    haystack.windows(needle.len()).position(|w| w == needle)
998}
999
1000#[cfg(test)]
1001mod tests {
1002    use super::*;
1003    use crate::object::Principal;
1004
1005    fn sample_attribution() -> Attribution {
1006        Attribution::human(Principal::new("Alice", "alice@example.com"))
1007    }
1008
1009    #[test]
1010    fn new_snapshot_sets_fresh_logical_identity() {
1011        let state =
1012            State::new_snapshot(ContentHash::compute(b"tree"), vec![], sample_attribution());
1013        let logical_change_id = state
1014            .logical_change_id
1015            .expect("snapshot should set logical identity");
1016        assert_ne!(state.logical_change_id(), state.change_id);
1017        assert_eq!(state.logical_change_id(), logical_change_id);
1018    }
1019
1020    #[test]
1021    fn new_refresh_preserves_explicit_logical_identity() {
1022        let logical_change_id = ChangeId::from_bytes([7; 16]);
1023        let state = State::new_refresh_of(
1024            ContentHash::compute(b"tree"),
1025            vec![],
1026            sample_attribution(),
1027            logical_change_id,
1028        );
1029        assert_eq!(state.logical_change_id(), logical_change_id);
1030        assert_ne!(state.change_id, logical_change_id);
1031    }
1032
1033    #[test]
1034    fn new_merge_uses_fresh_logical_identity() {
1035        let state = State::new_merge(
1036            ContentHash::compute(b"tree"),
1037            vec![ChangeId::from_bytes([1; 16]), ChangeId::from_bytes([2; 16])],
1038            sample_attribution(),
1039        );
1040        let logical_change_id = state
1041            .logical_change_id
1042            .expect("merge should set logical identity");
1043        assert_ne!(state.logical_change_id(), state.change_id);
1044        assert_eq!(state.logical_change_id(), logical_change_id);
1045        assert!(state.is_merge());
1046    }
1047
1048    #[test]
1049    fn with_change_id_invalidates_cached_hash_when_logical_identity_changes() {
1050        let mut state =
1051            State::new_snapshot(ContentHash::compute(b"tree"), vec![], sample_attribution());
1052        let previous_change_id = state.change_id;
1053        state = state.with_logical_change_id(previous_change_id);
1054        let original_hash = state.hash();
1055        let replacement = ChangeId::from_bytes([9; 16]);
1056
1057        let mut updated = state.with_change_id(replacement);
1058
1059        assert_eq!(updated.logical_change_id(), replacement);
1060        assert_ne!(updated.hash(), original_hash);
1061        assert_eq!(updated.hash(), updated.compute_hash());
1062    }
1063
1064    #[test]
1065    fn agent_segment_is_part_of_state_hash() {
1066        let principal = Principal::new("Alice", "alice@example.com");
1067        let attribution_a = Attribution::with_agent(
1068            principal.clone(),
1069            crate::object::Agent::new("openai", "gpt-5").with_session("sess-1", "seg-1"),
1070        );
1071        let attribution_b = Attribution::with_agent(
1072            principal,
1073            crate::object::Agent::new("openai", "gpt-5").with_session("sess-1", "seg-2"),
1074        );
1075        let tree = ContentHash::compute(b"tree");
1076        let timestamp = Utc::now();
1077        let logical_change_id = ChangeId::from_bytes([3; 16]);
1078        let state_a = State::new_snapshot(tree, vec![], attribution_a)
1079            .with_logical_change_id(logical_change_id)
1080            .with_timestamp(timestamp);
1081        let state_b = State::new_snapshot(tree, vec![], attribution_b)
1082            .with_logical_change_id(logical_change_id)
1083            .with_timestamp(timestamp);
1084
1085        assert_ne!(state_a.compute_hash(), state_b.compute_hash());
1086    }
1087
1088    fn sample_state() -> State {
1089        State::new_snapshot(ContentHash::compute(b"tree"), vec![], sample_attribution())
1090    }
1091
1092    fn assert_mutator_invalidates_cached_hash(
1093        mut state: State,
1094        mutate: impl FnOnce(State) -> State,
1095    ) {
1096        let original_hash = state.hash();
1097        let mut updated = mutate(state);
1098        assert_ne!(updated.hash(), original_hash);
1099        assert_eq!(updated.hash(), updated.compute_hash());
1100    }
1101
1102    #[test]
1103    fn with_intent_invalidates_cached_hash() {
1104        assert_mutator_invalidates_cached_hash(sample_state(), |state| {
1105            state.with_intent("capture intent")
1106        });
1107    }
1108
1109    #[test]
1110    fn with_confidence_invalidates_cached_hash() {
1111        assert_mutator_invalidates_cached_hash(sample_state(), |state| state.with_confidence(0.9));
1112    }
1113
1114    #[test]
1115    fn with_verification_invalidates_cached_hash() {
1116        assert_mutator_invalidates_cached_hash(sample_state(), |state| {
1117            state.with_verification(Verification::new().with_tests_passed(true))
1118        });
1119    }
1120
1121    #[test]
1122    fn with_status_invalidates_cached_hash() {
1123        assert_mutator_invalidates_cached_hash(sample_state(), |state| {
1124            state.with_status(Status::Published)
1125        });
1126    }
1127
1128    #[test]
1129    fn with_timestamp_invalidates_cached_hash() {
1130        assert_mutator_invalidates_cached_hash(sample_state(), |state| {
1131            state.with_timestamp(Utc::now() + chrono::Duration::seconds(1))
1132        });
1133    }
1134
1135    /// Locks the contract that W1 tail-append fields (risk_signals,
1136    /// review_signatures, discussions, structured_conflicts) are NOT
1137    /// part of the state hash. Adding them to identity would mean the
1138    /// same logical state hashes differently depending on what signals
1139    /// fired, what review signatures arrived, or whether a discussion
1140    /// was anchored — which would break every "same state?" check in
1141    /// the system. Their persistence is independent of identity.
1142    #[test]
1143    fn w1_tail_fields_are_not_part_of_state_hash() {
1144        let mut bare = sample_state();
1145        let bare_hash = bare.hash();
1146
1147        let mut decorated = sample_state()
1148            .with_change_id(bare.change_id)
1149            .with_logical_change_id(bare.logical_change_id())
1150            .with_risk_signals(ContentHash::compute(b"risk-signals-blob"))
1151            .with_review_signatures(ContentHash::compute(b"review-signatures-blob"))
1152            .with_discussions(ContentHash::compute(b"discussions-blob"))
1153            .with_structured_conflicts(ContentHash::compute(b"conflicts-blob"));
1154        decorated.created_at = bare.created_at;
1155
1156        assert_eq!(
1157            decorated.hash(),
1158            bare_hash,
1159            "W1 tail fields must not affect the state hash"
1160        );
1161    }
1162
1163    /// The inverse of `w1_tail_fields_are_not_part_of_state_hash`: the
1164    /// git-fidelity fields (#564 step 1) MUST be part of the hash so two
1165    /// git-distinct commits can't dedup-collide. Each field, set in
1166    /// isolation, must move the hash.
1167    #[test]
1168    fn fidelity_fields_are_part_of_state_hash() {
1169        let base = sample_state();
1170        let base_hash = base.compute_hash();
1171
1172        let with_committer = sample_state()
1173            .with_change_id(base.change_id)
1174            .with_logical_change_id(base.logical_change_id());
1175        let mut with_committer =
1176            with_committer.with_committer(Principal::new("Carol", "carol@example.com"));
1177        with_committer.created_at = base.created_at;
1178        assert_ne!(
1179            with_committer.hash(),
1180            base_hash,
1181            "committer must affect the state hash"
1182        );
1183
1184        for mutate in [
1185            |s: State| s.with_tz_offsets(3600, -7200),
1186            |s: State| s.with_authored_at(Utc::now() + chrono::Duration::seconds(1)),
1187            |s: State| s.with_raw_message("verbatim body\n"),
1188            // gpgsig now rides inline in extra_headers at its captured position.
1189            |s: State| {
1190                s.with_extra_headers(vec![(
1191                    b"gpgsig".to_vec(),
1192                    b"-----BEGIN PGP SIGNATURE-----\n".to_vec(),
1193                )])
1194            },
1195            |s: State| s.with_extra_headers(vec![(b"mergetag".to_vec(), b"x".to_vec())]),
1196        ] {
1197            let seeded = sample_state()
1198                .with_change_id(base.change_id)
1199                .with_logical_change_id(base.logical_change_id());
1200            let mut decorated = mutate(seeded);
1201            decorated.created_at = base.created_at;
1202            assert_ne!(
1203                decorated.hash(),
1204                base_hash,
1205                "fidelity field must affect the state hash"
1206            );
1207        }
1208    }
1209
1210    #[test]
1211    fn legacy_signature_migration_hash_matches_golden_vector() {
1212        let state = State::new_snapshot(
1213            ContentHash::compute(b"issue-633-tree"),
1214            vec![ChangeId::from_bytes([0x11; 16])],
1215            Attribution::with_agent(
1216                Principal::new("Legacy Author", "legacy@example.com"),
1217                crate::object::Agent::new("openai", "gpt-5")
1218                    .with_session("session-633", "segment-001")
1219                    .with_policy("policy-legacy"),
1220            ),
1221        )
1222        .with_logical_change_id(ChangeId::from_bytes([0x63; 16]))
1223        .with_intent("freeze pre-565 hash")
1224        .with_confidence(0.875)
1225        .with_timestamp(DateTime::from_timestamp(1_700_000_000, 0).expect("valid timestamp"))
1226        .with_committer(Principal::new("Legacy Committer", "committer@example.com"))
1227        .with_tz_offsets(3600, -18000)
1228        .with_authored_at(DateTime::from_timestamp(1_699_999_000, 0).expect("valid timestamp"))
1229        .with_raw_message(b"legacy commit message\n")
1230        .with_extra_headers(vec![(b"encoding".to_vec(), b"UTF-8".to_vec())])
1231        .with_status(Status::Published);
1232
1233        let legacy_hash = state.compute_hash_for_legacy_signature_migration();
1234        // Golden vector for the pre-#565 state hash format. Legacy
1235        // StateSignature migration depends on this recipe staying
1236        // byte-identical to that old format; if `hash_len_core` and
1237        // `update_hash_core` drift, real pre-#565 signatures become
1238        // unverifiable even though round-trip tests can still pass.
1239        assert_eq!(
1240            legacy_hash.to_hex(),
1241            "b89e1b40e681a1bf88679db7cfcacdafb1f370bc40ed5d50760dae1d4ab49dab",
1242        );
1243        assert_ne!(
1244            legacy_hash,
1245            state.compute_hash(),
1246            "fixture must distinguish the pre-#565 legacy path from the current hash",
1247        );
1248    }
1249
1250    /// extra_headers order is load-bearing (#566): the same pairs in a
1251    /// different order must hash differently.
1252    #[test]
1253    fn extra_headers_order_affects_hash() {
1254        let base = sample_state();
1255        let one = sample_state()
1256            .with_change_id(base.change_id)
1257            .with_logical_change_id(base.logical_change_id());
1258        let mut one = one.with_extra_headers(vec![
1259            (b"a".to_vec(), b"1".to_vec()),
1260            (b"b".to_vec(), b"2".to_vec()),
1261        ]);
1262        one.created_at = base.created_at;
1263
1264        let two = sample_state()
1265            .with_change_id(base.change_id)
1266            .with_logical_change_id(base.logical_change_id());
1267        let mut two = two.with_extra_headers(vec![
1268            (b"b".to_vec(), b"2".to_vec()),
1269            (b"a".to_vec(), b"1".to_vec()),
1270        ]);
1271        two.created_at = base.created_at;
1272
1273        assert_ne!(one.hash(), two.hash());
1274    }
1275
1276    /// The fidelity fields set together produce a stable, recomputable
1277    /// hash (guards against a `hash_len`/`update_hash` divergence making
1278    /// the cached hash differ from a fresh `compute_hash`).
1279    #[test]
1280    fn fidelity_fields_hash_is_stable() {
1281        let mut state = sample_state()
1282            .with_committer(Principal::new("Dave", "dave@example.com"))
1283            .with_tz_offsets(3600, 0)
1284            .with_authored_at(Utc::now())
1285            .with_raw_message("body\n")
1286            .with_extra_headers(vec![
1287                (b"gpgsig".to_vec(), b"sig".to_vec()),
1288                (b"k".to_vec(), b"v".to_vec()),
1289            ]);
1290        assert_eq!(state.hash(), state.compute_hash());
1291    }
1292
1293    /// A non-UTF8 git message body (latin-1 `café` = `caf\xe9`) must be
1294    /// stored byte-identically. `raw_message` is `Vec<u8>`, not `String`,
1295    /// precisely so these bytes survive; the hash stays stable/recomputable
1296    /// over the raw bytes (length-prefixed framing, NUL-safe). #564 step 1.
1297    #[test]
1298    fn non_utf8_raw_message_is_byte_preserved() {
1299        let raw = b"caf\xe9\n".to_vec();
1300        assert!(
1301            String::from_utf8(raw.clone()).is_err(),
1302            "test fixture must be invalid UTF-8 to be meaningful"
1303        );
1304        let mut state = sample_state().with_raw_message(&raw);
1305        assert_eq!(
1306            state.raw_message.as_deref(),
1307            Some(raw.as_slice()),
1308            "raw bytes preserved verbatim"
1309        );
1310        // rmp serialize → deserialize (the store's on-disk codec) keeps the
1311        // bytes intact, and the hash recomputes identically afterwards.
1312        let bytes = rmp_serde::to_vec(&state).expect("serialize state");
1313        let back: State = rmp_serde::from_slice(&bytes).expect("deserialize state");
1314        assert_eq!(back.raw_message.as_deref(), Some(raw.as_slice()));
1315        let mut back = back;
1316        assert_eq!(state.hash(), back.hash());
1317        assert_eq!(back.hash(), back.compute_hash());
1318    }
1319
1320    /// A NUL byte inside the message must not be swallowed/truncated by the
1321    /// hash framing — length-prefixed `raw_message` is what makes this safe,
1322    /// where the old NUL-terminated string framing would have been ambiguous.
1323    #[test]
1324    fn raw_message_with_nul_byte_changes_hash() {
1325        let base = sample_state();
1326        let with_nul = sample_state()
1327            .with_change_id(base.change_id)
1328            .with_logical_change_id(base.logical_change_id());
1329        let mut a = with_nul.with_raw_message(b"a\x00b");
1330        a.created_at = base.created_at;
1331
1332        let other = sample_state()
1333            .with_change_id(base.change_id)
1334            .with_logical_change_id(base.logical_change_id());
1335        let mut b = other.with_raw_message(b"a\x00c");
1336        b.created_at = base.created_at;
1337
1338        assert_ne!(a.hash(), b.hash());
1339    }
1340
1341    /// Close-the-class conformance: extension headers are captured from the
1342    /// raw commit header block in their EXACT on-the-wire order, regardless of
1343    /// which ones a decoder would surface as typed fields. A commit whose
1344    /// optional headers are in non-canonical order — `x-custom`, then a folded
1345    /// `gpgsig`, then `encoding`, then a folded `mergetag` — must reproduce that
1346    /// exact ordered `(name, value)` byte sequence. This fails if any header is
1347    /// reordered, prepended, appended, or dropped. #564 de-lossy step 1.
1348    #[test]
1349    fn parse_extension_headers_preserves_noncanonical_wire_order() {
1350        // A folded `mergetag` value carries a full tag object, which itself has
1351        // an internal blank line between the tag headers and the tag message —
1352        // on the wire that blank line is folded to a single space (` `), NEVER
1353        // an empty line, so it must not be mistaken for the header/body split.
1354        // Built line-by-line (NOT a `\`-continued literal, which would eat the
1355        // load-bearing leading space on each folded continuation line).
1356        let lines: &[&[u8]] = &[
1357            b"tree 1111111111111111111111111111111111111111",
1358            b"parent 2222222222222222222222222222222222222222",
1359            b"author Alice <alice@example.com> 1700000000 +0000",
1360            b"committer Bob <bob@example.com> 1700000100 +0000",
1361            b"x-custom custom value",
1362            b"gpgsig -----BEGIN PGP SIGNATURE-----",
1363            b" sig-line-1",
1364            b" -----END PGP SIGNATURE-----",
1365            b"encoding ISO-8859-1",
1366            b"mergetag object 3333333333333333333333333333333333333333",
1367            b" type commit",
1368            b" tag sidetag",
1369            b" tagger Carol <carol@example.com> 1700000050 +0000",
1370            b" ", // folded blank line inside the tag object (one space)
1371            b" signed side tag",
1372            b"", // the real header/body separator (empty line)
1373            b"the commit message",
1374            b"",
1375        ];
1376        let content = lines.join(&b'\n');
1377
1378        let headers = parse_commit_extension_headers(&content);
1379
1380        let expected: Vec<(Vec<u8>, Vec<u8>)> = vec![
1381            (b"x-custom".to_vec(), b"custom value".to_vec()),
1382            (
1383                b"gpgsig".to_vec(),
1384                // Unfolded: internal newlines restored, NO trailing newline (the
1385                // serializer re-folds each `\n` to `\n `, spike §2).
1386                b"-----BEGIN PGP SIGNATURE-----\nsig-line-1\n-----END PGP SIGNATURE-----"
1387                    .to_vec(),
1388            ),
1389            (b"encoding".to_vec(), b"ISO-8859-1".to_vec()),
1390            (
1391                b"mergetag".to_vec(),
1392                // The folded ` \n` blank line unfolds to an empty segment, so the
1393                // tag object's header/message split survives as a real `\n\n`.
1394                b"object 3333333333333333333333333333333333333333\ntype commit\ntag sidetag\ntagger Carol <carol@example.com> 1700000050 +0000\n\nsigned side tag".to_vec(),
1395            ),
1396        ];
1397
1398        assert_eq!(headers, expected);
1399    }
1400
1401    /// A commit with no extension headers (the common case) yields an empty
1402    /// vec — `tree`/`parent`/`author`/`committer` are modelled natively and
1403    /// never leak into `extra_headers`.
1404    #[test]
1405    fn parse_extension_headers_empty_when_only_core_headers() {
1406        let content: &[u8] = b"\
1407tree 1111111111111111111111111111111111111111\n\
1408author Alice <alice@example.com> 1700000000 +0000\n\
1409committer Bob <bob@example.com> 1700000100 +0000\n\
1410\n\
1411just a message\n";
1412        assert!(parse_commit_extension_headers(content).is_empty());
1413    }
1414}