Skip to main content

cortex_ledger/
hash.rs

1//! Domain-tagged, length-prefixed BLAKE3 hash chain (T-1.B.1 + T-1.B.6).
2//!
3//! ## Framing
4//!
5//! Two hashes participate in the chain:
6//!
7//! 1. `payload_hash` — `blake3(canonical_payload_bytes)`. The canonical
8//!    encoding of a `serde_json::Value` is the ordered, no-whitespace form
9//!    produced by [`canonical_payload_bytes`]. Object keys are sorted
10//!    lexicographically; arrays preserve order; numbers, strings, and
11//!    booleans are emitted in their canonical JSON form. This makes the
12//!    hash **stable across re-serialization**: a `Value` parsed from JSON
13//!    and re-serialized via the canonical encoder produces the same bytes
14//!    regardless of the original key order or whitespace.
15//!
16//! 2. `event_hash` — domain-tagged, length-prefixed framing:
17//!
18//!    ```text
19//!    event_hash = blake3(
20//!        DOMAIN_TAG_EVENT_HASH                  // 1 byte: 0x01
21//!     || prev_event_hash.len() as u64 (LE)      // 8 bytes
22//!     || prev_event_hash bytes                  // 32 bytes (or 0 if genesis)
23//!     || payload_hash.len() as u64 (LE)         // 8 bytes
24//!     || payload_hash bytes                     // 32 bytes
25//!    )
26//!    ```
27//!
28//! ## Why length-prefix + domain tag (T-1.B.6 — THREATS T-EV-5)
29//!
30//! Without length prefixes, two distinct `(prev, payload)` splits could
31//! concatenate to the same byte string and collide. Example: `prev = "AB"`,
32//! `payload = "CD"` vs `prev = "ABC"`, `payload = "D"` both yield `"ABCD"`.
33//! Length prefixes make the boundary unambiguous.
34//!
35//! Without a domain tag, an `event_hash` byte string could be
36//! reinterpreted as some other domain's hash input (e.g. an `audit_hash`)
37//! and collide cross-domain. The 1-byte `DOMAIN_TAG_EVENT_HASH = 0x01`
38//! reserves a domain-separated input space; future domains take other
39//! tags (0x02 audit, 0x03 trace seal, …).
40//!
41//! The `framing_resists_boundary_confusion` proptest (≥200 cases) asserts
42//! that no two distinct `(prev, payload)` pairs produce the same
43//! `event_hash`.
44
45use cortex_core::Event;
46
47/// Domain tag for `event_hash` framing. Reserved: 0x01.
48///
49/// Other domains (audit row hashing, trace seal hashing, …) MUST take
50/// distinct tag bytes. Re-using a tag is a chain-collision vulnerability.
51pub const DOMAIN_TAG_EVENT_HASH: u8 = 0x01;
52
53/// Length-in-bytes of a hex-encoded BLAKE3 hash (32 bytes → 64 hex chars).
54pub const HEX_HASH_LEN: usize = 64;
55
56/// Canonical, deterministic JSON encoding of a payload `Value`.
57///
58/// Object keys are sorted lexicographically (recursively). Arrays preserve
59/// element order. No whitespace. This is the input to [`payload_hash`].
60///
61/// **Invariant:** for any `Value v`, `canonical_payload_bytes(&v)` is equal
62/// to `canonical_payload_bytes(&serde_json::from_slice(&canonical_payload_bytes(&v)).unwrap())`.
63/// In other words, re-serializing through the canonical encoder is a
64/// fixed point.
65#[must_use]
66pub fn canonical_payload_bytes(value: &serde_json::Value) -> Vec<u8> {
67    let mut out = Vec::with_capacity(64);
68    encode_canonical(value, &mut out);
69    out
70}
71
72/// BLAKE3 hash of the canonical payload bytes, hex-encoded.
73#[must_use]
74pub fn payload_hash(value: &serde_json::Value) -> String {
75    let bytes = canonical_payload_bytes(value);
76    blake3::hash(&bytes).to_hex().to_string()
77}
78
79/// Compute `event_hash` for an event with the given previous hash and
80/// payload hash, both as hex strings.
81///
82/// `prev_event_hash` is `None` for the genesis event of a chain; the framing
83/// emits a zero-length prefix and an empty `prev` field in that case (the
84/// length prefix prevents collision between genesis and "empty prev" cases).
85#[must_use]
86pub fn event_hash(prev_event_hash: Option<&str>, payload_hash_hex: &str) -> String {
87    let prev_bytes = prev_event_hash.map(str::as_bytes).unwrap_or(&[]);
88    let payload_bytes = payload_hash_hex.as_bytes();
89
90    let mut hasher = blake3::Hasher::new();
91    hasher.update(&[DOMAIN_TAG_EVENT_HASH]);
92    hasher.update(&(prev_bytes.len() as u64).to_le_bytes());
93    hasher.update(prev_bytes);
94    hasher.update(&(payload_bytes.len() as u64).to_le_bytes());
95    hasher.update(payload_bytes);
96    hasher.finalize().to_hex().to_string()
97}
98
99/// Recompute `payload_hash` and `event_hash` for an event in-place.
100///
101/// Used by [`crate::jsonl::JsonlLog::append`] to seal an event before
102/// persisting it. The `prev_event_hash` field on `event` is taken as the
103/// authoritative previous-hash input (callers MUST set it correctly before
104/// invoking this).
105pub fn seal(event: &mut Event) {
106    event.payload_hash = payload_hash(&event.payload);
107    event.event_hash = event_hash(event.prev_event_hash.as_deref(), &event.payload_hash);
108}
109
110// ---------------------------------------------------------------------------
111// Canonical JSON encoder (sorted keys, no whitespace).
112// ---------------------------------------------------------------------------
113
114fn encode_canonical(v: &serde_json::Value, out: &mut Vec<u8>) {
115    match v {
116        serde_json::Value::Null => out.extend_from_slice(b"null"),
117        serde_json::Value::Bool(true) => out.extend_from_slice(b"true"),
118        serde_json::Value::Bool(false) => out.extend_from_slice(b"false"),
119        serde_json::Value::Number(n) => {
120            // serde_json::Number's Display already produces a canonical
121            // numeric form (no leading zeros, minimal exponent for floats).
122            // We accept that as canonical here; if exact-numeric collision
123            // resistance is needed later, switch to a stricter encoder
124            // (e.g. RFC 8785 JCS) and bump SCHEMA_VERSION.
125            out.extend_from_slice(n.to_string().as_bytes());
126        }
127        serde_json::Value::String(s) => {
128            // Reuse serde_json's string escaper to ensure correct \uXXXX
129            // and \" handling; this is identical across re-serializations
130            // because serde_json's string output is deterministic.
131            let s = serde_json::to_string(s).expect("string encode");
132            out.extend_from_slice(s.as_bytes());
133        }
134        serde_json::Value::Array(items) => {
135            out.push(b'[');
136            for (i, item) in items.iter().enumerate() {
137                if i > 0 {
138                    out.push(b',');
139                }
140                encode_canonical(item, out);
141            }
142            out.push(b']');
143        }
144        serde_json::Value::Object(map) => {
145            // Sort keys lexicographically by their UTF-8 byte order. JSON
146            // object keys are unordered by spec; sorting is what makes this
147            // canonical and re-serialization-stable.
148            let mut keys: Vec<&String> = map.keys().collect();
149            keys.sort();
150            out.push(b'{');
151            for (i, k) in keys.iter().enumerate() {
152                if i > 0 {
153                    out.push(b',');
154                }
155                let key_str = serde_json::to_string(k).expect("key encode");
156                out.extend_from_slice(key_str.as_bytes());
157                out.push(b':');
158                encode_canonical(&map[*k], out);
159            }
160            out.push(b'}');
161        }
162    }
163}
164
165#[cfg(test)]
166mod tests {
167    use super::*;
168    use chrono::TimeZone;
169    use cortex_core::{Event, EventSource, EventType, SCHEMA_VERSION};
170    use proptest::prelude::*;
171
172    fn fixture_event(payload: serde_json::Value) -> Event {
173        Event {
174            id: "evt_01ARZ3NDEKTSV4RRFFQ69G5FAV".parse().unwrap(),
175            schema_version: SCHEMA_VERSION,
176            observed_at: chrono::Utc.with_ymd_and_hms(2026, 1, 1, 12, 0, 0).unwrap(),
177            recorded_at: chrono::Utc.with_ymd_and_hms(2026, 1, 1, 12, 0, 1).unwrap(),
178            source: EventSource::User,
179            event_type: EventType::UserMessage,
180            trace_id: None,
181            session_id: None,
182            domain_tags: vec![],
183            payload,
184            payload_hash: String::new(),
185            prev_event_hash: None,
186            event_hash: String::new(),
187        }
188    }
189
190    /// T-1.B.1 acceptance: hash chain stable across re-serialization.
191    ///
192    /// We compute `payload_hash` and `event_hash` for an event, serialize the
193    /// event to JSON, deserialize, and reseal. The hashes MUST match because
194    /// the canonical encoder is order-independent and the framing is byte-
195    /// stable.
196    #[test]
197    fn hash_chain_stable_across_reserialization() {
198        let payload = serde_json::json!({
199            "z": 1,
200            "a": "two",
201            "m": [3, 4, {"y": "v", "x": "u"}],
202            "n": null,
203            "b": true,
204        });
205
206        let mut e1 = fixture_event(payload);
207        seal(&mut e1);
208        let h1_payload = e1.payload_hash.clone();
209        let h1_event = e1.event_hash.clone();
210
211        // Round-trip the entire event through serde_json (which may emit
212        // object keys in a different order).
213        let serialized = serde_json::to_string(&e1).unwrap();
214        let mut e2: Event = serde_json::from_str(&serialized).unwrap();
215        // Clear the hashes to force a re-seal; the result must match.
216        e2.payload_hash.clear();
217        e2.event_hash.clear();
218        seal(&mut e2);
219
220        assert_eq!(e2.payload_hash, h1_payload, "payload_hash drifted");
221        assert_eq!(e2.event_hash, h1_event, "event_hash drifted");
222
223        // Stronger check: re-serialize the payload through a different
224        // textual key order and confirm the canonical bytes are identical.
225        let scrambled = serde_json::json!({
226            "b": true,
227            "n": null,
228            "m": [3, 4, {"x": "u", "y": "v"}],
229            "a": "two",
230            "z": 1,
231        });
232        assert_eq!(payload_hash(&e1.payload), payload_hash(&scrambled));
233    }
234
235    #[test]
236    fn genesis_event_has_distinct_hash_from_empty_prev_string() {
237        // A genesis event (prev=None) and an event with prev="" are
238        // semantically different but the framing must NOT confuse them.
239        // With the length prefix, both encode the same byte slice (0-len)
240        // for `prev`, so they SHOULD collide on event_hash because they
241        // really are the same input. This test pins that behavior so a
242        // future change (e.g. distinguishing the two with an extra tag
243        // byte) is a deliberate decision, not an accident.
244        let p = payload_hash(&serde_json::json!({"x": 1}));
245        let h_none = event_hash(None, &p);
246        let h_empty = event_hash(Some(""), &p);
247        assert_eq!(
248            h_none, h_empty,
249            "genesis vs empty prev currently equivalent"
250        );
251    }
252
253    #[test]
254    fn payload_hash_is_deterministic() {
255        let p = serde_json::json!({"x": 1, "y": 2});
256        assert_eq!(payload_hash(&p), payload_hash(&p));
257    }
258
259    // T-1.B.6 acceptance: `framing_resists_boundary_confusion` proptest with
260    // ≥200 cases finds zero collisions.
261    //
262    // For any two distinct `(prev, payload)` pairs of arbitrary lengths, the
263    // resulting `event_hash` strings must differ. The length-prefix framing
264    // is what guarantees this: without it, e.g. ("AB", "CD") and ("ABC", "D")
265    // would collide on a naive `prev || payload` concatenation.
266    proptest! {
267        #![proptest_config(ProptestConfig {
268            cases: 256,
269            ..ProptestConfig::default()
270        })]
271
272        #[test]
273        fn framing_resists_boundary_confusion(
274            a_prev in ".{0,40}",
275            a_payload in ".{1,40}",
276            b_prev in ".{0,40}",
277            b_payload in ".{1,40}",
278        ) {
279            // Normalize: empty `prev` string == None (genesis). Both encode
280            // the same byte slice (0-len) for `prev`, so they are
281            // canonically the same input — see
282            // `genesis_event_has_distinct_hash_from_empty_prev_string`.
283            let a_prev_opt = if a_prev.is_empty() { None } else { Some(a_prev.as_str()) };
284            let b_prev_opt = if b_prev.is_empty() { None } else { Some(b_prev.as_str()) };
285
286            // Compare canonical inputs: skip cases where the framed input
287            // bytes are identical (otherwise the test is trivially false).
288            let a_norm: (&[u8], &str) = (a_prev_opt.map(str::as_bytes).unwrap_or(&[]), &a_payload);
289            let b_norm: (&[u8], &str) = (b_prev_opt.map(str::as_bytes).unwrap_or(&[]), &b_payload);
290            prop_assume!(a_norm != b_norm);
291
292            let ha = event_hash(a_prev_opt, &a_payload);
293            let hb = event_hash(b_prev_opt, &b_payload);
294
295            prop_assert_ne!(ha, hb);
296        }
297    }
298
299    /// Direct boundary-confusion regression: the classic ("AB","CD") vs
300    /// ("ABC","D") case. With length prefixes, these hashes MUST differ.
301    #[test]
302    fn boundary_confusion_regression() {
303        let h1 = event_hash(Some("AB"), "CD");
304        let h2 = event_hash(Some("ABC"), "D");
305        assert_ne!(
306            h1, h2,
307            "naive concatenation would collide; length-prefix framing must prevent this"
308        );
309    }
310
311    #[test]
312    fn domain_tag_prevents_cross_domain_collision() {
313        // An event_hash with DOMAIN_TAG_EVENT_HASH must not collide with the
314        // BLAKE3 hash of just the framed body. We verify by computing both
315        // and asserting inequality on a fixed input.
316        let prev = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef";
317        let p = "fedcba9876543210fedcba9876543210fedcba9876543210fedcba9876543210";
318        let with_tag = event_hash(Some(prev), p);
319
320        // Compute "no-tag" version manually for comparison.
321        let mut hasher = blake3::Hasher::new();
322        hasher.update(&(prev.len() as u64).to_le_bytes());
323        hasher.update(prev.as_bytes());
324        hasher.update(&(p.len() as u64).to_le_bytes());
325        hasher.update(p.as_bytes());
326        let no_tag = hasher.finalize().to_hex().to_string();
327
328        assert_ne!(with_tag, no_tag, "domain tag must change hash output");
329    }
330}