cortex_ledger/hash.rs
1//! Domain-tagged, length-prefixed BLAKE3 hash chain (T-1.B.1 + T-1.B.6).
2//!
3//! ## Framing
4//!
5//! Two hashes participate in the chain:
6//!
7//! 1. `payload_hash` — `blake3(canonical_payload_bytes)`. The canonical
8//! encoding of a `serde_json::Value` is the ordered, no-whitespace form
9//! produced by [`canonical_payload_bytes`]. Object keys are sorted
10//! lexicographically; arrays preserve order; numbers, strings, and
11//! booleans are emitted in their canonical JSON form. This makes the
12//! hash **stable across re-serialization**: a `Value` parsed from JSON
13//! and re-serialized via the canonical encoder produces the same bytes
14//! regardless of the original key order or whitespace.
15//!
16//! 2. `event_hash` — domain-tagged, length-prefixed framing:
17//!
18//! ```text
19//! event_hash = blake3(
20//! DOMAIN_TAG_EVENT_HASH // 1 byte: 0x01
21//! || prev_event_hash.len() as u64 (LE) // 8 bytes
22//! || prev_event_hash bytes // 32 bytes (or 0 if genesis)
23//! || payload_hash.len() as u64 (LE) // 8 bytes
24//! || payload_hash bytes // 32 bytes
25//! )
26//! ```
27//!
28//! ## Why length-prefix + domain tag (T-1.B.6 — THREATS T-EV-5)
29//!
30//! Without length prefixes, two distinct `(prev, payload)` splits could
31//! concatenate to the same byte string and collide. Example: `prev = "AB"`,
32//! `payload = "CD"` vs `prev = "ABC"`, `payload = "D"` both yield `"ABCD"`.
33//! Length prefixes make the boundary unambiguous.
34//!
35//! Without a domain tag, an `event_hash` byte string could be
36//! reinterpreted as some other domain's hash input (e.g. an `audit_hash`)
37//! and collide cross-domain. The 1-byte `DOMAIN_TAG_EVENT_HASH = 0x01`
38//! reserves a domain-separated input space; future domains take other
39//! tags (0x02 audit, 0x03 trace seal, …).
40//!
41//! The `framing_resists_boundary_confusion` proptest (≥200 cases) asserts
42//! that no two distinct `(prev, payload)` pairs produce the same
43//! `event_hash`.
44
45use cortex_core::Event;
46
47/// Domain tag for `event_hash` framing. Reserved: 0x01.
48///
49/// Other domains (audit row hashing, trace seal hashing, …) MUST take
50/// distinct tag bytes. Re-using a tag is a chain-collision vulnerability.
51pub const DOMAIN_TAG_EVENT_HASH: u8 = 0x01;
52
53/// Length-in-bytes of a hex-encoded BLAKE3 hash (32 bytes → 64 hex chars).
54pub const HEX_HASH_LEN: usize = 64;
55
56/// Canonical, deterministic JSON encoding of a payload `Value`.
57///
58/// Object keys are sorted lexicographically (recursively). Arrays preserve
59/// element order. No whitespace. This is the input to [`payload_hash`].
60///
61/// **Invariant:** for any `Value v`, `canonical_payload_bytes(&v)` is equal
62/// to `canonical_payload_bytes(&serde_json::from_slice(&canonical_payload_bytes(&v)).unwrap())`.
63/// In other words, re-serializing through the canonical encoder is a
64/// fixed point.
65#[must_use]
66pub fn canonical_payload_bytes(value: &serde_json::Value) -> Vec<u8> {
67 let mut out = Vec::with_capacity(64);
68 encode_canonical(value, &mut out);
69 out
70}
71
72/// BLAKE3 hash of the canonical payload bytes, hex-encoded.
73#[must_use]
74pub fn payload_hash(value: &serde_json::Value) -> String {
75 let bytes = canonical_payload_bytes(value);
76 blake3::hash(&bytes).to_hex().to_string()
77}
78
79/// Compute `event_hash` for an event with the given previous hash and
80/// payload hash, both as hex strings.
81///
82/// `prev_event_hash` is `None` for the genesis event of a chain; the framing
83/// emits a zero-length prefix and an empty `prev` field in that case (the
84/// length prefix prevents collision between genesis and "empty prev" cases).
85#[must_use]
86pub fn event_hash(prev_event_hash: Option<&str>, payload_hash_hex: &str) -> String {
87 let prev_bytes = prev_event_hash.map(str::as_bytes).unwrap_or(&[]);
88 let payload_bytes = payload_hash_hex.as_bytes();
89
90 let mut hasher = blake3::Hasher::new();
91 hasher.update(&[DOMAIN_TAG_EVENT_HASH]);
92 hasher.update(&(prev_bytes.len() as u64).to_le_bytes());
93 hasher.update(prev_bytes);
94 hasher.update(&(payload_bytes.len() as u64).to_le_bytes());
95 hasher.update(payload_bytes);
96 hasher.finalize().to_hex().to_string()
97}
98
99/// Recompute `payload_hash` and `event_hash` for an event in-place.
100///
101/// Used by [`crate::jsonl::JsonlLog::append`] to seal an event before
102/// persisting it. The `prev_event_hash` field on `event` is taken as the
103/// authoritative previous-hash input (callers MUST set it correctly before
104/// invoking this).
105pub fn seal(event: &mut Event) {
106 event.payload_hash = payload_hash(&event.payload);
107 event.event_hash = event_hash(event.prev_event_hash.as_deref(), &event.payload_hash);
108}
109
110// ---------------------------------------------------------------------------
111// Canonical JSON encoder (sorted keys, no whitespace).
112// ---------------------------------------------------------------------------
113
114fn encode_canonical(v: &serde_json::Value, out: &mut Vec<u8>) {
115 match v {
116 serde_json::Value::Null => out.extend_from_slice(b"null"),
117 serde_json::Value::Bool(true) => out.extend_from_slice(b"true"),
118 serde_json::Value::Bool(false) => out.extend_from_slice(b"false"),
119 serde_json::Value::Number(n) => {
120 // serde_json::Number's Display already produces a canonical
121 // numeric form (no leading zeros, minimal exponent for floats).
122 // We accept that as canonical here; if exact-numeric collision
123 // resistance is needed later, switch to a stricter encoder
124 // (e.g. RFC 8785 JCS) and bump SCHEMA_VERSION.
125 out.extend_from_slice(n.to_string().as_bytes());
126 }
127 serde_json::Value::String(s) => {
128 // Reuse serde_json's string escaper to ensure correct \uXXXX
129 // and \" handling; this is identical across re-serializations
130 // because serde_json's string output is deterministic.
131 let s = serde_json::to_string(s).expect("string encode");
132 out.extend_from_slice(s.as_bytes());
133 }
134 serde_json::Value::Array(items) => {
135 out.push(b'[');
136 for (i, item) in items.iter().enumerate() {
137 if i > 0 {
138 out.push(b',');
139 }
140 encode_canonical(item, out);
141 }
142 out.push(b']');
143 }
144 serde_json::Value::Object(map) => {
145 // Sort keys lexicographically by their UTF-8 byte order. JSON
146 // object keys are unordered by spec; sorting is what makes this
147 // canonical and re-serialization-stable.
148 let mut keys: Vec<&String> = map.keys().collect();
149 keys.sort();
150 out.push(b'{');
151 for (i, k) in keys.iter().enumerate() {
152 if i > 0 {
153 out.push(b',');
154 }
155 let key_str = serde_json::to_string(k).expect("key encode");
156 out.extend_from_slice(key_str.as_bytes());
157 out.push(b':');
158 encode_canonical(&map[*k], out);
159 }
160 out.push(b'}');
161 }
162 }
163}
164
165#[cfg(test)]
166mod tests {
167 use super::*;
168 use chrono::TimeZone;
169 use cortex_core::{Event, EventSource, EventType, SCHEMA_VERSION};
170 use proptest::prelude::*;
171
172 fn fixture_event(payload: serde_json::Value) -> Event {
173 Event {
174 id: "evt_01ARZ3NDEKTSV4RRFFQ69G5FAV".parse().unwrap(),
175 schema_version: SCHEMA_VERSION,
176 observed_at: chrono::Utc.with_ymd_and_hms(2026, 1, 1, 12, 0, 0).unwrap(),
177 recorded_at: chrono::Utc.with_ymd_and_hms(2026, 1, 1, 12, 0, 1).unwrap(),
178 source: EventSource::User,
179 event_type: EventType::UserMessage,
180 trace_id: None,
181 session_id: None,
182 domain_tags: vec![],
183 payload,
184 payload_hash: String::new(),
185 prev_event_hash: None,
186 event_hash: String::new(),
187 }
188 }
189
190 /// T-1.B.1 acceptance: hash chain stable across re-serialization.
191 ///
192 /// We compute `payload_hash` and `event_hash` for an event, serialize the
193 /// event to JSON, deserialize, and reseal. The hashes MUST match because
194 /// the canonical encoder is order-independent and the framing is byte-
195 /// stable.
196 #[test]
197 fn hash_chain_stable_across_reserialization() {
198 let payload = serde_json::json!({
199 "z": 1,
200 "a": "two",
201 "m": [3, 4, {"y": "v", "x": "u"}],
202 "n": null,
203 "b": true,
204 });
205
206 let mut e1 = fixture_event(payload);
207 seal(&mut e1);
208 let h1_payload = e1.payload_hash.clone();
209 let h1_event = e1.event_hash.clone();
210
211 // Round-trip the entire event through serde_json (which may emit
212 // object keys in a different order).
213 let serialized = serde_json::to_string(&e1).unwrap();
214 let mut e2: Event = serde_json::from_str(&serialized).unwrap();
215 // Clear the hashes to force a re-seal; the result must match.
216 e2.payload_hash.clear();
217 e2.event_hash.clear();
218 seal(&mut e2);
219
220 assert_eq!(e2.payload_hash, h1_payload, "payload_hash drifted");
221 assert_eq!(e2.event_hash, h1_event, "event_hash drifted");
222
223 // Stronger check: re-serialize the payload through a different
224 // textual key order and confirm the canonical bytes are identical.
225 let scrambled = serde_json::json!({
226 "b": true,
227 "n": null,
228 "m": [3, 4, {"x": "u", "y": "v"}],
229 "a": "two",
230 "z": 1,
231 });
232 assert_eq!(payload_hash(&e1.payload), payload_hash(&scrambled));
233 }
234
235 #[test]
236 fn genesis_event_has_distinct_hash_from_empty_prev_string() {
237 // A genesis event (prev=None) and an event with prev="" are
238 // semantically different but the framing must NOT confuse them.
239 // With the length prefix, both encode the same byte slice (0-len)
240 // for `prev`, so they SHOULD collide on event_hash because they
241 // really are the same input. This test pins that behavior so a
242 // future change (e.g. distinguishing the two with an extra tag
243 // byte) is a deliberate decision, not an accident.
244 let p = payload_hash(&serde_json::json!({"x": 1}));
245 let h_none = event_hash(None, &p);
246 let h_empty = event_hash(Some(""), &p);
247 assert_eq!(
248 h_none, h_empty,
249 "genesis vs empty prev currently equivalent"
250 );
251 }
252
253 #[test]
254 fn payload_hash_is_deterministic() {
255 let p = serde_json::json!({"x": 1, "y": 2});
256 assert_eq!(payload_hash(&p), payload_hash(&p));
257 }
258
259 // T-1.B.6 acceptance: `framing_resists_boundary_confusion` proptest with
260 // ≥200 cases finds zero collisions.
261 //
262 // For any two distinct `(prev, payload)` pairs of arbitrary lengths, the
263 // resulting `event_hash` strings must differ. The length-prefix framing
264 // is what guarantees this: without it, e.g. ("AB", "CD") and ("ABC", "D")
265 // would collide on a naive `prev || payload` concatenation.
266 proptest! {
267 #![proptest_config(ProptestConfig {
268 cases: 256,
269 ..ProptestConfig::default()
270 })]
271
272 #[test]
273 fn framing_resists_boundary_confusion(
274 a_prev in ".{0,40}",
275 a_payload in ".{1,40}",
276 b_prev in ".{0,40}",
277 b_payload in ".{1,40}",
278 ) {
279 // Normalize: empty `prev` string == None (genesis). Both encode
280 // the same byte slice (0-len) for `prev`, so they are
281 // canonically the same input — see
282 // `genesis_event_has_distinct_hash_from_empty_prev_string`.
283 let a_prev_opt = if a_prev.is_empty() { None } else { Some(a_prev.as_str()) };
284 let b_prev_opt = if b_prev.is_empty() { None } else { Some(b_prev.as_str()) };
285
286 // Compare canonical inputs: skip cases where the framed input
287 // bytes are identical (otherwise the test is trivially false).
288 let a_norm: (&[u8], &str) = (a_prev_opt.map(str::as_bytes).unwrap_or(&[]), &a_payload);
289 let b_norm: (&[u8], &str) = (b_prev_opt.map(str::as_bytes).unwrap_or(&[]), &b_payload);
290 prop_assume!(a_norm != b_norm);
291
292 let ha = event_hash(a_prev_opt, &a_payload);
293 let hb = event_hash(b_prev_opt, &b_payload);
294
295 prop_assert_ne!(ha, hb);
296 }
297 }
298
299 /// Direct boundary-confusion regression: the classic ("AB","CD") vs
300 /// ("ABC","D") case. With length prefixes, these hashes MUST differ.
301 #[test]
302 fn boundary_confusion_regression() {
303 let h1 = event_hash(Some("AB"), "CD");
304 let h2 = event_hash(Some("ABC"), "D");
305 assert_ne!(
306 h1, h2,
307 "naive concatenation would collide; length-prefix framing must prevent this"
308 );
309 }
310
311 #[test]
312 fn domain_tag_prevents_cross_domain_collision() {
313 // An event_hash with DOMAIN_TAG_EVENT_HASH must not collide with the
314 // BLAKE3 hash of just the framed body. We verify by computing both
315 // and asserting inequality on a fixed input.
316 let prev = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef";
317 let p = "fedcba9876543210fedcba9876543210fedcba9876543210fedcba9876543210";
318 let with_tag = event_hash(Some(prev), p);
319
320 // Compute "no-tag" version manually for comparison.
321 let mut hasher = blake3::Hasher::new();
322 hasher.update(&(prev.len() as u64).to_le_bytes());
323 hasher.update(prev.as_bytes());
324 hasher.update(&(p.len() as u64).to_le_bytes());
325 hasher.update(p.as_bytes());
326 let no_tag = hasher.finalize().to_hex().to_string();
327
328 assert_ne!(with_tag, no_tag, "domain tag must change hash output");
329 }
330}