crtx-ledger 0.1.1

//! Domain-tagged, length-prefixed BLAKE3 hash chain (T-1.B.1 + T-1.B.6).
//!
//! ## Framing
//!
//! Two hashes participate in the chain:
//!
//! 1. `payload_hash` — `blake3(canonical_payload_bytes)`. The canonical
//!    encoding of a `serde_json::Value` is the ordered, no-whitespace form
//!    produced by [`canonical_payload_bytes`]. Object keys are sorted
//!    lexicographically; arrays preserve order; numbers, strings, and
//!    booleans are emitted in their canonical JSON form. This makes the
//!    hash **stable across re-serialization**: a `Value` parsed from JSON
//!    and re-serialized via the canonical encoder produces the same bytes
//!    regardless of the original key order or whitespace.
//!
//! 2. `event_hash` — domain-tagged, length-prefixed framing:
//!
//!    ```text
//!    event_hash = blake3(
//!        DOMAIN_TAG_EVENT_HASH                  // 1 byte: 0x01
//!     || prev_event_hash.len() as u64 (LE)      // 8 bytes
//!     || prev_event_hash bytes                  // 32 bytes (or 0 if genesis)
//!     || payload_hash.len() as u64 (LE)         // 8 bytes
//!     || payload_hash bytes                     // 32 bytes
//!    )
//!    ```
//!
//! ## Why length-prefix + domain tag (T-1.B.6 — THREATS T-EV-5)
//!
//! Without length prefixes, two distinct `(prev, payload)` splits could
//! concatenate to the same byte string and collide. Example: `prev = "AB"`,
//! `payload = "CD"` vs `prev = "ABC"`, `payload = "D"` both yield `"ABCD"`.
//! Length prefixes make the boundary unambiguous.
//!
//! Without a domain tag, an `event_hash` byte string could be
//! reinterpreted as some other domain's hash input (e.g. an `audit_hash`)
//! and collide cross-domain. The 1-byte `DOMAIN_TAG_EVENT_HASH = 0x01`
//! reserves a domain-separated input space; future domains take other
//! tags (0x02 audit, 0x03 trace seal, …).
//!
//! The `framing_resists_boundary_confusion` proptest (≥200 cases) asserts
//! that no two distinct `(prev, payload)` pairs produce the same
//! `event_hash`.

use cortex_core::Event;

/// Domain tag for `event_hash` framing. Reserved: 0x01.
///
/// Other domains (audit row hashing, trace seal hashing, …) MUST take
/// distinct tag bytes. Re-using a tag is a chain-collision vulnerability.
pub const DOMAIN_TAG_EVENT_HASH: u8 = 0x01;

/// Length-in-bytes of a hex-encoded BLAKE3 hash (32 bytes → 64 hex chars).
pub const HEX_HASH_LEN: usize = 64;

/// Canonical, deterministic JSON encoding of a payload `Value`.
///
/// Object keys are sorted lexicographically (recursively). Arrays preserve
/// element order. No whitespace. This is the input to [`payload_hash`].
///
/// **Invariant:** for any `Value v`, `canonical_payload_bytes(&v)` is equal
/// to `canonical_payload_bytes(&serde_json::from_slice(&canonical_payload_bytes(&v)).unwrap())`.
/// In other words, re-serializing through the canonical encoder is a
/// fixed point.
#[must_use]
pub fn canonical_payload_bytes(value: &serde_json::Value) -> Vec<u8> {
    let mut out = Vec::with_capacity(64);
    encode_canonical(value, &mut out);
    out
}

/// BLAKE3 hash of the canonical payload bytes, hex-encoded.
#[must_use]
pub fn payload_hash(value: &serde_json::Value) -> String {
    let bytes = canonical_payload_bytes(value);
    blake3::hash(&bytes).to_hex().to_string()
}

/// Compute `event_hash` for an event with the given previous hash and
/// payload hash, both as hex strings.
///
/// `prev_event_hash` is `None` for the genesis event of a chain; the framing
/// emits a zero-length prefix and an empty `prev` field in that case (the
/// length prefix prevents collision between genesis and "empty prev" cases).
#[must_use]
pub fn event_hash(prev_event_hash: Option<&str>, payload_hash_hex: &str) -> String {
    let prev_bytes = prev_event_hash.map(str::as_bytes).unwrap_or(&[]);
    let payload_bytes = payload_hash_hex.as_bytes();

    let mut hasher = blake3::Hasher::new();
    hasher.update(&[DOMAIN_TAG_EVENT_HASH]);
    hasher.update(&(prev_bytes.len() as u64).to_le_bytes());
    hasher.update(prev_bytes);
    hasher.update(&(payload_bytes.len() as u64).to_le_bytes());
    hasher.update(payload_bytes);
    hasher.finalize().to_hex().to_string()
}

/// Recompute `payload_hash` and `event_hash` for an event in-place.
///
/// Used by [`crate::jsonl::JsonlLog::append`] to seal an event before
/// persisting it. The `prev_event_hash` field on `event` is taken as the
/// authoritative previous-hash input (callers MUST set it correctly before
/// invoking this).
pub fn seal(event: &mut Event) {
    event.payload_hash = payload_hash(&event.payload);
    event.event_hash = event_hash(event.prev_event_hash.as_deref(), &event.payload_hash);
}

// ---------------------------------------------------------------------------
// Canonical JSON encoder (sorted keys, no whitespace).
// ---------------------------------------------------------------------------

fn encode_canonical(v: &serde_json::Value, out: &mut Vec<u8>) {
    match v {
        serde_json::Value::Null => out.extend_from_slice(b"null"),
        serde_json::Value::Bool(true) => out.extend_from_slice(b"true"),
        serde_json::Value::Bool(false) => out.extend_from_slice(b"false"),
        serde_json::Value::Number(n) => {
            // serde_json::Number's Display already produces a canonical
            // numeric form (no leading zeros, minimal exponent for floats).
            // We accept that as canonical here; if exact-numeric collision
            // resistance is needed later, switch to a stricter encoder
            // (e.g. RFC 8785 JCS) and bump SCHEMA_VERSION.
            out.extend_from_slice(n.to_string().as_bytes());
        }
        serde_json::Value::String(s) => {
            // Reuse serde_json's string escaper to ensure correct \uXXXX
            // and \" handling; this is identical across re-serializations
            // because serde_json's string output is deterministic.
            let s = serde_json::to_string(s).expect("string encode");
            out.extend_from_slice(s.as_bytes());
        }
        serde_json::Value::Array(items) => {
            out.push(b'[');
            for (i, item) in items.iter().enumerate() {
                if i > 0 {
                    out.push(b',');
                }
                encode_canonical(item, out);
            }
            out.push(b']');
        }
        serde_json::Value::Object(map) => {
            // Sort keys lexicographically by their UTF-8 byte order. JSON
            // object keys are unordered by spec; sorting is what makes this
            // canonical and re-serialization-stable.
            let mut keys: Vec<&String> = map.keys().collect();
            keys.sort();
            out.push(b'{');
            for (i, k) in keys.iter().enumerate() {
                if i > 0 {
                    out.push(b',');
                }
                let key_str = serde_json::to_string(k).expect("key encode");
                out.extend_from_slice(key_str.as_bytes());
                out.push(b':');
                encode_canonical(&map[*k], out);
            }
            out.push(b'}');
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use chrono::TimeZone;
    use cortex_core::{Event, EventSource, EventType, SCHEMA_VERSION};
    use proptest::prelude::*;

    fn fixture_event(payload: serde_json::Value) -> Event {
        Event {
            id: "evt_01ARZ3NDEKTSV4RRFFQ69G5FAV".parse().unwrap(),
            schema_version: SCHEMA_VERSION,
            observed_at: chrono::Utc.with_ymd_and_hms(2026, 1, 1, 12, 0, 0).unwrap(),
            recorded_at: chrono::Utc.with_ymd_and_hms(2026, 1, 1, 12, 0, 1).unwrap(),
            source: EventSource::User,
            event_type: EventType::UserMessage,
            trace_id: None,
            session_id: None,
            domain_tags: vec![],
            payload,
            payload_hash: String::new(),
            prev_event_hash: None,
            event_hash: String::new(),
        }
    }

    /// T-1.B.1 acceptance: hash chain stable across re-serialization.
    ///
    /// We compute `payload_hash` and `event_hash` for an event, serialize the
    /// event to JSON, deserialize, and reseal. The hashes MUST match because
    /// the canonical encoder is order-independent and the framing is byte-
    /// stable.
    #[test]
    fn hash_chain_stable_across_reserialization() {
        let payload = serde_json::json!({
            "z": 1,
            "a": "two",
            "m": [3, 4, {"y": "v", "x": "u"}],
            "n": null,
            "b": true,
        });

        let mut e1 = fixture_event(payload);
        seal(&mut e1);
        let h1_payload = e1.payload_hash.clone();
        let h1_event = e1.event_hash.clone();

        // Round-trip the entire event through serde_json (which may emit
        // object keys in a different order).
        let serialized = serde_json::to_string(&e1).unwrap();
        let mut e2: Event = serde_json::from_str(&serialized).unwrap();
        // Clear the hashes to force a re-seal; the result must match.
        e2.payload_hash.clear();
        e2.event_hash.clear();
        seal(&mut e2);

        assert_eq!(e2.payload_hash, h1_payload, "payload_hash drifted");
        assert_eq!(e2.event_hash, h1_event, "event_hash drifted");

        // Stronger check: re-serialize the payload through a different
        // textual key order and confirm the canonical bytes are identical.
        let scrambled = serde_json::json!({
            "b": true,
            "n": null,
            "m": [3, 4, {"x": "u", "y": "v"}],
            "a": "two",
            "z": 1,
        });
        assert_eq!(payload_hash(&e1.payload), payload_hash(&scrambled));
    }

    #[test]
    fn genesis_event_has_distinct_hash_from_empty_prev_string() {
        // A genesis event (prev=None) and an event with prev="" are
        // semantically different but the framing must NOT confuse them.
        // With the length prefix, both encode the same byte slice (0-len)
        // for `prev`, so they SHOULD collide on event_hash because they
        // really are the same input. This test pins that behavior so a
        // future change (e.g. distinguishing the two with an extra tag
        // byte) is a deliberate decision, not an accident.
        let p = payload_hash(&serde_json::json!({"x": 1}));
        let h_none = event_hash(None, &p);
        let h_empty = event_hash(Some(""), &p);
        assert_eq!(
            h_none, h_empty,
            "genesis vs empty prev currently equivalent"
        );
    }

    #[test]
    fn payload_hash_is_deterministic() {
        let p = serde_json::json!({"x": 1, "y": 2});
        assert_eq!(payload_hash(&p), payload_hash(&p));
    }

    // T-1.B.6 acceptance: `framing_resists_boundary_confusion` proptest with
    // ≥200 cases finds zero collisions.
    //
    // For any two distinct `(prev, payload)` pairs of arbitrary lengths, the
    // resulting `event_hash` strings must differ. The length-prefix framing
    // is what guarantees this: without it, e.g. ("AB", "CD") and ("ABC", "D")
    // would collide on a naive `prev || payload` concatenation.
    proptest! {
        #![proptest_config(ProptestConfig {
            cases: 256,
            ..ProptestConfig::default()
        })]

        #[test]
        fn framing_resists_boundary_confusion(
            a_prev in ".{0,40}",
            a_payload in ".{1,40}",
            b_prev in ".{0,40}",
            b_payload in ".{1,40}",
        ) {
            // Normalize: empty `prev` string == None (genesis). Both encode
            // the same byte slice (0-len) for `prev`, so they are
            // canonically the same input — see
            // `genesis_event_has_distinct_hash_from_empty_prev_string`.
            let a_prev_opt = if a_prev.is_empty() { None } else { Some(a_prev.as_str()) };
            let b_prev_opt = if b_prev.is_empty() { None } else { Some(b_prev.as_str()) };

            // Compare canonical inputs: skip cases where the framed input
            // bytes are identical (otherwise the test is trivially false).
            let a_norm: (&[u8], &str) = (a_prev_opt.map(str::as_bytes).unwrap_or(&[]), &a_payload);
            let b_norm: (&[u8], &str) = (b_prev_opt.map(str::as_bytes).unwrap_or(&[]), &b_payload);
            prop_assume!(a_norm != b_norm);

            let ha = event_hash(a_prev_opt, &a_payload);
            let hb = event_hash(b_prev_opt, &b_payload);

            prop_assert_ne!(ha, hb);
        }
    }

    /// Direct boundary-confusion regression: the classic ("AB","CD") vs
    /// ("ABC","D") case. With length prefixes, these hashes MUST differ.
    #[test]
    fn boundary_confusion_regression() {
        let h1 = event_hash(Some("AB"), "CD");
        let h2 = event_hash(Some("ABC"), "D");
        assert_ne!(
            h1, h2,
            "naive concatenation would collide; length-prefix framing must prevent this"
        );
    }

    #[test]
    fn domain_tag_prevents_cross_domain_collision() {
        // An event_hash with DOMAIN_TAG_EVENT_HASH must not collide with the
        // BLAKE3 hash of just the framed body. We verify by computing both
        // and asserting inequality on a fixed input.
        let prev = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef";
        let p = "fedcba9876543210fedcba9876543210fedcba9876543210fedcba9876543210";
        let with_tag = event_hash(Some(prev), p);

        // Compute "no-tag" version manually for comparison.
        let mut hasher = blake3::Hasher::new();
        hasher.update(&(prev.len() as u64).to_le_bytes());
        hasher.update(prev.as_bytes());
        hasher.update(&(p.len() as u64).to_le_bytes());
        hasher.update(p.as_bytes());
        let no_tag = hasher.finalize().to_hex().to_string();

        assert_ne!(with_tag, no_tag, "domain tag must change hash output");
    }
}