Skip to main content

ethos_core/
c14n.rs

1/*
2 * Copyright 2026 The Ethos maintainers
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17//! c14n v1 — the single canonical JSON serialization (determinism contract §2).
18//! No other crate hand-rolls output JSON (invariant 2).
19//!
20//! Properties (tested below): UTF-8, no whitespace, keys sorted by code point,
21//! minimal escaping, integers only (floats are a hard error), idempotent.
22
23use serde_json::Value;
24use sha2::{Digest, Sha256};
25
26pub use crate::geom::MAX_SAFE_INT;
27
28/// c14n failure: a float or out-of-range number in a canonical value.
29#[derive(Debug, Clone, PartialEq, Eq)]
30pub struct C14nError {
31    /// Deterministic message.
32    pub message: String,
33}
34
35impl core::fmt::Display for C14nError {
36    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
37        f.write_str(&self.message)
38    }
39}
40impl std::error::Error for C14nError {}
41
42fn err(message: &str) -> C14nError {
43    C14nError {
44        message: message.to_string(),
45    }
46}
47
48/// Serialize a JSON value to canonical bytes.
49///
50/// Object keys are sorted **explicitly** at write time (String `Ord` = Unicode code
51/// point order — exactly the contract sort). We deliberately do NOT rely on
52/// `serde_json::Map`'s iteration order: Cargo feature unification means any crate in the
53/// final graph enabling `serde_json/preserve_order` would silently switch the map to
54/// insertion order — an instant fingerprint break. Explicit sorting makes c14n correct
55/// under either map flavor. Duplicate keys cannot be represented in `Value` and thus
56/// never reach c14n; our own model can never produce them.
57pub fn c14n_bytes(value: &Value) -> Result<Vec<u8>, C14nError> {
58    let mut out = Vec::with_capacity(256);
59    write_value(value, &mut out)?;
60    Ok(out)
61}
62
63fn write_value(value: &Value, out: &mut Vec<u8>) -> Result<(), C14nError> {
64    match value {
65        Value::Null => out.extend_from_slice(b"null"),
66        Value::Bool(true) => out.extend_from_slice(b"true"),
67        Value::Bool(false) => out.extend_from_slice(b"false"),
68        Value::Number(n) => {
69            if let Some(i) = n.as_i64() {
70                // unsigned_abs: `i64::MIN.abs()` would overflow (panic in debug, wrap in
71                // release and slip past the range check) — P2 reviewer finding.
72                if i.unsigned_abs() > MAX_SAFE_INT as u64 {
73                    return Err(err("integer exceeds 2^53-1 in canonical value"));
74                }
75                out.extend_from_slice(i.to_string().as_bytes());
76            } else if let Some(u) = n.as_u64() {
77                if u > MAX_SAFE_INT as u64 {
78                    return Err(err("integer exceeds 2^53-1 in canonical value"));
79                }
80                out.extend_from_slice(u.to_string().as_bytes());
81            } else {
82                return Err(err("non-integer number in canonical value"));
83            }
84        }
85        Value::String(s) => write_string(s, out),
86        Value::Array(items) => {
87            out.push(b'[');
88            for (i, item) in items.iter().enumerate() {
89                if i > 0 {
90                    out.push(b',');
91                }
92                write_value(item, out)?;
93            }
94            out.push(b']');
95        }
96        Value::Object(map) => {
97            out.push(b'{');
98            // Explicit code-point sort — never trust the map's own iteration order
99            // (see c14n_bytes docs: serde_json/preserve_order is an additive feature).
100            let mut entries: Vec<(&String, &Value)> = map.iter().collect();
101            entries.sort_unstable_by(|a, b| a.0.cmp(b.0));
102            for (i, (k, v)) in entries.into_iter().enumerate() {
103                if i > 0 {
104                    out.push(b',');
105                }
106                write_string(k, out);
107                out.push(b':');
108                write_value(v, out)?;
109            }
110            out.push(b'}');
111        }
112    }
113    Ok(())
114}
115
116fn write_string(s: &str, out: &mut Vec<u8>) {
117    out.push(b'"');
118    for c in s.chars() {
119        match c {
120            '"' => out.extend_from_slice(b"\\\""),
121            '\\' => out.extend_from_slice(b"\\\\"),
122            '\u{0008}' => out.extend_from_slice(b"\\b"),
123            '\t' => out.extend_from_slice(b"\\t"),
124            '\n' => out.extend_from_slice(b"\\n"),
125            '\u{000C}' => out.extend_from_slice(b"\\f"),
126            '\r' => out.extend_from_slice(b"\\r"),
127            c if (c as u32) < 0x20 => {
128                out.extend_from_slice(format!("\\u{:04x}", c as u32).as_bytes());
129            }
130            c => {
131                let mut buf = [0u8; 4];
132                out.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
133            }
134        }
135    }
136    out.push(b'"');
137}
138
139/// Lowercase hex sha256 over the c14n bytes of `value`.
140pub fn sha256_hex(value: &Value) -> Result<String, C14nError> {
141    let bytes = c14n_bytes(value)?;
142    Ok(hex(&Sha256::digest(&bytes)))
143}
144
145/// Lowercase hex sha256 over raw bytes (e.g. source PDF bytes).
146pub fn sha256_hex_bytes(bytes: &[u8]) -> String {
147    hex(&Sha256::digest(bytes))
148}
149
150fn hex(digest: &[u8]) -> String {
151    let mut s = String::with_capacity(digest.len() * 2);
152    for b in digest {
153        use core::fmt::Write as _;
154        let _ = write!(s, "{b:02x}");
155    }
156    s
157}
158
159#[cfg(test)]
160mod tests {
161    use super::*;
162    use proptest::prelude::*;
163    use serde_json::json;
164
165    fn c14n_str(v: &Value) -> String {
166        String::from_utf8(c14n_bytes(v).unwrap()).unwrap()
167    }
168
169    // --- contract test vectors (determinism contract §10; cross-checked vs Python ref) ---
170
171    #[test]
172    fn vector_v1_empty_object() {
173        let v = json!({});
174        assert_eq!(c14n_str(&v), "{}");
175        assert_eq!(
176            sha256_hex(&v).unwrap(),
177            "44136fa355b3678a1146ad16f7e8649e94fb4fc21fe77e8310c060f61caaff8a"
178        );
179    }
180
181    #[test]
182    fn vector_v2_key_order() {
183        let v = json!({"b": 2, "a": 1, "_": 0, "Z": -3});
184        assert_eq!(c14n_str(&v), r#"{"Z":-3,"_":0,"a":1,"b":2}"#);
185        assert_eq!(
186            sha256_hex(&v).unwrap(),
187            "9e8c5fa78b63297991b5b7b45bd334ccc61bd1058c5cd8ca6ee0451f78cd6cc1"
188        );
189    }
190
191    #[test]
192    fn vector_v3_strings_and_ints() {
193        let v = json!({
194            "text": "líne1\nl\"ine2\tend — \u{1F4A1}",
195            "n_zero": 0, "n_neg": -42, "arr": [3, 1, 2], "flag": true, "nothing": null
196        });
197        assert_eq!(
198            c14n_str(&v),
199            "{\"arr\":[3,1,2],\"flag\":true,\"n_neg\":-42,\"n_zero\":0,\"nothing\":null,\"text\":\"líne1\\nl\\\"ine2\\tend — \u{1F4A1}\"}"
200        );
201        assert_eq!(
202            sha256_hex(&v).unwrap(),
203            "86b355efaa571cac1ddb71d422a9971e6042c55ec5369305cce095f2c181426e"
204        );
205    }
206
207    #[test]
208    fn vector_v3b_controls_and_backslash() {
209        let v = json!({"bel": "\u{0007}", "backslash": "a\\b"});
210        assert_eq!(
211            c14n_str(&v),
212            "{\"backslash\":\"a\\\\b\",\"bel\":\"\\u0007\"}"
213        );
214        assert_eq!(
215            sha256_hex(&v).unwrap(),
216            "a1cc2b96cfaf4e1d27ca13e7c2e56faadf76bd027d233fce5a57124e36ea6dfd"
217        );
218    }
219
220    #[test]
221    fn vector_v4_fingerprint_manifest() {
222        // the fingerprint manifest of schemas/examples/document.example.json — its
223        // embedded hashes are real, so this vector ties c14n, the example, and the
224        // Python reference implementation together
225        let v = json!({
226            "config_sha256": "68cc61753d299917cc7773f069c18aca31c8ac68f43736a94cb57eee05144084",
227            "payload_sha256": "dad47d0ac4ab90f60691eb884c4c7e58d38ef7b87ef3df4bf602cd6087c9c757",
228            "profile_id": "ethos-deterministic-v1",
229            "profile_sha256": "d6145b9210845db39ad592ea549788432b52a649778c9947f5b2d91173e38070",
230            "schema_version": "1.0.0",
231            "source_fingerprint": "sha256:5f70bf18a086007016e948b04aed3b82103a36bea41755b6cddfaf10ace3c6ef"
232        });
233        assert_eq!(
234            sha256_hex(&v).unwrap(),
235            "b5d30710d0c25cc38d8dec924ecaf57ae4f81276dd5dc14d75cb3b5b6bde62d3"
236        );
237    }
238
239    #[test]
240    fn profile_artifact_hash_is_pinned() {
241        let raw = include_str!(concat!(
242            env!("CARGO_MANIFEST_DIR"),
243            "/../../profiles/ethos-deterministic-v1.json"
244        ));
245        let v: Value = serde_json::from_str(raw).unwrap();
246        assert_eq!(
247            sha256_hex(&v).unwrap(),
248            "d6145b9210845db39ad592ea549788432b52a649778c9947f5b2d91173e38070",
249            "profile artifact changed without a version bump (contract §10)"
250        );
251    }
252
253    #[test]
254    fn floats_are_rejected() {
255        assert!(c14n_bytes(&json!({"x": 1.5})).is_err());
256        assert!(c14n_bytes(&json!([0.1])).is_err());
257        // 2^53 boundary
258        assert!(c14n_bytes(&json!(MAX_SAFE_INT)).is_ok());
259        assert!(c14n_bytes(&json!(MAX_SAFE_INT + 1)).is_err());
260        assert!(c14n_bytes(&json!(-MAX_SAFE_INT)).is_ok());
261        assert!(c14n_bytes(&json!(-MAX_SAFE_INT - 1)).is_err());
262    }
263
264    #[test]
265    fn i64_min_is_an_error_not_a_panic() {
266        // regression (P2): i64::MIN.abs() overflows; unsigned_abs must catch it cleanly
267        assert!(c14n_bytes(&json!(i64::MIN)).is_err());
268        assert!(c14n_bytes(&json!({"n": i64::MIN})).is_err());
269        assert!(c14n_bytes(&json!(u64::MAX)).is_err());
270    }
271
272    // --- property tests -----------------------------------------------------------
273
274    fn arb_canonical_value() -> impl Strategy<Value = Value> {
275        let leaf = prop_oneof![
276            Just(Value::Null),
277            any::<bool>().prop_map(Value::from),
278            (-MAX_SAFE_INT..=MAX_SAFE_INT).prop_map(Value::from),
279            "\\PC*".prop_map(Value::from),
280        ];
281        leaf.prop_recursive(4, 32, 8, |inner| {
282            prop_oneof![
283                proptest::collection::vec(inner.clone(), 0..6).prop_map(Value::Array),
284                proptest::collection::btree_map("\\PC*", inner, 0..6)
285                    .prop_map(|m| { Value::Object(m.into_iter().collect()) }),
286            ]
287        })
288    }
289
290    proptest! {
291        /// c14n(parse(c14n(v))) == c14n(v) — the §11 idempotence gate.
292        #[test]
293        fn c14n_is_idempotent(v in arb_canonical_value()) {
294            let once = c14n_bytes(&v).unwrap();
295            let reparsed: Value = serde_json::from_slice(&once).unwrap();
296            let twice = c14n_bytes(&reparsed).unwrap();
297            prop_assert_eq!(once, twice);
298        }
299
300        /// c14n output is always valid JSON that parses back to an equal Value.
301        #[test]
302        fn c14n_round_trips_value(v in arb_canonical_value()) {
303            let bytes = c14n_bytes(&v).unwrap();
304            let reparsed: Value = serde_json::from_slice(&bytes).unwrap();
305            prop_assert_eq!(v, reparsed);
306        }
307
308        /// Key order in output is byte-sorted (scan adjacent top-level keys).
309        #[test]
310        fn object_keys_sorted(m in proptest::collection::btree_map("[a-z]{1,8}", 0i64..100, 0..8)) {
311            let v = Value::Object(m.into_iter().map(|(k, n)| (k, Value::from(n))).collect());
312            let bytes = c14n_bytes(&v).unwrap();
313            let reparsed: Value = serde_json::from_slice(&bytes).unwrap();
314            if let Value::Object(map) = reparsed {
315                let keys: Vec<_> = map.keys().cloned().collect();
316                let mut sorted = keys.clone();
317                sorted.sort();
318                prop_assert_eq!(keys, sorted);
319            }
320        }
321    }
322}