Skip to main content

shadow_core/agentlog/
canonical.rs

1//! Canonical JSON serialization per SPEC §5.
2//!
3//! Given a [`serde_json::Value`], produce the canonical byte sequence that
4//! is the input to SHA-256 content addressing (§6). Rules:
5//!
6//! 1. Object keys sorted by NFC-normalized UTF-8 bytes (§5.1).
7//! 2. No whitespace between tokens (§5.1).
8//! 3. Strings NFC-normalized before emission (§5.2).
9//! 4. Numbers in shortest round-trip decimal form (§5.3).
10//! 5. `-0` normalizes to `0` (§5.3).
11
12use serde_json::Value;
13use unicode_normalization::UnicodeNormalization;
14
15/// Serialize a [`serde_json::Value`] to canonical bytes (SPEC §5).
16///
17/// Infallible: `serde_json::Value` cannot hold `NaN` / `±Infinity`, so the
18/// only way a number is un-representable has already been ruled out before
19/// this function is called.
20pub fn to_bytes(value: &Value) -> Vec<u8> {
21    let mut out = Vec::new();
22    write_value(&mut out, value);
23    out
24}
25
26fn write_value(out: &mut Vec<u8>, value: &Value) {
27    match value {
28        Value::Null => out.extend_from_slice(b"null"),
29        Value::Bool(true) => out.extend_from_slice(b"true"),
30        Value::Bool(false) => out.extend_from_slice(b"false"),
31        Value::Number(n) => write_number(out, n),
32        Value::String(s) => write_string(out, s),
33        Value::Array(arr) => {
34            out.push(b'[');
35            for (i, v) in arr.iter().enumerate() {
36                if i > 0 {
37                    out.push(b',');
38                }
39                write_value(out, v);
40            }
41            out.push(b']');
42        }
43        Value::Object(map) => {
44            out.push(b'{');
45            // NFC-normalize keys first so equivalent forms collapse, then
46            // sort by the normalized UTF-8 bytes.
47            let mut entries: Vec<(String, &Value)> = map
48                .iter()
49                .map(|(k, v)| (k.nfc().collect::<String>(), v))
50                .collect();
51            entries.sort_by(|a, b| a.0.as_bytes().cmp(b.0.as_bytes()));
52            for (i, (k, v)) in entries.iter().enumerate() {
53                if i > 0 {
54                    out.push(b',');
55                }
56                write_string(out, k);
57                out.push(b':');
58                write_value(out, v);
59            }
60            out.push(b'}');
61        }
62    }
63}
64
65fn write_string(out: &mut Vec<u8>, s: &str) {
66    out.push(b'"');
67    let normalized: String = s.nfc().collect();
68    for c in normalized.chars() {
69        match c {
70            '"' => out.extend_from_slice(b"\\\""),
71            '\\' => out.extend_from_slice(b"\\\\"),
72            '\n' => out.extend_from_slice(b"\\n"),
73            '\r' => out.extend_from_slice(b"\\r"),
74            '\t' => out.extend_from_slice(b"\\t"),
75            '\u{08}' => out.extend_from_slice(b"\\b"),
76            '\u{0c}' => out.extend_from_slice(b"\\f"),
77            c if (c as u32) < 0x20 => {
78                // Non-shorthand control char → lowercase \u00XX escape.
79                let code = c as u32;
80                let buf = [
81                    b'\\',
82                    b'u',
83                    b'0',
84                    b'0',
85                    hex_nibble((code >> 4) as u8),
86                    hex_nibble((code & 0xF) as u8),
87                ];
88                out.extend_from_slice(&buf);
89            }
90            c => {
91                let mut buf = [0u8; 4];
92                out.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
93            }
94        }
95    }
96    out.push(b'"');
97}
98
99fn hex_nibble(n: u8) -> u8 {
100    debug_assert!(n < 16);
101    match n {
102        0..=9 => b'0' + n,
103        _ => b'a' + (n - 10),
104    }
105}
106
107fn write_number(out: &mut Vec<u8>, n: &serde_json::Number) {
108    // Prefer integer forms when available — serde_json tracks whether the
109    // number was parsed as an integer.
110    if let Some(i) = n.as_i64() {
111        out.extend_from_slice(i.to_string().as_bytes());
112        return;
113    }
114    if let Some(u) = n.as_u64() {
115        out.extend_from_slice(u.to_string().as_bytes());
116        return;
117    }
118    if let Some(f) = n.as_f64() {
119        // SPEC §5.3: -0 → 0.
120        if f == 0.0 {
121            out.push(b'0');
122            return;
123        }
124        // Rust's default `{}` for f64 is the shortest round-trip form:
125        // 1.0 -> "1", 1.5 -> "1.5", 1e20 -> "100000000000000000000".
126        // JSON uses lowercase `e` for scientific notation; Rust does too,
127        // so no conversion needed. `is_finite()` is a guard even though
128        // serde_json::Number can't hold NaN/Infinity by construction.
129        if f.is_finite() {
130            let s = format!("{f}");
131            out.extend_from_slice(s.as_bytes());
132            return;
133        }
134    }
135    // If we got here, the number is in an unexpected state. Emit "null" so
136    // canonical output remains syntactically valid JSON. Callers won't hit
137    // this because serde_json::Number is closed over i64/u64/f64-finite.
138    out.extend_from_slice(b"null");
139}
140
141#[cfg(test)]
142mod tests {
143    use super::*;
144    use serde_json::json;
145
146    #[test]
147    fn sorts_object_keys() {
148        assert_eq!(
149            to_bytes(&json!({"b": 2, "a": 1})),
150            br#"{"a":1,"b":2}"#.to_vec()
151        );
152    }
153
154    #[test]
155    fn recursive_sorting() {
156        assert_eq!(
157            to_bytes(&json!({"b": {"z": 1, "y": 2}, "a": 1})),
158            br#"{"a":1,"b":{"y":2,"z":1}}"#.to_vec()
159        );
160    }
161
162    #[test]
163    fn no_whitespace_in_arrays() {
164        assert_eq!(
165            to_bytes(&json!({"a": [1, 2, 3]})),
166            br#"{"a":[1,2,3]}"#.to_vec()
167        );
168    }
169
170    #[test]
171    fn booleans_and_null() {
172        assert_eq!(to_bytes(&json!(true)), b"true".to_vec());
173        assert_eq!(to_bytes(&json!(false)), b"false".to_vec());
174        assert_eq!(to_bytes(&json!(null)), b"null".to_vec());
175    }
176
177    #[test]
178    fn integer_numbers() {
179        assert_eq!(to_bytes(&json!(42)), b"42".to_vec());
180        assert_eq!(to_bytes(&json!(-17)), b"-17".to_vec());
181        assert_eq!(to_bytes(&json!(0)), b"0".to_vec());
182    }
183
184    #[test]
185    fn float_that_is_an_integer_emits_as_integer() {
186        // `1.00` parses as f64 1.0 — must emit as "1", not "1.0".
187        let v: Value = serde_json::from_str("1.00").unwrap();
188        assert_eq!(to_bytes(&v), b"1".to_vec());
189    }
190
191    #[test]
192    fn fractional_float() {
193        assert_eq!(to_bytes(&json!(1.5)), b"1.5".to_vec());
194        assert_eq!(to_bytes(&json!(0.1)), b"0.1".to_vec());
195    }
196
197    #[test]
198    fn negative_zero_normalizes_to_zero() {
199        let v: Value = serde_json::from_str("-0.0").unwrap();
200        assert_eq!(to_bytes(&v), b"0".to_vec());
201    }
202
203    #[test]
204    fn string_mandatory_escapes() {
205        assert_eq!(
206            to_bytes(&json!({"x": "a\"b\\c"})),
207            br#"{"x":"a\"b\\c"}"#.to_vec()
208        );
209    }
210
211    #[test]
212    fn string_control_chars_use_shorthand_when_available() {
213        assert_eq!(to_bytes(&json!("\n")), br#""\n""#.to_vec());
214        assert_eq!(to_bytes(&json!("\t")), br#""\t""#.to_vec());
215        assert_eq!(to_bytes(&json!("\r")), br#""\r""#.to_vec());
216    }
217
218    #[test]
219    fn string_other_control_chars_use_u00xx() {
220        // U+0001 has no shorthand; must emit the lowercase \u0001 escape.
221        let mut expected_01 = Vec::new();
222        expected_01.extend_from_slice(b"\"\\u0001\"");
223        assert_eq!(to_bytes(&json!("\u{01}")), expected_01);
224        // U+001F is the highest control-char that must be escaped.
225        let mut expected_1f = Vec::new();
226        expected_1f.extend_from_slice(b"\"\\u001f\"");
227        assert_eq!(to_bytes(&json!("\u{1f}")), expected_1f);
228    }
229
230    #[test]
231    fn non_ascii_emitted_literally() {
232        // SPEC §5.2: non-ASCII emitted as literal UTF-8, not \uXXXX.
233        // U+00E9 = é = bytes c3 a9.
234        let out = to_bytes(&json!("é"));
235        assert_eq!(out, &[b'"', 0xc3, 0xa9, b'"']);
236    }
237
238    #[test]
239    fn utf8_nfc_collapses_equivalent_forms() {
240        // "é" precomposed (U+00E9) vs decomposed (U+0065 U+0301) MUST
241        // serialize to the same bytes after NFC.
242        let decomposed = "e\u{0301}";
243        let precomposed = "\u{00e9}";
244        assert_eq!(to_bytes(&json!(decomposed)), to_bytes(&json!(precomposed)));
245    }
246
247    #[test]
248    fn utf8_nfc_applied_to_object_keys() {
249        // An object with both the decomposed and precomposed form as keys
250        // has TWO distinct keys in serde_json::Value, but after NFC they
251        // collide. Policy: emit both, sorted — they'll be byte-identical
252        // after normalization, so the sort is stable and both survive.
253        // (The §5.1 "unique keys" rule is a producer requirement, not
254        // something canonical serialization enforces after-the-fact.)
255        // The relevant invariant here: NFC is applied before the sort, so
256        // equivalent forms sort together.
257        let v = json!({ "é": 1, "e\u{0301}": 2 });
258        let out = to_bytes(&v);
259        let s = std::str::from_utf8(&out).unwrap();
260        // Both keys become "é" after NFC, so the output has two identical
261        // keys. That's intentionally left to producers to police. Assert
262        // the keys are adjacent in the output (prefix + same-key + prefix).
263        assert!(s.starts_with(r#"{"é":"#));
264    }
265
266    #[test]
267    fn idempotent_roundtrip() {
268        let v = json!({"b": 2, "a": {"d": 3, "c": 4}, "arr": [{"y": 1, "x": 2}]});
269        let once = to_bytes(&v);
270        let reparsed: Value = serde_json::from_slice(&once).unwrap();
271        let twice = to_bytes(&reparsed);
272        assert_eq!(once, twice);
273    }
274
275    #[test]
276    fn spec_5_6_known_vector_canonical_bytes() {
277        // SPEC §5.6 Conformance test case: {"hello":"world"} canonical bytes.
278        let payload = json!({"hello": "world"});
279        assert_eq!(to_bytes(&payload), br#"{"hello":"world"}"#.to_vec());
280    }
281}