acdp_jcs/lib.rs
1//! JSON Canonicalization Scheme (JCS) — RFC 8785.
2//!
3//! Implemented inline to avoid an external dependency and to guarantee
4//! correct handling of all edge cases, especially:
5//! - Object key sorting (RFC 8785 §3.2.1 UTF-16 code-unit order; all
6//! ACDP keys are ASCII, where this coincides with byte/`str` order)
7//! - No whitespace
8//! - Negative zero (`-0.0`) MUST become `0` (the most common bug)
9//! - Non-ASCII characters emitted as-is, not `\uXXXX`-escaped
10
11use std::io::Write;
12
13use acdp_primitives::AcdpError;
14use serde::Serialize;
15
16/// Hard recursion ceiling for the JCS walker. Far above any real ACDP
17/// body (metadata depth is capped at 8) and above serde_json's default
18/// 128-level parse limit, so a value that parsed off the wire can never
19/// hit it — the wire/golden-vector form is unchanged. The cap only
20/// guards against stack overflow from a pathologically deep
21/// programmatically-built `Value` (defense-in-depth, RFC-ACDP P1-3).
22const MAX_JCS_DEPTH: usize = 256;
23
24/// Canonicalize any serializable value to JCS bytes.
25///
26/// The returned bytes are the canonical UTF-8 JSON representation.
27pub fn canonicalize<T: Serialize>(value: &T) -> Result<Vec<u8>, AcdpError> {
28 let v = serde_json::to_value(value).map_err(|e| AcdpError::Canonicalization(e.to_string()))?;
29 try_canonicalize_value(&v)
30}
31
32/// Canonicalize a pre-parsed `serde_json::Value`, returning an error if
33/// nesting exceeds the internal recursion ceiling (`MAX_JCS_DEPTH`).
34/// Prefer this on any path that may canonicalize untrusted /
35/// programmatically-built input.
36pub fn try_canonicalize_value(value: &serde_json::Value) -> Result<Vec<u8>, AcdpError> {
37 let mut out = Vec::with_capacity(256);
38 write_value(value, &mut out, 0)?;
39 Ok(out)
40}
41
42/// Canonicalize a pre-parsed `serde_json::Value`.
43///
44/// Infallible back-compat wrapper. Panics only on input nested past the
45/// internal recursion ceiling (`MAX_JCS_DEPTH`, unreachable from parsed
46/// wire data); callers handling untrusted input should use
47/// [`try_canonicalize_value`].
48pub fn canonicalize_value(value: &serde_json::Value) -> Vec<u8> {
49 try_canonicalize_value(value)
50 .expect("JCS canonicalization exceeded depth limit; use try_canonicalize_value")
51}
52
53fn write_value(v: &serde_json::Value, out: &mut Vec<u8>, depth: usize) -> Result<(), AcdpError> {
54 if depth > MAX_JCS_DEPTH {
55 return Err(AcdpError::Canonicalization(format!(
56 "JSON nesting depth exceeds {MAX_JCS_DEPTH}"
57 )));
58 }
59 match v {
60 serde_json::Value::Null => out.extend_from_slice(b"null"),
61 serde_json::Value::Bool(true) => out.extend_from_slice(b"true"),
62 serde_json::Value::Bool(false) => out.extend_from_slice(b"false"),
63 serde_json::Value::Number(n) => write_number(n, out),
64 serde_json::Value::String(s) => write_string(s, out),
65 serde_json::Value::Array(arr) => {
66 out.push(b'[');
67 for (i, elem) in arr.iter().enumerate() {
68 if i > 0 {
69 out.push(b',');
70 }
71 write_value(elem, out, depth + 1)?;
72 }
73 out.push(b']');
74 }
75 serde_json::Value::Object(map) => {
76 // Sort keys in RFC 8785 §3.2.1 UTF-16 code-unit order. ACDP
77 // keys are ASCII, where Rust's `str` (byte/scalar) ordering
78 // coincides with UTF-16 code-unit ordering.
79 let mut keys: Vec<&String> = map.keys().collect();
80 keys.sort();
81 out.push(b'{');
82 for (i, key) in keys.iter().enumerate() {
83 if i > 0 {
84 out.push(b',');
85 }
86 write_string(key, out);
87 out.push(b':');
88 write_value(&map[key.as_str()], out, depth + 1)?;
89 }
90 out.push(b'}');
91 }
92 }
93 Ok(())
94}
95
96fn write_number(n: &serde_json::Number, out: &mut Vec<u8>) {
97 // Integer `Number`s (i64 / u64) are already canonical — serde_json prints
98 // the exact digits with no decimal point and no exponent, exactly what
99 // RFC 8785 requires. Only floats need the ECMAScript reformatting below.
100 if n.is_i64() || n.is_u64() {
101 out.extend_from_slice(n.to_string().as_bytes());
102 return;
103 }
104
105 // Float path. `as_f64` is `Some` for any non-integer `Number`; the `None`
106 // arm is unreachable but kept total rather than panicking.
107 let Some(f) = n.as_f64() else {
108 out.extend_from_slice(n.to_string().as_bytes());
109 return;
110 };
111
112 // RFC 8785 §3.2.2.3: both negative and positive zero serialize as "0".
113 if f == 0.0 {
114 out.push(b'0');
115 return;
116 }
117
118 // JSON cannot represent NaN or Infinity. `serde_json::Number::from_f64`
119 // rejects these and this crate does not enable `arbitrary_precision`, so a
120 // non-finite `Number` cannot be built through the safe API — unreachable on
121 // parsed input. Refuse it loudly in debug/test builds; the `null` fallback
122 // is a release-only last resort so canonicalization stays total (emitting
123 // `null` would corrupt the hash preimage). Producers with custom numeric
124 // paths MUST reject non-finite floats *before* canonicalization.
125 debug_assert!(
126 f.is_finite(),
127 "non-finite f64 reached JCS canonicalization ({f}); reject \
128 non-finite numbers before hashing (RFC 8785 §3.2.2.3)"
129 );
130 if !f.is_finite() {
131 out.extend_from_slice(b"null");
132 return;
133 }
134
135 out.extend_from_slice(ecma_number_string(f).as_bytes());
136}
137
138/// Serialize a finite, non-zero `f64` per the ECMAScript `Number::toString`
139/// algorithm that RFC 8785 §3.2.2.3 references: the shortest decimal that
140/// round-trips, rendered with the ES6 band rules — plain decimal for
141/// magnitudes in `[1e-6, 1e21)`, otherwise exponential with a signed,
142/// zero-padding-free exponent; the mantissa never carries a trailing `.0`.
143///
144/// Rust's `{:e}` formatter already produces the shortest round-tripping
145/// mantissa (via the stdlib's Grisu/Ryū path) as `d.ddde±EE`; we extract its
146/// digits and decimal exponent and reformat into the band ECMAScript chooses.
147fn ecma_number_string(f: f64) -> String {
148 let neg = f.is_sign_negative();
149 // e.g. "1.23e25", "5e-324", "1e21", "1.0000005e6".
150 let sci = format!("{:e}", f.abs());
151 let (mantissa, exp) = sci.split_once('e').expect("{:e} always emits 'e'");
152 let e10: i32 = exp.parse().expect("{:e} exponent is an integer");
153 let digits: String = mantissa.chars().filter(|c| *c != '.').collect();
154 let digits = digits.trim_end_matches('0');
155 let digits = if digits.is_empty() { "0" } else { digits };
156 let k = digits.len() as i32; // count of significant digits
157 let n = e10 + 1; // value = digits × 10^(n − k)
158
159 let body = if (k..=21).contains(&n) {
160 // Integer-valued: all digits then (n − k) trailing zeros.
161 format!("{digits}{}", "0".repeat((n - k) as usize))
162 } else if (1..=21).contains(&n) {
163 // Decimal point falls inside the digit run (here n < k).
164 format!("{}.{}", &digits[..n as usize], &digits[n as usize..])
165 } else if (-5..=0).contains(&n) {
166 // Leading "0." then (−n) zeros then the digits.
167 format!("0.{}{digits}", "0".repeat((-n) as usize))
168 } else if k == 1 {
169 // Single-digit mantissa, exponential form.
170 format!("{digits}e{}{}", exp_sign(n - 1), (n - 1).abs())
171 } else {
172 // Multi-digit mantissa, exponential form.
173 format!(
174 "{}.{}e{}{}",
175 &digits[..1],
176 &digits[1..],
177 exp_sign(n - 1),
178 (n - 1).abs()
179 )
180 };
181
182 if neg {
183 format!("-{body}")
184 } else {
185 body
186 }
187}
188
189/// `'+'` for a non-negative ECMAScript exponent, `'-'` otherwise. RFC 8785
190/// requires the exponent sign to always be present (`1e+21`, `1e-7`).
191fn exp_sign(e: i32) -> char {
192 if e >= 0 {
193 '+'
194 } else {
195 '-'
196 }
197}
198
199fn write_string(s: &str, out: &mut Vec<u8>) {
200 out.push(b'"');
201 for ch in s.chars() {
202 match ch {
203 '"' => out.extend_from_slice(b"\\\""),
204 '\\' => out.extend_from_slice(b"\\\\"),
205 '\n' => out.extend_from_slice(b"\\n"),
206 '\r' => out.extend_from_slice(b"\\r"),
207 '\t' => out.extend_from_slice(b"\\t"),
208 c if (c as u32) < 0x20 => {
209 // Control characters below U+0020 must be escaped
210 write!(out, "\\u{:04x}", c as u32).unwrap();
211 }
212 c => {
213 // Non-ASCII characters emitted as-is (UTF-8 bytes, not \uXXXX)
214 let mut buf = [0u8; 4];
215 let encoded = c.encode_utf8(&mut buf);
216 out.extend_from_slice(encoded.as_bytes());
217 }
218 }
219 }
220 out.push(b'"');
221}
222
223// ── Tests ─────────────────────────────────────────────────────────────────────
224
225#[cfg(test)]
226mod tests {
227 use super::*;
228 use serde_json::json;
229
230 #[test]
231 fn sorts_keys() {
232 let v = json!({"z": 1, "a": 2, "m": 3});
233 let out = canonicalize_value(&v);
234 assert_eq!(out, b"{\"a\":2,\"m\":3,\"z\":1}");
235 }
236
237 #[test]
238 fn negative_zero_becomes_zero() {
239 // The critical RFC 8785 edge case
240 let v = json!({"values": [42, -7, 0, 1.1, 1.5, -0.0_f64]});
241 let out = canonicalize_value(&v);
242 let s = std::str::from_utf8(&out).unwrap();
243 // -0.0 must become 0
244 assert!(!s.contains("-0"), "found '-0' in: {s}");
245 }
246
247 #[test]
248 fn unicode_as_is() {
249 let v = json!({"title": "café"});
250 let out = canonicalize_value(&v);
251 assert_eq!(out, "{\"title\":\"café\"}".as_bytes());
252 }
253
254 #[test]
255 fn empty_vs_absent() {
256 let with_tags = json!({"tags": [], "v": 1});
257 let without = json!({"v": 1});
258 let h1 = {
259 use sha2::{Digest, Sha256};
260 hex::encode(Sha256::digest(canonicalize_value(&with_tags)))
261 };
262 let h2 = {
263 use sha2::{Digest, Sha256};
264 hex::encode(Sha256::digest(canonicalize_value(&without)))
265 };
266 assert_ne!(h1, h2, "empty array and absent field must hash differently");
267 }
268
269 #[test]
270 fn minimal_body_golden_hash() {
271 // Reproduces can-001 vector from schemas/conformance/can-001-jcs-vector.json
272 let body = json!({
273 "agent_id": "did:agent:test",
274 "contributors": [],
275 "data_refs": [],
276 "supersedes": null,
277 "title": "Minimal",
278 "type": "data_snapshot",
279 "version": 1
280 });
281 use sha2::{Digest, Sha256};
282 let h = hex::encode(Sha256::digest(canonicalize_value(&body)));
283 assert_eq!(
284 h,
285 "5f8d88d6758cfd43be875d49edc9eaa494de8ec645bf7de6c592b15bbb1e2e3c"
286 );
287 }
288
289 // ── RFC 8785 numeric serialization vectors (Appendix B subset) ──────
290 //
291 // RFC 8785 §3.2.2.3 / Appendix B pin the serialization of JSON
292 // numbers. ACDP wire bodies only ever carry *integers* (version
293 // numbers, counts) and the occasional plain decimal — never the
294 // exponential / integer-valued-float forms (e.g. `1e21`, `1.0`) whose
295 // ECMAScript `Number::toString` output diverges from serde_json's
296 // shortest-float Display. We therefore pin the cases that actually
297 // occur on the wire and that this canonicalizer guarantees, plus the
298 // negative-zero rule that is the most common JCS bug. Full ECMAScript
299 // `Number::toString` formatting (exponential bands, shortest
300 // round-trip) is implemented in `write_number` and is covered by
301 // `rfc8785_ecmascript_float_bands` below.
302
303 /// Helper: canonicalize a single JSON number token (parsed from
304 /// text, so integers stay integers) and return the emitted string.
305 fn canon_number(json_token: &str) -> String {
306 let v: serde_json::Value = serde_json::from_str(json_token).unwrap();
307 String::from_utf8(canonicalize_value(&v)).unwrap()
308 }
309
310 #[test]
311 fn rfc8785_integer_vectors() {
312 // Integers serialize with no decimal point, no leading zeros,
313 // no plus sign — exactly their canonical decimal form.
314 for (input, expected) in [
315 ("0", "0"),
316 ("-0", "0"), // negative-zero *integer* normalizes to "0"
317 ("1", "1"),
318 ("-1", "-1"),
319 ("100", "100"),
320 ("9007199254740992", "9007199254740992"), // 2^53
321 ("9007199254740993", "9007199254740993"), // 2^53 + 1 (exact as i64)
322 ("18446744073709551615", "18446744073709551615"), // u64::MAX
323 ("-9223372036854775808", "-9223372036854775808"), // i64::MIN
324 ] {
325 assert_eq!(canon_number(input), expected, "input={input}");
326 }
327 }
328
329 #[test]
330 fn rfc8785_negative_zero_float_becomes_zero() {
331 // RFC 8785 §3.2.2.3: -0.0 MUST serialize as "0".
332 assert_eq!(canon_number("-0.0"), "0");
333 // And nested inside a structure (the realistic case). The other
334 // entries are integers to avoid the integer-valued-float case
335 // (`0.0` → "0.0") that is out of scope per the note above.
336 let v = json!({"a": [-0.0_f64, 1], "b": -0.0_f64});
337 let s = String::from_utf8(canonicalize_value(&v)).unwrap();
338 assert_eq!(s, r#"{"a":[0,1],"b":0}"#);
339 }
340
341 #[test]
342 fn rfc8785_plain_decimal_vectors() {
343 // Plain decimals whose shortest representation is unambiguous and
344 // identical under ES6 and serde_json's Display.
345 for (input, expected) in [
346 ("0.1", "0.1"),
347 ("1.5", "1.5"),
348 ("-2.5", "-2.5"),
349 ("123.456", "123.456"),
350 ] {
351 assert_eq!(canon_number(input), expected, "input={input}");
352 }
353 }
354
355 #[test]
356 fn rfc8785_numeric_serialization_is_idempotent() {
357 // Re-canonicalizing the emitted form reproduces it byte-for-byte
358 // (no drift across a parse → serialize round trip).
359 for token in ["0", "-0", "42", "9007199254740993", "0.1", "-2.5", "-0.0"] {
360 let once = canon_number(token);
361 let twice = canon_number(&once);
362 assert_eq!(once, twice, "token={token}");
363 }
364 }
365
366 /// RFC 8785 §3.2.2.3 float serialization — the `can-011` numeric
367 /// bands, now that ECMAScript `Number::toString` is implemented in
368 /// `write_number`. These canonical tokens are fixed by the algorithm,
369 /// so they hold regardless of the spec fixture's own SHA-256 values.
370 #[test]
371 fn rfc8785_ecmascript_float_bands() {
372 for (token, expected) in [
373 // Large-magnitude exponential (≥ 1e21).
374 ("1e21", "1e+21"),
375 ("1e22", "1e+22"),
376 ("1.23e25", "1.23e+25"),
377 ("1e100", "1e+100"),
378 // Small-magnitude exponential (< 1e-6).
379 ("1e-7", "1e-7"),
380 ("1e-10", "1e-10"),
381 ("5e-9", "5e-9"),
382 ("1e-20", "1e-20"),
383 // Decimal band [1e-6, 1e21).
384 ("1e-6", "0.000001"),
385 ("0.1", "0.1"),
386 ("1000000.5", "1000000.5"),
387 ("12345.6789", "12345.6789"),
388 // Integer-valued floats normalize like integers (no trailing .0).
389 ("1.0", "1"),
390 ("100.0", "100"),
391 // IEEE 754 magnitude extremes.
392 ("1.7976931348623157e308", "1.7976931348623157e+308"),
393 ("5e-324", "5e-324"),
394 ] {
395 assert_eq!(canon_number(token), expected, "token={token}");
396 }
397 }
398
399 /// Positive and negative zero — including the float and exponential
400 /// spellings — all canonicalize to "0" (RFC 8785 §3.2.2.3).
401 #[test]
402 fn rfc8785_all_zeros_normalize() {
403 for token in ["0", "-0", "0.0", "-0.0", "0e0", "-0.0e10"] {
404 assert_eq!(canon_number(token), "0", "token={token}");
405 }
406 }
407}