Skip to main content

acdp_jcs/
lib.rs

1//! JSON Canonicalization Scheme (JCS) — RFC 8785.
2//!
3//! Implemented inline to avoid an external dependency and to guarantee
4//! correct handling of all edge cases, especially:
5//!   - Object key sorting (RFC 8785 §3.2.1 UTF-16 code-unit order; all
6//!     ACDP keys are ASCII, where this coincides with byte/`str` order)
7//!   - No whitespace
8//!   - Negative zero (`-0.0`) MUST become `0`  (the most common bug)
9//!   - Non-ASCII characters emitted as-is, not `\uXXXX`-escaped
10
11use std::io::Write;
12
13use acdp_primitives::AcdpError;
14use serde::Serialize;
15
16/// Hard recursion ceiling for the JCS walker. Far above any real ACDP
17/// body (metadata depth is capped at 8) and above serde_json's default
18/// 128-level parse limit, so a value that parsed off the wire can never
19/// hit it — the wire/golden-vector form is unchanged. The cap only
20/// guards against stack overflow from a pathologically deep
21/// programmatically-built `Value` (defense-in-depth, RFC-ACDP P1-3).
22const MAX_JCS_DEPTH: usize = 256;
23
24/// Canonicalize any serializable value to JCS bytes.
25///
26/// The returned bytes are the canonical UTF-8 JSON representation.
27pub fn canonicalize<T: Serialize>(value: &T) -> Result<Vec<u8>, AcdpError> {
28    let v = serde_json::to_value(value).map_err(|e| AcdpError::Canonicalization(e.to_string()))?;
29    try_canonicalize_value(&v)
30}
31
32/// Canonicalize a pre-parsed `serde_json::Value`, returning an error if
33/// nesting exceeds the internal recursion ceiling (`MAX_JCS_DEPTH`).
34/// Prefer this on any path that may canonicalize untrusted /
35/// programmatically-built input.
36pub fn try_canonicalize_value(value: &serde_json::Value) -> Result<Vec<u8>, AcdpError> {
37    let mut out = Vec::with_capacity(256);
38    write_value(value, &mut out, 0)?;
39    Ok(out)
40}
41
42/// Canonicalize a pre-parsed `serde_json::Value`.
43///
44/// Infallible back-compat wrapper. Panics only on input nested past the
45/// internal recursion ceiling (`MAX_JCS_DEPTH`, unreachable from parsed
46/// wire data); callers handling untrusted input should use
47/// [`try_canonicalize_value`].
48pub fn canonicalize_value(value: &serde_json::Value) -> Vec<u8> {
49    try_canonicalize_value(value)
50        .expect("JCS canonicalization exceeded depth limit; use try_canonicalize_value")
51}
52
53fn write_value(v: &serde_json::Value, out: &mut Vec<u8>, depth: usize) -> Result<(), AcdpError> {
54    if depth > MAX_JCS_DEPTH {
55        return Err(AcdpError::Canonicalization(format!(
56            "JSON nesting depth exceeds {MAX_JCS_DEPTH}"
57        )));
58    }
59    match v {
60        serde_json::Value::Null => out.extend_from_slice(b"null"),
61        serde_json::Value::Bool(true) => out.extend_from_slice(b"true"),
62        serde_json::Value::Bool(false) => out.extend_from_slice(b"false"),
63        serde_json::Value::Number(n) => write_number(n, out),
64        serde_json::Value::String(s) => write_string(s, out),
65        serde_json::Value::Array(arr) => {
66            out.push(b'[');
67            for (i, elem) in arr.iter().enumerate() {
68                if i > 0 {
69                    out.push(b',');
70                }
71                write_value(elem, out, depth + 1)?;
72            }
73            out.push(b']');
74        }
75        serde_json::Value::Object(map) => {
76            // Sort keys in RFC 8785 §3.2.1 UTF-16 code-unit order. ACDP
77            // keys are ASCII, where Rust's `str` (byte/scalar) ordering
78            // coincides with UTF-16 code-unit ordering.
79            let mut keys: Vec<&String> = map.keys().collect();
80            keys.sort();
81            out.push(b'{');
82            for (i, key) in keys.iter().enumerate() {
83                if i > 0 {
84                    out.push(b',');
85                }
86                write_string(key, out);
87                out.push(b':');
88                write_value(&map[key.as_str()], out, depth + 1)?;
89            }
90            out.push(b'}');
91        }
92    }
93    Ok(())
94}
95
96fn write_number(n: &serde_json::Number, out: &mut Vec<u8>) {
97    // Integer `Number`s (i64 / u64) are already canonical — serde_json prints
98    // the exact digits with no decimal point and no exponent, exactly what
99    // RFC 8785 requires. Only floats need the ECMAScript reformatting below.
100    if n.is_i64() || n.is_u64() {
101        out.extend_from_slice(n.to_string().as_bytes());
102        return;
103    }
104
105    // Float path. `as_f64` is `Some` for any non-integer `Number`; the `None`
106    // arm is unreachable but kept total rather than panicking.
107    let Some(f) = n.as_f64() else {
108        out.extend_from_slice(n.to_string().as_bytes());
109        return;
110    };
111
112    // RFC 8785 §3.2.2.3: both negative and positive zero serialize as "0".
113    if f == 0.0 {
114        out.push(b'0');
115        return;
116    }
117
118    // JSON cannot represent NaN or Infinity. `serde_json::Number::from_f64`
119    // rejects these and this crate does not enable `arbitrary_precision`, so a
120    // non-finite `Number` cannot be built through the safe API — unreachable on
121    // parsed input. Refuse it loudly in debug/test builds; the `null` fallback
122    // is a release-only last resort so canonicalization stays total (emitting
123    // `null` would corrupt the hash preimage). Producers with custom numeric
124    // paths MUST reject non-finite floats *before* canonicalization.
125    debug_assert!(
126        f.is_finite(),
127        "non-finite f64 reached JCS canonicalization ({f}); reject \
128         non-finite numbers before hashing (RFC 8785 §3.2.2.3)"
129    );
130    if !f.is_finite() {
131        out.extend_from_slice(b"null");
132        return;
133    }
134
135    out.extend_from_slice(ecma_number_string(f).as_bytes());
136}
137
138/// Serialize a finite, non-zero `f64` per the ECMAScript `Number::toString`
139/// algorithm that RFC 8785 §3.2.2.3 references: the shortest decimal that
140/// round-trips, rendered with the ES6 band rules — plain decimal for
141/// magnitudes in `[1e-6, 1e21)`, otherwise exponential with a signed,
142/// zero-padding-free exponent; the mantissa never carries a trailing `.0`.
143///
144/// Rust's `{:e}` formatter already produces the shortest round-tripping
145/// mantissa (via the stdlib's Grisu/Ryū path) as `d.ddde±EE`; we extract its
146/// digits and decimal exponent and reformat into the band ECMAScript chooses.
147fn ecma_number_string(f: f64) -> String {
148    let neg = f.is_sign_negative();
149    // e.g. "1.23e25", "5e-324", "1e21", "1.0000005e6".
150    let sci = format!("{:e}", f.abs());
151    let (mantissa, exp) = sci.split_once('e').expect("{:e} always emits 'e'");
152    let e10: i32 = exp.parse().expect("{:e} exponent is an integer");
153    let digits: String = mantissa.chars().filter(|c| *c != '.').collect();
154    let digits = digits.trim_end_matches('0');
155    let digits = if digits.is_empty() { "0" } else { digits };
156    let k = digits.len() as i32; // count of significant digits
157    let n = e10 + 1; // value = digits × 10^(n − k)
158
159    let body = if (k..=21).contains(&n) {
160        // Integer-valued: all digits then (n − k) trailing zeros.
161        format!("{digits}{}", "0".repeat((n - k) as usize))
162    } else if (1..=21).contains(&n) {
163        // Decimal point falls inside the digit run (here n < k).
164        format!("{}.{}", &digits[..n as usize], &digits[n as usize..])
165    } else if (-5..=0).contains(&n) {
166        // Leading "0." then (−n) zeros then the digits.
167        format!("0.{}{digits}", "0".repeat((-n) as usize))
168    } else if k == 1 {
169        // Single-digit mantissa, exponential form.
170        format!("{digits}e{}{}", exp_sign(n - 1), (n - 1).abs())
171    } else {
172        // Multi-digit mantissa, exponential form.
173        format!(
174            "{}.{}e{}{}",
175            &digits[..1],
176            &digits[1..],
177            exp_sign(n - 1),
178            (n - 1).abs()
179        )
180    };
181
182    if neg {
183        format!("-{body}")
184    } else {
185        body
186    }
187}
188
189/// `'+'` for a non-negative ECMAScript exponent, `'-'` otherwise. RFC 8785
190/// requires the exponent sign to always be present (`1e+21`, `1e-7`).
191fn exp_sign(e: i32) -> char {
192    if e >= 0 {
193        '+'
194    } else {
195        '-'
196    }
197}
198
199fn write_string(s: &str, out: &mut Vec<u8>) {
200    out.push(b'"');
201    for ch in s.chars() {
202        match ch {
203            '"' => out.extend_from_slice(b"\\\""),
204            '\\' => out.extend_from_slice(b"\\\\"),
205            '\n' => out.extend_from_slice(b"\\n"),
206            '\r' => out.extend_from_slice(b"\\r"),
207            '\t' => out.extend_from_slice(b"\\t"),
208            c if (c as u32) < 0x20 => {
209                // Control characters below U+0020 must be escaped
210                write!(out, "\\u{:04x}", c as u32).unwrap();
211            }
212            c => {
213                // Non-ASCII characters emitted as-is (UTF-8 bytes, not \uXXXX)
214                let mut buf = [0u8; 4];
215                let encoded = c.encode_utf8(&mut buf);
216                out.extend_from_slice(encoded.as_bytes());
217            }
218        }
219    }
220    out.push(b'"');
221}
222
223// ── Tests ─────────────────────────────────────────────────────────────────────
224
225#[cfg(test)]
226mod tests {
227    use super::*;
228    use serde_json::json;
229
230    #[test]
231    fn sorts_keys() {
232        let v = json!({"z": 1, "a": 2, "m": 3});
233        let out = canonicalize_value(&v);
234        assert_eq!(out, b"{\"a\":2,\"m\":3,\"z\":1}");
235    }
236
237    #[test]
238    fn negative_zero_becomes_zero() {
239        // The critical RFC 8785 edge case
240        let v = json!({"values": [42, -7, 0, 1.1, 1.5, -0.0_f64]});
241        let out = canonicalize_value(&v);
242        let s = std::str::from_utf8(&out).unwrap();
243        // -0.0 must become 0
244        assert!(!s.contains("-0"), "found '-0' in: {s}");
245    }
246
247    #[test]
248    fn unicode_as_is() {
249        let v = json!({"title": "café"});
250        let out = canonicalize_value(&v);
251        assert_eq!(out, "{\"title\":\"café\"}".as_bytes());
252    }
253
254    #[test]
255    fn empty_vs_absent() {
256        let with_tags = json!({"tags": [], "v": 1});
257        let without = json!({"v": 1});
258        let h1 = {
259            use sha2::{Digest, Sha256};
260            hex::encode(Sha256::digest(canonicalize_value(&with_tags)))
261        };
262        let h2 = {
263            use sha2::{Digest, Sha256};
264            hex::encode(Sha256::digest(canonicalize_value(&without)))
265        };
266        assert_ne!(h1, h2, "empty array and absent field must hash differently");
267    }
268
269    #[test]
270    fn minimal_body_golden_hash() {
271        // Reproduces can-001 vector from schemas/conformance/can-001-jcs-vector.json
272        let body = json!({
273            "agent_id": "did:agent:test",
274            "contributors": [],
275            "data_refs": [],
276            "supersedes": null,
277            "title": "Minimal",
278            "type": "data_snapshot",
279            "version": 1
280        });
281        use sha2::{Digest, Sha256};
282        let h = hex::encode(Sha256::digest(canonicalize_value(&body)));
283        assert_eq!(
284            h,
285            "5f8d88d6758cfd43be875d49edc9eaa494de8ec645bf7de6c592b15bbb1e2e3c"
286        );
287    }
288
289    // ── RFC 8785 numeric serialization vectors (Appendix B subset) ──────
290    //
291    // RFC 8785 §3.2.2.3 / Appendix B pin the serialization of JSON
292    // numbers. ACDP wire bodies only ever carry *integers* (version
293    // numbers, counts) and the occasional plain decimal — never the
294    // exponential / integer-valued-float forms (e.g. `1e21`, `1.0`) whose
295    // ECMAScript `Number::toString` output diverges from serde_json's
296    // shortest-float Display. We therefore pin the cases that actually
297    // occur on the wire and that this canonicalizer guarantees, plus the
298    // negative-zero rule that is the most common JCS bug. Full ECMAScript
299    // `Number::toString` formatting (exponential bands, shortest
300    // round-trip) is implemented in `write_number` and is covered by
301    // `rfc8785_ecmascript_float_bands` below.
302
303    /// Helper: canonicalize a single JSON number token (parsed from
304    /// text, so integers stay integers) and return the emitted string.
305    fn canon_number(json_token: &str) -> String {
306        let v: serde_json::Value = serde_json::from_str(json_token).unwrap();
307        String::from_utf8(canonicalize_value(&v)).unwrap()
308    }
309
310    #[test]
311    fn rfc8785_integer_vectors() {
312        // Integers serialize with no decimal point, no leading zeros,
313        // no plus sign — exactly their canonical decimal form.
314        for (input, expected) in [
315            ("0", "0"),
316            ("-0", "0"), // negative-zero *integer* normalizes to "0"
317            ("1", "1"),
318            ("-1", "-1"),
319            ("100", "100"),
320            ("9007199254740992", "9007199254740992"), // 2^53
321            ("9007199254740993", "9007199254740993"), // 2^53 + 1 (exact as i64)
322            ("18446744073709551615", "18446744073709551615"), // u64::MAX
323            ("-9223372036854775808", "-9223372036854775808"), // i64::MIN
324        ] {
325            assert_eq!(canon_number(input), expected, "input={input}");
326        }
327    }
328
329    #[test]
330    fn rfc8785_negative_zero_float_becomes_zero() {
331        // RFC 8785 §3.2.2.3: -0.0 MUST serialize as "0".
332        assert_eq!(canon_number("-0.0"), "0");
333        // And nested inside a structure (the realistic case). The other
334        // entries are integers to avoid the integer-valued-float case
335        // (`0.0` → "0.0") that is out of scope per the note above.
336        let v = json!({"a": [-0.0_f64, 1], "b": -0.0_f64});
337        let s = String::from_utf8(canonicalize_value(&v)).unwrap();
338        assert_eq!(s, r#"{"a":[0,1],"b":0}"#);
339    }
340
341    #[test]
342    fn rfc8785_plain_decimal_vectors() {
343        // Plain decimals whose shortest representation is unambiguous and
344        // identical under ES6 and serde_json's Display.
345        for (input, expected) in [
346            ("0.1", "0.1"),
347            ("1.5", "1.5"),
348            ("-2.5", "-2.5"),
349            ("123.456", "123.456"),
350        ] {
351            assert_eq!(canon_number(input), expected, "input={input}");
352        }
353    }
354
355    #[test]
356    fn rfc8785_numeric_serialization_is_idempotent() {
357        // Re-canonicalizing the emitted form reproduces it byte-for-byte
358        // (no drift across a parse → serialize round trip).
359        for token in ["0", "-0", "42", "9007199254740993", "0.1", "-2.5", "-0.0"] {
360            let once = canon_number(token);
361            let twice = canon_number(&once);
362            assert_eq!(once, twice, "token={token}");
363        }
364    }
365
366    /// RFC 8785 §3.2.2.3 float serialization — the `can-011` numeric
367    /// bands, now that ECMAScript `Number::toString` is implemented in
368    /// `write_number`. These canonical tokens are fixed by the algorithm,
369    /// so they hold regardless of the spec fixture's own SHA-256 values.
370    #[test]
371    fn rfc8785_ecmascript_float_bands() {
372        for (token, expected) in [
373            // Large-magnitude exponential (≥ 1e21).
374            ("1e21", "1e+21"),
375            ("1e22", "1e+22"),
376            ("1.23e25", "1.23e+25"),
377            ("1e100", "1e+100"),
378            // Small-magnitude exponential (< 1e-6).
379            ("1e-7", "1e-7"),
380            ("1e-10", "1e-10"),
381            ("5e-9", "5e-9"),
382            ("1e-20", "1e-20"),
383            // Decimal band [1e-6, 1e21).
384            ("1e-6", "0.000001"),
385            ("0.1", "0.1"),
386            ("1000000.5", "1000000.5"),
387            ("12345.6789", "12345.6789"),
388            // Integer-valued floats normalize like integers (no trailing .0).
389            ("1.0", "1"),
390            ("100.0", "100"),
391            // IEEE 754 magnitude extremes.
392            ("1.7976931348623157e308", "1.7976931348623157e+308"),
393            ("5e-324", "5e-324"),
394        ] {
395            assert_eq!(canon_number(token), expected, "token={token}");
396        }
397    }
398
399    /// Positive and negative zero — including the float and exponential
400    /// spellings — all canonicalize to "0" (RFC 8785 §3.2.2.3).
401    #[test]
402    fn rfc8785_all_zeros_normalize() {
403        for token in ["0", "-0", "0.0", "-0.0", "0e0", "-0.0e10"] {
404            assert_eq!(canon_number(token), "0", "token={token}");
405        }
406    }
407}