Skip to main content

ash_core/
canonicalize.rs

1//! Deterministic canonicalization for JSON and URL-encoded payloads.
2//!
3//! This module ensures byte-identical output across all platforms and implementations.
4
5use serde_json::Value;
6use unicode_normalization::UnicodeNormalization;
7
8use crate::errors::{AshError, AshErrorCode};
9
10/// Canonicalize a JSON string to deterministic form.
11///
12/// # Canonicalization Rules
13///
14/// 1. **Minified**: No whitespace between elements
15/// 2. **Key Ordering**: Object keys sorted lexicographically (ascending)
16/// 3. **Array Order**: Preserved (arrays are ordered)
17/// 4. **Unicode**: NFC normalization applied to all strings
18/// 5. **Numbers**:
19///    - No scientific notation
20///    - No trailing zeros after decimal
21///    - `-0` becomes `0`
22/// 6. **Unsupported Values**: `NaN`, `Infinity` cause rejection
23///
24/// # Example
25///
26/// ```rust
27/// use ash_core::canonicalize_json;
28///
29/// let input = r#"{ "z": 1, "a": { "c": 3, "b": 2 } }"#;
30/// let output = canonicalize_json(input).unwrap();
31/// assert_eq!(output, r#"{"a":{"b":2,"c":3},"z":1}"#);
32/// ```
33///
34/// # Errors
35///
36/// Returns `AshError` with `CanonicalizationFailed` if:
37/// - Input is not valid JSON
38/// - JSON contains unsupported values (NaN, Infinity)
39pub fn canonicalize_json(input: &str) -> Result<String, AshError> {
40    // Parse JSON
41    let value: Value = serde_json::from_str(input).map_err(|e| {
42        AshError::new(
43            AshErrorCode::CanonicalizationFailed,
44            format!("Invalid JSON: {}", e),
45        )
46    })?;
47
48    // Canonicalize recursively
49    let canonical = canonicalize_value(&value)?;
50
51    // Serialize to minified JSON
52    serde_json::to_string(&canonical).map_err(|e| {
53        AshError::new(
54            AshErrorCode::CanonicalizationFailed,
55            format!("Failed to serialize: {}", e),
56        )
57    })
58}
59
60/// Recursively canonicalize a JSON value.
61fn canonicalize_value(value: &Value) -> Result<Value, AshError> {
62    match value {
63        Value::Null => Ok(Value::Null),
64        Value::Bool(b) => Ok(Value::Bool(*b)),
65        Value::Number(n) => canonicalize_number(n),
66        Value::String(s) => Ok(Value::String(canonicalize_string(s))),
67        Value::Array(arr) => {
68            let canonical: Result<Vec<Value>, AshError> =
69                arr.iter().map(canonicalize_value).collect();
70            Ok(Value::Array(canonical?))
71        }
72        Value::Object(obj) => {
73            // Sort keys lexicographically
74            let mut sorted: Vec<(&String, &Value)> = obj.iter().collect();
75            sorted.sort_by(|a, b| a.0.cmp(b.0));
76
77            let mut canonical = serde_json::Map::new();
78            for (key, val) in sorted {
79                let canonical_key = canonicalize_string(key);
80                let canonical_val = canonicalize_value(val)?;
81                canonical.insert(canonical_key, canonical_val);
82            }
83            Ok(Value::Object(canonical))
84        }
85    }
86}
87
88/// Canonicalize a number value.
89fn canonicalize_number(n: &serde_json::Number) -> Result<Value, AshError> {
90    // Check for special values that shouldn't exist in valid JSON
91    // but handle edge cases
92
93    if let Some(i) = n.as_i64() {
94        // Handle -0 case (though rare in integers)
95        if i == 0 {
96            return Ok(Value::Number(serde_json::Number::from(0)));
97        }
98        return Ok(Value::Number(serde_json::Number::from(i)));
99    }
100
101    if let Some(u) = n.as_u64() {
102        return Ok(Value::Number(serde_json::Number::from(u)));
103    }
104
105    if let Some(f) = n.as_f64() {
106        // Check for NaN and Infinity
107        if f.is_nan() {
108            return Err(AshError::new(
109                AshErrorCode::CanonicalizationFailed,
110                "NaN is not supported in ASH canonicalization",
111            ));
112        }
113        if f.is_infinite() {
114            return Err(AshError::new(
115                AshErrorCode::CanonicalizationFailed,
116                "Infinity is not supported in ASH canonicalization",
117            ));
118        }
119
120        // Handle -0
121        let f = if f == 0.0 && f.is_sign_negative() {
122            0.0
123        } else {
124            f
125        };
126
127        // Convert back to Number
128        serde_json::Number::from_f64(f)
129            .map(Value::Number)
130            .ok_or_else(|| {
131                AshError::new(
132                    AshErrorCode::CanonicalizationFailed,
133                    "Failed to canonicalize number",
134                )
135            })
136    } else {
137        Err(AshError::new(
138            AshErrorCode::CanonicalizationFailed,
139            "Unsupported number format",
140        ))
141    }
142}
143
144/// Canonicalize a string with Unicode NFC normalization.
145fn canonicalize_string(s: &str) -> String {
146    s.nfc().collect()
147}
148
149/// Canonicalize URL-encoded form data.
150///
151/// # Canonicalization Rules
152///
153/// 1. Parse key=value pairs (split on `&`, then on first `=`)
154/// 2. Percent-decode all values
155/// 3. Apply Unicode NFC normalization
156/// 4. Sort pairs by key lexicographically
157/// 5. For duplicate keys, preserve value order
158/// 6. Re-encode with percent encoding
159///
160/// # Example
161///
162/// ```rust
163/// use ash_core::canonicalize_urlencoded;
164///
165/// let input = "z=3&a=1&a=2&b=hello%20world";
166/// let output = canonicalize_urlencoded(input).unwrap();
167/// assert_eq!(output, "a=1&a=2&b=hello%20world&z=3");
168/// ```
169pub fn canonicalize_urlencoded(input: &str) -> Result<String, AshError> {
170    if input.is_empty() {
171        return Ok(String::new());
172    }
173
174    // Parse pairs
175    let mut pairs: Vec<(String, String)> = Vec::new();
176
177    for part in input.split('&') {
178        if part.is_empty() {
179            continue;
180        }
181
182        let (key, value) = match part.find('=') {
183            Some(pos) => (&part[..pos], &part[pos + 1..]),
184            None => (part, ""),
185        };
186
187        // Percent-decode
188        let decoded_key = percent_decode(key)?;
189        let decoded_value = percent_decode(value)?;
190
191        // NFC normalize
192        let normalized_key: String = decoded_key.nfc().collect();
193        let normalized_value: String = decoded_value.nfc().collect();
194
195        pairs.push((normalized_key, normalized_value));
196    }
197
198    // Sort by key (stable sort preserves order of duplicate keys)
199    pairs.sort_by(|a, b| a.0.cmp(&b.0));
200
201    // Re-encode and join
202    let encoded: Vec<String> = pairs
203        .into_iter()
204        .map(|(k, v)| format!("{}={}", percent_encode(&k), percent_encode(&v)))
205        .collect();
206
207    Ok(encoded.join("&"))
208}
209
210/// Percent-decode a string.
211fn percent_decode(input: &str) -> Result<String, AshError> {
212    let mut result = String::with_capacity(input.len());
213    let mut chars = input.chars().peekable();
214
215    while let Some(ch) = chars.next() {
216        if ch == '%' {
217            // Read two hex digits
218            let hex: String = chars.by_ref().take(2).collect();
219            if hex.len() != 2 {
220                return Err(AshError::new(
221                    AshErrorCode::CanonicalizationFailed,
222                    "Invalid percent encoding",
223                ));
224            }
225            let byte = u8::from_str_radix(&hex, 16).map_err(|_| {
226                AshError::new(
227                    AshErrorCode::CanonicalizationFailed,
228                    "Invalid percent encoding hex",
229                )
230            })?;
231            result.push(byte as char);
232        } else if ch == '+' {
233            // Plus is space in form data
234            result.push(' ');
235        } else {
236            result.push(ch);
237        }
238    }
239
240    Ok(result)
241}
242
243/// Canonicalize a URL query string according to ASH specification.
244///
245/// # Canonicalization Rules (9 MUST rules)
246///
247/// 1. MUST parse query string after `?` (or use full string if no `?`)
248/// 2. MUST split on `&` to get key=value pairs
249/// 3. MUST handle keys without values (treat as empty string)
250/// 4. MUST percent-decode all keys and values
251/// 5. MUST apply Unicode NFC normalization
252/// 6. MUST sort pairs by key lexicographically (byte order)
253/// 7. MUST preserve order of duplicate keys
254/// 8. MUST re-encode with uppercase hex (%XX)
255/// 9. MUST join with `&` separator
256///
257/// # Example
258///
259/// ```rust
260/// use ash_core::canonicalize_query;
261///
262/// let input = "z=3&a=1&b=hello%20world";
263/// let output = canonicalize_query(input).unwrap();
264/// assert_eq!(output, "a=1&b=hello%20world&z=3");
265///
266/// // With leading ?
267/// let input2 = "?z=3&a=1";
268/// let output2 = canonicalize_query(input2).unwrap();
269/// assert_eq!(output2, "a=1&z=3");
270/// ```
271pub fn canonicalize_query(input: &str) -> Result<String, AshError> {
272    // Rule 1: Remove leading ? if present
273    let query = input.strip_prefix('?').unwrap_or(input);
274
275    if query.is_empty() {
276        return Ok(String::new());
277    }
278
279    // Rule 2 & 3: Parse pairs
280    let mut pairs: Vec<(String, String)> = Vec::new();
281
282    for part in query.split('&') {
283        if part.is_empty() {
284            continue;
285        }
286
287        let (key, value) = match part.find('=') {
288            Some(pos) => (&part[..pos], &part[pos + 1..]),
289            None => (part, ""), // Rule 3: keys without values
290        };
291
292        // Rule 4: Percent-decode
293        let decoded_key = percent_decode(key)?;
294        let decoded_value = percent_decode(value)?;
295
296        // Rule 5: NFC normalize
297        let normalized_key: String = decoded_key.nfc().collect();
298        let normalized_value: String = decoded_value.nfc().collect();
299
300        pairs.push((normalized_key, normalized_value));
301    }
302
303    // Rule 6 & 7: Sort by key (stable sort preserves order of duplicate keys)
304    pairs.sort_by(|a, b| a.0.cmp(&b.0));
305
306    // Rule 8 & 9: Re-encode with uppercase hex and join
307    let encoded: Vec<String> = pairs
308        .into_iter()
309        .map(|(k, v)| {
310            format!(
311                "{}={}",
312                percent_encode_uppercase(&k),
313                percent_encode_uppercase(&v)
314            )
315        })
316        .collect();
317
318    Ok(encoded.join("&"))
319}
320
321/// Percent-encode a string with uppercase hex digits.
322fn percent_encode_uppercase(input: &str) -> String {
323    let mut result = String::with_capacity(input.len() * 3);
324
325    for ch in input.chars() {
326        match ch {
327            'A'..='Z' | 'a'..='z' | '0'..='9' | '-' | '_' | '.' | '~' => {
328                result.push(ch);
329            }
330            ' ' => {
331                result.push_str("%20");
332            }
333            _ => {
334                for byte in ch.to_string().as_bytes() {
335                    result.push('%');
336                    result.push_str(&format!("{:02X}", byte)); // Uppercase hex
337                }
338            }
339        }
340    }
341
342    result
343}
344
345/// Percent-encode a string for URL form data.
346fn percent_encode(input: &str) -> String {
347    let mut result = String::with_capacity(input.len() * 3);
348
349    for ch in input.chars() {
350        match ch {
351            'A'..='Z' | 'a'..='z' | '0'..='9' | '-' | '_' | '.' | '~' => {
352                result.push(ch);
353            }
354            ' ' => {
355                // Use %20 for spaces (more universal than +)
356                result.push_str("%20");
357            }
358            _ => {
359                // Percent-encode
360                for byte in ch.to_string().as_bytes() {
361                    result.push('%');
362                    result.push_str(&format!("{:02X}", byte));
363                }
364            }
365        }
366    }
367
368    result
369}
370
371#[cfg(test)]
372mod tests {
373    use super::*;
374
375    // JSON Canonicalization Tests
376
377    #[test]
378    fn test_canonicalize_json_simple_object() {
379        let input = r#"{"z":1,"a":2}"#;
380        let output = canonicalize_json(input).unwrap();
381        assert_eq!(output, r#"{"a":2,"z":1}"#);
382    }
383
384    #[test]
385    fn test_canonicalize_json_nested_object() {
386        let input = r#"{"b":{"d":4,"c":3},"a":1}"#;
387        let output = canonicalize_json(input).unwrap();
388        assert_eq!(output, r#"{"a":1,"b":{"c":3,"d":4}}"#);
389    }
390
391    #[test]
392    fn test_canonicalize_json_with_whitespace() {
393        let input = r#"{ "z" : 1 , "a" : 2 }"#;
394        let output = canonicalize_json(input).unwrap();
395        assert_eq!(output, r#"{"a":2,"z":1}"#);
396    }
397
398    #[test]
399    fn test_canonicalize_json_array_preserved() {
400        let input = r#"{"arr":[3,1,2]}"#;
401        let output = canonicalize_json(input).unwrap();
402        assert_eq!(output, r#"{"arr":[3,1,2]}"#);
403    }
404
405    #[test]
406    fn test_canonicalize_json_null() {
407        let input = r#"{"a":null}"#;
408        let output = canonicalize_json(input).unwrap();
409        assert_eq!(output, r#"{"a":null}"#);
410    }
411
412    #[test]
413    fn test_canonicalize_json_boolean() {
414        let input = r#"{"b":true,"a":false}"#;
415        let output = canonicalize_json(input).unwrap();
416        assert_eq!(output, r#"{"a":false,"b":true}"#);
417    }
418
419    #[test]
420    fn test_canonicalize_json_empty_object() {
421        let input = r#"{}"#;
422        let output = canonicalize_json(input).unwrap();
423        assert_eq!(output, r#"{}"#);
424    }
425
426    #[test]
427    fn test_canonicalize_json_empty_array() {
428        let input = r#"[]"#;
429        let output = canonicalize_json(input).unwrap();
430        assert_eq!(output, r#"[]"#);
431    }
432
433    #[test]
434    fn test_canonicalize_json_unicode() {
435        // Test with Unicode characters
436        let input = r#"{"name":"café"}"#;
437        let output = canonicalize_json(input).unwrap();
438        assert_eq!(output, r#"{"name":"café"}"#);
439    }
440
441    #[test]
442    fn test_canonicalize_json_invalid() {
443        let input = r#"{"a":}"#;
444        assert!(canonicalize_json(input).is_err());
445    }
446
447    // URL-Encoded Canonicalization Tests
448
449    #[test]
450    fn test_canonicalize_urlencoded_simple() {
451        let input = "z=3&a=1&b=2";
452        let output = canonicalize_urlencoded(input).unwrap();
453        assert_eq!(output, "a=1&b=2&z=3");
454    }
455
456    #[test]
457    fn test_canonicalize_urlencoded_duplicate_keys() {
458        let input = "a=2&a=1&b=3";
459        let output = canonicalize_urlencoded(input).unwrap();
460        // Duplicate keys preserve value order after sorting by key
461        assert_eq!(output, "a=2&a=1&b=3");
462    }
463
464    #[test]
465    fn test_canonicalize_urlencoded_encoded_space() {
466        let input = "a=hello%20world";
467        let output = canonicalize_urlencoded(input).unwrap();
468        assert_eq!(output, "a=hello%20world");
469    }
470
471    #[test]
472    fn test_canonicalize_urlencoded_plus_space() {
473        let input = "a=hello+world";
474        let output = canonicalize_urlencoded(input).unwrap();
475        assert_eq!(output, "a=hello%20world");
476    }
477
478    #[test]
479    fn test_canonicalize_urlencoded_empty() {
480        let input = "";
481        let output = canonicalize_urlencoded(input).unwrap();
482        assert_eq!(output, "");
483    }
484
485    #[test]
486    fn test_canonicalize_urlencoded_no_value() {
487        let input = "a&b=2";
488        let output = canonicalize_urlencoded(input).unwrap();
489        assert_eq!(output, "a=&b=2");
490    }
491}