Skip to main content

refget_digest/
lib.rs

1//! Digest computation for GA4GH refget: SHA-512/24 with base64url encoding
2//! and RFC-8785 JSON Canonicalization Scheme (JCS).
3
4use base64::Engine;
5use base64::engine::general_purpose::URL_SAFE_NO_PAD;
6use sha2::{Digest, Sha512};
7
8/// Compute the GA4GH sha512t24u digest of the given data.
9///
10/// This truncates the SHA-512 hash to 24 bytes and encodes it as base64url
11/// without padding, producing a 32-character string.
12pub fn sha512t24u(data: &[u8]) -> String {
13    let hash = Sha512::digest(data);
14    let truncated = &hash[..24];
15    URL_SAFE_NO_PAD.encode(truncated)
16}
17
18/// Canonicalize a JSON value according to RFC 8785 (JCS).
19///
20/// This produces a deterministic byte representation suitable for hashing.
21/// Key ordering is lexicographic by Unicode code point, numbers use the
22/// shortest representation, and no whitespace is added.
23pub fn jcs_canonicalize(value: &serde_json::Value) -> Vec<u8> {
24    let mut buf = Vec::new();
25    write_canonical(value, &mut buf);
26    buf
27}
28
29/// Canonicalize then compute sha512t24u of a JSON value.
30pub fn digest_json(value: &serde_json::Value) -> String {
31    let canonical = jcs_canonicalize(value);
32    sha512t24u(&canonical)
33}
34
35fn write_canonical(value: &serde_json::Value, buf: &mut Vec<u8>) {
36    match value {
37        serde_json::Value::Null => buf.extend_from_slice(b"null"),
38        serde_json::Value::Bool(b) => {
39            if *b {
40                buf.extend_from_slice(b"true");
41            } else {
42                buf.extend_from_slice(b"false");
43            }
44        }
45        serde_json::Value::Number(n) => {
46            // RFC 8785: use the shortest representation.
47            // serde_json's Display for Number already produces the right format
48            // for integers. For floats, we need to ensure no trailing zeros.
49            let s = n.to_string();
50            buf.extend_from_slice(s.as_bytes());
51        }
52        serde_json::Value::String(s) => {
53            write_canonical_string(s, buf);
54        }
55        serde_json::Value::Array(arr) => {
56            buf.push(b'[');
57            for (i, item) in arr.iter().enumerate() {
58                if i > 0 {
59                    buf.push(b',');
60                }
61                write_canonical(item, buf);
62            }
63            buf.push(b']');
64        }
65        serde_json::Value::Object(obj) => {
66            // RFC 8785: keys sorted by UTF-16 code units. For ASCII-only keys
67            // (common in refget), this is equivalent to lexicographic byte order.
68            // For full correctness, we sort by UTF-16 encoding.
69            let mut keys: Vec<&String> = obj.keys().collect();
70            keys.sort_by(|a, b| cmp_utf16(a, b));
71
72            buf.push(b'{');
73            for (i, key) in keys.iter().enumerate() {
74                if i > 0 {
75                    buf.push(b',');
76                }
77                write_canonical_string(key, buf);
78                buf.push(b':');
79                write_canonical(&obj[*key], buf);
80            }
81            buf.push(b'}');
82        }
83    }
84}
85
86/// Compare two strings by their UTF-16 code unit sequences, as required by RFC 8785.
87fn cmp_utf16(a: &str, b: &str) -> std::cmp::Ordering {
88    a.encode_utf16().cmp(b.encode_utf16())
89}
90
91/// Write a JSON-escaped string to the buffer.
92fn write_canonical_string(s: &str, buf: &mut Vec<u8>) {
93    buf.push(b'"');
94    for ch in s.chars() {
95        match ch {
96            '"' => buf.extend_from_slice(b"\\\""),
97            '\\' => buf.extend_from_slice(b"\\\\"),
98            '\x08' => buf.extend_from_slice(b"\\b"),
99            '\x0C' => buf.extend_from_slice(b"\\f"),
100            '\n' => buf.extend_from_slice(b"\\n"),
101            '\r' => buf.extend_from_slice(b"\\r"),
102            '\t' => buf.extend_from_slice(b"\\t"),
103            c if (c as u32) < 0x20 => {
104                // Control characters must use \u00XX
105                let code = c as u32;
106                buf.extend_from_slice(format!("\\u{code:04x}").as_bytes());
107            }
108            c => {
109                let mut utf8_buf = [0u8; 4];
110                buf.extend_from_slice(c.encode_utf8(&mut utf8_buf).as_bytes());
111            }
112        }
113    }
114    buf.push(b'"');
115}
116
117#[cfg(test)]
118mod tests {
119    use super::*;
120
121    #[test]
122    fn test_sha512t24u_empty() {
123        // SHA-512 of empty string, truncated to 24 bytes, base64url-encoded
124        let result = sha512t24u(b"");
125        assert_eq!(result.len(), 32);
126    }
127
128    #[test]
129    fn test_sha512t24u_spec_vector() {
130        // GA4GH canonical test vector: ACGT → aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2
131        let result = sha512t24u(b"ACGT");
132        assert_eq!(result, "aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2");
133    }
134
135    #[test]
136    fn test_sha512t24u_known_value() {
137        let digest = sha512t24u(b"ACGT");
138        assert_eq!(digest.len(), 32);
139        // Verify it's valid base64url
140        assert!(digest.chars().all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_'));
141    }
142
143    #[test]
144    fn test_jcs_canonicalize_object_key_order() {
145        let json: serde_json::Value = serde_json::from_str(r#"{"b":2,"a":1}"#).unwrap();
146        let canonical = jcs_canonicalize(&json);
147        assert_eq!(String::from_utf8(canonical).unwrap(), r#"{"a":1,"b":2}"#);
148    }
149
150    #[test]
151    fn test_jcs_canonicalize_nested() {
152        let json: serde_json::Value =
153            serde_json::from_str(r#"{"z":{"b":2,"a":1},"a":"hello"}"#).unwrap();
154        let canonical = jcs_canonicalize(&json);
155        assert_eq!(String::from_utf8(canonical).unwrap(), r#"{"a":"hello","z":{"a":1,"b":2}}"#);
156    }
157
158    #[test]
159    fn test_jcs_canonicalize_array() {
160        let json: serde_json::Value = serde_json::from_str(r#"[3,1,2]"#).unwrap();
161        let canonical = jcs_canonicalize(&json);
162        assert_eq!(String::from_utf8(canonical).unwrap(), "[3,1,2]");
163    }
164
165    #[test]
166    fn test_jcs_string_escaping() {
167        let json = serde_json::Value::String("hello\nworld".to_string());
168        let canonical = jcs_canonicalize(&json);
169        assert_eq!(String::from_utf8(canonical).unwrap(), r#""hello\nworld""#);
170    }
171
172    #[test]
173    fn test_digest_json() {
174        let json: serde_json::Value = serde_json::from_str(r#"{"a":1}"#).unwrap();
175        let digest = digest_json(&json);
176        assert_eq!(digest.len(), 32);
177        // Should be deterministic
178        assert_eq!(digest, digest_json(&json));
179    }
180
181    #[test]
182    fn test_jcs_primitives() {
183        assert_eq!(String::from_utf8(jcs_canonicalize(&serde_json::Value::Null)).unwrap(), "null");
184        assert_eq!(
185            String::from_utf8(jcs_canonicalize(&serde_json::Value::Bool(true))).unwrap(),
186            "true"
187        );
188        assert_eq!(
189            String::from_utf8(jcs_canonicalize(&serde_json::Value::Bool(false))).unwrap(),
190            "false"
191        );
192    }
193
194    #[test]
195    fn test_sha512t24u_large_input() {
196        // 1 MB of repeated bytes should still produce exactly 32 chars
197        let data = vec![0xABu8; 1_000_000];
198        let result = sha512t24u(&data);
199        assert_eq!(result.len(), 32);
200        assert!(result.chars().all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_'));
201        // Determinism
202        assert_eq!(result, sha512t24u(&data));
203    }
204
205    #[test]
206    fn test_jcs_empty_object_and_array() {
207        let obj: serde_json::Value = serde_json::from_str("{}").unwrap();
208        assert_eq!(String::from_utf8(jcs_canonicalize(&obj)).unwrap(), "{}");
209
210        let arr: serde_json::Value = serde_json::from_str("[]").unwrap();
211        assert_eq!(String::from_utf8(jcs_canonicalize(&arr)).unwrap(), "[]");
212    }
213
214    #[test]
215    fn test_jcs_numbers() {
216        let zero: serde_json::Value = serde_json::from_str("0").unwrap();
217        assert_eq!(String::from_utf8(jcs_canonicalize(&zero)).unwrap(), "0");
218
219        let neg: serde_json::Value = serde_json::from_str("-1").unwrap();
220        assert_eq!(String::from_utf8(jcs_canonicalize(&neg)).unwrap(), "-1");
221
222        let frac: serde_json::Value = serde_json::from_str("1.5").unwrap();
223        assert_eq!(String::from_utf8(jcs_canonicalize(&frac)).unwrap(), "1.5");
224
225        let big: serde_json::Value = serde_json::from_str("9007199254740992").unwrap();
226        assert_eq!(String::from_utf8(jcs_canonicalize(&big)).unwrap(), "9007199254740992");
227    }
228
229    #[test]
230    fn test_jcs_string_all_escape_sequences() {
231        // Build a string containing: quote, backslash, \b, \f, \n, \r, \t, and control char U+0001
232        let input = "\"\\\x08\x0C\n\r\t\x01";
233        let val = serde_json::Value::String(input.to_string());
234        let canonical = String::from_utf8(jcs_canonicalize(&val)).unwrap();
235        assert_eq!(canonical, r#""\"\\\b\f\n\r\t\u0001""#);
236    }
237
238    #[test]
239    fn test_jcs_unicode_emoji() {
240        let val = serde_json::Value::String("\u{1F600}".to_string()); // grinning face emoji
241        let canonical = jcs_canonicalize(&val);
242        // Emoji should be passed through as raw UTF-8, not escaped
243        let s = String::from_utf8(canonical).unwrap();
244        assert_eq!(s, "\"\u{1F600}\"");
245    }
246
247    #[test]
248    fn test_jcs_deeply_nested() {
249        let json: serde_json::Value =
250            serde_json::from_str(r#"{"a":{"b":{"c":{"d":"deep"}}}}"#).unwrap();
251        let canonical = String::from_utf8(jcs_canonicalize(&json)).unwrap();
252        assert_eq!(canonical, r#"{"a":{"b":{"c":{"d":"deep"}}}}"#);
253    }
254
255    #[test]
256    fn test_cmp_utf16_via_key_ordering() {
257        // U+00E9 (é, Latin Small Letter E with Acute) has UTF-16 code unit 0x00E9.
258        // U+0101 (ā, Latin Small Letter A with Macron) has UTF-16 code unit 0x0101.
259        // By UTF-16 ordering, é (0x00E9) < ā (0x0101), even though by Unicode code point
260        // 'a' < 'é' < 'ā' already holds. Use a case where UTF-16 differs from naive byte order:
261        // U+FB33 (Hebrew Letter Dalet with Dagesh) encodes as a single UTF-16 unit 0xFB33,
262        // while U+1F600 (emoji) encodes as a surrogate pair starting with 0xD83D.
263        // 0xD83D < 0xFB33 in UTF-16, so emoji sorts before FB33.
264        let json: serde_json::Value =
265            serde_json::from_str(r#"{"\uFB33":1,"\uD83D\uDE00":2}"#).unwrap();
266        let canonical = String::from_utf8(jcs_canonicalize(&json)).unwrap();
267        // Emoji (surrogate pair 0xD83D,0xDE00) sorts before U+FB33 (0xFB33) in UTF-16 order
268        assert!(canonical.starts_with("{\"\u{1F600}\":2"));
269    }
270
271    #[test]
272    fn test_digest_json_different_objects() {
273        let a: serde_json::Value = serde_json::from_str(r#"{"key":"value1"}"#).unwrap();
274        let b: serde_json::Value = serde_json::from_str(r#"{"key":"value2"}"#).unwrap();
275        let digest_a = digest_json(&a);
276        let digest_b = digest_json(&b);
277        assert_ne!(digest_a, digest_b);
278        assert_eq!(digest_a.len(), 32);
279        assert_eq!(digest_b.len(), 32);
280    }
281}