json_digest/
digest.rs

1use super::*;
2
3use tiny_keccak::Hasher;
4use unicode_normalization::UnicodeNormalization;
5
6/// Returns an [NFKD normalized] unicode representation of the input
7///
8/// [NFKD normalized]: https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms
9pub fn normalize_unicode(s: &str) -> String {
10    s.nfkd().collect()
11}
12
13/// Multibase-encoded hash of the provided bytes used in many places around this crate.
14///
15/// We use SHA3_256 (not Keccak) with Base-64 URL encoding.
16pub fn default_hasher(content: &[u8]) -> String {
17    // TODO we might want to use sha3 crate instead of tiny_keccak
18    let mut hasher = tiny_keccak::Sha3::v256();
19    let mut hash_output = [0u8; 32];
20    hasher.update(content);
21    hasher.finalize(&mut hash_output);
22    multibase::encode(multibase::Base::Base64Url, &hash_output)
23}
24
25/// Multibase-encoded hash of the utf8 representation of the provided string,
26/// prefixed with "cj". Character 'j' marks that a JSON value was hashed and
27/// 'c' stands for content hash.
28fn hash_str(content: &str) -> String {
29    format!("cj{}", default_hasher(content.as_bytes()))
30}
31
32/// Constructs the deterministic string representation of the provided JSON value.
33///
34/// The same JSON document can be represented in multiple ways depending on
35/// property ordering and string encoding. This canonical JSON format lists
36/// property names in ascending order by their utf8-encoded byte arrays and
37/// uses the [NFKD normalized] unicode representation of all strings
38/// (both property names and string values).
39///
40/// The function will return error if a provided JSON object has properties
41/// with the same name in different unicode normalizations.
42/// Note that creating `Value` arguments, `serde_json` accepts objects
43/// having properties with exactly the same name and keeps only the last value,
44/// ignoring all previous ones.
45///
46/// [NFKD normalized]: https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms
47pub fn canonical_json(data: &serde_json::Value) -> Result<String> {
48    match data {
49        serde_json::Value::Array(arr) => {
50            let mut canonical_json_items = Vec::new();
51            for item in arr {
52                canonical_json_items.push(canonical_json(item)?);
53            }
54            Ok(format!("[{}]", canonical_json_items.join(",")))
55        }
56
57        serde_json::Value::Object(obj) => {
58            let mut canonical_json_entries = Vec::new();
59            let mut keys: Vec<_> = obj.keys().collect();
60            keys.sort();
61            for key in keys {
62                ensure!(
63                    *key == normalize_unicode(key),
64                    "Data for canonical JSON serialization must contain field names normalized with Unicode NFKD"
65                );
66
67                let value = obj.get(key).expect("serde_json keys() impl error");
68                let canonical_key = canonical_json(&serde_json::Value::String(key.to_owned()))?;
69                let entry = format!("{}:{}", canonical_key, canonical_json(value)?);
70                canonical_json_entries.push(entry);
71            }
72            // NOTE: braces are escaped as double braces in Rust
73            Ok(format!("{{{}}}", canonical_json_entries.join(",")))
74        }
75
76        _ => {
77            let data_str = serde_json::to_string(data).expect("serde_json implementation error");
78            Ok(normalize_unicode(&data_str))
79        }
80    }
81}
82
83/// Replace JSON (sub)tree(s) with their multibase-encoded [Merkle-root hash] strings.
84///
85/// Argument `keep_paths` can be created using function [`split_alternatives`] as needed.
86///
87/// [`split_alternatives`]: ../json_path/fn.split_alternatives.html
88/// [Merkle-root hash]: https://en.wikipedia.org/wiki/Merkle_tree
89pub fn mask_json_subtree<'a, 'b>(
90    data: &'a serde_json::Value, keep_paths: impl AsRef<[&'b str]>,
91) -> Result<serde_json::Value> {
92    match data {
93        // NOTE path expressions are not (yet?) supported for arrays
94        serde_json::Value::Array(arr) => {
95            let mut canonical_json_items = Vec::new();
96            for item in arr {
97                let digested_item = mask_json_subtree(item, vec![])?;
98                canonical_json_items.push(serde_json::to_string(&digested_item)?);
99            }
100            let flattened_array = format!("[{}]", canonical_json_items.join(","));
101            //println!("Flattened array {} to {}", serde_json::to_string(&data)?, flattened_array);
102            let content_hash = hash_str(&flattened_array);
103            Ok(serde_json::Value::String(content_hash))
104        }
105
106        serde_json::Value::Object(obj) => {
107            // Build { head => vec![tails] } map
108            let mut keep_head_tails = HashMap::new();
109            for path in keep_paths.as_ref() {
110                let (head, tail_opt) = json_path::split_head_tail(path)?;
111                let tails = keep_head_tails.entry(head.to_owned()).or_insert_with(Vec::new);
112                if let Some(tail) = tail_opt {
113                    tails.push(tail);
114                }
115            }
116
117            let mut mask_root = true;
118            let mut canonical_json_entries = Vec::new();
119            let mut keys: Vec<_> = obj.keys().collect();
120            keys.sort();
121            for key in keys {
122                ensure!(
123                    *key == normalize_unicode(key),
124                    "Data to be digested must contain field names normalized with Unicode NFKD"
125                );
126
127                let value = obj.get(key).expect("serde_json keys() impl error");
128                if let Some(tails) = keep_head_tails.get(key) {
129                    // Found object key present in keep_paths option, skip masking current branch of tree
130                    mask_root = false;
131                    if tails.is_empty() {
132                        // This is the exact Json path to keep open, do not mask anything
133                        canonical_json_entries.push((key, value.to_owned()));
134                    } else {
135                        // This is a partial match for a Json path to keep open, recurse to mask it partially
136                        let partial_value = mask_json_subtree(value, tails)?;
137                        canonical_json_entries.push((key, partial_value));
138                    }
139                } else {
140                    // This path does not match any paths, mask it fully
141                    let fully_masked_value = mask_json_subtree(value, vec![])?;
142                    canonical_json_entries.push((key, fully_masked_value));
143                };
144            }
145
146            if mask_root {
147                let canonical_entry_strs = canonical_json_entries
148                    .iter()
149                    .filter_map(|(key, val)| {
150                        let canonical_key =
151                            canonical_json(&serde_json::Value::String((*key).to_string())).ok()?;
152                        Some(format!("{}:{}", canonical_key, serde_json::to_string(val).ok()?))
153                    })
154                    .collect::<Vec<_>>();
155                ensure!(
156                    canonical_entry_strs.len() == canonical_json_entries.len(),
157                    "Implementation error: failed to serialize JSON node entries"
158                );
159
160                // NOTE: braces are escaped as double brace in Rust
161                let flattened_object = format!("{{{}}}", canonical_entry_strs.join(","));
162
163                let content_hash = hash_str(&flattened_object);
164                Ok(serde_json::Value::String(content_hash))
165            } else {
166                let mut properties = serde_json::Map::new();
167                for (key, value) in canonical_json_entries {
168                    properties.insert(key.to_owned(), value);
169                }
170                Ok(serde_json::Value::Object(properties))
171            }
172        }
173
174        _ => Ok(data.clone()),
175    }
176}
177
178/// Convenience function to transform a JSON value into a (partially) masked JSON value.
179/// Only subtrees matching the provided JSON path pattern will be kept,
180/// all other subtrees will be masked.
181///
182/// Nearly equivalent to `mask_json_subtree(&json_value, split_alternatives(keep_paths_str))`,
183/// but always returns a string, not a `serde_json::Value`.
184pub fn selective_digest_json(
185    json_value: &serde_json::Value, keep_paths_str: &str,
186) -> Result<String> {
187    let keep_paths_vec = json_path::split_alternatives(keep_paths_str);
188    let digest_json = match &json_value {
189        serde_json::Value::Object(_obj) => mask_json_subtree(json_value, keep_paths_vec),
190        serde_json::Value::Array(_arr) => mask_json_subtree(json_value, keep_paths_vec),
191        serde_json::Value::String(_s) => Ok(json_value.to_owned()),
192        _ => bail!("Json digest is currently implemented only for composite types"),
193    }?;
194    match digest_json {
195        serde_json::Value::String(digest) => Ok(digest),
196        // TODO probably a serde_json::to_string() would be enough and faster
197        serde_json::Value::Object(_) => canonical_json(&digest_json),
198        _ => bail!("Implementation error: digest should always return a string or object"),
199    }
200}
201
202/// Convenience function calling [`selective_digest_json`] with arbitrary serializable types.
203///
204/// [`selective_digest_json`]: ./fn.selective_digest_json.html
205pub fn selective_digest_data<T: serde::Serialize>(
206    data: &T, keep_paths_str: &str,
207) -> Result<String> {
208    let json_value = serde_json::to_value(&data)?;
209    selective_digest_json(&json_value, keep_paths_str)
210}
211
212/// Convenience function calling [`selective_digest_json`] with a JSON string.
213///
214/// [`selective_digest_json`]: ./fn.selective_digest_json.html
215pub fn selective_digest_json_str(json_str: &str, keep_paths_str: &str) -> Result<String> {
216    ensure!(
217        json_str == normalize_unicode(json_str),
218        "Json string to be digested must be normalized with Unicode NFKD"
219    );
220
221    let json_value: serde_json::Value = serde_json::from_str(json_str)?;
222    selective_digest_json(&json_value, keep_paths_str)
223}
224
225const KEEP_NOTHING: &str = "";
226
227/// Convenience function for serializable types to mask the whole JSON tree into a digest, keep nothing.
228pub fn digest_data<T: serde::Serialize>(data: &T) -> Result<String> {
229    selective_digest_data(data, KEEP_NOTHING)
230}
231
232/// Convenience function for JSON strings to mask the whole JSON tree into a digest, keep nothing.
233pub fn digest_json_str(json_str: &str) -> Result<String> {
234    selective_digest_json_str(json_str, KEEP_NOTHING)
235}
236
237#[cfg(test)]
238mod tests {
239    use super::*;
240    use hex::FromHex;
241    use serde::{Deserialize, Serialize};
242
243    #[derive(Clone, Debug, Deserialize, Serialize)]
244    struct TestData {
245        b: u32,
246        a: u32,
247    }
248
249    #[derive(Clone, Debug, Deserialize, Serialize)]
250    struct CompositeTestData<T> {
251        z: Option<T>,
252        y: Option<T>,
253    }
254
255    #[test]
256    fn reject_non_nfkd() -> Result<()> {
257        let key_nfc = String::from_utf8(Vec::from_hex("c3a16c6f6d")?)?;
258        let key_nfkd = String::from_utf8(Vec::from_hex("61cc816c6f6d")?)?;
259        assert_eq!(key_nfc, "álom");
260        assert_eq!(key_nfkd, "álom");
261
262        let str_nfc = format!("{{\"{}\": 1}}", key_nfc);
263        let str_nfkd = format!("{{\"{}\": 1}}", key_nfkd);
264        assert_eq!(digest_json_str(&str_nfkd)?, "cjuRab8yOeLzxmFY_fEMC79cW5z9XyihRhaGnTSvMabrA8");
265        assert!(digest_json_str(&str_nfc).is_err());
266
267        let json_value_nfc: serde_json::Value = serde_json::from_str(&str_nfc)?;
268        let json_value_nfkd: serde_json::Value = serde_json::from_str(&str_nfkd)?;
269        assert_eq!(
270            selective_digest_json(&json_value_nfkd, "")?,
271            "cjuRab8yOeLzxmFY_fEMC79cW5z9XyihRhaGnTSvMabrA8"
272        );
273        assert!(selective_digest_json(&json_value_nfc, "").is_err());
274        Ok(())
275    }
276
277    #[test]
278    fn digest_string_is_idempotent() {
279        let content_id = &r#""cjuzC-XxgzNMwYXtw8aMIAeS2Xjlw1hlSNKTvVtUwPuyYo""#;
280        let digest_id = digest_data(content_id).unwrap();
281        assert_eq!(content_id, &digest_id);
282    }
283
284    #[test]
285    fn test_json_digest() -> Result<()> {
286        let test_obj = TestData { b: 1, a: 2 };
287        {
288            let digested = digest_data(&test_obj)?;
289            assert_eq!(digested, "cjumTq1s6Tn6xkXolxHj4LmAo7DAb-zoPLhEa1BvpovAFU");
290        }
291        {
292            let digested = digest_data(&[&test_obj, &test_obj])?;
293            assert_eq!(digested, "cjuGkDpb1HL7F8xFKDFVj3felfKZzjrJy92-108uuPixNw");
294        }
295        {
296            let digested =
297                digest_data(&(&test_obj, "cjumTq1s6Tn6xkXolxHj4LmAo7DAb-zoPLhEa1BvpovAFU"))?;
298            assert_eq!(digested, "cjuGkDpb1HL7F8xFKDFVj3felfKZzjrJy92-108uuPixNw");
299        }
300        {
301            let digested = digest_data(&[
302                "cjumTq1s6Tn6xkXolxHj4LmAo7DAb-zoPLhEa1BvpovAFU",
303                "cjumTq1s6Tn6xkXolxHj4LmAo7DAb-zoPLhEa1BvpovAFU",
304            ])?;
305            assert_eq!(digested, "cjuGkDpb1HL7F8xFKDFVj3felfKZzjrJy92-108uuPixNw");
306        }
307        {
308            let x = &test_obj;
309            let comp = CompositeTestData { z: Some(x.clone()), y: Some(x.clone()) };
310            let digested = digest_data(&comp)?;
311            assert_eq!(digested, "cjubdcpA0FfHhD8yEpDzZ8vS5sm7yxkrX_wAJgmke2bWRQ");
312        }
313        {
314            let comp = CompositeTestData {
315                z: Some("cjumTq1s6Tn6xkXolxHj4LmAo7DAb-zoPLhEa1BvpovAFU".to_owned()),
316                y: Some("cjumTq1s6Tn6xkXolxHj4LmAo7DAb-zoPLhEa1BvpovAFU".to_owned()),
317            };
318            let digested = digest_data(&comp)?;
319            assert_eq!(digested, "cjubdcpA0FfHhD8yEpDzZ8vS5sm7yxkrX_wAJgmke2bWRQ");
320        }
321        Ok(())
322    }
323
324    #[test]
325    fn test_selective_digesting() -> Result<()> {
326        let test_obj = TestData { b: 1, a: 2 };
327        let x = &test_obj;
328        let composite = CompositeTestData { z: Some(x.clone()), y: Some(x.clone()) };
329        let double_complex =
330            CompositeTestData { z: Some(composite.clone()), y: Some(composite.clone()) };
331        let triple_complex =
332            CompositeTestData { z: Some(double_complex.clone()), y: Some(double_complex.clone()) };
333        {
334            let fully_digested = selective_digest_data(&composite, "")?;
335            assert_eq!(fully_digested, "cjubdcpA0FfHhD8yEpDzZ8vS5sm7yxkrX_wAJgmke2bWRQ");
336        }
337        {
338            let keep_y = selective_digest_data(&composite, ".y")?;
339            assert_eq!(
340                keep_y,
341                r#"{"y":{"a":2,"b":1},"z":"cjumTq1s6Tn6xkXolxHj4LmAo7DAb-zoPLhEa1BvpovAFU"}"#
342            );
343            let val: serde_json::Value = serde_json::from_str(&keep_y)?;
344            assert_eq!(digest_data(&val)?, "cjubdcpA0FfHhD8yEpDzZ8vS5sm7yxkrX_wAJgmke2bWRQ");
345        }
346        {
347            let keep_z = selective_digest_data(&composite, ".z")?;
348            assert_eq!(
349                keep_z,
350                r#"{"y":"cjumTq1s6Tn6xkXolxHj4LmAo7DAb-zoPLhEa1BvpovAFU","z":{"a":2,"b":1}}"#
351            );
352            let val: serde_json::Value = serde_json::from_str(&keep_z)?;
353            assert_eq!(digest_data(&val)?, "cjubdcpA0FfHhD8yEpDzZ8vS5sm7yxkrX_wAJgmke2bWRQ");
354        }
355        {
356            let digest = digest_data(&double_complex)?;
357            assert_eq!(digest, "cjuQLebyl_BJipFLibhWiStDBqK5J4JZq15ehUqybfTTKA");
358        }
359        {
360            let keep_yz = selective_digest_data(&double_complex, ".y.z")?;
361            assert_eq!(
362                keep_yz,
363                r#"{"y":{"y":"cjumTq1s6Tn6xkXolxHj4LmAo7DAb-zoPLhEa1BvpovAFU","z":{"a":2,"b":1}},"z":"cjubdcpA0FfHhD8yEpDzZ8vS5sm7yxkrX_wAJgmke2bWRQ"}"#
364            );
365            let val: serde_json::Value = serde_json::from_str(&keep_yz)?;
366            assert_eq!(digest_data(&val)?, "cjuQLebyl_BJipFLibhWiStDBqK5J4JZq15ehUqybfTTKA");
367        }
368        {
369            let digest = digest_data(&triple_complex)?;
370            assert_eq!(digest, "cjuik140L3w7LCi6z1eHt7Qgwr2X65-iy8HA6zqrlUdmVk");
371        }
372        {
373            let keep_yz = selective_digest_data(&triple_complex, ".y.y , .z.z")?;
374            assert_eq!(
375                keep_yz,
376                r#"{"y":{"y":{"y":{"a":2,"b":1},"z":{"a":2,"b":1}},"z":"cjubdcpA0FfHhD8yEpDzZ8vS5sm7yxkrX_wAJgmke2bWRQ"},"z":{"y":"cjubdcpA0FfHhD8yEpDzZ8vS5sm7yxkrX_wAJgmke2bWRQ","z":{"y":{"a":2,"b":1},"z":{"a":2,"b":1}}}}"#
377            );
378            let val: serde_json::Value = serde_json::from_str(&keep_yz)?;
379            assert_eq!(digest_data(&val)?, "cjuik140L3w7LCi6z1eHt7Qgwr2X65-iy8HA6zqrlUdmVk");
380        }
381        Ok(())
382    }
383}