1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
use super::*;

use tiny_keccak::Hasher;
use unicode_normalization::UnicodeNormalization;

/// Returns an [NFKD normalized] unicode representation of the input
///
/// [NFKD normalized]: https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms
pub fn normalize_unicode(s: &str) -> String {
    s.nfkd().collect()
}

/// Multibase-encoded hash of the provided bytes used in many places around this crate.
///
/// We use SHA3_256 (not Keccak) with Base-64 URL encoding.
pub fn default_hasher(content: &[u8]) -> String {
    // TODO we might want to use sha3 crate instead of tiny_keccak
    let mut hasher = tiny_keccak::Sha3::v256();
    let mut hash_output = [0u8; 32];
    hasher.update(content);
    hasher.finalize(&mut hash_output);
    multibase::encode(multibase::Base::Base64Url, &hash_output)
}

/// Multibase-encoded hash of the utf8 representation of the provided string,
/// prefixed with "cj". Character 'j' marks that a JSON value was hashed and
/// 'c' stands for content hash.
fn hash_str(content: &str) -> String {
    format!("cj{}", default_hasher(content.as_bytes()))
}

/// Constructs the deterministic string representation of the provided JSON value.
///
/// The same JSON document can be represented in multiple ways depending on
/// property ordering and string encoding. This canonical JSON format lists
/// property names in ascending order by their utf8-encoded byte arrays and
/// uses the [NFKD normalized] unicode representation of all strings
/// (both property names and string values).
///
/// The function will return error if a provided JSON object has properties
/// with the same name in different unicode normalizations.
/// Note that creating `Value` arguments, `serde_json` accepts objects
/// having properties with exactly the same name and keeps only the last value,
/// ignoring all previous ones.
///
/// [NFKD normalized]: https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms
pub fn canonical_json(data: &serde_json::Value) -> Result<String> {
    match data {
        serde_json::Value::Array(arr) => {
            let mut canonical_json_items = Vec::new();
            for item in arr {
                canonical_json_items.push(canonical_json(item)?);
            }
            Ok(format!("[{}]", canonical_json_items.join(",")))
        }

        serde_json::Value::Object(obj) => {
            let mut canonical_json_entries = Vec::new();
            let mut keys: Vec<_> = obj.keys().collect();
            keys.sort();
            for key in keys {
                ensure!(
                    *key == normalize_unicode(key),
                    "Data for canonical JSON serialization must contain field names normalized with Unicode NFKD"
                );

                let value = obj.get(key).expect("serde_json keys() impl error");
                let canonical_key = canonical_json(&serde_json::Value::String(key.to_owned()))?;
                let entry = format!("{}:{}", canonical_key, canonical_json(value)?);
                canonical_json_entries.push(entry);
            }
            // NOTE: braces are escaped as double braces in Rust
            Ok(format!("{{{}}}", canonical_json_entries.join(",")))
        }

        _ => {
            let data_str = serde_json::to_string(data).expect("serde_json implementation error");
            Ok(normalize_unicode(&data_str))
        }
    }
}

/// Replace JSON (sub)tree(s) with their multibase-encoded [Merkle-root hash] strings.
///
/// Argument `keep_paths` can be created using function [`split_alternatives`] as needed.
///
/// [`split_alternatives`]: ../json_path/fn.split_alternatives.html
/// [Merkle-root hash]: https://en.wikipedia.org/wiki/Merkle_tree
pub fn mask_json_subtree<'a, 'b>(
    data: &'a serde_json::Value, keep_paths: impl AsRef<[&'b str]>,
) -> Result<serde_json::Value> {
    match data {
        // NOTE path expressions are not (yet?) supported for arrays
        serde_json::Value::Array(arr) => {
            let mut canonical_json_items = Vec::new();
            for item in arr {
                let digested_item = mask_json_subtree(item, vec![])?;
                canonical_json_items.push(serde_json::to_string(&digested_item)?);
            }
            let flattened_array = format!("[{}]", canonical_json_items.join(","));
            //println!("Flattened array {} to {}", serde_json::to_string(&data)?, flattened_array);
            let content_hash = hash_str(&flattened_array);
            Ok(serde_json::Value::String(content_hash))
        }

        serde_json::Value::Object(obj) => {
            // Build { head => vec![tails] } map
            let mut keep_head_tails = HashMap::new();
            for path in keep_paths.as_ref() {
                let (head, tail_opt) = json_path::split_head_tail(path)?;
                let tails = keep_head_tails.entry(head.to_owned()).or_insert_with(Vec::new);
                if let Some(tail) = tail_opt {
                    tails.push(tail);
                }
            }

            let mut mask_root = true;
            let mut canonical_json_entries = Vec::new();
            let mut keys: Vec<_> = obj.keys().collect();
            keys.sort();
            for key in keys {
                ensure!(
                    *key == normalize_unicode(key),
                    "Data to be digested must contain field names normalized with Unicode NFKD"
                );

                let value = obj.get(key).expect("serde_json keys() impl error");
                if let Some(tails) = keep_head_tails.get(key) {
                    // Found object key present in keep_paths option, skip masking current branch of tree
                    mask_root = false;
                    if tails.is_empty() {
                        // This is the exact Json path to keep open, do not mask anything
                        canonical_json_entries.push((key, value.to_owned()));
                    } else {
                        // This is a partial match for a Json path to keep open, recurse to mask it partially
                        let partial_value = mask_json_subtree(value, tails)?;
                        canonical_json_entries.push((key, partial_value));
                    }
                } else {
                    // This path does not match any paths, mask it fully
                    let fully_masked_value = mask_json_subtree(value, vec![])?;
                    canonical_json_entries.push((key, fully_masked_value));
                };
            }

            if mask_root {
                let canonical_entry_strs = canonical_json_entries
                    .iter()
                    .filter_map(|(key, val)| {
                        let canonical_key =
                            canonical_json(&serde_json::Value::String((*key).to_string())).ok()?;
                        Some(format!("{}:{}", canonical_key, serde_json::to_string(val).ok()?))
                    })
                    .collect::<Vec<_>>();
                ensure!(
                    canonical_entry_strs.len() == canonical_json_entries.len(),
                    "Implementation error: failed to serialize JSON node entries"
                );

                // NOTE: braces are escaped as double brace in Rust
                let flattened_object = format!("{{{}}}", canonical_entry_strs.join(","));

                let content_hash = hash_str(&flattened_object);
                Ok(serde_json::Value::String(content_hash))
            } else {
                let mut properties = serde_json::Map::new();
                for (key, value) in canonical_json_entries {
                    properties.insert(key.to_owned(), value);
                }
                Ok(serde_json::Value::Object(properties))
            }
        }

        _ => Ok(data.clone()),
    }
}

/// Convenience function to transform a JSON value into a (partially) masked JSON value.
/// Only subtrees matching the provided JSON path pattern will be kept,
/// all other subtrees will be masked.
///
/// Nearly equivalent to `mask_json_subtree(&json_value, split_alternatives(keep_paths_str))`,
/// but always returns a string, not a `serde_json::Value`.
pub fn selective_digest_json(
    json_value: &serde_json::Value, keep_paths_str: &str,
) -> Result<String> {
    let keep_paths_vec = json_path::split_alternatives(keep_paths_str);
    let digest_json = match &json_value {
        serde_json::Value::Object(_obj) => mask_json_subtree(json_value, keep_paths_vec),
        serde_json::Value::Array(_arr) => mask_json_subtree(json_value, keep_paths_vec),
        serde_json::Value::String(_s) => Ok(json_value.to_owned()),
        _ => bail!("Json digest is currently implemented only for composite types"),
    }?;
    match digest_json {
        serde_json::Value::String(digest) => Ok(digest),
        // TODO probably a serde_json::to_string() would be enough and faster
        serde_json::Value::Object(_) => canonical_json(&digest_json),
        _ => bail!("Implementation error: digest should always return a string or object"),
    }
}

/// Convenience function calling [`selective_digest_json`] with arbitrary serializable types.
///
/// [`selective_digest_json`]: ./fn.selective_digest_json.html
pub fn selective_digest_data<T: serde::Serialize>(
    data: &T, keep_paths_str: &str,
) -> Result<String> {
    let json_value = serde_json::to_value(&data)?;
    selective_digest_json(&json_value, keep_paths_str)
}

/// Convenience function calling [`selective_digest_json`] with a JSON string.
///
/// [`selective_digest_json`]: ./fn.selective_digest_json.html
pub fn selective_digest_json_str(json_str: &str, keep_paths_str: &str) -> Result<String> {
    ensure!(
        json_str == normalize_unicode(json_str),
        "Json string to be digested must be normalized with Unicode NFKD"
    );

    let json_value: serde_json::Value = serde_json::from_str(json_str)?;
    selective_digest_json(&json_value, keep_paths_str)
}

const KEEP_NOTHING: &str = "";

/// Convenience function for serializable types to mask the whole JSON tree into a digest, keep nothing.
pub fn digest_data<T: serde::Serialize>(data: &T) -> Result<String> {
    selective_digest_data(data, KEEP_NOTHING)
}

/// Convenience function for JSON strings to mask the whole JSON tree into a digest, keep nothing.
pub fn digest_json_str(json_str: &str) -> Result<String> {
    selective_digest_json_str(json_str, KEEP_NOTHING)
}

#[cfg(test)]
mod tests {
    use super::*;
    use hex::FromHex;
    use serde::{Deserialize, Serialize};

    #[derive(Clone, Debug, Deserialize, Serialize)]
    struct TestData {
        b: u32,
        a: u32,
    }

    #[derive(Clone, Debug, Deserialize, Serialize)]
    struct CompositeTestData<T> {
        z: Option<T>,
        y: Option<T>,
    }

    #[test]
    fn reject_non_nfkd() -> Result<()> {
        let key_nfc = String::from_utf8(Vec::from_hex("c3a16c6f6d")?)?;
        let key_nfkd = String::from_utf8(Vec::from_hex("61cc816c6f6d")?)?;
        assert_eq!(key_nfc, "álom");
        assert_eq!(key_nfkd, "álom");

        let str_nfc = format!("{{\"{}\": 1}}", key_nfc);
        let str_nfkd = format!("{{\"{}\": 1}}", key_nfkd);
        assert_eq!(digest_json_str(&str_nfkd)?, "cjuRab8yOeLzxmFY_fEMC79cW5z9XyihRhaGnTSvMabrA8");
        assert!(digest_json_str(&str_nfc).is_err());

        let json_value_nfc: serde_json::Value = serde_json::from_str(&str_nfc)?;
        let json_value_nfkd: serde_json::Value = serde_json::from_str(&str_nfkd)?;
        assert_eq!(
            selective_digest_json(&json_value_nfkd, "")?,
            "cjuRab8yOeLzxmFY_fEMC79cW5z9XyihRhaGnTSvMabrA8"
        );
        assert!(selective_digest_json(&json_value_nfc, "").is_err());
        Ok(())
    }

    #[test]
    fn digest_string_is_idempotent() {
        let content_id = &r#""cjuzC-XxgzNMwYXtw8aMIAeS2Xjlw1hlSNKTvVtUwPuyYo""#;
        let digest_id = digest_data(content_id).unwrap();
        assert_eq!(content_id, &digest_id);
    }

    #[test]
    fn test_json_digest() -> Result<()> {
        let test_obj = TestData { b: 1, a: 2 };
        {
            let digested = digest_data(&test_obj)?;
            assert_eq!(digested, "cjumTq1s6Tn6xkXolxHj4LmAo7DAb-zoPLhEa1BvpovAFU");
        }
        {
            let digested = digest_data(&[&test_obj, &test_obj])?;
            assert_eq!(digested, "cjuGkDpb1HL7F8xFKDFVj3felfKZzjrJy92-108uuPixNw");
        }
        {
            let digested =
                digest_data(&(&test_obj, "cjumTq1s6Tn6xkXolxHj4LmAo7DAb-zoPLhEa1BvpovAFU"))?;
            assert_eq!(digested, "cjuGkDpb1HL7F8xFKDFVj3felfKZzjrJy92-108uuPixNw");
        }
        {
            let digested = digest_data(&[
                "cjumTq1s6Tn6xkXolxHj4LmAo7DAb-zoPLhEa1BvpovAFU",
                "cjumTq1s6Tn6xkXolxHj4LmAo7DAb-zoPLhEa1BvpovAFU",
            ])?;
            assert_eq!(digested, "cjuGkDpb1HL7F8xFKDFVj3felfKZzjrJy92-108uuPixNw");
        }
        {
            let x = &test_obj;
            let comp = CompositeTestData { z: Some(x.clone()), y: Some(x.clone()) };
            let digested = digest_data(&comp)?;
            assert_eq!(digested, "cjubdcpA0FfHhD8yEpDzZ8vS5sm7yxkrX_wAJgmke2bWRQ");
        }
        {
            let comp = CompositeTestData {
                z: Some("cjumTq1s6Tn6xkXolxHj4LmAo7DAb-zoPLhEa1BvpovAFU".to_owned()),
                y: Some("cjumTq1s6Tn6xkXolxHj4LmAo7DAb-zoPLhEa1BvpovAFU".to_owned()),
            };
            let digested = digest_data(&comp)?;
            assert_eq!(digested, "cjubdcpA0FfHhD8yEpDzZ8vS5sm7yxkrX_wAJgmke2bWRQ");
        }
        Ok(())
    }

    #[test]
    fn test_selective_digesting() -> Result<()> {
        let test_obj = TestData { b: 1, a: 2 };
        let x = &test_obj;
        let composite = CompositeTestData { z: Some(x.clone()), y: Some(x.clone()) };
        let double_complex =
            CompositeTestData { z: Some(composite.clone()), y: Some(composite.clone()) };
        let triple_complex =
            CompositeTestData { z: Some(double_complex.clone()), y: Some(double_complex.clone()) };
        {
            let fully_digested = selective_digest_data(&composite, "")?;
            assert_eq!(fully_digested, "cjubdcpA0FfHhD8yEpDzZ8vS5sm7yxkrX_wAJgmke2bWRQ");
        }
        {
            let keep_y = selective_digest_data(&composite, ".y")?;
            assert_eq!(
                keep_y,
                r#"{"y":{"a":2,"b":1},"z":"cjumTq1s6Tn6xkXolxHj4LmAo7DAb-zoPLhEa1BvpovAFU"}"#
            );
            let val: serde_json::Value = serde_json::from_str(&keep_y)?;
            assert_eq!(digest_data(&val)?, "cjubdcpA0FfHhD8yEpDzZ8vS5sm7yxkrX_wAJgmke2bWRQ");
        }
        {
            let keep_z = selective_digest_data(&composite, ".z")?;
            assert_eq!(
                keep_z,
                r#"{"y":"cjumTq1s6Tn6xkXolxHj4LmAo7DAb-zoPLhEa1BvpovAFU","z":{"a":2,"b":1}}"#
            );
            let val: serde_json::Value = serde_json::from_str(&keep_z)?;
            assert_eq!(digest_data(&val)?, "cjubdcpA0FfHhD8yEpDzZ8vS5sm7yxkrX_wAJgmke2bWRQ");
        }
        {
            let digest = digest_data(&double_complex)?;
            assert_eq!(digest, "cjuQLebyl_BJipFLibhWiStDBqK5J4JZq15ehUqybfTTKA");
        }
        {
            let keep_yz = selective_digest_data(&double_complex, ".y.z")?;
            assert_eq!(
                keep_yz,
                r#"{"y":{"y":"cjumTq1s6Tn6xkXolxHj4LmAo7DAb-zoPLhEa1BvpovAFU","z":{"a":2,"b":1}},"z":"cjubdcpA0FfHhD8yEpDzZ8vS5sm7yxkrX_wAJgmke2bWRQ"}"#
            );
            let val: serde_json::Value = serde_json::from_str(&keep_yz)?;
            assert_eq!(digest_data(&val)?, "cjuQLebyl_BJipFLibhWiStDBqK5J4JZq15ehUqybfTTKA");
        }
        {
            let digest = digest_data(&triple_complex)?;
            assert_eq!(digest, "cjuik140L3w7LCi6z1eHt7Qgwr2X65-iy8HA6zqrlUdmVk");
        }
        {
            let keep_yz = selective_digest_data(&triple_complex, ".y.y , .z.z")?;
            assert_eq!(
                keep_yz,
                r#"{"y":{"y":{"y":{"a":2,"b":1},"z":{"a":2,"b":1}},"z":"cjubdcpA0FfHhD8yEpDzZ8vS5sm7yxkrX_wAJgmke2bWRQ"},"z":{"y":"cjubdcpA0FfHhD8yEpDzZ8vS5sm7yxkrX_wAJgmke2bWRQ","z":{"y":{"a":2,"b":1},"z":{"a":2,"b":1}}}}"#
            );
            let val: serde_json::Value = serde_json::from_str(&keep_yz)?;
            assert_eq!(digest_data(&val)?, "cjuik140L3w7LCi6z1eHt7Qgwr2X65-iy8HA6zqrlUdmVk");
        }
        Ok(())
    }
}