Skip to main content

triblespace_core/import/
json_tree.rs

1//! Lossless JSON importer that preserves structure and ordering.
2//!
3//! Every JSON value becomes a node tagged with a kind. Objects and arrays are
4//! expressed via explicit entry entities that record field names or indices.
5//! Entity ids are content-addressed so identical subtrees deduplicate across
6//! imports.
7
8use std::marker::PhantomData;
9
10use anybytes::{Bytes, View};
11use digest::Digest;
12use winnow::stream::Stream;
13
14use crate::blob::schemas::longstring::LongString;
15use crate::blob::Blob;
16use crate::blob::ToBlob;
17use crate::id::{ExclusiveId, Id, RawId, ID_LEN};
18use crate::import::ImportAttribute;
19use crate::macros::{entity, id_hex};
20use crate::metadata;
21use crate::metadata::{ConstDescribe, Describe};
22use crate::repo::BlobStore;
23use crate::trible::Fragment;
24use crate::trible::TribleSet;
25use crate::value::schemas::boolean::Boolean;
26use crate::value::schemas::genid::GenId;
27use crate::value::schemas::hash::{Blake3, Handle, HashProtocol};
28use crate::value::schemas::iu256::U256BE;
29use crate::value::Value;
30use triblespace_core_macros::attributes;
31
32use crate::import::json::{
33    parse_number_common, parse_string_common, parse_unicode_escape, EncodeError, JsonImportError,
34};
35
36type ParsedString = View<str>;
37
38attributes! {
39    /// Node kind tag (one of the `kind_*` constants).
40    "D78B9D5A96029FDBBB327E377418AF51" as pub kind: GenId;
41    /// String content stored as a LongString blob.
42    "40BC51924FD5D2058A48D1FA6073F871" as pub string: Handle<Blake3, LongString>;
43    /// Raw decimal number string (preserves precision).
44    "428E02672FFD0D010D95AE641ADE1730" as pub number_raw: Handle<Blake3, LongString>;
45    /// Boolean value.
46    "6F43FC771207574BF4CC58D3080C313C" as pub boolean: Boolean;
47    /// Parent entity of an object field entry.
48    "97A4ACD83EC9EA29EE7E487BB058C437" as pub field_parent: GenId;
49    /// Field name stored as a LongString blob.
50    "2B9FCF2A60C9B05FADDA9F022762B822" as pub field_name: Handle<Blake3, LongString>;
51    /// Ordinal position of a field within its parent object.
52    "38C7B1CDEA580DE70A520B2C8CBC4F14" as pub field_index: U256BE;
53    /// Value entity referenced by an object field entry.
54    "6E6CA175F925B6AA0844D357B409F15A" as pub field_value: GenId;
55    /// Parent entity of an array entry.
56    "B49E6499D0A2CF5DD9A1E72D9D047747" as pub array_parent: GenId;
57    /// Zero-based index of an array element.
58    "D5DA41A093BD0DE490925126D1150B57" as pub array_index: U256BE;
59    /// Value entity referenced by an array entry.
60    "33535F41827B476B1EC0CACECE9BEED0" as pub array_value: GenId;
61}
62
63/// JSON object node.
64#[allow(non_upper_case_globals)]
65pub const kind_object: Id = id_hex!("64D8981414502BF750387C617F1F9D09");
66/// JSON array node.
67#[allow(non_upper_case_globals)]
68pub const kind_array: Id = id_hex!("5DC7096A184E658C8E16C54EB207C386");
69/// JSON string node.
70#[allow(non_upper_case_globals)]
71pub const kind_string: Id = id_hex!("58A5EAC244801C5E26AD9178C784781A");
72/// JSON number node.
73#[allow(non_upper_case_globals)]
74pub const kind_number: Id = id_hex!("711555ADF72B9499E6A7F68E0BD3B4B8");
75/// JSON boolean node.
76#[allow(non_upper_case_globals)]
77pub const kind_bool: Id = id_hex!("7D3079C5E20658B6CA5F54771B5D0D30");
78/// JSON null node.
79#[allow(non_upper_case_globals)]
80pub const kind_null: Id = id_hex!("FC1DCF98A3A8418D6090EBD367CFFD7A");
81/// Object field entry.
82#[allow(non_upper_case_globals)]
83pub const kind_field: Id = id_hex!("890FC1F34B9FAD18F93E6EDF1B69A1A2");
84/// Array entry.
85#[allow(non_upper_case_globals)]
86pub const kind_array_entry: Id = id_hex!("EB325EABEA8C35DE7E5D700A5EF9207B");
87
88/// Returns a [`Fragment`] describing the lossless JSON tree schema —
89/// all node kinds, attribute definitions, and value/blob schema metadata.
90pub fn build_json_tree_metadata<B>(blobs: &mut B) -> Result<Fragment, B::PutError>
91where
92    B: BlobStore<Blake3>,
93{
94    let mut metadata = Fragment::default();
95    let name = |value: &'static str| {
96        Bytes::from_source(value)
97            .view::<str>()
98            .expect("static JSON attribute names are valid UTF-8")
99    };
100
101    metadata += <GenId as ConstDescribe>::describe(blobs)?;
102    metadata += <Boolean as ConstDescribe>::describe(blobs)?;
103    metadata += <U256BE as ConstDescribe>::describe(blobs)?;
104    metadata += <Handle<Blake3, LongString> as ConstDescribe>::describe(blobs)?;
105
106    metadata +=
107        ImportAttribute::<GenId>::from_raw(kind.raw(), Some(name("json.kind"))).describe(blobs)?;
108    metadata += ImportAttribute::<Handle<Blake3, LongString>>::from_raw(
109        string.raw(),
110        Some(name("json.string")),
111    )
112    .describe(blobs)?;
113    metadata += ImportAttribute::<Handle<Blake3, LongString>>::from_raw(
114        number_raw.raw(),
115        Some(name("json.number_raw")),
116    )
117    .describe(blobs)?;
118    metadata += ImportAttribute::<Boolean>::from_raw(boolean.raw(), Some(name("json.boolean")))
119        .describe(blobs)?;
120    metadata +=
121        ImportAttribute::<GenId>::from_raw(field_parent.raw(), Some(name("json.field_parent")))
122            .describe(blobs)?;
123    metadata += ImportAttribute::<Handle<Blake3, LongString>>::from_raw(
124        field_name.raw(),
125        Some(name("json.field_name")),
126    )
127    .describe(blobs)?;
128    metadata +=
129        ImportAttribute::<U256BE>::from_raw(field_index.raw(), Some(name("json.field_index")))
130            .describe(blobs)?;
131    metadata +=
132        ImportAttribute::<GenId>::from_raw(field_value.raw(), Some(name("json.field_value")))
133            .describe(blobs)?;
134    metadata +=
135        ImportAttribute::<GenId>::from_raw(array_parent.raw(), Some(name("json.array_parent")))
136            .describe(blobs)?;
137    metadata +=
138        ImportAttribute::<U256BE>::from_raw(array_index.raw(), Some(name("json.array_index")))
139            .describe(blobs)?;
140    metadata +=
141        ImportAttribute::<GenId>::from_raw(array_value.raw(), Some(name("json.array_value")))
142            .describe(blobs)?;
143
144    metadata += describe_kind(blobs, kind_object, "json.kind.object", "JSON object node.")?;
145    metadata += describe_kind(blobs, kind_array, "json.kind.array", "JSON array node.")?;
146    metadata += describe_kind(blobs, kind_string, "json.kind.string", "JSON string node.")?;
147    metadata += describe_kind(blobs, kind_number, "json.kind.number", "JSON number node.")?;
148    metadata += describe_kind(blobs, kind_bool, "json.kind.bool", "JSON boolean node.")?;
149    metadata += describe_kind(blobs, kind_null, "json.kind.null", "JSON null node.")?;
150    metadata += describe_kind(
151        blobs,
152        kind_field,
153        "json.kind.field",
154        "JSON object field entry.",
155    )?;
156    metadata += describe_kind(
157        blobs,
158        kind_array_entry,
159        "json.kind.array_entry",
160        "JSON array entry.",
161    )?;
162
163    Ok(metadata)
164}
165
166fn describe_kind<B>(
167    blobs: &mut B,
168    kind_id: Id,
169    name: &str,
170    description: &str,
171) -> Result<Fragment, B::PutError>
172where
173    B: BlobStore<Blake3>,
174{
175    let name_handle = blobs.put(name.to_owned())?;
176
177    let tribles = entity! { ExclusiveId::force_ref(&kind_id) @
178        metadata::name: name_handle,
179        metadata::description: blobs.put(description.to_owned())?,
180    };
181    Ok(tribles)
182}
183
184#[derive(Clone)]
185struct FieldEntry {
186    name: View<str>,
187    name_handle: Value<Handle<Blake3, LongString>>,
188    index: u64,
189    value: Id,
190}
191
192#[derive(Clone)]
193struct ArrayEntry {
194    index: u64,
195    value: Id,
196}
197
198/// Lossless JSON importer that preserves ordering and encodes explicit entry nodes.
199///
200/// This importer encodes JSON values as an explicit node/entry graph (a JSON AST),
201/// using content-addressed ids so identical subtrees deduplicate across imports.
202pub struct JsonTreeImporter<'a, Store, Hasher = Blake3>
203where
204    Store: BlobStore<Blake3>,
205    Hasher: HashProtocol,
206{
207    store: &'a mut Store,
208    id_salt: Option<[u8; 32]>,
209    _hasher: PhantomData<Hasher>,
210}
211
212impl<'a, Store, Hasher> JsonTreeImporter<'a, Store, Hasher>
213where
214    Store: BlobStore<Blake3>,
215    Hasher: HashProtocol,
216{
217    /// Creates a new lossless importer backed by `store`. Pass an optional
218    /// 32-byte salt to namespace the content-addressed entity ids.
219    pub fn new(store: &'a mut Store, id_salt: Option<[u8; 32]>) -> Self {
220        Self {
221            store,
222            id_salt,
223            _hasher: PhantomData,
224        }
225    }
226
227    /// Imports a JSON string. Convenience wrapper around [`import_blob`](Self::import_blob).
228    pub fn import_str(&mut self, input: &str) -> Result<Fragment, JsonImportError> {
229        self.import_blob(input.to_owned().to_blob())
230    }
231
232    /// Imports a JSON document from a [`LongString`] blob, returning a
233    /// [`Fragment`] rooted at the document's top-level node.
234    pub fn import_blob(&mut self, blob: Blob<LongString>) -> Result<Fragment, JsonImportError> {
235        let mut data = TribleSet::new();
236        let mut bytes = blob.bytes.clone();
237        self.skip_ws(&mut bytes);
238        let root = self.parse_value(&mut bytes, &mut data)?;
239        self.skip_ws(&mut bytes);
240        if bytes.peek_token().is_some() {
241            return Err(JsonImportError::Syntax("trailing tokens".into()));
242        }
243        Ok(Fragment::rooted(root, data))
244    }
245
246    /// Returns schema metadata for the lossless JSON tree format.
247    /// Delegates to [`build_json_tree_metadata`].
248    pub fn metadata(&mut self) -> Result<Fragment, Store::PutError> {
249        build_json_tree_metadata(self.store)
250    }
251
252    fn parse_value(
253        &mut self,
254        bytes: &mut Bytes,
255        data: &mut TribleSet,
256    ) -> Result<Id, JsonImportError> {
257        match bytes.peek_token() {
258            Some(b'n') => {
259                self.consume_literal(bytes, b"null")?;
260                let id = self.hash_tagged(b"null", &[]);
261                *data += entity! { ExclusiveId::force_ref(&id) @
262                    kind: kind_null,
263                };
264                Ok(id)
265            }
266            Some(b't') => {
267                self.consume_literal(bytes, b"true")?;
268                let id = self.hash_tagged(b"bool", &[b"true"]);
269                *data += entity! { ExclusiveId::force_ref(&id) @
270                    kind: kind_bool,
271                    boolean: true,
272                };
273                Ok(id)
274            }
275            Some(b'f') => {
276                self.consume_literal(bytes, b"false")?;
277                let id = self.hash_tagged(b"bool", &[b"false"]);
278                *data += entity! { ExclusiveId::force_ref(&id) @
279                    kind: kind_bool,
280                    boolean: false,
281                };
282                Ok(id)
283            }
284            Some(b'"') => {
285                let text = self.parse_string(bytes)?;
286                let id = self.hash_tagged(b"string", &[text.as_ref().as_bytes()]);
287                let handle = self
288                    .store
289                    .put(text)
290                    .map_err(|err| JsonImportError::EncodeString {
291                        field: "string".to_string(),
292                        source: EncodeError::from_error(err),
293                    })?;
294                *data += entity! { ExclusiveId::force_ref(&id) @
295                    kind: kind_string,
296                    string: handle,
297                };
298                Ok(id)
299            }
300            Some(b'{') => self.parse_object(bytes, data),
301            Some(b'[') => self.parse_array(bytes, data),
302            _ => {
303                let number = self.parse_number(bytes)?;
304                let number_view = number
305                    .view::<str>()
306                    .map_err(|_| JsonImportError::Syntax("invalid number".into()))?;
307                let id = self.hash_tagged(b"number", &[number_view.as_ref().as_bytes()]);
308                let handle =
309                    self.store
310                        .put(number_view)
311                        .map_err(|err| JsonImportError::EncodeNumber {
312                            field: "number".to_string(),
313                            source: EncodeError::from_error(err),
314                        })?;
315                *data += entity! { ExclusiveId::force_ref(&id) @
316                    kind: kind_number,
317                    number_raw: handle,
318                };
319                Ok(id)
320            }
321        }
322    }
323
324    fn parse_object(
325        &mut self,
326        bytes: &mut Bytes,
327        data: &mut TribleSet,
328    ) -> Result<Id, JsonImportError> {
329        self.consume_byte(bytes, b'{')?;
330        self.skip_ws(bytes);
331
332        let mut fields: Vec<FieldEntry> = Vec::new();
333        if bytes.peek_token() == Some(b'}') {
334            self.consume_byte(bytes, b'}')?;
335        } else {
336            let mut index: u64 = 0;
337            loop {
338                let name = self.parse_string(bytes)?;
339                self.skip_ws(bytes);
340                self.consume_byte(bytes, b':')?;
341                self.skip_ws(bytes);
342                let value = self.parse_value(bytes, data)?;
343                let name_handle =
344                    self.store
345                        .put(name.clone())
346                        .map_err(|err| JsonImportError::EncodeString {
347                            field: "field".to_string(),
348                            source: EncodeError::from_error(err),
349                        })?;
350                fields.push(FieldEntry {
351                    name,
352                    name_handle,
353                    index,
354                    value,
355                });
356                index = index.saturating_add(1);
357
358                self.skip_ws(bytes);
359                match bytes.peek_token() {
360                    Some(b',') => {
361                        self.consume_byte(bytes, b',')?;
362                        self.skip_ws(bytes);
363                    }
364                    Some(b'}') => {
365                        self.consume_byte(bytes, b'}')?;
366                        break;
367                    }
368                    _ => return Err(JsonImportError::Syntax("unexpected token".into())),
369                }
370            }
371        }
372
373        let object_id = self.hash_object(&fields);
374        *data += entity! { ExclusiveId::force_ref(&object_id) @
375            kind: kind_object,
376        };
377
378        for field in fields {
379            let entry_id = self.hash_field_entry(&object_id, &field);
380            *data += entity! { ExclusiveId::force_ref(&entry_id) @
381                kind: kind_field,
382                field_parent: object_id,
383                field_name: field.name_handle,
384                field_index: field.index,
385                field_value: field.value,
386            };
387        }
388
389        Ok(object_id)
390    }
391
392    fn parse_array(
393        &mut self,
394        bytes: &mut Bytes,
395        data: &mut TribleSet,
396    ) -> Result<Id, JsonImportError> {
397        self.consume_byte(bytes, b'[')?;
398        self.skip_ws(bytes);
399
400        let mut entries: Vec<ArrayEntry> = Vec::new();
401        if bytes.peek_token() == Some(b']') {
402            self.consume_byte(bytes, b']')?;
403        } else {
404            let mut index: u64 = 0;
405            loop {
406                let value = self.parse_value(bytes, data)?;
407                entries.push(ArrayEntry { index, value });
408                index = index.saturating_add(1);
409
410                self.skip_ws(bytes);
411                match bytes.peek_token() {
412                    Some(b',') => {
413                        self.consume_byte(bytes, b',')?;
414                        self.skip_ws(bytes);
415                    }
416                    Some(b']') => {
417                        self.consume_byte(bytes, b']')?;
418                        break;
419                    }
420                    _ => return Err(JsonImportError::Syntax("unexpected token".into())),
421                }
422            }
423        }
424
425        let array_id = self.hash_array(&entries);
426        *data += entity! { ExclusiveId::force_ref(&array_id) @
427            kind: kind_array,
428        };
429
430        for entry in entries {
431            let entry_id = self.hash_array_entry(&array_id, &entry);
432            *data += entity! { ExclusiveId::force_ref(&entry_id) @
433                kind: kind_array_entry,
434                array_parent: array_id,
435                array_index: entry.index,
436                array_value: entry.value,
437            };
438        }
439
440        Ok(array_id)
441    }
442
443    fn hash_object(&self, fields: &[FieldEntry]) -> Id {
444        let mut hasher = self.seeded_hasher();
445        hash_chunk(&mut hasher, b"object");
446        for field in fields {
447            let index_bytes = field.index.to_be_bytes();
448            hash_chunk(&mut hasher, field.name.as_ref().as_bytes());
449            hash_chunk(&mut hasher, &index_bytes);
450            hash_chunk(&mut hasher, field.value.as_ref());
451        }
452        self.finish_hash(hasher)
453    }
454
455    fn hash_array(&self, entries: &[ArrayEntry]) -> Id {
456        let mut hasher = self.seeded_hasher();
457        hash_chunk(&mut hasher, b"array");
458        for entry in entries {
459            let index_bytes = entry.index.to_be_bytes();
460            hash_chunk(&mut hasher, &index_bytes);
461            hash_chunk(&mut hasher, entry.value.as_ref());
462        }
463        self.finish_hash(hasher)
464    }
465
466    fn hash_field_entry(&self, parent: &Id, entry: &FieldEntry) -> Id {
467        let mut hasher = self.seeded_hasher();
468        hash_chunk(&mut hasher, b"field");
469        let index_bytes = entry.index.to_be_bytes();
470        hash_chunk(&mut hasher, parent.as_ref());
471        hash_chunk(&mut hasher, entry.name.as_ref().as_bytes());
472        hash_chunk(&mut hasher, &index_bytes);
473        hash_chunk(&mut hasher, entry.value.as_ref());
474        self.finish_hash(hasher)
475    }
476
477    fn hash_array_entry(&self, parent: &Id, entry: &ArrayEntry) -> Id {
478        let mut hasher = self.seeded_hasher();
479        hash_chunk(&mut hasher, b"array_entry");
480        let index_bytes = entry.index.to_be_bytes();
481        hash_chunk(&mut hasher, parent.as_ref());
482        hash_chunk(&mut hasher, &index_bytes);
483        hash_chunk(&mut hasher, entry.value.as_ref());
484        self.finish_hash(hasher)
485    }
486
487    fn hash_tagged(&self, tag: &[u8], parts: &[&[u8]]) -> Id {
488        let mut hasher = self.seeded_hasher();
489        hash_chunk(&mut hasher, tag);
490        for part in parts {
491            hash_chunk(&mut hasher, part);
492        }
493        self.finish_hash(hasher)
494    }
495
496    fn seeded_hasher(&self) -> Hasher {
497        let mut hasher = Hasher::new();
498        if let Some(salt) = self.id_salt {
499            hasher.update(salt.as_ref());
500        }
501        hasher
502    }
503
504    fn finish_hash(&self, hasher: Hasher) -> Id {
505        let digest = hasher.finalize();
506        id_from_digest(digest.as_ref())
507    }
508
509    fn skip_ws(&self, bytes: &mut Bytes) {
510        while matches!(bytes.peek_token(), Some(b) if b.is_ascii_whitespace()) {
511            bytes.pop_front();
512        }
513    }
514
515    fn consume_byte(&self, bytes: &mut Bytes, expected: u8) -> Result<(), JsonImportError> {
516        match bytes.pop_front() {
517            Some(b) if b == expected => Ok(()),
518            _ => Err(JsonImportError::Syntax("unexpected token".into())),
519        }
520    }
521
522    fn consume_literal(&self, bytes: &mut Bytes, literal: &[u8]) -> Result<(), JsonImportError> {
523        for expected in literal {
524            self.consume_byte(bytes, *expected)?;
525        }
526        Ok(())
527    }
528
529    fn parse_string(&self, bytes: &mut Bytes) -> Result<ParsedString, JsonImportError> {
530        let raw = parse_string_common(bytes, &mut parse_unicode_escape)?;
531        raw.view::<str>()
532            .map_err(|_| JsonImportError::Syntax("invalid utf-8".into()))
533    }
534
535    fn parse_number(&self, bytes: &mut Bytes) -> Result<Bytes, JsonImportError> {
536        parse_number_common(bytes)
537    }
538}
539
540fn hash_chunk<H: Digest>(hasher: &mut H, bytes: &[u8]) {
541    let len = (bytes.len() as u64).to_be_bytes();
542    hasher.update(len);
543    hasher.update(bytes);
544}
545
546fn id_from_digest(digest: &[u8]) -> Id {
547    let mut raw: RawId = [0u8; ID_LEN];
548    raw.copy_from_slice(&digest[digest.len() - ID_LEN..]);
549    if raw == [0; ID_LEN] {
550        raw[0] = 1;
551    }
552    Id::new(raw).unwrap_or_else(|| unsafe { Id::force(raw) })
553}
554
555#[cfg(test)]
556mod tests {
557    use super::{kind_array_entry, JsonTreeImporter};
558    use crate::blob::MemoryBlobStore;
559    use crate::blob::ToBlob;
560    use crate::id::Id;
561    use crate::macros::{find, pattern};
562    use crate::value::schemas::hash::Blake3;
563
564    #[test]
565    fn lossless_ids_are_content_based() {
566        let input = r#"{ "a": [1, 2] }"#;
567        let mut blobs = MemoryBlobStore::<Blake3>::new();
568        let mut importer = JsonTreeImporter::<_, Blake3>::new(&mut blobs, None);
569        let root = importer
570            .import_blob(input.to_blob())
571            .unwrap()
572            .root()
573            .expect("import_blob returns a rooted fragment");
574        drop(importer);
575        let mut other = JsonTreeImporter::<_, Blake3>::new(&mut blobs, None);
576        let other_root = other
577            .import_blob(input.to_blob())
578            .unwrap()
579            .root()
580            .expect("import_blob returns a rooted fragment");
581        assert_eq!(root, other_root);
582    }
583
584    #[test]
585    fn lossless_preserves_array_order() {
586        let input = r#"[1, 2]"#;
587        let mut blobs = MemoryBlobStore::<Blake3>::new();
588        let mut importer = JsonTreeImporter::<_, Blake3>::new(&mut blobs, None);
589        let fragment = importer.import_blob(input.to_blob()).unwrap();
590        let root = fragment
591            .root()
592            .expect("import_blob returns a rooted fragment");
593        let catalog = fragment.facts();
594        let mut entries = find!(
595            (index: ethnum::U256, value: Id),
596            pattern!(catalog, [{
597                _?entry @
598                super::kind: kind_array_entry,
599                super::array_parent: root,
600                super::array_index: ?index,
601                super::array_value: ?value,
602            }])
603        )
604        .collect::<Vec<_>>();
605        entries.sort_by_key(|(index, _)| *index);
606        assert_eq!(entries.len(), 2);
607        assert_eq!(entries[0].0, ethnum::U256::new(0));
608        assert_eq!(entries[1].0, ethnum::U256::new(1));
609    }
610}