Skip to main content

mnem_core/prolly/
tree.rs

1//! On-wire Prolly tree chunks (leaf and internal) + streaming builder.
2//!
3//! Per SPEC §4.3, a Prolly tree is a Merkle DAG of [`TreeChunk`] objects.
4//! A leaf carries a sorted `(ProllyKey, Cid)` list; an internal node
5//! carries separator keys and child-chunk CIDs. Both share the `_kind =
6//! "tree"` discriminator; the `leaf: bool` field picks the variant.
7//!
8//! The builder in this module consumes a **sorted** iterator of
9//! `(key, value)` pairs, writes every chunk it emits to a
10//! [`Blockstore`], and returns the root chunk's CID. M5.2 covers
11//! construction; lookup, cursor, and diff land in M5.3.
12
13use std::collections::BTreeMap;
14
15use ipld_core::ipld::Ipld;
16use serde::{Deserialize, Deserializer, Serialize, Serializer};
17
18use crate::codec::{from_canonical_bytes, hash_to_cid};
19use crate::error::Error;
20use crate::id::Cid;
21use crate::prolly::chunker::Chunker;
22use crate::prolly::constants::ProllyKey;
23use crate::store::Blockstore;
24
25// ---------------- Shapes ----------------
26
27/// On-wire content of a Prolly leaf chunk.
28#[derive(Clone, Debug, PartialEq, Eq)]
29pub struct Leaf {
30    /// Sorted (key, value) entries.
31    pub entries: Vec<(ProllyKey, Cid)>,
32    /// Forward-compat extension map per SPEC §3.2.
33    pub extra: BTreeMap<String, Ipld>,
34}
35
36impl Leaf {
37    /// Construct a leaf with no extension fields.
38    #[must_use]
39    pub const fn new(entries: Vec<(ProllyKey, Cid)>) -> Self {
40        Self {
41            entries,
42            extra: BTreeMap::new(),
43        }
44    }
45}
46
47/// On-wire content of a Prolly internal (non-leaf) chunk.
48#[derive(Clone, Debug, PartialEq, Eq)]
49pub struct Internal {
50    /// `children.len() - 1` separator keys, strictly ascending.
51    /// `boundaries[i]` is the inclusive lower bound of child `i + 1`.
52    pub boundaries: Vec<ProllyKey>,
53    /// `boundaries.len() + 1` child-chunk CIDs in order.
54    pub children: Vec<Cid>,
55    /// Forward-compat extension map per SPEC §3.2.
56    pub extra: BTreeMap<String, Ipld>,
57}
58
59impl Internal {
60    /// Construct an internal chunk with no extension fields.
61    ///
62    /// # Panics
63    ///
64    /// In debug builds: `boundaries.len() + 1 != children.len()`.
65    #[must_use]
66    pub fn new(boundaries: Vec<ProllyKey>, children: Vec<Cid>) -> Self {
67        debug_assert_eq!(boundaries.len() + 1, children.len());
68        Self {
69            boundaries,
70            children,
71            extra: BTreeMap::new(),
72        }
73    }
74}
75
76/// A Prolly tree chunk on the wire - either a leaf or an internal node.
77#[derive(Clone, Debug, PartialEq, Eq)]
78pub enum TreeChunk {
79    /// A leaf chunk carrying `(key, value)` entries.
80    Leaf(Leaf),
81    /// An internal chunk carrying separator keys and child pointers.
82    Internal(Internal),
83}
84
85impl TreeChunk {
86    /// The `_kind` discriminator on the wire. `"tree"` for both variants.
87    pub const KIND: &'static str = "tree";
88
89    /// `true` iff this is a leaf chunk.
90    #[must_use]
91    pub const fn is_leaf(&self) -> bool {
92        matches!(self, Self::Leaf(_))
93    }
94
95    /// Number of entries (leaf) or children (internal).
96    #[must_use]
97    pub const fn len(&self) -> usize {
98        match self {
99            Self::Leaf(l) => l.entries.len(),
100            Self::Internal(i) => i.children.len(),
101        }
102    }
103
104    /// Whether the chunk carries zero entries/children.
105    #[must_use]
106    pub const fn is_empty(&self) -> bool {
107        self.len() == 0
108    }
109}
110
111// ---------------- Serde ----------------
112
113#[derive(Serialize, Deserialize)]
114struct TreeWire {
115    #[serde(rename = "_kind")]
116    kind: String,
117    leaf: bool,
118    #[serde(default, skip_serializing_if = "Option::is_none")]
119    entries: Option<Vec<(ProllyKey, Cid)>>,
120    #[serde(default, skip_serializing_if = "Option::is_none")]
121    boundaries: Option<Vec<ProllyKey>>,
122    #[serde(default, skip_serializing_if = "Option::is_none")]
123    children: Option<Vec<Cid>>,
124    #[serde(flatten, default, skip_serializing_if = "BTreeMap::is_empty")]
125    extra: BTreeMap<String, Ipld>,
126}
127
128impl Serialize for TreeChunk {
129    fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
130        let wire = match self {
131            Self::Leaf(l) => TreeWire {
132                kind: Self::KIND.into(),
133                leaf: true,
134                entries: Some(l.entries.clone()),
135                boundaries: None,
136                children: None,
137                extra: l.extra.clone(),
138            },
139            Self::Internal(i) => TreeWire {
140                kind: Self::KIND.into(),
141                leaf: false,
142                entries: None,
143                boundaries: Some(i.boundaries.clone()),
144                children: Some(i.children.clone()),
145                extra: i.extra.clone(),
146            },
147        };
148        wire.serialize(serializer)
149    }
150}
151
152impl<'de> Deserialize<'de> for TreeChunk {
153    fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
154        use serde::de::Error as _;
155        let w = TreeWire::deserialize(deserializer)?;
156        if w.kind != Self::KIND {
157            return Err(D::Error::custom(format!(
158                "expected _kind='tree', got '{}'",
159                w.kind
160            )));
161        }
162        if w.leaf {
163            let entries = w
164                .entries
165                .ok_or_else(|| D::Error::custom("leaf chunk missing 'entries'"))?;
166            if w.boundaries.is_some() || w.children.is_some() {
167                return Err(D::Error::custom(
168                    "leaf chunk must not carry 'boundaries' or 'children'",
169                ));
170            }
171            Ok(Self::Leaf(Leaf {
172                entries,
173                extra: w.extra,
174            }))
175        } else {
176            let boundaries = w
177                .boundaries
178                .ok_or_else(|| D::Error::custom("internal chunk missing 'boundaries'"))?;
179            let children = w
180                .children
181                .ok_or_else(|| D::Error::custom("internal chunk missing 'children'"))?;
182            if w.entries.is_some() {
183                return Err(D::Error::custom("internal chunk must not carry 'entries'"));
184            }
185            if boundaries.len() + 1 != children.len() {
186                return Err(D::Error::custom(format!(
187                    "internal chunk: boundaries.len()={} must equal children.len()-1 ({}-1)",
188                    boundaries.len(),
189                    children.len()
190                )));
191            }
192            Ok(Self::Internal(Internal {
193                boundaries,
194                children,
195                extra: w.extra,
196            }))
197        }
198    }
199}
200
201// ---------------- Read helper ----------------
202
203/// Decode a [`TreeChunk`] from canonical DAG-CBOR bytes.
204///
205/// # Errors
206///
207/// Returns a codec error if the bytes are malformed or the decoded
208/// chunk violates SPEC §4.3 invariants (`_kind != "tree"`, mismatched
209/// `boundaries` / `children` lengths, etc.).
210pub fn load_chunk(bytes: &[u8]) -> Result<TreeChunk, Error> {
211    Ok(from_canonical_bytes(bytes)?)
212}
213
214/// Fetch a chunk from a blockstore by CID and decode it.
215///
216/// Combines [`Blockstore::get`] and [`load_chunk`] with explicit
217/// `NotFound` error mapping - the standard "load a tree chunk" pattern.
218///
219/// # Errors
220///
221/// Returns [`crate::error::StoreError::NotFound`] if the CID is absent
222/// from the store, plus any underlying store or codec error.
223pub fn load_tree_chunk<B: Blockstore + ?Sized>(store: &B, cid: &Cid) -> Result<TreeChunk, Error> {
224    let bytes = store
225        .get(cid)?
226        .ok_or_else(|| crate::error::StoreError::NotFound { cid: cid.clone() })?;
227    load_chunk(&bytes)
228}
229
230// ---------------- Builder ----------------
231
232/// Build a Prolly tree from a sorted iterator of `(key, value)` pairs.
233///
234/// Writes every chunk to `blockstore` and returns the root chunk's CID.
235/// Input MUST be sorted ascending by key; the builder does not sort.
236///
237/// Empty input produces a single empty leaf chunk whose CID is returned.
238///
239/// # Errors
240///
241/// Propagates blockstore and codec errors; otherwise infallible given
242/// sorted input.
243pub fn build_tree<B, I>(blockstore: &B, entries: I) -> Result<Cid, Error>
244where
245    B: Blockstore + ?Sized,
246    I: IntoIterator<Item = (ProllyKey, Cid)>,
247{
248    let level_zero = build_leaf_level(blockstore, entries)?;
249    let mut current = level_zero;
250    while current.len() > 1 {
251        current = build_internal_level(blockstore, current)?;
252    }
253    // `build_leaf_level` always produces at least one chunk (empty-leaf
254    // sentinel for empty input), so `current` has exactly 1 element here.
255    Ok(current
256        .into_iter()
257        .next()
258        .expect("build_leaf_level emits >= 1 chunk")
259        .1)
260}
261
262fn build_leaf_level<B, I>(blockstore: &B, entries: I) -> Result<Vec<(ProllyKey, Cid)>, Error>
263where
264    B: Blockstore + ?Sized,
265    I: IntoIterator<Item = (ProllyKey, Cid)>,
266{
267    let mut chunker = Chunker::new();
268    let mut cur: Vec<(ProllyKey, Cid)> = Vec::with_capacity(128);
269    let mut out: Vec<(ProllyKey, Cid)> = Vec::new();
270
271    for (k, v) in entries {
272        chunker.push_key(&k);
273        cur.push((k, v));
274        if chunker.should_split_at(cur.len()) {
275            emit_leaf(blockstore, &mut cur, &mut out)?;
276            chunker.reset();
277        }
278    }
279
280    if !cur.is_empty() {
281        emit_leaf(blockstore, &mut cur, &mut out)?;
282    }
283
284    if out.is_empty() {
285        // Empty-input sentinel: one empty leaf chunk.
286        let chunk = TreeChunk::Leaf(Leaf::new(Vec::new()));
287        let (bytes, cid) = hash_to_cid(&chunk)?;
288        // safety: cid computed above via hash_to_cid
289        blockstore.put_trusted(cid.clone(), bytes)?;
290        out.push((ProllyKey::default(), cid));
291    }
292
293    Ok(out)
294}
295
296fn emit_leaf<B: Blockstore + ?Sized>(
297    blockstore: &B,
298    cur: &mut Vec<(ProllyKey, Cid)>,
299    out: &mut Vec<(ProllyKey, Cid)>,
300) -> Result<(), Error> {
301    let first = cur[0].0;
302    let chunk = TreeChunk::Leaf(Leaf::new(std::mem::take(cur)));
303    let (bytes, cid) = hash_to_cid(&chunk)?;
304    // safety: cid computed above via hash_to_cid
305    blockstore.put_trusted(cid.clone(), bytes)?;
306    out.push((first, cid));
307    Ok(())
308}
309
310fn build_internal_level<B: Blockstore + ?Sized>(
311    blockstore: &B,
312    children: Vec<(ProllyKey, Cid)>,
313) -> Result<Vec<(ProllyKey, Cid)>, Error> {
314    let mut chunker = Chunker::new();
315    let mut out: Vec<(ProllyKey, Cid)> = Vec::new();
316    let mut cur_children: Vec<Cid> = Vec::new();
317    let mut cur_boundaries: Vec<ProllyKey> = Vec::new();
318    let mut cur_first: Option<ProllyKey> = None;
319
320    for (first_key, child_cid) in children {
321        chunker.push_key(&first_key);
322        if cur_first.is_none() {
323            cur_first = Some(first_key);
324        } else {
325            // Separator between the previous child and this one.
326            cur_boundaries.push(first_key);
327        }
328        cur_children.push(child_cid);
329
330        if chunker.should_split_at(cur_children.len()) {
331            emit_internal(
332                blockstore,
333                &mut cur_boundaries,
334                &mut cur_children,
335                &mut cur_first,
336                &mut out,
337            )?;
338            chunker.reset();
339        }
340    }
341
342    if !cur_children.is_empty() {
343        emit_internal(
344            blockstore,
345            &mut cur_boundaries,
346            &mut cur_children,
347            &mut cur_first,
348            &mut out,
349        )?;
350    }
351
352    Ok(out)
353}
354
355fn emit_internal<B: Blockstore + ?Sized>(
356    blockstore: &B,
357    cur_boundaries: &mut Vec<ProllyKey>,
358    cur_children: &mut Vec<Cid>,
359    cur_first: &mut Option<ProllyKey>,
360    out: &mut Vec<(ProllyKey, Cid)>,
361) -> Result<(), Error> {
362    debug_assert_eq!(cur_boundaries.len() + 1, cur_children.len());
363    let chunk = TreeChunk::Internal(Internal::new(
364        std::mem::take(cur_boundaries),
365        std::mem::take(cur_children),
366    ));
367    let (bytes, cid) = hash_to_cid(&chunk)?;
368    // safety: cid computed above via hash_to_cid
369    blockstore.put_trusted(cid.clone(), bytes)?;
370    out.push((cur_first.take().expect("first key set above"), cid));
371    Ok(())
372}
373
374#[cfg(test)]
375mod tests {
376    use super::*;
377    use crate::codec::{from_canonical_bytes, to_canonical_bytes};
378    use crate::id::{CODEC_RAW, Multihash};
379    use crate::store::MemoryBlockstore;
380
381    fn dummy_cid(n: u32) -> Cid {
382        Cid::new(CODEC_RAW, Multihash::sha2_256(&n.to_be_bytes()))
383    }
384
385    fn keyed(i: u32) -> ProllyKey {
386        let mut k = [0u8; 16];
387        k[12..16].copy_from_slice(&i.to_be_bytes());
388        ProllyKey(k)
389    }
390
391    #[test]
392    fn leaf_round_trip() {
393        let leaf = TreeChunk::Leaf(Leaf::new(vec![
394            (keyed(1), dummy_cid(1)),
395            (keyed(2), dummy_cid(2)),
396        ]));
397        let bytes = to_canonical_bytes(&leaf).unwrap();
398        let decoded: TreeChunk = from_canonical_bytes(&bytes).unwrap();
399        assert_eq!(leaf, decoded);
400        let bytes2 = to_canonical_bytes(&decoded).unwrap();
401        assert_eq!(bytes, bytes2);
402    }
403
404    #[test]
405    fn internal_round_trip() {
406        let internal = TreeChunk::Internal(Internal::new(
407            vec![keyed(10), keyed(20)],
408            vec![dummy_cid(100), dummy_cid(200), dummy_cid(300)],
409        ));
410        let bytes = to_canonical_bytes(&internal).unwrap();
411        let decoded: TreeChunk = from_canonical_bytes(&bytes).unwrap();
412        assert_eq!(internal, decoded);
413    }
414
415    #[test]
416    fn wrong_kind_rejected() {
417        // Craft a tree-shaped wire with wrong _kind.
418        let w = TreeWire {
419            kind: "node".into(),
420            leaf: true,
421            entries: Some(vec![]),
422            boundaries: None,
423            children: None,
424            extra: BTreeMap::new(),
425        };
426        let bytes = serde_ipld_dagcbor::to_vec(&w).unwrap();
427        let err = serde_ipld_dagcbor::from_slice::<TreeChunk>(&bytes).unwrap_err();
428        assert!(err.to_string().contains("_kind"));
429    }
430
431    #[test]
432    fn internal_rejects_inconsistent_lengths() {
433        let w = TreeWire {
434            kind: "tree".into(),
435            leaf: false,
436            entries: None,
437            boundaries: Some(vec![keyed(10), keyed(20)]),
438            children: Some(vec![dummy_cid(1)]),
439            extra: BTreeMap::new(),
440        };
441        let bytes = serde_ipld_dagcbor::to_vec(&w).unwrap();
442        let err = serde_ipld_dagcbor::from_slice::<TreeChunk>(&bytes).unwrap_err();
443        assert!(err.to_string().contains("boundaries.len()"));
444    }
445
446    #[test]
447    fn build_empty_produces_single_empty_leaf() {
448        let store = MemoryBlockstore::new();
449        let root = build_tree(&store, std::iter::empty()).unwrap();
450        assert_eq!(store.len(), 1);
451        let bytes = store.get(&root).unwrap().unwrap();
452        let chunk: TreeChunk = from_canonical_bytes(&bytes).unwrap();
453        match chunk {
454            TreeChunk::Leaf(l) => assert!(l.entries.is_empty()),
455            TreeChunk::Internal(_) => panic!("empty tree should be a leaf"),
456        }
457    }
458
459    #[test]
460    fn build_single_entry_is_leaf() {
461        let store = MemoryBlockstore::new();
462        let root = build_tree(&store, [(keyed(1), dummy_cid(1))]).unwrap();
463        let bytes = store.get(&root).unwrap().unwrap();
464        let chunk: TreeChunk = from_canonical_bytes(&bytes).unwrap();
465        assert!(chunk.is_leaf());
466        assert_eq!(chunk.len(), 1);
467    }
468
469    #[test]
470    fn build_many_entries_produces_internal_root() {
471        let store = MemoryBlockstore::new();
472        let entries: Vec<_> = (0..1000u32).map(|i| (keyed(i), dummy_cid(i))).collect();
473        let root = build_tree(&store, entries).unwrap();
474        let bytes = store.get(&root).unwrap().unwrap();
475        let chunk: TreeChunk = from_canonical_bytes(&bytes).unwrap();
476        assert!(
477            !chunk.is_leaf(),
478            "1000 entries should require an internal root"
479        );
480    }
481
482    #[test]
483    fn build_is_deterministic_across_builds() {
484        let entries: Vec<_> = (0..1_000u32).map(|i| (keyed(i), dummy_cid(i))).collect();
485        let s1 = MemoryBlockstore::new();
486        let r1 = build_tree(&s1, entries.clone()).unwrap();
487        let s2 = MemoryBlockstore::new();
488        let r2 = build_tree(&s2, entries).unwrap();
489        assert_eq!(r1, r2);
490        assert_eq!(s1.len(), s2.len(), "chunk count must match");
491    }
492
493    #[test]
494    fn internal_children_are_reachable_from_root() {
495        let store = MemoryBlockstore::new();
496        let entries: Vec<_> = (0..500u32).map(|i| (keyed(i), dummy_cid(i))).collect();
497        let root = build_tree(&store, entries).unwrap();
498
499        // Walk: root must exist; every child must exist.
500        fn walk(cid: &Cid, store: &MemoryBlockstore, leaves: &mut usize, internals: &mut usize) {
501            let bytes = store.get(cid).unwrap().unwrap();
502            let chunk: TreeChunk = from_canonical_bytes(&bytes).unwrap();
503            match chunk {
504                TreeChunk::Leaf(_) => *leaves += 1,
505                TreeChunk::Internal(i) => {
506                    *internals += 1;
507                    for c in &i.children {
508                        walk(c, store, leaves, internals);
509                    }
510                }
511            }
512        }
513
514        let mut leaves = 0;
515        let mut internals = 0;
516        walk(&root, &store, &mut leaves, &mut internals);
517        assert!(leaves >= 1);
518        assert!(
519            internals >= 1,
520            "500 entries should yield at least one internal chunk"
521        );
522        assert_eq!(leaves + internals, store.len(), "no unreachable chunks");
523    }
524}