Skip to main content

mnem_core/objects/
index_set.rs

1//! Index objects (Phase 2 secondary indexes).
2//!
3//! An [`IndexSet`] lives alongside a [`Commit`](super::Commit) and
4//! carries pointers to helper Prolly trees that make common agent
5//! queries fast:
6//!
7//! - **`nodes_by_label`** - for each node label, a Prolly tree keyed by
8//!   `NodeId` (16 bytes) whose values are node CIDs. Turns
9//!   "all Person nodes" from O(n) into a label-scoped cursor iteration.
10//! - **`nodes_by_prop`** - for each (label, `prop_name`), a Prolly tree
11//!   keyed by `blake3(canonical_ipld(value))[..16]` whose values are
12//!   node CIDs. Turns "node where name='Alice'" into O(log n) point
13//!   lookup. Single-valued: duplicate (label, prop, value) tuples
14//!   collide at the key level; last-write-wins (use the
15//!   `resolve_or_create_node` helper on Transaction to avoid
16//!   creating duplicates).
17//! - **`outgoing`** - Prolly tree keyed by **source** `NodeId` whose
18//!   values are CIDs of [`AdjacencyBucket`] objects - a small sorted
19//!   list of `(edge_label, edge_cid)` pairs for that node. Turns
20//!   "outgoing edges of X" into O(log n) + one bucket read. Previously
21//!   named `adjacency`; the on-wire field alias `adjacency` is still
22//!   accepted on decode for repos written before the incoming-index
23//!   addition.
24//! - **`incoming`** - Prolly tree keyed by **destination** `NodeId`
25//!   whose values are CIDs of [`IncomingAdjacencyBucket`] objects - a
26//!   small sorted list of `(etype, src, edge_cid)` triples for that
27//!   node. Turns "who points at X" into O(log n) + one bucket read.
28//!   Added alongside the incoming-index feature; mandates `IndexSet`
29//!   rebuild on format-bump commits.
30//!
31//! All indexes are derived from the node + edge Prolly trees owned by
32//! the Commit. They are entirely optional; a commit with
33//! `indexes = None` still functions (queries fall back to full scan).
34
35use std::collections::BTreeMap;
36
37use ipld_core::ipld::Ipld;
38use serde::{Deserialize, Deserializer, Serialize, Serializer};
39
40use crate::id::{Cid, NodeId};
41
42/// Top-level secondary-index aggregator (SPEC §4.8, added in mnem/0.2,
43/// extended with `incoming` in mnem/0.3).
44#[derive(Clone, Debug, Default, PartialEq, Eq)]
45pub struct IndexSet {
46    /// Map from node label (`ntype`) to the root CID of a Prolly tree
47    /// keyed by `NodeId` (16 bytes) with values = node CIDs.
48    pub nodes_by_label: BTreeMap<String, Cid>,
49
50    /// Two-level map `label -> prop_name -> Cid`. Each leaf CID is the
51    /// root of a Prolly tree keyed by
52    /// `blake3(canonical_ipld(value))[..16]` with values = node CIDs.
53    pub nodes_by_prop: BTreeMap<String, BTreeMap<String, Cid>>,
54
55    /// Outgoing adjacency index. Prolly tree keyed by **source**
56    /// `NodeId` whose values are CIDs of [`AdjacencyBucket`] objects.
57    /// `None` on a repo without edges.
58    ///
59    /// Historically serialised under the field name `adjacency`; on
60    /// decode both `outgoing` and the legacy `adjacency` are accepted.
61    /// New writes always emit `outgoing`.
62    pub outgoing: Option<Cid>,
63
64    /// Incoming adjacency index. Prolly tree keyed by **destination**
65    /// `NodeId` whose values are CIDs of [`IncomingAdjacencyBucket`]
66    /// objects. `None` on a repo without edges OR on a repo whose
67    /// `IndexSet` was built by a pre-0.3 implementation (in which case
68    /// callers gracefully degrade to "no back-edges known"; see
69    /// SPEC §4.8).
70    pub incoming: Option<Cid>,
71
72    /// Forward-compat extension map (SPEC §3.2).
73    pub extra: BTreeMap<String, Ipld>,
74}
75
76impl IndexSet {
77    /// The `_kind` discriminator on the wire.
78    pub const KIND: &'static str = "index_set";
79}
80
81/// Per-source-node bucket of outgoing edges. Stored as a standalone
82/// object so the adjacency Prolly tree can cheaply reference a list
83/// without inlining it into every leaf.
84#[derive(Clone, Debug, Default, PartialEq, Eq)]
85pub struct AdjacencyBucket {
86    /// Outgoing edges, sorted lexicographically by `(label, edge_cid)`
87    /// for byte-stable canonical form.
88    pub edges: Vec<AdjacencyEntry>,
89    /// Forward-compat extension map.
90    pub extra: BTreeMap<String, Ipld>,
91}
92
93impl AdjacencyBucket {
94    /// The `_kind` discriminator on the wire.
95    pub const KIND: &'static str = "adjacency_bucket";
96}
97
98/// One outgoing-edge record inside an [`AdjacencyBucket`].
99///
100/// The bucket is keyed by `src` `NodeId` in the outer Prolly tree, so
101/// the source is known from context; only the edge label and edge CID
102/// live in each entry. The CID resolves to an `Edge` object, from
103/// which `dst` and `edge_id` are recovered.
104#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
105pub struct AdjacencyEntry {
106    /// Edge label (`etype`).
107    pub label: String,
108    /// Content-addressed edge CID.
109    pub edge: Cid,
110}
111
112/// Per-destination-node bucket of incoming edges. Stored as a
113/// standalone object so the incoming-adjacency Prolly tree can cheaply
114/// reference a list without inlining it into every leaf.
115///
116/// Structurally distinct from [`AdjacencyBucket`] because each entry
117/// must carry the **source** `NodeId` (the bucket is keyed by `dst`,
118/// so the outer key tells you nothing about `src`). Callers answering
119/// "who points at me?" read the bucket and walk `entries` without any
120/// `Edge` decode work (the `src` is already here).
121#[derive(Clone, Debug, Default, PartialEq, Eq)]
122pub struct IncomingAdjacencyBucket {
123    /// Incoming edges, sorted lexicographically by
124    /// `(label, src, edge_cid)` for byte-stable canonical form.
125    pub edges: Vec<IncomingAdjacencyEntry>,
126    /// Forward-compat extension map.
127    pub extra: BTreeMap<String, Ipld>,
128}
129
130impl IncomingAdjacencyBucket {
131    /// The `_kind` discriminator on the wire.
132    pub const KIND: &'static str = "incoming_adjacency_bucket";
133}
134
135/// One incoming-edge record inside an [`IncomingAdjacencyBucket`].
136///
137/// Unlike [`AdjacencyEntry`] (which can omit `src` because it lives in
138/// the outer Prolly key), this entry carries the source `NodeId`
139/// explicitly: the outer key is `dst`, so without `src` here callers
140/// would have to decode the Edge object just to answer "who pointed
141/// at me?".
142#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
143pub struct IncomingAdjacencyEntry {
144    /// Edge label (`etype`).
145    pub label: String,
146    /// Source `NodeId` - the node that owns the out-edge pointing at
147    /// this bucket's dst.
148    pub src: NodeId,
149    /// Content-addressed edge CID. Resolving it gives the full `Edge`
150    /// object (with `edge_id`, props, etc.).
151    pub edge: Cid,
152}
153
154// ---------------- Serde: IndexSet ----------------
155
156/// On-wire shape for `IndexSet`.
157///
158/// Both the new `outgoing` field and the legacy `adjacency` alias are
159/// accepted on decode. `adjacency` is retained so older repos (written
160/// before the incoming-adjacency feature) still round-trip through
161/// `Deserialize`; new writes always emit `outgoing`.
162#[derive(Serialize, Deserialize)]
163struct IndexSetWire {
164    #[serde(rename = "_kind")]
165    kind: String,
166    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
167    nodes_by_label: BTreeMap<String, Cid>,
168    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
169    nodes_by_prop: BTreeMap<String, BTreeMap<String, Cid>>,
170    // Accept the legacy field name on decode; new writes emit `outgoing`.
171    #[serde(default, skip_serializing_if = "Option::is_none", alias = "adjacency")]
172    outgoing: Option<Cid>,
173    #[serde(default, skip_serializing_if = "Option::is_none")]
174    incoming: Option<Cid>,
175    #[serde(flatten, default, skip_serializing_if = "BTreeMap::is_empty")]
176    extra: BTreeMap<String, Ipld>,
177}
178
179impl Serialize for IndexSet {
180    fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
181        IndexSetWire {
182            kind: Self::KIND.into(),
183            nodes_by_label: self.nodes_by_label.clone(),
184            nodes_by_prop: self.nodes_by_prop.clone(),
185            outgoing: self.outgoing.clone(),
186            incoming: self.incoming.clone(),
187            extra: self.extra.clone(),
188        }
189        .serialize(serializer)
190    }
191}
192
193impl<'de> Deserialize<'de> for IndexSet {
194    fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
195        let w = IndexSetWire::deserialize(deserializer)?;
196        if w.kind != Self::KIND {
197            return Err(serde::de::Error::custom(format!(
198                "expected _kind='{}', got '{}'",
199                Self::KIND,
200                w.kind
201            )));
202        }
203        Ok(Self {
204            nodes_by_label: w.nodes_by_label,
205            nodes_by_prop: w.nodes_by_prop,
206            outgoing: w.outgoing,
207            incoming: w.incoming,
208            extra: w.extra,
209        })
210    }
211}
212
213// ---------------- Serde: AdjacencyBucket ----------------
214
215#[derive(Serialize, Deserialize)]
216struct AdjacencyBucketWire {
217    #[serde(rename = "_kind")]
218    kind: String,
219    edges: Vec<AdjacencyEntry>,
220    #[serde(flatten, default, skip_serializing_if = "BTreeMap::is_empty")]
221    extra: BTreeMap<String, Ipld>,
222}
223
224impl Serialize for AdjacencyBucket {
225    fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
226        AdjacencyBucketWire {
227            kind: Self::KIND.into(),
228            edges: self.edges.clone(),
229            extra: self.extra.clone(),
230        }
231        .serialize(serializer)
232    }
233}
234
235impl<'de> Deserialize<'de> for AdjacencyBucket {
236    fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
237        let w = AdjacencyBucketWire::deserialize(deserializer)?;
238        if w.kind != Self::KIND {
239            return Err(serde::de::Error::custom(format!(
240                "expected _kind='{}', got '{}'",
241                Self::KIND,
242                w.kind
243            )));
244        }
245        Ok(Self {
246            edges: w.edges,
247            extra: w.extra,
248        })
249    }
250}
251
252// ---------------- Serde: IncomingAdjacencyBucket ----------------
253
254#[derive(Serialize, Deserialize)]
255struct IncomingAdjacencyBucketWire {
256    #[serde(rename = "_kind")]
257    kind: String,
258    edges: Vec<IncomingAdjacencyEntry>,
259    #[serde(flatten, default, skip_serializing_if = "BTreeMap::is_empty")]
260    extra: BTreeMap<String, Ipld>,
261}
262
263impl Serialize for IncomingAdjacencyBucket {
264    fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
265        IncomingAdjacencyBucketWire {
266            kind: Self::KIND.into(),
267            edges: self.edges.clone(),
268            extra: self.extra.clone(),
269        }
270        .serialize(serializer)
271    }
272}
273
274impl<'de> Deserialize<'de> for IncomingAdjacencyBucket {
275    fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
276        let w = IncomingAdjacencyBucketWire::deserialize(deserializer)?;
277        if w.kind != Self::KIND {
278            return Err(serde::de::Error::custom(format!(
279                "expected _kind='{}', got '{}'",
280                Self::KIND,
281                w.kind
282            )));
283        }
284        Ok(Self {
285            edges: w.edges,
286            extra: w.extra,
287        })
288    }
289}
290
291#[cfg(test)]
292mod tests {
293    use super::*;
294    use crate::codec::{from_canonical_bytes, to_canonical_bytes};
295    use crate::id::{CODEC_RAW, Multihash};
296
297    fn raw(n: u32) -> Cid {
298        Cid::new(CODEC_RAW, Multihash::sha2_256(&n.to_be_bytes()))
299    }
300
301    #[test]
302    fn index_set_round_trip() {
303        let mut set = IndexSet::default();
304        set.nodes_by_label.insert("Person".into(), raw(1));
305        set.nodes_by_label.insert("Document".into(), raw(2));
306        let mut person_props = BTreeMap::new();
307        person_props.insert("name".into(), raw(3));
308        set.nodes_by_prop.insert("Person".into(), person_props);
309        set.outgoing = Some(raw(4));
310        set.incoming = Some(raw(5));
311
312        let bytes = to_canonical_bytes(&set).unwrap();
313        let decoded: IndexSet = from_canonical_bytes(&bytes).unwrap();
314        assert_eq!(set, decoded);
315    }
316
317    #[test]
318    fn empty_index_set_round_trips() {
319        let set = IndexSet::default();
320        let bytes = to_canonical_bytes(&set).unwrap();
321        let decoded: IndexSet = from_canonical_bytes(&bytes).unwrap();
322        assert_eq!(set, decoded);
323    }
324
325    #[test]
326    fn index_set_decodes_legacy_adjacency_alias() {
327        // Older repos (pre-incoming-adjacency) wrote `adjacency: Cid`.
328        // New code reads that into `outgoing`, and `incoming` stays
329        // `None` (older repos had no back-index).
330        #[derive(Serialize)]
331        struct LegacyWire {
332            #[serde(rename = "_kind")]
333            kind: String,
334            adjacency: Cid,
335        }
336        let legacy = LegacyWire {
337            kind: "index_set".into(),
338            adjacency: raw(42),
339        };
340        let bytes = to_canonical_bytes(&legacy).unwrap();
341        let decoded: IndexSet = from_canonical_bytes(&bytes).unwrap();
342        assert_eq!(decoded.outgoing, Some(raw(42)));
343        assert!(decoded.incoming.is_none());
344    }
345
346    #[test]
347    fn adjacency_bucket_round_trip() {
348        let b = AdjacencyBucket {
349            edges: vec![
350                AdjacencyEntry {
351                    label: "knows".into(),
352                    edge: raw(10),
353                },
354                AdjacencyEntry {
355                    label: "works_at".into(),
356                    edge: raw(11),
357                },
358            ],
359            extra: BTreeMap::new(),
360        };
361        let bytes = to_canonical_bytes(&b).unwrap();
362        let decoded: AdjacencyBucket = from_canonical_bytes(&bytes).unwrap();
363        assert_eq!(b, decoded);
364    }
365
366    #[test]
367    fn incoming_adjacency_bucket_round_trip() {
368        let b = IncomingAdjacencyBucket {
369            edges: vec![
370                IncomingAdjacencyEntry {
371                    label: "knows".into(),
372                    src: NodeId::from_bytes_raw([1u8; 16]),
373                    edge: raw(10),
374                },
375                IncomingAdjacencyEntry {
376                    label: "works_at".into(),
377                    src: NodeId::from_bytes_raw([2u8; 16]),
378                    edge: raw(11),
379                },
380            ],
381            extra: BTreeMap::new(),
382        };
383        let bytes = to_canonical_bytes(&b).unwrap();
384        let decoded: IncomingAdjacencyBucket = from_canonical_bytes(&bytes).unwrap();
385        assert_eq!(b, decoded);
386    }
387
388    #[test]
389    fn wrong_kind_rejected() {
390        let wire = AdjacencyBucketWire {
391            kind: "not_adjacency".into(),
392            edges: vec![],
393            extra: BTreeMap::new(),
394        };
395        let bytes = serde_ipld_dagcbor::to_vec(&wire).unwrap();
396        let err = serde_ipld_dagcbor::from_slice::<AdjacencyBucket>(&bytes).unwrap_err();
397        assert!(err.to_string().contains("_kind"));
398    }
399
400    #[test]
401    fn incoming_bucket_wrong_kind_rejected() {
402        let wire = IncomingAdjacencyBucketWire {
403            kind: "not_incoming".into(),
404            edges: vec![],
405            extra: BTreeMap::new(),
406        };
407        let bytes = serde_ipld_dagcbor::to_vec(&wire).unwrap();
408        let err = serde_ipld_dagcbor::from_slice::<IncomingAdjacencyBucket>(&bytes).unwrap_err();
409        assert!(err.to_string().contains("_kind"));
410    }
411}