Skip to main content

mnem_core/objects/
node.rs

1//! The [`Node`] object and its embedding substructure.
2//!
3//! Per SPEC §4.1:
4//!
5//! ```text
6//! Node: {
7//!   _kind:   "node",
8//!   id:      NodeId (16 bytes),
9//!   ntype:   string,
10//!   summary: string (optional),
11//!   props:   map<string, Ipld>,
12//!   content: bytes (optional),
13//! }
14//! ```
15//!
16//! Dense vector embeddings live in the per-commit sidecar
17//! (`Commit.embeddings` Prolly tree, keyed by NodeCid). Keeping them
18//! out of the canonical Node bytes prevents nondeterministic dense
19//! producers (e.g. ORT thread-count drift) from leaking into
20//! `NodeCid` and breaking federated dedup.
21//!
22//! Legacy DAG-CBOR carrying an explicit `embed` map round-trips
23//! losslessly: the field-less wire decode plus the `extra` flatten
24//! sink absorbs and re-emits the bytes byte-identically, so existing
25//! NodeCids stay stable for repos written before this change.
26
27use std::collections::BTreeMap;
28
29use bytes::Bytes;
30use ipld_core::ipld::Ipld;
31use serde::{Deserialize, Deserializer, Serialize, Serializer};
32
33use crate::error::ObjectError;
34use crate::id::NodeId;
35use crate::sparse::SparseEmbed;
36
37// ---------------- Dtype + Embedding ----------------
38
39/// Numeric element type for an [`Embedding`] vector (SPEC §4.1).
40#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
41#[serde(rename_all = "lowercase")]
42#[derive(Default)]
43pub enum Dtype {
44    /// IEEE 754 half precision - 2 bytes per element.
45    F16,
46    /// IEEE 754 single precision - 4 bytes per element. Default.
47    #[default]
48    F32,
49    /// IEEE 754 double precision - 8 bytes per element.
50    F64,
51    /// Signed 8-bit integer (quantized embeddings) - 1 byte per element.
52    I8,
53}
54
55impl Dtype {
56    /// Bytes per vector element.
57    #[must_use]
58    pub const fn byte_width(self) -> usize {
59        match self {
60            Self::F16 => 2,
61            Self::F32 => 4,
62            Self::F64 => 8,
63            Self::I8 => 1,
64        }
65    }
66}
67
68/// A dense vector embedding produced by a named model.
69///
70/// Embeddings live in the per-commit sidecar (`Commit.embeddings`
71/// Prolly tree, keyed by NodeCid) rather than inline on `Node`. This
72/// keeps dense bytes out of the canonical Node hash so nondeterministic
73/// producers (e.g. ORT thread-count drift) cannot perturb `NodeCid`.
74#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
75pub struct Embedding {
76    /// Free-form model identifier (`"text-embedding-3-small"`,
77    /// `"nomic-embed-text-v1.5"`, etc.).
78    pub model: String,
79    /// Element type of the vector. Defaults to `f32` on encode when absent.
80    #[serde(default)]
81    pub dtype: Dtype,
82    /// Vector dimension.
83    pub dim: u32,
84    /// Vector bytes. Length MUST equal `dim * dtype.byte_width()` per
85    /// SPEC §4.1; validate with [`Embedding::validate`] after decoding.
86    pub vector: Bytes,
87}
88
89impl Embedding {
90    /// Validate the `vector.len() == dim × byte_width` invariant.
91    ///
92    /// # Errors
93    ///
94    /// Returns [`ObjectError::EmbeddingSizeMismatch`] if the invariant is
95    /// violated.
96    pub const fn validate(&self) -> Result<(), ObjectError> {
97        let expected = (self.dim as usize) * self.dtype.byte_width();
98        if self.vector.len() == expected {
99            Ok(())
100        } else {
101            Err(ObjectError::EmbeddingSizeMismatch {
102                expected,
103                got: self.vector.len(),
104            })
105        }
106    }
107}
108
109// ---------------- Node ----------------
110
111/// A graph vertex.
112///
113/// See SPEC §4.1 and [the module docs](super). Construct via [`Node::new`]
114/// and add properties with the `with_*` fluent helpers.
115///
116/// Node is `PartialEq` but not `Eq`: `sparse_embed` carries `Vec<f32>`
117/// whose values cannot be `Eq` (NaN). Use CID equality via
118/// `hash_to_cid` when you need a canonical identity check; field-wise
119/// `==` comparison still works for non-NaN data.
120#[derive(Clone, Debug, PartialEq)]
121pub struct Node {
122    /// Stable node identity. Survives content edits; edges reference this.
123    pub id: NodeId,
124    /// Free-form node-type label (`"Person"`, `"mnem:Class"`, …).
125    pub ntype: String,
126    /// Optional short natural-language summary. Intended as the
127    /// token-cheap representation of this node for LLM-facing retrieval:
128    /// the field agents read when assembling context under a token
129    /// budget. Distinct from `props` (structured) and `content`
130    /// (opaque payload).
131    pub summary: Option<String>,
132    /// Property map. Values are any DAG-CBOR value, including `Link`s.
133    pub props: BTreeMap<String, Ipld>,
134    /// Optional opaque payload (a document body, a file, …).
135    pub content: Option<Bytes>,
136    /// Optional learned-sparse embedding . Produced
137    /// by a `SparseEncoder` adapter (`OpenSearch` neural-sparse-doc-v3-
138    /// distill, BGE-M3-sparse, etc.) and indexed by
139    /// `crate::index::sparse::SparseInvertedIndex::build_from_repo`.
140    ///
141    /// Additive: existing nodes with `sparse_embed = None` keep
142    /// byte-identical CIDs because the wire serializer omits the field
143    /// via `skip_serializing_if = "Option::is_none"`.
144    pub sparse_embed: Option<SparseEmbed>,
145    /// Optional contextualized-chunk prefix . An
146    /// LLM-generated one-sentence placement cue ("This paragraph is
147    /// from Section 3 of a legal contract between Alice and Bob's
148    /// employer...") stored alongside the node. The ingest pipeline
149    /// prepends it to `summary` before embedding so the dense + sparse
150    /// lanes capture positional and relational context the chunk
151    /// alone would lose.
152    ///
153    /// Anthropic's 2024 Contextual Retrieval paper reports -49% to
154    /// -67% retrieval-failure reduction when this prefix is present;
155    /// mnem stores it on the node so the render path can surface it
156    /// back to the agent for faithful source attribution.
157    ///
158    /// Additive: existing nodes with `context_sentence = None` keep
159    /// byte-identical CIDs (same `skip_serializing_if = "Option::is_none"`
160    /// pattern as `sparse_embed`).
161    pub context_sentence: Option<String>,
162    /// Forward-compat extension map per SPEC §3.2 - holds fields this
163    /// version doesn't recognize and preserves them on re-encode so signed
164    /// Nodes remain verifiable across version upgrades.
165    pub extra: BTreeMap<String, Ipld>,
166}
167
168impl Node {
169    /// The `_kind` discriminator for nodes. `"node"` on the wire.
170    pub const KIND: &'static str = "node";
171
172    /// Default `ntype` value used when a caller wants to ingest a node
173    /// without choosing a category. Applied by the HTTP bulk/single
174    /// handlers when the caller omits `label` or sends an empty string.
175    /// Direct Rust callers of [`Node::new`] still pass ntype explicitly;
176    /// [`Node::new_default`] is the zero-arg convenience.
177    pub const DEFAULT_NTYPE: &'static str = "Node";
178
179    /// Construct a Node with no summary, no props, no content.
180    #[must_use]
181    pub fn new(id: NodeId, ntype: impl Into<String>) -> Self {
182        Self {
183            id,
184            ntype: ntype.into(),
185            summary: None,
186            props: BTreeMap::new(),
187            content: None,
188            sparse_embed: None,
189            context_sentence: None,
190            extra: BTreeMap::new(),
191        }
192    }
193
194    /// Construct a Node with the project default `ntype = "Node"`.
195    /// Convenience for callers that don't want to categorise on write;
196    /// equivalent to `Node::new(id, Node::DEFAULT_NTYPE)`.
197    #[must_use]
198    pub fn new_default(id: NodeId) -> Self {
199        Self::new(id, Self::DEFAULT_NTYPE)
200    }
201
202    /// Attach a short summary. Returns `self` for chaining.
203    #[must_use]
204    pub fn with_summary(mut self, summary: impl Into<String>) -> Self {
205        self.summary = Some(summary.into());
206        self
207    }
208
209    /// Attach a property. Returns `self` for chaining.
210    #[must_use]
211    pub fn with_prop(mut self, key: impl Into<String>, value: impl Into<Ipld>) -> Self {
212        self.props.insert(key.into(), value.into());
213        self
214    }
215
216    /// Attach opaque content.
217    #[must_use]
218    pub fn with_content(mut self, content: Bytes) -> Self {
219        self.content = Some(content);
220        self
221    }
222
223    /// Attach a learned-sparse embedding. Consumed by the sparse lane in
224    /// `Retriever` via `crate::index::sparse::SparseInvertedIndex`.
225    #[must_use]
226    pub fn with_sparse_embed(mut self, sparse_embed: SparseEmbed) -> Self {
227        self.sparse_embed = Some(sparse_embed);
228        self
229    }
230
231    /// Attach an LLM-generated contextualized-chunk prefix .
232    /// The render path prepends this to the summary so the agent sees
233    /// where this chunk sits in its source document.
234    ///
235    /// Typical callers run this at ingest time via a `TextGenerator`
236    /// from `mnem-llm-providers` with a prompt like:
237    ///
238    /// > "Give a single sentence that situates the following chunk
239    /// > within its source so a retrieval model can understand where
240    /// > it came from. Chunk: `{summary}` Document context: `{doc_title}`"
241    #[must_use]
242    pub fn with_context_sentence(mut self, context: impl Into<String>) -> Self {
243        self.context_sentence = Some(context.into());
244        self
245    }
246
247    // ---------------- Typed-property accessors ----------------
248    //
249    // Agent code usually wants `"name" -> "Alice"` as `&str`, not the
250    // raw `Option<&Ipld>` every call-site has to pattern-match. The
251    // helpers below provide the common scalar extractions without
252    // adding a new dependency or a bespoke value type.
253
254    /// Get a property as `&str`. Returns `None` if absent or not a string.
255    #[must_use]
256    pub fn get_str(&self, key: &str) -> Option<&str> {
257        match self.props.get(key)? {
258            Ipld::String(s) => Some(s.as_str()),
259            _ => None,
260        }
261    }
262
263    /// Get a property as `i128`. Returns `None` if absent or not an integer.
264    #[must_use]
265    pub fn get_int(&self, key: &str) -> Option<i128> {
266        match self.props.get(key)? {
267            Ipld::Integer(n) => Some(*n),
268            _ => None,
269        }
270    }
271
272    /// Get a property as `bool`. Returns `None` if absent or not a bool.
273    #[must_use]
274    pub fn get_bool(&self, key: &str) -> Option<bool> {
275        match self.props.get(key)? {
276            Ipld::Bool(b) => Some(*b),
277            _ => None,
278        }
279    }
280
281    /// Get a property as `f64`. Returns `None` if absent or not a float.
282    #[must_use]
283    pub fn get_float(&self, key: &str) -> Option<f64> {
284        match self.props.get(key)? {
285            Ipld::Float(f) => Some(*f),
286            _ => None,
287        }
288    }
289
290    /// Get a property as a byte slice. Returns `None` if absent or not bytes.
291    #[must_use]
292    pub fn get_bytes(&self, key: &str) -> Option<&[u8]> {
293        match self.props.get(key)? {
294            Ipld::Bytes(b) => Some(b.as_slice()),
295            _ => None,
296        }
297    }
298}
299
300// ---------------- Node serde (hand-rolled to enforce _kind) ----------------
301
302// The on-wire shape is a DAG-CBOR map with a `_kind` string field. We
303// serialize via `NodeWire` (an internal helper that is structurally
304// identical to Node plus the `_kind` field) and validate `_kind` on
305// deserialize. This keeps the public `Node` struct ergonomic while
306// enforcing the discriminator at the codec boundary.
307
308#[derive(Serialize, Deserialize)]
309struct NodeWire {
310    #[serde(rename = "_kind")]
311    kind: String,
312    id: NodeId,
313    ntype: String,
314    #[serde(default, skip_serializing_if = "Option::is_none")]
315    summary: Option<String>,
316    props: BTreeMap<String, Ipld>,
317    #[serde(default, skip_serializing_if = "Option::is_none")]
318    content: Option<Bytes>,
319    #[serde(default, skip_serializing_if = "Option::is_none")]
320    sparse_embed: Option<SparseEmbed>,
321    #[serde(default, skip_serializing_if = "Option::is_none")]
322    context_sentence: Option<String>,
323    // Forward-compat sink. Absorbs unknown keys including legacy `embed`
324    // maps written before dense vectors moved to the sidecar; the
325    // flatten round-trip keeps NodeCid byte-stable for those repos.
326    #[serde(flatten, default, skip_serializing_if = "BTreeMap::is_empty")]
327    extra: BTreeMap<String, Ipld>,
328}
329
330impl Serialize for Node {
331    fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
332        NodeWire {
333            kind: Self::KIND.into(),
334            id: self.id,
335            ntype: self.ntype.clone(),
336            summary: self.summary.clone(),
337            props: self.props.clone(),
338            content: self.content.clone(),
339            sparse_embed: self.sparse_embed.clone(),
340            context_sentence: self.context_sentence.clone(),
341            extra: self.extra.clone(),
342        }
343        .serialize(serializer)
344    }
345}
346
347impl<'de> Deserialize<'de> for Node {
348    fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
349        let wire = NodeWire::deserialize(deserializer)?;
350        if wire.kind != Self::KIND {
351            return Err(serde::de::Error::custom(format!(
352                "expected _kind='{}', got '{}'",
353                Self::KIND,
354                wire.kind
355            )));
356        }
357        Ok(Self {
358            id: wire.id,
359            ntype: wire.ntype,
360            summary: wire.summary,
361            props: wire.props,
362            content: wire.content,
363            sparse_embed: wire.sparse_embed,
364            context_sentence: wire.context_sentence,
365            extra: wire.extra,
366        })
367    }
368}
369
370#[cfg(test)]
371mod tests {
372    use super::*;
373    use crate::codec::{from_canonical_bytes, hash_to_cid, to_canonical_bytes};
374
375    fn alice() -> Node {
376        Node::new(NodeId::from_bytes_raw([1u8; 16]), "Person")
377            .with_prop("name", Ipld::String("Alice".into()))
378            .with_prop("age", Ipld::Integer(30))
379    }
380
381    #[test]
382    fn node_round_trip_byte_identity() {
383        let original = alice();
384        let bytes = to_canonical_bytes(&original).expect("encode");
385        let decoded: Node = from_canonical_bytes(&bytes).expect("decode");
386        assert_eq!(original, decoded);
387        let bytes2 = to_canonical_bytes(&decoded).expect("re-encode");
388        assert_eq!(bytes, bytes2);
389    }
390
391    #[test]
392    fn node_cid_is_deterministic() {
393        let a1 = alice();
394        let a2 = alice();
395        let (_, c1) = hash_to_cid(&a1).expect("hash");
396        let (_, c2) = hash_to_cid(&a2).expect("hash");
397        assert_eq!(c1, c2);
398    }
399
400    #[test]
401    fn new_default_uses_default_ntype() {
402        let n = Node::new_default(NodeId::from_bytes_raw([7u8; 16]));
403        assert_eq!(n.ntype, Node::DEFAULT_NTYPE);
404        assert_eq!(n.ntype, "Node");
405    }
406
407    #[test]
408    fn new_default_and_explicit_new_match_when_ntype_equal() {
409        // `Node::new_default(id)` must be byte-identical to
410        // `Node::new(id, Node::DEFAULT_NTYPE)`. CID stability test.
411        let id = NodeId::from_bytes_raw([9u8; 16]);
412        let default_node = Node::new_default(id);
413        let explicit_node = Node::new(id, Node::DEFAULT_NTYPE);
414        let (_, c_default) = hash_to_cid(&default_node).expect("hash default");
415        let (_, c_explicit) = hash_to_cid(&explicit_node).expect("hash explicit");
416        assert_eq!(c_default, c_explicit);
417    }
418
419    #[test]
420    fn node_kind_rejection() {
421        // Encode something whose _kind = "edge"; decoding as Node must fail.
422        let wire = NodeWire {
423            kind: "edge".into(),
424            id: NodeId::from_bytes_raw([1u8; 16]),
425            ntype: "x".into(),
426            summary: None,
427            props: BTreeMap::new(),
428            content: None,
429            sparse_embed: None,
430            context_sentence: None,
431            extra: BTreeMap::new(),
432        };
433        let bytes = serde_ipld_dagcbor::to_vec(&wire).expect("encode wire");
434        let err = serde_ipld_dagcbor::from_slice::<Node>(&bytes).unwrap_err();
435        assert!(
436            err.to_string().contains("_kind"),
437            "expected _kind rejection, got: {err}"
438        );
439    }
440
441    #[test]
442    fn node_extra_fields_round_trip() {
443        // Start with a NodeWire that includes an unknown field.
444        let mut wire = NodeWire {
445            kind: "node".into(),
446            id: NodeId::from_bytes_raw([2u8; 16]),
447            ntype: "Future".into(),
448            summary: None,
449            props: BTreeMap::new(),
450            content: None,
451            sparse_embed: None,
452            context_sentence: None,
453            extra: BTreeMap::new(),
454        };
455        wire.extra.insert(
456            "x-future-field".into(),
457            Ipld::String("value-from-v99".into()),
458        );
459        let bytes_in = serde_ipld_dagcbor::to_vec(&wire).expect("encode");
460
461        // Decode as Node - the unknown field lands in `extra`.
462        let decoded: Node = serde_ipld_dagcbor::from_slice(&bytes_in).expect("decode");
463        assert_eq!(
464            decoded.extra.get("x-future-field"),
465            Some(&Ipld::String("value-from-v99".into())),
466        );
467
468        // Re-encode as Node - bytes must match the input.
469        let bytes_out = to_canonical_bytes(&decoded).expect("re-encode");
470        assert_eq!(bytes_in, bytes_out);
471    }
472
473    #[test]
474    fn legacy_embed_field_round_trips_through_extra() {
475        // Legacy DAG-CBOR encoded under the prior schema where the Node
476        // map carried an explicit `embed` sub-map. After the field
477        // removal the wire decoder no longer recognises `embed`, so the
478        // serde(flatten) `extra` sink absorbs the key. Re-encoding emits
479        // it unchanged - bytes are byte-identical and the NodeCid stays
480        // stable across the reader transition.
481        //
482        // We synthesise the legacy bytes by encoding a separate wire
483        // struct that still has an `embed` field so this test does not
484        // depend on a baked binary fixture.
485        #[derive(Serialize)]
486        struct LegacyNodeWire {
487            #[serde(rename = "_kind")]
488            kind: String,
489            id: NodeId,
490            ntype: String,
491            #[serde(skip_serializing_if = "Option::is_none")]
492            summary: Option<String>,
493            props: BTreeMap<String, Ipld>,
494            #[serde(skip_serializing_if = "Option::is_none")]
495            content: Option<Bytes>,
496            embed: Embedding,
497        }
498
499        let legacy = LegacyNodeWire {
500            kind: "node".into(),
501            id: NodeId::from_bytes_raw([42u8; 16]),
502            ntype: "Doc".into(),
503            summary: None,
504            props: BTreeMap::new(),
505            content: None,
506            embed: Embedding {
507                model: "openai:text-embedding-3-small".into(),
508                dtype: Dtype::F32,
509                dim: 2,
510                vector: Bytes::from(vec![
511                    0x00, 0x00, 0x80, 0x3f, // 1.0_f32 LE
512                    0x00, 0x00, 0x00, 0x00, // 0.0_f32 LE
513                ]),
514            },
515        };
516        let bytes_in = serde_ipld_dagcbor::to_vec(&legacy).expect("encode legacy");
517
518        // Decode as Node: `embed` is unknown to the new wire, so the
519        // flatten sink absorbs it.
520        let decoded: Node = serde_ipld_dagcbor::from_slice(&bytes_in).expect("decode legacy");
521        assert!(
522            decoded.extra.contains_key("embed"),
523            "legacy embed must land in extra"
524        );
525
526        // Re-encoding produces the same byte sequence.
527        let bytes_out = to_canonical_bytes(&decoded).expect("re-encode");
528        assert_eq!(bytes_in, bytes_out, "legacy bytes must round-trip exactly");
529
530        // NodeCid is stable across the reader transition: hashing the
531        // re-encoded bytes via the a future version reader path must produce the
532        // same Cid as hashing the legacy v0.1.0 bytes directly. Equality
533        // here is the load-bearing federated-dedup invariant.
534        let (bytes_from_node, cid_from_node) = hash_to_cid(&decoded).expect("hash node");
535        assert_eq!(
536            bytes_in.as_slice(),
537            bytes_from_node.as_ref(),
538            "a future version re-encode must match legacy bytes byte-for-byte"
539        );
540        let cid_via_legacy_bytes = {
541            let mh = crate::id::Multihash::sha2_256(&bytes_in);
542            crate::id::Cid::new(crate::id::CODEC_DAG_CBOR, mh)
543        };
544        assert_eq!(
545            cid_from_node, cid_via_legacy_bytes,
546            "NodeCid via a future version reader must equal NodeCid via legacy bytes"
547        );
548    }
549
550    #[test]
551    fn node_round_trip_with_summary() {
552        let n = Node::new(NodeId::from_bytes_raw([3u8; 16]), "Person")
553            .with_summary("Alice, 30, based in Berlin.")
554            .with_prop("name", Ipld::String("Alice".into()));
555        let bytes = to_canonical_bytes(&n).expect("encode");
556        let decoded: Node = from_canonical_bytes(&bytes).expect("decode");
557        assert_eq!(
558            decoded.summary.as_deref(),
559            Some("Alice, 30, based in Berlin.")
560        );
561        assert_eq!(n, decoded);
562
563        // Summary participates in the content hash: same node without
564        // the summary must hash to a different CID.
565        let bare = Node::new(NodeId::from_bytes_raw([3u8; 16]), "Person")
566            .with_prop("name", Ipld::String("Alice".into()));
567        let (_, c_with) = hash_to_cid(&n).expect("hash");
568        let (_, c_without) = hash_to_cid(&bare).expect("hash");
569        assert_ne!(c_with, c_without);
570    }
571
572    #[test]
573    fn node_sparse_embed_round_trips() {
574        let s = crate::sparse::SparseEmbed::new(vec![1, 5, 9], vec![0.5, 0.2, 0.1], "test-vocab")
575            .unwrap();
576        let n = Node::new(NodeId::from_bytes_raw([6u8; 16]), "Doc").with_sparse_embed(s.clone());
577        let bytes = to_canonical_bytes(&n).expect("encode");
578        let decoded: Node = from_canonical_bytes(&bytes).expect("decode");
579        assert_eq!(decoded.sparse_embed.as_ref(), Some(&s));
580        // Re-encode determinism: byte-identical.
581        let bytes2 = to_canonical_bytes(&decoded).expect("re-encode");
582        assert_eq!(bytes, bytes2);
583    }
584
585    #[test]
586    fn node_context_sentence_round_trips() {
587        let ctx = "This paragraph is from Section 3 of the 2024 lease.";
588        let n = Node::new(NodeId::from_bytes_raw([9u8; 16]), "Paragraph")
589            .with_summary("The tenant shall maintain the premises...")
590            .with_context_sentence(ctx);
591        let bytes = to_canonical_bytes(&n).expect("encode");
592        let decoded: Node = from_canonical_bytes(&bytes).expect("decode");
593        assert_eq!(decoded.context_sentence.as_deref(), Some(ctx));
594        let bytes2 = to_canonical_bytes(&decoded).expect("re-encode");
595        assert_eq!(bytes, bytes2);
596    }
597
598    #[test]
599    fn node_context_sentence_absent_not_emitted() {
600        // Same CID-stability property as sparse_embed: a node without
601        // context_sentence must not emit the field on the wire.
602        let n = Node::new(NodeId::from_bytes_raw([10u8; 16]), "Plain");
603        let bytes = to_canonical_bytes(&n).expect("encode");
604        assert!(
605            !bytes.windows(16).any(|w| w == b"context_sentence"),
606            "absent context_sentence should not appear on the wire"
607        );
608    }
609
610    #[test]
611    fn node_context_sentence_participates_in_cid() {
612        let base = Node::new(NodeId::from_bytes_raw([11u8; 16]), "P").with_summary("x");
613        let with_ctx = base.clone().with_context_sentence("cue");
614        let (_, c1) = hash_to_cid(&base).unwrap();
615        let (_, c2) = hash_to_cid(&with_ctx).unwrap();
616        assert_ne!(c1, c2, "context_sentence must participate in the CID");
617    }
618
619    #[test]
620    fn node_sparse_embed_absent_not_emitted() {
621        // A node without sparse_embed must not emit "sparse_embed" on
622        // the wire. This is the property that keeps pre-schema-change
623        // CIDs stable when the field is not populated.
624        let n = Node::new(NodeId::from_bytes_raw([7u8; 16]), "Thing");
625        let bytes = to_canonical_bytes(&n).expect("encode");
626        assert!(
627            !bytes.windows(12).any(|w| w == b"sparse_embed"),
628            "absent sparse_embed should not appear on the wire"
629        );
630    }
631
632    #[test]
633    fn node_sparse_embed_participates_in_cid() {
634        // Two nodes identical except for sparse_embed must produce
635        // different CIDs - sparse_embed is content-hash-bearing.
636        let s = crate::sparse::SparseEmbed::new(vec![1], vec![1.0], "v").unwrap();
637        let n_with = Node::new(NodeId::from_bytes_raw([8u8; 16]), "Doc").with_sparse_embed(s);
638        let n_without = Node::new(NodeId::from_bytes_raw([8u8; 16]), "Doc");
639        let (_, c_with) = hash_to_cid(&n_with).unwrap();
640        let (_, c_without) = hash_to_cid(&n_without).unwrap();
641        assert_ne!(c_with, c_without);
642    }
643
644    #[test]
645    fn node_summary_absent_not_emitted() {
646        // An unset summary must not emit a `summary: null` field on the
647        // wire; skip_serializing_if keeps the CID of pre-summary nodes
648        // stable.
649        let n = Node::new(NodeId::from_bytes_raw([4u8; 16]), "Thing");
650        let bytes = to_canonical_bytes(&n).expect("encode");
651        assert!(
652            !bytes.windows(7).any(|w| w == b"summary"),
653            "absent summary should not appear on the wire"
654        );
655    }
656
657    #[test]
658    fn embedding_validate_ok_and_err() {
659        let ok = Embedding {
660            model: "m".into(),
661            dtype: Dtype::F32,
662            dim: 4,
663            vector: Bytes::from(vec![0u8; 16]),
664        };
665        ok.validate().unwrap();
666
667        let bad = Embedding {
668            model: "m".into(),
669            dtype: Dtype::F32,
670            dim: 4,
671            vector: Bytes::from(vec![0u8; 10]),
672        };
673        let err = bad.validate().unwrap_err();
674        match err {
675            ObjectError::EmbeddingSizeMismatch { expected, got } => {
676                assert_eq!(expected, 16);
677                assert_eq!(got, 10);
678            }
679            e => panic!("wrong variant: {e:?}"),
680        }
681    }
682}