Skip to main content

mnem_core/objects/
commit.rs

1//! Commit object (SPEC §4.4).
2//!
3//! A commit is a versioned snapshot of the graph. It points at three
4//! Prolly-tree roots (nodes, edges, schema) and carries provenance
5//! metadata - author, agent, task, timestamp - plus an optional
6//! Ed25519 signature.
7
8use std::collections::BTreeMap;
9
10use bytes::Bytes;
11use ipld_core::ipld::Ipld;
12use serde::{Deserialize, Deserializer, Serialize, Serializer};
13
14use crate::id::{ChangeId, Cid};
15
16/// Cryptographic signature on a [`Commit`] or [`crate::objects::Operation`].
17///
18/// Per SPEC §9.1, the signature is computed over the canonical DAG-CBOR
19/// encoding of the containing object with the `signature` field absent.
20/// M12 will add verification helpers; for now we model the shape only.
21#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
22pub struct Signature {
23    /// Algorithm identifier. MUST be `"ed25519"` for mnem/0.1.
24    pub algo: String,
25    /// Signer's public key. 32 bytes for Ed25519.
26    pub public_key: Bytes,
27    /// Signature bytes. 64 bytes for Ed25519.
28    pub sig: Bytes,
29}
30
31/// A versioned snapshot of the graph.
32#[derive(Clone, Debug, PartialEq, Eq)]
33pub struct Commit {
34    /// Stable change identity (survives rewrite / rebase / amend).
35    pub change_id: ChangeId,
36    /// Parent commits (empty = root, ≥2 = merge).
37    pub parents: Vec<Cid>,
38    /// Root of the node Prolly tree.
39    pub nodes: Cid,
40    /// Root of the edge Prolly tree.
41    pub edges: Cid,
42    /// Root of the schema Prolly tree.
43    pub schema: Cid,
44    /// Optional `DeltaSet` link (reserved; not emitted in mnem/0.1).
45    pub delta: Option<Cid>,
46    /// Optional secondary-index root ([`crate::objects::IndexSet`],
47    /// SPEC §4.8). Agents that only need Prolly-lookup by stable id
48    /// can ignore this; query paths (label / property / adjacency)
49    /// use it when present.
50    pub indexes: Option<Cid>,
51    /// Optional embedding-sidecar Prolly root. Tree keyed by 32-byte
52    /// `NodeCid` digest, value = [`crate::objects::EmbeddingBucket`].
53    /// Lifts dense embedding vectors out of `Node` canonical bytes so
54    /// the Node CID stays byte-stable across ORT thread counts (f32
55    /// reduction ordering is non-deterministic; vectors drift by the
56    /// LSB across thread counts). `None` on commits that carry no
57    /// embed-bearing nodes.
58    ///
59    /// **Intentionally excluded from `content_cid`.** Content CID is
60    /// the deterministic "what graph is this" digest; including the
61    /// embeddings root would re-couple it to ORT thread count and
62    /// undo the determinism guarantee. Two machines re-deriving the
63    /// same source text on different cores produce the same
64    /// `content_cid`, just with per-machine drift in
65    /// `commit.embeddings`.
66    pub embeddings: Option<Cid>,
67    /// Optional sparse-embedding sidecar Prolly root. Tree keyed by
68    /// 16-byte truncated blake3 of `NodeCid` wire form, value =
69    /// [`crate::objects::SparseBucket`]. Lifts learned-sparse
70    /// embedding vectors out of `Node` canonical bytes so the Node CID
71    /// stays byte-stable across encoder versions and vocabulary updates.
72    /// `None` on commits that carry no sparse-embedding-bearing nodes.
73    ///
74    /// **Intentionally excluded from `content_cid`.** Same invariant as
75    /// `embeddings`: the content_cid must not be coupled to derived
76    /// bytes. Two machines indexing the same source text with different
77    /// sparse encoder versions share the same `content_cid` even when
78    /// their sparse sidecar CIDs differ.
79    pub sparse: Option<Cid>,
80    /// Free-form author identifier.
81    pub author: String,
82    /// AI agent identifier (when the commit was machine-generated).
83    pub agent_id: Option<String>,
84    /// Task / tool-call identifier for provenance.
85    pub task_id: Option<String>,
86    /// Microseconds since Unix epoch.
87    pub time: u64,
88    /// UTF-8 commit message. May be empty.
89    pub message: String,
90    /// Optional cryptographic signature.
91    pub signature: Option<Signature>,
92    /// Forward-compat extension map (SPEC §3.2).
93    pub extra: BTreeMap<String, Ipld>,
94}
95
96impl Commit {
97    /// The `_kind` discriminator on the wire.
98    pub const KIND: &'static str = "commit";
99
100    /// Build a commit with the required fields, empty optionals / parents / extras.
101    #[must_use]
102    pub fn new(
103        change_id: ChangeId,
104        nodes: Cid,
105        edges: Cid,
106        schema: Cid,
107        author: impl Into<String>,
108        time: u64,
109        message: impl Into<String>,
110    ) -> Self {
111        Self {
112            change_id,
113            parents: Vec::new(),
114            nodes,
115            edges,
116            schema,
117            delta: None,
118            indexes: None,
119            embeddings: None,
120            sparse: None,
121            author: author.into(),
122            agent_id: None,
123            task_id: None,
124            time,
125            message: message.into(),
126            signature: None,
127            extra: BTreeMap::new(),
128        }
129    }
130
131    /// Append a parent commit. Returns `self` for chaining.
132    #[must_use]
133    pub fn with_parent(mut self, parent: Cid) -> Self {
134        self.parents.push(parent);
135        self
136    }
137
138    /// Attach an agent identifier.
139    #[must_use]
140    pub fn with_agent(mut self, agent_id: impl Into<String>) -> Self {
141        self.agent_id = Some(agent_id.into());
142        self
143    }
144
145    /// Attach a task identifier.
146    #[must_use]
147    pub fn with_task(mut self, task_id: impl Into<String>) -> Self {
148        self.task_id = Some(task_id.into());
149        self
150    }
151
152    /// A deterministic CID over only the data-DAG portion of the commit --
153    /// the three Prolly tree roots (nodes, edges, schema) and the optional
154    /// indexes root. Excludes time, change_id, author, message, agent_id,
155    /// task_id, signature, extra, and **parents**.
156    ///
157    /// Two ingest runs against byte-identical input on different machines
158    /// (or at different times) MUST produce the same `content_cid`. The
159    /// standard `commit_cid` continues to embed wall-clock + UUIDv7
160    /// metadata for audit-trail purposes; that CID is intentionally
161    /// time-varying.
162    ///
163    /// Parents are excluded because they are `commit_cid`s of ancestor
164    /// commits, which embed timestamps and are therefore time-varying.
165    /// Including them would make `content_cid` non-deterministic across
166    /// independent repos even when the graph data is identical.
167    ///
168    /// # Errors
169    /// Propagates encoding failures from
170    /// [`crate::codec::dagcbor::hash_to_cid`].
171    ///
172    /// # Migration note
173    /// Wire format is unchanged: `content_cid` is computed from
174    /// existing fields, so older blockstores stay readable. A
175    /// follow-up may persist `content_cid` alongside `commit_cid` in
176    /// the operation log for cheap lookup.
177    /// schema_version 2: parents removed from payload (audit fix P0-1).
178    pub fn content_cid(&self) -> Result<Cid, crate::error::CodecError> {
179        let payload = ContentCidPayload {
180            schema_version: 2,
181            nodes: self.nodes.clone(),
182            edges: self.edges.clone(),
183            schema: self.schema.clone(),
184            indexes: self.indexes.clone(),
185        };
186        let (_bytes, cid) = crate::codec::dagcbor::hash_to_cid(&payload)?;
187        Ok(cid)
188    }
189}
190
191/// Stable wire shape for `Commit::content_cid()`. The struct is
192/// intentionally NOT exposed publicly: `content_cid` is purely a
193/// derived value, and the on-disk Commit format does not change.
194/// Schema version 2 (post audit-2026-04-25 P0-1): parents removed so
195/// that content_cid is deterministic across independent repos with the
196/// same graph data. Any future layout change MUST bump `schema_version`
197/// so that two versions of the codebase agree on whether they compare equal.
198#[derive(Serialize)]
199struct ContentCidPayload {
200    schema_version: u8,
201    nodes: Cid,
202    edges: Cid,
203    schema: Cid,
204    #[serde(skip_serializing_if = "Option::is_none")]
205    indexes: Option<Cid>,
206}
207
208// ---------------- Serde ----------------
209
210#[derive(Serialize, Deserialize)]
211struct CommitWire {
212    #[serde(rename = "_kind")]
213    kind: String,
214    change_id: ChangeId,
215    parents: Vec<Cid>,
216    nodes: Cid,
217    edges: Cid,
218    schema: Cid,
219    #[serde(default, skip_serializing_if = "Option::is_none")]
220    delta: Option<Cid>,
221    #[serde(default, skip_serializing_if = "Option::is_none")]
222    indexes: Option<Cid>,
223    /// `skip_serializing_if` keeps absence-on-encode so commits without
224    /// an embedding sidecar round-trip byte-identically.
225    #[serde(default, skip_serializing_if = "Option::is_none")]
226    embeddings: Option<Cid>,
227    /// `skip_serializing_if` keeps absence-on-encode so commits without
228    /// a sparse sidecar round-trip byte-identically.
229    #[serde(default, skip_serializing_if = "Option::is_none")]
230    sparse: Option<Cid>,
231    author: String,
232    #[serde(default, skip_serializing_if = "Option::is_none")]
233    agent_id: Option<String>,
234    #[serde(default, skip_serializing_if = "Option::is_none")]
235    task_id: Option<String>,
236    time: u64,
237    message: String,
238    #[serde(default, skip_serializing_if = "Option::is_none")]
239    signature: Option<Signature>,
240    #[serde(flatten, default, skip_serializing_if = "BTreeMap::is_empty")]
241    extra: BTreeMap<String, Ipld>,
242}
243
244impl Serialize for Commit {
245    fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
246        CommitWire {
247            kind: Self::KIND.into(),
248            change_id: self.change_id,
249            parents: self.parents.clone(),
250            nodes: self.nodes.clone(),
251            edges: self.edges.clone(),
252            schema: self.schema.clone(),
253            delta: self.delta.clone(),
254            indexes: self.indexes.clone(),
255            embeddings: self.embeddings.clone(),
256            sparse: self.sparse.clone(),
257            author: self.author.clone(),
258            agent_id: self.agent_id.clone(),
259            task_id: self.task_id.clone(),
260            time: self.time,
261            message: self.message.clone(),
262            signature: self.signature.clone(),
263            extra: self.extra.clone(),
264        }
265        .serialize(serializer)
266    }
267}
268
269impl<'de> Deserialize<'de> for Commit {
270    fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
271        let w = CommitWire::deserialize(deserializer)?;
272        if w.kind != Self::KIND {
273            return Err(serde::de::Error::custom(format!(
274                "expected _kind='{}', got '{}'",
275                Self::KIND,
276                w.kind
277            )));
278        }
279        Ok(Self {
280            change_id: w.change_id,
281            parents: w.parents,
282            nodes: w.nodes,
283            edges: w.edges,
284            schema: w.schema,
285            delta: w.delta,
286            indexes: w.indexes,
287            embeddings: w.embeddings,
288            sparse: w.sparse,
289            author: w.author,
290            agent_id: w.agent_id,
291            task_id: w.task_id,
292            time: w.time,
293            message: w.message,
294            signature: w.signature,
295            extra: w.extra,
296        })
297    }
298}
299
300#[cfg(test)]
301mod tests {
302    use super::*;
303    use crate::codec::{from_canonical_bytes, to_canonical_bytes};
304    use crate::id::{CODEC_RAW, Multihash};
305
306    fn raw(n: u32) -> Cid {
307        Cid::new(CODEC_RAW, Multihash::sha2_256(&n.to_be_bytes()))
308    }
309
310    fn sample() -> Commit {
311        Commit::new(
312            ChangeId::from_bytes_raw([1u8; 16]),
313            raw(1),
314            raw(2),
315            raw(3),
316            "alice@example.org",
317            1_700_000_000_000_000,
318            "init",
319        )
320        .with_agent("agent:claude")
321        .with_task("task:001")
322    }
323
324    #[test]
325    fn commit_round_trip_byte_identity() {
326        let original = sample();
327        let bytes = to_canonical_bytes(&original).unwrap();
328        let decoded: Commit = from_canonical_bytes(&bytes).unwrap();
329        assert_eq!(original, decoded);
330        let bytes2 = to_canonical_bytes(&decoded).unwrap();
331        assert_eq!(bytes, bytes2);
332    }
333
334    /// two commits with byte-identical data
335    /// roots but different timestamps, change_ids, authors, and
336    /// messages MUST share `content_cid` while their `commit_cid`
337    /// differs.
338    #[test]
339    fn content_cid_is_stable_across_metadata() {
340        let mut a = Commit::new(
341            ChangeId::from_bytes_raw([1u8; 16]),
342            raw(10),
343            raw(20),
344            raw(30),
345            "alice@example.org",
346            1_700_000_000_000_000,
347            "init",
348        );
349        a.indexes = Some(raw(40));
350
351        let mut b = Commit::new(
352            // Different change_id (UUIDv7 typically embeds a timestamp).
353            ChangeId::from_bytes_raw([2u8; 16]),
354            // SAME data roots:
355            raw(10),
356            raw(20),
357            raw(30),
358            // Different author + time + message:
359            "bob@example.org",
360            1_777_000_000_000_000,
361            "different message entirely",
362        );
363        b.indexes = Some(raw(40));
364
365        assert_eq!(
366            a.content_cid().unwrap(),
367            b.content_cid().unwrap(),
368            "content_cid must ignore metadata (time, change_id, author, message)"
369        );
370
371        let (a_bytes, a_commit_cid) = crate::codec::dagcbor::hash_to_cid(&a).unwrap();
372        let (b_bytes, b_commit_cid) = crate::codec::dagcbor::hash_to_cid(&b).unwrap();
373        let _ = (a_bytes, b_bytes);
374        assert_ne!(
375            a_commit_cid, b_commit_cid,
376            "commit_cid SHOULD differ when metadata differs (audit-trail invariant)"
377        );
378    }
379
380    /// content_cid MUST change when any data root changes.
381    #[test]
382    fn content_cid_distinguishes_data_roots() {
383        let a = Commit::new(
384            ChangeId::from_bytes_raw([1u8; 16]),
385            raw(10),
386            raw(20),
387            raw(30),
388            "alice",
389            1,
390            "msg",
391        );
392        let b = Commit::new(
393            ChangeId::from_bytes_raw([1u8; 16]),
394            raw(11), // different nodes root
395            raw(20),
396            raw(30),
397            "alice",
398            1,
399            "msg",
400        );
401        assert_ne!(a.content_cid().unwrap(), b.content_cid().unwrap());
402    }
403
404    /// Load-bearing invariant: two commits with byte-identical data
405    /// roots but DIFFERENT `embeddings` sidecar Cids MUST share
406    /// `content_cid`. If this fails, a future change re-coupled
407    /// `ContentCidPayload` to the embedding sidecar - exactly the
408    /// architectural error this design exists to prevent. Federated
409    /// dedup (two machines indexing the same source produce the same
410    /// content_cid) would silently break.
411    #[test]
412    fn content_cid_ignores_embeddings_field() {
413        let mut a = sample();
414        a.embeddings = Some(raw(100));
415        let mut b = sample();
416        b.embeddings = Some(raw(200)); // different embedding sidecar
417        assert_eq!(
418            a.content_cid().unwrap(),
419            b.content_cid().unwrap(),
420            "content_cid MUST ignore the embeddings sidecar - that is the G16 contract"
421        );
422
423        // Also: a commit with `embeddings = None` and a commit with
424        // `embeddings = Some(_)` must share the same content_cid when
425        // every other data root matches.
426        let mut c = sample();
427        c.embeddings = None;
428        let mut d = sample();
429        d.embeddings = Some(raw(300));
430        assert_eq!(
431            c.content_cid().unwrap(),
432            d.content_cid().unwrap(),
433            "absence of embeddings must not change content_cid either"
434        );
435    }
436
437    /// Load-bearing invariant: two commits with byte-identical data
438    /// roots but DIFFERENT `sparse` sidecar Cids MUST share
439    /// `content_cid`. If this fails, a future change re-coupled
440    /// `ContentCidPayload` to the sparse sidecar - exactly the
441    /// architectural error this design exists to prevent.
442    #[test]
443    fn content_cid_ignores_sparse_field() {
444        let mut a = sample();
445        a.sparse = Some(raw(100));
446        let mut b = sample();
447        b.sparse = Some(raw(200)); // different sparse sidecar
448        assert_eq!(
449            a.content_cid().unwrap(),
450            b.content_cid().unwrap(),
451            "content_cid MUST ignore the sparse sidecar - that is the G17 contract"
452        );
453
454        // Also: a commit with `sparse = None` and a commit with
455        // `sparse = Some(_)` must share the same content_cid when
456        // every other data root matches.
457        let mut c = sample();
458        c.sparse = None;
459        let mut d = sample();
460        d.sparse = Some(raw(300));
461        assert_eq!(
462            c.content_cid().unwrap(),
463            d.content_cid().unwrap(),
464            "absence of sparse must not change content_cid either"
465        );
466    }
467
468    /// `Commit.sparse: Some(cid)` survives encode to decode to
469    /// re-encode byte-identically. Pins the wire-form contract for
470    /// the new G17 field.
471    #[test]
472    fn commit_with_sparse_some_round_trips() {
473        let mut original = sample();
474        original.sparse = Some(raw(42));
475        let bytes = to_canonical_bytes(&original).unwrap();
476        let decoded: Commit = from_canonical_bytes(&bytes).unwrap();
477        assert_eq!(decoded.sparse, Some(raw(42)));
478        let bytes2 = to_canonical_bytes(&decoded).unwrap();
479        assert_eq!(
480            bytes, bytes2,
481            "round-trip must be byte-identical - wire form is contract-bound"
482        );
483    }
484
485    /// Backwards-compat: a CBOR commit written without the
486    /// `sparse` key must decode cleanly with `sparse = None`
487    /// and re-encode byte-identically.
488    #[test]
489    fn commit_legacy_no_sparse_key_round_trips() {
490        let original = sample();
491        assert_eq!(original.sparse, None);
492        let bytes = to_canonical_bytes(&original).unwrap();
493
494        // Verify the wire form does NOT contain the `sparse` key.
495        assert!(
496            !bytes.windows(b"sparse".len()).any(|w| w == b"sparse"),
497            "wire form must omit the `sparse` key when None"
498        );
499
500        let decoded: Commit = from_canonical_bytes(&bytes).unwrap();
501        assert_eq!(decoded.sparse, None);
502
503        let bytes2 = to_canonical_bytes(&decoded).unwrap();
504        assert_eq!(bytes, bytes2, "legacy CBOR must re-encode byte-identically");
505    }
506
507    /// `Commit.embeddings: Some(cid)` survives encode to decode to
508    /// re-encode byte-identically. Pins the wire-form contract for
509    /// the new G16 field.
510    #[test]
511    fn commit_with_embeddings_some_round_trips() {
512        let mut original = sample();
513        original.embeddings = Some(raw(42));
514        let bytes = to_canonical_bytes(&original).unwrap();
515        let decoded: Commit = from_canonical_bytes(&bytes).unwrap();
516        assert_eq!(original, decoded);
517        assert_eq!(decoded.embeddings, Some(raw(42)));
518        let bytes2 = to_canonical_bytes(&decoded).unwrap();
519        assert_eq!(
520            bytes, bytes2,
521            "round-trip must be byte-identical - wire form is contract-bound"
522        );
523    }
524
525    /// Backwards-compat: a CBOR commit written without the
526    /// `embeddings` key must decode cleanly with `embeddings = None`
527    /// and re-encode byte-identically. The wire emitter omits the
528    /// key when `None`, so legacy bytes round-trip.
529    #[test]
530    fn commit_legacy_no_embeddings_key_round_trips() {
531        // Construct a commit with `embeddings = None` (wire form
532        // omits the key entirely under `skip_serializing_if`).
533        let original = sample();
534        assert_eq!(original.embeddings, None);
535        let bytes = to_canonical_bytes(&original).unwrap();
536
537        // Verify the wire form does NOT contain the `embeddings` key.
538        // The literal byte string "embeddings" cannot appear by chance
539        // in a Cid digest, so this is a robust negative probe.
540        assert!(
541            !bytes
542                .windows(b"embeddings".len())
543                .any(|w| w == b"embeddings"),
544            "wire form must omit the `embeddings` key when None"
545        );
546
547        // Decode back; field defaults to `None`.
548        let decoded: Commit = from_canonical_bytes(&bytes).unwrap();
549        assert_eq!(decoded.embeddings, None);
550        assert_eq!(decoded, original);
551
552        // Re-encode; bytes must match exactly.
553        let bytes2 = to_canonical_bytes(&decoded).unwrap();
554        assert_eq!(bytes, bytes2, "legacy CBOR must re-encode byte-identically");
555    }
556
557    #[test]
558    fn commit_kind_rejection() {
559        let wire = CommitWire {
560            kind: "node".into(),
561            change_id: ChangeId::from_bytes_raw([1u8; 16]),
562            parents: vec![],
563            nodes: raw(1),
564            edges: raw(2),
565            schema: raw(3),
566            delta: None,
567            indexes: None,
568            embeddings: None,
569            sparse: None,
570            author: "x".into(),
571            agent_id: None,
572            task_id: None,
573            time: 0,
574            message: String::new(),
575            signature: None,
576            extra: BTreeMap::new(),
577        };
578        let bytes = serde_ipld_dagcbor::to_vec(&wire).unwrap();
579        let err = serde_ipld_dagcbor::from_slice::<Commit>(&bytes).unwrap_err();
580        assert!(err.to_string().contains("_kind"));
581    }
582
583    #[test]
584    fn commit_with_parents_round_trip() {
585        let c = sample().with_parent(raw(100)).with_parent(raw(101));
586        let bytes = to_canonical_bytes(&c).unwrap();
587        let decoded: Commit = from_canonical_bytes(&bytes).unwrap();
588        assert_eq!(c, decoded);
589        assert_eq!(decoded.parents.len(), 2);
590    }
591
592    #[test]
593    fn commit_with_signature_round_trip() {
594        let mut c = sample();
595        c.signature = Some(Signature {
596            algo: "ed25519".into(),
597            public_key: Bytes::from(vec![0xAAu8; 32]),
598            sig: Bytes::from(vec![0xBBu8; 64]),
599        });
600        let bytes = to_canonical_bytes(&c).unwrap();
601        let decoded: Commit = from_canonical_bytes(&bytes).unwrap();
602        assert_eq!(c, decoded);
603        assert_eq!(decoded.signature.as_ref().unwrap().algo, "ed25519");
604    }
605
606    #[test]
607    fn commit_extra_fields_preserved() {
608        let mut c = sample();
609        c.extra
610            .insert("x-future-field".into(), Ipld::String("v9".into()));
611        let bytes = to_canonical_bytes(&c).unwrap();
612        let decoded: Commit = from_canonical_bytes(&bytes).unwrap();
613        assert_eq!(c, decoded);
614        let bytes2 = to_canonical_bytes(&decoded).unwrap();
615        assert_eq!(bytes, bytes2);
616    }
617}