Skip to main content

mnem_core/objects/
commit.rs

1//! Commit object (SPEC §4.4).
2//!
3//! A commit is a versioned snapshot of the graph. It points at three
4//! Prolly-tree roots (nodes, edges, schema) and carries provenance
5//! metadata - author, agent, task, timestamp - plus an optional
6//! Ed25519 signature.
7
8use std::collections::BTreeMap;
9
10use bytes::Bytes;
11use ipld_core::ipld::Ipld;
12use serde::{Deserialize, Deserializer, Serialize, Serializer};
13
14use crate::id::{ChangeId, Cid};
15
16/// Cryptographic signature on a [`Commit`] or [`crate::objects::Operation`].
17///
18/// Per SPEC §9.1, the signature is computed over the canonical DAG-CBOR
19/// encoding of the containing object with the `signature` field absent.
20/// M12 will add verification helpers; for now we model the shape only.
21#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
22pub struct Signature {
23    /// Algorithm identifier. MUST be `"ed25519"` for mnem/0.1.
24    pub algo: String,
25    /// Signer's public key. 32 bytes for Ed25519.
26    pub public_key: Bytes,
27    /// Signature bytes. 64 bytes for Ed25519.
28    pub sig: Bytes,
29}
30
31/// A versioned snapshot of the graph.
32#[derive(Clone, Debug, PartialEq, Eq)]
33pub struct Commit {
34    /// Stable change identity (survives rewrite / rebase / amend).
35    pub change_id: ChangeId,
36    /// Parent commits (empty = root, ≥2 = merge).
37    pub parents: Vec<Cid>,
38    /// Root of the node Prolly tree.
39    pub nodes: Cid,
40    /// Root of the edge Prolly tree.
41    pub edges: Cid,
42    /// Root of the schema Prolly tree.
43    pub schema: Cid,
44    /// Optional `DeltaSet` link (reserved; not emitted in mnem/0.1).
45    pub delta: Option<Cid>,
46    /// Optional secondary-index root ([`crate::objects::IndexSet`],
47    /// SPEC §4.8). Agents that only need Prolly-lookup by stable id
48    /// can ignore this; query paths (label / property / adjacency)
49    /// use it when present.
50    pub indexes: Option<Cid>,
51    /// Optional embedding-sidecar Prolly root. Tree keyed by 32-byte
52    /// `NodeCid` digest, value = [`crate::objects::EmbeddingBucket`].
53    /// Lifts dense embedding vectors out of `Node` canonical bytes so
54    /// the Node CID stays byte-stable across ORT thread counts (f32
55    /// reduction ordering is non-deterministic; vectors drift by the
56    /// LSB across thread counts). `None` on commits that carry no
57    /// embed-bearing nodes.
58    ///
59    /// **Intentionally excluded from `content_cid`.** Content CID is
60    /// the deterministic "what graph is this" digest; including the
61    /// embeddings root would re-couple it to ORT thread count and
62    /// undo the determinism guarantee. Two machines re-deriving the
63    /// same source text on different cores produce the same
64    /// `content_cid`, just with per-machine drift in
65    /// `commit.embeddings`.
66    pub embeddings: Option<Cid>,
67    /// Free-form author identifier.
68    pub author: String,
69    /// AI agent identifier (when the commit was machine-generated).
70    pub agent_id: Option<String>,
71    /// Task / tool-call identifier for provenance.
72    pub task_id: Option<String>,
73    /// Microseconds since Unix epoch.
74    pub time: u64,
75    /// UTF-8 commit message. May be empty.
76    pub message: String,
77    /// Optional cryptographic signature.
78    pub signature: Option<Signature>,
79    /// Forward-compat extension map (SPEC §3.2).
80    pub extra: BTreeMap<String, Ipld>,
81}
82
83impl Commit {
84    /// The `_kind` discriminator on the wire.
85    pub const KIND: &'static str = "commit";
86
87    /// Build a commit with the required fields, empty optionals / parents / extras.
88    #[must_use]
89    pub fn new(
90        change_id: ChangeId,
91        nodes: Cid,
92        edges: Cid,
93        schema: Cid,
94        author: impl Into<String>,
95        time: u64,
96        message: impl Into<String>,
97    ) -> Self {
98        Self {
99            change_id,
100            parents: Vec::new(),
101            nodes,
102            edges,
103            schema,
104            delta: None,
105            indexes: None,
106            embeddings: None,
107            author: author.into(),
108            agent_id: None,
109            task_id: None,
110            time,
111            message: message.into(),
112            signature: None,
113            extra: BTreeMap::new(),
114        }
115    }
116
117    /// Append a parent commit. Returns `self` for chaining.
118    #[must_use]
119    pub fn with_parent(mut self, parent: Cid) -> Self {
120        self.parents.push(parent);
121        self
122    }
123
124    /// Attach an agent identifier.
125    #[must_use]
126    pub fn with_agent(mut self, agent_id: impl Into<String>) -> Self {
127        self.agent_id = Some(agent_id.into());
128        self
129    }
130
131    /// Attach a task identifier.
132    #[must_use]
133    pub fn with_task(mut self, task_id: impl Into<String>) -> Self {
134        self.task_id = Some(task_id.into());
135        self
136    }
137
138    /// A deterministic CID over only the data-DAG portion of the commit --
139    /// the three Prolly tree roots (nodes, edges, schema) and the optional
140    /// indexes root. Excludes time, change_id, author, message, agent_id,
141    /// task_id, signature, extra, and **parents**.
142    ///
143    /// Two ingest runs against byte-identical input on different machines
144    /// (or at different times) MUST produce the same `content_cid`. The
145    /// standard `commit_cid` continues to embed wall-clock + UUIDv7
146    /// metadata for audit-trail purposes; that CID is intentionally
147    /// time-varying.
148    ///
149    /// Parents are excluded because they are `commit_cid`s of ancestor
150    /// commits, which embed timestamps and are therefore time-varying.
151    /// Including them would make `content_cid` non-deterministic across
152    /// independent repos even when the graph data is identical.
153    ///
154    /// # Errors
155    /// Propagates encoding failures from
156    /// [`crate::codec::dagcbor::hash_to_cid`].
157    ///
158    /// # Migration note
159    /// Wire format is unchanged: `content_cid` is computed from
160    /// existing fields, so older blockstores stay readable. A
161    /// follow-up may persist `content_cid` alongside `commit_cid` in
162    /// the operation log for cheap lookup.
163    /// schema_version 2: parents removed from payload (audit fix P0-1).
164    pub fn content_cid(&self) -> Result<Cid, crate::error::CodecError> {
165        let payload = ContentCidPayload {
166            schema_version: 2,
167            nodes: self.nodes.clone(),
168            edges: self.edges.clone(),
169            schema: self.schema.clone(),
170            indexes: self.indexes.clone(),
171        };
172        let (_bytes, cid) = crate::codec::dagcbor::hash_to_cid(&payload)?;
173        Ok(cid)
174    }
175}
176
177/// Stable wire shape for `Commit::content_cid()`. The struct is
178/// intentionally NOT exposed publicly: `content_cid` is purely a
179/// derived value, and the on-disk Commit format does not change.
180/// Schema version 2 (post audit-2026-04-25 P0-1): parents removed so
181/// that content_cid is deterministic across independent repos with the
182/// same graph data. Any future layout change MUST bump `schema_version`
183/// so that two versions of the codebase agree on whether they compare equal.
184#[derive(Serialize)]
185struct ContentCidPayload {
186    schema_version: u8,
187    nodes: Cid,
188    edges: Cid,
189    schema: Cid,
190    #[serde(skip_serializing_if = "Option::is_none")]
191    indexes: Option<Cid>,
192}
193
194// ---------------- Serde ----------------
195
196#[derive(Serialize, Deserialize)]
197struct CommitWire {
198    #[serde(rename = "_kind")]
199    kind: String,
200    change_id: ChangeId,
201    parents: Vec<Cid>,
202    nodes: Cid,
203    edges: Cid,
204    schema: Cid,
205    #[serde(default, skip_serializing_if = "Option::is_none")]
206    delta: Option<Cid>,
207    #[serde(default, skip_serializing_if = "Option::is_none")]
208    indexes: Option<Cid>,
209    /// `skip_serializing_if` keeps absence-on-encode so commits without
210    /// an embedding sidecar round-trip byte-identically.
211    #[serde(default, skip_serializing_if = "Option::is_none")]
212    embeddings: Option<Cid>,
213    author: String,
214    #[serde(default, skip_serializing_if = "Option::is_none")]
215    agent_id: Option<String>,
216    #[serde(default, skip_serializing_if = "Option::is_none")]
217    task_id: Option<String>,
218    time: u64,
219    message: String,
220    #[serde(default, skip_serializing_if = "Option::is_none")]
221    signature: Option<Signature>,
222    #[serde(flatten, default, skip_serializing_if = "BTreeMap::is_empty")]
223    extra: BTreeMap<String, Ipld>,
224}
225
226impl Serialize for Commit {
227    fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
228        CommitWire {
229            kind: Self::KIND.into(),
230            change_id: self.change_id,
231            parents: self.parents.clone(),
232            nodes: self.nodes.clone(),
233            edges: self.edges.clone(),
234            schema: self.schema.clone(),
235            delta: self.delta.clone(),
236            indexes: self.indexes.clone(),
237            embeddings: self.embeddings.clone(),
238            author: self.author.clone(),
239            agent_id: self.agent_id.clone(),
240            task_id: self.task_id.clone(),
241            time: self.time,
242            message: self.message.clone(),
243            signature: self.signature.clone(),
244            extra: self.extra.clone(),
245        }
246        .serialize(serializer)
247    }
248}
249
250impl<'de> Deserialize<'de> for Commit {
251    fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
252        let w = CommitWire::deserialize(deserializer)?;
253        if w.kind != Self::KIND {
254            return Err(serde::de::Error::custom(format!(
255                "expected _kind='{}', got '{}'",
256                Self::KIND,
257                w.kind
258            )));
259        }
260        Ok(Self {
261            change_id: w.change_id,
262            parents: w.parents,
263            nodes: w.nodes,
264            edges: w.edges,
265            schema: w.schema,
266            delta: w.delta,
267            indexes: w.indexes,
268            embeddings: w.embeddings,
269            author: w.author,
270            agent_id: w.agent_id,
271            task_id: w.task_id,
272            time: w.time,
273            message: w.message,
274            signature: w.signature,
275            extra: w.extra,
276        })
277    }
278}
279
280#[cfg(test)]
281mod tests {
282    use super::*;
283    use crate::codec::{from_canonical_bytes, to_canonical_bytes};
284    use crate::id::{CODEC_RAW, Multihash};
285
286    fn raw(n: u32) -> Cid {
287        Cid::new(CODEC_RAW, Multihash::sha2_256(&n.to_be_bytes()))
288    }
289
290    fn sample() -> Commit {
291        Commit::new(
292            ChangeId::from_bytes_raw([1u8; 16]),
293            raw(1),
294            raw(2),
295            raw(3),
296            "alice@example.org",
297            1_700_000_000_000_000,
298            "init",
299        )
300        .with_agent("agent:claude")
301        .with_task("task:001")
302    }
303
304    #[test]
305    fn commit_round_trip_byte_identity() {
306        let original = sample();
307        let bytes = to_canonical_bytes(&original).unwrap();
308        let decoded: Commit = from_canonical_bytes(&bytes).unwrap();
309        assert_eq!(original, decoded);
310        let bytes2 = to_canonical_bytes(&decoded).unwrap();
311        assert_eq!(bytes, bytes2);
312    }
313
314    /// two commits with byte-identical data
315    /// roots but different timestamps, change_ids, authors, and
316    /// messages MUST share `content_cid` while their `commit_cid`
317    /// differs.
318    #[test]
319    fn content_cid_is_stable_across_metadata() {
320        let mut a = Commit::new(
321            ChangeId::from_bytes_raw([1u8; 16]),
322            raw(10),
323            raw(20),
324            raw(30),
325            "alice@example.org",
326            1_700_000_000_000_000,
327            "init",
328        );
329        a.indexes = Some(raw(40));
330
331        let mut b = Commit::new(
332            // Different change_id (UUIDv7 typically embeds a timestamp).
333            ChangeId::from_bytes_raw([2u8; 16]),
334            // SAME data roots:
335            raw(10),
336            raw(20),
337            raw(30),
338            // Different author + time + message:
339            "bob@example.org",
340            1_777_000_000_000_000,
341            "different message entirely",
342        );
343        b.indexes = Some(raw(40));
344
345        assert_eq!(
346            a.content_cid().unwrap(),
347            b.content_cid().unwrap(),
348            "content_cid must ignore metadata (time, change_id, author, message)"
349        );
350
351        let (a_bytes, a_commit_cid) = crate::codec::dagcbor::hash_to_cid(&a).unwrap();
352        let (b_bytes, b_commit_cid) = crate::codec::dagcbor::hash_to_cid(&b).unwrap();
353        let _ = (a_bytes, b_bytes);
354        assert_ne!(
355            a_commit_cid, b_commit_cid,
356            "commit_cid SHOULD differ when metadata differs (audit-trail invariant)"
357        );
358    }
359
360    /// content_cid MUST change when any data root changes.
361    #[test]
362    fn content_cid_distinguishes_data_roots() {
363        let a = Commit::new(
364            ChangeId::from_bytes_raw([1u8; 16]),
365            raw(10),
366            raw(20),
367            raw(30),
368            "alice",
369            1,
370            "msg",
371        );
372        let b = Commit::new(
373            ChangeId::from_bytes_raw([1u8; 16]),
374            raw(11), // different nodes root
375            raw(20),
376            raw(30),
377            "alice",
378            1,
379            "msg",
380        );
381        assert_ne!(a.content_cid().unwrap(), b.content_cid().unwrap());
382    }
383
384    /// Load-bearing invariant: two commits with byte-identical data
385    /// roots but DIFFERENT `embeddings` sidecar Cids MUST share
386    /// `content_cid`. If this fails, a future change re-coupled
387    /// `ContentCidPayload` to the embedding sidecar - exactly the
388    /// architectural error this design exists to prevent. Federated
389    /// dedup (two machines indexing the same source produce the same
390    /// content_cid) would silently break.
391    #[test]
392    fn content_cid_ignores_embeddings_field() {
393        let mut a = sample();
394        a.embeddings = Some(raw(100));
395        let mut b = sample();
396        b.embeddings = Some(raw(200)); // different embedding sidecar
397        assert_eq!(
398            a.content_cid().unwrap(),
399            b.content_cid().unwrap(),
400            "content_cid MUST ignore the embeddings sidecar - that is the G16 contract"
401        );
402
403        // Also: a commit with `embeddings = None` and a commit with
404        // `embeddings = Some(_)` must share the same content_cid when
405        // every other data root matches.
406        let mut c = sample();
407        c.embeddings = None;
408        let mut d = sample();
409        d.embeddings = Some(raw(300));
410        assert_eq!(
411            c.content_cid().unwrap(),
412            d.content_cid().unwrap(),
413            "absence of embeddings must not change content_cid either"
414        );
415    }
416
417    /// `Commit.embeddings: Some(cid)` survives encode → decode →
418    /// re-encode byte-identically. Pins the wire-form contract for
419    /// the new G16 field.
420    #[test]
421    fn commit_with_embeddings_some_round_trips() {
422        let mut original = sample();
423        original.embeddings = Some(raw(42));
424        let bytes = to_canonical_bytes(&original).unwrap();
425        let decoded: Commit = from_canonical_bytes(&bytes).unwrap();
426        assert_eq!(original, decoded);
427        assert_eq!(decoded.embeddings, Some(raw(42)));
428        let bytes2 = to_canonical_bytes(&decoded).unwrap();
429        assert_eq!(
430            bytes, bytes2,
431            "round-trip must be byte-identical - wire form is contract-bound"
432        );
433    }
434
435    /// Backwards-compat: a CBOR commit written without the
436    /// `embeddings` key must decode cleanly with `embeddings = None`
437    /// and re-encode byte-identically. The wire emitter omits the
438    /// key when `None`, so legacy bytes round-trip.
439    #[test]
440    fn commit_legacy_no_embeddings_key_round_trips() {
441        // Construct a commit with `embeddings = None` (wire form
442        // omits the key entirely under `skip_serializing_if`).
443        let original = sample();
444        assert_eq!(original.embeddings, None);
445        let bytes = to_canonical_bytes(&original).unwrap();
446
447        // Verify the wire form does NOT contain the `embeddings` key.
448        // The literal byte string "embeddings" cannot appear by chance
449        // in a Cid digest, so this is a robust negative probe.
450        assert!(
451            !bytes
452                .windows(b"embeddings".len())
453                .any(|w| w == b"embeddings"),
454            "wire form must omit the `embeddings` key when None"
455        );
456
457        // Decode back; field defaults to `None`.
458        let decoded: Commit = from_canonical_bytes(&bytes).unwrap();
459        assert_eq!(decoded.embeddings, None);
460        assert_eq!(decoded, original);
461
462        // Re-encode; bytes must match exactly.
463        let bytes2 = to_canonical_bytes(&decoded).unwrap();
464        assert_eq!(bytes, bytes2, "legacy CBOR must re-encode byte-identically");
465    }
466
467    #[test]
468    fn commit_kind_rejection() {
469        let wire = CommitWire {
470            kind: "node".into(),
471            change_id: ChangeId::from_bytes_raw([1u8; 16]),
472            parents: vec![],
473            nodes: raw(1),
474            edges: raw(2),
475            schema: raw(3),
476            delta: None,
477            indexes: None,
478            embeddings: None,
479            author: "x".into(),
480            agent_id: None,
481            task_id: None,
482            time: 0,
483            message: String::new(),
484            signature: None,
485            extra: BTreeMap::new(),
486        };
487        let bytes = serde_ipld_dagcbor::to_vec(&wire).unwrap();
488        let err = serde_ipld_dagcbor::from_slice::<Commit>(&bytes).unwrap_err();
489        assert!(err.to_string().contains("_kind"));
490    }
491
492    #[test]
493    fn commit_with_parents_round_trip() {
494        let c = sample().with_parent(raw(100)).with_parent(raw(101));
495        let bytes = to_canonical_bytes(&c).unwrap();
496        let decoded: Commit = from_canonical_bytes(&bytes).unwrap();
497        assert_eq!(c, decoded);
498        assert_eq!(decoded.parents.len(), 2);
499    }
500
501    #[test]
502    fn commit_with_signature_round_trip() {
503        let mut c = sample();
504        c.signature = Some(Signature {
505            algo: "ed25519".into(),
506            public_key: Bytes::from(vec![0xAAu8; 32]),
507            sig: Bytes::from(vec![0xBBu8; 64]),
508        });
509        let bytes = to_canonical_bytes(&c).unwrap();
510        let decoded: Commit = from_canonical_bytes(&bytes).unwrap();
511        assert_eq!(c, decoded);
512        assert_eq!(decoded.signature.as_ref().unwrap().algo, "ed25519");
513    }
514
515    #[test]
516    fn commit_extra_fields_preserved() {
517        let mut c = sample();
518        c.extra
519            .insert("x-future-field".into(), Ipld::String("v9".into()));
520        let bytes = to_canonical_bytes(&c).unwrap();
521        let decoded: Commit = from_canonical_bytes(&bytes).unwrap();
522        assert_eq!(c, decoded);
523        let bytes2 = to_canonical_bytes(&decoded).unwrap();
524        assert_eq!(bytes, bytes2);
525    }
526}