Skip to main content

mnem_core/objects/
commit.rs

1//! Commit object (SPEC §4.4).
2//!
3//! A commit is a versioned snapshot of the graph. It points at three
4//! Prolly-tree roots (nodes, edges, schema) and carries provenance
5//! metadata - author, agent, task, timestamp - plus an optional
6//! Ed25519 signature.
7
8use std::collections::BTreeMap;
9
10use bytes::Bytes;
11use ipld_core::ipld::Ipld;
12use serde::{Deserialize, Deserializer, Serialize, Serializer};
13
14use crate::id::{ChangeId, Cid};
15
16/// Cryptographic signature on a [`Commit`] or [`crate::objects::Operation`].
17///
18/// Per SPEC §9.1, the signature is computed over the canonical DAG-CBOR
19/// encoding of the containing object with the `signature` field absent.
20/// M12 will add verification helpers; for now we model the shape only.
21#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
22pub struct Signature {
23    /// Algorithm identifier. MUST be `"ed25519"` for mnem/0.1.
24    pub algo: String,
25    /// Signer's public key. 32 bytes for Ed25519.
26    pub public_key: Bytes,
27    /// Signature bytes. 64 bytes for Ed25519.
28    pub sig: Bytes,
29}
30
31/// A versioned snapshot of the graph.
32#[derive(Clone, Debug, PartialEq, Eq)]
33pub struct Commit {
34    /// Stable change identity (survives rewrite / rebase / amend).
35    pub change_id: ChangeId,
36    /// Parent commits (empty = root, ≥2 = merge).
37    pub parents: Vec<Cid>,
38    /// Root of the node Prolly tree.
39    pub nodes: Cid,
40    /// Root of the edge Prolly tree.
41    pub edges: Cid,
42    /// Root of the schema Prolly tree.
43    pub schema: Cid,
44    /// Optional `DeltaSet` link (reserved; not emitted in mnem/0.1).
45    pub delta: Option<Cid>,
46    /// Optional secondary-index root ([`crate::objects::IndexSet`],
47    /// SPEC §4.8). Agents that only need Prolly-lookup by stable id
48    /// can ignore this; query paths (label / property / adjacency)
49    /// use it when present.
50    pub indexes: Option<Cid>,
51    /// Optional embedding-sidecar Prolly root. Tree keyed by 32-byte
52    /// `NodeCid` digest, value = [`crate::objects::EmbeddingBucket`].
53    /// Lifts dense embedding vectors out of `Node` canonical bytes so
54    /// the Node CID stays byte-stable across ORT thread counts (f32
55    /// reduction ordering is non-deterministic; vectors drift by the
56    /// LSB across thread counts). `None` on commits that carry no
57    /// embed-bearing nodes.
58    ///
59    /// **Intentionally excluded from `content_cid`.** Content CID is
60    /// the deterministic "what graph is this" digest; including the
61    /// embeddings root would re-couple it to ORT thread count and
62    /// undo the determinism guarantee. Two machines re-deriving the
63    /// same source text on different cores produce the same
64    /// `content_cid`, just with per-machine drift in
65    /// `commit.embeddings`.
66    pub embeddings: Option<Cid>,
67    /// Free-form author identifier.
68    pub author: String,
69    /// AI agent identifier (when the commit was machine-generated).
70    pub agent_id: Option<String>,
71    /// Task / tool-call identifier for provenance.
72    pub task_id: Option<String>,
73    /// Microseconds since Unix epoch.
74    pub time: u64,
75    /// UTF-8 commit message. May be empty.
76    pub message: String,
77    /// Optional cryptographic signature.
78    pub signature: Option<Signature>,
79    /// Forward-compat extension map (SPEC §3.2).
80    pub extra: BTreeMap<String, Ipld>,
81}
82
83impl Commit {
84    /// The `_kind` discriminator on the wire.
85    pub const KIND: &'static str = "commit";
86
87    /// Build a commit with the required fields, empty optionals / parents / extras.
88    #[must_use]
89    pub fn new(
90        change_id: ChangeId,
91        nodes: Cid,
92        edges: Cid,
93        schema: Cid,
94        author: impl Into<String>,
95        time: u64,
96        message: impl Into<String>,
97    ) -> Self {
98        Self {
99            change_id,
100            parents: Vec::new(),
101            nodes,
102            edges,
103            schema,
104            delta: None,
105            indexes: None,
106            embeddings: None,
107            author: author.into(),
108            agent_id: None,
109            task_id: None,
110            time,
111            message: message.into(),
112            signature: None,
113            extra: BTreeMap::new(),
114        }
115    }
116
117    /// Append a parent commit. Returns `self` for chaining.
118    #[must_use]
119    pub fn with_parent(mut self, parent: Cid) -> Self {
120        self.parents.push(parent);
121        self
122    }
123
124    /// Attach an agent identifier.
125    #[must_use]
126    pub fn with_agent(mut self, agent_id: impl Into<String>) -> Self {
127        self.agent_id = Some(agent_id.into());
128        self
129    }
130
131    /// Attach a task identifier.
132    #[must_use]
133    pub fn with_task(mut self, task_id: impl Into<String>) -> Self {
134        self.task_id = Some(task_id.into());
135        self
136    }
137
138    /// (partial): a deterministic CID over only
139    /// the data-DAG portion of the commit -- the three Prolly tree
140    /// roots (nodes, edges, schema), the optional indexes root, and
141    /// the parents list. Excludes time, change_id, author, message,
142    /// agent_id, task_id, signature, and extra.
143    ///
144    /// Two ingest runs against byte-identical input on different
145    /// machines (or at different times) MUST produce the same
146    /// `content_cid`. The standard `commit_cid` continues to embed
147    /// wall-clock + UUIDv7 metadata for audit-trail purposes; that
148    /// CID is intentionally time-varying.
149    ///
150    /// # Errors
151    /// Propagates encoding failures from
152    /// [`crate::codec::dagcbor::hash_to_cid`].
153    ///
154    /// # Migration note
155    /// Wire format is unchanged: `content_cid` is computed from
156    /// existing fields, so older blockstores stay readable. A
157    /// follow-up may persist `content_cid` alongside `commit_cid` in
158    /// the operation log for cheap lookup.
159    pub fn content_cid(&self) -> Result<Cid, crate::error::CodecError> {
160        // Sort parents to make the hash insensitive to merge order
161        // (a future merge that swaps the parent list order would
162        // otherwise produce a different content_cid for an identical
163        // resulting graph). Parent order is not semantically meaningful
164        // for content-addressing.
165        let mut parents = self.parents.clone();
166        parents.sort_by(|a, b| a.to_string().cmp(&b.to_string()));
167
168        let payload = ContentCidPayload {
169            schema_version: 1,
170            nodes: self.nodes.clone(),
171            edges: self.edges.clone(),
172            schema: self.schema.clone(),
173            indexes: self.indexes.clone(),
174            parents,
175        };
176        let (_bytes, cid) = crate::codec::dagcbor::hash_to_cid(&payload)?;
177        Ok(cid)
178    }
179}
180
181/// Stable wire shape for `Commit::content_cid()`. The struct is
182/// intentionally NOT exposed publicly: `content_cid` is purely a
183/// derived value, and the on-disk Commit format does not change.
184/// Schema version 1 is the post-audit baseline; any future
185/// content_cid layout change MUST bump `schema_version` so that two
186/// versions of the codebase agree on whether they would compare equal.
187#[derive(Serialize)]
188struct ContentCidPayload {
189    schema_version: u8,
190    nodes: Cid,
191    edges: Cid,
192    schema: Cid,
193    #[serde(skip_serializing_if = "Option::is_none")]
194    indexes: Option<Cid>,
195    parents: Vec<Cid>,
196}
197
198// ---------------- Serde ----------------
199
200#[derive(Serialize, Deserialize)]
201struct CommitWire {
202    #[serde(rename = "_kind")]
203    kind: String,
204    change_id: ChangeId,
205    parents: Vec<Cid>,
206    nodes: Cid,
207    edges: Cid,
208    schema: Cid,
209    #[serde(default, skip_serializing_if = "Option::is_none")]
210    delta: Option<Cid>,
211    #[serde(default, skip_serializing_if = "Option::is_none")]
212    indexes: Option<Cid>,
213    /// `skip_serializing_if` keeps absence-on-encode so commits without
214    /// an embedding sidecar round-trip byte-identically.
215    #[serde(default, skip_serializing_if = "Option::is_none")]
216    embeddings: Option<Cid>,
217    author: String,
218    #[serde(default, skip_serializing_if = "Option::is_none")]
219    agent_id: Option<String>,
220    #[serde(default, skip_serializing_if = "Option::is_none")]
221    task_id: Option<String>,
222    time: u64,
223    message: String,
224    #[serde(default, skip_serializing_if = "Option::is_none")]
225    signature: Option<Signature>,
226    #[serde(flatten, default, skip_serializing_if = "BTreeMap::is_empty")]
227    extra: BTreeMap<String, Ipld>,
228}
229
230impl Serialize for Commit {
231    fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
232        CommitWire {
233            kind: Self::KIND.into(),
234            change_id: self.change_id,
235            parents: self.parents.clone(),
236            nodes: self.nodes.clone(),
237            edges: self.edges.clone(),
238            schema: self.schema.clone(),
239            delta: self.delta.clone(),
240            indexes: self.indexes.clone(),
241            embeddings: self.embeddings.clone(),
242            author: self.author.clone(),
243            agent_id: self.agent_id.clone(),
244            task_id: self.task_id.clone(),
245            time: self.time,
246            message: self.message.clone(),
247            signature: self.signature.clone(),
248            extra: self.extra.clone(),
249        }
250        .serialize(serializer)
251    }
252}
253
254impl<'de> Deserialize<'de> for Commit {
255    fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
256        let w = CommitWire::deserialize(deserializer)?;
257        if w.kind != Self::KIND {
258            return Err(serde::de::Error::custom(format!(
259                "expected _kind='{}', got '{}'",
260                Self::KIND,
261                w.kind
262            )));
263        }
264        Ok(Self {
265            change_id: w.change_id,
266            parents: w.parents,
267            nodes: w.nodes,
268            edges: w.edges,
269            schema: w.schema,
270            delta: w.delta,
271            indexes: w.indexes,
272            embeddings: w.embeddings,
273            author: w.author,
274            agent_id: w.agent_id,
275            task_id: w.task_id,
276            time: w.time,
277            message: w.message,
278            signature: w.signature,
279            extra: w.extra,
280        })
281    }
282}
283
284#[cfg(test)]
285mod tests {
286    use super::*;
287    use crate::codec::{from_canonical_bytes, to_canonical_bytes};
288    use crate::id::{CODEC_RAW, Multihash};
289
290    fn raw(n: u32) -> Cid {
291        Cid::new(CODEC_RAW, Multihash::sha2_256(&n.to_be_bytes()))
292    }
293
294    fn sample() -> Commit {
295        Commit::new(
296            ChangeId::from_bytes_raw([1u8; 16]),
297            raw(1),
298            raw(2),
299            raw(3),
300            "alice@example.org",
301            1_700_000_000_000_000,
302            "init",
303        )
304        .with_agent("agent:claude")
305        .with_task("task:001")
306    }
307
308    #[test]
309    fn commit_round_trip_byte_identity() {
310        let original = sample();
311        let bytes = to_canonical_bytes(&original).unwrap();
312        let decoded: Commit = from_canonical_bytes(&bytes).unwrap();
313        assert_eq!(original, decoded);
314        let bytes2 = to_canonical_bytes(&decoded).unwrap();
315        assert_eq!(bytes, bytes2);
316    }
317
318    /// two commits with byte-identical data
319    /// roots but different timestamps, change_ids, authors, and
320    /// messages MUST share `content_cid` while their `commit_cid`
321    /// differs.
322    #[test]
323    fn content_cid_is_stable_across_metadata() {
324        let mut a = Commit::new(
325            ChangeId::from_bytes_raw([1u8; 16]),
326            raw(10),
327            raw(20),
328            raw(30),
329            "alice@example.org",
330            1_700_000_000_000_000,
331            "init",
332        );
333        a.indexes = Some(raw(40));
334
335        let mut b = Commit::new(
336            // Different change_id (UUIDv7 typically embeds a timestamp).
337            ChangeId::from_bytes_raw([2u8; 16]),
338            // SAME data roots:
339            raw(10),
340            raw(20),
341            raw(30),
342            // Different author + time + message:
343            "bob@example.org",
344            1_777_000_000_000_000,
345            "different message entirely",
346        );
347        b.indexes = Some(raw(40));
348
349        assert_eq!(
350            a.content_cid().unwrap(),
351            b.content_cid().unwrap(),
352            "content_cid must ignore metadata (time, change_id, author, message)"
353        );
354
355        let (a_bytes, a_commit_cid) = crate::codec::dagcbor::hash_to_cid(&a).unwrap();
356        let (b_bytes, b_commit_cid) = crate::codec::dagcbor::hash_to_cid(&b).unwrap();
357        let _ = (a_bytes, b_bytes);
358        assert_ne!(
359            a_commit_cid, b_commit_cid,
360            "commit_cid SHOULD differ when metadata differs (audit-trail invariant)"
361        );
362    }
363
364    /// content_cid MUST change when any data root changes.
365    #[test]
366    fn content_cid_distinguishes_data_roots() {
367        let a = Commit::new(
368            ChangeId::from_bytes_raw([1u8; 16]),
369            raw(10),
370            raw(20),
371            raw(30),
372            "alice",
373            1,
374            "msg",
375        );
376        let b = Commit::new(
377            ChangeId::from_bytes_raw([1u8; 16]),
378            raw(11), // different nodes root
379            raw(20),
380            raw(30),
381            "alice",
382            1,
383            "msg",
384        );
385        assert_ne!(a.content_cid().unwrap(), b.content_cid().unwrap());
386    }
387
388    /// Load-bearing invariant: two commits with byte-identical data
389    /// roots but DIFFERENT `embeddings` sidecar Cids MUST share
390    /// `content_cid`. If this fails, a future change re-coupled
391    /// `ContentCidPayload` to the embedding sidecar - exactly the
392    /// architectural error this design exists to prevent. Federated
393    /// dedup (two machines indexing the same source produce the same
394    /// content_cid) would silently break.
395    #[test]
396    fn content_cid_ignores_embeddings_field() {
397        let mut a = sample();
398        a.embeddings = Some(raw(100));
399        let mut b = sample();
400        b.embeddings = Some(raw(200)); // different embedding sidecar
401        assert_eq!(
402            a.content_cid().unwrap(),
403            b.content_cid().unwrap(),
404            "content_cid MUST ignore the embeddings sidecar - that is the G16 contract"
405        );
406
407        // Also: a commit with `embeddings = None` and a commit with
408        // `embeddings = Some(_)` must share the same content_cid when
409        // every other data root matches.
410        let mut c = sample();
411        c.embeddings = None;
412        let mut d = sample();
413        d.embeddings = Some(raw(300));
414        assert_eq!(
415            c.content_cid().unwrap(),
416            d.content_cid().unwrap(),
417            "absence of embeddings must not change content_cid either"
418        );
419    }
420
421    /// `Commit.embeddings: Some(cid)` survives encode → decode →
422    /// re-encode byte-identically. Pins the wire-form contract for
423    /// the new G16 field.
424    #[test]
425    fn commit_with_embeddings_some_round_trips() {
426        let mut original = sample();
427        original.embeddings = Some(raw(42));
428        let bytes = to_canonical_bytes(&original).unwrap();
429        let decoded: Commit = from_canonical_bytes(&bytes).unwrap();
430        assert_eq!(original, decoded);
431        assert_eq!(decoded.embeddings, Some(raw(42)));
432        let bytes2 = to_canonical_bytes(&decoded).unwrap();
433        assert_eq!(
434            bytes, bytes2,
435            "round-trip must be byte-identical - wire form is contract-bound"
436        );
437    }
438
439    /// Backwards-compat: a CBOR commit written without the
440    /// `embeddings` key must decode cleanly with `embeddings = None`
441    /// and re-encode byte-identically. The wire emitter omits the
442    /// key when `None`, so legacy bytes round-trip.
443    #[test]
444    fn commit_legacy_no_embeddings_key_round_trips() {
445        // Construct a commit with `embeddings = None` (wire form
446        // omits the key entirely under `skip_serializing_if`).
447        let original = sample();
448        assert_eq!(original.embeddings, None);
449        let bytes = to_canonical_bytes(&original).unwrap();
450
451        // Verify the wire form does NOT contain the `embeddings` key.
452        // The literal byte string "embeddings" cannot appear by chance
453        // in a Cid digest, so this is a robust negative probe.
454        assert!(
455            !bytes
456                .windows(b"embeddings".len())
457                .any(|w| w == b"embeddings"),
458            "wire form must omit the `embeddings` key when None"
459        );
460
461        // Decode back; field defaults to `None`.
462        let decoded: Commit = from_canonical_bytes(&bytes).unwrap();
463        assert_eq!(decoded.embeddings, None);
464        assert_eq!(decoded, original);
465
466        // Re-encode; bytes must match exactly.
467        let bytes2 = to_canonical_bytes(&decoded).unwrap();
468        assert_eq!(bytes, bytes2, "legacy CBOR must re-encode byte-identically");
469    }
470
471    #[test]
472    fn commit_kind_rejection() {
473        let wire = CommitWire {
474            kind: "node".into(),
475            change_id: ChangeId::from_bytes_raw([1u8; 16]),
476            parents: vec![],
477            nodes: raw(1),
478            edges: raw(2),
479            schema: raw(3),
480            delta: None,
481            indexes: None,
482            embeddings: None,
483            author: "x".into(),
484            agent_id: None,
485            task_id: None,
486            time: 0,
487            message: String::new(),
488            signature: None,
489            extra: BTreeMap::new(),
490        };
491        let bytes = serde_ipld_dagcbor::to_vec(&wire).unwrap();
492        let err = serde_ipld_dagcbor::from_slice::<Commit>(&bytes).unwrap_err();
493        assert!(err.to_string().contains("_kind"));
494    }
495
496    #[test]
497    fn commit_with_parents_round_trip() {
498        let c = sample().with_parent(raw(100)).with_parent(raw(101));
499        let bytes = to_canonical_bytes(&c).unwrap();
500        let decoded: Commit = from_canonical_bytes(&bytes).unwrap();
501        assert_eq!(c, decoded);
502        assert_eq!(decoded.parents.len(), 2);
503    }
504
505    #[test]
506    fn commit_with_signature_round_trip() {
507        let mut c = sample();
508        c.signature = Some(Signature {
509            algo: "ed25519".into(),
510            public_key: Bytes::from(vec![0xAAu8; 32]),
511            sig: Bytes::from(vec![0xBBu8; 64]),
512        });
513        let bytes = to_canonical_bytes(&c).unwrap();
514        let decoded: Commit = from_canonical_bytes(&bytes).unwrap();
515        assert_eq!(c, decoded);
516        assert_eq!(decoded.signature.as_ref().unwrap().algo, "ed25519");
517    }
518
519    #[test]
520    fn commit_extra_fields_preserved() {
521        let mut c = sample();
522        c.extra
523            .insert("x-future-field".into(), Ipld::String("v9".into()));
524        let bytes = to_canonical_bytes(&c).unwrap();
525        let decoded: Commit = from_canonical_bytes(&bytes).unwrap();
526        assert_eq!(c, decoded);
527        let bytes2 = to_canonical_bytes(&decoded).unwrap();
528        assert_eq!(bytes, bytes2);
529    }
530}