Skip to main content

sim_kernel/
datum.rs

1//! The [`Datum`] contract: the content-addressable data form of the substrate.
2//!
3//! The kernel defines the datum shape and its content-hash algorithm
4//! identity; libraries supply the stores that intern and resolve data.
5
6use std::collections::BTreeSet;
7
8use crate::{
9    error::{Error, Result},
10    expr::{Expr, NumberLiteral},
11    id::Symbol,
12    ref_id::ContentId,
13};
14
15/// Namespace of the kernel datum content-hash algorithm symbol.
16pub const DATUM_CONTENT_ALGORITHM_NAMESPACE: &str = "core";
17/// Name of the kernel datum content-hash algorithm symbol.
18pub const DATUM_CONTENT_ALGORITHM_NAME: &str = "sha256-datum-v1";
19
20/// Content-addressable data form of the substrate.
21///
22/// A `Datum` is the canonical, hashable projection of an [`Expr`]: the same
23/// value always hashes to the same [`ContentId`], independent of how it was
24/// written. The kernel fixes the datum shape and its content-hash algorithm;
25/// libraries supply the [`DatumStore`](crate::datum_store::DatumStore)s that
26/// intern and resolve data.
27///
28/// # Examples
29///
30/// ```
31/// # use sim_kernel::Datum;
32/// let datum = Datum::String("hello".to_owned());
33/// let id = datum.content_id().unwrap();
34/// // The same datum always hashes to the same content id.
35/// assert_eq!(id, Datum::String("hello".to_owned()).content_id().unwrap());
36/// ```
37#[derive(Clone, Debug, PartialEq, Eq, Hash)]
38pub enum Datum {
39    /// The nil datum.
40    Nil,
41    /// A boolean datum.
42    Bool(bool),
43    /// A number literal in some domain.
44    Number(NumberLiteral),
45    /// A symbol datum.
46    Symbol(Symbol),
47    /// A UTF-8 string datum.
48    String(String),
49    /// A byte-string datum.
50    Bytes(Vec<u8>),
51    /// An ordered list datum.
52    List(Vec<Datum>),
53    /// An ordered vector datum.
54    Vector(Vec<Datum>),
55    /// A map datum as ordered key/value pairs; duplicate keys are rejected when
56    /// hashing.
57    Map(Vec<(Datum, Datum)>),
58    /// A set datum; duplicate entries are rejected when hashing.
59    Set(Vec<Datum>),
60    /// A tagged node datum with named fields, the datum form of an
61    /// [`Expr::Extension`].
62    Node {
63        /// Node tag symbol.
64        tag: Symbol,
65        /// Named field bindings in declaration order.
66        fields: Vec<(Symbol, Datum)>,
67    },
68}
69
70impl Datum {
71    /// Returns the canonical byte encoding used to compute the content id.
72    ///
73    /// The encoding is deterministic: equal data produces equal bytes, with map
74    /// and set entries ordered by their hashes so member order does not affect
75    /// the result. Returns an error when the datum cannot be canonicalized
76    /// (for example, a map with duplicate keys).
77    pub fn canonical_bytes(&self) -> Result<Vec<u8>> {
78        let mut out = Vec::new();
79        write_bytes(&mut out, b"sim6:datum:v1");
80        self.write_canonical(&mut out)?;
81        Ok(out)
82    }
83
84    /// Returns the content id of this datum under the kernel datum algorithm.
85    pub fn content_id(&self) -> Result<ContentId> {
86        Ok(ContentId::from_bytes(
87            datum_content_algorithm(),
88            sha256(&self.canonical_bytes()?),
89        ))
90    }
91
92    fn write_canonical(&self, out: &mut Vec<u8>) -> Result<()> {
93        match self {
94            Self::Nil => write_tag(out, "sim6:datum:nil"),
95            Self::Bool(value) => {
96                write_tag(out, "sim6:datum:bool");
97                out.push(u8::from(*value));
98            }
99            Self::Number(value) => {
100                write_tag(out, "sim6:datum:number");
101                write_symbol(out, &value.domain);
102                write_string(out, &value.canonical);
103            }
104            Self::Symbol(symbol) => {
105                write_tag(out, "sim6:datum:symbol");
106                write_symbol(out, symbol);
107            }
108            Self::String(value) => {
109                write_tag(out, "sim6:datum:string");
110                write_string(out, value);
111            }
112            Self::Bytes(value) => {
113                write_tag(out, "sim6:datum:bytes");
114                write_bytes(out, value);
115            }
116            Self::List(items) => {
117                write_tag(out, "sim6:datum:list");
118                write_records(out, ordered_child_records(items)?)?;
119            }
120            Self::Vector(items) => {
121                write_tag(out, "sim6:datum:vector");
122                write_records(out, ordered_child_records(items)?)?;
123            }
124            Self::Map(entries) => {
125                write_tag(out, "sim6:datum:map");
126                write_records(out, map_records(entries)?)?;
127            }
128            Self::Set(items) => {
129                write_tag(out, "sim6:datum:set");
130                write_records(out, set_records(items)?)?;
131            }
132            Self::Node { tag, fields } => {
133                write_tag(out, "sim6:datum:node");
134                write_symbol(out, tag);
135                write_records(out, node_field_records(fields)?)?;
136            }
137        }
138        Ok(())
139    }
140}
141
142/// Returns the kernel datum content-hash algorithm symbol (`core/sha256-datum-v1`).
143pub fn datum_content_algorithm() -> Symbol {
144    Symbol::qualified(
145        DATUM_CONTENT_ALGORITHM_NAMESPACE,
146        DATUM_CONTENT_ALGORITHM_NAME,
147    )
148}
149
150impl TryFrom<Expr> for Datum {
151    type Error = Error;
152
153    fn try_from(expr: Expr) -> Result<Self> {
154        match expr {
155            Expr::Nil => Ok(Self::Nil),
156            Expr::Bool(value) => Ok(Self::Bool(value)),
157            Expr::Number(value) => Ok(Self::Number(value)),
158            Expr::Symbol(value) => Ok(Self::Symbol(value)),
159            Expr::String(value) => Ok(Self::String(value)),
160            Expr::Bytes(value) => Ok(Self::Bytes(value)),
161            Expr::List(items) => items
162                .into_iter()
163                .map(Self::try_from)
164                .collect::<Result<Vec<_>>>()
165                .map(Self::List),
166            Expr::Vector(items) => items
167                .into_iter()
168                .map(Self::try_from)
169                .collect::<Result<Vec<_>>>()
170                .map(Self::Vector),
171            Expr::Map(entries) => entries
172                .into_iter()
173                .map(|(key, value)| Ok((Self::try_from(key)?, Self::try_from(value)?)))
174                .collect::<Result<Vec<_>>>()
175                .map(Self::Map),
176            Expr::Set(items) => items
177                .into_iter()
178                .map(Self::try_from)
179                .collect::<Result<Vec<_>>>()
180                .map(Self::Set),
181            Expr::Extension { tag, payload } => extension_to_node(tag, *payload),
182            other => Err(Error::TypeMismatch {
183                expected: "datum expression",
184                found: expr_variant_name(&other),
185            }),
186        }
187    }
188}
189
190impl From<Datum> for Expr {
191    fn from(datum: Datum) -> Self {
192        match datum {
193            Datum::Nil => Self::Nil,
194            Datum::Bool(value) => Self::Bool(value),
195            Datum::Number(value) => Self::Number(value),
196            Datum::Symbol(value) => Self::Symbol(value),
197            Datum::String(value) => Self::String(value),
198            Datum::Bytes(value) => Self::Bytes(value),
199            Datum::List(items) => Self::List(items.into_iter().map(Self::from).collect()),
200            Datum::Vector(items) => Self::Vector(items.into_iter().map(Self::from).collect()),
201            Datum::Map(entries) => Self::Map(
202                entries
203                    .into_iter()
204                    .map(|(key, value)| (Self::from(key), Self::from(value)))
205                    .collect(),
206            ),
207            Datum::Set(items) => Self::Set(items.into_iter().map(Self::from).collect()),
208            Datum::Node { tag, fields } => Self::Extension {
209                tag,
210                payload: Box::new(Self::Map(
211                    fields
212                        .into_iter()
213                        .map(|(field, value)| (Self::Symbol(field), Self::from(value)))
214                        .collect(),
215                )),
216            },
217        }
218    }
219}
220
221fn extension_to_node(tag: Symbol, payload: Expr) -> Result<Datum> {
222    let Expr::Map(entries) = payload else {
223        return Err(Error::TypeMismatch {
224            expected: "datum node field map",
225            found: expr_variant_name(&payload),
226        });
227    };
228
229    let fields = entries
230        .into_iter()
231        .map(|(key, value)| {
232            let Expr::Symbol(field) = key else {
233                return Err(Error::TypeMismatch {
234                    expected: "datum node field symbol",
235                    found: expr_variant_name(&key),
236                });
237            };
238            Ok((field, Datum::try_from(value)?))
239        })
240        .collect::<Result<Vec<_>>>()?;
241    Ok(Datum::Node { tag, fields })
242}
243
244fn expr_variant_name(expr: &Expr) -> &'static str {
245    match expr {
246        Expr::Nil => "nil expression",
247        Expr::Bool(_) => "bool expression",
248        Expr::Number(_) => "number expression",
249        Expr::Symbol(_) => "symbol expression",
250        Expr::Local(_) => "local expression",
251        Expr::String(_) => "string expression",
252        Expr::Bytes(_) => "bytes expression",
253        Expr::List(_) => "list expression",
254        Expr::Vector(_) => "vector expression",
255        Expr::Map(_) => "map expression",
256        Expr::Set(_) => "set expression",
257        Expr::Call { .. } => "call expression",
258        Expr::Infix { .. } => "infix expression",
259        Expr::Prefix { .. } => "prefix expression",
260        Expr::Postfix { .. } => "postfix expression",
261        Expr::Block(_) => "block expression",
262        Expr::Quote { .. } => "quote expression",
263        Expr::Annotated { .. } => "annotated expression",
264        Expr::Extension { .. } => "extension expression",
265    }
266}
267
268fn ordered_child_records(items: &[Datum]) -> Result<Vec<Vec<u8>>> {
269    items.iter().map(Datum::canonical_bytes).collect()
270}
271
272fn map_records(entries: &[(Datum, Datum)]) -> Result<Vec<Vec<u8>>> {
273    let mut keys = BTreeSet::new();
274    let mut records = Vec::with_capacity(entries.len());
275    for (key, value) in entries {
276        let key_bytes = key.canonical_bytes()?;
277        if !keys.insert(key_bytes.clone()) {
278            return Err(Error::Eval("duplicate datum map key".to_owned()));
279        }
280        let value_bytes = value.canonical_bytes()?;
281        let mut record = Vec::new();
282        write_tag(&mut record, "sim6:datum:map-entry");
283        write_bytes(&mut record, &key_bytes);
284        write_bytes(&mut record, &value_bytes);
285        records.push(record);
286    }
287    sort_records(&mut records);
288    Ok(records)
289}
290
291fn set_records(items: &[Datum]) -> Result<Vec<Vec<u8>>> {
292    let mut seen = BTreeSet::new();
293    let mut records = Vec::with_capacity(items.len());
294    for item in items {
295        let bytes = item.canonical_bytes()?;
296        if !seen.insert(bytes.clone()) {
297            return Err(Error::Eval("duplicate datum set entry".to_owned()));
298        }
299        records.push(bytes);
300    }
301    sort_records(&mut records);
302    Ok(records)
303}
304
305fn node_field_records(fields: &[(Symbol, Datum)]) -> Result<Vec<Vec<u8>>> {
306    let mut names = BTreeSet::new();
307    let mut records = Vec::with_capacity(fields.len());
308    for (name, value) in fields {
309        let mut name_bytes = Vec::new();
310        write_symbol(&mut name_bytes, name);
311        if !names.insert(name_bytes.clone()) {
312            return Err(Error::Eval("duplicate datum node field".to_owned()));
313        }
314        let value_bytes = value.canonical_bytes()?;
315        let mut record = Vec::new();
316        write_tag(&mut record, "sim6:datum:node-field");
317        write_bytes(&mut record, &name_bytes);
318        write_bytes(&mut record, &value_bytes);
319        records.push(record);
320    }
321    sort_records(&mut records);
322    Ok(records)
323}
324
325fn sort_records(records: &mut [Vec<u8>]) {
326    records.sort_by(|left, right| {
327        sha256(left)
328            .cmp(&sha256(right))
329            .then_with(|| left.cmp(right))
330    });
331}
332
333fn write_records(out: &mut Vec<u8>, records: Vec<Vec<u8>>) -> Result<()> {
334    write_len(out, records.len())?;
335    for record in records {
336        write_bytes(out, &record);
337    }
338    Ok(())
339}
340
341fn write_tag(out: &mut Vec<u8>, tag: &str) {
342    write_bytes(out, tag.as_bytes());
343}
344
345fn write_symbol(out: &mut Vec<u8>, symbol: &Symbol) {
346    match &symbol.namespace {
347        Some(namespace) => {
348            out.push(1);
349            write_string(out, namespace);
350        }
351        None => out.push(0),
352    }
353    write_string(out, &symbol.name);
354}
355
356fn write_string(out: &mut Vec<u8>, value: &str) {
357    write_bytes(out, value.as_bytes());
358}
359
360fn write_bytes(out: &mut Vec<u8>, value: &[u8]) {
361    write_len(out, value.len()).expect("canonical datum record length exceeded u64");
362    out.extend_from_slice(value);
363}
364
365fn write_len(out: &mut Vec<u8>, len: usize) -> Result<()> {
366    let len = u64::try_from(len).map_err(|_| Error::Eval("datum record too large".to_owned()))?;
367    out.extend_from_slice(&len.to_be_bytes());
368    Ok(())
369}
370
371fn sha256(input: &[u8]) -> [u8; 32] {
372    const H0: [u32; 8] = [
373        0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab,
374        0x5be0cd19,
375    ];
376    const K: [u32; 64] = [
377        0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4,
378        0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe,
379        0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f,
380        0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
381        0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc,
382        0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
383        0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 0x19a4c116,
384        0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
385        0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7,
386        0xc67178f2,
387    ];
388
389    let mut h = H0;
390    let mut message = input.to_vec();
391    let bit_len = u64::try_from(input.len())
392        .unwrap_or(u64::MAX)
393        .wrapping_mul(8);
394    message.push(0x80);
395    while message.len() % 64 != 56 {
396        message.push(0);
397    }
398    message.extend_from_slice(&bit_len.to_be_bytes());
399
400    for chunk in message.chunks_exact(64) {
401        let mut w = [0_u32; 64];
402        for (word, bytes) in w.iter_mut().take(16).zip(chunk.chunks_exact(4)) {
403            *word = u32::from_be_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]);
404        }
405        for i in 16..64 {
406            let s0 = w[i - 15].rotate_right(7) ^ w[i - 15].rotate_right(18) ^ (w[i - 15] >> 3);
407            let s1 = w[i - 2].rotate_right(17) ^ w[i - 2].rotate_right(19) ^ (w[i - 2] >> 10);
408            w[i] = w[i - 16]
409                .wrapping_add(s0)
410                .wrapping_add(w[i - 7])
411                .wrapping_add(s1);
412        }
413
414        let [mut a, mut b, mut c, mut d, mut e, mut f, mut g, mut hh] = h;
415        for i in 0..64 {
416            let s1 = e.rotate_right(6) ^ e.rotate_right(11) ^ e.rotate_right(25);
417            let ch = (e & f) ^ ((!e) & g);
418            let temp1 = hh
419                .wrapping_add(s1)
420                .wrapping_add(ch)
421                .wrapping_add(K[i])
422                .wrapping_add(w[i]);
423            let s0 = a.rotate_right(2) ^ a.rotate_right(13) ^ a.rotate_right(22);
424            let maj = (a & b) ^ (a & c) ^ (b & c);
425            let temp2 = s0.wrapping_add(maj);
426
427            hh = g;
428            g = f;
429            f = e;
430            e = d.wrapping_add(temp1);
431            d = c;
432            c = b;
433            b = a;
434            a = temp1.wrapping_add(temp2);
435        }
436
437        h[0] = h[0].wrapping_add(a);
438        h[1] = h[1].wrapping_add(b);
439        h[2] = h[2].wrapping_add(c);
440        h[3] = h[3].wrapping_add(d);
441        h[4] = h[4].wrapping_add(e);
442        h[5] = h[5].wrapping_add(f);
443        h[6] = h[6].wrapping_add(g);
444        h[7] = h[7].wrapping_add(hh);
445    }
446
447    let mut out = [0_u8; 32];
448    for (slot, word) in out.chunks_exact_mut(4).zip(h) {
449        slot.copy_from_slice(&word.to_be_bytes());
450    }
451    out
452}
453
454#[cfg(test)]
455#[path = "datum_tests.rs"]
456mod tests;