Skip to main content

relon_eval_api/
schema_canonical.rs

1//! Deterministic schema serialisation + sha256 digest.
2//!
3//! The wasm AOT backend embeds a 32-byte schema hash into the
4//! `relon.abi` custom section so host SDKs can detect schema drift at
5//! load time (host writing with an outdated `#main` schema while the
6//! wasm module was compiled against the new one — a mismatch makes
7//! the wasm read garbage out of the binary handshake buffer). The
8//! cryptographic strength is irrelevant; what matters is that host
9//! and codegen compute the **same** hash from the **same** schema.
10//!
11//! Spec: `docs/internal/adr/wasm-srcmap-section-v1-2026-05-16.md`,
12//! "canonical #main schema" section.
13//!
14//! The canonical form is intentionally narrower than the runtime
15//! [`crate::value::SchemaData`] type:
16//!
17//! * Fields are stored in **declaration order**, not alphabetical.
18//!   Field order changes are observable on the wire (layout slot
19//!   offsets shift), so they must invalidate the hash.
20//! * `doc_comment` and decorator metadata are **not** captured — they
21//!   are presentation / lint signals, not ABI-relevant.
22//! * Nested schemas are **inlined**, not referenced by name. Two
23//!   schemas with the same structural shape compare equal even when
24//!   declared in different files under different names — this is a
25//!   "behavioural hash", not a "location hash".
26//!
27//! Phase 2.b will plumb real `SchemaDef` lowering into this module;
28//! Phase 2.a only provides the canonical form + hash plumbing so the
29//! `relon.abi` section can be emitted with placeholder zeros until
30//! the codegen pass starts accepting schema input.
31
32use serde::{Deserialize, Serialize};
33use sha2::{Digest, Sha256};
34
35/// Logical type description used by canonical serialisation.
36///
37/// This is **not** the parser's `TypeNode`: the parser shape carries
38/// source ranges, doc comments, and parsed-but-unresolved generic
39/// argument references, none of which belong in an ABI hash. The
40/// canonical form is a structural snapshot that strips presentation
41/// metadata and inlines nested schemas.
42///
43/// Variants mirror the v1 binary layout's leaf-type table. Tuple
44/// shapes are represented as tuple schemas (`Schema::is_tuple = true`);
45/// future leaf/container layout shapes such as `Bytes` would need a new
46/// variant here so the hash distinguishes the new shape.
47#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
48#[serde(tag = "kind")]
49pub enum TypeRepr {
50    /// Internal unit slot. New source schemas should use `Option<T>`
51    /// or `T?` for absence instead of producing this type.
52    Unit,
53    /// `Bool` — a 0/1 byte.
54    Bool,
55    /// `Int` — a signed 64-bit integer.
56    Int,
57    /// `Float` — an IEEE-754 double.
58    Float,
59    /// `String` — UTF-8 bytes with a u32 length prefix.
60    String,
61    /// `List<T>` — variable-length sequence over `element`.
62    List {
63        /// Element type.
64        element: Box<TypeRepr>,
65    },
66    /// `Option<T>` — tag + payload union with two arms.
67    Option {
68        /// `Some` payload type.
69        inner: Box<TypeRepr>,
70    },
71    /// `Result<T, E>` — tag + payload union with two arms.
72    Result {
73        /// `Ok` payload type.
74        ok: Box<TypeRepr>,
75        /// `Err` payload type.
76        err: Box<TypeRepr>,
77    },
78    /// User-defined Rust-like `#enum Name { ... }`. The binary ABI is a
79    /// variant record: one tag byte plus an optional payload slot. Struct
80    /// and tuple variant payloads are represented as ordinary schemas, with
81    /// tuple payloads marked by [`EnumVariant::is_tuple`].
82    Enum {
83        /// Enum type name.
84        name: String,
85        /// Variants in declaration/tag order.
86        variants: Vec<EnumVariant>,
87    },
88    /// Inline reference to a named nested schema. The hash flattens
89    /// the nested structure rather than recording the name, so two
90    /// schemas with identical structural shape collapse to the same
91    /// digest regardless of declaration site.
92    Schema {
93        /// Recursive canonical form of the nested schema.
94        schema: Box<Schema>,
95    },
96    /// Phase F.2 (W7 closure-as-value boundary) — first-class closure
97    /// value. The variant records the closure's user-visible signature
98    /// (`params` declaration order, `ret`); the schema digest treats it
99    /// as a structural shape so two anonymous closure fields with the
100    /// same `(params, ret)` collapse to the same hash regardless of
101    /// declaration site.
102    ///
103    /// The runtime representation is a scratch-heap pointer-indirect
104    /// 8-byte handle (`[fn_table_idx: u32 LE][captures_ptr: u32 LE]`);
105    /// see `relon_ir::IrType::Closure` for the wasm-side layout. The
106    /// canonical form intentionally avoids carrying capture metadata —
107    /// captures are an implementation detail of the lambda's closure
108    /// conversion, not part of its ABI-visible type.
109    ///
110    /// Layout integration is **not** wired in this milestone: any
111    /// `TypeRepr::Closure` reaching `SchemaLayout::offsets_for` surfaces
112    /// as `LayoutError::UnsupportedTypeInLayoutV1` so the cross-boundary
113    /// dangle the binary handshake would otherwise see stays guarded.
114    /// Closure-typed fields are only valid as in-function intermediate
115    /// values (let-bindings, dict-field caches the lowering pass
116    /// owns) — never at a host-visible `#main` boundary.
117    Closure {
118        /// User-visible parameter types in declaration order. Carries
119        /// nested `TypeRepr` so a closure-returning closure
120        /// (`(Int) => (Int) => Int`) hashes as a distinct shape from a
121        /// flat `(Int, Int) => Int`.
122        params: Vec<TypeRepr>,
123        /// Return type. Single value (no tuples) — matches the wasm
124        /// `call_indirect` signature codegen emits today.
125        ret: Box<TypeRepr>,
126    },
127}
128
129/// One field in a canonical schema.
130///
131/// `default` carries the field's compile-time default value when
132/// declared; it's serialised as raw JSON so the hash is sensitive to
133/// `1` vs `1.0` vs `"1"` distinctions without needing a separate
134/// canonical-value encoder.
135#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
136pub struct Field {
137    /// Field name as declared in source.
138    pub name: String,
139    /// Field type in canonical form.
140    pub ty: TypeRepr,
141    /// Default value, when the field declared one. `serde_json::Value`
142    /// rather than `crate::value::Value` so the hash stays insulated
143    /// from runtime-only payload variants (`Closure`, `Schema`, ...).
144    #[serde(skip_serializing_if = "Option::is_none")]
145    pub default: Option<serde_json::Value>,
146}
147
148/// One variant in a canonical `#enum` description.
149#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
150pub struct EnumVariant {
151    /// Variant name as declared in source.
152    pub name: String,
153    /// Stable ABI tag. Tags are assigned in declaration order and must fit
154    /// in one byte for the current variant-record layout.
155    pub tag: u8,
156    /// Payload fields. Empty means a unit variant.
157    pub fields: Vec<Field>,
158    /// `true` when the payload came from a tuple variant, e.g.
159    /// `Rgb(Int, Int, Int)`. Field names are then synthetic decimal
160    /// indices (`"0"`, `"1"`, ...), and host JSON projection emits an
161    /// array payload.
162    #[serde(default, skip_serializing_if = "std::ops::Not::not")]
163    pub is_tuple: bool,
164}
165
166impl EnumVariant {
167    /// Build the record schema used for this variant's payload. Tuple
168    /// variants deliberately use a normal record schema with numeric field
169    /// names, because `Value::variant_dict` stores tuple payloads as a dict;
170    /// JSON projection recognizes those numeric keys and emits an array.
171    pub fn payload_schema(&self, enum_name: &str) -> Option<Schema> {
172        if self.fields.is_empty() {
173            return None;
174        }
175        Some(Schema {
176            name: format!("{enum_name}.{}", self.name),
177            generics: Vec::new(),
178            fields: self.fields.clone(),
179            is_tuple: false,
180        })
181    }
182}
183
184/// Canonical schema description. Field order is preserved exactly as
185/// declared; see the module docs for the rationale.
186#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
187pub struct Schema {
188    /// Schema name. Anonymous schemas declared inline carry an empty
189    /// string so the canonical form remains deterministic even when
190    /// hosts forget to pass a name through.
191    pub name: String,
192    /// Generic type parameters (e.g. `["T"]` for `Page<T>`). Empty
193    /// for monomorphic schemas.
194    pub generics: Vec<String>,
195    /// Fields in **declaration order**. Reordering invalidates the
196    /// hash even when each field's name + type are otherwise
197    /// identical, because the binary layout's field offsets are
198    /// declaration-order dependent.
199    pub fields: Vec<Field>,
200    /// Wave T2: marks an **anonymous positional record** synthesised for
201    /// a `Tuple<...>`. The binary layout, buffer builder, verifier and
202    /// codegen treat such a schema exactly like any other record (its
203    /// fields carry the synthetic positional names `"0"`, `"1"`, ...),
204    /// so the whole record/return ABI is reused unchanged. The only
205    /// behavioural fork is the **host decode**: a tuple schema decodes
206    /// to a positional `Value::Tuple`, which JSON projection later emits
207    /// as an array, rather than to a branded object.
208    ///
209    /// Serialised only when `true`, so every pre-T2 (non-tuple) schema's
210    /// canonical bytes — and therefore its ABI hash — stay unchanged.
211    #[serde(default, skip_serializing_if = "std::ops::Not::not")]
212    pub is_tuple: bool,
213}
214
215impl TypeRepr {
216    /// Return a custom enum variant by ABI tag.
217    pub fn enum_variant_by_tag(&self, tag: u8) -> Option<&EnumVariant> {
218        match self {
219            TypeRepr::Enum { variants, .. } => variants.iter().find(|v| v.tag == tag),
220            _ => None,
221        }
222    }
223
224    /// Return a custom enum variant by source name.
225    pub fn enum_variant_by_name(&self, variant_name: &str) -> Option<&EnumVariant> {
226        match self {
227            TypeRepr::Enum { variants, .. } => variants.iter().find(|v| v.name == variant_name),
228            _ => None,
229        }
230    }
231}
232
233impl Schema {
234    /// Build an empty schema with the given `name`. Convenience ctor
235    /// used by tests and by future codegen-side conversions.
236    pub fn new(name: impl Into<String>) -> Self {
237        Self {
238            name: name.into(),
239            generics: Vec::new(),
240            fields: Vec::new(),
241            is_tuple: false,
242        }
243    }
244
245    /// Wave T2: build an anonymous positional-record schema for a
246    /// `Tuple<...>` from its element types in order. Field names are the
247    /// synthetic decimal indices `"0"`, `"1"`, ... so the existing
248    /// declaration-order layout pass assigns one slot per element; the
249    /// `is_tuple` flag drives positional `Value::Tuple` host decode.
250    pub fn tuple(name: impl Into<String>, elements: Vec<TypeRepr>) -> Self {
251        let fields = elements
252            .into_iter()
253            .enumerate()
254            .map(|(i, ty)| Field {
255                name: i.to_string(),
256                ty,
257                default: None,
258            })
259            .collect();
260        Self {
261            name: name.into(),
262            generics: Vec::new(),
263            fields,
264            is_tuple: true,
265        }
266    }
267}
268
269/// Serialise a [`Schema`] to its canonical byte form.
270///
271/// The output is the schema's JSON projection with:
272///
273/// * sorted object keys at every level (so `serde_json`'s map iteration
274///   order can't poison the hash even when `BTreeMap` / `HashMap`
275///   internals reshuffle between minor releases),
276/// * no whitespace (compact form),
277/// * a top-level `"version": 3` marker so future canonical-form
278///   evolutions can bump and stay distinguishable. Phase F.2 lifted
279///   v1 -> v2 when adding the [`TypeRepr::Closure`] variant; v3 adds
280///   [`TypeRepr::Enum`] so custom `#enum` boundary shapes are explicit
281///   in the hash.
282///
283/// Field order inside [`Schema::fields`] is **not** sorted — the
284/// `Vec<Field>` is serialised in the order callers declared, matching
285/// the binary layout's declaration-order slot assignment.
286pub fn canonical_schema(schema: &Schema) -> Vec<u8> {
287    // We wrap the schema in a stable envelope so the version marker
288    // sits at a predictable spot in the JSON output. `BTreeMap` keeps
289    // keys sorted (`version` then `schema`) — important because
290    // `serde_json::Value::Object` is itself a `BTreeMap`-backed
291    // structure when constructed via `json!`, which gives us the
292    // sorted-keys property without us writing a custom serializer.
293    let value = serde_json::json!({
294        "version": 3,
295        "schema": schema,
296    });
297    // `serde_json::to_vec` on a `serde_json::Value` produces compact
298    // output (no whitespace) and walks `Value::Object` in key order
299    // (BTreeMap). The `Schema` types above are `#[derive(Serialize)]`
300    // and emit their fields in declaration order — exactly what the
301    // canonical form requires. Combined, that means: nested map keys
302    // (e.g. inside `default` values) are sorted; field-level ordering
303    // is preserved.
304    serde_json::to_vec(&value).expect("canonical schema serialisation never fails on owned types")
305}
306
307/// Compute the 32-byte sha256 digest of a schema's canonical form.
308///
309/// The digest is the value embedded in the `relon.abi` custom section
310/// for `main_schema_hash` / `return_schema_hash`. Host SDKs compute
311/// the same digest from their compile-time schema knowledge and
312/// refuse-to-load on mismatch.
313pub fn schema_hash(schema: &Schema) -> [u8; 32] {
314    let canonical = canonical_schema(schema);
315    let mut hasher = Sha256::new();
316    hasher.update(&canonical);
317    hasher.finalize().into()
318}
319
320#[cfg(test)]
321mod tests {
322    use super::*;
323
324    fn sample_user_schema() -> Schema {
325        Schema {
326            name: "User".into(),
327            generics: vec![],
328            is_tuple: false,
329            fields: vec![
330                Field {
331                    name: "id".into(),
332                    ty: TypeRepr::Int,
333                    default: None,
334                },
335                Field {
336                    name: "name".into(),
337                    ty: TypeRepr::String,
338                    default: None,
339                },
340                Field {
341                    name: "active".into(),
342                    ty: TypeRepr::Bool,
343                    default: Some(serde_json::Value::Bool(true)),
344                },
345            ],
346        }
347    }
348
349    #[test]
350    fn identical_schemas_produce_identical_hash() {
351        // Two independently constructed instances of the same schema
352        // must hash to the same digest — the hash is the wire-level
353        // identity of the shape, not of the Rust value.
354        let a = sample_user_schema();
355        let b = sample_user_schema();
356        assert_eq!(schema_hash(&a), schema_hash(&b));
357    }
358
359    #[test]
360    fn field_reorder_changes_hash() {
361        // Field declaration order maps to binary-layout offsets, so a
362        // reorder is observable on the wire and must invalidate the
363        // hash even when each field's (name, type, default) is
364        // otherwise identical.
365        let mut original = sample_user_schema();
366        let mut reordered = sample_user_schema();
367        reordered.fields.swap(0, 1);
368        assert_ne!(schema_hash(&original), schema_hash(&reordered));
369
370        // Sanity: the only difference is order — field set is identical.
371        original.fields.sort_by(|a, b| a.name.cmp(&b.name));
372        reordered.fields.sort_by(|a, b| a.name.cmp(&b.name));
373        assert_eq!(original.fields, reordered.fields);
374    }
375
376    #[test]
377    fn doc_comment_and_metadata_absent_means_hash_stable() {
378        // `Schema` / `Field` don't carry doc comments or decorator
379        // metadata fields, so adding such metadata in upstream
380        // `SchemaDef` should be a no-op for the canonical form once
381        // Phase 2.b plumbs the conversion. We exercise the invariant
382        // here by ensuring the canonical bytes don't mention any
383        // "doc" key — a regression that quietly leaks docs into the
384        // hash would surface as a string match here.
385        let schema = sample_user_schema();
386        let bytes = canonical_schema(&schema);
387        let text = std::str::from_utf8(&bytes).expect("canonical form is utf-8 json");
388        assert!(
389            !text.contains("\"doc"),
390            "canonical form must not contain doc-related keys, got: {text}"
391        );
392        assert!(
393            !text.contains("\"meta"),
394            "canonical form must not contain decorator metadata keys, got: {text}"
395        );
396    }
397
398    #[test]
399    fn nested_schema_inline_matches_flattened_equivalent() {
400        // Behavioural hash: a schema nesting `User` and an equivalent
401        // schema with the same fields inlined under a different
402        // declaration name must collapse to the same digest. The
403        // `Schema { schema: ... }` variant flattens recursively, so
404        // two structurally identical inputs reach the same canonical
405        // bytes.
406        let inner = sample_user_schema();
407        let outer_with_named = Schema {
408            name: "Wrapper".into(),
409            generics: vec![],
410            is_tuple: false,
411            fields: vec![Field {
412                name: "user".into(),
413                ty: TypeRepr::Schema {
414                    schema: Box::new(inner.clone()),
415                },
416                default: None,
417            }],
418        };
419        // Same outer schema but the inner schema's `name` differs.
420        // The hash should ignore the nested name and respond only to
421        // structural shape.
422        let outer_with_alias = Schema {
423            name: "Wrapper".into(),
424            generics: vec![],
425            is_tuple: false,
426            fields: vec![Field {
427                name: "user".into(),
428                ty: TypeRepr::Schema {
429                    schema: Box::new(Schema {
430                        // Per the spec the nested schema is inlined
431                        // recursively, and "a schema declared in
432                        // foo.relon vs bar.relon must hash the same
433                        // when its structure matches" — that is the
434                        // file-path invariance. Type rename, however,
435                        // is a breaking change to host consumers
436                        // (brand string flips), so we keep the
437                        // declared name as part of the canonical
438                        // form. The "behavioural" claim therefore
439                        // covers file-path locality, not type rename.
440                        // Use the same nested name here to match the
441                        // structural equivalence we are exercising.
442                        name: inner.name.clone(),
443                        generics: inner.generics.clone(),
444                        fields: inner.fields.clone(),
445                        is_tuple: false,
446                    }),
447                },
448                default: None,
449            }],
450        };
451        assert_eq!(
452            schema_hash(&outer_with_named),
453            schema_hash(&outer_with_alias)
454        );
455    }
456
457    #[test]
458    fn different_field_default_changes_hash() {
459        // Belt-and-braces: tweaking a default value (compile-time
460        // visible to the host) must shift the hash so a schema with a
461        // changed default doesn't sneak past the SDK's drift check.
462        let mut a = sample_user_schema();
463        let mut b = sample_user_schema();
464        a.fields[2].default = Some(serde_json::Value::Bool(true));
465        b.fields[2].default = Some(serde_json::Value::Bool(false));
466        assert_ne!(schema_hash(&a), schema_hash(&b));
467    }
468
469    #[test]
470    fn canonical_form_is_compact_json() {
471        // Whitespace in the canonical form would let an attacker forge
472        // two byte-different-but-semantically-equal payloads. Lock
473        // the compact form down with an explicit check.
474        let schema = sample_user_schema();
475        let bytes = canonical_schema(&schema);
476        assert!(
477            !bytes.contains(&b' '),
478            "canonical form must contain no spaces"
479        );
480        assert!(
481            !bytes.contains(&b'\n'),
482            "canonical form must contain no newlines"
483        );
484    }
485}