relon_eval_api/schema_canonical.rs
1//! Deterministic schema serialisation + sha256 digest.
2//!
3//! The wasm AOT backend embeds a 32-byte schema hash into the
4//! `relon.abi` custom section so host SDKs can detect schema drift at
5//! load time (host writing with an outdated `#main` schema while the
6//! wasm module was compiled against the new one — a mismatch makes
7//! the wasm read garbage out of the binary handshake buffer). The
8//! cryptographic strength is irrelevant; what matters is that host
9//! and codegen compute the **same** hash from the **same** schema.
10//!
11//! Spec: `docs/internal/adr/wasm-srcmap-section-v1-2026-05-16.md`,
12//! "canonical #main schema" section.
13//!
14//! The canonical form is intentionally narrower than the runtime
15//! [`crate::value::SchemaData`] type:
16//!
17//! * Fields are stored in **declaration order**, not alphabetical.
18//! Field order changes are observable on the wire (layout slot
19//! offsets shift), so they must invalidate the hash.
20//! * `doc_comment` and decorator metadata are **not** captured — they
21//! are presentation / lint signals, not ABI-relevant.
22//! * Nested schemas are **inlined**, not referenced by name. Two
23//! schemas with the same structural shape compare equal even when
24//! declared in different files under different names — this is a
25//! "behavioural hash", not a "location hash".
26//!
27//! Phase 2.b will plumb real `SchemaDef` lowering into this module;
28//! Phase 2.a only provides the canonical form + hash plumbing so the
29//! `relon.abi` section can be emitted with placeholder zeros until
30//! the codegen pass starts accepting schema input.
31
32use serde::{Deserialize, Serialize};
33use sha2::{Digest, Sha256};
34
35/// Logical type description used by canonical serialisation.
36///
37/// This is **not** the parser's `TypeNode`: the parser shape carries
38/// source ranges, doc comments, and parsed-but-unresolved generic
39/// argument references, none of which belong in an ABI hash. The
40/// canonical form is a structural snapshot that strips presentation
41/// metadata and inlines nested schemas.
42///
43/// Variants mirror the v1 binary layout's leaf-type table. Tuple
44/// shapes are represented as tuple schemas (`Schema::is_tuple = true`);
45/// future leaf/container layout shapes such as `Bytes` would need a new
46/// variant here so the hash distinguishes the new shape.
47#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
48#[serde(tag = "kind")]
49pub enum TypeRepr {
50 /// Internal unit slot. New source schemas should use `Option<T>`
51 /// or `T?` for absence instead of producing this type.
52 Unit,
53 /// `Bool` — a 0/1 byte.
54 Bool,
55 /// `Int` — a signed 64-bit integer.
56 Int,
57 /// `Float` — an IEEE-754 double.
58 Float,
59 /// `String` — UTF-8 bytes with a u32 length prefix.
60 String,
61 /// `List<T>` — variable-length sequence over `element`.
62 List {
63 /// Element type.
64 element: Box<TypeRepr>,
65 },
66 /// `Option<T>` — tag + payload union with two arms.
67 Option {
68 /// `Some` payload type.
69 inner: Box<TypeRepr>,
70 },
71 /// `Result<T, E>` — tag + payload union with two arms.
72 Result {
73 /// `Ok` payload type.
74 ok: Box<TypeRepr>,
75 /// `Err` payload type.
76 err: Box<TypeRepr>,
77 },
78 /// User-defined Rust-like `#enum Name { ... }`. The binary ABI is a
79 /// variant record: one tag byte plus an optional payload slot. Struct
80 /// and tuple variant payloads are represented as ordinary schemas, with
81 /// tuple payloads marked by [`EnumVariant::is_tuple`].
82 Enum {
83 /// Enum type name.
84 name: String,
85 /// Variants in declaration/tag order.
86 variants: Vec<EnumVariant>,
87 },
88 /// Inline reference to a named nested schema. The hash flattens
89 /// the nested structure rather than recording the name, so two
90 /// schemas with identical structural shape collapse to the same
91 /// digest regardless of declaration site.
92 Schema {
93 /// Recursive canonical form of the nested schema.
94 schema: Box<Schema>,
95 },
96 /// Phase F.2 (W7 closure-as-value boundary) — first-class closure
97 /// value. The variant records the closure's user-visible signature
98 /// (`params` declaration order, `ret`); the schema digest treats it
99 /// as a structural shape so two anonymous closure fields with the
100 /// same `(params, ret)` collapse to the same hash regardless of
101 /// declaration site.
102 ///
103 /// The runtime representation is a scratch-heap pointer-indirect
104 /// 8-byte handle (`[fn_table_idx: u32 LE][captures_ptr: u32 LE]`);
105 /// see `relon_ir::IrType::Closure` for the wasm-side layout. The
106 /// canonical form intentionally avoids carrying capture metadata —
107 /// captures are an implementation detail of the lambda's closure
108 /// conversion, not part of its ABI-visible type.
109 ///
110 /// Layout integration is **not** wired in this milestone: any
111 /// `TypeRepr::Closure` reaching `SchemaLayout::offsets_for` surfaces
112 /// as `LayoutError::UnsupportedTypeInLayoutV1` so the cross-boundary
113 /// dangle the binary handshake would otherwise see stays guarded.
114 /// Closure-typed fields are only valid as in-function intermediate
115 /// values (let-bindings, dict-field caches the lowering pass
116 /// owns) — never at a host-visible `#main` boundary.
117 Closure {
118 /// User-visible parameter types in declaration order. Carries
119 /// nested `TypeRepr` so a closure-returning closure
120 /// (`(Int) => (Int) => Int`) hashes as a distinct shape from a
121 /// flat `(Int, Int) => Int`.
122 params: Vec<TypeRepr>,
123 /// Return type. Single value (no tuples) — matches the wasm
124 /// `call_indirect` signature codegen emits today.
125 ret: Box<TypeRepr>,
126 },
127}
128
129/// One field in a canonical schema.
130///
131/// `default` carries the field's compile-time default value when
132/// declared; it's serialised as raw JSON so the hash is sensitive to
133/// `1` vs `1.0` vs `"1"` distinctions without needing a separate
134/// canonical-value encoder.
135#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
136pub struct Field {
137 /// Field name as declared in source.
138 pub name: String,
139 /// Field type in canonical form.
140 pub ty: TypeRepr,
141 /// Default value, when the field declared one. `serde_json::Value`
142 /// rather than `crate::value::Value` so the hash stays insulated
143 /// from runtime-only payload variants (`Closure`, `Schema`, ...).
144 #[serde(skip_serializing_if = "Option::is_none")]
145 pub default: Option<serde_json::Value>,
146}
147
148/// One variant in a canonical `#enum` description.
149#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
150pub struct EnumVariant {
151 /// Variant name as declared in source.
152 pub name: String,
153 /// Stable ABI tag. Tags are assigned in declaration order and must fit
154 /// in one byte for the current variant-record layout.
155 pub tag: u8,
156 /// Payload fields. Empty means a unit variant.
157 pub fields: Vec<Field>,
158 /// `true` when the payload came from a tuple variant, e.g.
159 /// `Rgb(Int, Int, Int)`. Field names are then synthetic decimal
160 /// indices (`"0"`, `"1"`, ...), and host JSON projection emits an
161 /// array payload.
162 #[serde(default, skip_serializing_if = "std::ops::Not::not")]
163 pub is_tuple: bool,
164}
165
166impl EnumVariant {
167 /// Build the record schema used for this variant's payload. Tuple
168 /// variants deliberately use a normal record schema with numeric field
169 /// names, because `Value::variant_dict` stores tuple payloads as a dict;
170 /// JSON projection recognizes those numeric keys and emits an array.
171 pub fn payload_schema(&self, enum_name: &str) -> Option<Schema> {
172 if self.fields.is_empty() {
173 return None;
174 }
175 Some(Schema {
176 name: format!("{enum_name}.{}", self.name),
177 generics: Vec::new(),
178 fields: self.fields.clone(),
179 is_tuple: false,
180 })
181 }
182}
183
184/// Canonical schema description. Field order is preserved exactly as
185/// declared; see the module docs for the rationale.
186#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
187pub struct Schema {
188 /// Schema name. Anonymous schemas declared inline carry an empty
189 /// string so the canonical form remains deterministic even when
190 /// hosts forget to pass a name through.
191 pub name: String,
192 /// Generic type parameters (e.g. `["T"]` for `Page<T>`). Empty
193 /// for monomorphic schemas.
194 pub generics: Vec<String>,
195 /// Fields in **declaration order**. Reordering invalidates the
196 /// hash even when each field's name + type are otherwise
197 /// identical, because the binary layout's field offsets are
198 /// declaration-order dependent.
199 pub fields: Vec<Field>,
200 /// Wave T2: marks an **anonymous positional record** synthesised for
201 /// a `Tuple<...>`. The binary layout, buffer builder, verifier and
202 /// codegen treat such a schema exactly like any other record (its
203 /// fields carry the synthetic positional names `"0"`, `"1"`, ...),
204 /// so the whole record/return ABI is reused unchanged. The only
205 /// behavioural fork is the **host decode**: a tuple schema decodes
206 /// to a positional `Value::Tuple`, which JSON projection later emits
207 /// as an array, rather than to a branded object.
208 ///
209 /// Serialised only when `true`, so every pre-T2 (non-tuple) schema's
210 /// canonical bytes — and therefore its ABI hash — stay unchanged.
211 #[serde(default, skip_serializing_if = "std::ops::Not::not")]
212 pub is_tuple: bool,
213}
214
215impl TypeRepr {
216 /// Return a custom enum variant by ABI tag.
217 pub fn enum_variant_by_tag(&self, tag: u8) -> Option<&EnumVariant> {
218 match self {
219 TypeRepr::Enum { variants, .. } => variants.iter().find(|v| v.tag == tag),
220 _ => None,
221 }
222 }
223
224 /// Return a custom enum variant by source name.
225 pub fn enum_variant_by_name(&self, variant_name: &str) -> Option<&EnumVariant> {
226 match self {
227 TypeRepr::Enum { variants, .. } => variants.iter().find(|v| v.name == variant_name),
228 _ => None,
229 }
230 }
231}
232
233impl Schema {
234 /// Build an empty schema with the given `name`. Convenience ctor
235 /// used by tests and by future codegen-side conversions.
236 pub fn new(name: impl Into<String>) -> Self {
237 Self {
238 name: name.into(),
239 generics: Vec::new(),
240 fields: Vec::new(),
241 is_tuple: false,
242 }
243 }
244
245 /// Wave T2: build an anonymous positional-record schema for a
246 /// `Tuple<...>` from its element types in order. Field names are the
247 /// synthetic decimal indices `"0"`, `"1"`, ... so the existing
248 /// declaration-order layout pass assigns one slot per element; the
249 /// `is_tuple` flag drives positional `Value::Tuple` host decode.
250 pub fn tuple(name: impl Into<String>, elements: Vec<TypeRepr>) -> Self {
251 let fields = elements
252 .into_iter()
253 .enumerate()
254 .map(|(i, ty)| Field {
255 name: i.to_string(),
256 ty,
257 default: None,
258 })
259 .collect();
260 Self {
261 name: name.into(),
262 generics: Vec::new(),
263 fields,
264 is_tuple: true,
265 }
266 }
267}
268
269/// Serialise a [`Schema`] to its canonical byte form.
270///
271/// The output is the schema's JSON projection with:
272///
273/// * sorted object keys at every level (so `serde_json`'s map iteration
274/// order can't poison the hash even when `BTreeMap` / `HashMap`
275/// internals reshuffle between minor releases),
276/// * no whitespace (compact form),
277/// * a top-level `"version": 3` marker so future canonical-form
278/// evolutions can bump and stay distinguishable. Phase F.2 lifted
279/// v1 -> v2 when adding the [`TypeRepr::Closure`] variant; v3 adds
280/// [`TypeRepr::Enum`] so custom `#enum` boundary shapes are explicit
281/// in the hash.
282///
283/// Field order inside [`Schema::fields`] is **not** sorted — the
284/// `Vec<Field>` is serialised in the order callers declared, matching
285/// the binary layout's declaration-order slot assignment.
286pub fn canonical_schema(schema: &Schema) -> Vec<u8> {
287 // We wrap the schema in a stable envelope so the version marker
288 // sits at a predictable spot in the JSON output. `BTreeMap` keeps
289 // keys sorted (`version` then `schema`) — important because
290 // `serde_json::Value::Object` is itself a `BTreeMap`-backed
291 // structure when constructed via `json!`, which gives us the
292 // sorted-keys property without us writing a custom serializer.
293 let value = serde_json::json!({
294 "version": 3,
295 "schema": schema,
296 });
297 // `serde_json::to_vec` on a `serde_json::Value` produces compact
298 // output (no whitespace) and walks `Value::Object` in key order
299 // (BTreeMap). The `Schema` types above are `#[derive(Serialize)]`
300 // and emit their fields in declaration order — exactly what the
301 // canonical form requires. Combined, that means: nested map keys
302 // (e.g. inside `default` values) are sorted; field-level ordering
303 // is preserved.
304 serde_json::to_vec(&value).expect("canonical schema serialisation never fails on owned types")
305}
306
307/// Compute the 32-byte sha256 digest of a schema's canonical form.
308///
309/// The digest is the value embedded in the `relon.abi` custom section
310/// for `main_schema_hash` / `return_schema_hash`. Host SDKs compute
311/// the same digest from their compile-time schema knowledge and
312/// refuse-to-load on mismatch.
313pub fn schema_hash(schema: &Schema) -> [u8; 32] {
314 let canonical = canonical_schema(schema);
315 let mut hasher = Sha256::new();
316 hasher.update(&canonical);
317 hasher.finalize().into()
318}
319
320#[cfg(test)]
321mod tests {
322 use super::*;
323
324 fn sample_user_schema() -> Schema {
325 Schema {
326 name: "User".into(),
327 generics: vec![],
328 is_tuple: false,
329 fields: vec![
330 Field {
331 name: "id".into(),
332 ty: TypeRepr::Int,
333 default: None,
334 },
335 Field {
336 name: "name".into(),
337 ty: TypeRepr::String,
338 default: None,
339 },
340 Field {
341 name: "active".into(),
342 ty: TypeRepr::Bool,
343 default: Some(serde_json::Value::Bool(true)),
344 },
345 ],
346 }
347 }
348
349 #[test]
350 fn identical_schemas_produce_identical_hash() {
351 // Two independently constructed instances of the same schema
352 // must hash to the same digest — the hash is the wire-level
353 // identity of the shape, not of the Rust value.
354 let a = sample_user_schema();
355 let b = sample_user_schema();
356 assert_eq!(schema_hash(&a), schema_hash(&b));
357 }
358
359 #[test]
360 fn field_reorder_changes_hash() {
361 // Field declaration order maps to binary-layout offsets, so a
362 // reorder is observable on the wire and must invalidate the
363 // hash even when each field's (name, type, default) is
364 // otherwise identical.
365 let mut original = sample_user_schema();
366 let mut reordered = sample_user_schema();
367 reordered.fields.swap(0, 1);
368 assert_ne!(schema_hash(&original), schema_hash(&reordered));
369
370 // Sanity: the only difference is order — field set is identical.
371 original.fields.sort_by(|a, b| a.name.cmp(&b.name));
372 reordered.fields.sort_by(|a, b| a.name.cmp(&b.name));
373 assert_eq!(original.fields, reordered.fields);
374 }
375
376 #[test]
377 fn doc_comment_and_metadata_absent_means_hash_stable() {
378 // `Schema` / `Field` don't carry doc comments or decorator
379 // metadata fields, so adding such metadata in upstream
380 // `SchemaDef` should be a no-op for the canonical form once
381 // Phase 2.b plumbs the conversion. We exercise the invariant
382 // here by ensuring the canonical bytes don't mention any
383 // "doc" key — a regression that quietly leaks docs into the
384 // hash would surface as a string match here.
385 let schema = sample_user_schema();
386 let bytes = canonical_schema(&schema);
387 let text = std::str::from_utf8(&bytes).expect("canonical form is utf-8 json");
388 assert!(
389 !text.contains("\"doc"),
390 "canonical form must not contain doc-related keys, got: {text}"
391 );
392 assert!(
393 !text.contains("\"meta"),
394 "canonical form must not contain decorator metadata keys, got: {text}"
395 );
396 }
397
398 #[test]
399 fn nested_schema_inline_matches_flattened_equivalent() {
400 // Behavioural hash: a schema nesting `User` and an equivalent
401 // schema with the same fields inlined under a different
402 // declaration name must collapse to the same digest. The
403 // `Schema { schema: ... }` variant flattens recursively, so
404 // two structurally identical inputs reach the same canonical
405 // bytes.
406 let inner = sample_user_schema();
407 let outer_with_named = Schema {
408 name: "Wrapper".into(),
409 generics: vec![],
410 is_tuple: false,
411 fields: vec![Field {
412 name: "user".into(),
413 ty: TypeRepr::Schema {
414 schema: Box::new(inner.clone()),
415 },
416 default: None,
417 }],
418 };
419 // Same outer schema but the inner schema's `name` differs.
420 // The hash should ignore the nested name and respond only to
421 // structural shape.
422 let outer_with_alias = Schema {
423 name: "Wrapper".into(),
424 generics: vec![],
425 is_tuple: false,
426 fields: vec![Field {
427 name: "user".into(),
428 ty: TypeRepr::Schema {
429 schema: Box::new(Schema {
430 // Per the spec the nested schema is inlined
431 // recursively, and "a schema declared in
432 // foo.relon vs bar.relon must hash the same
433 // when its structure matches" — that is the
434 // file-path invariance. Type rename, however,
435 // is a breaking change to host consumers
436 // (brand string flips), so we keep the
437 // declared name as part of the canonical
438 // form. The "behavioural" claim therefore
439 // covers file-path locality, not type rename.
440 // Use the same nested name here to match the
441 // structural equivalence we are exercising.
442 name: inner.name.clone(),
443 generics: inner.generics.clone(),
444 fields: inner.fields.clone(),
445 is_tuple: false,
446 }),
447 },
448 default: None,
449 }],
450 };
451 assert_eq!(
452 schema_hash(&outer_with_named),
453 schema_hash(&outer_with_alias)
454 );
455 }
456
457 #[test]
458 fn different_field_default_changes_hash() {
459 // Belt-and-braces: tweaking a default value (compile-time
460 // visible to the host) must shift the hash so a schema with a
461 // changed default doesn't sneak past the SDK's drift check.
462 let mut a = sample_user_schema();
463 let mut b = sample_user_schema();
464 a.fields[2].default = Some(serde_json::Value::Bool(true));
465 b.fields[2].default = Some(serde_json::Value::Bool(false));
466 assert_ne!(schema_hash(&a), schema_hash(&b));
467 }
468
469 #[test]
470 fn canonical_form_is_compact_json() {
471 // Whitespace in the canonical form would let an attacker forge
472 // two byte-different-but-semantically-equal payloads. Lock
473 // the compact form down with an explicit check.
474 let schema = sample_user_schema();
475 let bytes = canonical_schema(&schema);
476 assert!(
477 !bytes.contains(&b' '),
478 "canonical form must contain no spaces"
479 );
480 assert!(
481 !bytes.contains(&b'\n'),
482 "canonical form must contain no newlines"
483 );
484 }
485}