Skip to main content

pmcp_workbook_runtime/
artifact_model.rs

1//! The RUNTIME-safe bundle artifact model + hashing (Phase 11, Plan 05 / Codex
2//! HIGH #2 boundary).
3//!
4//! These shapes describe the EMITTED bundle that the served binary deserializes
5//! and integrity-checks at load:
6//!
7//! - [`CellEntry`]/[`CellMap`] — the manifest-driven I/O map (Codex HIGH #5).
8//! - [`ArtifactHashes`]/[`BundleLock`] — the per-artifact + combined SHA-256
9//!   hash-of-hashes integrity record (ART-04/D-05).
10//!
11//! They live HERE (umya/SWC-free) so BOTH sides share ONE definition rather than
12//! the served binary re-declaring byte-for-byte serde mirrors:
13//!
14//! - `workbook-compiler` (the offline EMITTER) re-exports these from
15//!   `artifact::{cell_map,bundle_lock}` via a re-export shim (the SAME pattern
16//!   `manifest::model` uses), so the emit path keeps compiling unchanged.
17//! - the served binary deserializes these types DIRECTLY and recomputes
18//!   integrity via the SAME [`build_bundle_lock`] the emitter used.
19//!
20//! The hashing helpers ([`sha256_hex`], [`build_bundle_lock`], [`update_field`])
21//! are the SINGLE source the emitter and the server-side integrity check share —
22//! they MUST byte-reproduce each other or the integrity check false-positives.
23
24use std::collections::BTreeMap;
25
26use serde::{Deserialize, Serialize};
27use sha2::{Digest, Sha256};
28
29use crate::sheet_ir::value::CellValue;
30
31/// One input/output cell entry in a [`CellMap`].
32#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, schemars::JsonSchema)]
33pub struct CellEntry {
34    /// The neutral JSON key the caller uses for this cell (the LLM-facing name).
35    pub json_key: String,
36    /// The `CellEnv` seed coordinate — the fully-qualified `sheet!addr` cell key.
37    pub seed_coord: String,
38    /// The declared unit (`m2`/`GBP`/…), when known.
39    pub unit: Option<String>,
40}
41
42/// One served tool — the multi-tool model lift (WBV2-03, §4.1): each output Table in
43/// the source workbook becomes its OWN [`Tool`], owning its `outputs` projection and a
44/// minimal, DAG-derived `input_keys` schema (the subset of the shared [`CellMap::inputs`]
45/// pool transitively reachable upstream of this tool's output cells).
46///
47/// This type crosses the reader-free boundary (it lives HERE, beside [`CellMap`], not
48/// re-declared on the served side): both the offline compiler emitter and the served
49/// binary deserialize ONE definition (artifact_model.rs module doc).
50///
51/// Derive note: `Eq` is DROPPED — `oracle` carries [`CellValue`] (an `f64`-bearing
52/// `Number`), so this type is `PartialEq` but NOT `Eq` (the [`crate::manifest_model`]
53/// `GovernedDatum` precedent).
54#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, schemars::JsonSchema)]
55pub struct Tool {
56    /// The tool name — derived from the owning output Table's name (raw; MCP-charset
57    /// sanitization happens in the served emit, Plan 04).
58    pub name: String,
59    /// The tool description — the caption cell above the output Table, when authored.
60    pub description: Option<String>,
61    /// The minimal input schema: the LLM-facing `json_key`s of the [`CellMap::inputs`]
62    /// pool entries transitively reachable upstream of this tool's outputs (constant-only
63    /// paths excluded; shared intermediates yield the union of this tool's own upstream
64    /// leaves). DAG-derived via [`crate::dag::upstream_input_leaves`].
65    pub input_keys: Vec<String>,
66    /// One entry per output cell this tool projects (reuses [`CellEntry`] — the same
67    /// `{json_key, seed_coord, unit}` shape the inputs use).
68    pub outputs: Vec<CellEntry>,
69    /// The per-tool reconcile oracle: `<output json_key>` → the authored expected value
70    /// (the cached `<v>` cell value). Carries a typed [`CellValue`] (the `f64`-bearing
71    /// `Number` that drops `Eq`).
72    pub oracle: BTreeMap<String, CellValue>,
73}
74
75/// The manifest-driven I/O cell map (Codex HIGH #5): the shared inputs pool + the
76/// per-Table [`Tool`]s the served binary fans out into one MCP tool each (WBV2-03 §4.1).
77///
78/// The single-tool `outputs: Vec<CellEntry>` FIELD was lifted to `tools: Vec<Tool>`:
79/// each [`Tool`] owns its own outputs + minimal `input_keys`, so the N=1 (single output
80/// Table) case is just `tools.len() == 1` — never special-cased. `inputs` stays the
81/// shared pool every tool draws its `input_keys` from.
82///
83/// Derive note: `Eq` is DROPPED because [`Tool::oracle`] carries an `f64`-bearing
84/// [`CellValue`]; the map is `PartialEq` but NOT `Eq`.
85#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, schemars::JsonSchema)]
86pub struct CellMap {
87    /// One entry per `Role::Input` cell (the shared seedable per-call input pool every
88    /// tool's `input_keys` draws from).
89    pub inputs: Vec<CellEntry>,
90    /// One [`Tool`] per output Table (WBV2-03 §4.1) — the multi-tool fan-out.
91    pub tools: Vec<Tool>,
92}
93
94/// The three per-artifact content hashes recorded in a [`BundleLock`].
95#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, schemars::JsonSchema)]
96pub struct ArtifactHashes {
97    /// SHA-256 over `executable.ir.json` bytes (64-char hex).
98    pub executable: String,
99    /// SHA-256 over `manifest.json` bytes (64-char hex).
100    pub manifest: String,
101    /// SHA-256 over the evidence directory's path+length-prefixed content (64-char
102    /// hex; computed by the evidence emitter, which also folds `cell_map.json`).
103    pub evidence: String,
104}
105
106/// The `BUNDLE.lock` record (ART-04/D-05): the bundle identity, the
107/// `workbook_hash` provenance anchor, the three per-artifact content hashes, and
108/// the COMBINED hash-of-hashes that flips on any single-artifact change.
109#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, schemars::JsonSchema)]
110pub struct BundleLock {
111    /// The neutral bundle identifier (D-17; e.g. `"tax-calc"`).
112    pub bundle_id: String,
113    /// The semver version (e.g. `"1.0.0"`).
114    pub version: String,
115    /// The canonical source-workbook CONTENT hash (`source_workbook_hash`), the
116    /// provenance anchor binding the bundle to the exact source workbook (D-05).
117    pub workbook_hash: String,
118    /// The per-artifact content hashes.
119    pub artifacts: ArtifactHashes,
120    /// The combined hash-of-hashes over the three per-artifact hashes — flips
121    /// when ANY artifact changes (tampering / partial-rebuild detection, D-05).
122    pub combined: String,
123}
124
125/// `hex::encode(Sha256::digest(bytes))` — the single per-artifact content hash.
126pub fn sha256_hex(bytes: &[u8]) -> String {
127    hex::encode(Sha256::digest(bytes))
128}
129
130/// Feed one length-prefixed field to the digest: the tag, then the u64-LE byte
131/// length, then the bytes. Because the length is encoded out-of-band, the field
132/// bytes can contain ANY byte without creating an ambiguous boundary (T-7-11).
133///
134/// This is the SINGLE canonicalization the evidence-dir hash uses; the server's
135/// integrity recompute and the emitter MUST share it byte-for-byte.
136pub fn update_field(hasher: &mut Sha256, tag: &[u8], data: &[u8]) {
137    hasher.update(tag);
138    hasher.update((data.len() as u64).to_le_bytes());
139    hasher.update(data);
140}
141
142/// Fold the evidence-dir hash over `(relative_path, bytes)` members.
143///
144/// Each member is fed as two length-prefixed fields (`evidence.path`, then
145/// `evidence.body`) via [`update_field`], in SORTED relative-path order — the
146/// sort happens HERE, so callers cannot desync on ordering. This is the SINGLE
147/// evidence fold the emitter, the fixture generator, and the server-side loader
148/// recompute share, byte-for-byte (Pitfall 2).
149pub fn fold_evidence_hash(members: &[(&str, &[u8])]) -> String {
150    let mut sorted: Vec<&(&str, &[u8])> = members.iter().collect();
151    sorted.sort_by_key(|(path, _)| *path);
152    let mut hasher = Sha256::new();
153    for (path, body) in sorted {
154        update_field(&mut hasher, b"evidence.path", path.as_bytes());
155        update_field(&mut hasher, b"evidence.body", body);
156    }
157    hex::encode(hasher.finalize())
158}
159
160/// Build the [`BundleLock`] over the emitted artifact bytes.
161///
162/// Each per-artifact hash is `hex::encode(Sha256::digest(bytes))`; the combined
163/// hash is `Sha256` over the concatenation of the three 64-char hex hashes (a
164/// fixed-width concatenation is unambiguous). `workbook_hash` is the
165/// caller-supplied `source_workbook_hash` content projection — RECORDED, not
166/// recomputed from raw bytes (D-05). A one-byte change to any artifact flips its
167/// per-artifact hash, which flips the combined hash (D-05 tamper detection).
168pub fn build_bundle_lock(
169    bundle_id: &str,
170    version: &str,
171    workbook_hash: String,
172    ir_json: &str,
173    manifest_json: &str,
174    evidence_hash: &str,
175) -> BundleLock {
176    let h_exec = sha256_hex(ir_json.as_bytes());
177    let h_manifest = sha256_hex(manifest_json.as_bytes());
178    // The evidence hash is computed over the evidence DIR (path+length-prefixed,
179    // folding cell_map.json) by the emitter; the lock records it verbatim.
180    let h_evidence = evidence_hash.to_string();
181
182    let combined = sha256_hex(format!("{h_exec}{h_manifest}{h_evidence}").as_bytes());
183
184    BundleLock {
185        bundle_id: bundle_id.to_string(),
186        version: version.to_string(),
187        workbook_hash,
188        artifacts: ArtifactHashes {
189            executable: h_exec,
190            manifest: h_manifest,
191            evidence: h_evidence,
192        },
193        combined,
194    }
195}
196
197#[cfg(test)]
198mod tests {
199    use super::*;
200
201    fn workbook_hash() -> String {
202        sha256_hex(b"S!A1|10|\nS!B1|0.37|")
203    }
204
205    fn entry(json_key: &str, seed: &str, unit: Option<&str>) -> CellEntry {
206        CellEntry {
207            json_key: json_key.to_string(),
208            seed_coord: seed.to_string(),
209            unit: unit.map(str::to_string),
210        }
211    }
212
213    #[test]
214    fn artifact_model_tool_round_trips_through_serde() {
215        let mut oracle = BTreeMap::new();
216        oracle.insert("tax_owed".to_string(), CellValue::Number(18241.0));
217        let tool = Tool {
218            name: "Calculate_Tax".to_string(),
219            description: Some("Compute the tax owed".to_string()),
220            input_keys: vec!["income".to_string(), "filing".to_string()],
221            outputs: vec![entry("tax_owed", "Calc!B3", Some("USD"))],
222            oracle,
223        };
224        let json = serde_json::to_string(&tool).expect("serialize Tool");
225        let back: Tool = serde_json::from_str(&json).expect("deserialize Tool");
226        assert_eq!(
227            tool, back,
228            "Tool must serde round-trip preserving all fields"
229        );
230        assert_eq!(back.name, "Calculate_Tax");
231        assert_eq!(back.description.as_deref(), Some("Compute the tax owed"));
232        assert_eq!(back.input_keys, vec!["income", "filing"]);
233        assert_eq!(back.outputs.len(), 1);
234        assert_eq!(
235            back.oracle.get("tax_owed"),
236            Some(&CellValue::Number(18241.0))
237        );
238    }
239
240    #[test]
241    fn artifact_model_cell_map_with_tools_round_trips() {
242        let map = CellMap {
243            inputs: vec![entry("income", "In!B4", Some("USD"))],
244            tools: vec![Tool {
245                name: "Calculate_Tax".to_string(),
246                description: None,
247                input_keys: vec!["income".to_string()],
248                outputs: vec![entry("tax_owed", "Calc!B3", Some("USD"))],
249                oracle: BTreeMap::new(),
250            }],
251        };
252        let json = serde_json::to_string(&map).expect("serialize CellMap");
253        let back: CellMap = serde_json::from_str(&json).expect("deserialize CellMap");
254        assert_eq!(back.inputs.len(), 1);
255        // The N=1 (single output Table) case is just one tool — no special path.
256        assert_eq!(
257            back.tools.len(),
258            1,
259            "a one-Table manifest yields exactly one Tool"
260        );
261        assert_eq!(back.tools[0].name, "Calculate_Tax");
262    }
263
264    #[test]
265    fn artifact_model_per_tool_outputs_are_independent() {
266        // The shim is RETIRED (Plan 04): every consumer iterates `tools[].outputs`
267        // per-tool. Two tools own DISJOINT output sets — there is no union accessor.
268        let map = CellMap {
269            inputs: vec![],
270            tools: vec![
271                Tool {
272                    name: "A".to_string(),
273                    description: None,
274                    input_keys: vec![],
275                    outputs: vec![entry("a1", "S!A1", None), entry("a2", "S!A2", None)],
276                    oracle: BTreeMap::new(),
277                },
278                Tool {
279                    name: "B".to_string(),
280                    description: None,
281                    input_keys: vec![],
282                    outputs: vec![entry("b1", "S!B1", None)],
283                    oracle: BTreeMap::new(),
284                },
285            ],
286        };
287        let tool_a_keys: Vec<&str> = map.tools[0]
288            .outputs
289            .iter()
290            .map(|e| e.json_key.as_str())
291            .collect();
292        let tool_b_keys: Vec<&str> = map.tools[1]
293            .outputs
294            .iter()
295            .map(|e| e.json_key.as_str())
296            .collect();
297        assert_eq!(tool_a_keys, vec!["a1", "a2"], "tool A owns its outputs");
298        assert_eq!(tool_b_keys, vec!["b1"], "tool B owns its outputs");
299        // The per-tool union, computed inline by consumers (no accessor).
300        let total: usize = map.tools.iter().map(|t| t.outputs.len()).sum();
301        assert_eq!(total, 3, "three output cells across two tools");
302    }
303
304    #[test]
305    fn bundle_lock_records_three_plus_combined() {
306        let lock = build_bundle_lock(
307            "tax-calc",
308            "1.0.0",
309            workbook_hash(),
310            "{IR}",
311            "{MANIFEST}",
312            &sha256_hex(b"{EVIDENCE-DIR}"),
313        );
314        for h in [
315            &lock.artifacts.executable,
316            &lock.artifacts.manifest,
317            &lock.artifacts.evidence,
318            &lock.combined,
319        ] {
320            assert_eq!(h.len(), 64, "each hash is a 64-char sha256 hex");
321        }
322        assert_ne!(lock.combined, lock.artifacts.executable);
323        assert_ne!(lock.combined, lock.artifacts.manifest);
324        assert_ne!(lock.combined, lock.artifacts.evidence);
325    }
326
327    #[test]
328    fn bundle_lock_hashes_stable_across_runs() {
329        let a = build_bundle_lock(
330            "tax-calc",
331            "1.0.0",
332            workbook_hash(),
333            "{IR}",
334            "{MANIFEST}",
335            &sha256_hex(b"{EVID}"),
336        );
337        let b = build_bundle_lock(
338            "tax-calc",
339            "1.0.0",
340            workbook_hash(),
341            "{IR}",
342            "{MANIFEST}",
343            &sha256_hex(b"{EVID}"),
344        );
345        assert_eq!(a, b, "bundle-lock hashing is stable across runs");
346    }
347
348    #[test]
349    fn combined_hash_changes_when_any_artifact_changes() {
350        let base = build_bundle_lock(
351            "tax-calc",
352            "1.0.0",
353            workbook_hash(),
354            "{IR}",
355            "{MANIFEST}",
356            &sha256_hex(b"{EVID}"),
357        );
358        let tampered = build_bundle_lock(
359            "tax-calc",
360            "1.0.0",
361            workbook_hash(),
362            "{IR}",
363            "{MANIFEST }", // one extra byte
364            &sha256_hex(b"{EVID}"),
365        );
366        assert_ne!(base.artifacts.manifest, tampered.artifacts.manifest);
367        assert_ne!(base.combined, tampered.combined);
368        let tampered_exec = build_bundle_lock(
369            "tax-calc",
370            "1.0.0",
371            workbook_hash(),
372            "{IR }",
373            "{MANIFEST}",
374            &sha256_hex(b"{EVID}"),
375        );
376        assert_ne!(base.combined, tampered_exec.combined);
377    }
378
379    #[test]
380    fn workbook_hash_reuses_content_projection() {
381        let wh = workbook_hash();
382        let lock = build_bundle_lock(
383            "tax-calc",
384            "1.0.0",
385            wh.clone(),
386            "{IR}",
387            "{MANIFEST}",
388            &sha256_hex(b"{EVID}"),
389        );
390        assert_eq!(lock.workbook_hash, wh);
391        assert_ne!(lock.workbook_hash, lock.artifacts.executable);
392        assert_ne!(lock.workbook_hash, lock.combined);
393    }
394
395    #[test]
396    fn workflow_and_version_are_parameters_not_hardcoded() {
397        let lock = build_bundle_lock(
398            "other-bundle",
399            "2.3.4",
400            workbook_hash(),
401            "{IR}",
402            "{MANIFEST}",
403            &sha256_hex(b"{EVID}"),
404        );
405        assert_eq!(lock.bundle_id, "other-bundle");
406        assert_eq!(lock.version, "2.3.4");
407    }
408
409    #[test]
410    fn update_field_is_length_prefixed() {
411        // Two fields whose concatenation would collide are distinguished by the
412        // out-of-band length prefix.
413        let mut a = Sha256::new();
414        update_field(&mut a, b"t", b"ab");
415        update_field(&mut a, b"t", b"c");
416        let mut b = Sha256::new();
417        update_field(&mut b, b"t", b"a");
418        update_field(&mut b, b"t", b"bc");
419        assert_ne!(hex::encode(a.finalize()), hex::encode(b.finalize()));
420    }
421}