pmcp_workbook_runtime/artifact_model.rs
1//! The RUNTIME-safe bundle artifact model + hashing (Phase 11, Plan 05 / Codex
2//! HIGH #2 boundary).
3//!
4//! These shapes describe the EMITTED bundle that the served binary deserializes
5//! and integrity-checks at load:
6//!
7//! - [`CellEntry`]/[`CellMap`] — the manifest-driven I/O map (Codex HIGH #5).
8//! - [`ArtifactHashes`]/[`BundleLock`] — the per-artifact + combined SHA-256
9//! hash-of-hashes integrity record (ART-04/D-05).
10//!
11//! They live HERE (umya/SWC-free) so BOTH sides share ONE definition rather than
12//! the served binary re-declaring byte-for-byte serde mirrors:
13//!
14//! - `workbook-compiler` (the offline EMITTER) re-exports these from
15//! `artifact::{cell_map,bundle_lock}` via a re-export shim (the SAME pattern
16//! `manifest::model` uses), so the emit path keeps compiling unchanged.
17//! - the served binary deserializes these types DIRECTLY and recomputes
18//! integrity via the SAME [`build_bundle_lock`] the emitter used.
19//!
20//! The hashing helpers ([`sha256_hex`], [`build_bundle_lock`], [`update_field`])
21//! are the SINGLE source the emitter and the server-side integrity check share —
22//! they MUST byte-reproduce each other or the integrity check false-positives.
23
24use std::collections::BTreeMap;
25
26use serde::{Deserialize, Serialize};
27use sha2::{Digest, Sha256};
28
29use crate::sheet_ir::value::CellValue;
30
31/// One input/output cell entry in a [`CellMap`].
32#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, schemars::JsonSchema)]
33pub struct CellEntry {
34 /// The neutral JSON key the caller uses for this cell (the LLM-facing name).
35 pub json_key: String,
36 /// The `CellEnv` seed coordinate — the fully-qualified `sheet!addr` cell key.
37 pub seed_coord: String,
38 /// The declared unit (`m2`/`GBP`/…), when known.
39 pub unit: Option<String>,
40}
41
42/// One served tool — the multi-tool model lift (WBV2-03, §4.1): each output Table in
43/// the source workbook becomes its OWN [`Tool`], owning its `outputs` projection and a
44/// minimal, DAG-derived `input_keys` schema (the subset of the shared [`CellMap::inputs`]
45/// pool transitively reachable upstream of this tool's output cells).
46///
47/// This type crosses the reader-free boundary (it lives HERE, beside [`CellMap`], not
48/// re-declared on the served side): both the offline compiler emitter and the served
49/// binary deserialize ONE definition (artifact_model.rs module doc).
50///
51/// Derive note: `Eq` is DROPPED — `oracle` carries [`CellValue`] (an `f64`-bearing
52/// `Number`), so this type is `PartialEq` but NOT `Eq` (the [`crate::manifest_model`]
53/// `GovernedDatum` precedent).
54#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, schemars::JsonSchema)]
55pub struct Tool {
56 /// The tool name — derived from the owning output Table's name (raw; MCP-charset
57 /// sanitization happens in the served emit, Plan 04).
58 pub name: String,
59 /// The tool description — the caption cell above the output Table, when authored.
60 pub description: Option<String>,
61 /// The minimal input schema: the LLM-facing `json_key`s of the [`CellMap::inputs`]
62 /// pool entries transitively reachable upstream of this tool's outputs (constant-only
63 /// paths excluded; shared intermediates yield the union of this tool's own upstream
64 /// leaves). DAG-derived via [`crate::dag::upstream_input_leaves`].
65 pub input_keys: Vec<String>,
66 /// One entry per output cell this tool projects (reuses [`CellEntry`] — the same
67 /// `{json_key, seed_coord, unit}` shape the inputs use).
68 pub outputs: Vec<CellEntry>,
69 /// The per-tool reconcile oracle: `<output json_key>` → the authored expected value
70 /// (the cached `<v>` cell value). Carries a typed [`CellValue`] (the `f64`-bearing
71 /// `Number` that drops `Eq`).
72 pub oracle: BTreeMap<String, CellValue>,
73}
74
75/// The manifest-driven I/O cell map (Codex HIGH #5): the shared inputs pool + the
76/// per-Table [`Tool`]s the served binary fans out into one MCP tool each (WBV2-03 §4.1).
77///
78/// The single-tool `outputs: Vec<CellEntry>` FIELD was lifted to `tools: Vec<Tool>`:
79/// each [`Tool`] owns its own outputs + minimal `input_keys`, so the N=1 (single output
80/// Table) case is just `tools.len() == 1` — never special-cased. `inputs` stays the
81/// shared pool every tool draws its `input_keys` from.
82///
83/// Derive note: `Eq` is DROPPED because [`Tool::oracle`] carries an `f64`-bearing
84/// [`CellValue`]; the map is `PartialEq` but NOT `Eq`.
85#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, schemars::JsonSchema)]
86pub struct CellMap {
87 /// One entry per `Role::Input` cell (the shared seedable per-call input pool every
88 /// tool's `input_keys` draws from).
89 pub inputs: Vec<CellEntry>,
90 /// One [`Tool`] per output Table (WBV2-03 §4.1) — the multi-tool fan-out.
91 pub tools: Vec<Tool>,
92}
93
94/// The three per-artifact content hashes recorded in a [`BundleLock`].
95#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, schemars::JsonSchema)]
96pub struct ArtifactHashes {
97 /// SHA-256 over `executable.ir.json` bytes (64-char hex).
98 pub executable: String,
99 /// SHA-256 over `manifest.json` bytes (64-char hex).
100 pub manifest: String,
101 /// SHA-256 over the evidence directory's path+length-prefixed content (64-char
102 /// hex; computed by the evidence emitter, which also folds `cell_map.json`).
103 pub evidence: String,
104}
105
106/// The `BUNDLE.lock` record (ART-04/D-05): the bundle identity, the
107/// `workbook_hash` provenance anchor, the three per-artifact content hashes, and
108/// the COMBINED hash-of-hashes that flips on any single-artifact change.
109#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, schemars::JsonSchema)]
110pub struct BundleLock {
111 /// The neutral bundle identifier (D-17; e.g. `"tax-calc"`).
112 pub bundle_id: String,
113 /// The semver version (e.g. `"1.0.0"`).
114 pub version: String,
115 /// The canonical source-workbook CONTENT hash (`source_workbook_hash`), the
116 /// provenance anchor binding the bundle to the exact source workbook (D-05).
117 pub workbook_hash: String,
118 /// The per-artifact content hashes.
119 pub artifacts: ArtifactHashes,
120 /// The combined hash-of-hashes over the three per-artifact hashes — flips
121 /// when ANY artifact changes (tampering / partial-rebuild detection, D-05).
122 pub combined: String,
123}
124
125/// `hex::encode(Sha256::digest(bytes))` — the single per-artifact content hash.
126pub fn sha256_hex(bytes: &[u8]) -> String {
127 hex::encode(Sha256::digest(bytes))
128}
129
130/// Feed one length-prefixed field to the digest: the tag, then the u64-LE byte
131/// length, then the bytes. Because the length is encoded out-of-band, the field
132/// bytes can contain ANY byte without creating an ambiguous boundary (T-7-11).
133///
134/// This is the SINGLE canonicalization the evidence-dir hash uses; the server's
135/// integrity recompute and the emitter MUST share it byte-for-byte.
136pub fn update_field(hasher: &mut Sha256, tag: &[u8], data: &[u8]) {
137 hasher.update(tag);
138 hasher.update((data.len() as u64).to_le_bytes());
139 hasher.update(data);
140}
141
142/// Fold the evidence-dir hash over `(relative_path, bytes)` members.
143///
144/// Each member is fed as two length-prefixed fields (`evidence.path`, then
145/// `evidence.body`) via [`update_field`], in SORTED relative-path order — the
146/// sort happens HERE, so callers cannot desync on ordering. This is the SINGLE
147/// evidence fold the emitter, the fixture generator, and the server-side loader
148/// recompute share, byte-for-byte (Pitfall 2).
149pub fn fold_evidence_hash(members: &[(&str, &[u8])]) -> String {
150 let mut sorted: Vec<&(&str, &[u8])> = members.iter().collect();
151 sorted.sort_by_key(|(path, _)| *path);
152 let mut hasher = Sha256::new();
153 for (path, body) in sorted {
154 update_field(&mut hasher, b"evidence.path", path.as_bytes());
155 update_field(&mut hasher, b"evidence.body", body);
156 }
157 hex::encode(hasher.finalize())
158}
159
160/// Build the [`BundleLock`] over the emitted artifact bytes.
161///
162/// Each per-artifact hash is `hex::encode(Sha256::digest(bytes))`; the combined
163/// hash is `Sha256` over the concatenation of the three 64-char hex hashes (a
164/// fixed-width concatenation is unambiguous). `workbook_hash` is the
165/// caller-supplied `source_workbook_hash` content projection — RECORDED, not
166/// recomputed from raw bytes (D-05). A one-byte change to any artifact flips its
167/// per-artifact hash, which flips the combined hash (D-05 tamper detection).
168pub fn build_bundle_lock(
169 bundle_id: &str,
170 version: &str,
171 workbook_hash: String,
172 ir_json: &str,
173 manifest_json: &str,
174 evidence_hash: &str,
175) -> BundleLock {
176 let h_exec = sha256_hex(ir_json.as_bytes());
177 let h_manifest = sha256_hex(manifest_json.as_bytes());
178 // The evidence hash is computed over the evidence DIR (path+length-prefixed,
179 // folding cell_map.json) by the emitter; the lock records it verbatim.
180 let h_evidence = evidence_hash.to_string();
181
182 let combined = sha256_hex(format!("{h_exec}{h_manifest}{h_evidence}").as_bytes());
183
184 BundleLock {
185 bundle_id: bundle_id.to_string(),
186 version: version.to_string(),
187 workbook_hash,
188 artifacts: ArtifactHashes {
189 executable: h_exec,
190 manifest: h_manifest,
191 evidence: h_evidence,
192 },
193 combined,
194 }
195}
196
197#[cfg(test)]
198mod tests {
199 use super::*;
200
201 fn workbook_hash() -> String {
202 sha256_hex(b"S!A1|10|\nS!B1|0.37|")
203 }
204
205 fn entry(json_key: &str, seed: &str, unit: Option<&str>) -> CellEntry {
206 CellEntry {
207 json_key: json_key.to_string(),
208 seed_coord: seed.to_string(),
209 unit: unit.map(str::to_string),
210 }
211 }
212
213 #[test]
214 fn artifact_model_tool_round_trips_through_serde() {
215 let mut oracle = BTreeMap::new();
216 oracle.insert("tax_owed".to_string(), CellValue::Number(18241.0));
217 let tool = Tool {
218 name: "Calculate_Tax".to_string(),
219 description: Some("Compute the tax owed".to_string()),
220 input_keys: vec!["income".to_string(), "filing".to_string()],
221 outputs: vec![entry("tax_owed", "Calc!B3", Some("USD"))],
222 oracle,
223 };
224 let json = serde_json::to_string(&tool).expect("serialize Tool");
225 let back: Tool = serde_json::from_str(&json).expect("deserialize Tool");
226 assert_eq!(
227 tool, back,
228 "Tool must serde round-trip preserving all fields"
229 );
230 assert_eq!(back.name, "Calculate_Tax");
231 assert_eq!(back.description.as_deref(), Some("Compute the tax owed"));
232 assert_eq!(back.input_keys, vec!["income", "filing"]);
233 assert_eq!(back.outputs.len(), 1);
234 assert_eq!(
235 back.oracle.get("tax_owed"),
236 Some(&CellValue::Number(18241.0))
237 );
238 }
239
240 #[test]
241 fn artifact_model_cell_map_with_tools_round_trips() {
242 let map = CellMap {
243 inputs: vec![entry("income", "In!B4", Some("USD"))],
244 tools: vec![Tool {
245 name: "Calculate_Tax".to_string(),
246 description: None,
247 input_keys: vec!["income".to_string()],
248 outputs: vec![entry("tax_owed", "Calc!B3", Some("USD"))],
249 oracle: BTreeMap::new(),
250 }],
251 };
252 let json = serde_json::to_string(&map).expect("serialize CellMap");
253 let back: CellMap = serde_json::from_str(&json).expect("deserialize CellMap");
254 assert_eq!(back.inputs.len(), 1);
255 // The N=1 (single output Table) case is just one tool — no special path.
256 assert_eq!(
257 back.tools.len(),
258 1,
259 "a one-Table manifest yields exactly one Tool"
260 );
261 assert_eq!(back.tools[0].name, "Calculate_Tax");
262 }
263
264 #[test]
265 fn artifact_model_per_tool_outputs_are_independent() {
266 // The shim is RETIRED (Plan 04): every consumer iterates `tools[].outputs`
267 // per-tool. Two tools own DISJOINT output sets — there is no union accessor.
268 let map = CellMap {
269 inputs: vec![],
270 tools: vec![
271 Tool {
272 name: "A".to_string(),
273 description: None,
274 input_keys: vec![],
275 outputs: vec![entry("a1", "S!A1", None), entry("a2", "S!A2", None)],
276 oracle: BTreeMap::new(),
277 },
278 Tool {
279 name: "B".to_string(),
280 description: None,
281 input_keys: vec![],
282 outputs: vec![entry("b1", "S!B1", None)],
283 oracle: BTreeMap::new(),
284 },
285 ],
286 };
287 let tool_a_keys: Vec<&str> = map.tools[0]
288 .outputs
289 .iter()
290 .map(|e| e.json_key.as_str())
291 .collect();
292 let tool_b_keys: Vec<&str> = map.tools[1]
293 .outputs
294 .iter()
295 .map(|e| e.json_key.as_str())
296 .collect();
297 assert_eq!(tool_a_keys, vec!["a1", "a2"], "tool A owns its outputs");
298 assert_eq!(tool_b_keys, vec!["b1"], "tool B owns its outputs");
299 // The per-tool union, computed inline by consumers (no accessor).
300 let total: usize = map.tools.iter().map(|t| t.outputs.len()).sum();
301 assert_eq!(total, 3, "three output cells across two tools");
302 }
303
304 #[test]
305 fn bundle_lock_records_three_plus_combined() {
306 let lock = build_bundle_lock(
307 "tax-calc",
308 "1.0.0",
309 workbook_hash(),
310 "{IR}",
311 "{MANIFEST}",
312 &sha256_hex(b"{EVIDENCE-DIR}"),
313 );
314 for h in [
315 &lock.artifacts.executable,
316 &lock.artifacts.manifest,
317 &lock.artifacts.evidence,
318 &lock.combined,
319 ] {
320 assert_eq!(h.len(), 64, "each hash is a 64-char sha256 hex");
321 }
322 assert_ne!(lock.combined, lock.artifacts.executable);
323 assert_ne!(lock.combined, lock.artifacts.manifest);
324 assert_ne!(lock.combined, lock.artifacts.evidence);
325 }
326
327 #[test]
328 fn bundle_lock_hashes_stable_across_runs() {
329 let a = build_bundle_lock(
330 "tax-calc",
331 "1.0.0",
332 workbook_hash(),
333 "{IR}",
334 "{MANIFEST}",
335 &sha256_hex(b"{EVID}"),
336 );
337 let b = build_bundle_lock(
338 "tax-calc",
339 "1.0.0",
340 workbook_hash(),
341 "{IR}",
342 "{MANIFEST}",
343 &sha256_hex(b"{EVID}"),
344 );
345 assert_eq!(a, b, "bundle-lock hashing is stable across runs");
346 }
347
348 #[test]
349 fn combined_hash_changes_when_any_artifact_changes() {
350 let base = build_bundle_lock(
351 "tax-calc",
352 "1.0.0",
353 workbook_hash(),
354 "{IR}",
355 "{MANIFEST}",
356 &sha256_hex(b"{EVID}"),
357 );
358 let tampered = build_bundle_lock(
359 "tax-calc",
360 "1.0.0",
361 workbook_hash(),
362 "{IR}",
363 "{MANIFEST }", // one extra byte
364 &sha256_hex(b"{EVID}"),
365 );
366 assert_ne!(base.artifacts.manifest, tampered.artifacts.manifest);
367 assert_ne!(base.combined, tampered.combined);
368 let tampered_exec = build_bundle_lock(
369 "tax-calc",
370 "1.0.0",
371 workbook_hash(),
372 "{IR }",
373 "{MANIFEST}",
374 &sha256_hex(b"{EVID}"),
375 );
376 assert_ne!(base.combined, tampered_exec.combined);
377 }
378
379 #[test]
380 fn workbook_hash_reuses_content_projection() {
381 let wh = workbook_hash();
382 let lock = build_bundle_lock(
383 "tax-calc",
384 "1.0.0",
385 wh.clone(),
386 "{IR}",
387 "{MANIFEST}",
388 &sha256_hex(b"{EVID}"),
389 );
390 assert_eq!(lock.workbook_hash, wh);
391 assert_ne!(lock.workbook_hash, lock.artifacts.executable);
392 assert_ne!(lock.workbook_hash, lock.combined);
393 }
394
395 #[test]
396 fn workflow_and_version_are_parameters_not_hardcoded() {
397 let lock = build_bundle_lock(
398 "other-bundle",
399 "2.3.4",
400 workbook_hash(),
401 "{IR}",
402 "{MANIFEST}",
403 &sha256_hex(b"{EVID}"),
404 );
405 assert_eq!(lock.bundle_id, "other-bundle");
406 assert_eq!(lock.version, "2.3.4");
407 }
408
409 #[test]
410 fn update_field_is_length_prefixed() {
411 // Two fields whose concatenation would collide are distinguished by the
412 // out-of-band length prefix.
413 let mut a = Sha256::new();
414 update_field(&mut a, b"t", b"ab");
415 update_field(&mut a, b"t", b"c");
416 let mut b = Sha256::new();
417 update_field(&mut b, b"t", b"a");
418 update_field(&mut b, b"t", b"bc");
419 assert_ne!(hex::encode(a.finalize()), hex::encode(b.finalize()));
420 }
421}