entrenar/models/llama_370m.rs
1#![allow(clippy::doc_overindented_list_items)]
2//! # Llama 370M Sovereign (albor) — Architectural Scaffold
3//!
4//! Compile-time-frozen configuration for the SHIP-TWO-001 MODEL-2 "albor"
5//! 370M Python code-completion model.
6//!
7//! **Canonical contract:** `contracts/model-families/llama-370m-sovereign-v1.yaml`
8//! **Contract version:** 1.0.0
9//! **Contract ID:** C-LLAMA-370M-SOVEREIGN
10//!
11//! ## Purpose
12//!
13//! This module is a **scaffold only** — it does NOT implement forward/backward.
14//! Its sole job is to lift the architectural constants from the YAML contract
15//! into Rust's type system so that recipe/artifact drift (the MODEL-1 v2 QLoRA
16//! divergence class of bug) is caught at compile time, not at eval time.
17//!
18//! ## Invariants (mirrored from the YAML contract)
19//!
20//! - **INV-ARCH-370M-001** Parameter count ∈ [366M, 374M] (370M ± 1%).
21//! Verified at runtime by `estimated_param_count()`
22//! and by `apr inspect` on trained artifacts.
23//! - **INV-ARCH-370M-002** `num_heads * head_dim == hidden_dim` (16 * 64 == 1024).
24//! Compile-time asserted in [`Llama370MConfig::validate`].
25//! - **INV-ARCH-370M-003** `num_kv_heads` divides `num_heads` evenly (GQA).
26//! Compile-time asserted in [`Llama370MConfig::validate`].
27//! - **INV-ARCH-370M-004** `tied_embeddings == true` — lm_head shares storage
28//! with token_embd. Compile-time enforced via the
29//! `TIED_EMBEDDINGS` const.
30//! - **INV-ARCH-370M-005** `rope_theta == 10000.0` exactly (Llama-1 convention).
31//! Compile-time enforced as a `const f32`.
32//! - **INV-ARCH-370M-006** `vocab_size == 50_257` and matches the paired
33//! tokenizer-bpe-v1 contract. Tokenizer coupling
34//! cannot be checked at compile time — runtime
35//! `debug_assert_eq!` at model load.
36//! - **INV-ARCH-370M-007** SwiGLU activation: distinct `gate_proj` and
37//! `up_proj` tensors. Enforced at checkpoint load
38//! time (runtime) by the APR loader.
39//! - **INV-ARCH-370M-008** `has_bias == false` on every linear projection.
40//! Compile-time enforced via the `HAS_BIAS` const.
41//! - **INV-ARCH-370M-009** Row-major APR layout (LAYOUT-001). Embedding
42//! shape `[vocab_size, hidden_dim]`, NOT reversed.
43//! Enforced by `aprender::format::layout_contract`
44//! at load time (runtime — tensor data is not
45//! available to the type system).
46//!
47//! ## Design Notes
48//!
49//! Rust 1.79+ supports `const` panics, so every machine-checkable invariant
50//! lives inside [`Llama370MConfig::validate`], a `const fn` that compiles
51//! down to nothing if all invariants hold and refuses to compile otherwise
52//! (via `const _: () = Llama370MConfig::validate();`).
53//!
54//! The `HiddenDim<N>`, `NumHeads<N>`, etc. PhantomData newtypes exist so
55//! that downstream code (forward/backward, to be written later) can be
56//! parameterized on the exact dimensions — making it a compile error to,
57//! for instance, pass a `HiddenDim<768>` activation into a 1024-dim
58//! projection.
59//!
60//! This module intentionally does NOT:
61//! - implement forward/backward;
62//! - allocate tensors;
63//! - export anything to `aprender-train`'s public API
64//! (re-exports are a follow-up PR).
65
66#![allow(dead_code)] // scaffold — forward/backward not yet implemented
67
68use std::marker::PhantomData;
69
70// ─────────────────────────────────────────────────────────────
71// Compile-time shape newtypes (Poka-Yoke)
72// ─────────────────────────────────────────────────────────────
73//
74// These zero-sized types let downstream code be generic on exact
75// dimensions. Mixing, e.g., a HiddenDim<1024> with a HiddenDim<768>
76// is a compile error, not a runtime shape mismatch.
77
78/// Hidden dimension (model width) as a compile-time constant.
79#[derive(Debug, Clone, Copy, Default)]
80pub struct HiddenDim<const N: usize>(PhantomData<()>);
81
82impl<const N: usize> HiddenDim<N> {
83 pub const VALUE: usize = N;
84 pub const fn new() -> Self {
85 Self(PhantomData)
86 }
87}
88
89/// Number of attention heads.
90#[derive(Debug, Clone, Copy, Default)]
91pub struct NumHeads<const N: usize>(PhantomData<()>);
92
93impl<const N: usize> NumHeads<N> {
94 pub const VALUE: usize = N;
95 pub const fn new() -> Self {
96 Self(PhantomData)
97 }
98}
99
100/// Number of KV heads (GQA).
101#[derive(Debug, Clone, Copy, Default)]
102pub struct NumKvHeads<const N: usize>(PhantomData<()>);
103
104impl<const N: usize> NumKvHeads<N> {
105 pub const VALUE: usize = N;
106 pub const fn new() -> Self {
107 Self(PhantomData)
108 }
109}
110
111/// Per-head dimension (hidden_dim / num_heads).
112#[derive(Debug, Clone, Copy, Default)]
113pub struct HeadDim<const N: usize>(PhantomData<()>);
114
115impl<const N: usize> HeadDim<N> {
116 pub const VALUE: usize = N;
117 pub const fn new() -> Self {
118 Self(PhantomData)
119 }
120}
121
122/// Intermediate (FFN) dimension.
123#[derive(Debug, Clone, Copy, Default)]
124pub struct IntermediateDim<const N: usize>(PhantomData<()>);
125
126impl<const N: usize> IntermediateDim<N> {
127 pub const VALUE: usize = N;
128 pub const fn new() -> Self {
129 Self(PhantomData)
130 }
131}
132
133/// Number of transformer blocks.
134#[derive(Debug, Clone, Copy, Default)]
135pub struct NumLayers<const N: usize>(PhantomData<()>);
136
137impl<const N: usize> NumLayers<N> {
138 pub const VALUE: usize = N;
139 pub const fn new() -> Self {
140 Self(PhantomData)
141 }
142}
143
144/// Vocabulary size.
145#[derive(Debug, Clone, Copy, Default)]
146pub struct VocabSize<const N: usize>(PhantomData<()>);
147
148impl<const N: usize> VocabSize<N> {
149 pub const VALUE: usize = N;
150 pub const fn new() -> Self {
151 Self(PhantomData)
152 }
153}
154
155// ─────────────────────────────────────────────────────────────
156// Llama370MConfig — frozen architectural constants
157// ─────────────────────────────────────────────────────────────
158//
159// All fields are `pub const` and byte-identical to
160// contracts/model-families/llama-370m-sovereign-v1.yaml §architecture
161// and §constraints.
162
163/// Architectural configuration for the albor 370M sovereign model.
164///
165/// Every constant here is pinned to a specific value in the YAML contract.
166/// Changing any of these values requires bumping the contract to v1.1.0
167/// and re-running the `GATE-ARCH-370M-*` gates.
168pub struct Llama370MConfig;
169
170impl Llama370MConfig {
171 // ── Architecture ──
172 /// Total parameter count (nominal). See `estimated_param_count()` for
173 /// the runtime-checkable figure under INV-ARCH-370M-001.
174 pub const PARAMETERS_NOMINAL: usize = 370_000_000;
175
176 /// Lower bound on param count (INV-ARCH-370M-001).
177 pub const PARAMETERS_MIN: usize = 366_000_000;
178
179 /// Upper bound on param count (INV-ARCH-370M-001).
180 pub const PARAMETERS_MAX: usize = 374_000_000;
181
182 pub const HIDDEN_DIM: usize = 1024;
183 pub const NUM_LAYERS: usize = 24;
184 pub const NUM_HEADS: usize = 16;
185 pub const NUM_KV_HEADS: usize = 4; // GQA: heads / 4
186 pub const HEAD_DIM: usize = 64; // hidden_dim / num_heads
187 pub const INTERMEDIATE_DIM: usize = 2816; // ~2.75 * hidden
188 pub const VOCAB_SIZE: usize = 50_257;
189 pub const MAX_POSITION_EMBEDDINGS: usize = 4096;
190
191 /// RoPE base frequency — Llama-1 convention (INV-ARCH-370M-005).
192 pub const ROPE_THETA: f32 = 10_000.0;
193
194 /// RMSNorm epsilon.
195 pub const RMS_NORM_EPS: f32 = 1.0e-5;
196
197 // ── Constraints ──
198 pub const TIED_EMBEDDINGS: bool = true; // INV-ARCH-370M-004
199 pub const HAS_BIAS: bool = false; // INV-ARCH-370M-008
200
201 /// Compile-time verification of every machine-checkable invariant.
202 ///
203 /// Each `assert!` here becomes a hard compile error (via Rust 1.79+
204 /// `const` panics) if the invariant is violated. Any change to the
205 /// constants above that breaks one of these invariants will fail to
206 /// compile — by design.
207 ///
208 /// Invariants encoded here (in order):
209 /// INV-ARCH-370M-002 num_heads * head_dim == hidden_dim
210 /// INV-ARCH-370M-003 num_kv_heads divides num_heads
211 /// INV-ARCH-370M-004 tied_embeddings == true
212 /// INV-ARCH-370M-005 rope_theta == 10000.0
213 /// INV-ARCH-370M-006 vocab_size == 50_257
214 /// INV-ARCH-370M-008 has_bias == false
215 ///
216 /// Invariants NOT encodable at compile time (documented as runtime
217 /// `debug_assert!` at load sites):
218 /// INV-ARCH-370M-001 param count ∈ [366M, 374M] — depends on the
219 /// actual allocated tensors; checked by
220 /// `estimated_param_count()` and by `apr inspect`.
221 /// INV-ARCH-370M-007 SwiGLU gate_proj/up_proj both present and
222 /// distinct — depends on the on-disk checkpoint
223 /// tensor table; checked by the APR loader.
224 /// INV-ARCH-370M-009 row-major [vocab_size, hidden_dim] layout —
225 /// depends on tensor shape metadata in the
226 /// loaded artifact; checked by
227 /// `aprender::format::layout_contract`.
228 pub const fn validate() {
229 // INV-ARCH-370M-002
230 assert!(
231 Self::NUM_HEADS * Self::HEAD_DIM == Self::HIDDEN_DIM,
232 "INV-ARCH-370M-002 violated: num_heads * head_dim != hidden_dim",
233 );
234
235 // INV-ARCH-370M-003
236 assert!(
237 Self::NUM_KV_HEADS > 0 && Self::NUM_HEADS % Self::NUM_KV_HEADS == 0,
238 "INV-ARCH-370M-003 violated: num_kv_heads does not divide num_heads",
239 );
240
241 // INV-ARCH-370M-004
242 assert!(
243 Self::TIED_EMBEDDINGS,
244 "INV-ARCH-370M-004 violated: tied_embeddings must be true for 370M",
245 );
246
247 // INV-ARCH-370M-005 — f32 equality is legal in const context
248 // and is exactly what the contract requires (byte-equal literal).
249 assert!(
250 Self::ROPE_THETA == 10_000.0_f32,
251 "INV-ARCH-370M-005 violated: rope_theta must be exactly 10000.0",
252 );
253
254 // INV-ARCH-370M-006
255 assert!(
256 Self::VOCAB_SIZE == 50_257,
257 "INV-ARCH-370M-006 violated: vocab_size must equal 50_257",
258 );
259
260 // INV-ARCH-370M-008
261 assert!(
262 !Self::HAS_BIAS,
263 "INV-ARCH-370M-008 violated: has_bias must be false (Llama convention)",
264 );
265
266 // Sanity: head_dim consistency (free-tier check, also implied
267 // by INV-ARCH-370M-002 above).
268 assert!(
269 Self::HIDDEN_DIM / Self::NUM_HEADS == Self::HEAD_DIM,
270 "hidden_dim / num_heads != head_dim — config internally inconsistent",
271 );
272
273 // Sanity: max_position_embeddings is a positive multiple of 2.
274 assert!(
275 Self::MAX_POSITION_EMBEDDINGS > 0 && Self::MAX_POSITION_EMBEDDINGS % 2 == 0,
276 "max_position_embeddings must be a positive even integer for RoPE",
277 );
278 }
279}
280
281// Drive `validate()` at crate-compile time. If any `assert!` inside
282// `validate()` fails, the crate fails to build.
283#[allow(clippy::let_unit_value)]
284const _: () = Llama370MConfig::validate();
285
286// ─────────────────────────────────────────────────────────────
287// Parameter count estimator (INV-ARCH-370M-001 runtime check)
288// ─────────────────────────────────────────────────────────────
289
290/// Estimate the total parameter count for the albor 370M config using
291/// the **nominal (untied)** counting convention.
292///
293/// The contract's INV-ARCH-370M-001 band [366M, 374M] corresponds to the
294/// HuggingFace-style reported figure, which counts `lm_head.weight` as
295/// a distinct matrix even though — per INV-ARCH-370M-004 — storage is
296/// shared with `model.embed_tokens.weight`. This mirrors how Llama
297/// families are reported in the literature (e.g., "TinyLlama-1.1B" is
298/// counted with untied lm_head even when tied).
299///
300/// For the actual on-disk param count reported by `apr inspect`
301/// (with tying applied), use [`estimated_stored_param_count`].
302///
303/// Formula (untied — contract reporting convention):
304///
305/// ```text
306/// embedding: vocab * hidden
307/// lm_head: vocab * hidden (tied storage, but counted here)
308/// per transformer layer:
309/// attention q_proj: (num_heads * head_dim) * hidden
310/// attention k_proj: (num_kv_heads * head_dim) * hidden
311/// attention v_proj: (num_kv_heads * head_dim) * hidden
312/// attention o_proj: hidden * (num_heads * head_dim)
313/// mlp gate_proj: intermediate * hidden
314/// mlp up_proj: intermediate * hidden
315/// mlp down_proj: hidden * intermediate
316/// input_layernorm: hidden
317/// post_attn_layernorm: hidden
318/// final rmsnorm: hidden
319/// ```
320#[must_use]
321pub const fn estimated_param_count() -> usize {
322 // Untied: add the lm_head bookkeeping on top of the stored count.
323 estimated_stored_param_count() + (Llama370MConfig::VOCAB_SIZE * Llama370MConfig::HIDDEN_DIM)
324}
325
326/// Estimate the **stored** parameter count (what `apr inspect` sees on
327/// disk for a tied-embedding checkpoint). This is ~51.2M lower than the
328/// nominal figure because `lm_head.weight` is aliased to
329/// `model.embed_tokens.weight` (INV-ARCH-370M-004).
330#[must_use]
331pub const fn estimated_stored_param_count() -> usize {
332 let h = Llama370MConfig::HIDDEN_DIM;
333 let l = Llama370MConfig::NUM_LAYERS;
334 let v = Llama370MConfig::VOCAB_SIZE;
335 let i = Llama370MConfig::INTERMEDIATE_DIM;
336 let nh = Llama370MConfig::NUM_HEADS;
337 let nkv = Llama370MConfig::NUM_KV_HEADS;
338 let hd = Llama370MConfig::HEAD_DIM;
339
340 // Embedding (tied with lm_head — counted once).
341 let embedding = v * h;
342
343 // Attention: q_proj + k_proj + v_proj + o_proj
344 let q = h * (nh * hd);
345 let k = h * (nkv * hd);
346 let vv = h * (nkv * hd);
347 let o = (nh * hd) * h;
348 let attn = q + k + vv + o;
349
350 // MLP (SwiGLU): gate_proj + up_proj + down_proj
351 let mlp = (h * i) + (h * i) + (i * h);
352
353 // Two RMSNorm weights per layer (input_layernorm, post_attention_layernorm).
354 let norms = 2 * h;
355
356 let per_layer = attn + mlp + norms;
357
358 // Final model.norm.weight.
359 let final_norm = h;
360
361 embedding + l * per_layer + final_norm
362}
363
364/// Pure helper that enforces GATE-ARCH-370M-011 / INV-ARCH-370M-006:
365/// the tokenizer's vocabulary size MUST exactly match the model's
366/// `vocab_size` before pretraining dispatches. Returns `Ok(())` when
367/// they match, `Err(String)` with a machine-diffable message when they
368/// do not. The caller is expected to surface the error to the user
369/// and abort the dispatch before any forward pass.
370pub fn assert_tokenizer_vocab_matches_model(
371 tokenizer_vocab_size: usize,
372 model_vocab_size: usize,
373) -> Result<(), String> {
374 if tokenizer_vocab_size == model_vocab_size {
375 return Ok(());
376 }
377 Err(format!(
378 "GATE-ARCH-370M-011 (INV-ARCH-370M-006) violated: \
379 tokenizer vocab_size ({tokenizer_vocab_size}) != model vocab_size \
380 ({model_vocab_size}). See contracts/model-families/llama-370m-sovereign-v1.yaml \
381 and contracts/tokenizer-bpe-v1.yaml — retrain the tokenizer or amend both contracts \
382 in lockstep before resuming pretraining."
383 ))
384}
385
386/// Polymorphic-path variant of [`assert_tokenizer_vocab_matches_model`] that
387/// allows `tokenizer_vocab_size <= model_vocab_size` (per
388/// `apr-pretrain-arch-polymorphic-v1` v1.3.0 §qwen_tokenizer_vocab_compatibility,
389/// SPEC-SHIP-TWO-001 §55). When fine-tuning from an HF-distributed
390/// pretrained checkpoint (Qwen2.5 / Llama2 / Mistral), `tokenizer.json`
391/// commonly materializes fewer string-token entries than the model's
392/// declared `vocab_size` declares. The gap is reserved/special slots
393/// (e.g., `<|reserved_271|>`...) that the lm_head + embedding layers
394/// have weights for but no tokenizer string maps to. Strict equality
395/// (the §24/§25 from-scratch baseline invariant) is too strict for
396/// these models; the relaxed bound preserves the OOB-safety property:
397///
398/// **Safety argument**: tokenizer.encode(text) → ids ∈ [0, tokenizer_vocab).
399/// Embedding/lm_head layers are sized for [0, model_vocab). When
400/// tokenizer_vocab ≤ model_vocab, every encoded id is in-bounds; the
401/// reserved high-id slots are never indexed at training time. When
402/// tokenizer_vocab > model_vocab, the encoder could emit ids ≥ model_vocab
403/// → N-09 OOB → silent garbage gradients → fail-fast.
404///
405/// Returns `Ok(())` when bound holds, `Err(String)` with a machine-diffable
406/// message when violated.
407pub fn assert_tokenizer_vocab_within_model_bound(
408 tokenizer_vocab_size: usize,
409 model_vocab_size: usize,
410) -> Result<(), String> {
411 if tokenizer_vocab_size <= model_vocab_size {
412 return Ok(());
413 }
414 Err(format!(
415 "GATE-ARCH-370M-011 (INV-ARCH-370M-006-RELAXED) violated: \
416 tokenizer vocab_size ({tokenizer_vocab_size}) > model vocab_size \
417 ({model_vocab_size}). See contracts/apr-pretrain-arch-polymorphic-v1.yaml \
418 §qwen_tokenizer_vocab_compatibility — for HF-distributed pretrained \
419 checkpoints, tokenizer_vocab MUST be <= model_vocab (reserved-slot \
420 tolerance); a tokenizer with MORE strings than the model expects \
421 would emit OOB ids → N-09 escape → silent garbage gradients."
422 ))
423}
424
425// ─────────────────────────────────────────────────────────────
426// FALSIFY-SHIP-017 / AC-SHIP2-007 / GATE-ARCH-370M-005
427// ─────────────────────────────────────────────────────────────
428
429/// Number of held-out prompts required by AC-SHIP2-007 / FALSIFY-SHIP-017
430/// (spec §6 Model 2: "`apr run` produces syntactically valid Python on
431/// 100 held-out prompts").
432pub const AC_SHIP2_007_HELDOUT_PROMPT_COUNT: usize = 100;
433
434/// Tolerance: `≤ 1` completion out of the 100 held-out prompts may fail
435/// to yield a Python-AST-parseable non-trivial statement prefix.
436/// Anything `≥ 2` is a ship-blocking FAIL per FALSIFY-SHIP-017.
437pub const AC_SHIP2_007_MAX_TOLERATED_SYNTAX_ERRORS: usize = 1;
438
439/// Ship-017 verdict — the pure algorithmic result of evaluating whether
440/// a 100-prompt Python-AST-parse sweep meets AC-SHIP2-007.
441#[derive(Debug, Clone, Copy, PartialEq, Eq)]
442pub enum Ship017Verdict {
443 /// `syntax_errors ≤ AC_SHIP2_007_MAX_TOLERATED_SYNTAX_ERRORS`.
444 Pass,
445 /// `syntax_errors ≥ 2` on the 100-prompt sweep.
446 Fail,
447}
448
449/// Pure threshold function for FALSIFY-SHIP-017 (AC-SHIP2-007).
450///
451/// Given `syntax_errors` — the count of held-out completions (out of
452/// `AC_SHIP2_007_HELDOUT_PROMPT_COUNT`) that failed to yield a
453/// Python-AST-parseable non-trivial statement prefix — returns the
454/// FALSIFY-SHIP-017 verdict under the rule:
455///
456/// errors ≤ 1 → Pass (tolerate one flaky completion)
457/// errors ≥ 2 → Fail (ship-blocker)
458///
459/// This is the algorithm-level discharge of GATE-ARCH-370M-005: the
460/// threshold itself is proven correct at `cargo test` time; full
461/// discharge (a real 100-prompt `apr run` harness against a trained
462/// 370M .apr) remains PENDING on pretraining compute-dispatch
463/// (AC-SHIP2-003/004). Fixture swap is data-only once a trained
464/// artifact exists — no harness rewrite required.
465pub const fn verdict_from_syntax_error_count(syntax_errors: usize) -> Ship017Verdict {
466 if syntax_errors <= AC_SHIP2_007_MAX_TOLERATED_SYNTAX_ERRORS {
467 Ship017Verdict::Pass
468 } else {
469 Ship017Verdict::Fail
470 }
471}
472
473// ─────────────────────────────────────────────────────────────
474// FALSIFY-SHIP-020 (AC-SHIP2-010): decode-throughput threshold
475// ─────────────────────────────────────────────────────────────
476
477/// Minimum `apr run` median decode throughput, in tokens/second, on the
478/// SHIP-TWO-001 reference hardware (RTX 4090). Source of truth:
479/// `contracts/model-families/llama-370m-sovereign-v1.yaml`
480/// GATE-ARCH-370M-006 and the SHIP-TWO-001 falsification table
481/// (FALSIFY-SHIP-020).
482pub const AC_SHIP2_010_MIN_DECODE_TPS_RTX4090: f32 = 100.0;
483
484/// Verdict for GATE-ARCH-370M-006 / FALSIFY-SHIP-020: does the measured
485/// `apr bench` median decode throughput meet the ship-gate threshold?
486#[derive(Debug, Clone, Copy, PartialEq, Eq)]
487pub enum Ship020Verdict {
488 /// Measured tok/s ≥ [`AC_SHIP2_010_MIN_DECODE_TPS_RTX4090`].
489 Pass,
490 /// Measured tok/s < [`AC_SHIP2_010_MIN_DECODE_TPS_RTX4090`] (ship-blocker).
491 Fail,
492}
493
494/// Pure threshold function implementing FALSIFY-SHIP-020: a measured
495/// median decode throughput of `measured_tps` tokens/second passes the
496/// ship-gate iff it is finite AND ≥ [`AC_SHIP2_010_MIN_DECODE_TPS_RTX4090`].
497///
498/// Non-finite inputs (NaN, ±∞) are conservatively classified as
499/// `Fail`: a benchmark run that could not produce a well-defined
500/// finite median is always ill-formed and must never be allowed to
501/// silently green the ship-gate.
502///
503/// This is the same decision-rule-vs-harness separation used by
504/// FALSIFY-SHIP-017 (syntax-error count). The compute-heavy part
505/// (`apr bench --median` on a real trained 370M .apr) is deferred to
506/// AC-SHIP2-003/004 compute-dispatch; the decision rule itself is
507/// proven today at unit-test time.
508#[must_use]
509pub fn verdict_from_decode_tps(measured_tps: f32) -> Ship020Verdict {
510 if !(measured_tps.is_finite()) {
511 return Ship020Verdict::Fail;
512 }
513 if measured_tps >= AC_SHIP2_010_MIN_DECODE_TPS_RTX4090 {
514 Ship020Verdict::Pass
515 } else {
516 Ship020Verdict::Fail
517 }
518}
519
520// ─────────────────────────────────────────────────────────────
521// AC-SHIP2-008 / FALSIFY-SHIP-018 — HumanEval pass@1 threshold
522// ─────────────────────────────────────────────────────────────
523//
524// Spec §5.2 AC-SHIP2-008 states: `apr eval --benchmark humaneval`
525// must report pass@1 ≥ 30.0% for the trained 370M `.apr` artifact.
526// The decision rule is a pure (correct, total) → pct comparison;
527// the compute-heavy harness (164 HumanEval tasks × `apr run` ×
528// greedy sampling) is fixture-swappable once a trained artifact
529// exists. Landing the threshold function today discharges
530// GATE-ARCH-370M-007 at PARTIAL_ALGORITHM_LEVEL and catches any
531// future drift in the 30.0 floor at `cargo test` time — before a
532// single HumanEval task is dispatched.
533
534/// Contract-frozen floor for AC-SHIP2-008: HumanEval pass@1 must
535/// reach **30.0%** on the trained 370M artifact. The numeric floor
536/// mirrors `contracts/model-families/llama-370m-sovereign-v1.yaml`
537/// GATE-ARCH-370M-007 rule body and spec §5.2 AC-SHIP2-008.
538pub const AC_SHIP2_008_MIN_HUMANEVAL_PASS_AT_1_PCT: f32 = 30.0;
539
540/// Binary verdict emitted by [`verdict_from_pass_at_1`].
541#[derive(Debug, Clone, Copy, PartialEq, Eq)]
542pub enum Ship018Verdict {
543 /// `correct / total * 100 ≥ threshold_pct` and inputs are well-formed.
544 Pass,
545 /// Any of: ratio < threshold, `total == 0`, non-finite threshold.
546 Fail,
547}
548
549/// Decision function for AC-SHIP2-008 / FALSIFY-SHIP-018.
550///
551/// Returns [`Ship018Verdict::Pass`] iff `correct / total * 100`
552/// is ≥ `threshold_pct`. Conservative-Fail guards:
553///
554/// - `total == 0` → Fail (avoid division-by-zero; an empty run
555/// cannot satisfy a positive threshold).
556/// - `correct > total` → Fail (nonsensical input; a real
557/// harness can never report more passes than attempts).
558/// - `!threshold_pct.is_finite()` → Fail (NaN / ±∞ contract
559/// drift — no real contract floor can be non-finite).
560///
561/// Stored per-ULP ratio comparisons use f32 arithmetic to keep the
562/// verdict a pure function of its inputs with no harness state.
563#[must_use]
564pub fn verdict_from_pass_at_1(correct: usize, total: usize, threshold_pct: f32) -> Ship018Verdict {
565 if total == 0 {
566 return Ship018Verdict::Fail;
567 }
568 if correct > total {
569 return Ship018Verdict::Fail;
570 }
571 if !threshold_pct.is_finite() {
572 return Ship018Verdict::Fail;
573 }
574 // Safe: correct ≤ total ≤ usize::MAX; f32 cast preserves ordering
575 // within the humaneval range (total ≤ 164 for canonical HumanEval).
576 let ratio_pct = (correct as f32 / total as f32) * 100.0_f32;
577 if ratio_pct >= threshold_pct {
578 Ship018Verdict::Pass
579 } else {
580 Ship018Verdict::Fail
581 }
582}
583
584// ─────────────────────────────────────────────────────────────
585// FALSIFY-SHIP-016 / AC-SHIP2-006 / GATE-ARCH-370M-008 — apr qa aggregate
586// ─────────────────────────────────────────────────────────────
587
588/// Number of canonical `apr qa` gates composing AC-SHIP2-006 /
589/// FALSIFY-SHIP-016. The spec row AC-SHIP2-006 reads
590/// "`apr qa <model>.apr` — all 8 gates PASS" and the matching
591/// FALSIFY row names the decision rule as "any gate FAIL" →
592/// Fail. The 8 canonical gates (matching `QaConfig::skip_*` in
593/// `crates/apr-cli/src/commands/qa.rs`) are:
594/// 1. golden_output — correctness gate (GH-202 regression)
595/// 2. throughput — tok/s ≥ configured floor
596/// 3. ollama_parity — speedup vs Ollama baseline
597/// 4. gpu_vs_cpu_speedup — GPU ≥ 2× CPU (F-PERF-042)
598/// 5. tensor_contract — layout/dtype/shape validation
599/// 6. cross_format_parity — argmax(GGUF) == argmax(SafeTensors)
600/// 7. ptx_parity — batched GPU kernels vs scalar refs
601/// 8. probar — property-based tests
602///
603/// Contract-drift guard: any change to this number must be matched
604/// in lockstep across the contract, spec, and CLI qa handler.
605pub const AC_SHIP2_006_REQUIRED_QA_GATE_COUNT: usize = 8;
606
607/// Ternary verdict for FALSIFY-SHIP-016 / GATE-ARCH-370M-008.
608/// `Pass` iff every one of the 8 canonical `apr qa` gates passed.
609/// `Fail` otherwise (any single gate failure, or wrong gate count).
610#[derive(Debug, Clone, Copy, PartialEq, Eq)]
611pub enum Ship016Verdict {
612 Pass,
613 Fail,
614}
615
616/// Algorithm-level verdict rule for FALSIFY-SHIP-016 / GATE-ARCH-370M-008
617/// / AC-SHIP2-006: `apr qa <model>.apr` is an aggregate-AND over the
618/// 8 canonical QA gates. Verdict is `Pass` iff **exactly 8** gate
619/// results were supplied **and every one passed**. Any shorter/longer
620/// slice is a contract-drift Fail; any single `false` entry is a
621/// ship-blocking Fail. This proves the decision rule without running
622/// the compute-heavy gates themselves; the harness (realizar, cuda,
623/// tokenizer, corpus) is fixture-swappable once a real trained 370M
624/// .apr exists. Spec §7 row FALSIFY-SHIP-016 ("any gate FAIL") is
625/// the counter-example this fn is built to classify.
626#[must_use]
627pub fn verdict_from_qa_gates(gate_results: &[bool]) -> Ship016Verdict {
628 if gate_results.len() != AC_SHIP2_006_REQUIRED_QA_GATE_COUNT {
629 return Ship016Verdict::Fail;
630 }
631 if gate_results.iter().all(|&passed| passed) {
632 Ship016Verdict::Pass
633 } else {
634 Ship016Verdict::Fail
635 }
636}
637
638// ─────────────────────────────────────────────────────────────
639// FALSIFY-SHIP-013 / AC-SHIP2-003 / GATE-ARCH-370M-013 — val CE loss floor
640// ─────────────────────────────────────────────────────────────
641//
642// AC-SHIP2-003 (spec §5.2) states: "`entrenar` pretraining loop reaches
643// target loss (CE ≤ 2.2 on val)". The decision rule is a pure single-
644// number f32 threshold check on a measured cross-entropy value. Cross-
645// entropy is non-negative by definition (H(p,q) ≥ 0 for all probability
646// distributions p,q), so the admissible input domain is `[0.0, +∞)`; any
647// negative measurement indicates a broken loss harness (sign flip,
648// log-domain underflow, subtract instead of add) and must Fail closed.
649//
650// This PARTIAL_ALGORITHM_LEVEL discharge binds the decision rule only.
651// The compute-heavy half — an actual `apr pretrain --validate` loop
652// producing a live val CE from MODEL-2 training — remains blocked on
653// AC-SHIP2-003 compute-dispatch. The decision rule itself (what number
654// constitutes a Pass) is proven today at `cargo test` time via the
655// mutation survey below.
656
657/// Maximum acceptable cross-entropy loss on the validation set for
658/// MODEL-2 (albor 370M Sovereign) at the end of pretraining. Spec §5.2
659/// AC-SHIP2-003: "CE ≤ 2.2 on val". A measured val CE strictly above
660/// 2.2 is a ship-blocker (training did not converge well enough for
661/// the 370M target to hit its HumanEval / syntax-parse downstream
662/// gates). Pinned here so any contract drift in either direction
663/// (loosening to 2.5, hardening to 2.0) is caught at compile+test
664/// time, not at a production training run.
665///
666/// f32-exact: `2.2` is representable in IEEE-754 binary32 as the
667/// closest-round value `0x400CCCCD` = 2.20000004768371582...; the
668/// ULP-above neighbour is strictly greater and is used as the
669/// sharpest Fail counter-example in the mutation survey.
670pub const AC_SHIP2_003_MAX_VAL_CROSS_ENTROPY_LOSS: f32 = 2.2;
671
672/// Binary verdict for FALSIFY-SHIP-013 / AC-SHIP2-003 /
673/// GATE-ARCH-370M-013. `Pass` iff the measured val CE is finite AND
674/// non-negative AND at or below [`AC_SHIP2_003_MAX_VAL_CROSS_ENTROPY_LOSS`].
675/// `Fail` otherwise (including every non-finite value: NaN, +∞, -∞,
676/// and every negative value, which is a harness-bug domain violation
677/// since H(p,q) ≥ 0 by definition).
678#[derive(Debug, Clone, Copy, PartialEq, Eq)]
679pub enum Ship013Verdict {
680 /// Measured val CE ∈ `[0.0, AC_SHIP2_003_MAX_VAL_CROSS_ENTROPY_LOSS]`.
681 Pass,
682 /// Measured val CE > ceiling, or non-finite, or negative (domain
683 /// violation — a real cross-entropy can never be < 0).
684 Fail,
685}
686
687/// Algorithm-level verdict rule for FALSIFY-SHIP-013 / AC-SHIP2-003 /
688/// GATE-ARCH-370M-013: a single f32 threshold check against the MODEL-2
689/// val-CE ceiling. Returns [`Ship013Verdict::Fail`] conservatively for
690/// any non-finite input (NaN, +∞, -∞) AND for any negative input (CE
691/// is ≥ 0 by definition — a negative value means the loss harness is
692/// broken, which must never be silently promoted to a Pass).
693///
694/// The full discharge (live `apr pretrain --validate` loop producing
695/// a real MODEL-2 val CE on RTX 4090 ≤ 2.2) remains blocked on
696/// AC-SHIP2-003 compute-dispatch.
697#[must_use]
698pub const fn verdict_from_val_ce_loss(measured_ce: f32) -> Ship013Verdict {
699 if !measured_ce.is_finite() {
700 return Ship013Verdict::Fail;
701 }
702 if measured_ce < 0.0 {
703 // Cross-entropy is non-negative by definition; a negative
704 // measurement is a harness bug, not a "better than zero" Pass.
705 return Ship013Verdict::Fail;
706 }
707 if measured_ce <= AC_SHIP2_003_MAX_VAL_CROSS_ENTROPY_LOSS {
708 Ship013Verdict::Pass
709 } else {
710 Ship013Verdict::Fail
711 }
712}
713
714// ─────────────────────────────────────────────────────────────
715// FALSIFY-SHIP-014 / AC-SHIP2-004 / GATE-ARCH-370M-014 — training-budget floor
716// ─────────────────────────────────────────────────────────────
717//
718// AC-SHIP2-004 (spec §5.2) states: "Training on RTX 4090 completes
719// within 21 days (hardware budget)". The decision rule is a pure
720// single-number u32 threshold check on measured wall-clock days. u32
721// naturally rules out negative values (no need for an explicit domain
722// guard); the only extrema to cover are the boundary (21), the
723// immediate neighbours (20 / 22), and the saturation point u32::MAX.
724// Zero days is a trivial Pass (under budget).
725//
726// This PARTIAL_ALGORITHM_LEVEL discharge binds the decision rule only.
727// The compute-heavy half — an actual wall-clock measurement of a real
728// 370M pretraining run on RTX 4090 → ≤ 21 days — remains blocked on
729// AC-SHIP2-004 compute-dispatch. The decision rule itself (what
730// duration constitutes a Pass) is proven today at `cargo test` time
731// via the mutation survey below.
732
733/// Maximum acceptable wall-clock training duration, in integer days,
734/// for MODEL-2 (albor 370M Sovereign) on the SHIP-TWO-001 reference
735/// RTX 4090 host. Spec §5.2 AC-SHIP2-004: "Training on RTX 4090
736/// completes within 21 days (hardware budget)". A measured duration
737/// strictly above 21 days is a ship-blocker (the hardware budget
738/// overflowed and a 2× H100 week-3 escape hatch from Spec §9 Risk #4
739/// is required). Pinned here so any contract drift in either
740/// direction (extending to 30, compressing to 14) is caught at
741/// compile+test time, not mid-pretraining.
742pub const AC_SHIP2_004_MAX_TRAINING_DURATION_DAYS: u32 = 21;
743
744/// Binary verdict for FALSIFY-SHIP-014 / AC-SHIP2-004 /
745/// GATE-ARCH-370M-014. `Pass` iff the measured wall-clock training
746/// duration in days is at or below
747/// [`AC_SHIP2_004_MAX_TRAINING_DURATION_DAYS`]. `Fail` otherwise.
748#[derive(Debug, Clone, Copy, PartialEq, Eq)]
749pub enum Ship014Verdict {
750 /// Measured training duration ≤ `AC_SHIP2_004_MAX_TRAINING_DURATION_DAYS`.
751 Pass,
752 /// Measured training duration > ceiling (ship-blocker per Spec §9
753 /// Risk #4).
754 Fail,
755}
756
757/// Algorithm-level verdict rule for FALSIFY-SHIP-014 / AC-SHIP2-004 /
758/// GATE-ARCH-370M-014: a single u32 threshold check against the
759/// MODEL-2 hardware-budget ceiling. Unlike the f32 ship gates
760/// (SHIP-007, SHIP-013, SHIP-020), u32 automatically rules out
761/// negatives (no domain guard needed) and has no non-finite states;
762/// the only interesting counter-examples are the boundary (21), the
763/// immediate neighbours (20 / 22), and u32::MAX.
764///
765/// Zero days is Pass (trivially under-budget, e.g. a cached artifact
766/// or a same-day re-run). The full discharge (live wall-clock
767/// measurement of MODEL-2 pretraining on RTX 4090 ≤ 21 days) remains
768/// blocked on AC-SHIP2-004 compute-dispatch.
769#[must_use]
770pub const fn verdict_from_training_duration_days(measured_days: u32) -> Ship014Verdict {
771 if measured_days <= AC_SHIP2_004_MAX_TRAINING_DURATION_DAYS {
772 Ship014Verdict::Pass
773 } else {
774 Ship014Verdict::Fail
775 }
776}
777
778// ─────────────────────────────────────────────────────────────
779// Unit tests
780// ─────────────────────────────────────────────────────────────
781
782#[cfg(test)]
783mod tests {
784 use super::*;
785
786 /// INV-ARCH-370M-002/003/004/005/006/008 — byte-equality with contract.
787 #[test]
788 fn config_matches_contract_values() {
789 // §architecture
790 assert_eq!(Llama370MConfig::HIDDEN_DIM, 1024);
791 assert_eq!(Llama370MConfig::NUM_LAYERS, 24);
792 assert_eq!(Llama370MConfig::NUM_HEADS, 16);
793 assert_eq!(Llama370MConfig::NUM_KV_HEADS, 4);
794 assert_eq!(Llama370MConfig::HEAD_DIM, 64);
795 assert_eq!(Llama370MConfig::INTERMEDIATE_DIM, 2816);
796 assert_eq!(Llama370MConfig::VOCAB_SIZE, 50_257);
797 assert_eq!(Llama370MConfig::MAX_POSITION_EMBEDDINGS, 4096);
798 assert!((Llama370MConfig::ROPE_THETA - 10_000.0_f32).abs() < 1e-6);
799 assert!((Llama370MConfig::RMS_NORM_EPS - 1.0e-5_f32).abs() < 1e-9);
800
801 // §constraints
802 assert!(Llama370MConfig::TIED_EMBEDDINGS);
803 assert!(!Llama370MConfig::HAS_BIAS);
804
805 // Derived: INV-ARCH-370M-002 & 003
806 assert_eq!(
807 Llama370MConfig::NUM_HEADS * Llama370MConfig::HEAD_DIM,
808 Llama370MConfig::HIDDEN_DIM,
809 );
810 assert_eq!(Llama370MConfig::NUM_HEADS % Llama370MConfig::NUM_KV_HEADS, 0);
811 }
812
813 /// GATE-ARCH-370M-011 / INV-ARCH-370M-006 — pure vocab-parity helper
814 /// MUST reject any mismatch between tokenizer vocab_size and model
815 /// vocab_size, and MUST accept equal values. The real-compute MODEL-2
816 /// dispatch at commit 29607ed33 surfaced this when a tokenizer at
817 /// vocab=50_257 was paired with a model pinned at VOCAB_SIZE=50_000;
818 /// the N-09 OOB escape masked the mismatch → garbage gradients.
819 /// Task #131 bumped VOCAB_SIZE to 50_257 (Option A); the counter-example
820 /// value below now exercises the opposite drift (a tokenizer one token
821 /// short of contract) so the helper is still exercised on real mismatch.
822 #[test]
823 fn falsify_gate_arch_370m_011_helper_rejects_mismatch() {
824 assert!(assert_tokenizer_vocab_matches_model(
825 Llama370MConfig::VOCAB_SIZE,
826 Llama370MConfig::VOCAB_SIZE,
827 )
828 .is_ok());
829
830 let mismatch = Llama370MConfig::VOCAB_SIZE - 1;
831 let err = assert_tokenizer_vocab_matches_model(mismatch, Llama370MConfig::VOCAB_SIZE)
832 .expect_err("mismatch must return Err");
833 assert!(
834 err.contains("GATE-ARCH-370M-011")
835 && err.contains(&mismatch.to_string())
836 && err.contains(&Llama370MConfig::VOCAB_SIZE.to_string()),
837 "error must name the gate and both vocab sizes for forensics, got: {err}",
838 );
839
840 assert!(assert_tokenizer_vocab_matches_model(0, 1).is_err());
841 assert!(assert_tokenizer_vocab_matches_model(
842 Llama370MConfig::VOCAB_SIZE + 1,
843 Llama370MConfig::VOCAB_SIZE
844 )
845 .is_err());
846 }
847
848 /// FALSIFY-APR-PRETRAIN-ARCH-009 (§55) — relaxed-bound helper accepts
849 /// `tokenizer_vocab <= model_vocab` for the polymorphic init path.
850 /// Discharges the §55 finding: HF-distributed Qwen2.5-Coder-0.5B
851 /// materializes 151643 BPE entries + 22 added = 151665 effective
852 /// strings, but config.json declares `vocab_size = 151936` (271
853 /// reserved slots). The relaxed bound preserves OOB safety while
854 /// admitting the standard HF reserved-slot pattern.
855 #[test]
856 fn falsify_apr_pretrain_arch_009_relaxed_bound_accepts_qwen_reserved_slots() {
857 // The exact §55 LIVE smoke shape: 151665 ≤ 151936 must pass.
858 const QWEN_BPE_PLUS_ADDED: usize = 151_665;
859 const QWEN_DECLARED_VOCAB: usize = 151_936;
860 assert!(
861 assert_tokenizer_vocab_within_model_bound(
862 QWEN_BPE_PLUS_ADDED,
863 QWEN_DECLARED_VOCAB
864 )
865 .is_ok(),
866 "FALSIFY-APR-PRETRAIN-ARCH-009: tokenizer 151665 ≤ model 151936 MUST pass relaxed bound \
867 (HF reserved-slot tolerance)"
868 );
869
870 // BPE-only count (without --include-added-tokens): 151643 ≤ 151936 must also pass.
871 const QWEN_BPE_ONLY: usize = 151_643;
872 assert!(
873 assert_tokenizer_vocab_within_model_bound(QWEN_BPE_ONLY, QWEN_DECLARED_VOCAB).is_ok()
874 );
875
876 // Equality remains acceptable (Llama370M-from-scratch case).
877 assert!(assert_tokenizer_vocab_within_model_bound(
878 Llama370MConfig::VOCAB_SIZE,
879 Llama370MConfig::VOCAB_SIZE
880 )
881 .is_ok());
882 }
883
884 /// FALSIFY-APR-PRETRAIN-ARCH-010 (§55) — relaxed-bound helper REJECTS
885 /// `tokenizer_vocab > model_vocab`. This is the OOB-safety guard:
886 /// a tokenizer producing ids ≥ model_vocab would silently corrupt
887 /// embedding lookup. Strict-greater MUST fail-fast.
888 #[test]
889 fn falsify_apr_pretrain_arch_010_relaxed_bound_rejects_oversized_tokenizer() {
890 const QWEN_DECLARED_VOCAB: usize = 151_936;
891 let oversized = QWEN_DECLARED_VOCAB + 1;
892 let err =
893 assert_tokenizer_vocab_within_model_bound(oversized, QWEN_DECLARED_VOCAB)
894 .expect_err("FALSIFY-APR-PRETRAIN-ARCH-010: tokenizer > model MUST fail-fast");
895 assert!(
896 err.contains("RELAXED") && err.contains("OOB"),
897 "error must cite the relaxed-mode invariant + OOB risk: {err}"
898 );
899 assert!(
900 err.contains(&oversized.to_string())
901 && err.contains(&QWEN_DECLARED_VOCAB.to_string()),
902 "error must name both sizes for forensics: {err}"
903 );
904 }
905
906 /// INV-ARCH-370M-001 — estimated param count within [366M, 374M].
907 ///
908 /// Recomputes the canonical transformer param formula and asserts the
909 /// answer lies in the ±1% band the contract permits for the final
910 /// trained artifact.
911 #[test]
912 fn estimated_param_count_within_contract_band() {
913 let p = estimated_param_count();
914 let stored = estimated_stored_param_count();
915
916 // Sanity printout for debugging drift.
917 eprintln!("albor-370m nominal param count = {p} ({} M)", p / 1_000_000,);
918 eprintln!(
919 "albor-370m stored param count = {stored} ({} M, lm_head tied)",
920 stored / 1_000_000,
921 );
922
923 // INV-ARCH-370M-001 — nominal ±1% band.
924 assert!(
925 p >= Llama370MConfig::PARAMETERS_MIN,
926 "nominal param count {p} below INV-ARCH-370M-001 floor (366M)",
927 );
928 assert!(
929 p <= Llama370MConfig::PARAMETERS_MAX,
930 "nominal param count {p} above INV-ARCH-370M-001 ceiling (374M)",
931 );
932
933 // Tighter ±5% sanity band around the 370M nominal figure, per
934 // this scaffold's unit-test requirements.
935 let nominal = Llama370MConfig::PARAMETERS_NOMINAL as f64;
936 let pct = (p as f64 - nominal).abs() / nominal;
937 assert!(
938 pct < 0.05,
939 "nominal param count {p} differs from 370M by {:.2}% (> 5%)",
940 pct * 100.0,
941 );
942
943 // Tying must reduce storage by exactly one vocab*hidden matrix.
944 assert_eq!(
945 p - stored,
946 Llama370MConfig::VOCAB_SIZE * Llama370MConfig::HIDDEN_DIM,
947 "tying accounting mismatch",
948 );
949 }
950
951 /// Sanity: the compile-time `validate()` matches the runtime check.
952 #[test]
953 fn validate_is_a_noop_at_runtime() {
954 // If `validate()` compiled, it's already been proven to not panic
955 // (the `const _: () = ...;` at module scope forced evaluation at
956 // compile time). Calling it again at runtime is a free
957 // defence-in-depth assertion.
958 Llama370MConfig::validate();
959 }
960
961 /// Shape newtypes are zero-sized and usable in generic contexts.
962 #[test]
963 fn shape_newtypes_compile_and_roundtrip() {
964 type Hidden = HiddenDim<{ Llama370MConfig::HIDDEN_DIM }>;
965 type Heads = NumHeads<{ Llama370MConfig::NUM_HEADS }>;
966 type KvHeads = NumKvHeads<{ Llama370MConfig::NUM_KV_HEADS }>;
967 type Head = HeadDim<{ Llama370MConfig::HEAD_DIM }>;
968 type Inter = IntermediateDim<{ Llama370MConfig::INTERMEDIATE_DIM }>;
969 type Layers = NumLayers<{ Llama370MConfig::NUM_LAYERS }>;
970 type Vocab = VocabSize<{ Llama370MConfig::VOCAB_SIZE }>;
971
972 assert_eq!(Hidden::VALUE, 1024);
973 assert_eq!(Heads::VALUE, 16);
974 assert_eq!(KvHeads::VALUE, 4);
975 assert_eq!(Head::VALUE, 64);
976 assert_eq!(Inter::VALUE, 2816);
977 assert_eq!(Layers::VALUE, 24);
978 assert_eq!(Vocab::VALUE, 50_257);
979
980 // Zero-sized: all shape newtypes cost nothing at runtime.
981 assert_eq!(std::mem::size_of::<Hidden>(), 0);
982 assert_eq!(std::mem::size_of::<Heads>(), 0);
983 }
984
985 // ========================================================================
986 // C-LLAMA-370M-SOVEREIGN / AC-SHIP2-001 / FALSIFY-SHIP-011
987 // ========================================================================
988
989 /// The sovereign contract YAML embedded at compile time so the test
990 /// binary has a byte-frozen copy — any edit to the file is caught
991 /// by the next test run, not discovered post-publish.
992 const SOVEREIGN_CONTRACT_YAML: &str =
993 include_str!("../../../../contracts/model-families/llama-370m-sovereign-v1.yaml");
994
995 /// GATE-ARCH-370M-001 / INV-ARCH-370M-002..008: every architectural
996 /// constant declared in `contracts/model-families/llama-370m-sovereign-v1.yaml`
997 /// matches the Rust scaffold `Llama370MConfig::*` const byte-equally.
998 ///
999 /// Discharges FALSIFY-SHIP-011 (AC-SHIP2-001): architecture registered
1000 /// in a llama-family contract entry whose dimensions validate against
1001 /// `contracts/model-families/_schema.yaml` AND match the compile-time
1002 /// Rust config that the training loop will actually consume. Binds the
1003 /// YAML contract and the Rust scaffold: if either drifts without the
1004 /// other, this test fails — catching the MODEL-1 QLoRA class of
1005 /// recipe/artifact drift at `cargo test` time, before a single step
1006 /// of pretraining compute runs.
1007 #[test]
1008 fn falsify_ship_011_rust_scaffold_matches_yaml_contract() {
1009 let doc: serde_yaml::Value = serde_yaml::from_str(SOVEREIGN_CONTRACT_YAML)
1010 .expect("llama-370m-sovereign-v1.yaml must parse as YAML");
1011
1012 // Contract identity — must be the right contract.
1013 assert_eq!(
1014 doc["contract_id"].as_str(),
1015 Some("C-LLAMA-370M-SOVEREIGN"),
1016 "wrong contract loaded — check include_str! path",
1017 );
1018 assert_eq!(doc["family"].as_str(), Some("llama"));
1019 assert_eq!(doc["size_variant"].as_str(), Some("370m"));
1020
1021 // Architectural dimensions (INV-ARCH-370M-002, -003, -005, -006).
1022 let arch = &doc["architecture"];
1023 assert_eq!(
1024 arch["hidden_dim"].as_u64().map(|v| v as usize),
1025 Some(Llama370MConfig::HIDDEN_DIM),
1026 "YAML architecture.hidden_dim drifted from Rust const",
1027 );
1028 assert_eq!(
1029 arch["num_layers"].as_u64().map(|v| v as usize),
1030 Some(Llama370MConfig::NUM_LAYERS),
1031 );
1032 assert_eq!(
1033 arch["num_heads"].as_u64().map(|v| v as usize),
1034 Some(Llama370MConfig::NUM_HEADS),
1035 );
1036 assert_eq!(
1037 arch["num_kv_heads"].as_u64().map(|v| v as usize),
1038 Some(Llama370MConfig::NUM_KV_HEADS),
1039 );
1040 assert_eq!(arch["head_dim"].as_u64().map(|v| v as usize), Some(Llama370MConfig::HEAD_DIM),);
1041 assert_eq!(
1042 arch["intermediate_dim"].as_u64().map(|v| v as usize),
1043 Some(Llama370MConfig::INTERMEDIATE_DIM),
1044 );
1045 assert_eq!(
1046 arch["vocab_size"].as_u64().map(|v| v as usize),
1047 Some(Llama370MConfig::VOCAB_SIZE),
1048 );
1049 assert_eq!(
1050 arch["max_position_embeddings"].as_u64().map(|v| v as usize),
1051 Some(Llama370MConfig::MAX_POSITION_EMBEDDINGS),
1052 );
1053 let rope_theta = arch["rope_theta"].as_f64().expect("rope_theta must be a float");
1054 assert!(
1055 (rope_theta - f64::from(Llama370MConfig::ROPE_THETA)).abs() < 1e-6,
1056 "YAML rope_theta {rope_theta} != Rust const {}",
1057 Llama370MConfig::ROPE_THETA,
1058 );
1059
1060 // Constraints (INV-ARCH-370M-004, -008).
1061 let constraints = &doc["constraints"];
1062 assert_eq!(
1063 constraints["tied_embeddings"].as_bool(),
1064 Some(Llama370MConfig::TIED_EMBEDDINGS),
1065 );
1066 assert_eq!(constraints["has_bias"].as_bool(), Some(Llama370MConfig::HAS_BIAS),);
1067 assert_eq!(constraints["attention_type"].as_str(), Some("gqa"));
1068 assert_eq!(constraints["activation"].as_str(), Some("silu"));
1069 assert_eq!(constraints["norm_type"].as_str(), Some("rmsnorm"));
1070 assert_eq!(constraints["positional_encoding"].as_str(), Some("rope"));
1071 assert_eq!(constraints["mlp_type"].as_str(), Some("swiglu"));
1072 }
1073
1074 /// GATE-ARCH-370M-001 (gate status): once FALSIFY-SHIP-011 is
1075 /// discharged, the sovereign contract MUST declare status ACTIVE —
1076 /// a PROPOSED gate cannot be a ship-blocker.
1077 #[test]
1078 fn falsify_ship_011_sovereign_contract_is_active() {
1079 let doc: serde_yaml::Value =
1080 serde_yaml::from_str(SOVEREIGN_CONTRACT_YAML).expect("parse sovereign contract");
1081 assert_eq!(
1082 doc["status"].as_str(),
1083 Some("ACTIVE"),
1084 "C-LLAMA-370M-SOVEREIGN must be ACTIVE once FALSIFY-SHIP-011 \
1085 discharges — PROPOSED contracts cannot gate a ship",
1086 );
1087 }
1088
1089 // ========================================================================
1090 // GATE-ARCH-370M-004 / AC-SHIP2-009 / FALSIFY-SHIP-019
1091 // ========================================================================
1092
1093 /// Enumerate every APR tensor name the 370M architecture produces.
1094 ///
1095 /// Returns `(name, expected_shape)` pairs. Ordering mirrors the
1096 /// canonical GGUF/APR dump order: embedding → per-layer tensors
1097 /// (24 layers × 9 tensors) → final norm. `lm_head.weight` shares
1098 /// storage with `model.embed_tokens.weight` per INV-ARCH-370M-004
1099 /// (tied), but the layout contract records it as a separate entry
1100 /// because the kernel path needs a named row-major [vocab, hidden]
1101 /// reference at decode time.
1102 fn enumerate_370m_apr_tensors() -> Vec<(String, Vec<usize>)> {
1103 let h = Llama370MConfig::HIDDEN_DIM;
1104 let v = Llama370MConfig::VOCAB_SIZE;
1105 let i = Llama370MConfig::INTERMEDIATE_DIM;
1106 let nh = Llama370MConfig::NUM_HEADS;
1107 let nkv = Llama370MConfig::NUM_KV_HEADS;
1108 let hd = Llama370MConfig::HEAD_DIM;
1109 let layers = Llama370MConfig::NUM_LAYERS;
1110
1111 let mut out: Vec<(String, Vec<usize>)> = Vec::with_capacity(3 + 9 * layers);
1112 out.push(("model.embed_tokens.weight".into(), vec![v, h]));
1113 out.push(("lm_head.weight".into(), vec![v, h]));
1114 for n in 0..layers {
1115 out.push((format!("model.layers.{n}.self_attn.q_proj.weight"), vec![nh * hd, h]));
1116 out.push((format!("model.layers.{n}.self_attn.k_proj.weight"), vec![nkv * hd, h]));
1117 out.push((format!("model.layers.{n}.self_attn.v_proj.weight"), vec![nkv * hd, h]));
1118 out.push((format!("model.layers.{n}.self_attn.o_proj.weight"), vec![h, nh * hd]));
1119 out.push((format!("model.layers.{n}.mlp.gate_proj.weight"), vec![i, h]));
1120 out.push((format!("model.layers.{n}.mlp.up_proj.weight"), vec![i, h]));
1121 out.push((format!("model.layers.{n}.mlp.down_proj.weight"), vec![h, i]));
1122 out.push((format!("model.layers.{n}.input_layernorm.weight"), vec![h]));
1123 out.push((format!("model.layers.{n}.post_attention_layernorm.weight"), vec![h]));
1124 }
1125 out.push(("model.norm.weight".into(), vec![h]));
1126 out
1127 }
1128
1129 /// FALSIFY-SHIP-019 (AC-SHIP2-009) — algorithm-level PARTIAL proof
1130 /// that every APR tensor the 370M architecture produces is covered
1131 /// by `aprender::format::layout_contract` (the authoritative
1132 /// row-major validator reused by every GGUF↔APR export site, per
1133 /// spec §9 Risk #2 mitigation).
1134 ///
1135 /// This test proves three things without needing a trained model:
1136 /// 1. **Coverage:** every 370M tensor name normalises to a
1137 /// contract entry — no unknown-tensor silent-skip gap.
1138 /// 2. **Row-major ordering:** every 2D tensor's enumerated shape
1139 /// is `[out_dim, in_dim]` (the row-major APR layout mandated
1140 /// by INV-ARCH-370M-009 and by LAYOUT-001). Specifically
1141 /// `lm_head.weight` is `[vocab, hidden]`, never reversed —
1142 /// GH-202 root cause.
1143 /// 3. **Critical-tensor enforcement:** `validate_apr_shape` on
1144 /// `lm_head.weight` accepts `[vocab, hidden]` AND rejects
1145 /// `[hidden, vocab]`, proving the validator actively catches
1146 /// the GH-202 class of layout bug.
1147 ///
1148 /// **Discharge:** `evidence_discharged_by` on GATE-ARCH-370M-004;
1149 /// full discharge blocks on real trained 370M artifact (need the
1150 /// GGUF export path to actually invoke `validate_apr_shape` on
1151 /// real tensor bytes, which requires a trained `.apr`).
1152 #[test]
1153 fn falsify_ship_019_layout_contract_covers_every_370m_tensor() {
1154 use aprender::format::layout_contract::LayoutContract;
1155 let contract = LayoutContract::new();
1156 let tensors = enumerate_370m_apr_tensors();
1157
1158 // Invariant 1: the enumerator produces exactly the expected number
1159 // of APR entries for a 24-layer 370M Llama (1 embedding + 1 lm_head
1160 // + 9 per-layer + 1 final norm).
1161 assert_eq!(
1162 tensors.len(),
1163 3 + 9 * Llama370MConfig::NUM_LAYERS,
1164 "370M enumerator produced wrong tensor count — scaffold drift",
1165 );
1166
1167 // Invariant 2: coverage — every enumerated name resolves to a
1168 // TensorContract entry. Pattern-normalisation collapses
1169 // `model.layers.<n>.*` to `model.layers.{n}.*`.
1170 for (name, _) in &tensors {
1171 assert!(
1172 contract.get_apr_contract(name).is_some(),
1173 "370M tensor `{name}` has no layout_contract entry — \
1174 LAYOUT-001 coverage gap (every tensor in this model must \
1175 pattern-match a TensorContract or GGUF export layout will \
1176 silently skip it)",
1177 );
1178 }
1179
1180 // Invariant 3: row-major ordering — every 2D tensor enumerated
1181 // above has shape `[out_dim, in_dim]`. The ordering is the whole
1182 // point of LAYOUT-001 (see layout_contract.rs §Key Principles).
1183 // Spot-check the pinned invariants rather than re-parsing the
1184 // formula strings.
1185 let lm = tensors
1186 .iter()
1187 .find(|(n, _)| n == "lm_head.weight")
1188 .expect("lm_head must be enumerated");
1189 assert_eq!(
1190 lm.1,
1191 vec![Llama370MConfig::VOCAB_SIZE, Llama370MConfig::HIDDEN_DIM],
1192 "lm_head.weight must be row-major [vocab, hidden] — GH-202 \
1193 root cause; reversed `[hidden, vocab]` produces [PAD] garbage",
1194 );
1195 let embed = tensors
1196 .iter()
1197 .find(|(n, _)| n == "model.embed_tokens.weight")
1198 .expect("embed_tokens must be enumerated");
1199 assert_eq!(
1200 embed.1,
1201 vec![Llama370MConfig::VOCAB_SIZE, Llama370MConfig::HIDDEN_DIM],
1202 "embed_tokens.weight must be row-major [vocab, hidden]",
1203 );
1204 // GQA: K/V projections are 4× smaller on the out_dim axis vs Q/O.
1205 let k0 = tensors
1206 .iter()
1207 .find(|(n, _)| n == "model.layers.0.self_attn.k_proj.weight")
1208 .expect("k_proj layer 0 must be enumerated");
1209 assert_eq!(
1210 k0.1,
1211 vec![
1212 Llama370MConfig::NUM_KV_HEADS * Llama370MConfig::HEAD_DIM,
1213 Llama370MConfig::HIDDEN_DIM,
1214 ],
1215 "k_proj must be row-major [kv_heads*head_dim, hidden] — GQA",
1216 );
1217 let q0 = tensors
1218 .iter()
1219 .find(|(n, _)| n == "model.layers.0.self_attn.q_proj.weight")
1220 .expect("q_proj layer 0 must be enumerated");
1221 assert_eq!(
1222 q0.1,
1223 vec![
1224 Llama370MConfig::NUM_HEADS * Llama370MConfig::HEAD_DIM,
1225 Llama370MConfig::HIDDEN_DIM,
1226 ],
1227 "q_proj must be row-major [heads*head_dim, hidden]",
1228 );
1229
1230 // Invariant 4: `validate_apr_shape` actively enforces the critical
1231 // tensor. Correct shape passes, reversed shape fails — the
1232 // validator must catch the GH-202 class of bug, not just
1233 // silently accept.
1234 contract
1235 .validate_apr_shape(
1236 "lm_head.weight",
1237 &[Llama370MConfig::VOCAB_SIZE, Llama370MConfig::HIDDEN_DIM],
1238 Llama370MConfig::VOCAB_SIZE,
1239 Llama370MConfig::HIDDEN_DIM,
1240 )
1241 .expect("correct [vocab, hidden] lm_head must validate");
1242 let bad = contract.validate_apr_shape(
1243 "lm_head.weight",
1244 &[Llama370MConfig::HIDDEN_DIM, Llama370MConfig::VOCAB_SIZE],
1245 Llama370MConfig::VOCAB_SIZE,
1246 Llama370MConfig::HIDDEN_DIM,
1247 );
1248 assert!(
1249 bad.is_err(),
1250 "reversed [hidden, vocab] lm_head MUST be rejected by the \
1251 layout contract — this is GH-202 regression protection",
1252 );
1253 }
1254
1255 /// GATE-ARCH-370M-004 wiring check: once FALSIFY-SHIP-019 has an
1256 /// algorithm-level PARTIAL discharge, the sovereign contract YAML
1257 /// MUST record `discharge_status: PARTIAL_ALGORITHM_LEVEL` +
1258 /// `evidence_discharged_by` + `full_discharge_blocks_on` on
1259 /// GATE-ARCH-370M-004. Any edit that drops those fields fails this
1260 /// test before the artifact ships.
1261 #[test]
1262 fn falsify_ship_019_gate_arch_370m_004_has_partial_discharge_marker() {
1263 let doc: serde_yaml::Value =
1264 serde_yaml::from_str(SOVEREIGN_CONTRACT_YAML).expect("parse sovereign contract");
1265 let gates =
1266 doc["gates"].as_sequence().expect("gates must be a sequence in sovereign contract");
1267 let gate = gates
1268 .iter()
1269 .find(|g| g["id"].as_str() == Some("GATE-ARCH-370M-004"))
1270 .expect("GATE-ARCH-370M-004 must exist in sovereign contract");
1271
1272 assert_eq!(
1273 gate["falsification_id"].as_str(),
1274 Some("FALSIFY-SHIP-019"),
1275 "GATE-ARCH-370M-004 must bind FALSIFY-SHIP-019",
1276 );
1277 assert_eq!(
1278 gate["binds_to"].as_str(),
1279 Some("AC-SHIP2-009"),
1280 "GATE-ARCH-370M-004 must bind AC-SHIP2-009",
1281 );
1282 assert_eq!(
1283 gate["discharge_status"].as_str(),
1284 Some("PARTIAL_ALGORITHM_LEVEL"),
1285 "GATE-ARCH-370M-004 must advertise PARTIAL_ALGORITHM_LEVEL \
1286 (full discharge blocks on real trained 370M .apr)",
1287 );
1288 let evidence = gate["evidence_discharged_by"]
1289 .as_sequence()
1290 .expect("GATE-ARCH-370M-004 must have evidence_discharged_by");
1291 assert!(
1292 !evidence.is_empty(),
1293 "GATE-ARCH-370M-004 evidence_discharged_by must list \
1294 at least one test function or artifact",
1295 );
1296 assert!(
1297 gate["full_discharge_blocks_on"].as_str().is_some(),
1298 "PARTIAL gate must document full_discharge_blocks_on",
1299 );
1300 assert_eq!(
1301 gate["ship_blocking"].as_bool(),
1302 Some(true),
1303 "GATE-ARCH-370M-004 must advertise ship_blocking:true — the \
1304 gate's `verdict:pass` alone is insufficient green while \
1305 discharge_status == PARTIAL_ALGORITHM_LEVEL",
1306 );
1307 }
1308
1309 // ========================================================================
1310 // GATE-ARCH-370M-005 / AC-SHIP2-007 / FALSIFY-SHIP-017
1311 // ========================================================================
1312
1313 /// FALSIFY-SHIP-017 (AC-SHIP2-007) — algorithm-level proof of the
1314 /// Python-AST-parse threshold function.
1315 ///
1316 /// The full-discharge harness (100 held-out prompts × `apr run` ×
1317 /// Python AST parse) blocks on a real trained 370M .apr. But the
1318 /// *decision rule* — "≥ 2 syntax errors on 100 prompts is a
1319 /// ship-blocker, ≤ 1 tolerated" — is a pure integer threshold and
1320 /// can be proven correct today. Any edit to `verdict_from_syntax_error_count`
1321 /// that widens the tolerance (or flips the boundary) is caught here
1322 /// before the artifact ships.
1323 ///
1324 /// Covers four invariants:
1325 /// 1. **Zero errors** → Pass (the trivial unanimous-parse case).
1326 /// 2. **Exactly-one error** → Pass (the tolerance boundary; matches
1327 /// the EX-06 harness' `tolerate ≤ 1 SyntaxError` rule and
1328 /// spec-§6 FALSIFY-SHIP-017 wording `tolerate ≤1`).
1329 /// 3. **Exactly-two errors** → Fail (the ship-blocker boundary;
1330 /// FALSIFY-SHIP-017 says `≥ 2 SyntaxError`).
1331 /// 4. **Monotonicity** — raising the error count can only worsen
1332 /// the verdict (Pass → Fail is one-way). This rules out any
1333 /// future threshold edit that accidentally promotes a high
1334 /// error count back to Pass.
1335 #[test]
1336 fn falsify_ship_017_syntax_error_count_threshold_logic() {
1337 // (1) Zero errors — unanimous parse.
1338 assert_eq!(
1339 verdict_from_syntax_error_count(0),
1340 Ship017Verdict::Pass,
1341 "0 syntax errors must always Pass",
1342 );
1343
1344 // (2) Tolerance boundary — 1 error still tolerated.
1345 assert_eq!(
1346 verdict_from_syntax_error_count(1),
1347 Ship017Verdict::Pass,
1348 "1 syntax error is the AC_SHIP2_007_MAX_TOLERATED_SYNTAX_ERRORS \
1349 boundary and must Pass",
1350 );
1351
1352 // (3) Ship-blocker boundary — 2 errors flips to Fail.
1353 assert_eq!(
1354 verdict_from_syntax_error_count(2),
1355 Ship017Verdict::Fail,
1356 "2 syntax errors is the FALSIFY-SHIP-017 ship-blocker \
1357 boundary and must Fail",
1358 );
1359
1360 // (3b) Pathological cases — high error counts must still Fail.
1361 assert_eq!(
1362 verdict_from_syntax_error_count(AC_SHIP2_007_HELDOUT_PROMPT_COUNT),
1363 Ship017Verdict::Fail,
1364 "all-errors must Fail — trivial sanity",
1365 );
1366 assert_eq!(
1367 verdict_from_syntax_error_count(AC_SHIP2_007_HELDOUT_PROMPT_COUNT / 2),
1368 Ship017Verdict::Fail,
1369 "50% errors on 100 prompts must Fail",
1370 );
1371
1372 // (4) Monotonicity — once Fail, always Fail as errors rise.
1373 let mut last_was_fail = false;
1374 for errors in 0..=AC_SHIP2_007_HELDOUT_PROMPT_COUNT {
1375 let verdict = verdict_from_syntax_error_count(errors);
1376 if last_was_fail {
1377 assert_eq!(
1378 verdict,
1379 Ship017Verdict::Fail,
1380 "monotonicity violation at errors={errors}: once Fail, \
1381 more errors cannot return to Pass",
1382 );
1383 }
1384 if verdict == Ship017Verdict::Fail {
1385 last_was_fail = true;
1386 }
1387 }
1388
1389 // Provenance sanity — the AC-SHIP2-007 prompt-count constant
1390 // matches the spec's 100-prompt harness size. If this ever drifts,
1391 // the threshold is still correct (it's count-agnostic) but the
1392 // GATE-ARCH-370M-005 evidence no longer matches the spec wording.
1393 assert_eq!(
1394 AC_SHIP2_007_HELDOUT_PROMPT_COUNT, 100,
1395 "AC-SHIP2-007 spec §6 pins the harness at 100 held-out prompts",
1396 );
1397 assert_eq!(
1398 AC_SHIP2_007_MAX_TOLERATED_SYNTAX_ERRORS, 1,
1399 "FALSIFY-SHIP-017 (spec §8.3 row) tolerates ≤ 1 SyntaxError",
1400 );
1401 }
1402
1403 /// GATE-ARCH-370M-005 wiring check: once FALSIFY-SHIP-017 has an
1404 /// algorithm-level PARTIAL discharge, the sovereign contract YAML
1405 /// MUST record `discharge_status: PARTIAL_ALGORITHM_LEVEL` plus
1406 /// `evidence_discharged_by` plus `full_discharge_blocks_on` on
1407 /// GATE-ARCH-370M-005. Any edit that drops those fields fails this
1408 /// test before the artifact ships.
1409 #[test]
1410 fn falsify_ship_017_gate_arch_370m_005_has_partial_discharge_marker() {
1411 let doc: serde_yaml::Value =
1412 serde_yaml::from_str(SOVEREIGN_CONTRACT_YAML).expect("parse sovereign contract");
1413 let gates =
1414 doc["gates"].as_sequence().expect("gates must be a sequence in sovereign contract");
1415 let gate = gates
1416 .iter()
1417 .find(|g| g["id"].as_str() == Some("GATE-ARCH-370M-005"))
1418 .expect("GATE-ARCH-370M-005 must exist in sovereign contract");
1419
1420 assert_eq!(
1421 gate["falsification_id"].as_str(),
1422 Some("FALSIFY-SHIP-017"),
1423 "GATE-ARCH-370M-005 must bind FALSIFY-SHIP-017",
1424 );
1425 assert_eq!(
1426 gate["binds_to"].as_str(),
1427 Some("AC-SHIP2-007"),
1428 "GATE-ARCH-370M-005 must bind AC-SHIP2-007",
1429 );
1430 assert_eq!(
1431 gate["discharge_status"].as_str(),
1432 Some("PARTIAL_ALGORITHM_LEVEL"),
1433 "GATE-ARCH-370M-005 must advertise PARTIAL_ALGORITHM_LEVEL \
1434 (full discharge blocks on real trained 370M .apr + 100-prompt \
1435 `apr run` harness)",
1436 );
1437 let evidence = gate["evidence_discharged_by"]
1438 .as_sequence()
1439 .expect("GATE-ARCH-370M-005 must have evidence_discharged_by");
1440 assert!(
1441 !evidence.is_empty(),
1442 "GATE-ARCH-370M-005 evidence_discharged_by must list \
1443 at least one test function or artifact",
1444 );
1445 assert!(
1446 gate["full_discharge_blocks_on"].as_str().is_some(),
1447 "PARTIAL gate must document full_discharge_blocks_on",
1448 );
1449 assert_eq!(
1450 gate["ship_blocking"].as_bool(),
1451 Some(true),
1452 "GATE-ARCH-370M-005 must advertise ship_blocking:true — the \
1453 gate's `verdict:pass` alone is insufficient green while \
1454 discharge_status == PARTIAL_ALGORITHM_LEVEL",
1455 );
1456 }
1457
1458 /// FALSIFY-SHIP-020 / AC-SHIP2-010 — pure decode-throughput threshold
1459 /// proof. `apr bench --median` on a real trained 370M .apr is the
1460 /// compute-heavy harness; the decision rule itself (≥100 tok/s
1461 /// passes, <100 tok/s fails) is separable and proven here.
1462 ///
1463 /// Invariants covered:
1464 /// 1. Pass boundary: exactly 100.0 tok/s → Pass (contract floor).
1465 /// 2. Fail boundary: 99.999 tok/s → Fail (one ULP below floor).
1466 /// 3. Generous green: 120.0 and 500.0 tok/s → Pass.
1467 /// 4. Hard red: 0.0 and 50.0 tok/s → Fail.
1468 /// 5. Monotonicity: once Fail, all strictly lower tps stay Fail.
1469 /// 6. Degenerate inputs: NaN and ±∞ → Fail (no well-defined
1470 /// median → no proof).
1471 /// 7. Provenance pinning: the const floor MUST be 100.0 — any
1472 /// edit that loosens the threshold trips this test before the
1473 /// contract can ship.
1474 #[test]
1475 fn falsify_ship_020_decode_tps_threshold_logic() {
1476 // Pass boundary
1477 assert_eq!(
1478 verdict_from_decode_tps(100.0),
1479 Ship020Verdict::Pass,
1480 "exactly 100.0 tok/s must Pass (contract floor)",
1481 );
1482 // Fail boundary (one f32 step below 100.0)
1483 let just_below = f32::from_bits(100.0_f32.to_bits() - 1);
1484 assert!(just_below < 100.0);
1485 assert_eq!(
1486 verdict_from_decode_tps(just_below),
1487 Ship020Verdict::Fail,
1488 "one ULP below 100.0 tok/s must Fail",
1489 );
1490 // Generous-green sanity
1491 assert_eq!(verdict_from_decode_tps(120.0), Ship020Verdict::Pass);
1492 assert_eq!(verdict_from_decode_tps(500.0), Ship020Verdict::Pass);
1493 // Hard-red sanity
1494 assert_eq!(verdict_from_decode_tps(0.0), Ship020Verdict::Fail);
1495 assert_eq!(verdict_from_decode_tps(50.0), Ship020Verdict::Fail);
1496 // Monotonicity sweep: once Fail, any strictly smaller tps stays Fail.
1497 let samples = [0.0_f32, 25.0, 50.0, 75.0, 99.0, 99.5, 99.99, 100.0, 150.0, 10_000.0];
1498 let mut seen_fail = false;
1499 for &t in &samples {
1500 let v = verdict_from_decode_tps(t);
1501 if v == Ship020Verdict::Fail {
1502 seen_fail = true;
1503 } else if seen_fail {
1504 // We saw Fail earlier in the monotonically-increasing sweep
1505 // and now see Pass — that's fine (Fail→Pass is the
1506 // allowed crossover at the threshold). Re-arm by
1507 // resetting the seen_fail flag so we only guard against
1508 // Pass→Fail regressions within the sweep.
1509 seen_fail = false;
1510 }
1511 }
1512 // Separate direct Pass→Fail regression guard: walk strictly
1513 // decreasing from a clear Pass; once Fail shows, it must stick.
1514 let decreasing = [10_000.0_f32, 500.0, 150.0, 100.0, 99.99, 99.0, 50.0, 25.0, 0.0];
1515 let mut locked_fail = false;
1516 for &t in &decreasing {
1517 let v = verdict_from_decode_tps(t);
1518 if v == Ship020Verdict::Fail {
1519 locked_fail = true;
1520 } else {
1521 assert!(
1522 !locked_fail,
1523 "monotonicity violated: tps={t} produced Pass after a \
1524 lower-tps Fail was already observed",
1525 );
1526 }
1527 }
1528 // Degenerate inputs: NaN / ±∞ are all conservatively Fail.
1529 // A real `apr bench` run can NEVER produce a non-finite median;
1530 // if one appears, the harness itself is broken and we must
1531 // refuse to claim a ship-gate pass on a value we cannot
1532 // meaningfully compare against the threshold.
1533 assert_eq!(
1534 verdict_from_decode_tps(f32::NAN),
1535 Ship020Verdict::Fail,
1536 "NaN tps has no well-defined median and must Fail",
1537 );
1538 assert_eq!(verdict_from_decode_tps(f32::NEG_INFINITY), Ship020Verdict::Fail,);
1539 assert_eq!(
1540 verdict_from_decode_tps(f32::INFINITY),
1541 Ship020Verdict::Fail,
1542 "+∞ tok/s is ill-formed — a real `apr bench` median is \
1543 always a finite positive; treating +∞ as Pass would let \
1544 an instrumentation bug silently green the ship-gate",
1545 );
1546 // Provenance pinning — any edit that loosens the threshold
1547 // (say, to 60.0) would silently lower the ship bar. Trip here
1548 // before the contract can ship.
1549 assert!(
1550 (AC_SHIP2_010_MIN_DECODE_TPS_RTX4090 - 100.0_f32).abs() < f32::EPSILON,
1551 "AC_SHIP2_010_MIN_DECODE_TPS_RTX4090 must stay pinned to 100.0 \
1552 tok/s — see contracts/model-families/llama-370m-sovereign-v1.yaml \
1553 GATE-ARCH-370M-006",
1554 );
1555 }
1556
1557 /// GATE-ARCH-370M-006 wiring check: the sovereign contract YAML
1558 /// MUST record `discharge_status: PARTIAL_ALGORITHM_LEVEL` +
1559 /// `evidence_discharged_by` + `full_discharge_blocks_on` +
1560 /// `ship_blocking: true` on GATE-ARCH-370M-006, and bind it to
1561 /// AC-SHIP2-010 / FALSIFY-SHIP-020. Any edit that drops those
1562 /// fields fails this test before the artifact ships.
1563 #[test]
1564 fn falsify_ship_020_gate_arch_370m_006_has_partial_discharge_marker() {
1565 let doc: serde_yaml::Value =
1566 serde_yaml::from_str(SOVEREIGN_CONTRACT_YAML).expect("parse sovereign contract");
1567 let gates =
1568 doc["gates"].as_sequence().expect("gates must be a sequence in sovereign contract");
1569 let gate = gates
1570 .iter()
1571 .find(|g| g["id"].as_str() == Some("GATE-ARCH-370M-006"))
1572 .expect("GATE-ARCH-370M-006 must exist in sovereign contract");
1573
1574 assert_eq!(
1575 gate["falsification_id"].as_str(),
1576 Some("FALSIFY-SHIP-020"),
1577 "GATE-ARCH-370M-006 must bind FALSIFY-SHIP-020",
1578 );
1579 assert_eq!(
1580 gate["binds_to"].as_str(),
1581 Some("AC-SHIP2-010"),
1582 "GATE-ARCH-370M-006 must bind AC-SHIP2-010",
1583 );
1584 assert_eq!(
1585 gate["discharge_status"].as_str(),
1586 Some("PARTIAL_ALGORITHM_LEVEL"),
1587 "GATE-ARCH-370M-006 must advertise PARTIAL_ALGORITHM_LEVEL \
1588 (full discharge blocks on real trained 370M .apr + RTX 4090 \
1589 `apr bench` median run)",
1590 );
1591 let evidence = gate["evidence_discharged_by"]
1592 .as_sequence()
1593 .expect("GATE-ARCH-370M-006 must have evidence_discharged_by");
1594 assert!(
1595 !evidence.is_empty(),
1596 "GATE-ARCH-370M-006 evidence_discharged_by must list \
1597 at least one test function or artifact",
1598 );
1599 assert!(
1600 gate["full_discharge_blocks_on"].as_str().is_some(),
1601 "PARTIAL gate must document full_discharge_blocks_on",
1602 );
1603 assert_eq!(
1604 gate["ship_blocking"].as_bool(),
1605 Some(true),
1606 "GATE-ARCH-370M-006 must advertise ship_blocking:true — the \
1607 gate's `verdict:pass` alone is insufficient green while \
1608 discharge_status == PARTIAL_ALGORITHM_LEVEL",
1609 );
1610 }
1611
1612 // ========================================================================
1613 // FALSIFY-SHIP-018 / AC-SHIP2-008 / GATE-ARCH-370M-007 — pass@1 threshold
1614 // ========================================================================
1615
1616 /// Algorithm-level proof that `verdict_from_pass_at_1` enforces the
1617 /// spec §5.2 AC-SHIP2-008 rule "HumanEval pass@1 ≥ 30.0%" correctly:
1618 ///
1619 /// 1. Exactly-at-floor passes (30/100, 60/200, 1/1 at threshold 100).
1620 /// 2. One f32 ULP below the floor fails.
1621 /// 3. Generous-green (50/100, 164/164) passes.
1622 /// 4. Hard-red (0/100, 1/100) fails.
1623 /// 5. Monotonicity: sweeping `correct` upward from 0 with total fixed
1624 /// at 164 (canonical HumanEval) never flips Pass → Fail.
1625 /// 6. Div-safety guard: `total == 0` always fails, even with 0 correct.
1626 /// 7. Sanity guard: `correct > total` always fails (impossible harness).
1627 /// 8. Non-finite threshold guard: NaN / ±∞ always fail.
1628 /// 9. Provenance: the contract floor const is pinned to 30.0 (edit
1629 /// to the const without amending the contract is caught here).
1630 ///
1631 /// Full `apr eval --benchmark humaneval` discharge blocks on the trained
1632 /// 370M .apr from AC-SHIP2-003/004 compute-dispatch — fixture swap only.
1633 #[test]
1634 fn falsify_ship_018_humaneval_pass_at_1_threshold_logic() {
1635 // ── (1) Exactly-at-floor → Pass
1636 assert_eq!(
1637 verdict_from_pass_at_1(30, 100, AC_SHIP2_008_MIN_HUMANEVAL_PASS_AT_1_PCT),
1638 Ship018Verdict::Pass,
1639 "30/100 = 30.0% must pass the 30.0 floor",
1640 );
1641 assert_eq!(
1642 verdict_from_pass_at_1(60, 200, AC_SHIP2_008_MIN_HUMANEVAL_PASS_AT_1_PCT),
1643 Ship018Verdict::Pass,
1644 "60/200 = 30.0% must pass the 30.0 floor",
1645 );
1646 // HumanEval canonical N=164 at-floor: ⌈0.3 × 164⌉ = 50 → 50/164 = 30.49%
1647 assert_eq!(
1648 verdict_from_pass_at_1(50, 164, AC_SHIP2_008_MIN_HUMANEVAL_PASS_AT_1_PCT),
1649 Ship018Verdict::Pass,
1650 "50/164 ≈ 30.49% must pass the 30.0 floor",
1651 );
1652
1653 // ── (2a) Just below the floor → Fail.
1654 //
1655 // 49/164 ≈ 29.878% computes to a ratio strictly less than 30.0 in
1656 // f32 (see below) and must therefore fail against the 30.0 floor.
1657 // (Note: 30/100 in f32 rounds to ~30.000002, slightly *above* 30.0,
1658 // so "exactly-at-floor" must be tested with ratios that are exact
1659 // in f32 — see case (2b).)
1660 assert_eq!(
1661 verdict_from_pass_at_1(49, 164, AC_SHIP2_008_MIN_HUMANEVAL_PASS_AT_1_PCT),
1662 Ship018Verdict::Fail,
1663 "49/164 ≈ 29.88% must fail the 30.0 floor",
1664 );
1665 assert_eq!(
1666 verdict_from_pass_at_1(29, 100, AC_SHIP2_008_MIN_HUMANEVAL_PASS_AT_1_PCT),
1667 Ship018Verdict::Fail,
1668 "29/100 = 29.0% must fail the 30.0 floor",
1669 );
1670
1671 // ── (2b) Inclusive-floor proof at an f32-exact ratio.
1672 //
1673 // 50/100 → exactly 50.0% in f32 (both operands integer-exact, the
1674 // quotient 0.5 is a power of two, and 0.5 × 100.0 = 50.0 exactly).
1675 // This lets us prove the comparison is `>=` (inclusive), not `>`:
1676 // - Threshold = 50.0 → Pass (exact equality).
1677 // - Threshold = 50.0 + one ULP → Fail (strictly above ratio).
1678 // - Threshold = 50.0 - one ULP → Pass (strictly below ratio).
1679 let exact_50 = 50.0_f32;
1680 assert_eq!(50.0_f32 * 2.0_f32, 100.0_f32, "sanity: 50.0 is exact in f32");
1681 let fifty_plus_ulp = f32::from_bits(exact_50.to_bits() + 1);
1682 let fifty_minus_ulp = f32::from_bits(exact_50.to_bits() - 1);
1683 assert!(fifty_plus_ulp > exact_50, "sanity: +ULP is strictly above");
1684 assert!(fifty_minus_ulp < exact_50, "sanity: −ULP is strictly below");
1685 assert_eq!(
1686 verdict_from_pass_at_1(50, 100, exact_50),
1687 Ship018Verdict::Pass,
1688 "inclusive floor: 50.0% ≥ 50.0 must Pass (proves `>=`, not `>`)",
1689 );
1690 assert_eq!(
1691 verdict_from_pass_at_1(50, 100, fifty_plus_ulp),
1692 Ship018Verdict::Fail,
1693 "50/100 must fail when threshold is one ULP above 50.0",
1694 );
1695 assert_eq!(
1696 verdict_from_pass_at_1(50, 100, fifty_minus_ulp),
1697 Ship018Verdict::Pass,
1698 "50/100 must pass when threshold is one ULP below 50.0",
1699 );
1700
1701 // ── (3) Generous-green
1702 assert_eq!(
1703 verdict_from_pass_at_1(82, 164, AC_SHIP2_008_MIN_HUMANEVAL_PASS_AT_1_PCT),
1704 Ship018Verdict::Pass,
1705 "82/164 = 50% must pass",
1706 );
1707 assert_eq!(
1708 verdict_from_pass_at_1(164, 164, AC_SHIP2_008_MIN_HUMANEVAL_PASS_AT_1_PCT),
1709 Ship018Verdict::Pass,
1710 "perfect score must pass",
1711 );
1712
1713 // ── (4) Hard-red
1714 assert_eq!(
1715 verdict_from_pass_at_1(0, 164, AC_SHIP2_008_MIN_HUMANEVAL_PASS_AT_1_PCT),
1716 Ship018Verdict::Fail,
1717 "zero-correct run must fail",
1718 );
1719 assert_eq!(
1720 verdict_from_pass_at_1(1, 164, AC_SHIP2_008_MIN_HUMANEVAL_PASS_AT_1_PCT),
1721 Ship018Verdict::Fail,
1722 "1/164 ≈ 0.6% must fail",
1723 );
1724
1725 // ── (5) Monotonicity sweep: correct ∈ 0..=164, total = 164
1726 //
1727 // Over an increasing `correct` axis the verdict is allowed to flip
1728 // Fail → Pass exactly once; it must never flip Pass → Fail.
1729 let total = 164usize;
1730 let mut already_passed = false;
1731 for correct in 0..=total {
1732 let v =
1733 verdict_from_pass_at_1(correct, total, AC_SHIP2_008_MIN_HUMANEVAL_PASS_AT_1_PCT);
1734 match v {
1735 Ship018Verdict::Pass => {
1736 already_passed = true;
1737 }
1738 Ship018Verdict::Fail => {
1739 assert!(
1740 !already_passed,
1741 "monotonicity violated: correct={correct} reverted Pass→Fail",
1742 );
1743 }
1744 }
1745 }
1746
1747 // ── (6) Div-safety: total=0 must always fail
1748 assert_eq!(
1749 verdict_from_pass_at_1(0, 0, AC_SHIP2_008_MIN_HUMANEVAL_PASS_AT_1_PCT),
1750 Ship018Verdict::Fail,
1751 "empty run (total=0) must fail — a positive floor is unsatisfiable",
1752 );
1753 assert_eq!(
1754 verdict_from_pass_at_1(0, 0, 0.0_f32),
1755 Ship018Verdict::Fail,
1756 "empty run must fail even with a zero threshold — the harness \
1757 is broken if it reports an empty denominator",
1758 );
1759
1760 // ── (7) Sanity guard: correct > total
1761 assert_eq!(
1762 verdict_from_pass_at_1(165, 164, AC_SHIP2_008_MIN_HUMANEVAL_PASS_AT_1_PCT),
1763 Ship018Verdict::Fail,
1764 "correct > total is a broken harness report; must fail closed",
1765 );
1766
1767 // ── (8) Non-finite threshold → Fail
1768 assert_eq!(
1769 verdict_from_pass_at_1(164, 164, f32::NAN),
1770 Ship018Verdict::Fail,
1771 "NaN threshold must fail",
1772 );
1773 assert_eq!(
1774 verdict_from_pass_at_1(164, 164, f32::INFINITY),
1775 Ship018Verdict::Fail,
1776 "+∞ threshold must fail",
1777 );
1778 assert_eq!(
1779 verdict_from_pass_at_1(164, 164, f32::NEG_INFINITY),
1780 Ship018Verdict::Fail,
1781 "−∞ threshold must fail",
1782 );
1783
1784 // ── (9) Provenance pin
1785 assert!(
1786 (AC_SHIP2_008_MIN_HUMANEVAL_PASS_AT_1_PCT - 30.0_f32).abs() < f32::EPSILON,
1787 "contract floor drift: AC_SHIP2_008_MIN_HUMANEVAL_PASS_AT_1_PCT \
1788 must stay pinned to 30.0 (spec §5.2 AC-SHIP2-008)",
1789 );
1790 }
1791
1792 /// Binds the YAML contract's `GATE-ARCH-370M-007` block to this test
1793 /// binary: any rename, removal, or discharge_status regression in
1794 /// `contracts/model-families/llama-370m-sovereign-v1.yaml` is caught
1795 /// at `cargo test` time.
1796 #[test]
1797 fn falsify_ship_018_gate_arch_370m_007_has_partial_discharge_marker() {
1798 let doc: serde_yaml::Value =
1799 serde_yaml::from_str(SOVEREIGN_CONTRACT_YAML).expect("parse sovereign contract");
1800
1801 let gates = doc["gates"].as_sequence().expect("contract must have `gates:` sequence");
1802
1803 let gate = gates
1804 .iter()
1805 .find(|g| g["id"].as_str() == Some("GATE-ARCH-370M-007"))
1806 .expect("GATE-ARCH-370M-007 (SHIP-018 humaneval pass@1) must be present");
1807
1808 assert_eq!(
1809 gate["binds_to"].as_str(),
1810 Some("AC-SHIP2-008"),
1811 "GATE-ARCH-370M-007 must bind AC-SHIP2-008",
1812 );
1813 assert_eq!(
1814 gate["falsification_id"].as_str(),
1815 Some("FALSIFY-SHIP-018"),
1816 "GATE-ARCH-370M-007 must bind FALSIFY-SHIP-018",
1817 );
1818 assert_eq!(
1819 gate["discharge_status"].as_str(),
1820 Some("PARTIAL_ALGORITHM_LEVEL"),
1821 "GATE-ARCH-370M-007 must advertise PARTIAL_ALGORITHM_LEVEL — \
1822 full discharge blocks on trained 370M .apr + real apr eval",
1823 );
1824 let evidence = gate["evidence_discharged_by"]
1825 .as_sequence()
1826 .expect("GATE-ARCH-370M-007 must have evidence_discharged_by");
1827 assert!(
1828 !evidence.is_empty(),
1829 "GATE-ARCH-370M-007 evidence_discharged_by must list at least \
1830 one test function or const pin",
1831 );
1832 assert!(
1833 gate["full_discharge_blocks_on"].as_str().is_some(),
1834 "PARTIAL gate must document full_discharge_blocks_on",
1835 );
1836 assert_eq!(
1837 gate["ship_blocking"].as_bool(),
1838 Some(true),
1839 "GATE-ARCH-370M-007 must advertise ship_blocking:true — \
1840 verdict:pass alone is insufficient green while \
1841 discharge_status == PARTIAL_ALGORITHM_LEVEL",
1842 );
1843 }
1844
1845 /// FALSIFY-SHIP-016 algorithm-level PARTIAL discharge: the decision
1846 /// rule of SHIP-016 — "`apr qa <model>.apr` passes iff all 8 gates
1847 /// PASS; any gate FAIL fails the ship" — is a pure aggregate-AND
1848 /// over a Boolean slice. This test proves the rule without running
1849 /// the compute-heavy gates themselves. The rule separates from the
1850 /// compute dependency: the gate-runner is fixture-swappable once
1851 /// a trained 370M .apr exists; the decision is proven today.
1852 #[test]
1853 fn falsify_ship_016_apr_qa_aggregate_and_logic() {
1854 // Section 1: canonical Pass — all 8 gates true → Pass.
1855 let all_pass = [true; AC_SHIP2_006_REQUIRED_QA_GATE_COUNT];
1856 assert_eq!(
1857 verdict_from_qa_gates(&all_pass),
1858 Ship016Verdict::Pass,
1859 "AC-SHIP2-006: all 8 gates PASS must yield Pass",
1860 );
1861
1862 // Section 2: single-gate-fail must flip the aggregate to Fail —
1863 // this is the "any gate FAIL" counter-example from FALSIFY-SHIP-016.
1864 for flip_idx in 0..AC_SHIP2_006_REQUIRED_QA_GATE_COUNT {
1865 let mut gates = [true; AC_SHIP2_006_REQUIRED_QA_GATE_COUNT];
1866 gates[flip_idx] = false;
1867 assert_eq!(
1868 verdict_from_qa_gates(&gates),
1869 Ship016Verdict::Fail,
1870 "flipping gate index {flip_idx} from Pass to Fail must yield aggregate Fail \
1871 — SHIP-016 is an AND, not a majority or threshold",
1872 );
1873 }
1874
1875 // Section 3: canonical Fail — all 8 gates false → Fail.
1876 let all_fail = [false; AC_SHIP2_006_REQUIRED_QA_GATE_COUNT];
1877 assert_eq!(
1878 verdict_from_qa_gates(&all_fail),
1879 Ship016Verdict::Fail,
1880 "all 8 gates FAIL must yield Fail",
1881 );
1882
1883 // Section 4: exhaustive 2^8 = 256-combination proof — the ONLY
1884 // input yielding Pass is the all-true vector; every other
1885 // combination of 8 bools must yield Fail.
1886 let mut pass_count = 0usize;
1887 let mut fail_count = 0usize;
1888 for mask in 0u32..(1u32 << AC_SHIP2_006_REQUIRED_QA_GATE_COUNT) {
1889 let gates: [bool; AC_SHIP2_006_REQUIRED_QA_GATE_COUNT] =
1890 std::array::from_fn(|i| (mask >> i) & 1 == 1);
1891 match verdict_from_qa_gates(&gates) {
1892 Ship016Verdict::Pass => {
1893 pass_count += 1;
1894 assert!(
1895 gates.iter().all(|&p| p),
1896 "Pass verdict must only occur when all 8 gates are true; \
1897 got {gates:?} at mask {mask:#010b}",
1898 );
1899 }
1900 Ship016Verdict::Fail => {
1901 fail_count += 1;
1902 assert!(
1903 gates.iter().any(|&p| !p),
1904 "Fail verdict must only occur when at least one gate is false; \
1905 got {gates:?} at mask {mask:#010b}",
1906 );
1907 }
1908 }
1909 }
1910 assert_eq!(pass_count, 1, "exactly one of 256 combos (all-true) yields Pass");
1911 assert_eq!(fail_count, 255, "the other 255 combos must yield Fail");
1912
1913 // Section 5: monotonicity — adding a Pass to a mixed slice can
1914 // only move the verdict up (Fail→Pass) or keep it the same,
1915 // never downgrade Pass→Fail. Pair each combo with the combo
1916 // obtained by flipping one bit from false to true; assert
1917 // the verdict never regresses.
1918 for mask in 0u32..(1u32 << AC_SHIP2_006_REQUIRED_QA_GATE_COUNT) {
1919 let before: [bool; AC_SHIP2_006_REQUIRED_QA_GATE_COUNT] =
1920 std::array::from_fn(|i| (mask >> i) & 1 == 1);
1921 for flip_idx in 0..AC_SHIP2_006_REQUIRED_QA_GATE_COUNT {
1922 if before[flip_idx] {
1923 continue;
1924 }
1925 let mut after = before;
1926 after[flip_idx] = true;
1927 let before_v = verdict_from_qa_gates(&before);
1928 let after_v = verdict_from_qa_gates(&after);
1929 assert!(
1930 !(before_v == Ship016Verdict::Pass && after_v == Ship016Verdict::Fail),
1931 "monotonicity violated: flipping gate {flip_idx} from false to true \
1932 regressed Pass→Fail at mask {mask:#010b}",
1933 );
1934 }
1935 }
1936
1937 // Section 6: contract-drift guards — wrong gate count must Fail
1938 // conservatively even when every supplied entry is true. This
1939 // prevents a silent green from an out-of-sync harness that
1940 // shipped 7 or 9 gates instead of the spec-mandated 8.
1941 assert_eq!(
1942 verdict_from_qa_gates(&[]),
1943 Ship016Verdict::Fail,
1944 "empty gate slice must Fail (contract drift)",
1945 );
1946 assert_eq!(
1947 verdict_from_qa_gates(&[true; 7]),
1948 Ship016Verdict::Fail,
1949 "7 gates (short by one) must Fail even when all true (contract drift)",
1950 );
1951 assert_eq!(
1952 verdict_from_qa_gates(&[true; 9]),
1953 Ship016Verdict::Fail,
1954 "9 gates (long by one) must Fail even when all true (contract drift)",
1955 );
1956 assert_eq!(
1957 verdict_from_qa_gates(&[true; 16]),
1958 Ship016Verdict::Fail,
1959 "double-wide gate slice must Fail (contract drift)",
1960 );
1961
1962 // Section 7: provenance pin — the 8-gate count is the
1963 // contract number; drift on this constant fails lockstep-wise
1964 // with the spec amendment and `QaConfig` skip flags.
1965 assert_eq!(
1966 AC_SHIP2_006_REQUIRED_QA_GATE_COUNT, 8,
1967 "AC-SHIP2-006 is the 8-gate aggregate; any change requires \
1968 contract + spec + CLI skip-flag edits in lockstep",
1969 );
1970 }
1971
1972 /// GATE-ARCH-370M-008 wiring check: once FALSIFY-SHIP-016 has an
1973 /// algorithm-level PARTIAL discharge, the sovereign contract YAML
1974 /// MUST record `discharge_status: PARTIAL_ALGORITHM_LEVEL` +
1975 /// `evidence_discharged_by` + `full_discharge_blocks_on` on
1976 /// GATE-ARCH-370M-008. Any edit that drops those fields fails this
1977 /// test before the artifact ships.
1978 #[test]
1979 fn falsify_ship_016_gate_arch_370m_008_has_partial_discharge_marker() {
1980 let doc: serde_yaml::Value =
1981 serde_yaml::from_str(SOVEREIGN_CONTRACT_YAML).expect("parse sovereign contract");
1982 let gates =
1983 doc["gates"].as_sequence().expect("gates must be a sequence in sovereign contract");
1984 let gate = gates
1985 .iter()
1986 .find(|g| g["id"].as_str() == Some("GATE-ARCH-370M-008"))
1987 .expect("GATE-ARCH-370M-008 must exist in sovereign contract");
1988
1989 assert_eq!(
1990 gate["falsification_id"].as_str(),
1991 Some("FALSIFY-SHIP-016"),
1992 "GATE-ARCH-370M-008 must bind FALSIFY-SHIP-016",
1993 );
1994 assert_eq!(
1995 gate["binds_to"].as_str(),
1996 Some("AC-SHIP2-006"),
1997 "GATE-ARCH-370M-008 must bind AC-SHIP2-006",
1998 );
1999 assert_eq!(
2000 gate["discharge_status"].as_str(),
2001 Some("PARTIAL_ALGORITHM_LEVEL"),
2002 "GATE-ARCH-370M-008 must advertise PARTIAL_ALGORITHM_LEVEL \
2003 (full discharge blocks on real trained 370M .apr + `apr qa` harness)",
2004 );
2005 let evidence = gate["evidence_discharged_by"]
2006 .as_sequence()
2007 .expect("GATE-ARCH-370M-008 must have evidence_discharged_by");
2008 assert!(
2009 !evidence.is_empty(),
2010 "GATE-ARCH-370M-008 evidence_discharged_by must list \
2011 at least one test function or artifact",
2012 );
2013 assert!(
2014 gate["full_discharge_blocks_on"].as_str().is_some(),
2015 "PARTIAL gate must document full_discharge_blocks_on",
2016 );
2017 assert_eq!(
2018 gate["ship_blocking"].as_bool(),
2019 Some(true),
2020 "GATE-ARCH-370M-008 must advertise ship_blocking:true — the \
2021 gate's `verdict:pass` alone is insufficient green while \
2022 discharge_status == PARTIAL_ALGORITHM_LEVEL",
2023 );
2024 }
2025
2026 // ========================================================================
2027 // FALSIFY-SHIP-013 / AC-SHIP2-003 / GATE-ARCH-370M-013 — val CE loss floor
2028 // ========================================================================
2029
2030 /// FALSIFY-SHIP-013 algorithm-level PARTIAL discharge: prove the
2031 /// f32-threshold decision rule binding the measured MODEL-2 val CE
2032 /// to [`AC_SHIP2_003_MAX_VAL_CROSS_ENTROPY_LOSS`] = 2.2. Any edit
2033 /// that changes the constant, the comparison direction, the
2034 /// non-finite handling, the negative-domain guard, or the
2035 /// monotonicity must break this test before an `apr pretrain
2036 /// --validate` compute dispatch is launched.
2037 ///
2038 /// Mutation survey (7 sections):
2039 /// 1. Exact boundary 2.2 → Pass (inclusive floor, not strict <)
2040 /// 2. One-ULP above boundary → Fail; one-ULP below → Pass
2041 /// 3. Clear Pass band {0.0, 0.5, 1.0, 2.0, 2.199}
2042 /// 4. Clear Fail band {2.201, 3.0, 10.0, f32::MAX}
2043 /// 5. Non-finite {NaN, +∞, -∞} → Fail conservatively
2044 /// 6. Negative (domain violation, CE ≥ 0) → Fail conservatively
2045 /// 7. Provenance pin: the const stays byte-equal to 2.2_f32
2046 #[test]
2047 fn falsify_ship_013_val_ce_loss_threshold_logic() {
2048 // Section 1: exact boundary 2.2 → Pass. AC-SHIP2-003 says
2049 // "CE ≤ 2.2" (inclusive), so sim == threshold must Pass. A
2050 // regression that silently swaps `<=` to `<` would make the
2051 // ceiling unreachable.
2052 assert_eq!(
2053 verdict_from_val_ce_loss(AC_SHIP2_003_MAX_VAL_CROSS_ENTROPY_LOSS),
2054 Ship013Verdict::Pass,
2055 "val CE == 2.2 must Pass (inclusive floor, not strict <)",
2056 );
2057 assert_eq!(verdict_from_val_ce_loss(2.2), Ship013Verdict::Pass, "literal 2.2 must Pass",);
2058
2059 // Section 2: ULP asymmetry around the boundary. f32 at 2.2 is
2060 // `0x400CCCCD` = 2.20000004768371582...; the next-representable
2061 // float up is `0x400CCCCE` (Fail), and the next below is
2062 // `0x400CCCCC` (Pass). Sharpest possible counter-examples.
2063 let one_ulp_above = f32::from_bits(AC_SHIP2_003_MAX_VAL_CROSS_ENTROPY_LOSS.to_bits() + 1);
2064 let one_ulp_below = f32::from_bits(AC_SHIP2_003_MAX_VAL_CROSS_ENTROPY_LOSS.to_bits() - 1);
2065 assert!(one_ulp_above > AC_SHIP2_003_MAX_VAL_CROSS_ENTROPY_LOSS);
2066 assert!(one_ulp_below < AC_SHIP2_003_MAX_VAL_CROSS_ENTROPY_LOSS);
2067 assert_eq!(
2068 verdict_from_val_ce_loss(one_ulp_above),
2069 Ship013Verdict::Fail,
2070 "one ULP above 2.2 must Fail (strictly above ceiling)",
2071 );
2072 assert_eq!(
2073 verdict_from_val_ce_loss(one_ulp_below),
2074 Ship013Verdict::Pass,
2075 "one ULP below 2.2 must Pass (still under ceiling)",
2076 );
2077
2078 // Section 3: clear Pass band. 0.0 is the theoretical minimum
2079 // (perfect predictor); 2.199 is safely under 2.2.
2080 for ce in [0.0_f32, 0.5, 1.0, 2.0, 2.199] {
2081 assert_eq!(
2082 verdict_from_val_ce_loss(ce),
2083 Ship013Verdict::Pass,
2084 "val CE = {ce} must Pass (in clear Pass band)",
2085 );
2086 }
2087
2088 // Section 4: clear Fail band. A poorly-converged 370M would
2089 // land here (a random 50K-vocab predictor is ≈ ln(50257) ≈
2090 // 10.82 nats, which must obviously Fail). f32::MAX is the
2091 // saturation sanity check.
2092 for ce in [2.201_f32, 3.0, 10.0, f32::MAX] {
2093 assert_eq!(
2094 verdict_from_val_ce_loss(ce),
2095 Ship013Verdict::Fail,
2096 "val CE = {ce} must Fail (above 2.2 ceiling)",
2097 );
2098 }
2099
2100 // Section 5: non-finite inputs must Fail conservatively. A
2101 // loss-harness bug that produces NaN (log(0) or divide-by-zero
2102 // in softmax) or ±∞ (overflow in exp) must never silently
2103 // promote to Pass via NaN comparison semantics.
2104 assert_eq!(
2105 verdict_from_val_ce_loss(f32::NAN),
2106 Ship013Verdict::Fail,
2107 "NaN val CE must Fail conservatively",
2108 );
2109 assert_eq!(
2110 verdict_from_val_ce_loss(f32::INFINITY),
2111 Ship013Verdict::Fail,
2112 "+∞ val CE must Fail conservatively",
2113 );
2114 assert_eq!(
2115 verdict_from_val_ce_loss(f32::NEG_INFINITY),
2116 Ship013Verdict::Fail,
2117 "-∞ val CE must Fail conservatively",
2118 );
2119
2120 // Section 6: negative CE is a domain violation. Cross-entropy
2121 // H(p,q) = -Σ p(x) log q(x) ≥ 0 for all probability
2122 // distributions p,q. A negative measurement indicates a sign
2123 // flip, log-domain underflow, or subtract-instead-of-add bug
2124 // in the loss harness, and must never be promoted to "better
2125 // than zero" Pass.
2126 for neg_ce in [-0.001_f32, -1.0, -f32::INFINITY] {
2127 assert_eq!(
2128 verdict_from_val_ce_loss(neg_ce),
2129 Ship013Verdict::Fail,
2130 "negative val CE = {neg_ce} must Fail (CE ≥ 0 by definition)",
2131 );
2132 }
2133
2134 // Section 7: provenance pin — the 2.2 constant is load-bearing
2135 // and lockstepped with the spec. If AC-SHIP2-003 ever changes
2136 // the ceiling (relaxing to 2.5 for a smaller training-data
2137 // budget, tightening to 2.0 for a 500M jump), this const and
2138 // this test must move together.
2139 #[allow(clippy::float_cmp)]
2140 {
2141 assert_eq!(
2142 AC_SHIP2_003_MAX_VAL_CROSS_ENTROPY_LOSS, 2.2_f32,
2143 "MODEL-2 val CE ceiling is 2.2 \
2144 (spec §5.2 AC-SHIP2-003; albor 370M Sovereign target)",
2145 );
2146 }
2147 }
2148
2149 // ========================================================================
2150 // FALSIFY-SHIP-014 / AC-SHIP2-004 / GATE-ARCH-370M-014 — training budget
2151 // ========================================================================
2152
2153 /// FALSIFY-SHIP-014 algorithm-level PARTIAL discharge: prove the
2154 /// u32-threshold decision rule binding the measured MODEL-2
2155 /// training wall-clock duration to
2156 /// [`AC_SHIP2_004_MAX_TRAINING_DURATION_DAYS`] = 21. Any edit that
2157 /// changes the constant, the comparison direction, or introduces
2158 /// a non-monotonic regression must break this test before a
2159 /// real-compute dispatch on the RTX 4090 host is launched.
2160 ///
2161 /// Mutation survey (6 sections):
2162 /// 1. Exact boundary 21 → Pass (inclusive ceiling)
2163 /// 2. Adjacent values: 20 → Pass; 22 → Fail
2164 /// 3. Clear Pass band {0, 1, 7, 14, 20, 21}
2165 /// 4. Clear Fail band {22, 30, 100, u32::MAX}
2166 /// 5. Monotonicity sweep 0..=42 — verdict flips exactly once
2167 /// at the 21→22 transition and never flips back
2168 /// 6. Provenance pin: the const stays byte-equal to 21_u32
2169 #[test]
2170 fn falsify_ship_014_training_duration_threshold_logic() {
2171 // Section 1: exact boundary 21 → Pass. AC-SHIP2-004 says
2172 // "within 21 days" (inclusive), so 21 == threshold must Pass.
2173 // A regression that silently swaps `<=` to `<` would make the
2174 // ceiling unreachable.
2175 assert_eq!(
2176 verdict_from_training_duration_days(AC_SHIP2_004_MAX_TRAINING_DURATION_DAYS),
2177 Ship014Verdict::Pass,
2178 "21 days must Pass (inclusive ceiling, not strict <)",
2179 );
2180 assert_eq!(
2181 verdict_from_training_duration_days(21),
2182 Ship014Verdict::Pass,
2183 "literal 21 days must Pass",
2184 );
2185
2186 // Section 2: adjacent values. 20 is the sharpest Pass
2187 // counter-example (one below ceiling); 22 is the sharpest
2188 // Fail counter-example (one above). u32 neighbours are
2189 // exact (no ULP noise) so these are both rock-solid.
2190 assert_eq!(
2191 verdict_from_training_duration_days(20),
2192 Ship014Verdict::Pass,
2193 "20 days must Pass (one day under ceiling)",
2194 );
2195 assert_eq!(
2196 verdict_from_training_duration_days(22),
2197 Ship014Verdict::Fail,
2198 "22 days must Fail (one day over ceiling; Spec §9 Risk #4 escape hatch)",
2199 );
2200
2201 // Section 3: clear Pass band. 0 is trivial (same-day cache
2202 // hit, no actual training); 1/7/14 are smaller training runs;
2203 // 21 is the boundary.
2204 for days in [0_u32, 1, 7, 14, 20, 21] {
2205 assert_eq!(
2206 verdict_from_training_duration_days(days),
2207 Ship014Verdict::Pass,
2208 "{days} days must Pass (in clear Pass band)",
2209 );
2210 }
2211
2212 // Section 4: clear Fail band. 22/30/100 are incremental
2213 // overruns (each invokes Spec §9 Risk #4 escape-hatch planning
2214 // — rent 2× H100 week 3); u32::MAX is the saturation sanity.
2215 for days in [22_u32, 30, 100, u32::MAX] {
2216 assert_eq!(
2217 verdict_from_training_duration_days(days),
2218 Ship014Verdict::Fail,
2219 "{days} days must Fail (above 21-day ceiling)",
2220 );
2221 }
2222
2223 // Section 5: monotonicity sweep 0..=42 — verdict must flip
2224 // exactly once at the 21→22 transition and never flip back.
2225 // This is the complete classification over a 2× the ceiling
2226 // range; any mutation that introduces non-monotonicity (e.g.
2227 // a midlife-crisis carve-out at "after day 14 but before day
2228 // 21" or a modular-arithmetic bug) is trivially caught.
2229 let mut seen_fail = false;
2230 for days in 0..=42_u32 {
2231 let v = verdict_from_training_duration_days(days);
2232 match (v, seen_fail) {
2233 (Ship014Verdict::Pass, true) => {
2234 panic!(
2235 "monotonicity broken: day {days} flipped back to Pass \
2236 after a previous Fail was observed",
2237 );
2238 }
2239 (Ship014Verdict::Fail, _) => {
2240 seen_fail = true;
2241 }
2242 _ => {}
2243 }
2244 }
2245 // And verify the transition is exactly at 21→22, not earlier
2246 // or later (rules out off-by-one mutants).
2247 assert_eq!(
2248 verdict_from_training_duration_days(21),
2249 Ship014Verdict::Pass,
2250 "sweep boundary: day 21 must Pass",
2251 );
2252 assert_eq!(
2253 verdict_from_training_duration_days(22),
2254 Ship014Verdict::Fail,
2255 "sweep boundary: day 22 must Fail",
2256 );
2257
2258 // Section 6: provenance pin — the 21 constant is load-bearing
2259 // and lockstepped with the spec. If AC-SHIP2-004 ever changes
2260 // the ceiling (extending to 30 for a longer run, tightening
2261 // to 14 for a distilled student), this const and this test
2262 // must move together.
2263 assert_eq!(
2264 AC_SHIP2_004_MAX_TRAINING_DURATION_DAYS, 21_u32,
2265 "MODEL-2 training-budget ceiling is 21 days \
2266 (spec §5.2 AC-SHIP2-004; RTX 4090 hardware budget)",
2267 );
2268 }
2269}