entrenar/train/gputrain_007.rs
1//! FALSIFY-GPUTRAIN-007 / INV-GPUTRAIN-007 / GATE-GPUTRAIN-006 — algorithm-level
2//! PARTIAL discharge.
3//!
4//! Spec: `docs/specifications/aprender-train/ship-two-models-spec.md` §14
5//! (task #132 CUDA training backend gap).
6//!
7//! Contract: `contracts/entrenar/gpu-training-backend-v1.yaml` v1.0.0 → v1.1.0
8//! binds INV-GPUTRAIN-007 at PARTIAL_ALGORITHM_LEVEL via two pure functions:
9//!
10//! 1. `verdict_from_version_json_keys(present_keys) -> Gputrain007Verdict`
11//! — schema gate: Pass iff every key in
12//! `AC_GPUTRAIN_007_REQUIRED_VERSION_JSON_KEYS` appears in the
13//! presented-keys slice (extra unknown keys are tolerated; missing
14//! required keys are not).
15//!
16//! 2. `verdict_from_version_json_fields(&VersionJsonCudaFields) -> Verdict`
17//! — field-shape gate: Pass iff `visible_devices.len() <= 16` (matches
18//! INV-GPUTRAIN-001's `cuda:0..cuda:15` grammar, 16 max) AND NOT
19//! `(cuda_feature && !cuda_runtime_available)` (claiming CUDA was
20//! compiled in while the runtime is absent is exactly the
21//! FM-GPUTRAIN-STALE-BUILD failure mode — the binary must either not
22//! advertise the feature or confirm the runtime).
23//!
24//! INV-GPUTRAIN-007 states: "`apr --version --json` reports
25//! `{cuda_feature, cuda_runtime_available, visible_devices[]}`." Operators
26//! must be able to tell "compiled without cuda" apart from "compiled with
27//! cuda but no GPU visible" without reading a stack trace.
28//!
29//! The compute-heavy portion (spawning `apr --version --json` as a
30//! subprocess and parsing its stdout) is intentionally out of scope here;
31//! the schema+shape rules are what the live gate calls after the JSON has
32//! been deserialized, so this binding is JSON-library agnostic (we take a
33//! `&[&str]` of keys, not a raw string).
34
35/// Required top-level keys in `apr --version --json` output. Order is
36/// irrelevant; extra keys (e.g. future `tensorrt_feature`) are tolerated,
37/// but these three are load-bearing for operator triage.
38pub const AC_GPUTRAIN_007_REQUIRED_VERSION_JSON_KEYS: &[&str] =
39 &["cuda_feature", "cuda_runtime_available", "visible_devices"];
40
41/// Deserialized shape of the CUDA-related block of `apr --version --json`.
42/// Library-agnostic: caller is responsible for parsing JSON into this
43/// shape (serde_json, hand-rolled, whatever) and then running the field-
44/// shape gate.
45#[derive(Debug, Clone)]
46pub struct VersionJsonCudaFields {
47 /// Whether the binary was built with `--features cuda`.
48 pub cuda_feature: bool,
49 /// Whether, at startup, `cudaGetDeviceCount` returned a non-zero
50 /// device count AND the runtime didn't error.
51 pub cuda_runtime_available: bool,
52 /// Human-readable name/index pairs for each visible device, e.g.
53 /// `["0:RTX 4090", "1:RTX 4090"]`. Upper bound 16 matches
54 /// INV-GPUTRAIN-001 grammar's `:0..:15` range.
55 pub visible_devices: Vec<String>,
56}
57
58/// Binary verdict for FALSIFY-GPUTRAIN-007 / GATE-GPUTRAIN-006.
59#[derive(Debug, Clone, Copy, PartialEq, Eq)]
60pub enum Gputrain007Verdict {
61 /// JSON contains all required keys (schema gate) OR field values
62 /// pass every sanity rail (shape gate).
63 Pass,
64 /// A required key is missing, device-count exceeds the grammar
65 /// limit, or the (cuda_feature, cuda_runtime_available) pair is in
66 /// the advertised-but-missing inconsistent state.
67 Fail,
68}
69
70/// Schema gate: given the list of top-level keys actually present in
71/// `apr --version --json`, Pass iff every one of
72/// `AC_GPUTRAIN_007_REQUIRED_VERSION_JSON_KEYS` is represented.
73/// Unknown / extra keys are silently tolerated (forward-compatible).
74#[must_use]
75pub fn verdict_from_version_json_keys(present_keys: &[&str]) -> Gputrain007Verdict {
76 for required in AC_GPUTRAIN_007_REQUIRED_VERSION_JSON_KEYS {
77 if !present_keys.contains(required) {
78 return Gputrain007Verdict::Fail;
79 }
80 }
81 Gputrain007Verdict::Pass
82}
83
84/// Field-shape gate: given the parsed shape, Pass iff
85/// 1. `visible_devices.len() <= 16` (matches INV-GPUTRAIN-001 grammar
86/// `cuda:0..cuda:15`, 16 max), and
87/// 2. NOT `(cuda_feature && !cuda_runtime_available)` — the advertised-
88/// but-missing inconsistent state from FM-GPUTRAIN-STALE-BUILD.
89///
90/// The other three (cuda_feature, cuda_runtime_available) combinations
91/// are all operationally valid:
92/// - `(false, false)`: CPU-only build. Fine.
93/// - `(false, true)`: CUDA runtime present but build didn't enable it.
94/// Also fine; operator just needs a cuda-feature build to use it.
95/// - `(true, true)`: CUDA fully wired. Baseline.
96#[must_use]
97pub fn verdict_from_version_json_fields(fields: &VersionJsonCudaFields) -> Gputrain007Verdict {
98 // INV-GPUTRAIN-001 grammar allows cuda:0..cuda:15 — 16 indices max.
99 if fields.visible_devices.len() > 16 {
100 return Gputrain007Verdict::Fail;
101 }
102 // FM-GPUTRAIN-STALE-BUILD: advertised feature but missing runtime is
103 // the footgun that cost 14 minutes on lambda-labs. Fail-closed.
104 if fields.cuda_feature && !fields.cuda_runtime_available {
105 return Gputrain007Verdict::Fail;
106 }
107 Gputrain007Verdict::Pass
108}
109
110// ─────────────────────────────────────────────────────────────
111// Unit tests — FALSIFY-GPUTRAIN-007 algorithm-level proof
112// ─────────────────────────────────────────────────────────────
113
114#[cfg(test)]
115mod tests {
116 use super::*;
117
118 /// FALSIFY-GPUTRAIN-007 algorithm-level PARTIAL discharge: prove the
119 /// version-JSON schema + field-shape invariants. Any mutation that
120 /// allows a missing required key, silently accepts too many visible
121 /// devices, or lets the stale-build state Pass must break this test
122 /// before `apr --version --json` is ever shelled out.
123 #[test]
124 fn falsify_gputrain_007_version_json_schema_and_shape() {
125 // Section 1: all required keys present — baseline Pass for the
126 // schema gate. Mirrors the minimum JSON output on any shipping
127 // build (CUDA or not).
128 assert_eq!(
129 verdict_from_version_json_keys(&[
130 "cuda_feature",
131 "cuda_runtime_available",
132 "visible_devices",
133 ]),
134 Gputrain007Verdict::Pass,
135 "all three required keys present must Pass",
136 );
137 // Extra unknown keys (future-proof): tolerated per spec
138 // comment about forward compatibility.
139 assert_eq!(
140 verdict_from_version_json_keys(&[
141 "cuda_feature",
142 "cuda_runtime_available",
143 "visible_devices",
144 "version",
145 "sha",
146 "tensorrt_feature",
147 ]),
148 Gputrain007Verdict::Pass,
149 "extra unknown keys must not Fail (forward-compat)",
150 );
151
152 // Section 2: each required key missing in turn. Three separate
153 // sub-mutations of the output schema; each must Fail. Catches
154 // any mutation to the required-keys list (e.g. removing one by
155 // accident during refactor).
156 assert_eq!(
157 verdict_from_version_json_keys(&["cuda_runtime_available", "visible_devices",]),
158 Gputrain007Verdict::Fail,
159 "missing `cuda_feature` must Fail",
160 );
161 assert_eq!(
162 verdict_from_version_json_keys(&["cuda_feature", "visible_devices"]),
163 Gputrain007Verdict::Fail,
164 "missing `cuda_runtime_available` must Fail",
165 );
166 assert_eq!(
167 verdict_from_version_json_keys(&["cuda_feature", "cuda_runtime_available",]),
168 Gputrain007Verdict::Fail,
169 "missing `visible_devices` must Fail",
170 );
171 // All three missing (minimal empty JSON).
172 assert_eq!(
173 verdict_from_version_json_keys(&[]),
174 Gputrain007Verdict::Fail,
175 "empty present-keys slice must Fail",
176 );
177 // Only completely unrelated keys present.
178 assert_eq!(
179 verdict_from_version_json_keys(&["version", "sha"]),
180 Gputrain007Verdict::Fail,
181 "only unrelated keys present must Fail",
182 );
183
184 // Section 3: field-shape happy paths. Three of four
185 // (cuda_feature, cuda_runtime_available) combinations are all
186 // operationally valid.
187 // (false, false) — pure CPU build.
188 // (false, true) — CUDA runtime present but build didn't
189 // enable; not a bug, just a rebuild needed.
190 // (true, true) — fully wired.
191 for (feat, runtime) in [(false, false), (false, true), (true, true)] {
192 let fields = VersionJsonCudaFields {
193 cuda_feature: feat,
194 cuda_runtime_available: runtime,
195 visible_devices: vec!["0:RTX 4090".to_string()],
196 };
197 assert_eq!(
198 verdict_from_version_json_fields(&fields),
199 Gputrain007Verdict::Pass,
200 "consistent (cuda_feature={feat}, runtime_available={runtime}) must Pass",
201 );
202 }
203 // Zero visible devices is Pass on a CPU-only build; grammar
204 // allows up to 16. Lower bound is 0.
205 let fields = VersionJsonCudaFields {
206 cuda_feature: false,
207 cuda_runtime_available: false,
208 visible_devices: vec![],
209 };
210 assert_eq!(
211 verdict_from_version_json_fields(&fields),
212 Gputrain007Verdict::Pass,
213 "empty visible_devices on CPU-only build must Pass",
214 );
215
216 // Section 4: claims-feature-without-runtime — the stale-build
217 // footgun. Must Fail. THIS IS THE NOVEL SANITY RAIL — if a
218 // binary advertises `cuda_feature: true` it must also prove
219 // `cuda_runtime_available: true`, or operators risk another
220 // task #126.
221 let stale = VersionJsonCudaFields {
222 cuda_feature: true,
223 cuda_runtime_available: false,
224 visible_devices: vec![],
225 };
226 assert_eq!(
227 verdict_from_version_json_fields(&stale),
228 Gputrain007Verdict::Fail,
229 "cuda_feature=true + runtime_available=false must Fail \
230 (FM-GPUTRAIN-STALE-BUILD: advertised but missing)",
231 );
232 // Same inconsistency even if visible_devices accidentally got
233 // populated (e.g. cached from a previous build).
234 let stale_with_stale_devices = VersionJsonCudaFields {
235 cuda_feature: true,
236 cuda_runtime_available: false,
237 visible_devices: vec!["0:RTX 4090".to_string()],
238 };
239 assert_eq!(
240 verdict_from_version_json_fields(&stale_with_stale_devices),
241 Gputrain007Verdict::Fail,
242 "advertised feature + missing runtime must Fail regardless of visible_devices",
243 );
244
245 // Section 5: too-many visible devices. INV-GPUTRAIN-001 grammar
246 // allows indices 0..=15 (16 max). Boundary: 16 Pass, 17 Fail.
247 let sixteen = VersionJsonCudaFields {
248 cuda_feature: true,
249 cuda_runtime_available: true,
250 visible_devices: (0..16).map(|i| format!("{i}:device")).collect(),
251 };
252 assert_eq!(
253 verdict_from_version_json_fields(&sixteen),
254 Gputrain007Verdict::Pass,
255 "exactly 16 visible devices must Pass (grammar max)",
256 );
257 let seventeen = VersionJsonCudaFields {
258 cuda_feature: true,
259 cuda_runtime_available: true,
260 visible_devices: (0..17).map(|i| format!("{i}:device")).collect(),
261 };
262 assert_eq!(
263 verdict_from_version_json_fields(&seventeen),
264 Gputrain007Verdict::Fail,
265 "17 visible devices must Fail (exceeds cuda:0..cuda:15 grammar)",
266 );
267 let many = VersionJsonCudaFields {
268 cuda_feature: true,
269 cuda_runtime_available: true,
270 visible_devices: (0..100).map(|i| format!("{i}:device")).collect(),
271 };
272 assert_eq!(
273 verdict_from_version_json_fields(&many),
274 Gputrain007Verdict::Fail,
275 "100 visible devices must Fail (well past grammar)",
276 );
277
278 // Section 6: combined happy-path shape — the three required
279 // keys present AND the field values consistent. Catches a
280 // refactor that split the two gates into separate codepaths
281 // and forgot to call one of them.
282 let happy_fields = VersionJsonCudaFields {
283 cuda_feature: true,
284 cuda_runtime_available: true,
285 visible_devices: vec!["0:RTX 4090".to_string(), "1:RTX 4090".to_string()],
286 };
287 assert_eq!(
288 verdict_from_version_json_fields(&happy_fields),
289 Gputrain007Verdict::Pass,
290 "consistent 2-device CUDA build must Pass field-shape gate",
291 );
292 assert_eq!(
293 verdict_from_version_json_keys(&[
294 "cuda_feature",
295 "cuda_runtime_available",
296 "visible_devices",
297 ]),
298 Gputrain007Verdict::Pass,
299 "matching 3-key schema must Pass schema gate",
300 );
301
302 // Section 7: provenance pin — the required-keys slice is
303 // load-bearing. If a future spec amendment adds a 4th key
304 // (e.g. `rocm_feature`), this slice and the YAML rule must
305 // move together. The byte-literal shape prevents accidental
306 // reordering from changing behaviour.
307 assert_eq!(
308 AC_GPUTRAIN_007_REQUIRED_VERSION_JSON_KEYS.len(),
309 3,
310 "required key count is 3 \
311 (spec §14.4 / gpu-training-backend-v1 INV-GPUTRAIN-007)",
312 );
313 assert!(
314 AC_GPUTRAIN_007_REQUIRED_VERSION_JSON_KEYS.contains(&"cuda_feature"),
315 "`cuda_feature` is a required key",
316 );
317 assert!(
318 AC_GPUTRAIN_007_REQUIRED_VERSION_JSON_KEYS.contains(&"cuda_runtime_available"),
319 "`cuda_runtime_available` is a required key",
320 );
321 assert!(
322 AC_GPUTRAIN_007_REQUIRED_VERSION_JSON_KEYS.contains(&"visible_devices"),
323 "`visible_devices` is a required key",
324 );
325 }
326}