Skip to main content

entrenar/train/
gputrain_007.rs

1//! FALSIFY-GPUTRAIN-007 / INV-GPUTRAIN-007 / GATE-GPUTRAIN-006 — algorithm-level
2//! PARTIAL discharge.
3//!
4//! Spec: `docs/specifications/aprender-train/ship-two-models-spec.md` §14
5//! (task #132 CUDA training backend gap).
6//!
7//! Contract: `contracts/entrenar/gpu-training-backend-v1.yaml` v1.0.0 → v1.1.0
8//! binds INV-GPUTRAIN-007 at PARTIAL_ALGORITHM_LEVEL via two pure functions:
9//!
10//!   1. `verdict_from_version_json_keys(present_keys) -> Gputrain007Verdict`
11//!      — schema gate: Pass iff every key in
12//!      `AC_GPUTRAIN_007_REQUIRED_VERSION_JSON_KEYS` appears in the
13//!      presented-keys slice (extra unknown keys are tolerated; missing
14//!      required keys are not).
15//!
16//!   2. `verdict_from_version_json_fields(&VersionJsonCudaFields) -> Verdict`
17//!      — field-shape gate: Pass iff `visible_devices.len() <= 16` (matches
18//!      INV-GPUTRAIN-001's `cuda:0..cuda:15` grammar, 16 max) AND NOT
19//!      `(cuda_feature && !cuda_runtime_available)` (claiming CUDA was
20//!      compiled in while the runtime is absent is exactly the
21//!      FM-GPUTRAIN-STALE-BUILD failure mode — the binary must either not
22//!      advertise the feature or confirm the runtime).
23//!
24//! INV-GPUTRAIN-007 states: "`apr --version --json` reports
25//! `{cuda_feature, cuda_runtime_available, visible_devices[]}`." Operators
26//! must be able to tell "compiled without cuda" apart from "compiled with
27//! cuda but no GPU visible" without reading a stack trace.
28//!
29//! The compute-heavy portion (spawning `apr --version --json` as a
30//! subprocess and parsing its stdout) is intentionally out of scope here;
31//! the schema+shape rules are what the live gate calls after the JSON has
32//! been deserialized, so this binding is JSON-library agnostic (we take a
33//! `&[&str]` of keys, not a raw string).
34
35/// Required top-level keys in `apr --version --json` output. Order is
36/// irrelevant; extra keys (e.g. future `tensorrt_feature`) are tolerated,
37/// but these three are load-bearing for operator triage.
38pub const AC_GPUTRAIN_007_REQUIRED_VERSION_JSON_KEYS: &[&str] =
39    &["cuda_feature", "cuda_runtime_available", "visible_devices"];
40
41/// Deserialized shape of the CUDA-related block of `apr --version --json`.
42/// Library-agnostic: caller is responsible for parsing JSON into this
43/// shape (serde_json, hand-rolled, whatever) and then running the field-
44/// shape gate.
45#[derive(Debug, Clone)]
46pub struct VersionJsonCudaFields {
47    /// Whether the binary was built with `--features cuda`.
48    pub cuda_feature: bool,
49    /// Whether, at startup, `cudaGetDeviceCount` returned a non-zero
50    /// device count AND the runtime didn't error.
51    pub cuda_runtime_available: bool,
52    /// Human-readable name/index pairs for each visible device, e.g.
53    /// `["0:RTX 4090", "1:RTX 4090"]`. Upper bound 16 matches
54    /// INV-GPUTRAIN-001 grammar's `:0..:15` range.
55    pub visible_devices: Vec<String>,
56}
57
58/// Binary verdict for FALSIFY-GPUTRAIN-007 / GATE-GPUTRAIN-006.
59#[derive(Debug, Clone, Copy, PartialEq, Eq)]
60pub enum Gputrain007Verdict {
61    /// JSON contains all required keys (schema gate) OR field values
62    /// pass every sanity rail (shape gate).
63    Pass,
64    /// A required key is missing, device-count exceeds the grammar
65    /// limit, or the (cuda_feature, cuda_runtime_available) pair is in
66    /// the advertised-but-missing inconsistent state.
67    Fail,
68}
69
70/// Schema gate: given the list of top-level keys actually present in
71/// `apr --version --json`, Pass iff every one of
72/// `AC_GPUTRAIN_007_REQUIRED_VERSION_JSON_KEYS` is represented.
73/// Unknown / extra keys are silently tolerated (forward-compatible).
74#[must_use]
75pub fn verdict_from_version_json_keys(present_keys: &[&str]) -> Gputrain007Verdict {
76    for required in AC_GPUTRAIN_007_REQUIRED_VERSION_JSON_KEYS {
77        if !present_keys.contains(required) {
78            return Gputrain007Verdict::Fail;
79        }
80    }
81    Gputrain007Verdict::Pass
82}
83
84/// Field-shape gate: given the parsed shape, Pass iff
85///   1. `visible_devices.len() <= 16` (matches INV-GPUTRAIN-001 grammar
86///      `cuda:0..cuda:15`, 16 max), and
87///   2. NOT `(cuda_feature && !cuda_runtime_available)` — the advertised-
88///      but-missing inconsistent state from FM-GPUTRAIN-STALE-BUILD.
89///
90/// The other three (cuda_feature, cuda_runtime_available) combinations
91/// are all operationally valid:
92///   - `(false, false)`: CPU-only build. Fine.
93///   - `(false, true)`: CUDA runtime present but build didn't enable it.
94///     Also fine; operator just needs a cuda-feature build to use it.
95///   - `(true, true)`: CUDA fully wired. Baseline.
96#[must_use]
97pub fn verdict_from_version_json_fields(fields: &VersionJsonCudaFields) -> Gputrain007Verdict {
98    // INV-GPUTRAIN-001 grammar allows cuda:0..cuda:15 — 16 indices max.
99    if fields.visible_devices.len() > 16 {
100        return Gputrain007Verdict::Fail;
101    }
102    // FM-GPUTRAIN-STALE-BUILD: advertised feature but missing runtime is
103    // the footgun that cost 14 minutes on lambda-labs. Fail-closed.
104    if fields.cuda_feature && !fields.cuda_runtime_available {
105        return Gputrain007Verdict::Fail;
106    }
107    Gputrain007Verdict::Pass
108}
109
110// ─────────────────────────────────────────────────────────────
111// Unit tests — FALSIFY-GPUTRAIN-007 algorithm-level proof
112// ─────────────────────────────────────────────────────────────
113
114#[cfg(test)]
115mod tests {
116    use super::*;
117
118    /// FALSIFY-GPUTRAIN-007 algorithm-level PARTIAL discharge: prove the
119    /// version-JSON schema + field-shape invariants. Any mutation that
120    /// allows a missing required key, silently accepts too many visible
121    /// devices, or lets the stale-build state Pass must break this test
122    /// before `apr --version --json` is ever shelled out.
123    #[test]
124    fn falsify_gputrain_007_version_json_schema_and_shape() {
125        // Section 1: all required keys present — baseline Pass for the
126        // schema gate. Mirrors the minimum JSON output on any shipping
127        // build (CUDA or not).
128        assert_eq!(
129            verdict_from_version_json_keys(&[
130                "cuda_feature",
131                "cuda_runtime_available",
132                "visible_devices",
133            ]),
134            Gputrain007Verdict::Pass,
135            "all three required keys present must Pass",
136        );
137        // Extra unknown keys (future-proof): tolerated per spec
138        // comment about forward compatibility.
139        assert_eq!(
140            verdict_from_version_json_keys(&[
141                "cuda_feature",
142                "cuda_runtime_available",
143                "visible_devices",
144                "version",
145                "sha",
146                "tensorrt_feature",
147            ]),
148            Gputrain007Verdict::Pass,
149            "extra unknown keys must not Fail (forward-compat)",
150        );
151
152        // Section 2: each required key missing in turn. Three separate
153        // sub-mutations of the output schema; each must Fail. Catches
154        // any mutation to the required-keys list (e.g. removing one by
155        // accident during refactor).
156        assert_eq!(
157            verdict_from_version_json_keys(&["cuda_runtime_available", "visible_devices",]),
158            Gputrain007Verdict::Fail,
159            "missing `cuda_feature` must Fail",
160        );
161        assert_eq!(
162            verdict_from_version_json_keys(&["cuda_feature", "visible_devices"]),
163            Gputrain007Verdict::Fail,
164            "missing `cuda_runtime_available` must Fail",
165        );
166        assert_eq!(
167            verdict_from_version_json_keys(&["cuda_feature", "cuda_runtime_available",]),
168            Gputrain007Verdict::Fail,
169            "missing `visible_devices` must Fail",
170        );
171        // All three missing (minimal empty JSON).
172        assert_eq!(
173            verdict_from_version_json_keys(&[]),
174            Gputrain007Verdict::Fail,
175            "empty present-keys slice must Fail",
176        );
177        // Only completely unrelated keys present.
178        assert_eq!(
179            verdict_from_version_json_keys(&["version", "sha"]),
180            Gputrain007Verdict::Fail,
181            "only unrelated keys present must Fail",
182        );
183
184        // Section 3: field-shape happy paths. Three of four
185        // (cuda_feature, cuda_runtime_available) combinations are all
186        // operationally valid.
187        //   (false, false) — pure CPU build.
188        //   (false, true)  — CUDA runtime present but build didn't
189        //                    enable; not a bug, just a rebuild needed.
190        //   (true, true)   — fully wired.
191        for (feat, runtime) in [(false, false), (false, true), (true, true)] {
192            let fields = VersionJsonCudaFields {
193                cuda_feature: feat,
194                cuda_runtime_available: runtime,
195                visible_devices: vec!["0:RTX 4090".to_string()],
196            };
197            assert_eq!(
198                verdict_from_version_json_fields(&fields),
199                Gputrain007Verdict::Pass,
200                "consistent (cuda_feature={feat}, runtime_available={runtime}) must Pass",
201            );
202        }
203        // Zero visible devices is Pass on a CPU-only build; grammar
204        // allows up to 16. Lower bound is 0.
205        let fields = VersionJsonCudaFields {
206            cuda_feature: false,
207            cuda_runtime_available: false,
208            visible_devices: vec![],
209        };
210        assert_eq!(
211            verdict_from_version_json_fields(&fields),
212            Gputrain007Verdict::Pass,
213            "empty visible_devices on CPU-only build must Pass",
214        );
215
216        // Section 4: claims-feature-without-runtime — the stale-build
217        // footgun. Must Fail. THIS IS THE NOVEL SANITY RAIL — if a
218        // binary advertises `cuda_feature: true` it must also prove
219        // `cuda_runtime_available: true`, or operators risk another
220        // task #126.
221        let stale = VersionJsonCudaFields {
222            cuda_feature: true,
223            cuda_runtime_available: false,
224            visible_devices: vec![],
225        };
226        assert_eq!(
227            verdict_from_version_json_fields(&stale),
228            Gputrain007Verdict::Fail,
229            "cuda_feature=true + runtime_available=false must Fail \
230             (FM-GPUTRAIN-STALE-BUILD: advertised but missing)",
231        );
232        // Same inconsistency even if visible_devices accidentally got
233        // populated (e.g. cached from a previous build).
234        let stale_with_stale_devices = VersionJsonCudaFields {
235            cuda_feature: true,
236            cuda_runtime_available: false,
237            visible_devices: vec!["0:RTX 4090".to_string()],
238        };
239        assert_eq!(
240            verdict_from_version_json_fields(&stale_with_stale_devices),
241            Gputrain007Verdict::Fail,
242            "advertised feature + missing runtime must Fail regardless of visible_devices",
243        );
244
245        // Section 5: too-many visible devices. INV-GPUTRAIN-001 grammar
246        // allows indices 0..=15 (16 max). Boundary: 16 Pass, 17 Fail.
247        let sixteen = VersionJsonCudaFields {
248            cuda_feature: true,
249            cuda_runtime_available: true,
250            visible_devices: (0..16).map(|i| format!("{i}:device")).collect(),
251        };
252        assert_eq!(
253            verdict_from_version_json_fields(&sixteen),
254            Gputrain007Verdict::Pass,
255            "exactly 16 visible devices must Pass (grammar max)",
256        );
257        let seventeen = VersionJsonCudaFields {
258            cuda_feature: true,
259            cuda_runtime_available: true,
260            visible_devices: (0..17).map(|i| format!("{i}:device")).collect(),
261        };
262        assert_eq!(
263            verdict_from_version_json_fields(&seventeen),
264            Gputrain007Verdict::Fail,
265            "17 visible devices must Fail (exceeds cuda:0..cuda:15 grammar)",
266        );
267        let many = VersionJsonCudaFields {
268            cuda_feature: true,
269            cuda_runtime_available: true,
270            visible_devices: (0..100).map(|i| format!("{i}:device")).collect(),
271        };
272        assert_eq!(
273            verdict_from_version_json_fields(&many),
274            Gputrain007Verdict::Fail,
275            "100 visible devices must Fail (well past grammar)",
276        );
277
278        // Section 6: combined happy-path shape — the three required
279        // keys present AND the field values consistent. Catches a
280        // refactor that split the two gates into separate codepaths
281        // and forgot to call one of them.
282        let happy_fields = VersionJsonCudaFields {
283            cuda_feature: true,
284            cuda_runtime_available: true,
285            visible_devices: vec!["0:RTX 4090".to_string(), "1:RTX 4090".to_string()],
286        };
287        assert_eq!(
288            verdict_from_version_json_fields(&happy_fields),
289            Gputrain007Verdict::Pass,
290            "consistent 2-device CUDA build must Pass field-shape gate",
291        );
292        assert_eq!(
293            verdict_from_version_json_keys(&[
294                "cuda_feature",
295                "cuda_runtime_available",
296                "visible_devices",
297            ]),
298            Gputrain007Verdict::Pass,
299            "matching 3-key schema must Pass schema gate",
300        );
301
302        // Section 7: provenance pin — the required-keys slice is
303        // load-bearing. If a future spec amendment adds a 4th key
304        // (e.g. `rocm_feature`), this slice and the YAML rule must
305        // move together. The byte-literal shape prevents accidental
306        // reordering from changing behaviour.
307        assert_eq!(
308            AC_GPUTRAIN_007_REQUIRED_VERSION_JSON_KEYS.len(),
309            3,
310            "required key count is 3 \
311             (spec §14.4 / gpu-training-backend-v1 INV-GPUTRAIN-007)",
312        );
313        assert!(
314            AC_GPUTRAIN_007_REQUIRED_VERSION_JSON_KEYS.contains(&"cuda_feature"),
315            "`cuda_feature` is a required key",
316        );
317        assert!(
318            AC_GPUTRAIN_007_REQUIRED_VERSION_JSON_KEYS.contains(&"cuda_runtime_available"),
319            "`cuda_runtime_available` is a required key",
320        );
321        assert!(
322            AC_GPUTRAIN_007_REQUIRED_VERSION_JSON_KEYS.contains(&"visible_devices"),
323            "`visible_devices` is a required key",
324        );
325    }
326}