entrenar/train/gputrain_003.rs
1//! FALSIFY-GPUTRAIN-003 / INV-GPUTRAIN-003 / GATE-GPUTRAIN-003 — algorithm-level
2//! PARTIAL discharge.
3//!
4//! Spec: `docs/specifications/aprender-train/ship-two-models-spec.md` §14
5//! (task #132 CUDA training backend gap).
6//!
7//! Contract: `contracts/entrenar/gpu-training-backend-v1.yaml` v1.0.0 → v1.1.0
8//! adds `GATE-GPUTRAIN-003` PARTIAL-discharge evidence binding the pure
9//! decision rule for the `nvidia-smi` residency probe.
10//!
11//! INV-GPUTRAIN-003 states: "When the resolved backend is CUDA, a GPU process
12//! MUST be visible via `nvidia-smi --query-compute-apps`. A post-init probe
13//! runs within 5 seconds of step 0 and asserts `pid == training_pid AND
14//! used_memory_mib > 0`." This file discharges the *decision rule* at
15//! `PARTIAL_ALGORITHM_LEVEL`:
16//!
17//! 1. `parse_nvidia_smi_compute_apps(output) -> Result<Vec<…>, ()>`
18//! — strict parser for `nvidia-smi --query-compute-apps=pid,used_memory
19//! --format=csv,noheader,nounits` output. Each line is `<pid>, <used_mib>`
20//! (comma-space). Both fields must be non-negative integers; empty lines
21//! are skipped; any parse failure is a conservative `Err(())`.
22//!
23//! 2. `verdict_from_residency(training_pid, apps) -> Gputrain003Verdict`
24//! — aggregate rule: `Pass` iff any `app.pid == training_pid AND
25//! app.used_memory_mib >= AC_GPUTRAIN_003_MIN_USED_MEMORY_MIB`. An empty
26//! slice is `Fail` (no active processes ⇒ no GPU residency proof).
27//!
28//! The compute-heavy portion of the AC (`spawn nvidia-smi` + 5-second poll
29//! window + capture stdout) is intentionally out of scope here; the threshold
30//! rule is what the live probe must call, and changing either constant
31//! (the 5-second poll window, the 1-MiB floor) breaks this test before any
32//! subprocess is launched.
33
34/// Maximum seconds between step 0 and the first `nvidia-smi` probe call.
35/// Pinned to the INV-GPUTRAIN-003 rule text so that any drift in the
36/// probe-window timing (e.g. tightening to 1 s or relaxing to 30 s) must
37/// move this constant in lockstep.
38pub const AC_GPUTRAIN_003_NVIDIA_SMI_POLL_WINDOW_SECONDS: u32 = 5;
39
40/// Minimum `used_memory_mib` that counts as "GPU actually allocated
41/// something on behalf of this pid." Zero mem means the pid registered
42/// with the driver but never cudaMalloc'd — the exact symptom task #126
43/// caught.
44pub const AC_GPUTRAIN_003_MIN_USED_MEMORY_MIB: u64 = 1;
45
46/// A single row from `nvidia-smi --query-compute-apps=pid,used_memory
47/// --format=csv,noheader,nounits`.
48#[derive(Debug, Clone, Copy, PartialEq, Eq)]
49pub struct NvidiaSmiComputeApp {
50 /// Process ID the CUDA driver reports as owning the allocation.
51 pub pid: u32,
52 /// MiB of GPU memory currently resident for that pid.
53 pub used_memory_mib: u64,
54}
55
56/// Binary verdict for FALSIFY-GPUTRAIN-003 / GATE-GPUTRAIN-003.
57#[derive(Debug, Clone, Copy, PartialEq, Eq)]
58pub enum Gputrain003Verdict {
59 /// Residency proof held: some app in the probe output matches our pid
60 /// AND is holding at least the minimum amount of GPU memory.
61 Pass,
62 /// Residency proof failed: no match, zero-mem match, or empty output.
63 Fail,
64}
65
66/// Strict parser for `nvidia-smi --query-compute-apps=pid,used_memory
67/// --format=csv,noheader,nounits` output.
68///
69/// Format (documented by NVIDIA): one line per compute app, `<pid>, <mib>`
70/// with a literal comma-space separator and no header row. Blank lines
71/// between records are tolerated (some `nvidia-smi` versions emit a
72/// trailing newline).
73///
74/// # Errors
75///
76/// Returns `Err(())` on any line that is not `<digits>, <digits>` —
77/// wrong separator, non-digit chars, missing field, or integer overflow.
78/// The error is intentionally empty-payload so callers treat it as a
79/// single opaque "parse failed" signal and conservatively map to Fail.
80#[allow(clippy::result_unit_err)]
81pub fn parse_nvidia_smi_compute_apps(output: &str) -> Result<Vec<NvidiaSmiComputeApp>, ()> {
82 let mut apps = Vec::new();
83 for line in output.lines() {
84 if line.is_empty() {
85 continue;
86 }
87 // Split on the exact ", " separator — any other whitespace is
88 // rejected because nvidia-smi emits comma-space canonically.
89 let (pid_s, mem_s) = line.split_once(", ").ok_or(())?;
90 let pid: u32 = pid_s.parse().map_err(|_| ())?;
91 let used_memory_mib: u64 = mem_s.parse().map_err(|_| ())?;
92 apps.push(NvidiaSmiComputeApp { pid, used_memory_mib });
93 }
94 Ok(apps)
95}
96
97/// Algorithm-level verdict rule for INV-GPUTRAIN-003: given our training
98/// pid and the parsed nvidia-smi compute-app list, Pass iff some app has
99/// `app.pid == training_pid AND app.used_memory_mib >=
100/// AC_GPUTRAIN_003_MIN_USED_MEMORY_MIB`.
101///
102/// Returns [`Gputrain003Verdict::Fail`] conservatively for every negative
103/// case: empty slice, no matching pid, or matching pid with zero memory.
104#[must_use]
105pub fn verdict_from_residency(
106 training_pid: u32,
107 apps: &[NvidiaSmiComputeApp],
108) -> Gputrain003Verdict {
109 for app in apps {
110 if app.pid == training_pid && app.used_memory_mib >= AC_GPUTRAIN_003_MIN_USED_MEMORY_MIB {
111 return Gputrain003Verdict::Pass;
112 }
113 }
114 Gputrain003Verdict::Fail
115}
116
117// ─────────────────────────────────────────────────────────────
118// Unit tests — FALSIFY-GPUTRAIN-003 algorithm-level proof
119// ─────────────────────────────────────────────────────────────
120
121#[cfg(test)]
122mod tests {
123 use super::*;
124
125 /// FALSIFY-GPUTRAIN-003 algorithm-level PARTIAL discharge: prove the
126 /// residency-probe parse + aggregate rule binding `nvidia-smi` output
127 /// to the training pid. Any mutation that relaxes the zero-memory
128 /// rejection, accepts a non-matching pid, or silently treats empty
129 /// input as Pass must break this test before any subprocess is
130 /// launched.
131 #[test]
132 fn falsify_gputrain_003_residency_proof_logic() {
133 // Section 1: happy path — our pid, 5000 MiB allocated. Baseline
134 // Pass case. Catches any mutation that swaps comparison operators
135 // (== vs !=, >= vs <) because any such flip breaks this.
136 let training_pid = 12345;
137 let output_happy = "12345, 5000\n";
138 let apps = parse_nvidia_smi_compute_apps(output_happy).expect("canonical line parses");
139 assert_eq!(apps, vec![NvidiaSmiComputeApp { pid: 12345, used_memory_mib: 5000 }],);
140 assert_eq!(
141 verdict_from_residency(training_pid, &apps),
142 Gputrain003Verdict::Pass,
143 "matching pid with non-zero mem must Pass",
144 );
145
146 // Section 2: our pid but zero memory. Exactly the task #126
147 // defect: process registered with the CUDA driver but never
148 // cudaMalloc'd a weight tensor. Must Fail — otherwise a silent
149 // CPU training run would pass the residency gate.
150 let output_zero = "12345, 0\n";
151 let apps = parse_nvidia_smi_compute_apps(output_zero).expect("zero-mem line parses");
152 assert_eq!(
153 verdict_from_residency(training_pid, &apps),
154 Gputrain003Verdict::Fail,
155 "matching pid with 0 MiB must Fail (GPU allocated nothing)",
156 );
157
158 // Section 3: different pid owns all the memory. Catches the
159 // "wrong pid captured" class where the probe read pid from a
160 // stale env var or a different child process. Must Fail.
161 let output_other = "99999, 5000\n";
162 let apps = parse_nvidia_smi_compute_apps(output_other).expect("other-pid line parses");
163 assert_eq!(
164 verdict_from_residency(training_pid, &apps),
165 Gputrain003Verdict::Fail,
166 "non-matching pid must Fail even if it holds lots of memory",
167 );
168
169 // Section 4: empty output. Catches the CUDA-present-but-no-
170 // active-compute-apps case where nvidia-smi returns an empty
171 // record set. Must Fail — we need POSITIVE proof that our
172 // process is resident, not absence of negative evidence.
173 let apps_empty: Vec<NvidiaSmiComputeApp> =
174 parse_nvidia_smi_compute_apps("").expect("empty input parses as zero-length slice");
175 assert!(apps_empty.is_empty());
176 assert_eq!(
177 verdict_from_residency(training_pid, &apps_empty),
178 Gputrain003Verdict::Fail,
179 "empty compute-app list must Fail",
180 );
181
182 // Section 5: multi-process output. Catches the case where our
183 // training pid is one of many on a shared GPU host (e.g.
184 // jupyter kernel + `apr pretrain` + `nvidia-settings` probe).
185 // Must Pass because our row has non-zero mem; must NOT
186 // short-circuit on the first row if that row is a sibling
187 // process. Tests both orderings.
188 let output_multi_ours_first = "12345, 2000\n99999, 1500\n";
189 let apps =
190 parse_nvidia_smi_compute_apps(output_multi_ours_first).expect("two-line output parses");
191 assert_eq!(apps.len(), 2);
192 assert_eq!(
193 verdict_from_residency(training_pid, &apps),
194 Gputrain003Verdict::Pass,
195 "multi-process output with our pid first must Pass",
196 );
197 let output_multi_ours_last = "99999, 1500\n12345, 2000\n";
198 let apps =
199 parse_nvidia_smi_compute_apps(output_multi_ours_last).expect("two-line output parses");
200 assert_eq!(
201 verdict_from_residency(training_pid, &apps),
202 Gputrain003Verdict::Pass,
203 "multi-process output with our pid last must Pass \
204 (loop must not short-circuit on non-matching rows)",
205 );
206
207 // Section 6: malformed lines. Each of these is a parse failure
208 // that must surface as Err(()) at the parser boundary. On Err
209 // the caller conservatively treats apps as empty, which then
210 // falls through to verdict Fail per Section 4.
211 let no_comma = "12345 5000\n";
212 assert_eq!(parse_nvidia_smi_compute_apps(no_comma), Err(()));
213 let wrong_separator = "12345,5000\n"; // no space after comma
214 assert_eq!(parse_nvidia_smi_compute_apps(wrong_separator), Err(()));
215 let extra_whitespace = "12345, 5000\n"; // two spaces
216 assert_eq!(parse_nvidia_smi_compute_apps(extra_whitespace), Err(()));
217 let non_digit_pid = "abc, 5000\n";
218 assert_eq!(parse_nvidia_smi_compute_apps(non_digit_pid), Err(()));
219 let non_digit_mem = "12345, xyz\n";
220 assert_eq!(parse_nvidia_smi_compute_apps(non_digit_mem), Err(()));
221 let missing_field = "12345,\n"; // no mem after comma
222 assert_eq!(parse_nvidia_smi_compute_apps(missing_field), Err(()));
223 // Conservative fallback: on parse Err, verdict of empty slice is Fail.
224 assert_eq!(
225 verdict_from_residency(training_pid, &[]),
226 Gputrain003Verdict::Fail,
227 "conservative Fail when parse errored and caller passed empty slice",
228 );
229
230 // Section 7: u32::MAX / u64::MAX boundary sanity. Catches any
231 // mutation that replaces unsigned comparison with a signed-cast
232 // helper or that overflows on extreme inputs. MAX memory must
233 // Pass; zero memory at MAX pid must Fail.
234 let max_pid_max_mem =
235 vec![NvidiaSmiComputeApp { pid: u32::MAX, used_memory_mib: u64::MAX }];
236 assert_eq!(
237 verdict_from_residency(u32::MAX, &max_pid_max_mem),
238 Gputrain003Verdict::Pass,
239 "u32::MAX pid + u64::MAX mem must Pass",
240 );
241 let max_pid_zero_mem = vec![NvidiaSmiComputeApp { pid: u32::MAX, used_memory_mib: 0 }];
242 assert_eq!(
243 verdict_from_residency(u32::MAX, &max_pid_zero_mem),
244 Gputrain003Verdict::Fail,
245 "u32::MAX pid + 0 MiB must Fail (zero-mem rule is exceptionless)",
246 );
247
248 // Provenance pin — both constants are load-bearing and
249 // lockstep with the YAML contract. Any spec drift (tightening
250 // the 5-s window or relaxing the 1-MiB floor) must move these
251 // constants together with the YAML rule text.
252 assert_eq!(
253 AC_GPUTRAIN_003_NVIDIA_SMI_POLL_WINDOW_SECONDS, 5,
254 "INV-GPUTRAIN-003 poll window is 5 seconds \
255 (spec §14.4 / gpu-training-backend-v1 INV-GPUTRAIN-003)",
256 );
257 assert_eq!(
258 AC_GPUTRAIN_003_MIN_USED_MEMORY_MIB, 1,
259 "INV-GPUTRAIN-003 min-mem floor is 1 MiB \
260 (spec §14.4 / gpu-training-backend-v1 INV-GPUTRAIN-003)",
261 );
262 }
263}