Skip to main content

entrenar/train/
gputrain_003.rs

1//! FALSIFY-GPUTRAIN-003 / INV-GPUTRAIN-003 / GATE-GPUTRAIN-003 — algorithm-level
2//! PARTIAL discharge.
3//!
4//! Spec: `docs/specifications/aprender-train/ship-two-models-spec.md` §14
5//! (task #132 CUDA training backend gap).
6//!
7//! Contract: `contracts/entrenar/gpu-training-backend-v1.yaml` v1.0.0 → v1.1.0
8//! adds `GATE-GPUTRAIN-003` PARTIAL-discharge evidence binding the pure
9//! decision rule for the `nvidia-smi` residency probe.
10//!
11//! INV-GPUTRAIN-003 states: "When the resolved backend is CUDA, a GPU process
12//! MUST be visible via `nvidia-smi --query-compute-apps`. A post-init probe
13//! runs within 5 seconds of step 0 and asserts `pid == training_pid AND
14//! used_memory_mib > 0`." This file discharges the *decision rule* at
15//! `PARTIAL_ALGORITHM_LEVEL`:
16//!
17//!   1. `parse_nvidia_smi_compute_apps(output) -> Result<Vec<…>, ()>`
18//!      — strict parser for `nvidia-smi --query-compute-apps=pid,used_memory
19//!      --format=csv,noheader,nounits` output. Each line is `<pid>, <used_mib>`
20//!      (comma-space). Both fields must be non-negative integers; empty lines
21//!      are skipped; any parse failure is a conservative `Err(())`.
22//!
23//!   2. `verdict_from_residency(training_pid, apps) -> Gputrain003Verdict`
24//!      — aggregate rule: `Pass` iff any `app.pid == training_pid AND
25//!      app.used_memory_mib >= AC_GPUTRAIN_003_MIN_USED_MEMORY_MIB`. An empty
26//!      slice is `Fail` (no active processes ⇒ no GPU residency proof).
27//!
28//! The compute-heavy portion of the AC (`spawn nvidia-smi` + 5-second poll
29//! window + capture stdout) is intentionally out of scope here; the threshold
30//! rule is what the live probe must call, and changing either constant
31//! (the 5-second poll window, the 1-MiB floor) breaks this test before any
32//! subprocess is launched.
33
34/// Maximum seconds between step 0 and the first `nvidia-smi` probe call.
35/// Pinned to the INV-GPUTRAIN-003 rule text so that any drift in the
36/// probe-window timing (e.g. tightening to 1 s or relaxing to 30 s) must
37/// move this constant in lockstep.
38pub const AC_GPUTRAIN_003_NVIDIA_SMI_POLL_WINDOW_SECONDS: u32 = 5;
39
40/// Minimum `used_memory_mib` that counts as "GPU actually allocated
41/// something on behalf of this pid." Zero mem means the pid registered
42/// with the driver but never cudaMalloc'd — the exact symptom task #126
43/// caught.
44pub const AC_GPUTRAIN_003_MIN_USED_MEMORY_MIB: u64 = 1;
45
46/// A single row from `nvidia-smi --query-compute-apps=pid,used_memory
47/// --format=csv,noheader,nounits`.
48#[derive(Debug, Clone, Copy, PartialEq, Eq)]
49pub struct NvidiaSmiComputeApp {
50    /// Process ID the CUDA driver reports as owning the allocation.
51    pub pid: u32,
52    /// MiB of GPU memory currently resident for that pid.
53    pub used_memory_mib: u64,
54}
55
56/// Binary verdict for FALSIFY-GPUTRAIN-003 / GATE-GPUTRAIN-003.
57#[derive(Debug, Clone, Copy, PartialEq, Eq)]
58pub enum Gputrain003Verdict {
59    /// Residency proof held: some app in the probe output matches our pid
60    /// AND is holding at least the minimum amount of GPU memory.
61    Pass,
62    /// Residency proof failed: no match, zero-mem match, or empty output.
63    Fail,
64}
65
66/// Strict parser for `nvidia-smi --query-compute-apps=pid,used_memory
67/// --format=csv,noheader,nounits` output.
68///
69/// Format (documented by NVIDIA): one line per compute app, `<pid>, <mib>`
70/// with a literal comma-space separator and no header row. Blank lines
71/// between records are tolerated (some `nvidia-smi` versions emit a
72/// trailing newline).
73///
74/// # Errors
75///
76/// Returns `Err(())` on any line that is not `<digits>, <digits>` —
77/// wrong separator, non-digit chars, missing field, or integer overflow.
78/// The error is intentionally empty-payload so callers treat it as a
79/// single opaque "parse failed" signal and conservatively map to Fail.
80#[allow(clippy::result_unit_err)]
81pub fn parse_nvidia_smi_compute_apps(output: &str) -> Result<Vec<NvidiaSmiComputeApp>, ()> {
82    let mut apps = Vec::new();
83    for line in output.lines() {
84        if line.is_empty() {
85            continue;
86        }
87        // Split on the exact ", " separator — any other whitespace is
88        // rejected because nvidia-smi emits comma-space canonically.
89        let (pid_s, mem_s) = line.split_once(", ").ok_or(())?;
90        let pid: u32 = pid_s.parse().map_err(|_| ())?;
91        let used_memory_mib: u64 = mem_s.parse().map_err(|_| ())?;
92        apps.push(NvidiaSmiComputeApp { pid, used_memory_mib });
93    }
94    Ok(apps)
95}
96
97/// Algorithm-level verdict rule for INV-GPUTRAIN-003: given our training
98/// pid and the parsed nvidia-smi compute-app list, Pass iff some app has
99/// `app.pid == training_pid AND app.used_memory_mib >=
100/// AC_GPUTRAIN_003_MIN_USED_MEMORY_MIB`.
101///
102/// Returns [`Gputrain003Verdict::Fail`] conservatively for every negative
103/// case: empty slice, no matching pid, or matching pid with zero memory.
104#[must_use]
105pub fn verdict_from_residency(
106    training_pid: u32,
107    apps: &[NvidiaSmiComputeApp],
108) -> Gputrain003Verdict {
109    for app in apps {
110        if app.pid == training_pid && app.used_memory_mib >= AC_GPUTRAIN_003_MIN_USED_MEMORY_MIB {
111            return Gputrain003Verdict::Pass;
112        }
113    }
114    Gputrain003Verdict::Fail
115}
116
117// ─────────────────────────────────────────────────────────────
118// Unit tests — FALSIFY-GPUTRAIN-003 algorithm-level proof
119// ─────────────────────────────────────────────────────────────
120
121#[cfg(test)]
122mod tests {
123    use super::*;
124
125    /// FALSIFY-GPUTRAIN-003 algorithm-level PARTIAL discharge: prove the
126    /// residency-probe parse + aggregate rule binding `nvidia-smi` output
127    /// to the training pid. Any mutation that relaxes the zero-memory
128    /// rejection, accepts a non-matching pid, or silently treats empty
129    /// input as Pass must break this test before any subprocess is
130    /// launched.
131    #[test]
132    fn falsify_gputrain_003_residency_proof_logic() {
133        // Section 1: happy path — our pid, 5000 MiB allocated. Baseline
134        // Pass case. Catches any mutation that swaps comparison operators
135        // (== vs !=, >= vs <) because any such flip breaks this.
136        let training_pid = 12345;
137        let output_happy = "12345, 5000\n";
138        let apps = parse_nvidia_smi_compute_apps(output_happy).expect("canonical line parses");
139        assert_eq!(apps, vec![NvidiaSmiComputeApp { pid: 12345, used_memory_mib: 5000 }],);
140        assert_eq!(
141            verdict_from_residency(training_pid, &apps),
142            Gputrain003Verdict::Pass,
143            "matching pid with non-zero mem must Pass",
144        );
145
146        // Section 2: our pid but zero memory. Exactly the task #126
147        // defect: process registered with the CUDA driver but never
148        // cudaMalloc'd a weight tensor. Must Fail — otherwise a silent
149        // CPU training run would pass the residency gate.
150        let output_zero = "12345, 0\n";
151        let apps = parse_nvidia_smi_compute_apps(output_zero).expect("zero-mem line parses");
152        assert_eq!(
153            verdict_from_residency(training_pid, &apps),
154            Gputrain003Verdict::Fail,
155            "matching pid with 0 MiB must Fail (GPU allocated nothing)",
156        );
157
158        // Section 3: different pid owns all the memory. Catches the
159        // "wrong pid captured" class where the probe read pid from a
160        // stale env var or a different child process. Must Fail.
161        let output_other = "99999, 5000\n";
162        let apps = parse_nvidia_smi_compute_apps(output_other).expect("other-pid line parses");
163        assert_eq!(
164            verdict_from_residency(training_pid, &apps),
165            Gputrain003Verdict::Fail,
166            "non-matching pid must Fail even if it holds lots of memory",
167        );
168
169        // Section 4: empty output. Catches the CUDA-present-but-no-
170        // active-compute-apps case where nvidia-smi returns an empty
171        // record set. Must Fail — we need POSITIVE proof that our
172        // process is resident, not absence of negative evidence.
173        let apps_empty: Vec<NvidiaSmiComputeApp> =
174            parse_nvidia_smi_compute_apps("").expect("empty input parses as zero-length slice");
175        assert!(apps_empty.is_empty());
176        assert_eq!(
177            verdict_from_residency(training_pid, &apps_empty),
178            Gputrain003Verdict::Fail,
179            "empty compute-app list must Fail",
180        );
181
182        // Section 5: multi-process output. Catches the case where our
183        // training pid is one of many on a shared GPU host (e.g.
184        // jupyter kernel + `apr pretrain` + `nvidia-settings` probe).
185        // Must Pass because our row has non-zero mem; must NOT
186        // short-circuit on the first row if that row is a sibling
187        // process. Tests both orderings.
188        let output_multi_ours_first = "12345, 2000\n99999, 1500\n";
189        let apps =
190            parse_nvidia_smi_compute_apps(output_multi_ours_first).expect("two-line output parses");
191        assert_eq!(apps.len(), 2);
192        assert_eq!(
193            verdict_from_residency(training_pid, &apps),
194            Gputrain003Verdict::Pass,
195            "multi-process output with our pid first must Pass",
196        );
197        let output_multi_ours_last = "99999, 1500\n12345, 2000\n";
198        let apps =
199            parse_nvidia_smi_compute_apps(output_multi_ours_last).expect("two-line output parses");
200        assert_eq!(
201            verdict_from_residency(training_pid, &apps),
202            Gputrain003Verdict::Pass,
203            "multi-process output with our pid last must Pass \
204             (loop must not short-circuit on non-matching rows)",
205        );
206
207        // Section 6: malformed lines. Each of these is a parse failure
208        // that must surface as Err(()) at the parser boundary. On Err
209        // the caller conservatively treats apps as empty, which then
210        // falls through to verdict Fail per Section 4.
211        let no_comma = "12345 5000\n";
212        assert_eq!(parse_nvidia_smi_compute_apps(no_comma), Err(()));
213        let wrong_separator = "12345,5000\n"; // no space after comma
214        assert_eq!(parse_nvidia_smi_compute_apps(wrong_separator), Err(()));
215        let extra_whitespace = "12345,  5000\n"; // two spaces
216        assert_eq!(parse_nvidia_smi_compute_apps(extra_whitespace), Err(()));
217        let non_digit_pid = "abc, 5000\n";
218        assert_eq!(parse_nvidia_smi_compute_apps(non_digit_pid), Err(()));
219        let non_digit_mem = "12345, xyz\n";
220        assert_eq!(parse_nvidia_smi_compute_apps(non_digit_mem), Err(()));
221        let missing_field = "12345,\n"; // no mem after comma
222        assert_eq!(parse_nvidia_smi_compute_apps(missing_field), Err(()));
223        // Conservative fallback: on parse Err, verdict of empty slice is Fail.
224        assert_eq!(
225            verdict_from_residency(training_pid, &[]),
226            Gputrain003Verdict::Fail,
227            "conservative Fail when parse errored and caller passed empty slice",
228        );
229
230        // Section 7: u32::MAX / u64::MAX boundary sanity. Catches any
231        // mutation that replaces unsigned comparison with a signed-cast
232        // helper or that overflows on extreme inputs. MAX memory must
233        // Pass; zero memory at MAX pid must Fail.
234        let max_pid_max_mem =
235            vec![NvidiaSmiComputeApp { pid: u32::MAX, used_memory_mib: u64::MAX }];
236        assert_eq!(
237            verdict_from_residency(u32::MAX, &max_pid_max_mem),
238            Gputrain003Verdict::Pass,
239            "u32::MAX pid + u64::MAX mem must Pass",
240        );
241        let max_pid_zero_mem = vec![NvidiaSmiComputeApp { pid: u32::MAX, used_memory_mib: 0 }];
242        assert_eq!(
243            verdict_from_residency(u32::MAX, &max_pid_zero_mem),
244            Gputrain003Verdict::Fail,
245            "u32::MAX pid + 0 MiB must Fail (zero-mem rule is exceptionless)",
246        );
247
248        // Provenance pin — both constants are load-bearing and
249        // lockstep with the YAML contract. Any spec drift (tightening
250        // the 5-s window or relaxing the 1-MiB floor) must move these
251        // constants together with the YAML rule text.
252        assert_eq!(
253            AC_GPUTRAIN_003_NVIDIA_SMI_POLL_WINDOW_SECONDS, 5,
254            "INV-GPUTRAIN-003 poll window is 5 seconds \
255             (spec §14.4 / gpu-training-backend-v1 INV-GPUTRAIN-003)",
256        );
257        assert_eq!(
258            AC_GPUTRAIN_003_MIN_USED_MEMORY_MIB, 1,
259            "INV-GPUTRAIN-003 min-mem floor is 1 MiB \
260             (spec §14.4 / gpu-training-backend-v1 INV-GPUTRAIN-003)",
261        );
262    }
263}