entrenar/train/pretrain_real_cuda.rs
1//! CUDA-backend `StepFn` / `ValFn` / `CheckpointFn` for the 370M pretrain
2//! loop (task #132 Phase 2, contract `gpu-training-backend-v1`).
3//!
4//! Mirrors `pretrain_real.rs` but swaps `TransformerTrainer`
5//! (CPU + trueno SIMD) for `CudaTransformerTrainer` (GPU-resident
6//! AdamW + fused CE). The entire module is gated on
7//! `#[cfg(feature = "cuda")]` because `CudaTransformerTrainer::new`
8//! / `train_batch` / `eval_batch` / `save_apr` only exist in the
9//! cuda build — the non-cuda stub returns an error from `new()` and
10//! exposes no step/eval/save methods.
11//!
12//! Contract obligations discharged / strengthened vs the CPU path:
13//! - INV-ARCH-370M-001 (param count ∈ [366M, 374M]) via `debug_assert`
14//! on `CudaTransformerTrainer::model().parameters()`, matching
15//! the CPU guard.
16//! - INV-TRAIN-007 (no NaN/Inf): `train_batch` / `eval_batch` return
17//! finite loss by construction; non-finite outputs abort via
18//! `PretrainLoop`'s guards.
19//! - INV-TRAIN-008 (grad_norm ≥ 0): `last_grad_norm()` returns the
20//! real LM-head L2 norm. Strictly stronger than the CPU path's
21//! `1.0` placeholder.
22//!
23//! Deferred to a follow-up:
24//! - INV-TRAIN-003 (AdamW-state sha256). `CudaTransformerTrainer`
25//! keeps (m, v, t) on the GPU; discharging this cleanly needs a
26//! D2H sync that `save_apr` already pays for but `StepFn` does
27//! not want to pay per-step. Until that sync is factored out,
28//! the trait default `optimizer_state_sha256 -> None` is used,
29//! and GATE-TRAIN-006 runs only on the CPU path.
30
31#![cfg(feature = "cuda")]
32
33use crate::train::pretrain::{CheckpointFn, EpochArtifact, StepFn, ValFn};
34use crate::train::pretrain_real::{
35 build_transformer_config, llama_370m_train_config, load_init_tensors_from_apr,
36 populate_trainer_from_init_tensors, validate_pretrain_init_arch_compatible,
37};
38use crate::train::transformer_trainer::{CudaTransformerTrainer, LMBatch, TransformerTrainConfig};
39use crate::transformer::{Transformer, TransformerConfig};
40use std::cell::RefCell;
41use std::path::Path;
42use std::rc::Rc;
43
44/// Shared mutable ownership of a GPU-resident trainer. Both
45/// `CudaRealStepFn` (train steps) and `CudaRealValFn` (eval) clone
46/// this `Rc` so the three hooks see the same GPU memory.
47pub type SharedCudaTrainer = Rc<RefCell<CudaTransformerTrainer>>;
48
49/// Allocate a `CudaTransformerTrainer` with MODEL-2 v2-remedy defaults
50/// and verify INV-ARCH-370M-001 in debug builds.
51///
52/// Returns a `crate::Result` because `CudaTransformerTrainer::new`
53/// can fail on missing CUDA runtime, kernel pre-warm failure, or
54/// block upload failure — the CLI surfaces this as a
55/// GATE-GPUTRAIN-002 error so the operator knows to check their
56/// `--features cuda` build or their GPU.
57pub fn build_shared_cuda_trainer(
58 lr: f32,
59 seq_length: usize,
60 seed: u64,
61) -> crate::Result<SharedCudaTrainer> {
62 let cfg = llama_370m_train_config(lr, seq_length, seed);
63 let trainer = CudaTransformerTrainer::new(cfg)?;
64 #[cfg(debug_assertions)]
65 {
66 let param_count: usize = trainer.model().parameters().iter().map(|t| t.len()).sum();
67 debug_assert!(
68 (366_000_000..=374_000_000).contains(¶m_count),
69 "INV-ARCH-370M-001: parameter count {param_count} outside [366M, 374M] band",
70 );
71 }
72 Ok(Rc::new(RefCell::new(trainer)))
73}
74
75/// Polymorphic CUDA trainer builder for `apr pretrain --init --device cuda`
76/// (§50.4 step 5f.5 — symmetric to the CPU `build_shared_trainer_with_init`).
77///
78/// Composes the same §50.4 step-5f machinery as the CPU path, but runs
79/// it against `CudaTransformerTrainer::with_model` so the populated
80/// init weights flow through GPU upload (transformer blocks via
81/// `upload_blocks`, final RMSNorm via `from_host`, lm_head /
82/// embed_tokens.weight via `from_host`):
83/// - 5c: `build_transformer_config(init_arch)` — polymorphic dispatch
84/// - 5f.1: `validate_pretrain_init_arch_compatible(init_arch)` — encoder rejection
85/// - 5f.2: `load_init_tensors_from_apr(path)` — read APR weights
86/// - 5f.3: `populate_trainer_from_init_tensors(transformer, &tensors)` — populate CPU model
87/// - 5f.5: `CudaTransformerTrainer::with_model(populated_model, train_cfg)` — GPU upload
88///
89/// Behaviour:
90/// init = None → identical to `build_shared_cuda_trainer` (Llama370M
91/// from-scratch baseline with INV-ARCH-370M-001 enforced).
92/// init = Some → builds a CUDA trainer whose GPU weights derive from
93/// the populated CPU model (the populated `Transformer`
94/// is moved into `with_model` which uploads its blocks /
95/// norm / lm_head to GPU). INV-ARCH-370M-001 is NOT
96/// enforced — arch is whatever the init APR has.
97///
98/// Spec: SPEC-SHIP-TWO-001 §52.4 (CPU 5f.4 wireup) + §54-§56 (Qwen
99/// 5g.0/5g.1 prerequisites) + this §50.4 step 5f.5 (CUDA wireup).
100///
101/// # Errors
102///
103/// Returns Err when:
104/// - `init_arch.is_some() != init_path.is_some()` (caller bug — same
105/// diagnostic as the CPU path's `build_shared_trainer_with_init`).
106/// - `init_arch` is `Some` with `architecture = Encoder`
107/// (FALSIFY-APR-PRETRAIN-ARCH-007 / FALSIFY-APR-PRETRAIN-INIT-001).
108/// - `load_init_tensors_from_apr` fails (FALSIFY-APR-PRETRAIN-INIT-006).
109/// - `populate_trainer_from_init_tensors` fails (FALSIFY-APR-PRETRAIN-INIT-007).
110/// - `CudaTransformerTrainer::with_model` fails (CUDA init / kernel
111/// pre-warm / block upload — surfaces as GATE-GPUTRAIN-002).
112///
113/// # Caller Contract
114///
115/// The caller MUST have built the binary with `--features cuda`. This
116/// function is gated on `#[cfg(feature = "cuda")]` so a non-cuda build
117/// will not see this symbol; the apr-cli dispatch layer routes
118/// `--device cuda` to `drive_real_cuda` which calls this builder, and
119/// the non-cuda stub for `drive_real_cuda` already returns the
120/// rebuild-with-cuda error per `feedback_cuda_feature_footgun.md`.
121pub fn build_shared_cuda_trainer_with_init(
122 lr: f32,
123 seq_length: usize,
124 seed: u64,
125 init_arch: Option<&TransformerConfig>,
126 init_path: Option<&Path>,
127) -> crate::Result<SharedCudaTrainer> {
128 if init_arch.is_some() != init_path.is_some() {
129 return Err(crate::error::Error::ConfigError(format!(
130 "build_shared_cuda_trainer_with_init: init_arch and init_path must both be Some \
131 or both None (caller bug; init_arch.is_some()={}, init_path.is_some()={})",
132 init_arch.is_some(),
133 init_path.is_some()
134 )));
135 }
136
137 if let Some(cfg) = init_arch {
138 validate_pretrain_init_arch_compatible(cfg).map_err(crate::error::Error::ConfigError)?;
139 }
140
141 let model_cfg = build_transformer_config(init_arch);
142 let mut train_cfg = TransformerTrainConfig::new(model_cfg);
143 train_cfg.lr = lr;
144 train_cfg.max_seq_len = seq_length;
145 train_cfg.seed = seed;
146
147 // Build the CPU model first; populate init weights into it; then
148 // hand it to CudaTransformerTrainer::with_model which uploads the
149 // populated blocks, final RMSNorm, and lm_head/embed_tokens to GPU.
150 // This is the symmetric path to CPU's build_shared_trainer_with_init,
151 // exercising the SAME populate_trainer_from_init_tensors helper so
152 // the population semantics are identical between backends.
153 let mut transformer = Transformer::new(&train_cfg.model_config);
154
155 if let Some(path) = init_path {
156 let tensors = load_init_tensors_from_apr(path).map_err(crate::error::Error::ConfigError)?;
157 populate_trainer_from_init_tensors(&mut transformer, &tensors)
158 .map_err(crate::error::Error::ConfigError)?;
159 } else {
160 // From-scratch CUDA path with init=None: enforce the
161 // INV-ARCH-370M-001 param-count band. Mirrors the CPU
162 // `build_shared_trainer` invariant exactly.
163 #[cfg(debug_assertions)]
164 {
165 let param_count: usize = transformer.parameters().iter().map(|t| t.len()).sum();
166 debug_assert!(
167 (366_000_000..=374_000_000).contains(¶m_count),
168 "INV-ARCH-370M-001: parameter count {param_count} outside [366M, 374M] band \
169 (from-scratch CUDA path with init=None)",
170 );
171 }
172 }
173
174 let trainer = CudaTransformerTrainer::with_model(transformer, train_cfg)?;
175 Ok(Rc::new(RefCell::new(trainer)))
176}
177
178/// CUDA `StepFn` — pulls one `LMBatch` from the shard iterator and
179/// runs a real GPU forward + backward + AdamW step.
180pub struct CudaRealStepFn {
181 trainer: SharedCudaTrainer,
182 batches: Box<dyn Iterator<Item = LMBatch>>,
183}
184
185impl CudaRealStepFn {
186 pub fn new(trainer: SharedCudaTrainer, batches: Box<dyn Iterator<Item = LMBatch>>) -> Self {
187 Self { trainer, batches }
188 }
189}
190
191impl StepFn for CudaRealStepFn {
192 fn step(&mut self, _step: u64, _lr: f32, _batch_tokens: u64) -> (f32, f32) {
193 // Exhausted shard stream: emit a finite placeholder so the
194 // NaN/Inf guard (INV-TRAIN-007) doesn't mis-fire and the
195 // divergence guard (GATE-TRAIN-005) correctly does not abort.
196 let Some(batch) = self.batches.next() else {
197 return (1.0, 1.0);
198 };
199 let mut trainer = self.trainer.borrow_mut();
200 let loss = trainer.train_batch(&batch);
201 // Real LM-head L2 norm — strictly more informative than the
202 // CPU path's `1.0` placeholder for GATE-TRAIN-008 monitoring.
203 let grad_norm = trainer.last_grad_norm();
204 (loss, grad_norm)
205 }
206
207 // INV-TRAIN-003 intentionally deferred for the GPU path — see
208 // module docs. Uses trait default `-> None`, so the CPU gate
209 // (`--device cpu`) is the one that exercises AdamW-state parity.
210}
211
212/// CUDA `ValFn` — forward-only eval across pre-loaded held-out
213/// batches. Uses `eval_batch` (fused GPU cross-entropy, no logits
214/// D2H) and averages across batches.
215pub struct CudaRealValFn {
216 trainer: SharedCudaTrainer,
217 held_out: Vec<LMBatch>,
218}
219
220impl CudaRealValFn {
221 pub fn new(trainer: SharedCudaTrainer, held_out: Vec<LMBatch>) -> Self {
222 Self { trainer, held_out }
223 }
224}
225
226impl ValFn for CudaRealValFn {
227 fn validate(&mut self, _epoch: usize) -> f32 {
228 if self.held_out.is_empty() {
229 return f32::NAN;
230 }
231 let mut trainer = self.trainer.borrow_mut();
232 let mut total_loss = 0.0_f32;
233 let mut count = 0_usize;
234 for batch in &self.held_out {
235 if batch.batch_size == 0 {
236 continue;
237 }
238 total_loss += trainer.eval_batch(batch);
239 count += 1;
240 }
241 if count == 0 {
242 f32::NAN
243 } else {
244 total_loss / count as f32
245 }
246 }
247}
248
249/// CUDA `CheckpointFn` — writes the 370M weights to
250/// `artifact.checkpoint_path` in APR format. `save_apr` takes
251/// `&mut self` on the CUDA path because it syncs GPU→CPU before
252/// writing, which is why this holds the `SharedCudaTrainer` instead
253/// of cloning the trainer out.
254pub struct CudaAprCheckpointFn {
255 trainer: SharedCudaTrainer,
256 model_name: String,
257 architecture: String,
258}
259
260impl CudaAprCheckpointFn {
261 pub fn new(
262 trainer: SharedCudaTrainer,
263 model_name: impl Into<String>,
264 architecture: impl Into<String>,
265 ) -> Self {
266 Self { trainer, model_name: model_name.into(), architecture: architecture.into() }
267 }
268}
269
270impl CheckpointFn for CudaAprCheckpointFn {
271 fn save(&mut self, _epoch: usize, artifact: &EpochArtifact) -> Result<(), String> {
272 let mut trainer = self.trainer.borrow_mut();
273 trainer
274 .save_apr(&artifact.checkpoint_path, &self.model_name, &self.architecture)
275 .map_err(|e| format!("save_apr (cuda) failed: {e}"))
276 }
277}
278
279#[cfg(test)]
280mod tests {
281 use super::*;
282
283 /// FALSIFY-APR-PRETRAIN-INIT-CUDA-002 (paired-args invariant):
284 /// `build_shared_cuda_trainer_with_init` MUST reject the
285 /// (Some, None) and (None, Some) caller-bug states identically
286 /// to the CPU `build_shared_trainer_with_init`. The two fields
287 /// are paired by construction — separately optional fields are
288 /// a defect class because they let a caller pass an arch
289 /// without weights (silent random-init at the GPU boundary) or
290 /// weights without an arch (silently fall back to Llama370M).
291 ///
292 /// This test fires WITHOUT a CUDA device — the args check
293 /// happens before any GPU allocation — so it runs on any host
294 /// even when CUDA runtime is unavailable.
295 #[test]
296 fn build_shared_cuda_trainer_with_init_rejects_unpaired_args() {
297 use std::path::PathBuf;
298 // Arch without path — Err. Use Qwen 0.5B as a concrete
299 // non-Llama370M decoder config to prove the paired-args
300 // gate fires before any architectural inspection.
301 let cfg = TransformerConfig::qwen2_0_5b();
302 let result_arch_only =
303 build_shared_cuda_trainer_with_init(1.0e-4, 128, 42, Some(&cfg), None);
304 assert!(
305 matches!(result_arch_only, Err(_)),
306 "(Some(arch), None) MUST Err — caller-bug guard"
307 );
308
309 // Path without arch — Err.
310 let dummy = PathBuf::from("/tmp/does-not-exist.apr");
311 let result_path_only =
312 build_shared_cuda_trainer_with_init(1.0e-4, 128, 42, None, Some(&dummy));
313 assert!(
314 matches!(result_path_only, Err(_)),
315 "(None, Some(path)) MUST Err — caller-bug guard"
316 );
317
318 // Both Err messages name the function so callers can grep
319 // back to the offending invocation. We extract the message
320 // by destructuring (CudaTransformerTrainer is not Debug, so
321 // unwrap_err() doesn't compile) — the err is a ConfigError.
322 let err_arch = match result_arch_only {
323 Err(crate::error::Error::ConfigError(s)) => s,
324 other => panic!("expected ConfigError, got: {:?}", other.is_ok()),
325 };
326 let err_path = match result_path_only {
327 Err(crate::error::Error::ConfigError(s)) => s,
328 other => panic!("expected ConfigError, got: {:?}", other.is_ok()),
329 };
330 assert!(
331 err_arch.contains("build_shared_cuda_trainer_with_init"),
332 "Err MUST name the function for grep-ability: {err_arch}"
333 );
334 assert!(
335 err_path.contains("build_shared_cuda_trainer_with_init"),
336 "Err MUST name the function for grep-ability: {err_path}"
337 );
338 }
339
340 /// FALSIFY-APR-PRETRAIN-INIT-CUDA-003 (encoder family rejection):
341 /// passing an Encoder-architecture init config to
342 /// `build_shared_cuda_trainer_with_init` MUST Err — same semantic
343 /// as the CPU path's `validate_pretrain_init_arch_compatible`.
344 /// This proves the symmetric builder threads the §50.4 step 5f.1
345 /// encoder rejection through the CUDA backend.
346 ///
347 /// Fires WITHOUT a CUDA device — the encoder check happens
348 /// before any GPU allocation.
349 #[test]
350 fn build_shared_cuda_trainer_with_init_rejects_encoder_family() {
351 use crate::transformer::ModelArchitecture;
352 use std::path::PathBuf;
353 let mut encoder_cfg = TransformerConfig::qwen2_0_5b();
354 encoder_cfg.architecture = ModelArchitecture::Encoder;
355 let dummy = PathBuf::from("/tmp/does-not-exist.apr");
356 let result =
357 build_shared_cuda_trainer_with_init(1.0e-4, 128, 42, Some(&encoder_cfg), Some(&dummy));
358 assert!(matches!(result, Err(_)), "Encoder-family init MUST Err under §50.4 step 5f.1");
359 }
360
361 /// FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-001 (H1 sanity bound):
362 /// `CudaTransformerTrainer::eval_batch` on a fresh-init trainer
363 /// (random weights) over a synthetic batch with random uniform
364 /// tokens MUST return a loss in a sensible range.
365 ///
366 /// Theoretical bound: random-init Llama-style 2-layer transformer
367 /// over uniformly-distributed targets in vocab=1000 produces
368 /// average cross-entropy near `ln(1000) = 6.91`. Any non-trivially-
369 /// trained model with finite weights produces loss in
370 /// `[0.5 × ln(vocab), 1.5 × ln(vocab)]` modulo float noise.
371 ///
372 /// LIVE EVIDENCE motivating this test (this branch's parent):
373 /// `evidence/section-60-5g-2-redispatch-2026-05-09/README.md`
374 /// recorded a 1500× train/eval discrepancy at the same model
375 /// state (epoch 0: train_loss=1.20 vs val_loss=0.00081). The
376 /// gap survived PR #1579's H2 (populate-coverage) fix, confirming
377 /// H1 (eval_batch degenerate) is independent of H2.
378 ///
379 /// This test reproduces the bug at unit-test level: if H1 is
380 /// real, eval_batch on a tiny random-init model returns ~0
381 /// instead of ~ln(vocab_size). The test is gated on
382 /// `--features cuda` so CI without that flag does not see it;
383 /// `cargo test -p aprender-train --features cuda --lib
384 /// falsify_eval_batch_h1_sanity_bound` reproduces.
385 ///
386 /// Spec: SPEC-SHIP-TWO-001 §60 (forthcoming) H1 root-cause cascade.
387 #[test]
388 fn falsify_eval_batch_h1_sanity_bound() {
389 use crate::train::transformer_trainer::TransformerTrainConfig;
390 use crate::train::transformer_trainer::{CudaTransformerTrainer, LMBatch};
391
392 // Tiny model so the test runs in a few seconds on RTX 4090.
393 let model_cfg = TransformerConfig::tiny();
394 let train_cfg = TransformerTrainConfig::new(model_cfg.clone());
395
396 // Build trainer with random init. Skip the test (rather than
397 // panic) if CUDA is unavailable on the host — the falsifier is
398 // host-dependent.
399 let trainer = match CudaTransformerTrainer::new(train_cfg) {
400 Ok(t) => t,
401 Err(e) => {
402 eprintln!(
403 "[falsify_eval_batch_h1_sanity_bound] skipping: \
404 CudaTransformerTrainer::new failed: {e:?} \
405 (test requires --features cuda + a CUDA host)"
406 );
407 return;
408 }
409 };
410 let mut trainer = trainer;
411
412 // Build a synthetic batch: 4 sequences × 16 tokens each, drawn
413 // from a deterministic LCG so the test is reproducible.
414 let vocab_size = model_cfg.vocab_size as u32;
415 let seq_len = 16;
416 let batch_size = 4;
417 let mut state: u64 = 0xDEAD_BEEF_CAFE_F00D;
418 let lcg = |s: &mut u64| -> u32 {
419 *s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
420 ((*s >> 32) as u32) % vocab_size
421 };
422 let mut sequences = Vec::with_capacity(batch_size);
423 for _ in 0..batch_size {
424 let mut seq = Vec::with_capacity(seq_len + 1);
425 for _ in 0..(seq_len + 1) {
426 seq.push(lcg(&mut state));
427 }
428 sequences.push(seq);
429 }
430 let batch = LMBatch::from_sequences(&sequences, 0, 0);
431
432 // Sanity bound: random-init eval loss should be ≈ ln(1000) = 6.91.
433 // We accept anything in [0.5, 1.5 × ln(vocab)] = [0.5, ~10.4].
434 // If H1 is real, eval_batch returns ~0 (degenerate).
435 let loss = trainer.eval_batch(&batch);
436 let ln_vocab = (vocab_size as f32).ln();
437 let lower_bound = 0.5_f32;
438 let upper_bound = 1.5_f32 * ln_vocab;
439
440 assert!(
441 loss >= lower_bound,
442 "FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-001 (H1 lower bound): \
443 eval_batch on random-init {}-vocab tiny model returned \
444 loss = {loss}, expected ≥ {lower_bound} (random-init theoretical \
445 ≈ ln({vocab_size}) = {ln_vocab:.3}). Loss < 0.5 indicates \
446 eval pipeline is degenerate (cross-entropy collapsing to 0); \
447 see evidence/section-60-5g-2-redispatch-2026-05-09/ for the \
448 1500× train/eval discrepancy that motivated this falsifier.",
449 vocab_size
450 );
451 assert!(
452 loss <= upper_bound,
453 "FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-001 (H1 upper bound): \
454 eval_batch returned loss = {loss}, expected ≤ {upper_bound:.3} \
455 (1.5 × ln(vocab)). Loss > upper_bound suggests numerical \
456 explosion (NaN coercion or gradient overflow), a separate \
457 defect class from the lower-bound H1.",
458 );
459 assert!(loss.is_finite(), "eval_batch returned non-finite loss = {loss}");
460 }
461
462 /// FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-002 (H1 hypothesis A —
463 /// train→eval state pollution): the val_loss anomaly observed in
464 /// `evidence/section-60-5g-2-redispatch-2026-05-09/README.md`
465 /// fired at EPOCH 0 — i.e., AFTER 100 train_batch calls, not on
466 /// a fresh trainer. This test exercises that ordering directly:
467 /// eval_batch BEFORE training (loss_a, sanity), then train_batch,
468 /// then eval_batch on the same evaluation batch (loss_b). The
469 /// two losses should differ by AT MOST the optimizer-step effect
470 /// (a few percent at lr=5e-5 on one mini-batch).
471 ///
472 /// If H1 hypothesis A (logits_buf state contamination) is real,
473 /// loss_b will be much smaller than loss_a even though the model
474 /// only changed by one optimizer step. The 1500× train/val
475 /// discrepancy in §59/§60 evidence implies loss_b/loss_a ~ 1/1500.
476 #[test]
477 fn falsify_eval_batch_h1_train_pollution() {
478 use crate::train::transformer_trainer::TransformerTrainConfig;
479 use crate::train::transformer_trainer::{CudaTransformerTrainer, LMBatch};
480
481 let model_cfg = TransformerConfig::tiny();
482 let train_cfg = TransformerTrainConfig::new(model_cfg.clone());
483
484 let trainer = match CudaTransformerTrainer::new(train_cfg) {
485 Ok(t) => t,
486 Err(e) => {
487 eprintln!(
488 "[falsify_eval_batch_h1_train_pollution] skipping: \
489 CudaTransformerTrainer::new failed: {e:?} \
490 (test requires --features cuda + a CUDA host)"
491 );
492 return;
493 }
494 };
495 let mut trainer = trainer;
496
497 let vocab_size = model_cfg.vocab_size as u32;
498 let seq_len = 16;
499 let batch_size = 4;
500 let mut state: u64 = 0xCAFE_BABE_DEAD_BEEF;
501 let lcg = |s: &mut u64| -> u32 {
502 *s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
503 ((*s >> 32) as u32) % vocab_size
504 };
505 let make_batch = |state: &mut u64, lcg: &dyn Fn(&mut u64) -> u32| -> LMBatch {
506 let mut sequences = Vec::with_capacity(batch_size);
507 for _ in 0..batch_size {
508 let mut seq = Vec::with_capacity(seq_len + 1);
509 for _ in 0..(seq_len + 1) {
510 seq.push(lcg(state));
511 }
512 sequences.push(seq);
513 }
514 LMBatch::from_sequences(&sequences, 0, 0)
515 };
516
517 let train_batch_data = make_batch(&mut state, &lcg);
518 let eval_batch_data = make_batch(&mut state, &lcg);
519
520 // Phase 1: eval BEFORE any training — establishes baseline.
521 let loss_a = trainer.eval_batch(&eval_batch_data);
522 assert!(
523 loss_a.is_finite() && loss_a >= 0.5,
524 "Phase 1 baseline: eval before any train must be sensible \
525 (got {loss_a}); test setup precondition failed before \
526 we can probe H1A. See test 001 for the same lower bound."
527 );
528
529 // Phase 2: train on a DIFFERENT batch — mutates logits_buf
530 // (KAIZEN-052 in-place gradient writeback) and runs optimizer_step.
531 let _train_loss = trainer.train_batch(&train_batch_data);
532
533 // Phase 3: eval on the SAME eval batch — same model state up
534 // to one optimizer step. loss_b should be close to loss_a.
535 let loss_b = trainer.eval_batch(&eval_batch_data);
536
537 // The optimizer step at lr=5e-5 (default finetune mode but our
538 // train_cfg uses lr=0.001 from TrainConfig::default) on ONE
539 // mini-batch can shift loss by maybe 5-30%. We accept any
540 // |loss_b - loss_a| / loss_a < 0.95 (i.e., loss_b doesn't drop
541 // by more than 95%) — generous to allow normal training
542 // dynamics. A drop to ~0 (factor of 1500× as observed in §60)
543 // would break this bound by orders of magnitude.
544 let rel_drop = (loss_a - loss_b).max(0.0) / loss_a;
545 assert!(
546 loss_b.is_finite(),
547 "eval_batch after train returned non-finite loss = {loss_b}; \
548 possible NaN propagation from train_batch's in-place gradient \
549 writeback contaminating subsequent eval forward."
550 );
551 assert!(
552 rel_drop < 0.95,
553 "FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-002 (H1A train→eval \
554 state pollution): eval_batch loss dropped from {loss_a} to \
555 {loss_b} ({:.4}× relative drop) after a single train_batch \
556 on a DIFFERENT batch. A single optimizer step at typical \
557 learning rates cannot legitimately move loss by ≥95%. \
558 This indicates train_batch contaminates state that eval_batch \
559 reads (most likely the gpu_training.logits_buf via KAIZEN-052 \
560 in-place gradient writeback overlapping with the next \
561 gpu_forward GEMM). See \
562 evidence/section-60-5g-2-redispatch-2026-05-09/README.md \
563 for the 1500× train/val discrepancy this falsifier reproduces.",
564 rel_drop
565 );
566 }
567
568 /// FALSIFY-CUDA-FORWARD-PARITY-001 (the load-bearing H4D bisect):
569 /// On a populated Qwen 0.5B, `CudaTransformerTrainer::eval_batch`
570 /// MUST produce a finite, non-degenerate val_loss in the same
571 /// regime as the CPU `Transformer::forward` — i.e., in the
572 /// industry-baseline range for Qwen 0.5B on Python (~1.5–3.0).
573 ///
574 /// Concrete bound: when CPU forward produces logits with
575 /// peak-to-mean > 5 (PR #1602 evidence on populated Qwen,
576 /// argmax=9370), the corresponding CUDA path MUST produce
577 /// val_loss < `ln(vocab_size)` × 0.7 = ~12.0. A val_loss
578 /// approaching or exceeding `ln(vocab)` = 17.21 indicates
579 /// the CUDA path is anti-aligned (sub-random predictions).
580 ///
581 /// CONTEXT: SHIP-TWO §61 evidence (PR #1600) recorded
582 /// val_loss=18.55 at step 1 — *above* `ln(vocab)`. The bug
583 /// is in the CUDA forward path's missing bias-add operation:
584 /// `cuda_block.rs::CudaTransformerBlock` has no `b_q`/`b_k`/
585 /// `b_v` fields and `forward()` does pure gemms (lines 719-747)
586 /// without adding the trained Qwen Q/K/V biases.
587 ///
588 /// Pre-fix: this test fails with val_loss > 12 (CUDA path
589 /// drops biases → sub-random predictions).
590 /// Post-fix: passes with val_loss in the expected range.
591 ///
592 /// Host-gated: requires the canonical Qwen 0.5B init APR + the
593 /// 5g.1-v2 corpus on the lambda-vector RTX 4090 host.
594 #[test]
595 fn falsify_cuda_forward_parity_qwen_val_loss_below_ln_vocab() {
596 let init_path = std::path::Path::new("/mnt/nvme-raid0/models/qwen2.5-coder-0.5b-fresh.apr");
597 if !init_path.exists() {
598 eprintln!(
599 "[falsify-cuda-forward-parity-001] skipping: host lacks {}",
600 init_path.display()
601 );
602 return;
603 }
604 let cfg = TransformerConfig::qwen2_0_5b();
605 let trainer_rc = match build_shared_cuda_trainer_with_init(
606 5.0e-5,
607 32,
608 42,
609 Some(&cfg),
610 Some(init_path),
611 ) {
612 Ok(t) => t,
613 Err(e) => {
614 eprintln!(
615 "[falsify-cuda-forward-parity-001] skipping: \
616 build_shared_cuda_trainer_with_init failed: {e:?} \
617 (test requires --features cuda + a CUDA host)"
618 );
619 return;
620 }
621 };
622
623 // Build a tiny synthetic batch: 1 sequence × 16 tokens.
624 // Choose tokens deterministically; correctness doesn't
625 // depend on which Python tokens — just that the batch is
626 // valid and exercises the forward path end-to-end.
627 let seq = vec![100_u32; 17]; // 16 input + 1 target shift
628 let batch = LMBatch::from_sequences(&[seq], 0, 0);
629
630 let val_loss = trainer_rc.borrow_mut().eval_batch(&batch);
631 let ln_vocab = (cfg.vocab_size as f32).ln();
632 let upper_bound = ln_vocab * 0.7;
633 eprintln!(
634 "[falsify-cuda-forward-parity-001] val_loss={val_loss} ln(vocab)={ln_vocab} \
635 upper_bound (0.7×ln_vocab)={upper_bound}"
636 );
637
638 assert!(val_loss.is_finite(), "val_loss must be finite, got {val_loss}");
639 // The DOMINANT assertion: val_loss MUST be below 0.7×ln(vocab).
640 // CPU forward produces peak-to-mean=5.68 (PR #1602) → cross-
641 // entropy on a single deterministic token should be
642 // O(ln_vocab) at most for a clearly-confident model. The
643 // pre-fix CUDA path produces val_loss > ln_vocab because it
644 // drops Qwen's Q/K/V biases (cuda_block.rs lines 103-135 has
645 // no bias fields; lines 719-747 do bare gemms).
646 assert!(
647 val_loss < upper_bound,
648 "FALSIFY-CUDA-FORWARD-PARITY-001 (H4D): CUDA val_loss={val_loss} >= \
649 0.7×ln(vocab)={upper_bound}. Same Qwen weights produce \
650 peak-to-mean=5.68 on CPU forward (PR #1602 falsify_h4_cpu_forward_*) \
651 but CUDA produces sub-random predictions. Root cause: \
652 CudaTransformerBlock drops Qwen Q/K/V biases — struct has no bias \
653 fields (cuda_block.rs lines 103-135), forward does bare gemms \
654 (lines 719-747) without `cuda_add(q, b_q)` after each projection. \
655 See evidence/section-60-5g-2-redispatch-2026-05-09/ + this contract \
656 apr-pretrain-cuda-forward-parity-v1.yaml. Fix scope: add b_q/b_k/b_v \
657 fields, thread through with_model upload, apply bias-add after each \
658 Q/K/V gemm in forward."
659 );
660 }
661}