entrenar/train/pretrain_real_cuda.rs
1//! CUDA-backend `StepFn` / `ValFn` / `CheckpointFn` for the 370M pretrain
2//! loop (task #132 Phase 2, contract `gpu-training-backend-v1`).
3//!
4//! Mirrors `pretrain_real.rs` but swaps `TransformerTrainer`
5//! (CPU + trueno SIMD) for `CudaTransformerTrainer` (GPU-resident
6//! AdamW + fused CE). The entire module is gated on
7//! `#[cfg(feature = "cuda")]` because `CudaTransformerTrainer::new`
8//! / `train_batch` / `eval_batch` / `save_apr` only exist in the
9//! cuda build — the non-cuda stub returns an error from `new()` and
10//! exposes no step/eval/save methods.
11//!
12//! Contract obligations discharged / strengthened vs the CPU path:
13//! - INV-ARCH-370M-001 (param count ∈ [366M, 374M]) via `debug_assert`
14//! on `CudaTransformerTrainer::model().parameters()`, matching
15//! the CPU guard.
16//! - INV-TRAIN-007 (no NaN/Inf): `train_batch` / `eval_batch` return
17//! finite loss by construction; non-finite outputs abort via
18//! `PretrainLoop`'s guards.
19//! - INV-TRAIN-008 (grad_norm ≥ 0): `last_grad_norm()` returns the
20//! real LM-head L2 norm. Strictly stronger than the CPU path's
21//! `1.0` placeholder.
22//!
23//! Deferred to a follow-up:
24//! - INV-TRAIN-003 (AdamW-state sha256). `CudaTransformerTrainer`
25//! keeps (m, v, t) on the GPU; discharging this cleanly needs a
26//! D2H sync that `save_apr` already pays for but `StepFn` does
27//! not want to pay per-step. Until that sync is factored out,
28//! the trait default `optimizer_state_sha256 -> None` is used,
29//! and GATE-TRAIN-006 runs only on the CPU path.
30
31#![cfg(feature = "cuda")]
32
33use crate::train::pretrain::{CheckpointFn, EpochArtifact, StepFn, ValFn};
34use crate::train::pretrain_real::{
35 build_transformer_config, llama_370m_train_config, load_init_tensors_from_apr,
36 populate_trainer_from_init_tensors, validate_pretrain_init_arch_compatible,
37};
38use crate::train::transformer_trainer::{CudaTransformerTrainer, LMBatch, TransformerTrainConfig};
39use crate::transformer::{Transformer, TransformerConfig};
40use std::cell::RefCell;
41use std::path::Path;
42use std::rc::Rc;
43
44/// Shared mutable ownership of a GPU-resident trainer. Both
45/// `CudaRealStepFn` (train steps) and `CudaRealValFn` (eval) clone
46/// this `Rc` so the three hooks see the same GPU memory.
47pub type SharedCudaTrainer = Rc<RefCell<CudaTransformerTrainer>>;
48
49/// Allocate a `CudaTransformerTrainer` with MODEL-2 v2-remedy defaults
50/// and verify INV-ARCH-370M-001 in debug builds.
51///
52/// Returns a `crate::Result` because `CudaTransformerTrainer::new`
53/// can fail on missing CUDA runtime, kernel pre-warm failure, or
54/// block upload failure — the CLI surfaces this as a
55/// GATE-GPUTRAIN-002 error so the operator knows to check their
56/// `--features cuda` build or their GPU.
57pub fn build_shared_cuda_trainer(
58 lr: f32,
59 seq_length: usize,
60 seed: u64,
61) -> crate::Result<SharedCudaTrainer> {
62 let cfg = llama_370m_train_config(lr, seq_length, seed);
63 let trainer = CudaTransformerTrainer::new(cfg)?;
64 #[cfg(debug_assertions)]
65 {
66 let param_count: usize = trainer.model().parameters().iter().map(|t| t.len()).sum();
67 debug_assert!(
68 (366_000_000..=374_000_000).contains(¶m_count),
69 "INV-ARCH-370M-001: parameter count {param_count} outside [366M, 374M] band",
70 );
71 }
72 Ok(Rc::new(RefCell::new(trainer)))
73}
74
75/// Polymorphic CUDA trainer builder for `apr pretrain --init --device cuda`
76/// (§50.4 step 5f.5 — symmetric to the CPU `build_shared_trainer_with_init`).
77///
78/// Composes the same §50.4 step-5f machinery as the CPU path, but runs
79/// it against `CudaTransformerTrainer::with_model` so the populated
80/// init weights flow through GPU upload (transformer blocks via
81/// `upload_blocks`, final RMSNorm via `from_host`, lm_head /
82/// embed_tokens.weight via `from_host`):
83/// - 5c: `build_transformer_config(init_arch)` — polymorphic dispatch
84/// - 5f.1: `validate_pretrain_init_arch_compatible(init_arch)` — encoder rejection
85/// - 5f.2: `load_init_tensors_from_apr(path)` — read APR weights
86/// - 5f.3: `populate_trainer_from_init_tensors(transformer, &tensors)` — populate CPU model
87/// - 5f.5: `CudaTransformerTrainer::with_model(populated_model, train_cfg)` — GPU upload
88///
89/// Behaviour:
90/// init = None → identical to `build_shared_cuda_trainer` (Llama370M
91/// from-scratch baseline with INV-ARCH-370M-001 enforced).
92/// init = Some → builds a CUDA trainer whose GPU weights derive from
93/// the populated CPU model (the populated `Transformer`
94/// is moved into `with_model` which uploads its blocks /
95/// norm / lm_head to GPU). INV-ARCH-370M-001 is NOT
96/// enforced — arch is whatever the init APR has.
97///
98/// Spec: SPEC-SHIP-TWO-001 §52.4 (CPU 5f.4 wireup) + §54-§56 (Qwen
99/// 5g.0/5g.1 prerequisites) + this §50.4 step 5f.5 (CUDA wireup).
100///
101/// # Errors
102///
103/// Returns Err when:
104/// - `init_arch.is_some() != init_path.is_some()` (caller bug — same
105/// diagnostic as the CPU path's `build_shared_trainer_with_init`).
106/// - `init_arch` is `Some` with `architecture = Encoder`
107/// (FALSIFY-APR-PRETRAIN-ARCH-007 / FALSIFY-APR-PRETRAIN-INIT-001).
108/// - `load_init_tensors_from_apr` fails (FALSIFY-APR-PRETRAIN-INIT-006).
109/// - `populate_trainer_from_init_tensors` fails (FALSIFY-APR-PRETRAIN-INIT-007).
110/// - `CudaTransformerTrainer::with_model` fails (CUDA init / kernel
111/// pre-warm / block upload — surfaces as GATE-GPUTRAIN-002).
112///
113/// # Caller Contract
114///
115/// The caller MUST have built the binary with `--features cuda`. This
116/// function is gated on `#[cfg(feature = "cuda")]` so a non-cuda build
117/// will not see this symbol; the apr-cli dispatch layer routes
118/// `--device cuda` to `drive_real_cuda` which calls this builder, and
119/// the non-cuda stub for `drive_real_cuda` already returns the
120/// rebuild-with-cuda error per `feedback_cuda_feature_footgun.md`.
121pub fn build_shared_cuda_trainer_with_init(
122 lr: f32,
123 seq_length: usize,
124 seed: u64,
125 init_arch: Option<&TransformerConfig>,
126 init_path: Option<&Path>,
127) -> crate::Result<SharedCudaTrainer> {
128 if init_arch.is_some() != init_path.is_some() {
129 return Err(crate::error::Error::ConfigError(format!(
130 "build_shared_cuda_trainer_with_init: init_arch and init_path must both be Some \
131 or both None (caller bug; init_arch.is_some()={}, init_path.is_some()={})",
132 init_arch.is_some(),
133 init_path.is_some()
134 )));
135 }
136
137 if let Some(cfg) = init_arch {
138 validate_pretrain_init_arch_compatible(cfg).map_err(crate::error::Error::ConfigError)?;
139 }
140
141 let model_cfg = build_transformer_config(init_arch);
142 let mut train_cfg = TransformerTrainConfig::new(model_cfg);
143 train_cfg.lr = lr;
144 train_cfg.max_seq_len = seq_length;
145 train_cfg.seed = seed;
146
147 // Build the CPU model first; populate init weights into it; then
148 // hand it to CudaTransformerTrainer::with_model which uploads the
149 // populated blocks, final RMSNorm, and lm_head/embed_tokens to GPU.
150 // This is the symmetric path to CPU's build_shared_trainer_with_init,
151 // exercising the SAME populate_trainer_from_init_tensors helper so
152 // the population semantics are identical between backends.
153 let mut transformer = Transformer::new(&train_cfg.model_config);
154
155 if let Some(path) = init_path {
156 let tensors = load_init_tensors_from_apr(path).map_err(crate::error::Error::ConfigError)?;
157 populate_trainer_from_init_tensors(&mut transformer, &tensors)
158 .map_err(crate::error::Error::ConfigError)?;
159 } else {
160 // From-scratch CUDA path with init=None: enforce the
161 // INV-ARCH-370M-001 param-count band. Mirrors the CPU
162 // `build_shared_trainer` invariant exactly.
163 #[cfg(debug_assertions)]
164 {
165 let param_count: usize = transformer.parameters().iter().map(|t| t.len()).sum();
166 debug_assert!(
167 (366_000_000..=374_000_000).contains(¶m_count),
168 "INV-ARCH-370M-001: parameter count {param_count} outside [366M, 374M] band \
169 (from-scratch CUDA path with init=None)",
170 );
171 }
172 }
173
174 let trainer = CudaTransformerTrainer::with_model(transformer, train_cfg)?;
175 Ok(Rc::new(RefCell::new(trainer)))
176}
177
178/// CUDA `StepFn` — pulls one `LMBatch` from the shard iterator and
179/// runs a real GPU forward + backward + AdamW step.
180pub struct CudaRealStepFn {
181 trainer: SharedCudaTrainer,
182 batches: Box<dyn Iterator<Item = LMBatch>>,
183}
184
185impl CudaRealStepFn {
186 pub fn new(trainer: SharedCudaTrainer, batches: Box<dyn Iterator<Item = LMBatch>>) -> Self {
187 Self { trainer, batches }
188 }
189}
190
191impl StepFn for CudaRealStepFn {
192 fn step(&mut self, _step: u64, _lr: f32, _batch_tokens: u64) -> (f32, f32) {
193 // Exhausted shard stream: emit a finite placeholder so the
194 // NaN/Inf guard (INV-TRAIN-007) doesn't mis-fire and the
195 // divergence guard (GATE-TRAIN-005) correctly does not abort.
196 let Some(batch) = self.batches.next() else {
197 return (1.0, 1.0);
198 };
199 let mut trainer = self.trainer.borrow_mut();
200 let loss = trainer.train_batch(&batch);
201 // Real LM-head L2 norm — strictly more informative than the
202 // CPU path's `1.0` placeholder for GATE-TRAIN-008 monitoring.
203 let grad_norm = trainer.last_grad_norm();
204 (loss, grad_norm)
205 }
206
207 // INV-TRAIN-003 intentionally deferred for the GPU path — see
208 // module docs. Uses trait default `-> None`, so the CPU gate
209 // (`--device cpu`) is the one that exercises AdamW-state parity.
210}
211
212/// CUDA `ValFn` — forward-only eval across pre-loaded held-out
213/// batches. Uses `eval_batch` (fused GPU cross-entropy, no logits
214/// D2H) and averages across batches.
215pub struct CudaRealValFn {
216 trainer: SharedCudaTrainer,
217 held_out: Vec<LMBatch>,
218}
219
220impl CudaRealValFn {
221 pub fn new(trainer: SharedCudaTrainer, held_out: Vec<LMBatch>) -> Self {
222 Self { trainer, held_out }
223 }
224}
225
226impl ValFn for CudaRealValFn {
227 fn validate(&mut self, _epoch: usize) -> f32 {
228 if self.held_out.is_empty() {
229 return f32::NAN;
230 }
231 let mut trainer = self.trainer.borrow_mut();
232 let mut total_loss = 0.0_f32;
233 let mut count = 0_usize;
234 for batch in &self.held_out {
235 if batch.batch_size == 0 {
236 continue;
237 }
238 total_loss += trainer.eval_batch(batch);
239 count += 1;
240 }
241 if count == 0 {
242 f32::NAN
243 } else {
244 total_loss / count as f32
245 }
246 }
247}
248
249/// CUDA `CheckpointFn` — writes the 370M weights to
250/// `artifact.checkpoint_path` in APR format. `save_apr` takes
251/// `&mut self` on the CUDA path because it syncs GPU→CPU before
252/// writing, which is why this holds the `SharedCudaTrainer` instead
253/// of cloning the trainer out.
254pub struct CudaAprCheckpointFn {
255 trainer: SharedCudaTrainer,
256 model_name: String,
257 architecture: String,
258 /// SPEC-SHIP-TWO-001 §81 P0-D: optional tokenizer directory whose
259 /// tokenizer.json is embedded into every checkpoint via
260 /// `tokenizer.vocabulary` + `tokenizer.merges` metadata keys.
261 /// When None, checkpoints are written without an embedded tokenizer
262 /// (legacy behavior; `apr qa` will fail with C-03/embedded-tokenizer
263 /// gate per §81 — left as caller's responsibility).
264 tokenizer_dir: Option<std::path::PathBuf>,
265}
266
267impl CudaAprCheckpointFn {
268 pub fn new(
269 trainer: SharedCudaTrainer,
270 model_name: impl Into<String>,
271 architecture: impl Into<String>,
272 ) -> Self {
273 Self {
274 trainer,
275 model_name: model_name.into(),
276 architecture: architecture.into(),
277 tokenizer_dir: None,
278 }
279 }
280
281 /// SPEC-SHIP-TWO-001 §81 P0-D: builder for embedding the tokenizer
282 /// in every checkpoint write. Pass `--tokenizer <DIR>` through here
283 /// so `apr qa <epoch-N.apr>` can run inference without an external
284 /// tokenizer file.
285 pub fn with_tokenizer_dir(mut self, dir: impl Into<std::path::PathBuf>) -> Self {
286 self.tokenizer_dir = Some(dir.into());
287 self
288 }
289}
290
291impl CheckpointFn for CudaAprCheckpointFn {
292 fn save(&mut self, _epoch: usize, artifact: &EpochArtifact) -> Result<(), String> {
293 let mut trainer = self.trainer.borrow_mut();
294 trainer
295 .save_apr_with_tokenizer(
296 &artifact.checkpoint_path,
297 &self.model_name,
298 &self.architecture,
299 self.tokenizer_dir.as_deref(),
300 )
301 .map_err(|e| format!("save_apr (cuda) failed: {e}"))
302 }
303}
304
305#[cfg(test)]
306mod tests {
307 use super::*;
308
309 /// FALSIFY-APR-PRETRAIN-INIT-CUDA-002 (paired-args invariant):
310 /// `build_shared_cuda_trainer_with_init` MUST reject the
311 /// (Some, None) and (None, Some) caller-bug states identically
312 /// to the CPU `build_shared_trainer_with_init`. The two fields
313 /// are paired by construction — separately optional fields are
314 /// a defect class because they let a caller pass an arch
315 /// without weights (silent random-init at the GPU boundary) or
316 /// weights without an arch (silently fall back to Llama370M).
317 ///
318 /// This test fires WITHOUT a CUDA device — the args check
319 /// happens before any GPU allocation — so it runs on any host
320 /// even when CUDA runtime is unavailable.
321 #[test]
322 fn build_shared_cuda_trainer_with_init_rejects_unpaired_args() {
323 use std::path::PathBuf;
324 // Arch without path — Err. Use Qwen 0.5B as a concrete
325 // non-Llama370M decoder config to prove the paired-args
326 // gate fires before any architectural inspection.
327 let cfg = TransformerConfig::qwen2_0_5b();
328 let result_arch_only =
329 build_shared_cuda_trainer_with_init(1.0e-4, 128, 42, Some(&cfg), None);
330 assert!(
331 matches!(result_arch_only, Err(_)),
332 "(Some(arch), None) MUST Err — caller-bug guard"
333 );
334
335 // Path without arch — Err.
336 let dummy = PathBuf::from("/tmp/does-not-exist.apr");
337 let result_path_only =
338 build_shared_cuda_trainer_with_init(1.0e-4, 128, 42, None, Some(&dummy));
339 assert!(
340 matches!(result_path_only, Err(_)),
341 "(None, Some(path)) MUST Err — caller-bug guard"
342 );
343
344 // Both Err messages name the function so callers can grep
345 // back to the offending invocation. We extract the message
346 // by destructuring (CudaTransformerTrainer is not Debug, so
347 // unwrap_err() doesn't compile) — the err is a ConfigError.
348 let err_arch = match result_arch_only {
349 Err(crate::error::Error::ConfigError(s)) => s,
350 other => panic!("expected ConfigError, got: {:?}", other.is_ok()),
351 };
352 let err_path = match result_path_only {
353 Err(crate::error::Error::ConfigError(s)) => s,
354 other => panic!("expected ConfigError, got: {:?}", other.is_ok()),
355 };
356 assert!(
357 err_arch.contains("build_shared_cuda_trainer_with_init"),
358 "Err MUST name the function for grep-ability: {err_arch}"
359 );
360 assert!(
361 err_path.contains("build_shared_cuda_trainer_with_init"),
362 "Err MUST name the function for grep-ability: {err_path}"
363 );
364 }
365
366 /// FALSIFY-APR-PRETRAIN-INIT-CUDA-003 (encoder family rejection):
367 /// passing an Encoder-architecture init config to
368 /// `build_shared_cuda_trainer_with_init` MUST Err — same semantic
369 /// as the CPU path's `validate_pretrain_init_arch_compatible`.
370 /// This proves the symmetric builder threads the §50.4 step 5f.1
371 /// encoder rejection through the CUDA backend.
372 ///
373 /// Fires WITHOUT a CUDA device — the encoder check happens
374 /// before any GPU allocation.
375 #[test]
376 fn build_shared_cuda_trainer_with_init_rejects_encoder_family() {
377 use crate::transformer::ModelArchitecture;
378 use std::path::PathBuf;
379 let mut encoder_cfg = TransformerConfig::qwen2_0_5b();
380 encoder_cfg.architecture = ModelArchitecture::Encoder;
381 let dummy = PathBuf::from("/tmp/does-not-exist.apr");
382 let result =
383 build_shared_cuda_trainer_with_init(1.0e-4, 128, 42, Some(&encoder_cfg), Some(&dummy));
384 assert!(matches!(result, Err(_)), "Encoder-family init MUST Err under §50.4 step 5f.1");
385 }
386
387 /// FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-001 (H1 sanity bound):
388 /// `CudaTransformerTrainer::eval_batch` on a fresh-init trainer
389 /// (random weights) over a synthetic batch with random uniform
390 /// tokens MUST return a loss in a sensible range.
391 ///
392 /// Theoretical bound: random-init Llama-style 2-layer transformer
393 /// over uniformly-distributed targets in vocab=1000 produces
394 /// average cross-entropy near `ln(1000) = 6.91`. Any non-trivially-
395 /// trained model with finite weights produces loss in
396 /// `[0.5 × ln(vocab), 1.5 × ln(vocab)]` modulo float noise.
397 ///
398 /// LIVE EVIDENCE motivating this test (this branch's parent):
399 /// `evidence/section-60-5g-2-redispatch-2026-05-09/README.md`
400 /// recorded a 1500× train/eval discrepancy at the same model
401 /// state (epoch 0: train_loss=1.20 vs val_loss=0.00081). The
402 /// gap survived PR #1579's H2 (populate-coverage) fix, confirming
403 /// H1 (eval_batch degenerate) is independent of H2.
404 ///
405 /// This test reproduces the bug at unit-test level: if H1 is
406 /// real, eval_batch on a tiny random-init model returns ~0
407 /// instead of ~ln(vocab_size). The test is gated on
408 /// `--features cuda` so CI without that flag does not see it;
409 /// `cargo test -p aprender-train --features cuda --lib
410 /// falsify_eval_batch_h1_sanity_bound` reproduces.
411 ///
412 /// Spec: SPEC-SHIP-TWO-001 §60 (forthcoming) H1 root-cause cascade.
413 #[test]
414 fn falsify_eval_batch_h1_sanity_bound() {
415 use crate::train::transformer_trainer::TransformerTrainConfig;
416 use crate::train::transformer_trainer::{CudaTransformerTrainer, LMBatch};
417
418 // Tiny model so the test runs in a few seconds on RTX 4090.
419 let model_cfg = TransformerConfig::tiny();
420 let train_cfg = TransformerTrainConfig::new(model_cfg.clone());
421
422 // Build trainer with random init. Skip the test (rather than
423 // panic) if CUDA is unavailable on the host — the falsifier is
424 // host-dependent.
425 let trainer = match CudaTransformerTrainer::new(train_cfg) {
426 Ok(t) => t,
427 Err(e) => {
428 eprintln!(
429 "[falsify_eval_batch_h1_sanity_bound] skipping: \
430 CudaTransformerTrainer::new failed: {e:?} \
431 (test requires --features cuda + a CUDA host)"
432 );
433 return;
434 }
435 };
436 let mut trainer = trainer;
437
438 // Build a synthetic batch: 4 sequences × 16 tokens each, drawn
439 // from a deterministic LCG so the test is reproducible.
440 let vocab_size = model_cfg.vocab_size as u32;
441 let seq_len = 16;
442 let batch_size = 4;
443 let mut state: u64 = 0xDEAD_BEEF_CAFE_F00D;
444 let lcg = |s: &mut u64| -> u32 {
445 *s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
446 ((*s >> 32) as u32) % vocab_size
447 };
448 let mut sequences = Vec::with_capacity(batch_size);
449 for _ in 0..batch_size {
450 let mut seq = Vec::with_capacity(seq_len + 1);
451 for _ in 0..(seq_len + 1) {
452 seq.push(lcg(&mut state));
453 }
454 sequences.push(seq);
455 }
456 let batch = LMBatch::from_sequences(&sequences, 0, 0);
457
458 // Sanity bound: random-init eval loss should be ≈ ln(1000) = 6.91.
459 // We accept anything in [0.5, 1.5 × ln(vocab)] = [0.5, ~10.4].
460 // If H1 is real, eval_batch returns ~0 (degenerate).
461 let loss = trainer.eval_batch(&batch);
462 let ln_vocab = (vocab_size as f32).ln();
463 let lower_bound = 0.5_f32;
464 let upper_bound = 1.5_f32 * ln_vocab;
465
466 assert!(
467 loss >= lower_bound,
468 "FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-001 (H1 lower bound): \
469 eval_batch on random-init {}-vocab tiny model returned \
470 loss = {loss}, expected ≥ {lower_bound} (random-init theoretical \
471 ≈ ln({vocab_size}) = {ln_vocab:.3}). Loss < 0.5 indicates \
472 eval pipeline is degenerate (cross-entropy collapsing to 0); \
473 see evidence/section-60-5g-2-redispatch-2026-05-09/ for the \
474 1500× train/eval discrepancy that motivated this falsifier.",
475 vocab_size
476 );
477 assert!(
478 loss <= upper_bound,
479 "FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-001 (H1 upper bound): \
480 eval_batch returned loss = {loss}, expected ≤ {upper_bound:.3} \
481 (1.5 × ln(vocab)). Loss > upper_bound suggests numerical \
482 explosion (NaN coercion or gradient overflow), a separate \
483 defect class from the lower-bound H1.",
484 );
485 assert!(loss.is_finite(), "eval_batch returned non-finite loss = {loss}");
486 }
487
488 /// FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-002 (H1 hypothesis A —
489 /// train→eval state pollution): the val_loss anomaly observed in
490 /// `evidence/section-60-5g-2-redispatch-2026-05-09/README.md`
491 /// fired at EPOCH 0 — i.e., AFTER 100 train_batch calls, not on
492 /// a fresh trainer. This test exercises that ordering directly:
493 /// eval_batch BEFORE training (loss_a, sanity), then train_batch,
494 /// then eval_batch on the same evaluation batch (loss_b). The
495 /// two losses should differ by AT MOST the optimizer-step effect
496 /// (a few percent at lr=5e-5 on one mini-batch).
497 ///
498 /// If H1 hypothesis A (logits_buf state contamination) is real,
499 /// loss_b will be much smaller than loss_a even though the model
500 /// only changed by one optimizer step. The 1500× train/val
501 /// discrepancy in §59/§60 evidence implies loss_b/loss_a ~ 1/1500.
502 #[test]
503 fn falsify_eval_batch_h1_train_pollution() {
504 use crate::train::transformer_trainer::TransformerTrainConfig;
505 use crate::train::transformer_trainer::{CudaTransformerTrainer, LMBatch};
506
507 let model_cfg = TransformerConfig::tiny();
508 let train_cfg = TransformerTrainConfig::new(model_cfg.clone());
509
510 let trainer = match CudaTransformerTrainer::new(train_cfg) {
511 Ok(t) => t,
512 Err(e) => {
513 eprintln!(
514 "[falsify_eval_batch_h1_train_pollution] skipping: \
515 CudaTransformerTrainer::new failed: {e:?} \
516 (test requires --features cuda + a CUDA host)"
517 );
518 return;
519 }
520 };
521 let mut trainer = trainer;
522
523 let vocab_size = model_cfg.vocab_size as u32;
524 let seq_len = 16;
525 let batch_size = 4;
526 let mut state: u64 = 0xCAFE_BABE_DEAD_BEEF;
527 let lcg = |s: &mut u64| -> u32 {
528 *s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
529 ((*s >> 32) as u32) % vocab_size
530 };
531 let make_batch = |state: &mut u64, lcg: &dyn Fn(&mut u64) -> u32| -> LMBatch {
532 let mut sequences = Vec::with_capacity(batch_size);
533 for _ in 0..batch_size {
534 let mut seq = Vec::with_capacity(seq_len + 1);
535 for _ in 0..(seq_len + 1) {
536 seq.push(lcg(state));
537 }
538 sequences.push(seq);
539 }
540 LMBatch::from_sequences(&sequences, 0, 0)
541 };
542
543 let train_batch_data = make_batch(&mut state, &lcg);
544 let eval_batch_data = make_batch(&mut state, &lcg);
545
546 // Phase 1: eval BEFORE any training — establishes baseline.
547 let loss_a = trainer.eval_batch(&eval_batch_data);
548 assert!(
549 loss_a.is_finite() && loss_a >= 0.5,
550 "Phase 1 baseline: eval before any train must be sensible \
551 (got {loss_a}); test setup precondition failed before \
552 we can probe H1A. See test 001 for the same lower bound."
553 );
554
555 // Phase 2: train on a DIFFERENT batch — mutates logits_buf
556 // (KAIZEN-052 in-place gradient writeback) and runs optimizer_step.
557 let _train_loss = trainer.train_batch(&train_batch_data);
558
559 // Phase 3: eval on the SAME eval batch — same model state up
560 // to one optimizer step. loss_b should be close to loss_a.
561 let loss_b = trainer.eval_batch(&eval_batch_data);
562
563 // The optimizer step at lr=5e-5 (default finetune mode but our
564 // train_cfg uses lr=0.001 from TrainConfig::default) on ONE
565 // mini-batch can shift loss by maybe 5-30%. We accept any
566 // |loss_b - loss_a| / loss_a < 0.95 (i.e., loss_b doesn't drop
567 // by more than 95%) — generous to allow normal training
568 // dynamics. A drop to ~0 (factor of 1500× as observed in §60)
569 // would break this bound by orders of magnitude.
570 let rel_drop = (loss_a - loss_b).max(0.0) / loss_a;
571 assert!(
572 loss_b.is_finite(),
573 "eval_batch after train returned non-finite loss = {loss_b}; \
574 possible NaN propagation from train_batch's in-place gradient \
575 writeback contaminating subsequent eval forward."
576 );
577 assert!(
578 rel_drop < 0.95,
579 "FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-002 (H1A train→eval \
580 state pollution): eval_batch loss dropped from {loss_a} to \
581 {loss_b} ({:.4}× relative drop) after a single train_batch \
582 on a DIFFERENT batch. A single optimizer step at typical \
583 learning rates cannot legitimately move loss by ≥95%. \
584 This indicates train_batch contaminates state that eval_batch \
585 reads (most likely the gpu_training.logits_buf via KAIZEN-052 \
586 in-place gradient writeback overlapping with the next \
587 gpu_forward GEMM). See \
588 evidence/section-60-5g-2-redispatch-2026-05-09/README.md \
589 for the 1500× train/val discrepancy this falsifier reproduces.",
590 rel_drop
591 );
592 }
593
594 /// FALSIFY-CUDA-FORWARD-PARITY-001 (the load-bearing H4D bisect):
595 /// On a populated Qwen 0.5B, `CudaTransformerTrainer::eval_batch`
596 /// MUST produce a finite, non-degenerate val_loss in the same
597 /// regime as the CPU `Transformer::forward` — i.e., in the
598 /// industry-baseline range for Qwen 0.5B on Python (~1.5–3.0).
599 ///
600 /// Concrete bound: when CPU forward produces logits with
601 /// peak-to-mean > 5 (PR #1602 evidence on populated Qwen,
602 /// argmax=9370), the corresponding CUDA path MUST produce
603 /// val_loss < `ln(vocab_size)` × 0.7 = ~12.0. A val_loss
604 /// approaching or exceeding `ln(vocab)` = 17.21 indicates
605 /// the CUDA path is anti-aligned (sub-random predictions).
606 ///
607 /// CONTEXT: SHIP-TWO §61 evidence (PR #1600) recorded
608 /// val_loss=18.55 at step 1 — *above* `ln(vocab)`. The bug
609 /// is in the CUDA forward path's missing bias-add operation:
610 /// `cuda_block.rs::CudaTransformerBlock` has no `b_q`/`b_k`/
611 /// `b_v` fields and `forward()` does pure gemms (lines 719-747)
612 /// without adding the trained Qwen Q/K/V biases.
613 ///
614 /// Pre-fix: this test fails with val_loss > 12 (CUDA path
615 /// drops biases → sub-random predictions).
616 /// Post-fix: passes with val_loss in the expected range.
617 ///
618 /// Host-gated: requires the canonical Qwen 0.5B init APR + the
619 /// 5g.1-v2 corpus on the lambda-vector RTX 4090 host.
620 #[test]
621 fn falsify_cuda_forward_parity_qwen_val_loss_below_ln_vocab() {
622 let init_path = std::path::Path::new("/mnt/nvme-raid0/models/qwen2.5-coder-0.5b-fresh.apr");
623 if !init_path.exists() {
624 eprintln!(
625 "[falsify-cuda-forward-parity-001] skipping: host lacks {}",
626 init_path.display()
627 );
628 return;
629 }
630 let cfg = TransformerConfig::qwen2_0_5b();
631 let trainer_rc = match build_shared_cuda_trainer_with_init(
632 5.0e-5,
633 32,
634 42,
635 Some(&cfg),
636 Some(init_path),
637 ) {
638 Ok(t) => t,
639 Err(e) => {
640 eprintln!(
641 "[falsify-cuda-forward-parity-001] skipping: \
642 build_shared_cuda_trainer_with_init failed: {e:?} \
643 (test requires --features cuda + a CUDA host)"
644 );
645 return;
646 }
647 };
648
649 // Build a tiny synthetic batch: 1 sequence × 16 tokens.
650 // Choose tokens deterministically; correctness doesn't
651 // depend on which Python tokens — just that the batch is
652 // valid and exercises the forward path end-to-end.
653 let seq = vec![100_u32; 17]; // 16 input + 1 target shift
654 let batch = LMBatch::from_sequences(&[seq], 0, 0);
655
656 let val_loss = trainer_rc.borrow_mut().eval_batch(&batch);
657 let ln_vocab = (cfg.vocab_size as f32).ln();
658 let upper_bound = ln_vocab * 0.7;
659 eprintln!(
660 "[falsify-cuda-forward-parity-001] val_loss={val_loss} ln(vocab)={ln_vocab} \
661 upper_bound (0.7×ln_vocab)={upper_bound}"
662 );
663
664 assert!(val_loss.is_finite(), "val_loss must be finite, got {val_loss}");
665 // The DOMINANT assertion: val_loss MUST be below 0.7×ln(vocab).
666 // CPU forward produces peak-to-mean=5.68 (PR #1602) → cross-
667 // entropy on a single deterministic token should be
668 // O(ln_vocab) at most for a clearly-confident model. The
669 // pre-fix CUDA path produces val_loss > ln_vocab because it
670 // drops Qwen's Q/K/V biases (cuda_block.rs lines 103-135 has
671 // no bias fields; lines 719-747 do bare gemms).
672 assert!(
673 val_loss < upper_bound,
674 "FALSIFY-CUDA-FORWARD-PARITY-001 (H4D): CUDA val_loss={val_loss} >= \
675 0.7×ln(vocab)={upper_bound}. Same Qwen weights produce \
676 peak-to-mean=5.68 on CPU forward (PR #1602 falsify_h4_cpu_forward_*) \
677 but CUDA produces sub-random predictions. Root cause: \
678 CudaTransformerBlock drops Qwen Q/K/V biases — struct has no bias \
679 fields (cuda_block.rs lines 103-135), forward does bare gemms \
680 (lines 719-747) without `cuda_add(q, b_q)` after each projection. \
681 See evidence/section-60-5g-2-redispatch-2026-05-09/ + this contract \
682 apr-pretrain-cuda-forward-parity-v1.yaml. Fix scope: add b_q/b_k/b_v \
683 fields, thread through with_model upload, apply bias-add after each \
684 Q/K/V gemm in forward."
685 );
686 }
687}