entrenar/train/
pretrain_real_cuda.rs

1//! CUDA-backend `StepFn` / `ValFn` / `CheckpointFn` for the 370M pretrain
2//! loop (task #132 Phase 2, contract `gpu-training-backend-v1`).
3//!
4//! Mirrors `pretrain_real.rs` but swaps `TransformerTrainer`
5//! (CPU + trueno SIMD) for `CudaTransformerTrainer` (GPU-resident
6//! AdamW + fused CE). The entire module is gated on
7//! `#[cfg(feature = "cuda")]` because `CudaTransformerTrainer::new`
8//! / `train_batch` / `eval_batch` / `save_apr` only exist in the
9//! cuda build — the non-cuda stub returns an error from `new()` and
10//! exposes no step/eval/save methods.
11//!
12//! Contract obligations discharged / strengthened vs the CPU path:
13//! - INV-ARCH-370M-001 (param count ∈ [366M, 374M]) via `debug_assert`
14//!   on `CudaTransformerTrainer::model().parameters()`, matching
15//!   the CPU guard.
16//! - INV-TRAIN-007 (no NaN/Inf): `train_batch` / `eval_batch` return
17//!   finite loss by construction; non-finite outputs abort via
18//!   `PretrainLoop`'s guards.
19//! - INV-TRAIN-008 (grad_norm ≥ 0): `last_grad_norm()` returns the
20//!   real LM-head L2 norm. Strictly stronger than the CPU path's
21//!   `1.0` placeholder.
22//!
23//! Deferred to a follow-up:
24//! - INV-TRAIN-003 (AdamW-state sha256). `CudaTransformerTrainer`
25//!   keeps (m, v, t) on the GPU; discharging this cleanly needs a
26//!   D2H sync that `save_apr` already pays for but `StepFn` does
27//!   not want to pay per-step. Until that sync is factored out,
28//!   the trait default `optimizer_state_sha256 -> None` is used,
29//!   and GATE-TRAIN-006 runs only on the CPU path.
30
31#![cfg(feature = "cuda")]
32
33use crate::train::pretrain::{CheckpointFn, EpochArtifact, StepFn, ValFn};
34use crate::train::pretrain_real::{
35    build_transformer_config, llama_370m_train_config, load_init_tensors_from_apr,
36    populate_trainer_from_init_tensors, validate_pretrain_init_arch_compatible,
37};
38use crate::train::transformer_trainer::{CudaTransformerTrainer, LMBatch, TransformerTrainConfig};
39use crate::transformer::{Transformer, TransformerConfig};
40use std::cell::RefCell;
41use std::path::Path;
42use std::rc::Rc;
43
44/// Shared mutable ownership of a GPU-resident trainer. Both
45/// `CudaRealStepFn` (train steps) and `CudaRealValFn` (eval) clone
46/// this `Rc` so the three hooks see the same GPU memory.
47pub type SharedCudaTrainer = Rc<RefCell<CudaTransformerTrainer>>;
48
49/// Allocate a `CudaTransformerTrainer` with MODEL-2 v2-remedy defaults
50/// and verify INV-ARCH-370M-001 in debug builds.
51///
52/// Returns a `crate::Result` because `CudaTransformerTrainer::new`
53/// can fail on missing CUDA runtime, kernel pre-warm failure, or
54/// block upload failure — the CLI surfaces this as a
55/// GATE-GPUTRAIN-002 error so the operator knows to check their
56/// `--features cuda` build or their GPU.
57pub fn build_shared_cuda_trainer(
58    lr: f32,
59    seq_length: usize,
60    seed: u64,
61) -> crate::Result<SharedCudaTrainer> {
62    let cfg = llama_370m_train_config(lr, seq_length, seed);
63    let trainer = CudaTransformerTrainer::new(cfg)?;
64    #[cfg(debug_assertions)]
65    {
66        let param_count: usize = trainer.model().parameters().iter().map(|t| t.len()).sum();
67        debug_assert!(
68            (366_000_000..=374_000_000).contains(&param_count),
69            "INV-ARCH-370M-001: parameter count {param_count} outside [366M, 374M] band",
70        );
71    }
72    Ok(Rc::new(RefCell::new(trainer)))
73}
74
75/// Polymorphic CUDA trainer builder for `apr pretrain --init --device cuda`
76/// (§50.4 step 5f.5 — symmetric to the CPU `build_shared_trainer_with_init`).
77///
78/// Composes the same §50.4 step-5f machinery as the CPU path, but runs
79/// it against `CudaTransformerTrainer::with_model` so the populated
80/// init weights flow through GPU upload (transformer blocks via
81/// `upload_blocks`, final RMSNorm via `from_host`, lm_head /
82/// embed_tokens.weight via `from_host`):
83///   - 5c: `build_transformer_config(init_arch)` — polymorphic dispatch
84///   - 5f.1: `validate_pretrain_init_arch_compatible(init_arch)` — encoder rejection
85///   - 5f.2: `load_init_tensors_from_apr(path)` — read APR weights
86///   - 5f.3: `populate_trainer_from_init_tensors(transformer, &tensors)` — populate CPU model
87///   - 5f.5: `CudaTransformerTrainer::with_model(populated_model, train_cfg)` — GPU upload
88///
89/// Behaviour:
90///   init = None  → identical to `build_shared_cuda_trainer` (Llama370M
91///                  from-scratch baseline with INV-ARCH-370M-001 enforced).
92///   init = Some  → builds a CUDA trainer whose GPU weights derive from
93///                  the populated CPU model (the populated `Transformer`
94///                  is moved into `with_model` which uploads its blocks /
95///                  norm / lm_head to GPU). INV-ARCH-370M-001 is NOT
96///                  enforced — arch is whatever the init APR has.
97///
98/// Spec: SPEC-SHIP-TWO-001 §52.4 (CPU 5f.4 wireup) + §54-§56 (Qwen
99/// 5g.0/5g.1 prerequisites) + this §50.4 step 5f.5 (CUDA wireup).
100///
101/// # Errors
102///
103/// Returns Err when:
104/// - `init_arch.is_some() != init_path.is_some()` (caller bug — same
105///   diagnostic as the CPU path's `build_shared_trainer_with_init`).
106/// - `init_arch` is `Some` with `architecture = Encoder`
107///   (FALSIFY-APR-PRETRAIN-ARCH-007 / FALSIFY-APR-PRETRAIN-INIT-001).
108/// - `load_init_tensors_from_apr` fails (FALSIFY-APR-PRETRAIN-INIT-006).
109/// - `populate_trainer_from_init_tensors` fails (FALSIFY-APR-PRETRAIN-INIT-007).
110/// - `CudaTransformerTrainer::with_model` fails (CUDA init / kernel
111///   pre-warm / block upload — surfaces as GATE-GPUTRAIN-002).
112///
113/// # Caller Contract
114///
115/// The caller MUST have built the binary with `--features cuda`. This
116/// function is gated on `#[cfg(feature = "cuda")]` so a non-cuda build
117/// will not see this symbol; the apr-cli dispatch layer routes
118/// `--device cuda` to `drive_real_cuda` which calls this builder, and
119/// the non-cuda stub for `drive_real_cuda` already returns the
120/// rebuild-with-cuda error per `feedback_cuda_feature_footgun.md`.
121pub fn build_shared_cuda_trainer_with_init(
122    lr: f32,
123    seq_length: usize,
124    seed: u64,
125    init_arch: Option<&TransformerConfig>,
126    init_path: Option<&Path>,
127) -> crate::Result<SharedCudaTrainer> {
128    if init_arch.is_some() != init_path.is_some() {
129        return Err(crate::error::Error::ConfigError(format!(
130            "build_shared_cuda_trainer_with_init: init_arch and init_path must both be Some \
131             or both None (caller bug; init_arch.is_some()={}, init_path.is_some()={})",
132            init_arch.is_some(),
133            init_path.is_some()
134        )));
135    }
136
137    if let Some(cfg) = init_arch {
138        validate_pretrain_init_arch_compatible(cfg).map_err(crate::error::Error::ConfigError)?;
139    }
140
141    let model_cfg = build_transformer_config(init_arch);
142    let mut train_cfg = TransformerTrainConfig::new(model_cfg);
143    train_cfg.lr = lr;
144    train_cfg.max_seq_len = seq_length;
145    train_cfg.seed = seed;
146
147    // Build the CPU model first; populate init weights into it; then
148    // hand it to CudaTransformerTrainer::with_model which uploads the
149    // populated blocks, final RMSNorm, and lm_head/embed_tokens to GPU.
150    // This is the symmetric path to CPU's build_shared_trainer_with_init,
151    // exercising the SAME populate_trainer_from_init_tensors helper so
152    // the population semantics are identical between backends.
153    let mut transformer = Transformer::new(&train_cfg.model_config);
154
155    if let Some(path) = init_path {
156        let tensors = load_init_tensors_from_apr(path).map_err(crate::error::Error::ConfigError)?;
157        populate_trainer_from_init_tensors(&mut transformer, &tensors)
158            .map_err(crate::error::Error::ConfigError)?;
159    } else {
160        // From-scratch CUDA path with init=None: enforce the
161        // INV-ARCH-370M-001 param-count band. Mirrors the CPU
162        // `build_shared_trainer` invariant exactly.
163        #[cfg(debug_assertions)]
164        {
165            let param_count: usize = transformer.parameters().iter().map(|t| t.len()).sum();
166            debug_assert!(
167                (366_000_000..=374_000_000).contains(&param_count),
168                "INV-ARCH-370M-001: parameter count {param_count} outside [366M, 374M] band \
169                 (from-scratch CUDA path with init=None)",
170            );
171        }
172    }
173
174    let trainer = CudaTransformerTrainer::with_model(transformer, train_cfg)?;
175    Ok(Rc::new(RefCell::new(trainer)))
176}
177
178/// CUDA `StepFn` — pulls one `LMBatch` from the shard iterator and
179/// runs a real GPU forward + backward + AdamW step.
180pub struct CudaRealStepFn {
181    trainer: SharedCudaTrainer,
182    batches: Box<dyn Iterator<Item = LMBatch>>,
183}
184
185impl CudaRealStepFn {
186    pub fn new(trainer: SharedCudaTrainer, batches: Box<dyn Iterator<Item = LMBatch>>) -> Self {
187        Self { trainer, batches }
188    }
189}
190
191impl StepFn for CudaRealStepFn {
192    fn step(&mut self, _step: u64, _lr: f32, _batch_tokens: u64) -> (f32, f32) {
193        // Exhausted shard stream: emit a finite placeholder so the
194        // NaN/Inf guard (INV-TRAIN-007) doesn't mis-fire and the
195        // divergence guard (GATE-TRAIN-005) correctly does not abort.
196        let Some(batch) = self.batches.next() else {
197            return (1.0, 1.0);
198        };
199        let mut trainer = self.trainer.borrow_mut();
200        let loss = trainer.train_batch(&batch);
201        // Real LM-head L2 norm — strictly more informative than the
202        // CPU path's `1.0` placeholder for GATE-TRAIN-008 monitoring.
203        let grad_norm = trainer.last_grad_norm();
204        (loss, grad_norm)
205    }
206
207    // INV-TRAIN-003 intentionally deferred for the GPU path — see
208    // module docs. Uses trait default `-> None`, so the CPU gate
209    // (`--device cpu`) is the one that exercises AdamW-state parity.
210}
211
212/// CUDA `ValFn` — forward-only eval across pre-loaded held-out
213/// batches. Uses `eval_batch` (fused GPU cross-entropy, no logits
214/// D2H) and averages across batches.
215pub struct CudaRealValFn {
216    trainer: SharedCudaTrainer,
217    held_out: Vec<LMBatch>,
218}
219
220impl CudaRealValFn {
221    pub fn new(trainer: SharedCudaTrainer, held_out: Vec<LMBatch>) -> Self {
222        Self { trainer, held_out }
223    }
224}
225
226impl ValFn for CudaRealValFn {
227    fn validate(&mut self, _epoch: usize) -> f32 {
228        if self.held_out.is_empty() {
229            return f32::NAN;
230        }
231        let mut trainer = self.trainer.borrow_mut();
232        let mut total_loss = 0.0_f32;
233        let mut count = 0_usize;
234        for batch in &self.held_out {
235            if batch.batch_size == 0 {
236                continue;
237            }
238            total_loss += trainer.eval_batch(batch);
239            count += 1;
240        }
241        if count == 0 {
242            f32::NAN
243        } else {
244            total_loss / count as f32
245        }
246    }
247}
248
249/// CUDA `CheckpointFn` — writes the 370M weights to
250/// `artifact.checkpoint_path` in APR format. `save_apr` takes
251/// `&mut self` on the CUDA path because it syncs GPU→CPU before
252/// writing, which is why this holds the `SharedCudaTrainer` instead
253/// of cloning the trainer out.
254pub struct CudaAprCheckpointFn {
255    trainer: SharedCudaTrainer,
256    model_name: String,
257    architecture: String,
258}
259
260impl CudaAprCheckpointFn {
261    pub fn new(
262        trainer: SharedCudaTrainer,
263        model_name: impl Into<String>,
264        architecture: impl Into<String>,
265    ) -> Self {
266        Self { trainer, model_name: model_name.into(), architecture: architecture.into() }
267    }
268}
269
270impl CheckpointFn for CudaAprCheckpointFn {
271    fn save(&mut self, _epoch: usize, artifact: &EpochArtifact) -> Result<(), String> {
272        let mut trainer = self.trainer.borrow_mut();
273        trainer
274            .save_apr(&artifact.checkpoint_path, &self.model_name, &self.architecture)
275            .map_err(|e| format!("save_apr (cuda) failed: {e}"))
276    }
277}
278
279#[cfg(test)]
280mod tests {
281    use super::*;
282
283    /// FALSIFY-APR-PRETRAIN-INIT-CUDA-002 (paired-args invariant):
284    /// `build_shared_cuda_trainer_with_init` MUST reject the
285    /// (Some, None) and (None, Some) caller-bug states identically
286    /// to the CPU `build_shared_trainer_with_init`. The two fields
287    /// are paired by construction — separately optional fields are
288    /// a defect class because they let a caller pass an arch
289    /// without weights (silent random-init at the GPU boundary) or
290    /// weights without an arch (silently fall back to Llama370M).
291    ///
292    /// This test fires WITHOUT a CUDA device — the args check
293    /// happens before any GPU allocation — so it runs on any host
294    /// even when CUDA runtime is unavailable.
295    #[test]
296    fn build_shared_cuda_trainer_with_init_rejects_unpaired_args() {
297        use std::path::PathBuf;
298        // Arch without path — Err. Use Qwen 0.5B as a concrete
299        // non-Llama370M decoder config to prove the paired-args
300        // gate fires before any architectural inspection.
301        let cfg = TransformerConfig::qwen2_0_5b();
302        let result_arch_only =
303            build_shared_cuda_trainer_with_init(1.0e-4, 128, 42, Some(&cfg), None);
304        assert!(
305            matches!(result_arch_only, Err(_)),
306            "(Some(arch), None) MUST Err — caller-bug guard"
307        );
308
309        // Path without arch — Err.
310        let dummy = PathBuf::from("/tmp/does-not-exist.apr");
311        let result_path_only =
312            build_shared_cuda_trainer_with_init(1.0e-4, 128, 42, None, Some(&dummy));
313        assert!(
314            matches!(result_path_only, Err(_)),
315            "(None, Some(path)) MUST Err — caller-bug guard"
316        );
317
318        // Both Err messages name the function so callers can grep
319        // back to the offending invocation. We extract the message
320        // by destructuring (CudaTransformerTrainer is not Debug, so
321        // unwrap_err() doesn't compile) — the err is a ConfigError.
322        let err_arch = match result_arch_only {
323            Err(crate::error::Error::ConfigError(s)) => s,
324            other => panic!("expected ConfigError, got: {:?}", other.is_ok()),
325        };
326        let err_path = match result_path_only {
327            Err(crate::error::Error::ConfigError(s)) => s,
328            other => panic!("expected ConfigError, got: {:?}", other.is_ok()),
329        };
330        assert!(
331            err_arch.contains("build_shared_cuda_trainer_with_init"),
332            "Err MUST name the function for grep-ability: {err_arch}"
333        );
334        assert!(
335            err_path.contains("build_shared_cuda_trainer_with_init"),
336            "Err MUST name the function for grep-ability: {err_path}"
337        );
338    }
339
340    /// FALSIFY-APR-PRETRAIN-INIT-CUDA-003 (encoder family rejection):
341    /// passing an Encoder-architecture init config to
342    /// `build_shared_cuda_trainer_with_init` MUST Err — same semantic
343    /// as the CPU path's `validate_pretrain_init_arch_compatible`.
344    /// This proves the symmetric builder threads the §50.4 step 5f.1
345    /// encoder rejection through the CUDA backend.
346    ///
347    /// Fires WITHOUT a CUDA device — the encoder check happens
348    /// before any GPU allocation.
349    #[test]
350    fn build_shared_cuda_trainer_with_init_rejects_encoder_family() {
351        use crate::transformer::ModelArchitecture;
352        use std::path::PathBuf;
353        let mut encoder_cfg = TransformerConfig::qwen2_0_5b();
354        encoder_cfg.architecture = ModelArchitecture::Encoder;
355        let dummy = PathBuf::from("/tmp/does-not-exist.apr");
356        let result =
357            build_shared_cuda_trainer_with_init(1.0e-4, 128, 42, Some(&encoder_cfg), Some(&dummy));
358        assert!(matches!(result, Err(_)), "Encoder-family init MUST Err under §50.4 step 5f.1");
359    }
360
361    /// FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-001 (H1 sanity bound):
362    /// `CudaTransformerTrainer::eval_batch` on a fresh-init trainer
363    /// (random weights) over a synthetic batch with random uniform
364    /// tokens MUST return a loss in a sensible range.
365    ///
366    /// Theoretical bound: random-init Llama-style 2-layer transformer
367    /// over uniformly-distributed targets in vocab=1000 produces
368    /// average cross-entropy near `ln(1000) = 6.91`. Any non-trivially-
369    /// trained model with finite weights produces loss in
370    /// `[0.5 × ln(vocab), 1.5 × ln(vocab)]` modulo float noise.
371    ///
372    /// LIVE EVIDENCE motivating this test (this branch's parent):
373    /// `evidence/section-60-5g-2-redispatch-2026-05-09/README.md`
374    /// recorded a 1500× train/eval discrepancy at the same model
375    /// state (epoch 0: train_loss=1.20 vs val_loss=0.00081). The
376    /// gap survived PR #1579's H2 (populate-coverage) fix, confirming
377    /// H1 (eval_batch degenerate) is independent of H2.
378    ///
379    /// This test reproduces the bug at unit-test level: if H1 is
380    /// real, eval_batch on a tiny random-init model returns ~0
381    /// instead of ~ln(vocab_size). The test is gated on
382    /// `--features cuda` so CI without that flag does not see it;
383    /// `cargo test -p aprender-train --features cuda --lib
384    /// falsify_eval_batch_h1_sanity_bound` reproduces.
385    ///
386    /// Spec: SPEC-SHIP-TWO-001 §60 (forthcoming) H1 root-cause cascade.
387    #[test]
388    fn falsify_eval_batch_h1_sanity_bound() {
389        use crate::train::transformer_trainer::TransformerTrainConfig;
390        use crate::train::transformer_trainer::{CudaTransformerTrainer, LMBatch};
391
392        // Tiny model so the test runs in a few seconds on RTX 4090.
393        let model_cfg = TransformerConfig::tiny();
394        let train_cfg = TransformerTrainConfig::new(model_cfg.clone());
395
396        // Build trainer with random init. Skip the test (rather than
397        // panic) if CUDA is unavailable on the host — the falsifier is
398        // host-dependent.
399        let trainer = match CudaTransformerTrainer::new(train_cfg) {
400            Ok(t) => t,
401            Err(e) => {
402                eprintln!(
403                    "[falsify_eval_batch_h1_sanity_bound] skipping: \
404                     CudaTransformerTrainer::new failed: {e:?} \
405                     (test requires --features cuda + a CUDA host)"
406                );
407                return;
408            }
409        };
410        let mut trainer = trainer;
411
412        // Build a synthetic batch: 4 sequences × 16 tokens each, drawn
413        // from a deterministic LCG so the test is reproducible.
414        let vocab_size = model_cfg.vocab_size as u32;
415        let seq_len = 16;
416        let batch_size = 4;
417        let mut state: u64 = 0xDEAD_BEEF_CAFE_F00D;
418        let lcg = |s: &mut u64| -> u32 {
419            *s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
420            ((*s >> 32) as u32) % vocab_size
421        };
422        let mut sequences = Vec::with_capacity(batch_size);
423        for _ in 0..batch_size {
424            let mut seq = Vec::with_capacity(seq_len + 1);
425            for _ in 0..(seq_len + 1) {
426                seq.push(lcg(&mut state));
427            }
428            sequences.push(seq);
429        }
430        let batch = LMBatch::from_sequences(&sequences, 0, 0);
431
432        // Sanity bound: random-init eval loss should be ≈ ln(1000) = 6.91.
433        // We accept anything in [0.5, 1.5 × ln(vocab)] = [0.5, ~10.4].
434        // If H1 is real, eval_batch returns ~0 (degenerate).
435        let loss = trainer.eval_batch(&batch);
436        let ln_vocab = (vocab_size as f32).ln();
437        let lower_bound = 0.5_f32;
438        let upper_bound = 1.5_f32 * ln_vocab;
439
440        assert!(
441            loss >= lower_bound,
442            "FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-001 (H1 lower bound): \
443             eval_batch on random-init {}-vocab tiny model returned \
444             loss = {loss}, expected ≥ {lower_bound} (random-init theoretical \
445             ≈ ln({vocab_size}) = {ln_vocab:.3}). Loss < 0.5 indicates \
446             eval pipeline is degenerate (cross-entropy collapsing to 0); \
447             see evidence/section-60-5g-2-redispatch-2026-05-09/ for the \
448             1500× train/eval discrepancy that motivated this falsifier.",
449            vocab_size
450        );
451        assert!(
452            loss <= upper_bound,
453            "FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-001 (H1 upper bound): \
454             eval_batch returned loss = {loss}, expected ≤ {upper_bound:.3} \
455             (1.5 × ln(vocab)). Loss > upper_bound suggests numerical \
456             explosion (NaN coercion or gradient overflow), a separate \
457             defect class from the lower-bound H1.",
458        );
459        assert!(loss.is_finite(), "eval_batch returned non-finite loss = {loss}");
460    }
461
462    /// FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-002 (H1 hypothesis A —
463    /// train→eval state pollution): the val_loss anomaly observed in
464    /// `evidence/section-60-5g-2-redispatch-2026-05-09/README.md`
465    /// fired at EPOCH 0 — i.e., AFTER 100 train_batch calls, not on
466    /// a fresh trainer. This test exercises that ordering directly:
467    /// eval_batch BEFORE training (loss_a, sanity), then train_batch,
468    /// then eval_batch on the same evaluation batch (loss_b). The
469    /// two losses should differ by AT MOST the optimizer-step effect
470    /// (a few percent at lr=5e-5 on one mini-batch).
471    ///
472    /// If H1 hypothesis A (logits_buf state contamination) is real,
473    /// loss_b will be much smaller than loss_a even though the model
474    /// only changed by one optimizer step. The 1500× train/val
475    /// discrepancy in §59/§60 evidence implies loss_b/loss_a ~ 1/1500.
476    #[test]
477    fn falsify_eval_batch_h1_train_pollution() {
478        use crate::train::transformer_trainer::TransformerTrainConfig;
479        use crate::train::transformer_trainer::{CudaTransformerTrainer, LMBatch};
480
481        let model_cfg = TransformerConfig::tiny();
482        let train_cfg = TransformerTrainConfig::new(model_cfg.clone());
483
484        let trainer = match CudaTransformerTrainer::new(train_cfg) {
485            Ok(t) => t,
486            Err(e) => {
487                eprintln!(
488                    "[falsify_eval_batch_h1_train_pollution] skipping: \
489                     CudaTransformerTrainer::new failed: {e:?} \
490                     (test requires --features cuda + a CUDA host)"
491                );
492                return;
493            }
494        };
495        let mut trainer = trainer;
496
497        let vocab_size = model_cfg.vocab_size as u32;
498        let seq_len = 16;
499        let batch_size = 4;
500        let mut state: u64 = 0xCAFE_BABE_DEAD_BEEF;
501        let lcg = |s: &mut u64| -> u32 {
502            *s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
503            ((*s >> 32) as u32) % vocab_size
504        };
505        let make_batch = |state: &mut u64, lcg: &dyn Fn(&mut u64) -> u32| -> LMBatch {
506            let mut sequences = Vec::with_capacity(batch_size);
507            for _ in 0..batch_size {
508                let mut seq = Vec::with_capacity(seq_len + 1);
509                for _ in 0..(seq_len + 1) {
510                    seq.push(lcg(state));
511                }
512                sequences.push(seq);
513            }
514            LMBatch::from_sequences(&sequences, 0, 0)
515        };
516
517        let train_batch_data = make_batch(&mut state, &lcg);
518        let eval_batch_data = make_batch(&mut state, &lcg);
519
520        // Phase 1: eval BEFORE any training — establishes baseline.
521        let loss_a = trainer.eval_batch(&eval_batch_data);
522        assert!(
523            loss_a.is_finite() && loss_a >= 0.5,
524            "Phase 1 baseline: eval before any train must be sensible \
525             (got {loss_a}); test setup precondition failed before \
526             we can probe H1A. See test 001 for the same lower bound."
527        );
528
529        // Phase 2: train on a DIFFERENT batch — mutates logits_buf
530        // (KAIZEN-052 in-place gradient writeback) and runs optimizer_step.
531        let _train_loss = trainer.train_batch(&train_batch_data);
532
533        // Phase 3: eval on the SAME eval batch — same model state up
534        // to one optimizer step. loss_b should be close to loss_a.
535        let loss_b = trainer.eval_batch(&eval_batch_data);
536
537        // The optimizer step at lr=5e-5 (default finetune mode but our
538        // train_cfg uses lr=0.001 from TrainConfig::default) on ONE
539        // mini-batch can shift loss by maybe 5-30%. We accept any
540        // |loss_b - loss_a| / loss_a < 0.95 (i.e., loss_b doesn't drop
541        // by more than 95%) — generous to allow normal training
542        // dynamics. A drop to ~0 (factor of 1500× as observed in §60)
543        // would break this bound by orders of magnitude.
544        let rel_drop = (loss_a - loss_b).max(0.0) / loss_a;
545        assert!(
546            loss_b.is_finite(),
547            "eval_batch after train returned non-finite loss = {loss_b}; \
548             possible NaN propagation from train_batch's in-place gradient \
549             writeback contaminating subsequent eval forward."
550        );
551        assert!(
552            rel_drop < 0.95,
553            "FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-002 (H1A train→eval \
554             state pollution): eval_batch loss dropped from {loss_a} to \
555             {loss_b} ({:.4}× relative drop) after a single train_batch \
556             on a DIFFERENT batch. A single optimizer step at typical \
557             learning rates cannot legitimately move loss by ≥95%. \
558             This indicates train_batch contaminates state that eval_batch \
559             reads (most likely the gpu_training.logits_buf via KAIZEN-052 \
560             in-place gradient writeback overlapping with the next \
561             gpu_forward GEMM). See \
562             evidence/section-60-5g-2-redispatch-2026-05-09/README.md \
563             for the 1500× train/val discrepancy this falsifier reproduces.",
564            rel_drop
565        );
566    }
567
568    /// FALSIFY-CUDA-FORWARD-PARITY-001 (the load-bearing H4D bisect):
569    /// On a populated Qwen 0.5B, `CudaTransformerTrainer::eval_batch`
570    /// MUST produce a finite, non-degenerate val_loss in the same
571    /// regime as the CPU `Transformer::forward` — i.e., in the
572    /// industry-baseline range for Qwen 0.5B on Python (~1.5–3.0).
573    ///
574    /// Concrete bound: when CPU forward produces logits with
575    /// peak-to-mean > 5 (PR #1602 evidence on populated Qwen,
576    /// argmax=9370), the corresponding CUDA path MUST produce
577    /// val_loss < `ln(vocab_size)` × 0.7 = ~12.0. A val_loss
578    /// approaching or exceeding `ln(vocab)` = 17.21 indicates
579    /// the CUDA path is anti-aligned (sub-random predictions).
580    ///
581    /// CONTEXT: SHIP-TWO §61 evidence (PR #1600) recorded
582    /// val_loss=18.55 at step 1 — *above* `ln(vocab)`. The bug
583    /// is in the CUDA forward path's missing bias-add operation:
584    /// `cuda_block.rs::CudaTransformerBlock` has no `b_q`/`b_k`/
585    /// `b_v` fields and `forward()` does pure gemms (lines 719-747)
586    /// without adding the trained Qwen Q/K/V biases.
587    ///
588    /// Pre-fix: this test fails with val_loss > 12 (CUDA path
589    /// drops biases → sub-random predictions).
590    /// Post-fix: passes with val_loss in the expected range.
591    ///
592    /// Host-gated: requires the canonical Qwen 0.5B init APR + the
593    /// 5g.1-v2 corpus on the lambda-vector RTX 4090 host.
594    #[test]
595    fn falsify_cuda_forward_parity_qwen_val_loss_below_ln_vocab() {
596        let init_path = std::path::Path::new("/mnt/nvme-raid0/models/qwen2.5-coder-0.5b-fresh.apr");
597        if !init_path.exists() {
598            eprintln!(
599                "[falsify-cuda-forward-parity-001] skipping: host lacks {}",
600                init_path.display()
601            );
602            return;
603        }
604        let cfg = TransformerConfig::qwen2_0_5b();
605        let trainer_rc = match build_shared_cuda_trainer_with_init(
606            5.0e-5,
607            32,
608            42,
609            Some(&cfg),
610            Some(init_path),
611        ) {
612            Ok(t) => t,
613            Err(e) => {
614                eprintln!(
615                    "[falsify-cuda-forward-parity-001] skipping: \
616                     build_shared_cuda_trainer_with_init failed: {e:?} \
617                     (test requires --features cuda + a CUDA host)"
618                );
619                return;
620            }
621        };
622
623        // Build a tiny synthetic batch: 1 sequence × 16 tokens.
624        // Choose tokens deterministically; correctness doesn't
625        // depend on which Python tokens — just that the batch is
626        // valid and exercises the forward path end-to-end.
627        let seq = vec![100_u32; 17]; // 16 input + 1 target shift
628        let batch = LMBatch::from_sequences(&[seq], 0, 0);
629
630        let val_loss = trainer_rc.borrow_mut().eval_batch(&batch);
631        let ln_vocab = (cfg.vocab_size as f32).ln();
632        let upper_bound = ln_vocab * 0.7;
633        eprintln!(
634            "[falsify-cuda-forward-parity-001] val_loss={val_loss} ln(vocab)={ln_vocab} \
635             upper_bound (0.7×ln_vocab)={upper_bound}"
636        );
637
638        assert!(val_loss.is_finite(), "val_loss must be finite, got {val_loss}");
639        // The DOMINANT assertion: val_loss MUST be below 0.7×ln(vocab).
640        // CPU forward produces peak-to-mean=5.68 (PR #1602) → cross-
641        // entropy on a single deterministic token should be
642        // O(ln_vocab) at most for a clearly-confident model. The
643        // pre-fix CUDA path produces val_loss > ln_vocab because it
644        // drops Qwen's Q/K/V biases (cuda_block.rs lines 103-135 has
645        // no bias fields; lines 719-747 do bare gemms).
646        assert!(
647            val_loss < upper_bound,
648            "FALSIFY-CUDA-FORWARD-PARITY-001 (H4D): CUDA val_loss={val_loss} >= \
649             0.7×ln(vocab)={upper_bound}. Same Qwen weights produce \
650             peak-to-mean=5.68 on CPU forward (PR #1602 falsify_h4_cpu_forward_*) \
651             but CUDA produces sub-random predictions. Root cause: \
652             CudaTransformerBlock drops Qwen Q/K/V biases — struct has no bias \
653             fields (cuda_block.rs lines 103-135), forward does bare gemms \
654             (lines 719-747) without `cuda_add(q, b_q)` after each projection. \
655             See evidence/section-60-5g-2-redispatch-2026-05-09/ + this contract \
656             apr-pretrain-cuda-forward-parity-v1.yaml. Fix scope: add b_q/b_k/b_v \
657             fields, thread through with_model upload, apply bias-add after each \
658             Q/K/V gemm in forward."
659        );
660    }
661}
entrenar/train/pretrain_real_cuda.rs

entrenar/train/
pretrain_real_cuda.rs