entrenar/train/
pretrain_real_cuda.rs

1//! CUDA-backend `StepFn` / `ValFn` / `CheckpointFn` for the 370M pretrain
2//! loop (task #132 Phase 2, contract `gpu-training-backend-v1`).
3//!
4//! Mirrors `pretrain_real.rs` but swaps `TransformerTrainer`
5//! (CPU + trueno SIMD) for `CudaTransformerTrainer` (GPU-resident
6//! AdamW + fused CE). The entire module is gated on
7//! `#[cfg(feature = "cuda")]` because `CudaTransformerTrainer::new`
8//! / `train_batch` / `eval_batch` / `save_apr` only exist in the
9//! cuda build — the non-cuda stub returns an error from `new()` and
10//! exposes no step/eval/save methods.
11//!
12//! Contract obligations discharged / strengthened vs the CPU path:
13//! - INV-ARCH-370M-001 (param count ∈ [366M, 374M]) via `debug_assert`
14//!   on `CudaTransformerTrainer::model().parameters()`, matching
15//!   the CPU guard.
16//! - INV-TRAIN-007 (no NaN/Inf): `train_batch` / `eval_batch` return
17//!   finite loss by construction; non-finite outputs abort via
18//!   `PretrainLoop`'s guards.
19//! - INV-TRAIN-008 (grad_norm ≥ 0): `last_grad_norm()` returns the
20//!   real LM-head L2 norm. Strictly stronger than the CPU path's
21//!   `1.0` placeholder.
22//!
23//! Deferred to a follow-up:
24//! - INV-TRAIN-003 (AdamW-state sha256). `CudaTransformerTrainer`
25//!   keeps (m, v, t) on the GPU; discharging this cleanly needs a
26//!   D2H sync that `save_apr` already pays for but `StepFn` does
27//!   not want to pay per-step. Until that sync is factored out,
28//!   the trait default `optimizer_state_sha256 -> None` is used,
29//!   and GATE-TRAIN-006 runs only on the CPU path.
30
31#![cfg(feature = "cuda")]
32
33use crate::train::pretrain::{CheckpointFn, EpochArtifact, StepFn, ValFn};
34use crate::train::pretrain_real::{
35    build_transformer_config, llama_370m_train_config, load_init_tensors_from_apr,
36    populate_trainer_from_init_tensors, validate_pretrain_init_arch_compatible,
37};
38use crate::train::transformer_trainer::{CudaTransformerTrainer, LMBatch, TransformerTrainConfig};
39use crate::transformer::{Transformer, TransformerConfig};
40use std::cell::RefCell;
41use std::path::Path;
42use std::rc::Rc;
43
44/// Shared mutable ownership of a GPU-resident trainer. Both
45/// `CudaRealStepFn` (train steps) and `CudaRealValFn` (eval) clone
46/// this `Rc` so the three hooks see the same GPU memory.
47pub type SharedCudaTrainer = Rc<RefCell<CudaTransformerTrainer>>;
48
49/// Allocate a `CudaTransformerTrainer` with MODEL-2 v2-remedy defaults
50/// and verify INV-ARCH-370M-001 in debug builds.
51///
52/// Returns a `crate::Result` because `CudaTransformerTrainer::new`
53/// can fail on missing CUDA runtime, kernel pre-warm failure, or
54/// block upload failure — the CLI surfaces this as a
55/// GATE-GPUTRAIN-002 error so the operator knows to check their
56/// `--features cuda` build or their GPU.
57pub fn build_shared_cuda_trainer(
58    lr: f32,
59    seq_length: usize,
60    seed: u64,
61) -> crate::Result<SharedCudaTrainer> {
62    let cfg = llama_370m_train_config(lr, seq_length, seed);
63    let trainer = CudaTransformerTrainer::new(cfg)?;
64    #[cfg(debug_assertions)]
65    {
66        let param_count: usize = trainer.model().parameters().iter().map(|t| t.len()).sum();
67        debug_assert!(
68            (366_000_000..=374_000_000).contains(&param_count),
69            "INV-ARCH-370M-001: parameter count {param_count} outside [366M, 374M] band",
70        );
71    }
72    Ok(Rc::new(RefCell::new(trainer)))
73}
74
75/// Polymorphic CUDA trainer builder for `apr pretrain --init --device cuda`
76/// (§50.4 step 5f.5 — symmetric to the CPU `build_shared_trainer_with_init`).
77///
78/// Composes the same §50.4 step-5f machinery as the CPU path, but runs
79/// it against `CudaTransformerTrainer::with_model` so the populated
80/// init weights flow through GPU upload (transformer blocks via
81/// `upload_blocks`, final RMSNorm via `from_host`, lm_head /
82/// embed_tokens.weight via `from_host`):
83///   - 5c: `build_transformer_config(init_arch)` — polymorphic dispatch
84///   - 5f.1: `validate_pretrain_init_arch_compatible(init_arch)` — encoder rejection
85///   - 5f.2: `load_init_tensors_from_apr(path)` — read APR weights
86///   - 5f.3: `populate_trainer_from_init_tensors(transformer, &tensors)` — populate CPU model
87///   - 5f.5: `CudaTransformerTrainer::with_model(populated_model, train_cfg)` — GPU upload
88///
89/// Behaviour:
90///   init = None  → identical to `build_shared_cuda_trainer` (Llama370M
91///                  from-scratch baseline with INV-ARCH-370M-001 enforced).
92///   init = Some  → builds a CUDA trainer whose GPU weights derive from
93///                  the populated CPU model (the populated `Transformer`
94///                  is moved into `with_model` which uploads its blocks /
95///                  norm / lm_head to GPU). INV-ARCH-370M-001 is NOT
96///                  enforced — arch is whatever the init APR has.
97///
98/// Spec: SPEC-SHIP-TWO-001 §52.4 (CPU 5f.4 wireup) + §54-§56 (Qwen
99/// 5g.0/5g.1 prerequisites) + this §50.4 step 5f.5 (CUDA wireup).
100///
101/// # Errors
102///
103/// Returns Err when:
104/// - `init_arch.is_some() != init_path.is_some()` (caller bug — same
105///   diagnostic as the CPU path's `build_shared_trainer_with_init`).
106/// - `init_arch` is `Some` with `architecture = Encoder`
107///   (FALSIFY-APR-PRETRAIN-ARCH-007 / FALSIFY-APR-PRETRAIN-INIT-001).
108/// - `load_init_tensors_from_apr` fails (FALSIFY-APR-PRETRAIN-INIT-006).
109/// - `populate_trainer_from_init_tensors` fails (FALSIFY-APR-PRETRAIN-INIT-007).
110/// - `CudaTransformerTrainer::with_model` fails (CUDA init / kernel
111///   pre-warm / block upload — surfaces as GATE-GPUTRAIN-002).
112///
113/// # Caller Contract
114///
115/// The caller MUST have built the binary with `--features cuda`. This
116/// function is gated on `#[cfg(feature = "cuda")]` so a non-cuda build
117/// will not see this symbol; the apr-cli dispatch layer routes
118/// `--device cuda` to `drive_real_cuda` which calls this builder, and
119/// the non-cuda stub for `drive_real_cuda` already returns the
120/// rebuild-with-cuda error per `feedback_cuda_feature_footgun.md`.
121pub fn build_shared_cuda_trainer_with_init(
122    lr: f32,
123    seq_length: usize,
124    seed: u64,
125    init_arch: Option<&TransformerConfig>,
126    init_path: Option<&Path>,
127) -> crate::Result<SharedCudaTrainer> {
128    if init_arch.is_some() != init_path.is_some() {
129        return Err(crate::error::Error::ConfigError(format!(
130            "build_shared_cuda_trainer_with_init: init_arch and init_path must both be Some \
131             or both None (caller bug; init_arch.is_some()={}, init_path.is_some()={})",
132            init_arch.is_some(),
133            init_path.is_some()
134        )));
135    }
136
137    if let Some(cfg) = init_arch {
138        validate_pretrain_init_arch_compatible(cfg).map_err(crate::error::Error::ConfigError)?;
139    }
140
141    let model_cfg = build_transformer_config(init_arch);
142    let mut train_cfg = TransformerTrainConfig::new(model_cfg);
143    train_cfg.lr = lr;
144    train_cfg.max_seq_len = seq_length;
145    train_cfg.seed = seed;
146
147    // Build the CPU model first; populate init weights into it; then
148    // hand it to CudaTransformerTrainer::with_model which uploads the
149    // populated blocks, final RMSNorm, and lm_head/embed_tokens to GPU.
150    // This is the symmetric path to CPU's build_shared_trainer_with_init,
151    // exercising the SAME populate_trainer_from_init_tensors helper so
152    // the population semantics are identical between backends.
153    let mut transformer = Transformer::new(&train_cfg.model_config);
154
155    if let Some(path) = init_path {
156        let tensors = load_init_tensors_from_apr(path).map_err(crate::error::Error::ConfigError)?;
157        populate_trainer_from_init_tensors(&mut transformer, &tensors)
158            .map_err(crate::error::Error::ConfigError)?;
159    } else {
160        // From-scratch CUDA path with init=None: enforce the
161        // INV-ARCH-370M-001 param-count band. Mirrors the CPU
162        // `build_shared_trainer` invariant exactly.
163        #[cfg(debug_assertions)]
164        {
165            let param_count: usize = transformer.parameters().iter().map(|t| t.len()).sum();
166            debug_assert!(
167                (366_000_000..=374_000_000).contains(&param_count),
168                "INV-ARCH-370M-001: parameter count {param_count} outside [366M, 374M] band \
169                 (from-scratch CUDA path with init=None)",
170            );
171        }
172    }
173
174    let trainer = CudaTransformerTrainer::with_model(transformer, train_cfg)?;
175    Ok(Rc::new(RefCell::new(trainer)))
176}
177
178/// CUDA `StepFn` — pulls one `LMBatch` from the shard iterator and
179/// runs a real GPU forward + backward + AdamW step.
180pub struct CudaRealStepFn {
181    trainer: SharedCudaTrainer,
182    batches: Box<dyn Iterator<Item = LMBatch>>,
183}
184
185impl CudaRealStepFn {
186    pub fn new(trainer: SharedCudaTrainer, batches: Box<dyn Iterator<Item = LMBatch>>) -> Self {
187        Self { trainer, batches }
188    }
189}
190
191impl StepFn for CudaRealStepFn {
192    fn step(&mut self, _step: u64, _lr: f32, _batch_tokens: u64) -> (f32, f32) {
193        // Exhausted shard stream: emit a finite placeholder so the
194        // NaN/Inf guard (INV-TRAIN-007) doesn't mis-fire and the
195        // divergence guard (GATE-TRAIN-005) correctly does not abort.
196        let Some(batch) = self.batches.next() else {
197            return (1.0, 1.0);
198        };
199        let mut trainer = self.trainer.borrow_mut();
200        let loss = trainer.train_batch(&batch);
201        // Real LM-head L2 norm — strictly more informative than the
202        // CPU path's `1.0` placeholder for GATE-TRAIN-008 monitoring.
203        let grad_norm = trainer.last_grad_norm();
204        (loss, grad_norm)
205    }
206
207    // INV-TRAIN-003 intentionally deferred for the GPU path — see
208    // module docs. Uses trait default `-> None`, so the CPU gate
209    // (`--device cpu`) is the one that exercises AdamW-state parity.
210}
211
212/// CUDA `ValFn` — forward-only eval across pre-loaded held-out
213/// batches. Uses `eval_batch` (fused GPU cross-entropy, no logits
214/// D2H) and averages across batches.
215pub struct CudaRealValFn {
216    trainer: SharedCudaTrainer,
217    held_out: Vec<LMBatch>,
218}
219
220impl CudaRealValFn {
221    pub fn new(trainer: SharedCudaTrainer, held_out: Vec<LMBatch>) -> Self {
222        Self { trainer, held_out }
223    }
224}
225
226impl ValFn for CudaRealValFn {
227    fn validate(&mut self, _epoch: usize) -> f32 {
228        if self.held_out.is_empty() {
229            return f32::NAN;
230        }
231        let mut trainer = self.trainer.borrow_mut();
232        let mut total_loss = 0.0_f32;
233        let mut count = 0_usize;
234        for batch in &self.held_out {
235            if batch.batch_size == 0 {
236                continue;
237            }
238            total_loss += trainer.eval_batch(batch);
239            count += 1;
240        }
241        if count == 0 {
242            f32::NAN
243        } else {
244            total_loss / count as f32
245        }
246    }
247}
248
249/// CUDA `CheckpointFn` — writes the 370M weights to
250/// `artifact.checkpoint_path` in APR format. `save_apr` takes
251/// `&mut self` on the CUDA path because it syncs GPU→CPU before
252/// writing, which is why this holds the `SharedCudaTrainer` instead
253/// of cloning the trainer out.
254pub struct CudaAprCheckpointFn {
255    trainer: SharedCudaTrainer,
256    model_name: String,
257    architecture: String,
258    /// SPEC-SHIP-TWO-001 §81 P0-D: optional tokenizer directory whose
259    /// tokenizer.json is embedded into every checkpoint via
260    /// `tokenizer.vocabulary` + `tokenizer.merges` metadata keys.
261    /// When None, checkpoints are written without an embedded tokenizer
262    /// (legacy behavior; `apr qa` will fail with C-03/embedded-tokenizer
263    /// gate per §81 — left as caller's responsibility).
264    tokenizer_dir: Option<std::path::PathBuf>,
265}
266
267impl CudaAprCheckpointFn {
268    pub fn new(
269        trainer: SharedCudaTrainer,
270        model_name: impl Into<String>,
271        architecture: impl Into<String>,
272    ) -> Self {
273        Self {
274            trainer,
275            model_name: model_name.into(),
276            architecture: architecture.into(),
277            tokenizer_dir: None,
278        }
279    }
280
281    /// SPEC-SHIP-TWO-001 §81 P0-D: builder for embedding the tokenizer
282    /// in every checkpoint write. Pass `--tokenizer <DIR>` through here
283    /// so `apr qa <epoch-N.apr>` can run inference without an external
284    /// tokenizer file.
285    pub fn with_tokenizer_dir(mut self, dir: impl Into<std::path::PathBuf>) -> Self {
286        self.tokenizer_dir = Some(dir.into());
287        self
288    }
289}
290
291impl CheckpointFn for CudaAprCheckpointFn {
292    fn save(&mut self, _epoch: usize, artifact: &EpochArtifact) -> Result<(), String> {
293        let mut trainer = self.trainer.borrow_mut();
294        trainer
295            .save_apr_with_tokenizer(
296                &artifact.checkpoint_path,
297                &self.model_name,
298                &self.architecture,
299                self.tokenizer_dir.as_deref(),
300            )
301            .map_err(|e| format!("save_apr (cuda) failed: {e}"))
302    }
303}
304
305#[cfg(test)]
306mod tests {
307    use super::*;
308
309    /// FALSIFY-APR-PRETRAIN-INIT-CUDA-002 (paired-args invariant):
310    /// `build_shared_cuda_trainer_with_init` MUST reject the
311    /// (Some, None) and (None, Some) caller-bug states identically
312    /// to the CPU `build_shared_trainer_with_init`. The two fields
313    /// are paired by construction — separately optional fields are
314    /// a defect class because they let a caller pass an arch
315    /// without weights (silent random-init at the GPU boundary) or
316    /// weights without an arch (silently fall back to Llama370M).
317    ///
318    /// This test fires WITHOUT a CUDA device — the args check
319    /// happens before any GPU allocation — so it runs on any host
320    /// even when CUDA runtime is unavailable.
321    #[test]
322    fn build_shared_cuda_trainer_with_init_rejects_unpaired_args() {
323        use std::path::PathBuf;
324        // Arch without path — Err. Use Qwen 0.5B as a concrete
325        // non-Llama370M decoder config to prove the paired-args
326        // gate fires before any architectural inspection.
327        let cfg = TransformerConfig::qwen2_0_5b();
328        let result_arch_only =
329            build_shared_cuda_trainer_with_init(1.0e-4, 128, 42, Some(&cfg), None);
330        assert!(
331            matches!(result_arch_only, Err(_)),
332            "(Some(arch), None) MUST Err — caller-bug guard"
333        );
334
335        // Path without arch — Err.
336        let dummy = PathBuf::from("/tmp/does-not-exist.apr");
337        let result_path_only =
338            build_shared_cuda_trainer_with_init(1.0e-4, 128, 42, None, Some(&dummy));
339        assert!(
340            matches!(result_path_only, Err(_)),
341            "(None, Some(path)) MUST Err — caller-bug guard"
342        );
343
344        // Both Err messages name the function so callers can grep
345        // back to the offending invocation. We extract the message
346        // by destructuring (CudaTransformerTrainer is not Debug, so
347        // unwrap_err() doesn't compile) — the err is a ConfigError.
348        let err_arch = match result_arch_only {
349            Err(crate::error::Error::ConfigError(s)) => s,
350            other => panic!("expected ConfigError, got: {:?}", other.is_ok()),
351        };
352        let err_path = match result_path_only {
353            Err(crate::error::Error::ConfigError(s)) => s,
354            other => panic!("expected ConfigError, got: {:?}", other.is_ok()),
355        };
356        assert!(
357            err_arch.contains("build_shared_cuda_trainer_with_init"),
358            "Err MUST name the function for grep-ability: {err_arch}"
359        );
360        assert!(
361            err_path.contains("build_shared_cuda_trainer_with_init"),
362            "Err MUST name the function for grep-ability: {err_path}"
363        );
364    }
365
366    /// FALSIFY-APR-PRETRAIN-INIT-CUDA-003 (encoder family rejection):
367    /// passing an Encoder-architecture init config to
368    /// `build_shared_cuda_trainer_with_init` MUST Err — same semantic
369    /// as the CPU path's `validate_pretrain_init_arch_compatible`.
370    /// This proves the symmetric builder threads the §50.4 step 5f.1
371    /// encoder rejection through the CUDA backend.
372    ///
373    /// Fires WITHOUT a CUDA device — the encoder check happens
374    /// before any GPU allocation.
375    #[test]
376    fn build_shared_cuda_trainer_with_init_rejects_encoder_family() {
377        use crate::transformer::ModelArchitecture;
378        use std::path::PathBuf;
379        let mut encoder_cfg = TransformerConfig::qwen2_0_5b();
380        encoder_cfg.architecture = ModelArchitecture::Encoder;
381        let dummy = PathBuf::from("/tmp/does-not-exist.apr");
382        let result =
383            build_shared_cuda_trainer_with_init(1.0e-4, 128, 42, Some(&encoder_cfg), Some(&dummy));
384        assert!(matches!(result, Err(_)), "Encoder-family init MUST Err under §50.4 step 5f.1");
385    }
386
387    /// FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-001 (H1 sanity bound):
388    /// `CudaTransformerTrainer::eval_batch` on a fresh-init trainer
389    /// (random weights) over a synthetic batch with random uniform
390    /// tokens MUST return a loss in a sensible range.
391    ///
392    /// Theoretical bound: random-init Llama-style 2-layer transformer
393    /// over uniformly-distributed targets in vocab=1000 produces
394    /// average cross-entropy near `ln(1000) = 6.91`. Any non-trivially-
395    /// trained model with finite weights produces loss in
396    /// `[0.5 × ln(vocab), 1.5 × ln(vocab)]` modulo float noise.
397    ///
398    /// LIVE EVIDENCE motivating this test (this branch's parent):
399    /// `evidence/section-60-5g-2-redispatch-2026-05-09/README.md`
400    /// recorded a 1500× train/eval discrepancy at the same model
401    /// state (epoch 0: train_loss=1.20 vs val_loss=0.00081). The
402    /// gap survived PR #1579's H2 (populate-coverage) fix, confirming
403    /// H1 (eval_batch degenerate) is independent of H2.
404    ///
405    /// This test reproduces the bug at unit-test level: if H1 is
406    /// real, eval_batch on a tiny random-init model returns ~0
407    /// instead of ~ln(vocab_size). The test is gated on
408    /// `--features cuda` so CI without that flag does not see it;
409    /// `cargo test -p aprender-train --features cuda --lib
410    /// falsify_eval_batch_h1_sanity_bound` reproduces.
411    ///
412    /// Spec: SPEC-SHIP-TWO-001 §60 (forthcoming) H1 root-cause cascade.
413    #[test]
414    fn falsify_eval_batch_h1_sanity_bound() {
415        use crate::train::transformer_trainer::TransformerTrainConfig;
416        use crate::train::transformer_trainer::{CudaTransformerTrainer, LMBatch};
417
418        // Tiny model so the test runs in a few seconds on RTX 4090.
419        let model_cfg = TransformerConfig::tiny();
420        let train_cfg = TransformerTrainConfig::new(model_cfg.clone());
421
422        // Build trainer with random init. Skip the test (rather than
423        // panic) if CUDA is unavailable on the host — the falsifier is
424        // host-dependent.
425        let trainer = match CudaTransformerTrainer::new(train_cfg) {
426            Ok(t) => t,
427            Err(e) => {
428                eprintln!(
429                    "[falsify_eval_batch_h1_sanity_bound] skipping: \
430                     CudaTransformerTrainer::new failed: {e:?} \
431                     (test requires --features cuda + a CUDA host)"
432                );
433                return;
434            }
435        };
436        let mut trainer = trainer;
437
438        // Build a synthetic batch: 4 sequences × 16 tokens each, drawn
439        // from a deterministic LCG so the test is reproducible.
440        let vocab_size = model_cfg.vocab_size as u32;
441        let seq_len = 16;
442        let batch_size = 4;
443        let mut state: u64 = 0xDEAD_BEEF_CAFE_F00D;
444        let lcg = |s: &mut u64| -> u32 {
445            *s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
446            ((*s >> 32) as u32) % vocab_size
447        };
448        let mut sequences = Vec::with_capacity(batch_size);
449        for _ in 0..batch_size {
450            let mut seq = Vec::with_capacity(seq_len + 1);
451            for _ in 0..(seq_len + 1) {
452                seq.push(lcg(&mut state));
453            }
454            sequences.push(seq);
455        }
456        let batch = LMBatch::from_sequences(&sequences, 0, 0);
457
458        // Sanity bound: random-init eval loss should be ≈ ln(1000) = 6.91.
459        // We accept anything in [0.5, 1.5 × ln(vocab)] = [0.5, ~10.4].
460        // If H1 is real, eval_batch returns ~0 (degenerate).
461        let loss = trainer.eval_batch(&batch);
462        let ln_vocab = (vocab_size as f32).ln();
463        let lower_bound = 0.5_f32;
464        let upper_bound = 1.5_f32 * ln_vocab;
465
466        assert!(
467            loss >= lower_bound,
468            "FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-001 (H1 lower bound): \
469             eval_batch on random-init {}-vocab tiny model returned \
470             loss = {loss}, expected ≥ {lower_bound} (random-init theoretical \
471             ≈ ln({vocab_size}) = {ln_vocab:.3}). Loss < 0.5 indicates \
472             eval pipeline is degenerate (cross-entropy collapsing to 0); \
473             see evidence/section-60-5g-2-redispatch-2026-05-09/ for the \
474             1500× train/eval discrepancy that motivated this falsifier.",
475            vocab_size
476        );
477        assert!(
478            loss <= upper_bound,
479            "FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-001 (H1 upper bound): \
480             eval_batch returned loss = {loss}, expected ≤ {upper_bound:.3} \
481             (1.5 × ln(vocab)). Loss > upper_bound suggests numerical \
482             explosion (NaN coercion or gradient overflow), a separate \
483             defect class from the lower-bound H1.",
484        );
485        assert!(loss.is_finite(), "eval_batch returned non-finite loss = {loss}");
486    }
487
488    /// FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-002 (H1 hypothesis A —
489    /// train→eval state pollution): the val_loss anomaly observed in
490    /// `evidence/section-60-5g-2-redispatch-2026-05-09/README.md`
491    /// fired at EPOCH 0 — i.e., AFTER 100 train_batch calls, not on
492    /// a fresh trainer. This test exercises that ordering directly:
493    /// eval_batch BEFORE training (loss_a, sanity), then train_batch,
494    /// then eval_batch on the same evaluation batch (loss_b). The
495    /// two losses should differ by AT MOST the optimizer-step effect
496    /// (a few percent at lr=5e-5 on one mini-batch).
497    ///
498    /// If H1 hypothesis A (logits_buf state contamination) is real,
499    /// loss_b will be much smaller than loss_a even though the model
500    /// only changed by one optimizer step. The 1500× train/val
501    /// discrepancy in §59/§60 evidence implies loss_b/loss_a ~ 1/1500.
502    #[test]
503    fn falsify_eval_batch_h1_train_pollution() {
504        use crate::train::transformer_trainer::TransformerTrainConfig;
505        use crate::train::transformer_trainer::{CudaTransformerTrainer, LMBatch};
506
507        let model_cfg = TransformerConfig::tiny();
508        let train_cfg = TransformerTrainConfig::new(model_cfg.clone());
509
510        let trainer = match CudaTransformerTrainer::new(train_cfg) {
511            Ok(t) => t,
512            Err(e) => {
513                eprintln!(
514                    "[falsify_eval_batch_h1_train_pollution] skipping: \
515                     CudaTransformerTrainer::new failed: {e:?} \
516                     (test requires --features cuda + a CUDA host)"
517                );
518                return;
519            }
520        };
521        let mut trainer = trainer;
522
523        let vocab_size = model_cfg.vocab_size as u32;
524        let seq_len = 16;
525        let batch_size = 4;
526        let mut state: u64 = 0xCAFE_BABE_DEAD_BEEF;
527        let lcg = |s: &mut u64| -> u32 {
528            *s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
529            ((*s >> 32) as u32) % vocab_size
530        };
531        let make_batch = |state: &mut u64, lcg: &dyn Fn(&mut u64) -> u32| -> LMBatch {
532            let mut sequences = Vec::with_capacity(batch_size);
533            for _ in 0..batch_size {
534                let mut seq = Vec::with_capacity(seq_len + 1);
535                for _ in 0..(seq_len + 1) {
536                    seq.push(lcg(state));
537                }
538                sequences.push(seq);
539            }
540            LMBatch::from_sequences(&sequences, 0, 0)
541        };
542
543        let train_batch_data = make_batch(&mut state, &lcg);
544        let eval_batch_data = make_batch(&mut state, &lcg);
545
546        // Phase 1: eval BEFORE any training — establishes baseline.
547        let loss_a = trainer.eval_batch(&eval_batch_data);
548        assert!(
549            loss_a.is_finite() && loss_a >= 0.5,
550            "Phase 1 baseline: eval before any train must be sensible \
551             (got {loss_a}); test setup precondition failed before \
552             we can probe H1A. See test 001 for the same lower bound."
553        );
554
555        // Phase 2: train on a DIFFERENT batch — mutates logits_buf
556        // (KAIZEN-052 in-place gradient writeback) and runs optimizer_step.
557        let _train_loss = trainer.train_batch(&train_batch_data);
558
559        // Phase 3: eval on the SAME eval batch — same model state up
560        // to one optimizer step. loss_b should be close to loss_a.
561        let loss_b = trainer.eval_batch(&eval_batch_data);
562
563        // The optimizer step at lr=5e-5 (default finetune mode but our
564        // train_cfg uses lr=0.001 from TrainConfig::default) on ONE
565        // mini-batch can shift loss by maybe 5-30%. We accept any
566        // |loss_b - loss_a| / loss_a < 0.95 (i.e., loss_b doesn't drop
567        // by more than 95%) — generous to allow normal training
568        // dynamics. A drop to ~0 (factor of 1500× as observed in §60)
569        // would break this bound by orders of magnitude.
570        let rel_drop = (loss_a - loss_b).max(0.0) / loss_a;
571        assert!(
572            loss_b.is_finite(),
573            "eval_batch after train returned non-finite loss = {loss_b}; \
574             possible NaN propagation from train_batch's in-place gradient \
575             writeback contaminating subsequent eval forward."
576        );
577        assert!(
578            rel_drop < 0.95,
579            "FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-002 (H1A train→eval \
580             state pollution): eval_batch loss dropped from {loss_a} to \
581             {loss_b} ({:.4}× relative drop) after a single train_batch \
582             on a DIFFERENT batch. A single optimizer step at typical \
583             learning rates cannot legitimately move loss by ≥95%. \
584             This indicates train_batch contaminates state that eval_batch \
585             reads (most likely the gpu_training.logits_buf via KAIZEN-052 \
586             in-place gradient writeback overlapping with the next \
587             gpu_forward GEMM). See \
588             evidence/section-60-5g-2-redispatch-2026-05-09/README.md \
589             for the 1500× train/val discrepancy this falsifier reproduces.",
590            rel_drop
591        );
592    }
593
594    /// FALSIFY-CUDA-FORWARD-PARITY-001 (the load-bearing H4D bisect):
595    /// On a populated Qwen 0.5B, `CudaTransformerTrainer::eval_batch`
596    /// MUST produce a finite, non-degenerate val_loss in the same
597    /// regime as the CPU `Transformer::forward` — i.e., in the
598    /// industry-baseline range for Qwen 0.5B on Python (~1.5–3.0).
599    ///
600    /// Concrete bound: when CPU forward produces logits with
601    /// peak-to-mean > 5 (PR #1602 evidence on populated Qwen,
602    /// argmax=9370), the corresponding CUDA path MUST produce
603    /// val_loss < `ln(vocab_size)` × 0.7 = ~12.0. A val_loss
604    /// approaching or exceeding `ln(vocab)` = 17.21 indicates
605    /// the CUDA path is anti-aligned (sub-random predictions).
606    ///
607    /// CONTEXT: SHIP-TWO §61 evidence (PR #1600) recorded
608    /// val_loss=18.55 at step 1 — *above* `ln(vocab)`. The bug
609    /// is in the CUDA forward path's missing bias-add operation:
610    /// `cuda_block.rs::CudaTransformerBlock` has no `b_q`/`b_k`/
611    /// `b_v` fields and `forward()` does pure gemms (lines 719-747)
612    /// without adding the trained Qwen Q/K/V biases.
613    ///
614    /// Pre-fix: this test fails with val_loss > 12 (CUDA path
615    /// drops biases → sub-random predictions).
616    /// Post-fix: passes with val_loss in the expected range.
617    ///
618    /// Host-gated: requires the canonical Qwen 0.5B init APR + the
619    /// 5g.1-v2 corpus on the lambda-vector RTX 4090 host.
620    #[test]
621    fn falsify_cuda_forward_parity_qwen_val_loss_below_ln_vocab() {
622        let init_path = std::path::Path::new("/mnt/nvme-raid0/models/qwen2.5-coder-0.5b-fresh.apr");
623        if !init_path.exists() {
624            eprintln!(
625                "[falsify-cuda-forward-parity-001] skipping: host lacks {}",
626                init_path.display()
627            );
628            return;
629        }
630        let cfg = TransformerConfig::qwen2_0_5b();
631        let trainer_rc = match build_shared_cuda_trainer_with_init(
632            5.0e-5,
633            32,
634            42,
635            Some(&cfg),
636            Some(init_path),
637        ) {
638            Ok(t) => t,
639            Err(e) => {
640                eprintln!(
641                    "[falsify-cuda-forward-parity-001] skipping: \
642                     build_shared_cuda_trainer_with_init failed: {e:?} \
643                     (test requires --features cuda + a CUDA host)"
644                );
645                return;
646            }
647        };
648
649        // Build a tiny synthetic batch: 1 sequence × 16 tokens.
650        // Choose tokens deterministically; correctness doesn't
651        // depend on which Python tokens — just that the batch is
652        // valid and exercises the forward path end-to-end.
653        let seq = vec![100_u32; 17]; // 16 input + 1 target shift
654        let batch = LMBatch::from_sequences(&[seq], 0, 0);
655
656        let val_loss = trainer_rc.borrow_mut().eval_batch(&batch);
657        let ln_vocab = (cfg.vocab_size as f32).ln();
658        let upper_bound = ln_vocab * 0.7;
659        eprintln!(
660            "[falsify-cuda-forward-parity-001] val_loss={val_loss} ln(vocab)={ln_vocab} \
661             upper_bound (0.7×ln_vocab)={upper_bound}"
662        );
663
664        assert!(val_loss.is_finite(), "val_loss must be finite, got {val_loss}");
665        // The DOMINANT assertion: val_loss MUST be below 0.7×ln(vocab).
666        // CPU forward produces peak-to-mean=5.68 (PR #1602) → cross-
667        // entropy on a single deterministic token should be
668        // O(ln_vocab) at most for a clearly-confident model. The
669        // pre-fix CUDA path produces val_loss > ln_vocab because it
670        // drops Qwen's Q/K/V biases (cuda_block.rs lines 103-135 has
671        // no bias fields; lines 719-747 do bare gemms).
672        assert!(
673            val_loss < upper_bound,
674            "FALSIFY-CUDA-FORWARD-PARITY-001 (H4D): CUDA val_loss={val_loss} >= \
675             0.7×ln(vocab)={upper_bound}. Same Qwen weights produce \
676             peak-to-mean=5.68 on CPU forward (PR #1602 falsify_h4_cpu_forward_*) \
677             but CUDA produces sub-random predictions. Root cause: \
678             CudaTransformerBlock drops Qwen Q/K/V biases — struct has no bias \
679             fields (cuda_block.rs lines 103-135), forward does bare gemms \
680             (lines 719-747) without `cuda_add(q, b_q)` after each projection. \
681             See evidence/section-60-5g-2-redispatch-2026-05-09/ + this contract \
682             apr-pretrain-cuda-forward-parity-v1.yaml. Fix scope: add b_q/b_k/b_v \
683             fields, thread through with_model upload, apply bias-add after each \
684             Q/K/V gemm in forward."
685        );
686    }
687}
entrenar/train/pretrain_real_cuda.rs

entrenar/train/
pretrain_real_cuda.rs