aprender-train 0.38.0

Training & Optimization library with autograd, LoRA, quantization, and model merging
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
//! CUDA-backend `StepFn` / `ValFn` / `CheckpointFn` for the 370M pretrain
//! loop (task #132 Phase 2, contract `gpu-training-backend-v1`).
//!
//! Mirrors `pretrain_real.rs` but swaps `TransformerTrainer`
//! (CPU + trueno SIMD) for `CudaTransformerTrainer` (GPU-resident
//! AdamW + fused CE). The entire module is gated on
//! `#[cfg(feature = "cuda")]` because `CudaTransformerTrainer::new`
//! / `train_batch` / `eval_batch` / `save_apr` only exist in the
//! cuda build — the non-cuda stub returns an error from `new()` and
//! exposes no step/eval/save methods.
//!
//! Contract obligations discharged / strengthened vs the CPU path:
//! - INV-ARCH-370M-001 (param count ∈ [366M, 374M]) via `debug_assert`
//!   on `CudaTransformerTrainer::model().parameters()`, matching
//!   the CPU guard.
//! - INV-TRAIN-007 (no NaN/Inf): `train_batch` / `eval_batch` return
//!   finite loss by construction; non-finite outputs abort via
//!   `PretrainLoop`'s guards.
//! - INV-TRAIN-008 (grad_norm ≥ 0): `last_grad_norm()` returns the
//!   real LM-head L2 norm. Strictly stronger than the CPU path's
//!   `1.0` placeholder.
//!
//! Deferred to a follow-up:
//! - INV-TRAIN-003 (AdamW-state sha256). `CudaTransformerTrainer`
//!   keeps (m, v, t) on the GPU; discharging this cleanly needs a
//!   D2H sync that `save_apr` already pays for but `StepFn` does
//!   not want to pay per-step. Until that sync is factored out,
//!   the trait default `optimizer_state_sha256 -> None` is used,
//!   and GATE-TRAIN-006 runs only on the CPU path.

#![cfg(feature = "cuda")]

use crate::train::pretrain::{CheckpointFn, EpochArtifact, StepFn, ValFn};
use crate::train::pretrain_real::{
    build_transformer_config, llama_370m_train_config, load_init_tensors_from_apr,
    populate_trainer_from_init_tensors, validate_pretrain_init_arch_compatible,
};
use crate::train::transformer_trainer::{CudaTransformerTrainer, LMBatch, TransformerTrainConfig};
use crate::transformer::{Transformer, TransformerConfig};
use std::cell::RefCell;
use std::path::Path;
use std::rc::Rc;

/// Shared mutable ownership of a GPU-resident trainer. Both
/// `CudaRealStepFn` (train steps) and `CudaRealValFn` (eval) clone
/// this `Rc` so the three hooks see the same GPU memory.
pub type SharedCudaTrainer = Rc<RefCell<CudaTransformerTrainer>>;

/// Allocate a `CudaTransformerTrainer` with MODEL-2 v2-remedy defaults
/// and verify INV-ARCH-370M-001 in debug builds.
///
/// Returns a `crate::Result` because `CudaTransformerTrainer::new`
/// can fail on missing CUDA runtime, kernel pre-warm failure, or
/// block upload failure — the CLI surfaces this as a
/// GATE-GPUTRAIN-002 error so the operator knows to check their
/// `--features cuda` build or their GPU.
pub fn build_shared_cuda_trainer(
    lr: f32,
    seq_length: usize,
    seed: u64,
) -> crate::Result<SharedCudaTrainer> {
    let cfg = llama_370m_train_config(lr, seq_length, seed);
    let trainer = CudaTransformerTrainer::new(cfg)?;
    #[cfg(debug_assertions)]
    {
        let param_count: usize = trainer.model().parameters().iter().map(|t| t.len()).sum();
        debug_assert!(
            (366_000_000..=374_000_000).contains(&param_count),
            "INV-ARCH-370M-001: parameter count {param_count} outside [366M, 374M] band",
        );
    }
    Ok(Rc::new(RefCell::new(trainer)))
}

/// Polymorphic CUDA trainer builder for `apr pretrain --init --device cuda`
/// (§50.4 step 5f.5 — symmetric to the CPU `build_shared_trainer_with_init`).
///
/// Composes the same §50.4 step-5f machinery as the CPU path, but runs
/// it against `CudaTransformerTrainer::with_model` so the populated
/// init weights flow through GPU upload (transformer blocks via
/// `upload_blocks`, final RMSNorm via `from_host`, lm_head /
/// embed_tokens.weight via `from_host`):
///   - 5c: `build_transformer_config(init_arch)` — polymorphic dispatch
///   - 5f.1: `validate_pretrain_init_arch_compatible(init_arch)` — encoder rejection
///   - 5f.2: `load_init_tensors_from_apr(path)` — read APR weights
///   - 5f.3: `populate_trainer_from_init_tensors(transformer, &tensors)` — populate CPU model
///   - 5f.5: `CudaTransformerTrainer::with_model(populated_model, train_cfg)` — GPU upload
///
/// Behaviour:
///   init = None  → identical to `build_shared_cuda_trainer` (Llama370M
///                  from-scratch baseline with INV-ARCH-370M-001 enforced).
///   init = Some  → builds a CUDA trainer whose GPU weights derive from
///                  the populated CPU model (the populated `Transformer`
///                  is moved into `with_model` which uploads its blocks /
///                  norm / lm_head to GPU). INV-ARCH-370M-001 is NOT
///                  enforced — arch is whatever the init APR has.
///
/// Spec: SPEC-SHIP-TWO-001 §52.4 (CPU 5f.4 wireup) + §54-§56 (Qwen
/// 5g.0/5g.1 prerequisites) + this §50.4 step 5f.5 (CUDA wireup).
///
/// # Errors
///
/// Returns Err when:
/// - `init_arch.is_some() != init_path.is_some()` (caller bug — same
///   diagnostic as the CPU path's `build_shared_trainer_with_init`).
/// - `init_arch` is `Some` with `architecture = Encoder`
///   (FALSIFY-APR-PRETRAIN-ARCH-007 / FALSIFY-APR-PRETRAIN-INIT-001).
/// - `load_init_tensors_from_apr` fails (FALSIFY-APR-PRETRAIN-INIT-006).
/// - `populate_trainer_from_init_tensors` fails (FALSIFY-APR-PRETRAIN-INIT-007).
/// - `CudaTransformerTrainer::with_model` fails (CUDA init / kernel
///   pre-warm / block upload — surfaces as GATE-GPUTRAIN-002).
///
/// # Caller Contract
///
/// The caller MUST have built the binary with `--features cuda`. This
/// function is gated on `#[cfg(feature = "cuda")]` so a non-cuda build
/// will not see this symbol; the apr-cli dispatch layer routes
/// `--device cuda` to `drive_real_cuda` which calls this builder, and
/// the non-cuda stub for `drive_real_cuda` already returns the
/// rebuild-with-cuda error per `feedback_cuda_feature_footgun.md`.
pub fn build_shared_cuda_trainer_with_init(
    lr: f32,
    seq_length: usize,
    seed: u64,
    init_arch: Option<&TransformerConfig>,
    init_path: Option<&Path>,
) -> crate::Result<SharedCudaTrainer> {
    if init_arch.is_some() != init_path.is_some() {
        return Err(crate::error::Error::ConfigError(format!(
            "build_shared_cuda_trainer_with_init: init_arch and init_path must both be Some \
             or both None (caller bug; init_arch.is_some()={}, init_path.is_some()={})",
            init_arch.is_some(),
            init_path.is_some()
        )));
    }

    if let Some(cfg) = init_arch {
        validate_pretrain_init_arch_compatible(cfg).map_err(crate::error::Error::ConfigError)?;
    }

    let model_cfg = build_transformer_config(init_arch);
    let mut train_cfg = TransformerTrainConfig::new(model_cfg);
    train_cfg.lr = lr;
    train_cfg.max_seq_len = seq_length;
    train_cfg.seed = seed;

    // Build the CPU model first; populate init weights into it; then
    // hand it to CudaTransformerTrainer::with_model which uploads the
    // populated blocks, final RMSNorm, and lm_head/embed_tokens to GPU.
    // This is the symmetric path to CPU's build_shared_trainer_with_init,
    // exercising the SAME populate_trainer_from_init_tensors helper so
    // the population semantics are identical between backends.
    let mut transformer = Transformer::new(&train_cfg.model_config);

    if let Some(path) = init_path {
        let tensors = load_init_tensors_from_apr(path).map_err(crate::error::Error::ConfigError)?;
        populate_trainer_from_init_tensors(&mut transformer, &tensors)
            .map_err(crate::error::Error::ConfigError)?;
    } else {
        // From-scratch CUDA path with init=None: enforce the
        // INV-ARCH-370M-001 param-count band. Mirrors the CPU
        // `build_shared_trainer` invariant exactly.
        #[cfg(debug_assertions)]
        {
            let param_count: usize = transformer.parameters().iter().map(|t| t.len()).sum();
            debug_assert!(
                (366_000_000..=374_000_000).contains(&param_count),
                "INV-ARCH-370M-001: parameter count {param_count} outside [366M, 374M] band \
                 (from-scratch CUDA path with init=None)",
            );
        }
    }

    let trainer = CudaTransformerTrainer::with_model(transformer, train_cfg)?;
    Ok(Rc::new(RefCell::new(trainer)))
}

/// CUDA `StepFn` — pulls one `LMBatch` from the shard iterator and
/// runs a real GPU forward + backward + AdamW step.
pub struct CudaRealStepFn {
    trainer: SharedCudaTrainer,
    batches: Box<dyn Iterator<Item = LMBatch>>,
}

impl CudaRealStepFn {
    pub fn new(trainer: SharedCudaTrainer, batches: Box<dyn Iterator<Item = LMBatch>>) -> Self {
        Self { trainer, batches }
    }
}

impl StepFn for CudaRealStepFn {
    fn step(&mut self, _step: u64, _lr: f32, _batch_tokens: u64) -> (f32, f32) {
        // Exhausted shard stream: emit a finite placeholder so the
        // NaN/Inf guard (INV-TRAIN-007) doesn't mis-fire and the
        // divergence guard (GATE-TRAIN-005) correctly does not abort.
        let Some(batch) = self.batches.next() else {
            return (1.0, 1.0);
        };
        let mut trainer = self.trainer.borrow_mut();
        let loss = trainer.train_batch(&batch);
        // Real LM-head L2 norm — strictly more informative than the
        // CPU path's `1.0` placeholder for GATE-TRAIN-008 monitoring.
        let grad_norm = trainer.last_grad_norm();
        (loss, grad_norm)
    }

    // INV-TRAIN-003 intentionally deferred for the GPU path — see
    // module docs. Uses trait default `-> None`, so the CPU gate
    // (`--device cpu`) is the one that exercises AdamW-state parity.
}

/// CUDA `ValFn` — forward-only eval across pre-loaded held-out
/// batches. Uses `eval_batch` (fused GPU cross-entropy, no logits
/// D2H) and averages across batches.
pub struct CudaRealValFn {
    trainer: SharedCudaTrainer,
    held_out: Vec<LMBatch>,
}

impl CudaRealValFn {
    pub fn new(trainer: SharedCudaTrainer, held_out: Vec<LMBatch>) -> Self {
        Self { trainer, held_out }
    }
}

impl ValFn for CudaRealValFn {
    fn validate(&mut self, _epoch: usize) -> f32 {
        if self.held_out.is_empty() {
            return f32::NAN;
        }
        let mut trainer = self.trainer.borrow_mut();
        let mut total_loss = 0.0_f32;
        let mut count = 0_usize;
        for batch in &self.held_out {
            if batch.batch_size == 0 {
                continue;
            }
            total_loss += trainer.eval_batch(batch);
            count += 1;
        }
        if count == 0 {
            f32::NAN
        } else {
            total_loss / count as f32
        }
    }
}

/// CUDA `CheckpointFn` — writes the 370M weights to
/// `artifact.checkpoint_path` in APR format. `save_apr` takes
/// `&mut self` on the CUDA path because it syncs GPU→CPU before
/// writing, which is why this holds the `SharedCudaTrainer` instead
/// of cloning the trainer out.
pub struct CudaAprCheckpointFn {
    trainer: SharedCudaTrainer,
    model_name: String,
    architecture: String,
    /// SPEC-SHIP-TWO-001 §81 P0-D: optional tokenizer directory whose
    /// tokenizer.json is embedded into every checkpoint via
    /// `tokenizer.vocabulary` + `tokenizer.merges` metadata keys.
    /// When None, checkpoints are written without an embedded tokenizer
    /// (legacy behavior; `apr qa` will fail with C-03/embedded-tokenizer
    /// gate per §81 — left as caller's responsibility).
    tokenizer_dir: Option<std::path::PathBuf>,
}

impl CudaAprCheckpointFn {
    pub fn new(
        trainer: SharedCudaTrainer,
        model_name: impl Into<String>,
        architecture: impl Into<String>,
    ) -> Self {
        Self {
            trainer,
            model_name: model_name.into(),
            architecture: architecture.into(),
            tokenizer_dir: None,
        }
    }

    /// SPEC-SHIP-TWO-001 §81 P0-D: builder for embedding the tokenizer
    /// in every checkpoint write. Pass `--tokenizer <DIR>` through here
    /// so `apr qa <epoch-N.apr>` can run inference without an external
    /// tokenizer file.
    pub fn with_tokenizer_dir(mut self, dir: impl Into<std::path::PathBuf>) -> Self {
        self.tokenizer_dir = Some(dir.into());
        self
    }
}

impl CheckpointFn for CudaAprCheckpointFn {
    fn save(&mut self, _epoch: usize, artifact: &EpochArtifact) -> Result<(), String> {
        let mut trainer = self.trainer.borrow_mut();
        trainer
            .save_apr_with_tokenizer(
                &artifact.checkpoint_path,
                &self.model_name,
                &self.architecture,
                self.tokenizer_dir.as_deref(),
            )
            .map_err(|e| format!("save_apr (cuda) failed: {e}"))
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    /// FALSIFY-APR-PRETRAIN-INIT-CUDA-002 (paired-args invariant):
    /// `build_shared_cuda_trainer_with_init` MUST reject the
    /// (Some, None) and (None, Some) caller-bug states identically
    /// to the CPU `build_shared_trainer_with_init`. The two fields
    /// are paired by construction — separately optional fields are
    /// a defect class because they let a caller pass an arch
    /// without weights (silent random-init at the GPU boundary) or
    /// weights without an arch (silently fall back to Llama370M).
    ///
    /// This test fires WITHOUT a CUDA device — the args check
    /// happens before any GPU allocation — so it runs on any host
    /// even when CUDA runtime is unavailable.
    #[test]
    fn build_shared_cuda_trainer_with_init_rejects_unpaired_args() {
        use std::path::PathBuf;
        // Arch without path — Err. Use Qwen 0.5B as a concrete
        // non-Llama370M decoder config to prove the paired-args
        // gate fires before any architectural inspection.
        let cfg = TransformerConfig::qwen2_0_5b();
        let result_arch_only =
            build_shared_cuda_trainer_with_init(1.0e-4, 128, 42, Some(&cfg), None);
        assert!(
            matches!(result_arch_only, Err(_)),
            "(Some(arch), None) MUST Err — caller-bug guard"
        );

        // Path without arch — Err.
        let dummy = PathBuf::from("/tmp/does-not-exist.apr");
        let result_path_only =
            build_shared_cuda_trainer_with_init(1.0e-4, 128, 42, None, Some(&dummy));
        assert!(
            matches!(result_path_only, Err(_)),
            "(None, Some(path)) MUST Err — caller-bug guard"
        );

        // Both Err messages name the function so callers can grep
        // back to the offending invocation. We extract the message
        // by destructuring (CudaTransformerTrainer is not Debug, so
        // unwrap_err() doesn't compile) — the err is a ConfigError.
        let err_arch = match result_arch_only {
            Err(crate::error::Error::ConfigError(s)) => s,
            other => panic!("expected ConfigError, got: {:?}", other.is_ok()),
        };
        let err_path = match result_path_only {
            Err(crate::error::Error::ConfigError(s)) => s,
            other => panic!("expected ConfigError, got: {:?}", other.is_ok()),
        };
        assert!(
            err_arch.contains("build_shared_cuda_trainer_with_init"),
            "Err MUST name the function for grep-ability: {err_arch}"
        );
        assert!(
            err_path.contains("build_shared_cuda_trainer_with_init"),
            "Err MUST name the function for grep-ability: {err_path}"
        );
    }

    /// FALSIFY-APR-PRETRAIN-INIT-CUDA-003 (encoder family rejection):
    /// passing an Encoder-architecture init config to
    /// `build_shared_cuda_trainer_with_init` MUST Err — same semantic
    /// as the CPU path's `validate_pretrain_init_arch_compatible`.
    /// This proves the symmetric builder threads the §50.4 step 5f.1
    /// encoder rejection through the CUDA backend.
    ///
    /// Fires WITHOUT a CUDA device — the encoder check happens
    /// before any GPU allocation.
    #[test]
    fn build_shared_cuda_trainer_with_init_rejects_encoder_family() {
        use crate::transformer::ModelArchitecture;
        use std::path::PathBuf;
        let mut encoder_cfg = TransformerConfig::qwen2_0_5b();
        encoder_cfg.architecture = ModelArchitecture::Encoder;
        let dummy = PathBuf::from("/tmp/does-not-exist.apr");
        let result =
            build_shared_cuda_trainer_with_init(1.0e-4, 128, 42, Some(&encoder_cfg), Some(&dummy));
        assert!(matches!(result, Err(_)), "Encoder-family init MUST Err under §50.4 step 5f.1");
    }

    /// FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-001 (H1 sanity bound):
    /// `CudaTransformerTrainer::eval_batch` on a fresh-init trainer
    /// (random weights) over a synthetic batch with random uniform
    /// tokens MUST return a loss in a sensible range.
    ///
    /// Theoretical bound: random-init Llama-style 2-layer transformer
    /// over uniformly-distributed targets in vocab=1000 produces
    /// average cross-entropy near `ln(1000) = 6.91`. Any non-trivially-
    /// trained model with finite weights produces loss in
    /// `[0.5 × ln(vocab), 1.5 × ln(vocab)]` modulo float noise.
    ///
    /// LIVE EVIDENCE motivating this test (this branch's parent):
    /// `evidence/section-60-5g-2-redispatch-2026-05-09/README.md`
    /// recorded a 1500× train/eval discrepancy at the same model
    /// state (epoch 0: train_loss=1.20 vs val_loss=0.00081). The
    /// gap survived PR #1579's H2 (populate-coverage) fix, confirming
    /// H1 (eval_batch degenerate) is independent of H2.
    ///
    /// This test reproduces the bug at unit-test level: if H1 is
    /// real, eval_batch on a tiny random-init model returns ~0
    /// instead of ~ln(vocab_size). The test is gated on
    /// `--features cuda` so CI without that flag does not see it;
    /// `cargo test -p aprender-train --features cuda --lib
    /// falsify_eval_batch_h1_sanity_bound` reproduces.
    ///
    /// Spec: SPEC-SHIP-TWO-001 §60 (forthcoming) H1 root-cause cascade.
    #[test]
    fn falsify_eval_batch_h1_sanity_bound() {
        use crate::train::transformer_trainer::TransformerTrainConfig;
        use crate::train::transformer_trainer::{CudaTransformerTrainer, LMBatch};

        // Tiny model so the test runs in a few seconds on RTX 4090.
        let model_cfg = TransformerConfig::tiny();
        let train_cfg = TransformerTrainConfig::new(model_cfg.clone());

        // Build trainer with random init. Skip the test (rather than
        // panic) if CUDA is unavailable on the host — the falsifier is
        // host-dependent.
        let trainer = match CudaTransformerTrainer::new(train_cfg) {
            Ok(t) => t,
            Err(e) => {
                eprintln!(
                    "[falsify_eval_batch_h1_sanity_bound] skipping: \
                     CudaTransformerTrainer::new failed: {e:?} \
                     (test requires --features cuda + a CUDA host)"
                );
                return;
            }
        };
        let mut trainer = trainer;

        // Build a synthetic batch: 4 sequences × 16 tokens each, drawn
        // from a deterministic LCG so the test is reproducible.
        let vocab_size = model_cfg.vocab_size as u32;
        let seq_len = 16;
        let batch_size = 4;
        let mut state: u64 = 0xDEAD_BEEF_CAFE_F00D;
        let lcg = |s: &mut u64| -> u32 {
            *s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
            ((*s >> 32) as u32) % vocab_size
        };
        let mut sequences = Vec::with_capacity(batch_size);
        for _ in 0..batch_size {
            let mut seq = Vec::with_capacity(seq_len + 1);
            for _ in 0..(seq_len + 1) {
                seq.push(lcg(&mut state));
            }
            sequences.push(seq);
        }
        let batch = LMBatch::from_sequences(&sequences, 0, 0);

        // Sanity bound: random-init eval loss should be ≈ ln(1000) = 6.91.
        // We accept anything in [0.5, 1.5 × ln(vocab)] = [0.5, ~10.4].
        // If H1 is real, eval_batch returns ~0 (degenerate).
        let loss = trainer.eval_batch(&batch);
        let ln_vocab = (vocab_size as f32).ln();
        let lower_bound = 0.5_f32;
        let upper_bound = 1.5_f32 * ln_vocab;

        assert!(
            loss >= lower_bound,
            "FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-001 (H1 lower bound): \
             eval_batch on random-init {}-vocab tiny model returned \
             loss = {loss}, expected ≥ {lower_bound} (random-init theoretical \
             ≈ ln({vocab_size}) = {ln_vocab:.3}). Loss < 0.5 indicates \
             eval pipeline is degenerate (cross-entropy collapsing to 0); \
             see evidence/section-60-5g-2-redispatch-2026-05-09/ for the \
             1500× train/eval discrepancy that motivated this falsifier.",
            vocab_size
        );
        assert!(
            loss <= upper_bound,
            "FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-001 (H1 upper bound): \
             eval_batch returned loss = {loss}, expected ≤ {upper_bound:.3} \
             (1.5 × ln(vocab)). Loss > upper_bound suggests numerical \
             explosion (NaN coercion or gradient overflow), a separate \
             defect class from the lower-bound H1.",
        );
        assert!(loss.is_finite(), "eval_batch returned non-finite loss = {loss}");
    }

    /// FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-002 (H1 hypothesis A —
    /// train→eval state pollution): the val_loss anomaly observed in
    /// `evidence/section-60-5g-2-redispatch-2026-05-09/README.md`
    /// fired at EPOCH 0 — i.e., AFTER 100 train_batch calls, not on
    /// a fresh trainer. This test exercises that ordering directly:
    /// eval_batch BEFORE training (loss_a, sanity), then train_batch,
    /// then eval_batch on the same evaluation batch (loss_b). The
    /// two losses should differ by AT MOST the optimizer-step effect
    /// (a few percent at lr=5e-5 on one mini-batch).
    ///
    /// If H1 hypothesis A (logits_buf state contamination) is real,
    /// loss_b will be much smaller than loss_a even though the model
    /// only changed by one optimizer step. The 1500× train/val
    /// discrepancy in §59/§60 evidence implies loss_b/loss_a ~ 1/1500.
    #[test]
    fn falsify_eval_batch_h1_train_pollution() {
        use crate::train::transformer_trainer::TransformerTrainConfig;
        use crate::train::transformer_trainer::{CudaTransformerTrainer, LMBatch};

        let model_cfg = TransformerConfig::tiny();
        let train_cfg = TransformerTrainConfig::new(model_cfg.clone());

        let trainer = match CudaTransformerTrainer::new(train_cfg) {
            Ok(t) => t,
            Err(e) => {
                eprintln!(
                    "[falsify_eval_batch_h1_train_pollution] skipping: \
                     CudaTransformerTrainer::new failed: {e:?} \
                     (test requires --features cuda + a CUDA host)"
                );
                return;
            }
        };
        let mut trainer = trainer;

        let vocab_size = model_cfg.vocab_size as u32;
        let seq_len = 16;
        let batch_size = 4;
        let mut state: u64 = 0xCAFE_BABE_DEAD_BEEF;
        let lcg = |s: &mut u64| -> u32 {
            *s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
            ((*s >> 32) as u32) % vocab_size
        };
        let make_batch = |state: &mut u64, lcg: &dyn Fn(&mut u64) -> u32| -> LMBatch {
            let mut sequences = Vec::with_capacity(batch_size);
            for _ in 0..batch_size {
                let mut seq = Vec::with_capacity(seq_len + 1);
                for _ in 0..(seq_len + 1) {
                    seq.push(lcg(state));
                }
                sequences.push(seq);
            }
            LMBatch::from_sequences(&sequences, 0, 0)
        };

        let train_batch_data = make_batch(&mut state, &lcg);
        let eval_batch_data = make_batch(&mut state, &lcg);

        // Phase 1: eval BEFORE any training — establishes baseline.
        let loss_a = trainer.eval_batch(&eval_batch_data);
        assert!(
            loss_a.is_finite() && loss_a >= 0.5,
            "Phase 1 baseline: eval before any train must be sensible \
             (got {loss_a}); test setup precondition failed before \
             we can probe H1A. See test 001 for the same lower bound."
        );

        // Phase 2: train on a DIFFERENT batch — mutates logits_buf
        // (KAIZEN-052 in-place gradient writeback) and runs optimizer_step.
        let _train_loss = trainer.train_batch(&train_batch_data);

        // Phase 3: eval on the SAME eval batch — same model state up
        // to one optimizer step. loss_b should be close to loss_a.
        let loss_b = trainer.eval_batch(&eval_batch_data);

        // The optimizer step at lr=5e-5 (default finetune mode but our
        // train_cfg uses lr=0.001 from TrainConfig::default) on ONE
        // mini-batch can shift loss by maybe 5-30%. We accept any
        // |loss_b - loss_a| / loss_a < 0.95 (i.e., loss_b doesn't drop
        // by more than 95%) — generous to allow normal training
        // dynamics. A drop to ~0 (factor of 1500× as observed in §60)
        // would break this bound by orders of magnitude.
        let rel_drop = (loss_a - loss_b).max(0.0) / loss_a;
        assert!(
            loss_b.is_finite(),
            "eval_batch after train returned non-finite loss = {loss_b}; \
             possible NaN propagation from train_batch's in-place gradient \
             writeback contaminating subsequent eval forward."
        );
        assert!(
            rel_drop < 0.95,
            "FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-002 (H1A train→eval \
             state pollution): eval_batch loss dropped from {loss_a} to \
             {loss_b} ({:.4}× relative drop) after a single train_batch \
             on a DIFFERENT batch. A single optimizer step at typical \
             learning rates cannot legitimately move loss by ≥95%. \
             This indicates train_batch contaminates state that eval_batch \
             reads (most likely the gpu_training.logits_buf via KAIZEN-052 \
             in-place gradient writeback overlapping with the next \
             gpu_forward GEMM). See \
             evidence/section-60-5g-2-redispatch-2026-05-09/README.md \
             for the 1500× train/val discrepancy this falsifier reproduces.",
            rel_drop
        );
    }

    /// FALSIFY-CUDA-FORWARD-PARITY-001 (the load-bearing H4D bisect):
    /// On a populated Qwen 0.5B, `CudaTransformerTrainer::eval_batch`
    /// MUST produce a finite, non-degenerate val_loss in the same
    /// regime as the CPU `Transformer::forward` — i.e., in the
    /// industry-baseline range for Qwen 0.5B on Python (~1.5–3.0).
    ///
    /// Concrete bound: when CPU forward produces logits with
    /// peak-to-mean > 5 (PR #1602 evidence on populated Qwen,
    /// argmax=9370), the corresponding CUDA path MUST produce
    /// val_loss < `ln(vocab_size)` × 0.7 = ~12.0. A val_loss
    /// approaching or exceeding `ln(vocab)` = 17.21 indicates
    /// the CUDA path is anti-aligned (sub-random predictions).
    ///
    /// CONTEXT: SHIP-TWO §61 evidence (PR #1600) recorded
    /// val_loss=18.55 at step 1 — *above* `ln(vocab)`. The bug
    /// is in the CUDA forward path's missing bias-add operation:
    /// `cuda_block.rs::CudaTransformerBlock` has no `b_q`/`b_k`/
    /// `b_v` fields and `forward()` does pure gemms (lines 719-747)
    /// without adding the trained Qwen Q/K/V biases.
    ///
    /// Pre-fix: this test fails with val_loss > 12 (CUDA path
    /// drops biases → sub-random predictions).
    /// Post-fix: passes with val_loss in the expected range.
    ///
    /// Host-gated: requires the canonical Qwen 0.5B init APR + the
    /// 5g.1-v2 corpus on the lambda-vector RTX 4090 host.
    #[test]
    fn falsify_cuda_forward_parity_qwen_val_loss_below_ln_vocab() {
        let init_path = std::path::Path::new("/mnt/nvme-raid0/models/qwen2.5-coder-0.5b-fresh.apr");
        if !init_path.exists() {
            eprintln!(
                "[falsify-cuda-forward-parity-001] skipping: host lacks {}",
                init_path.display()
            );
            return;
        }
        let cfg = TransformerConfig::qwen2_0_5b();
        let trainer_rc = match build_shared_cuda_trainer_with_init(
            5.0e-5,
            32,
            42,
            Some(&cfg),
            Some(init_path),
        ) {
            Ok(t) => t,
            Err(e) => {
                eprintln!(
                    "[falsify-cuda-forward-parity-001] skipping: \
                     build_shared_cuda_trainer_with_init failed: {e:?} \
                     (test requires --features cuda + a CUDA host)"
                );
                return;
            }
        };

        // Build a tiny synthetic batch: 1 sequence × 16 tokens.
        // Choose tokens deterministically; correctness doesn't
        // depend on which Python tokens — just that the batch is
        // valid and exercises the forward path end-to-end.
        let seq = vec![100_u32; 17]; // 16 input + 1 target shift
        let batch = LMBatch::from_sequences(&[seq], 0, 0);

        let val_loss = trainer_rc.borrow_mut().eval_batch(&batch);
        let ln_vocab = (cfg.vocab_size as f32).ln();
        let upper_bound = ln_vocab * 0.7;
        eprintln!(
            "[falsify-cuda-forward-parity-001] val_loss={val_loss} ln(vocab)={ln_vocab} \
             upper_bound (0.7×ln_vocab)={upper_bound}"
        );

        assert!(val_loss.is_finite(), "val_loss must be finite, got {val_loss}");
        // The DOMINANT assertion: val_loss MUST be below 0.7×ln(vocab).
        // CPU forward produces peak-to-mean=5.68 (PR #1602) → cross-
        // entropy on a single deterministic token should be
        // O(ln_vocab) at most for a clearly-confident model. The
        // pre-fix CUDA path produces val_loss > ln_vocab because it
        // drops Qwen's Q/K/V biases (cuda_block.rs lines 103-135 has
        // no bias fields; lines 719-747 do bare gemms).
        assert!(
            val_loss < upper_bound,
            "FALSIFY-CUDA-FORWARD-PARITY-001 (H4D): CUDA val_loss={val_loss} >= \
             0.7×ln(vocab)={upper_bound}. Same Qwen weights produce \
             peak-to-mean=5.68 on CPU forward (PR #1602 falsify_h4_cpu_forward_*) \
             but CUDA produces sub-random predictions. Root cause: \
             CudaTransformerBlock drops Qwen Q/K/V biases — struct has no bias \
             fields (cuda_block.rs lines 103-135), forward does bare gemms \
             (lines 719-747) without `cuda_add(q, b_q)` after each projection. \
             See evidence/section-60-5g-2-redispatch-2026-05-09/ + this contract \
             apr-pretrain-cuda-forward-parity-v1.yaml. Fix scope: add b_q/b_k/b_v \
             fields, thread through with_model upload, apply bias-add after each \
             Q/K/V gemm in forward."
        );
    }
}