#![cfg(feature = "cuda")]
use crate::train::pretrain::{CheckpointFn, EpochArtifact, StepFn, ValFn};
use crate::train::pretrain_real::{
build_transformer_config, llama_370m_train_config, load_init_tensors_from_apr,
populate_trainer_from_init_tensors, validate_pretrain_init_arch_compatible,
};
use crate::train::transformer_trainer::{CudaTransformerTrainer, LMBatch, TransformerTrainConfig};
use crate::transformer::{Transformer, TransformerConfig};
use std::cell::RefCell;
use std::path::Path;
use std::rc::Rc;
pub type SharedCudaTrainer = Rc<RefCell<CudaTransformerTrainer>>;
pub fn build_shared_cuda_trainer(
lr: f32,
seq_length: usize,
seed: u64,
) -> crate::Result<SharedCudaTrainer> {
let cfg = llama_370m_train_config(lr, seq_length, seed);
let trainer = CudaTransformerTrainer::new(cfg)?;
#[cfg(debug_assertions)]
{
let param_count: usize = trainer.model().parameters().iter().map(|t| t.len()).sum();
debug_assert!(
(366_000_000..=374_000_000).contains(¶m_count),
"INV-ARCH-370M-001: parameter count {param_count} outside [366M, 374M] band",
);
}
Ok(Rc::new(RefCell::new(trainer)))
}
pub fn build_shared_cuda_trainer_with_init(
lr: f32,
seq_length: usize,
seed: u64,
init_arch: Option<&TransformerConfig>,
init_path: Option<&Path>,
) -> crate::Result<SharedCudaTrainer> {
if init_arch.is_some() != init_path.is_some() {
return Err(crate::error::Error::ConfigError(format!(
"build_shared_cuda_trainer_with_init: init_arch and init_path must both be Some \
or both None (caller bug; init_arch.is_some()={}, init_path.is_some()={})",
init_arch.is_some(),
init_path.is_some()
)));
}
if let Some(cfg) = init_arch {
validate_pretrain_init_arch_compatible(cfg).map_err(crate::error::Error::ConfigError)?;
}
let model_cfg = build_transformer_config(init_arch);
let mut train_cfg = TransformerTrainConfig::new(model_cfg);
train_cfg.lr = lr;
train_cfg.max_seq_len = seq_length;
train_cfg.seed = seed;
let mut transformer = Transformer::new(&train_cfg.model_config);
if let Some(path) = init_path {
let tensors = load_init_tensors_from_apr(path).map_err(crate::error::Error::ConfigError)?;
populate_trainer_from_init_tensors(&mut transformer, &tensors)
.map_err(crate::error::Error::ConfigError)?;
} else {
#[cfg(debug_assertions)]
{
let param_count: usize = transformer.parameters().iter().map(|t| t.len()).sum();
debug_assert!(
(366_000_000..=374_000_000).contains(¶m_count),
"INV-ARCH-370M-001: parameter count {param_count} outside [366M, 374M] band \
(from-scratch CUDA path with init=None)",
);
}
}
let trainer = CudaTransformerTrainer::with_model(transformer, train_cfg)?;
Ok(Rc::new(RefCell::new(trainer)))
}
pub struct CudaRealStepFn {
trainer: SharedCudaTrainer,
batches: Box<dyn Iterator<Item = LMBatch>>,
}
impl CudaRealStepFn {
pub fn new(trainer: SharedCudaTrainer, batches: Box<dyn Iterator<Item = LMBatch>>) -> Self {
Self { trainer, batches }
}
}
impl StepFn for CudaRealStepFn {
fn step(&mut self, _step: u64, _lr: f32, _batch_tokens: u64) -> (f32, f32) {
let Some(batch) = self.batches.next() else {
return (1.0, 1.0);
};
let mut trainer = self.trainer.borrow_mut();
let loss = trainer.train_batch(&batch);
let grad_norm = trainer.last_grad_norm();
(loss, grad_norm)
}
}
pub struct CudaRealValFn {
trainer: SharedCudaTrainer,
held_out: Vec<LMBatch>,
}
impl CudaRealValFn {
pub fn new(trainer: SharedCudaTrainer, held_out: Vec<LMBatch>) -> Self {
Self { trainer, held_out }
}
}
impl ValFn for CudaRealValFn {
fn validate(&mut self, _epoch: usize) -> f32 {
if self.held_out.is_empty() {
return f32::NAN;
}
let mut trainer = self.trainer.borrow_mut();
let mut total_loss = 0.0_f32;
let mut count = 0_usize;
for batch in &self.held_out {
if batch.batch_size == 0 {
continue;
}
total_loss += trainer.eval_batch(batch);
count += 1;
}
if count == 0 {
f32::NAN
} else {
total_loss / count as f32
}
}
}
pub struct CudaAprCheckpointFn {
trainer: SharedCudaTrainer,
model_name: String,
architecture: String,
tokenizer_dir: Option<std::path::PathBuf>,
}
impl CudaAprCheckpointFn {
pub fn new(
trainer: SharedCudaTrainer,
model_name: impl Into<String>,
architecture: impl Into<String>,
) -> Self {
Self {
trainer,
model_name: model_name.into(),
architecture: architecture.into(),
tokenizer_dir: None,
}
}
pub fn with_tokenizer_dir(mut self, dir: impl Into<std::path::PathBuf>) -> Self {
self.tokenizer_dir = Some(dir.into());
self
}
}
impl CheckpointFn for CudaAprCheckpointFn {
fn save(&mut self, _epoch: usize, artifact: &EpochArtifact) -> Result<(), String> {
let mut trainer = self.trainer.borrow_mut();
trainer
.save_apr_with_tokenizer(
&artifact.checkpoint_path,
&self.model_name,
&self.architecture,
self.tokenizer_dir.as_deref(),
)
.map_err(|e| format!("save_apr (cuda) failed: {e}"))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn build_shared_cuda_trainer_with_init_rejects_unpaired_args() {
use std::path::PathBuf;
let cfg = TransformerConfig::qwen2_0_5b();
let result_arch_only =
build_shared_cuda_trainer_with_init(1.0e-4, 128, 42, Some(&cfg), None);
assert!(
matches!(result_arch_only, Err(_)),
"(Some(arch), None) MUST Err — caller-bug guard"
);
let dummy = PathBuf::from("/tmp/does-not-exist.apr");
let result_path_only =
build_shared_cuda_trainer_with_init(1.0e-4, 128, 42, None, Some(&dummy));
assert!(
matches!(result_path_only, Err(_)),
"(None, Some(path)) MUST Err — caller-bug guard"
);
let err_arch = match result_arch_only {
Err(crate::error::Error::ConfigError(s)) => s,
other => panic!("expected ConfigError, got: {:?}", other.is_ok()),
};
let err_path = match result_path_only {
Err(crate::error::Error::ConfigError(s)) => s,
other => panic!("expected ConfigError, got: {:?}", other.is_ok()),
};
assert!(
err_arch.contains("build_shared_cuda_trainer_with_init"),
"Err MUST name the function for grep-ability: {err_arch}"
);
assert!(
err_path.contains("build_shared_cuda_trainer_with_init"),
"Err MUST name the function for grep-ability: {err_path}"
);
}
#[test]
fn build_shared_cuda_trainer_with_init_rejects_encoder_family() {
use crate::transformer::ModelArchitecture;
use std::path::PathBuf;
let mut encoder_cfg = TransformerConfig::qwen2_0_5b();
encoder_cfg.architecture = ModelArchitecture::Encoder;
let dummy = PathBuf::from("/tmp/does-not-exist.apr");
let result =
build_shared_cuda_trainer_with_init(1.0e-4, 128, 42, Some(&encoder_cfg), Some(&dummy));
assert!(matches!(result, Err(_)), "Encoder-family init MUST Err under §50.4 step 5f.1");
}
#[test]
fn falsify_eval_batch_h1_sanity_bound() {
use crate::train::transformer_trainer::TransformerTrainConfig;
use crate::train::transformer_trainer::{CudaTransformerTrainer, LMBatch};
let model_cfg = TransformerConfig::tiny();
let train_cfg = TransformerTrainConfig::new(model_cfg.clone());
let trainer = match CudaTransformerTrainer::new(train_cfg) {
Ok(t) => t,
Err(e) => {
eprintln!(
"[falsify_eval_batch_h1_sanity_bound] skipping: \
CudaTransformerTrainer::new failed: {e:?} \
(test requires --features cuda + a CUDA host)"
);
return;
}
};
let mut trainer = trainer;
let vocab_size = model_cfg.vocab_size as u32;
let seq_len = 16;
let batch_size = 4;
let mut state: u64 = 0xDEAD_BEEF_CAFE_F00D;
let lcg = |s: &mut u64| -> u32 {
*s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
((*s >> 32) as u32) % vocab_size
};
let mut sequences = Vec::with_capacity(batch_size);
for _ in 0..batch_size {
let mut seq = Vec::with_capacity(seq_len + 1);
for _ in 0..(seq_len + 1) {
seq.push(lcg(&mut state));
}
sequences.push(seq);
}
let batch = LMBatch::from_sequences(&sequences, 0, 0);
let loss = trainer.eval_batch(&batch);
let ln_vocab = (vocab_size as f32).ln();
let lower_bound = 0.5_f32;
let upper_bound = 1.5_f32 * ln_vocab;
assert!(
loss >= lower_bound,
"FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-001 (H1 lower bound): \
eval_batch on random-init {}-vocab tiny model returned \
loss = {loss}, expected ≥ {lower_bound} (random-init theoretical \
≈ ln({vocab_size}) = {ln_vocab:.3}). Loss < 0.5 indicates \
eval pipeline is degenerate (cross-entropy collapsing to 0); \
see evidence/section-60-5g-2-redispatch-2026-05-09/ for the \
1500× train/eval discrepancy that motivated this falsifier.",
vocab_size
);
assert!(
loss <= upper_bound,
"FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-001 (H1 upper bound): \
eval_batch returned loss = {loss}, expected ≤ {upper_bound:.3} \
(1.5 × ln(vocab)). Loss > upper_bound suggests numerical \
explosion (NaN coercion or gradient overflow), a separate \
defect class from the lower-bound H1.",
);
assert!(loss.is_finite(), "eval_batch returned non-finite loss = {loss}");
}
#[test]
fn falsify_eval_batch_h1_train_pollution() {
use crate::train::transformer_trainer::TransformerTrainConfig;
use crate::train::transformer_trainer::{CudaTransformerTrainer, LMBatch};
let model_cfg = TransformerConfig::tiny();
let train_cfg = TransformerTrainConfig::new(model_cfg.clone());
let trainer = match CudaTransformerTrainer::new(train_cfg) {
Ok(t) => t,
Err(e) => {
eprintln!(
"[falsify_eval_batch_h1_train_pollution] skipping: \
CudaTransformerTrainer::new failed: {e:?} \
(test requires --features cuda + a CUDA host)"
);
return;
}
};
let mut trainer = trainer;
let vocab_size = model_cfg.vocab_size as u32;
let seq_len = 16;
let batch_size = 4;
let mut state: u64 = 0xCAFE_BABE_DEAD_BEEF;
let lcg = |s: &mut u64| -> u32 {
*s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
((*s >> 32) as u32) % vocab_size
};
let make_batch = |state: &mut u64, lcg: &dyn Fn(&mut u64) -> u32| -> LMBatch {
let mut sequences = Vec::with_capacity(batch_size);
for _ in 0..batch_size {
let mut seq = Vec::with_capacity(seq_len + 1);
for _ in 0..(seq_len + 1) {
seq.push(lcg(state));
}
sequences.push(seq);
}
LMBatch::from_sequences(&sequences, 0, 0)
};
let train_batch_data = make_batch(&mut state, &lcg);
let eval_batch_data = make_batch(&mut state, &lcg);
let loss_a = trainer.eval_batch(&eval_batch_data);
assert!(
loss_a.is_finite() && loss_a >= 0.5,
"Phase 1 baseline: eval before any train must be sensible \
(got {loss_a}); test setup precondition failed before \
we can probe H1A. See test 001 for the same lower bound."
);
let _train_loss = trainer.train_batch(&train_batch_data);
let loss_b = trainer.eval_batch(&eval_batch_data);
let rel_drop = (loss_a - loss_b).max(0.0) / loss_a;
assert!(
loss_b.is_finite(),
"eval_batch after train returned non-finite loss = {loss_b}; \
possible NaN propagation from train_batch's in-place gradient \
writeback contaminating subsequent eval forward."
);
assert!(
rel_drop < 0.95,
"FALSIFY-APR-PRETRAIN-EVAL-METHODOLOGY-002 (H1A train→eval \
state pollution): eval_batch loss dropped from {loss_a} to \
{loss_b} ({:.4}× relative drop) after a single train_batch \
on a DIFFERENT batch. A single optimizer step at typical \
learning rates cannot legitimately move loss by ≥95%. \
This indicates train_batch contaminates state that eval_batch \
reads (most likely the gpu_training.logits_buf via KAIZEN-052 \
in-place gradient writeback overlapping with the next \
gpu_forward GEMM). See \
evidence/section-60-5g-2-redispatch-2026-05-09/README.md \
for the 1500× train/val discrepancy this falsifier reproduces.",
rel_drop
);
}
#[test]
fn falsify_cuda_forward_parity_qwen_val_loss_below_ln_vocab() {
let init_path = std::path::Path::new("/mnt/nvme-raid0/models/qwen2.5-coder-0.5b-fresh.apr");
if !init_path.exists() {
eprintln!(
"[falsify-cuda-forward-parity-001] skipping: host lacks {}",
init_path.display()
);
return;
}
let cfg = TransformerConfig::qwen2_0_5b();
let trainer_rc = match build_shared_cuda_trainer_with_init(
5.0e-5,
32,
42,
Some(&cfg),
Some(init_path),
) {
Ok(t) => t,
Err(e) => {
eprintln!(
"[falsify-cuda-forward-parity-001] skipping: \
build_shared_cuda_trainer_with_init failed: {e:?} \
(test requires --features cuda + a CUDA host)"
);
return;
}
};
let seq = vec![100_u32; 17]; let batch = LMBatch::from_sequences(&[seq], 0, 0);
let val_loss = trainer_rc.borrow_mut().eval_batch(&batch);
let ln_vocab = (cfg.vocab_size as f32).ln();
let upper_bound = ln_vocab * 0.7;
eprintln!(
"[falsify-cuda-forward-parity-001] val_loss={val_loss} ln(vocab)={ln_vocab} \
upper_bound (0.7×ln_vocab)={upper_bound}"
);
assert!(val_loss.is_finite(), "val_loss must be finite, got {val_loss}");
assert!(
val_loss < upper_bound,
"FALSIFY-CUDA-FORWARD-PARITY-001 (H4D): CUDA val_loss={val_loss} >= \
0.7×ln(vocab)={upper_bound}. Same Qwen weights produce \
peak-to-mean=5.68 on CPU forward (PR #1602 falsify_h4_cpu_forward_*) \
but CUDA produces sub-random predictions. Root cause: \
CudaTransformerBlock drops Qwen Q/K/V biases — struct has no bias \
fields (cuda_block.rs lines 103-135), forward does bare gemms \
(lines 719-747) without `cuda_add(q, b_q)` after each projection. \
See evidence/section-60-5g-2-redispatch-2026-05-09/ + this contract \
apr-pretrain-cuda-forward-parity-v1.yaml. Fix scope: add b_q/b_k/b_v \
fields, thread through with_model upload, apply bias-add after each \
Q/K/V gemm in forward."
);
}
}