Skip to main content

entrenar/finetune/
gpu_backward_fallback.rs

1//! CPU fallback for lm_head backward GEMM (PMAT-471).
2//!
3//! On VRAM-constrained GPUs (yoga 8GB), GPU embeddings don't fit after NF4 blocks.
4//! Without GPU embeddings, lm_head backward GEMM silently fails and backward through
5//! blocks is never called — the model cannot train.
6//!
7//! This module provides a CPU fallback: download grad_logits from GPU, multiply with
8//! CPU embedding weights, upload grad_hidden to GPU. Slower but functional.
9
10#[cfg(feature = "cuda")]
11use crate::autograd::cuda_training::CudaTrainer;
12#[cfg(feature = "cuda")]
13use trueno_gpu::driver::{CudaStream, GpuBuffer};
14
15/// CPU fallback for lm_head backward: grad_hidden = grad_logits @ embed.
16///
17/// Downloads grad_logits[seq_len, vocab_size] from GPU, multiplies with CPU
18/// embedding weights[vocab_size, hidden_size], uploads result to GPU.
19///
20/// # Cost
21/// - D2H: seq_len × vocab_size × 4 bytes (~311 MB for seq=512, vocab=151936)
22/// - CPU matmul: O(seq × vocab × hidden)
23/// - H2D: seq_len × hidden_size × 4 bytes (~3 MB for seq=512, hidden=1536)
24#[cfg(feature = "cuda")]
25pub fn cpu_lmhead_backward(
26    trainer: &CudaTrainer,
27    logits_buf: &GpuBuffer<f32>,
28    grad_hidden_buf: &mut GpuBuffer<f32>,
29    embed_weights: &[f32],
30    seq_len: usize,
31    vocab_size: usize,
32    hidden_size: usize,
33    stream: &CudaStream,
34) -> Option<()> {
35    stream.synchronize().ok()?;
36
37    let grad_logits = trainer.download(logits_buf).ok()?;
38
39    // CPU matmul: grad_hidden[s,h] = Σ_v grad_logits[s,v] × embed[v,h]
40    let mut grad_hidden = vec![0.0f32; seq_len * hidden_size];
41    for s in 0..seq_len {
42        for v in 0..vocab_size {
43            let g = grad_logits[s * vocab_size + v];
44            if g == 0.0 {
45                continue;
46            }
47            let embed_row = &embed_weights[v * hidden_size..(v + 1) * hidden_size];
48            let out_row = &mut grad_hidden[s * hidden_size..(s + 1) * hidden_size];
49            for h in 0..hidden_size {
50                out_row[h] += g * embed_row[h];
51            }
52        }
53    }
54
55    let gpu_grad = trainer.upload(&grad_hidden).ok()?;
56    grad_hidden_buf.copy_from_buffer(&gpu_grad).ok()?;
57    stream.synchronize().ok()?;
58
59    eprintln!(
60        "[CUDA] lm_head backward via CPU fallback (PMAT-471): \
61         {:.1}MB D2H + CPU matmul + {:.1}MB H2D",
62        (seq_len * vocab_size * 4) as f64 / 1e6,
63        (seq_len * hidden_size * 4) as f64 / 1e6,
64    );
65    Some(())
66}
67
68/// Initialize FP16 weights for all NF4 blocks when FP16_GEMM=1 (PMAT-470).
69///
70/// Casts fp32 dequantized weights to fp16 on GPU using CastF32ToF16Kernel.
71/// One-time cost at model initialization, amortized over all training steps.
72#[cfg(feature = "cuda")]
73pub fn init_fp16_weights(
74    blocks: &mut [crate::transformer::CudaBlock],
75    stream: &CudaStream,
76) -> usize {
77    use crate::transformer::CudaBlock;
78
79    let mut ok = 0usize;
80    for (i, block) in blocks.iter_mut().enumerate() {
81        if let CudaBlock::Nf4(ref mut nf4) = block {
82            match nf4.set_fp16_weights(stream) {
83                Ok(()) => ok += 1,
84                Err(e) => {
85                    eprintln!("[FP16] Layer {i} cast failed: {e} — fp32 fallback");
86                    break;
87                }
88            }
89        }
90    }
91    if ok == blocks.len() {
92        eprintln!("[FP16] All {ok} layers cast to fp16 — tensor core GEMM enabled");
93    }
94    ok
95}
96
97/// Pre-allocate cuBLAS workspace for CUDA graph capture (PMAT-063).
98///
99/// During graph capture, cuBLAS cannot allocate workspace dynamically.
100/// Must be called BEFORE `stream.begin_capture()`. Returns the workspace
101/// buffer that must be kept alive for the duration of graph use.
102#[cfg(feature = "cuda")]
103pub fn preallocate_cublas_workspace(trainer: &CudaTrainer) -> Option<GpuBuffer<f32>> {
104    const WORKSPACE_BYTES: usize = 32 * 1024 * 1024; // 32 MB
105    const WORKSPACE_ELEMS: usize = WORKSPACE_BYTES / 4;
106
107    let ws = trainer.zeros(WORKSPACE_ELEMS).ok()?;
108    let ws_ptr = ws.as_ptr();
109
110    if let Err(e) = crate::autograd::cuda_forward::set_cublas_workspace(ws_ptr, WORKSPACE_BYTES) {
111        eprintln!("[CUDA] cuBLAS workspace set failed: {e}");
112        return None;
113    }
114    eprintln!("[CUDA] cuBLAS workspace pre-allocated: 32 MB");
115    Some(ws)
116}