entrenar/finetune/
gpu_backward_fallback.rs1#[cfg(feature = "cuda")]
11use crate::autograd::cuda_training::CudaTrainer;
12#[cfg(feature = "cuda")]
13use trueno_gpu::driver::{CudaStream, GpuBuffer};
14
15#[cfg(feature = "cuda")]
25pub fn cpu_lmhead_backward(
26 trainer: &CudaTrainer,
27 logits_buf: &GpuBuffer<f32>,
28 grad_hidden_buf: &mut GpuBuffer<f32>,
29 embed_weights: &[f32],
30 seq_len: usize,
31 vocab_size: usize,
32 hidden_size: usize,
33 stream: &CudaStream,
34) -> Option<()> {
35 stream.synchronize().ok()?;
36
37 let grad_logits = trainer.download(logits_buf).ok()?;
38
39 let mut grad_hidden = vec![0.0f32; seq_len * hidden_size];
41 for s in 0..seq_len {
42 for v in 0..vocab_size {
43 let g = grad_logits[s * vocab_size + v];
44 if g == 0.0 {
45 continue;
46 }
47 let embed_row = &embed_weights[v * hidden_size..(v + 1) * hidden_size];
48 let out_row = &mut grad_hidden[s * hidden_size..(s + 1) * hidden_size];
49 for h in 0..hidden_size {
50 out_row[h] += g * embed_row[h];
51 }
52 }
53 }
54
55 let gpu_grad = trainer.upload(&grad_hidden).ok()?;
56 grad_hidden_buf.copy_from_buffer(&gpu_grad).ok()?;
57 stream.synchronize().ok()?;
58
59 eprintln!(
60 "[CUDA] lm_head backward via CPU fallback (PMAT-471): \
61 {:.1}MB D2H + CPU matmul + {:.1}MB H2D",
62 (seq_len * vocab_size * 4) as f64 / 1e6,
63 (seq_len * hidden_size * 4) as f64 / 1e6,
64 );
65 Some(())
66}
67
68#[cfg(feature = "cuda")]
73pub fn init_fp16_weights(
74 blocks: &mut [crate::transformer::CudaBlock],
75 stream: &CudaStream,
76) -> usize {
77 use crate::transformer::CudaBlock;
78
79 let mut ok = 0usize;
80 for (i, block) in blocks.iter_mut().enumerate() {
81 if let CudaBlock::Nf4(ref mut nf4) = block {
82 match nf4.set_fp16_weights(stream) {
83 Ok(()) => ok += 1,
84 Err(e) => {
85 eprintln!("[FP16] Layer {i} cast failed: {e} — fp32 fallback");
86 break;
87 }
88 }
89 }
90 }
91 if ok == blocks.len() {
92 eprintln!("[FP16] All {ok} layers cast to fp16 — tensor core GEMM enabled");
93 }
94 ok
95}
96
97#[cfg(feature = "cuda")]
103pub fn preallocate_cublas_workspace(trainer: &CudaTrainer) -> Option<GpuBuffer<f32>> {
104 const WORKSPACE_BYTES: usize = 32 * 1024 * 1024; const WORKSPACE_ELEMS: usize = WORKSPACE_BYTES / 4;
106
107 let ws = trainer.zeros(WORKSPACE_ELEMS).ok()?;
108 let ws_ptr = ws.as_ptr();
109
110 if let Err(e) = crate::autograd::cuda_forward::set_cublas_workspace(ws_ptr, WORKSPACE_BYTES) {
111 eprintln!("[CUDA] cuBLAS workspace set failed: {e}");
112 return None;
113 }
114 eprintln!("[CUDA] cuBLAS workspace pre-allocated: 32 MB");
115 Some(ws)
116}