entrenar/finetune/instruct_pipeline/
cuda_init.rs1#[cfg(feature = "cuda")]
2use super::{
3 CudaTrainer, InstructConfig, InstructGpuTrainingState, InstructPipeline, LoRALayer,
4 Transformer, TransformerConfig, VramGuard,
5};
6
7#[cfg(feature = "cuda")]
8use crate::autograd::cuda_backward::pre_warm_lora_backward_kernels as pre_warm_backward_cache_kernels;
9#[cfg(feature = "cuda")]
10use crate::autograd::cuda_forward::{pre_warm_forward_kernels, pre_warm_lora_backward_kernels};
11#[cfg(feature = "cuda")]
12use crate::autograd::cuda_optim::pre_warm_lora_adamw_kernels;
13#[cfg(feature = "cuda")]
14use crate::autograd::cuda_training::cuda_training_available;
15#[cfg(feature = "cuda")]
16use crate::transformer::{
17 CudaBlock, CudaBlockScratch, CudaLoraGradWorkspace, CudaTransformerBlock, GpuLoraOptimizerState,
18};
19#[cfg(feature = "cuda")]
20use std::sync::Arc;
21
22#[cfg(feature = "cuda")]
23impl InstructPipeline {
24 pub(super) fn init_cuda(&mut self, model_config: &TransformerConfig) {
27 let budget_mb = Self::estimate_vram_mb(model_config, &self.config);
29 let task_label = if self.config.quantize_nf4 { "instruct-qlora" } else { "instruct-lora" };
30 match VramGuard::acquire(budget_mb, task_label) {
31 Ok(guard) => {
32 eprintln!(
33 "[GPU-SHARE] VRAM reserved: {budget_mb} MB for {task_label} (gpu: {})",
34 guard.gpu_uuid()
35 );
36 self.vram_guard = Some(guard);
37 }
38 Err(e) => {
39 eprintln!("[GPU-SHARE] VRAM guard denied: {e} — falling back to CPU");
40 return;
41 }
42 }
43
44 let (trainer, blocks, scratch) =
45 Self::try_init_cuda(&self.model, model_config, &self.config, &self.lora_layers);
46
47 if trainer.is_none() {
48 self.vram_guard = None;
50 return;
51 }
52
53 self.cuda_trainer = trainer;
54 self.cuda_blocks = blocks;
55 self.shared_scratch = scratch;
56
57 self.gpu_training = Self::try_init_gpu_training(
59 &self.model,
60 model_config,
61 self.config.max_seq_len,
62 self.cuda_trainer.as_ref(),
63 self.cuda_blocks.as_ref(),
64 );
65
66 if self.config.quantize_nf4 {
67 let (grad_ws, opt_states) = Self::try_init_nf4_lora_training(
68 self.cuda_trainer.as_ref(),
69 self.cuda_blocks.as_ref(),
70 model_config,
71 &self.config,
72 );
73 if let (Some(ws), Some(t)) = (&grad_ws, &self.cuda_trainer) {
74 self.lora_fused_clip =
75 super::super::fused_lora_clip::init_lora_fused_clip(ws, t.context());
76 }
77 self.cuda_lora_grad_workspace = grad_ws;
78 self.cuda_lora_optimizer_states = opt_states;
79 }
80
81 if let Some(ref mut guard) = self.vram_guard {
83 let _ = guard.update_actual(budget_mb);
84 }
85 }
86
87 fn estimate_vram_mb(model_config: &TransformerConfig, config: &InstructConfig) -> usize {
89 if config.quantize_nf4 {
90 let weight_elements =
91 model_config.per_layer_weight_elements() * model_config.num_hidden_layers;
92 let weight_mb = weight_elements / (2 * 1024 * 1024);
93 let scratch_mb =
94 (config.max_seq_len * model_config.hidden_size * 4 * 10) / (1024 * 1024);
95 weight_mb + scratch_mb + 512
96 } else {
97 model_config.total_training_vram_bytes_shared(config.max_seq_len) / (1024 * 1024) + 256
98 }
99 }
100
101 fn try_init_cuda(
104 model: &Transformer,
105 model_config: &TransformerConfig,
106 config: &InstructConfig,
107 lora_layers: &[LoRALayer],
108 ) -> (Option<CudaTrainer>, Option<Vec<CudaBlock>>, Option<CudaBlockScratch>) {
109 if !cuda_training_available() {
110 eprintln!("[CUDA] No CUDA runtime detected — using CPU");
111 return (None, None, None);
112 }
113
114 let trainer = match CudaTrainer::new() {
115 Ok(t) => {
116 eprintln!(
117 "[CUDA] Initialized: {} ({:.1} GB)",
118 t.device_name(),
119 t.total_memory() as f64 / 1e9
120 );
121 t
122 }
123 Err(e) => {
124 eprintln!("[CUDA] Failed to create trainer: {e} — using CPU");
125 return (None, None, None);
126 }
127 };
128
129 let ctx = Arc::clone(trainer.context());
130 let max_seq_len = config.max_seq_len;
131
132 if let Err(e) = pre_warm_forward_kernels(
134 model_config.hidden_size,
135 model_config.intermediate_size,
136 model_config.num_attention_heads,
137 model_config.num_kv_heads,
138 model_config.head_dim(),
139 max_seq_len,
140 ) {
141 eprintln!("[CUDA] Failed to pre-warm forward kernels: {e} — using CPU");
142 return (None, None, None);
143 }
144
145 let quantize_nf4 = config.quantize_nf4;
146 if quantize_nf4 {
147 eprintln!(
148 "[CUDA] NF4 quantization enabled — frozen weights will be 4-bit (~8x compression)"
149 );
150 }
151
152 let head_dim = model_config.head_dim();
153 if let Err(e) = pre_warm_lora_backward_kernels(
154 model_config.hidden_size,
155 model_config.num_attention_heads * head_dim,
156 model_config.num_kv_heads * head_dim,
157 max_seq_len,
158 config.lora_rank,
159 ) {
160 eprintln!("[CUDA] Failed to pre-warm LoRA backward kernels: {e} — using CPU");
161 return (None, None, None);
162 }
163
164 if let Err(e) = pre_warm_backward_cache_kernels(
165 model_config.hidden_size,
166 model_config.num_attention_heads * head_dim,
167 model_config.num_kv_heads * head_dim,
168 max_seq_len,
169 config.lora_rank,
170 model_config.intermediate_size,
171 model_config.num_attention_heads,
172 quantize_nf4,
173 ) {
174 eprintln!("[CUDA] Failed to pre-warm backward cache kernels: {e}");
175 eprintln!("[CUDA] STOP THE LINE: backward kernel pre-warming failed.");
176 eprintln!("[CUDA] This is a FATAL error — training will produce loss=0.0 if backward");
177 eprintln!("[CUDA] kernels are compiled during active GPU work (trueno#200).");
178 return (None, None, None);
179 }
180 eprintln!("[CUDA] Backward kernels pre-warmed successfully");
181 if let Err(e) = pre_warm_lora_adamw_kernels(
182 model_config.hidden_size,
183 model_config.num_attention_heads * head_dim,
184 model_config.num_kv_heads * head_dim,
185 config.lora_rank,
186 0, model_config.intermediate_size,
188 quantize_nf4,
189 ) {
190 eprintln!("[CUDA] Failed to pre-warm AdamW kernels: {e} — using CPU");
191 return (None, None, None);
192 }
193
194 let mut blocks = Vec::with_capacity(model.config.num_hidden_layers);
195 for (i, layer) in model.layers.iter().enumerate() {
196 let input_norm = layer.input_norm.weight.data();
197 let input_norm = input_norm.as_slice().expect("contiguous input_norm");
198 let post_attn_norm = layer.post_attn_norm.weight.data();
199 let post_attn_norm = post_attn_norm.as_slice().expect("contiguous post_attn_norm");
200 let w_q = layer.self_attn.w_q.data();
201 let w_q = w_q.as_slice().expect("contiguous w_q");
202 let w_k = layer.self_attn.w_k.data();
203 let w_k = w_k.as_slice().expect("contiguous w_k");
204 let w_v = layer.self_attn.w_v.data();
205 let w_v = w_v.as_slice().expect("contiguous w_v");
206 let w_o = layer.self_attn.w_o.data();
207 let w_o = w_o.as_slice().expect("contiguous w_o");
208 let w_gate = layer.ffn.w_gate.data();
209 let w_gate = w_gate.as_slice().expect("contiguous w_gate");
210 let w_up = layer.ffn.w_up.data();
211 let w_up = w_up.as_slice().expect("contiguous w_up");
212 let w_down = layer.ffn.w_down.data();
213 let w_down = w_down.as_slice().expect("contiguous w_down");
214
215 let result = if quantize_nf4 {
216 let lora_scale = config.lora_alpha / config.lora_rank as f32;
217 let lora_rank = config.lora_rank;
218 let q_lora_idx = i * 2;
219 let v_lora_idx = i * 2 + 1;
220
221 let q_a_data;
223 let q_b_data;
224 let q_lora = if q_lora_idx < lora_layers.len() {
225 q_a_data = lora_layers[q_lora_idx].lora_a().data();
226 q_b_data = lora_layers[q_lora_idx].lora_b().data();
227 Some((
228 q_a_data.as_slice().expect("contiguous lora_a_q"),
229 q_b_data.as_slice().expect("contiguous lora_b_q"),
230 ))
231 } else {
232 None
233 };
234
235 let v_a_data;
237 let v_b_data;
238 let v_lora = if v_lora_idx < lora_layers.len() {
239 v_a_data = lora_layers[v_lora_idx].lora_a().data();
240 v_b_data = lora_layers[v_lora_idx].lora_b().data();
241 Some((
242 v_a_data.as_slice().expect("contiguous lora_a_v"),
243 v_b_data.as_slice().expect("contiguous lora_b_v"),
244 ))
245 } else {
246 None
247 };
248
249 let q_norm_data = layer
251 .self_attn
252 .q_norm
253 .as_ref()
254 .map(|t| t.data().as_slice().expect("contiguous q_norm").to_vec());
255 let k_norm_data = layer
256 .self_attn
257 .k_norm
258 .as_ref()
259 .map(|t| t.data().as_slice().expect("contiguous k_norm").to_vec());
260
261 crate::transformer::CudaNf4TransformerBlock::new(
262 model_config,
263 i,
264 Arc::clone(&ctx),
265 input_norm,
266 post_attn_norm,
267 w_q,
268 w_k,
269 w_v,
270 w_o,
271 w_gate,
272 w_up,
273 w_down,
274 max_seq_len,
275 q_lora,
276 v_lora,
277 lora_scale,
278 lora_rank,
279 q_norm_data.as_deref(),
280 k_norm_data.as_deref(),
281 )
282 .map(CudaBlock::Nf4)
283 } else {
284 CudaTransformerBlock::new(
285 model_config,
286 i,
287 Arc::clone(&ctx),
288 input_norm,
289 post_attn_norm,
290 w_q,
291 w_k,
292 w_v,
293 w_o,
294 w_gate,
295 w_up,
296 w_down,
297 max_seq_len,
298 None, None, None, )
302 .map(CudaBlock::Fp32)
303 };
304
305 match result {
306 Ok(block) => blocks.push(block),
307 Err(e) => {
308 eprintln!(
309 "[CUDA] Failed to upload layer {i} to GPU: {e} — falling back to CPU"
310 );
311 return (None, None, None);
312 }
313 }
314 }
315
316 eprintln!(
317 "[CUDA] Uploaded {} transformer layers to GPU (max_seq_len={})",
318 blocks.len(),
319 max_seq_len
320 );
321
322 assert_eq!(blocks.len(), model.config.num_hidden_layers);
323 if std::env::var("FP16_GEMM").as_deref() == Ok("1") && quantize_nf4 {
325 super::super::gpu_backward_fallback::init_fp16_weights(&mut blocks, trainer.stream());
326 }
327
328 let shared_scratch = if quantize_nf4 {
330 match CudaBlockScratch::new(model_config, max_seq_len, &ctx, config.lora_rank) {
331 Ok(s) => Some(s),
332 Err(e) => {
333 eprintln!("[CUDA] Failed to allocate shared scratch: {e} — using CPU");
334 return (None, None, None);
335 }
336 }
337 } else {
338 None
339 };
340
341 (Some(trainer), Some(blocks), shared_scratch)
342 }
343
344 pub(super) fn try_init_gpu_training(
346 model: &Transformer,
347 model_config: &TransformerConfig,
348 max_seq_len: usize,
349 cuda_trainer: Option<&CudaTrainer>,
350 cuda_blocks: Option<&Vec<CudaBlock>>,
351 ) -> Option<InstructGpuTrainingState> {
352 let trainer = cuda_trainer?;
353 let blocks = cuda_blocks?;
354
355 let hidden_size = model_config.hidden_size;
356 let buf_size = max_seq_len * hidden_size;
357 let num_layers = blocks.len();
358
359 let mut layer_inputs = Vec::with_capacity(num_layers);
361 for _ in 0..num_layers {
362 match trainer.zeros(buf_size) {
363 Ok(buf) => layer_inputs.push(buf),
364 Err(e) => {
365 eprintln!("[CUDA] GPU training init failed (layer input alloc): {e}");
366 return None;
367 }
368 }
369 }
370
371 let norm_data = model.norm.weight.data();
373 let norm_slice = norm_data.as_slice().expect("contiguous final norm weight");
374 let final_norm_weight = match trainer.upload(norm_slice) {
375 Ok(buf) => buf,
376 Err(e) => {
377 eprintln!("[CUDA] GPU training init failed (final norm upload): {e}");
378 return None;
379 }
380 };
381
382 let blocks_output = trainer.zeros(buf_size).ok()?;
384 let grad_buf_a = trainer.zeros(buf_size).ok()?;
385 let grad_buf_b = trainer.zeros(buf_size).ok()?;
386 let grad_final_norm_weight = trainer.zeros(hidden_size).ok()?;
387
388 let vocab_size = model_config.vocab_size;
390 let embed_data = model.embed_tokens.weight.data();
391 let embed_slice = embed_data.as_slice().expect("contiguous embed");
392 let embed_bytes = vocab_size * hidden_size * 4; let vram_available_mb = trainer.free_memory_mb().unwrap_or(0);
394 let embed_mb = embed_bytes / (1024 * 1024);
395 let use_gpu_embed = vram_available_mb > (embed_mb + 256) as u64;
396
397 let (embed_original, embed_transposed) = if use_gpu_embed {
398 eprintln!(
399 "[CUDA] GPU-resident embeddings: {embed_mb}MB (VRAM free: {vram_available_mb}MB)"
400 );
401 let orig = trainer
402 .upload(embed_slice)
403 .map_err(|e| eprintln!("[CUDA] embed_original upload failed: {e}"))
404 .ok()?;
405 let trans = trainer.zeros(1).ok()?;
406 (orig, trans)
407 } else {
408 eprintln!("[CUDA] Skipping GPU embeddings ({embed_mb}MB > {vram_available_mb}MB free)");
409 let orig = trainer.zeros(1).ok()?;
410 let trans = trainer.zeros(1).ok()?;
411 (orig, trans)
412 };
413
414 let logits_buf = trainer
416 .zeros(max_seq_len * vocab_size)
417 .map_err(|e| eprintln!("[CUDA] logits_buf alloc failed: {e}"))
418 .ok()?;
419
420 let grad_hidden_buf = trainer.zeros(buf_size).ok()?;
422
423 eprintln!(
424 "[CUDA] GPU training state initialized: {num_layers} layers, {buf_size} buf_size, \
425 embed=[{vocab_size}x{hidden_size}] on GPU (NF4 QLoRA mode)"
426 );
427
428 let output_scratch = trainer.zeros(buf_size).ok()?;
430 let grad_upload_buf = trainer.zeros(buf_size).ok()?;
431 let fwd_scratch_a = trainer.zeros(buf_size).ok()?;
432 let fwd_scratch_b = trainer.zeros(buf_size).ok()?;
433 let lm_head_hidden_buf = trainer.zeros(buf_size).ok()?;
434
435 let num_layers = layer_inputs.len();
436 Some(InstructGpuTrainingState {
437 layer_inputs,
438 final_norm_weight,
439 blocks_output,
440 grad_buf_a,
441 grad_buf_b,
442 grad_final_norm_weight,
443 embed_transposed,
444 embed_original,
445 logits_buf,
446 grad_hidden_buf,
447 output_scratch,
448 grad_upload_buf,
449 fwd_scratch_a,
450 fwd_scratch_b,
451 lm_head_hidden_buf,
452 forward_graph_exec: None,
453 graph_cached_seq_len: 0,
454 backward_graph_state: None,
455 cublas_workspace: None,
456 profiler_layer_fwd_us: vec![0u64; num_layers],
457 profiler_layer_bwd_us: vec![0u64; num_layers],
458 profiler_layer_start: None,
459 profiler_op_us: [0u64; 16],
460 profiler_op_start: None,
461 })
462 }
463
464 fn try_init_nf4_lora_training(
466 cuda_trainer: Option<&CudaTrainer>,
467 cuda_blocks: Option<&Vec<CudaBlock>>,
468 model_config: &TransformerConfig,
469 config: &InstructConfig,
470 ) -> (Option<CudaLoraGradWorkspace>, Option<Vec<GpuLoraOptimizerState>>) {
471 let trainer = match cuda_trainer {
472 Some(t) => t,
473 None => return (None, None),
474 };
475 let blocks = match cuda_blocks {
476 Some(b) => b,
477 None => return (None, None),
478 };
479
480 let grad_ws =
481 match CudaLoraGradWorkspace::new(trainer.context(), model_config, config.lora_rank) {
482 Ok(ws) => ws,
483 Err(e) => {
484 eprintln!("[CUDA] NF4 LoRA grad workspace alloc failed: {e}");
485 return (None, None);
486 }
487 };
488
489 let mut opt_states = Vec::with_capacity(blocks.len());
490 for (i, block) in blocks.iter().enumerate() {
491 match block.init_lora_optimizer_state() {
492 Ok(state) => opt_states.push(state),
493 Err(e) => {
494 eprintln!("[CUDA] NF4 LoRA optimizer init failed (layer {i}): {e}");
495 return (None, None);
496 }
497 }
498 }
499
500 eprintln!(
501 "[CUDA] NF4 QLoRA training initialized: {} layers, rank={}, scale={:.2}",
502 blocks.len(),
503 config.lora_rank,
504 config.lora_alpha / config.lora_rank as f32,
505 );
506
507 (Some(grad_ws), Some(opt_states))
508 }
509}