entrenar/finetune/instruct_pipeline/
cuda_init.rs

1#[cfg(feature = "cuda")]
2use super::{
3    CudaTrainer, InstructConfig, InstructGpuTrainingState, InstructPipeline, LoRALayer,
4    Transformer, TransformerConfig, VramGuard,
5};
6
7#[cfg(feature = "cuda")]
8use crate::autograd::cuda_backward::pre_warm_lora_backward_kernels as pre_warm_backward_cache_kernels;
9#[cfg(feature = "cuda")]
10use crate::autograd::cuda_forward::{pre_warm_forward_kernels, pre_warm_lora_backward_kernels};
11#[cfg(feature = "cuda")]
12use crate::autograd::cuda_optim::pre_warm_lora_adamw_kernels;
13#[cfg(feature = "cuda")]
14use crate::autograd::cuda_training::cuda_training_available;
15#[cfg(feature = "cuda")]
16use crate::transformer::{
17    CudaBlock, CudaBlockScratch, CudaLoraGradWorkspace, CudaTransformerBlock, GpuLoraOptimizerState,
18};
19#[cfg(feature = "cuda")]
20use std::sync::Arc;
21
22#[cfg(feature = "cuda")]
23impl InstructPipeline {
24    /// Initialize CUDA acceleration: create trainer, upload blocks, init LoRA training.
25    /// GPU-SHARE-002: Acquires VRAM guard; falls back to CPU if denied.
26    pub(super) fn init_cuda(&mut self, model_config: &TransformerConfig) {
27        // GPU-SHARE-002: Acquire VRAM reservation before allocating
28        let budget_mb = Self::estimate_vram_mb(model_config, &self.config);
29        let task_label = if self.config.quantize_nf4 { "instruct-qlora" } else { "instruct-lora" };
30        match VramGuard::acquire(budget_mb, task_label) {
31            Ok(guard) => {
32                eprintln!(
33                    "[GPU-SHARE] VRAM reserved: {budget_mb} MB for {task_label} (gpu: {})",
34                    guard.gpu_uuid()
35                );
36                self.vram_guard = Some(guard);
37            }
38            Err(e) => {
39                eprintln!("[GPU-SHARE] VRAM guard denied: {e} — falling back to CPU");
40                return;
41            }
42        }
43
44        let (trainer, blocks, scratch) =
45            Self::try_init_cuda(&self.model, model_config, &self.config, &self.lora_layers);
46
47        if trainer.is_none() {
48            // CUDA init failed — release the guard
49            self.vram_guard = None;
50            return;
51        }
52
53        self.cuda_trainer = trainer;
54        self.cuda_blocks = blocks;
55        self.shared_scratch = scratch;
56
57        // GPU training state (layer input snapshots for backward)
58        self.gpu_training = Self::try_init_gpu_training(
59            &self.model,
60            model_config,
61            self.config.max_seq_len,
62            self.cuda_trainer.as_ref(),
63            self.cuda_blocks.as_ref(),
64        );
65
66        if self.config.quantize_nf4 {
67            let (grad_ws, opt_states) = Self::try_init_nf4_lora_training(
68                self.cuda_trainer.as_ref(),
69                self.cuda_blocks.as_ref(),
70                model_config,
71                &self.config,
72            );
73            if let (Some(ws), Some(t)) = (&grad_ws, &self.cuda_trainer) {
74                self.lora_fused_clip =
75                    super::super::fused_lora_clip::init_lora_fused_clip(ws, t.context());
76            }
77            self.cuda_lora_grad_workspace = grad_ws;
78            self.cuda_lora_optimizer_states = opt_states;
79        }
80
81        // GPU-SHARE-002: Update actual VRAM usage after all allocations
82        if let Some(ref mut guard) = self.vram_guard {
83            let _ = guard.update_actual(budget_mb);
84        }
85    }
86
87    /// Estimate VRAM usage (MB) for GPU training (GPU-SHARE-002 ledger reservation).
88    fn estimate_vram_mb(model_config: &TransformerConfig, config: &InstructConfig) -> usize {
89        if config.quantize_nf4 {
90            let weight_elements =
91                model_config.per_layer_weight_elements() * model_config.num_hidden_layers;
92            let weight_mb = weight_elements / (2 * 1024 * 1024);
93            let scratch_mb =
94                (config.max_seq_len * model_config.hidden_size * 4 * 10) / (1024 * 1024);
95            weight_mb + scratch_mb + 512
96        } else {
97            model_config.total_training_vram_bytes_shared(config.max_seq_len) / (1024 * 1024) + 256
98        }
99    }
100
101    /// Create `CudaTrainer` and upload all transformer layer weights to GPU.
102    /// Returns `(None, None, None)` if CUDA is unavailable or any step fails.
103    fn try_init_cuda(
104        model: &Transformer,
105        model_config: &TransformerConfig,
106        config: &InstructConfig,
107        lora_layers: &[LoRALayer],
108    ) -> (Option<CudaTrainer>, Option<Vec<CudaBlock>>, Option<CudaBlockScratch>) {
109        if !cuda_training_available() {
110            eprintln!("[CUDA] No CUDA runtime detected — using CPU");
111            return (None, None, None);
112        }
113
114        let trainer = match CudaTrainer::new() {
115            Ok(t) => {
116                eprintln!(
117                    "[CUDA] Initialized: {} ({:.1} GB)",
118                    t.device_name(),
119                    t.total_memory() as f64 / 1e9
120                );
121                t
122            }
123            Err(e) => {
124                eprintln!("[CUDA] Failed to create trainer: {e} — using CPU");
125                return (None, None, None);
126            }
127        };
128
129        let ctx = Arc::clone(trainer.context());
130        let max_seq_len = config.max_seq_len;
131
132        // C-PREWARM-001: JIT-compile forward kernels before block upload
133        if let Err(e) = pre_warm_forward_kernels(
134            model_config.hidden_size,
135            model_config.intermediate_size,
136            model_config.num_attention_heads,
137            model_config.num_kv_heads,
138            model_config.head_dim(),
139            max_seq_len,
140        ) {
141            eprintln!("[CUDA] Failed to pre-warm forward kernels: {e} — using CPU");
142            return (None, None, None);
143        }
144
145        let quantize_nf4 = config.quantize_nf4;
146        if quantize_nf4 {
147            eprintln!(
148                "[CUDA] NF4 quantization enabled — frozen weights will be 4-bit (~8x compression)"
149            );
150        }
151
152        let head_dim = model_config.head_dim();
153        if let Err(e) = pre_warm_lora_backward_kernels(
154            model_config.hidden_size,
155            model_config.num_attention_heads * head_dim,
156            model_config.num_kv_heads * head_dim,
157            max_seq_len,
158            config.lora_rank,
159        ) {
160            eprintln!("[CUDA] Failed to pre-warm LoRA backward kernels: {e} — using CPU");
161            return (None, None, None);
162        }
163
164        if let Err(e) = pre_warm_backward_cache_kernels(
165            model_config.hidden_size,
166            model_config.num_attention_heads * head_dim,
167            model_config.num_kv_heads * head_dim,
168            max_seq_len,
169            config.lora_rank,
170            model_config.intermediate_size,
171            model_config.num_attention_heads,
172            quantize_nf4,
173        ) {
174            eprintln!("[CUDA] Failed to pre-warm backward cache kernels: {e}");
175            eprintln!("[CUDA] STOP THE LINE: backward kernel pre-warming failed.");
176            eprintln!("[CUDA] This is a FATAL error — training will produce loss=0.0 if backward");
177            eprintln!("[CUDA] kernels are compiled during active GPU work (trueno#200).");
178            return (None, None, None);
179        }
180        eprintln!("[CUDA] Backward kernels pre-warmed successfully");
181        if let Err(e) = pre_warm_lora_adamw_kernels(
182            model_config.hidden_size,
183            model_config.num_attention_heads * head_dim,
184            model_config.num_kv_heads * head_dim,
185            config.lora_rank,
186            0, // instruct has no classifier head
187            model_config.intermediate_size,
188            quantize_nf4,
189        ) {
190            eprintln!("[CUDA] Failed to pre-warm AdamW kernels: {e} — using CPU");
191            return (None, None, None);
192        }
193
194        let mut blocks = Vec::with_capacity(model.config.num_hidden_layers);
195        for (i, layer) in model.layers.iter().enumerate() {
196            let input_norm = layer.input_norm.weight.data();
197            let input_norm = input_norm.as_slice().expect("contiguous input_norm");
198            let post_attn_norm = layer.post_attn_norm.weight.data();
199            let post_attn_norm = post_attn_norm.as_slice().expect("contiguous post_attn_norm");
200            let w_q = layer.self_attn.w_q.data();
201            let w_q = w_q.as_slice().expect("contiguous w_q");
202            let w_k = layer.self_attn.w_k.data();
203            let w_k = w_k.as_slice().expect("contiguous w_k");
204            let w_v = layer.self_attn.w_v.data();
205            let w_v = w_v.as_slice().expect("contiguous w_v");
206            let w_o = layer.self_attn.w_o.data();
207            let w_o = w_o.as_slice().expect("contiguous w_o");
208            let w_gate = layer.ffn.w_gate.data();
209            let w_gate = w_gate.as_slice().expect("contiguous w_gate");
210            let w_up = layer.ffn.w_up.data();
211            let w_up = w_up.as_slice().expect("contiguous w_up");
212            let w_down = layer.ffn.w_down.data();
213            let w_down = w_down.as_slice().expect("contiguous w_down");
214
215            let result = if quantize_nf4 {
216                let lora_scale = config.lora_alpha / config.lora_rank as f32;
217                let lora_rank = config.lora_rank;
218                let q_lora_idx = i * 2;
219                let v_lora_idx = i * 2 + 1;
220
221                // Q LoRA
222                let q_a_data;
223                let q_b_data;
224                let q_lora = if q_lora_idx < lora_layers.len() {
225                    q_a_data = lora_layers[q_lora_idx].lora_a().data();
226                    q_b_data = lora_layers[q_lora_idx].lora_b().data();
227                    Some((
228                        q_a_data.as_slice().expect("contiguous lora_a_q"),
229                        q_b_data.as_slice().expect("contiguous lora_b_q"),
230                    ))
231                } else {
232                    None
233                };
234
235                // V LoRA
236                let v_a_data;
237                let v_b_data;
238                let v_lora = if v_lora_idx < lora_layers.len() {
239                    v_a_data = lora_layers[v_lora_idx].lora_a().data();
240                    v_b_data = lora_layers[v_lora_idx].lora_b().data();
241                    Some((
242                        v_a_data.as_slice().expect("contiguous lora_a_v"),
243                        v_b_data.as_slice().expect("contiguous lora_b_v"),
244                    ))
245                } else {
246                    None
247                };
248
249                // ENT-270: Extract QK-norm weights if present
250                let q_norm_data = layer
251                    .self_attn
252                    .q_norm
253                    .as_ref()
254                    .map(|t| t.data().as_slice().expect("contiguous q_norm").to_vec());
255                let k_norm_data = layer
256                    .self_attn
257                    .k_norm
258                    .as_ref()
259                    .map(|t| t.data().as_slice().expect("contiguous k_norm").to_vec());
260
261                crate::transformer::CudaNf4TransformerBlock::new(
262                    model_config,
263                    i,
264                    Arc::clone(&ctx),
265                    input_norm,
266                    post_attn_norm,
267                    w_q,
268                    w_k,
269                    w_v,
270                    w_o,
271                    w_gate,
272                    w_up,
273                    w_down,
274                    max_seq_len,
275                    q_lora,
276                    v_lora,
277                    lora_scale,
278                    lora_rank,
279                    q_norm_data.as_deref(),
280                    k_norm_data.as_deref(),
281                )
282                .map(CudaBlock::Nf4)
283            } else {
284                CudaTransformerBlock::new(
285                    model_config,
286                    i,
287                    Arc::clone(&ctx),
288                    input_norm,
289                    post_attn_norm,
290                    w_q,
291                    w_k,
292                    w_v,
293                    w_o,
294                    w_gate,
295                    w_up,
296                    w_down,
297                    max_seq_len,
298                    None, // b_q (instruct pipeline doesn't surface biases yet)
299                    None, // b_k
300                    None, // b_v
301                )
302                .map(CudaBlock::Fp32)
303            };
304
305            match result {
306                Ok(block) => blocks.push(block),
307                Err(e) => {
308                    eprintln!(
309                        "[CUDA] Failed to upload layer {i} to GPU: {e} — falling back to CPU"
310                    );
311                    return (None, None, None);
312                }
313            }
314        }
315
316        eprintln!(
317            "[CUDA] Uploaded {} transformer layers to GPU (max_seq_len={})",
318            blocks.len(),
319            max_seq_len
320        );
321
322        assert_eq!(blocks.len(), model.config.num_hidden_layers);
323        // PMAT-470: FP16 weight cast for tensor core GEMM
324        if std::env::var("FP16_GEMM").as_deref() == Ok("1") && quantize_nf4 {
325            super::super::gpu_backward_fallback::init_fp16_weights(&mut blocks, trainer.stream());
326        }
327
328        // C-SCRATCH-001: Shared scratch for NF4
329        let shared_scratch = if quantize_nf4 {
330            match CudaBlockScratch::new(model_config, max_seq_len, &ctx, config.lora_rank) {
331                Ok(s) => Some(s),
332                Err(e) => {
333                    eprintln!("[CUDA] Failed to allocate shared scratch: {e} — using CPU");
334                    return (None, None, None);
335                }
336            }
337        } else {
338            None
339        };
340
341        (Some(trainer), Some(blocks), shared_scratch)
342    }
343
344    /// Initialize GPU training state for NF4 QLoRA backward pass.
345    pub(super) fn try_init_gpu_training(
346        model: &Transformer,
347        model_config: &TransformerConfig,
348        max_seq_len: usize,
349        cuda_trainer: Option<&CudaTrainer>,
350        cuda_blocks: Option<&Vec<CudaBlock>>,
351    ) -> Option<InstructGpuTrainingState> {
352        let trainer = cuda_trainer?;
353        let blocks = cuda_blocks?;
354
355        let hidden_size = model_config.hidden_size;
356        let buf_size = max_seq_len * hidden_size;
357        let num_layers = blocks.len();
358
359        // Allocate layer-input snapshot buffers
360        let mut layer_inputs = Vec::with_capacity(num_layers);
361        for _ in 0..num_layers {
362            match trainer.zeros(buf_size) {
363                Ok(buf) => layer_inputs.push(buf),
364                Err(e) => {
365                    eprintln!("[CUDA] GPU training init failed (layer input alloc): {e}");
366                    return None;
367                }
368            }
369        }
370
371        // Upload final RMSNorm weight
372        let norm_data = model.norm.weight.data();
373        let norm_slice = norm_data.as_slice().expect("contiguous final norm weight");
374        let final_norm_weight = match trainer.upload(norm_slice) {
375            Ok(buf) => buf,
376            Err(e) => {
377                eprintln!("[CUDA] GPU training init failed (final norm upload): {e}");
378                return None;
379            }
380        };
381
382        // Allocate gradient scratch buffers
383        let blocks_output = trainer.zeros(buf_size).ok()?;
384        let grad_buf_a = trainer.zeros(buf_size).ok()?;
385        let grad_buf_b = trainer.zeros(buf_size).ok()?;
386        let grad_final_norm_weight = trainer.zeros(hidden_size).ok()?;
387
388        // Upload embeddings for GPU lm_head (KAIZEN-068). PMAT-420: skip on <=16GB.
389        let vocab_size = model_config.vocab_size;
390        let embed_data = model.embed_tokens.weight.data();
391        let embed_slice = embed_data.as_slice().expect("contiguous embed");
392        let embed_bytes = vocab_size * hidden_size * 4; // entrenar#317: single layout
393        let vram_available_mb = trainer.free_memory_mb().unwrap_or(0);
394        let embed_mb = embed_bytes / (1024 * 1024);
395        let use_gpu_embed = vram_available_mb > (embed_mb + 256) as u64;
396
397        let (embed_original, embed_transposed) = if use_gpu_embed {
398            eprintln!(
399                "[CUDA] GPU-resident embeddings: {embed_mb}MB (VRAM free: {vram_available_mb}MB)"
400            );
401            let orig = trainer
402                .upload(embed_slice)
403                .map_err(|e| eprintln!("[CUDA] embed_original upload failed: {e}"))
404                .ok()?;
405            let trans = trainer.zeros(1).ok()?;
406            (orig, trans)
407        } else {
408            eprintln!("[CUDA] Skipping GPU embeddings ({embed_mb}MB > {vram_available_mb}MB free)");
409            let orig = trainer.zeros(1).ok()?;
410            let trans = trainer.zeros(1).ok()?;
411            (orig, trans)
412        };
413
414        // Logits scratch: [max_seq_len, vocab_size]
415        let logits_buf = trainer
416            .zeros(max_seq_len * vocab_size)
417            .map_err(|e| eprintln!("[CUDA] logits_buf alloc failed: {e}"))
418            .ok()?;
419
420        // Grad-hidden scratch: [max_seq_len, hidden_size]
421        let grad_hidden_buf = trainer.zeros(buf_size).ok()?;
422
423        eprintln!(
424            "[CUDA] GPU training state initialized: {num_layers} layers, {buf_size} buf_size, \
425             embed=[{vocab_size}x{hidden_size}] on GPU (NF4 QLoRA mode)"
426        );
427
428        // KAIZEN-045/062: Pre-allocate backward + forward scratch buffers
429        let output_scratch = trainer.zeros(buf_size).ok()?;
430        let grad_upload_buf = trainer.zeros(buf_size).ok()?;
431        let fwd_scratch_a = trainer.zeros(buf_size).ok()?;
432        let fwd_scratch_b = trainer.zeros(buf_size).ok()?;
433        let lm_head_hidden_buf = trainer.zeros(buf_size).ok()?;
434
435        let num_layers = layer_inputs.len();
436        Some(InstructGpuTrainingState {
437            layer_inputs,
438            final_norm_weight,
439            blocks_output,
440            grad_buf_a,
441            grad_buf_b,
442            grad_final_norm_weight,
443            embed_transposed,
444            embed_original,
445            logits_buf,
446            grad_hidden_buf,
447            output_scratch,
448            grad_upload_buf,
449            fwd_scratch_a,
450            fwd_scratch_b,
451            lm_head_hidden_buf,
452            forward_graph_exec: None,
453            graph_cached_seq_len: 0,
454            backward_graph_state: None,
455            cublas_workspace: None,
456            profiler_layer_fwd_us: vec![0u64; num_layers],
457            profiler_layer_bwd_us: vec![0u64; num_layers],
458            profiler_layer_start: None,
459            profiler_op_us: [0u64; 16],
460            profiler_op_start: None,
461        })
462    }
463
464    /// Initialize NF4 LoRA training state: gradient workspace + per-layer optimizer states.
465    fn try_init_nf4_lora_training(
466        cuda_trainer: Option<&CudaTrainer>,
467        cuda_blocks: Option<&Vec<CudaBlock>>,
468        model_config: &TransformerConfig,
469        config: &InstructConfig,
470    ) -> (Option<CudaLoraGradWorkspace>, Option<Vec<GpuLoraOptimizerState>>) {
471        let trainer = match cuda_trainer {
472            Some(t) => t,
473            None => return (None, None),
474        };
475        let blocks = match cuda_blocks {
476            Some(b) => b,
477            None => return (None, None),
478        };
479
480        let grad_ws =
481            match CudaLoraGradWorkspace::new(trainer.context(), model_config, config.lora_rank) {
482                Ok(ws) => ws,
483                Err(e) => {
484                    eprintln!("[CUDA] NF4 LoRA grad workspace alloc failed: {e}");
485                    return (None, None);
486                }
487            };
488
489        let mut opt_states = Vec::with_capacity(blocks.len());
490        for (i, block) in blocks.iter().enumerate() {
491            match block.init_lora_optimizer_state() {
492                Ok(state) => opt_states.push(state),
493                Err(e) => {
494                    eprintln!("[CUDA] NF4 LoRA optimizer init failed (layer {i}): {e}");
495                    return (None, None);
496                }
497            }
498        }
499
500        eprintln!(
501            "[CUDA] NF4 QLoRA training initialized: {} layers, rank={}, scale={:.2}",
502            blocks.len(),
503            config.lora_rank,
504            config.lora_alpha / config.lora_rank as f32,
505        );
506
507        (Some(grad_ws), Some(opt_states))
508    }
509}
entrenar/finetune/instruct_pipeline/cuda_init.rs

entrenar/finetune/instruct_pipeline/
cuda_init.rs