Skip to main content

entrenar/finetune/instruct_pipeline/
accessors.rs

1//! Small accessor and utility methods on `InstructPipeline`:
2//! `tokenize`, `has_tokenizer`, `num_trainable_parameters`, `set_learning_rate`,
3//! `learning_rate`, `set_model_path`, `sync_lora_to_cpu`, `is_cuda`, `gpu_name`,
4//! `gpu_total_memory`, `summary`, `tokenizer`.
5
6#[allow(clippy::wildcard_imports)]
7use super::*;
8
9#[cfg(feature = "cuda")]
10use crate::autograd::cuda_training::CudaTrainer;
11
12impl InstructPipeline {
13    /// Tokenize text without truncation.
14    ///
15    /// Returns the full token sequence. Callers (e.g., `train_step`) are
16    /// responsible for budget allocation and truncation of the concatenated
17    /// prompt+response sequence.
18    ///
19    /// Falls back to byte-level encoding (each UTF-8 byte as a u32 token ID)
20    /// when no BPE tokenizer is loaded.
21    pub fn tokenize(&self, text: &str) -> Vec<u32> {
22        match self.tokenizer.as_ref() {
23            Some(tok) => tok.encode(text),
24            None => {
25                // Byte-level fallback when no BPE tokenizer is loaded
26                text.bytes().map(u32::from).collect()
27            }
28        }
29    }
30
31    /// Returns `true` if a BPE tokenizer is loaded.
32    #[must_use]
33    pub fn has_tokenizer(&self) -> bool {
34        self.tokenizer.is_some()
35    }
36
37    /// Number of trainable LoRA parameters.
38    #[must_use]
39    pub fn num_trainable_parameters(&self) -> usize {
40        // LoRA layers store weight + lora_a + lora_b; we count lora_a + lora_b
41        self.lora_layers.len()
42            * 2
43            * self.config.lora_rank
44            * (self.lora_layers.first().map_or(0, |_| {
45                // Approximate: each LoRA pair has rank * (rows + cols) params
46                // This is a rough estimate since layers may differ in size
47                1
48            }))
49    }
50
51    /// Update learning rate (for LR scheduling).
52    pub fn set_learning_rate(&mut self, lr: f32) {
53        self.optimizer.set_lr(lr);
54    }
55
56    /// Get current learning rate.
57    #[must_use]
58    pub fn learning_rate(&self) -> f32 {
59        contract_pre_learning_rate_scaling!();
60        self.optimizer.lr()
61    }
62
63    /// Set model path for checkpoint provenance.
64    pub fn set_model_path(&mut self, path: &Path) {
65        self.model_dir = Some(path.to_path_buf());
66    }
67
68    /// Synchronize GPU LoRA weights back to CPU LoRA layers (NF4 QLoRA).
69    ///
70    /// Required for checkpointing after NF4 QLoRA training. Downloads A_q, B_q,
71    /// A_v, B_v from each NF4 block and updates the corresponding CPU LoRA layers.
72    ///
73    /// # Contract (C-QLORA-CKPT-001)
74    ///
75    /// - **Precondition**: NF4 QLoRA training completed (optimizer steps applied)
76    /// - **Postcondition**: CPU LoRA layers match GPU-trained LoRA weights
77    #[cfg(feature = "cuda")]
78    pub fn sync_lora_to_cpu(&mut self) {
79        let blocks = match self.cuda_blocks.as_ref() {
80            Some(b) => b,
81            None => return,
82        };
83
84        let lora_scale = self.config.lora_alpha / self.config.lora_rank.max(1) as f32;
85        let inv_scale = if lora_scale.abs() > 1e-10 { 1.0 / lora_scale } else { 1.0 };
86
87        for (layer_idx, block) in blocks.iter().enumerate() {
88            if let Ok((a_q, b_q, a_v, b_v)) = block.download_lora_weights() {
89                let q_lora_idx = layer_idx * 2;
90                let v_lora_idx = layer_idx * 2 + 1;
91
92                // Un-scale B matrices (GPU stores B * lora_scale)
93                let b_q_unscaled: Vec<f32> = b_q.iter().map(|&v| v * inv_scale).collect();
94                let b_v_unscaled: Vec<f32> = b_v.iter().map(|&v| v * inv_scale).collect();
95
96                if q_lora_idx < self.lora_layers.len() {
97                    *self.lora_layers[q_lora_idx].lora_a_mut() = crate::Tensor::from_vec(a_q, true);
98                    *self.lora_layers[q_lora_idx].lora_b_mut() =
99                        crate::Tensor::from_vec(b_q_unscaled, true);
100                }
101                if v_lora_idx < self.lora_layers.len() {
102                    *self.lora_layers[v_lora_idx].lora_a_mut() = crate::Tensor::from_vec(a_v, true);
103                    *self.lora_layers[v_lora_idx].lora_b_mut() =
104                        crate::Tensor::from_vec(b_v_unscaled, true);
105                }
106            }
107        }
108    }
109
110    /// Check if this pipeline is using CUDA acceleration.
111    #[must_use]
112    pub fn is_cuda(&self) -> bool {
113        #[cfg(feature = "cuda")]
114        {
115            self.cuda_blocks.is_some()
116        }
117        #[cfg(not(feature = "cuda"))]
118        {
119            false
120        }
121    }
122
123    /// Get GPU device name, or `None` if not using CUDA.
124    #[must_use]
125    pub fn gpu_name(&self) -> Option<String> {
126        #[cfg(feature = "cuda")]
127        {
128            self.cuda_trainer.as_ref().map(CudaTrainer::device_name)
129        }
130        #[cfg(not(feature = "cuda"))]
131        {
132            None
133        }
134    }
135
136    /// Get total GPU memory in bytes, or `None` if not using CUDA.
137    #[must_use]
138    pub fn gpu_total_memory(&self) -> Option<usize> {
139        #[cfg(feature = "cuda")]
140        {
141            self.cuda_trainer.as_ref().map(CudaTrainer::total_memory)
142        }
143        #[cfg(not(feature = "cuda"))]
144        {
145            None
146        }
147    }
148
149    /// Summary of pipeline configuration.
150    #[must_use]
151    pub fn summary(&self) -> String {
152        format!(
153            "InstructPipeline: {} LoRA layers, rank={}, alpha={:.1}{}",
154            self.lora_layers.len(),
155            self.config.lora_rank,
156            self.config.lora_alpha,
157            if self.config.quantize_nf4 { ", NF4 QLoRA" } else { "" },
158        )
159    }
160
161    /// Get a reference to the tokenizer, if loaded.
162    #[must_use]
163    pub fn tokenizer(&self) -> Option<&HfTokenizer> {
164        self.tokenizer.as_ref()
165    }
166
167    /// PMAT-483: Enable the per-step profiler with the given report interval.
168    /// When enabled, profiler measures per-phase and per-layer timing.
169    /// Call `profiler.print_report()` or `profiler.print_json_report()` to get results.
170    pub fn enable_profiler(&mut self, report_interval: usize) {
171        self.profiler = StepProfiler::new(true, report_interval);
172    }
173}