Skip to main content

trueno/tuner/features/
builder.rs

1#![allow(missing_docs)]
2//! Builder for `TunerFeatures` with automatic normalization.
3
4use crate::hardware::HardwareCapability;
5
6use crate::tuner::types::{KernelType, QuantType};
7
8use super::TunerFeatures;
9
10// ============================================================================
11// TunerFeaturesBuilder
12// ============================================================================
13
14/// Builder for TunerFeatures with automatic normalization.
15#[derive(Default)]
16pub struct TunerFeaturesBuilder {
17    model_params_b: Option<f32>,
18    hidden_dim: Option<u32>,
19    num_layers: Option<u32>,
20    num_heads: Option<u32>,
21    head_dim: Option<u32>,
22    vocab_size: Option<u32>,
23    batch_size: Option<u32>,
24    seq_len: Option<u32>,
25    cuda_graphs: bool,
26    kv_caches: Option<u32>,
27    is_prefill: bool,
28    quant_type: Option<QuantType>,
29    kernel_type: Option<KernelType>,
30    gpu_mem_bw_gbs: Option<f32>,
31    gpu_compute_tflops: Option<f32>,
32    gpu_sm_count: Option<u32>,
33    gpu_l2_cache_mb: Option<f32>, // v1.1.0
34    is_zero_copy: bool,           // v1.1.0
35    measured_tps: Option<f32>,
36}
37
38impl TunerFeaturesBuilder {
39    /// Set model size in billions of parameters
40    pub fn model_params_b(mut self, params: f32) -> Self {
41        self.model_params_b = Some(params);
42        self
43    }
44
45    /// Set hidden dimension
46    pub fn hidden_dim(mut self, dim: u32) -> Self {
47        self.hidden_dim = Some(dim);
48        self
49    }
50
51    /// Set number of layers
52    pub fn num_layers(mut self, layers: u32) -> Self {
53        self.num_layers = Some(layers);
54        self
55    }
56
57    /// Set number of attention heads
58    pub fn num_heads(mut self, heads: u32) -> Self {
59        self.num_heads = Some(heads);
60        self
61    }
62
63    /// Set head dimension
64    pub fn head_dim(mut self, dim: u32) -> Self {
65        self.head_dim = Some(dim);
66        self
67    }
68
69    /// Set vocabulary size
70    pub fn vocab_size(mut self, size: u32) -> Self {
71        self.vocab_size = Some(size);
72        self
73    }
74
75    /// Set batch size (M)
76    pub fn batch_size(mut self, m: u32) -> Self {
77        self.batch_size = Some(m);
78        self
79    }
80
81    /// Set sequence length
82    pub fn seq_len(mut self, len: u32) -> Self {
83        self.seq_len = Some(len);
84        self
85    }
86
87    /// Enable CUDA graphs
88    pub fn cuda_graphs(mut self, enabled: bool) -> Self {
89        self.cuda_graphs = enabled;
90        self
91    }
92
93    /// Set number of KV caches
94    pub fn kv_caches(mut self, count: u32) -> Self {
95        self.kv_caches = Some(count);
96        self
97    }
98
99    /// Set prefill mode
100    pub fn is_prefill(mut self, prefill: bool) -> Self {
101        self.is_prefill = prefill;
102        self
103    }
104
105    /// Set quantization type
106    pub fn quant_type(mut self, qt: QuantType) -> Self {
107        self.quant_type = Some(qt);
108        self
109    }
110
111    /// Set kernel type
112    pub fn kernel_type(mut self, kt: KernelType) -> Self {
113        self.kernel_type = Some(kt);
114        self
115    }
116
117    /// Set GPU memory bandwidth in GB/s
118    pub fn gpu_mem_bw_gbs(mut self, bw: f32) -> Self {
119        self.gpu_mem_bw_gbs = Some(bw);
120        self
121    }
122
123    /// Set GPU compute in TFLOPS
124    pub fn gpu_compute_tflops(mut self, tflops: f32) -> Self {
125        self.gpu_compute_tflops = Some(tflops);
126        self
127    }
128
129    /// Set GPU SM count
130    pub fn gpu_sm_count(mut self, count: u32) -> Self {
131        self.gpu_sm_count = Some(count);
132        self
133    }
134
135    /// Set measured throughput (for training data)
136    pub fn measured_tps(mut self, tps: f32) -> Self {
137        self.measured_tps = Some(tps);
138        self
139    }
140
141    /// Set L2 cache size in MB (v1.1.0)
142    pub fn gpu_l2_cache_mb(mut self, l2_mb: f32) -> Self {
143        self.gpu_l2_cache_mb = Some(l2_mb);
144        self
145    }
146
147    /// Set zero-copy memory path enabled (v1.1.0)
148    pub fn is_zero_copy(mut self, enabled: bool) -> Self {
149        self.is_zero_copy = enabled;
150        self
151    }
152
153    /// Set hardware capability (auto-fills GPU features)
154    #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
155    // SAFETY: GPU bandwidth/TFLOPS values fit in f32 (practical max ~10K); f64→f32 truncation negligible.
156    pub fn hardware(mut self, hw: &HardwareCapability) -> Self {
157        if let Some(gpu) = &hw.gpu {
158            self.gpu_mem_bw_gbs = Some(gpu.memory_bw_gbps as f32);
159            self.gpu_compute_tflops = Some(gpu.peak_tflops_fp32 as f32);
160            // SM count not directly available; estimate from compute capability
161            self.gpu_sm_count = None;
162        }
163        self
164    }
165
166    /// Build the feature vector with normalization
167    #[allow(clippy::cast_precision_loss)]
168    // SAFETY: All u32 values are model hyperparams (hidden_dim ≤ 16384, layers ≤ 128,
169    //         heads ≤ 128, vocab ≤ 1M, batch ≤ 64, seq_len ≤ 32768) — well within f32 mantissa.
170    /// Build features, panicking if raw inputs violate physical constraints.
171    ///
172    /// Use `try_build()` for fallible construction.
173    pub fn build(self) -> TunerFeatures {
174        self.try_build().expect("TunerFeatures: invalid raw input (see try_build for details)")
175    }
176
177    /// Build features, returning Err if raw inputs violate physical constraints.
178    ///
179    /// F024: model_params_b, gpu_mem_bw, gpu_compute must be non-negative.
180    /// F024: Pre-normalization validation — physical quantities must be non-negative.
181    fn validate_non_negative(&self) -> Result<(), crate::tuner::TunerError> {
182        if let Some(p) = self.model_params_b {
183            if p < 0.0 {
184                return Err(crate::tuner::TunerError::InvalidFeature(format!(
185                    "model_params_b must be non-negative, got {p}"
186                )));
187            }
188        }
189        if let Some(bw) = self.gpu_mem_bw_gbs {
190            if bw < 0.0 {
191                return Err(crate::tuner::TunerError::InvalidFeature(format!(
192                    "gpu_mem_bw_gbs must be non-negative, got {bw}"
193                )));
194            }
195        }
196        if let Some(tf) = self.gpu_compute_tflops {
197            if tf < 0.0 {
198                return Err(crate::tuner::TunerError::InvalidFeature(format!(
199                    "gpu_compute_tflops must be non-negative, got {tf}"
200                )));
201            }
202        }
203        Ok(())
204    }
205
206    pub fn try_build(self) -> Result<TunerFeatures, crate::tuner::TunerError> {
207        self.validate_non_negative()?;
208
209        let batch_size = self.batch_size.unwrap_or(1);
210        let kv_caches = self.kv_caches.unwrap_or(batch_size);
211
212        // Create one-hot encodings
213        let mut quant_onehot = [0.0f32; 8];
214        if let Some(qt) = self.quant_type {
215            quant_onehot[qt.to_index()] = 1.0;
216        }
217
218        let mut kernel_onehot = [0.0f32; 16];
219        if let Some(kt) = self.kernel_type {
220            kernel_onehot[kt.to_index()] = 1.0;
221        }
222
223        // Calculate derived features
224        // C-14 (Meyer DbC): 0 = unknown, no architecture-specific magic number.
225        let hidden_dim = self.hidden_dim.unwrap_or(0) as f32;
226        let batch_size_f = batch_size as f32;
227        let quant_bytes = self.quant_type.map(|q| q.bytes_per_param()).unwrap_or(0.5625);
228
229        // Arithmetic intensity for GEMV: 2*N*K FLOPs / (N*K*bytes + K + N) bytes
230        // Simplified: ~2 / bytes_per_param for memory-bound inference
231        let arithmetic_intensity = (2.0 / quant_bytes).min(10.0) / 10.0;
232
233        // Theoretical efficiency starts at 0 (unknown until measured)
234        let theoretical_efficiency = 0.0;
235
236        Ok(TunerFeatures {
237            // Normalized static features
238            model_params_b: self
239                .model_params_b
240                .map(|p| (p.max(f32::EPSILON).log10() + 1.0) / 3.0) // log10(0.1)=-1, log10(100)=2 → [0, 1]
241                .unwrap_or(0.0)
242                .clamp(0.0, 1.0),
243            hidden_dim_norm: (hidden_dim / 16384.0).clamp(0.0, 1.0),
244            // N-07 (Meyer DbC): 0 when unknown — no model-specific defaults.
245            num_layers_norm: (self.num_layers.unwrap_or(0) as f32 / 128.0).clamp(0.0, 1.0),
246            num_heads_norm: (self.num_heads.unwrap_or(0) as f32 / 128.0).clamp(0.0, 1.0),
247            head_dim_norm: (self.head_dim.unwrap_or(0) as f32 / 256.0).clamp(0.0, 1.0),
248            vocab_size_log: self
249                .vocab_size
250                .map(|v| (v.max(1) as f32).log10() / 6.0) // log10(1M)=6
251                .unwrap_or(0.0)
252                .clamp(0.0, 1.0),
253            batch_size_norm: (batch_size_f / 64.0).clamp(0.0, 1.0),
254            seq_len_log: self
255                .seq_len
256                .map(|s| (s.max(1) as f32).log2() / 15.0) // log2(32K)≈15
257                .unwrap_or(0.0)
258                .clamp(0.0, 1.0),
259            cuda_graphs: if self.cuda_graphs { 1.0 } else { 0.0 },
260            kv_cache_ratio: (kv_caches as f32 / batch_size_f).clamp(0.0, 1.0),
261            is_prefill: if self.is_prefill { 1.0 } else { 0.0 },
262
263            // One-hot encodings
264            quant_type_onehot: quant_onehot,
265            kernel_type_onehot: kernel_onehot,
266
267            // Hardware features (5) [v1.1.0]
268            gpu_mem_bw_norm: (self.gpu_mem_bw_gbs.unwrap_or(1000.0) / 3000.0).clamp(0.0, 1.0),
269            gpu_compute_norm: (self.gpu_compute_tflops.unwrap_or(100.0) / 500.0).clamp(0.0, 1.0),
270            gpu_sm_norm: (self.gpu_sm_count.unwrap_or(128) as f32 / 200.0).clamp(0.0, 1.0),
271            gpu_l2_cache_norm: (self.gpu_l2_cache_mb.unwrap_or(48.0) / 128.0).clamp(0.0, 1.0), // v1.1.0
272            is_zero_copy: if self.is_zero_copy { 1.0 } else { 0.0 }, // v1.1.0
273
274            // Derived features
275            arithmetic_intensity,
276            theoretical_efficiency,
277
278            // Labels
279            measured_tps: self.measured_tps,
280            best_kernel_id: None,
281            bottleneck_class: None,
282        })
283    }
284}