trueno/tuner/features/
builder.rs1#![allow(missing_docs)]
2use crate::hardware::HardwareCapability;
5
6use crate::tuner::types::{KernelType, QuantType};
7
8use super::TunerFeatures;
9
10#[derive(Default)]
16pub struct TunerFeaturesBuilder {
17 model_params_b: Option<f32>,
18 hidden_dim: Option<u32>,
19 num_layers: Option<u32>,
20 num_heads: Option<u32>,
21 head_dim: Option<u32>,
22 vocab_size: Option<u32>,
23 batch_size: Option<u32>,
24 seq_len: Option<u32>,
25 cuda_graphs: bool,
26 kv_caches: Option<u32>,
27 is_prefill: bool,
28 quant_type: Option<QuantType>,
29 kernel_type: Option<KernelType>,
30 gpu_mem_bw_gbs: Option<f32>,
31 gpu_compute_tflops: Option<f32>,
32 gpu_sm_count: Option<u32>,
33 gpu_l2_cache_mb: Option<f32>, is_zero_copy: bool, measured_tps: Option<f32>,
36}
37
38impl TunerFeaturesBuilder {
39 pub fn model_params_b(mut self, params: f32) -> Self {
41 self.model_params_b = Some(params);
42 self
43 }
44
45 pub fn hidden_dim(mut self, dim: u32) -> Self {
47 self.hidden_dim = Some(dim);
48 self
49 }
50
51 pub fn num_layers(mut self, layers: u32) -> Self {
53 self.num_layers = Some(layers);
54 self
55 }
56
57 pub fn num_heads(mut self, heads: u32) -> Self {
59 self.num_heads = Some(heads);
60 self
61 }
62
63 pub fn head_dim(mut self, dim: u32) -> Self {
65 self.head_dim = Some(dim);
66 self
67 }
68
69 pub fn vocab_size(mut self, size: u32) -> Self {
71 self.vocab_size = Some(size);
72 self
73 }
74
75 pub fn batch_size(mut self, m: u32) -> Self {
77 self.batch_size = Some(m);
78 self
79 }
80
81 pub fn seq_len(mut self, len: u32) -> Self {
83 self.seq_len = Some(len);
84 self
85 }
86
87 pub fn cuda_graphs(mut self, enabled: bool) -> Self {
89 self.cuda_graphs = enabled;
90 self
91 }
92
93 pub fn kv_caches(mut self, count: u32) -> Self {
95 self.kv_caches = Some(count);
96 self
97 }
98
99 pub fn is_prefill(mut self, prefill: bool) -> Self {
101 self.is_prefill = prefill;
102 self
103 }
104
105 pub fn quant_type(mut self, qt: QuantType) -> Self {
107 self.quant_type = Some(qt);
108 self
109 }
110
111 pub fn kernel_type(mut self, kt: KernelType) -> Self {
113 self.kernel_type = Some(kt);
114 self
115 }
116
117 pub fn gpu_mem_bw_gbs(mut self, bw: f32) -> Self {
119 self.gpu_mem_bw_gbs = Some(bw);
120 self
121 }
122
123 pub fn gpu_compute_tflops(mut self, tflops: f32) -> Self {
125 self.gpu_compute_tflops = Some(tflops);
126 self
127 }
128
129 pub fn gpu_sm_count(mut self, count: u32) -> Self {
131 self.gpu_sm_count = Some(count);
132 self
133 }
134
135 pub fn measured_tps(mut self, tps: f32) -> Self {
137 self.measured_tps = Some(tps);
138 self
139 }
140
141 pub fn gpu_l2_cache_mb(mut self, l2_mb: f32) -> Self {
143 self.gpu_l2_cache_mb = Some(l2_mb);
144 self
145 }
146
147 pub fn is_zero_copy(mut self, enabled: bool) -> Self {
149 self.is_zero_copy = enabled;
150 self
151 }
152
153 #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
155 pub fn hardware(mut self, hw: &HardwareCapability) -> Self {
157 if let Some(gpu) = &hw.gpu {
158 self.gpu_mem_bw_gbs = Some(gpu.memory_bw_gbps as f32);
159 self.gpu_compute_tflops = Some(gpu.peak_tflops_fp32 as f32);
160 self.gpu_sm_count = None;
162 }
163 self
164 }
165
166 #[allow(clippy::cast_precision_loss)]
168 pub fn build(self) -> TunerFeatures {
174 self.try_build().expect("TunerFeatures: invalid raw input (see try_build for details)")
175 }
176
177 fn validate_non_negative(&self) -> Result<(), crate::tuner::TunerError> {
182 if let Some(p) = self.model_params_b {
183 if p < 0.0 {
184 return Err(crate::tuner::TunerError::InvalidFeature(format!(
185 "model_params_b must be non-negative, got {p}"
186 )));
187 }
188 }
189 if let Some(bw) = self.gpu_mem_bw_gbs {
190 if bw < 0.0 {
191 return Err(crate::tuner::TunerError::InvalidFeature(format!(
192 "gpu_mem_bw_gbs must be non-negative, got {bw}"
193 )));
194 }
195 }
196 if let Some(tf) = self.gpu_compute_tflops {
197 if tf < 0.0 {
198 return Err(crate::tuner::TunerError::InvalidFeature(format!(
199 "gpu_compute_tflops must be non-negative, got {tf}"
200 )));
201 }
202 }
203 Ok(())
204 }
205
206 pub fn try_build(self) -> Result<TunerFeatures, crate::tuner::TunerError> {
207 self.validate_non_negative()?;
208
209 let batch_size = self.batch_size.unwrap_or(1);
210 let kv_caches = self.kv_caches.unwrap_or(batch_size);
211
212 let mut quant_onehot = [0.0f32; 8];
214 if let Some(qt) = self.quant_type {
215 quant_onehot[qt.to_index()] = 1.0;
216 }
217
218 let mut kernel_onehot = [0.0f32; 16];
219 if let Some(kt) = self.kernel_type {
220 kernel_onehot[kt.to_index()] = 1.0;
221 }
222
223 let hidden_dim = self.hidden_dim.unwrap_or(0) as f32;
226 let batch_size_f = batch_size as f32;
227 let quant_bytes = self.quant_type.map(|q| q.bytes_per_param()).unwrap_or(0.5625);
228
229 let arithmetic_intensity = (2.0 / quant_bytes).min(10.0) / 10.0;
232
233 let theoretical_efficiency = 0.0;
235
236 Ok(TunerFeatures {
237 model_params_b: self
239 .model_params_b
240 .map(|p| (p.max(f32::EPSILON).log10() + 1.0) / 3.0) .unwrap_or(0.0)
242 .clamp(0.0, 1.0),
243 hidden_dim_norm: (hidden_dim / 16384.0).clamp(0.0, 1.0),
244 num_layers_norm: (self.num_layers.unwrap_or(0) as f32 / 128.0).clamp(0.0, 1.0),
246 num_heads_norm: (self.num_heads.unwrap_or(0) as f32 / 128.0).clamp(0.0, 1.0),
247 head_dim_norm: (self.head_dim.unwrap_or(0) as f32 / 256.0).clamp(0.0, 1.0),
248 vocab_size_log: self
249 .vocab_size
250 .map(|v| (v.max(1) as f32).log10() / 6.0) .unwrap_or(0.0)
252 .clamp(0.0, 1.0),
253 batch_size_norm: (batch_size_f / 64.0).clamp(0.0, 1.0),
254 seq_len_log: self
255 .seq_len
256 .map(|s| (s.max(1) as f32).log2() / 15.0) .unwrap_or(0.0)
258 .clamp(0.0, 1.0),
259 cuda_graphs: if self.cuda_graphs { 1.0 } else { 0.0 },
260 kv_cache_ratio: (kv_caches as f32 / batch_size_f).clamp(0.0, 1.0),
261 is_prefill: if self.is_prefill { 1.0 } else { 0.0 },
262
263 quant_type_onehot: quant_onehot,
265 kernel_type_onehot: kernel_onehot,
266
267 gpu_mem_bw_norm: (self.gpu_mem_bw_gbs.unwrap_or(1000.0) / 3000.0).clamp(0.0, 1.0),
269 gpu_compute_norm: (self.gpu_compute_tflops.unwrap_or(100.0) / 500.0).clamp(0.0, 1.0),
270 gpu_sm_norm: (self.gpu_sm_count.unwrap_or(128) as f32 / 200.0).clamp(0.0, 1.0),
271 gpu_l2_cache_norm: (self.gpu_l2_cache_mb.unwrap_or(48.0) / 128.0).clamp(0.0, 1.0), is_zero_copy: if self.is_zero_copy { 1.0 } else { 0.0 }, arithmetic_intensity,
276 theoretical_efficiency,
277
278 measured_tps: self.measured_tps,
280 best_kernel_id: None,
281 bottleneck_class: None,
282 })
283 }
284}