trueno/tuner/features/
extractor.rs1#![allow(missing_docs)]
2use crate::brick::{BrickCategory, BrickProfiler};
7use crate::hardware::HardwareCapability;
8use serde::{Deserialize, Serialize};
9
10use crate::tuner::types::{BottleneckClass, KernelType, QuantType};
11
12use super::TunerFeatures;
13
14#[derive(Debug)]
20pub struct FeatureExtractor {
21 pub(crate) hardware: Option<HardwareCapability>,
23}
24
25impl Default for FeatureExtractor {
26 fn default() -> Self {
27 Self::new()
28 }
29}
30
31impl FeatureExtractor {
32 pub fn new() -> Self {
34 Self { hardware: None }
35 }
36
37 pub fn with_hardware(hardware: HardwareCapability) -> Self {
39 Self { hardware: Some(hardware) }
40 }
41
42 pub fn extract(&self, profiler: &BrickProfiler, config: &RunConfig) -> TunerFeatures {
44 let mut builder = TunerFeatures::builder()
45 .model_params_b(config.model_params_b)
46 .hidden_dim(config.hidden_dim)
47 .num_layers(config.num_layers)
48 .num_heads(config.num_heads)
49 .batch_size(config.batch_size)
50 .seq_len(config.seq_len)
51 .cuda_graphs(config.cuda_graphs)
52 .quant_type(config.quant_type)
53 .kernel_type(config.kernel_type);
54
55 if let Some(hw) = &self.hardware {
57 builder = builder.hardware(hw);
58 }
59
60 if let Some(tps) = profiler.tokens_per_sec() {
62 builder = builder.measured_tps(tps);
63 }
64
65 let mut features = builder.build();
66
67 if let Some(efficiency) = self.calculate_efficiency(profiler, config) {
69 features.theoretical_efficiency = efficiency;
70 }
71
72 features.bottleneck_class = Some(self.classify_bottleneck(profiler));
74
75 features
76 }
77
78 #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
80 pub fn calculate_efficiency(
82 &self,
83 profiler: &BrickProfiler,
84 config: &RunConfig,
85 ) -> Option<f32> {
86 let measured_tps = profiler.tokens_per_sec()?;
87 let hw = self.hardware.as_ref()?;
88 let gpu = hw.gpu.as_ref()?;
89
90 let bytes_per_token = config.model_params_b * 1e9 * config.quant_type.bytes_per_param();
92 let theoretical_tps = (gpu.memory_bw_gbps as f32) * 1e9 / bytes_per_token;
93
94 Some((measured_tps / theoretical_tps).clamp(0.0, 1.0))
95 }
96
97 #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
101 pub fn classify_bottleneck(&self, profiler: &BrickProfiler) -> BottleneckClass {
103 let cats = profiler.category_stats();
104 let total_ns = profiler.total_ns();
105
106 if total_ns == 0 {
107 return BottleneckClass::Unknown;
108 }
109
110 let attention_pct =
112 cats[BrickCategory::Attention as usize].percentage(total_ns) as f32 / 100.0;
113 let ffn_pct = cats[BrickCategory::Ffn as usize].percentage(total_ns) as f32 / 100.0;
114 let norm_pct = cats[BrickCategory::Norm as usize].percentage(total_ns) as f32 / 100.0;
115
116 if attention_pct > 0.35 {
118 BottleneckClass::AttentionBound
119 } else if ffn_pct > 0.50 {
120 BottleneckClass::MemoryBound
122 } else if norm_pct > 0.20 {
123 BottleneckClass::LaunchBound
125 } else {
126 BottleneckClass::MemoryBound }
128 }
129}
130
131#[derive(Debug, Clone, Serialize, Deserialize)]
137pub struct RunConfig {
138 pub model_params_b: f32,
139 pub hidden_dim: u32,
140 pub num_layers: u32,
141 pub num_heads: u32,
142 pub batch_size: u32,
143 pub seq_len: u32,
144 pub cuda_graphs: bool,
145 pub quant_type: QuantType,
146 pub kernel_type: KernelType,
147}
148
149const DEFAULT_HIDDEN_DIM: u32 = 1536;
151
152impl Default for RunConfig {
153 fn default() -> Self {
154 Self {
155 model_params_b: 1.5,
156 hidden_dim: DEFAULT_HIDDEN_DIM,
157 num_layers: 28,
158 num_heads: 12,
159 batch_size: 1,
160 seq_len: 1,
161 cuda_graphs: false,
162 quant_type: QuantType::Q4K,
163 kernel_type: KernelType::VectorizedQ4K,
164 }
165 }
166}