Skip to main content

trueno/tuner/features/
mod.rs

1#![allow(missing_docs)]
2//! Feature Extraction for ML Tuning
3//!
4//! Implements TunerFeatures, TunerFeaturesBuilder, FeatureExtractor, and RunConfig.
5
6mod builder;
7mod extractor;
8
9pub use builder::TunerFeaturesBuilder;
10pub use extractor::{FeatureExtractor, RunConfig};
11
12use serde::{Deserialize, Serialize};
13
14use super::error::TunerError;
15use super::types::BottleneckClass;
16
17// ============================================================================
18// TunerFeatures
19// ============================================================================
20
21/// Feature vector for ML-based kernel tuning.
22///
23/// All fields normalized to [0, 1] for model input.
24/// Total dimension: 42 features.
25///
26/// # Feature Categories
27///
28/// - **Static (11)**: Known before execution (model size, batch size, etc.)
29/// - **Quant one-hot (8)**: Quantization type encoding
30/// - **Kernel one-hot (16)**: Kernel type encoding
31/// - **Hardware (5)**: GPU capabilities
32/// - **Derived (2)**: Computed features (arithmetic intensity, efficiency)
33#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct TunerFeatures {
35    // === Static features (11) ===
36    /// Model size in billions (log10 normalized)
37    pub model_params_b: f32,
38    /// Hidden dimension / 16384
39    pub hidden_dim_norm: f32,
40    /// Number of layers / 128
41    pub num_layers_norm: f32,
42    /// Number of attention heads / 128
43    pub num_heads_norm: f32,
44    /// Head dimension / 256
45    pub head_dim_norm: f32,
46    /// Vocabulary size (log10 normalized)
47    pub vocab_size_log: f32,
48    /// Batch size M / 64
49    pub batch_size_norm: f32,
50    /// Sequence length (log2 / 15)
51    pub seq_len_log: f32,
52    /// CUDA graphs enabled (0 or 1)
53    pub cuda_graphs: f32,
54    /// Number of KV caches / batch_size (for multi-cache detection)
55    pub kv_cache_ratio: f32,
56    /// Prefill vs decode (0=decode, 1=prefill)
57    pub is_prefill: f32,
58
59    // === Quantization one-hot (8) ===
60    pub quant_type_onehot: [f32; 8],
61
62    // === Kernel one-hot (16) ===
63    pub kernel_type_onehot: [f32; 16],
64
65    // === Hardware features (5) === [v1.1.0: added L2 cache + zero-copy]
66    /// Memory bandwidth / 3000 GB/s
67    pub gpu_mem_bw_norm: f32,
68    /// Compute TFLOPS / 500
69    pub gpu_compute_norm: f32,
70    /// SM count / 200
71    pub gpu_sm_norm: f32,
72    /// L2 cache size / 128 MB (v1.1.0: critical for occupancy)
73    pub gpu_l2_cache_norm: f32,
74    /// Zero-copy memory path enabled (0 or 1) (v1.1.0: pinned memory)
75    pub is_zero_copy: f32,
76
77    // === Derived features (2) ===
78    /// Arithmetic intensity (FLOP/byte), normalized
79    pub arithmetic_intensity: f32,
80    /// Theoretical efficiency (measured / roofline)
81    pub theoretical_efficiency: f32,
82
83    // === Labels (for training) ===
84    /// Measured throughput (tokens/second) - training label
85    #[serde(skip_serializing_if = "Option::is_none")]
86    pub measured_tps: Option<f32>,
87    /// Best kernel ID - classification label
88    #[serde(skip_serializing_if = "Option::is_none")]
89    pub best_kernel_id: Option<u8>,
90    /// Bottleneck class - classification label
91    #[serde(skip_serializing_if = "Option::is_none")]
92    pub bottleneck_class: Option<BottleneckClass>,
93}
94
95impl Default for TunerFeatures {
96    fn default() -> Self {
97        Self {
98            model_params_b: 0.0,
99            hidden_dim_norm: 0.0,
100            num_layers_norm: 0.0,
101            num_heads_norm: 0.0,
102            head_dim_norm: 0.0,
103            vocab_size_log: 0.0,
104            batch_size_norm: 0.0,
105            seq_len_log: 0.0,
106            cuda_graphs: 0.0,
107            kv_cache_ratio: 1.0,
108            is_prefill: 0.0,
109            quant_type_onehot: [0.0; 8],
110            kernel_type_onehot: [0.0; 16],
111            gpu_mem_bw_norm: 0.0,
112            gpu_compute_norm: 0.0,
113            gpu_sm_norm: 0.0,
114            gpu_l2_cache_norm: 0.0,
115            is_zero_copy: 0.0,
116            arithmetic_intensity: 0.0,
117            theoretical_efficiency: 0.0,
118            measured_tps: None,
119            best_kernel_id: None,
120            bottleneck_class: None,
121        }
122    }
123}
124
125impl TunerFeatures {
126    /// Total feature dimension (excluding labels)
127    /// v1.1.0: 11 static + 8 quant + 16 kernel + 5 hardware + 2 derived = 42
128    pub const DIM: usize = 11 + 8 + 16 + 5 + 2; // 42 features (v1.1.0)
129
130    /// Create a new feature builder
131    pub fn builder() -> TunerFeaturesBuilder {
132        TunerFeaturesBuilder::default()
133    }
134
135    /// ALB-099: Convert to fixed-size stack array — zero heap allocation.
136    /// dhat profiling showed 140K Vec allocations from to_vector() in tests.
137    pub fn to_array(&self) -> [f32; Self::DIM] {
138        let mut a = [0.0f32; Self::DIM];
139        let mut i = 0;
140
141        // Static features (11)
142        a[i] = self.model_params_b;
143        i += 1;
144        a[i] = self.hidden_dim_norm;
145        i += 1;
146        a[i] = self.num_layers_norm;
147        i += 1;
148        a[i] = self.num_heads_norm;
149        i += 1;
150        a[i] = self.head_dim_norm;
151        i += 1;
152        a[i] = self.vocab_size_log;
153        i += 1;
154        a[i] = self.batch_size_norm;
155        i += 1;
156        a[i] = self.seq_len_log;
157        i += 1;
158        a[i] = self.cuda_graphs;
159        i += 1;
160        a[i] = self.kv_cache_ratio;
161        i += 1;
162        a[i] = self.is_prefill;
163        i += 1;
164
165        // One-hot encodings (8 + 16)
166        a[i..i + 8].copy_from_slice(&self.quant_type_onehot);
167        i += 8;
168        a[i..i + 16].copy_from_slice(&self.kernel_type_onehot);
169        i += 16;
170
171        // Hardware features (5)
172        a[i] = self.gpu_mem_bw_norm;
173        i += 1;
174        a[i] = self.gpu_compute_norm;
175        i += 1;
176        a[i] = self.gpu_sm_norm;
177        i += 1;
178        a[i] = self.gpu_l2_cache_norm;
179        i += 1;
180        a[i] = self.is_zero_copy;
181        i += 1;
182
183        // Derived features (2)
184        a[i] = self.arithmetic_intensity;
185        i += 1;
186        a[i] = self.theoretical_efficiency;
187
188        a
189    }
190
191    /// Convert to flat vector for model input
192    pub fn to_vector(&self) -> Vec<f32> {
193        self.to_array().to_vec()
194    }
195
196    /// Validate features (F021-F030 falsification criteria)
197    pub fn validate(&self) -> Result<(), TunerError> {
198        let v = self.to_array();
199
200        // F021: No NaN features
201        if v.iter().any(|x| x.is_nan()) {
202            return Err(TunerError::InvalidFeature("NaN value in features".into()));
203        }
204
205        // F022: No infinite features
206        if v.iter().any(|x| x.is_infinite()) {
207            return Err(TunerError::InvalidFeature("Infinite value in features".into()));
208        }
209
210        // F023: All features in [0, 1] (with small tolerance for floating point)
211        if v.iter().any(|x| *x < -0.001 || *x > 1.001) {
212            return Err(TunerError::InvalidFeature("Feature value outside [0, 1]".into()));
213        }
214
215        // F029: One-hot sums = 1
216        let quant_sum: f32 = self.quant_type_onehot.iter().sum();
217        if (quant_sum - 1.0).abs() > 0.001 && quant_sum > 0.001 {
218            return Err(TunerError::InvalidFeature("Quant one-hot does not sum to 1".into()));
219        }
220
221        let kernel_sum: f32 = self.kernel_type_onehot.iter().sum();
222        if (kernel_sum - 1.0).abs() > 0.001 && kernel_sum > 0.001 {
223            return Err(TunerError::InvalidFeature("Kernel one-hot does not sum to 1".into()));
224        }
225
226        Ok(())
227    }
228}