#![allow(missing_docs)]
mod builder;
mod extractor;
pub use builder::TunerFeaturesBuilder;
pub use extractor::{FeatureExtractor, RunConfig};
use serde::{Deserialize, Serialize};
use super::error::TunerError;
use super::types::BottleneckClass;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TunerFeatures {
pub model_params_b: f32,
pub hidden_dim_norm: f32,
pub num_layers_norm: f32,
pub num_heads_norm: f32,
pub head_dim_norm: f32,
pub vocab_size_log: f32,
pub batch_size_norm: f32,
pub seq_len_log: f32,
pub cuda_graphs: f32,
pub kv_cache_ratio: f32,
pub is_prefill: f32,
pub quant_type_onehot: [f32; 8],
pub kernel_type_onehot: [f32; 16],
pub gpu_mem_bw_norm: f32,
pub gpu_compute_norm: f32,
pub gpu_sm_norm: f32,
pub gpu_l2_cache_norm: f32,
pub is_zero_copy: f32,
pub arithmetic_intensity: f32,
pub theoretical_efficiency: f32,
#[serde(skip_serializing_if = "Option::is_none")]
pub measured_tps: Option<f32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub best_kernel_id: Option<u8>,
#[serde(skip_serializing_if = "Option::is_none")]
pub bottleneck_class: Option<BottleneckClass>,
}
impl Default for TunerFeatures {
fn default() -> Self {
Self {
model_params_b: 0.0,
hidden_dim_norm: 0.0,
num_layers_norm: 0.0,
num_heads_norm: 0.0,
head_dim_norm: 0.0,
vocab_size_log: 0.0,
batch_size_norm: 0.0,
seq_len_log: 0.0,
cuda_graphs: 0.0,
kv_cache_ratio: 1.0,
is_prefill: 0.0,
quant_type_onehot: [0.0; 8],
kernel_type_onehot: [0.0; 16],
gpu_mem_bw_norm: 0.0,
gpu_compute_norm: 0.0,
gpu_sm_norm: 0.0,
gpu_l2_cache_norm: 0.0,
is_zero_copy: 0.0,
arithmetic_intensity: 0.0,
theoretical_efficiency: 0.0,
measured_tps: None,
best_kernel_id: None,
bottleneck_class: None,
}
}
}
impl TunerFeatures {
pub const DIM: usize = 11 + 8 + 16 + 5 + 2;
pub fn builder() -> TunerFeaturesBuilder {
TunerFeaturesBuilder::default()
}
pub fn to_array(&self) -> [f32; Self::DIM] {
let mut a = [0.0f32; Self::DIM];
let mut i = 0;
a[i] = self.model_params_b;
i += 1;
a[i] = self.hidden_dim_norm;
i += 1;
a[i] = self.num_layers_norm;
i += 1;
a[i] = self.num_heads_norm;
i += 1;
a[i] = self.head_dim_norm;
i += 1;
a[i] = self.vocab_size_log;
i += 1;
a[i] = self.batch_size_norm;
i += 1;
a[i] = self.seq_len_log;
i += 1;
a[i] = self.cuda_graphs;
i += 1;
a[i] = self.kv_cache_ratio;
i += 1;
a[i] = self.is_prefill;
i += 1;
a[i..i + 8].copy_from_slice(&self.quant_type_onehot);
i += 8;
a[i..i + 16].copy_from_slice(&self.kernel_type_onehot);
i += 16;
a[i] = self.gpu_mem_bw_norm;
i += 1;
a[i] = self.gpu_compute_norm;
i += 1;
a[i] = self.gpu_sm_norm;
i += 1;
a[i] = self.gpu_l2_cache_norm;
i += 1;
a[i] = self.is_zero_copy;
i += 1;
a[i] = self.arithmetic_intensity;
i += 1;
a[i] = self.theoretical_efficiency;
a
}
pub fn to_vector(&self) -> Vec<f32> {
self.to_array().to_vec()
}
pub fn validate(&self) -> Result<(), TunerError> {
let v = self.to_array();
if v.iter().any(|x| x.is_nan()) {
return Err(TunerError::InvalidFeature("NaN value in features".into()));
}
if v.iter().any(|x| x.is_infinite()) {
return Err(TunerError::InvalidFeature("Infinite value in features".into()));
}
if v.iter().any(|x| *x < -0.001 || *x > 1.001) {
return Err(TunerError::InvalidFeature("Feature value outside [0, 1]".into()));
}
let quant_sum: f32 = self.quant_type_onehot.iter().sum();
if (quant_sum - 1.0).abs() > 0.001 && quant_sum > 0.001 {
return Err(TunerError::InvalidFeature("Quant one-hot does not sum to 1".into()));
}
let kernel_sum: f32 = self.kernel_type_onehot.iter().sum();
if (kernel_sum - 1.0).abs() > 0.001 && kernel_sum > 0.001 {
return Err(TunerError::InvalidFeature("Kernel one-hot does not sum to 1".into()));
}
Ok(())
}
}