#![allow(missing_docs)]
use crate::brick::{BrickCategory, BrickProfiler};
use crate::hardware::HardwareCapability;
use serde::{Deserialize, Serialize};
use crate::tuner::types::{BottleneckClass, KernelType, QuantType};
use super::TunerFeatures;
#[derive(Debug)]
pub struct FeatureExtractor {
pub(crate) hardware: Option<HardwareCapability>,
}
impl Default for FeatureExtractor {
fn default() -> Self {
Self::new()
}
}
impl FeatureExtractor {
pub fn new() -> Self {
Self { hardware: None }
}
pub fn with_hardware(hardware: HardwareCapability) -> Self {
Self { hardware: Some(hardware) }
}
pub fn extract(&self, profiler: &BrickProfiler, config: &RunConfig) -> TunerFeatures {
let mut builder = TunerFeatures::builder()
.model_params_b(config.model_params_b)
.hidden_dim(config.hidden_dim)
.num_layers(config.num_layers)
.num_heads(config.num_heads)
.batch_size(config.batch_size)
.seq_len(config.seq_len)
.cuda_graphs(config.cuda_graphs)
.quant_type(config.quant_type)
.kernel_type(config.kernel_type);
if let Some(hw) = &self.hardware {
builder = builder.hardware(hw);
}
if let Some(tps) = profiler.tokens_per_sec() {
builder = builder.measured_tps(tps);
}
let mut features = builder.build();
if let Some(efficiency) = self.calculate_efficiency(profiler, config) {
features.theoretical_efficiency = efficiency;
}
features.bottleneck_class = Some(self.classify_bottleneck(profiler));
features
}
#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
pub fn calculate_efficiency(
&self,
profiler: &BrickProfiler,
config: &RunConfig,
) -> Option<f32> {
let measured_tps = profiler.tokens_per_sec()?;
let hw = self.hardware.as_ref()?;
let gpu = hw.gpu.as_ref()?;
let bytes_per_token = config.model_params_b * 1e9 * config.quant_type.bytes_per_param();
let theoretical_tps = (gpu.memory_bw_gbps as f32) * 1e9 / bytes_per_token;
Some((measured_tps / theoretical_tps).clamp(0.0, 1.0))
}
#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
pub fn classify_bottleneck(&self, profiler: &BrickProfiler) -> BottleneckClass {
let cats = profiler.category_stats();
let total_ns = profiler.total_ns();
if total_ns == 0 {
return BottleneckClass::Unknown;
}
let attention_pct =
cats[BrickCategory::Attention as usize].percentage(total_ns) as f32 / 100.0;
let ffn_pct = cats[BrickCategory::Ffn as usize].percentage(total_ns) as f32 / 100.0;
let norm_pct = cats[BrickCategory::Norm as usize].percentage(total_ns) as f32 / 100.0;
if attention_pct > 0.35 {
BottleneckClass::AttentionBound
} else if ffn_pct > 0.50 {
BottleneckClass::MemoryBound
} else if norm_pct > 0.20 {
BottleneckClass::LaunchBound
} else {
BottleneckClass::MemoryBound }
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RunConfig {
pub model_params_b: f32,
pub hidden_dim: u32,
pub num_layers: u32,
pub num_heads: u32,
pub batch_size: u32,
pub seq_len: u32,
pub cuda_graphs: bool,
pub quant_type: QuantType,
pub kernel_type: KernelType,
}
const DEFAULT_HIDDEN_DIM: u32 = 1536;
impl Default for RunConfig {
fn default() -> Self {
Self {
model_params_b: 1.5,
hidden_dim: DEFAULT_HIDDEN_DIM,
num_layers: 28,
num_heads: 12,
batch_size: 1,
seq_len: 1,
cuda_graphs: false,
quant_type: QuantType::Q4K,
kernel_type: KernelType::VectorizedQ4K,
}
}
}