#![allow(missing_docs)]
mod kernel;
mod throughput;
pub use kernel::KernelClassifier;
pub use throughput::ThroughputRegressor;
use serde::{Deserialize, Serialize};
use super::features::TunerFeatures;
use super::types::{BottleneckClass, KernelType};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ThroughputPrediction {
pub predicted_tps: f32,
pub confidence: f32,
pub top_features: Vec<(String, f32)>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KernelRecommendation {
pub top_kernel: KernelType,
pub confidence: f32,
pub alternatives: Vec<(KernelType, f32)>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BottleneckPrediction {
pub class: BottleneckClass,
pub confidence: f32,
pub explanation: String,
pub recommended_action: String,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct BottleneckClassifier {
accuracy: f32,
}
impl BottleneckClassifier {
pub fn new() -> Self {
Self { accuracy: 0.90 }
}
pub fn predict(&self, features: &TunerFeatures) -> BottleneckPrediction {
if let Some(class) = features.bottleneck_class {
return BottleneckPrediction {
class,
confidence: 0.95,
explanation: format!("Bottleneck classified from profiler data: {}", class),
recommended_action: class.recommended_action().to_string(),
};
}
let batch_size = (features.batch_size_norm * 64.0).round() as u32;
let seq_len = (2.0_f32.powf(features.seq_len_log * 15.0)).round() as u32;
let (class, confidence, explanation) = if batch_size == 1 && features.cuda_graphs < 0.5 {
(
BottleneckClass::LaunchBound,
0.75,
"Single sequence without CUDA graphs: kernel launch overhead may dominate".into(),
)
} else if seq_len > 512 {
(
BottleneckClass::AttentionBound,
0.80,
format!("Long sequence (len={}) likely makes attention the bottleneck", seq_len),
)
} else {
(
BottleneckClass::MemoryBound,
0.85,
"Q4K GEMV is typically memory-bound for LLM inference".into(),
)
};
BottleneckPrediction {
class,
confidence,
explanation,
recommended_action: class.recommended_action().to_string(),
}
}
}