#![allow(clippy::disallowed_methods)]
use trueno::tuner::{BrickTuner, KernelClassifier, QuantType, ThroughputRegressor, TunerFeatures};
fn make_demo_features() -> TunerFeatures {
TunerFeatures::builder()
.model_params_b(1.5)
.hidden_dim(1536)
.num_layers(28)
.num_heads(12)
.batch_size(4)
.seq_len(512)
.quant_type(QuantType::Q4K)
.gpu_mem_bw_gbs(1000.0)
.gpu_sm_count(128)
.cuda_graphs(true)
.build()
}
fn demo_feature_construction(features: &TunerFeatures) {
println!("1. TunerFeatures (DIM=42 vector)");
println!(" -----------------------------");
println!(" Model: Qwen2.5-Coder-1.5B (Q4_K_M)");
println!(" GPU: RTX 4090 (1000 GB/s, 128 SMs, 24GB VRAM)");
println!(" Batch size: M=4");
println!(" Sequence length: 512");
println!(" CUDA graphs: enabled");
println!();
match features.validate() {
Ok(()) => println!(" Feature validation: PASSED"),
Err(e) => println!(" Feature validation: FAILED - {}", e),
}
let vec = features.to_vector();
println!(" Feature vector length: {} (expected: 42)", vec.len());
println!(
" Sample features: model_params_b={:.3}, batch_size_norm={:.3}, gpu_mem_bw_norm={:.3}",
vec[0], vec[6], vec[35]
);
println!();
}
fn demo_throughput_prediction(features: &TunerFeatures) {
println!("2. Throughput Prediction");
println!(" ----------------------");
let regressor = ThroughputRegressor::new();
let prediction = regressor.predict(features);
println!(" Predicted throughput: {:.1} tok/s", prediction.predicted_tps);
println!(" Confidence: {:.1}%", prediction.confidence * 100.0);
println!(" Top contributing features:");
for (name, importance) in prediction.top_features.iter().take(3) {
println!(" - {}: {:.1}%", name, importance * 100.0);
}
println!();
println!(" Batch size comparison:");
for m in [1, 2, 4, 8] {
let m_features = TunerFeatures::builder()
.model_params_b(1.5)
.hidden_dim(1536)
.batch_size(m)
.quant_type(QuantType::Q4K)
.gpu_mem_bw_gbs(1000.0)
.cuda_graphs(true)
.build();
let pred = regressor.predict(&m_features);
println!(
" M={}: {:.1} tok/s (conf: {:.0}%)",
m,
pred.predicted_tps,
pred.confidence * 100.0
);
}
println!();
}
fn demo_kernel_selection(features: &TunerFeatures) {
println!("3. Kernel Selection");
println!(" -----------------");
let classifier = KernelClassifier::new();
let recommendation = classifier.predict(features);
println!(" Recommended kernel: {:?}", recommendation.top_kernel);
println!(" Confidence: {:.1}%", recommendation.confidence * 100.0);
println!(" Alternatives:");
for (kernel, conf) in recommendation.alternatives.iter().take(3) {
println!(" - {:?}: {:.1}%", kernel, conf * 100.0);
}
println!();
println!(" Kernel selection by batch size:");
for m in [1, 2, 4, 8] {
let m_features = TunerFeatures::builder()
.model_params_b(1.5)
.batch_size(m)
.quant_type(QuantType::Q4K)
.cuda_graphs(m == 1)
.build();
let rec = classifier.predict(&m_features);
println!(" M={}: {:?}", m, rec.top_kernel);
}
println!();
}
fn demo_roofline_model() {
println!("4. Roofline Model (Physical Limits)");
println!(" ---------------------------------");
let regressor = ThroughputRegressor::new();
println!(" Theoretical max throughput (RTX 4090, M=4):");
for (name, params_b, quant) in [
("0.5B Q4_K", 0.5, QuantType::Q4K),
("1.5B Q4_K", 1.5, QuantType::Q4K),
("7B Q4_K", 7.0, QuantType::Q4K),
("7B Q6_K", 7.0, QuantType::Q6K),
("32B Q4_K", 32.0, QuantType::Q4K),
] {
let f = TunerFeatures::builder()
.model_params_b(params_b)
.batch_size(4)
.quant_type(quant)
.gpu_mem_bw_gbs(1000.0)
.build();
let pred = regressor.predict(&f);
println!(" {}: {:.0} tok/s (roofline-clamped)", name, pred.predicted_tps);
}
println!();
}
fn demo_full_tuner(features: &TunerFeatures) {
println!("5. Full Tuner Recommendations");
println!(" ---------------------------");
let tuner = BrickTuner::new();
let full_rec = tuner.recommend(features);
println!(" Throughput: {:.1} tok/s", full_rec.throughput.predicted_tps);
println!(" Best kernel: {:?}", full_rec.kernel.top_kernel);
println!(" Experiment suggestions:");
for suggestion in full_rec.suggested_experiments.iter().take(3) {
println!(" - {}", suggestion);
}
println!();
}
fn demo_random_forest(features: &TunerFeatures) {
#[cfg(feature = "ml-tuner")]
{
println!("6. RandomForest Models (ml-tuner feature)");
println!(" --------------------------------------");
let mut rf_regressor = ThroughputRegressor::with_random_forest(100);
println!(" Created RandomForestRegressor with 100 trees");
let training_data: Vec<(TunerFeatures, f32)> = (0..100)
.map(|i| {
let batch = 1 + (i % 8) as u32;
let f = TunerFeatures::builder()
.model_params_b(1.5)
.batch_size(batch)
.quant_type(QuantType::Q4K)
.gpu_mem_bw_gbs(1000.0)
.cuda_graphs(batch == 1)
.build();
let throughput = 200.0 + (batch as f32) * 80.0 + (i as f32 * 0.5);
(f, throughput)
})
.collect();
println!(" Generated {} training samples", training_data.len());
match rf_regressor.train_random_forest(&training_data) {
Ok(()) => {
println!(" Training: SUCCESS");
let pred = rf_regressor.predict(features);
println!(" RF prediction for M=4: {:.1} tok/s", pred.predicted_tps);
}
Err(e) => println!(" Training: FAILED - {}", e),
}
let mut rf_classifier = KernelClassifier::with_random_forest(50);
println!(" Created RandomForestClassifier with 50 trees");
let class_data: Vec<(TunerFeatures, u32)> = (0..100)
.map(|i| {
let batch = 1 + (i % 8) as u32;
let f = TunerFeatures::builder()
.model_params_b(1.5)
.batch_size(batch)
.quant_type(QuantType::Q4K)
.build();
let label = if batch >= 4 { 3 } else { 2 };
(f, label)
})
.collect();
match rf_classifier.train(&class_data) {
Ok(()) => {
println!(" Classifier training: SUCCESS");
println!(" Accuracy: {:.1}%", rf_classifier.predict(features).confidence * 100.0);
}
Err(e) => println!(" Classifier training: FAILED - {}", e),
}
println!();
}
#[cfg(not(feature = "ml-tuner"))]
{
let _ = features;
println!("6. RandomForest Models");
println!(" --------------------");
println!(" [Disabled - enable with: --features ml-tuner]");
println!();
}
}
fn main() {
println!("=== ML Tuner Demo ===\n");
println!("ComputeBrick kernel selection and throughput prediction");
println!("Reference: SHOWCASE-BRICK-001, Section 12\n");
let features = make_demo_features();
demo_feature_construction(&features);
demo_throughput_prediction(&features);
demo_kernel_selection(&features);
demo_roofline_model();
demo_full_tuner(&features);
demo_random_forest(&features);
println!("=== Demo Complete ===\n");
println!("Key takeaways:");
println!(" - TunerFeatures: 42-dimension vector for ML models");
println!(" - Throughput prediction with roofline clamping (v1.1.0)");
println!(" - Kernel selection: BatchedQ4K for M>=4, VectorizedQ4K otherwise");
println!(" - RandomForest available with --features ml-tuner");
println!();
println!("Next steps:");
println!(" cargo run --example quickstart # Basic trueno usage");
println!(" cargo run --example performance_demo # SIMD benchmarks");
println!(" cargo run --features gpu --example gpu_batch_demo # GPU operations");
}