use realizar::bench_viz::{BenchMeasurement, BenchmarkGrid, BenchmarkRunner, ProfilingHotspot};
use std::time::Duration;
fn main() {
let args: Vec<String> = std::env::args().collect();
let show_log = args.iter().any(|a| a == "--log");
let show_compact = args.iter().any(|a| a == "--compact");
let mut grid = BenchmarkGrid::new()
.with_model("Qwen2.5-Coder-0.5B-Instruct", "0.5B params", "Q4_K_M")
.with_gpu("NVIDIA RTX 4090", 24.0);
grid.set_gguf_row(
BenchMeasurement::new("APR", "GGUF")
.with_throughput(500.0) .with_ttft(7.0)
.with_gpu(95.0, 2048.0),
BenchMeasurement::new("Ollama", "GGUF")
.with_throughput(318.0)
.with_ttft(50.0)
.with_gpu(92.0, 1800.0),
BenchMeasurement::new("llama.cpp", "GGUF")
.with_throughput(200.0)
.with_ttft(30.0)
.with_gpu(90.0, 1600.0),
);
grid.set_apr_row(
BenchMeasurement::new("APR", ".apr")
.with_throughput(600.0) .with_ttft(5.0)
.with_gpu(96.0, 1900.0),
BenchMeasurement::new("APR", "GGUF")
.with_throughput(500.0)
.with_ttft(7.0)
.with_gpu(95.0, 2048.0),
BenchMeasurement::new("Ollama", "GGUF")
.with_throughput(318.0)
.with_ttft(50.0)
.with_gpu(92.0, 1800.0),
);
grid.add_hotspot(ProfilingHotspot {
component: "Q4K_GEMV".to_string(),
time: Duration::from_millis(150),
percentage: 42.5,
call_count: 28 * 128, avg_per_call: Duration::from_micros(42),
explanation: "Matrix ops dominate (42.5%) - expected for transformer inference".to_string(),
is_expected: true,
});
grid.add_hotspot(ProfilingHotspot {
component: "Attention".to_string(),
time: Duration::from_millis(80),
percentage: 22.7,
call_count: 28 * 128,
avg_per_call: Duration::from_micros(22),
explanation: "Attention at 22.7% - normal for autoregressive decoding".to_string(),
is_expected: true,
});
grid.add_hotspot(ProfilingHotspot {
component: "RMSNorm".to_string(),
time: Duration::from_millis(30),
percentage: 8.5,
call_count: 28 * 2 * 128, avg_per_call: Duration::from_micros(4),
explanation: "Normalization within normal range".to_string(),
is_expected: true,
});
grid.add_hotspot(ProfilingHotspot {
component: "KernelLaunch".to_string(),
time: Duration::from_millis(25),
percentage: 7.1,
call_count: 28 * 10 * 128, avg_per_call: Duration::from_nanos(700),
explanation: "Kernel launch overhead - consider CUDA graphs or megakernels".to_string(),
is_expected: false,
});
if show_compact {
println!("{}", grid.render_compact());
} else if show_log {
println!("{}", grid.render_profiling_log());
} else {
println!("{}", grid.render_ascii());
println!();
println!("{}", grid.render_profiling_log());
}
}
#[allow(dead_code)]
fn example_with_runner() {
let mut runner = BenchmarkRunner::new();
runner.start();
runner.record_component("Q4K_GEMV", Duration::from_millis(150), 3584);
runner.record_component("Attention", Duration::from_millis(80), 3584);
runner.record_component("RMSNorm", Duration::from_millis(30), 7168);
runner.record_component("Softmax", Duration::from_millis(15), 3584);
runner.record_component("KernelLaunch", Duration::from_millis(25), 35840);
runner.record_component("Embedding", Duration::from_millis(5), 128);
runner.record_component("Sampling", Duration::from_millis(10), 128);
runner.finalize();
runner.grid = runner
.grid
.with_model("Qwen2.5-Coder-0.5B", "0.5B", "Q4_K_M")
.with_gpu("RTX 4090", 24.0);
runner.grid.gguf_apr = Some(
BenchMeasurement::new("APR", "GGUF")
.with_tokens(128, Duration::from_millis(315))
.with_ttft(7.0),
);
println!("{}", runner.grid.render_profiling_log());
}