use candle_core::{Device, Tensor};
use cortex_rust::kernels::cpu::BitLinearCpu;
use cortex_rust::kernels::packing::PackedTensor;
use std::time::Instant;
fn bench_size(name: &str, m: usize, k: usize, n: usize, iterations: usize) -> anyhow::Result<()> {
let device = Device::Cpu;
let x_data = vec![0.5f32; m * k];
let x = Tensor::from_vec(x_data, (m, k), &device)?;
let packed_len = n * k / 4;
let w_data: Vec<u8> = (0..packed_len).map(|i| (i % 255) as u8).collect();
let w_shape = candle_core::Shape::from((n, k));
let packed_weights = PackedTensor::new(w_data, w_shape, 1.0, &device)?;
for _ in 0..5 {
let _ = BitLinearCpu::forward(&x, &packed_weights)?;
}
let start = Instant::now();
for _ in 0..iterations {
let out = BitLinearCpu::forward(&x, &packed_weights)?;
let _vec = out.flatten_all()?.to_vec1::<f32>()?;
}
let duration = start.elapsed();
let avg_ms = duration.as_secs_f64() * 1000.0 / iterations as f64;
let macs = (m as f64) * (n as f64) * (k as f64);
let gops = macs / (avg_ms / 1000.0) / 1e9;
let weight_mb = packed_len as f64 / 1024.0 / 1024.0;
println!(
"| {:12} | {:>5} | {:>5} | {:>5} | {:>7.2} | {:>8.2} | {:>6.2} |",
name, m, k, n, avg_ms, gops, weight_mb
);
Ok(())
}
fn main() -> anyhow::Result<()> {
println!("=== BitLinearCpu Multi-Size Benchmark ===\n");
println!("| Model | M | K | N | ms/op | GOps/s | W(MB) |");
println!("|--------------|-------|-------|-------|---------|----------|--------|");
bench_size("TinyLlama", 1, 2048, 2048, 100)?;
bench_size("TinyLlama-4", 4, 2048, 2048, 100)?;
bench_size("Llama-7B", 1, 4096, 4096, 100)?;
bench_size("Llama-7B-4", 4, 4096, 4096, 50)?;
bench_size("Llama-13B", 1, 5120, 5120, 50)?;
bench_size("Llama-70B", 1, 8192, 8192, 50)?;
bench_size("Llama-70B-4", 4, 8192, 8192, 20)?;
println!("\n✅ Benchmark complete!");
Ok(())
}