use clap::{Parser, Subcommand};
use xn::Result;
type Tensor = xn::Tensor<f32, xn::CpuDevice>;
trait Benchmark {
type PreProcessData;
type RunResult;
fn preprocess() -> Result<Self::PreProcessData>;
fn run_one(_: &Self::PreProcessData) -> Result<Self::RunResult>;
const ITERS: usize;
}
struct MatMul;
impl Benchmark for MatMul {
type PreProcessData = (Tensor, Tensor);
type RunResult = Tensor;
fn preprocess() -> Result<Self::PreProcessData> {
let lhs = Tensor::zeros((125, 4096), &xn::CPU)?;
let rhs = Tensor::zeros((4096, 1024), &xn::CPU)?;
Ok((lhs, rhs))
}
fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> {
d.0.matmul(&d.1)
}
const ITERS: usize = 5;
}
struct MatVec;
impl Benchmark for MatVec {
type PreProcessData = (Tensor, Tensor);
type RunResult = Tensor;
fn preprocess() -> Result<Self::PreProcessData> {
let lhs = Tensor::zeros((1024 * 4, 1024 * 4), &xn::CPU)?;
let rhs = Tensor::zeros((1024 * 4, 1), &xn::CPU)?;
Ok((lhs, rhs))
}
fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> {
d.0.matmul(&d.1)
}
const ITERS: usize = 100;
}
fn run<B: Benchmark>(iters: Option<usize>) -> Result<()> {
use std::hint::black_box;
let iters = iters.unwrap_or(B::ITERS);
let d = B::preprocess()?;
let start = std::time::Instant::now();
for _iter in 0..iters {
let _res = black_box(B::run_one(black_box(&d))?);
}
println!("{:?}", start.elapsed() / iters as u32);
Ok(())
}
#[derive(Subcommand, Debug, Clone)]
enum Task {
Matmul,
Matvec,
}
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
pub struct Args {
#[command(subcommand)]
task: Task,
#[arg(long)]
iters: Option<usize>,
}
fn main() -> Result<()> {
let args = Args::parse();
match args.task {
Task::Matmul => {
for _ in 0..20 {
run::<MatMul>(args.iters)?
}
}
Task::Matvec => {
for _ in 0..20 {
run::<MatVec>(args.iters)?
}
}
}
Ok(())
}