use constensor_core::{BestDevice, CompiledGraph, DType, Graph, GraphTensor, R3};
use std::time::Instant;
fn bench<T: DType, const B: usize, const M: usize, const K: usize, const N: usize>(
type_name: &str,
alpha: T,
beta: T,
) {
let iterations = 1;
let mut total = std::time::Duration::new(0, 0);
let mut graph = Graph::empty();
let a = GraphTensor::<R3<B, M, K>, T, BestDevice<0>>::fill(&mut graph, T::from_f64(1.));
let b = GraphTensor::<R3<B, N, K>, T, BestDevice<0>>::fill(&mut graph, T::from_f64(2.)).t();
let o = GraphTensor::<R3<B, M, N>, T, BestDevice<0>>::fill(&mut graph, T::from_f64(3.));
let _c = a.matmul_axpby(b, o, alpha, beta);
graph.optimize();
let compiled: CompiledGraph<R3<B, M, N>, T, BestDevice<0>> = graph.compile().unwrap();
for _ in 0..iterations {
let start = Instant::now();
let tensor = std::hint::black_box(compiled.run().unwrap());
dbg!(tensor.data().unwrap());
total += start.elapsed();
}
let avg = total / (iterations as u32);
println!("Average execution time for {type_name} over {iterations} iterations: {avg:?}");
}
fn main() {
const B: usize = 1;
const M: usize = 2;
const N: usize = 2;
const K: usize = 2;
bench::<f32, B, M, K, N>("f32", 1.0, 1.0);
}