matmul/
main.rs

1use constensor_core::{BestDevice, CompiledGraph, DType, Graph, GraphTensor, R3};
2use std::time::Instant;
3
4fn bench<T: DType, const B: usize, const M: usize, const K: usize, const N: usize>(
5    type_name: &str,
6    alpha: T,
7    beta: T,
8) {
9    // Number of times to run the matmul for averaging
10    let iterations = 1;
11    let mut total = std::time::Duration::new(0, 0);
12
13    let mut graph = Graph::empty();
14    let a = GraphTensor::<R3<B, M, K>, T, BestDevice<0>>::fill(&mut graph, T::from_f64(1.));
15    // Strided matmuls works on all devices.
16    let b = GraphTensor::<R3<B, N, K>, T, BestDevice<0>>::fill(&mut graph, T::from_f64(2.)).t();
17    // let b = GraphTensor::<R3<B, K, N>, T, BestDevice<0>>::fill(&mut graph, T::from_f64(2.));
18    let o = GraphTensor::<R3<B, M, N>, T, BestDevice<0>>::fill(&mut graph, T::from_f64(3.));
19    let _c = a.matmul_axpby(b, o, alpha, beta);
20
21    graph.optimize();
22    let compiled: CompiledGraph<R3<B, M, N>, T, BestDevice<0>> = graph.compile().unwrap();
23
24    for _ in 0..iterations {
25        let start = Instant::now();
26
27        let tensor = std::hint::black_box(compiled.run().unwrap());
28        dbg!(tensor.data().unwrap());
29
30        total += start.elapsed();
31    }
32
33    let avg = total / (iterations as u32);
34    println!("Average execution time for {type_name} over {iterations} iterations: {avg:?}");
35}
36
37fn main() {
38    const B: usize = 1;
39    const M: usize = 2;
40    const N: usize = 2;
41    const K: usize = 2;
42
43    bench::<f32, B, M, K, N>("f32", 1.0, 1.0);
44    // bench::<i32, B, M, K, N>("i32", 1, 1);
45}
matmul/main.rs

matmul/
main.rs