use numr::prelude::*;
fn compute<R: Runtime>(a: &Tensor<R>, b: &Tensor<R>, client: &R::Client) -> Result<Tensor<R>>
where
R::Client: MatmulOps<R> + ActivationOps<R> + ReduceOps<R>,
{
let product = client.matmul(a, b)?;
let softmax = client.softmax(&product, -1)?;
let row_sums = client.sum(&softmax, &[1], false)?;
Ok(row_sums)
}
fn main() -> Result<()> {
let cpu_device = CpuDevice::new();
let cpu_client = CpuRuntime::default_client(&cpu_device);
let a_cpu =
Tensor::<CpuRuntime>::from_slice(&[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0], &[2, 3], &cpu_device);
let b_cpu =
Tensor::<CpuRuntime>::from_slice(&[0.1f32, 0.2, 0.3, 0.4, 0.5, 0.6], &[3, 2], &cpu_device);
let cpu_result = compute(&a_cpu, &b_cpu, &cpu_client)?;
let cpu_vec: Vec<f32> = cpu_result.to_vec();
println!("CPU result: {cpu_vec:?}");
#[cfg(feature = "wgpu")]
{
let wgpu_device = WgpuDevice::new(0);
let wgpu_client = WgpuRuntime::default_client(&wgpu_device);
let a_wgpu = Tensor::<WgpuRuntime>::from_slice(
&[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0],
&[2, 3],
&wgpu_device,
);
let b_wgpu = Tensor::<WgpuRuntime>::from_slice(
&[0.1f32, 0.2, 0.3, 0.4, 0.5, 0.6],
&[3, 2],
&wgpu_device,
);
let wgpu_result = compute(&a_wgpu, &b_wgpu, &wgpu_client)?;
let wgpu_vec: Vec<f32> = wgpu_result.to_vec();
println!("WGPU result: {wgpu_vec:?}");
let max_diff: f32 = cpu_vec
.iter()
.zip(wgpu_vec.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
println!("Max CPU–WGPU difference: {max_diff:.2e}");
assert!(
max_diff < 1e-4,
"CPU and WebGPU results should match within FP tolerance"
);
}
#[cfg(not(feature = "wgpu"))]
{
println!("\n(WebGPU comparison skipped — enable with --features wgpu)");
}
println!("\nBackend switch example completed successfully!");
Ok(())
}