pub fn matmul_webgpu(a: &Array, b: &Array) -> Array
Matrix multiplication on WebGPU (public API for dispatch)