use cuda_rust_wasm::runtime::{Grid, Block, thread, block, grid};
use cuda_rust_wasm::memory::{DeviceBuffer, SharedMemory};
use cuda_rust_wasm::kernel::launch_kernel;
#[kernel]
pub fn vectorAdd(a: &[f32], b: &[f32], c: &mut [f32], n: i32) {
let idx = thread::index().x + block::index().x * block::dim().x;
if idx < n as u32 {
c[idx as usize] = a[idx as usize] + b[idx as usize];
}
}
#[cfg(test)]
mod tests {
use super::*;
use cuda_rust_wasm::runtime::CudaRuntime;
#[test]
fn test_vector_add() {
let runtime = CudaRuntime::new().unwrap();
let n = 1024;
let a: Vec<f32> = (0..n).map(|i| i as f32).collect();
let b: Vec<f32> = (0..n).map(|i| (i * 2) as f32).collect();
let mut c = vec![0.0f32; n];
let d_a = DeviceBuffer::from_slice(&a).unwrap();
let d_b = DeviceBuffer::from_slice(&b).unwrap();
let mut d_c = DeviceBuffer::new(n).unwrap();
let block_size = 256;
let grid_size = (n + block_size - 1) / block_size;
launch_kernel!(
vectorAdd<<<grid_size, block_size>>>(
d_a.as_slice(),
d_b.as_slice(),
d_c.as_mut_slice(),
n as i32
)
);
d_c.copy_to_host(&mut c).unwrap();
for i in 0..n {
assert_eq!(c[i], a[i] + b[i]);
}
}
}