pub const CUDA_SPMV_KERNEL_SOURCE: &str = r#"
extern "C" __global__ void spmv_csr_kernel(
int rows,
const int* __restrict__ indptr,
const int* __restrict__ indices,
const float* __restrict__ data,
const float* __restrict__ x,
float* __restrict__ y
) {
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= rows) return;
float sum = 0.0f;
int start = indptr[row];
int end = indptr[row + 1];
// Vectorized loop for better memory access patterns
for (int j = start; j < end; j++) {
sum += data[j] * x[indices[j]];
}
y[row] = sum;
}
extern "C" __global__ void spmv_csr_vectorized_kernel(
int rows,
const int* __restrict__ indptr,
const int* __restrict__ indices,
const float* __restrict__ data,
const float* __restrict__ x,
float* __restrict__ y
) {
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= rows) return;
float sum = 0.0f;
int start = indptr[row];
int end = indptr[row + 1];
// Use shared memory for better performance
extern __shared__ float sdata[];
int tid = threadIdx.x;
sdata[tid] = 0.0f;
__syncthreads();
for (int j = start; j < end; j++) {
sdata[tid] += data[j] * x[indices[j]];
}
__syncthreads();
y[row] = sdata[tid];
}
"#;Expand description
CUDA kernel source code for sparse matrix-vector multiplication