CUDA_SPMV_KERNEL_SOURCE

Constant CUDA_SPMV_KERNEL_SOURCE 

Source
pub const CUDA_SPMV_KERNEL_SOURCE: &str = r#"
extern "C" __global__ void spmv_csr_kernel(
    int rows,
    const int* __restrict__ indptr,
    const int* __restrict__ indices,
    const float* __restrict__ data,
    const float* __restrict__ x,
    float* __restrict__ y
) {
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row >= rows) return;
    
    float sum = 0.0f;
    int start = indptr[row];
    int end = indptr[row + 1];
    
    // Vectorized loop for better memory access patterns
    for (int j = start; j < end; j++) {
        sum += data[j] * x[indices[j]];
    }
    
    y[row] = sum;
}

extern "C" __global__ void spmv_csr_vectorized_kernel(
    int rows,
    const int* __restrict__ indptr,
    const int* __restrict__ indices,
    const float* __restrict__ data,
    const float* __restrict__ x,
    float* __restrict__ y
) {
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row >= rows) return;
    
    float sum = 0.0f;
    int start = indptr[row];
    int end = indptr[row + 1];
    
    // Use shared memory for better performance
    extern __shared__ float sdata[];
    int tid = threadIdx.x;
    
    sdata[tid] = 0.0f;
    __syncthreads();
    
    for (int j = start; j < end; j++) {
        sdata[tid] += data[j] * x[indices[j]];
    }
    
    __syncthreads();
    y[row] = sdata[tid];
}
"#;
Expand description

CUDA kernel source code for sparse matrix-vector multiplication