use crate::csr_array::CsrArray;
use crate::error::{SparseError, SparseResult};
use crate::sparray::SparseArray;
use scirs2_core::ndarray::{Array1, ArrayView1};
use scirs2_core::numeric::{Float, SparseElement};
use std::fmt::Debug;
#[cfg(feature = "gpu")]
use crate::gpu_kernel_execution::{GpuKernelConfig, MemoryStrategy};
#[cfg(feature = "gpu")]
pub use scirs2_core::gpu::{GpuBackend, GpuBuffer, GpuContext, GpuDataType, GpuKernelHandle};
#[cfg(feature = "gpu")]
pub use scirs2_core::GpuError;
pub const CUDA_SPMV_KERNEL_SOURCE: &str = r#"
extern "C" __global__ void spmv_csr_kernel(
int rows,
const int* __restrict__ indptr,
const int* __restrict__ indices,
const float* __restrict__ data,
const float* __restrict__ x,
float* __restrict__ y
) {
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= rows) return;
float sum = 0.0f;
int start = indptr[row];
int end = indptr[row + 1];
// Vectorized loop for better memory access patterns
for (int j = start; j < end; j++) {
sum += data[j] * x[indices[j]];
}
y[row] = sum;
}
extern "C" __global__ void spmv_csr_vectorized_kernel(
int rows,
const int* __restrict__ indptr,
const int* __restrict__ indices,
const float* __restrict__ data,
const float* __restrict__ x,
float* __restrict__ y
) {
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= rows) return;
float sum = 0.0f;
int start = indptr[row];
int end = indptr[row + 1];
// Use shared memory for better performance
extern __shared__ float sdata[];
int tid = threadIdx.x;
sdata[tid] = 0.0f;
__syncthreads();
for (int j = start; j < end; j++) {
sdata[tid] += data[j] * x[indices[j]];
}
__syncthreads();
y[row] = sdata[tid];
}
"#;
pub const CUDA_WARP_SPMV_KERNEL_SOURCE: &str = r#"
extern "C" __global__ void spmv_csr_warp_kernel(
int rows,
const int* __restrict__ indptr,
const int* __restrict__ indices,
const float* __restrict__ data,
const float* __restrict__ x,
float* __restrict__ y
) {
int warp_id = blockIdx.x * blockDim.x + threadIdx.x;
int lane_id = threadIdx.x % 32;
int row = warp_id / 32;
if (row >= rows) return;
int start = indptr[row];
int end = indptr[row + 1];
float sum = 0.0f;
// Warp-level parallelization
for (int j = start + lane_id; j < end; j += 32) {
sum += data[j] * x[indices[j]];
}
// Warp reduction
#pragma unroll
for (int offset = 16; offset > 0; offset /= 2) {
sum += __shfl_down_sync(0xffffffff, sum, offset);
}
if (lane_id == 0) {
y[row] = sum;
}
}
"#;
pub struct CudaSpMatVec {
context: Option<scirs2_core::gpu::GpuContext>,
kernel_handle: Option<scirs2_core::gpu::GpuKernelHandle>,
vectorized_kernel: Option<scirs2_core::gpu::GpuKernelHandle>,
warp_kernel: Option<scirs2_core::gpu::GpuKernelHandle>,
}
impl CudaSpMatVec {
pub fn new() -> SparseResult<Self> {
#[cfg(feature = "gpu")]
let context = match scirs2_core::gpu::GpuContext::new(scirs2_core::gpu::GpuBackend::Cuda) {
Ok(ctx) => Some(ctx),
Err(_) => None, };
#[cfg(not(feature = "gpu"))]
let context = None;
let mut handler = Self {
context,
kernel_handle: None,
vectorized_kernel: None,
warp_kernel: None,
};
#[cfg(feature = "gpu")]
if handler.context.is_some() {
let _ = handler.compile_kernels();
}
Ok(handler)
}
#[cfg(feature = "gpu")]
pub fn compile_kernels(&mut self) -> Result<(), scirs2_core::gpu::GpuError> {
if let Some(ref context) = self.context {
self.kernel_handle =
context.execute(|compiler| compiler.compile(CUDA_SPMV_KERNEL_SOURCE).ok());
self.vectorized_kernel =
context.execute(|compiler| compiler.compile(CUDA_SPMV_KERNEL_SOURCE).ok());
self.warp_kernel =
context.execute(|compiler| compiler.compile(CUDA_WARP_SPMV_KERNEL_SOURCE).ok());
if self.kernel_handle.is_some() {
Ok(())
} else {
Err(scirs2_core::gpu::GpuError::KernelCompilationError(
"Failed to compile CUDA kernels".to_string(),
))
}
} else {
Err(scirs2_core::gpu::GpuError::BackendNotAvailable(
"CUDA".to_string(),
))
}
}
#[cfg(feature = "gpu")]
pub fn execute_spmv<T>(
&self,
matrix: &CsrArray<T>,
vector: &ArrayView1<T>,
_device: &super::GpuDevice,
) -> SparseResult<Array1<T>>
where
T: Float + SparseElement + Debug + Copy + scirs2_core::gpu::GpuDataType,
{
let (rows, cols) = matrix.shape();
if cols != vector.len() {
return Err(SparseError::DimensionMismatch {
expected: cols,
found: vector.len(),
});
}
if let Some(ref context) = self.context {
if let Some(ref kernel) = self.kernel_handle {
let indptr_buffer = context.create_buffer_from_slice(
matrix.get_indptr().as_slice().expect("Operation failed"),
);
let indices_buffer = context.create_buffer_from_slice(
matrix.get_indices().as_slice().expect("Operation failed"),
);
let data_buffer = context.create_buffer_from_slice(
matrix.get_data().as_slice().expect("Operation failed"),
);
let vector_buffer =
context.create_buffer_from_slice(vector.as_slice().expect("Operation failed"));
let result_buffer = context.create_buffer::<T>(rows);
kernel.set_buffer("indptr", &indptr_buffer);
kernel.set_buffer("indices", &indices_buffer);
kernel.set_buffer("data", &data_buffer);
kernel.set_buffer("vector", &vector_buffer);
kernel.set_buffer("result", &result_buffer);
kernel.set_u32("rows", rows as u32);
let grid_size = ((rows + 255) / 256, 1, 1);
let block_size = (256, 1, 1);
let args = vec![scirs2_core::gpu::DynamicKernelArg::U32(rows as u32)];
context
.launch_kernel("spmv_csr_kernel", grid_size, block_size, &args)
.map_err(|e| {
SparseError::ComputationError(format!(
"CUDA kernel execution failed: {:?}",
e
))
})?;
let mut result_vec = vec![T::sparse_zero(); rows];
result_buffer.copy_to_host(&mut result_vec).map_err(|e| {
SparseError::ComputationError(format!(
"Failed to copy result from GPU: {:?}",
e
))
})?;
Ok(Array1::from_vec(result_vec))
} else {
Err(SparseError::ComputationError(
"CUDA kernel not compiled".to_string(),
))
}
} else {
matrix.dot_vector(vector)
}
}
#[cfg(feature = "gpu")]
pub fn execute_optimized_spmv<T>(
&self,
matrix: &CsrArray<T>,
vector: &ArrayView1<T>,
device: &super::GpuDevice,
optimization_level: CudaOptimizationLevel,
) -> SparseResult<Array1<T>>
where
T: Float + SparseElement + Debug + Copy + super::GpuDataType,
{
let (rows, cols) = matrix.shape();
if cols != vector.len() {
return Err(SparseError::DimensionMismatch {
expected: cols,
found: vector.len(),
});
}
let kernel = match optimization_level {
CudaOptimizationLevel::Basic => &self.kernel_handle,
CudaOptimizationLevel::Vectorized => &self.vectorized_kernel,
CudaOptimizationLevel::WarpLevel => &self.warp_kernel,
};
if let Some(ref k) = kernel {
self.execute_kernel_with_optimization(matrix, vector, device, k, optimization_level)
} else {
Err(SparseError::ComputationError(
"CUDA kernel not available for requested optimization level".to_string(),
))
}
}
#[cfg(feature = "gpu")]
fn execute_kernel_with_optimization<T>(
&self,
_matrix: &CsrArray<T>,
_vector: &ArrayView1<T>,
_device: &super::GpuDevice,
_kernel: &super::GpuKernelHandle,
_optimization_level: CudaOptimizationLevel,
) -> SparseResult<Array1<T>>
where
T: Float + SparseElement + Debug + Copy + super::GpuDataType,
{
Err(SparseError::ComputationError(
"CUDA optimized execution not yet implemented".to_string(),
))
}
#[cfg(not(feature = "gpu"))]
pub fn execute_spmv_cpu<T>(
&self,
matrix: &CsrArray<T>,
vector: &ArrayView1<T>,
) -> SparseResult<Array1<T>>
where
T: Float + SparseElement + Debug + Copy + std::iter::Sum,
{
matrix.dot_vector(vector)
}
}
impl Default for CudaSpMatVec {
fn default() -> Self {
Self::new().unwrap_or_else(|_| Self {
context: None,
kernel_handle: None,
vectorized_kernel: None,
warp_kernel: None,
})
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum CudaOptimizationLevel {
#[default]
Basic,
Vectorized,
WarpLevel,
}
pub struct CudaMemoryManager {
#[allow(dead_code)]
allocated_buffers: Vec<String>,
}
impl CudaMemoryManager {
pub fn new() -> Self {
Self {
allocated_buffers: Vec::new(),
}
}
#[cfg(feature = "gpu")]
pub fn allocate_sparse_matrix<T>(
&mut self,
_matrix: &CsrArray<T>,
_device: &super::GpuDevice,
) -> Result<CudaMatrixBuffers<T>, super::GpuError>
where
T: super::GpuDataType + Copy + Float + SparseElement + Debug,
{
Err(super::GpuError::BackendNotImplemented(
super::GpuBackend::Cuda,
))
}
#[cfg(feature = "gpu")]
pub fn allocate_optimized<T>(
&mut self,
_data: &[T],
_device: &super::GpuDevice,
_access_pattern: MemoryAccessPattern,
) -> Result<super::GpuBuffer<T>, super::GpuError>
where
T: super::GpuDataType + Copy + Float + SparseElement + Debug,
{
Err(super::GpuError::BackendNotImplemented(
super::GpuBackend::Cuda,
))
}
}
impl Default for CudaMemoryManager {
fn default() -> Self {
Self::new()
}
}
#[cfg(feature = "gpu")]
pub struct CudaMatrixBuffers<T: super::GpuDataType> {
pub indptr: super::GpuBuffer<usize>,
pub indices: super::GpuBuffer<usize>,
pub data: super::GpuBuffer<T>,
}
#[derive(Debug, Clone, Copy)]
pub enum MemoryAccessPattern {
Sequential,
Random,
Strided,
}
#[cfg(test)]
mod tests {
use super::*;
use scirs2_core::ndarray::Array1;
#[test]
fn test_cuda_spmv_creation() {
let cuda_spmv = CudaSpMatVec::new();
assert!(cuda_spmv.is_ok());
}
#[test]
fn test_cuda_optimization_levels() {
let basic = CudaOptimizationLevel::Basic;
let vectorized = CudaOptimizationLevel::Vectorized;
let warp = CudaOptimizationLevel::WarpLevel;
assert_ne!(basic, vectorized);
assert_ne!(vectorized, warp);
assert_eq!(
CudaOptimizationLevel::default(),
CudaOptimizationLevel::Basic
);
}
#[test]
fn test_cuda_memory_manager() {
let manager = CudaMemoryManager::new();
assert_eq!(manager.allocated_buffers.len(), 0);
}
#[test]
fn test_memory_access_patterns() {
let patterns = [
MemoryAccessPattern::Sequential,
MemoryAccessPattern::Random,
MemoryAccessPattern::Strided,
];
for pattern in &patterns {
match pattern {
MemoryAccessPattern::Sequential => (),
MemoryAccessPattern::Random => (),
MemoryAccessPattern::Strided => (),
}
}
}
#[test]
#[allow(clippy::const_is_empty)]
fn test_kernel_sources() {
assert!(!CUDA_SPMV_KERNEL_SOURCE.is_empty());
assert!(!CUDA_WARP_SPMV_KERNEL_SOURCE.is_empty());
assert!(CUDA_SPMV_KERNEL_SOURCE.contains("spmv_csr_kernel"));
assert!(CUDA_SPMV_KERNEL_SOURCE.contains("spmv_csr_vectorized_kernel"));
assert!(CUDA_WARP_SPMV_KERNEL_SOURCE.contains("spmv_csr_warp_kernel"));
}
}