1use crate::csr_array::CsrArray;
6use crate::error::{SparseError, SparseResult};
7use crate::sparray::SparseArray;
8use scirs2_core::ndarray::{Array1, ArrayView1};
9use scirs2_core::numeric::{Float, SparseElement};
10use std::fmt::Debug;
11
12#[cfg(feature = "gpu")]
13use crate::gpu_kernel_execution::{GpuKernelConfig, MemoryStrategy};
14
15#[cfg(feature = "gpu")]
16pub use scirs2_core::gpu::{GpuBackend, GpuBuffer, GpuContext, GpuDataType, GpuKernelHandle};
17
18#[cfg(feature = "gpu")]
19pub use scirs2_core::GpuError;
20
21pub const CUDA_SPMV_KERNEL_SOURCE: &str = r#"
23extern "C" __global__ void spmv_csr_kernel(
24 int rows,
25 const int* __restrict__ indptr,
26 const int* __restrict__ indices,
27 const float* __restrict__ data,
28 const float* __restrict__ x,
29 float* __restrict__ y
30) {
31 int row = blockIdx.x * blockDim.x + threadIdx.x;
32 if (row >= rows) return;
33
34 float sum = 0.0f;
35 int start = indptr[row];
36 int end = indptr[row + 1];
37
38 // Vectorized loop for better memory access patterns
39 for (int j = start; j < end; j++) {
40 sum += data[j] * x[indices[j]];
41 }
42
43 y[row] = sum;
44}
45
46extern "C" __global__ void spmv_csr_vectorized_kernel(
47 int rows,
48 const int* __restrict__ indptr,
49 const int* __restrict__ indices,
50 const float* __restrict__ data,
51 const float* __restrict__ x,
52 float* __restrict__ y
53) {
54 int row = blockIdx.x * blockDim.x + threadIdx.x;
55 if (row >= rows) return;
56
57 float sum = 0.0f;
58 int start = indptr[row];
59 int end = indptr[row + 1];
60
61 // Use shared memory for better performance
62 extern __shared__ float sdata[];
63 int tid = threadIdx.x;
64
65 sdata[tid] = 0.0f;
66 __syncthreads();
67
68 for (int j = start; j < end; j++) {
69 sdata[tid] += data[j] * x[indices[j]];
70 }
71
72 __syncthreads();
73 y[row] = sdata[tid];
74}
75"#;
76
77pub const CUDA_WARP_SPMV_KERNEL_SOURCE: &str = r#"
79extern "C" __global__ void spmv_csr_warp_kernel(
80 int rows,
81 const int* __restrict__ indptr,
82 const int* __restrict__ indices,
83 const float* __restrict__ data,
84 const float* __restrict__ x,
85 float* __restrict__ y
86) {
87 int warp_id = blockIdx.x * blockDim.x + threadIdx.x;
88 int lane_id = threadIdx.x % 32;
89 int row = warp_id / 32;
90
91 if (row >= rows) return;
92
93 int start = indptr[row];
94 int end = indptr[row + 1];
95 float sum = 0.0f;
96
97 // Warp-level parallelization
98 for (int j = start + lane_id; j < end; j += 32) {
99 sum += data[j] * x[indices[j]];
100 }
101
102 // Warp reduction
103 #pragma unroll
104 for (int offset = 16; offset > 0; offset /= 2) {
105 sum += __shfl_down_sync(0xffffffff, sum, offset);
106 }
107
108 if (lane_id == 0) {
109 y[row] = sum;
110 }
111}
112"#;
113
114pub struct CudaSpMatVec {
116 context: Option<scirs2_core::gpu::GpuContext>,
117 kernel_handle: Option<scirs2_core::gpu::GpuKernelHandle>,
118 vectorized_kernel: Option<scirs2_core::gpu::GpuKernelHandle>,
119 warp_kernel: Option<scirs2_core::gpu::GpuKernelHandle>,
120}
121
122impl CudaSpMatVec {
123 pub fn new() -> SparseResult<Self> {
125 #[cfg(feature = "gpu")]
127 let context = match scirs2_core::gpu::GpuContext::new(scirs2_core::gpu::GpuBackend::Cuda) {
128 Ok(ctx) => Some(ctx),
129 Err(_) => None, };
131 #[cfg(not(feature = "gpu"))]
132 let context = None;
133
134 let mut handler = Self {
135 context,
136 kernel_handle: None,
137 vectorized_kernel: None,
138 warp_kernel: None,
139 };
140
141 #[cfg(feature = "gpu")]
143 if handler.context.is_some() {
144 let _ = handler.compile_kernels();
145 }
146
147 Ok(handler)
148 }
149
150 #[cfg(feature = "gpu")]
152 pub fn compile_kernels(&mut self) -> Result<(), scirs2_core::gpu::GpuError> {
153 if let Some(ref context) = self.context {
154 self.kernel_handle =
156 context.execute(|compiler| compiler.compile(CUDA_SPMV_KERNEL_SOURCE).ok());
157
158 self.vectorized_kernel =
159 context.execute(|compiler| compiler.compile(CUDA_SPMV_KERNEL_SOURCE).ok());
160
161 self.warp_kernel =
162 context.execute(|compiler| compiler.compile(CUDA_WARP_SPMV_KERNEL_SOURCE).ok());
163
164 if self.kernel_handle.is_some() {
165 Ok(())
166 } else {
167 Err(scirs2_core::gpu::GpuError::KernelCompilationError(
168 "Failed to compile CUDA kernels".to_string(),
169 ))
170 }
171 } else {
172 Err(scirs2_core::gpu::GpuError::BackendNotAvailable(
173 "CUDA".to_string(),
174 ))
175 }
176 }
177
178 #[cfg(feature = "gpu")]
180 pub fn execute_spmv<T>(
181 &self,
182 matrix: &CsrArray<T>,
183 vector: &ArrayView1<T>,
184 _device: &super::GpuDevice,
185 ) -> SparseResult<Array1<T>>
186 where
187 T: Float + SparseElement + Debug + Copy + scirs2_core::gpu::GpuDataType,
188 {
189 let (rows, cols) = matrix.shape();
190 if cols != vector.len() {
191 return Err(SparseError::DimensionMismatch {
192 expected: cols,
193 found: vector.len(),
194 });
195 }
196
197 if let Some(ref context) = self.context {
198 if let Some(ref kernel) = self.kernel_handle {
199 let indptr_buffer = context.create_buffer_from_slice(
201 matrix.get_indptr().as_slice().expect("Operation failed"),
202 );
203 let indices_buffer = context.create_buffer_from_slice(
204 matrix.get_indices().as_slice().expect("Operation failed"),
205 );
206 let data_buffer = context.create_buffer_from_slice(
207 matrix.get_data().as_slice().expect("Operation failed"),
208 );
209 let vector_buffer =
210 context.create_buffer_from_slice(vector.as_slice().expect("Operation failed"));
211 let result_buffer = context.create_buffer::<T>(rows);
212
213 kernel.set_buffer("indptr", &indptr_buffer);
215 kernel.set_buffer("indices", &indices_buffer);
216 kernel.set_buffer("data", &data_buffer);
217 kernel.set_buffer("vector", &vector_buffer);
218 kernel.set_buffer("result", &result_buffer);
219 kernel.set_u32("rows", rows as u32);
220
221 let grid_size = ((rows + 255) / 256, 1, 1);
223 let block_size = (256, 1, 1);
224
225 let args = vec![scirs2_core::gpu::DynamicKernelArg::U32(rows as u32)];
227
228 context
229 .launch_kernel("spmv_csr_kernel", grid_size, block_size, &args)
230 .map_err(|e| {
231 SparseError::ComputationError(format!(
232 "CUDA kernel execution failed: {:?}",
233 e
234 ))
235 })?;
236
237 let mut result_vec = vec![T::sparse_zero(); rows];
239 result_buffer.copy_to_host(&mut result_vec).map_err(|e| {
240 SparseError::ComputationError(format!(
241 "Failed to copy result from GPU: {:?}",
242 e
243 ))
244 })?;
245 Ok(Array1::from_vec(result_vec))
246 } else {
247 Err(SparseError::ComputationError(
248 "CUDA kernel not compiled".to_string(),
249 ))
250 }
251 } else {
252 matrix.dot_vector(vector)
254 }
255 }
256
257 #[cfg(feature = "gpu")]
259 pub fn execute_optimized_spmv<T>(
260 &self,
261 matrix: &CsrArray<T>,
262 vector: &ArrayView1<T>,
263 device: &super::GpuDevice,
264 optimization_level: CudaOptimizationLevel,
265 ) -> SparseResult<Array1<T>>
266 where
267 T: Float + SparseElement + Debug + Copy + super::GpuDataType,
268 {
269 let (rows, cols) = matrix.shape();
270 if cols != vector.len() {
271 return Err(SparseError::DimensionMismatch {
272 expected: cols,
273 found: vector.len(),
274 });
275 }
276
277 let kernel = match optimization_level {
279 CudaOptimizationLevel::Basic => &self.kernel_handle,
280 CudaOptimizationLevel::Vectorized => &self.vectorized_kernel,
281 CudaOptimizationLevel::WarpLevel => &self.warp_kernel,
282 };
283
284 if let Some(ref k) = kernel {
285 self.execute_kernel_with_optimization(matrix, vector, device, k, optimization_level)
286 } else {
287 Err(SparseError::ComputationError(
288 "CUDA kernel not available for requested optimization level".to_string(),
289 ))
290 }
291 }
292
293 #[cfg(feature = "gpu")]
294 fn execute_kernel_with_optimization<T>(
295 &self,
296 _matrix: &CsrArray<T>,
297 _vector: &ArrayView1<T>,
298 _device: &super::GpuDevice,
299 _kernel: &super::GpuKernelHandle,
300 _optimization_level: CudaOptimizationLevel,
301 ) -> SparseResult<Array1<T>>
302 where
303 T: Float + SparseElement + Debug + Copy + super::GpuDataType,
304 {
305 Err(SparseError::ComputationError(
307 "CUDA optimized execution not yet implemented".to_string(),
308 ))
309
310 }
343
344 #[cfg(not(feature = "gpu"))]
346 pub fn execute_spmv_cpu<T>(
347 &self,
348 matrix: &CsrArray<T>,
349 vector: &ArrayView1<T>,
350 ) -> SparseResult<Array1<T>>
351 where
352 T: Float + SparseElement + Debug + Copy + std::iter::Sum,
353 {
354 matrix.dot_vector(vector)
355 }
356}
357
358impl Default for CudaSpMatVec {
359 fn default() -> Self {
360 Self::new().unwrap_or_else(|_| Self {
361 context: None,
362 kernel_handle: None,
363 vectorized_kernel: None,
364 warp_kernel: None,
365 })
366 }
367}
368
369#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
371pub enum CudaOptimizationLevel {
372 #[default]
374 Basic,
375 Vectorized,
377 WarpLevel,
379}
380
381pub struct CudaMemoryManager {
383 #[allow(dead_code)]
384 allocated_buffers: Vec<String>,
385}
386
387impl CudaMemoryManager {
388 pub fn new() -> Self {
390 Self {
391 allocated_buffers: Vec::new(),
392 }
393 }
394
395 #[cfg(feature = "gpu")]
397 pub fn allocate_sparse_matrix<T>(
398 &mut self,
399 _matrix: &CsrArray<T>,
400 _device: &super::GpuDevice,
401 ) -> Result<CudaMatrixBuffers<T>, super::GpuError>
402 where
403 T: super::GpuDataType + Copy + Float + SparseElement + Debug,
404 {
405 Err(super::GpuError::BackendNotImplemented(
407 super::GpuBackend::Cuda,
408 ))
409 }
410
411 #[cfg(feature = "gpu")]
413 pub fn allocate_optimized<T>(
414 &mut self,
415 _data: &[T],
416 _device: &super::GpuDevice,
417 _access_pattern: MemoryAccessPattern,
418 ) -> Result<super::GpuBuffer<T>, super::GpuError>
419 where
420 T: super::GpuDataType + Copy + Float + SparseElement + Debug,
421 {
422 Err(super::GpuError::BackendNotImplemented(
425 super::GpuBackend::Cuda,
426 ))
427 }
428}
429
430impl Default for CudaMemoryManager {
431 fn default() -> Self {
432 Self::new()
433 }
434}
435
436#[cfg(feature = "gpu")]
438pub struct CudaMatrixBuffers<T: super::GpuDataType> {
439 pub indptr: super::GpuBuffer<usize>,
440 pub indices: super::GpuBuffer<usize>,
441 pub data: super::GpuBuffer<T>,
442}
443
444#[derive(Debug, Clone, Copy)]
446pub enum MemoryAccessPattern {
447 Sequential,
449 Random,
451 Strided,
453}
454
455#[cfg(test)]
456mod tests {
457 use super::*;
458 use scirs2_core::ndarray::Array1;
459
460 #[test]
461 fn test_cuda_spmv_creation() {
462 let cuda_spmv = CudaSpMatVec::new();
463 assert!(cuda_spmv.is_ok());
464 }
465
466 #[test]
467 fn test_cuda_optimization_levels() {
468 let basic = CudaOptimizationLevel::Basic;
469 let vectorized = CudaOptimizationLevel::Vectorized;
470 let warp = CudaOptimizationLevel::WarpLevel;
471
472 assert_ne!(basic, vectorized);
473 assert_ne!(vectorized, warp);
474 assert_eq!(
475 CudaOptimizationLevel::default(),
476 CudaOptimizationLevel::Basic
477 );
478 }
479
480 #[test]
481 fn test_cuda_memory_manager() {
482 let manager = CudaMemoryManager::new();
483 assert_eq!(manager.allocated_buffers.len(), 0);
484 }
485
486 #[test]
487 fn test_memory_access_patterns() {
488 let patterns = [
489 MemoryAccessPattern::Sequential,
490 MemoryAccessPattern::Random,
491 MemoryAccessPattern::Strided,
492 ];
493
494 for pattern in &patterns {
496 match pattern {
497 MemoryAccessPattern::Sequential => (),
498 MemoryAccessPattern::Random => (),
499 MemoryAccessPattern::Strided => (),
500 }
501 }
502 }
503
504 #[test]
505 #[allow(clippy::const_is_empty)]
506 fn test_kernel_sources() {
507 assert!(!CUDA_SPMV_KERNEL_SOURCE.is_empty());
508 assert!(!CUDA_WARP_SPMV_KERNEL_SOURCE.is_empty());
509
510 assert!(CUDA_SPMV_KERNEL_SOURCE.contains("spmv_csr_kernel"));
512 assert!(CUDA_SPMV_KERNEL_SOURCE.contains("spmv_csr_vectorized_kernel"));
513 assert!(CUDA_WARP_SPMV_KERNEL_SOURCE.contains("spmv_csr_warp_kernel"));
514 }
515}