1use crate::csr_array::CsrArray;
6use crate::error::{SparseError, SparseResult};
7use crate::sparray::SparseArray;
8use scirs2_core::ndarray::{Array1, ArrayView1};
9use scirs2_core::numeric::Float;
10use std::fmt::Debug;
11
12#[cfg(feature = "gpu")]
13use crate::gpu_kernel_execution::{GpuKernelConfig, MemoryStrategy};
14
15#[cfg(feature = "gpu")]
16pub use scirs2_core::gpu::{GpuBackend, GpuBuffer, GpuContext, GpuDataType, GpuKernelHandle};
17
18#[cfg(feature = "gpu")]
19pub use scirs2_core::GpuError;
20
21pub const CUDA_SPMV_KERNEL_SOURCE: &str = r#"
23extern "C" __global__ void spmv_csr_kernel(
24 int rows,
25 const int* __restrict__ indptr,
26 const int* __restrict__ indices,
27 const float* __restrict__ data,
28 const float* __restrict__ x,
29 float* __restrict__ y
30) {
31 int row = blockIdx.x * blockDim.x + threadIdx.x;
32 if (row >= rows) return;
33
34 float sum = 0.0f;
35 int start = indptr[row];
36 int end = indptr[row + 1];
37
38 // Vectorized loop for better memory access patterns
39 for (int j = start; j < end; j++) {
40 sum += data[j] * x[indices[j]];
41 }
42
43 y[row] = sum;
44}
45
46extern "C" __global__ void spmv_csr_vectorized_kernel(
47 int rows,
48 const int* __restrict__ indptr,
49 const int* __restrict__ indices,
50 const float* __restrict__ data,
51 const float* __restrict__ x,
52 float* __restrict__ y
53) {
54 int row = blockIdx.x * blockDim.x + threadIdx.x;
55 if (row >= rows) return;
56
57 float sum = 0.0f;
58 int start = indptr[row];
59 int end = indptr[row + 1];
60
61 // Use shared memory for better performance
62 extern __shared__ float sdata[];
63 int tid = threadIdx.x;
64
65 sdata[tid] = 0.0f;
66 __syncthreads();
67
68 for (int j = start; j < end; j++) {
69 sdata[tid] += data[j] * x[indices[j]];
70 }
71
72 __syncthreads();
73 y[row] = sdata[tid];
74}
75"#;
76
77pub const CUDA_WARP_SPMV_KERNEL_SOURCE: &str = r#"
79extern "C" __global__ void spmv_csr_warp_kernel(
80 int rows,
81 const int* __restrict__ indptr,
82 const int* __restrict__ indices,
83 const float* __restrict__ data,
84 const float* __restrict__ x,
85 float* __restrict__ y
86) {
87 int warp_id = blockIdx.x * blockDim.x + threadIdx.x;
88 int lane_id = threadIdx.x % 32;
89 int row = warp_id / 32;
90
91 if (row >= rows) return;
92
93 int start = indptr[row];
94 int end = indptr[row + 1];
95 float sum = 0.0f;
96
97 // Warp-level parallelization
98 for (int j = start + lane_id; j < end; j += 32) {
99 sum += data[j] * x[indices[j]];
100 }
101
102 // Warp reduction
103 #pragma unroll
104 for (int offset = 16; offset > 0; offset /= 2) {
105 sum += __shfl_down_sync(0xffffffff, sum, offset);
106 }
107
108 if (lane_id == 0) {
109 y[row] = sum;
110 }
111}
112"#;
113
114pub struct CudaSpMatVec {
116 context: Option<scirs2_core::gpu::GpuContext>,
117 kernel_handle: Option<scirs2_core::gpu::GpuKernelHandle>,
118 vectorized_kernel: Option<scirs2_core::gpu::GpuKernelHandle>,
119 warp_kernel: Option<scirs2_core::gpu::GpuKernelHandle>,
120}
121
122impl CudaSpMatVec {
123 pub fn new() -> SparseResult<Self> {
125 #[cfg(feature = "gpu")]
127 let context = match scirs2_core::gpu::GpuContext::new(scirs2_core::gpu::GpuBackend::Cuda) {
128 Ok(ctx) => Some(ctx),
129 Err(_) => None, };
131 #[cfg(not(feature = "gpu"))]
132 let context = None;
133
134 let mut handler = Self {
135 context,
136 kernel_handle: None,
137 vectorized_kernel: None,
138 warp_kernel: None,
139 };
140
141 #[cfg(feature = "gpu")]
143 if handler.context.is_some() {
144 let _ = handler.compile_kernels();
145 }
146
147 Ok(handler)
148 }
149
150 #[cfg(feature = "gpu")]
152 pub fn compile_kernels(&mut self) -> Result<(), scirs2_core::gpu::GpuError> {
153 if let Some(ref context) = self.context {
154 self.kernel_handle =
156 context.execute(|compiler| compiler.compile(CUDA_SPMV_KERNEL_SOURCE).ok());
157
158 self.vectorized_kernel =
159 context.execute(|compiler| compiler.compile(CUDA_SPMV_KERNEL_SOURCE).ok());
160
161 self.warp_kernel =
162 context.execute(|compiler| compiler.compile(CUDA_WARP_SPMV_KERNEL_SOURCE).ok());
163
164 if self.kernel_handle.is_some() {
165 Ok(())
166 } else {
167 Err(scirs2_core::gpu::GpuError::KernelCompilationError(
168 "Failed to compile CUDA kernels".to_string(),
169 ))
170 }
171 } else {
172 Err(scirs2_core::gpu::GpuError::BackendNotAvailable(
173 "CUDA".to_string(),
174 ))
175 }
176 }
177
178 #[cfg(feature = "gpu")]
180 pub fn execute_spmv<T>(
181 &self,
182 matrix: &CsrArray<T>,
183 vector: &ArrayView1<T>,
184 _device: &super::GpuDevice,
185 ) -> SparseResult<Array1<T>>
186 where
187 T: Float + Debug + Copy + scirs2_core::gpu::GpuDataType,
188 {
189 let (rows, cols) = matrix.shape();
190 if cols != vector.len() {
191 return Err(SparseError::DimensionMismatch {
192 expected: cols,
193 found: vector.len(),
194 });
195 }
196
197 if let Some(ref context) = self.context {
198 if let Some(ref kernel) = self.kernel_handle {
199 let indptr_buffer =
201 context.create_buffer_from_slice(matrix.get_indptr().as_slice().unwrap());
202 let indices_buffer =
203 context.create_buffer_from_slice(matrix.get_indices().as_slice().unwrap());
204 let data_buffer =
205 context.create_buffer_from_slice(matrix.get_data().as_slice().unwrap());
206 let vector_buffer = context.create_buffer_from_slice(vector.as_slice().unwrap());
207 let result_buffer = context.create_buffer::<T>(rows);
208
209 kernel.set_buffer("indptr", &indptr_buffer);
211 kernel.set_buffer("indices", &indices_buffer);
212 kernel.set_buffer("data", &data_buffer);
213 kernel.set_buffer("vector", &vector_buffer);
214 kernel.set_buffer("result", &result_buffer);
215 kernel.set_u32("rows", rows as u32);
216
217 let grid_size = ((rows + 255) / 256, 1, 1);
219 let block_size = (256, 1, 1);
220
221 let args = vec![scirs2_core::gpu::DynamicKernelArg::U32(rows as u32)];
223
224 context
225 .launch_kernel("spmv_csr_kernel", grid_size, block_size, &args)
226 .map_err(|e| {
227 SparseError::ComputationError(format!(
228 "CUDA kernel execution failed: {:?}",
229 e
230 ))
231 })?;
232
233 let mut result_vec = vec![T::zero(); rows];
235 result_buffer.copy_to_host(&mut result_vec).map_err(|e| {
236 SparseError::ComputationError(format!(
237 "Failed to copy result from GPU: {:?}",
238 e
239 ))
240 })?;
241 Ok(Array1::from_vec(result_vec))
242 } else {
243 Err(SparseError::ComputationError(
244 "CUDA kernel not compiled".to_string(),
245 ))
246 }
247 } else {
248 matrix.dot_vector(vector)
250 }
251 }
252
253 #[cfg(feature = "gpu")]
255 pub fn execute_optimized_spmv<T>(
256 &self,
257 matrix: &CsrArray<T>,
258 vector: &ArrayView1<T>,
259 device: &super::GpuDevice,
260 optimization_level: CudaOptimizationLevel,
261 ) -> SparseResult<Array1<T>>
262 where
263 T: Float + Debug + Copy + super::GpuDataType,
264 {
265 let (rows, cols) = matrix.shape();
266 if cols != vector.len() {
267 return Err(SparseError::DimensionMismatch {
268 expected: cols,
269 found: vector.len(),
270 });
271 }
272
273 let kernel = match optimization_level {
275 CudaOptimizationLevel::Basic => &self.kernel_handle,
276 CudaOptimizationLevel::Vectorized => &self.vectorized_kernel,
277 CudaOptimizationLevel::WarpLevel => &self.warp_kernel,
278 };
279
280 if let Some(ref k) = kernel {
281 self.execute_kernel_with_optimization(matrix, vector, device, k, optimization_level)
282 } else {
283 Err(SparseError::ComputationError(
284 "CUDA kernel not available for requested optimization level".to_string(),
285 ))
286 }
287 }
288
289 #[cfg(feature = "gpu")]
290 fn execute_kernel_with_optimization<T>(
291 &self,
292 _matrix: &CsrArray<T>,
293 _vector: &ArrayView1<T>,
294 _device: &super::GpuDevice,
295 _kernel: &super::GpuKernelHandle,
296 _optimization_level: CudaOptimizationLevel,
297 ) -> SparseResult<Array1<T>>
298 where
299 T: Float + Debug + Copy + super::GpuDataType,
300 {
301 Err(SparseError::ComputationError(
303 "CUDA optimized execution not yet implemented".to_string(),
304 ))
305
306 }
339
340 #[cfg(not(feature = "gpu"))]
342 pub fn execute_spmv_cpu<T>(
343 &self,
344 matrix: &CsrArray<T>,
345 vector: &ArrayView1<T>,
346 ) -> SparseResult<Array1<T>>
347 where
348 T: Float + Debug + Copy + std::iter::Sum,
349 {
350 matrix.dot_vector(vector)
351 }
352}
353
354impl Default for CudaSpMatVec {
355 fn default() -> Self {
356 Self::new().unwrap_or_else(|_| Self {
357 context: None,
358 kernel_handle: None,
359 vectorized_kernel: None,
360 warp_kernel: None,
361 })
362 }
363}
364
365#[derive(Debug, Clone, Copy, PartialEq, Eq)]
367pub enum CudaOptimizationLevel {
368 Basic,
370 Vectorized,
372 WarpLevel,
374}
375
376impl Default for CudaOptimizationLevel {
377 fn default() -> Self {
378 Self::Basic
379 }
380}
381
382pub struct CudaMemoryManager {
384 #[allow(dead_code)]
385 allocated_buffers: Vec<String>,
386}
387
388impl CudaMemoryManager {
389 pub fn new() -> Self {
391 Self {
392 allocated_buffers: Vec::new(),
393 }
394 }
395
396 #[cfg(feature = "gpu")]
398 pub fn allocate_sparse_matrix<T>(
399 &mut self,
400 _matrix: &CsrArray<T>,
401 _device: &super::GpuDevice,
402 ) -> Result<CudaMatrixBuffers<T>, super::GpuError>
403 where
404 T: super::GpuDataType + Copy + Float + Debug,
405 {
406 Err(super::GpuError::BackendNotImplemented(
408 super::GpuBackend::Cuda,
409 ))
410 }
411
412 #[cfg(feature = "gpu")]
414 pub fn allocate_optimized<T>(
415 &mut self,
416 _data: &[T],
417 _device: &super::GpuDevice,
418 _access_pattern: MemoryAccessPattern,
419 ) -> Result<super::GpuBuffer<T>, super::GpuError>
420 where
421 T: super::GpuDataType + Copy + Float + Debug,
422 {
423 Err(super::GpuError::BackendNotImplemented(
426 super::GpuBackend::Cuda,
427 ))
428 }
429}
430
431impl Default for CudaMemoryManager {
432 fn default() -> Self {
433 Self::new()
434 }
435}
436
437#[cfg(feature = "gpu")]
439pub struct CudaMatrixBuffers<T: super::GpuDataType> {
440 pub indptr: super::GpuBuffer<usize>,
441 pub indices: super::GpuBuffer<usize>,
442 pub data: super::GpuBuffer<T>,
443}
444
445#[derive(Debug, Clone, Copy)]
447pub enum MemoryAccessPattern {
448 Sequential,
450 Random,
452 Strided,
454}
455
456#[cfg(test)]
457mod tests {
458 use super::*;
459 use scirs2_core::ndarray::Array1;
460
461 #[test]
462 fn test_cuda_spmv_creation() {
463 let cuda_spmv = CudaSpMatVec::new();
464 assert!(cuda_spmv.is_ok());
465 }
466
467 #[test]
468 fn test_cuda_optimization_levels() {
469 let basic = CudaOptimizationLevel::Basic;
470 let vectorized = CudaOptimizationLevel::Vectorized;
471 let warp = CudaOptimizationLevel::WarpLevel;
472
473 assert_ne!(basic, vectorized);
474 assert_ne!(vectorized, warp);
475 assert_eq!(
476 CudaOptimizationLevel::default(),
477 CudaOptimizationLevel::Basic
478 );
479 }
480
481 #[test]
482 fn test_cuda_memory_manager() {
483 let manager = CudaMemoryManager::new();
484 assert_eq!(manager.allocated_buffers.len(), 0);
485 }
486
487 #[test]
488 fn test_memory_access_patterns() {
489 let patterns = [
490 MemoryAccessPattern::Sequential,
491 MemoryAccessPattern::Random,
492 MemoryAccessPattern::Strided,
493 ];
494
495 for pattern in &patterns {
497 match pattern {
498 MemoryAccessPattern::Sequential => (),
499 MemoryAccessPattern::Random => (),
500 MemoryAccessPattern::Strided => (),
501 }
502 }
503 }
504
505 #[test]
506 #[allow(clippy::const_is_empty)]
507 fn test_kernel_sources() {
508 assert!(!CUDA_SPMV_KERNEL_SOURCE.is_empty());
509 assert!(!CUDA_WARP_SPMV_KERNEL_SOURCE.is_empty());
510
511 assert!(CUDA_SPMV_KERNEL_SOURCE.contains("spmv_csr_kernel"));
513 assert!(CUDA_SPMV_KERNEL_SOURCE.contains("spmv_csr_vectorized_kernel"));
514 assert!(CUDA_WARP_SPMV_KERNEL_SOURCE.contains("spmv_csr_warp_kernel"));
515 }
516}