scirs2_sparse/
gpu_spmv_implementation.rs

1//! Enhanced GPU SpMV Implementation for scirs2-sparse
2//!
3//! This module provides production-ready GPU-accelerated sparse matrix-vector multiplication
4//! with proper error handling, memory management, and multi-backend support.
5
6use crate::error::{SparseError, SparseResult};
7use scirs2_core::gpu::{GpuBackend, GpuContext, GpuDataType};
8use scirs2_core::numeric::{Float, NumAssign};
9use scirs2_core::simd_ops::SimdUnifiedOps;
10use std::fmt::Debug;
11
12/// Enhanced GPU-accelerated Sparse Matrix-Vector multiplication implementation
13pub struct GpuSpMV {
14    #[allow(dead_code)]
15    context: GpuContext,
16    backend: GpuBackend,
17}
18
19impl GpuSpMV {
20    /// Create a new GPU SpMV instance with automatic backend detection
21    pub fn new() -> SparseResult<Self> {
22        // Try to initialize GPU backends in order of preference
23        let (context, backend) = Self::initialize_best_backend()?;
24
25        Ok(Self { context, backend })
26    }
27
28    /// Create a new GPU SpMV instance with specified backend
29    pub fn with_backend(backend: GpuBackend) -> SparseResult<Self> {
30        let context = GpuContext::new(backend).map_err(|e| {
31            SparseError::ComputationError(format!("Failed to initialize GPU context: {e}"))
32        })?;
33
34        Ok(Self { context, backend })
35    }
36
37    /// Initialize the best available GPU backend
38    fn initialize_best_backend() -> SparseResult<(GpuContext, GpuBackend)> {
39        // Try backends in order of performance preference
40        let backends_to_try = [
41            GpuBackend::Cuda,   // Best performance on NVIDIA GPUs
42            GpuBackend::Metal,  // Best performance on Apple Silicon
43            GpuBackend::OpenCL, // Good cross-platform compatibility
44            GpuBackend::Cpu,    // Fallback option
45        ];
46
47        for &backend in &backends_to_try {
48            if let Ok(context) = GpuContext::new(backend) {
49                return Ok((context, backend));
50            }
51        }
52
53        Err(SparseError::ComputationError(
54            "No GPU backend available".to_string(),
55        ))
56    }
57
58    /// Execute sparse matrix-vector multiplication: y = A * x
59    #[allow(clippy::too_many_arguments)]
60    pub fn spmv<T>(
61        &self,
62        rows: usize,
63        cols: usize,
64        indptr: &[usize],
65        indices: &[usize],
66        data: &[T],
67        x: &[T],
68    ) -> SparseResult<Vec<T>>
69    where
70        T: Float
71            + Debug
72            + Copy
73            + Default
74            + GpuDataType
75            + Send
76            + Sync
77            + 'static
78            + NumAssign
79            + SimdUnifiedOps
80            + std::iter::Sum,
81    {
82        // Validate input dimensions
83        self.validate_spmv_inputs(rows, cols, indptr, indices, data, x)?;
84
85        // Execute GPU-accelerated SpMV based on backend
86        match self.backend {
87            GpuBackend::Cuda => self.spmv_cuda(rows, indptr, indices, data, x),
88            GpuBackend::OpenCL => self.spmv_opencl(rows, indptr, indices, data, x),
89            GpuBackend::Metal => self.spmv_metal(rows, indptr, indices, data, x),
90            GpuBackend::Cpu => self.spmv_cpu_optimized(rows, indptr, indices, data, x),
91            GpuBackend::Rocm | GpuBackend::Wgpu => {
92                // For now, use CPU fallback for Rocm and Wgpu until implemented
93                self.spmv_cpu_optimized(rows, indptr, indices, data, x)
94            }
95        }
96    }
97
98    /// Validate SpMV input parameters
99    fn validate_spmv_inputs<T>(
100        &self,
101        rows: usize,
102        cols: usize,
103        indptr: &[usize],
104        indices: &[usize],
105        data: &[T],
106        x: &[T],
107    ) -> SparseResult<()>
108    where
109        T: Float + Debug,
110    {
111        if indptr.len() != rows + 1 {
112            return Err(SparseError::InvalidFormat(format!(
113                "indptr length {} does not match rows + 1 = {}",
114                indptr.len(),
115                rows + 1
116            )));
117        }
118
119        if indices.len() != data.len() {
120            return Err(SparseError::InvalidFormat(format!(
121                "indices length {} does not match data length {}",
122                indices.len(),
123                data.len()
124            )));
125        }
126
127        if x.len() != cols {
128            return Err(SparseError::InvalidFormat(format!(
129                "x length {} does not match cols {}",
130                x.len(),
131                cols
132            )));
133        }
134
135        // Validate that all indices are within bounds
136        for &idx in indices {
137            if idx >= cols {
138                return Err(SparseError::InvalidFormat(format!(
139                    "Column index {idx} exceeds cols {cols}"
140                )));
141            }
142        }
143
144        Ok(())
145    }
146
147    /// CUDA-accelerated SpMV implementation
148    fn spmv_cuda<T>(
149        &self,
150        rows: usize,
151        indptr: &[usize],
152        indices: &[usize],
153        data: &[T],
154        x: &[T],
155    ) -> SparseResult<Vec<T>>
156    where
157        T: Float
158            + Debug
159            + Copy
160            + Default
161            + GpuDataType
162            + Send
163            + Sync
164            + 'static
165            + NumAssign
166            + SimdUnifiedOps
167            + std::iter::Sum,
168    {
169        #[cfg(feature = "gpu")]
170        {
171            use crate::gpu_ops::{GpuBufferExt, SpMVKernel};
172
173            // Create GPU buffers
174            let indptr_buffer = self.context.create_buffer_from_slice(indptr);
175            let indices_buffer = self.context.create_buffer_from_slice(indices);
176            let data_buffer = self.context.create_buffer_from_slice(data);
177            let x_buffer = self.context.create_buffer_from_slice(x);
178            let mut y_buffer = self.context.create_buffer::<T>(rows);
179
180            // Use unified GPU interface
181            use crate::csr_array::CsrArray;
182            use crate::gpu::GpuSpMatVec;
183
184            // Create CSR matrix from components for unified interface
185            let csr_matrix = CsrArray::new(
186                data.to_vec().into(),
187                indices.to_vec().into(),
188                indptr.to_vec().into(),
189                (rows, x.len()),
190            )?;
191
192            let gpu_handler = GpuSpMatVec::with_backend(self.backend)?;
193            let result = gpu_handler.spmv(
194                &csr_matrix,
195                &scirs2_core::ndarray::ArrayView1::from(x),
196                None,
197            )?;
198            Ok(result.to_vec())
199        }
200
201        #[cfg(not(feature = "gpu"))]
202        {
203            // Fall back to optimized CPU implementation when GPU feature is not enabled
204            self.spmv_cpu_optimized(rows, indptr, indices, data, x)
205        }
206    }
207
208    /// OpenCL-accelerated SpMV implementation
209    fn spmv_opencl<T>(
210        &self,
211        rows: usize,
212        indptr: &[usize],
213        indices: &[usize],
214        data: &[T],
215        x: &[T],
216    ) -> SparseResult<Vec<T>>
217    where
218        T: Float
219            + Debug
220            + Copy
221            + Default
222            + GpuDataType
223            + Send
224            + Sync
225            + 'static
226            + NumAssign
227            + SimdUnifiedOps
228            + std::iter::Sum,
229    {
230        #[cfg(feature = "gpu")]
231        {
232            use crate::gpu_ops::{GpuBufferExt, SpMVKernel};
233
234            // Use unified GPU interface instead of low-level OpenCL
235            use crate::csr_array::CsrArray;
236            use crate::gpu::GpuSpMatVec;
237
238            // Create CSR matrix from components for unified interface
239            let csr_matrix = CsrArray::new(
240                data.to_vec().into(),
241                indices.to_vec().into(),
242                indptr.to_vec().into(),
243                (rows, x.len()),
244            )?;
245
246            let gpu_handler = GpuSpMatVec::with_backend(self.backend)?;
247            let result = gpu_handler.spmv(
248                &csr_matrix,
249                &scirs2_core::ndarray::ArrayView1::from(x),
250                None,
251            )?;
252            Ok(result.to_vec())
253        }
254
255        #[cfg(not(feature = "gpu"))]
256        {
257            // Fall back to optimized CPU implementation when GPU feature is not enabled
258            self.spmv_cpu_optimized(rows, indptr, indices, data, x)
259        }
260    }
261
262    /// Metal-accelerated SpMV implementation  
263    fn spmv_metal<T>(
264        &self,
265        rows: usize,
266        indptr: &[usize],
267        indices: &[usize],
268        data: &[T],
269        x: &[T],
270    ) -> SparseResult<Vec<T>>
271    where
272        T: Float
273            + Debug
274            + Copy
275            + Default
276            + GpuDataType
277            + Send
278            + Sync
279            + 'static
280            + NumAssign
281            + SimdUnifiedOps
282            + std::iter::Sum,
283    {
284        #[cfg(feature = "gpu")]
285        {
286            use crate::gpu_ops::{GpuBufferExt, SpMVKernel};
287
288            // Create GPU buffers for Metal
289            let indptr_buffer = self.context.create_buffer_from_slice(indptr);
290            let indices_buffer = self.context.create_buffer_from_slice(indices);
291            let data_buffer = self.context.create_buffer_from_slice(data);
292            let x_buffer = self.context.create_buffer_from_slice(x);
293            let mut y_buffer = self.context.create_buffer::<T>(rows);
294
295            // Use unified GPU interface instead of low-level kernel
296            use crate::csr_array::CsrArray;
297            use crate::gpu::GpuSpMatVec;
298
299            // Create CSR matrix from components for unified interface
300            let csr_matrix = CsrArray::new(
301                data.to_vec().into(),
302                indices.to_vec().into(),
303                indptr.to_vec().into(),
304                (rows, x.len()),
305            )?;
306
307            let gpu_handler = GpuSpMatVec::with_backend(self.backend)?;
308            let result = gpu_handler.spmv(
309                &csr_matrix,
310                &scirs2_core::ndarray::ArrayView1::from(x),
311                None,
312            )?;
313            Ok(result.to_vec())
314        }
315
316        #[cfg(not(feature = "gpu"))]
317        {
318            // Fall back to optimized CPU implementation when GPU feature is not enabled
319            self.spmv_cpu_optimized(rows, indptr, indices, data, x)
320        }
321    }
322
323    /// Optimized CPU fallback implementation
324    fn spmv_cpu_optimized<T>(
325        &self,
326        rows: usize,
327        indptr: &[usize],
328        indices: &[usize],
329        data: &[T],
330        x: &[T],
331    ) -> SparseResult<Vec<T>>
332    where
333        T: Float + Debug + Copy + Default + Send + Sync + NumAssign + SimdUnifiedOps,
334    {
335        let mut y = vec![T::zero(); rows];
336
337        // Use parallel processing for CPU implementation
338        #[cfg(feature = "parallel")]
339        {
340            use crate::parallel_vector_ops::parallel_sparse_matvec_csr;
341            parallel_sparse_matvec_csr(&mut y, rows, indptr, indices, data, x, None);
342        }
343
344        #[cfg(not(feature = "parallel"))]
345        {
346            for row in 0..rows {
347                let mut sum = T::zero();
348                let start = indptr[row];
349                let end = indptr[row + 1];
350
351                for idx in start..end {
352                    let col = indices[idx];
353                    sum = sum + data[idx] * x[col];
354                }
355                y[row] = sum;
356            }
357        }
358
359        Ok(y)
360    }
361
362    /// Get CUDA kernel source code
363    #[allow(dead_code)]
364    fn get_cuda_spmv_kernel_source(&self) -> String {
365        r#"
366        extern "C" _global_ void spmv_csr_kernel(
367            int rows,
368            const int* _restrict_ indptr,
369            const int* _restrict_ indices,
370            const float* _restrict_ data,
371            const float* _restrict_ x,
372            float* _restrict_ y
373        ) {
374            int row = blockIdx.x * blockDim.x + threadIdx.x;
375            if (row >= rows) return;
376            
377            float sum = 0.0f;
378            int start = indptr[row];
379            int end = indptr[row + 1];
380            
381            // Optimized loop with memory coalescing
382            for (int j = start; j < end; j++) {
383                sum += data[j] * x[indices[j]];
384            }
385            
386            y[row] = sum;
387        }
388        "#
389        .to_string()
390    }
391
392    /// Get OpenCL kernel source code
393    #[allow(dead_code)]
394    fn get_opencl_spmv_kernel_source(&self) -> String {
395        r#"
396        _kernel void spmv_csr_kernel(
397            const int rowsglobal const int* restrict indptr_global const int* restrict indices_global const float* restrict data_global const float* restrict x_global float* restrict y
398        ) {
399            int row = get_global_id(0);
400            if (row >= rows) return;
401            
402            float sum = 0.0f;
403            int start = indptr[row];
404            int end = indptr[row + 1];
405            
406            // Vectorized loop with memory coalescing
407            for (int j = start; j < end; j++) {
408                sum += data[j] * x[indices[j]];
409            }
410            
411            y[row] = sum;
412        }
413        "#
414        .to_string()
415    }
416
417    /// Get Metal kernel source code
418    #[allow(dead_code)]
419    fn get_metal_spmv_kernel_source(&self) -> String {
420        r#"
421        #include <metal_stdlib>
422        using namespace metal;
423        
424        kernel void spmv_csr_kernel(
425            constant int& rows [[buffer(0)]],
426            constant int* indptr [[buffer(1)]],
427            constant int* indices [[buffer(2)]],
428            constant float* data [[buffer(3)]],
429            constant float* x [[buffer(4)]],
430            device float* y [[buffer(5)]],
431            uint row [[thread_position_in_grid]]
432        ) {
433            if (row >= rows) return;
434            
435            float sum = 0.0f;
436            int start = indptr[row];
437            int end = indptr[row + 1];
438            
439            // Vectorized loop optimized for Metal
440            for (int j = start; j < end; j++) {
441                sum += data[j] * x[indices[j]];
442            }
443            
444            y[row] = sum;
445        }
446        "#
447        .to_string()
448    }
449
450    /// Get information about the current GPU backend
451    pub fn backend_info(&self) -> (GpuBackend, String) {
452        let backend_name = match self.backend {
453            GpuBackend::Cuda => "NVIDIA CUDA",
454            GpuBackend::OpenCL => "OpenCL",
455            GpuBackend::Metal => "Apple Metal",
456            GpuBackend::Cpu => "CPU Fallback",
457            GpuBackend::Rocm => "AMD ROCm",
458            GpuBackend::Wgpu => "WebGPU",
459        };
460
461        (self.backend, backend_name.to_string())
462    }
463}
464
465impl Default for GpuSpMV {
466    fn default() -> Self {
467        Self::new().unwrap_or_else(|_| {
468            // If GPU initialization fails, create CPU-only version
469            Self {
470                context: GpuContext::new(GpuBackend::Cpu).unwrap(),
471                backend: GpuBackend::Cpu,
472            }
473        })
474    }
475}
476
477#[cfg(test)]
478mod tests {
479    use super::*;
480
481    #[test]
482    fn test_gpu_spmv_creation() {
483        let gpu_spmv = GpuSpMV::new();
484        assert!(
485            gpu_spmv.is_ok(),
486            "Should be able to create GPU SpMV instance"
487        );
488    }
489
490    #[test]
491    fn test_cpu_fallback_spmv() {
492        let gpu_spmv = GpuSpMV::with_backend(GpuBackend::Cpu).unwrap();
493
494        // Simple test matrix: [[1, 2], [0, 3]]
495        let indptr = vec![0, 2, 3];
496        let indices = vec![0, 1, 1];
497        let data = vec![1.0, 2.0, 3.0];
498        let x = vec![1.0, 1.0];
499
500        let result = gpu_spmv.spmv(2, 2, &indptr, &indices, &data, &x).unwrap();
501        assert_eq!(result, vec![3.0, 3.0]); // [1*1 + 2*1, 3*1] = [3, 3]
502    }
503
504    #[test]
505    fn test_backend_info() {
506        let gpu_spmv = GpuSpMV::default();
507        let (_backend, name) = gpu_spmv.backend_info();
508        assert!(!name.is_empty(), "Backend name should not be empty");
509    }
510}