scirs2_sparse/gpu/
opencl.rs

1//! OpenCL backend for sparse matrix GPU operations
2//!
3//! This module provides OpenCL-specific implementations for sparse matrix operations.
4
5use crate::csr_array::CsrArray;
6use crate::error::{SparseError, SparseResult};
7use crate::sparray::SparseArray;
8use scirs2_core::ndarray::{Array1, ArrayView1};
9use scirs2_core::numeric::{Float, SparseElement};
10use std::fmt::Debug;
11
12#[cfg(feature = "gpu")]
13use crate::gpu_kernel_execution::{GpuKernelConfig, MemoryStrategy};
14
15#[cfg(feature = "gpu")]
16pub use scirs2_core::gpu::{GpuBackend, GpuBuffer, GpuContext, GpuDataType, GpuKernelHandle};
17
18#[cfg(feature = "gpu")]
19pub use scirs2_core::GpuError;
20
21/// OpenCL kernel source code for sparse matrix-vector multiplication
22pub const OPENCL_SPMV_KERNEL_SOURCE: &str = r#"
23__kernel void spmv_csr_kernel(
24    int rows,
25    __global const int* restrict indptr,
26    __global const int* restrict indices,
27    __global const float* restrict data,
28    __global const float* restrict x,
29    __global float* restrict y
30) {
31    int row = get_global_id(0);
32    if (row >= rows) return;
33    
34    float sum = 0.0f;
35    int start = indptr[row];
36    int end = indptr[row + 1];
37    
38    for (int j = start; j < end; j++) {
39        sum += data[j] * x[indices[j]];
40    }
41    
42    y[row] = sum;
43}
44
45__kernel void spmv_csr_workgroup_kernel(
46    int rows,
47    __global const int* restrict indptr,
48    __global const int* restrict indices,
49    __global const float* restrict data,
50    __global const float* restrict x,
51    __global float* restrict y,
52    __local float* local_mem
53) {
54    int gid = get_global_id(0);
55    int lid = get_local_id(0);
56    int group_size = get_local_size(0);
57    
58    if (gid >= rows) return;
59    
60    int start = indptr[gid];
61    int end = indptr[gid + 1];
62    
63    local_mem[lid] = 0.0f;
64    barrier(CLK_LOCAL_MEM_FENCE);
65    
66    for (int j = start; j < end; j++) {
67        local_mem[lid] += data[j] * x[indices[j]];
68    }
69    
70    barrier(CLK_LOCAL_MEM_FENCE);
71    y[gid] = local_mem[lid];
72}
73"#;
74
75/// OpenCL vectorized kernel for better performance
76pub const OPENCL_VECTORIZED_KERNEL_SOURCE: &str = r#"
77__kernel void spmv_csr_vectorized_kernel(
78    int rows,
79    __global const int* restrict indptr,
80    __global const int* restrict indices,
81    __global const float* restrict data,
82    __global const float* restrict x,
83    __global float* restrict y
84) {
85    int row = get_global_id(0);
86    if (row >= rows) return;
87    
88    int start = indptr[row];
89    int end = indptr[row + 1];
90    int nnz = end - start;
91    
92    float4 sum = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
93    
94    // Process 4 elements at a time when possible
95    int j;
96    for (j = start; j + 3 < end; j += 4) {
97        float4 data_vec = (float4)(data[j], data[j+1], data[j+2], data[j+3]);
98        float4 x_vec = (float4)(
99            x[indices[j]], 
100            x[indices[j+1]], 
101            x[indices[j+2]], 
102            x[indices[j+3]]
103        );
104        sum += data_vec * x_vec;
105    }
106    
107    float scalar_sum = sum.x + sum.y + sum.z + sum.w;
108    
109    // Handle remaining elements
110    for (; j < end; j++) {
111        scalar_sum += data[j] * x[indices[j]];
112    }
113    
114    y[row] = scalar_sum;
115}
116"#;
117
118/// OpenCL sparse matrix operations
119pub struct OpenCLSpMatVec {
120    context: Option<scirs2_core::gpu::GpuContext>,
121    kernel_handle: Option<scirs2_core::gpu::GpuKernelHandle>,
122    workgroup_kernel: Option<scirs2_core::gpu::GpuKernelHandle>,
123    vectorized_kernel: Option<scirs2_core::gpu::GpuKernelHandle>,
124    platform_info: OpenCLPlatformInfo,
125}
126
127impl OpenCLSpMatVec {
128    /// Create a new OpenCL sparse matrix-vector multiplication handler
129    pub fn new() -> SparseResult<Self> {
130        // Try to create OpenCL context
131        #[cfg(feature = "gpu")]
132        let context = match scirs2_core::gpu::GpuContext::new(scirs2_core::gpu::GpuBackend::OpenCL)
133        {
134            Ok(ctx) => Some(ctx),
135            Err(_) => None, // OpenCL not available, will use CPU fallback
136        };
137        #[cfg(not(feature = "gpu"))]
138        let context = None;
139
140        let mut handler = Self {
141            context,
142            kernel_handle: None,
143            workgroup_kernel: None,
144            vectorized_kernel: None,
145            platform_info: OpenCLPlatformInfo::detect(),
146        };
147
148        // Compile kernels if context is available
149        #[cfg(feature = "gpu")]
150        if handler.context.is_some() {
151            let _ = handler.compile_kernels();
152        }
153
154        Ok(handler)
155    }
156
157    /// Compile OpenCL kernels for sparse matrix operations
158    #[cfg(feature = "gpu")]
159    pub fn compile_kernels(&mut self) -> Result<(), scirs2_core::gpu::GpuError> {
160        if let Some(ref context) = self.context {
161            // Compile basic kernel
162            self.kernel_handle =
163                context.execute(|compiler| compiler.compile(OPENCL_SPMV_KERNEL_SOURCE).ok());
164
165            // Compile workgroup-optimized kernel
166            self.workgroup_kernel =
167                context.execute(|compiler| compiler.compile(OPENCL_SPMV_KERNEL_SOURCE).ok());
168
169            // Compile vectorized kernel
170            self.vectorized_kernel =
171                context.execute(|compiler| compiler.compile(OPENCL_VECTORIZED_KERNEL_SOURCE).ok());
172
173            if self.kernel_handle.is_some() {
174                Ok(())
175            } else {
176                Err(scirs2_core::gpu::GpuError::KernelCompilationError(
177                    "Failed to compile OpenCL kernels".to_string(),
178                ))
179            }
180        } else {
181            Err(scirs2_core::gpu::GpuError::BackendNotAvailable(
182                "OpenCL".to_string(),
183            ))
184        }
185    }
186
187    /// Execute OpenCL sparse matrix-vector multiplication
188    #[cfg(feature = "gpu")]
189    pub fn execute_spmv<T>(
190        &self,
191        matrix: &CsrArray<T>,
192        vector: &ArrayView1<T>,
193        _device: &super::GpuDevice,
194    ) -> SparseResult<Array1<T>>
195    where
196        T: Float + SparseElement + Debug + Copy + scirs2_core::gpu::GpuDataType,
197    {
198        let (rows, cols) = matrix.shape();
199        if cols != vector.len() {
200            return Err(SparseError::DimensionMismatch {
201                expected: cols,
202                found: vector.len(),
203            });
204        }
205
206        if let Some(ref context) = self.context {
207            if let Some(ref kernel) = self.kernel_handle {
208                // Upload data to GPU
209                let indptr_buffer = context.create_buffer_from_slice(
210                    matrix.get_indptr().as_slice().expect("Operation failed"),
211                );
212                let indices_buffer = context.create_buffer_from_slice(
213                    matrix.get_indices().as_slice().expect("Operation failed"),
214                );
215                let data_buffer = context.create_buffer_from_slice(
216                    matrix.get_data().as_slice().expect("Operation failed"),
217                );
218                let vector_buffer =
219                    context.create_buffer_from_slice(vector.as_slice().expect("Operation failed"));
220                let result_buffer = context.create_buffer::<T>(rows);
221
222                // Set kernel parameters
223                kernel.set_buffer("indptr", &indptr_buffer);
224                kernel.set_buffer("indices", &indices_buffer);
225                kernel.set_buffer("data", &data_buffer);
226                kernel.set_buffer("vector", &vector_buffer);
227                kernel.set_buffer("result", &result_buffer);
228                kernel.set_u32("rows", rows as u32);
229
230                // Configure work group size based on platform
231                let work_group_size = self.platform_info.max_work_group_size.min(256);
232                let grid_size = ((rows + work_group_size - 1) / work_group_size, 1, 1);
233                let block_size = (work_group_size, 1, 1);
234
235                // Execute kernel
236                let args = vec![scirs2_core::gpu::DynamicKernelArg::U32(rows as u32)];
237
238                context
239                    .launch_kernel("spmv_csr_kernel", grid_size, block_size, &args)
240                    .map_err(|e| {
241                        SparseError::ComputationError(format!(
242                            "OpenCL kernel execution failed: {:?}",
243                            e
244                        ))
245                    })?;
246
247                // Read result back
248                let mut result_vec = vec![T::sparse_zero(); rows];
249                result_buffer.copy_to_host(&mut result_vec).map_err(|e| {
250                    SparseError::ComputationError(format!(
251                        "Failed to copy result from GPU: {:?}",
252                        e
253                    ))
254                })?;
255                Ok(Array1::from_vec(result_vec))
256            } else {
257                Err(SparseError::ComputationError(
258                    "OpenCL kernel not compiled".to_string(),
259                ))
260            }
261        } else {
262            // Fallback to CPU implementation
263            matrix.dot_vector(vector)
264        }
265    }
266
267    /// Execute optimized OpenCL sparse matrix-vector multiplication
268    #[cfg(feature = "gpu")]
269    pub fn execute_optimized_spmv<T>(
270        &self,
271        matrix: &CsrArray<T>,
272        vector: &ArrayView1<T>,
273        device: &super::GpuDevice,
274        optimization_level: OpenCLOptimizationLevel,
275    ) -> SparseResult<Array1<T>>
276    where
277        T: Float + SparseElement + Debug + Copy + super::GpuDataType,
278    {
279        let (rows, cols) = matrix.shape();
280        if cols != vector.len() {
281            return Err(SparseError::DimensionMismatch {
282                expected: cols,
283                found: vector.len(),
284            });
285        }
286
287        // Choose kernel based on optimization level
288        let kernel = match optimization_level {
289            OpenCLOptimizationLevel::Basic => &self.kernel_handle,
290            OpenCLOptimizationLevel::Workgroup => &self.workgroup_kernel,
291            OpenCLOptimizationLevel::Vectorized => &self.vectorized_kernel,
292        };
293
294        if let Some(ref k) = kernel {
295            self.execute_kernel_with_optimization(matrix, vector, device, k, optimization_level)
296        } else {
297            Err(SparseError::ComputationError(
298                "OpenCL kernel not available for requested optimization level".to_string(),
299            ))
300        }
301    }
302
303    #[cfg(feature = "gpu")]
304    fn execute_kernel_with_optimization<T>(
305        &self,
306        matrix: &CsrArray<T>,
307        vector: &ArrayView1<T>,
308        _device: &super::GpuDevice,
309        _kernel: &super::GpuKernelHandle,
310        optimization_level: OpenCLOptimizationLevel,
311    ) -> SparseResult<Array1<T>>
312    where
313        T: Float + SparseElement + Debug + Copy + super::GpuDataType,
314    {
315        let (rows, _) = matrix.shape();
316
317        if let Some(ref context) = self.context {
318            // Upload data to GPU using context
319            let indptr_gpu = context.create_buffer_from_slice(
320                matrix.get_indptr().as_slice().expect("Operation failed"),
321            );
322            let indices_gpu = context.create_buffer_from_slice(
323                matrix.get_indices().as_slice().expect("Operation failed"),
324            );
325            let data_gpu = context
326                .create_buffer_from_slice(matrix.get_data().as_slice().expect("Operation failed"));
327            let vector_gpu =
328                context.create_buffer_from_slice(vector.as_slice().expect("Operation failed"));
329            let result_gpu = context.create_buffer::<T>(rows);
330
331            // Configure work group parameters based on optimization level
332            let (work_group_size, _local_memory_size) = match optimization_level {
333                OpenCLOptimizationLevel::Basic => {
334                    (self.platform_info.max_work_group_size.min(64), 0)
335                }
336                OpenCLOptimizationLevel::Workgroup => {
337                    let wg_size = self.platform_info.max_work_group_size.min(128);
338                    (wg_size, wg_size * std::mem::size_of::<f32>())
339                }
340                OpenCLOptimizationLevel::Vectorized => {
341                    (self.platform_info.max_work_group_size.min(256), 0)
342                }
343            };
344
345            let global_work_size =
346                ((rows + work_group_size - 1) / work_group_size) * work_group_size;
347
348            // Launch kernel using context
349            let args = vec![scirs2_core::gpu::DynamicKernelArg::U32(rows as u32)];
350
351            // Use appropriate kernel based on optimization level
352            let kernel_name = match optimization_level {
353                OpenCLOptimizationLevel::Basic => "spmv_csr_kernel",
354                OpenCLOptimizationLevel::Workgroup => "spmv_csr_workgroup_kernel",
355                OpenCLOptimizationLevel::Vectorized => "spmv_csr_vectorized_kernel",
356            };
357
358            context
359                .launch_kernel(
360                    kernel_name,
361                    (global_work_size, 1, 1),
362                    (work_group_size, 1, 1),
363                    &args,
364                )
365                .map_err(|e| {
366                    SparseError::ComputationError(format!(
367                        "OpenCL kernel execution failed: {:?}",
368                        e
369                    ))
370                })?;
371
372            // Download result
373            let mut result_vec = vec![T::sparse_zero(); rows];
374            result_gpu.copy_to_host(&mut result_vec).map_err(|e| {
375                SparseError::ComputationError(format!("Failed to copy result from GPU: {:?}", e))
376            })?;
377            Ok(Array1::from_vec(result_vec))
378        } else {
379            // Fallback to CPU implementation
380            matrix.dot_vector(vector)
381        }
382    }
383
384    /// Select optimal kernel based on matrix characteristics
385    #[cfg(feature = "gpu")]
386    fn select_optimal_kernel<T>(
387        &self,
388        rows: usize,
389        matrix: &CsrArray<T>,
390    ) -> SparseResult<super::GpuKernelHandle>
391    where
392        T: Float + SparseElement + Debug + Copy,
393    {
394        // Calculate average non-zeros per row
395        let avg_nnz_per_row = matrix.get_data().len() as f64 / rows as f64;
396
397        // Select kernel based on sparsity pattern and platform capabilities
398        if avg_nnz_per_row < 5.0 && self.platform_info.supports_vectorization {
399            // Sparse matrix, use vectorized kernel if available
400            if let Some(ref kernel) = self.vectorized_kernel {
401                Ok(kernel.clone())
402            } else if let Some(ref kernel) = self.kernel_handle {
403                Ok(kernel.clone())
404            } else {
405                Err(SparseError::ComputationError(
406                    "No OpenCL kernels available".to_string(),
407                ))
408            }
409        } else if avg_nnz_per_row > 20.0 && self.platform_info.max_work_group_size >= 128 {
410            // Dense-ish matrix, use workgroup kernel
411            if let Some(ref kernel) = self.workgroup_kernel {
412                Ok(kernel.clone())
413            } else if let Some(ref kernel) = self.kernel_handle {
414                Ok(kernel.clone())
415            } else {
416                Err(SparseError::ComputationError(
417                    "No OpenCL kernels available".to_string(),
418                ))
419            }
420        } else {
421            // Default to basic kernel
422            if let Some(ref kernel) = self.kernel_handle {
423                Ok(kernel.clone())
424            } else {
425                Err(SparseError::ComputationError(
426                    "No OpenCL kernels available".to_string(),
427                ))
428            }
429        }
430    }
431
432    /// CPU fallback implementation
433    #[cfg(not(feature = "gpu"))]
434    pub fn execute_spmv_cpu<T>(
435        &self,
436        matrix: &CsrArray<T>,
437        vector: &ArrayView1<T>,
438    ) -> SparseResult<Array1<T>>
439    where
440        T: Float + SparseElement + Debug + Copy + std::iter::Sum,
441    {
442        matrix.dot_vector(vector)
443    }
444}
445
446impl Default for OpenCLSpMatVec {
447    fn default() -> Self {
448        Self::new().unwrap_or_else(|_| Self {
449            context: None,
450            kernel_handle: None,
451            workgroup_kernel: None,
452            vectorized_kernel: None,
453            platform_info: OpenCLPlatformInfo::default(),
454        })
455    }
456}
457
458/// OpenCL optimization levels for sparse matrix operations
459#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
460pub enum OpenCLOptimizationLevel {
461    /// Basic thread-per-row implementation
462    #[default]
463    Basic,
464    /// Workgroup-optimized implementation with local memory
465    Workgroup,
466    /// Vectorized implementation using float4 operations
467    Vectorized,
468}
469
470/// OpenCL platform information for optimization
471#[derive(Debug)]
472pub struct OpenCLPlatformInfo {
473    pub max_work_group_size: usize,
474    pub local_memory_size: usize,
475    pub supports_vectorization: bool,
476    pub compute_units: usize,
477    pub device_type: OpenCLDeviceType,
478}
479
480impl OpenCLPlatformInfo {
481    /// Detect OpenCL platform capabilities
482    pub fn detect() -> Self {
483        // In a real implementation, this would query the OpenCL runtime
484        // For now, return sensible defaults
485        Self {
486            max_work_group_size: 256,
487            local_memory_size: 32768, // 32KB
488            supports_vectorization: true,
489            compute_units: 8,
490            device_type: OpenCLDeviceType::GPU,
491        }
492    }
493}
494
495impl Default for OpenCLPlatformInfo {
496    fn default() -> Self {
497        Self::detect()
498    }
499}
500
501/// OpenCL device types
502#[derive(Debug, Clone, Copy, PartialEq, Eq)]
503pub enum OpenCLDeviceType {
504    /// CPU device
505    CPU,
506    /// GPU device
507    GPU,
508    /// Accelerator device
509    Accelerator,
510}
511
512/// OpenCL memory management for sparse matrices
513pub struct OpenCLMemoryManager {
514    platform_info: OpenCLPlatformInfo,
515    #[allow(dead_code)]
516    allocated_buffers: Vec<String>,
517}
518
519impl OpenCLMemoryManager {
520    /// Create a new OpenCL memory manager
521    pub fn new() -> Self {
522        Self {
523            platform_info: OpenCLPlatformInfo::detect(),
524            allocated_buffers: Vec::new(),
525        }
526    }
527
528    /// Allocate GPU memory for sparse matrix data with optimal layout
529    #[cfg(feature = "gpu")]
530    pub fn allocate_sparse_matrix<T>(
531        &mut self,
532        _matrix: &CsrArray<T>,
533        _device: &super::GpuDevice,
534    ) -> Result<OpenCLMatrixBuffers<T>, super::GpuError>
535    where
536        T: super::GpuDataType + Copy + Float + SparseElement + Debug,
537    {
538        // This functionality should use GpuContext instead of GpuDevice
539        // For now, return an error indicating this needs proper implementation
540        Err(super::GpuError::BackendNotImplemented(
541            super::GpuBackend::OpenCL,
542        ))
543    }
544
545    /// Get optimal work group size for the current platform
546    pub fn optimal_work_group_size(&self, problem_size: usize) -> usize {
547        let max_wg_size = self.platform_info.max_work_group_size;
548
549        // For small problems, use smaller work groups
550        if problem_size < 1000 {
551            max_wg_size.min(64)
552        } else if problem_size < 10000 {
553            max_wg_size.min(128)
554        } else {
555            max_wg_size
556        }
557    }
558
559    /// Check if vectorization is beneficial for the given matrix
560    pub fn should_use_vectorization<T>(&self, matrix: &CsrArray<T>) -> bool
561    where
562        T: Float + SparseElement + Debug + Copy,
563    {
564        if !self.platform_info.supports_vectorization {
565            return false;
566        }
567
568        let avg_nnz_per_row = matrix.nnz() as f64 / matrix.shape().0 as f64;
569
570        // Vectorization is beneficial for sparse matrices with moderate sparsity
571        (4.0..=32.0).contains(&avg_nnz_per_row)
572    }
573}
574
575impl Default for OpenCLMemoryManager {
576    fn default() -> Self {
577        Self::new()
578    }
579}
580
581/// GPU memory buffers for OpenCL sparse matrix data
582#[cfg(feature = "gpu")]
583pub struct OpenCLMatrixBuffers<T: super::GpuDataType> {
584    pub indptr: super::GpuBuffer<usize>,
585    pub indices: super::GpuBuffer<usize>,
586    pub data: super::GpuBuffer<T>,
587}
588
589#[cfg(test)]
590mod tests {
591    use super::*;
592    use scirs2_core::ndarray::Array1;
593
594    #[test]
595    fn test_opencl_spmv_creation() {
596        let opencl_spmv = OpenCLSpMatVec::new();
597        assert!(opencl_spmv.is_ok());
598    }
599
600    #[test]
601    fn test_opencl_optimization_levels() {
602        let basic = OpenCLOptimizationLevel::Basic;
603        let workgroup = OpenCLOptimizationLevel::Workgroup;
604        let vectorized = OpenCLOptimizationLevel::Vectorized;
605
606        assert_ne!(basic, workgroup);
607        assert_ne!(workgroup, vectorized);
608        assert_eq!(
609            OpenCLOptimizationLevel::default(),
610            OpenCLOptimizationLevel::Basic
611        );
612    }
613
614    #[test]
615    fn test_opencl_platform_info() {
616        let info = OpenCLPlatformInfo::detect();
617        assert!(info.max_work_group_size > 0);
618        assert!(info.local_memory_size > 0);
619        assert!(info.compute_units > 0);
620    }
621
622    #[test]
623    fn test_opencl_device_types() {
624        let device_types = [
625            OpenCLDeviceType::CPU,
626            OpenCLDeviceType::GPU,
627            OpenCLDeviceType::Accelerator,
628        ];
629
630        for device_type in &device_types {
631            match device_type {
632                OpenCLDeviceType::CPU => (),
633                OpenCLDeviceType::GPU => (),
634                OpenCLDeviceType::Accelerator => (),
635            }
636        }
637    }
638
639    #[test]
640    fn test_opencl_memory_manager() {
641        let manager = OpenCLMemoryManager::new();
642        assert_eq!(manager.allocated_buffers.len(), 0);
643        assert!(manager.platform_info.max_work_group_size > 0);
644
645        // Test work group size selection
646        let wg_size_small = manager.optimal_work_group_size(500);
647        let wg_size_large = manager.optimal_work_group_size(50000);
648        assert!(wg_size_small <= wg_size_large);
649    }
650
651    #[test]
652    #[allow(clippy::const_is_empty)]
653    fn test_kernel_sources() {
654        assert!(!OPENCL_SPMV_KERNEL_SOURCE.is_empty());
655        assert!(!OPENCL_VECTORIZED_KERNEL_SOURCE.is_empty());
656
657        // Check that kernels contain expected function names
658        assert!(OPENCL_SPMV_KERNEL_SOURCE.contains("spmv_csr_kernel"));
659        assert!(OPENCL_SPMV_KERNEL_SOURCE.contains("spmv_csr_workgroup_kernel"));
660        assert!(OPENCL_VECTORIZED_KERNEL_SOURCE.contains("spmv_csr_vectorized_kernel"));
661    }
662}