scirs2_sparse/gpu/
opencl.rs

1//! OpenCL backend for sparse matrix GPU operations
2//!
3//! This module provides OpenCL-specific implementations for sparse matrix operations.
4
5use crate::csr_array::CsrArray;
6use crate::error::{SparseError, SparseResult};
7use crate::sparray::SparseArray;
8use scirs2_core::ndarray::{Array1, ArrayView1};
9use scirs2_core::numeric::Float;
10use std::fmt::Debug;
11
12#[cfg(feature = "gpu")]
13use crate::gpu_kernel_execution::{GpuKernelConfig, MemoryStrategy};
14
15#[cfg(feature = "gpu")]
16pub use scirs2_core::gpu::{GpuBackend, GpuBuffer, GpuContext, GpuDataType, GpuKernelHandle};
17
18#[cfg(feature = "gpu")]
19pub use scirs2_core::GpuError;
20
21/// OpenCL kernel source code for sparse matrix-vector multiplication
22pub const OPENCL_SPMV_KERNEL_SOURCE: &str = r#"
23__kernel void spmv_csr_kernel(
24    int rows,
25    __global const int* restrict indptr,
26    __global const int* restrict indices,
27    __global const float* restrict data,
28    __global const float* restrict x,
29    __global float* restrict y
30) {
31    int row = get_global_id(0);
32    if (row >= rows) return;
33    
34    float sum = 0.0f;
35    int start = indptr[row];
36    int end = indptr[row + 1];
37    
38    for (int j = start; j < end; j++) {
39        sum += data[j] * x[indices[j]];
40    }
41    
42    y[row] = sum;
43}
44
45__kernel void spmv_csr_workgroup_kernel(
46    int rows,
47    __global const int* restrict indptr,
48    __global const int* restrict indices,
49    __global const float* restrict data,
50    __global const float* restrict x,
51    __global float* restrict y,
52    __local float* local_mem
53) {
54    int gid = get_global_id(0);
55    int lid = get_local_id(0);
56    int group_size = get_local_size(0);
57    
58    if (gid >= rows) return;
59    
60    int start = indptr[gid];
61    int end = indptr[gid + 1];
62    
63    local_mem[lid] = 0.0f;
64    barrier(CLK_LOCAL_MEM_FENCE);
65    
66    for (int j = start; j < end; j++) {
67        local_mem[lid] += data[j] * x[indices[j]];
68    }
69    
70    barrier(CLK_LOCAL_MEM_FENCE);
71    y[gid] = local_mem[lid];
72}
73"#;
74
75/// OpenCL vectorized kernel for better performance
76pub const OPENCL_VECTORIZED_KERNEL_SOURCE: &str = r#"
77__kernel void spmv_csr_vectorized_kernel(
78    int rows,
79    __global const int* restrict indptr,
80    __global const int* restrict indices,
81    __global const float* restrict data,
82    __global const float* restrict x,
83    __global float* restrict y
84) {
85    int row = get_global_id(0);
86    if (row >= rows) return;
87    
88    int start = indptr[row];
89    int end = indptr[row + 1];
90    int nnz = end - start;
91    
92    float4 sum = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
93    
94    // Process 4 elements at a time when possible
95    int j;
96    for (j = start; j + 3 < end; j += 4) {
97        float4 data_vec = (float4)(data[j], data[j+1], data[j+2], data[j+3]);
98        float4 x_vec = (float4)(
99            x[indices[j]], 
100            x[indices[j+1]], 
101            x[indices[j+2]], 
102            x[indices[j+3]]
103        );
104        sum += data_vec * x_vec;
105    }
106    
107    float scalar_sum = sum.x + sum.y + sum.z + sum.w;
108    
109    // Handle remaining elements
110    for (; j < end; j++) {
111        scalar_sum += data[j] * x[indices[j]];
112    }
113    
114    y[row] = scalar_sum;
115}
116"#;
117
118/// OpenCL sparse matrix operations
119pub struct OpenCLSpMatVec {
120    context: Option<scirs2_core::gpu::GpuContext>,
121    kernel_handle: Option<scirs2_core::gpu::GpuKernelHandle>,
122    workgroup_kernel: Option<scirs2_core::gpu::GpuKernelHandle>,
123    vectorized_kernel: Option<scirs2_core::gpu::GpuKernelHandle>,
124    platform_info: OpenCLPlatformInfo,
125}
126
127impl OpenCLSpMatVec {
128    /// Create a new OpenCL sparse matrix-vector multiplication handler
129    pub fn new() -> SparseResult<Self> {
130        // Try to create OpenCL context
131        #[cfg(feature = "gpu")]
132        let context = match scirs2_core::gpu::GpuContext::new(scirs2_core::gpu::GpuBackend::OpenCL)
133        {
134            Ok(ctx) => Some(ctx),
135            Err(_) => None, // OpenCL not available, will use CPU fallback
136        };
137        #[cfg(not(feature = "gpu"))]
138        let context = None;
139
140        let mut handler = Self {
141            context,
142            kernel_handle: None,
143            workgroup_kernel: None,
144            vectorized_kernel: None,
145            platform_info: OpenCLPlatformInfo::detect(),
146        };
147
148        // Compile kernels if context is available
149        #[cfg(feature = "gpu")]
150        if handler.context.is_some() {
151            let _ = handler.compile_kernels();
152        }
153
154        Ok(handler)
155    }
156
157    /// Compile OpenCL kernels for sparse matrix operations
158    #[cfg(feature = "gpu")]
159    pub fn compile_kernels(&mut self) -> Result<(), scirs2_core::gpu::GpuError> {
160        if let Some(ref context) = self.context {
161            // Compile basic kernel
162            self.kernel_handle =
163                context.execute(|compiler| compiler.compile(OPENCL_SPMV_KERNEL_SOURCE).ok());
164
165            // Compile workgroup-optimized kernel
166            self.workgroup_kernel =
167                context.execute(|compiler| compiler.compile(OPENCL_SPMV_KERNEL_SOURCE).ok());
168
169            // Compile vectorized kernel
170            self.vectorized_kernel =
171                context.execute(|compiler| compiler.compile(OPENCL_VECTORIZED_KERNEL_SOURCE).ok());
172
173            if self.kernel_handle.is_some() {
174                Ok(())
175            } else {
176                Err(scirs2_core::gpu::GpuError::KernelCompilationError(
177                    "Failed to compile OpenCL kernels".to_string(),
178                ))
179            }
180        } else {
181            Err(scirs2_core::gpu::GpuError::BackendNotAvailable(
182                "OpenCL".to_string(),
183            ))
184        }
185    }
186
187    /// Execute OpenCL sparse matrix-vector multiplication
188    #[cfg(feature = "gpu")]
189    pub fn execute_spmv<T>(
190        &self,
191        matrix: &CsrArray<T>,
192        vector: &ArrayView1<T>,
193        _device: &super::GpuDevice,
194    ) -> SparseResult<Array1<T>>
195    where
196        T: Float + Debug + Copy + scirs2_core::gpu::GpuDataType,
197    {
198        let (rows, cols) = matrix.shape();
199        if cols != vector.len() {
200            return Err(SparseError::DimensionMismatch {
201                expected: cols,
202                found: vector.len(),
203            });
204        }
205
206        if let Some(ref context) = self.context {
207            if let Some(ref kernel) = self.kernel_handle {
208                // Upload data to GPU
209                let indptr_buffer =
210                    context.create_buffer_from_slice(matrix.get_indptr().as_slice().unwrap());
211                let indices_buffer =
212                    context.create_buffer_from_slice(matrix.get_indices().as_slice().unwrap());
213                let data_buffer =
214                    context.create_buffer_from_slice(matrix.get_data().as_slice().unwrap());
215                let vector_buffer = context.create_buffer_from_slice(vector.as_slice().unwrap());
216                let result_buffer = context.create_buffer::<T>(rows);
217
218                // Set kernel parameters
219                kernel.set_buffer("indptr", &indptr_buffer);
220                kernel.set_buffer("indices", &indices_buffer);
221                kernel.set_buffer("data", &data_buffer);
222                kernel.set_buffer("vector", &vector_buffer);
223                kernel.set_buffer("result", &result_buffer);
224                kernel.set_u32("rows", rows as u32);
225
226                // Configure work group size based on platform
227                let work_group_size = self.platform_info.max_work_group_size.min(256);
228                let grid_size = ((rows + work_group_size - 1) / work_group_size, 1, 1);
229                let block_size = (work_group_size, 1, 1);
230
231                // Execute kernel
232                let args = vec![scirs2_core::gpu::DynamicKernelArg::U32(rows as u32)];
233
234                context
235                    .launch_kernel("spmv_csr_kernel", grid_size, block_size, &args)
236                    .map_err(|e| {
237                        SparseError::ComputationError(format!(
238                            "OpenCL kernel execution failed: {:?}",
239                            e
240                        ))
241                    })?;
242
243                // Read result back
244                let mut result_vec = vec![T::zero(); rows];
245                result_buffer.copy_to_host(&mut result_vec).map_err(|e| {
246                    SparseError::ComputationError(format!(
247                        "Failed to copy result from GPU: {:?}",
248                        e
249                    ))
250                })?;
251                Ok(Array1::from_vec(result_vec))
252            } else {
253                Err(SparseError::ComputationError(
254                    "OpenCL kernel not compiled".to_string(),
255                ))
256            }
257        } else {
258            // Fallback to CPU implementation
259            matrix.dot_vector(vector)
260        }
261    }
262
263    /// Execute optimized OpenCL sparse matrix-vector multiplication
264    #[cfg(feature = "gpu")]
265    pub fn execute_optimized_spmv<T>(
266        &self,
267        matrix: &CsrArray<T>,
268        vector: &ArrayView1<T>,
269        device: &super::GpuDevice,
270        optimization_level: OpenCLOptimizationLevel,
271    ) -> SparseResult<Array1<T>>
272    where
273        T: Float + Debug + Copy + super::GpuDataType,
274    {
275        let (rows, cols) = matrix.shape();
276        if cols != vector.len() {
277            return Err(SparseError::DimensionMismatch {
278                expected: cols,
279                found: vector.len(),
280            });
281        }
282
283        // Choose kernel based on optimization level
284        let kernel = match optimization_level {
285            OpenCLOptimizationLevel::Basic => &self.kernel_handle,
286            OpenCLOptimizationLevel::Workgroup => &self.workgroup_kernel,
287            OpenCLOptimizationLevel::Vectorized => &self.vectorized_kernel,
288        };
289
290        if let Some(ref k) = kernel {
291            self.execute_kernel_with_optimization(matrix, vector, device, k, optimization_level)
292        } else {
293            Err(SparseError::ComputationError(
294                "OpenCL kernel not available for requested optimization level".to_string(),
295            ))
296        }
297    }
298
299    #[cfg(feature = "gpu")]
300    fn execute_kernel_with_optimization<T>(
301        &self,
302        matrix: &CsrArray<T>,
303        vector: &ArrayView1<T>,
304        _device: &super::GpuDevice,
305        _kernel: &super::GpuKernelHandle,
306        optimization_level: OpenCLOptimizationLevel,
307    ) -> SparseResult<Array1<T>>
308    where
309        T: Float + Debug + Copy + super::GpuDataType,
310    {
311        let (rows, _) = matrix.shape();
312
313        if let Some(ref context) = self.context {
314            // Upload data to GPU using context
315            let indptr_gpu =
316                context.create_buffer_from_slice(matrix.get_indptr().as_slice().unwrap());
317            let indices_gpu =
318                context.create_buffer_from_slice(matrix.get_indices().as_slice().unwrap());
319            let data_gpu = context.create_buffer_from_slice(matrix.get_data().as_slice().unwrap());
320            let vector_gpu = context.create_buffer_from_slice(vector.as_slice().unwrap());
321            let result_gpu = context.create_buffer::<T>(rows);
322
323            // Configure work group parameters based on optimization level
324            let (work_group_size, _local_memory_size) = match optimization_level {
325                OpenCLOptimizationLevel::Basic => {
326                    (self.platform_info.max_work_group_size.min(64), 0)
327                }
328                OpenCLOptimizationLevel::Workgroup => {
329                    let wg_size = self.platform_info.max_work_group_size.min(128);
330                    (wg_size, wg_size * std::mem::size_of::<f32>())
331                }
332                OpenCLOptimizationLevel::Vectorized => {
333                    (self.platform_info.max_work_group_size.min(256), 0)
334                }
335            };
336
337            let global_work_size =
338                ((rows + work_group_size - 1) / work_group_size) * work_group_size;
339
340            // Launch kernel using context
341            let args = vec![scirs2_core::gpu::DynamicKernelArg::U32(rows as u32)];
342
343            // Use appropriate kernel based on optimization level
344            let kernel_name = match optimization_level {
345                OpenCLOptimizationLevel::Basic => "spmv_csr_kernel",
346                OpenCLOptimizationLevel::Workgroup => "spmv_csr_workgroup_kernel",
347                OpenCLOptimizationLevel::Vectorized => "spmv_csr_vectorized_kernel",
348            };
349
350            context
351                .launch_kernel(
352                    kernel_name,
353                    (global_work_size, 1, 1),
354                    (work_group_size, 1, 1),
355                    &args,
356                )
357                .map_err(|e| {
358                    SparseError::ComputationError(format!(
359                        "OpenCL kernel execution failed: {:?}",
360                        e
361                    ))
362                })?;
363
364            // Download result
365            let mut result_vec = vec![T::zero(); rows];
366            result_gpu.copy_to_host(&mut result_vec).map_err(|e| {
367                SparseError::ComputationError(format!("Failed to copy result from GPU: {:?}", e))
368            })?;
369            Ok(Array1::from_vec(result_vec))
370        } else {
371            // Fallback to CPU implementation
372            matrix.dot_vector(vector)
373        }
374    }
375
376    /// Select optimal kernel based on matrix characteristics
377    #[cfg(feature = "gpu")]
378    fn select_optimal_kernel<T>(
379        &self,
380        rows: usize,
381        matrix: &CsrArray<T>,
382    ) -> SparseResult<super::GpuKernelHandle>
383    where
384        T: Float + Debug + Copy,
385    {
386        // Calculate average non-zeros per row
387        let avg_nnz_per_row = matrix.get_data().len() as f64 / rows as f64;
388
389        // Select kernel based on sparsity pattern and platform capabilities
390        if avg_nnz_per_row < 5.0 && self.platform_info.supports_vectorization {
391            // Sparse matrix, use vectorized kernel if available
392            if let Some(ref kernel) = self.vectorized_kernel {
393                Ok(kernel.clone())
394            } else if let Some(ref kernel) = self.kernel_handle {
395                Ok(kernel.clone())
396            } else {
397                Err(SparseError::ComputationError(
398                    "No OpenCL kernels available".to_string(),
399                ))
400            }
401        } else if avg_nnz_per_row > 20.0 && self.platform_info.max_work_group_size >= 128 {
402            // Dense-ish matrix, use workgroup kernel
403            if let Some(ref kernel) = self.workgroup_kernel {
404                Ok(kernel.clone())
405            } else if let Some(ref kernel) = self.kernel_handle {
406                Ok(kernel.clone())
407            } else {
408                Err(SparseError::ComputationError(
409                    "No OpenCL kernels available".to_string(),
410                ))
411            }
412        } else {
413            // Default to basic kernel
414            if let Some(ref kernel) = self.kernel_handle {
415                Ok(kernel.clone())
416            } else {
417                Err(SparseError::ComputationError(
418                    "No OpenCL kernels available".to_string(),
419                ))
420            }
421        }
422    }
423
424    /// CPU fallback implementation
425    #[cfg(not(feature = "gpu"))]
426    pub fn execute_spmv_cpu<T>(
427        &self,
428        matrix: &CsrArray<T>,
429        vector: &ArrayView1<T>,
430    ) -> SparseResult<Array1<T>>
431    where
432        T: Float + Debug + Copy + std::iter::Sum,
433    {
434        matrix.dot_vector(vector)
435    }
436}
437
438impl Default for OpenCLSpMatVec {
439    fn default() -> Self {
440        Self::new().unwrap_or_else(|_| Self {
441            context: None,
442            kernel_handle: None,
443            workgroup_kernel: None,
444            vectorized_kernel: None,
445            platform_info: OpenCLPlatformInfo::default(),
446        })
447    }
448}
449
450/// OpenCL optimization levels for sparse matrix operations
451#[derive(Debug, Clone, Copy, PartialEq, Eq)]
452pub enum OpenCLOptimizationLevel {
453    /// Basic thread-per-row implementation
454    Basic,
455    /// Workgroup-optimized implementation with local memory
456    Workgroup,
457    /// Vectorized implementation using float4 operations
458    Vectorized,
459}
460
461impl Default for OpenCLOptimizationLevel {
462    fn default() -> Self {
463        Self::Basic
464    }
465}
466
467/// OpenCL platform information for optimization
468#[derive(Debug)]
469pub struct OpenCLPlatformInfo {
470    pub max_work_group_size: usize,
471    pub local_memory_size: usize,
472    pub supports_vectorization: bool,
473    pub compute_units: usize,
474    pub device_type: OpenCLDeviceType,
475}
476
477impl OpenCLPlatformInfo {
478    /// Detect OpenCL platform capabilities
479    pub fn detect() -> Self {
480        // In a real implementation, this would query the OpenCL runtime
481        // For now, return sensible defaults
482        Self {
483            max_work_group_size: 256,
484            local_memory_size: 32768, // 32KB
485            supports_vectorization: true,
486            compute_units: 8,
487            device_type: OpenCLDeviceType::GPU,
488        }
489    }
490}
491
492impl Default for OpenCLPlatformInfo {
493    fn default() -> Self {
494        Self::detect()
495    }
496}
497
498/// OpenCL device types
499#[derive(Debug, Clone, Copy, PartialEq, Eq)]
500pub enum OpenCLDeviceType {
501    /// CPU device
502    CPU,
503    /// GPU device
504    GPU,
505    /// Accelerator device
506    Accelerator,
507}
508
509/// OpenCL memory management for sparse matrices
510pub struct OpenCLMemoryManager {
511    platform_info: OpenCLPlatformInfo,
512    #[allow(dead_code)]
513    allocated_buffers: Vec<String>,
514}
515
516impl OpenCLMemoryManager {
517    /// Create a new OpenCL memory manager
518    pub fn new() -> Self {
519        Self {
520            platform_info: OpenCLPlatformInfo::detect(),
521            allocated_buffers: Vec::new(),
522        }
523    }
524
525    /// Allocate GPU memory for sparse matrix data with optimal layout
526    #[cfg(feature = "gpu")]
527    pub fn allocate_sparse_matrix<T>(
528        &mut self,
529        _matrix: &CsrArray<T>,
530        _device: &super::GpuDevice,
531    ) -> Result<OpenCLMatrixBuffers<T>, super::GpuError>
532    where
533        T: super::GpuDataType + Copy + Float + Debug,
534    {
535        // This functionality should use GpuContext instead of GpuDevice
536        // For now, return an error indicating this needs proper implementation
537        Err(super::GpuError::BackendNotImplemented(
538            super::GpuBackend::OpenCL,
539        ))
540    }
541
542    /// Get optimal work group size for the current platform
543    pub fn optimal_work_group_size(&self, problem_size: usize) -> usize {
544        let max_wg_size = self.platform_info.max_work_group_size;
545
546        // For small problems, use smaller work groups
547        if problem_size < 1000 {
548            max_wg_size.min(64)
549        } else if problem_size < 10000 {
550            max_wg_size.min(128)
551        } else {
552            max_wg_size
553        }
554    }
555
556    /// Check if vectorization is beneficial for the given matrix
557    pub fn should_use_vectorization<T>(&self, matrix: &CsrArray<T>) -> bool
558    where
559        T: Float + Debug + Copy,
560    {
561        if !self.platform_info.supports_vectorization {
562            return false;
563        }
564
565        let avg_nnz_per_row = matrix.nnz() as f64 / matrix.shape().0 as f64;
566
567        // Vectorization is beneficial for sparse matrices with moderate sparsity
568        (4.0..=32.0).contains(&avg_nnz_per_row)
569    }
570}
571
572impl Default for OpenCLMemoryManager {
573    fn default() -> Self {
574        Self::new()
575    }
576}
577
578/// GPU memory buffers for OpenCL sparse matrix data
579#[cfg(feature = "gpu")]
580pub struct OpenCLMatrixBuffers<T: super::GpuDataType> {
581    pub indptr: super::GpuBuffer<usize>,
582    pub indices: super::GpuBuffer<usize>,
583    pub data: super::GpuBuffer<T>,
584}
585
586#[cfg(test)]
587mod tests {
588    use super::*;
589    use scirs2_core::ndarray::Array1;
590
591    #[test]
592    fn test_opencl_spmv_creation() {
593        let opencl_spmv = OpenCLSpMatVec::new();
594        assert!(opencl_spmv.is_ok());
595    }
596
597    #[test]
598    fn test_opencl_optimization_levels() {
599        let basic = OpenCLOptimizationLevel::Basic;
600        let workgroup = OpenCLOptimizationLevel::Workgroup;
601        let vectorized = OpenCLOptimizationLevel::Vectorized;
602
603        assert_ne!(basic, workgroup);
604        assert_ne!(workgroup, vectorized);
605        assert_eq!(
606            OpenCLOptimizationLevel::default(),
607            OpenCLOptimizationLevel::Basic
608        );
609    }
610
611    #[test]
612    fn test_opencl_platform_info() {
613        let info = OpenCLPlatformInfo::detect();
614        assert!(info.max_work_group_size > 0);
615        assert!(info.local_memory_size > 0);
616        assert!(info.compute_units > 0);
617    }
618
619    #[test]
620    fn test_opencl_device_types() {
621        let device_types = [
622            OpenCLDeviceType::CPU,
623            OpenCLDeviceType::GPU,
624            OpenCLDeviceType::Accelerator,
625        ];
626
627        for device_type in &device_types {
628            match device_type {
629                OpenCLDeviceType::CPU => (),
630                OpenCLDeviceType::GPU => (),
631                OpenCLDeviceType::Accelerator => (),
632            }
633        }
634    }
635
636    #[test]
637    fn test_opencl_memory_manager() {
638        let manager = OpenCLMemoryManager::new();
639        assert_eq!(manager.allocated_buffers.len(), 0);
640        assert!(manager.platform_info.max_work_group_size > 0);
641
642        // Test work group size selection
643        let wg_size_small = manager.optimal_work_group_size(500);
644        let wg_size_large = manager.optimal_work_group_size(50000);
645        assert!(wg_size_small <= wg_size_large);
646    }
647
648    #[test]
649    #[allow(clippy::const_is_empty)]
650    fn test_kernel_sources() {
651        assert!(!OPENCL_SPMV_KERNEL_SOURCE.is_empty());
652        assert!(!OPENCL_VECTORIZED_KERNEL_SOURCE.is_empty());
653
654        // Check that kernels contain expected function names
655        assert!(OPENCL_SPMV_KERNEL_SOURCE.contains("spmv_csr_kernel"));
656        assert!(OPENCL_SPMV_KERNEL_SOURCE.contains("spmv_csr_workgroup_kernel"));
657        assert!(OPENCL_VECTORIZED_KERNEL_SOURCE.contains("spmv_csr_vectorized_kernel"));
658    }
659}