1use crate::csr_array::CsrArray;
6use crate::error::{SparseError, SparseResult};
7use crate::sparray::SparseArray;
8use scirs2_core::ndarray::{Array1, ArrayView1};
9use scirs2_core::numeric::{Float, SparseElement};
10use std::fmt::Debug;
11
12#[cfg(feature = "gpu")]
13use crate::gpu_kernel_execution::{GpuKernelConfig, MemoryStrategy};
14
15#[cfg(feature = "gpu")]
16pub use scirs2_core::gpu::{GpuBackend, GpuBuffer, GpuContext, GpuDataType, GpuKernelHandle};
17
18#[cfg(feature = "gpu")]
19pub use scirs2_core::GpuError;
20
21pub const OPENCL_SPMV_KERNEL_SOURCE: &str = r#"
23__kernel void spmv_csr_kernel(
24 int rows,
25 __global const int* restrict indptr,
26 __global const int* restrict indices,
27 __global const float* restrict data,
28 __global const float* restrict x,
29 __global float* restrict y
30) {
31 int row = get_global_id(0);
32 if (row >= rows) return;
33
34 float sum = 0.0f;
35 int start = indptr[row];
36 int end = indptr[row + 1];
37
38 for (int j = start; j < end; j++) {
39 sum += data[j] * x[indices[j]];
40 }
41
42 y[row] = sum;
43}
44
45__kernel void spmv_csr_workgroup_kernel(
46 int rows,
47 __global const int* restrict indptr,
48 __global const int* restrict indices,
49 __global const float* restrict data,
50 __global const float* restrict x,
51 __global float* restrict y,
52 __local float* local_mem
53) {
54 int gid = get_global_id(0);
55 int lid = get_local_id(0);
56 int group_size = get_local_size(0);
57
58 if (gid >= rows) return;
59
60 int start = indptr[gid];
61 int end = indptr[gid + 1];
62
63 local_mem[lid] = 0.0f;
64 barrier(CLK_LOCAL_MEM_FENCE);
65
66 for (int j = start; j < end; j++) {
67 local_mem[lid] += data[j] * x[indices[j]];
68 }
69
70 barrier(CLK_LOCAL_MEM_FENCE);
71 y[gid] = local_mem[lid];
72}
73"#;
74
75pub const OPENCL_VECTORIZED_KERNEL_SOURCE: &str = r#"
77__kernel void spmv_csr_vectorized_kernel(
78 int rows,
79 __global const int* restrict indptr,
80 __global const int* restrict indices,
81 __global const float* restrict data,
82 __global const float* restrict x,
83 __global float* restrict y
84) {
85 int row = get_global_id(0);
86 if (row >= rows) return;
87
88 int start = indptr[row];
89 int end = indptr[row + 1];
90 int nnz = end - start;
91
92 float4 sum = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
93
94 // Process 4 elements at a time when possible
95 int j;
96 for (j = start; j + 3 < end; j += 4) {
97 float4 data_vec = (float4)(data[j], data[j+1], data[j+2], data[j+3]);
98 float4 x_vec = (float4)(
99 x[indices[j]],
100 x[indices[j+1]],
101 x[indices[j+2]],
102 x[indices[j+3]]
103 );
104 sum += data_vec * x_vec;
105 }
106
107 float scalar_sum = sum.x + sum.y + sum.z + sum.w;
108
109 // Handle remaining elements
110 for (; j < end; j++) {
111 scalar_sum += data[j] * x[indices[j]];
112 }
113
114 y[row] = scalar_sum;
115}
116"#;
117
118pub struct OpenCLSpMatVec {
120 context: Option<scirs2_core::gpu::GpuContext>,
121 kernel_handle: Option<scirs2_core::gpu::GpuKernelHandle>,
122 workgroup_kernel: Option<scirs2_core::gpu::GpuKernelHandle>,
123 vectorized_kernel: Option<scirs2_core::gpu::GpuKernelHandle>,
124 platform_info: OpenCLPlatformInfo,
125}
126
127impl OpenCLSpMatVec {
128 pub fn new() -> SparseResult<Self> {
130 #[cfg(feature = "gpu")]
132 let context = match scirs2_core::gpu::GpuContext::new(scirs2_core::gpu::GpuBackend::OpenCL)
133 {
134 Ok(ctx) => Some(ctx),
135 Err(_) => None, };
137 #[cfg(not(feature = "gpu"))]
138 let context = None;
139
140 let mut handler = Self {
141 context,
142 kernel_handle: None,
143 workgroup_kernel: None,
144 vectorized_kernel: None,
145 platform_info: OpenCLPlatformInfo::detect(),
146 };
147
148 #[cfg(feature = "gpu")]
150 if handler.context.is_some() {
151 let _ = handler.compile_kernels();
152 }
153
154 Ok(handler)
155 }
156
157 #[cfg(feature = "gpu")]
159 pub fn compile_kernels(&mut self) -> Result<(), scirs2_core::gpu::GpuError> {
160 if let Some(ref context) = self.context {
161 self.kernel_handle =
163 context.execute(|compiler| compiler.compile(OPENCL_SPMV_KERNEL_SOURCE).ok());
164
165 self.workgroup_kernel =
167 context.execute(|compiler| compiler.compile(OPENCL_SPMV_KERNEL_SOURCE).ok());
168
169 self.vectorized_kernel =
171 context.execute(|compiler| compiler.compile(OPENCL_VECTORIZED_KERNEL_SOURCE).ok());
172
173 if self.kernel_handle.is_some() {
174 Ok(())
175 } else {
176 Err(scirs2_core::gpu::GpuError::KernelCompilationError(
177 "Failed to compile OpenCL kernels".to_string(),
178 ))
179 }
180 } else {
181 Err(scirs2_core::gpu::GpuError::BackendNotAvailable(
182 "OpenCL".to_string(),
183 ))
184 }
185 }
186
187 #[cfg(feature = "gpu")]
189 pub fn execute_spmv<T>(
190 &self,
191 matrix: &CsrArray<T>,
192 vector: &ArrayView1<T>,
193 _device: &super::GpuDevice,
194 ) -> SparseResult<Array1<T>>
195 where
196 T: Float + SparseElement + Debug + Copy + scirs2_core::gpu::GpuDataType,
197 {
198 let (rows, cols) = matrix.shape();
199 if cols != vector.len() {
200 return Err(SparseError::DimensionMismatch {
201 expected: cols,
202 found: vector.len(),
203 });
204 }
205
206 if let Some(ref context) = self.context {
207 if let Some(ref kernel) = self.kernel_handle {
208 let indptr_buffer = context.create_buffer_from_slice(
210 matrix.get_indptr().as_slice().expect("Operation failed"),
211 );
212 let indices_buffer = context.create_buffer_from_slice(
213 matrix.get_indices().as_slice().expect("Operation failed"),
214 );
215 let data_buffer = context.create_buffer_from_slice(
216 matrix.get_data().as_slice().expect("Operation failed"),
217 );
218 let vector_buffer =
219 context.create_buffer_from_slice(vector.as_slice().expect("Operation failed"));
220 let result_buffer = context.create_buffer::<T>(rows);
221
222 kernel.set_buffer("indptr", &indptr_buffer);
224 kernel.set_buffer("indices", &indices_buffer);
225 kernel.set_buffer("data", &data_buffer);
226 kernel.set_buffer("vector", &vector_buffer);
227 kernel.set_buffer("result", &result_buffer);
228 kernel.set_u32("rows", rows as u32);
229
230 let work_group_size = self.platform_info.max_work_group_size.min(256);
232 let grid_size = ((rows + work_group_size - 1) / work_group_size, 1, 1);
233 let block_size = (work_group_size, 1, 1);
234
235 let args = vec![scirs2_core::gpu::DynamicKernelArg::U32(rows as u32)];
237
238 context
239 .launch_kernel("spmv_csr_kernel", grid_size, block_size, &args)
240 .map_err(|e| {
241 SparseError::ComputationError(format!(
242 "OpenCL kernel execution failed: {:?}",
243 e
244 ))
245 })?;
246
247 let mut result_vec = vec![T::sparse_zero(); rows];
249 result_buffer.copy_to_host(&mut result_vec).map_err(|e| {
250 SparseError::ComputationError(format!(
251 "Failed to copy result from GPU: {:?}",
252 e
253 ))
254 })?;
255 Ok(Array1::from_vec(result_vec))
256 } else {
257 Err(SparseError::ComputationError(
258 "OpenCL kernel not compiled".to_string(),
259 ))
260 }
261 } else {
262 matrix.dot_vector(vector)
264 }
265 }
266
267 #[cfg(feature = "gpu")]
269 pub fn execute_optimized_spmv<T>(
270 &self,
271 matrix: &CsrArray<T>,
272 vector: &ArrayView1<T>,
273 device: &super::GpuDevice,
274 optimization_level: OpenCLOptimizationLevel,
275 ) -> SparseResult<Array1<T>>
276 where
277 T: Float + SparseElement + Debug + Copy + super::GpuDataType,
278 {
279 let (rows, cols) = matrix.shape();
280 if cols != vector.len() {
281 return Err(SparseError::DimensionMismatch {
282 expected: cols,
283 found: vector.len(),
284 });
285 }
286
287 let kernel = match optimization_level {
289 OpenCLOptimizationLevel::Basic => &self.kernel_handle,
290 OpenCLOptimizationLevel::Workgroup => &self.workgroup_kernel,
291 OpenCLOptimizationLevel::Vectorized => &self.vectorized_kernel,
292 };
293
294 if let Some(ref k) = kernel {
295 self.execute_kernel_with_optimization(matrix, vector, device, k, optimization_level)
296 } else {
297 Err(SparseError::ComputationError(
298 "OpenCL kernel not available for requested optimization level".to_string(),
299 ))
300 }
301 }
302
303 #[cfg(feature = "gpu")]
304 fn execute_kernel_with_optimization<T>(
305 &self,
306 matrix: &CsrArray<T>,
307 vector: &ArrayView1<T>,
308 _device: &super::GpuDevice,
309 _kernel: &super::GpuKernelHandle,
310 optimization_level: OpenCLOptimizationLevel,
311 ) -> SparseResult<Array1<T>>
312 where
313 T: Float + SparseElement + Debug + Copy + super::GpuDataType,
314 {
315 let (rows, _) = matrix.shape();
316
317 if let Some(ref context) = self.context {
318 let indptr_gpu = context.create_buffer_from_slice(
320 matrix.get_indptr().as_slice().expect("Operation failed"),
321 );
322 let indices_gpu = context.create_buffer_from_slice(
323 matrix.get_indices().as_slice().expect("Operation failed"),
324 );
325 let data_gpu = context
326 .create_buffer_from_slice(matrix.get_data().as_slice().expect("Operation failed"));
327 let vector_gpu =
328 context.create_buffer_from_slice(vector.as_slice().expect("Operation failed"));
329 let result_gpu = context.create_buffer::<T>(rows);
330
331 let (work_group_size, _local_memory_size) = match optimization_level {
333 OpenCLOptimizationLevel::Basic => {
334 (self.platform_info.max_work_group_size.min(64), 0)
335 }
336 OpenCLOptimizationLevel::Workgroup => {
337 let wg_size = self.platform_info.max_work_group_size.min(128);
338 (wg_size, wg_size * std::mem::size_of::<f32>())
339 }
340 OpenCLOptimizationLevel::Vectorized => {
341 (self.platform_info.max_work_group_size.min(256), 0)
342 }
343 };
344
345 let global_work_size =
346 ((rows + work_group_size - 1) / work_group_size) * work_group_size;
347
348 let args = vec![scirs2_core::gpu::DynamicKernelArg::U32(rows as u32)];
350
351 let kernel_name = match optimization_level {
353 OpenCLOptimizationLevel::Basic => "spmv_csr_kernel",
354 OpenCLOptimizationLevel::Workgroup => "spmv_csr_workgroup_kernel",
355 OpenCLOptimizationLevel::Vectorized => "spmv_csr_vectorized_kernel",
356 };
357
358 context
359 .launch_kernel(
360 kernel_name,
361 (global_work_size, 1, 1),
362 (work_group_size, 1, 1),
363 &args,
364 )
365 .map_err(|e| {
366 SparseError::ComputationError(format!(
367 "OpenCL kernel execution failed: {:?}",
368 e
369 ))
370 })?;
371
372 let mut result_vec = vec![T::sparse_zero(); rows];
374 result_gpu.copy_to_host(&mut result_vec).map_err(|e| {
375 SparseError::ComputationError(format!("Failed to copy result from GPU: {:?}", e))
376 })?;
377 Ok(Array1::from_vec(result_vec))
378 } else {
379 matrix.dot_vector(vector)
381 }
382 }
383
384 #[cfg(feature = "gpu")]
386 fn select_optimal_kernel<T>(
387 &self,
388 rows: usize,
389 matrix: &CsrArray<T>,
390 ) -> SparseResult<super::GpuKernelHandle>
391 where
392 T: Float + SparseElement + Debug + Copy,
393 {
394 let avg_nnz_per_row = matrix.get_data().len() as f64 / rows as f64;
396
397 if avg_nnz_per_row < 5.0 && self.platform_info.supports_vectorization {
399 if let Some(ref kernel) = self.vectorized_kernel {
401 Ok(kernel.clone())
402 } else if let Some(ref kernel) = self.kernel_handle {
403 Ok(kernel.clone())
404 } else {
405 Err(SparseError::ComputationError(
406 "No OpenCL kernels available".to_string(),
407 ))
408 }
409 } else if avg_nnz_per_row > 20.0 && self.platform_info.max_work_group_size >= 128 {
410 if let Some(ref kernel) = self.workgroup_kernel {
412 Ok(kernel.clone())
413 } else if let Some(ref kernel) = self.kernel_handle {
414 Ok(kernel.clone())
415 } else {
416 Err(SparseError::ComputationError(
417 "No OpenCL kernels available".to_string(),
418 ))
419 }
420 } else {
421 if let Some(ref kernel) = self.kernel_handle {
423 Ok(kernel.clone())
424 } else {
425 Err(SparseError::ComputationError(
426 "No OpenCL kernels available".to_string(),
427 ))
428 }
429 }
430 }
431
432 #[cfg(not(feature = "gpu"))]
434 pub fn execute_spmv_cpu<T>(
435 &self,
436 matrix: &CsrArray<T>,
437 vector: &ArrayView1<T>,
438 ) -> SparseResult<Array1<T>>
439 where
440 T: Float + SparseElement + Debug + Copy + std::iter::Sum,
441 {
442 matrix.dot_vector(vector)
443 }
444}
445
446impl Default for OpenCLSpMatVec {
447 fn default() -> Self {
448 Self::new().unwrap_or_else(|_| Self {
449 context: None,
450 kernel_handle: None,
451 workgroup_kernel: None,
452 vectorized_kernel: None,
453 platform_info: OpenCLPlatformInfo::default(),
454 })
455 }
456}
457
458#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
460pub enum OpenCLOptimizationLevel {
461 #[default]
463 Basic,
464 Workgroup,
466 Vectorized,
468}
469
470#[derive(Debug)]
472pub struct OpenCLPlatformInfo {
473 pub max_work_group_size: usize,
474 pub local_memory_size: usize,
475 pub supports_vectorization: bool,
476 pub compute_units: usize,
477 pub device_type: OpenCLDeviceType,
478}
479
480impl OpenCLPlatformInfo {
481 pub fn detect() -> Self {
483 Self {
486 max_work_group_size: 256,
487 local_memory_size: 32768, supports_vectorization: true,
489 compute_units: 8,
490 device_type: OpenCLDeviceType::GPU,
491 }
492 }
493}
494
495impl Default for OpenCLPlatformInfo {
496 fn default() -> Self {
497 Self::detect()
498 }
499}
500
501#[derive(Debug, Clone, Copy, PartialEq, Eq)]
503pub enum OpenCLDeviceType {
504 CPU,
506 GPU,
508 Accelerator,
510}
511
512pub struct OpenCLMemoryManager {
514 platform_info: OpenCLPlatformInfo,
515 #[allow(dead_code)]
516 allocated_buffers: Vec<String>,
517}
518
519impl OpenCLMemoryManager {
520 pub fn new() -> Self {
522 Self {
523 platform_info: OpenCLPlatformInfo::detect(),
524 allocated_buffers: Vec::new(),
525 }
526 }
527
528 #[cfg(feature = "gpu")]
530 pub fn allocate_sparse_matrix<T>(
531 &mut self,
532 _matrix: &CsrArray<T>,
533 _device: &super::GpuDevice,
534 ) -> Result<OpenCLMatrixBuffers<T>, super::GpuError>
535 where
536 T: super::GpuDataType + Copy + Float + SparseElement + Debug,
537 {
538 Err(super::GpuError::BackendNotImplemented(
541 super::GpuBackend::OpenCL,
542 ))
543 }
544
545 pub fn optimal_work_group_size(&self, problem_size: usize) -> usize {
547 let max_wg_size = self.platform_info.max_work_group_size;
548
549 if problem_size < 1000 {
551 max_wg_size.min(64)
552 } else if problem_size < 10000 {
553 max_wg_size.min(128)
554 } else {
555 max_wg_size
556 }
557 }
558
559 pub fn should_use_vectorization<T>(&self, matrix: &CsrArray<T>) -> bool
561 where
562 T: Float + SparseElement + Debug + Copy,
563 {
564 if !self.platform_info.supports_vectorization {
565 return false;
566 }
567
568 let avg_nnz_per_row = matrix.nnz() as f64 / matrix.shape().0 as f64;
569
570 (4.0..=32.0).contains(&avg_nnz_per_row)
572 }
573}
574
575impl Default for OpenCLMemoryManager {
576 fn default() -> Self {
577 Self::new()
578 }
579}
580
581#[cfg(feature = "gpu")]
583pub struct OpenCLMatrixBuffers<T: super::GpuDataType> {
584 pub indptr: super::GpuBuffer<usize>,
585 pub indices: super::GpuBuffer<usize>,
586 pub data: super::GpuBuffer<T>,
587}
588
589#[cfg(test)]
590mod tests {
591 use super::*;
592 use scirs2_core::ndarray::Array1;
593
594 #[test]
595 fn test_opencl_spmv_creation() {
596 let opencl_spmv = OpenCLSpMatVec::new();
597 assert!(opencl_spmv.is_ok());
598 }
599
600 #[test]
601 fn test_opencl_optimization_levels() {
602 let basic = OpenCLOptimizationLevel::Basic;
603 let workgroup = OpenCLOptimizationLevel::Workgroup;
604 let vectorized = OpenCLOptimizationLevel::Vectorized;
605
606 assert_ne!(basic, workgroup);
607 assert_ne!(workgroup, vectorized);
608 assert_eq!(
609 OpenCLOptimizationLevel::default(),
610 OpenCLOptimizationLevel::Basic
611 );
612 }
613
614 #[test]
615 fn test_opencl_platform_info() {
616 let info = OpenCLPlatformInfo::detect();
617 assert!(info.max_work_group_size > 0);
618 assert!(info.local_memory_size > 0);
619 assert!(info.compute_units > 0);
620 }
621
622 #[test]
623 fn test_opencl_device_types() {
624 let device_types = [
625 OpenCLDeviceType::CPU,
626 OpenCLDeviceType::GPU,
627 OpenCLDeviceType::Accelerator,
628 ];
629
630 for device_type in &device_types {
631 match device_type {
632 OpenCLDeviceType::CPU => (),
633 OpenCLDeviceType::GPU => (),
634 OpenCLDeviceType::Accelerator => (),
635 }
636 }
637 }
638
639 #[test]
640 fn test_opencl_memory_manager() {
641 let manager = OpenCLMemoryManager::new();
642 assert_eq!(manager.allocated_buffers.len(), 0);
643 assert!(manager.platform_info.max_work_group_size > 0);
644
645 let wg_size_small = manager.optimal_work_group_size(500);
647 let wg_size_large = manager.optimal_work_group_size(50000);
648 assert!(wg_size_small <= wg_size_large);
649 }
650
651 #[test]
652 #[allow(clippy::const_is_empty)]
653 fn test_kernel_sources() {
654 assert!(!OPENCL_SPMV_KERNEL_SOURCE.is_empty());
655 assert!(!OPENCL_VECTORIZED_KERNEL_SOURCE.is_empty());
656
657 assert!(OPENCL_SPMV_KERNEL_SOURCE.contains("spmv_csr_kernel"));
659 assert!(OPENCL_SPMV_KERNEL_SOURCE.contains("spmv_csr_workgroup_kernel"));
660 assert!(OPENCL_VECTORIZED_KERNEL_SOURCE.contains("spmv_csr_vectorized_kernel"));
661 }
662}