1use crate::csr_array::CsrArray;
6use crate::error::{SparseError, SparseResult};
7use crate::sparray::SparseArray;
8use scirs2_core::ndarray::{Array1, ArrayView1};
9use scirs2_core::numeric::Float;
10use std::fmt::Debug;
11
12#[cfg(feature = "gpu")]
13use crate::gpu_kernel_execution::{GpuKernelConfig, MemoryStrategy};
14
15#[cfg(feature = "gpu")]
16pub use scirs2_core::gpu::{GpuBackend, GpuBuffer, GpuContext, GpuDataType, GpuKernelHandle};
17
18#[cfg(feature = "gpu")]
19pub use scirs2_core::GpuError;
20
21pub const OPENCL_SPMV_KERNEL_SOURCE: &str = r#"
23__kernel void spmv_csr_kernel(
24 int rows,
25 __global const int* restrict indptr,
26 __global const int* restrict indices,
27 __global const float* restrict data,
28 __global const float* restrict x,
29 __global float* restrict y
30) {
31 int row = get_global_id(0);
32 if (row >= rows) return;
33
34 float sum = 0.0f;
35 int start = indptr[row];
36 int end = indptr[row + 1];
37
38 for (int j = start; j < end; j++) {
39 sum += data[j] * x[indices[j]];
40 }
41
42 y[row] = sum;
43}
44
45__kernel void spmv_csr_workgroup_kernel(
46 int rows,
47 __global const int* restrict indptr,
48 __global const int* restrict indices,
49 __global const float* restrict data,
50 __global const float* restrict x,
51 __global float* restrict y,
52 __local float* local_mem
53) {
54 int gid = get_global_id(0);
55 int lid = get_local_id(0);
56 int group_size = get_local_size(0);
57
58 if (gid >= rows) return;
59
60 int start = indptr[gid];
61 int end = indptr[gid + 1];
62
63 local_mem[lid] = 0.0f;
64 barrier(CLK_LOCAL_MEM_FENCE);
65
66 for (int j = start; j < end; j++) {
67 local_mem[lid] += data[j] * x[indices[j]];
68 }
69
70 barrier(CLK_LOCAL_MEM_FENCE);
71 y[gid] = local_mem[lid];
72}
73"#;
74
75pub const OPENCL_VECTORIZED_KERNEL_SOURCE: &str = r#"
77__kernel void spmv_csr_vectorized_kernel(
78 int rows,
79 __global const int* restrict indptr,
80 __global const int* restrict indices,
81 __global const float* restrict data,
82 __global const float* restrict x,
83 __global float* restrict y
84) {
85 int row = get_global_id(0);
86 if (row >= rows) return;
87
88 int start = indptr[row];
89 int end = indptr[row + 1];
90 int nnz = end - start;
91
92 float4 sum = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
93
94 // Process 4 elements at a time when possible
95 int j;
96 for (j = start; j + 3 < end; j += 4) {
97 float4 data_vec = (float4)(data[j], data[j+1], data[j+2], data[j+3]);
98 float4 x_vec = (float4)(
99 x[indices[j]],
100 x[indices[j+1]],
101 x[indices[j+2]],
102 x[indices[j+3]]
103 );
104 sum += data_vec * x_vec;
105 }
106
107 float scalar_sum = sum.x + sum.y + sum.z + sum.w;
108
109 // Handle remaining elements
110 for (; j < end; j++) {
111 scalar_sum += data[j] * x[indices[j]];
112 }
113
114 y[row] = scalar_sum;
115}
116"#;
117
118pub struct OpenCLSpMatVec {
120 context: Option<scirs2_core::gpu::GpuContext>,
121 kernel_handle: Option<scirs2_core::gpu::GpuKernelHandle>,
122 workgroup_kernel: Option<scirs2_core::gpu::GpuKernelHandle>,
123 vectorized_kernel: Option<scirs2_core::gpu::GpuKernelHandle>,
124 platform_info: OpenCLPlatformInfo,
125}
126
127impl OpenCLSpMatVec {
128 pub fn new() -> SparseResult<Self> {
130 #[cfg(feature = "gpu")]
132 let context = match scirs2_core::gpu::GpuContext::new(scirs2_core::gpu::GpuBackend::OpenCL)
133 {
134 Ok(ctx) => Some(ctx),
135 Err(_) => None, };
137 #[cfg(not(feature = "gpu"))]
138 let context = None;
139
140 let mut handler = Self {
141 context,
142 kernel_handle: None,
143 workgroup_kernel: None,
144 vectorized_kernel: None,
145 platform_info: OpenCLPlatformInfo::detect(),
146 };
147
148 #[cfg(feature = "gpu")]
150 if handler.context.is_some() {
151 let _ = handler.compile_kernels();
152 }
153
154 Ok(handler)
155 }
156
157 #[cfg(feature = "gpu")]
159 pub fn compile_kernels(&mut self) -> Result<(), scirs2_core::gpu::GpuError> {
160 if let Some(ref context) = self.context {
161 self.kernel_handle =
163 context.execute(|compiler| compiler.compile(OPENCL_SPMV_KERNEL_SOURCE).ok());
164
165 self.workgroup_kernel =
167 context.execute(|compiler| compiler.compile(OPENCL_SPMV_KERNEL_SOURCE).ok());
168
169 self.vectorized_kernel =
171 context.execute(|compiler| compiler.compile(OPENCL_VECTORIZED_KERNEL_SOURCE).ok());
172
173 if self.kernel_handle.is_some() {
174 Ok(())
175 } else {
176 Err(scirs2_core::gpu::GpuError::KernelCompilationError(
177 "Failed to compile OpenCL kernels".to_string(),
178 ))
179 }
180 } else {
181 Err(scirs2_core::gpu::GpuError::BackendNotAvailable(
182 "OpenCL".to_string(),
183 ))
184 }
185 }
186
187 #[cfg(feature = "gpu")]
189 pub fn execute_spmv<T>(
190 &self,
191 matrix: &CsrArray<T>,
192 vector: &ArrayView1<T>,
193 _device: &super::GpuDevice,
194 ) -> SparseResult<Array1<T>>
195 where
196 T: Float + Debug + Copy + scirs2_core::gpu::GpuDataType,
197 {
198 let (rows, cols) = matrix.shape();
199 if cols != vector.len() {
200 return Err(SparseError::DimensionMismatch {
201 expected: cols,
202 found: vector.len(),
203 });
204 }
205
206 if let Some(ref context) = self.context {
207 if let Some(ref kernel) = self.kernel_handle {
208 let indptr_buffer =
210 context.create_buffer_from_slice(matrix.get_indptr().as_slice().unwrap());
211 let indices_buffer =
212 context.create_buffer_from_slice(matrix.get_indices().as_slice().unwrap());
213 let data_buffer =
214 context.create_buffer_from_slice(matrix.get_data().as_slice().unwrap());
215 let vector_buffer = context.create_buffer_from_slice(vector.as_slice().unwrap());
216 let result_buffer = context.create_buffer::<T>(rows);
217
218 kernel.set_buffer("indptr", &indptr_buffer);
220 kernel.set_buffer("indices", &indices_buffer);
221 kernel.set_buffer("data", &data_buffer);
222 kernel.set_buffer("vector", &vector_buffer);
223 kernel.set_buffer("result", &result_buffer);
224 kernel.set_u32("rows", rows as u32);
225
226 let work_group_size = self.platform_info.max_work_group_size.min(256);
228 let grid_size = ((rows + work_group_size - 1) / work_group_size, 1, 1);
229 let block_size = (work_group_size, 1, 1);
230
231 let args = vec![scirs2_core::gpu::DynamicKernelArg::U32(rows as u32)];
233
234 context
235 .launch_kernel("spmv_csr_kernel", grid_size, block_size, &args)
236 .map_err(|e| {
237 SparseError::ComputationError(format!(
238 "OpenCL kernel execution failed: {:?}",
239 e
240 ))
241 })?;
242
243 let mut result_vec = vec![T::zero(); rows];
245 result_buffer.copy_to_host(&mut result_vec).map_err(|e| {
246 SparseError::ComputationError(format!(
247 "Failed to copy result from GPU: {:?}",
248 e
249 ))
250 })?;
251 Ok(Array1::from_vec(result_vec))
252 } else {
253 Err(SparseError::ComputationError(
254 "OpenCL kernel not compiled".to_string(),
255 ))
256 }
257 } else {
258 matrix.dot_vector(vector)
260 }
261 }
262
263 #[cfg(feature = "gpu")]
265 pub fn execute_optimized_spmv<T>(
266 &self,
267 matrix: &CsrArray<T>,
268 vector: &ArrayView1<T>,
269 device: &super::GpuDevice,
270 optimization_level: OpenCLOptimizationLevel,
271 ) -> SparseResult<Array1<T>>
272 where
273 T: Float + Debug + Copy + super::GpuDataType,
274 {
275 let (rows, cols) = matrix.shape();
276 if cols != vector.len() {
277 return Err(SparseError::DimensionMismatch {
278 expected: cols,
279 found: vector.len(),
280 });
281 }
282
283 let kernel = match optimization_level {
285 OpenCLOptimizationLevel::Basic => &self.kernel_handle,
286 OpenCLOptimizationLevel::Workgroup => &self.workgroup_kernel,
287 OpenCLOptimizationLevel::Vectorized => &self.vectorized_kernel,
288 };
289
290 if let Some(ref k) = kernel {
291 self.execute_kernel_with_optimization(matrix, vector, device, k, optimization_level)
292 } else {
293 Err(SparseError::ComputationError(
294 "OpenCL kernel not available for requested optimization level".to_string(),
295 ))
296 }
297 }
298
299 #[cfg(feature = "gpu")]
300 fn execute_kernel_with_optimization<T>(
301 &self,
302 matrix: &CsrArray<T>,
303 vector: &ArrayView1<T>,
304 _device: &super::GpuDevice,
305 _kernel: &super::GpuKernelHandle,
306 optimization_level: OpenCLOptimizationLevel,
307 ) -> SparseResult<Array1<T>>
308 where
309 T: Float + Debug + Copy + super::GpuDataType,
310 {
311 let (rows, _) = matrix.shape();
312
313 if let Some(ref context) = self.context {
314 let indptr_gpu =
316 context.create_buffer_from_slice(matrix.get_indptr().as_slice().unwrap());
317 let indices_gpu =
318 context.create_buffer_from_slice(matrix.get_indices().as_slice().unwrap());
319 let data_gpu = context.create_buffer_from_slice(matrix.get_data().as_slice().unwrap());
320 let vector_gpu = context.create_buffer_from_slice(vector.as_slice().unwrap());
321 let result_gpu = context.create_buffer::<T>(rows);
322
323 let (work_group_size, _local_memory_size) = match optimization_level {
325 OpenCLOptimizationLevel::Basic => {
326 (self.platform_info.max_work_group_size.min(64), 0)
327 }
328 OpenCLOptimizationLevel::Workgroup => {
329 let wg_size = self.platform_info.max_work_group_size.min(128);
330 (wg_size, wg_size * std::mem::size_of::<f32>())
331 }
332 OpenCLOptimizationLevel::Vectorized => {
333 (self.platform_info.max_work_group_size.min(256), 0)
334 }
335 };
336
337 let global_work_size =
338 ((rows + work_group_size - 1) / work_group_size) * work_group_size;
339
340 let args = vec![scirs2_core::gpu::DynamicKernelArg::U32(rows as u32)];
342
343 let kernel_name = match optimization_level {
345 OpenCLOptimizationLevel::Basic => "spmv_csr_kernel",
346 OpenCLOptimizationLevel::Workgroup => "spmv_csr_workgroup_kernel",
347 OpenCLOptimizationLevel::Vectorized => "spmv_csr_vectorized_kernel",
348 };
349
350 context
351 .launch_kernel(
352 kernel_name,
353 (global_work_size, 1, 1),
354 (work_group_size, 1, 1),
355 &args,
356 )
357 .map_err(|e| {
358 SparseError::ComputationError(format!(
359 "OpenCL kernel execution failed: {:?}",
360 e
361 ))
362 })?;
363
364 let mut result_vec = vec![T::zero(); rows];
366 result_gpu.copy_to_host(&mut result_vec).map_err(|e| {
367 SparseError::ComputationError(format!("Failed to copy result from GPU: {:?}", e))
368 })?;
369 Ok(Array1::from_vec(result_vec))
370 } else {
371 matrix.dot_vector(vector)
373 }
374 }
375
376 #[cfg(feature = "gpu")]
378 fn select_optimal_kernel<T>(
379 &self,
380 rows: usize,
381 matrix: &CsrArray<T>,
382 ) -> SparseResult<super::GpuKernelHandle>
383 where
384 T: Float + Debug + Copy,
385 {
386 let avg_nnz_per_row = matrix.get_data().len() as f64 / rows as f64;
388
389 if avg_nnz_per_row < 5.0 && self.platform_info.supports_vectorization {
391 if let Some(ref kernel) = self.vectorized_kernel {
393 Ok(kernel.clone())
394 } else if let Some(ref kernel) = self.kernel_handle {
395 Ok(kernel.clone())
396 } else {
397 Err(SparseError::ComputationError(
398 "No OpenCL kernels available".to_string(),
399 ))
400 }
401 } else if avg_nnz_per_row > 20.0 && self.platform_info.max_work_group_size >= 128 {
402 if let Some(ref kernel) = self.workgroup_kernel {
404 Ok(kernel.clone())
405 } else if let Some(ref kernel) = self.kernel_handle {
406 Ok(kernel.clone())
407 } else {
408 Err(SparseError::ComputationError(
409 "No OpenCL kernels available".to_string(),
410 ))
411 }
412 } else {
413 if let Some(ref kernel) = self.kernel_handle {
415 Ok(kernel.clone())
416 } else {
417 Err(SparseError::ComputationError(
418 "No OpenCL kernels available".to_string(),
419 ))
420 }
421 }
422 }
423
424 #[cfg(not(feature = "gpu"))]
426 pub fn execute_spmv_cpu<T>(
427 &self,
428 matrix: &CsrArray<T>,
429 vector: &ArrayView1<T>,
430 ) -> SparseResult<Array1<T>>
431 where
432 T: Float + Debug + Copy + std::iter::Sum,
433 {
434 matrix.dot_vector(vector)
435 }
436}
437
438impl Default for OpenCLSpMatVec {
439 fn default() -> Self {
440 Self::new().unwrap_or_else(|_| Self {
441 context: None,
442 kernel_handle: None,
443 workgroup_kernel: None,
444 vectorized_kernel: None,
445 platform_info: OpenCLPlatformInfo::default(),
446 })
447 }
448}
449
450#[derive(Debug, Clone, Copy, PartialEq, Eq)]
452pub enum OpenCLOptimizationLevel {
453 Basic,
455 Workgroup,
457 Vectorized,
459}
460
461impl Default for OpenCLOptimizationLevel {
462 fn default() -> Self {
463 Self::Basic
464 }
465}
466
467#[derive(Debug)]
469pub struct OpenCLPlatformInfo {
470 pub max_work_group_size: usize,
471 pub local_memory_size: usize,
472 pub supports_vectorization: bool,
473 pub compute_units: usize,
474 pub device_type: OpenCLDeviceType,
475}
476
477impl OpenCLPlatformInfo {
478 pub fn detect() -> Self {
480 Self {
483 max_work_group_size: 256,
484 local_memory_size: 32768, supports_vectorization: true,
486 compute_units: 8,
487 device_type: OpenCLDeviceType::GPU,
488 }
489 }
490}
491
492impl Default for OpenCLPlatformInfo {
493 fn default() -> Self {
494 Self::detect()
495 }
496}
497
498#[derive(Debug, Clone, Copy, PartialEq, Eq)]
500pub enum OpenCLDeviceType {
501 CPU,
503 GPU,
505 Accelerator,
507}
508
509pub struct OpenCLMemoryManager {
511 platform_info: OpenCLPlatformInfo,
512 #[allow(dead_code)]
513 allocated_buffers: Vec<String>,
514}
515
516impl OpenCLMemoryManager {
517 pub fn new() -> Self {
519 Self {
520 platform_info: OpenCLPlatformInfo::detect(),
521 allocated_buffers: Vec::new(),
522 }
523 }
524
525 #[cfg(feature = "gpu")]
527 pub fn allocate_sparse_matrix<T>(
528 &mut self,
529 _matrix: &CsrArray<T>,
530 _device: &super::GpuDevice,
531 ) -> Result<OpenCLMatrixBuffers<T>, super::GpuError>
532 where
533 T: super::GpuDataType + Copy + Float + Debug,
534 {
535 Err(super::GpuError::BackendNotImplemented(
538 super::GpuBackend::OpenCL,
539 ))
540 }
541
542 pub fn optimal_work_group_size(&self, problem_size: usize) -> usize {
544 let max_wg_size = self.platform_info.max_work_group_size;
545
546 if problem_size < 1000 {
548 max_wg_size.min(64)
549 } else if problem_size < 10000 {
550 max_wg_size.min(128)
551 } else {
552 max_wg_size
553 }
554 }
555
556 pub fn should_use_vectorization<T>(&self, matrix: &CsrArray<T>) -> bool
558 where
559 T: Float + Debug + Copy,
560 {
561 if !self.platform_info.supports_vectorization {
562 return false;
563 }
564
565 let avg_nnz_per_row = matrix.nnz() as f64 / matrix.shape().0 as f64;
566
567 (4.0..=32.0).contains(&avg_nnz_per_row)
569 }
570}
571
572impl Default for OpenCLMemoryManager {
573 fn default() -> Self {
574 Self::new()
575 }
576}
577
578#[cfg(feature = "gpu")]
580pub struct OpenCLMatrixBuffers<T: super::GpuDataType> {
581 pub indptr: super::GpuBuffer<usize>,
582 pub indices: super::GpuBuffer<usize>,
583 pub data: super::GpuBuffer<T>,
584}
585
586#[cfg(test)]
587mod tests {
588 use super::*;
589 use scirs2_core::ndarray::Array1;
590
591 #[test]
592 fn test_opencl_spmv_creation() {
593 let opencl_spmv = OpenCLSpMatVec::new();
594 assert!(opencl_spmv.is_ok());
595 }
596
597 #[test]
598 fn test_opencl_optimization_levels() {
599 let basic = OpenCLOptimizationLevel::Basic;
600 let workgroup = OpenCLOptimizationLevel::Workgroup;
601 let vectorized = OpenCLOptimizationLevel::Vectorized;
602
603 assert_ne!(basic, workgroup);
604 assert_ne!(workgroup, vectorized);
605 assert_eq!(
606 OpenCLOptimizationLevel::default(),
607 OpenCLOptimizationLevel::Basic
608 );
609 }
610
611 #[test]
612 fn test_opencl_platform_info() {
613 let info = OpenCLPlatformInfo::detect();
614 assert!(info.max_work_group_size > 0);
615 assert!(info.local_memory_size > 0);
616 assert!(info.compute_units > 0);
617 }
618
619 #[test]
620 fn test_opencl_device_types() {
621 let device_types = [
622 OpenCLDeviceType::CPU,
623 OpenCLDeviceType::GPU,
624 OpenCLDeviceType::Accelerator,
625 ];
626
627 for device_type in &device_types {
628 match device_type {
629 OpenCLDeviceType::CPU => (),
630 OpenCLDeviceType::GPU => (),
631 OpenCLDeviceType::Accelerator => (),
632 }
633 }
634 }
635
636 #[test]
637 fn test_opencl_memory_manager() {
638 let manager = OpenCLMemoryManager::new();
639 assert_eq!(manager.allocated_buffers.len(), 0);
640 assert!(manager.platform_info.max_work_group_size > 0);
641
642 let wg_size_small = manager.optimal_work_group_size(500);
644 let wg_size_large = manager.optimal_work_group_size(50000);
645 assert!(wg_size_small <= wg_size_large);
646 }
647
648 #[test]
649 #[allow(clippy::const_is_empty)]
650 fn test_kernel_sources() {
651 assert!(!OPENCL_SPMV_KERNEL_SOURCE.is_empty());
652 assert!(!OPENCL_VECTORIZED_KERNEL_SOURCE.is_empty());
653
654 assert!(OPENCL_SPMV_KERNEL_SOURCE.contains("spmv_csr_kernel"));
656 assert!(OPENCL_SPMV_KERNEL_SOURCE.contains("spmv_csr_workgroup_kernel"));
657 assert!(OPENCL_VECTORIZED_KERNEL_SOURCE.contains("spmv_csr_vectorized_kernel"));
658 }
659}