1use crate::error::{SparseError, SparseResult};
7use scirs2_core::gpu::{GpuBackend, GpuContext, GpuDataType};
8use scirs2_core::numeric::{Float, NumAssign};
9use scirs2_core::simd_ops::SimdUnifiedOps;
10use std::fmt::Debug;
11
12pub struct GpuSpMV {
14 #[allow(dead_code)]
15 context: GpuContext,
16 backend: GpuBackend,
17}
18
19impl GpuSpMV {
20 pub fn new() -> SparseResult<Self> {
22 let (context, backend) = Self::initialize_best_backend()?;
24
25 Ok(Self { context, backend })
26 }
27
28 pub fn with_backend(backend: GpuBackend) -> SparseResult<Self> {
30 let context = GpuContext::new(backend).map_err(|e| {
31 SparseError::ComputationError(format!("Failed to initialize GPU context: {e}"))
32 })?;
33
34 Ok(Self { context, backend })
35 }
36
37 fn initialize_best_backend() -> SparseResult<(GpuContext, GpuBackend)> {
39 let backends_to_try = [
41 GpuBackend::Cuda, GpuBackend::Metal, GpuBackend::OpenCL, GpuBackend::Cpu, ];
46
47 for &backend in &backends_to_try {
48 if let Ok(context) = GpuContext::new(backend) {
49 return Ok((context, backend));
50 }
51 }
52
53 Err(SparseError::ComputationError(
54 "No GPU backend available".to_string(),
55 ))
56 }
57
58 #[allow(clippy::too_many_arguments)]
60 pub fn spmv<T>(
61 &self,
62 rows: usize,
63 cols: usize,
64 indptr: &[usize],
65 indices: &[usize],
66 data: &[T],
67 x: &[T],
68 ) -> SparseResult<Vec<T>>
69 where
70 T: Float
71 + Debug
72 + Copy
73 + Default
74 + GpuDataType
75 + Send
76 + Sync
77 + 'static
78 + NumAssign
79 + SimdUnifiedOps
80 + std::iter::Sum,
81 {
82 self.validate_spmv_inputs(rows, cols, indptr, indices, data, x)?;
84
85 match self.backend {
87 GpuBackend::Cuda => self.spmv_cuda(rows, indptr, indices, data, x),
88 GpuBackend::OpenCL => self.spmv_opencl(rows, indptr, indices, data, x),
89 GpuBackend::Metal => self.spmv_metal(rows, indptr, indices, data, x),
90 GpuBackend::Cpu => self.spmv_cpu_optimized(rows, indptr, indices, data, x),
91 GpuBackend::Rocm | GpuBackend::Wgpu => {
92 self.spmv_cpu_optimized(rows, indptr, indices, data, x)
94 }
95 }
96 }
97
98 fn validate_spmv_inputs<T>(
100 &self,
101 rows: usize,
102 cols: usize,
103 indptr: &[usize],
104 indices: &[usize],
105 data: &[T],
106 x: &[T],
107 ) -> SparseResult<()>
108 where
109 T: Float + Debug,
110 {
111 if indptr.len() != rows + 1 {
112 return Err(SparseError::InvalidFormat(format!(
113 "indptr length {} does not match rows + 1 = {}",
114 indptr.len(),
115 rows + 1
116 )));
117 }
118
119 if indices.len() != data.len() {
120 return Err(SparseError::InvalidFormat(format!(
121 "indices length {} does not match data length {}",
122 indices.len(),
123 data.len()
124 )));
125 }
126
127 if x.len() != cols {
128 return Err(SparseError::InvalidFormat(format!(
129 "x length {} does not match cols {}",
130 x.len(),
131 cols
132 )));
133 }
134
135 for &idx in indices {
137 if idx >= cols {
138 return Err(SparseError::InvalidFormat(format!(
139 "Column index {idx} exceeds cols {cols}"
140 )));
141 }
142 }
143
144 Ok(())
145 }
146
147 fn spmv_cuda<T>(
149 &self,
150 rows: usize,
151 indptr: &[usize],
152 indices: &[usize],
153 data: &[T],
154 x: &[T],
155 ) -> SparseResult<Vec<T>>
156 where
157 T: Float
158 + Debug
159 + Copy
160 + Default
161 + GpuDataType
162 + Send
163 + Sync
164 + 'static
165 + NumAssign
166 + SimdUnifiedOps
167 + std::iter::Sum,
168 {
169 #[cfg(feature = "gpu")]
170 {
171 use crate::gpu_ops::{GpuBufferExt, SpMVKernel};
172
173 let indptr_buffer = self.context.create_buffer_from_slice(indptr);
175 let indices_buffer = self.context.create_buffer_from_slice(indices);
176 let data_buffer = self.context.create_buffer_from_slice(data);
177 let x_buffer = self.context.create_buffer_from_slice(x);
178 let mut y_buffer = self.context.create_buffer::<T>(rows);
179
180 use crate::csr_array::CsrArray;
182 use crate::gpu::GpuSpMatVec;
183
184 let csr_matrix = CsrArray::new(
186 data.to_vec().into(),
187 indices.to_vec().into(),
188 indptr.to_vec().into(),
189 (rows, x.len()),
190 )?;
191
192 let gpu_handler = GpuSpMatVec::with_backend(self.backend)?;
193 let result = gpu_handler.spmv(
194 &csr_matrix,
195 &scirs2_core::ndarray::ArrayView1::from(x),
196 None,
197 )?;
198 Ok(result.to_vec())
199 }
200
201 #[cfg(not(feature = "gpu"))]
202 {
203 self.spmv_cpu_optimized(rows, indptr, indices, data, x)
205 }
206 }
207
208 fn spmv_opencl<T>(
210 &self,
211 rows: usize,
212 indptr: &[usize],
213 indices: &[usize],
214 data: &[T],
215 x: &[T],
216 ) -> SparseResult<Vec<T>>
217 where
218 T: Float
219 + Debug
220 + Copy
221 + Default
222 + GpuDataType
223 + Send
224 + Sync
225 + 'static
226 + NumAssign
227 + SimdUnifiedOps
228 + std::iter::Sum,
229 {
230 #[cfg(feature = "gpu")]
231 {
232 use crate::gpu_ops::{GpuBufferExt, SpMVKernel};
233
234 use crate::csr_array::CsrArray;
236 use crate::gpu::GpuSpMatVec;
237
238 let csr_matrix = CsrArray::new(
240 data.to_vec().into(),
241 indices.to_vec().into(),
242 indptr.to_vec().into(),
243 (rows, x.len()),
244 )?;
245
246 let gpu_handler = GpuSpMatVec::with_backend(self.backend)?;
247 let result = gpu_handler.spmv(
248 &csr_matrix,
249 &scirs2_core::ndarray::ArrayView1::from(x),
250 None,
251 )?;
252 Ok(result.to_vec())
253 }
254
255 #[cfg(not(feature = "gpu"))]
256 {
257 self.spmv_cpu_optimized(rows, indptr, indices, data, x)
259 }
260 }
261
262 fn spmv_metal<T>(
264 &self,
265 rows: usize,
266 indptr: &[usize],
267 indices: &[usize],
268 data: &[T],
269 x: &[T],
270 ) -> SparseResult<Vec<T>>
271 where
272 T: Float
273 + Debug
274 + Copy
275 + Default
276 + GpuDataType
277 + Send
278 + Sync
279 + 'static
280 + NumAssign
281 + SimdUnifiedOps
282 + std::iter::Sum,
283 {
284 #[cfg(feature = "gpu")]
285 {
286 use crate::gpu_ops::{GpuBufferExt, SpMVKernel};
287
288 let indptr_buffer = self.context.create_buffer_from_slice(indptr);
290 let indices_buffer = self.context.create_buffer_from_slice(indices);
291 let data_buffer = self.context.create_buffer_from_slice(data);
292 let x_buffer = self.context.create_buffer_from_slice(x);
293 let mut y_buffer = self.context.create_buffer::<T>(rows);
294
295 use crate::csr_array::CsrArray;
297 use crate::gpu::GpuSpMatVec;
298
299 let csr_matrix = CsrArray::new(
301 data.to_vec().into(),
302 indices.to_vec().into(),
303 indptr.to_vec().into(),
304 (rows, x.len()),
305 )?;
306
307 let gpu_handler = GpuSpMatVec::with_backend(self.backend)?;
308 let result = gpu_handler.spmv(
309 &csr_matrix,
310 &scirs2_core::ndarray::ArrayView1::from(x),
311 None,
312 )?;
313 Ok(result.to_vec())
314 }
315
316 #[cfg(not(feature = "gpu"))]
317 {
318 self.spmv_cpu_optimized(rows, indptr, indices, data, x)
320 }
321 }
322
323 fn spmv_cpu_optimized<T>(
325 &self,
326 rows: usize,
327 indptr: &[usize],
328 indices: &[usize],
329 data: &[T],
330 x: &[T],
331 ) -> SparseResult<Vec<T>>
332 where
333 T: Float + Debug + Copy + Default + Send + Sync + NumAssign + SimdUnifiedOps,
334 {
335 let mut y = vec![T::zero(); rows];
336
337 #[cfg(feature = "parallel")]
339 {
340 use crate::parallel_vector_ops::parallel_sparse_matvec_csr;
341 parallel_sparse_matvec_csr(&mut y, rows, indptr, indices, data, x, None);
342 }
343
344 #[cfg(not(feature = "parallel"))]
345 {
346 for row in 0..rows {
347 let mut sum = T::zero();
348 let start = indptr[row];
349 let end = indptr[row + 1];
350
351 for idx in start..end {
352 let col = indices[idx];
353 sum = sum + data[idx] * x[col];
354 }
355 y[row] = sum;
356 }
357 }
358
359 Ok(y)
360 }
361
362 #[allow(dead_code)]
364 fn get_cuda_spmv_kernel_source(&self) -> String {
365 r#"
366 extern "C" _global_ void spmv_csr_kernel(
367 int rows,
368 const int* _restrict_ indptr,
369 const int* _restrict_ indices,
370 const float* _restrict_ data,
371 const float* _restrict_ x,
372 float* _restrict_ y
373 ) {
374 int row = blockIdx.x * blockDim.x + threadIdx.x;
375 if (row >= rows) return;
376
377 float sum = 0.0f;
378 int start = indptr[row];
379 int end = indptr[row + 1];
380
381 // Optimized loop with memory coalescing
382 for (int j = start; j < end; j++) {
383 sum += data[j] * x[indices[j]];
384 }
385
386 y[row] = sum;
387 }
388 "#
389 .to_string()
390 }
391
392 #[allow(dead_code)]
394 fn get_opencl_spmv_kernel_source(&self) -> String {
395 r#"
396 _kernel void spmv_csr_kernel(
397 const int rowsglobal const int* restrict indptr_global const int* restrict indices_global const float* restrict data_global const float* restrict x_global float* restrict y
398 ) {
399 int row = get_global_id(0);
400 if (row >= rows) return;
401
402 float sum = 0.0f;
403 int start = indptr[row];
404 int end = indptr[row + 1];
405
406 // Vectorized loop with memory coalescing
407 for (int j = start; j < end; j++) {
408 sum += data[j] * x[indices[j]];
409 }
410
411 y[row] = sum;
412 }
413 "#
414 .to_string()
415 }
416
417 #[allow(dead_code)]
419 fn get_metal_spmv_kernel_source(&self) -> String {
420 r#"
421 #include <metal_stdlib>
422 using namespace metal;
423
424 kernel void spmv_csr_kernel(
425 constant int& rows [[buffer(0)]],
426 constant int* indptr [[buffer(1)]],
427 constant int* indices [[buffer(2)]],
428 constant float* data [[buffer(3)]],
429 constant float* x [[buffer(4)]],
430 device float* y [[buffer(5)]],
431 uint row [[thread_position_in_grid]]
432 ) {
433 if (row >= rows) return;
434
435 float sum = 0.0f;
436 int start = indptr[row];
437 int end = indptr[row + 1];
438
439 // Vectorized loop optimized for Metal
440 for (int j = start; j < end; j++) {
441 sum += data[j] * x[indices[j]];
442 }
443
444 y[row] = sum;
445 }
446 "#
447 .to_string()
448 }
449
450 pub fn backend_info(&self) -> (GpuBackend, String) {
452 let backend_name = match self.backend {
453 GpuBackend::Cuda => "NVIDIA CUDA",
454 GpuBackend::OpenCL => "OpenCL",
455 GpuBackend::Metal => "Apple Metal",
456 GpuBackend::Cpu => "CPU Fallback",
457 GpuBackend::Rocm => "AMD ROCm",
458 GpuBackend::Wgpu => "WebGPU",
459 };
460
461 (self.backend, backend_name.to_string())
462 }
463}
464
465impl Default for GpuSpMV {
466 fn default() -> Self {
467 Self::new().unwrap_or_else(|_| {
468 Self {
470 context: GpuContext::new(GpuBackend::Cpu).unwrap(),
471 backend: GpuBackend::Cpu,
472 }
473 })
474 }
475}
476
477#[cfg(test)]
478mod tests {
479 use super::*;
480
481 #[test]
482 fn test_gpu_spmv_creation() {
483 let gpu_spmv = GpuSpMV::new();
484 assert!(
485 gpu_spmv.is_ok(),
486 "Should be able to create GPU SpMV instance"
487 );
488 }
489
490 #[test]
491 fn test_cpu_fallback_spmv() {
492 let gpu_spmv = GpuSpMV::with_backend(GpuBackend::Cpu).unwrap();
493
494 let indptr = vec![0, 2, 3];
496 let indices = vec![0, 1, 1];
497 let data = vec![1.0, 2.0, 3.0];
498 let x = vec![1.0, 1.0];
499
500 let result = gpu_spmv.spmv(2, 2, &indptr, &indices, &data, &x).unwrap();
501 assert_eq!(result, vec![3.0, 3.0]); }
503
504 #[test]
505 fn test_backend_info() {
506 let gpu_spmv = GpuSpMV::default();
507 let (_backend, name) = gpu_spmv.backend_info();
508 assert!(!name.is_empty(), "Backend name should not be empty");
509 }
510}