1#![allow(dead_code)]
7
8#[allow(unused_imports)]
9use crate::gpu_ops::{GpuBackend, GpuBuffer, GpuBufferExt, GpuDevice, GpuError, GpuKernelHandle};
10#[cfg(feature = "gpu")]
11use scirs2_core::gpu::GpuContext;
12use scirs2_core::numeric::Float;
13use scirs2_core::GpuDataType;
14use std::fmt::Debug;
15
16#[derive(Debug, Clone)]
18pub struct GpuKernelConfig {
19 pub workgroup_size: [u32; 3],
21 pub compute_units: u32,
23 pub vectorization: bool,
25 pub memory_strategy: MemoryStrategy,
27}
28
29impl Default for GpuKernelConfig {
30 fn default() -> Self {
31 Self {
32 workgroup_size: [256, 1, 1],
33 compute_units: 0, vectorization: true,
35 memory_strategy: MemoryStrategy::Coalesced,
36 }
37 }
38}
39
40#[derive(Debug, Clone, Copy, PartialEq)]
42pub enum MemoryStrategy {
43 Standard,
45 Coalesced,
47 SharedMemory,
49 TextureMemory,
51}
52
53#[cfg(feature = "gpu")]
55#[allow(dead_code)]
56#[allow(clippy::too_many_arguments)]
57pub fn execute_spmv_kernel<T>(
58 device: &GpuDevice,
59 kernel: &GpuKernelHandle,
60 rows: usize,
61 indptr_buffer: &GpuBuffer<u32>,
62 indices_buffer: &GpuBuffer<u32>,
63 data_buffer: &GpuBuffer<T>,
64 x_buffer: &GpuBuffer<T>,
65 y_buffer: &GpuBuffer<T>,
66 config: &GpuKernelConfig,
67) -> Result<(), GpuError>
68where
69 T: Float + Debug + Copy + 'static + GpuDataType,
70{
71 let (global_size, local_size) = calculate_optimal_dimensions(
73 device.backend(),
74 rows,
75 config.workgroup_size,
76 config.compute_units,
77 );
78
79 match device.backend() {
81 GpuBackend::Cuda => execute_cuda_spmv(
82 device,
83 kernel,
84 rows,
85 indptr_buffer,
86 indices_buffer,
87 data_buffer,
88 x_buffer,
89 y_buffer,
90 &global_size,
91 &local_size,
92 config,
93 ),
94 GpuBackend::OpenCL => execute_opencl_spmv(
95 device,
96 kernel,
97 rows,
98 indptr_buffer,
99 indices_buffer,
100 data_buffer,
101 x_buffer,
102 y_buffer,
103 &global_size,
104 &local_size,
105 config,
106 ),
107 GpuBackend::Metal => execute_metal_spmv(
108 device,
109 kernel,
110 rows,
111 indptr_buffer,
112 indices_buffer,
113 data_buffer,
114 x_buffer,
115 y_buffer,
116 &global_size,
117 &local_size,
118 config,
119 ),
120 GpuBackend::Cpu => execute_cpu_spmv_fallback(
121 rows,
122 indptr_buffer,
123 indices_buffer,
124 data_buffer,
125 x_buffer,
126 y_buffer,
127 ),
128 GpuBackend::Rocm | GpuBackend::Wgpu => {
129 execute_cpu_spmv_fallback(
131 rows,
132 indptr_buffer,
133 indices_buffer,
134 data_buffer,
135 x_buffer,
136 y_buffer,
137 )
138 }
139 }
140}
141
142#[cfg(feature = "gpu")]
144#[allow(dead_code)]
145#[allow(clippy::too_many_arguments)]
146pub fn execute_symmetric_spmv_kernel<T>(
147 device: &GpuDevice,
148 kernel: &GpuKernelHandle,
149 rows: usize,
150 indptr_buffer: &GpuBuffer<u32>,
151 indices_buffer: &GpuBuffer<u32>,
152 data_buffer: &GpuBuffer<T>,
153 x_buffer: &GpuBuffer<T>,
154 y_buffer: &GpuBuffer<T>,
155 config: &GpuKernelConfig,
156) -> Result<(), GpuError>
157where
158 T: Float + Debug + Copy + 'static + GpuDataType,
159{
160 let (global_size, local_size) = calculate_optimal_dimensions(
162 device.backend(),
163 rows,
164 config.workgroup_size,
165 config.compute_units,
166 );
167
168 match device.backend() {
169 GpuBackend::Cuda => execute_cuda_symmetric_spmv(
170 device,
171 kernel,
172 rows,
173 indptr_buffer,
174 indices_buffer,
175 data_buffer,
176 x_buffer,
177 y_buffer,
178 &global_size,
179 &local_size,
180 config,
181 ),
182 GpuBackend::OpenCL => execute_opencl_symmetric_spmv(
183 device,
184 kernel,
185 rows,
186 indptr_buffer,
187 indices_buffer,
188 data_buffer,
189 x_buffer,
190 y_buffer,
191 &global_size,
192 &local_size,
193 config,
194 ),
195 GpuBackend::Metal => execute_metal_symmetric_spmv(
196 device,
197 kernel,
198 rows,
199 indptr_buffer,
200 indices_buffer,
201 data_buffer,
202 x_buffer,
203 y_buffer,
204 &global_size,
205 &local_size,
206 config,
207 ),
208 GpuBackend::Cpu => execute_cpu_symmetric_spmv_fallback(
209 rows,
210 indptr_buffer,
211 indices_buffer,
212 data_buffer,
213 x_buffer,
214 y_buffer,
215 ),
216 GpuBackend::Rocm | GpuBackend::Wgpu => {
217 execute_cpu_symmetric_spmv_fallback(
219 rows,
220 indptr_buffer,
221 indices_buffer,
222 data_buffer,
223 x_buffer,
224 y_buffer,
225 )
226 }
227 }
228}
229
230#[allow(dead_code)]
232fn calculate_optimal_dimensions(
233 backend: GpuBackend,
234 problem_size: usize,
235 workgroup_size: [u32; 3],
236 _compute_units: u32,
237) -> (Vec<usize>, Vec<usize>) {
238 let optimal_workgroup = match backend {
239 GpuBackend::Cuda => {
240 let warp_aligned = workgroup_size[0].div_ceil(32) * 32;
242 [warp_aligned.min(1024), 1, 1] }
244 GpuBackend::OpenCL => {
245 [
247 workgroup_size[0].min(256),
248 workgroup_size[1],
249 workgroup_size[2],
250 ]
251 }
252 GpuBackend::Metal => {
253 let simd_aligned = workgroup_size[0].div_ceil(32) * 32;
255 [simd_aligned.min(1024), 1, 1]
256 }
257 GpuBackend::Cpu => {
258 workgroup_size
260 }
261 GpuBackend::Rocm => {
262 let wave_aligned = workgroup_size[0].div_ceil(64) * 64;
264 [wave_aligned.min(1024), 1, 1]
265 }
266 GpuBackend::Wgpu => {
267 [workgroup_size[0].min(256), 1, 1]
269 }
270 };
271
272 let global_size =
273 vec![problem_size.div_ceil(optimal_workgroup[0] as usize) * optimal_workgroup[0] as usize];
274 let local_size = vec![optimal_workgroup[0] as usize];
275
276 (global_size, local_size)
277}
278
279#[cfg(feature = "gpu")]
281#[allow(dead_code)]
282#[allow(unused_variables)]
283fn execute_cuda_spmv<T>(
284 device: &GpuDevice,
285 kernel: &GpuKernelHandle,
286 rows: usize,
287 indptr_buffer: &GpuBuffer<u32>,
288 indices_buffer: &GpuBuffer<u32>,
289 data_buffer: &GpuBuffer<T>,
290 x_buffer: &GpuBuffer<T>,
291 y_buffer: &GpuBuffer<T>,
292 global_size: &[usize],
293 local_size: &[usize],
294 config: &GpuKernelConfig,
295) -> Result<(), GpuError>
296where
297 T: Float + Debug + Copy + 'static + GpuDataType,
298{
299 let _warp_size = 32; let block_size = local_size[0].min(1024); let grid_size = rows.div_ceil(block_size);
308
309 let shared_memory_size = match config.memory_strategy {
311 MemoryStrategy::SharedMemory => std::mem::size_of::<T>() * block_size,
312 MemoryStrategy::Standard | MemoryStrategy::Coalesced | MemoryStrategy::TextureMemory => 0,
313 };
314
315 let cuda_args = &[
317 Box::new(rows as u32) as Box<dyn std::any::Any>,
318 Box::new(&raw const *indptr_buffer) as Box<dyn std::any::Any>,
319 Box::new(&raw const *indices_buffer) as Box<dyn std::any::Any>,
320 Box::new(&raw const *data_buffer) as Box<dyn std::any::Any>,
321 Box::new(&raw const *x_buffer) as Box<dyn std::any::Any>,
322 Box::new(std::ptr::addr_of!(*y_buffer) as *mut GpuBuffer<T>) as Box<dyn std::any::Any>,
323 Box::new(block_size as u32) as Box<dyn std::any::Any>,
324 Box::new(shared_memory_size as u32) as Box<dyn std::any::Any>,
325 ];
326
327 let cuda_global_size = &[grid_size, 1, 1];
329 let cuda_local_size = &[block_size, 1, 1];
330
331 Err(GpuError::BackendNotImplemented(GpuBackend::Cuda))
334}
335
336#[cfg(feature = "gpu")]
338#[allow(dead_code)]
339#[allow(unused_variables)]
340fn execute_opencl_spmv<T>(
341 device: &GpuDevice,
342 kernel: &GpuKernelHandle,
343 rows: usize,
344 indptr_buffer: &GpuBuffer<u32>,
345 indices_buffer: &GpuBuffer<u32>,
346 data_buffer: &GpuBuffer<T>,
347 x_buffer: &GpuBuffer<T>,
348 y_buffer: &GpuBuffer<T>,
349 global_size: &[usize],
350 local_size: &[usize],
351 config: &GpuKernelConfig,
352) -> Result<(), GpuError>
353where
354 T: Float + Debug + Copy + 'static + GpuDataType,
355{
356 let optimal_local_size = local_size[0].min(256); let aligned_global_size = rows.div_ceil(optimal_local_size) * optimal_local_size;
367
368 let local_memory_size = match config.memory_strategy {
370 MemoryStrategy::SharedMemory => std::mem::size_of::<T>() * optimal_local_size,
371 MemoryStrategy::Standard | MemoryStrategy::Coalesced | MemoryStrategy::TextureMemory => 0,
372 };
373
374 let opencl_args = &[
376 Box::new(rows as u32) as Box<dyn std::any::Any>,
377 Box::new(&raw const *indptr_buffer) as Box<dyn std::any::Any>,
378 Box::new(&raw const *indices_buffer) as Box<dyn std::any::Any>,
379 Box::new(&raw const *data_buffer) as Box<dyn std::any::Any>,
380 Box::new(&raw const *x_buffer) as Box<dyn std::any::Any>,
381 Box::new(std::ptr::addr_of!(*y_buffer) as *mut GpuBuffer<T>) as Box<dyn std::any::Any>,
382 Box::new(local_memory_size as u32) as Box<dyn std::any::Any>,
383 Box::new(config.vectorization as u32) as Box<dyn std::any::Any>,
384 ];
385
386 let opencl_global_size = &[aligned_global_size, 1, 1];
388 let opencl_local_size = &[optimal_local_size, 1, 1];
389
390 Err(GpuError::BackendNotImplemented(GpuBackend::OpenCL))
393}
394
395#[cfg(feature = "gpu")]
397#[allow(dead_code)]
398#[allow(unused_variables)]
399fn execute_metal_spmv<T>(
400 device: &GpuDevice,
401 kernel: &GpuKernelHandle,
402 rows: usize,
403 indptr_buffer: &GpuBuffer<u32>,
404 indices_buffer: &GpuBuffer<u32>,
405 data_buffer: &GpuBuffer<T>,
406 x_buffer: &GpuBuffer<T>,
407 y_buffer: &GpuBuffer<T>,
408 global_size: &[usize],
409 local_size: &[usize],
410 config: &GpuKernelConfig,
411) -> Result<(), GpuError>
412where
413 T: Float + Debug + Copy + 'static + GpuDataType,
414{
415 let simdgroup_size = 32; let optimal_threadgroup_size = local_size[0].min(1024); let aligned_threadgroup_size =
427 (optimal_threadgroup_size + simdgroup_size - 1) / simdgroup_size * simdgroup_size;
428
429 let num_threadgroups = (rows + aligned_threadgroup_size - 1) / aligned_threadgroup_size;
431
432 let threadgroup_memory_size = match config.memory_strategy {
434 MemoryStrategy::SharedMemory => std::mem::size_of::<T>() * aligned_threadgroup_size,
435 MemoryStrategy::Standard | MemoryStrategy::Coalesced | MemoryStrategy::TextureMemory => 0,
436 };
437
438 let metal_args = &[
440 Box::new(rows as u32) as Box<dyn std::any::Any>,
441 Box::new(&raw const *indptr_buffer) as Box<dyn std::any::Any>,
442 Box::new(&raw const *indices_buffer) as Box<dyn std::any::Any>,
443 Box::new(&raw const *data_buffer) as Box<dyn std::any::Any>,
444 Box::new(&raw const *x_buffer) as Box<dyn std::any::Any>,
445 Box::new(std::ptr::addr_of!(*y_buffer) as *mut GpuBuffer<T>) as Box<dyn std::any::Any>,
446 Box::new(aligned_threadgroup_size as u32) as Box<dyn std::any::Any>,
447 Box::new(threadgroup_memory_size as u32) as Box<dyn std::any::Any>,
448 Box::new(simdgroup_size as u32) as Box<dyn std::any::Any>,
449 ];
450
451 let metal_global_size = &[num_threadgroups * aligned_threadgroup_size, 1, 1];
453 let metal_local_size = &[aligned_threadgroup_size, 1, 1];
454
455 Err(GpuError::BackendNotImplemented(GpuBackend::Metal))
458}
459
460#[allow(dead_code)]
462#[allow(unused_variables)]
463fn execute_cpu_spmv_fallback<T>(
464 rows: usize,
465 indptr_buffer: &GpuBuffer<u32>,
466 indices_buffer: &GpuBuffer<u32>,
467 data_buffer: &GpuBuffer<T>,
468 x_buffer: &GpuBuffer<T>,
469 y_buffer: &GpuBuffer<T>,
470) -> Result<(), GpuError>
471where
472 T: Float + Debug + Copy + 'static + GpuDataType,
473{
474 let indptr = indptr_buffer.to_host()?;
476 let indices = indices_buffer.to_host()?;
477 let data = data_buffer.to_host()?;
478 let x = x_buffer.to_host()?;
479 let mut y = y_buffer.to_host()?;
480
481 for i in 0..rows {
483 let start = indptr[i] as usize;
484 let end = indptr[i + 1] as usize;
485 let mut sum = T::zero();
486
487 for j in start..end {
488 sum = sum + data[j] * x[indices[j] as usize];
489 }
490
491 y[i] = sum;
492 }
493
494 Ok(())
496}
497
498#[cfg(feature = "gpu")]
500#[allow(dead_code)]
501#[allow(unused_variables)]
502fn execute_cuda_symmetric_spmv<T>(
503 device: &GpuDevice,
504 kernel: &GpuKernelHandle,
505 rows: usize,
506 indptr_buffer: &GpuBuffer<u32>,
507 indices_buffer: &GpuBuffer<u32>,
508 data_buffer: &GpuBuffer<T>,
509 x_buffer: &GpuBuffer<T>,
510 y_buffer: &GpuBuffer<T>,
511 global_size: &[usize],
512 local_size: &[usize],
513 config: &GpuKernelConfig,
514) -> Result<(), GpuError>
515where
516 T: Float + Debug + Copy + 'static + GpuDataType,
517{
518 Err(GpuError::BackendNotImplemented(GpuBackend::Cuda))
521}
522
523#[cfg(feature = "gpu")]
524#[allow(dead_code)]
525#[allow(unused_variables)]
526fn execute_opencl_symmetric_spmv<T>(
527 device: &GpuDevice,
528 kernel: &GpuKernelHandle,
529 rows: usize,
530 indptr_buffer: &GpuBuffer<u32>,
531 indices_buffer: &GpuBuffer<u32>,
532 data_buffer: &GpuBuffer<T>,
533 x_buffer: &GpuBuffer<T>,
534 y_buffer: &GpuBuffer<T>,
535 global_size: &[usize],
536 local_size: &[usize],
537 config: &GpuKernelConfig,
538) -> Result<(), GpuError>
539where
540 T: Float + Debug + Copy + 'static + GpuDataType,
541{
542 Err(GpuError::BackendNotImplemented(GpuBackend::OpenCL))
545}
546
547#[cfg(feature = "gpu")]
548#[allow(dead_code)]
549#[allow(unused_variables)]
550fn execute_metal_symmetric_spmv<T>(
551 device: &GpuDevice,
552 kernel: &GpuKernelHandle,
553 rows: usize,
554 indptr_buffer: &GpuBuffer<u32>,
555 indices_buffer: &GpuBuffer<u32>,
556 data_buffer: &GpuBuffer<T>,
557 x_buffer: &GpuBuffer<T>,
558 y_buffer: &GpuBuffer<T>,
559 global_size: &[usize],
560 local_size: &[usize],
561 config: &GpuKernelConfig,
562) -> Result<(), GpuError>
563where
564 T: Float + Debug + Copy + 'static + GpuDataType,
565{
566 Err(GpuError::BackendNotImplemented(GpuBackend::Metal))
569}
570
571#[allow(dead_code)]
572#[allow(unused_variables)]
573fn execute_cpu_symmetric_spmv_fallback<T>(
574 rows: usize,
575 indptr_buffer: &GpuBuffer<u32>,
576 indices_buffer: &GpuBuffer<u32>,
577 data_buffer: &GpuBuffer<T>,
578 x_buffer: &GpuBuffer<T>,
579 y_buffer: &GpuBuffer<T>,
580) -> Result<(), GpuError>
581where
582 T: Float + Debug + Copy + 'static + GpuDataType,
583{
584 let indptr = indptr_buffer.to_host()?;
585 let indices = indices_buffer.to_host()?;
586 let data = data_buffer.to_host()?;
587 let x = x_buffer.to_host()?;
588 let mut y = y_buffer.to_host()?;
589
590 for i in 0..rows {
592 let start = indptr[i] as usize;
593 let end = indptr[i + 1] as usize;
594 let mut sum = T::zero();
595
596 for j in start..end {
597 let col_idx = indices[j] as usize;
598 let val = data[j];
599
600 if col_idx == i {
602 sum = sum + val * x[col_idx];
603 } else {
604 sum = sum + val * x[col_idx];
606 }
609 }
610
611 y[i] = sum;
612 }
613
614 Ok(())
615}
616
617#[cfg(feature = "gpu")]
619#[allow(dead_code)]
620#[allow(clippy::too_many_arguments)]
621pub fn execute_triangular_solve_kernel<T>(
622 device: &GpuDevice,
623 kernel: &GpuKernelHandle,
624 n: usize,
625 indptr_buffer: &GpuBuffer<u32>,
626 indices_buffer: &GpuBuffer<u32>,
627 data_buffer: &GpuBuffer<T>,
628 b_buffer: &GpuBuffer<T>,
629 x_buffer: &GpuBuffer<T>,
630 _config: &GpuKernelConfig,
631) -> Result<(), GpuError>
632where
633 T: Float + Debug + Copy + 'static + GpuDataType,
634{
635 let (global_size, local_size) = match device.backend() {
637 GpuBackend::Cuda => {
638 ([32], [32]) }
641 GpuBackend::OpenCL => {
642 ([64], [64])
644 }
645 GpuBackend::Metal => {
646 ([32], [32])
648 }
649 GpuBackend::Cpu => {
650 ([1], [1])
652 }
653 GpuBackend::Rocm | GpuBackend::Wgpu => {
654 ([64], [64])
656 }
657 };
658
659 match device.backend() {
660 GpuBackend::Cpu => execute_cpu_triangular_solve_fallback(
661 n,
662 indptr_buffer,
663 indices_buffer,
664 data_buffer,
665 b_buffer,
666 x_buffer,
667 ),
668 _ => {
669 Err(GpuError::BackendNotImplemented(device.backend()))
672 }
673 }
674}
675
676#[allow(dead_code)]
678#[allow(unused_variables)]
679fn execute_cpu_triangular_solve_fallback<T>(
680 n: usize,
681 indptr_buffer: &GpuBuffer<u32>,
682 indices_buffer: &GpuBuffer<u32>,
683 data_buffer: &GpuBuffer<T>,
684 b_buffer: &GpuBuffer<T>,
685 x_buffer: &GpuBuffer<T>,
686) -> Result<(), GpuError>
687where
688 T: Float + Debug + Copy + 'static + GpuDataType,
689{
690 let indptr = indptr_buffer.to_host()?;
691 let indices = indices_buffer.to_host()?;
692 let data = data_buffer.to_host()?;
693 let b = b_buffer.to_host()?;
694 let mut x = x_buffer.to_host()?;
695
696 for i in 0..n {
698 let start = indptr[i] as usize;
699 let end = indptr[i + 1] as usize;
700 let mut sum = T::zero();
701 let mut diag_val = T::zero();
702
703 for j in start..end {
704 let col_idx = indices[j] as usize;
705 let val = data[j];
706
707 match col_idx.cmp(&i) {
708 std::cmp::Ordering::Equal => {
709 diag_val = val;
710 }
711 std::cmp::Ordering::Less => {
712 sum = sum + val * x[col_idx];
713 }
714 std::cmp::Ordering::Greater => {}
715 }
716 }
717
718 if diag_val != T::zero() {
719 x[i] = (b[i] - sum) / diag_val;
720 } else {
721 #[cfg(feature = "gpu")]
722 return Err(GpuError::InvalidParameter(
723 "Singular matrix in triangular solve".to_string(),
724 ));
725 #[cfg(not(feature = "gpu"))]
726 return Err(GpuError::invalid_parameter(
727 "Singular matrix in triangular solve".to_string(),
728 ));
729 }
730 }
731
732 Ok(())
733}
734
735pub struct GpuMemoryManager {
737 device: GpuDevice,
738 #[cfg(feature = "gpu")]
739 context: Option<GpuContext>,
740 buffer_pool: std::collections::HashMap<(usize, std::any::TypeId), Vec<Box<dyn std::any::Any>>>,
741 memory_stats: GpuMemoryStats,
743 transfer_queue: std::collections::VecDeque<TransferRequest>,
745 alignment_preference: usize,
747 max_pool_size: usize,
749}
750
751#[derive(Debug, Clone, Default)]
753pub struct GpuMemoryStats {
754 pub total_allocated: usize,
756 pub peak_usage: usize,
758 pub allocation_count: u64,
760 pub pool_hits: u64,
762 pub pool_misses: u64,
764 pub avg_transfer_bandwidth: f64,
766}
767
768#[derive(Debug)]
770struct TransferRequest {
771 size: usize,
772 priority: TransferPriority,
773 timestamp: std::time::Instant,
774}
775
776#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
778pub enum TransferPriority {
779 Low,
780 Normal,
781 High,
782 Critical,
783}
784
785#[derive(Debug, Clone, Copy, PartialEq)]
787pub enum MemoryLayout {
788 Standard,
790 Coalesced,
792 Strided { stride: usize },
794}
795
796impl GpuMemoryManager {
797 pub fn new(backend: GpuBackend) -> Result<Self, GpuError> {
798 #[cfg(feature = "gpu")]
799 let device = GpuDevice::new(backend, 0);
800 #[cfg(not(feature = "gpu"))]
801 let device = GpuDevice::new(backend)?;
802
803 let alignment_preference = match backend {
804 GpuBackend::Cuda => 128, GpuBackend::OpenCL => 64, GpuBackend::Metal => 16, GpuBackend::Cpu => 8, GpuBackend::Rocm => 64, GpuBackend::Wgpu => 32, };
811
812 #[cfg(feature = "gpu")]
813 let context = if device.backend() != GpuBackend::Cpu {
814 GpuContext::new(device.backend()).ok()
815 } else {
816 None
817 };
818
819 Ok(Self {
820 device,
821 #[cfg(feature = "gpu")]
822 context,
823 buffer_pool: std::collections::HashMap::new(),
824 memory_stats: GpuMemoryStats::default(),
825 transfer_queue: std::collections::VecDeque::new(),
826 alignment_preference,
827 max_pool_size: 20, })
829 }
830
831 pub fn get_buffer<T>(&mut self, size: usize) -> Result<GpuBuffer<T>, GpuError>
833 where
834 T: GpuDataType + Default + 'static,
835 {
836 let aligned_size =
837 self.align_size(size * std::mem::size_of::<T>()) / std::mem::size_of::<T>();
838 let key = (aligned_size, std::any::TypeId::of::<T>());
839
840 if let Some(pool) = self.buffer_pool.get_mut(&key) {
842 if let Some(buffer) = pool.pop() {
843 if let Ok(typed_buffer) = buffer.downcast::<GpuBuffer<T>>() {
844 self.memory_stats.pool_hits += 1;
845 return Ok(*typed_buffer);
846 }
847 }
848 }
849
850 self.memory_stats.pool_misses += 1;
852 self.memory_stats.allocation_count += 1;
853
854 #[cfg(feature = "gpu")]
855 let buffer = if let Some(ref context) = self.context {
856 let buffer = context.create_buffer::<T>(aligned_size);
857 let zeros = vec![T::default(); aligned_size];
859 buffer.copy_from_host(&zeros)?;
860 buffer
861 } else {
862 return Err(GpuError::BackendNotAvailable(
864 "No GPU context available".to_string(),
865 ));
866 };
867
868 #[cfg(not(feature = "gpu"))]
869 let buffer = GpuBuffer::from_vec(vec![T::default(); aligned_size]);
870
871 let allocation_size = aligned_size * std::mem::size_of::<T>();
873 self.memory_stats.total_allocated += allocation_size;
874 if self.memory_stats.total_allocated > self.memory_stats.peak_usage {
875 self.memory_stats.peak_usage = self.memory_stats.total_allocated;
876 }
877
878 Ok(buffer)
879 }
880
881 pub fn get_buffer_with_layout<T>(
883 &mut self,
884 size: usize,
885 layout: MemoryLayout,
886 ) -> Result<GpuBuffer<T>, GpuError>
887 where
888 T: GpuDataType + Default + 'static,
889 {
890 match layout {
891 MemoryLayout::Coalesced => {
892 let coalesced_size = self.calculate_coalesced_size(size);
894 self.get_buffer::<T>(coalesced_size)
895 }
896 MemoryLayout::Strided { stride } => {
897 let strided_size = size + (size % stride);
899 self.get_buffer::<T>(strided_size)
900 }
901 MemoryLayout::Standard => self.get_buffer::<T>(size),
902 }
903 }
904
905 pub fn return_buffer<T>(&mut self, buffer: GpuBuffer<T>)
907 where
908 T: GpuDataType + 'static,
909 {
910 let size = buffer.len();
911 let allocation_size = size * std::mem::size_of::<T>();
912 let key = (size, std::any::TypeId::of::<T>());
913
914 let pool = self.buffer_pool.entry(key).or_default();
915
916 if pool.len() < self.max_pool_size && allocation_size > 1024 {
918 pool.push(Box::new(buffer));
920 }
921
922 self.memory_stats.total_allocated = self
924 .memory_stats
925 .total_allocated
926 .saturating_sub(allocation_size);
927
928 self.cleanup_old_buffers_if_needed();
930 }
931
932 fn cleanup_old_buffers_if_needed(&mut self) {
934 if self.memory_stats.total_allocated > self.memory_stats.peak_usage * 3 / 4 {
935 for (_, pool) in self.buffer_pool.iter_mut() {
937 let remove_count = pool.len() / 2;
938 for _ in 0..remove_count {
939 pool.remove(0);
940 }
941 }
942 }
943 }
944
945 pub fn transfer_data_optimized<T>(
947 &mut self,
948 host_data: &[T],
949 _priority: TransferPriority,
950 ) -> Result<GpuBuffer<T>, GpuError>
951 where
952 T: GpuDataType + Copy,
953 {
954 let transfer_size = std::mem::size_of_val(host_data);
955 let start_time = std::time::Instant::now();
956
957 let buffer = match self.device.backend() {
958 #[cfg(feature = "gpu")]
959 GpuBackend::Cuda => {
960 self.transfer_data_cuda_optimized(host_data, transfer_size, _priority)
961 }
962 #[cfg(feature = "gpu")]
963 GpuBackend::OpenCL => {
964 self.transfer_data_opencl_optimized(host_data, transfer_size, _priority)
965 }
966 #[cfg(feature = "gpu")]
967 GpuBackend::Metal => {
968 self.transfer_data_metal_optimized(host_data, transfer_size, _priority)
969 }
970 _ => {
971 #[cfg(feature = "gpu")]
973 {
974 if let Some(ref context) = self.context {
975 let buffer = context.create_buffer_from_slice(host_data);
976 Ok(buffer)
977 } else {
978 Err(GpuError::BackendNotAvailable(
980 "No GPU context available".to_string(),
981 ))
982 }
983 }
984 #[cfg(not(feature = "gpu"))]
985 Ok(GpuBuffer::from_vec(host_data.to_vec()))
986 }
987 }?;
988
989 let elapsed = start_time.elapsed().as_secs_f64();
991 if elapsed > 0.0 {
992 let bandwidth = transfer_size as f64 / elapsed;
993 self.update_bandwidth_stats(bandwidth);
994 }
995
996 Ok(buffer)
997 }
998
999 #[cfg(feature = "gpu")]
1001 fn transfer_data_cuda_optimized<T>(
1002 &self,
1003 host_data: &[T],
1004 transfer_size: usize,
1005 priority: TransferPriority,
1006 ) -> Result<GpuBuffer<T>, GpuError>
1007 where
1008 T: GpuDataType + Copy,
1009 {
1010 if let Some(ref context) = self.context {
1012 if matches!(
1013 priority,
1014 TransferPriority::High | TransferPriority::Critical
1015 ) && transfer_size > 4 * 1024 * 1024
1016 {
1017 Ok(context.create_buffer_from_slice(host_data))
1020 } else if transfer_size > 64 * 1024 {
1021 Ok(context.create_buffer_from_slice(host_data))
1023 } else {
1024 Ok(context.create_buffer_from_slice(host_data))
1026 }
1027 } else {
1028 Err(GpuError::BackendNotAvailable(
1030 "No GPU context available".to_string(),
1031 ))
1032 }
1033 }
1034
1035 #[cfg(feature = "gpu")]
1037 fn transfer_data_opencl_optimized<T>(
1038 &self,
1039 host_data: &[T],
1040 transfer_size: usize,
1041 _priority: TransferPriority,
1042 ) -> Result<GpuBuffer<T>, GpuError>
1043 where
1044 T: GpuDataType + Copy,
1045 {
1046 if let Some(ref context) = self.context {
1047 if transfer_size > 1024 * 1024 {
1048 Ok(context.create_buffer_from_slice(host_data))
1050 } else {
1051 Ok(context.create_buffer_from_slice(host_data))
1052 }
1053 } else {
1054 Err(GpuError::BackendNotAvailable(
1056 "No GPU context available".to_string(),
1057 ))
1058 }
1059 }
1060
1061 #[cfg(feature = "gpu")]
1063 fn transfer_data_metal_optimized<T>(
1064 &self,
1065 host_data: &[T],
1066 transfer_size: usize,
1067 _priority: TransferPriority,
1068 ) -> Result<GpuBuffer<T>, GpuError>
1069 where
1070 T: GpuDataType + Copy,
1071 {
1072 if let Some(ref context) = self.context {
1073 if transfer_size > 2 * 1024 * 1024 {
1075 Ok(context.create_buffer_from_slice(host_data))
1077 } else {
1078 Ok(context.create_buffer_from_slice(host_data))
1079 }
1080 } else {
1081 Err(GpuError::BackendNotAvailable(
1083 "No GPU context available".to_string(),
1084 ))
1085 }
1086 }
1087
1088 pub fn batch_operations<F, R>(&mut self, operations: F) -> Result<R, GpuError>
1090 where
1091 F: FnOnce(&mut Self) -> Result<R, GpuError>,
1092 {
1093 let batch_start = std::time::Instant::now();
1095
1096 let result = operations(self)?;
1098
1099 self.process_pending_transfers()?;
1101
1102 let batch_duration = batch_start.elapsed();
1104 if batch_duration.as_millis() > 100 {
1105 eprintln!(
1107 "Warning: GPU batch operation took {batch_duration_ms}ms",
1108 batch_duration_ms = batch_duration.as_millis()
1109 );
1110 }
1111
1112 Ok(result)
1113 }
1114
1115 fn process_pending_transfers(&mut self) -> Result<(), GpuError> {
1117 let mut transfers: Vec<_> = self.transfer_queue.drain(..).collect();
1119 transfers.sort_by_key(|t| (t.priority, t.timestamp));
1120
1121 for transfer in transfers {
1123 if transfer.priority >= TransferPriority::High {
1125 } else {
1127 self.transfer_queue.push_back(transfer);
1129 }
1130 }
1131
1132 Ok(())
1133 }
1134
1135 fn update_bandwidth_stats(&mut self, bandwidth: f64) {
1137 let alpha = 0.1; if self.memory_stats.avg_transfer_bandwidth == 0.0 {
1140 self.memory_stats.avg_transfer_bandwidth = bandwidth;
1141 } else {
1142 self.memory_stats.avg_transfer_bandwidth =
1143 alpha * bandwidth + (1.0 - alpha) * self.memory_stats.avg_transfer_bandwidth;
1144 }
1145 }
1146
1147 pub fn get_memory_stats(&self) -> &GpuMemoryStats {
1149 &self.memory_stats
1150 }
1151
1152 pub fn get_pool_efficiency(&self) -> f64 {
1154 let total_requests = self.memory_stats.pool_hits + self.memory_stats.pool_misses;
1155 if total_requests == 0 {
1156 0.0
1157 } else {
1158 self.memory_stats.pool_hits as f64 / total_requests as f64
1159 }
1160 }
1161
1162 fn align_size(&self, size: usize) -> usize {
1164 (size + self.alignment_preference - 1) & !(self.alignment_preference - 1)
1165 }
1166
1167 fn calculate_coalesced_size(&self, size: usize) -> usize {
1169 match self.device.backend() {
1170 GpuBackend::Cuda => {
1171 let alignment = 128 / std::mem::size_of::<usize>();
1173 size.div_ceil(alignment) * alignment
1174 }
1175 GpuBackend::OpenCL => {
1176 let alignment = 64 / std::mem::size_of::<usize>();
1178 size.div_ceil(alignment) * alignment
1179 }
1180 GpuBackend::Metal => {
1181 let alignment = 16 / std::mem::size_of::<usize>();
1183 size.div_ceil(alignment) * alignment
1184 }
1185 GpuBackend::Cpu => size,
1186 GpuBackend::Rocm => {
1187 let alignment = 64 / std::mem::size_of::<usize>();
1189 size.div_ceil(alignment) * alignment
1190 }
1191 GpuBackend::Wgpu => {
1192 let alignment = 32 / std::mem::size_of::<usize>();
1194 size.div_ceil(alignment) * alignment
1195 }
1196 }
1197 }
1198}
1199
1200#[allow(dead_code)]
1202pub fn prefetch_matrix_data<T>(
1203 memory_manager: &mut GpuMemoryManager,
1204 matrix_data: &[T],
1205 access_pattern: AccessPattern,
1206) -> Result<GpuBuffer<T>, GpuError>
1207where
1208 T: GpuDataType + Copy,
1209{
1210 let priority = match access_pattern {
1211 AccessPattern::Sequential => TransferPriority::Normal,
1212 AccessPattern::Random => TransferPriority::High,
1213 AccessPattern::Strided { .. } => TransferPriority::High,
1214 AccessPattern::Blocked => TransferPriority::Normal,
1215 };
1216
1217 memory_manager.transfer_data_optimized(matrix_data, priority)
1218}
1219
1220#[derive(Debug, Clone, Copy, PartialEq)]
1222pub enum AccessPattern {
1223 Sequential,
1225 Random,
1227 Strided { stride: usize },
1229 Blocked,
1231}
1232
1233#[allow(dead_code)]
1235pub fn optimize_memory_bandwidth(
1236 backend: GpuBackend,
1237 data_size: usize,
1238 access_pattern: AccessPattern,
1239) -> MemoryStrategy {
1240 match (backend, access_pattern, data_size) {
1241 (GpuBackend::Cuda, AccessPattern::Sequential, size) if size > 1024 * 1024 => {
1242 MemoryStrategy::Coalesced
1243 }
1244 (GpuBackend::Cuda, AccessPattern::Random, _) => MemoryStrategy::TextureMemory,
1245 (GpuBackend::OpenCL, AccessPattern::Blocked, _) => MemoryStrategy::SharedMemory,
1246 (GpuBackend::Metal, _, size) if size > 512 * 1024 => {
1247 MemoryStrategy::SharedMemory }
1249 _ => MemoryStrategy::Standard,
1250 }
1251}
1252
1253#[allow(dead_code)]
1255pub fn calculate_adaptive_workgroup_size(
1256 backend: GpuBackend,
1257 matrix_rows: usize,
1258 matrix_nnz: usize,
1259 available_memory: usize,
1260) -> GpuKernelConfig {
1261 let avg_nnz_per_row = if matrix_rows > 0 {
1262 matrix_nnz / matrix_rows
1263 } else {
1264 0
1265 };
1266
1267 let workgroup_size = match backend {
1268 GpuBackend::Cuda => {
1269 if avg_nnz_per_row < 10 {
1271 [128, 1, 1] } else if avg_nnz_per_row < 50 {
1273 [256, 1, 1] } else {
1275 [512, 1, 1] }
1277 }
1278 GpuBackend::OpenCL => {
1279 if avg_nnz_per_row < 20 {
1281 [64, 1, 1]
1282 } else {
1283 [128, 1, 1]
1284 }
1285 }
1286 GpuBackend::Metal => {
1287 if avg_nnz_per_row < 15 {
1289 [128, 1, 1]
1290 } else {
1291 [256, 1, 1]
1292 }
1293 }
1294 GpuBackend::Cpu => {
1295 [1, 1, 1]
1297 }
1298 GpuBackend::Rocm => {
1299 if avg_nnz_per_row < 10 {
1301 [64, 1, 1]
1302 } else if avg_nnz_per_row < 50 {
1303 [128, 1, 1]
1304 } else {
1305 [256, 1, 1]
1306 }
1307 }
1308 GpuBackend::Wgpu => {
1309 if avg_nnz_per_row < 20 {
1311 [32, 1, 1]
1312 } else {
1313 [64, 1, 1]
1314 }
1315 }
1316 };
1317
1318 let memory_strategy = if available_memory > 512 * 1024 * 1024 {
1319 MemoryStrategy::SharedMemory } else {
1321 MemoryStrategy::Coalesced };
1323
1324 GpuKernelConfig {
1325 workgroup_size,
1326 compute_units: 0, vectorization: avg_nnz_per_row > 4, memory_strategy,
1329 }
1330}
1331
1332#[cfg(not(feature = "gpu"))]
1334#[allow(dead_code)]
1335#[allow(unused_variables)]
1336#[allow(clippy::too_many_arguments)]
1337pub fn execute_spmv_kernel<T>(
1338 _device: &GpuDevice,
1339 _kernel: &GpuKernelHandle,
1340 rows: usize,
1341 indptr_buffer: &GpuBuffer<u32>,
1342 indices_buffer: &GpuBuffer<u32>,
1343 data_buffer: &GpuBuffer<T>,
1344 x_buffer: &GpuBuffer<T>,
1345 y_buffer: &GpuBuffer<T>,
1346 _config: &GpuKernelConfig,
1347) -> Result<(), GpuError>
1348where
1349 T: Float + Debug + Copy + 'static + GpuDataType,
1350{
1351 execute_cpu_spmv_fallback(
1353 rows,
1354 indptr_buffer,
1355 indices_buffer,
1356 data_buffer,
1357 x_buffer,
1358 y_buffer,
1359 )
1360}
1361
1362#[cfg(not(feature = "gpu"))]
1363#[allow(dead_code)]
1364#[allow(unused_variables)]
1365#[allow(clippy::too_many_arguments)]
1366pub fn execute_symmetric_spmv_kernel<T>(
1367 _device: &GpuDevice,
1368 _kernel: &GpuKernelHandle,
1369 rows: usize,
1370 indptr_buffer: &GpuBuffer<u32>,
1371 indices_buffer: &GpuBuffer<u32>,
1372 data_buffer: &GpuBuffer<T>,
1373 x_buffer: &GpuBuffer<T>,
1374 y_buffer: &GpuBuffer<T>,
1375 _config: &GpuKernelConfig,
1376) -> Result<(), GpuError>
1377where
1378 T: Float + Debug + Copy + 'static + GpuDataType,
1379{
1380 execute_cpu_symmetric_spmv_fallback(
1382 rows,
1383 indptr_buffer,
1384 indices_buffer,
1385 data_buffer,
1386 x_buffer,
1387 y_buffer,
1388 )
1389}
1390
1391#[cfg(not(feature = "gpu"))]
1392#[allow(dead_code)]
1393#[allow(unused_variables)]
1394#[allow(clippy::too_many_arguments)]
1395pub fn execute_triangular_solve_kernel<T>(
1396 _device: &GpuDevice,
1397 _kernel: &GpuKernelHandle,
1398 n: usize,
1399 indptr_buffer: &GpuBuffer<u32>,
1400 indices_buffer: &GpuBuffer<u32>,
1401 data_buffer: &GpuBuffer<T>,
1402 b_buffer: &GpuBuffer<T>,
1403 x_buffer: &GpuBuffer<T>,
1404 _config: &GpuKernelConfig,
1405) -> Result<(), GpuError>
1406where
1407 T: Float + Debug + Copy + 'static + GpuDataType,
1408{
1409 execute_cpu_triangular_solve_fallback(
1411 n,
1412 indptr_buffer,
1413 indices_buffer,
1414 data_buffer,
1415 b_buffer,
1416 x_buffer,
1417 )
1418}
1419
1420pub struct GpuPerformanceProfiler {
1422 backend: GpuBackend,
1423 timing_data: std::collections::HashMap<String, Vec<f64>>,
1424}
1425
1426impl GpuPerformanceProfiler {
1427 pub fn new(backend: GpuBackend) -> Self {
1428 Self {
1429 backend,
1430 timing_data: std::collections::HashMap::new(),
1431 }
1432 }
1433
1434 pub fn profile_operation<F, R>(
1436 &mut self,
1437 operationname: &str,
1438 operation: F,
1439 ) -> Result<R, GpuError>
1440 where
1441 F: FnOnce() -> Result<R, GpuError>,
1442 {
1443 let start_time = std::time::Instant::now();
1444 let result = operation()?;
1445 let elapsed = start_time.elapsed().as_secs_f64() * 1000.0; let timings = self
1449 .timing_data
1450 .entry(operationname.to_string())
1451 .or_default();
1452 timings.push(elapsed);
1453
1454 if timings.len() > 100 {
1456 timings.remove(0);
1457 }
1458
1459 Ok(result)
1460 }
1461
1462 pub fn get_average_time(&self, operationname: &str) -> Option<f64> {
1464 if let Some(timings) = self.timing_data.get(operationname) {
1465 if !timings.is_empty() {
1466 Some(timings.iter().sum::<f64>() / timings.len() as f64)
1467 } else {
1468 None
1469 }
1470 } else {
1471 None
1472 }
1473 }
1474
1475 pub fn get_recommendations(&self) -> Vec<String> {
1477 let mut recommendations = Vec::new();
1478
1479 for (operation, timings) in &self.timing_data {
1480 if let Some(avg_time) = self.get_average_time(operation) {
1481 let variance = if timings.len() > 1 {
1483 let mean = avg_time;
1484 let var = timings.iter().map(|&t| (t - mean).powi(2)).sum::<f64>()
1485 / timings.len() as f64;
1486 var.sqrt()
1487 } else {
1488 0.0
1489 };
1490
1491 match self.backend {
1493 GpuBackend::Cuda => {
1494 if avg_time > 10.0 && operation.contains("spmv") {
1495 recommendations.push(format!(
1496 "Consider using larger workgroup sizes for {operation} (current avg: {avg_time:.2}ms, variance: {variance:.2})"
1497 ));
1498 }
1499 if variance > avg_time * 0.5 {
1500 recommendations.push(format!(
1501 "High timing variance for {operation} suggests memory bandwidth bottleneck"
1502 ));
1503 }
1504 }
1505 GpuBackend::OpenCL => {
1506 if avg_time > 15.0 {
1507 recommendations.push(format!(
1508 "OpenCL performance for {operation} could be improved with memory optimization (current avg: {avg_time:.2}ms)"
1509 ));
1510 }
1511 if variance > 5.0 {
1512 recommendations.push(format!(
1513 "Consider using local memory optimization for {operation} to reduce timing variance"
1514 ));
1515 }
1516 }
1517 GpuBackend::Metal => {
1518 if avg_time > 8.0 && operation.contains("triangular") {
1519 recommendations.push(format!(
1520 "Metal triangular solve {operation} may benefit from simdgroup optimization (current avg: {avg_time:.2}ms)"
1521 ));
1522 }
1523 if operation.contains("spmv") && avg_time > 5.0 {
1524 recommendations.push(format!(
1525 "Consider using Metal's unified memory architecture for {operation} optimization"
1526 ));
1527 }
1528 }
1529 GpuBackend::Cpu => {
1530 if avg_time > 50.0 {
1531 recommendations.push(format!(
1532 "Consider enabling GPU acceleration for {operation} (CPU avg: {avg_time:.2}ms)"
1533 ));
1534 }
1535 if variance > 20.0 {
1536 recommendations.push(format!(
1537 "High CPU timing variance for {operation} suggests CPU scheduling issues"
1538 ));
1539 }
1540 }
1541 GpuBackend::Rocm => {
1542 if avg_time > 12.0 {
1543 recommendations.push(format!(
1544 "ROCm performance for {operation} could be improved with memory optimization (current avg: {avg_time:.2}ms)"
1545 ));
1546 }
1547 }
1548 GpuBackend::Wgpu => {
1549 if avg_time > 15.0 {
1550 recommendations.push(format!(
1551 "WebGPU performance for {operation} could be improved with buffer optimization (current avg: {avg_time:.2}ms)"
1552 ));
1553 }
1554 }
1555 }
1556
1557 if avg_time > 100.0 {
1559 recommendations.push(format!(
1560 "Operation {operation} is taking very long ({avg_time:.2}ms) - consider algorithm optimization"
1561 ));
1562 }
1563 }
1564 }
1565
1566 recommendations
1567 }
1568
1569 pub fn get_operation_metrics(&self, operationname: &str) -> Option<OperationMetrics> {
1571 if let Some(timings) = self.timing_data.get(operationname) {
1572 if timings.is_empty() {
1573 return None;
1574 }
1575
1576 let count = timings.len();
1577 let total_time: f64 = timings.iter().sum();
1578 let avg_time = total_time / count as f64;
1579 let min_time = timings.iter().copied().fold(f64::INFINITY, f64::min);
1580 let max_time = timings.iter().copied().fold(f64::NEG_INFINITY, f64::max);
1581
1582 let variance = if count > 1 {
1583 timings.iter().map(|&t| (t - avg_time).powi(2)).sum::<f64>() / count as f64
1584 } else {
1585 0.0
1586 };
1587
1588 Some(OperationMetrics {
1589 operationname: operationname.to_string(),
1590 call_count: count,
1591 total_time,
1592 avg_time,
1593 min_time,
1594 max_time,
1595 variance: variance.sqrt(),
1596 throughput: 1000.0 / avg_time, })
1598 } else {
1599 None
1600 }
1601 }
1602
1603 pub fn reset_metrics(&mut self) {
1605 self.timing_data.clear();
1606 }
1607
1608 pub fn export_metrics(&self) -> Vec<OperationMetrics> {
1610 self.timing_data
1611 .keys()
1612 .filter_map(|op| self.get_operation_metrics(op))
1613 .collect()
1614 }
1615}
1616
1617#[derive(Debug, Clone)]
1619pub struct OperationMetrics {
1620 pub operationname: String,
1621 pub call_count: usize,
1622 pub total_time: f64,
1623 pub avg_time: f64,
1624 pub min_time: f64,
1625 pub max_time: f64,
1626 pub variance: f64,
1627 pub throughput: f64, }
1629
1630#[cfg(test)]
1631mod tests {
1632 use super::*;
1633
1634 #[test]
1635 fn test_calculate_optimal_dimensions() {
1636 let (global, local) = calculate_optimal_dimensions(
1637 GpuBackend::Cuda,
1638 1000, [256, 1, 1], 0, );
1642
1643 assert_eq!(local[0], 256);
1644 assert!(global[0] >= 1000);
1645 assert_eq!(global[0] % local[0], 0);
1646 }
1647
1648 #[test]
1649 fn test_adaptive_workgroup_config() {
1650 let config = calculate_adaptive_workgroup_size(
1651 GpuBackend::Cuda,
1652 10000, 50000, 1024 * 1024 * 1024, );
1656
1657 assert!(config.workgroup_size[0] > 0);
1658 assert!(config.vectorization); assert_eq!(config.memory_strategy, MemoryStrategy::SharedMemory);
1660 }
1661
1662 #[test]
1663 fn test_gpu_kernel_config_default() {
1664 let config = GpuKernelConfig::default();
1665 assert_eq!(config.workgroup_size, [256, 1, 1]);
1666 assert_eq!(config.compute_units, 0);
1667 assert!(config.vectorization);
1668 assert_eq!(config.memory_strategy, MemoryStrategy::Coalesced);
1669 }
1670}