1#![allow(dead_code)]
7
8#[allow(unused_imports)]
9use crate::gpu_ops::{
10 GpuBackend, GpuBuffer, GpuBufferExt, GpuDataType, GpuDevice, GpuError, GpuKernelHandle,
11};
12use num_traits::Float;
13use std::fmt::Debug;
14
15#[derive(Debug, Clone)]
17pub struct GpuKernelConfig {
18 pub workgroup_size: [u32; 3],
20 pub compute_units: u32,
22 pub vectorization: bool,
24 pub memory_strategy: MemoryStrategy,
26}
27
28impl Default for GpuKernelConfig {
29 fn default() -> Self {
30 Self {
31 workgroup_size: [256, 1, 1],
32 compute_units: 0, vectorization: true,
34 memory_strategy: MemoryStrategy::Coalesced,
35 }
36 }
37}
38
39#[derive(Debug, Clone, Copy, PartialEq)]
41pub enum MemoryStrategy {
42 Standard,
44 Coalesced,
46 SharedMemory,
48 TextureMemory,
50}
51
52#[cfg(feature = "gpu")]
54#[allow(dead_code)]
55#[allow(clippy::too_many_arguments)]
56pub fn execute_spmv_kernel<T>(
57 device: &GpuDevice,
58 kernel: &GpuKernelHandle,
59 rows: usize,
60 indptr_buffer: &GpuBuffer<u32>,
61 indices_buffer: &GpuBuffer<u32>,
62 data_buffer: &GpuBuffer<T>,
63 x_buffer: &GpuBuffer<T>,
64 y_buffer: &GpuBuffer<T>,
65 config: &GpuKernelConfig,
66) -> Result<(), GpuError>
67where
68 T: Float + Debug + Copy + 'static + GpuDataType,
69{
70 let (global_size, local_size) = calculate_optimal_dimensions(
72 device.backend(),
73 rows,
74 config.workgroup_size,
75 config.compute_units,
76 );
77
78 match device.backend() {
80 GpuBackend::Cuda => execute_cuda_spmv(
81 device,
82 kernel,
83 rows,
84 indptr_buffer,
85 indices_buffer,
86 data_buffer,
87 x_buffer,
88 y_buffer,
89 &global_size,
90 &local_size,
91 config,
92 ),
93 GpuBackend::OpenCL => execute_opencl_spmv(
94 device,
95 kernel,
96 rows,
97 indptr_buffer,
98 indices_buffer,
99 data_buffer,
100 x_buffer,
101 y_buffer,
102 &global_size,
103 &local_size,
104 config,
105 ),
106 GpuBackend::Metal => execute_metal_spmv(
107 device,
108 kernel,
109 rows,
110 indptr_buffer,
111 indices_buffer,
112 data_buffer,
113 x_buffer,
114 y_buffer,
115 &global_size,
116 &local_size,
117 config,
118 ),
119 GpuBackend::Cpu => execute_cpu_spmv_fallback(
120 rows,
121 indptr_buffer,
122 indices_buffer,
123 data_buffer,
124 x_buffer,
125 y_buffer,
126 ),
127 GpuBackend::Rocm | GpuBackend::Wgpu => {
128 execute_cpu_spmv_fallback(
130 rows,
131 indptr_buffer,
132 indices_buffer,
133 data_buffer,
134 x_buffer,
135 y_buffer,
136 )
137 }
138 }
139}
140
141#[cfg(feature = "gpu")]
143#[allow(dead_code)]
144#[allow(clippy::too_many_arguments)]
145pub fn execute_symmetric_spmv_kernel<T>(
146 device: &GpuDevice,
147 kernel: &GpuKernelHandle,
148 rows: usize,
149 indptr_buffer: &GpuBuffer<u32>,
150 indices_buffer: &GpuBuffer<u32>,
151 data_buffer: &GpuBuffer<T>,
152 x_buffer: &GpuBuffer<T>,
153 y_buffer: &GpuBuffer<T>,
154 config: &GpuKernelConfig,
155) -> Result<(), GpuError>
156where
157 T: Float + Debug + Copy + 'static + GpuDataType,
158{
159 let (global_size, local_size) = calculate_optimal_dimensions(
161 device.backend(),
162 rows,
163 config.workgroup_size,
164 config.compute_units,
165 );
166
167 match device.backend() {
168 GpuBackend::Cuda => execute_cuda_symmetric_spmv(
169 device,
170 kernel,
171 rows,
172 indptr_buffer,
173 indices_buffer,
174 data_buffer,
175 x_buffer,
176 y_buffer,
177 &global_size,
178 &local_size,
179 config,
180 ),
181 GpuBackend::OpenCL => execute_opencl_symmetric_spmv(
182 device,
183 kernel,
184 rows,
185 indptr_buffer,
186 indices_buffer,
187 data_buffer,
188 x_buffer,
189 y_buffer,
190 &global_size,
191 &local_size,
192 config,
193 ),
194 GpuBackend::Metal => execute_metal_symmetric_spmv(
195 device,
196 kernel,
197 rows,
198 indptr_buffer,
199 indices_buffer,
200 data_buffer,
201 x_buffer,
202 y_buffer,
203 &global_size,
204 &local_size,
205 config,
206 ),
207 GpuBackend::Cpu => execute_cpu_symmetric_spmv_fallback(
208 rows,
209 indptr_buffer,
210 indices_buffer,
211 data_buffer,
212 x_buffer,
213 y_buffer,
214 ),
215 GpuBackend::Rocm | GpuBackend::Wgpu => {
216 execute_cpu_symmetric_spmv_fallback(
218 rows,
219 indptr_buffer,
220 indices_buffer,
221 data_buffer,
222 x_buffer,
223 y_buffer,
224 )
225 }
226 }
227}
228
229#[allow(dead_code)]
231fn calculate_optimal_dimensions(
232 backend: GpuBackend,
233 problem_size: usize,
234 workgroup_size: [u32; 3],
235 _compute_units: u32,
236) -> (Vec<usize>, Vec<usize>) {
237 let optimal_workgroup = match backend {
238 GpuBackend::Cuda => {
239 let warp_aligned = workgroup_size[0].div_ceil(32) * 32;
241 [warp_aligned.min(1024), 1, 1] }
243 GpuBackend::OpenCL => {
244 [
246 workgroup_size[0].min(256),
247 workgroup_size[1],
248 workgroup_size[2],
249 ]
250 }
251 GpuBackend::Metal => {
252 let simd_aligned = workgroup_size[0].div_ceil(32) * 32;
254 [simd_aligned.min(1024), 1, 1]
255 }
256 GpuBackend::Cpu => {
257 workgroup_size
259 }
260 GpuBackend::Rocm => {
261 let wave_aligned = workgroup_size[0].div_ceil(64) * 64;
263 [wave_aligned.min(1024), 1, 1]
264 }
265 GpuBackend::Wgpu => {
266 [workgroup_size[0].min(256), 1, 1]
268 }
269 };
270
271 let global_size =
272 vec![problem_size.div_ceil(optimal_workgroup[0] as usize) * optimal_workgroup[0] as usize];
273 let local_size = vec![optimal_workgroup[0] as usize];
274
275 (global_size, local_size)
276}
277
278#[cfg(feature = "gpu")]
280#[allow(dead_code)]
281#[allow(unused_variables)]
282fn execute_cuda_spmv<T>(
283 device: &GpuDevice,
284 kernel: &GpuKernelHandle,
285 rows: usize,
286 indptr_buffer: &GpuBuffer<u32>,
287 indices_buffer: &GpuBuffer<u32>,
288 data_buffer: &GpuBuffer<T>,
289 x_buffer: &GpuBuffer<T>,
290 y_buffer: &GpuBuffer<T>,
291 global_size: &[usize],
292 local_size: &[usize],
293 config: &GpuKernelConfig,
294) -> Result<(), GpuError>
295where
296 T: Float + Debug + Copy + 'static + GpuDataType,
297{
298 let _warp_size = 32; let block_size = local_size[0].min(1024); let grid_size = rows.div_ceil(block_size);
307
308 let shared_memory_size = match config.memory_strategy {
310 MemoryStrategy::SharedMemory => std::mem::size_of::<T>() * block_size_,
311 };
312
313 let cuda_args = &[
315 Box::new(rows as u32) as Box<dyn std::any::Any>,
316 Box::new(&raw const *indptr_buffer) as Box<dyn std::any::Any>,
317 Box::new(&raw const *indices_buffer) as Box<dyn std::any::Any>,
318 Box::new(&raw const *data_buffer) as Box<dyn std::any::Any>,
319 Box::new(&raw const *x_buffer) as Box<dyn std::any::Any>,
320 Box::new(std::ptr::addr_of!(*y_buffer) as *mut GpuBuffer<T>) as Box<dyn std::any::Any>,
321 Box::new(block_size as u32) as Box<dyn std::any::Any>,
322 Box::new(shared_memory_size as u32) as Box<dyn std::any::Any>,
323 ];
324
325 let cuda_global_size = &[grid_size, 1, 1];
327 let cuda_local_size = &[block_size, 1, 1];
328
329 device.execute_kernel_with_args(kernel, cuda_global_size, cuda_local_size, cuda_args)?;
331
332 if cfg!(debug_assertions) {
334 device.synchronize()?;
336 }
337
338 Ok(())
339}
340
341#[cfg(feature = "gpu")]
343#[allow(dead_code)]
344#[allow(unused_variables)]
345fn execute_opencl_spmv<T>(
346 device: &GpuDevice,
347 kernel: &GpuKernelHandle,
348 rows: usize,
349 indptr_buffer: &GpuBuffer<u32>,
350 indices_buffer: &GpuBuffer<u32>,
351 data_buffer: &GpuBuffer<T>,
352 x_buffer: &GpuBuffer<T>,
353 y_buffer: &GpuBuffer<T>,
354 global_size: &[usize],
355 local_size: &[usize],
356 config: &GpuKernelConfig,
357) -> Result<(), GpuError>
358where
359 T: Float + Debug + Copy + 'static + GpuDataType,
360{
361 let max_work_group_size = device.get_max_work_group_size().unwrap_or(256);
368 let optimal_local_size = local_size[0].min(max_work_group_size);
369
370 let aligned_global_size = rows.div_ceil(optimal_local_size) * optimal_local_size;
372
373 let local_memory_size = match config.memory_strategy {
375 MemoryStrategy::SharedMemory => std::mem::size_of::<T>() * optimal_local_size_,
376 };
377
378 let opencl_args = &[
380 Box::new(rows as u32) as Box<dyn std::any::Any>,
381 Box::new(&raw const *indptr_buffer) as Box<dyn std::any::Any>,
382 Box::new(&raw const *indices_buffer) as Box<dyn std::any::Any>,
383 Box::new(&raw const *data_buffer) as Box<dyn std::any::Any>,
384 Box::new(&raw const *x_buffer) as Box<dyn std::any::Any>,
385 Box::new(std::ptr::addr_of!(*y_buffer) as *mut GpuBuffer<T>) as Box<dyn std::any::Any>,
386 Box::new(local_memory_size as u32) as Box<dyn std::any::Any>,
387 Box::new(config.vectorization as u32) as Box<dyn std::any::Any>,
388 ];
389
390 let opencl_global_size = &[aligned_global_size, 1, 1];
392 let opencl_local_size = &[optimal_local_size, 1, 1];
393
394 device.execute_kernel_with_args(kernel, opencl_global_size, opencl_local_size, opencl_args)?;
396
397 device.finish_queue()?;
399
400 Ok(())
401}
402
403#[cfg(feature = "gpu")]
405#[allow(dead_code)]
406#[allow(unused_variables)]
407fn execute_metal_spmv<T>(
408 device: &GpuDevice,
409 kernel: &GpuKernelHandle,
410 rows: usize,
411 indptr_buffer: &GpuBuffer<u32>,
412 indices_buffer: &GpuBuffer<u32>,
413 data_buffer: &GpuBuffer<T>,
414 x_buffer: &GpuBuffer<T>,
415 y_buffer: &GpuBuffer<T>,
416 global_size: &[usize],
417 local_size: &[usize],
418 config: &GpuKernelConfig,
419) -> Result<(), GpuError>
420where
421 T: Float + Debug + Copy + 'static + GpuDataType,
422{
423 let simdgroup_size = 32; let max_threads_per_group = device.get_max_threads_per_threadgroup().unwrap_or(1024);
431 let optimal_threadgroup_size = local_size[0].min(max_threads_per_group);
432
433 let aligned_threadgroup_size =
435 (optimal_threadgroup_size + simdgroup_size - 1) / simdgroup_size * simdgroup_size;
436
437 let num_threadgroups = (rows + aligned_threadgroup_size - 1) / aligned_threadgroup_size;
439
440 let threadgroup_memory_size = match config.memory_strategy {
442 MemoryStrategy::SharedMemory => std::mem::size_of::<T>() * aligned_threadgroup_size_,
443 };
444
445 let metal_args = &[
447 Box::new(rows as u32) as Box<dyn std::any::Any>,
448 Box::new(&raw const *indptr_buffer) as Box<dyn std::any::Any>,
449 Box::new(&raw const *indices_buffer) as Box<dyn std::any::Any>,
450 Box::new(&raw const *data_buffer) as Box<dyn std::any::Any>,
451 Box::new(&raw const *x_buffer) as Box<dyn std::any::Any>,
452 Box::new(std::ptr::addr_of!(*y_buffer) as *mut GpuBuffer<T>) as Box<dyn std::any::Any>,
453 Box::new(aligned_threadgroup_size as u32) as Box<dyn std::any::Any>,
454 Box::new(threadgroup_memory_size as u32) as Box<dyn std::any::Any>,
455 Box::new(simdgroup_size as u32) as Box<dyn std::any::Any>,
456 ];
457
458 let metal_global_size = &[num_threadgroups * aligned_threadgroup_size, 1, 1];
460 let metal_local_size = &[aligned_threadgroup_size, 1, 1];
461
462 device.execute_kernel_with_args(kernel, metal_global_size, metal_local_size, metal_args)?;
464
465 device.commit_and_wait()?;
467
468 Ok(())
469}
470
471#[allow(dead_code)]
473#[allow(unused_variables)]
474fn execute_cpu_spmv_fallback<T>(
475 rows: usize,
476 indptr_buffer: &GpuBuffer<u32>,
477 indices_buffer: &GpuBuffer<u32>,
478 data_buffer: &GpuBuffer<T>,
479 x_buffer: &GpuBuffer<T>,
480 y_buffer: &GpuBuffer<T>,
481) -> Result<(), GpuError>
482where
483 T: Float + Debug + Copy + 'static + GpuDataType,
484{
485 let indptr = indptr_buffer.to_host()?;
487 let indices = indices_buffer.to_host()?;
488 let data = data_buffer.to_host()?;
489 let x = x_buffer.to_host()?;
490 let mut y = y_buffer.to_host()?;
491
492 for i in 0..rows {
494 let start = indptr[i] as usize;
495 let end = indptr[i + 1] as usize;
496 let mut sum = T::zero();
497
498 for j in start..end {
499 sum = sum + data[j] * x[indices[j] as usize];
500 }
501
502 y[i] = sum;
503 }
504
505 Ok(())
507}
508
509#[cfg(feature = "gpu")]
511#[allow(dead_code)]
512#[allow(unused_variables)]
513fn execute_cuda_symmetric_spmv<T>(
514 device: &GpuDevice,
515 kernel: &GpuKernelHandle,
516 rows: usize,
517 indptr_buffer: &GpuBuffer<u32>,
518 indices_buffer: &GpuBuffer<u32>,
519 data_buffer: &GpuBuffer<T>,
520 x_buffer: &GpuBuffer<T>,
521 y_buffer: &GpuBuffer<T>,
522 global_size: &[usize],
523 local_size: &[usize],
524 config: &GpuKernelConfig,
525) -> Result<(), GpuError>
526where
527 T: Float + Debug + Copy + 'static + GpuDataType,
528{
529 device.execute_kernel_with_args(
531 kernel,
532 global_size,
533 local_size,
534 &[
535 Box::new(rows as u32) as Box<dyn std::any::Any>,
536 Box::new(&raw const *indptr_buffer) as Box<dyn std::any::Any>,
537 Box::new(&raw const *indices_buffer) as Box<dyn std::any::Any>,
538 Box::new(&raw const *data_buffer) as Box<dyn std::any::Any>,
539 Box::new(&raw const *x_buffer) as Box<dyn std::any::Any>,
540 Box::new(std::ptr::addr_of!(*y_buffer) as *mut GpuBuffer<T>) as Box<dyn std::any::Any>,
541 ],
542 )
543}
544
545#[cfg(feature = "gpu")]
546#[allow(dead_code)]
547#[allow(unused_variables)]
548fn execute_opencl_symmetric_spmv<T>(
549 device: &GpuDevice,
550 kernel: &GpuKernelHandle,
551 rows: usize,
552 indptr_buffer: &GpuBuffer<u32>,
553 indices_buffer: &GpuBuffer<u32>,
554 data_buffer: &GpuBuffer<T>,
555 x_buffer: &GpuBuffer<T>,
556 y_buffer: &GpuBuffer<T>,
557 global_size: &[usize],
558 local_size: &[usize],
559 config: &GpuKernelConfig,
560) -> Result<(), GpuError>
561where
562 T: Float + Debug + Copy + 'static + GpuDataType,
563{
564 device.execute_kernel_with_args(
565 kernel,
566 global_size,
567 local_size,
568 &[
569 Box::new(rows as u32) as Box<dyn std::any::Any>,
570 Box::new(&raw const *indptr_buffer) as Box<dyn std::any::Any>,
571 Box::new(&raw const *indices_buffer) as Box<dyn std::any::Any>,
572 Box::new(&raw const *data_buffer) as Box<dyn std::any::Any>,
573 Box::new(&raw const *x_buffer) as Box<dyn std::any::Any>,
574 Box::new(std::ptr::addr_of!(*y_buffer) as *mut GpuBuffer<T>) as Box<dyn std::any::Any>,
575 ],
576 )
577}
578
579#[cfg(feature = "gpu")]
580#[allow(dead_code)]
581#[allow(unused_variables)]
582fn execute_metal_symmetric_spmv<T>(
583 device: &GpuDevice,
584 kernel: &GpuKernelHandle,
585 rows: usize,
586 indptr_buffer: &GpuBuffer<u32>,
587 indices_buffer: &GpuBuffer<u32>,
588 data_buffer: &GpuBuffer<T>,
589 x_buffer: &GpuBuffer<T>,
590 y_buffer: &GpuBuffer<T>,
591 global_size: &[usize],
592 local_size: &[usize],
593 config: &GpuKernelConfig,
594) -> Result<(), GpuError>
595where
596 T: Float + Debug + Copy + 'static + GpuDataType,
597{
598 device.execute_kernel_with_args(
599 kernel,
600 global_size,
601 local_size,
602 &[
603 Box::new(rows as u32) as Box<dyn std::any::Any>,
604 Box::new(&raw const *indptr_buffer) as Box<dyn std::any::Any>,
605 Box::new(&raw const *indices_buffer) as Box<dyn std::any::Any>,
606 Box::new(&raw const *data_buffer) as Box<dyn std::any::Any>,
607 Box::new(&raw const *x_buffer) as Box<dyn std::any::Any>,
608 Box::new(std::ptr::addr_of!(*y_buffer) as *mut GpuBuffer<T>) as Box<dyn std::any::Any>,
609 ],
610 )
611}
612
613#[allow(dead_code)]
614#[allow(unused_variables)]
615fn execute_cpu_symmetric_spmv_fallback<T>(
616 rows: usize,
617 indptr_buffer: &GpuBuffer<u32>,
618 indices_buffer: &GpuBuffer<u32>,
619 data_buffer: &GpuBuffer<T>,
620 x_buffer: &GpuBuffer<T>,
621 y_buffer: &GpuBuffer<T>,
622) -> Result<(), GpuError>
623where
624 T: Float + Debug + Copy + 'static + GpuDataType,
625{
626 let indptr = indptr_buffer.to_host()?;
627 let indices = indices_buffer.to_host()?;
628 let data = data_buffer.to_host()?;
629 let x = x_buffer.to_host()?;
630 let mut y = y_buffer.to_host()?;
631
632 for i in 0..rows {
634 let start = indptr[i] as usize;
635 let end = indptr[i + 1] as usize;
636 let mut sum = T::zero();
637
638 for j in start..end {
639 let col_idx = indices[j] as usize;
640 let val = data[j];
641
642 if col_idx == i {
644 sum = sum + val * x[col_idx];
645 } else {
646 sum = sum + val * x[col_idx];
648 }
651 }
652
653 y[i] = sum;
654 }
655
656 Ok(())
657}
658
659#[cfg(feature = "gpu")]
661#[allow(dead_code)]
662#[allow(clippy::too_many_arguments)]
663pub fn execute_triangular_solve_kernel<T>(
664 device: &GpuDevice,
665 kernel: &GpuKernelHandle,
666 n: usize,
667 indptr_buffer: &GpuBuffer<u32>,
668 indices_buffer: &GpuBuffer<u32>,
669 data_buffer: &GpuBuffer<T>,
670 b_buffer: &GpuBuffer<T>,
671 x_buffer: &GpuBuffer<T>,
672 _config: &GpuKernelConfig,
673) -> Result<(), GpuError>
674where
675 T: Float + Debug + Copy + 'static + GpuDataType,
676{
677 let (global_size, local_size) = match device.backend() {
679 GpuBackend::Cuda => {
680 ([32], [32]) }
683 GpuBackend::OpenCL => {
684 ([64], [64])
686 }
687 GpuBackend::Metal => {
688 ([32], [32])
690 }
691 GpuBackend::Cpu => {
692 ([1], [1])
694 }
695 GpuBackend::Rocm | GpuBackend::Wgpu => {
696 ([64], [64])
698 }
699 };
700
701 match device.backend() {
702 GpuBackend::Cpu => execute_cpu_triangular_solve_fallback(
703 n,
704 indptr_buffer,
705 indices_buffer,
706 data_buffer,
707 b_buffer,
708 x_buffer,
709 ),
710 _ => device.execute_kernel_with_args(
711 kernel,
712 &global_size,
713 &local_size,
714 &[
715 Box::new(n as u32) as Box<dyn std::any::Any>,
716 Box::new(&raw const *indptr_buffer) as Box<dyn std::any::Any>,
717 Box::new(&raw const *indices_buffer) as Box<dyn std::any::Any>,
718 Box::new(&raw const *data_buffer) as Box<dyn std::any::Any>,
719 Box::new(&raw const *b_buffer) as Box<dyn std::any::Any>,
720 Box::new(&raw const *x_buffer) as Box<dyn std::any::Any>,
721 ],
722 ),
723 }
724}
725
726#[allow(dead_code)]
728#[allow(unused_variables)]
729fn execute_cpu_triangular_solve_fallback<T>(
730 n: usize,
731 indptr_buffer: &GpuBuffer<u32>,
732 indices_buffer: &GpuBuffer<u32>,
733 data_buffer: &GpuBuffer<T>,
734 b_buffer: &GpuBuffer<T>,
735 x_buffer: &GpuBuffer<T>,
736) -> Result<(), GpuError>
737where
738 T: Float + Debug + Copy + 'static + GpuDataType,
739{
740 let indptr = indptr_buffer.to_host()?;
741 let indices = indices_buffer.to_host()?;
742 let data = data_buffer.to_host()?;
743 let b = b_buffer.to_host()?;
744 let mut x = x_buffer.to_host()?;
745
746 for i in 0..n {
748 let start = indptr[i] as usize;
749 let end = indptr[i + 1] as usize;
750 let mut sum = T::zero();
751 let mut diag_val = T::zero();
752
753 for j in start..end {
754 let col_idx = indices[j] as usize;
755 let val = data[j];
756
757 match col_idx.cmp(&i) {
758 std::cmp::Ordering::Equal => {
759 diag_val = val;
760 }
761 std::cmp::Ordering::Less => {
762 sum = sum + val * x[col_idx];
763 }
764 std::cmp::Ordering::Greater => {}
765 }
766 }
767
768 if diag_val != T::zero() {
769 x[i] = (b[i] - sum) / diag_val;
770 } else {
771 return Err(GpuError::invalid_parameter(
772 "Singular matrix in triangular solve".to_string(),
773 ));
774 }
775 }
776
777 Ok(())
778}
779
780pub struct GpuMemoryManager {
782 device: GpuDevice,
783 buffer_pool: std::collections::HashMap<(usize, std::any::TypeId), Vec<Box<dyn std::any::Any>>>,
784 memory_stats: GpuMemoryStats,
786 transfer_queue: std::collections::VecDeque<TransferRequest>,
788 alignment_preference: usize,
790 max_pool_size: usize,
792}
793
794#[derive(Debug, Clone, Default)]
796pub struct GpuMemoryStats {
797 pub total_allocated: usize,
799 pub peak_usage: usize,
801 pub allocation_count: u64,
803 pub pool_hits: u64,
805 pub pool_misses: u64,
807 pub avg_transfer_bandwidth: f64,
809}
810
811#[derive(Debug)]
813struct TransferRequest {
814 size: usize,
815 priority: TransferPriority,
816 timestamp: std::time::Instant,
817}
818
819#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
821pub enum TransferPriority {
822 Low,
823 Normal,
824 High,
825 Critical,
826}
827
828#[derive(Debug, Clone, Copy, PartialEq)]
830pub enum MemoryLayout {
831 Standard,
833 Coalesced,
835 Strided { stride: usize },
837}
838
839impl GpuMemoryManager {
840 pub fn new(backend: GpuBackend) -> Result<Self, GpuError> {
841 let device = GpuDevice::get_default(backend)?;
842
843 let alignment_preference = match backend {
844 GpuBackend::Cuda => 128, GpuBackend::OpenCL => 64, GpuBackend::Metal => 16, GpuBackend::Cpu => 8, GpuBackend::Rocm => 64, GpuBackend::Wgpu => 32, };
851
852 Ok(Self {
853 device,
854 buffer_pool: std::collections::HashMap::new(),
855 memory_stats: GpuMemoryStats::default(),
856 transfer_queue: std::collections::VecDeque::new(),
857 alignment_preference,
858 max_pool_size: 20, })
860 }
861
862 pub fn get_buffer<T>(&mut self, size: usize) -> Result<GpuBuffer<T>, GpuError>
864 where
865 T: GpuDataType + Default + 'static,
866 {
867 let aligned_size =
868 self.align_size(size * std::mem::size_of::<T>()) / std::mem::size_of::<T>();
869 let key = (aligned_size, std::any::TypeId::of::<T>());
870
871 if let Some(pool) = self.buffer_pool.get_mut(&key) {
873 if let Some(buffer) = pool.pop() {
874 if let Ok(typed_buffer) = buffer.downcast::<GpuBuffer<T>>() {
875 self.memory_stats.pool_hits += 1;
876 return Ok(*typed_buffer);
877 }
878 }
879 }
880
881 self.memory_stats.pool_misses += 1;
883 self.memory_stats.allocation_count += 1;
884
885 let buffer = self.device.create_buffer_zeros::<T>(aligned_size)?;
886
887 let allocation_size = aligned_size * std::mem::size_of::<T>();
889 self.memory_stats.total_allocated += allocation_size;
890 if self.memory_stats.total_allocated > self.memory_stats.peak_usage {
891 self.memory_stats.peak_usage = self.memory_stats.total_allocated;
892 }
893
894 Ok(buffer)
895 }
896
897 pub fn get_buffer_with_layout<T>(
899 &mut self,
900 size: usize,
901 layout: MemoryLayout,
902 ) -> Result<GpuBuffer<T>, GpuError>
903 where
904 T: GpuDataType + Default + 'static,
905 {
906 match layout {
907 MemoryLayout::Coalesced => {
908 let coalesced_size = self.calculate_coalesced_size(size);
910 self.get_buffer::<T>(coalesced_size)
911 }
912 MemoryLayout::Strided { stride } => {
913 let strided_size = size + (size % stride);
915 self.get_buffer::<T>(strided_size)
916 }
917 MemoryLayout::Standard => self.get_buffer::<T>(size),
918 }
919 }
920
921 pub fn return_buffer<T>(&mut self, buffer: GpuBuffer<T>)
923 where
924 T: GpuDataType + 'static,
925 {
926 let size = buffer.len();
927 let allocation_size = size * std::mem::size_of::<T>();
928 let key = (size, std::any::TypeId::of::<T>());
929
930 let pool = self.buffer_pool.entry(key).or_default();
931
932 if pool.len() < self.max_pool_size && allocation_size > 1024 {
934 pool.push(Box::new(buffer));
936 }
937
938 self.memory_stats.total_allocated = self
940 .memory_stats
941 .total_allocated
942 .saturating_sub(allocation_size);
943
944 self.cleanup_old_buffers_if_needed();
946 }
947
948 fn cleanup_old_buffers_if_needed(&mut self) {
950 if self.memory_stats.total_allocated > self.memory_stats.peak_usage * 3 / 4 {
951 for (_, pool) in self.buffer_pool.iter_mut() {
953 let remove_count = pool.len() / 2;
954 for _ in 0..remove_count {
955 pool.remove(0);
956 }
957 }
958 }
959 }
960
961 pub fn transfer_data_optimized<T>(
963 &mut self,
964 host_data: &[T],
965 _priority: TransferPriority,
966 ) -> Result<GpuBuffer<T>, GpuError>
967 where
968 T: GpuDataType + Copy,
969 {
970 let transfer_size = std::mem::size_of_val(host_data);
971 let start_time = std::time::Instant::now();
972
973 let buffer = match self.device.backend() {
974 #[cfg(feature = "gpu")]
975 GpuBackend::Cuda => {
976 self.transfer_data_cuda_optimized(host_data, transfer_size_priority)
977 }
978 #[cfg(feature = "gpu")]
979 GpuBackend::OpenCL => {
980 self.transfer_data_opencl_optimized(host_data, transfer_size_priority)
981 }
982 #[cfg(feature = "gpu")]
983 GpuBackend::Metal => {
984 self.transfer_data_metal_optimized(host_data, transfer_size_priority)
985 }
986 _ => {
987 self.device.create_buffer(host_data)
989 }
990 }?;
991
992 let elapsed = start_time.elapsed().as_secs_f64();
994 if elapsed > 0.0 {
995 let bandwidth = transfer_size as f64 / elapsed;
996 self.update_bandwidth_stats(bandwidth);
997 }
998
999 Ok(buffer)
1000 }
1001
1002 #[cfg(feature = "gpu")]
1004 fn transfer_data_cuda_optimized<T>(
1005 &self,
1006 host_data: &[T],
1007 transfer_size: usize,
1008 _priority: TransferPriority,
1009 ) -> Result<GpuBuffer<T>, GpuError>
1010 where
1011 T: GpuDataType + Copy,
1012 {
1013 match (transfer_size_priority) {
1014 (size, TransferPriority::High | TransferPriority::Critical)
1016 if size > 4 * 1024 * 1024 =>
1017 {
1018 self.device.create_buffer(host_data) }
1021 (size_) if size > 64 * 1024 => self.device.create_buffer(host_data),
1023 _ => self.device.create_buffer(host_data),
1025 }
1026 }
1027
1028 #[cfg(feature = "gpu")]
1030 fn transfer_data_opencl_optimized<T>(
1031 &self,
1032 host_data: &[T],
1033 transfer_size: usize,
1034 _priority: TransferPriority,
1035 ) -> Result<GpuBuffer<T>, GpuError>
1036 where
1037 T: GpuDataType + Copy,
1038 {
1039 if transfer_size > 1024 * 1024 {
1040 self.device.create_buffer(host_data)
1042 } else {
1043 self.device.create_buffer(host_data)
1044 }
1045 }
1046
1047 #[cfg(feature = "gpu")]
1049 fn transfer_data_metal_optimized<T>(
1050 &self,
1051 host_data: &[T],
1052 transfer_size: usize,
1053 _priority: TransferPriority,
1054 ) -> Result<GpuBuffer<T>, GpuError>
1055 where
1056 T: GpuDataType + Copy,
1057 {
1058 if transfer_size > 2 * 1024 * 1024 {
1060 self.device.create_buffer(host_data)
1062 } else {
1063 self.device.create_buffer(host_data)
1064 }
1065 }
1066
1067 pub fn batch_operations<F, R>(&mut self, operations: F) -> Result<R, GpuError>
1069 where
1070 F: FnOnce(&mut Self) -> Result<R, GpuError>,
1071 {
1072 let batch_start = std::time::Instant::now();
1074
1075 let result = operations(self)?;
1077
1078 self.process_pending_transfers()?;
1080
1081 let batch_duration = batch_start.elapsed();
1083 if batch_duration.as_millis() > 100 {
1084 eprintln!(
1086 "Warning: GPU batch operation took {batch_duration_ms}ms",
1087 batch_duration_ms = batch_duration.as_millis()
1088 );
1089 }
1090
1091 Ok(result)
1092 }
1093
1094 fn process_pending_transfers(&mut self) -> Result<(), GpuError> {
1096 let mut transfers: Vec<_> = self.transfer_queue.drain(..).collect();
1098 transfers.sort_by_key(|t| (t.priority, t.timestamp));
1099
1100 for transfer in transfers {
1102 if transfer.priority >= TransferPriority::High {
1104 } else {
1106 self.transfer_queue.push_back(transfer);
1108 }
1109 }
1110
1111 Ok(())
1112 }
1113
1114 fn update_bandwidth_stats(&mut self, bandwidth: f64) {
1116 let alpha = 0.1; if self.memory_stats.avg_transfer_bandwidth == 0.0 {
1119 self.memory_stats.avg_transfer_bandwidth = bandwidth;
1120 } else {
1121 self.memory_stats.avg_transfer_bandwidth =
1122 alpha * bandwidth + (1.0 - alpha) * self.memory_stats.avg_transfer_bandwidth;
1123 }
1124 }
1125
1126 pub fn get_memory_stats(&self) -> &GpuMemoryStats {
1128 &self.memory_stats
1129 }
1130
1131 pub fn get_pool_efficiency(&self) -> f64 {
1133 let total_requests = self.memory_stats.pool_hits + self.memory_stats.pool_misses;
1134 if total_requests == 0 {
1135 0.0
1136 } else {
1137 self.memory_stats.pool_hits as f64 / total_requests as f64
1138 }
1139 }
1140
1141 fn align_size(&self, size: usize) -> usize {
1143 (size + self.alignment_preference - 1) & !(self.alignment_preference - 1)
1144 }
1145
1146 fn calculate_coalesced_size(&self, size: usize) -> usize {
1148 match self.device.backend() {
1149 GpuBackend::Cuda => {
1150 let alignment = 128 / std::mem::size_of::<usize>();
1152 size.div_ceil(alignment) * alignment
1153 }
1154 GpuBackend::OpenCL => {
1155 let alignment = 64 / std::mem::size_of::<usize>();
1157 size.div_ceil(alignment) * alignment
1158 }
1159 GpuBackend::Metal => {
1160 let alignment = 16 / std::mem::size_of::<usize>();
1162 size.div_ceil(alignment) * alignment
1163 }
1164 GpuBackend::Cpu => size,
1165 GpuBackend::Rocm => {
1166 let alignment = 64 / std::mem::size_of::<usize>();
1168 size.div_ceil(alignment) * alignment
1169 }
1170 GpuBackend::Wgpu => {
1171 let alignment = 32 / std::mem::size_of::<usize>();
1173 size.div_ceil(alignment) * alignment
1174 }
1175 }
1176 }
1177}
1178
1179#[allow(dead_code)]
1181pub fn prefetch_matrix_data<T>(
1182 memory_manager: &mut GpuMemoryManager,
1183 matrix_data: &[T],
1184 access_pattern: AccessPattern,
1185) -> Result<GpuBuffer<T>, GpuError>
1186where
1187 T: GpuDataType + Copy,
1188{
1189 let priority = match access_pattern {
1190 AccessPattern::Sequential => TransferPriority::Normal,
1191 AccessPattern::Random => TransferPriority::High,
1192 AccessPattern::Strided { .. } => TransferPriority::High,
1193 AccessPattern::Blocked => TransferPriority::Normal,
1194 };
1195
1196 memory_manager.transfer_data_optimized(matrix_data, priority)
1197}
1198
1199#[derive(Debug, Clone, Copy, PartialEq)]
1201pub enum AccessPattern {
1202 Sequential,
1204 Random,
1206 Strided { stride: usize },
1208 Blocked,
1210}
1211
1212#[allow(dead_code)]
1214pub fn optimize_memory_bandwidth(
1215 backend: GpuBackend,
1216 data_size: usize,
1217 access_pattern: AccessPattern,
1218) -> MemoryStrategy {
1219 match (backend, access_pattern, data_size) {
1220 (GpuBackend::Cuda, AccessPattern::Sequential, size) if size > 1024 * 1024 => {
1221 MemoryStrategy::Coalesced
1222 }
1223 (GpuBackend::Cuda, AccessPattern::Random, _) => MemoryStrategy::TextureMemory,
1224 (GpuBackend::OpenCL, AccessPattern::Blocked, _) => MemoryStrategy::SharedMemory,
1225 (GpuBackend::Metal, _, size) if size > 512 * 1024 => {
1226 MemoryStrategy::SharedMemory }
1228 _ => MemoryStrategy::Standard,
1229 }
1230}
1231
1232#[allow(dead_code)]
1234pub fn calculate_adaptive_workgroup_size(
1235 backend: GpuBackend,
1236 matrix_rows: usize,
1237 matrix_nnz: usize,
1238 available_memory: usize,
1239) -> GpuKernelConfig {
1240 let avg_nnz_per_row = if matrix_rows > 0 {
1241 matrix_nnz / matrix_rows
1242 } else {
1243 0
1244 };
1245
1246 let workgroup_size = match backend {
1247 GpuBackend::Cuda => {
1248 if avg_nnz_per_row < 10 {
1250 [128, 1, 1] } else if avg_nnz_per_row < 50 {
1252 [256, 1, 1] } else {
1254 [512, 1, 1] }
1256 }
1257 GpuBackend::OpenCL => {
1258 if avg_nnz_per_row < 20 {
1260 [64, 1, 1]
1261 } else {
1262 [128, 1, 1]
1263 }
1264 }
1265 GpuBackend::Metal => {
1266 if avg_nnz_per_row < 15 {
1268 [128, 1, 1]
1269 } else {
1270 [256, 1, 1]
1271 }
1272 }
1273 GpuBackend::Cpu => {
1274 [1, 1, 1]
1276 }
1277 GpuBackend::Rocm => {
1278 if avg_nnz_per_row < 10 {
1280 [64, 1, 1]
1281 } else if avg_nnz_per_row < 50 {
1282 [128, 1, 1]
1283 } else {
1284 [256, 1, 1]
1285 }
1286 }
1287 GpuBackend::Wgpu => {
1288 if avg_nnz_per_row < 20 {
1290 [32, 1, 1]
1291 } else {
1292 [64, 1, 1]
1293 }
1294 }
1295 };
1296
1297 let memory_strategy = if available_memory > 512 * 1024 * 1024 {
1298 MemoryStrategy::SharedMemory } else {
1300 MemoryStrategy::Coalesced };
1302
1303 GpuKernelConfig {
1304 workgroup_size,
1305 compute_units: 0, vectorization: avg_nnz_per_row > 4, memory_strategy,
1308 }
1309}
1310
1311#[cfg(not(feature = "gpu"))]
1313#[allow(dead_code)]
1314#[allow(unused_variables)]
1315#[allow(clippy::too_many_arguments)]
1316pub fn execute_spmv_kernel<T>(
1317 _device: &GpuDevice,
1318 _kernel: &GpuKernelHandle,
1319 rows: usize,
1320 indptr_buffer: &GpuBuffer<u32>,
1321 indices_buffer: &GpuBuffer<u32>,
1322 data_buffer: &GpuBuffer<T>,
1323 x_buffer: &GpuBuffer<T>,
1324 y_buffer: &GpuBuffer<T>,
1325 _config: &GpuKernelConfig,
1326) -> Result<(), GpuError>
1327where
1328 T: Float + Debug + Copy + 'static + GpuDataType,
1329{
1330 execute_cpu_spmv_fallback(
1332 rows,
1333 indptr_buffer,
1334 indices_buffer,
1335 data_buffer,
1336 x_buffer,
1337 y_buffer,
1338 )
1339}
1340
1341#[cfg(not(feature = "gpu"))]
1342#[allow(dead_code)]
1343#[allow(unused_variables)]
1344#[allow(clippy::too_many_arguments)]
1345pub fn execute_symmetric_spmv_kernel<T>(
1346 _device: &GpuDevice,
1347 _kernel: &GpuKernelHandle,
1348 rows: usize,
1349 indptr_buffer: &GpuBuffer<u32>,
1350 indices_buffer: &GpuBuffer<u32>,
1351 data_buffer: &GpuBuffer<T>,
1352 x_buffer: &GpuBuffer<T>,
1353 y_buffer: &GpuBuffer<T>,
1354 _config: &GpuKernelConfig,
1355) -> Result<(), GpuError>
1356where
1357 T: Float + Debug + Copy + 'static + GpuDataType,
1358{
1359 execute_cpu_symmetric_spmv_fallback(
1361 rows,
1362 indptr_buffer,
1363 indices_buffer,
1364 data_buffer,
1365 x_buffer,
1366 y_buffer,
1367 )
1368}
1369
1370#[cfg(not(feature = "gpu"))]
1371#[allow(dead_code)]
1372#[allow(unused_variables)]
1373#[allow(clippy::too_many_arguments)]
1374pub fn execute_triangular_solve_kernel<T>(
1375 _device: &GpuDevice,
1376 _kernel: &GpuKernelHandle,
1377 n: usize,
1378 indptr_buffer: &GpuBuffer<u32>,
1379 indices_buffer: &GpuBuffer<u32>,
1380 data_buffer: &GpuBuffer<T>,
1381 b_buffer: &GpuBuffer<T>,
1382 x_buffer: &GpuBuffer<T>,
1383 _config: &GpuKernelConfig,
1384) -> Result<(), GpuError>
1385where
1386 T: Float + Debug + Copy + 'static + GpuDataType,
1387{
1388 execute_cpu_triangular_solve_fallback(
1390 n,
1391 indptr_buffer,
1392 indices_buffer,
1393 data_buffer,
1394 b_buffer,
1395 x_buffer,
1396 )
1397}
1398
1399pub struct GpuPerformanceProfiler {
1401 backend: GpuBackend,
1402 timing_data: std::collections::HashMap<String, Vec<f64>>,
1403}
1404
1405impl GpuPerformanceProfiler {
1406 pub fn new(backend: GpuBackend) -> Self {
1407 Self {
1408 backend,
1409 timing_data: std::collections::HashMap::new(),
1410 }
1411 }
1412
1413 pub fn profile_operation<F, R>(
1415 &mut self,
1416 operationname: &str,
1417 operation: F,
1418 ) -> Result<R, GpuError>
1419 where
1420 F: FnOnce() -> Result<R, GpuError>,
1421 {
1422 let start_time = std::time::Instant::now();
1423 let result = operation()?;
1424 let elapsed = start_time.elapsed().as_secs_f64() * 1000.0; let timings = self
1428 .timing_data
1429 .entry(operationname.to_string())
1430 .or_default();
1431 timings.push(elapsed);
1432
1433 if timings.len() > 100 {
1435 timings.remove(0);
1436 }
1437
1438 Ok(result)
1439 }
1440
1441 pub fn get_average_time(&self, operationname: &str) -> Option<f64> {
1443 if let Some(timings) = self.timing_data.get(operationname) {
1444 if !timings.is_empty() {
1445 Some(timings.iter().sum::<f64>() / timings.len() as f64)
1446 } else {
1447 None
1448 }
1449 } else {
1450 None
1451 }
1452 }
1453
1454 pub fn get_recommendations(&self) -> Vec<String> {
1456 let mut recommendations = Vec::new();
1457
1458 for (operation, timings) in &self.timing_data {
1459 if let Some(avg_time) = self.get_average_time(operation) {
1460 let variance = if timings.len() > 1 {
1462 let mean = avg_time;
1463 let var = timings.iter().map(|&t| (t - mean).powi(2)).sum::<f64>()
1464 / timings.len() as f64;
1465 var.sqrt()
1466 } else {
1467 0.0
1468 };
1469
1470 match self.backend {
1472 GpuBackend::Cuda => {
1473 if avg_time > 10.0 && operation.contains("spmv") {
1474 recommendations.push(format!(
1475 "Consider using larger workgroup sizes for {operation} (current avg: {avg_time:.2}ms, variance: {variance:.2})"
1476 ));
1477 }
1478 if variance > avg_time * 0.5 {
1479 recommendations.push(format!(
1480 "High timing variance for {operation} suggests memory bandwidth bottleneck"
1481 ));
1482 }
1483 }
1484 GpuBackend::OpenCL => {
1485 if avg_time > 15.0 {
1486 recommendations.push(format!(
1487 "OpenCL performance for {operation} could be improved with memory optimization (current avg: {avg_time:.2}ms)"
1488 ));
1489 }
1490 if variance > 5.0 {
1491 recommendations.push(format!(
1492 "Consider using local memory optimization for {operation} to reduce timing variance"
1493 ));
1494 }
1495 }
1496 GpuBackend::Metal => {
1497 if avg_time > 8.0 && operation.contains("triangular") {
1498 recommendations.push(format!(
1499 "Metal triangular solve {operation} may benefit from simdgroup optimization (current avg: {avg_time:.2}ms)"
1500 ));
1501 }
1502 if operation.contains("spmv") && avg_time > 5.0 {
1503 recommendations.push(format!(
1504 "Consider using Metal's unified memory architecture for {operation} optimization"
1505 ));
1506 }
1507 }
1508 GpuBackend::Cpu => {
1509 if avg_time > 50.0 {
1510 recommendations.push(format!(
1511 "Consider enabling GPU acceleration for {operation} (CPU avg: {avg_time:.2}ms)"
1512 ));
1513 }
1514 if variance > 20.0 {
1515 recommendations.push(format!(
1516 "High CPU timing variance for {operation} suggests CPU scheduling issues"
1517 ));
1518 }
1519 }
1520 GpuBackend::Rocm => {
1521 if avg_time > 12.0 {
1522 recommendations.push(format!(
1523 "ROCm performance for {operation} could be improved with memory optimization (current avg: {avg_time:.2}ms)"
1524 ));
1525 }
1526 }
1527 GpuBackend::Wgpu => {
1528 if avg_time > 15.0 {
1529 recommendations.push(format!(
1530 "WebGPU performance for {operation} could be improved with buffer optimization (current avg: {avg_time:.2}ms)"
1531 ));
1532 }
1533 }
1534 }
1535
1536 if avg_time > 100.0 {
1538 recommendations.push(format!(
1539 "Operation {operation} is taking very long ({avg_time:.2}ms) - consider algorithm optimization"
1540 ));
1541 }
1542 }
1543 }
1544
1545 recommendations
1546 }
1547
1548 pub fn get_operation_metrics(&self, operationname: &str) -> Option<OperationMetrics> {
1550 if let Some(timings) = self.timing_data.get(operationname) {
1551 if timings.is_empty() {
1552 return None;
1553 }
1554
1555 let count = timings.len();
1556 let total_time: f64 = timings.iter().sum();
1557 let avg_time = total_time / count as f64;
1558 let min_time = timings.iter().copied().fold(f64::INFINITY, f64::min);
1559 let max_time = timings.iter().copied().fold(f64::NEG_INFINITY, f64::max);
1560
1561 let variance = if count > 1 {
1562 timings.iter().map(|&t| (t - avg_time).powi(2)).sum::<f64>() / count as f64
1563 } else {
1564 0.0
1565 };
1566
1567 Some(OperationMetrics {
1568 operationname: operationname.to_string(),
1569 call_count: count,
1570 total_time,
1571 avg_time,
1572 min_time,
1573 max_time,
1574 variance: variance.sqrt(),
1575 throughput: 1000.0 / avg_time, })
1577 } else {
1578 None
1579 }
1580 }
1581
1582 pub fn reset_metrics(&mut self) {
1584 self.timing_data.clear();
1585 }
1586
1587 pub fn export_metrics(&self) -> Vec<OperationMetrics> {
1589 self.timing_data
1590 .keys()
1591 .filter_map(|op| self.get_operation_metrics(op))
1592 .collect()
1593 }
1594}
1595
1596#[derive(Debug, Clone)]
1598pub struct OperationMetrics {
1599 pub operationname: String,
1600 pub call_count: usize,
1601 pub total_time: f64,
1602 pub avg_time: f64,
1603 pub min_time: f64,
1604 pub max_time: f64,
1605 pub variance: f64,
1606 pub throughput: f64, }
1608
1609#[cfg(test)]
1610mod tests {
1611 use super::*;
1612
1613 #[test]
1614 fn test_calculate_optimal_dimensions() {
1615 let (global, local) = calculate_optimal_dimensions(
1616 GpuBackend::Cuda,
1617 1000, [256, 1, 1], 0, );
1621
1622 assert_eq!(local[0], 256);
1623 assert!(global[0] >= 1000);
1624 assert_eq!(global[0] % local[0], 0);
1625 }
1626
1627 #[test]
1628 fn test_adaptive_workgroup_config() {
1629 let config = calculate_adaptive_workgroup_size(
1630 GpuBackend::Cuda,
1631 10000, 50000, 1024 * 1024 * 1024, );
1635
1636 assert!(config.workgroup_size[0] > 0);
1637 assert!(config.vectorization); assert_eq!(config.memory_strategy, MemoryStrategy::SharedMemory);
1639 }
1640
1641 #[test]
1642 fn test_gpu_kernel_config_default() {
1643 let config = GpuKernelConfig::default();
1644 assert_eq!(config.workgroup_size, [256, 1, 1]);
1645 assert_eq!(config.compute_units, 0);
1646 assert!(config.vectorization);
1647 assert_eq!(config.memory_strategy, MemoryStrategy::Coalesced);
1648 }
1649}