1#![allow(dead_code)]
7
8#[allow(unused_imports)]
9use crate::gpu_ops::{GpuBackend, GpuBuffer, GpuBufferExt, GpuDevice, GpuError, GpuKernelHandle};
10#[cfg(feature = "gpu")]
11use scirs2_core::gpu::GpuContext;
12use scirs2_core::numeric::{Float, SparseElement};
13use scirs2_core::GpuDataType;
14use std::fmt::Debug;
15
16#[derive(Debug, Clone)]
18pub struct GpuKernelConfig {
19 pub workgroup_size: [u32; 3],
21 pub compute_units: u32,
23 pub vectorization: bool,
25 pub memory_strategy: MemoryStrategy,
27}
28
29impl Default for GpuKernelConfig {
30 fn default() -> Self {
31 Self {
32 workgroup_size: [256, 1, 1],
33 compute_units: 0, vectorization: true,
35 memory_strategy: MemoryStrategy::Coalesced,
36 }
37 }
38}
39
40#[derive(Debug, Clone, Copy, PartialEq)]
42pub enum MemoryStrategy {
43 Standard,
45 Coalesced,
47 SharedMemory,
49 TextureMemory,
51}
52
53#[cfg(feature = "gpu")]
55#[allow(dead_code)]
56#[allow(clippy::too_many_arguments)]
57pub fn execute_spmv_kernel<T>(
58 device: &GpuDevice,
59 kernel: &GpuKernelHandle,
60 rows: usize,
61 indptr_buffer: &GpuBuffer<u32>,
62 indices_buffer: &GpuBuffer<u32>,
63 data_buffer: &GpuBuffer<T>,
64 x_buffer: &GpuBuffer<T>,
65 y_buffer: &GpuBuffer<T>,
66 config: &GpuKernelConfig,
67) -> Result<(), GpuError>
68where
69 T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
70{
71 let (global_size, local_size) = calculate_optimal_dimensions(
73 device.backend(),
74 rows,
75 config.workgroup_size,
76 config.compute_units,
77 );
78
79 match device.backend() {
81 GpuBackend::Cuda => execute_cuda_spmv(
82 device,
83 kernel,
84 rows,
85 indptr_buffer,
86 indices_buffer,
87 data_buffer,
88 x_buffer,
89 y_buffer,
90 &global_size,
91 &local_size,
92 config,
93 ),
94 GpuBackend::OpenCL => execute_opencl_spmv(
95 device,
96 kernel,
97 rows,
98 indptr_buffer,
99 indices_buffer,
100 data_buffer,
101 x_buffer,
102 y_buffer,
103 &global_size,
104 &local_size,
105 config,
106 ),
107 GpuBackend::Metal => execute_metal_spmv(
108 device,
109 kernel,
110 rows,
111 indptr_buffer,
112 indices_buffer,
113 data_buffer,
114 x_buffer,
115 y_buffer,
116 &global_size,
117 &local_size,
118 config,
119 ),
120 GpuBackend::Cpu => execute_cpu_spmv_fallback(
121 rows,
122 indptr_buffer,
123 indices_buffer,
124 data_buffer,
125 x_buffer,
126 y_buffer,
127 ),
128 GpuBackend::Rocm | GpuBackend::Wgpu => {
129 execute_cpu_spmv_fallback(
131 rows,
132 indptr_buffer,
133 indices_buffer,
134 data_buffer,
135 x_buffer,
136 y_buffer,
137 )
138 }
139 }
140}
141
142#[cfg(feature = "gpu")]
144#[allow(dead_code)]
145#[allow(clippy::too_many_arguments)]
146pub fn execute_symmetric_spmv_kernel<T>(
147 device: &GpuDevice,
148 kernel: &GpuKernelHandle,
149 rows: usize,
150 indptr_buffer: &GpuBuffer<u32>,
151 indices_buffer: &GpuBuffer<u32>,
152 data_buffer: &GpuBuffer<T>,
153 x_buffer: &GpuBuffer<T>,
154 y_buffer: &GpuBuffer<T>,
155 config: &GpuKernelConfig,
156) -> Result<(), GpuError>
157where
158 T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
159{
160 let (global_size, local_size) = calculate_optimal_dimensions(
162 device.backend(),
163 rows,
164 config.workgroup_size,
165 config.compute_units,
166 );
167
168 match device.backend() {
169 GpuBackend::Cuda => execute_cuda_symmetric_spmv(
170 device,
171 kernel,
172 rows,
173 indptr_buffer,
174 indices_buffer,
175 data_buffer,
176 x_buffer,
177 y_buffer,
178 &global_size,
179 &local_size,
180 config,
181 ),
182 GpuBackend::OpenCL => execute_opencl_symmetric_spmv(
183 device,
184 kernel,
185 rows,
186 indptr_buffer,
187 indices_buffer,
188 data_buffer,
189 x_buffer,
190 y_buffer,
191 &global_size,
192 &local_size,
193 config,
194 ),
195 GpuBackend::Metal => execute_metal_symmetric_spmv(
196 device,
197 kernel,
198 rows,
199 indptr_buffer,
200 indices_buffer,
201 data_buffer,
202 x_buffer,
203 y_buffer,
204 &global_size,
205 &local_size,
206 config,
207 ),
208 GpuBackend::Cpu => execute_cpu_symmetric_spmv_fallback(
209 rows,
210 indptr_buffer,
211 indices_buffer,
212 data_buffer,
213 x_buffer,
214 y_buffer,
215 ),
216 GpuBackend::Rocm | GpuBackend::Wgpu => {
217 execute_cpu_symmetric_spmv_fallback(
219 rows,
220 indptr_buffer,
221 indices_buffer,
222 data_buffer,
223 x_buffer,
224 y_buffer,
225 )
226 }
227 }
228}
229
230#[allow(dead_code)]
232fn calculate_optimal_dimensions(
233 backend: GpuBackend,
234 problem_size: usize,
235 workgroup_size: [u32; 3],
236 _compute_units: u32,
237) -> (Vec<usize>, Vec<usize>) {
238 let optimal_workgroup = match backend {
239 GpuBackend::Cuda => {
240 let warp_aligned = workgroup_size[0].div_ceil(32) * 32;
242 [warp_aligned.min(1024), 1, 1] }
244 GpuBackend::OpenCL => {
245 [
247 workgroup_size[0].min(256),
248 workgroup_size[1],
249 workgroup_size[2],
250 ]
251 }
252 GpuBackend::Metal => {
253 let simd_aligned = workgroup_size[0].div_ceil(32) * 32;
255 [simd_aligned.min(1024), 1, 1]
256 }
257 GpuBackend::Cpu => {
258 workgroup_size
260 }
261 GpuBackend::Rocm => {
262 let wave_aligned = workgroup_size[0].div_ceil(64) * 64;
264 [wave_aligned.min(1024), 1, 1]
265 }
266 GpuBackend::Wgpu => {
267 [workgroup_size[0].min(256), 1, 1]
269 }
270 #[cfg(not(feature = "gpu"))]
271 GpuBackend::Vulkan => {
272 [workgroup_size[0].min(256), 1, 1]
274 }
275 };
276
277 let global_size =
278 vec![problem_size.div_ceil(optimal_workgroup[0] as usize) * optimal_workgroup[0] as usize];
279 let local_size = vec![optimal_workgroup[0] as usize];
280
281 (global_size, local_size)
282}
283
284#[cfg(feature = "gpu")]
286#[allow(dead_code)]
287#[allow(unused_variables)]
288fn execute_cuda_spmv<T>(
289 device: &GpuDevice,
290 kernel: &GpuKernelHandle,
291 rows: usize,
292 indptr_buffer: &GpuBuffer<u32>,
293 indices_buffer: &GpuBuffer<u32>,
294 data_buffer: &GpuBuffer<T>,
295 x_buffer: &GpuBuffer<T>,
296 y_buffer: &GpuBuffer<T>,
297 global_size: &[usize],
298 local_size: &[usize],
299 config: &GpuKernelConfig,
300) -> Result<(), GpuError>
301where
302 T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
303{
304 let _warp_size = 32; let block_size = local_size[0].min(1024); let grid_size = rows.div_ceil(block_size);
313
314 let shared_memory_size = match config.memory_strategy {
316 MemoryStrategy::SharedMemory => std::mem::size_of::<T>() * block_size,
317 MemoryStrategy::Standard | MemoryStrategy::Coalesced | MemoryStrategy::TextureMemory => 0,
318 };
319
320 let cuda_args = &[
322 Box::new(rows as u32) as Box<dyn std::any::Any>,
323 Box::new(&raw const *indptr_buffer) as Box<dyn std::any::Any>,
324 Box::new(&raw const *indices_buffer) as Box<dyn std::any::Any>,
325 Box::new(&raw const *data_buffer) as Box<dyn std::any::Any>,
326 Box::new(&raw const *x_buffer) as Box<dyn std::any::Any>,
327 Box::new(std::ptr::addr_of!(*y_buffer) as *mut GpuBuffer<T>) as Box<dyn std::any::Any>,
328 Box::new(block_size as u32) as Box<dyn std::any::Any>,
329 Box::new(shared_memory_size as u32) as Box<dyn std::any::Any>,
330 ];
331
332 let cuda_global_size = &[grid_size, 1, 1];
334 let cuda_local_size = &[block_size, 1, 1];
335
336 Err(GpuError::BackendNotImplemented(GpuBackend::Cuda))
339}
340
341#[cfg(feature = "gpu")]
343#[allow(dead_code)]
344#[allow(unused_variables)]
345fn execute_opencl_spmv<T>(
346 device: &GpuDevice,
347 kernel: &GpuKernelHandle,
348 rows: usize,
349 indptr_buffer: &GpuBuffer<u32>,
350 indices_buffer: &GpuBuffer<u32>,
351 data_buffer: &GpuBuffer<T>,
352 x_buffer: &GpuBuffer<T>,
353 y_buffer: &GpuBuffer<T>,
354 global_size: &[usize],
355 local_size: &[usize],
356 config: &GpuKernelConfig,
357) -> Result<(), GpuError>
358where
359 T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
360{
361 let optimal_local_size = local_size[0].min(256); let aligned_global_size = rows.div_ceil(optimal_local_size) * optimal_local_size;
372
373 let local_memory_size = match config.memory_strategy {
375 MemoryStrategy::SharedMemory => std::mem::size_of::<T>() * optimal_local_size,
376 MemoryStrategy::Standard | MemoryStrategy::Coalesced | MemoryStrategy::TextureMemory => 0,
377 };
378
379 let opencl_args = &[
381 Box::new(rows as u32) as Box<dyn std::any::Any>,
382 Box::new(&raw const *indptr_buffer) as Box<dyn std::any::Any>,
383 Box::new(&raw const *indices_buffer) as Box<dyn std::any::Any>,
384 Box::new(&raw const *data_buffer) as Box<dyn std::any::Any>,
385 Box::new(&raw const *x_buffer) as Box<dyn std::any::Any>,
386 Box::new(std::ptr::addr_of!(*y_buffer) as *mut GpuBuffer<T>) as Box<dyn std::any::Any>,
387 Box::new(local_memory_size as u32) as Box<dyn std::any::Any>,
388 Box::new(config.vectorization as u32) as Box<dyn std::any::Any>,
389 ];
390
391 let opencl_global_size = &[aligned_global_size, 1, 1];
393 let opencl_local_size = &[optimal_local_size, 1, 1];
394
395 Err(GpuError::BackendNotImplemented(GpuBackend::OpenCL))
398}
399
400#[cfg(feature = "gpu")]
402#[allow(dead_code)]
403#[allow(unused_variables)]
404fn execute_metal_spmv<T>(
405 device: &GpuDevice,
406 kernel: &GpuKernelHandle,
407 rows: usize,
408 indptr_buffer: &GpuBuffer<u32>,
409 indices_buffer: &GpuBuffer<u32>,
410 data_buffer: &GpuBuffer<T>,
411 x_buffer: &GpuBuffer<T>,
412 y_buffer: &GpuBuffer<T>,
413 global_size: &[usize],
414 local_size: &[usize],
415 config: &GpuKernelConfig,
416) -> Result<(), GpuError>
417where
418 T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
419{
420 let simdgroup_size = 32; let optimal_threadgroup_size = local_size[0].min(1024); let aligned_threadgroup_size =
432 (optimal_threadgroup_size + simdgroup_size - 1) / simdgroup_size * simdgroup_size;
433
434 let num_threadgroups = (rows + aligned_threadgroup_size - 1) / aligned_threadgroup_size;
436
437 let threadgroup_memory_size = match config.memory_strategy {
439 MemoryStrategy::SharedMemory => std::mem::size_of::<T>() * aligned_threadgroup_size,
440 MemoryStrategy::Standard | MemoryStrategy::Coalesced | MemoryStrategy::TextureMemory => 0,
441 };
442
443 let metal_args = &[
445 Box::new(rows as u32) as Box<dyn std::any::Any>,
446 Box::new(&raw const *indptr_buffer) as Box<dyn std::any::Any>,
447 Box::new(&raw const *indices_buffer) as Box<dyn std::any::Any>,
448 Box::new(&raw const *data_buffer) as Box<dyn std::any::Any>,
449 Box::new(&raw const *x_buffer) as Box<dyn std::any::Any>,
450 Box::new(std::ptr::addr_of!(*y_buffer) as *mut GpuBuffer<T>) as Box<dyn std::any::Any>,
451 Box::new(aligned_threadgroup_size as u32) as Box<dyn std::any::Any>,
452 Box::new(threadgroup_memory_size as u32) as Box<dyn std::any::Any>,
453 Box::new(simdgroup_size as u32) as Box<dyn std::any::Any>,
454 ];
455
456 let metal_global_size = &[num_threadgroups * aligned_threadgroup_size, 1, 1];
458 let metal_local_size = &[aligned_threadgroup_size, 1, 1];
459
460 Err(GpuError::BackendNotImplemented(GpuBackend::Metal))
463}
464
465#[allow(dead_code)]
467#[allow(unused_variables)]
468fn execute_cpu_spmv_fallback<T>(
469 rows: usize,
470 indptr_buffer: &GpuBuffer<u32>,
471 indices_buffer: &GpuBuffer<u32>,
472 data_buffer: &GpuBuffer<T>,
473 x_buffer: &GpuBuffer<T>,
474 y_buffer: &GpuBuffer<T>,
475) -> Result<(), GpuError>
476where
477 T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
478{
479 let indptr = indptr_buffer.to_host()?;
481 let indices = indices_buffer.to_host()?;
482 let data = data_buffer.to_host()?;
483 let x = x_buffer.to_host()?;
484 let mut y = y_buffer.to_host()?;
485
486 for i in 0..rows {
488 let start = indptr[i] as usize;
489 let end = indptr[i + 1] as usize;
490 let mut sum = T::sparse_zero();
491
492 for j in start..end {
493 sum = sum + data[j] * x[indices[j] as usize];
494 }
495
496 y[i] = sum;
497 }
498
499 Ok(())
501}
502
503#[cfg(feature = "gpu")]
505#[allow(dead_code)]
506#[allow(unused_variables)]
507fn execute_cuda_symmetric_spmv<T>(
508 device: &GpuDevice,
509 kernel: &GpuKernelHandle,
510 rows: usize,
511 indptr_buffer: &GpuBuffer<u32>,
512 indices_buffer: &GpuBuffer<u32>,
513 data_buffer: &GpuBuffer<T>,
514 x_buffer: &GpuBuffer<T>,
515 y_buffer: &GpuBuffer<T>,
516 global_size: &[usize],
517 local_size: &[usize],
518 config: &GpuKernelConfig,
519) -> Result<(), GpuError>
520where
521 T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
522{
523 Err(GpuError::BackendNotImplemented(GpuBackend::Cuda))
526}
527
528#[cfg(feature = "gpu")]
529#[allow(dead_code)]
530#[allow(unused_variables)]
531fn execute_opencl_symmetric_spmv<T>(
532 device: &GpuDevice,
533 kernel: &GpuKernelHandle,
534 rows: usize,
535 indptr_buffer: &GpuBuffer<u32>,
536 indices_buffer: &GpuBuffer<u32>,
537 data_buffer: &GpuBuffer<T>,
538 x_buffer: &GpuBuffer<T>,
539 y_buffer: &GpuBuffer<T>,
540 global_size: &[usize],
541 local_size: &[usize],
542 config: &GpuKernelConfig,
543) -> Result<(), GpuError>
544where
545 T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
546{
547 Err(GpuError::BackendNotImplemented(GpuBackend::OpenCL))
550}
551
552#[cfg(feature = "gpu")]
553#[allow(dead_code)]
554#[allow(unused_variables)]
555fn execute_metal_symmetric_spmv<T>(
556 device: &GpuDevice,
557 kernel: &GpuKernelHandle,
558 rows: usize,
559 indptr_buffer: &GpuBuffer<u32>,
560 indices_buffer: &GpuBuffer<u32>,
561 data_buffer: &GpuBuffer<T>,
562 x_buffer: &GpuBuffer<T>,
563 y_buffer: &GpuBuffer<T>,
564 global_size: &[usize],
565 local_size: &[usize],
566 config: &GpuKernelConfig,
567) -> Result<(), GpuError>
568where
569 T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
570{
571 Err(GpuError::BackendNotImplemented(GpuBackend::Metal))
574}
575
576#[allow(dead_code)]
577#[allow(unused_variables)]
578fn execute_cpu_symmetric_spmv_fallback<T>(
579 rows: usize,
580 indptr_buffer: &GpuBuffer<u32>,
581 indices_buffer: &GpuBuffer<u32>,
582 data_buffer: &GpuBuffer<T>,
583 x_buffer: &GpuBuffer<T>,
584 y_buffer: &GpuBuffer<T>,
585) -> Result<(), GpuError>
586where
587 T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
588{
589 let indptr = indptr_buffer.to_host()?;
590 let indices = indices_buffer.to_host()?;
591 let data = data_buffer.to_host()?;
592 let x = x_buffer.to_host()?;
593 let mut y = y_buffer.to_host()?;
594
595 for i in 0..rows {
597 let start = indptr[i] as usize;
598 let end = indptr[i + 1] as usize;
599 let mut sum = T::sparse_zero();
600
601 for j in start..end {
602 let col_idx = indices[j] as usize;
603 let val = data[j];
604
605 if col_idx == i {
607 sum = sum + val * x[col_idx];
608 } else {
609 sum = sum + val * x[col_idx];
611 }
614 }
615
616 y[i] = sum;
617 }
618
619 Ok(())
620}
621
622#[cfg(feature = "gpu")]
624#[allow(dead_code)]
625#[allow(clippy::too_many_arguments)]
626pub fn execute_triangular_solve_kernel<T>(
627 device: &GpuDevice,
628 kernel: &GpuKernelHandle,
629 n: usize,
630 indptr_buffer: &GpuBuffer<u32>,
631 indices_buffer: &GpuBuffer<u32>,
632 data_buffer: &GpuBuffer<T>,
633 b_buffer: &GpuBuffer<T>,
634 x_buffer: &GpuBuffer<T>,
635 _config: &GpuKernelConfig,
636) -> Result<(), GpuError>
637where
638 T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
639{
640 let (global_size, local_size) = match device.backend() {
642 GpuBackend::Cuda => {
643 ([32], [32]) }
646 GpuBackend::OpenCL => {
647 ([64], [64])
649 }
650 GpuBackend::Metal => {
651 ([32], [32])
653 }
654 GpuBackend::Cpu => {
655 ([1], [1])
657 }
658 GpuBackend::Rocm | GpuBackend::Wgpu => {
659 ([64], [64])
661 }
662 };
663
664 match device.backend() {
665 GpuBackend::Cpu => execute_cpu_triangular_solve_fallback(
666 n,
667 indptr_buffer,
668 indices_buffer,
669 data_buffer,
670 b_buffer,
671 x_buffer,
672 ),
673 _ => {
674 Err(GpuError::BackendNotImplemented(device.backend()))
677 }
678 }
679}
680
681#[allow(dead_code)]
683#[allow(unused_variables)]
684fn execute_cpu_triangular_solve_fallback<T>(
685 n: usize,
686 indptr_buffer: &GpuBuffer<u32>,
687 indices_buffer: &GpuBuffer<u32>,
688 data_buffer: &GpuBuffer<T>,
689 b_buffer: &GpuBuffer<T>,
690 x_buffer: &GpuBuffer<T>,
691) -> Result<(), GpuError>
692where
693 T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
694{
695 let indptr = indptr_buffer.to_host()?;
696 let indices = indices_buffer.to_host()?;
697 let data = data_buffer.to_host()?;
698 let b = b_buffer.to_host()?;
699 let mut x = x_buffer.to_host()?;
700
701 for i in 0..n {
703 let start = indptr[i] as usize;
704 let end = indptr[i + 1] as usize;
705 let mut sum = T::sparse_zero();
706 let mut diag_val = T::sparse_zero();
707
708 for j in start..end {
709 let col_idx = indices[j] as usize;
710 let val = data[j];
711
712 match col_idx.cmp(&i) {
713 std::cmp::Ordering::Equal => {
714 diag_val = val;
715 }
716 std::cmp::Ordering::Less => {
717 sum = sum + val * x[col_idx];
718 }
719 std::cmp::Ordering::Greater => {}
720 }
721 }
722
723 if diag_val != T::sparse_zero() {
724 x[i] = (b[i] - sum) / diag_val;
725 } else {
726 #[cfg(feature = "gpu")]
727 return Err(GpuError::InvalidParameter(
728 "Singular matrix in triangular solve".to_string(),
729 ));
730 #[cfg(not(feature = "gpu"))]
731 return Err(GpuError::invalid_parameter(
732 "Singular matrix in triangular solve".to_string(),
733 ));
734 }
735 }
736
737 Ok(())
738}
739
740pub struct GpuMemoryManager {
742 device: GpuDevice,
743 #[cfg(feature = "gpu")]
744 context: Option<GpuContext>,
745 buffer_pool: std::collections::HashMap<(usize, std::any::TypeId), Vec<Box<dyn std::any::Any>>>,
746 memory_stats: GpuMemoryStats,
748 transfer_queue: std::collections::VecDeque<TransferRequest>,
750 alignment_preference: usize,
752 max_pool_size: usize,
754}
755
756#[derive(Debug, Clone, Default)]
758pub struct GpuMemoryStats {
759 pub total_allocated: usize,
761 pub peak_usage: usize,
763 pub allocation_count: u64,
765 pub pool_hits: u64,
767 pub pool_misses: u64,
769 pub avg_transfer_bandwidth: f64,
771}
772
773#[derive(Debug)]
775struct TransferRequest {
776 size: usize,
777 priority: TransferPriority,
778 timestamp: std::time::Instant,
779}
780
781#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
783pub enum TransferPriority {
784 Low,
785 Normal,
786 High,
787 Critical,
788}
789
790#[derive(Debug, Clone, Copy, PartialEq)]
792pub enum MemoryLayout {
793 Standard,
795 Coalesced,
797 Strided { stride: usize },
799}
800
801impl GpuMemoryManager {
802 pub fn new(backend: GpuBackend) -> Result<Self, GpuError> {
803 #[cfg(feature = "gpu")]
804 let device = GpuDevice::new(backend, 0);
805 #[cfg(not(feature = "gpu"))]
806 let device = GpuDevice::new(backend)?;
807
808 let alignment_preference = match backend {
809 GpuBackend::Cuda => 128, GpuBackend::OpenCL => 64, GpuBackend::Metal => 16, GpuBackend::Cpu => 8, GpuBackend::Rocm => 64, GpuBackend::Wgpu => 32, #[cfg(not(feature = "gpu"))]
816 GpuBackend::Vulkan => 32, };
818
819 #[cfg(feature = "gpu")]
820 let context = if device.backend() != GpuBackend::Cpu {
821 GpuContext::new(device.backend()).ok()
822 } else {
823 None
824 };
825
826 Ok(Self {
827 device,
828 #[cfg(feature = "gpu")]
829 context,
830 buffer_pool: std::collections::HashMap::new(),
831 memory_stats: GpuMemoryStats::default(),
832 transfer_queue: std::collections::VecDeque::new(),
833 alignment_preference,
834 max_pool_size: 20, })
836 }
837
838 pub fn get_buffer<T>(&mut self, size: usize) -> Result<GpuBuffer<T>, GpuError>
840 where
841 T: GpuDataType + Default + 'static,
842 {
843 let aligned_size =
844 self.align_size(size * std::mem::size_of::<T>()) / std::mem::size_of::<T>();
845 let key = (aligned_size, std::any::TypeId::of::<T>());
846
847 if let Some(pool) = self.buffer_pool.get_mut(&key) {
849 if let Some(buffer) = pool.pop() {
850 if let Ok(typed_buffer) = buffer.downcast::<GpuBuffer<T>>() {
851 self.memory_stats.pool_hits += 1;
852 return Ok(*typed_buffer);
853 }
854 }
855 }
856
857 self.memory_stats.pool_misses += 1;
859 self.memory_stats.allocation_count += 1;
860
861 #[cfg(feature = "gpu")]
862 let buffer = if let Some(ref context) = self.context {
863 let buffer = context.create_buffer::<T>(aligned_size);
864 let zeros = vec![T::default(); aligned_size];
866 buffer.copy_from_host(&zeros)?;
867 buffer
868 } else {
869 return Err(GpuError::BackendNotAvailable(
871 "No GPU context available".to_string(),
872 ));
873 };
874
875 #[cfg(not(feature = "gpu"))]
876 let buffer = GpuBuffer::from_vec(vec![T::default(); aligned_size]);
877
878 let allocation_size = aligned_size * std::mem::size_of::<T>();
880 self.memory_stats.total_allocated += allocation_size;
881 if self.memory_stats.total_allocated > self.memory_stats.peak_usage {
882 self.memory_stats.peak_usage = self.memory_stats.total_allocated;
883 }
884
885 Ok(buffer)
886 }
887
888 pub fn get_buffer_with_layout<T>(
890 &mut self,
891 size: usize,
892 layout: MemoryLayout,
893 ) -> Result<GpuBuffer<T>, GpuError>
894 where
895 T: GpuDataType + Default + 'static,
896 {
897 match layout {
898 MemoryLayout::Coalesced => {
899 let coalesced_size = self.calculate_coalesced_size(size);
901 self.get_buffer::<T>(coalesced_size)
902 }
903 MemoryLayout::Strided { stride } => {
904 let strided_size = size + (size % stride);
906 self.get_buffer::<T>(strided_size)
907 }
908 MemoryLayout::Standard => self.get_buffer::<T>(size),
909 }
910 }
911
912 pub fn return_buffer<T>(&mut self, buffer: GpuBuffer<T>)
914 where
915 T: GpuDataType + 'static,
916 {
917 let size = buffer.len();
918 let allocation_size = size * std::mem::size_of::<T>();
919 let key = (size, std::any::TypeId::of::<T>());
920
921 let pool = self.buffer_pool.entry(key).or_default();
922
923 if pool.len() < self.max_pool_size && allocation_size > 1024 {
925 pool.push(Box::new(buffer));
927 }
928
929 self.memory_stats.total_allocated = self
931 .memory_stats
932 .total_allocated
933 .saturating_sub(allocation_size);
934
935 self.cleanup_old_buffers_if_needed();
937 }
938
939 fn cleanup_old_buffers_if_needed(&mut self) {
941 if self.memory_stats.total_allocated > self.memory_stats.peak_usage * 3 / 4 {
942 for (_, pool) in self.buffer_pool.iter_mut() {
944 let remove_count = pool.len() / 2;
945 for _ in 0..remove_count {
946 pool.remove(0);
947 }
948 }
949 }
950 }
951
952 pub fn transfer_data_optimized<T>(
954 &mut self,
955 host_data: &[T],
956 _priority: TransferPriority,
957 ) -> Result<GpuBuffer<T>, GpuError>
958 where
959 T: GpuDataType + Copy,
960 {
961 let transfer_size = std::mem::size_of_val(host_data);
962 let start_time = std::time::Instant::now();
963
964 let buffer = match self.device.backend() {
965 #[cfg(feature = "gpu")]
966 GpuBackend::Cuda => {
967 self.transfer_data_cuda_optimized(host_data, transfer_size, _priority)
968 }
969 #[cfg(feature = "gpu")]
970 GpuBackend::OpenCL => {
971 self.transfer_data_opencl_optimized(host_data, transfer_size, _priority)
972 }
973 #[cfg(feature = "gpu")]
974 GpuBackend::Metal => {
975 self.transfer_data_metal_optimized(host_data, transfer_size, _priority)
976 }
977 _ => {
978 #[cfg(feature = "gpu")]
980 {
981 if let Some(ref context) = self.context {
982 let buffer = context.create_buffer_from_slice(host_data);
983 Ok(buffer)
984 } else {
985 Err(GpuError::BackendNotAvailable(
987 "No GPU context available".to_string(),
988 ))
989 }
990 }
991 #[cfg(not(feature = "gpu"))]
992 Ok(GpuBuffer::from_vec(host_data.to_vec()))
993 }
994 }?;
995
996 let elapsed = start_time.elapsed().as_secs_f64();
998 if elapsed > 0.0 {
999 let bandwidth = transfer_size as f64 / elapsed;
1000 self.update_bandwidth_stats(bandwidth);
1001 }
1002
1003 Ok(buffer)
1004 }
1005
1006 #[cfg(feature = "gpu")]
1008 fn transfer_data_cuda_optimized<T>(
1009 &self,
1010 host_data: &[T],
1011 transfer_size: usize,
1012 priority: TransferPriority,
1013 ) -> Result<GpuBuffer<T>, GpuError>
1014 where
1015 T: GpuDataType + Copy,
1016 {
1017 if let Some(ref context) = self.context {
1019 if matches!(
1020 priority,
1021 TransferPriority::High | TransferPriority::Critical
1022 ) && transfer_size > 4 * 1024 * 1024
1023 {
1024 Ok(context.create_buffer_from_slice(host_data))
1027 } else if transfer_size > 64 * 1024 {
1028 Ok(context.create_buffer_from_slice(host_data))
1030 } else {
1031 Ok(context.create_buffer_from_slice(host_data))
1033 }
1034 } else {
1035 Err(GpuError::BackendNotAvailable(
1037 "No GPU context available".to_string(),
1038 ))
1039 }
1040 }
1041
1042 #[cfg(feature = "gpu")]
1044 fn transfer_data_opencl_optimized<T>(
1045 &self,
1046 host_data: &[T],
1047 transfer_size: usize,
1048 _priority: TransferPriority,
1049 ) -> Result<GpuBuffer<T>, GpuError>
1050 where
1051 T: GpuDataType + Copy,
1052 {
1053 if let Some(ref context) = self.context {
1054 if transfer_size > 1024 * 1024 {
1055 Ok(context.create_buffer_from_slice(host_data))
1057 } else {
1058 Ok(context.create_buffer_from_slice(host_data))
1059 }
1060 } else {
1061 Err(GpuError::BackendNotAvailable(
1063 "No GPU context available".to_string(),
1064 ))
1065 }
1066 }
1067
1068 #[cfg(feature = "gpu")]
1070 fn transfer_data_metal_optimized<T>(
1071 &self,
1072 host_data: &[T],
1073 transfer_size: usize,
1074 _priority: TransferPriority,
1075 ) -> Result<GpuBuffer<T>, GpuError>
1076 where
1077 T: GpuDataType + Copy,
1078 {
1079 if let Some(ref context) = self.context {
1080 if transfer_size > 2 * 1024 * 1024 {
1082 Ok(context.create_buffer_from_slice(host_data))
1084 } else {
1085 Ok(context.create_buffer_from_slice(host_data))
1086 }
1087 } else {
1088 Err(GpuError::BackendNotAvailable(
1090 "No GPU context available".to_string(),
1091 ))
1092 }
1093 }
1094
1095 pub fn batch_operations<F, R>(&mut self, operations: F) -> Result<R, GpuError>
1097 where
1098 F: FnOnce(&mut Self) -> Result<R, GpuError>,
1099 {
1100 let batch_start = std::time::Instant::now();
1102
1103 let result = operations(self)?;
1105
1106 self.process_pending_transfers()?;
1108
1109 let batch_duration = batch_start.elapsed();
1111 if batch_duration.as_millis() > 100 {
1112 eprintln!(
1114 "Warning: GPU batch operation took {batch_duration_ms}ms",
1115 batch_duration_ms = batch_duration.as_millis()
1116 );
1117 }
1118
1119 Ok(result)
1120 }
1121
1122 fn process_pending_transfers(&mut self) -> Result<(), GpuError> {
1124 let mut transfers: Vec<_> = self.transfer_queue.drain(..).collect();
1126 transfers.sort_by_key(|t| (t.priority, t.timestamp));
1127
1128 for transfer in transfers {
1130 if transfer.priority >= TransferPriority::High {
1132 } else {
1134 self.transfer_queue.push_back(transfer);
1136 }
1137 }
1138
1139 Ok(())
1140 }
1141
1142 fn update_bandwidth_stats(&mut self, bandwidth: f64) {
1144 let alpha = 0.1; if self.memory_stats.avg_transfer_bandwidth == 0.0 {
1147 self.memory_stats.avg_transfer_bandwidth = bandwidth;
1148 } else {
1149 self.memory_stats.avg_transfer_bandwidth =
1150 alpha * bandwidth + (1.0 - alpha) * self.memory_stats.avg_transfer_bandwidth;
1151 }
1152 }
1153
1154 pub fn get_memory_stats(&self) -> &GpuMemoryStats {
1156 &self.memory_stats
1157 }
1158
1159 pub fn get_pool_efficiency(&self) -> f64 {
1161 let total_requests = self.memory_stats.pool_hits + self.memory_stats.pool_misses;
1162 if total_requests == 0 {
1163 0.0
1164 } else {
1165 self.memory_stats.pool_hits as f64 / total_requests as f64
1166 }
1167 }
1168
1169 fn align_size(&self, size: usize) -> usize {
1171 (size + self.alignment_preference - 1) & !(self.alignment_preference - 1)
1172 }
1173
1174 fn calculate_coalesced_size(&self, size: usize) -> usize {
1176 match self.device.backend() {
1177 GpuBackend::Cuda => {
1178 let alignment = 128 / std::mem::size_of::<usize>();
1180 size.div_ceil(alignment) * alignment
1181 }
1182 GpuBackend::OpenCL => {
1183 let alignment = 64 / std::mem::size_of::<usize>();
1185 size.div_ceil(alignment) * alignment
1186 }
1187 GpuBackend::Metal => {
1188 let alignment = 16 / std::mem::size_of::<usize>();
1190 size.div_ceil(alignment) * alignment
1191 }
1192 GpuBackend::Cpu => size,
1193 GpuBackend::Rocm => {
1194 let alignment = 64 / std::mem::size_of::<usize>();
1196 size.div_ceil(alignment) * alignment
1197 }
1198 GpuBackend::Wgpu => {
1199 let alignment = 32 / std::mem::size_of::<usize>();
1201 size.div_ceil(alignment) * alignment
1202 }
1203 #[cfg(not(feature = "gpu"))]
1204 GpuBackend::Vulkan => {
1205 let alignment = 32 / std::mem::size_of::<usize>();
1207 size.div_ceil(alignment) * alignment
1208 }
1209 }
1210 }
1211}
1212
1213#[allow(dead_code)]
1215pub fn prefetch_matrix_data<T>(
1216 memory_manager: &mut GpuMemoryManager,
1217 matrix_data: &[T],
1218 access_pattern: AccessPattern,
1219) -> Result<GpuBuffer<T>, GpuError>
1220where
1221 T: GpuDataType + Copy,
1222{
1223 let priority = match access_pattern {
1224 AccessPattern::Sequential => TransferPriority::Normal,
1225 AccessPattern::Random => TransferPriority::High,
1226 AccessPattern::Strided { .. } => TransferPriority::High,
1227 AccessPattern::Blocked => TransferPriority::Normal,
1228 };
1229
1230 memory_manager.transfer_data_optimized(matrix_data, priority)
1231}
1232
1233#[derive(Debug, Clone, Copy, PartialEq)]
1235pub enum AccessPattern {
1236 Sequential,
1238 Random,
1240 Strided { stride: usize },
1242 Blocked,
1244}
1245
1246#[allow(dead_code)]
1248pub fn optimize_memory_bandwidth(
1249 backend: GpuBackend,
1250 data_size: usize,
1251 access_pattern: AccessPattern,
1252) -> MemoryStrategy {
1253 match (backend, access_pattern, data_size) {
1254 (GpuBackend::Cuda, AccessPattern::Sequential, size) if size > 1024 * 1024 => {
1255 MemoryStrategy::Coalesced
1256 }
1257 (GpuBackend::Cuda, AccessPattern::Random, _) => MemoryStrategy::TextureMemory,
1258 (GpuBackend::OpenCL, AccessPattern::Blocked, _) => MemoryStrategy::SharedMemory,
1259 (GpuBackend::Metal, _, size) if size > 512 * 1024 => {
1260 MemoryStrategy::SharedMemory }
1262 _ => MemoryStrategy::Standard,
1263 }
1264}
1265
1266#[allow(dead_code)]
1268pub fn calculate_adaptive_workgroup_size(
1269 backend: GpuBackend,
1270 matrix_rows: usize,
1271 matrix_nnz: usize,
1272 available_memory: usize,
1273) -> GpuKernelConfig {
1274 let avg_nnz_per_row = if matrix_rows > 0 {
1275 matrix_nnz / matrix_rows
1276 } else {
1277 0
1278 };
1279
1280 let workgroup_size = match backend {
1281 GpuBackend::Cuda => {
1282 if avg_nnz_per_row < 10 {
1284 [128, 1, 1] } else if avg_nnz_per_row < 50 {
1286 [256, 1, 1] } else {
1288 [512, 1, 1] }
1290 }
1291 GpuBackend::OpenCL => {
1292 if avg_nnz_per_row < 20 {
1294 [64, 1, 1]
1295 } else {
1296 [128, 1, 1]
1297 }
1298 }
1299 GpuBackend::Metal => {
1300 if avg_nnz_per_row < 15 {
1302 [128, 1, 1]
1303 } else {
1304 [256, 1, 1]
1305 }
1306 }
1307 GpuBackend::Cpu => {
1308 [1, 1, 1]
1310 }
1311 GpuBackend::Rocm => {
1312 if avg_nnz_per_row < 10 {
1314 [64, 1, 1]
1315 } else if avg_nnz_per_row < 50 {
1316 [128, 1, 1]
1317 } else {
1318 [256, 1, 1]
1319 }
1320 }
1321 GpuBackend::Wgpu => {
1322 if avg_nnz_per_row < 20 {
1324 [32, 1, 1]
1325 } else {
1326 [64, 1, 1]
1327 }
1328 }
1329 #[cfg(not(feature = "gpu"))]
1330 GpuBackend::Vulkan => {
1331 if avg_nnz_per_row < 20 {
1333 [32, 1, 1]
1334 } else {
1335 [64, 1, 1]
1336 }
1337 }
1338 };
1339
1340 let memory_strategy = if available_memory > 512 * 1024 * 1024 {
1341 MemoryStrategy::SharedMemory } else {
1343 MemoryStrategy::Coalesced };
1345
1346 GpuKernelConfig {
1347 workgroup_size,
1348 compute_units: 0, vectorization: avg_nnz_per_row > 4, memory_strategy,
1351 }
1352}
1353
1354#[cfg(not(feature = "gpu"))]
1356#[allow(dead_code)]
1357#[allow(unused_variables)]
1358#[allow(clippy::too_many_arguments)]
1359pub fn execute_spmv_kernel<T>(
1360 _device: &GpuDevice,
1361 _kernel: &GpuKernelHandle,
1362 rows: usize,
1363 indptr_buffer: &GpuBuffer<u32>,
1364 indices_buffer: &GpuBuffer<u32>,
1365 data_buffer: &GpuBuffer<T>,
1366 x_buffer: &GpuBuffer<T>,
1367 y_buffer: &GpuBuffer<T>,
1368 _config: &GpuKernelConfig,
1369) -> Result<(), GpuError>
1370where
1371 T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
1372{
1373 execute_cpu_spmv_fallback(
1375 rows,
1376 indptr_buffer,
1377 indices_buffer,
1378 data_buffer,
1379 x_buffer,
1380 y_buffer,
1381 )
1382}
1383
1384#[cfg(not(feature = "gpu"))]
1385#[allow(dead_code)]
1386#[allow(unused_variables)]
1387#[allow(clippy::too_many_arguments)]
1388pub fn execute_symmetric_spmv_kernel<T>(
1389 _device: &GpuDevice,
1390 _kernel: &GpuKernelHandle,
1391 rows: usize,
1392 indptr_buffer: &GpuBuffer<u32>,
1393 indices_buffer: &GpuBuffer<u32>,
1394 data_buffer: &GpuBuffer<T>,
1395 x_buffer: &GpuBuffer<T>,
1396 y_buffer: &GpuBuffer<T>,
1397 _config: &GpuKernelConfig,
1398) -> Result<(), GpuError>
1399where
1400 T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
1401{
1402 execute_cpu_symmetric_spmv_fallback(
1404 rows,
1405 indptr_buffer,
1406 indices_buffer,
1407 data_buffer,
1408 x_buffer,
1409 y_buffer,
1410 )
1411}
1412
1413#[cfg(not(feature = "gpu"))]
1414#[allow(dead_code)]
1415#[allow(unused_variables)]
1416#[allow(clippy::too_many_arguments)]
1417pub fn execute_triangular_solve_kernel<T>(
1418 _device: &GpuDevice,
1419 _kernel: &GpuKernelHandle,
1420 n: usize,
1421 indptr_buffer: &GpuBuffer<u32>,
1422 indices_buffer: &GpuBuffer<u32>,
1423 data_buffer: &GpuBuffer<T>,
1424 b_buffer: &GpuBuffer<T>,
1425 x_buffer: &GpuBuffer<T>,
1426 _config: &GpuKernelConfig,
1427) -> Result<(), GpuError>
1428where
1429 T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
1430{
1431 execute_cpu_triangular_solve_fallback(
1433 n,
1434 indptr_buffer,
1435 indices_buffer,
1436 data_buffer,
1437 b_buffer,
1438 x_buffer,
1439 )
1440}
1441
1442pub struct GpuPerformanceProfiler {
1444 backend: GpuBackend,
1445 timing_data: std::collections::HashMap<String, Vec<f64>>,
1446}
1447
1448impl GpuPerformanceProfiler {
1449 pub fn new(backend: GpuBackend) -> Self {
1450 Self {
1451 backend,
1452 timing_data: std::collections::HashMap::new(),
1453 }
1454 }
1455
1456 pub fn profile_operation<F, R>(
1458 &mut self,
1459 operationname: &str,
1460 operation: F,
1461 ) -> Result<R, GpuError>
1462 where
1463 F: FnOnce() -> Result<R, GpuError>,
1464 {
1465 let start_time = std::time::Instant::now();
1466 let result = operation()?;
1467 let elapsed = start_time.elapsed().as_secs_f64() * 1000.0; let timings = self
1471 .timing_data
1472 .entry(operationname.to_string())
1473 .or_default();
1474 timings.push(elapsed);
1475
1476 if timings.len() > 100 {
1478 timings.remove(0);
1479 }
1480
1481 Ok(result)
1482 }
1483
1484 pub fn get_average_time(&self, operationname: &str) -> Option<f64> {
1486 if let Some(timings) = self.timing_data.get(operationname) {
1487 if !timings.is_empty() {
1488 Some(timings.iter().sum::<f64>() / timings.len() as f64)
1489 } else {
1490 None
1491 }
1492 } else {
1493 None
1494 }
1495 }
1496
1497 pub fn get_recommendations(&self) -> Vec<String> {
1499 let mut recommendations = Vec::new();
1500
1501 for (operation, timings) in &self.timing_data {
1502 if let Some(avg_time) = self.get_average_time(operation) {
1503 let variance = if timings.len() > 1 {
1505 let mean = avg_time;
1506 let var = timings.iter().map(|&t| (t - mean).powi(2)).sum::<f64>()
1507 / timings.len() as f64;
1508 var.sqrt()
1509 } else {
1510 0.0
1511 };
1512
1513 match self.backend {
1515 GpuBackend::Cuda => {
1516 if avg_time > 10.0 && operation.contains("spmv") {
1517 recommendations.push(format!(
1518 "Consider using larger workgroup sizes for {operation} (current avg: {avg_time:.2}ms, variance: {variance:.2})"
1519 ));
1520 }
1521 if variance > avg_time * 0.5 {
1522 recommendations.push(format!(
1523 "High timing variance for {operation} suggests memory bandwidth bottleneck"
1524 ));
1525 }
1526 }
1527 GpuBackend::OpenCL => {
1528 if avg_time > 15.0 {
1529 recommendations.push(format!(
1530 "OpenCL performance for {operation} could be improved with memory optimization (current avg: {avg_time:.2}ms)"
1531 ));
1532 }
1533 if variance > 5.0 {
1534 recommendations.push(format!(
1535 "Consider using local memory optimization for {operation} to reduce timing variance"
1536 ));
1537 }
1538 }
1539 GpuBackend::Metal => {
1540 if avg_time > 8.0 && operation.contains("triangular") {
1541 recommendations.push(format!(
1542 "Metal triangular solve {operation} may benefit from simdgroup optimization (current avg: {avg_time:.2}ms)"
1543 ));
1544 }
1545 if operation.contains("spmv") && avg_time > 5.0 {
1546 recommendations.push(format!(
1547 "Consider using Metal's unified memory architecture for {operation} optimization"
1548 ));
1549 }
1550 }
1551 GpuBackend::Cpu => {
1552 if avg_time > 50.0 {
1553 recommendations.push(format!(
1554 "Consider enabling GPU acceleration for {operation} (CPU avg: {avg_time:.2}ms)"
1555 ));
1556 }
1557 if variance > 20.0 {
1558 recommendations.push(format!(
1559 "High CPU timing variance for {operation} suggests CPU scheduling issues"
1560 ));
1561 }
1562 }
1563 GpuBackend::Rocm => {
1564 if avg_time > 12.0 {
1565 recommendations.push(format!(
1566 "ROCm performance for {operation} could be improved with memory optimization (current avg: {avg_time:.2}ms)"
1567 ));
1568 }
1569 }
1570 GpuBackend::Wgpu => {
1571 if avg_time > 15.0 {
1572 recommendations.push(format!(
1573 "WebGPU performance for {operation} could be improved with buffer optimization (current avg: {avg_time:.2}ms)"
1574 ));
1575 } else if avg_time > 12.0 {
1576 recommendations.push(format!(
1577 "WebGPU performance for {operation} could be improved with memory optimization (current avg: {avg_time:.2}ms)"
1578 ));
1579 }
1580 }
1581 #[cfg(not(feature = "gpu"))]
1582 GpuBackend::Vulkan => {
1583 if avg_time > 15.0 {
1584 recommendations.push(format!(
1585 "Vulkan performance for {operation} could be improved with buffer optimization (current avg: {avg_time:.2}ms)"
1586 ));
1587 } else if avg_time > 12.0 {
1588 recommendations.push(format!(
1589 "Vulkan performance for {operation} could be improved with memory optimization (current avg: {avg_time:.2}ms)"
1590 ));
1591 }
1592 }
1593 }
1594
1595 if avg_time > 100.0 {
1597 recommendations.push(format!(
1598 "Operation {operation} is taking very long ({avg_time:.2}ms) - consider algorithm optimization"
1599 ));
1600 }
1601 }
1602 }
1603
1604 recommendations
1605 }
1606
1607 pub fn get_operation_metrics(&self, operationname: &str) -> Option<OperationMetrics> {
1609 if let Some(timings) = self.timing_data.get(operationname) {
1610 if timings.is_empty() {
1611 return None;
1612 }
1613
1614 let count = timings.len();
1615 let total_time: f64 = timings.iter().sum();
1616 let avg_time = total_time / count as f64;
1617 let min_time = timings.iter().copied().fold(f64::INFINITY, f64::min);
1618 let max_time = timings.iter().copied().fold(f64::NEG_INFINITY, f64::max);
1619
1620 let variance = if count > 1 {
1621 timings.iter().map(|&t| (t - avg_time).powi(2)).sum::<f64>() / count as f64
1622 } else {
1623 0.0
1624 };
1625
1626 Some(OperationMetrics {
1627 operationname: operationname.to_string(),
1628 call_count: count,
1629 total_time,
1630 avg_time,
1631 min_time,
1632 max_time,
1633 variance: variance.sqrt(),
1634 throughput: 1000.0 / avg_time, })
1636 } else {
1637 None
1638 }
1639 }
1640
1641 pub fn reset_metrics(&mut self) {
1643 self.timing_data.clear();
1644 }
1645
1646 pub fn export_metrics(&self) -> Vec<OperationMetrics> {
1648 self.timing_data
1649 .keys()
1650 .filter_map(|op| self.get_operation_metrics(op))
1651 .collect()
1652 }
1653}
1654
1655#[derive(Debug, Clone)]
1657pub struct OperationMetrics {
1658 pub operationname: String,
1659 pub call_count: usize,
1660 pub total_time: f64,
1661 pub avg_time: f64,
1662 pub min_time: f64,
1663 pub max_time: f64,
1664 pub variance: f64,
1665 pub throughput: f64, }
1667
1668#[cfg(test)]
1669mod tests {
1670 use super::*;
1671
1672 #[test]
1673 fn test_calculate_optimal_dimensions() {
1674 let (global, local) = calculate_optimal_dimensions(
1675 GpuBackend::Cuda,
1676 1000, [256, 1, 1], 0, );
1680
1681 assert_eq!(local[0], 256);
1682 assert!(global[0] >= 1000);
1683 assert_eq!(global[0] % local[0], 0);
1684 }
1685
1686 #[test]
1687 fn test_adaptive_workgroup_config() {
1688 let config = calculate_adaptive_workgroup_size(
1689 GpuBackend::Cuda,
1690 10000, 50000, 1024 * 1024 * 1024, );
1694
1695 assert!(config.workgroup_size[0] > 0);
1696 assert!(config.vectorization); assert_eq!(config.memory_strategy, MemoryStrategy::SharedMemory);
1698 }
1699
1700 #[test]
1701 fn test_gpu_kernel_config_default() {
1702 let config = GpuKernelConfig::default();
1703 assert_eq!(config.workgroup_size, [256, 1, 1]);
1704 assert_eq!(config.compute_units, 0);
1705 assert!(config.vectorization);
1706 assert_eq!(config.memory_strategy, MemoryStrategy::Coalesced);
1707 }
1708}