scirs2_metrics/optimization/gpu_kernels/
config.rs1#![allow(clippy::too_many_arguments)]
7#![allow(dead_code)]
8
9use std::time::Duration;
10
11#[derive(Debug, Clone)]
13pub struct KernelConfig {
14 pub block_size: (u32, u32, u32),
16 pub grid_size: (u32, u32, u32),
18 pub shared_memory_size: u32,
20 pub async_execution: bool,
22 pub use_pinned_memory: bool,
24 pub optimization_level: u8,
26}
27
28#[derive(Debug, Clone)]
30pub struct GpuComputeConfig {
31 pub preferred_api: GpuApi,
33 pub memory_strategy: MemoryStrategy,
35 pub kernel_optimization: KernelOptimization,
37 pub batch_settings: BatchSettings,
39 pub error_handling: ErrorHandling,
41}
42
43#[derive(Debug, Clone, Copy)]
45pub enum GpuApi {
46 Auto,
47 Cuda,
48 OpenCl,
49 Metal, Vulkan, }
52
53#[derive(Debug, Clone)]
55pub enum MemoryStrategy {
56 Pool {
58 initial_size: usize,
59 max_size: usize,
60 },
61 OnDemand,
63 Unified,
65 Mapped,
67}
68
69#[derive(Debug, Clone)]
71pub struct KernelOptimization {
72 pub fast_math: bool,
74 pub vectorization: VectorizationLevel,
76 pub optimize_occupancy: bool,
78 pub use_shared_memory: bool,
80 pub memory_coalescing: bool,
82}
83
84#[derive(Debug, Clone, Copy)]
86pub enum VectorizationLevel {
87 None,
88 Float2,
89 Float4,
90 Float8,
91 Auto,
92}
93
94#[derive(Debug, Clone)]
96pub struct BatchSettings {
97 pub max_batch_size: usize,
99 pub min_batch_size: usize,
101 pub multi_stream: bool,
103 pub stream_count: usize,
105 pub overlap_computation: bool,
107}
108
109#[derive(Debug, Clone, Copy)]
111pub enum ErrorHandling {
112 FailFast,
114 RetryFallback,
116 GracefulFallback,
118}
119
120#[derive(Debug, Default, Clone)]
122pub struct GpuPerformanceStats {
123 pub total_operations: u64,
125 pub total_gpu_time: Duration,
127 pub memory_transfers: u64,
129 pub total_memory_transferred: usize,
131 pub kernel_launches: u64,
133 pub avg_kernel_time: Duration,
135 pub cache_hit_rate: f64,
137 pub memory_bandwidth_utilization: f64,
139}
140
141#[derive(Debug)]
143pub struct GpuComputeResults<T> {
144 pub results: T,
146 pub execution_time: Duration,
148 pub memory_used: usize,
150 pub kernel_metrics: KernelMetrics,
152 pub transfer_metrics: TransferMetrics,
154}
155
156#[derive(Debug)]
158pub struct KernelMetrics {
159 pub launch_time: Duration,
161 pub execution_time: Duration,
163 pub occupancy: f32,
165 pub memory_bandwidth: f64,
167 pub flops: f64,
169}
170
171#[derive(Debug)]
173pub struct TransferMetrics {
174 pub h2d_time: Duration,
176 pub d2h_time: Duration,
178 pub h2d_bytes: usize,
180 pub d2h_bytes: usize,
182 pub bandwidth: f64,
184}
185
186#[derive(Debug, Clone, Copy)]
188pub enum ComputeStrategy {
189 Cuda,
190 OpenCl,
191 Fallback,
192}
193
194impl Default for GpuComputeConfig {
195 fn default() -> Self {
196 Self {
197 preferred_api: GpuApi::Auto,
198 memory_strategy: MemoryStrategy::Pool {
199 initial_size: 256 * 1024 * 1024, max_size: 2 * 1024 * 1024 * 1024, },
202 kernel_optimization: KernelOptimization {
203 fast_math: true,
204 vectorization: VectorizationLevel::Auto,
205 optimize_occupancy: true,
206 use_shared_memory: true,
207 memory_coalescing: true,
208 },
209 batch_settings: BatchSettings {
210 max_batch_size: 1024 * 1024,
211 min_batch_size: 1000,
212 multi_stream: true,
213 stream_count: 4,
214 overlap_computation: true,
215 },
216 error_handling: ErrorHandling::RetryFallback,
217 }
218 }
219}
220
221impl Default for KernelConfig {
222 fn default() -> Self {
223 Self {
224 block_size: (256, 1, 1),
225 grid_size: (1, 1, 1),
226 shared_memory_size: 0,
227 async_execution: true,
228 use_pinned_memory: true,
229 optimization_level: 2,
230 }
231 }
232}