scirs2_metrics/optimization/gpu_kernels/
config.rs

1//! GPU kernel configuration and performance metrics
2//!
3//! This module provides configuration structures and performance tracking
4//! for GPU kernel execution across different backends.
5
6#![allow(clippy::too_many_arguments)]
7#![allow(dead_code)]
8
9use std::time::Duration;
10
11/// GPU kernel configuration
12#[derive(Debug, Clone)]
13pub struct KernelConfig {
14    /// Block size for CUDA / Work group size for OpenCL
15    pub block_size: (u32, u32, u32),
16    /// Grid size for CUDA / Global work size for OpenCL
17    pub grid_size: (u32, u32, u32),
18    /// Shared memory size
19    pub shared_memory_size: u32,
20    /// Use asynchronous execution
21    pub async_execution: bool,
22    /// Memory transfer optimization
23    pub use_pinned_memory: bool,
24    /// Kernel optimization level
25    pub optimization_level: u8,
26}
27
28/// GPU compute configuration
29#[derive(Debug, Clone)]
30pub struct GpuComputeConfig {
31    /// Preferred API (CUDA, OpenCL, Auto)
32    pub preferred_api: GpuApi,
33    /// Memory allocation strategy
34    pub memory_strategy: MemoryStrategy,
35    /// Kernel optimization settings
36    pub kernel_optimization: KernelOptimization,
37    /// Batch processing settings
38    pub batch_settings: BatchSettings,
39    /// Error handling strategy
40    pub error_handling: ErrorHandling,
41}
42
43/// GPU API preference
44#[derive(Debug, Clone, Copy)]
45pub enum GpuApi {
46    Auto,
47    Cuda,
48    OpenCl,
49    Metal,  // For macOS support
50    Vulkan, // For advanced compute
51}
52
53/// Memory allocation strategy
54#[derive(Debug, Clone)]
55pub enum MemoryStrategy {
56    /// Pool pre-allocated blocks
57    Pool {
58        initial_size: usize,
59        max_size: usize,
60    },
61    /// Allocate on demand
62    OnDemand,
63    /// Use unified memory (CUDA)
64    Unified,
65    /// Memory mapping
66    Mapped,
67}
68
69/// Kernel optimization settings
70#[derive(Debug, Clone)]
71pub struct KernelOptimization {
72    /// Use fast math operations
73    pub fast_math: bool,
74    /// Vectorization level
75    pub vectorization: VectorizationLevel,
76    /// Occupancy optimization
77    pub optimize_occupancy: bool,
78    /// Use shared memory optimizations
79    pub use_shared_memory: bool,
80    /// Memory coalescing optimization
81    pub memory_coalescing: bool,
82}
83
84/// Vectorization level
85#[derive(Debug, Clone, Copy)]
86pub enum VectorizationLevel {
87    None,
88    Float2,
89    Float4,
90    Float8,
91    Auto,
92}
93
94/// Batch processing settings
95#[derive(Debug, Clone)]
96pub struct BatchSettings {
97    /// Maximum batch size
98    pub max_batch_size: usize,
99    /// Minimum batch size for GPU usage
100    pub min_batch_size: usize,
101    /// Use multi-stream processing
102    pub multi_stream: bool,
103    /// Stream count
104    pub stream_count: usize,
105    /// Overlap computation and memory transfer
106    pub overlap_computation: bool,
107}
108
109/// Error handling strategy
110#[derive(Debug, Clone, Copy)]
111pub enum ErrorHandling {
112    /// Fail fast on any error
113    FailFast,
114    /// Retry with fallback
115    RetryFallback,
116    /// Graceful degradation
117    GracefulFallback,
118}
119
120/// GPU performance statistics
121#[derive(Debug, Default, Clone)]
122pub struct GpuPerformanceStats {
123    /// Total GPU operations performed
124    pub total_operations: u64,
125    /// Total GPU time
126    pub total_gpu_time: Duration,
127    /// Memory transfers performed
128    pub memory_transfers: u64,
129    /// Total memory transferred (bytes)
130    pub total_memory_transferred: usize,
131    /// Kernel launch count
132    pub kernel_launches: u64,
133    /// Average kernel execution time
134    pub avg_kernel_time: Duration,
135    /// Cache hit rate
136    pub cache_hit_rate: f64,
137    /// Memory bandwidth utilization
138    pub memory_bandwidth_utilization: f64,
139}
140
141/// GPU computation results with detailed metrics
142#[derive(Debug)]
143pub struct GpuComputeResults<T> {
144    /// Computation results
145    pub results: T,
146    /// Execution time
147    pub execution_time: Duration,
148    /// Memory usage
149    pub memory_used: usize,
150    /// Kernel performance metrics
151    pub kernel_metrics: KernelMetrics,
152    /// Transfer metrics
153    pub transfer_metrics: TransferMetrics,
154}
155
156/// Kernel execution metrics
157#[derive(Debug)]
158pub struct KernelMetrics {
159    /// Kernel launch time
160    pub launch_time: Duration,
161    /// Kernel execution time
162    pub execution_time: Duration,
163    /// Occupancy achieved
164    pub occupancy: f32,
165    /// Memory bandwidth achieved
166    pub memory_bandwidth: f64,
167    /// FLOPS achieved
168    pub flops: f64,
169}
170
171/// Memory transfer metrics
172#[derive(Debug)]
173pub struct TransferMetrics {
174    /// Host to device transfer time
175    pub h2d_time: Duration,
176    /// Device to host transfer time
177    pub d2h_time: Duration,
178    /// Bytes transferred H2D
179    pub h2d_bytes: usize,
180    /// Bytes transferred D2H
181    pub d2h_bytes: usize,
182    /// Transfer bandwidth achieved
183    pub bandwidth: f64,
184}
185
186/// Compute strategy selection
187#[derive(Debug, Clone, Copy)]
188pub enum ComputeStrategy {
189    Cuda,
190    OpenCl,
191    Fallback,
192}
193
194impl Default for GpuComputeConfig {
195    fn default() -> Self {
196        Self {
197            preferred_api: GpuApi::Auto,
198            memory_strategy: MemoryStrategy::Pool {
199                initial_size: 256 * 1024 * 1024,  // 256MB
200                max_size: 2 * 1024 * 1024 * 1024, // 2GB
201            },
202            kernel_optimization: KernelOptimization {
203                fast_math: true,
204                vectorization: VectorizationLevel::Auto,
205                optimize_occupancy: true,
206                use_shared_memory: true,
207                memory_coalescing: true,
208            },
209            batch_settings: BatchSettings {
210                max_batch_size: 1024 * 1024,
211                min_batch_size: 1000,
212                multi_stream: true,
213                stream_count: 4,
214                overlap_computation: true,
215            },
216            error_handling: ErrorHandling::RetryFallback,
217        }
218    }
219}
220
221impl Default for KernelConfig {
222    fn default() -> Self {
223        Self {
224            block_size: (256, 1, 1),
225            grid_size: (1, 1, 1),
226            shared_memory_size: 0,
227            async_execution: true,
228            use_pinned_memory: true,
229            optimization_level: 2,
230        }
231    }
232}