scirs2_series/gpu_acceleration/
config.rs

1//! GPU acceleration configuration and capabilities
2//!
3//! This module defines the configuration structures and enums for GPU acceleration,
4//! including device capabilities, tensor cores configuration, and optimization settings.
5
6use std::fmt::Debug;
7
8/// GPU device configuration
9#[derive(Debug, Clone)]
10pub struct GpuConfig {
11    /// Device ID to use
12    pub device_id: usize,
13    /// Memory pool size in bytes
14    pub memory_pool_size: Option<usize>,
15    /// Enable memory optimization
16    pub enable_memory_optimization: bool,
17    /// Batch size for GPU operations
18    pub batch_size: usize,
19    /// Use half precision (FP16) for faster computation
20    pub use_half_precision: bool,
21    /// Enable asynchronous execution
22    pub enable_async: bool,
23    /// Tensor cores configuration
24    pub tensor_cores: TensorCoresConfig,
25    /// Memory allocation strategy
26    pub memory_strategy: MemoryStrategy,
27    /// Enable dynamic batch sizing
28    pub dynamic_batching: bool,
29    /// Graph optimization level
30    pub graph_optimization: GraphOptimizationLevel,
31}
32
33/// Graph optimization levels for GPU computation
34#[derive(Debug, Clone, Copy)]
35pub enum GraphOptimizationLevel {
36    /// No optimization
37    None,
38    /// Basic optimization
39    Basic,
40    /// Extended optimization
41    Extended,
42    /// Maximum optimization (may increase compile time)
43    Maximum,
44}
45
46impl Default for GpuConfig {
47    fn default() -> Self {
48        Self {
49            device_id: 0,
50            memory_pool_size: None,
51            enable_memory_optimization: true,
52            batch_size: 1024,
53            use_half_precision: false,
54            enable_async: true,
55            tensor_cores: TensorCoresConfig::default(),
56            memory_strategy: MemoryStrategy::OnDemand,
57            dynamic_batching: true,
58            graph_optimization: GraphOptimizationLevel::Extended,
59        }
60    }
61}
62
63/// GPU memory management strategy
64#[derive(Debug, Clone)]
65pub enum MemoryStrategy {
66    /// Allocate memory on-demand
67    OnDemand,
68    /// Pre-allocate memory pool
69    PreAllocated {
70        /// Size of the memory pool in bytes
71        pool_size: usize,
72    },
73    /// Use unified memory (if available)
74    Unified,
75    /// Use pinned host memory for transfers
76    Pinned,
77}
78
79/// GPU computation backend
80#[derive(Debug, Clone, PartialEq)]
81pub enum GpuBackend {
82    /// CUDA backend for NVIDIA GPUs
83    Cuda,
84    /// ROCm backend for AMD GPUs
85    Rocm,
86    /// OpenCL backend for cross-platform support
87    OpenCL,
88    /// Metal backend for Apple Silicon
89    Metal,
90    /// CPU fallback (no GPU acceleration)
91    CpuFallback,
92}
93
94/// GPU acceleration capabilities
95#[derive(Debug, Clone)]
96pub struct GpuCapabilities {
97    /// Available backend
98    pub backend: GpuBackend,
99    /// Compute capability (for CUDA)
100    pub compute_capability: Option<(u32, u32)>,
101    /// Available memory in bytes
102    pub memory: usize,
103    /// Number of multiprocessors
104    pub multiprocessors: usize,
105    /// Supports half precision
106    pub supports_fp16: bool,
107    /// Supports tensor cores
108    pub supports_tensor_cores: bool,
109    /// Maximum threads per block
110    pub max_threads_per_block: usize,
111    /// Tensor cores generation
112    pub tensor_cores_generation: Option<TensorCoresGeneration>,
113    /// Memory bandwidth (GB/s)
114    pub memory_bandwidth: f64,
115    /// Peak tensor performance (TOPS)
116    pub tensor_performance: Option<f64>,
117}
118
119/// Tensor cores generation and capabilities
120#[derive(Debug, Clone, Copy)]
121pub enum TensorCoresGeneration {
122    /// First generation (V100)
123    V1,
124    /// Second generation (T4, RTX 20xx)
125    V2,
126    /// Third generation (A100, RTX 30xx)
127    V3,
128    /// Fourth generation (H100, RTX 40xx)
129    V4,
130}
131
132impl TensorCoresGeneration {
133    /// Get supported data types for this generation
134    pub fn supported_data_types(&self) -> Vec<TensorDataType> {
135        match self {
136            TensorCoresGeneration::V1 => vec![TensorDataType::FP16],
137            TensorCoresGeneration::V2 => vec![TensorDataType::FP16, TensorDataType::INT8],
138            TensorCoresGeneration::V3 => vec![
139                TensorDataType::FP16,
140                TensorDataType::BF16,
141                TensorDataType::INT8,
142                TensorDataType::INT4,
143                TensorDataType::FP64,
144            ],
145            TensorCoresGeneration::V4 => vec![
146                TensorDataType::FP16,
147                TensorDataType::BF16,
148                TensorDataType::INT8,
149                TensorDataType::INT4,
150                TensorDataType::FP8,
151                TensorDataType::FP64,
152            ],
153        }
154    }
155
156    /// Get matrix dimensions supported by tensor cores
157    pub fn supported_matrix_dimensions(&self) -> Vec<(usize, usize, usize)> {
158        match self {
159            TensorCoresGeneration::V1 => vec![(16, 16, 16)],
160            TensorCoresGeneration::V2 => vec![(16, 16, 16), (8, 32, 16), (32, 8, 16)],
161            TensorCoresGeneration::V3 | TensorCoresGeneration::V4 => vec![
162                (16, 16, 16),
163                (8, 32, 16),
164                (32, 8, 16),
165                (16, 8, 8),
166                (8, 8, 4),
167            ],
168        }
169    }
170}
171
172/// Tensor data types supported by tensor cores
173#[derive(Debug, Clone, Copy, PartialEq)]
174pub enum TensorDataType {
175    /// 16-bit floating point
176    FP16,
177    /// 16-bit brain floating point
178    BF16,
179    /// 8-bit floating point (FP8)
180    FP8,
181    /// 64-bit floating point
182    FP64,
183    /// 8-bit integer
184    INT8,
185    /// 4-bit integer
186    INT4,
187}
188
189/// Tensor cores optimization configuration
190#[derive(Debug, Clone)]
191pub struct TensorCoresConfig {
192    /// Enable tensor cores acceleration
193    pub enabled: bool,
194    /// Preferred data type for computation
195    pub data_type: TensorDataType,
196    /// Matrix dimensions to use for tiling
197    pub tile_size: (usize, usize, usize),
198    /// Enable mixed precision training
199    pub mixed_precision: bool,
200    /// Loss scaling for mixed precision
201    pub loss_scale: f32,
202    /// Enable automatic mixed precision
203    pub auto_mixed_precision: bool,
204    /// Minimum matrix size to use tensor cores
205    pub min_matrix_size: usize,
206}
207
208impl Default for TensorCoresConfig {
209    fn default() -> Self {
210        Self {
211            enabled: true,
212            data_type: TensorDataType::FP16,
213            tile_size: (16, 16, 16),
214            mixed_precision: true,
215            loss_scale: 65536.0,
216            auto_mixed_precision: true,
217            min_matrix_size: 512,
218        }
219    }
220}