scirs2_series/gpu_acceleration/
config.rs1use std::fmt::Debug;
7
8#[derive(Debug, Clone)]
10pub struct GpuConfig {
11 pub device_id: usize,
13 pub memory_pool_size: Option<usize>,
15 pub enable_memory_optimization: bool,
17 pub batch_size: usize,
19 pub use_half_precision: bool,
21 pub enable_async: bool,
23 pub tensor_cores: TensorCoresConfig,
25 pub memory_strategy: MemoryStrategy,
27 pub dynamic_batching: bool,
29 pub graph_optimization: GraphOptimizationLevel,
31}
32
33#[derive(Debug, Clone, Copy)]
35pub enum GraphOptimizationLevel {
36 None,
38 Basic,
40 Extended,
42 Maximum,
44}
45
46impl Default for GpuConfig {
47 fn default() -> Self {
48 Self {
49 device_id: 0,
50 memory_pool_size: None,
51 enable_memory_optimization: true,
52 batch_size: 1024,
53 use_half_precision: false,
54 enable_async: true,
55 tensor_cores: TensorCoresConfig::default(),
56 memory_strategy: MemoryStrategy::OnDemand,
57 dynamic_batching: true,
58 graph_optimization: GraphOptimizationLevel::Extended,
59 }
60 }
61}
62
63#[derive(Debug, Clone)]
65pub enum MemoryStrategy {
66 OnDemand,
68 PreAllocated {
70 pool_size: usize,
72 },
73 Unified,
75 Pinned,
77}
78
79#[derive(Debug, Clone, PartialEq)]
81pub enum GpuBackend {
82 Cuda,
84 Rocm,
86 OpenCL,
88 Metal,
90 CpuFallback,
92}
93
94#[derive(Debug, Clone)]
96pub struct GpuCapabilities {
97 pub backend: GpuBackend,
99 pub compute_capability: Option<(u32, u32)>,
101 pub memory: usize,
103 pub multiprocessors: usize,
105 pub supports_fp16: bool,
107 pub supports_tensor_cores: bool,
109 pub max_threads_per_block: usize,
111 pub tensor_cores_generation: Option<TensorCoresGeneration>,
113 pub memory_bandwidth: f64,
115 pub tensor_performance: Option<f64>,
117}
118
119#[derive(Debug, Clone, Copy)]
121pub enum TensorCoresGeneration {
122 V1,
124 V2,
126 V3,
128 V4,
130}
131
132impl TensorCoresGeneration {
133 pub fn supported_data_types(&self) -> Vec<TensorDataType> {
135 match self {
136 TensorCoresGeneration::V1 => vec![TensorDataType::FP16],
137 TensorCoresGeneration::V2 => vec![TensorDataType::FP16, TensorDataType::INT8],
138 TensorCoresGeneration::V3 => vec![
139 TensorDataType::FP16,
140 TensorDataType::BF16,
141 TensorDataType::INT8,
142 TensorDataType::INT4,
143 TensorDataType::FP64,
144 ],
145 TensorCoresGeneration::V4 => vec![
146 TensorDataType::FP16,
147 TensorDataType::BF16,
148 TensorDataType::INT8,
149 TensorDataType::INT4,
150 TensorDataType::FP8,
151 TensorDataType::FP64,
152 ],
153 }
154 }
155
156 pub fn supported_matrix_dimensions(&self) -> Vec<(usize, usize, usize)> {
158 match self {
159 TensorCoresGeneration::V1 => vec![(16, 16, 16)],
160 TensorCoresGeneration::V2 => vec![(16, 16, 16), (8, 32, 16), (32, 8, 16)],
161 TensorCoresGeneration::V3 | TensorCoresGeneration::V4 => vec![
162 (16, 16, 16),
163 (8, 32, 16),
164 (32, 8, 16),
165 (16, 8, 8),
166 (8, 8, 4),
167 ],
168 }
169 }
170}
171
172#[derive(Debug, Clone, Copy, PartialEq)]
174pub enum TensorDataType {
175 FP16,
177 BF16,
179 FP8,
181 FP64,
183 INT8,
185 INT4,
187}
188
189#[derive(Debug, Clone)]
191pub struct TensorCoresConfig {
192 pub enabled: bool,
194 pub data_type: TensorDataType,
196 pub tile_size: (usize, usize, usize),
198 pub mixed_precision: bool,
200 pub loss_scale: f32,
202 pub auto_mixed_precision: bool,
204 pub min_matrix_size: usize,
206}
207
208impl Default for TensorCoresConfig {
209 fn default() -> Self {
210 Self {
211 enabled: true,
212 data_type: TensorDataType::FP16,
213 tile_size: (16, 16, 16),
214 mixed_precision: true,
215 loss_scale: 65536.0,
216 auto_mixed_precision: true,
217 min_matrix_size: 512,
218 }
219 }
220}