use std::fmt::Debug;
#[derive(Debug, Clone)]
pub struct GpuConfig {
pub device_id: usize,
pub memory_pool_size: Option<usize>,
pub enable_memory_optimization: bool,
pub batch_size: usize,
pub use_half_precision: bool,
pub enable_async: bool,
pub tensor_cores: TensorCoresConfig,
pub memory_strategy: MemoryStrategy,
pub dynamic_batching: bool,
pub graph_optimization: GraphOptimizationLevel,
}
#[derive(Debug, Clone, Copy)]
pub enum GraphOptimizationLevel {
None,
Basic,
Extended,
Maximum,
}
impl Default for GpuConfig {
fn default() -> Self {
Self {
device_id: 0,
memory_pool_size: None,
enable_memory_optimization: true,
batch_size: 1024,
use_half_precision: false,
enable_async: true,
tensor_cores: TensorCoresConfig::default(),
memory_strategy: MemoryStrategy::OnDemand,
dynamic_batching: true,
graph_optimization: GraphOptimizationLevel::Extended,
}
}
}
#[derive(Debug, Clone)]
pub enum MemoryStrategy {
OnDemand,
PreAllocated {
pool_size: usize,
},
Unified,
Pinned,
}
#[derive(Debug, Clone, PartialEq)]
pub enum GpuBackend {
Cuda,
Rocm,
OpenCL,
Metal,
CpuFallback,
}
#[derive(Debug, Clone)]
pub struct GpuCapabilities {
pub backend: GpuBackend,
pub compute_capability: Option<(u32, u32)>,
pub memory: usize,
pub multiprocessors: usize,
pub supports_fp16: bool,
pub supports_tensor_cores: bool,
pub max_threads_per_block: usize,
pub tensor_cores_generation: Option<TensorCoresGeneration>,
pub memory_bandwidth: f64,
pub tensor_performance: Option<f64>,
}
#[derive(Debug, Clone, Copy)]
pub enum TensorCoresGeneration {
V1,
V2,
V3,
V4,
}
impl TensorCoresGeneration {
pub fn supported_data_types(&self) -> Vec<TensorDataType> {
match self {
TensorCoresGeneration::V1 => vec![TensorDataType::FP16],
TensorCoresGeneration::V2 => vec![TensorDataType::FP16, TensorDataType::INT8],
TensorCoresGeneration::V3 => vec![
TensorDataType::FP16,
TensorDataType::BF16,
TensorDataType::INT8,
TensorDataType::INT4,
TensorDataType::FP64,
],
TensorCoresGeneration::V4 => vec![
TensorDataType::FP16,
TensorDataType::BF16,
TensorDataType::INT8,
TensorDataType::INT4,
TensorDataType::FP8,
TensorDataType::FP64,
],
}
}
pub fn supported_matrix_dimensions(&self) -> Vec<(usize, usize, usize)> {
match self {
TensorCoresGeneration::V1 => vec![(16, 16, 16)],
TensorCoresGeneration::V2 => vec![(16, 16, 16), (8, 32, 16), (32, 8, 16)],
TensorCoresGeneration::V3 | TensorCoresGeneration::V4 => vec![
(16, 16, 16),
(8, 32, 16),
(32, 8, 16),
(16, 8, 8),
(8, 8, 4),
],
}
}
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum TensorDataType {
FP16,
BF16,
FP8,
FP64,
INT8,
INT4,
}
#[derive(Debug, Clone)]
pub struct TensorCoresConfig {
pub enabled: bool,
pub data_type: TensorDataType,
pub tile_size: (usize, usize, usize),
pub mixed_precision: bool,
pub loss_scale: f32,
pub auto_mixed_precision: bool,
pub min_matrix_size: usize,
}
impl Default for TensorCoresConfig {
fn default() -> Self {
Self {
enabled: true,
data_type: TensorDataType::FP16,
tile_size: (16, 16, 16),
mixed_precision: true,
loss_scale: 65536.0,
auto_mixed_precision: true,
min_matrix_size: 512,
}
}
}