#![allow(clippy::too_many_arguments)]
#![allow(dead_code)]
use std::time::Duration;
#[derive(Debug, Clone)]
pub struct KernelConfig {
pub block_size: (u32, u32, u32),
pub grid_size: (u32, u32, u32),
pub shared_memory_size: u32,
pub async_execution: bool,
pub use_pinned_memory: bool,
pub optimization_level: u8,
}
#[derive(Debug, Clone)]
pub struct GpuComputeConfig {
pub preferred_api: GpuApi,
pub memory_strategy: MemoryStrategy,
pub kernel_optimization: KernelOptimization,
pub batch_settings: BatchSettings,
pub error_handling: ErrorHandling,
}
#[derive(Debug, Clone, Copy)]
pub enum GpuApi {
Auto,
Cuda,
OpenCl,
Metal, Vulkan, }
#[derive(Debug, Clone)]
pub enum MemoryStrategy {
Pool {
initial_size: usize,
max_size: usize,
},
OnDemand,
Unified,
Mapped,
}
#[derive(Debug, Clone)]
pub struct KernelOptimization {
pub fast_math: bool,
pub vectorization: VectorizationLevel,
pub optimize_occupancy: bool,
pub use_shared_memory: bool,
pub memory_coalescing: bool,
}
#[derive(Debug, Clone, Copy)]
pub enum VectorizationLevel {
None,
Float2,
Float4,
Float8,
Auto,
}
#[derive(Debug, Clone)]
pub struct BatchSettings {
pub max_batch_size: usize,
pub min_batch_size: usize,
pub multi_stream: bool,
pub stream_count: usize,
pub overlap_computation: bool,
}
#[derive(Debug, Clone, Copy)]
pub enum ErrorHandling {
FailFast,
RetryFallback,
GracefulFallback,
}
#[derive(Debug, Default, Clone)]
pub struct GpuPerformanceStats {
pub total_operations: u64,
pub total_gpu_time: Duration,
pub memory_transfers: u64,
pub total_memory_transferred: usize,
pub kernel_launches: u64,
pub avg_kernel_time: Duration,
pub cache_hit_rate: f64,
pub memory_bandwidth_utilization: f64,
}
#[derive(Debug)]
pub struct GpuComputeResults<T> {
pub results: T,
pub execution_time: Duration,
pub memory_used: usize,
pub kernel_metrics: KernelMetrics,
pub transfer_metrics: TransferMetrics,
}
#[derive(Debug)]
pub struct KernelMetrics {
pub launch_time: Duration,
pub execution_time: Duration,
pub occupancy: f32,
pub memory_bandwidth: f64,
pub flops: f64,
}
#[derive(Debug)]
pub struct TransferMetrics {
pub h2d_time: Duration,
pub d2h_time: Duration,
pub h2d_bytes: usize,
pub d2h_bytes: usize,
pub bandwidth: f64,
}
#[derive(Debug, Clone, Copy)]
pub enum ComputeStrategy {
Cuda,
OpenCl,
Fallback,
}
impl Default for GpuComputeConfig {
fn default() -> Self {
Self {
preferred_api: GpuApi::Auto,
memory_strategy: MemoryStrategy::Pool {
initial_size: 256 * 1024 * 1024, max_size: 2 * 1024 * 1024 * 1024, },
kernel_optimization: KernelOptimization {
fast_math: true,
vectorization: VectorizationLevel::Auto,
optimize_occupancy: true,
use_shared_memory: true,
memory_coalescing: true,
},
batch_settings: BatchSettings {
max_batch_size: 1024 * 1024,
min_batch_size: 1000,
multi_stream: true,
stream_count: 4,
overlap_computation: true,
},
error_handling: ErrorHandling::RetryFallback,
}
}
}
impl Default for KernelConfig {
fn default() -> Self {
Self {
block_size: (256, 1, 1),
grid_size: (1, 1, 1),
shared_memory_size: 0,
async_execution: true,
use_pinned_memory: true,
optimization_level: 2,
}
}
}