use super::{
cpu::CpuInfo, gpu::GpuInfo, memory::MemoryInfo, network::NetworkInfo, storage::StorageInfo,
};
use crate::error::CoreResult;
#[derive(Debug, Clone)]
pub struct OptimizationParams {
pub thread_count: usize,
pub chunk_size: usize,
pub enable_simd: bool,
pub enable_gpu: bool,
pub enable_prefetch: bool,
pub numa_aware: bool,
pub cache_params: CacheParams,
pub io_params: IoParams,
pub gpu_params: Option<GpuParams>,
}
impl Default for OptimizationParams {
fn default() -> Self {
Self {
thread_count: std::thread::available_parallelism()
.map(|n| n.get())
.unwrap_or(4),
chunk_size: 64 * 1024, enable_simd: false,
enable_gpu: false,
enable_prefetch: true,
numa_aware: false,
cache_params: CacheParams::default(),
io_params: IoParams::default(),
gpu_params: None,
}
}
}
impl OptimizationParams {
pub fn generate(
cpu: &CpuInfo,
memory: &MemoryInfo,
gpu: Option<&GpuInfo>,
network: &NetworkInfo,
storage: &StorageInfo,
) -> CoreResult<Self> {
let thread_count = Self::calculate_optimal_thread_count(cpu, memory);
let chunk_size = Self::calculate_optimal_chunk_size(cpu, memory, storage);
let enable_simd = Self::should_enable_simd(cpu);
let enable_gpu = Self::should_enable_gpu(gpu);
let enable_prefetch = Self::should_enable_prefetch(memory, storage);
let numa_aware = memory.numa_nodes > 1;
let cache_params = CacheParams::from_cpu(cpu);
let io_params = IoParams::from_resources(network, storage);
let gpu_params = gpu.map(GpuParams::from_gpu);
Ok(Self {
thread_count,
chunk_size,
enable_simd,
enable_gpu,
enable_prefetch,
numa_aware,
cache_params,
io_params,
gpu_params,
})
}
fn calculate_optimal_thread_count(cpu: &CpuInfo, memory: &MemoryInfo) -> usize {
let base_threads = cpu.physical_cores;
let ht_benefit = if cpu.logical_cores > cpu.physical_cores {
(cpu.logical_cores - cpu.physical_cores) / 2
} else {
0
};
let memory_factor = if memory.is_under_pressure() {
0.75 } else {
1.0
};
let optimal = ((base_threads + ht_benefit) as f64 * memory_factor) as usize;
optimal.max(1).min(cpu.logical_cores)
}
fn calculate_optimal_chunk_size(
cpu: &CpuInfo,
memory: &MemoryInfo,
storage: &StorageInfo,
) -> usize {
let cachebased = cpu.cache_l3_kb * 1024 / 4;
let memorybased = memory.optimal_chunk_size();
let storagebased = storage.optimal_io_size;
let geometric_mean = ((cachebased as f64 * memorybased as f64 * storagebased as f64)
.powf(1.0 / 3.0)) as usize;
geometric_mean.clamp(4 * 1024, 64 * 1024 * 1024)
}
fn should_enable_simd(cpu: &CpuInfo) -> bool {
cpu.simd_capabilities.sse4_2 || cpu.simd_capabilities.avx2 || cpu.simd_capabilities.neon
}
fn should_enable_gpu(gpuinfo: Option<&GpuInfo>) -> bool {
gpuinfo.map(|g| g.is_compute_capable()).unwrap_or(false)
}
fn should_enable_prefetch(memory: &MemoryInfo, storage: &StorageInfo) -> bool {
!memory.is_under_pressure() && storage.supports_async_io()
}
pub fn get_scaling_factor(problemsize: usize) -> f64 {
let base_size = 1024 * 1024; if problemsize <= base_size {
1.0
} else {
let ratio = problemsize as f64 / base_size as f64;
ratio.sqrt()
}
}
pub fn scaling_factor(&self, problemsize: usize) -> f64 {
Self::get_scaling_factor(problemsize)
}
pub fn adjust_for_workload(&mut self, workload: WorkloadType) {
match workload {
WorkloadType::CpuIntensive => {
self.thread_count = self.thread_count.max(
std::thread::available_parallelism()
.map(|n| n.get())
.unwrap_or(4),
);
self.chunk_size = self.chunk_size.max(1024 * 1024); }
WorkloadType::MemoryIntensive => {
self.enable_prefetch = true;
self.chunk_size = self.chunk_size.min(256 * 1024); }
WorkloadType::IoIntensive => {
self.thread_count = (self.thread_count * 2).min(16); self.chunk_size = self.io_params.optimal_buffersize;
}
WorkloadType::GpuIntensive => {
if self.enable_gpu {
self.thread_count = self.thread_count.min(4); }
}
}
}
}
#[derive(Debug, Clone)]
pub struct CacheParams {
pub cache_line_size: usize,
pub alignment: usize,
pub prefetch_distance: usize,
pub tile_size: usize,
}
impl Default for CacheParams {
fn default() -> Self {
Self {
cache_line_size: 64,
alignment: 64,
prefetch_distance: 64,
tile_size: 64,
}
}
}
impl CacheParams {
pub fn from_cpu(cpu: &CpuInfo) -> Self {
let cache_line_size = 64; let alignment = cache_line_size;
let prefetch_distance = (cpu.cache_l1_kb * 1024 / 16).clamp(64, 1024);
let tile_size = (cpu.cache_l1_kb * 1024 / 8).clamp(64, 4096);
Self {
cache_line_size,
alignment,
prefetch_distance,
tile_size,
}
}
}
#[derive(Debug, Clone)]
pub struct IoParams {
pub optimal_buffersize: usize,
pub concurrent_operations: usize,
pub enable_async_io: bool,
pub enable_io_cache: bool,
}
impl Default for IoParams {
fn default() -> Self {
Self {
optimal_buffersize: 64 * 1024, concurrent_operations: 4,
enable_async_io: true,
enable_io_cache: true,
}
}
}
impl IoParams {
pub fn from_network(network: &NetworkInfo, storage: &StorageInfo) -> Self {
let optimal_buffersize = storage.optimal_io_size.max(network.mtu);
let concurrent_operations = storage.queue_depth.min(16);
let enable_async_io = storage.supports_async_io();
let enable_io_cache =
!storage.is_ssd() || storage.capacity > (512u64 * 1024 * 1024 * 1024) as usize;
Self {
optimal_buffersize,
concurrent_operations,
enable_async_io,
enable_io_cache,
}
}
pub fn from_resources(network: &NetworkInfo, storage: &StorageInfo) -> Self {
Self::from_network(network, storage)
}
}
#[derive(Debug, Clone)]
pub struct GpuParams {
pub workgroup_size: usize,
pub workgroup_count: usize,
pub shared_memory_size: usize,
pub use_unified_memory: bool,
pub transfer_strategy: GpuTransferStrategy,
}
impl GpuParams {
pub fn from_gpu(gpu: &GpuInfo) -> Self {
let workgroup_size = gpu.optimal_workgroup_size();
let workgroup_count = (gpu.compute_units * 4).min(65535); let shared_memory_size = 16 * 1024; let use_unified_memory = gpu.features.unified_memory;
let transfer_strategy = if gpu.memorybandwidth_gbps > 500.0 {
GpuTransferStrategy::HighBandwidth
} else if use_unified_memory {
GpuTransferStrategy::Unified
} else {
GpuTransferStrategy::Standard
};
Self {
workgroup_size,
workgroup_count,
shared_memory_size,
use_unified_memory,
transfer_strategy,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum GpuTransferStrategy {
Standard,
HighBandwidth,
Unified,
ZeroCopy,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum WorkloadType {
CpuIntensive,
MemoryIntensive,
IoIntensive,
GpuIntensive,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_optimization_params_generation() {
let cpu = CpuInfo::default();
let memory = MemoryInfo::default();
let gpu = Some(GpuInfo::default());
let network = NetworkInfo::default();
let storage = StorageInfo::default();
let params = OptimizationParams::generate(&cpu, &memory, gpu.as_ref(), &network, &storage);
assert!(params.is_ok());
let params = params.expect("Operation failed");
assert!(params.thread_count > 0);
assert!(params.chunk_size > 0);
}
#[test]
fn test_thread_count_calculation() {
let cpu = CpuInfo {
physical_cores: 8,
logical_cores: 16,
..Default::default()
};
let memory = MemoryInfo::default();
let thread_count = OptimizationParams::calculate_optimal_thread_count(&cpu, &memory);
assert!(thread_count >= 8);
assert!(thread_count <= 16);
}
#[test]
fn test_chunk_size_calculation() {
let cpu = CpuInfo {
cache_l3_kb: 8192, ..Default::default()
};
let memory = MemoryInfo::default();
let storage = StorageInfo::default();
let chunk_size = OptimizationParams::calculate_optimal_chunk_size(&cpu, &memory, &storage);
assert!(chunk_size >= 4 * 1024); assert!(chunk_size <= 64 * 1024 * 1024); }
#[test]
fn test_workload_adjustment() {
let mut params = OptimizationParams::default();
let original_thread_count = params.thread_count;
params.adjust_for_workload(WorkloadType::CpuIntensive);
assert!(params.thread_count >= original_thread_count);
params.adjust_for_workload(WorkloadType::MemoryIntensive);
assert!(params.enable_prefetch);
}
#[test]
fn test_cache_params() {
let cpu = CpuInfo {
cache_l1_kb: 32,
..Default::default()
};
let cache_params = CacheParams::from_cpu(&cpu);
assert_eq!(cache_params.cache_line_size, 64);
assert!(cache_params.tile_size > 0);
}
#[test]
fn test_gpu_params() {
let gpu = GpuInfo {
vendor: super::super::gpu::GpuVendor::Nvidia,
compute_units: 2048,
features: super::super::gpu::GpuFeatures {
unified_memory: true,
..Default::default()
},
..Default::default()
};
let gpu_params = GpuParams::from_gpu(&gpu);
assert_eq!(gpu_params.workgroup_size, 256); assert!(gpu_params.use_unified_memory);
assert_eq!(gpu_params.transfer_strategy, GpuTransferStrategy::Unified);
}
#[test]
fn test_scaling_factor() {
let params = OptimizationParams::default();
assert_eq!(params.scaling_factor(1024), 1.0); assert!(params.scaling_factor(1024 * 1024 * 4) > 1.0); }
}