1use super::{
7 cpu::CpuInfo, gpu::GpuInfo, memory::MemoryInfo, network::NetworkInfo, storage::StorageInfo,
8};
9use crate::error::CoreResult;
10
11#[derive(Debug, Clone)]
13pub struct OptimizationParams {
14 pub thread_count: usize,
16 pub chunk_size: usize,
18 pub enable_simd: bool,
20 pub enable_gpu: bool,
22 pub enable_prefetch: bool,
24 pub numa_aware: bool,
26 pub cache_params: CacheParams,
28 pub io_params: IoParams,
30 pub gpu_params: Option<GpuParams>,
32}
33
34impl Default for OptimizationParams {
35 fn default() -> Self {
36 Self {
37 thread_count: std::thread::available_parallelism()
38 .map(|n| n.get())
39 .unwrap_or(4),
40 chunk_size: 64 * 1024, enable_simd: false,
42 enable_gpu: false,
43 enable_prefetch: true,
44 numa_aware: false,
45 cache_params: CacheParams::default(),
46 io_params: IoParams::default(),
47 gpu_params: None,
48 }
49 }
50}
51
52impl OptimizationParams {
53 pub fn generate(
55 cpu: &CpuInfo,
56 memory: &MemoryInfo,
57 gpu: Option<&GpuInfo>,
58 network: &NetworkInfo,
59 storage: &StorageInfo,
60 ) -> CoreResult<Self> {
61 let thread_count = Self::calculate_optimal_thread_count(cpu, memory);
62 let chunk_size = Self::calculate_optimal_chunk_size(cpu, memory, storage);
63 let enable_simd = Self::should_enable_simd(cpu);
64 let enable_gpu = Self::should_enable_gpu(gpu);
65 let enable_prefetch = Self::should_enable_prefetch(memory, storage);
66 let numa_aware = memory.numa_nodes > 1;
67
68 let cache_params = CacheParams::from_cpu(cpu);
69 let io_params = IoParams::from_resources(network, storage);
70 let gpu_params = gpu.map(GpuParams::from_gpu);
71
72 Ok(Self {
73 thread_count,
74 chunk_size,
75 enable_simd,
76 enable_gpu,
77 enable_prefetch,
78 numa_aware,
79 cache_params,
80 io_params,
81 gpu_params,
82 })
83 }
84
85 fn calculate_optimal_thread_count(cpu: &CpuInfo, memory: &MemoryInfo) -> usize {
87 let base_threads = cpu.physical_cores;
88
89 let ht_benefit = if cpu.logical_cores > cpu.physical_cores {
91 (cpu.logical_cores - cpu.physical_cores) / 2
92 } else {
93 0
94 };
95
96 let memory_factor = if memory.is_under_pressure() {
98 0.75 } else {
100 1.0
101 };
102
103 let optimal = ((base_threads + ht_benefit) as f64 * memory_factor) as usize;
104 optimal.max(1).min(cpu.logical_cores)
105 }
106
107 fn calculate_optimal_chunk_size(
109 cpu: &CpuInfo,
110 memory: &MemoryInfo,
111 storage: &StorageInfo,
112 ) -> usize {
113 let cachebased = cpu.cache_l3_kb * 1024 / 4; let memorybased = memory.optimal_chunk_size();
118
119 let storagebased = storage.optimal_io_size;
121
122 let geometric_mean = ((cachebased as f64 * memorybased as f64 * storagebased as f64)
124 .powf(1.0 / 3.0)) as usize;
125
126 geometric_mean.clamp(4 * 1024, 64 * 1024 * 1024)
128 }
129
130 fn should_enable_simd(cpu: &CpuInfo) -> bool {
132 cpu.simd_capabilities.sse4_2 || cpu.simd_capabilities.avx2 || cpu.simd_capabilities.neon
133 }
134
135 fn should_enable_gpu(gpuinfo: Option<&GpuInfo>) -> bool {
137 gpuinfo.map(|g| g.is_compute_capable()).unwrap_or(false)
138 }
139
140 fn should_enable_prefetch(memory: &MemoryInfo, storage: &StorageInfo) -> bool {
142 !memory.is_under_pressure() && storage.supports_async_io()
144 }
145
146 pub fn get_scaling_factor(problemsize: usize) -> f64 {
148 let base_size = 1024 * 1024; if problemsize <= base_size {
150 1.0
151 } else {
152 let ratio = problemsize as f64 / base_size as f64;
153 ratio.sqrt()
155 }
156 }
157
158 pub fn scaling_factor(&self, problemsize: usize) -> f64 {
160 Self::get_scaling_factor(problemsize)
161 }
162
163 pub fn adjust_for_workload(&mut self, workload: WorkloadType) {
165 match workload {
166 WorkloadType::CpuIntensive => {
167 self.thread_count = self.thread_count.max(
169 std::thread::available_parallelism()
170 .map(|n| n.get())
171 .unwrap_or(4),
172 );
173 self.chunk_size = self.chunk_size.max(1024 * 1024); }
175 WorkloadType::MemoryIntensive => {
176 self.enable_prefetch = true;
178 self.chunk_size = self.chunk_size.min(256 * 1024); }
180 WorkloadType::IoIntensive => {
181 self.thread_count = (self.thread_count * 2).min(16); self.chunk_size = self.io_params.optimal_buffersize;
184 }
185 WorkloadType::GpuIntensive => {
186 if self.enable_gpu {
188 self.thread_count = self.thread_count.min(4); }
190 }
191 }
192 }
193}
194
195#[derive(Debug, Clone)]
197pub struct CacheParams {
198 pub cache_line_size: usize,
200 pub alignment: usize,
202 pub prefetch_distance: usize,
204 pub tile_size: usize,
206}
207
208impl Default for CacheParams {
209 fn default() -> Self {
210 Self {
211 cache_line_size: 64,
212 alignment: 64,
213 prefetch_distance: 64,
214 tile_size: 64,
215 }
216 }
217}
218
219impl CacheParams {
220 pub fn from_cpu(cpu: &CpuInfo) -> Self {
222 let cache_line_size = 64; let alignment = cache_line_size;
224
225 let prefetch_distance = (cpu.cache_l1_kb * 1024 / 16).clamp(64, 1024);
227
228 let tile_size = (cpu.cache_l1_kb * 1024 / 8).clamp(64, 4096);
230
231 Self {
232 cache_line_size,
233 alignment,
234 prefetch_distance,
235 tile_size,
236 }
237 }
238}
239
240#[derive(Debug, Clone)]
242pub struct IoParams {
243 pub optimal_buffersize: usize,
245 pub concurrent_operations: usize,
247 pub enable_async_io: bool,
249 pub enable_io_cache: bool,
251}
252
253impl Default for IoParams {
254 fn default() -> Self {
255 Self {
256 optimal_buffersize: 64 * 1024, concurrent_operations: 4,
258 enable_async_io: true,
259 enable_io_cache: true,
260 }
261 }
262}
263
264impl IoParams {
265 pub fn from_network(network: &NetworkInfo, storage: &StorageInfo) -> Self {
267 let optimal_buffersize = storage.optimal_io_size.max(network.mtu);
268 let concurrent_operations = storage.queue_depth.min(16);
269 let enable_async_io = storage.supports_async_io();
270 let enable_io_cache =
271 !storage.is_ssd() || storage.capacity > (512u64 * 1024 * 1024 * 1024) as usize; Self {
274 optimal_buffersize,
275 concurrent_operations,
276 enable_async_io,
277 enable_io_cache,
278 }
279 }
280
281 pub fn from_resources(network: &NetworkInfo, storage: &StorageInfo) -> Self {
283 Self::from_network(network, storage)
284 }
285}
286
287#[derive(Debug, Clone)]
289pub struct GpuParams {
290 pub workgroup_size: usize,
292 pub workgroup_count: usize,
294 pub shared_memory_size: usize,
296 pub use_unified_memory: bool,
298 pub transfer_strategy: GpuTransferStrategy,
300}
301
302impl GpuParams {
303 pub fn from_gpu(gpu: &GpuInfo) -> Self {
305 let workgroup_size = gpu.optimal_workgroup_size();
306 let workgroup_count = (gpu.compute_units * 4).min(65535); let shared_memory_size = 16 * 1024; let use_unified_memory = gpu.features.unified_memory;
309
310 let transfer_strategy = if gpu.memorybandwidth_gbps > 500.0 {
311 GpuTransferStrategy::HighBandwidth
312 } else if use_unified_memory {
313 GpuTransferStrategy::Unified
314 } else {
315 GpuTransferStrategy::Standard
316 };
317
318 Self {
319 workgroup_size,
320 workgroup_count,
321 shared_memory_size,
322 use_unified_memory,
323 transfer_strategy,
324 }
325 }
326}
327
328#[derive(Debug, Clone, Copy, PartialEq, Eq)]
330pub enum GpuTransferStrategy {
331 Standard,
333 HighBandwidth,
335 Unified,
337 ZeroCopy,
339}
340
341#[derive(Debug, Clone, Copy, PartialEq, Eq)]
343pub enum WorkloadType {
344 CpuIntensive,
346 MemoryIntensive,
348 IoIntensive,
350 GpuIntensive,
352}
353
354#[cfg(test)]
355mod tests {
356 use super::*;
357
358 #[test]
359 fn test_optimization_params_generation() {
360 let cpu = CpuInfo::default();
361 let memory = MemoryInfo::default();
362 let gpu = Some(GpuInfo::default());
363 let network = NetworkInfo::default();
364 let storage = StorageInfo::default();
365
366 let params = OptimizationParams::generate(&cpu, &memory, gpu.as_ref(), &network, &storage);
367 assert!(params.is_ok());
368
369 let params = params.expect("Operation failed");
370 assert!(params.thread_count > 0);
371 assert!(params.chunk_size > 0);
372 }
373
374 #[test]
375 fn test_thread_count_calculation() {
376 let cpu = CpuInfo {
377 physical_cores: 8,
378 logical_cores: 16,
379 ..Default::default()
380 };
381 let memory = MemoryInfo::default();
382
383 let thread_count = OptimizationParams::calculate_optimal_thread_count(&cpu, &memory);
384 assert!(thread_count >= 8);
385 assert!(thread_count <= 16);
386 }
387
388 #[test]
389 fn test_chunk_size_calculation() {
390 let cpu = CpuInfo {
391 cache_l3_kb: 8192, ..Default::default()
393 };
394 let memory = MemoryInfo::default();
395 let storage = StorageInfo::default();
396
397 let chunk_size = OptimizationParams::calculate_optimal_chunk_size(&cpu, &memory, &storage);
398 assert!(chunk_size >= 4 * 1024); assert!(chunk_size <= 64 * 1024 * 1024); }
401
402 #[test]
403 fn test_workload_adjustment() {
404 let mut params = OptimizationParams::default();
405 let original_thread_count = params.thread_count;
406
407 params.adjust_for_workload(WorkloadType::CpuIntensive);
408 assert!(params.thread_count >= original_thread_count);
409
410 params.adjust_for_workload(WorkloadType::MemoryIntensive);
411 assert!(params.enable_prefetch);
412 }
413
414 #[test]
415 fn test_cache_params() {
416 let cpu = CpuInfo {
417 cache_l1_kb: 32,
418 ..Default::default()
419 };
420
421 let cache_params = CacheParams::from_cpu(&cpu);
422 assert_eq!(cache_params.cache_line_size, 64);
423 assert!(cache_params.tile_size > 0);
424 }
425
426 #[test]
427 fn test_gpu_params() {
428 let gpu = GpuInfo {
429 vendor: super::super::gpu::GpuVendor::Nvidia,
430 compute_units: 2048,
431 features: super::super::gpu::GpuFeatures {
432 unified_memory: true,
433 ..Default::default()
434 },
435 ..Default::default()
436 };
437
438 let gpu_params = GpuParams::from_gpu(&gpu);
439 assert_eq!(gpu_params.workgroup_size, 256); assert!(gpu_params.use_unified_memory);
441 assert_eq!(gpu_params.transfer_strategy, GpuTransferStrategy::Unified);
442 }
443
444 #[test]
445 fn test_scaling_factor() {
446 let params = OptimizationParams::default();
447
448 assert_eq!(params.scaling_factor(1024), 1.0); assert!(params.scaling_factor(1024 * 1024 * 4) > 1.0); }
451}
452
453