scirs2_core/resource/
optimization.rs

1//! # Optimization Parameters Generation
2//!
3//! This module generates optimal parameters for various operations
4//! based on detected system resources.
5
6use super::{
7    cpu::CpuInfo, gpu::GpuInfo, memory::MemoryInfo, network::NetworkInfo, storage::StorageInfo,
8};
9use crate::error::CoreResult;
10
11/// Optimization parameters for system operations
12#[derive(Debug, Clone)]
13pub struct OptimizationParams {
14    /// Recommended thread count for parallel operations
15    pub thread_count: usize,
16    /// Recommended chunk size for memory operations (bytes)
17    pub chunk_size: usize,
18    /// Enable SIMD operations
19    pub enable_simd: bool,
20    /// Enable GPU acceleration
21    pub enable_gpu: bool,
22    /// Enable memory prefetching
23    pub enable_prefetch: bool,
24    /// NUMA-aware memory allocation
25    pub numa_aware: bool,
26    /// Cache-friendly parameters
27    pub cache_params: CacheParams,
28    /// I/O optimization parameters
29    pub io_params: IoParams,
30    /// GPU-specific parameters
31    pub gpu_params: Option<GpuParams>,
32}
33
34impl Default for OptimizationParams {
35    fn default() -> Self {
36        Self {
37            thread_count: std::thread::available_parallelism()
38                .map(|n| n.get())
39                .unwrap_or(4),
40            chunk_size: 64 * 1024, // 64KB default
41            enable_simd: false,
42            enable_gpu: false,
43            enable_prefetch: true,
44            numa_aware: false,
45            cache_params: CacheParams::default(),
46            io_params: IoParams::default(),
47            gpu_params: None,
48        }
49    }
50}
51
52impl OptimizationParams {
53    /// Generate optimization parameters from system resources
54    pub fn generate(
55        cpu: &CpuInfo,
56        memory: &MemoryInfo,
57        gpu: Option<&GpuInfo>,
58        network: &NetworkInfo,
59        storage: &StorageInfo,
60    ) -> CoreResult<Self> {
61        let thread_count = Self::calculate_optimal_thread_count(cpu, memory);
62        let chunk_size = Self::calculate_optimal_chunk_size(cpu, memory, storage);
63        let enable_simd = Self::should_enable_simd(cpu);
64        let enable_gpu = Self::should_enable_gpu(gpu);
65        let enable_prefetch = Self::should_enable_prefetch(memory, storage);
66        let numa_aware = memory.numa_nodes > 1;
67
68        let cache_params = CacheParams::from_cpu(cpu);
69        let io_params = IoParams::from_resources(network, storage);
70        let gpu_params = gpu.map(GpuParams::from_gpu);
71
72        Ok(Self {
73            thread_count,
74            chunk_size,
75            enable_simd,
76            enable_gpu,
77            enable_prefetch,
78            numa_aware,
79            cache_params,
80            io_params,
81            gpu_params,
82        })
83    }
84
85    /// Calculate optimal thread count
86    fn calculate_optimal_thread_count(cpu: &CpuInfo, memory: &MemoryInfo) -> usize {
87        let base_threads = cpu.physical_cores;
88
89        // Add hyperthreading benefit for certain workloads
90        let ht_benefit = if cpu.logical_cores > cpu.physical_cores {
91            (cpu.logical_cores - cpu.physical_cores) / 2
92        } else {
93            0
94        };
95
96        // Consider memory pressure
97        let memory_factor = if memory.is_under_pressure() {
98            0.75 // Reduce threads under memory pressure
99        } else {
100            1.0
101        };
102
103        let optimal = ((base_threads + ht_benefit) as f64 * memory_factor) as usize;
104        optimal.max(1).min(cpu.logical_cores)
105    }
106
107    /// Calculate optimal chunk size
108    fn calculate_optimal_chunk_size(
109        cpu: &CpuInfo,
110        memory: &MemoryInfo,
111        storage: &StorageInfo,
112    ) -> usize {
113        // Base on CPU cache size
114        let cachebased = cpu.cache_l3_kb * 1024 / 4; // Use 1/4 of L3 cache
115
116        // Base on memory bandwidth
117        let memorybased = memory.optimal_chunk_size();
118
119        // Base on storage characteristics
120        let storagebased = storage.optimal_io_size;
121
122        // Take the geometric mean to balance all factors
123        let geometric_mean = ((cachebased as f64 * memorybased as f64 * storagebased as f64)
124            .powf(1.0 / 3.0)) as usize;
125
126        // Ensure it's a reasonable size (between 4KB and 64MB)
127        geometric_mean.clamp(4 * 1024, 64 * 1024 * 1024)
128    }
129
130    /// Determine if SIMD should be enabled
131    fn should_enable_simd(cpu: &CpuInfo) -> bool {
132        cpu.simd_capabilities.sse4_2 || cpu.simd_capabilities.avx2 || cpu.simd_capabilities.neon
133    }
134
135    /// Determine if GPU should be enabled
136    fn should_enable_gpu(gpuinfo: Option<&GpuInfo>) -> bool {
137        gpuinfo.map(|g| g.is_compute_capable()).unwrap_or(false)
138    }
139
140    /// Determine if prefetching should be enabled
141    fn should_enable_prefetch(memory: &MemoryInfo, storage: &StorageInfo) -> bool {
142        // Enable prefetch if we have sufficient memory and storage supports it
143        !memory.is_under_pressure() && storage.supports_async_io()
144    }
145
146    /// Get scaling factor for different problem sizes
147    pub fn get_scaling_factor(problemsize: usize) -> f64 {
148        let base_size = 1024 * 1024; // 1MB base
149        if problemsize <= base_size {
150            1.0
151        } else {
152            let ratio = problemsize as f64 / base_size as f64;
153            // Use square root scaling to avoid excessive resource usage
154            ratio.sqrt()
155        }
156    }
157
158    /// Instance method to get scaling factor
159    pub fn scaling_factor(&self, problemsize: usize) -> f64 {
160        Self::get_scaling_factor(problemsize)
161    }
162
163    /// Adjust parameters for specific workload type
164    pub fn adjust_for_workload(&mut self, workload: WorkloadType) {
165        match workload {
166            WorkloadType::CpuIntensive => {
167                // Maximize CPU utilization
168                self.thread_count = self.thread_count.max(
169                    std::thread::available_parallelism()
170                        .map(|n| n.get())
171                        .unwrap_or(4),
172                );
173                self.chunk_size = self.chunk_size.max(1024 * 1024); // Larger chunks
174            }
175            WorkloadType::MemoryIntensive => {
176                // Optimize for memory bandwidth
177                self.enable_prefetch = true;
178                self.chunk_size = self.chunk_size.min(256 * 1024); // Smaller chunks
179            }
180            WorkloadType::IoIntensive => {
181                // Optimize for I/O throughput
182                self.thread_count = (self.thread_count * 2).min(16); // More threads for I/O
183                self.chunk_size = self.io_params.optimal_buffersize;
184            }
185            WorkloadType::GpuIntensive => {
186                // Favor GPU over CPU
187                if self.enable_gpu {
188                    self.thread_count = self.thread_count.min(4); // Fewer CPU threads
189                }
190            }
191        }
192    }
193}
194
195/// Cache optimization parameters
196#[derive(Debug, Clone)]
197pub struct CacheParams {
198    /// L1 cache line size
199    pub cache_line_size: usize,
200    /// Optimal data alignment
201    pub alignment: usize,
202    /// Prefetch distance
203    pub prefetch_distance: usize,
204    /// Cache-friendly loop tiling size
205    pub tile_size: usize,
206}
207
208impl Default for CacheParams {
209    fn default() -> Self {
210        Self {
211            cache_line_size: 64,
212            alignment: 64,
213            prefetch_distance: 64,
214            tile_size: 64,
215        }
216    }
217}
218
219impl CacheParams {
220    /// Generate cache parameters from CPU info
221    pub fn from_cpu(cpu: &CpuInfo) -> Self {
222        let cache_line_size = 64; // Most modern CPUs use 64-byte cache lines
223        let alignment = cache_line_size;
224
225        // Prefetch distance based on cache size
226        let prefetch_distance = (cpu.cache_l1_kb * 1024 / 16).clamp(64, 1024);
227
228        // Tile size based on L1 cache
229        let tile_size = (cpu.cache_l1_kb * 1024 / 8).clamp(64, 4096);
230
231        Self {
232            cache_line_size,
233            alignment,
234            prefetch_distance,
235            tile_size,
236        }
237    }
238}
239
240/// I/O optimization parameters
241#[derive(Debug, Clone)]
242pub struct IoParams {
243    /// Optimal buffer size for I/O operations
244    pub optimal_buffersize: usize,
245    /// Number of concurrent I/O operations
246    pub concurrent_operations: usize,
247    /// Enable asynchronous I/O
248    pub enable_async_io: bool,
249    /// Enable I/O caching
250    pub enable_io_cache: bool,
251}
252
253impl Default for IoParams {
254    fn default() -> Self {
255        Self {
256            optimal_buffersize: 64 * 1024, // 64KB
257            concurrent_operations: 4,
258            enable_async_io: true,
259            enable_io_cache: true,
260        }
261    }
262}
263
264impl IoParams {
265    /// Generate I/O parameters from network and storage info
266    pub fn from_network(network: &NetworkInfo, storage: &StorageInfo) -> Self {
267        let optimal_buffersize = storage.optimal_io_size.max(network.mtu);
268        let concurrent_operations = storage.queue_depth.min(16);
269        let enable_async_io = storage.supports_async_io();
270        let enable_io_cache = !storage.is_ssd() || storage.capacity > 512 * 1024 * 1024 * 1024; // Cache for HDD or large SSDs
271
272        Self {
273            optimal_buffersize,
274            concurrent_operations,
275            enable_async_io,
276            enable_io_cache,
277        }
278    }
279
280    /// Generate I/O parameters from resources (alias for from_network)
281    pub fn from_resources(network: &NetworkInfo, storage: &StorageInfo) -> Self {
282        Self::from_network(network, storage)
283    }
284}
285
286/// GPU optimization parameters
287#[derive(Debug, Clone)]
288pub struct GpuParams {
289    /// Optimal workgroup/block size
290    pub workgroup_size: usize,
291    /// Number of workgroups to launch
292    pub workgroup_count: usize,
293    /// Shared memory usage per workgroup
294    pub shared_memory_size: usize,
295    /// Enable unified memory
296    pub use_unified_memory: bool,
297    /// Optimal data transfer strategy
298    pub transfer_strategy: GpuTransferStrategy,
299}
300
301impl GpuParams {
302    /// Generate GPU parameters from GPU info
303    pub fn from_gpu(gpu: &GpuInfo) -> Self {
304        let workgroup_size = gpu.optimal_workgroup_size();
305        let workgroup_count = (gpu.compute_units * 4).min(65535); // 4 workgroups per compute unit, capped
306        let shared_memory_size = 16 * 1024; // 16KB default shared memory
307        let use_unified_memory = gpu.features.unified_memory;
308
309        let transfer_strategy = if gpu.memorybandwidth_gbps > 500.0 {
310            GpuTransferStrategy::HighBandwidth
311        } else if use_unified_memory {
312            GpuTransferStrategy::Unified
313        } else {
314            GpuTransferStrategy::Standard
315        };
316
317        Self {
318            workgroup_size,
319            workgroup_count,
320            shared_memory_size,
321            use_unified_memory,
322            transfer_strategy,
323        }
324    }
325}
326
327/// GPU data transfer strategies
328#[derive(Debug, Clone, Copy, PartialEq, Eq)]
329pub enum GpuTransferStrategy {
330    /// Standard host-device transfers
331    Standard,
332    /// High bandwidth optimized transfers
333    HighBandwidth,
334    /// Unified memory
335    Unified,
336    /// Zero-copy transfers
337    ZeroCopy,
338}
339
340/// Workload type classifications
341#[derive(Debug, Clone, Copy, PartialEq, Eq)]
342pub enum WorkloadType {
343    /// CPU-intensive computations
344    CpuIntensive,
345    /// Memory-intensive operations
346    MemoryIntensive,
347    /// I/O-intensive operations
348    IoIntensive,
349    /// GPU-intensive computations
350    GpuIntensive,
351}
352
353#[cfg(test)]
354mod tests {
355    use super::*;
356
357    #[test]
358    fn test_optimization_params_generation() {
359        let cpu = CpuInfo::default();
360        let memory = MemoryInfo::default();
361        let gpu = Some(GpuInfo::default());
362        let network = NetworkInfo::default();
363        let storage = StorageInfo::default();
364
365        let params = OptimizationParams::generate(&cpu, &memory, gpu.as_ref(), &network, &storage);
366        assert!(params.is_ok());
367
368        let params = params.unwrap();
369        assert!(params.thread_count > 0);
370        assert!(params.chunk_size > 0);
371    }
372
373    #[test]
374    fn test_thread_count_calculation() {
375        let cpu = CpuInfo {
376            physical_cores: 8,
377            logical_cores: 16,
378            ..Default::default()
379        };
380        let memory = MemoryInfo::default();
381
382        let thread_count = OptimizationParams::calculate_optimal_thread_count(&cpu, &memory);
383        assert!(thread_count >= 8);
384        assert!(thread_count <= 16);
385    }
386
387    #[test]
388    fn test_chunk_size_calculation() {
389        let cpu = CpuInfo {
390            cache_l3_kb: 8192, // 8MB L3 cache
391            ..Default::default()
392        };
393        let memory = MemoryInfo::default();
394        let storage = StorageInfo::default();
395
396        let chunk_size = OptimizationParams::calculate_optimal_chunk_size(&cpu, &memory, &storage);
397        assert!(chunk_size >= 4 * 1024); // At least 4KB
398        assert!(chunk_size <= 64 * 1024 * 1024); // At most 64MB
399    }
400
401    #[test]
402    fn test_workload_adjustment() {
403        let mut params = OptimizationParams::default();
404        let original_thread_count = params.thread_count;
405
406        params.adjust_for_workload(WorkloadType::CpuIntensive);
407        assert!(params.thread_count >= original_thread_count);
408
409        params.adjust_for_workload(WorkloadType::MemoryIntensive);
410        assert!(params.enable_prefetch);
411    }
412
413    #[test]
414    fn test_cache_params() {
415        let cpu = CpuInfo {
416            cache_l1_kb: 32,
417            ..Default::default()
418        };
419
420        let cache_params = CacheParams::from_cpu(&cpu);
421        assert_eq!(cache_params.cache_line_size, 64);
422        assert!(cache_params.tile_size > 0);
423    }
424
425    #[test]
426    fn test_gpu_params() {
427        let gpu = GpuInfo {
428            vendor: super::super::gpu::GpuVendor::Nvidia,
429            compute_units: 2048,
430            features: super::super::gpu::GpuFeatures {
431                unified_memory: true,
432                ..Default::default()
433            },
434            ..Default::default()
435        };
436
437        let gpu_params = GpuParams::from_gpu(&gpu);
438        assert_eq!(gpu_params.workgroup_size, 256); // NVIDIA typical
439        assert!(gpu_params.use_unified_memory);
440        assert_eq!(gpu_params.transfer_strategy, GpuTransferStrategy::Unified);
441    }
442
443    #[test]
444    fn test_scaling_factor() {
445        let params = OptimizationParams::default();
446
447        assert_eq!(params.scaling_factor(1024), 1.0); // Small problem
448        assert!(params.scaling_factor(1024 * 1024 * 4) > 1.0); // Larger problem
449    }
450}
451
452// Import statements are already handled above