Skip to main content

scirs2_core/resource/
optimization.rs

1//! # Optimization Parameters Generation
2//!
3//! This module generates optimal parameters for various operations
4//! based on detected system resources.
5
6use super::{
7    cpu::CpuInfo, gpu::GpuInfo, memory::MemoryInfo, network::NetworkInfo, storage::StorageInfo,
8};
9use crate::error::CoreResult;
10
11/// Optimization parameters for system operations
12#[derive(Debug, Clone)]
13pub struct OptimizationParams {
14    /// Recommended thread count for parallel operations
15    pub thread_count: usize,
16    /// Recommended chunk size for memory operations (bytes)
17    pub chunk_size: usize,
18    /// Enable SIMD operations
19    pub enable_simd: bool,
20    /// Enable GPU acceleration
21    pub enable_gpu: bool,
22    /// Enable memory prefetching
23    pub enable_prefetch: bool,
24    /// NUMA-aware memory allocation
25    pub numa_aware: bool,
26    /// Cache-friendly parameters
27    pub cache_params: CacheParams,
28    /// I/O optimization parameters
29    pub io_params: IoParams,
30    /// GPU-specific parameters
31    pub gpu_params: Option<GpuParams>,
32}
33
34impl Default for OptimizationParams {
35    fn default() -> Self {
36        Self {
37            thread_count: std::thread::available_parallelism()
38                .map(|n| n.get())
39                .unwrap_or(4),
40            chunk_size: 64 * 1024, // 64KB default
41            enable_simd: false,
42            enable_gpu: false,
43            enable_prefetch: true,
44            numa_aware: false,
45            cache_params: CacheParams::default(),
46            io_params: IoParams::default(),
47            gpu_params: None,
48        }
49    }
50}
51
52impl OptimizationParams {
53    /// Generate optimization parameters from system resources
54    pub fn generate(
55        cpu: &CpuInfo,
56        memory: &MemoryInfo,
57        gpu: Option<&GpuInfo>,
58        network: &NetworkInfo,
59        storage: &StorageInfo,
60    ) -> CoreResult<Self> {
61        let thread_count = Self::calculate_optimal_thread_count(cpu, memory);
62        let chunk_size = Self::calculate_optimal_chunk_size(cpu, memory, storage);
63        let enable_simd = Self::should_enable_simd(cpu);
64        let enable_gpu = Self::should_enable_gpu(gpu);
65        let enable_prefetch = Self::should_enable_prefetch(memory, storage);
66        let numa_aware = memory.numa_nodes > 1;
67
68        let cache_params = CacheParams::from_cpu(cpu);
69        let io_params = IoParams::from_resources(network, storage);
70        let gpu_params = gpu.map(GpuParams::from_gpu);
71
72        Ok(Self {
73            thread_count,
74            chunk_size,
75            enable_simd,
76            enable_gpu,
77            enable_prefetch,
78            numa_aware,
79            cache_params,
80            io_params,
81            gpu_params,
82        })
83    }
84
85    /// Calculate optimal thread count
86    fn calculate_optimal_thread_count(cpu: &CpuInfo, memory: &MemoryInfo) -> usize {
87        let base_threads = cpu.physical_cores;
88
89        // Add hyperthreading benefit for certain workloads
90        let ht_benefit = if cpu.logical_cores > cpu.physical_cores {
91            (cpu.logical_cores - cpu.physical_cores) / 2
92        } else {
93            0
94        };
95
96        // Consider memory pressure
97        let memory_factor = if memory.is_under_pressure() {
98            0.75 // Reduce threads under memory pressure
99        } else {
100            1.0
101        };
102
103        let optimal = ((base_threads + ht_benefit) as f64 * memory_factor) as usize;
104        optimal.max(1).min(cpu.logical_cores)
105    }
106
107    /// Calculate optimal chunk size
108    fn calculate_optimal_chunk_size(
109        cpu: &CpuInfo,
110        memory: &MemoryInfo,
111        storage: &StorageInfo,
112    ) -> usize {
113        // Base on CPU cache size
114        let cachebased = cpu.cache_l3_kb * 1024 / 4; // Use 1/4 of L3 cache
115
116        // Base on memory bandwidth
117        let memorybased = memory.optimal_chunk_size();
118
119        // Base on storage characteristics
120        let storagebased = storage.optimal_io_size;
121
122        // Take the geometric mean to balance all factors
123        let geometric_mean = ((cachebased as f64 * memorybased as f64 * storagebased as f64)
124            .powf(1.0 / 3.0)) as usize;
125
126        // Ensure it's a reasonable size (between 4KB and 64MB)
127        geometric_mean.clamp(4 * 1024, 64 * 1024 * 1024)
128    }
129
130    /// Determine if SIMD should be enabled
131    fn should_enable_simd(cpu: &CpuInfo) -> bool {
132        cpu.simd_capabilities.sse4_2 || cpu.simd_capabilities.avx2 || cpu.simd_capabilities.neon
133    }
134
135    /// Determine if GPU should be enabled
136    fn should_enable_gpu(gpuinfo: Option<&GpuInfo>) -> bool {
137        gpuinfo.map(|g| g.is_compute_capable()).unwrap_or(false)
138    }
139
140    /// Determine if prefetching should be enabled
141    fn should_enable_prefetch(memory: &MemoryInfo, storage: &StorageInfo) -> bool {
142        // Enable prefetch if we have sufficient memory and storage supports it
143        !memory.is_under_pressure() && storage.supports_async_io()
144    }
145
146    /// Get scaling factor for different problem sizes
147    pub fn get_scaling_factor(problemsize: usize) -> f64 {
148        let base_size = 1024 * 1024; // 1MB base
149        if problemsize <= base_size {
150            1.0
151        } else {
152            let ratio = problemsize as f64 / base_size as f64;
153            // Use square root scaling to avoid excessive resource usage
154            ratio.sqrt()
155        }
156    }
157
158    /// Instance method to get scaling factor
159    pub fn scaling_factor(&self, problemsize: usize) -> f64 {
160        Self::get_scaling_factor(problemsize)
161    }
162
163    /// Adjust parameters for specific workload type
164    pub fn adjust_for_workload(&mut self, workload: WorkloadType) {
165        match workload {
166            WorkloadType::CpuIntensive => {
167                // Maximize CPU utilization
168                self.thread_count = self.thread_count.max(
169                    std::thread::available_parallelism()
170                        .map(|n| n.get())
171                        .unwrap_or(4),
172                );
173                self.chunk_size = self.chunk_size.max(1024 * 1024); // Larger chunks
174            }
175            WorkloadType::MemoryIntensive => {
176                // Optimize for memory bandwidth
177                self.enable_prefetch = true;
178                self.chunk_size = self.chunk_size.min(256 * 1024); // Smaller chunks
179            }
180            WorkloadType::IoIntensive => {
181                // Optimize for I/O throughput
182                self.thread_count = (self.thread_count * 2).min(16); // More threads for I/O
183                self.chunk_size = self.io_params.optimal_buffersize;
184            }
185            WorkloadType::GpuIntensive => {
186                // Favor GPU over CPU
187                if self.enable_gpu {
188                    self.thread_count = self.thread_count.min(4); // Fewer CPU threads
189                }
190            }
191        }
192    }
193}
194
195/// Cache optimization parameters
196#[derive(Debug, Clone)]
197pub struct CacheParams {
198    /// L1 cache line size
199    pub cache_line_size: usize,
200    /// Optimal data alignment
201    pub alignment: usize,
202    /// Prefetch distance
203    pub prefetch_distance: usize,
204    /// Cache-friendly loop tiling size
205    pub tile_size: usize,
206}
207
208impl Default for CacheParams {
209    fn default() -> Self {
210        Self {
211            cache_line_size: 64,
212            alignment: 64,
213            prefetch_distance: 64,
214            tile_size: 64,
215        }
216    }
217}
218
219impl CacheParams {
220    /// Generate cache parameters from CPU info
221    pub fn from_cpu(cpu: &CpuInfo) -> Self {
222        let cache_line_size = 64; // Most modern CPUs use 64-byte cache lines
223        let alignment = cache_line_size;
224
225        // Prefetch distance based on cache size
226        let prefetch_distance = (cpu.cache_l1_kb * 1024 / 16).clamp(64, 1024);
227
228        // Tile size based on L1 cache
229        let tile_size = (cpu.cache_l1_kb * 1024 / 8).clamp(64, 4096);
230
231        Self {
232            cache_line_size,
233            alignment,
234            prefetch_distance,
235            tile_size,
236        }
237    }
238}
239
240/// I/O optimization parameters
241#[derive(Debug, Clone)]
242pub struct IoParams {
243    /// Optimal buffer size for I/O operations
244    pub optimal_buffersize: usize,
245    /// Number of concurrent I/O operations
246    pub concurrent_operations: usize,
247    /// Enable asynchronous I/O
248    pub enable_async_io: bool,
249    /// Enable I/O caching
250    pub enable_io_cache: bool,
251}
252
253impl Default for IoParams {
254    fn default() -> Self {
255        Self {
256            optimal_buffersize: 64 * 1024, // 64KB
257            concurrent_operations: 4,
258            enable_async_io: true,
259            enable_io_cache: true,
260        }
261    }
262}
263
264impl IoParams {
265    /// Generate I/O parameters from network and storage info
266    pub fn from_network(network: &NetworkInfo, storage: &StorageInfo) -> Self {
267        let optimal_buffersize = storage.optimal_io_size.max(network.mtu);
268        let concurrent_operations = storage.queue_depth.min(16);
269        let enable_async_io = storage.supports_async_io();
270        let enable_io_cache =
271            !storage.is_ssd() || storage.capacity > (512u64 * 1024 * 1024 * 1024) as usize; // Cache for HDD or large SSDs
272
273        Self {
274            optimal_buffersize,
275            concurrent_operations,
276            enable_async_io,
277            enable_io_cache,
278        }
279    }
280
281    /// Generate I/O parameters from resources (alias for from_network)
282    pub fn from_resources(network: &NetworkInfo, storage: &StorageInfo) -> Self {
283        Self::from_network(network, storage)
284    }
285}
286
287/// GPU optimization parameters
288#[derive(Debug, Clone)]
289pub struct GpuParams {
290    /// Optimal workgroup/block size
291    pub workgroup_size: usize,
292    /// Number of workgroups to launch
293    pub workgroup_count: usize,
294    /// Shared memory usage per workgroup
295    pub shared_memory_size: usize,
296    /// Enable unified memory
297    pub use_unified_memory: bool,
298    /// Optimal data transfer strategy
299    pub transfer_strategy: GpuTransferStrategy,
300}
301
302impl GpuParams {
303    /// Generate GPU parameters from GPU info
304    pub fn from_gpu(gpu: &GpuInfo) -> Self {
305        let workgroup_size = gpu.optimal_workgroup_size();
306        let workgroup_count = (gpu.compute_units * 4).min(65535); // 4 workgroups per compute unit, capped
307        let shared_memory_size = 16 * 1024; // 16KB default shared memory
308        let use_unified_memory = gpu.features.unified_memory;
309
310        let transfer_strategy = if gpu.memorybandwidth_gbps > 500.0 {
311            GpuTransferStrategy::HighBandwidth
312        } else if use_unified_memory {
313            GpuTransferStrategy::Unified
314        } else {
315            GpuTransferStrategy::Standard
316        };
317
318        Self {
319            workgroup_size,
320            workgroup_count,
321            shared_memory_size,
322            use_unified_memory,
323            transfer_strategy,
324        }
325    }
326}
327
328/// GPU data transfer strategies
329#[derive(Debug, Clone, Copy, PartialEq, Eq)]
330pub enum GpuTransferStrategy {
331    /// Standard host-device transfers
332    Standard,
333    /// High bandwidth optimized transfers
334    HighBandwidth,
335    /// Unified memory
336    Unified,
337    /// Zero-copy transfers
338    ZeroCopy,
339}
340
341/// Workload type classifications
342#[derive(Debug, Clone, Copy, PartialEq, Eq)]
343pub enum WorkloadType {
344    /// CPU-intensive computations
345    CpuIntensive,
346    /// Memory-intensive operations
347    MemoryIntensive,
348    /// I/O-intensive operations
349    IoIntensive,
350    /// GPU-intensive computations
351    GpuIntensive,
352}
353
354#[cfg(test)]
355mod tests {
356    use super::*;
357
358    #[test]
359    fn test_optimization_params_generation() {
360        let cpu = CpuInfo::default();
361        let memory = MemoryInfo::default();
362        let gpu = Some(GpuInfo::default());
363        let network = NetworkInfo::default();
364        let storage = StorageInfo::default();
365
366        let params = OptimizationParams::generate(&cpu, &memory, gpu.as_ref(), &network, &storage);
367        assert!(params.is_ok());
368
369        let params = params.expect("Operation failed");
370        assert!(params.thread_count > 0);
371        assert!(params.chunk_size > 0);
372    }
373
374    #[test]
375    fn test_thread_count_calculation() {
376        let cpu = CpuInfo {
377            physical_cores: 8,
378            logical_cores: 16,
379            ..Default::default()
380        };
381        let memory = MemoryInfo::default();
382
383        let thread_count = OptimizationParams::calculate_optimal_thread_count(&cpu, &memory);
384        assert!(thread_count >= 8);
385        assert!(thread_count <= 16);
386    }
387
388    #[test]
389    fn test_chunk_size_calculation() {
390        let cpu = CpuInfo {
391            cache_l3_kb: 8192, // 8MB L3 cache
392            ..Default::default()
393        };
394        let memory = MemoryInfo::default();
395        let storage = StorageInfo::default();
396
397        let chunk_size = OptimizationParams::calculate_optimal_chunk_size(&cpu, &memory, &storage);
398        assert!(chunk_size >= 4 * 1024); // At least 4KB
399        assert!(chunk_size <= 64 * 1024 * 1024); // At most 64MB
400    }
401
402    #[test]
403    fn test_workload_adjustment() {
404        let mut params = OptimizationParams::default();
405        let original_thread_count = params.thread_count;
406
407        params.adjust_for_workload(WorkloadType::CpuIntensive);
408        assert!(params.thread_count >= original_thread_count);
409
410        params.adjust_for_workload(WorkloadType::MemoryIntensive);
411        assert!(params.enable_prefetch);
412    }
413
414    #[test]
415    fn test_cache_params() {
416        let cpu = CpuInfo {
417            cache_l1_kb: 32,
418            ..Default::default()
419        };
420
421        let cache_params = CacheParams::from_cpu(&cpu);
422        assert_eq!(cache_params.cache_line_size, 64);
423        assert!(cache_params.tile_size > 0);
424    }
425
426    #[test]
427    fn test_gpu_params() {
428        let gpu = GpuInfo {
429            vendor: super::super::gpu::GpuVendor::Nvidia,
430            compute_units: 2048,
431            features: super::super::gpu::GpuFeatures {
432                unified_memory: true,
433                ..Default::default()
434            },
435            ..Default::default()
436        };
437
438        let gpu_params = GpuParams::from_gpu(&gpu);
439        assert_eq!(gpu_params.workgroup_size, 256); // NVIDIA typical
440        assert!(gpu_params.use_unified_memory);
441        assert_eq!(gpu_params.transfer_strategy, GpuTransferStrategy::Unified);
442    }
443
444    #[test]
445    fn test_scaling_factor() {
446        let params = OptimizationParams::default();
447
448        assert_eq!(params.scaling_factor(1024), 1.0); // Small problem
449        assert!(params.scaling_factor(1024 * 1024 * 4) > 1.0); // Larger problem
450    }
451}
452
453// Import statements are already handled above