Skip to main content

cbtop/optimize/
cpu_detect.rs

1//! CPU detection for accurate theoretical peak calculation.
2
3/// Detected CPU capabilities for theoretical peak calculation
4#[derive(Debug, Clone)]
5pub struct CpuCapabilities {
6    /// Number of physical cores
7    pub cores: usize,
8    /// Max frequency in MHz
9    pub max_freq_mhz: u32,
10    /// AVX-512 support
11    pub has_avx512: bool,
12    /// AVX2 support
13    pub has_avx2: bool,
14    /// L1 data cache size in bytes
15    pub l1d_cache: usize,
16    /// L2 cache size in bytes
17    pub l2_cache: usize,
18    /// L3 cache size in bytes
19    pub l3_cache: usize,
20    /// Memory bandwidth estimate in GB/s
21    pub mem_bandwidth_gbs: f64,
22}
23
24impl Default for CpuCapabilities {
25    fn default() -> Self {
26        Self::detect()
27    }
28}
29
30impl CpuCapabilities {
31    /// Detect CPU capabilities at runtime
32    pub fn detect() -> Self {
33        let cores = std::thread::available_parallelism()
34            .map(|n| n.get())
35            .unwrap_or(1);
36
37        // Use CPUID to detect features
38        #[cfg(target_arch = "x86_64")]
39        let (has_avx512, has_avx2) = {
40            (
41                is_x86_feature_detected!("avx512f"),
42                is_x86_feature_detected!("avx2"),
43            )
44        };
45
46        #[cfg(not(target_arch = "x86_64"))]
47        let (has_avx512, has_avx2) = (false, false);
48
49        // Estimate max frequency (conservative default, can be improved with sysfs)
50        let max_freq_mhz = Self::detect_max_freq().unwrap_or(3500);
51
52        // Estimate cache sizes (conservative defaults for desktop CPUs)
53        // These could be read from /sys/devices/system/cpu/cpu0/cache on Linux
54        let (l1d_cache, l2_cache, l3_cache) = Self::detect_cache_sizes();
55
56        // Estimate memory bandwidth based on core count
57        // Conservative: ~4 GB/s per core for DDR4, ~6 GB/s for DDR5
58        let mem_bandwidth_gbs = (cores as f64) * 4.0;
59
60        Self {
61            cores,
62            max_freq_mhz,
63            has_avx512,
64            has_avx2,
65            l1d_cache,
66            l2_cache,
67            l3_cache,
68            mem_bandwidth_gbs,
69        }
70    }
71
72    /// Detect maximum CPU frequency from sysfs
73    fn detect_max_freq() -> Option<u32> {
74        #[cfg(target_os = "linux")]
75        {
76            if let Ok(content) =
77                std::fs::read_to_string("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq")
78            {
79                // cpuinfo_max_freq is in kHz
80                return content.trim().parse::<u32>().ok().map(|khz| khz / 1000);
81            }
82        }
83        None
84    }
85
86    /// Detect cache sizes from sysfs
87    fn detect_cache_sizes() -> (usize, usize, usize) {
88        #[cfg(target_os = "linux")]
89        {
90            let l1d = Self::read_cache_size(0, 0).unwrap_or(32 * 1024); // 32 KB default
91            let l2 = Self::read_cache_size(0, 2).unwrap_or(512 * 1024); // 512 KB default
92            let l3 = Self::read_cache_size(0, 3).unwrap_or(32 * 1024 * 1024); // 32 MB default
93            (l1d, l2, l3)
94        }
95
96        #[cfg(not(target_os = "linux"))]
97        {
98            (32 * 1024, 512 * 1024, 32 * 1024 * 1024)
99        }
100    }
101
102    #[cfg(target_os = "linux")]
103    fn read_cache_size(cpu: u32, index: u32) -> Option<usize> {
104        let path = format!(
105            "/sys/devices/system/cpu/cpu{}/cache/index{}/size",
106            cpu, index
107        );
108        if let Ok(content) = std::fs::read_to_string(&path) {
109            let s = content.trim();
110            if let Some(kb_str) = s.strip_suffix('K') {
111                return kb_str.parse::<usize>().ok().map(|kb| kb * 1024);
112            } else if let Some(mb_str) = s.strip_suffix('M') {
113                return mb_str.parse::<usize>().ok().map(|mb| mb * 1024 * 1024);
114            }
115        }
116        None
117    }
118
119    /// Calculate theoretical peak GFLOP/s for compute-bound operations
120    pub fn compute_peak_gflops(&self) -> f64 {
121        let freq_ghz = self.max_freq_mhz as f64 / 1000.0;
122
123        // f32 FLOPs per cycle per core
124        let flops_per_cycle = if self.has_avx512 {
125            // AVX-512: 2 × 512-bit FMA units = 2 × 16 × 2 = 64 FLOPs/cycle (theoretical)
126            // Most CPUs have 2 AVX-512 units, but frequency drops, so use conservative 32
127            32.0
128        } else if self.has_avx2 {
129            // AVX2: 2 × 256-bit FMA units = 2 × 8 × 2 = 32 FLOPs/cycle (theoretical)
130            // Conservative: single FMA port = 16
131            16.0
132        } else {
133            // SSE: 4 FLOPs/cycle
134            4.0
135        };
136
137        self.cores as f64 * freq_ghz * flops_per_cycle
138    }
139
140    /// Calculate theoretical peak GFLOP/s for memory-bound operations
141    /// bytes_per_flop: number of bytes that must be transferred per FLOP
142    pub fn memory_peak_gflops(&self, bytes_per_flop: f64) -> f64 {
143        self.mem_bandwidth_gbs / bytes_per_flop
144    }
145
146    /// Calculate theoretical peak for a given size (cache vs memory bound)
147    /// Uses bytes_per_flop to estimate total working set (includes all arrays)
148    pub fn theoretical_peak_for_size(
149        &self,
150        size: usize,
151        _bytes_per_element: usize,
152        bytes_per_flop: f64,
153    ) -> f64 {
154        // Calculate working set using bytes_per_flop which accounts for all arrays
155        // e.g., elementwise_mul: 12 bytes/FLOP = 3 arrays × 4 bytes
156        // This gives accurate cache behavior estimation
157        let working_set_bytes = (size as f64 * bytes_per_flop) as usize;
158
159        // Determine which cache level (if any) the data fits in
160        // Use 80% of cache as threshold to account for other data
161        if working_set_bytes < (self.l1d_cache * 80 / 100) {
162            // L1 cache: effectively compute-bound
163            self.compute_peak_gflops()
164        } else if working_set_bytes < (self.l2_cache * 80 / 100) {
165            // L2 cache: ~50% of compute peak
166            self.compute_peak_gflops() * 0.5
167        } else if working_set_bytes < (self.l3_cache * 80 / 100) {
168            // L3 cache: ~25% of compute peak
169            self.compute_peak_gflops() * 0.25
170        } else {
171            // Main memory: memory-bound
172            self.memory_peak_gflops(bytes_per_flop)
173        }
174    }
175}