cbtop/optimize/
cpu_detect.rs1#[derive(Debug, Clone)]
5pub struct CpuCapabilities {
6 pub cores: usize,
8 pub max_freq_mhz: u32,
10 pub has_avx512: bool,
12 pub has_avx2: bool,
14 pub l1d_cache: usize,
16 pub l2_cache: usize,
18 pub l3_cache: usize,
20 pub mem_bandwidth_gbs: f64,
22}
23
24impl Default for CpuCapabilities {
25 fn default() -> Self {
26 Self::detect()
27 }
28}
29
30impl CpuCapabilities {
31 pub fn detect() -> Self {
33 let cores = std::thread::available_parallelism()
34 .map(|n| n.get())
35 .unwrap_or(1);
36
37 #[cfg(target_arch = "x86_64")]
39 let (has_avx512, has_avx2) = {
40 (
41 is_x86_feature_detected!("avx512f"),
42 is_x86_feature_detected!("avx2"),
43 )
44 };
45
46 #[cfg(not(target_arch = "x86_64"))]
47 let (has_avx512, has_avx2) = (false, false);
48
49 let max_freq_mhz = Self::detect_max_freq().unwrap_or(3500);
51
52 let (l1d_cache, l2_cache, l3_cache) = Self::detect_cache_sizes();
55
56 let mem_bandwidth_gbs = (cores as f64) * 4.0;
59
60 Self {
61 cores,
62 max_freq_mhz,
63 has_avx512,
64 has_avx2,
65 l1d_cache,
66 l2_cache,
67 l3_cache,
68 mem_bandwidth_gbs,
69 }
70 }
71
72 fn detect_max_freq() -> Option<u32> {
74 #[cfg(target_os = "linux")]
75 {
76 if let Ok(content) =
77 std::fs::read_to_string("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq")
78 {
79 return content.trim().parse::<u32>().ok().map(|khz| khz / 1000);
81 }
82 }
83 None
84 }
85
86 fn detect_cache_sizes() -> (usize, usize, usize) {
88 #[cfg(target_os = "linux")]
89 {
90 let l1d = Self::read_cache_size(0, 0).unwrap_or(32 * 1024); let l2 = Self::read_cache_size(0, 2).unwrap_or(512 * 1024); let l3 = Self::read_cache_size(0, 3).unwrap_or(32 * 1024 * 1024); (l1d, l2, l3)
94 }
95
96 #[cfg(not(target_os = "linux"))]
97 {
98 (32 * 1024, 512 * 1024, 32 * 1024 * 1024)
99 }
100 }
101
102 #[cfg(target_os = "linux")]
103 fn read_cache_size(cpu: u32, index: u32) -> Option<usize> {
104 let path = format!(
105 "/sys/devices/system/cpu/cpu{}/cache/index{}/size",
106 cpu, index
107 );
108 if let Ok(content) = std::fs::read_to_string(&path) {
109 let s = content.trim();
110 if let Some(kb_str) = s.strip_suffix('K') {
111 return kb_str.parse::<usize>().ok().map(|kb| kb * 1024);
112 } else if let Some(mb_str) = s.strip_suffix('M') {
113 return mb_str.parse::<usize>().ok().map(|mb| mb * 1024 * 1024);
114 }
115 }
116 None
117 }
118
119 pub fn compute_peak_gflops(&self) -> f64 {
121 let freq_ghz = self.max_freq_mhz as f64 / 1000.0;
122
123 let flops_per_cycle = if self.has_avx512 {
125 32.0
128 } else if self.has_avx2 {
129 16.0
132 } else {
133 4.0
135 };
136
137 self.cores as f64 * freq_ghz * flops_per_cycle
138 }
139
140 pub fn memory_peak_gflops(&self, bytes_per_flop: f64) -> f64 {
143 self.mem_bandwidth_gbs / bytes_per_flop
144 }
145
146 pub fn theoretical_peak_for_size(
149 &self,
150 size: usize,
151 _bytes_per_element: usize,
152 bytes_per_flop: f64,
153 ) -> f64 {
154 let working_set_bytes = (size as f64 * bytes_per_flop) as usize;
158
159 if working_set_bytes < (self.l1d_cache * 80 / 100) {
162 self.compute_peak_gflops()
164 } else if working_set_bytes < (self.l2_cache * 80 / 100) {
165 self.compute_peak_gflops() * 0.5
167 } else if working_set_bytes < (self.l3_cache * 80 / 100) {
168 self.compute_peak_gflops() * 0.25
170 } else {
171 self.memory_peak_gflops(bytes_per_flop)
173 }
174 }
175}