1use crate::config::{ComputeBackend, WorkloadType};
4use crate::error::CbtopError;
5use crate::headless::Benchmark;
6use serde::{Deserialize, Serialize};
7use std::path::PathBuf;
8use std::time::Duration;
9
10use super::cpu_detect::CpuCapabilities;
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct WorkloadConfig {
15 pub workload: WorkloadType,
17 pub name: String,
19 pub theoretical_peak_gflops: f64,
21 pub memory_bound: bool,
23 #[serde(default = "default_bytes_per_flop")]
28 pub bytes_per_flop: f64,
29}
30
31fn default_bytes_per_flop() -> f64 {
32 8.0 }
34
35impl WorkloadConfig {
36 pub fn theoretical_peak_for_size(&self, size: usize, cpu: &CpuCapabilities) -> f64 {
38 let bytes_per_element = 4;
40 cpu.theoretical_peak_for_size(size, bytes_per_element, self.bytes_per_flop)
41 }
42}
43
44#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct BaselineEntry {
47 pub workload: String,
49 pub size: usize,
51 pub backend: String,
53 pub gflops: f64,
55 pub efficiency: f64,
57 pub cv_percent: f64,
59 pub score: u8,
61}
62
63#[derive(Debug, Clone, Serialize, Deserialize)]
65pub struct BaselineReport {
66 pub version: String,
68 pub timestamp: String,
70 pub entries: Vec<BaselineEntry>,
72 pub system: String,
74}
75
76impl BaselineReport {
77 pub fn save(&self, path: &std::path::Path) -> Result<(), CbtopError> {
79 let json = serde_json::to_string_pretty(self)
80 .map_err(|e| CbtopError::Config(format!("JSON serialization failed: {}", e)))?;
81 std::fs::write(path, json)
82 .map_err(|e| CbtopError::Config(format!("Failed to write file: {}", e)))?;
83 Ok(())
84 }
85
86 pub fn load(path: &std::path::Path) -> Result<Self, CbtopError> {
88 let json = std::fs::read_to_string(path)
89 .map_err(|e| CbtopError::Config(format!("Failed to read file: {}", e)))?;
90 serde_json::from_str(&json)
91 .map_err(|e| CbtopError::Config(format!("JSON parsing failed: {}", e)))
92 }
93}
94
95pub struct OptimizationSuite {
97 pub workloads: Vec<WorkloadConfig>,
99 pub backends: Vec<ComputeBackend>,
101 pub sizes: Vec<usize>,
103 pub duration: Duration,
105 pub baseline_file: PathBuf,
107}
108
109impl Default for OptimizationSuite {
110 fn default() -> Self {
111 Self::standard()
112 }
113}
114
115impl OptimizationSuite {
116 pub fn standard() -> Self {
118 Self {
119 workloads: vec![
120 WorkloadConfig {
121 workload: WorkloadType::Gemm,
122 name: "dot_product".to_string(),
123 theoretical_peak_gflops: 100.0, memory_bound: false,
125 bytes_per_flop: 4.0,
127 },
128 WorkloadConfig {
129 workload: WorkloadType::Elementwise,
130 name: "elementwise_mul".to_string(),
131 theoretical_peak_gflops: 50.0, memory_bound: true,
133 bytes_per_flop: 12.0,
135 },
136 WorkloadConfig {
137 workload: WorkloadType::Reduction,
138 name: "sum_reduction".to_string(),
139 theoretical_peak_gflops: 50.0, memory_bound: true,
141 bytes_per_flop: 4.0,
143 },
144 WorkloadConfig {
145 workload: WorkloadType::Bandwidth,
146 name: "memory_bandwidth".to_string(),
147 theoretical_peak_gflops: 30.0, memory_bound: true,
149 bytes_per_flop: 8.0,
151 },
152 ],
153 backends: vec![ComputeBackend::Simd],
154 sizes: vec![
155 1_000, 10_000, 100_000, 1_000_000, 4_000_000, 16_000_000, ],
162 duration: Duration::from_secs(3),
163 baseline_file: PathBuf::from("benchmarks/baseline.json"),
164 }
165 }
166
167 pub fn quick() -> Self {
169 Self {
170 workloads: vec![
171 WorkloadConfig {
172 workload: WorkloadType::Gemm,
173 name: "dot_product".to_string(),
174 theoretical_peak_gflops: 100.0,
175 memory_bound: false,
176 bytes_per_flop: 4.0,
177 },
178 WorkloadConfig {
179 workload: WorkloadType::Elementwise,
180 name: "elementwise_mul".to_string(),
181 theoretical_peak_gflops: 50.0,
182 memory_bound: true,
183 bytes_per_flop: 12.0,
184 },
185 ],
186 backends: vec![ComputeBackend::Simd],
187 sizes: vec![10_000, 1_000_000],
188 duration: Duration::from_secs(1),
189 baseline_file: PathBuf::from("benchmarks/baseline-quick.json"),
190 }
191 }
192
193 pub fn collect_baseline(&self) -> Result<BaselineReport, CbtopError> {
195 let mut entries = Vec::new();
196 let cpu = CpuCapabilities::detect();
197
198 let mut prev_working_set_mb: usize = 0;
199
200 for workload in &self.workloads {
201 for &size in &self.sizes {
202 for &backend in &self.backends {
203 if !entries.is_empty() {
207 let cooldown_ms = 100 + (prev_working_set_mb * 10).min(400);
208 std::thread::sleep(Duration::from_millis(cooldown_ms as u64));
209
210 std::sync::atomic::fence(std::sync::atomic::Ordering::SeqCst);
213 }
214
215 prev_working_set_mb =
218 ((size as f64 * workload.bytes_per_flop) / (1024.0 * 1024.0)) as usize;
219
220 let result = Benchmark::builder()
221 .workload_type(workload.workload)
222 .size(size)
223 .backend(backend)
224 .duration(self.duration)
225 .build()?
226 .run()?;
227
228 let theoretical_peak = workload.theoretical_peak_for_size(size, &cpu);
230 let efficiency = if theoretical_peak > 0.0 {
231 (result.results.gflops / theoretical_peak).min(1.0)
234 } else {
235 0.0
236 };
237
238 entries.push(BaselineEntry {
239 workload: workload.name.clone(),
240 size,
241 backend: format!("{:?}", backend),
242 gflops: result.results.gflops,
243 efficiency,
244 cv_percent: result.results.latency_ms.cv_percent,
245 score: result.score.total,
246 });
247 }
248 }
249 }
250
251 let timestamp = chrono::Utc::now().to_rfc3339();
252
253 Ok(BaselineReport {
254 version: env!("CARGO_PKG_VERSION").to_string(),
255 timestamp,
256 entries,
257 system: Self::get_system_info(&cpu),
258 })
259 }
260
261 pub(crate) fn get_system_info(cpu: &CpuCapabilities) -> String {
262 format!(
263 "{} cores @ {} MHz, AVX2={}, AVX512={}, L3={}MB, mem_bw={:.0} GB/s",
264 cpu.cores,
265 cpu.max_freq_mhz,
266 cpu.has_avx2,
267 cpu.has_avx512,
268 cpu.l3_cache / (1024 * 1024),
269 cpu.mem_bandwidth_gbs
270 )
271 }
272}