1mod types;
16pub use types::*;
17
18use crate::brick::Scorable;
19use crate::bricks::generators::SimdLoadBrick;
20use crate::config::{ComputeBackend, WorkloadType};
21use crate::error::CbtopError;
22use std::time::{Duration, Instant};
23
24pub struct HeadlessBenchmark {
26 backend: ComputeBackend,
27 workload: WorkloadType,
28 size: usize,
29 duration: Duration,
30}
31
32impl HeadlessBenchmark {
33 pub fn new(
35 backend: ComputeBackend,
36 workload: WorkloadType,
37 size: usize,
38 duration: Duration,
39 ) -> Self {
40 Self {
41 backend,
42 workload,
43 size,
44 duration,
45 }
46 }
47
48 pub fn run(&self) -> Result<BenchmarkResult, CbtopError> {
50 let system = SystemInfo::detect();
51 let start_time = Instant::now();
52
53 let mut brick = SimdLoadBrick::new(self.size);
55 brick.set_workload(self.workload);
56 brick.set_intensity(1.0); brick.start();
58
59 let base_warmup_ms = (self.duration.as_millis() / 10).max(100) as u64;
63 let warmup_duration = if self.size < 100_000 {
64 Duration::from_millis(base_warmup_ms * 2) } else {
66 Duration::from_millis(base_warmup_ms)
67 };
68 let warmup_start = Instant::now();
69 while warmup_start.elapsed() < warmup_duration {
70 brick.run_iteration();
71 }
72
73 let mut brick = SimdLoadBrick::new(self.size);
75 brick.set_workload(self.workload);
76 brick.set_intensity(1.0);
77 brick.start();
78
79 let start_freq_mhz = Self::sample_cpu_freq();
81
82 let min_iterations: u64 = if self.size < 10_000 {
86 5000 } else if self.size < 100_000 {
88 1000 } else if self.size < 1_000_000 {
90 100 } else {
92 10 };
94
95 let mut iterations = 0u64;
97 let measure_start = Instant::now();
98
99 while measure_start.elapsed() < self.duration || iterations < min_iterations {
101 brick.run_iteration();
102 iterations += 1;
103
104 if iterations >= 100_000 {
106 break;
107 }
108 }
109
110 let total_duration = start_time.elapsed();
111 brick.stop();
112
113 let end_freq_mhz = Self::sample_cpu_freq();
115
116 let latencies = brick.latency_history_slice();
119 let latency_stats = Self::calculate_latency_stats(&latencies);
120 let gflops = brick.gflops();
121 let throughput = if latency_stats.mean > 0.0 {
122 1000.0 / latency_stats.mean
123 } else {
124 0.0
125 };
126
127 let score = brick.score();
129
130 let mut warnings = system.check_benchmark_readiness();
132
133 if let (Some(start), Some(end)) = (start_freq_mhz, end_freq_mhz) {
135 if start > 0 {
136 let drop_percent = ((start as f64 - end as f64) / start as f64) * 100.0;
137 if drop_percent > 5.0 {
138 warnings.push(format!(
139 "CPU frequency dropped {}MHz -> {}MHz ({:.1}% drop) during benchmark. \
140 Possible thermal throttling.",
141 start, end, drop_percent
142 ));
143 }
144 }
145 }
146
147 Ok(BenchmarkResult {
148 version: env!("CARGO_PKG_VERSION").to_string(),
149 timestamp: chrono::Utc::now().to_rfc3339(),
150 duration_secs: total_duration.as_secs_f64(),
151 system,
152 benchmark: BenchmarkConfig {
153 backend: format!("{:?}", self.backend),
154 workload: format!("{:?}", self.workload),
155 size: self.size,
156 iterations,
157 },
158 results: BenchmarkResults {
159 gflops,
160 throughput_ops_sec: throughput,
161 latency_ms: latency_stats,
162 },
163 score: score.into(),
164 warnings,
165 })
166 }
167
168 fn sample_cpu_freq() -> Option<u32> {
170 #[cfg(target_os = "linux")]
171 {
172 let path = "/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq";
173 if let Ok(content) = std::fs::read_to_string(path) {
174 return content.trim().parse::<u32>().ok().map(|khz| khz / 1000);
175 }
176 }
177 None
178 }
179
180 fn calculate_latency_stats(latencies: &[f64]) -> LatencyStats {
181 if latencies.is_empty() {
182 return LatencyStats {
183 mean: 0.0,
184 min: 0.0,
185 max: 0.0,
186 p50: 0.0,
187 p95: 0.0,
188 p99: 0.0,
189 cv_percent: 0.0,
190 };
191 }
192
193 let filtered = Self::filter_outliers_iqr(latencies);
196 let data = if filtered.len() >= 10 {
197 &filtered
198 } else {
199 latencies
200 };
201
202 let n = data.len() as f64;
203 let mean = data.iter().sum::<f64>() / n;
204 let min = data.iter().cloned().fold(f64::INFINITY, f64::min);
205 let max = data.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
206
207 let variance = data.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n;
209 let std_dev = variance.sqrt();
210 let cv_percent = if mean > 0.0 {
211 (std_dev / mean) * 100.0
212 } else {
213 0.0
214 };
215
216 let mut sorted = latencies.to_vec();
218 sorted.sort_by(|a, b| {
219 a.partial_cmp(b)
220 .expect("latency values MUST be comparable (no NaN)")
221 });
222
223 let percentile = |p: f64| -> f64 {
224 let idx = (p * (sorted.len() - 1) as f64).round() as usize;
225 sorted[idx.min(sorted.len() - 1)]
226 };
227
228 LatencyStats {
229 mean,
230 min,
231 max,
232 p50: percentile(0.50),
233 p95: percentile(0.95),
234 p99: percentile(0.99),
235 cv_percent,
236 }
237 }
238
239 fn filter_outliers_iqr(data: &[f64]) -> Vec<f64> {
242 if data.len() < 4 {
243 return data.to_vec();
244 }
245
246 let mut sorted = data.to_vec();
247 sorted.sort_by(|a, b| {
248 a.partial_cmp(b)
249 .expect("data values MUST be comparable (no NaN)")
250 });
251
252 let n = sorted.len();
253 let q1_idx = n / 4;
254 let q3_idx = (3 * n) / 4;
255
256 let q1 = sorted[q1_idx];
257 let q3 = sorted[q3_idx];
258 let iqr = q3 - q1;
259
260 let lower_bound = q1 - 1.5 * iqr;
262 let upper_bound = q3 + 1.5 * iqr;
263
264 data.iter()
265 .cloned()
266 .filter(|&x| x >= lower_bound && x <= upper_bound)
267 .collect()
268 }
269}
270
271#[derive(Default)]
297pub struct BenchmarkBuilder {
298 backend: Option<ComputeBackend>,
299 workload: Option<WorkloadType>,
300 size: Option<usize>,
301 duration: Option<Duration>,
302}
303
304impl BenchmarkBuilder {
305 pub fn new() -> Self {
307 Self::default()
308 }
309
310 pub fn backend(mut self, backend: ComputeBackend) -> Self {
312 self.backend = Some(backend);
313 self
314 }
315
316 pub fn backend_str(mut self, backend: &str) -> Self {
318 self.backend = Some(match backend.to_lowercase().as_str() {
319 "cuda" => ComputeBackend::Cuda,
320 "wgpu" => ComputeBackend::Wgpu,
321 "simd" => ComputeBackend::Simd,
322 _ => ComputeBackend::Simd, });
324 self
325 }
326
327 pub fn workload_type(mut self, workload: WorkloadType) -> Self {
329 self.workload = Some(workload);
330 self
331 }
332
333 pub fn workload(mut self, workload: &str) -> Self {
335 self.workload = Some(match workload.to_lowercase().as_str() {
336 "dot" | "dotproduct" | "dot_product" => WorkloadType::Gemm,
337 "elementwise" | "element_wise" => WorkloadType::Elementwise,
338 "reduction" | "reduce" => WorkloadType::Reduction,
339 "bandwidth" | "memcpy" => WorkloadType::Bandwidth,
340 "conv2d" | "conv" | "convolution" => WorkloadType::Conv2d,
341 "attention" | "attn" => WorkloadType::Attention,
342 "all" => WorkloadType::All,
343 _ => WorkloadType::Gemm, });
345 self
346 }
347
348 pub fn size(mut self, size: usize) -> Self {
350 self.size = Some(size);
351 self
352 }
353
354 pub fn duration(mut self, duration: Duration) -> Self {
356 self.duration = Some(duration);
357 self
358 }
359
360 pub fn duration_secs(mut self, secs: u64) -> Self {
362 self.duration = Some(Duration::from_secs(secs));
363 self
364 }
365
366 pub fn build(self) -> Result<Benchmark, CbtopError> {
368 Ok(Benchmark {
369 inner: HeadlessBenchmark::new(
370 self.backend.unwrap_or(ComputeBackend::Simd),
371 self.workload.unwrap_or(WorkloadType::Gemm),
372 self.size.unwrap_or(1_000_000),
373 self.duration.unwrap_or(Duration::from_secs(5)),
374 ),
375 })
376 }
377}
378
379pub struct Benchmark {
383 inner: HeadlessBenchmark,
384}
385
386impl Benchmark {
387 pub fn builder() -> BenchmarkBuilder {
389 BenchmarkBuilder::new()
390 }
391
392 pub fn run(&self) -> Result<BenchmarkResult, CbtopError> {
394 self.inner.run()
395 }
396
397 pub fn run_with_baseline(
399 &self,
400 baseline: &BenchmarkResult,
401 threshold: f64,
402 ) -> Result<(BenchmarkResult, RegressionResult), CbtopError> {
403 let result = self.inner.run()?;
404 let regression = result.check_regression(baseline, threshold);
405 Ok((result, regression))
406 }
407}
408
409#[cfg(test)]
410mod tests;