Skip to main content

cbtop/headless/
mod.rs

1//! Headless benchmark mode for CI/CD and AI agent integration
2//!
3//! Enables cbtop to run without a TTY, outputting machine-readable results.
4//!
5//! # Example
6//!
7//! ```bash
8//! # Run headless benchmark
9//! cbtop --headless --format json --duration 5
10//!
11//! # Use bench subcommand
12//! cbtop bench --backend simd --workload gemm --duration 5
13//! ```
14
15mod types;
16pub use types::*;
17
18use crate::brick::Scorable;
19use crate::bricks::generators::SimdLoadBrick;
20use crate::config::{ComputeBackend, WorkloadType};
21use crate::error::CbtopError;
22use std::time::{Duration, Instant};
23
24/// Headless benchmark runner
25pub struct HeadlessBenchmark {
26    backend: ComputeBackend,
27    workload: WorkloadType,
28    size: usize,
29    duration: Duration,
30}
31
32impl HeadlessBenchmark {
33    /// Create a new headless benchmark
34    pub fn new(
35        backend: ComputeBackend,
36        workload: WorkloadType,
37        size: usize,
38        duration: Duration,
39    ) -> Self {
40        Self {
41            backend,
42            workload,
43            size,
44            duration,
45        }
46    }
47
48    /// Run the benchmark and return results
49    pub fn run(&self) -> Result<BenchmarkResult, CbtopError> {
50        let system = SystemInfo::detect();
51        let start_time = Instant::now();
52
53        // Create and configure the load brick
54        let mut brick = SimdLoadBrick::new(self.size);
55        brick.set_workload(self.workload);
56        brick.set_intensity(1.0); // Full intensity for benchmarking
57        brick.start();
58
59        // OPT-013: Warmup phase with scaled duration
60        // Longer warmup for small sizes to ensure stable cache/branch predictor state
61        // Small sizes complete quickly, need more warmup time to reach steady state
62        let base_warmup_ms = (self.duration.as_millis() / 10).max(100) as u64;
63        let warmup_duration = if self.size < 100_000 {
64            Duration::from_millis(base_warmup_ms * 2) // 2x warmup for small sizes
65        } else {
66            Duration::from_millis(base_warmup_ms)
67        };
68        let warmup_start = Instant::now();
69        while warmup_start.elapsed() < warmup_duration {
70            brick.run_iteration();
71        }
72
73        // Reset metrics after warmup
74        let mut brick = SimdLoadBrick::new(self.size);
75        brick.set_workload(self.workload);
76        brick.set_intensity(1.0);
77        brick.start();
78
79        // OPT-014: Sample CPU frequency at start of measurement
80        let start_freq_mhz = Self::sample_cpu_freq();
81
82        // OPT-008: Calculate minimum iterations for statistical stability
83        // Small workloads complete too quickly, causing high variance (CV > 600%)
84        // Require more iterations for smaller sizes to get stable measurements
85        let min_iterations: u64 = if self.size < 10_000 {
86            5000 // Very small: need many iterations
87        } else if self.size < 100_000 {
88            1000 // Small: need moderate iterations
89        } else if self.size < 1_000_000 {
90            100 // Medium: fewer iterations needed
91        } else {
92            10 // Large: minimal iterations (each takes significant time)
93        };
94
95        // Measurement phase
96        let mut iterations = 0u64;
97        let measure_start = Instant::now();
98
99        // OPT-008: Run until both duration AND minimum iterations are satisfied
100        while measure_start.elapsed() < self.duration || iterations < min_iterations {
101            brick.run_iteration();
102            iterations += 1;
103
104            // Safety: cap at 100K iterations to prevent runaway benchmarks
105            if iterations >= 100_000 {
106                break;
107            }
108        }
109
110        let total_duration = start_time.elapsed();
111        brick.stop();
112
113        // OPT-014: Sample CPU frequency at end and detect throttling
114        let end_freq_mhz = Self::sample_cpu_freq();
115
116        // Calculate statistics using brick's internal latency history (PERF-002)
117        // This ensures CV calculation matches what score() uses
118        let latencies = brick.latency_history_slice();
119        let latency_stats = Self::calculate_latency_stats(&latencies);
120        let gflops = brick.gflops();
121        let throughput = if latency_stats.mean > 0.0 {
122            1000.0 / latency_stats.mean
123        } else {
124            0.0
125        };
126
127        // Get score
128        let score = brick.score();
129
130        // PERF-003: Check for benchmark environment warnings
131        let mut warnings = system.check_benchmark_readiness();
132
133        // OPT-014: Detect frequency throttling during benchmark
134        if let (Some(start), Some(end)) = (start_freq_mhz, end_freq_mhz) {
135            if start > 0 {
136                let drop_percent = ((start as f64 - end as f64) / start as f64) * 100.0;
137                if drop_percent > 5.0 {
138                    warnings.push(format!(
139                        "CPU frequency dropped {}MHz -> {}MHz ({:.1}% drop) during benchmark. \
140                         Possible thermal throttling.",
141                        start, end, drop_percent
142                    ));
143                }
144            }
145        }
146
147        Ok(BenchmarkResult {
148            version: env!("CARGO_PKG_VERSION").to_string(),
149            timestamp: chrono::Utc::now().to_rfc3339(),
150            duration_secs: total_duration.as_secs_f64(),
151            system,
152            benchmark: BenchmarkConfig {
153                backend: format!("{:?}", self.backend),
154                workload: format!("{:?}", self.workload),
155                size: self.size,
156                iterations,
157            },
158            results: BenchmarkResults {
159                gflops,
160                throughput_ops_sec: throughput,
161                latency_ms: latency_stats,
162            },
163            score: score.into(),
164            warnings,
165        })
166    }
167
168    /// OPT-014: Sample current CPU frequency for throttling detection
169    fn sample_cpu_freq() -> Option<u32> {
170        #[cfg(target_os = "linux")]
171        {
172            let path = "/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq";
173            if let Ok(content) = std::fs::read_to_string(path) {
174                return content.trim().parse::<u32>().ok().map(|khz| khz / 1000);
175            }
176        }
177        None
178    }
179
180    fn calculate_latency_stats(latencies: &[f64]) -> LatencyStats {
181        if latencies.is_empty() {
182            return LatencyStats {
183                mean: 0.0,
184                min: 0.0,
185                max: 0.0,
186                p50: 0.0,
187                p95: 0.0,
188                p99: 0.0,
189                cv_percent: 0.0,
190            };
191        }
192
193        // OPT-015: Filter outliers using IQR method before calculating CV
194        // This reduces measurement noise from system interrupts, GC pauses, etc.
195        let filtered = Self::filter_outliers_iqr(latencies);
196        let data = if filtered.len() >= 10 {
197            &filtered
198        } else {
199            latencies
200        };
201
202        let n = data.len() as f64;
203        let mean = data.iter().sum::<f64>() / n;
204        let min = data.iter().cloned().fold(f64::INFINITY, f64::min);
205        let max = data.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
206
207        // Calculate standard deviation on filtered data
208        let variance = data.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n;
209        let std_dev = variance.sqrt();
210        let cv_percent = if mean > 0.0 {
211            (std_dev / mean) * 100.0
212        } else {
213            0.0
214        };
215
216        // Calculate percentiles on original data (for accurate p95/p99)
217        let mut sorted = latencies.to_vec();
218        sorted.sort_by(|a, b| {
219            a.partial_cmp(b)
220                .expect("latency values MUST be comparable (no NaN)")
221        });
222
223        let percentile = |p: f64| -> f64 {
224            let idx = (p * (sorted.len() - 1) as f64).round() as usize;
225            sorted[idx.min(sorted.len() - 1)]
226        };
227
228        LatencyStats {
229            mean,
230            min,
231            max,
232            p50: percentile(0.50),
233            p95: percentile(0.95),
234            p99: percentile(0.99),
235            cv_percent,
236        }
237    }
238
239    /// OPT-015: Filter outliers using IQR (Interquartile Range) method
240    /// Removes values outside Q1 - 1.5*IQR and Q3 + 1.5*IQR
241    fn filter_outliers_iqr(data: &[f64]) -> Vec<f64> {
242        if data.len() < 4 {
243            return data.to_vec();
244        }
245
246        let mut sorted = data.to_vec();
247        sorted.sort_by(|a, b| {
248            a.partial_cmp(b)
249                .expect("data values MUST be comparable (no NaN)")
250        });
251
252        let n = sorted.len();
253        let q1_idx = n / 4;
254        let q3_idx = (3 * n) / 4;
255
256        let q1 = sorted[q1_idx];
257        let q3 = sorted[q3_idx];
258        let iqr = q3 - q1;
259
260        // Use 1.5*IQR rule (standard for outlier detection)
261        let lower_bound = q1 - 1.5 * iqr;
262        let upper_bound = q3 + 1.5 * iqr;
263
264        data.iter()
265            .cloned()
266            .filter(|&x| x >= lower_bound && x <= upper_bound)
267            .collect()
268    }
269}
270
271// ============================================================================
272// Library API for Programmatic Access (HL-007)
273// ============================================================================
274
275/// Builder for creating benchmarks programmatically
276///
277/// This provides an ergonomic API for running cbtop benchmarks from Rust code.
278///
279/// # Example
280///
281/// ```rust,no_run
282/// use cbtop::{Benchmark, BenchmarkResult};
283/// use std::time::Duration;
284///
285/// let result: BenchmarkResult = Benchmark::builder()
286///     .workload("gemm")
287///     .size(1_000_000)
288///     .duration(Duration::from_secs(5))
289///     .build()
290///     .unwrap()
291///     .run()
292///     .unwrap();
293///
294/// println!("GFLOP/s: {}", result.results.gflops);
295/// ```
296#[derive(Default)]
297pub struct BenchmarkBuilder {
298    backend: Option<ComputeBackend>,
299    workload: Option<WorkloadType>,
300    size: Option<usize>,
301    duration: Option<Duration>,
302}
303
304impl BenchmarkBuilder {
305    /// Create a new benchmark builder with defaults
306    pub fn new() -> Self {
307        Self::default()
308    }
309
310    /// Set the compute backend (default: Auto/Simd)
311    pub fn backend(mut self, backend: ComputeBackend) -> Self {
312        self.backend = Some(backend);
313        self
314    }
315
316    /// Set the compute backend from string (e.g., "simd", "cuda", "auto")
317    pub fn backend_str(mut self, backend: &str) -> Self {
318        self.backend = Some(match backend.to_lowercase().as_str() {
319            "cuda" => ComputeBackend::Cuda,
320            "wgpu" => ComputeBackend::Wgpu,
321            "simd" => ComputeBackend::Simd,
322            _ => ComputeBackend::Simd, // Default to SIMD
323        });
324        self
325    }
326
327    /// Set the workload type (default: Gemm)
328    pub fn workload_type(mut self, workload: WorkloadType) -> Self {
329        self.workload = Some(workload);
330        self
331    }
332
333    /// Set the workload type from string (e.g., "gemm", "dot", "elementwise")
334    pub fn workload(mut self, workload: &str) -> Self {
335        self.workload = Some(match workload.to_lowercase().as_str() {
336            "dot" | "dotproduct" | "dot_product" => WorkloadType::Gemm,
337            "elementwise" | "element_wise" => WorkloadType::Elementwise,
338            "reduction" | "reduce" => WorkloadType::Reduction,
339            "bandwidth" | "memcpy" => WorkloadType::Bandwidth,
340            "conv2d" | "conv" | "convolution" => WorkloadType::Conv2d,
341            "attention" | "attn" => WorkloadType::Attention,
342            "all" => WorkloadType::All,
343            _ => WorkloadType::Gemm, // Default to GEMM
344        });
345        self
346    }
347
348    /// Set the problem size (default: 1_000_000)
349    pub fn size(mut self, size: usize) -> Self {
350        self.size = Some(size);
351        self
352    }
353
354    /// Set the benchmark duration (default: 5 seconds)
355    pub fn duration(mut self, duration: Duration) -> Self {
356        self.duration = Some(duration);
357        self
358    }
359
360    /// Set the benchmark duration in seconds
361    pub fn duration_secs(mut self, secs: u64) -> Self {
362        self.duration = Some(Duration::from_secs(secs));
363        self
364    }
365
366    /// Build the benchmark with the configured parameters
367    pub fn build(self) -> Result<Benchmark, CbtopError> {
368        Ok(Benchmark {
369            inner: HeadlessBenchmark::new(
370                self.backend.unwrap_or(ComputeBackend::Simd),
371                self.workload.unwrap_or(WorkloadType::Gemm),
372                self.size.unwrap_or(1_000_000),
373                self.duration.unwrap_or(Duration::from_secs(5)),
374            ),
375        })
376    }
377}
378
379/// Benchmark runner for programmatic access
380///
381/// Created via [`Benchmark::builder()`].
382pub struct Benchmark {
383    inner: HeadlessBenchmark,
384}
385
386impl Benchmark {
387    /// Create a new benchmark builder
388    pub fn builder() -> BenchmarkBuilder {
389        BenchmarkBuilder::new()
390    }
391
392    /// Run the benchmark and return results
393    pub fn run(&self) -> Result<BenchmarkResult, CbtopError> {
394        self.inner.run()
395    }
396
397    /// Run the benchmark and compare against a baseline
398    pub fn run_with_baseline(
399        &self,
400        baseline: &BenchmarkResult,
401        threshold: f64,
402    ) -> Result<(BenchmarkResult, RegressionResult), CbtopError> {
403        let result = self.inner.run()?;
404        let regression = result.check_regression(baseline, threshold);
405        Ok((result, regression))
406    }
407}
408
409#[cfg(test)]
410mod tests;