Skip to main content

cbtop/optimize/
suite.rs

1//! Optimization suite for baseline collection (OPT-001).
2
3use crate::config::{ComputeBackend, WorkloadType};
4use crate::error::CbtopError;
5use crate::headless::Benchmark;
6use serde::{Deserialize, Serialize};
7use std::path::PathBuf;
8use std::time::Duration;
9
10use super::cpu_detect::CpuCapabilities;
11
12/// Configuration for a specific workload in the benchmark suite
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct WorkloadConfig {
15    /// Workload type
16    pub workload: WorkloadType,
17    /// Human-readable name
18    pub name: String,
19    /// Legacy: static theoretical peak GFLOP/s (deprecated, use bytes_per_flop instead)
20    pub theoretical_peak_gflops: f64,
21    /// Whether this workload is memory-bound
22    pub memory_bound: bool,
23    /// Bytes transferred per FLOP (for memory-bound analysis)
24    /// - dot_product: read 2 floats (8 bytes) per 2 FLOPs (mul+add) = 4 bytes/FLOP
25    /// - elementwise: read 2, write 1 float (12 bytes) per 1 FLOP = 12 bytes/FLOP
26    /// - reduction: read 1 float (4 bytes) per 1 FLOP = 4 bytes/FLOP
27    #[serde(default = "default_bytes_per_flop")]
28    pub bytes_per_flop: f64,
29}
30
31fn default_bytes_per_flop() -> f64 {
32    8.0 // Conservative default
33}
34
35impl WorkloadConfig {
36    /// Calculate size-aware theoretical peak using detected CPU capabilities
37    pub fn theoretical_peak_for_size(&self, size: usize, cpu: &CpuCapabilities) -> f64 {
38        // Each element is 4 bytes (f32)
39        let bytes_per_element = 4;
40        cpu.theoretical_peak_for_size(size, bytes_per_element, self.bytes_per_flop)
41    }
42}
43
44/// Entry in the baseline report
45#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct BaselineEntry {
47    /// Workload name
48    pub workload: String,
49    /// Problem size
50    pub size: usize,
51    /// Backend used
52    pub backend: String,
53    /// Achieved GFLOP/s
54    pub gflops: f64,
55    /// Efficiency (achieved / theoretical)
56    pub efficiency: f64,
57    /// Coefficient of variation (%)
58    pub cv_percent: f64,
59    /// Quality score (0-100)
60    pub score: u8,
61}
62
63/// Complete baseline report with all measurements
64#[derive(Debug, Clone, Serialize, Deserialize)]
65pub struct BaselineReport {
66    /// Version of cbtop that generated this report
67    pub version: String,
68    /// Timestamp when collected
69    pub timestamp: String,
70    /// All baseline entries
71    pub entries: Vec<BaselineEntry>,
72    /// System information
73    pub system: String,
74}
75
76impl BaselineReport {
77    /// Save baseline to JSON file
78    pub fn save(&self, path: &std::path::Path) -> Result<(), CbtopError> {
79        let json = serde_json::to_string_pretty(self)
80            .map_err(|e| CbtopError::Config(format!("JSON serialization failed: {}", e)))?;
81        std::fs::write(path, json)
82            .map_err(|e| CbtopError::Config(format!("Failed to write file: {}", e)))?;
83        Ok(())
84    }
85
86    /// Load baseline from JSON file
87    pub fn load(path: &std::path::Path) -> Result<Self, CbtopError> {
88        let json = std::fs::read_to_string(path)
89            .map_err(|e| CbtopError::Config(format!("Failed to read file: {}", e)))?;
90        serde_json::from_str(&json)
91            .map_err(|e| CbtopError::Config(format!("JSON parsing failed: {}", e)))
92    }
93}
94
95/// Comprehensive benchmark suite for optimization identification
96pub struct OptimizationSuite {
97    /// Workloads to benchmark
98    pub workloads: Vec<WorkloadConfig>,
99    /// Backends to test
100    pub backends: Vec<ComputeBackend>,
101    /// Problem sizes to test
102    pub sizes: Vec<usize>,
103    /// Duration per benchmark
104    pub duration: Duration,
105    /// Output file for baseline
106    pub baseline_file: PathBuf,
107}
108
109impl Default for OptimizationSuite {
110    fn default() -> Self {
111        Self::standard()
112    }
113}
114
115impl OptimizationSuite {
116    /// Create standard optimization suite with recommended configurations
117    pub fn standard() -> Self {
118        Self {
119            workloads: vec![
120                WorkloadConfig {
121                    workload: WorkloadType::Gemm,
122                    name: "dot_product".to_string(),
123                    theoretical_peak_gflops: 100.0, // Legacy, use bytes_per_flop
124                    memory_bound: false,
125                    // dot_product: read 2 floats per 2 FLOPs = 4 bytes/FLOP
126                    bytes_per_flop: 4.0,
127                },
128                WorkloadConfig {
129                    workload: WorkloadType::Elementwise,
130                    name: "elementwise_mul".to_string(),
131                    theoretical_peak_gflops: 50.0, // Legacy
132                    memory_bound: true,
133                    // elementwise: read 2, write 1 float per 1 FLOP = 12 bytes/FLOP
134                    bytes_per_flop: 12.0,
135                },
136                WorkloadConfig {
137                    workload: WorkloadType::Reduction,
138                    name: "sum_reduction".to_string(),
139                    theoretical_peak_gflops: 50.0, // Legacy
140                    memory_bound: true,
141                    // reduction: read 1 float per 1 FLOP = 4 bytes/FLOP
142                    bytes_per_flop: 4.0,
143                },
144                WorkloadConfig {
145                    workload: WorkloadType::Bandwidth,
146                    name: "memory_bandwidth".to_string(),
147                    theoretical_peak_gflops: 30.0, // Legacy
148                    memory_bound: true,
149                    // bandwidth: read + write = 8 bytes per "FLOP" (copy)
150                    bytes_per_flop: 8.0,
151                },
152            ],
153            backends: vec![ComputeBackend::Simd],
154            sizes: vec![
155                1_000,      // L1 cache (~4 KB for 1000 f32)
156                10_000,     // L2 cache (~40 KB)
157                100_000,    // L3 cache (~400 KB)
158                1_000_000,  // Main memory (~4 MB)
159                4_000_000,  // Large (tiling threshold, ~16 MB)
160                16_000_000, // Very large (~64 MB)
161            ],
162            duration: Duration::from_secs(3),
163            baseline_file: PathBuf::from("benchmarks/baseline.json"),
164        }
165    }
166
167    /// Create a quick suite for CI (fewer configurations, shorter duration)
168    pub fn quick() -> Self {
169        Self {
170            workloads: vec![
171                WorkloadConfig {
172                    workload: WorkloadType::Gemm,
173                    name: "dot_product".to_string(),
174                    theoretical_peak_gflops: 100.0,
175                    memory_bound: false,
176                    bytes_per_flop: 4.0,
177                },
178                WorkloadConfig {
179                    workload: WorkloadType::Elementwise,
180                    name: "elementwise_mul".to_string(),
181                    theoretical_peak_gflops: 50.0,
182                    memory_bound: true,
183                    bytes_per_flop: 12.0,
184                },
185            ],
186            backends: vec![ComputeBackend::Simd],
187            sizes: vec![10_000, 1_000_000],
188            duration: Duration::from_secs(1),
189            baseline_file: PathBuf::from("benchmarks/baseline-quick.json"),
190        }
191    }
192
193    /// Collect baseline measurements for all configurations
194    pub fn collect_baseline(&self) -> Result<BaselineReport, CbtopError> {
195        let mut entries = Vec::new();
196        let cpu = CpuCapabilities::detect();
197
198        let mut prev_working_set_mb: usize = 0;
199
200        for workload in &self.workloads {
201            for &size in &self.sizes {
202                for &backend in &self.backends {
203                    // OPT-011: Adaptive cooldown based on working set size
204                    // Scale cooldown: 100ms base + 10ms per MB of previous working set (max 500ms)
205                    // This allows memory subsystem to stabilize for large workloads
206                    if !entries.is_empty() {
207                        let cooldown_ms = 100 + (prev_working_set_mb * 10).min(400);
208                        std::thread::sleep(Duration::from_millis(cooldown_ms as u64));
209
210                        // OPT-012: Memory barrier to ensure previous benchmark's
211                        // writes are visible and memory allocator state is stable
212                        std::sync::atomic::fence(std::sync::atomic::Ordering::SeqCst);
213                    }
214
215                    // Calculate working set for this benchmark (used for next cooldown)
216                    // Working set = size * bytes_per_flop (accounts for all arrays)
217                    prev_working_set_mb =
218                        ((size as f64 * workload.bytes_per_flop) / (1024.0 * 1024.0)) as usize;
219
220                    let result = Benchmark::builder()
221                        .workload_type(workload.workload)
222                        .size(size)
223                        .backend(backend)
224                        .duration(self.duration)
225                        .build()?
226                        .run()?;
227
228                    // Use size-aware theoretical peak
229                    let theoretical_peak = workload.theoretical_peak_for_size(size, &cpu);
230                    let efficiency = if theoretical_peak > 0.0 {
231                        // Cap efficiency at 1.0 (100%) - values > 100% indicate
232                        // measurement noise or overly conservative theoretical peak
233                        (result.results.gflops / theoretical_peak).min(1.0)
234                    } else {
235                        0.0
236                    };
237
238                    entries.push(BaselineEntry {
239                        workload: workload.name.clone(),
240                        size,
241                        backend: format!("{:?}", backend),
242                        gflops: result.results.gflops,
243                        efficiency,
244                        cv_percent: result.results.latency_ms.cv_percent,
245                        score: result.score.total,
246                    });
247                }
248            }
249        }
250
251        let timestamp = chrono::Utc::now().to_rfc3339();
252
253        Ok(BaselineReport {
254            version: env!("CARGO_PKG_VERSION").to_string(),
255            timestamp,
256            entries,
257            system: Self::get_system_info(&cpu),
258        })
259    }
260
261    pub(crate) fn get_system_info(cpu: &CpuCapabilities) -> String {
262        format!(
263            "{} cores @ {} MHz, AVX2={}, AVX512={}, L3={}MB, mem_bw={:.0} GB/s",
264            cpu.cores,
265            cpu.max_freq_mhz,
266            cpu.has_avx2,
267            cpu.has_avx512,
268            cpu.l3_cache / (1024 * 1024),
269            cpu.mem_bandwidth_gbs
270        )
271    }
272}