Skip to main content

cbtop/variance_analysis/
mod.rs

1//! Variance Source Analysis Module (PMAT-027)
2//!
3//! Analyzes sources of performance variance to identify and mitigate
4//! benchmark instability per PERF-003 (CV 5-8% vs target <5%).
5//!
6//! # Motivation
7//!
8//! F605 (Results reproducible) is PARTIAL with CV 5-8%. Need systematic
9//! variance attribution to identify and mitigate sources.
10//!
11//! # Components
12//!
13//! | Component | Detection Method | Mitigation |
14//! |-----------|-----------------|------------|
15//! | Frequency Variance | std_dev(CPU MHz samples) | Pin frequency |
16//! | Thermal Drift | Correlation(temp, latency) | Cooldown periods |
17//! | Cache Noise | First-run vs warm-run delta | Warmup iterations |
18//! | System Noise | Residual after above | Isolation/shielding |
19
20/// Source of performance variance
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum VarianceSource {
23    /// CPU frequency scaling (turbo boost variance)
24    FrequencyScaling,
25    /// Thermal throttling effects
26    ThermalThrottling,
27    /// Cache state variance (cold vs warm)
28    CacheState,
29    /// Background system activity
30    SystemNoise,
31    /// Unknown or unattributed
32    Unknown,
33}
34
35impl VarianceSource {
36    /// Get human-readable name
37    pub fn name(&self) -> &'static str {
38        match self {
39            VarianceSource::FrequencyScaling => "CPU frequency scaling",
40            VarianceSource::ThermalThrottling => "thermal throttling",
41            VarianceSource::CacheState => "cache state variance",
42            VarianceSource::SystemNoise => "system noise",
43            VarianceSource::Unknown => "unknown",
44        }
45    }
46
47    /// Get mitigation recommendation
48    pub fn mitigation(&self) -> &'static str {
49        match self {
50            VarianceSource::FrequencyScaling => {
51                "Pin CPU frequency with: cpupower frequency-set -g performance"
52            }
53            VarianceSource::ThermalThrottling => {
54                "Add cooldown periods between runs or improve cooling"
55            }
56            VarianceSource::CacheState => "Increase warmup iterations before measurement",
57            VarianceSource::SystemNoise => {
58                "Run with CPU isolation (isolcpus) or reduce background tasks"
59            }
60            VarianceSource::Unknown => "Profile with renacer for deeper analysis",
61        }
62    }
63}
64
65/// Variance analysis result
66#[derive(Debug, Clone)]
67pub struct VarianceAnalysis {
68    /// Total coefficient of variation (%)
69    pub total_cv_percent: f64,
70    /// Estimated frequency scaling contribution (%)
71    pub frequency_contribution: f64,
72    /// Estimated thermal contribution (%)
73    pub thermal_contribution: f64,
74    /// Estimated cache state contribution (%)
75    pub cache_contribution: f64,
76    /// Residual unexplained noise (%)
77    pub residual_noise: f64,
78    /// Dominant source of variance
79    pub dominant_source: VarianceSource,
80    /// Mitigation recommendations
81    pub recommendations: Vec<String>,
82    /// Whether variance budget is met (CV < 5%)
83    pub budget_met: bool,
84    /// Sample statistics
85    pub sample_count: usize,
86    /// Warmup effect ratio (cold/warm performance)
87    pub warmup_effect: f64,
88    /// Trend coefficient (positive = increasing latency)
89    pub trend_coefficient: f64,
90}
91
92/// Input data for variance analysis
93#[derive(Debug, Clone)]
94pub struct VarianceInput {
95    /// Latency samples (µs)
96    pub latencies: Vec<f64>,
97    /// CPU frequency samples (MHz), if available
98    pub frequencies: Option<Vec<f64>>,
99    /// Temperature samples (°C), if available
100    pub temperatures: Option<Vec<f64>>,
101    /// Number of warmup iterations
102    pub warmup_count: usize,
103}
104
105impl VarianceAnalysis {
106    /// Analyze variance sources from input data
107    pub fn analyze(input: &VarianceInput) -> Option<Self> {
108        if input.latencies.is_empty() {
109            return None;
110        }
111
112        let n = input.latencies.len();
113        let mean = input.latencies.iter().sum::<f64>() / n as f64;
114
115        // Calculate total CV
116        let variance = if n > 1 {
117            input
118                .latencies
119                .iter()
120                .map(|x| (x - mean).powi(2))
121                .sum::<f64>()
122                / (n - 1) as f64
123        } else {
124            0.0
125        };
126        let std_dev = variance.sqrt();
127        let total_cv_percent = if mean > 0.0 {
128            (std_dev / mean) * 100.0
129        } else {
130            0.0
131        };
132
133        // Estimate frequency contribution
134        let frequency_contribution = if let Some(ref freqs) = input.frequencies {
135            estimate_frequency_contribution(freqs, &input.latencies)
136        } else {
137            0.0
138        };
139
140        // Estimate thermal contribution
141        let thermal_contribution = if let Some(ref temps) = input.temperatures {
142            estimate_thermal_contribution(temps, &input.latencies)
143        } else {
144            0.0
145        };
146
147        // Estimate cache state contribution
148        let (cache_contribution, warmup_effect) =
149            estimate_cache_contribution(&input.latencies, input.warmup_count);
150
151        // Calculate residual
152        let attributed = frequency_contribution + thermal_contribution + cache_contribution;
153        let residual_noise = (total_cv_percent - attributed).max(0.0);
154
155        // Identify dominant source
156        let dominant_source = identify_dominant_source(
157            frequency_contribution,
158            thermal_contribution,
159            cache_contribution,
160            residual_noise,
161        );
162
163        // Generate recommendations
164        let recommendations = generate_recommendations(
165            total_cv_percent,
166            frequency_contribution,
167            thermal_contribution,
168            cache_contribution,
169            residual_noise,
170        );
171
172        // Calculate trend
173        let trend_coefficient = calculate_trend(&input.latencies);
174
175        let budget_met = total_cv_percent < 5.0;
176
177        Some(Self {
178            total_cv_percent,
179            frequency_contribution,
180            thermal_contribution,
181            cache_contribution,
182            residual_noise,
183            dominant_source,
184            recommendations,
185            budget_met,
186            sample_count: n,
187            warmup_effect,
188            trend_coefficient,
189        })
190    }
191
192    /// Get summary string
193    pub fn summary(&self) -> String {
194        format!(
195            "CV={:.1}% (freq={:.1}% therm={:.1}% cache={:.1}% noise={:.1}%) dominant={}",
196            self.total_cv_percent,
197            self.frequency_contribution,
198            self.thermal_contribution,
199            self.cache_contribution,
200            self.residual_noise,
201            self.dominant_source.name()
202        )
203    }
204
205    /// Check if any single source dominates (>50% of variance)
206    pub fn has_dominant_source(&self) -> bool {
207        let max = self
208            .frequency_contribution
209            .max(self.thermal_contribution)
210            .max(self.cache_contribution)
211            .max(self.residual_noise);
212        max > self.total_cv_percent * 0.5
213    }
214}
215
216/// Estimate frequency scaling contribution to variance
217fn estimate_frequency_contribution(frequencies: &[f64], latencies: &[f64]) -> f64 {
218    if frequencies.len() < 2 || latencies.len() < 2 {
219        return 0.0;
220    }
221
222    // Calculate correlation between frequency and latency
223    let correlation = calculate_correlation(frequencies, latencies);
224
225    // Frequency variance
226    let freq_mean = frequencies.iter().sum::<f64>() / frequencies.len() as f64;
227    let freq_variance = frequencies
228        .iter()
229        .map(|f| (f - freq_mean).powi(2))
230        .sum::<f64>()
231        / (frequencies.len() - 1) as f64;
232    let freq_cv = if freq_mean > 0.0 {
233        freq_variance.sqrt() / freq_mean * 100.0
234    } else {
235        0.0
236    };
237
238    // Contribution is correlation × frequency CV (simplified model)
239    correlation.abs() * freq_cv
240}
241
242/// Estimate thermal throttling contribution
243fn estimate_thermal_contribution(temperatures: &[f64], latencies: &[f64]) -> f64 {
244    if temperatures.len() < 2 || latencies.len() < 2 {
245        return 0.0;
246    }
247
248    // Calculate correlation between temperature and latency
249    let correlation = calculate_correlation(temperatures, latencies);
250
251    // Temperature variance
252    let temp_mean = temperatures.iter().sum::<f64>() / temperatures.len() as f64;
253    let temp_variance = temperatures
254        .iter()
255        .map(|t| (t - temp_mean).powi(2))
256        .sum::<f64>()
257        / (temperatures.len() - 1) as f64;
258    let temp_cv = if temp_mean > 0.0 {
259        temp_variance.sqrt() / temp_mean * 100.0
260    } else {
261        0.0
262    };
263
264    // Positive correlation threshold: higher temperature correlates with higher latency
265    if correlation > 0.3 {
266        correlation * temp_cv
267    } else {
268        0.0
269    }
270}
271
272/// Estimate cache state contribution
273fn estimate_cache_contribution(latencies: &[f64], warmup_count: usize) -> (f64, f64) {
274    if latencies.len() <= warmup_count || warmup_count == 0 {
275        return (0.0, 1.0);
276    }
277
278    // Split into cold (early) and warm (later) samples
279    let cold_samples: Vec<f64> = latencies.iter().take(warmup_count).cloned().collect();
280    let warm_samples: Vec<f64> = latencies.iter().skip(warmup_count).cloned().collect();
281
282    if cold_samples.is_empty() || warm_samples.is_empty() {
283        return (0.0, 1.0);
284    }
285
286    let cold_mean = cold_samples.iter().sum::<f64>() / cold_samples.len() as f64;
287    let warm_mean = warm_samples.iter().sum::<f64>() / warm_samples.len() as f64;
288
289    // Warmup effect ratio
290    let warmup_effect = if warm_mean > 0.0 {
291        cold_mean / warm_mean
292    } else {
293        1.0
294    };
295
296    // Cache contribution is the difference between cold and warm CV
297    let cold_cv = calculate_cv(&cold_samples);
298    let warm_cv = calculate_cv(&warm_samples);
299
300    let cache_contribution = (cold_cv - warm_cv).max(0.0);
301
302    (cache_contribution, warmup_effect)
303}
304
305/// Calculate coefficient of variation
306fn calculate_cv(samples: &[f64]) -> f64 {
307    if samples.len() < 2 {
308        return 0.0;
309    }
310
311    let mean = samples.iter().sum::<f64>() / samples.len() as f64;
312    if mean == 0.0 {
313        return 0.0;
314    }
315
316    let variance =
317        samples.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (samples.len() - 1) as f64;
318
319    (variance.sqrt() / mean) * 100.0
320}
321
322/// Calculate Pearson correlation coefficient
323fn calculate_correlation(x: &[f64], y: &[f64]) -> f64 {
324    let n = x.len().min(y.len());
325    if n < 2 {
326        return 0.0;
327    }
328
329    let x_mean = x.iter().take(n).sum::<f64>() / n as f64;
330    let y_mean = y.iter().take(n).sum::<f64>() / n as f64;
331
332    let mut numerator = 0.0;
333    let mut x_var = 0.0;
334    let mut y_var = 0.0;
335
336    for i in 0..n {
337        let dx = x[i] - x_mean;
338        let dy = y[i] - y_mean;
339        numerator += dx * dy;
340        x_var += dx * dx;
341        y_var += dy * dy;
342    }
343
344    let denominator = (x_var * y_var).sqrt();
345    if denominator > 0.0 {
346        numerator / denominator
347    } else {
348        0.0
349    }
350}
351
352/// Calculate trend coefficient (slope of linear regression)
353fn calculate_trend(samples: &[f64]) -> f64 {
354    if samples.len() < 2 {
355        return 0.0;
356    }
357
358    let n = samples.len() as f64;
359    let x_mean = (n - 1.0) / 2.0; // Mean of 0, 1, 2, ..., n-1
360    let y_mean = samples.iter().sum::<f64>() / n;
361
362    let mut numerator = 0.0;
363    let mut denominator = 0.0;
364
365    for (i, &y) in samples.iter().enumerate() {
366        let x = i as f64;
367        numerator += (x - x_mean) * (y - y_mean);
368        denominator += (x - x_mean).powi(2);
369    }
370
371    if denominator > 0.0 {
372        numerator / denominator
373    } else {
374        0.0
375    }
376}
377
378/// Identify the dominant source of variance
379fn identify_dominant_source(freq: f64, thermal: f64, cache: f64, residual: f64) -> VarianceSource {
380    let max = freq.max(thermal).max(cache).max(residual);
381
382    if max < 0.5 {
383        VarianceSource::Unknown
384    } else if max == freq {
385        VarianceSource::FrequencyScaling
386    } else if max == thermal {
387        VarianceSource::ThermalThrottling
388    } else if max == cache {
389        VarianceSource::CacheState
390    } else {
391        VarianceSource::SystemNoise
392    }
393}
394
395/// Generate mitigation recommendations based on variance sources
396fn generate_recommendations(
397    total_cv: f64,
398    freq: f64,
399    thermal: f64,
400    cache: f64,
401    residual: f64,
402) -> Vec<String> {
403    let mut recs = Vec::new();
404
405    if total_cv >= 5.0 {
406        recs.push(format!(
407            "CV {:.1}% exceeds 5% target. Mitigation needed.",
408            total_cv
409        ));
410    }
411
412    if freq > 1.0 {
413        recs.push(format!(
414            "Frequency variance ({:.1}%): {}",
415            freq,
416            VarianceSource::FrequencyScaling.mitigation()
417        ));
418    }
419
420    if thermal > 1.0 {
421        recs.push(format!(
422            "Thermal variance ({:.1}%): {}",
423            thermal,
424            VarianceSource::ThermalThrottling.mitigation()
425        ));
426    }
427
428    if cache > 1.0 {
429        recs.push(format!(
430            "Cache variance ({:.1}%): {}",
431            cache,
432            VarianceSource::CacheState.mitigation()
433        ));
434    }
435
436    if residual > 2.0 {
437        recs.push(format!(
438            "Residual noise ({:.1}%): {}",
439            residual,
440            VarianceSource::SystemNoise.mitigation()
441        ));
442    }
443
444    if recs.is_empty() {
445        recs.push("Variance within acceptable limits.".to_string());
446    }
447
448    recs
449}
450
451#[cfg(test)]
452mod tests;