Skip to main content

fluxbench_stats/
summary.rs

1//! Summary Statistics
2//!
3//! Computes comprehensive summary statistics following the critical design decision:
4//! - Mean, median, stddev computed from CLEANED data (outliers removed)
5//! - Min, max, percentiles computed from ALL data (outliers preserved)
6
7use crate::outliers::{OutlierAnalysis, OutlierMethod, detect_outliers};
8use crate::percentiles::compute_percentile;
9
10/// Comprehensive summary statistics
11#[derive(Debug, Clone)]
12pub struct SummaryStatistics {
13    /// Mean (computed from cleaned data with outliers removed)
14    pub mean: f64,
15    /// Median (computed from cleaned data with outliers removed)
16    pub median: f64,
17    /// Standard deviation (computed from cleaned data with outliers removed)
18    pub std_dev: f64,
19
20    /// Minimum value (from all data, preserving outliers)
21    pub min: f64,
22    /// Maximum value (from all data, preserving outliers)
23    pub max: f64,
24
25    /// 50th percentile/median (from all data, preserving outliers)
26    pub p50: f64,
27    /// 90th percentile (from all data, preserving outliers)
28    pub p90: f64,
29    /// 95th percentile (from all data, preserving outliers)
30    pub p95: f64,
31    /// 99th percentile (from all data, preserving outliers)
32    pub p99: f64,
33    /// 99.9th percentile (from all data, preserving outliers)
34    pub p999: f64,
35
36    /// Skewness of the distribution (from cleaned data)
37    pub skewness: f64,
38    /// Kurtosis of the distribution (from cleaned data)
39    pub kurtosis: f64,
40
41    /// Total number of samples (before outlier removal)
42    pub sample_count: usize,
43    /// Number of outliers detected
44    pub outlier_count: usize,
45    /// Complete outlier analysis including detection bounds and indices
46    pub outlier_analysis: OutlierAnalysis,
47}
48
49/// CPU cycles statistics (computed alongside time stats)
50#[derive(Debug, Clone, Default)]
51pub struct CyclesStatistics {
52    /// Mean CPU cycles per iteration
53    pub mean_cycles: f64,
54    /// Median CPU cycles per iteration
55    pub median_cycles: f64,
56    /// Standard deviation of cycles
57    pub std_dev_cycles: f64,
58    /// Minimum cycles observed
59    pub min_cycles: u64,
60    /// Maximum cycles observed
61    pub max_cycles: u64,
62    /// Cycles per nanosecond (approximates CPU frequency in GHz)
63    pub cycles_per_ns: f64,
64}
65
66/// Compute summary statistics with proper separation of cleaned vs raw data
67///
68/// Mean/median/stddev are computed from cleaned data (outliers removed), while
69/// percentiles and extremes preserve outliers as they represent important tail behavior.
70///
71/// # Examples
72///
73/// ```ignore
74/// # use fluxbench_stats::{compute_summary, OutlierMethod};
75/// let samples = vec![100.0, 102.0, 98.0, 101.0, 99.0];
76/// let stats = compute_summary(&samples, OutlierMethod::default());
77/// println!("Mean: {:.2}", stats.mean);
78/// println!("P99: {:.2}", stats.p99);
79/// println!("Outliers detected: {}", stats.outlier_count);
80/// ```
81pub fn compute_summary(samples: &[f64], outlier_method: OutlierMethod) -> SummaryStatistics {
82    if samples.is_empty() {
83        return SummaryStatistics {
84            mean: 0.0,
85            median: 0.0,
86            std_dev: 0.0,
87            min: 0.0,
88            max: 0.0,
89            p50: 0.0,
90            p90: 0.0,
91            p95: 0.0,
92            p99: 0.0,
93            p999: 0.0,
94            skewness: 0.0,
95            kurtosis: 0.0,
96            sample_count: 0,
97            outlier_count: 0,
98            outlier_analysis: detect_outliers(samples, outlier_method),
99        };
100    }
101
102    // Detect outliers
103    let analysis = detect_outliers(samples, outlier_method);
104    let cleaned = &analysis.cleaned_samples;
105    let all = &analysis.all_samples;
106
107    // Central tendency from CLEANED data
108    let mean = if cleaned.is_empty() {
109        0.0
110    } else {
111        cleaned.iter().sum::<f64>() / cleaned.len() as f64
112    };
113
114    let median = if cleaned.is_empty() {
115        0.0
116    } else {
117        compute_percentile(cleaned, 50.0)
118    };
119
120    let std_dev = if cleaned.len() < 2 {
121        0.0
122    } else {
123        let variance =
124            cleaned.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (cleaned.len() - 1) as f64;
125        variance.sqrt()
126    };
127
128    // Extremes from ALL data
129    let min = all
130        .iter()
131        .cloned()
132        .min_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
133        .unwrap_or(0.0);
134    let max = all
135        .iter()
136        .cloned()
137        .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
138        .unwrap_or(0.0);
139
140    // Percentiles from ALL data (outliers ARE the tail signal)
141    let p50 = compute_percentile(all, 50.0);
142    let p90 = compute_percentile(all, 90.0);
143    let p95 = compute_percentile(all, 95.0);
144    let p99 = compute_percentile(all, 99.0);
145    let p999 = compute_percentile(all, 99.9);
146
147    // Skewness and kurtosis from CLEANED data (Fisher's definitions)
148    let (skewness, kurtosis) = if cleaned.len() < 3 || std_dev < f64::EPSILON {
149        (0.0, 0.0)
150    } else {
151        let n = cleaned.len() as f64;
152        let mut m3 = 0.0;
153        let mut m4 = 0.0;
154        for &x in cleaned.iter() {
155            let d = (x - mean) / std_dev;
156            let d2 = d * d;
157            m3 += d2 * d;
158            m4 += d2 * d2;
159        }
160        (m3 / n, m4 / n - 3.0)
161    };
162
163    SummaryStatistics {
164        mean,
165        median,
166        std_dev,
167        min,
168        max,
169        p50,
170        p90,
171        p95,
172        p99,
173        p999,
174        skewness,
175        kurtosis,
176        sample_count: all.len(),
177        outlier_count: analysis.outlier_indices.len(),
178        outlier_analysis: analysis,
179    }
180}
181
182impl SummaryStatistics {
183    /// Coefficient of variation (relative stddev)
184    pub fn coefficient_of_variation(&self) -> f64 {
185        if self.mean == 0.0 {
186            0.0
187        } else {
188            (self.std_dev / self.mean) * 100.0
189        }
190    }
191
192    /// Interquartile range (from all data)
193    pub fn iqr(&self) -> f64 {
194        let q1 = compute_percentile(&self.outlier_analysis.all_samples, 25.0);
195        let q3 = compute_percentile(&self.outlier_analysis.all_samples, 75.0);
196        q3 - q1
197    }
198
199    /// Check if distribution appears stable (low CV)
200    pub fn is_stable(&self, cv_threshold: f64) -> bool {
201        self.coefficient_of_variation() < cv_threshold
202    }
203
204    /// Classify the distribution shape based on skewness and kurtosis.
205    ///
206    /// Returns a string like "symmetric, normal-tailed" or "right-skewed, heavy-tailed".
207    pub fn distribution_shape(&self) -> String {
208        let skew_label = if self.skewness.abs() < 0.5 {
209            "symmetric"
210        } else if self.skewness > 0.0 {
211            "right-skewed"
212        } else {
213            "left-skewed"
214        };
215
216        let kurt_label = if self.kurtosis.abs() < 1.0 {
217            "normal-tailed"
218        } else if self.kurtosis > 0.0 {
219            "heavy-tailed"
220        } else {
221            "light-tailed"
222        };
223
224        format!("{}, {}", skew_label, kurt_label)
225    }
226}
227
228/// Compute CPU cycles statistics from raw cycle counts
229///
230/// Takes parallel arrays of cycles and nanos to compute cycles_per_ns ratio.
231pub fn compute_cycles_stats(cycles: &[u64], nanos: &[f64]) -> CyclesStatistics {
232    if cycles.is_empty() {
233        return CyclesStatistics::default();
234    }
235
236    // Convert to f64 for statistical calculations
237    let cycles_f64: Vec<f64> = cycles.iter().map(|&c| c as f64).collect();
238
239    // Mean cycles
240    let mean_cycles = cycles_f64.iter().sum::<f64>() / cycles_f64.len() as f64;
241
242    // Median cycles
243    let mut sorted = cycles_f64.clone();
244    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
245    let median_cycles = if sorted.len() % 2 == 0 {
246        let mid = sorted.len() / 2;
247        (sorted[mid - 1] + sorted[mid]) / 2.0
248    } else {
249        sorted[sorted.len() / 2]
250    };
251
252    // Std dev
253    let std_dev_cycles = if cycles.len() < 2 {
254        0.0
255    } else {
256        let variance = cycles_f64
257            .iter()
258            .map(|x| (x - mean_cycles).powi(2))
259            .sum::<f64>()
260            / (cycles_f64.len() - 1) as f64;
261        variance.sqrt()
262    };
263
264    // Min/max
265    let min_cycles = *cycles.iter().min().unwrap_or(&0);
266    let max_cycles = *cycles.iter().max().unwrap_or(&0);
267
268    // Cycles per nanosecond (CPU frequency approximation)
269    let cycles_per_ns = if !nanos.is_empty() {
270        let total_nanos: f64 = nanos.iter().sum();
271        let total_cycles: f64 = cycles_f64.iter().sum();
272        if total_nanos > 0.0 {
273            total_cycles / total_nanos
274        } else {
275            0.0
276        }
277    } else {
278        0.0
279    };
280
281    CyclesStatistics {
282        mean_cycles,
283        median_cycles,
284        std_dev_cycles,
285        min_cycles,
286        max_cycles,
287        cycles_per_ns,
288    }
289}
290
291#[cfg(test)]
292mod tests {
293    use super::*;
294
295    #[test]
296    fn test_basic_summary() {
297        let samples = vec![1.0, 2.0, 3.0, 4.0, 5.0];
298        let summary = compute_summary(&samples, OutlierMethod::default());
299
300        assert!((summary.mean - 3.0).abs() < 0.01);
301        assert!((summary.median - 3.0).abs() < 0.01);
302        assert_eq!(summary.min, 1.0);
303        assert_eq!(summary.max, 5.0);
304        assert_eq!(summary.sample_count, 5);
305    }
306
307    #[test]
308    fn test_outlier_handling() {
309        // 100.0 is an outlier
310        let samples = vec![1.0, 2.0, 3.0, 4.0, 5.0, 100.0];
311        let summary = compute_summary(&samples, OutlierMethod::default());
312
313        // Mean should be from cleaned data (exclude 100.0)
314        assert!(summary.mean < 10.0);
315
316        // Max should be from ALL data (include 100.0)
317        assert_eq!(summary.max, 100.0);
318
319        // p99 should include the outlier
320        assert!(summary.p99 > 50.0);
321
322        assert_eq!(summary.outlier_count, 1);
323    }
324
325    #[test]
326    fn test_coefficient_of_variation() {
327        let samples = vec![100.0, 100.0, 100.0, 100.0, 100.0];
328        let summary = compute_summary(&samples, OutlierMethod::None);
329
330        // Zero variance = zero CV
331        assert!((summary.coefficient_of_variation() - 0.0).abs() < f64::EPSILON);
332    }
333
334    #[test]
335    fn test_empty_samples() {
336        let samples: Vec<f64> = Vec::new();
337        let summary = compute_summary(&samples, OutlierMethod::default());
338
339        assert_eq!(summary.sample_count, 0);
340        assert!((summary.mean - 0.0).abs() < f64::EPSILON);
341    }
342
343    #[test]
344    fn test_skewness_symmetric() {
345        // Symmetric data should have skewness ≈ 0
346        let samples: Vec<f64> = (1..=100).map(|x| x as f64).collect();
347        let summary = compute_summary(&samples, OutlierMethod::None);
348        assert!(
349            summary.skewness.abs() < 0.1,
350            "skewness={}",
351            summary.skewness
352        );
353        // Uniform distribution has excess kurtosis ≈ -1.2 (light-tailed)
354        assert!(summary.kurtosis < 0.0, "kurtosis={}", summary.kurtosis);
355        assert!(summary.distribution_shape().contains("symmetric"));
356    }
357
358    #[test]
359    fn test_skewness_right_skewed() {
360        // Right-skewed data (exponential-like)
361        let samples: Vec<f64> = (0..200).map(|x| (x as f64 * 0.05).exp()).collect();
362        let summary = compute_summary(&samples, OutlierMethod::None);
363        assert!(summary.skewness > 0.5, "skewness={}", summary.skewness);
364        assert!(summary.distribution_shape().contains("right-skewed"));
365    }
366
367    #[test]
368    fn test_skewness_left_skewed() {
369        // Left-skewed data (mirror of exponential)
370        let samples: Vec<f64> = (0..200).map(|x| 200.0 - (x as f64 * 0.05).exp()).collect();
371        let summary = compute_summary(&samples, OutlierMethod::None);
372        assert!(summary.skewness < -0.5, "skewness={}", summary.skewness);
373        assert!(summary.distribution_shape().contains("left-skewed"));
374    }
375
376    #[test]
377    fn test_skewness_constant_values() {
378        let samples = vec![5.0, 5.0, 5.0, 5.0, 5.0];
379        let summary = compute_summary(&samples, OutlierMethod::None);
380        assert_eq!(summary.skewness, 0.0);
381        assert_eq!(summary.kurtosis, 0.0);
382    }
383
384    #[test]
385    fn test_skewness_two_samples() {
386        let samples = vec![1.0, 2.0];
387        let summary = compute_summary(&samples, OutlierMethod::None);
388        // < 3 samples: returns (0.0, 0.0)
389        assert_eq!(summary.skewness, 0.0);
390        assert_eq!(summary.kurtosis, 0.0);
391    }
392
393    #[test]
394    fn test_kurtosis_normal() {
395        // Uniform distribution has negative excess kurtosis (light-tailed)
396        let samples: Vec<f64> = (0..1000).map(|x| x as f64).collect();
397        let summary = compute_summary(&samples, OutlierMethod::None);
398        // Uniform: kurtosis = -1.2
399        assert!(summary.kurtosis < 0.0, "kurtosis={}", summary.kurtosis);
400    }
401
402    #[test]
403    fn test_cycles_stats() {
404        let cycles = vec![3000u64, 3100, 2900, 3050, 2950];
405        let nanos = vec![1000.0, 1033.0, 967.0, 1017.0, 983.0];
406        let stats = compute_cycles_stats(&cycles, &nanos);
407
408        // Mean should be ~3000
409        assert!((stats.mean_cycles - 3000.0).abs() < 50.0);
410        assert_eq!(stats.min_cycles, 2900);
411        assert_eq!(stats.max_cycles, 3100);
412        // ~3 cycles per ns
413        assert!((stats.cycles_per_ns - 3.0).abs() < 0.5);
414    }
415
416    #[test]
417    fn test_cycles_stats_empty() {
418        let cycles: Vec<u64> = vec![];
419        let nanos: Vec<f64> = vec![];
420        let stats = compute_cycles_stats(&cycles, &nanos);
421
422        assert!((stats.mean_cycles - 0.0).abs() < f64::EPSILON);
423        assert_eq!(stats.min_cycles, 0);
424        assert_eq!(stats.max_cycles, 0);
425    }
426}