tester/
stats.rs

1#![allow(missing_docs)]
2#![allow(deprecated)] // Float
3
4use std::cmp::Ordering::{self, Equal, Greater, Less};
5use std::mem;
6
7#[cfg(test)]
8mod tests;
9
10fn local_cmp(x: f64, y: f64) -> Ordering {
11    // arbitrarily decide that NaNs are larger than everything.
12    if y.is_nan() {
13        Less
14    } else if x.is_nan() {
15        Greater
16    } else if x < y {
17        Less
18    } else if x == y {
19        Equal
20    } else {
21        Greater
22    }
23}
24
25fn local_sort(v: &mut [f64]) {
26    v.sort_by(|x: &f64, y: &f64| local_cmp(*x, *y));
27}
28
29/// Trait that provides simple descriptive statistics on a univariate set of numeric samples.
30pub trait Stats {
31    /// Sum of the samples.
32    ///
33    /// Note: this method sacrifices performance at the altar of accuracy
34    /// Depends on IEEE-754 arithmetic guarantees. See proof of correctness at:
35    /// ["Adaptive Precision Floating-Point Arithmetic and Fast Robust Geometric
36    /// Predicates"][paper]
37    ///
38    /// [paper]: http://www.cs.cmu.edu/~quake-papers/robust-arithmetic.ps
39    fn sum(&self) -> f64;
40
41    /// Minimum value of the samples.
42    fn min(&self) -> f64;
43
44    /// Maximum value of the samples.
45    fn max(&self) -> f64;
46
47    /// Arithmetic mean (average) of the samples: sum divided by sample-count.
48    ///
49    /// See: <https://en.wikipedia.org/wiki/Arithmetic_mean>
50    fn mean(&self) -> f64;
51
52    /// Median of the samples: value separating the lower half of the samples from the higher half.
53    /// Equal to `self.percentile(50.0)`.
54    ///
55    /// See: <https://en.wikipedia.org/wiki/Median>
56    fn median(&self) -> f64;
57
58    /// Variance of the samples: bias-corrected mean of the squares of the differences of each
59    /// sample from the sample mean. Note that this calculates the _sample variance_ rather than the
60    /// population variance, which is assumed to be unknown. It therefore corrects the `(n-1)/n`
61    /// bias that would appear if we calculated a population variance, by dividing by `(n-1)` rather
62    /// than `n`.
63    ///
64    /// See: <https://en.wikipedia.org/wiki/Variance>
65    fn var(&self) -> f64;
66
67    /// Standard deviation: the square root of the sample variance.
68    ///
69    /// Note: this is not a robust statistic for non-normal distributions. Prefer the
70    /// `median_abs_dev` for unknown distributions.
71    ///
72    /// See: <https://en.wikipedia.org/wiki/Standard_deviation>
73    fn std_dev(&self) -> f64;
74
75    /// Standard deviation as a percent of the mean value. See `std_dev` and `mean`.
76    ///
77    /// Note: this is not a robust statistic for non-normal distributions. Prefer the
78    /// `median_abs_dev_pct` for unknown distributions.
79    fn std_dev_pct(&self) -> f64;
80
81    /// Scaled median of the absolute deviations of each sample from the sample median. This is a
82    /// robust (distribution-agnostic) estimator of sample variability. Use this in preference to
83    /// `std_dev` if you cannot assume your sample is normally distributed. Note that this is scaled
84    /// by the constant `1.4826` to allow its use as a consistent estimator for the standard
85    /// deviation.
86    ///
87    /// See: <https://en.wikipedia.org/wiki/Median_absolute_deviation>
88    fn median_abs_dev(&self) -> f64;
89
90    /// Median absolute deviation as a percent of the median. See `median_abs_dev` and `median`.
91    fn median_abs_dev_pct(&self) -> f64;
92
93    /// Percentile: the value below which `pct` percent of the values in `self` fall. For example,
94    /// percentile(95.0) will return the value `v` such that 95% of the samples `s` in `self`
95    /// satisfy `s <= v`.
96    ///
97    /// Calculated by linear interpolation between closest ranks.
98    ///
99    /// See: <https://en.wikipedia.org/wiki/Percentile>
100    fn percentile(&self, pct: f64) -> f64;
101
102    /// Quartiles of the sample: three values that divide the sample into four equal groups, each
103    /// with 1/4 of the data. The middle value is the median. See `median` and `percentile`. This
104    /// function may calculate the 3 quartiles more efficiently than 3 calls to `percentile`, but
105    /// is otherwise equivalent.
106    ///
107    /// See also: <https://en.wikipedia.org/wiki/Quartile>
108    fn quartiles(&self) -> (f64, f64, f64);
109
110    /// Inter-quartile range: the difference between the 25th percentile (1st quartile) and the 75th
111    /// percentile (3rd quartile). See `quartiles`.
112    ///
113    /// See also: <https://en.wikipedia.org/wiki/Interquartile_range>
114    fn iqr(&self) -> f64;
115}
116
117/// Extracted collection of all the summary statistics of a sample set.
118#[derive(Debug, Clone, PartialEq, Copy)]
119#[allow(missing_docs)]
120pub struct Summary {
121    pub sum: f64,
122    pub min: f64,
123    pub max: f64,
124    pub mean: f64,
125    pub median: f64,
126    pub var: f64,
127    pub std_dev: f64,
128    pub std_dev_pct: f64,
129    pub median_abs_dev: f64,
130    pub median_abs_dev_pct: f64,
131    pub quartiles: (f64, f64, f64),
132    pub iqr: f64,
133}
134
135impl Summary {
136    /// Construct a new summary of a sample set.
137    pub fn new(samples: &[f64]) -> Summary {
138        Summary {
139            sum: samples.sum(),
140            min: samples.min(),
141            max: samples.max(),
142            mean: samples.mean(),
143            median: samples.median(),
144            var: samples.var(),
145            std_dev: samples.std_dev(),
146            std_dev_pct: samples.std_dev_pct(),
147            median_abs_dev: samples.median_abs_dev(),
148            median_abs_dev_pct: samples.median_abs_dev_pct(),
149            quartiles: samples.quartiles(),
150            iqr: samples.iqr(),
151        }
152    }
153}
154
155impl Stats for [f64] {
156    // FIXME #11059 handle NaN, inf and overflow
157    fn sum(&self) -> f64 {
158        let mut partials = vec![];
159
160        for &x in self {
161            let mut x = x;
162            let mut j = 0;
163            // This inner loop applies `hi`/`lo` summation to each
164            // partial so that the list of partial sums remains exact.
165            for i in 0..partials.len() {
166                let mut y: f64 = partials[i];
167                if x.abs() < y.abs() {
168                    mem::swap(&mut x, &mut y);
169                }
170                // Rounded `x+y` is stored in `hi` with round-off stored in
171                // `lo`. Together `hi+lo` are exactly equal to `x+y`.
172                let hi = x + y;
173                let lo = y - (hi - x);
174                if lo != 0.0 {
175                    partials[j] = lo;
176                    j += 1;
177                }
178                x = hi;
179            }
180            if j >= partials.len() {
181                partials.push(x);
182            } else {
183                partials[j] = x;
184                partials.truncate(j + 1);
185            }
186        }
187        let zero: f64 = 0.0;
188        partials.iter().fold(zero, |p, q| p + *q)
189    }
190
191    fn min(&self) -> f64 {
192        assert!(!self.is_empty());
193        self.iter().fold(self[0], |p, q| p.min(*q))
194    }
195
196    fn max(&self) -> f64 {
197        assert!(!self.is_empty());
198        self.iter().fold(self[0], |p, q| p.max(*q))
199    }
200
201    fn mean(&self) -> f64 {
202        assert!(!self.is_empty());
203        self.sum() / (self.len() as f64)
204    }
205
206    fn median(&self) -> f64 {
207        self.percentile(50_f64)
208    }
209
210    fn var(&self) -> f64 {
211        if self.len() < 2 {
212            0.0
213        } else {
214            let mean = self.mean();
215            let mut v: f64 = 0.0;
216            for s in self {
217                let x = *s - mean;
218                v += x * x;
219            }
220            // N.B., this is _supposed to be_ len-1, not len. If you
221            // change it back to len, you will be calculating a
222            // population variance, not a sample variance.
223            let denom = (self.len() - 1) as f64;
224            v / denom
225        }
226    }
227
228    fn std_dev(&self) -> f64 {
229        self.var().sqrt()
230    }
231
232    fn std_dev_pct(&self) -> f64 {
233        let hundred = 100_f64;
234        (self.std_dev() / self.mean()) * hundred
235    }
236
237    fn median_abs_dev(&self) -> f64 {
238        let med = self.median();
239        let abs_devs: Vec<f64> = self.iter().map(|&v| (med - v).abs()).collect();
240        // This constant is derived by smarter statistics brains than me, but it is
241        // consistent with how R and other packages treat the MAD.
242        let number = 1.4826;
243        abs_devs.median() * number
244    }
245
246    fn median_abs_dev_pct(&self) -> f64 {
247        let hundred = 100_f64;
248        (self.median_abs_dev() / self.median()) * hundred
249    }
250
251    fn percentile(&self, pct: f64) -> f64 {
252        let mut tmp = self.to_vec();
253        local_sort(&mut tmp);
254        percentile_of_sorted(&tmp, pct)
255    }
256
257    fn quartiles(&self) -> (f64, f64, f64) {
258        let mut tmp = self.to_vec();
259        local_sort(&mut tmp);
260        let first = 25_f64;
261        let a = percentile_of_sorted(&tmp, first);
262        let second = 50_f64;
263        let b = percentile_of_sorted(&tmp, second);
264        let third = 75_f64;
265        let c = percentile_of_sorted(&tmp, third);
266        (a, b, c)
267    }
268
269    fn iqr(&self) -> f64 {
270        let (a, _, c) = self.quartiles();
271        c - a
272    }
273}
274
275// Helper function: extract a value representing the `pct` percentile of a sorted sample-set, using
276// linear interpolation. If samples are not sorted, return nonsensical value.
277fn percentile_of_sorted(sorted_samples: &[f64], pct: f64) -> f64 {
278    assert!(!sorted_samples.is_empty());
279    if sorted_samples.len() == 1 {
280        return sorted_samples[0];
281    }
282    let zero: f64 = 0.0;
283    assert!(zero <= pct);
284    let hundred = 100_f64;
285    assert!(pct <= hundred);
286    if pct == hundred {
287        return sorted_samples[sorted_samples.len() - 1];
288    }
289    let length = (sorted_samples.len() - 1) as f64;
290    let rank = (pct / hundred) * length;
291    let lrank = rank.floor();
292    let d = rank - lrank;
293    let n = lrank as usize;
294    let lo = sorted_samples[n];
295    let hi = sorted_samples[n + 1];
296    lo + (hi - lo) * d
297}
298
299/// Winsorize a set of samples, replacing values above the `100-pct` percentile
300/// and below the `pct` percentile with those percentiles themselves. This is a
301/// way of minimizing the effect of outliers, at the cost of biasing the sample.
302/// It differs from trimming in that it does not change the number of samples,
303/// just changes the values of those that are outliers.
304///
305/// See: <https://en.wikipedia.org/wiki/Winsorising>
306pub fn winsorize(samples: &mut [f64], pct: f64) {
307    let mut tmp = samples.to_vec();
308    local_sort(&mut tmp);
309    let lo = percentile_of_sorted(&tmp, pct);
310    let hundred = 100_f64;
311    let hi = percentile_of_sorted(&tmp, hundred - pct);
312    for samp in samples {
313        if *samp > hi {
314            *samp = hi
315        } else if *samp < lo {
316            *samp = lo
317        }
318    }
319}