std_dev/
lib.rs

1#![cfg_attr(docsrs, feature(doc_auto_cfg))]
2use std::collections::HashMap;
3use std::ops::{Deref, DerefMut};
4use std::{hash, ops};
5
6#[cfg(feature = "regression")]
7#[path = "regression.rs"]
8pub mod regression;
9
10pub mod percentile;
11
12#[cfg(feature = "percentile-rand")]
13pub use percentile::percentile_rand;
14pub use percentile::{median, percentile, Fraction};
15#[cfg(feature = "ols")]
16pub use regression::best_fit_ols as regression_best_fit;
17#[cfg(feature = "regression")]
18pub use regression::{Determination, Predictive};
19
20use self::percentile::cluster;
21
22/// > As all algorithms are executed in linear time now, this is not as useful, but nevertheless an interesting feature.
23/// > If you already have clustered data, this feature is great.
24///
25/// When using this, calculations are done per _unique_ value.
26/// Say you have a dataset of infant height, in centimeters.
27/// That's probably only going to be some 40 different values, but potentially millions of entries.
28/// Using clusters, all that data is only processed as `O(40)`, not `O(millions)`. (I know that notation isn't right, but you get my point).
29pub type Cluster = (f64, usize);
30
31/// Owned variant of [`ClusterList`].
32/// Use [`Self::borrow`] to get a [`ClusterList`].
33/// The inner slice is accessible through the [`Deref`] and [`DerefMut`], which means you can use
34/// this as a mutable slice.
35#[derive(Debug)]
36pub struct OwnedClusterList {
37    list: Vec<Cluster>,
38    len: usize,
39}
40impl OwnedClusterList {
41    /// The float is the value. The integer is the count.
42    pub fn new(list: Vec<Cluster>) -> Self {
43        let len = ClusterList::size(&list);
44        Self { list, len }
45    }
46    pub fn borrow(&self) -> ClusterList {
47        ClusterList {
48            list: &self.list,
49            len: self.len,
50        }
51    }
52}
53impl Deref for OwnedClusterList {
54    type Target = [Cluster];
55    fn deref(&self) -> &Self::Target {
56        &self.list
57    }
58}
59impl DerefMut for OwnedClusterList {
60    fn deref_mut(&mut self) -> &mut Self::Target {
61        &mut self.list
62    }
63}
64
65/// F64 wrapper that implements [`Ord`] and [`Hash`].
66///
67/// When [`PartialOrd`] returns [`None`], we return [`std::cmp::Ordering::Equal`].
68///
69/// You should probably not be using this unless you know what you're doing.
70#[derive(Debug, Copy, Clone)]
71#[repr(transparent)]
72pub struct F64OrdHash(pub f64);
73impl F64OrdHash {
74    #[inline(always)]
75    fn key(&self) -> u64 {
76        self.0.to_bits()
77    }
78
79    /// Compares two `f64`s using our ordering.
80    #[inline(always)]
81    pub fn f64_cmp(a: f64, b: f64) -> std::cmp::Ordering {
82        Self(a).cmp(&Self(b))
83    }
84}
85impl hash::Hash for F64OrdHash {
86    #[inline]
87    fn hash<H>(&self, state: &mut H)
88    where
89        H: hash::Hasher,
90    {
91        self.key().hash(state)
92    }
93}
94impl PartialEq for F64OrdHash {
95    #[inline]
96    fn eq(&self, other: &F64OrdHash) -> bool {
97        self.key() == other.key()
98    }
99}
100impl Eq for F64OrdHash {}
101impl PartialOrd for F64OrdHash {
102    #[inline]
103    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
104        Some(self.cmp(other))
105    }
106}
107impl Ord for F64OrdHash {
108    #[inline]
109    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
110        self.0
111            .partial_cmp(&other.0)
112            .unwrap_or_else(|| match (self.0.is_nan(), other.0.is_nan()) {
113                (true, true) | (false, false) => std::cmp::Ordering::Equal,
114                (true, false) => std::cmp::Ordering::Less,
115                (false, true) => std::cmp::Ordering::Greater,
116            })
117    }
118}
119
120/// A list of clusters.
121///
122/// A cluster is a value and the count.
123///
124/// `m` in `O(m)` means the count of clusters.
125#[derive(Debug)]
126pub struct ClusterList<'a> {
127    list: &'a [Cluster],
128    len: usize,
129}
130impl<'a> ClusterList<'a> {
131    /// The float is the value. The integer is the count.
132    pub fn new(list: &'a [Cluster]) -> Self {
133        let len = Self::size(list);
134        Self { list, len }
135    }
136
137    fn size(list: &[Cluster]) -> usize {
138        list.iter().map(|(_, count)| *count).sum()
139    }
140
141    /// O(1)
142    pub fn len(&self) -> usize {
143        self.len
144    }
145    /// O(1)
146    pub fn is_empty(&self) -> bool {
147        self.list.is_empty()
148    }
149    /// O(m)
150    pub fn sum(&self) -> f64 {
151        let mut sum = 0.0;
152        for (v, count) in self.list.iter() {
153            sum += v * *count as f64;
154        }
155        sum
156    }
157    fn sum_squared_diff(&self, base: f64) -> f64 {
158        let mut sum = 0.0;
159        for (v, count) in self.list.iter() {
160            sum += (v - base).powi(2) * *count as f64;
161        }
162        sum
163    }
164    /// Can be used in [`Self::new`].
165    pub fn split_start(&self, len: usize) -> OwnedClusterList {
166        let mut sum = 0;
167        let mut list = Vec::new();
168        for (v, count) in self.list {
169            sum += count;
170            if sum >= len {
171                list.push((*v, *count - (sum - len)));
172                break;
173            } else {
174                list.push((*v, *count));
175            }
176        }
177        debug_assert_eq!(len, Self::size(&list));
178        OwnedClusterList { list, len }
179    }
180    /// Can be used in [`Self::new`].
181    pub fn split_end(&self, len: usize) -> OwnedClusterList {
182        let mut sum = 0;
183        let mut list = Vec::new();
184        for (v, count) in self.list.iter().rev() {
185            sum += count;
186            if sum >= len {
187                list.insert(0, (*v, *count - (len - sum)));
188                break;
189            } else {
190                list.insert(0, (*v, *count))
191            }
192        }
193        debug_assert_eq!(len, Self::size(&list));
194        OwnedClusterList { list, len }
195    }
196    /// Returns the value at `idx`. This iterates the clusters to get the value.
197    ///
198    /// # Panics
199    ///
200    /// Panics if [`Self::is_empty`] or if `idx >= self.len()`.
201    #[inline]
202    #[allow(clippy::should_implement_trait)] // `TODO`
203    pub fn index(&self, mut idx: usize) -> &f64 {
204        for (v, c) in self.list {
205            let c = *c;
206            if idx < c {
207                return v;
208            }
209            idx -= c;
210        }
211        &self.list.last().unwrap().0
212    }
213
214    /// Groups [`Cluster`]s with the same value together, by adding their count.
215    ///
216    /// This speeds up calculations enormously.
217    ///
218    /// O(n)
219    pub fn optimize_values(self) -> OwnedClusterList {
220        let mut collected = HashMap::with_capacity(16);
221        for (v, count) in self.list {
222            let c = collected.entry(F64OrdHash(*v)).or_insert(0);
223            *c += count;
224        }
225        let list = collected.into_iter().map(|(f, c)| (f.0, c)).collect();
226        OwnedClusterList {
227            list,
228            len: self.len,
229        }
230    }
231}
232
233/// Returned from [`standard_deviation`] and similar functions.
234#[derive(Debug, PartialEq, Eq, Clone, Copy)]
235pub struct StandardDeviationOutput<T> {
236    pub standard_deviation: T,
237    pub mean: T,
238}
239/// Returned from [`percentiles_cluster`] and similar functions.
240#[derive(Debug, PartialEq, Clone, Copy)]
241pub struct PercentilesOutput {
242    pub median: f64,
243    pub lower_quadrille: Option<f64>,
244    pub higher_quadrille: Option<f64>,
245}
246
247/// Helper-trait for types used by [`mean`].
248///
249/// This is implemented generically when the feature `generic-impl` is enabled.
250pub trait Mean<'a, D>: std::iter::Sum<&'a Self> + ops::Div<Output = D>
251where
252    Self: 'a,
253{
254    fn from_usize(n: usize) -> Self;
255}
256#[cfg(feature = "generic-impls")]
257impl<'a, T: std::iter::Sum<&'a Self> + ops::Div + num_traits::FromPrimitive> Mean<'a, T::Output>
258    for T
259where
260    T: 'a,
261{
262    fn from_usize(n: usize) -> Self {
263        Self::from_usize(n).expect("Value can not be converted from usize. Check your type in the call to standard_deviation/mean.")
264    }
265}
266#[cfg(not(feature = "generic-impls"))]
267macro_rules! impl_mean {
268    ($($t:ty, )+) => {
269        $(
270        impl<'a> Mean<'a, <$t as ops::Div>::Output> for $t {
271            fn from_usize(n: usize) -> Self {
272                n as _
273            }
274        }
275        )+
276    };
277}
278#[cfg(not(feature = "generic-impls"))]
279impl_mean!(f32, f64, u8, u16, u32, u64, u128, usize, i8, i16, i32, i64, i128, isize,);
280
281/// Helper-trait for types used by [`standard_deviation`].
282///
283/// This is implemented generically when the feature `generic-impl` is enabled.
284pub trait StandardDeviation<'a>:
285    Copy
286    + Mean<'a, Self>
287    + std::iter::Sum<&'a Self>
288    + std::iter::Sum
289    + ops::Div<Output = Self>
290    + ops::Sub<Output = Self>
291    + ops::Mul<Output = Self>
292where
293    Self: 'a,
294{
295    fn one() -> Self;
296    fn sqrt(self) -> Self;
297    fn max(self, other: Self) -> Self;
298}
299#[cfg(feature = "generic-impls")]
300impl<
301        'a,
302        T: Copy
303            + Mean<'a, Self>
304            + PartialOrd
305            + std::iter::Sum<&'a Self>
306            + std::iter::Sum
307            + ops::Div<Output = Self>
308            + ops::Sub<Output = Self>
309            + ops::Mul<Output = Self>
310            + num_traits::identities::One
311            + num_traits::real::Real,
312    > StandardDeviation<'a> for T
313where
314    T: 'a,
315{
316    fn one() -> Self {
317        Self::one()
318    }
319    fn sqrt(self) -> Self {
320        self.sqrt()
321    }
322    fn max(self, other: Self) -> Self {
323        if self < other {
324            other
325        } else {
326            self
327        }
328    }
329}
330#[cfg(not(feature = "generic-impls"))]
331macro_rules! impl_std_dev {
332    ($($t:ty, )+) => {
333        $(
334        impl<'a> StandardDeviation<'a> for $t {
335            fn one() -> Self {
336                1.0
337            }
338            fn sqrt(self) -> Self {
339                <$t>::sqrt(self)
340            }
341            fn max(self, other: Self) -> Self {
342                if self < other {
343                    other
344                } else {
345                    self
346                }
347            }
348        }
349        )+
350    };
351}
352#[cfg(not(feature = "generic-impls"))]
353impl_std_dev!(f32, f64,);
354
355/// Mean of clustered `values`.
356pub fn mean_cluster(values: &ClusterList) -> f64 {
357    values.sum() / values.len() as f64
358}
359/// Mean of `values`.
360pub fn mean<'a, D, T: Mean<'a, D>>(values: &'a [T]) -> D {
361    values.iter().sum::<T>() / T::from_usize(values.len())
362}
363
364/// Get the standard deviation of `values`.
365/// The mean is also returned from this, because it's required to compute the standard deviation.
366///
367/// O(m), where m is the number of [`Cluster`]s.
368pub fn standard_deviation_cluster(values: &ClusterList) -> StandardDeviationOutput<f64> {
369    let m = mean_cluster(values);
370    let squared_deviations = values.sum_squared_diff(m);
371    let variance: f64 = squared_deviations / (values.len() - 1).max(1) as f64;
372    StandardDeviationOutput {
373        standard_deviation: variance.sqrt(),
374        mean: m,
375    }
376}
377/// Get the standard deviation of `values`.
378/// The mean is also returned from this, because it's required to compute the standard deviation.
379///
380/// O(n)
381pub fn standard_deviation<'a, T: StandardDeviation<'a>>(
382    values: &'a [T],
383) -> StandardDeviationOutput<T> {
384    let m = mean(values);
385    let squared_deviations: T = values
386        .iter()
387        .map(|t| {
388            let diff = *t - m;
389
390            diff * diff
391        })
392        .sum();
393    let len_minus_one = T::from_usize(values.len()) - T::one();
394    // So we don't get an NaN if 1 value is supplied.
395    let denominator = len_minus_one.max(T::one());
396    let variance: T = squared_deviations / denominator;
397    let std_dev = variance.sqrt();
398
399    StandardDeviationOutput {
400        standard_deviation: std_dev,
401        mean: m,
402    }
403}
404
405/// Get a collection of percentiles from `values`.
406pub fn percentiles_cluster(values: &mut OwnedClusterList) -> PercentilesOutput {
407    fn percentile(
408        values: &mut OwnedClusterList,
409        target: impl percentile::OrderedListIndex,
410    ) -> percentile::MeanValue<f64> {
411        #[cfg(feature = "percentile-rand")]
412        {
413            cluster::percentile_rand(values, target)
414        }
415        #[cfg(not(feature = "percentile-rand"))]
416        {
417            cluster::percentile(values, target, &mut cluster::pivot_fn::middle())
418        }
419    }
420    let lower = if values.borrow().len() >= 4 {
421        Some(percentile(values, Fraction::new(1, 4)).resolve())
422    } else {
423        None
424    };
425    let higher = if values.borrow().len() >= 4 {
426        Some(percentile(values, Fraction::new(3, 4)).resolve())
427    } else {
428        None
429    };
430    PercentilesOutput {
431        median: cluster::median(values).resolve(),
432        lower_quadrille: lower,
433        higher_quadrille: higher,
434    }
435}