ndarray_stats/histogram/
strategies.rs

1//! Strategies used by [`GridBuilder`] to infer optimal parameters from data for building [`Bins`]
2//! and [`Grid`] instances.
3//!
4//! The docs for each strategy have been taken almost verbatim from [`NumPy`].
5//!
6//! Each strategy specifies how to compute the optimal number of [`Bins`] or the optimal bin width.
7//! For those strategies that prescribe the optimal number of [`Bins`], the optimal bin width is
8//! computed by `bin_width = (max - min)/n`.
9//!
10//! Since all bins are left-closed and right-open, it is guaranteed to add an extra bin to include
11//! the maximum value from the given data when necessary, so that no data is discarded.
12//!
13//! # Strategies
14//!
15//! Currently, the following strategies are implemented:
16//!
17//! - [`Auto`]: Maximum of the [`Sturges`] and [`FreedmanDiaconis`] strategies. Provides good all
18//!   around performance.
19//! - [`FreedmanDiaconis`]: Robust (resilient to outliers) strategy that takes into account data
20//!   variability and data size.
21//! - [`Rice`]: A strategy that does not take variability into account, only data size. Commonly
22//!   overestimates number of bins required.
23//! - [`Sqrt`]: Square root (of data size) strategy, used by Excel and other programs
24//!   for its speed and simplicity.
25//! - [`Sturges`]: R’s default strategy, only accounts for data size. Only optimal for gaussian data
26//!   and underestimates number of bins for large non-gaussian datasets.
27//!
28//! # Notes
29//!
30//! In general, successful infererence on optimal bin width and number of bins relies on
31//! **variability** of data. In other word, the provided ovservations should not be empty or
32//! constant.
33//!
34//! In addition, [`Auto`] and [`FreedmanDiaconis`] requires the [`interquartile range (IQR)`][iqr],
35//! i.e. the difference between upper and lower quartiles, to be positive.
36//!
37//! [`GridBuilder`]: ../struct.GridBuilder.html
38//! [`Bins`]: ../struct.Bins.html
39//! [`Grid`]: ../struct.Grid.html
40//! [`NumPy`]: https://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram_bin_edges.html#numpy.histogram_bin_edges
41//! [`Auto`]: struct.Auto.html
42//! [`Sturges`]: struct.Sturges.html
43//! [`FreedmanDiaconis`]: struct.FreedmanDiaconis.html
44//! [`Rice`]: struct.Rice.html
45//! [`Sqrt`]: struct.Sqrt.html
46//! [iqr]: https://www.wikiwand.com/en/Interquartile_range
47#![warn(missing_docs, clippy::all, clippy::pedantic)]
48
49use crate::{
50    histogram::{errors::BinsBuildError, Bins, Edges},
51    quantile::{interpolate::Nearest, Quantile1dExt, QuantileExt},
52};
53use ndarray::{prelude::*, Data};
54use noisy_float::types::n64;
55use num_traits::{FromPrimitive, NumOps, Zero};
56
57/// A trait implemented by all strategies to build [`Bins`] with parameters inferred from
58/// observations.
59///
60/// This is required by [`GridBuilder`] to know how to build a [`Grid`]'s projections on the
61/// coordinate axes.
62///
63/// [`Bins`]: ../struct.Bins.html
64/// [`GridBuilder`]: ../struct.GridBuilder.html
65/// [`Grid`]: ../struct.Grid.html
66pub trait BinsBuildingStrategy {
67    #[allow(missing_docs)]
68    type Elem: Ord;
69    /// Returns a strategy that has learnt the required parameter fo building [`Bins`] for given
70    /// 1-dimensional array, or an `Err` if it is not possible to infer the required parameter
71    /// with the given data and specified strategy.
72    ///
73    /// # Errors
74    ///
75    /// See each of the struct-level documentation for details on errors an implementor may return.
76    ///
77    /// [`Bins`]: ../struct.Bins.html
78    fn from_array<S>(array: &ArrayBase<S, Ix1>) -> Result<Self, BinsBuildError>
79    where
80        S: Data<Elem = Self::Elem>,
81        Self: std::marker::Sized;
82
83    /// Returns a [`Bins`] instance, according to parameters inferred from observations.
84    ///
85    /// [`Bins`]: ../struct.Bins.html
86    fn build(&self) -> Bins<Self::Elem>;
87
88    /// Returns the optimal number of bins, according to parameters inferred from observations.
89    fn n_bins(&self) -> usize;
90}
91
92#[derive(Debug)]
93struct EquiSpaced<T> {
94    bin_width: T,
95    min: T,
96    max: T,
97}
98
99/// Square root (of data size) strategy, used by Excel and other programs for its speed and
100/// simplicity.
101///
102/// Let `n` be the number of observations. Then
103///
104/// `n_bins` = `sqrt(n)`
105///
106/// # Notes
107///
108/// This strategy requires the data
109///
110/// - not being empty
111/// - not being constant
112#[derive(Debug)]
113pub struct Sqrt<T> {
114    builder: EquiSpaced<T>,
115}
116
117/// A strategy that does not take variability into account, only data size. Commonly
118/// overestimates number of bins required.
119///
120/// Let `n` be the number of observations and `n_bins` be the number of bins.
121///
122/// `n_bins` = 2`n`<sup>1/3</sup>
123///
124/// `n_bins` is only proportional to cube root of `n`. It tends to overestimate
125/// the `n_bins` and it does not take into account data variability.
126///
127/// # Notes
128///
129/// This strategy requires the data
130///
131/// - not being empty
132/// - not being constant
133#[derive(Debug)]
134pub struct Rice<T> {
135    builder: EquiSpaced<T>,
136}
137
138/// R’s default strategy, only accounts for data size. Only optimal for gaussian data and
139/// underestimates number of bins for large non-gaussian datasets.
140///
141/// Let `n` be the number of observations.
142/// The number of bins is 1 plus the base 2 log of `n`. This estimator assumes normality of data and
143/// is too conservative for larger, non-normal datasets.
144///
145/// This is the default method in R’s hist method.
146///
147/// # Notes
148///
149/// This strategy requires the data
150///
151/// - not being empty
152/// - not being constant
153#[derive(Debug)]
154pub struct Sturges<T> {
155    builder: EquiSpaced<T>,
156}
157
158/// Robust (resilient to outliers) strategy that takes into account data variability and data size.
159///
160/// Let `n` be the number of observations.
161///
162/// `bin_width` = 2 × `IQR` × `n`<sup>−1/3</sup>
163///
164/// The bin width is proportional to the interquartile range ([`IQR`]) and inversely proportional to
165/// cube root of `n`. It can be too conservative for small datasets, but it is quite good for large
166/// datasets.
167///
168/// The [`IQR`] is very robust to outliers.
169///
170/// # Notes
171///
172/// This strategy requires the data
173///
174/// - not being empty
175/// - not being constant
176/// - having positive [`IQR`]
177///
178/// [`IQR`]: https://en.wikipedia.org/wiki/Interquartile_range
179#[derive(Debug)]
180pub struct FreedmanDiaconis<T> {
181    builder: EquiSpaced<T>,
182}
183
184#[derive(Debug)]
185enum SturgesOrFD<T> {
186    Sturges(Sturges<T>),
187    FreedmanDiaconis(FreedmanDiaconis<T>),
188}
189
190/// Maximum of the [`Sturges`] and [`FreedmanDiaconis`] strategies. Provides good all around
191/// performance.
192///
193/// A compromise to get a good value. For small datasets the [`Sturges`] value will usually be
194/// chosen, while larger datasets will usually default to [`FreedmanDiaconis`]. Avoids the overly
195/// conservative behaviour of [`FreedmanDiaconis`] and [`Sturges`] for small and large datasets
196/// respectively.
197///
198/// # Notes
199///
200/// This strategy requires the data
201///
202/// - not being empty
203/// - not being constant
204/// - having positive [`IQR`]
205///
206/// [`Sturges`]: struct.Sturges.html
207/// [`FreedmanDiaconis`]: struct.FreedmanDiaconis.html
208/// [`IQR`]: https://en.wikipedia.org/wiki/Interquartile_range
209#[derive(Debug)]
210pub struct Auto<T> {
211    builder: SturgesOrFD<T>,
212}
213
214impl<T> EquiSpaced<T>
215where
216    T: Ord + Clone + FromPrimitive + NumOps + Zero,
217{
218    /// Returns `Err(BinsBuildError::Strategy)` if `bin_width<=0` or `min` >= `max`.
219    /// Returns `Ok(Self)` otherwise.
220    fn new(bin_width: T, min: T, max: T) -> Result<Self, BinsBuildError> {
221        if (bin_width <= T::zero()) || (min >= max) {
222            Err(BinsBuildError::Strategy)
223        } else {
224            Ok(Self {
225                bin_width,
226                min,
227                max,
228            })
229        }
230    }
231
232    fn build(&self) -> Bins<T> {
233        let n_bins = self.n_bins();
234        let mut edges: Vec<T> = vec![];
235        for i in 0..=n_bins {
236            let edge = self.min.clone() + T::from_usize(i).unwrap() * self.bin_width.clone();
237            edges.push(edge);
238        }
239        Bins::new(Edges::from(edges))
240    }
241
242    fn n_bins(&self) -> usize {
243        let mut max_edge = self.min.clone();
244        let mut n_bins = 0;
245        while max_edge <= self.max {
246            max_edge = max_edge + self.bin_width.clone();
247            n_bins += 1;
248        }
249        n_bins
250    }
251
252    fn bin_width(&self) -> T {
253        self.bin_width.clone()
254    }
255}
256
257impl<T> BinsBuildingStrategy for Sqrt<T>
258where
259    T: Ord + Clone + FromPrimitive + NumOps + Zero,
260{
261    type Elem = T;
262
263    /// Returns `Err(BinsBuildError::Strategy)` if the array is constant.
264    /// Returns `Err(BinsBuildError::EmptyInput)` if `a.len()==0`.
265    /// Returns `Ok(Self)` otherwise.
266    fn from_array<S>(a: &ArrayBase<S, Ix1>) -> Result<Self, BinsBuildError>
267    where
268        S: Data<Elem = Self::Elem>,
269    {
270        let n_elems = a.len();
271        // casting `n_elems: usize` to `f64` may casus off-by-one error here if `n_elems` > 2 ^ 53,
272        // but it's not relevant here
273        #[allow(clippy::cast_precision_loss)]
274        // casting the rounded square root from `f64` to `usize` is safe
275        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
276        let n_bins = (n_elems as f64).sqrt().round() as usize;
277        let min = a.min()?;
278        let max = a.max()?;
279        let bin_width = compute_bin_width(min.clone(), max.clone(), n_bins);
280        let builder = EquiSpaced::new(bin_width, min.clone(), max.clone())?;
281        Ok(Self { builder })
282    }
283
284    fn build(&self) -> Bins<T> {
285        self.builder.build()
286    }
287
288    fn n_bins(&self) -> usize {
289        self.builder.n_bins()
290    }
291}
292
293impl<T> Sqrt<T>
294where
295    T: Ord + Clone + FromPrimitive + NumOps + Zero,
296{
297    /// The bin width (or bin length) according to the fitted strategy.
298    pub fn bin_width(&self) -> T {
299        self.builder.bin_width()
300    }
301}
302
303impl<T> BinsBuildingStrategy for Rice<T>
304where
305    T: Ord + Clone + FromPrimitive + NumOps + Zero,
306{
307    type Elem = T;
308
309    /// Returns `Err(BinsBuildError::Strategy)` if the array is constant.
310    /// Returns `Err(BinsBuildError::EmptyInput)` if `a.len()==0`.
311    /// Returns `Ok(Self)` otherwise.
312    fn from_array<S>(a: &ArrayBase<S, Ix1>) -> Result<Self, BinsBuildError>
313    where
314        S: Data<Elem = Self::Elem>,
315    {
316        let n_elems = a.len();
317        // casting `n_elems: usize` to `f64` may casus off-by-one error here if `n_elems` > 2 ^ 53,
318        // but it's not relevant here
319        #[allow(clippy::cast_precision_loss)]
320        // casting the rounded cube root from `f64` to `usize` is safe
321        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
322        let n_bins = (2. * (n_elems as f64).powf(1. / 3.)).round() as usize;
323        let min = a.min()?;
324        let max = a.max()?;
325        let bin_width = compute_bin_width(min.clone(), max.clone(), n_bins);
326        let builder = EquiSpaced::new(bin_width, min.clone(), max.clone())?;
327        Ok(Self { builder })
328    }
329
330    fn build(&self) -> Bins<T> {
331        self.builder.build()
332    }
333
334    fn n_bins(&self) -> usize {
335        self.builder.n_bins()
336    }
337}
338
339impl<T> Rice<T>
340where
341    T: Ord + Clone + FromPrimitive + NumOps + Zero,
342{
343    /// The bin width (or bin length) according to the fitted strategy.
344    pub fn bin_width(&self) -> T {
345        self.builder.bin_width()
346    }
347}
348
349impl<T> BinsBuildingStrategy for Sturges<T>
350where
351    T: Ord + Clone + FromPrimitive + NumOps + Zero,
352{
353    type Elem = T;
354
355    /// Returns `Err(BinsBuildError::Strategy)` if the array is constant.
356    /// Returns `Err(BinsBuildError::EmptyInput)` if `a.len()==0`.
357    /// Returns `Ok(Self)` otherwise.
358    fn from_array<S>(a: &ArrayBase<S, Ix1>) -> Result<Self, BinsBuildError>
359    where
360        S: Data<Elem = Self::Elem>,
361    {
362        let n_elems = a.len();
363        // casting `n_elems: usize` to `f64` may casus off-by-one error here if `n_elems` > 2 ^ 53,
364        // but it's not relevant here
365        #[allow(clippy::cast_precision_loss)]
366        // casting the rounded base-2 log from `f64` to `usize` is safe
367        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
368        let n_bins = (n_elems as f64).log2().round() as usize + 1;
369        let min = a.min()?;
370        let max = a.max()?;
371        let bin_width = compute_bin_width(min.clone(), max.clone(), n_bins);
372        let builder = EquiSpaced::new(bin_width, min.clone(), max.clone())?;
373        Ok(Self { builder })
374    }
375
376    fn build(&self) -> Bins<T> {
377        self.builder.build()
378    }
379
380    fn n_bins(&self) -> usize {
381        self.builder.n_bins()
382    }
383}
384
385impl<T> Sturges<T>
386where
387    T: Ord + Clone + FromPrimitive + NumOps + Zero,
388{
389    /// The bin width (or bin length) according to the fitted strategy.
390    pub fn bin_width(&self) -> T {
391        self.builder.bin_width()
392    }
393}
394
395impl<T> BinsBuildingStrategy for FreedmanDiaconis<T>
396where
397    T: Ord + Clone + FromPrimitive + NumOps + Zero,
398{
399    type Elem = T;
400
401    /// Returns `Err(BinsBuildError::Strategy)` if `IQR==0`.
402    /// Returns `Err(BinsBuildError::EmptyInput)` if `a.len()==0`.
403    /// Returns `Ok(Self)` otherwise.
404    fn from_array<S>(a: &ArrayBase<S, Ix1>) -> Result<Self, BinsBuildError>
405    where
406        S: Data<Elem = Self::Elem>,
407    {
408        let n_points = a.len();
409        if n_points == 0 {
410            return Err(BinsBuildError::EmptyInput);
411        }
412
413        let mut a_copy = a.to_owned();
414        let first_quartile = a_copy.quantile_mut(n64(0.25), &Nearest).unwrap();
415        let third_quartile = a_copy.quantile_mut(n64(0.75), &Nearest).unwrap();
416        let iqr = third_quartile - first_quartile;
417
418        let bin_width = FreedmanDiaconis::compute_bin_width(n_points, iqr);
419        let min = a.min()?;
420        let max = a.max()?;
421        let builder = EquiSpaced::new(bin_width, min.clone(), max.clone())?;
422        Ok(Self { builder })
423    }
424
425    fn build(&self) -> Bins<T> {
426        self.builder.build()
427    }
428
429    fn n_bins(&self) -> usize {
430        self.builder.n_bins()
431    }
432}
433
434impl<T> FreedmanDiaconis<T>
435where
436    T: Ord + Clone + FromPrimitive + NumOps + Zero,
437{
438    fn compute_bin_width(n_bins: usize, iqr: T) -> T {
439        // casting `n_bins: usize` to `f64` may casus off-by-one error here if `n_bins` > 2 ^ 53,
440        // but it's not relevant here
441        #[allow(clippy::cast_precision_loss)]
442        let denominator = (n_bins as f64).powf(1. / 3.);
443        T::from_usize(2).unwrap() * iqr / T::from_f64(denominator).unwrap()
444    }
445
446    /// The bin width (or bin length) according to the fitted strategy.
447    pub fn bin_width(&self) -> T {
448        self.builder.bin_width()
449    }
450}
451
452impl<T> BinsBuildingStrategy for Auto<T>
453where
454    T: Ord + Clone + FromPrimitive + NumOps + Zero,
455{
456    type Elem = T;
457
458    /// Returns `Err(BinsBuildError::Strategy)` if `IQR==0`.
459    /// Returns `Err(BinsBuildError::EmptyInput)` if `a.len()==0`.
460    /// Returns `Ok(Self)` otherwise.
461    fn from_array<S>(a: &ArrayBase<S, Ix1>) -> Result<Self, BinsBuildError>
462    where
463        S: Data<Elem = Self::Elem>,
464    {
465        let fd_builder = FreedmanDiaconis::from_array(&a);
466        let sturges_builder = Sturges::from_array(&a);
467        match (fd_builder, sturges_builder) {
468            (Err(_), Ok(sturges_builder)) => {
469                let builder = SturgesOrFD::Sturges(sturges_builder);
470                Ok(Self { builder })
471            }
472            (Ok(fd_builder), Err(_)) => {
473                let builder = SturgesOrFD::FreedmanDiaconis(fd_builder);
474                Ok(Self { builder })
475            }
476            (Ok(fd_builder), Ok(sturges_builder)) => {
477                let builder = if fd_builder.bin_width() > sturges_builder.bin_width() {
478                    SturgesOrFD::Sturges(sturges_builder)
479                } else {
480                    SturgesOrFD::FreedmanDiaconis(fd_builder)
481                };
482                Ok(Self { builder })
483            }
484            (Err(err), Err(_)) => Err(err),
485        }
486    }
487
488    fn build(&self) -> Bins<T> {
489        // Ugly
490        match &self.builder {
491            SturgesOrFD::FreedmanDiaconis(b) => b.build(),
492            SturgesOrFD::Sturges(b) => b.build(),
493        }
494    }
495
496    fn n_bins(&self) -> usize {
497        // Ugly
498        match &self.builder {
499            SturgesOrFD::FreedmanDiaconis(b) => b.n_bins(),
500            SturgesOrFD::Sturges(b) => b.n_bins(),
501        }
502    }
503}
504
505impl<T> Auto<T>
506where
507    T: Ord + Clone + FromPrimitive + NumOps + Zero,
508{
509    /// The bin width (or bin length) according to the fitted strategy.
510    pub fn bin_width(&self) -> T {
511        // Ugly
512        match &self.builder {
513            SturgesOrFD::FreedmanDiaconis(b) => b.bin_width(),
514            SturgesOrFD::Sturges(b) => b.bin_width(),
515        }
516    }
517}
518
519/// Returns the `bin_width`, given the two end points of a range (`max`, `min`), and the number of
520/// bins, consuming endpoints
521///
522/// `bin_width = (max - min)/n`
523///
524/// **Panics** if `n_bins == 0` and division by 0 panics for `T`.
525fn compute_bin_width<T>(min: T, max: T, n_bins: usize) -> T
526where
527    T: Ord + Clone + FromPrimitive + NumOps + Zero,
528{
529    let range = max - min;
530    range / T::from_usize(n_bins).unwrap()
531}
532
533#[cfg(test)]
534mod equispaced_tests {
535    use super::EquiSpaced;
536
537    #[test]
538    fn bin_width_has_to_be_positive() {
539        assert!(EquiSpaced::new(0, 0, 200).is_err());
540    }
541
542    #[test]
543    fn min_has_to_be_strictly_smaller_than_max() {
544        assert!(EquiSpaced::new(10, 0, 0).is_err());
545    }
546}
547
548#[cfg(test)]
549mod sqrt_tests {
550    use super::{BinsBuildingStrategy, Sqrt};
551    use ndarray::array;
552
553    #[test]
554    fn constant_array_are_bad() {
555        assert!(Sqrt::from_array(&array![1, 1, 1, 1, 1, 1, 1])
556            .unwrap_err()
557            .is_strategy());
558    }
559
560    #[test]
561    fn empty_arrays_are_bad() {
562        assert!(Sqrt::<usize>::from_array(&array![])
563            .unwrap_err()
564            .is_empty_input());
565    }
566}
567
568#[cfg(test)]
569mod rice_tests {
570    use super::{BinsBuildingStrategy, Rice};
571    use ndarray::array;
572
573    #[test]
574    fn constant_array_are_bad() {
575        assert!(Rice::from_array(&array![1, 1, 1, 1, 1, 1, 1])
576            .unwrap_err()
577            .is_strategy());
578    }
579
580    #[test]
581    fn empty_arrays_are_bad() {
582        assert!(Rice::<usize>::from_array(&array![])
583            .unwrap_err()
584            .is_empty_input());
585    }
586}
587
588#[cfg(test)]
589mod sturges_tests {
590    use super::{BinsBuildingStrategy, Sturges};
591    use ndarray::array;
592
593    #[test]
594    fn constant_array_are_bad() {
595        assert!(Sturges::from_array(&array![1, 1, 1, 1, 1, 1, 1])
596            .unwrap_err()
597            .is_strategy());
598    }
599
600    #[test]
601    fn empty_arrays_are_bad() {
602        assert!(Sturges::<usize>::from_array(&array![])
603            .unwrap_err()
604            .is_empty_input());
605    }
606}
607
608#[cfg(test)]
609mod fd_tests {
610    use super::{BinsBuildingStrategy, FreedmanDiaconis};
611    use ndarray::array;
612
613    #[test]
614    fn constant_array_are_bad() {
615        assert!(FreedmanDiaconis::from_array(&array![1, 1, 1, 1, 1, 1, 1])
616            .unwrap_err()
617            .is_strategy());
618    }
619
620    #[test]
621    fn zero_iqr_is_bad() {
622        assert!(
623            FreedmanDiaconis::from_array(&array![-20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 20])
624                .unwrap_err()
625                .is_strategy()
626        );
627    }
628
629    #[test]
630    fn empty_arrays_are_bad() {
631        assert!(FreedmanDiaconis::<usize>::from_array(&array![])
632            .unwrap_err()
633            .is_empty_input());
634    }
635}
636
637#[cfg(test)]
638mod auto_tests {
639    use super::{Auto, BinsBuildingStrategy};
640    use ndarray::array;
641
642    #[test]
643    fn constant_array_are_bad() {
644        assert!(Auto::from_array(&array![1, 1, 1, 1, 1, 1, 1])
645            .unwrap_err()
646            .is_strategy());
647    }
648
649    #[test]
650    fn zero_iqr_is_handled_by_sturged() {
651        assert!(Auto::from_array(&array![-20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 20]).is_ok());
652    }
653
654    #[test]
655    fn empty_arrays_are_bad() {
656        assert!(Auto::<usize>::from_array(&array![])
657            .unwrap_err()
658            .is_empty_input());
659    }
660}