ndarray_stats/histogram/
strategies.rs

1//! Strategies used by [`GridBuilder`] to infer optimal parameters from data for building [`Bins`]
2//! and [`Grid`] instances.
3//!
4//! The docs for each strategy have been taken almost verbatim from [`NumPy`].
5//!
6//! Each strategy specifies how to compute the optimal number of [`Bins`] or the optimal bin width.
7//! For those strategies that prescribe the optimal number of [`Bins`], the optimal bin width is
8//! computed by `bin_width = (max - min)/n`.
9//!
10//! Since all bins are left-closed and right-open, it is guaranteed to add an extra bin to include
11//! the maximum value from the given data when necessary, so that no data is discarded.
12//!
13//! # Strategies
14//!
15//! Currently, the following strategies are implemented:
16//!
17//! - [`Auto`]: Maximum of the [`Sturges`] and [`FreedmanDiaconis`] strategies. Provides good all
18//!   around performance.
19//! - [`FreedmanDiaconis`]: Robust (resilient to outliers) strategy that takes into account data
20//!   variability and data size.
21//! - [`Rice`]: A strategy that does not take variability into account, only data size. Commonly
22//!   overestimates number of bins required.
23//! - [`Sqrt`]: Square root (of data size) strategy, used by Excel and other programs
24//!   for its speed and simplicity.
25//! - [`Sturges`]: R’s default strategy, only accounts for data size. Only optimal for gaussian data
26//!   and underestimates number of bins for large non-gaussian datasets.
27//!
28//! # Notes
29//!
30//! In general, successful infererence on optimal bin width and number of bins relies on
31//! **variability** of data. In other word, the provided ovservations should not be empty or
32//! constant.
33//!
34//! In addition, [`Auto`] and [`FreedmanDiaconis`] requires the [`interquartile range (IQR)`][iqr],
35//! i.e. the difference between upper and lower quartiles, to be positive.
36//!
37//! [`GridBuilder`]: ../struct.GridBuilder.html
38//! [`Bins`]: ../struct.Bins.html
39//! [`Grid`]: ../struct.Grid.html
40//! [`NumPy`]: https://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram_bin_edges.html#numpy.histogram_bin_edges
41//! [`Auto`]: struct.Auto.html
42//! [`Sturges`]: struct.Sturges.html
43//! [`FreedmanDiaconis`]: struct.FreedmanDiaconis.html
44//! [`Rice`]: struct.Rice.html
45//! [`Sqrt`]: struct.Sqrt.html
46//! [iqr]: https://www.wikiwand.com/en/Interquartile_range
47#![warn(missing_docs, clippy::all, clippy::pedantic)]
48
49use crate::{
50    histogram::{errors::BinsBuildError, Bins, Edges},
51    quantile::{interpolate::Nearest, Quantile1dExt, QuantileExt},
52};
53use ndarray::prelude::*;
54use noisy_float::types::n64;
55use num_traits::{FromPrimitive, NumOps, Zero};
56
57/// A trait implemented by all strategies to build [`Bins`] with parameters inferred from
58/// observations.
59///
60/// This is required by [`GridBuilder`] to know how to build a [`Grid`]'s projections on the
61/// coordinate axes.
62///
63/// [`Bins`]: ../struct.Bins.html
64/// [`GridBuilder`]: ../struct.GridBuilder.html
65/// [`Grid`]: ../struct.Grid.html
66pub trait BinsBuildingStrategy {
67    #[allow(missing_docs)]
68    type Elem: Ord;
69    /// Returns a strategy that has learnt the required parameter fo building [`Bins`] for given
70    /// 1-dimensional array, or an `Err` if it is not possible to infer the required parameter
71    /// with the given data and specified strategy.
72    ///
73    /// # Errors
74    ///
75    /// See each of the struct-level documentation for details on errors an implementor may return.
76    ///
77    /// [`Bins`]: ../struct.Bins.html
78    fn from_array(array: &ArrayRef<Self::Elem, Ix1>) -> Result<Self, BinsBuildError>
79    where
80        Self: std::marker::Sized;
81
82    /// Returns a [`Bins`] instance, according to parameters inferred from observations.
83    ///
84    /// [`Bins`]: ../struct.Bins.html
85    fn build(&self) -> Bins<Self::Elem>;
86
87    /// Returns the optimal number of bins, according to parameters inferred from observations.
88    fn n_bins(&self) -> usize;
89}
90
91#[derive(Debug)]
92struct EquiSpaced<T> {
93    bin_width: T,
94    min: T,
95    max: T,
96}
97
98/// Square root (of data size) strategy, used by Excel and other programs for its speed and
99/// simplicity.
100///
101/// Let `n` be the number of observations. Then
102///
103/// `n_bins` = `sqrt(n)`
104///
105/// # Notes
106///
107/// This strategy requires the data
108///
109/// - not being empty
110/// - not being constant
111#[derive(Debug)]
112pub struct Sqrt<T> {
113    builder: EquiSpaced<T>,
114}
115
116/// A strategy that does not take variability into account, only data size. Commonly
117/// overestimates number of bins required.
118///
119/// Let `n` be the number of observations and `n_bins` be the number of bins.
120///
121/// `n_bins` = 2`n`<sup>1/3</sup>
122///
123/// `n_bins` is only proportional to cube root of `n`. It tends to overestimate
124/// the `n_bins` and it does not take into account data variability.
125///
126/// # Notes
127///
128/// This strategy requires the data
129///
130/// - not being empty
131/// - not being constant
132#[derive(Debug)]
133pub struct Rice<T> {
134    builder: EquiSpaced<T>,
135}
136
137/// R’s default strategy, only accounts for data size. Only optimal for gaussian data and
138/// underestimates number of bins for large non-gaussian datasets.
139///
140/// Let `n` be the number of observations.
141/// The number of bins is 1 plus the base 2 log of `n`. This estimator assumes normality of data and
142/// is too conservative for larger, non-normal datasets.
143///
144/// This is the default method in R’s hist method.
145///
146/// # Notes
147///
148/// This strategy requires the data
149///
150/// - not being empty
151/// - not being constant
152#[derive(Debug)]
153pub struct Sturges<T> {
154    builder: EquiSpaced<T>,
155}
156
157/// Robust (resilient to outliers) strategy that takes into account data variability and data size.
158///
159/// Let `n` be the number of observations.
160///
161/// `bin_width` = 2 × `IQR` × `n`<sup>−1/3</sup>
162///
163/// The bin width is proportional to the interquartile range ([`IQR`]) and inversely proportional to
164/// cube root of `n`. It can be too conservative for small datasets, but it is quite good for large
165/// datasets.
166///
167/// The [`IQR`] is very robust to outliers.
168///
169/// # Notes
170///
171/// This strategy requires the data
172///
173/// - not being empty
174/// - not being constant
175/// - having positive [`IQR`]
176///
177/// [`IQR`]: https://en.wikipedia.org/wiki/Interquartile_range
178#[derive(Debug)]
179pub struct FreedmanDiaconis<T> {
180    builder: EquiSpaced<T>,
181}
182
183#[derive(Debug)]
184enum SturgesOrFD<T> {
185    Sturges(Sturges<T>),
186    FreedmanDiaconis(FreedmanDiaconis<T>),
187}
188
189/// Maximum of the [`Sturges`] and [`FreedmanDiaconis`] strategies. Provides good all around
190/// performance.
191///
192/// A compromise to get a good value. For small datasets the [`Sturges`] value will usually be
193/// chosen, while larger datasets will usually default to [`FreedmanDiaconis`]. Avoids the overly
194/// conservative behaviour of [`FreedmanDiaconis`] and [`Sturges`] for small and large datasets
195/// respectively.
196///
197/// # Notes
198///
199/// This strategy requires the data
200///
201/// - not being empty
202/// - not being constant
203/// - having positive [`IQR`]
204///
205/// [`Sturges`]: struct.Sturges.html
206/// [`FreedmanDiaconis`]: struct.FreedmanDiaconis.html
207/// [`IQR`]: https://en.wikipedia.org/wiki/Interquartile_range
208#[derive(Debug)]
209pub struct Auto<T> {
210    builder: SturgesOrFD<T>,
211}
212
213impl<T> EquiSpaced<T>
214where
215    T: Ord + Clone + FromPrimitive + NumOps + Zero,
216{
217    /// Returns `Err(BinsBuildError::Strategy)` if `bin_width<=0` or `min` >= `max`.
218    /// Returns `Ok(Self)` otherwise.
219    fn new(bin_width: T, min: T, max: T) -> Result<Self, BinsBuildError> {
220        if (bin_width <= T::zero()) || (min >= max) {
221            Err(BinsBuildError::Strategy)
222        } else {
223            Ok(Self {
224                bin_width,
225                min,
226                max,
227            })
228        }
229    }
230
231    fn build(&self) -> Bins<T> {
232        let n_bins = self.n_bins();
233        let mut edges: Vec<T> = vec![];
234        for i in 0..=n_bins {
235            let edge = self.min.clone() + T::from_usize(i).unwrap() * self.bin_width.clone();
236            edges.push(edge);
237        }
238        Bins::new(Edges::from(edges))
239    }
240
241    fn n_bins(&self) -> usize {
242        let mut max_edge = self.min.clone();
243        let mut n_bins = 0;
244        while max_edge <= self.max {
245            max_edge = max_edge + self.bin_width.clone();
246            n_bins += 1;
247        }
248        n_bins
249    }
250
251    fn bin_width(&self) -> T {
252        self.bin_width.clone()
253    }
254}
255
256impl<T> BinsBuildingStrategy for Sqrt<T>
257where
258    T: Ord + Clone + FromPrimitive + NumOps + Zero,
259{
260    type Elem = T;
261
262    /// Returns `Err(BinsBuildError::Strategy)` if the array is constant.
263    /// Returns `Err(BinsBuildError::EmptyInput)` if `a.len()==0`.
264    /// Returns `Ok(Self)` otherwise.
265    fn from_array(a: &ArrayRef<T, Ix1>) -> Result<Self, BinsBuildError> {
266        let n_elems = a.len();
267        // casting `n_elems: usize` to `f64` may casus off-by-one error here if `n_elems` > 2 ^ 53,
268        // but it's not relevant here
269        #[allow(clippy::cast_precision_loss)]
270        // casting the rounded square root from `f64` to `usize` is safe
271        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
272        let n_bins = (n_elems as f64).sqrt().round() as usize;
273        let min = a.min()?;
274        let max = a.max()?;
275        let bin_width = compute_bin_width(min.clone(), max.clone(), n_bins);
276        let builder = EquiSpaced::new(bin_width, min.clone(), max.clone())?;
277        Ok(Self { builder })
278    }
279
280    fn build(&self) -> Bins<T> {
281        self.builder.build()
282    }
283
284    fn n_bins(&self) -> usize {
285        self.builder.n_bins()
286    }
287}
288
289impl<T> Sqrt<T>
290where
291    T: Ord + Clone + FromPrimitive + NumOps + Zero,
292{
293    /// The bin width (or bin length) according to the fitted strategy.
294    pub fn bin_width(&self) -> T {
295        self.builder.bin_width()
296    }
297}
298
299impl<T> BinsBuildingStrategy for Rice<T>
300where
301    T: Ord + Clone + FromPrimitive + NumOps + Zero,
302{
303    type Elem = T;
304
305    /// Returns `Err(BinsBuildError::Strategy)` if the array is constant.
306    /// Returns `Err(BinsBuildError::EmptyInput)` if `a.len()==0`.
307    /// Returns `Ok(Self)` otherwise.
308    fn from_array(a: &ArrayRef<T, Ix1>) -> Result<Self, BinsBuildError> {
309        let n_elems = a.len();
310        // casting `n_elems: usize` to `f64` may casus off-by-one error here if `n_elems` > 2 ^ 53,
311        // but it's not relevant here
312        #[allow(clippy::cast_precision_loss)]
313        // casting the rounded cube root from `f64` to `usize` is safe
314        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
315        let n_bins = (2. * (n_elems as f64).powf(1. / 3.)).round() as usize;
316        let min = a.min()?;
317        let max = a.max()?;
318        let bin_width = compute_bin_width(min.clone(), max.clone(), n_bins);
319        let builder = EquiSpaced::new(bin_width, min.clone(), max.clone())?;
320        Ok(Self { builder })
321    }
322
323    fn build(&self) -> Bins<T> {
324        self.builder.build()
325    }
326
327    fn n_bins(&self) -> usize {
328        self.builder.n_bins()
329    }
330}
331
332impl<T> Rice<T>
333where
334    T: Ord + Clone + FromPrimitive + NumOps + Zero,
335{
336    /// The bin width (or bin length) according to the fitted strategy.
337    pub fn bin_width(&self) -> T {
338        self.builder.bin_width()
339    }
340}
341
342impl<T> BinsBuildingStrategy for Sturges<T>
343where
344    T: Ord + Clone + FromPrimitive + NumOps + Zero,
345{
346    type Elem = T;
347
348    /// Returns `Err(BinsBuildError::Strategy)` if the array is constant.
349    /// Returns `Err(BinsBuildError::EmptyInput)` if `a.len()==0`.
350    /// Returns `Ok(Self)` otherwise.
351    fn from_array(a: &ArrayRef<T, Ix1>) -> Result<Self, BinsBuildError> {
352        let n_elems = a.len();
353        // casting `n_elems: usize` to `f64` may casus off-by-one error here if `n_elems` > 2 ^ 53,
354        // but it's not relevant here
355        #[allow(clippy::cast_precision_loss)]
356        // casting the rounded base-2 log from `f64` to `usize` is safe
357        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
358        let n_bins = (n_elems as f64).log2().round() as usize + 1;
359        let min = a.min()?;
360        let max = a.max()?;
361        let bin_width = compute_bin_width(min.clone(), max.clone(), n_bins);
362        let builder = EquiSpaced::new(bin_width, min.clone(), max.clone())?;
363        Ok(Self { builder })
364    }
365
366    fn build(&self) -> Bins<T> {
367        self.builder.build()
368    }
369
370    fn n_bins(&self) -> usize {
371        self.builder.n_bins()
372    }
373}
374
375impl<T> Sturges<T>
376where
377    T: Ord + Clone + FromPrimitive + NumOps + Zero,
378{
379    /// The bin width (or bin length) according to the fitted strategy.
380    pub fn bin_width(&self) -> T {
381        self.builder.bin_width()
382    }
383}
384
385impl<T> BinsBuildingStrategy for FreedmanDiaconis<T>
386where
387    T: Ord + Clone + FromPrimitive + NumOps + Zero,
388{
389    type Elem = T;
390
391    /// Returns `Err(BinsBuildError::Strategy)` if `IQR==0`.
392    /// Returns `Err(BinsBuildError::EmptyInput)` if `a.len()==0`.
393    /// Returns `Ok(Self)` otherwise.
394    fn from_array(a: &ArrayRef<T, Ix1>) -> Result<Self, BinsBuildError> {
395        let n_points = a.len();
396        if n_points == 0 {
397            return Err(BinsBuildError::EmptyInput);
398        }
399
400        let mut a_copy = a.to_owned();
401        let first_quartile = a_copy.quantile_mut(n64(0.25), &Nearest).unwrap();
402        let third_quartile = a_copy.quantile_mut(n64(0.75), &Nearest).unwrap();
403        let iqr = third_quartile - first_quartile;
404
405        let bin_width = FreedmanDiaconis::compute_bin_width(n_points, iqr);
406        let min = a.min()?;
407        let max = a.max()?;
408        let builder = EquiSpaced::new(bin_width, min.clone(), max.clone())?;
409        Ok(Self { builder })
410    }
411
412    fn build(&self) -> Bins<T> {
413        self.builder.build()
414    }
415
416    fn n_bins(&self) -> usize {
417        self.builder.n_bins()
418    }
419}
420
421impl<T> FreedmanDiaconis<T>
422where
423    T: Ord + Clone + FromPrimitive + NumOps + Zero,
424{
425    fn compute_bin_width(n_bins: usize, iqr: T) -> T {
426        // casting `n_bins: usize` to `f64` may casus off-by-one error here if `n_bins` > 2 ^ 53,
427        // but it's not relevant here
428        #[allow(clippy::cast_precision_loss)]
429        let denominator = (n_bins as f64).powf(1. / 3.);
430        T::from_usize(2).unwrap() * iqr / T::from_f64(denominator).unwrap()
431    }
432
433    /// The bin width (or bin length) according to the fitted strategy.
434    pub fn bin_width(&self) -> T {
435        self.builder.bin_width()
436    }
437}
438
439impl<T> BinsBuildingStrategy for Auto<T>
440where
441    T: Ord + Clone + FromPrimitive + NumOps + Zero,
442{
443    type Elem = T;
444
445    /// Returns `Err(BinsBuildError::Strategy)` if `IQR==0`.
446    /// Returns `Err(BinsBuildError::EmptyInput)` if `a.len()==0`.
447    /// Returns `Ok(Self)` otherwise.
448    fn from_array(a: &ArrayRef<T, Ix1>) -> Result<Self, BinsBuildError> {
449        let fd_builder = FreedmanDiaconis::from_array(&a);
450        let sturges_builder = Sturges::from_array(&a);
451        match (fd_builder, sturges_builder) {
452            (Err(_), Ok(sturges_builder)) => {
453                let builder = SturgesOrFD::Sturges(sturges_builder);
454                Ok(Self { builder })
455            }
456            (Ok(fd_builder), Err(_)) => {
457                let builder = SturgesOrFD::FreedmanDiaconis(fd_builder);
458                Ok(Self { builder })
459            }
460            (Ok(fd_builder), Ok(sturges_builder)) => {
461                let builder = if fd_builder.bin_width() > sturges_builder.bin_width() {
462                    SturgesOrFD::Sturges(sturges_builder)
463                } else {
464                    SturgesOrFD::FreedmanDiaconis(fd_builder)
465                };
466                Ok(Self { builder })
467            }
468            (Err(err), Err(_)) => Err(err),
469        }
470    }
471
472    fn build(&self) -> Bins<T> {
473        // Ugly
474        match &self.builder {
475            SturgesOrFD::FreedmanDiaconis(b) => b.build(),
476            SturgesOrFD::Sturges(b) => b.build(),
477        }
478    }
479
480    fn n_bins(&self) -> usize {
481        // Ugly
482        match &self.builder {
483            SturgesOrFD::FreedmanDiaconis(b) => b.n_bins(),
484            SturgesOrFD::Sturges(b) => b.n_bins(),
485        }
486    }
487}
488
489impl<T> Auto<T>
490where
491    T: Ord + Clone + FromPrimitive + NumOps + Zero,
492{
493    /// The bin width (or bin length) according to the fitted strategy.
494    pub fn bin_width(&self) -> T {
495        // Ugly
496        match &self.builder {
497            SturgesOrFD::FreedmanDiaconis(b) => b.bin_width(),
498            SturgesOrFD::Sturges(b) => b.bin_width(),
499        }
500    }
501}
502
503/// Returns the `bin_width`, given the two end points of a range (`max`, `min`), and the number of
504/// bins, consuming endpoints
505///
506/// `bin_width = (max - min)/n`
507///
508/// **Panics** if `n_bins == 0` and division by 0 panics for `T`.
509fn compute_bin_width<T>(min: T, max: T, n_bins: usize) -> T
510where
511    T: Ord + Clone + FromPrimitive + NumOps + Zero,
512{
513    let range = max - min;
514    range / T::from_usize(n_bins).unwrap()
515}
516
517#[cfg(test)]
518mod equispaced_tests {
519    use super::EquiSpaced;
520
521    #[test]
522    fn bin_width_has_to_be_positive() {
523        assert!(EquiSpaced::new(0, 0, 200).is_err());
524    }
525
526    #[test]
527    fn min_has_to_be_strictly_smaller_than_max() {
528        assert!(EquiSpaced::new(10, 0, 0).is_err());
529    }
530}
531
532#[cfg(test)]
533mod sqrt_tests {
534    use super::{BinsBuildingStrategy, Sqrt};
535    use ndarray::array;
536
537    #[test]
538    fn constant_array_are_bad() {
539        assert!(Sqrt::from_array(&array![1, 1, 1, 1, 1, 1, 1])
540            .unwrap_err()
541            .is_strategy());
542    }
543
544    #[test]
545    fn empty_arrays_are_bad() {
546        assert!(Sqrt::<usize>::from_array(&array![])
547            .unwrap_err()
548            .is_empty_input());
549    }
550}
551
552#[cfg(test)]
553mod rice_tests {
554    use super::{BinsBuildingStrategy, Rice};
555    use ndarray::array;
556
557    #[test]
558    fn constant_array_are_bad() {
559        assert!(Rice::from_array(&array![1, 1, 1, 1, 1, 1, 1])
560            .unwrap_err()
561            .is_strategy());
562    }
563
564    #[test]
565    fn empty_arrays_are_bad() {
566        assert!(Rice::<usize>::from_array(&array![])
567            .unwrap_err()
568            .is_empty_input());
569    }
570}
571
572#[cfg(test)]
573mod sturges_tests {
574    use super::{BinsBuildingStrategy, Sturges};
575    use ndarray::array;
576
577    #[test]
578    fn constant_array_are_bad() {
579        assert!(Sturges::from_array(&array![1, 1, 1, 1, 1, 1, 1])
580            .unwrap_err()
581            .is_strategy());
582    }
583
584    #[test]
585    fn empty_arrays_are_bad() {
586        assert!(Sturges::<usize>::from_array(&array![])
587            .unwrap_err()
588            .is_empty_input());
589    }
590}
591
592#[cfg(test)]
593mod fd_tests {
594    use super::{BinsBuildingStrategy, FreedmanDiaconis};
595    use ndarray::array;
596
597    #[test]
598    fn constant_array_are_bad() {
599        assert!(FreedmanDiaconis::from_array(&array![1, 1, 1, 1, 1, 1, 1])
600            .unwrap_err()
601            .is_strategy());
602    }
603
604    #[test]
605    fn zero_iqr_is_bad() {
606        assert!(
607            FreedmanDiaconis::from_array(&array![-20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 20])
608                .unwrap_err()
609                .is_strategy()
610        );
611    }
612
613    #[test]
614    fn empty_arrays_are_bad() {
615        assert!(FreedmanDiaconis::<usize>::from_array(&array![])
616            .unwrap_err()
617            .is_empty_input());
618    }
619}
620
621#[cfg(test)]
622mod auto_tests {
623    use super::{Auto, BinsBuildingStrategy};
624    use ndarray::array;
625
626    #[test]
627    fn constant_array_are_bad() {
628        assert!(Auto::from_array(&array![1, 1, 1, 1, 1, 1, 1])
629            .unwrap_err()
630            .is_strategy());
631    }
632
633    #[test]
634    fn zero_iqr_is_handled_by_sturged() {
635        assert!(Auto::from_array(&array![-20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 20]).is_ok());
636    }
637
638    #[test]
639    fn empty_arrays_are_bad() {
640        assert!(Auto::<usize>::from_array(&array![])
641            .unwrap_err()
642            .is_empty_input());
643    }
644}