openhistogram/
analysis.rs

1use super::bin::Bin;
2use super::Histogram;
3use std::cmp::Ordering;
4use probability::prelude::{Binomial, Distribution};
5use rand::prelude::*;
6
7
8/// The QuantileType represents various methodologies for calculating quantiles.
9pub enum QuantileType {
10    /// corresponds to Type=1 quantiles in the Hyndman-Fan list (Statistical Computing, 1996).
11    Type1,
12    /// corresponds to discretized Type=7 quantiles in the Hyndman-Fan list (Statistical Computing, 1996).
13    Type7,
14}
15
16fn binomial_reduce_random(n: u64, pr: f64, tgt: f64) -> u64 {
17    if pr <= 0.0 { 0 }
18    else if pr >= 1.0 { n }
19    else {
20        let cumbin = Binomial::new(n as usize, pr);
21        let (mut left, mut right) = (0, n);
22        let mut i = (n as f64 * pr) as u64;
23        let mut bias = match (n as f64 * 0.005) as u64 {
24                x if x < 2 => 2,
25                x => x,
26        };
27        while left < right {
28            let s = i as f64;
29            let cum = cumbin.distribution(s);
30            if i == right {
31                if right - left  == 1 {
32                    if tgt <= cum {
33                        return i;
34                    }
35                    else {
36                        return i-1;
37                    }
38                }
39            }
40            if tgt <= cum {
41                if i == 0 {
42                    return 0;
43                }
44                right = i;
45            }
46            else {
47                if i == n {
48                    return n;
49                }
50                else {
51                    left = i;
52                }
53            }
54
55            let mut skip = (right - left) / bias;
56            if bias != 2 && skip < 1 {
57                skip = (right - left) / 2;
58            }
59            if skip < 1 {
60                skip = 1;
61            }
62
63            if tgt <= cum {
64                i = right - skip;
65            }
66            else {
67                i = left + skip;
68            }
69            bias = 2;
70        }
71        0
72    }
73}
74pub(crate) fn binomial_reduce(n: u64, pr: f64) -> u64 {
75    let mut rng = rand::rng();
76    let tgt = rng.random::<f64>();
77    binomial_reduce_random(n, pr, tgt)
78}
79
80impl Histogram {
81    fn accum_moments<const N: usize>(&self, ks: &[i32; N]) -> [f64; N] {
82        self.bvs.iter().fold([0.0; N], |v, (bin, count)| {
83            let midpoint = bin.midpoint();
84            let cardinality = *count as f64;
85            let mut newv = [0.0; N];
86            for i in 0..N {
87                newv[i] = v[i] + midpoint.powi(ks[i]) * cardinality;
88            }
89            newv
90        })
91    }
92    /// Calculate an approximate mean across all samples.
93    pub fn mean(&self) -> f64 {
94        let r = self.accum_moments(&[0, 1]);
95        if r[0] == 0.0 {
96            f64::NAN
97        } else {
98            r[1] / r[0]
99        }
100    }
101    /// Calculate an approximate sum across all samples.
102    pub fn sum(&self) -> f64 {
103        self.accum_moments(&[1])[0]
104    }
105    /// Calculate an approximate k-th moment across all samples.
106    pub fn moment(&self, k: i32) -> f64 {
107        let r = self.accum_moments(&[0, k]);
108        if r[0] == 0.0 {
109            f64::NAN
110        } else {
111            r[1] / r[0].powi(k)
112        }
113    }
114    /// Calculate the approximate standard deviation across all samples.
115    pub fn stddev(&self) -> f64 {
116        let r = self.accum_moments(&[0, 1, 2]);
117        if r[0] == 0.0 {
118            f64::NAN
119        } else {
120            (r[2] / r[0] - (r[1] / r[0]).powi(2)).sqrt()
121        }
122    }
123
124    /// Calculate an approximate number of samples relative to a `pivot` [Bin]
125    pub fn count_relative(&self, pivot: &Bin, ord: Ordering, inclusive: bool) -> u128 {
126        assert_ne!(ord, Ordering::Equal);
127        self.bvs.iter().fold(0u128, |v, (bin, count)| {
128            let rel = bin.cmp(pivot);
129            if rel == Ordering::Equal {
130                if inclusive {
131                    v + (*count as u128)
132                } else {
133                    v
134                }
135            } else if rel == ord {
136                v + *count as u128
137            } else {
138                v
139            }
140        })
141    }
142    /// Count all samples in all [Bin]s less than or equal to the [Bin] containing `v`.
143    pub fn count_below_inclusive(&self, v: f64) -> u128 {
144        self.count_relative(&v.into(), Ordering::Less, true)
145    }
146    /// Count all samples in all [Bin]s less than the [Bin] containing `v`.
147    pub fn count_below_exclusive(&self, v: f64) -> u128 {
148        self.count_relative(&v.into(), Ordering::Less, false)
149    }
150    /// Count all samples in all [Bin]s greater than or equal to the [Bin] containing `v`.
151    pub fn count_above_inclusive(&self, v: f64) -> u128 {
152        self.count_relative(&v.into(), Ordering::Greater, true)
153    }
154    /// Count all samples in all [Bin]s greater than the [Bin] containing `v`.
155    pub fn count_above_exclusive(&self, v: f64) -> u128 {
156        self.count_relative(&v.into(), Ordering::Greater, false)
157    }
158    /// Count the samples in the [Bin] containing `v`
159    pub fn count_nearby(&self, v: f64) -> u64 {
160        self.bvs
161            .get(&v.into())
162            .and_then(|x| Some(*x))
163            .or(Some(0u64))
164            .unwrap()
165    }
166
167    /// Calculate a set of approximate quantiles from the samples.
168    pub fn quantile_ex<const N: usize>(
169        &self,
170        qs: &[f64; N],
171        qt: QuantileType,
172    ) -> Result<[f64; N], super::Error> {
173        let mut qout = [f64::NAN; N];
174        if N == 0 {
175            return Ok([0.0; N]);
176        }
177        for i in 1..N {
178            if qs[i - 1] > qs[i] {
179                return Err(super::Error::QuantilesOutOfOrder);
180            }
181        }
182        if qs[0] < 0.0 || qs[N - 1] > 1.0 {
183            return Err(super::Error::QuantileOutOfBounds);
184        }
185
186        match self.count() {
187            0 => Ok(qout),
188            cu128 => {
189                let total_count = cu128 as f64;
190                /* We use q_out as temporary space to hold the count-normalized quantiles */
191                match qt {
192                    QuantileType::Type1 => {
193                        for i in 0..N {
194                            qout[i] = total_count * qs[i]
195                        }
196                    }
197                    QuantileType::Type7 => {
198                        for i in 0..N {
199                            qout[i] = ((total_count - 1.0) * qs[i] + 1.0).floor()
200                        }
201                    }
202                };
203
204                /* this is to track */
205                struct Track {
206                    bin_width: f64,
207                    bin_left: f64,
208                    bin_count: f64,
209                    lower_cnt: f64,
210                    upper_cnt: f64,
211                }
212                impl Track {
213                    fn update(&mut self, bin: &Bin, count: u64) {
214                        self.bin_width = bin.width_signed();
215                        self.bin_left = bin.left();
216                        self.bin_count = count as f64;
217                        self.lower_cnt = self.upper_cnt;
218                        self.upper_cnt = self.lower_cnt + count as f64;
219                    }
220                }
221                let mut track = Track {
222                    bin_width: 0.0,
223                    bin_left: 0.0,
224                    bin_count: 0.0,
225                    lower_cnt: 0.0,
226                    upper_cnt: 0.0,
227                };
228                // iterate through the tree and stop to act once we've place
229                let mut bv_iter = self.bvs.iter();
230                if let Some((bin, count)) = bv_iter.next() {
231                    track.update(bin, *count);
232                }
233
234                for i_q in 0..N {
235                    while track.upper_cnt < qout[i_q] {
236                        match bv_iter.next() {
237                            Some((bin,count)) => track.update(bin, *count),
238                            _ => break
239                        }
240                    }
241                    if track.bin_width == 0.0 {
242                        qout[i_q] = track.bin_left;
243                    }
244                    else {
245                        match qt {
246                            // For `QuantileType::Type1` quantiles:`
247                            // A q-quantile for the bucket, will be represented by the sample number:
248                            //
249                            // (q = 0)  k = 1
250                            // (q > 0)  k = ceil(q*n)
251                            // 
252                            // so that q=0 => k=1 and q=1 => k=n. This corresponds to Type=1 quantiles
253                            // in the Hyndman-Fan list (Statistical Computing, 1996).
254                            QuantileType::Type1 => {
255                                let qn = qout[i_q];
256                                assert!(qn >= track.lower_cnt);
257                                assert!(qn <= track.upper_cnt);
258                                let k = match (qn - track.lower_cnt).ceil() {
259                                    0.0 => 1.0,
260                                    k => k,
261                                };
262                                qout[i_q] = track.bin_left + k / (track.bin_count + 1.0) * track.bin_width;
263                            },
264                            // For Type 7 Quantiles, we consider samples at indices:
265                            // 
266                            // k = floor( q*(n-1) + 1 )
267                            // 
268                            // This corresponds to discretized Type=7 quantiles in
269                            // the Hyndman-Fan list (Statistical Computing, 1996).
270                            QuantileType::Type7 => {
271                                let k = qout[i_q] - track.lower_cnt;
272                                qout[i_q] = track.bin_left + k / (track.bin_count + 1.0) * track.bin_width;
273                            },
274                        }
275                    }
276                }
277                Ok(qout)
278            }
279        }
280    }
281    /// Calculate a set of Type=1 quantiles from the sample set.
282    pub fn quantile1<const N: usize>(
283        &self,
284        qs: &[f64; N],
285    ) -> Result<[f64; N], super::Error> {
286        self.quantile_ex(qs, QuantileType::Type1)
287    }
288    /// Calculate a set of Type=7 quantiles from the sample set.
289    pub fn quantile7<const N: usize>(
290        &self,
291        qs: &[f64; N],
292    ) -> Result<[f64; N], super::Error> {
293        self.quantile_ex(qs, QuantileType::Type7)
294    }
295    /// Calculate a set of inverse quantiles from the sample set.
296    pub fn inverse_quantile<const N: usize>(
297        &self,
298        vs: &[f64; N],
299    ) -> Result<[f64; N], super::Error> {
300        let mut out = [f64::NAN; N];
301        if N == 0 {
302            return Ok(out);
303        }
304        for i in 1..N {
305            if vs[i] < vs[i-1] {
306                return Err(super::Error::QuantilesOutOfOrder);
307            }
308        }
309        let total_count = match self.count() {
310            0 => { return Ok(out); },
311            x => { x as f64 },
312        };
313        let mut vs_idx = 0usize;
314        let mut threshold = vs[vs_idx];
315        let mut count_below = 0u128;
316        for (bin, count) in self.bvs.iter() {
317            let (bin_lower, bin_upper, bin_width) = match bin.val {
318                v if v < 0i8 => {
319                    let (b, w) = (bin.absolute_min(), bin.width());
320                    (b-w, b, w)
321                },
322                v if v > 0i8 => {
323                    let (b, w) = (bin.absolute_min(), bin.width());
324                    (b, b+w, w)
325                },
326                _ => (-1e-128,1e-128,0.0),
327            };
328
329            while threshold < bin_lower {
330                out[vs_idx] = count_below as f64 / total_count as f64;
331                vs_idx += 1;
332                if vs_idx >= N {
333                    return Ok(out);
334                }
335                threshold = vs[vs_idx];
336            }
337            while threshold < bin_upper {
338                out[vs_idx] = if bin_width > 0.0 {
339                        let pr = (threshold - bin_lower) / bin_width;
340                        (count_below as f64 + pr * (*count) as f64) / total_count as f64
341                    } else {
342                        count_below as f64 / total_count as f64
343                    };
344                vs_idx += 1;
345                if vs_idx >= N {
346                    return Ok(out);
347                }
348                threshold = vs[vs_idx];
349            }
350            count_below += *count as u128;
351        }
352        while vs_idx < N {
353            out[vs_idx] = 1.0;
354            vs_idx += 1;
355        }
356        Ok(out)
357    }
358}
359
360#[cfg(test)]
361mod tests {
362    use super::*;
363
364    #[test]
365    fn counts() {
366        let h = hist![
367            (-10, 2),
368            (-7, 3),
369            (-0.1, 5),
370            (0, 1),
371            (3),
372            (7, 14),
373            (f64::NAN)
374        ];
375        assert_eq!(h.count_including_nan(), 27);
376        assert_eq!(h.count(), 26);
377        assert_eq!(h.count_below_inclusive(-0.1), 10);
378        assert_eq!(h.count_above_inclusive(-0.1), 21);
379        assert_eq!(h.count_below_exclusive(-0.1), 5);
380        assert_eq!(h.count_above_exclusive(-0.1), 16);
381        assert_eq!(h.count_nearby(-0.1), 5);
382        assert_eq!(h.count_nearby(0.0), 1);
383    }
384    #[test]
385    fn quantiles() -> Result<(), super::super::Error> {
386        let h = hist![0.123, 0, 0.43, 0.41, 0.415, 0.2201, 0.3201, 0.125, 0.13];
387        let qs = [0.0,0.95,0.99,1.0];
388        assert_eq!(h.quantile1(&qs)?, [0.0, 0.435, 0.435, 0.435]);
389        assert_eq!(h.quantile7(&qs)?, [0.0, 0.41666666666666663, 0.41666666666666663, 0.435]);
390        assert_eq!(h.mean(), 0.2443387586038558);
391        assert_eq!(h.sum(), 2.199048827434702);
392
393        let h = hist![1,1];
394        let a = 1.0+0.1*1.0/3.0;
395        let b = 1.0+0.1*2.0/3.0;
396        let qs = [ 0.0, 0.25, 0.5, 0.75, 1.0 ];
397        assert_eq!(h.quantile_ex(&qs, QuantileType::Type1)?, [a,a,a,b,b]);
398        assert_eq!(h.quantile_ex(&qs, QuantileType::Type7)?, [a,a,a,a,b]);
399
400        let h = hist![1];
401        let qs = [ 0.0, 0.25, 0.5, 1.0 ];
402        assert_eq!(h.quantile1(&qs)?, [1.05, 1.05, 1.05, 1.05]);
403        assert_eq!(h.quantile7(&qs)?, [1.05, 1.05, 1.05, 1.05]);
404
405        let h = hist![1,1,1];
406        let qs = [ 0.0, 0.5, 1.0 ];
407        assert_eq!(h.quantile1(&qs)?, [1.025, 1.05, 1.075]);
408        assert_eq!(h.quantile7(&qs)?, [1.025, 1.05, 1.075]);
409
410        let h = hist![1.0, 2.0];
411        assert_eq!(h.quantile1(&[0.5])?, [1.05]);
412        assert_eq!(h.quantile7(&[0.5])?, [1.05]);
413
414        let h = hist![1.0, 1e200];
415        assert_eq!(h.quantile1(&[0.0,1.0])?, [1.05,1.05]);
416        assert_eq!(h.quantile7(&[0.0,1.0])?, [1.05,1.05]);
417        assert_eq!(h.mean(), 1.0476190476190477);
418
419        let h = hist![(1e200,3),(0.0,2),(1e-20,3),1e-10];
420        assert_eq!(h.quantile1(&[0.0,1.0])?, [0.0, 1.05e-10]);
421        assert_eq!(h.quantile7(&[0.0,1.0])?, [0.0, 1.05e-10]);
422
423        let h = hist![0,1];
424        let qs = [0.0,0.1,0.499,0.501,0.9,1.0];
425        assert_eq!(h.quantile1(&qs)?, [0.0,0.0,0.0,1.05,1.05,1.05]);
426        assert_eq!(h.quantile7(&qs)?, [0.0,0.0,0.0,0.0,0.0,1.05]);
427
428        let h = hist![10,100];
429        let qs = [0.0,0.1,0.499,0.501,0.9,1.0];
430        assert_eq!(h.quantile1(&qs)?, [10.5,10.5,10.5,105.0,105.0,105.0]);
431        assert_eq!(h.quantile7(&qs)?, [10.5,10.5,10.5,10.5,10.5,105.0]);
432        Ok(())
433    }
434
435    #[test]
436    fn inverse_quantiles() -> Result<(), super::super::Error> {
437        use approx::*;
438
439        let vs = [-200.0,-100.0,0.0,1.0,1.001,1.1,1.2,2.0,3.0,4.0];
440        let h = hist![];
441        let rq = h.inverse_quantile(&vs)?;
442        assert!(rq.iter().fold(true, |v, x| v && x.is_nan()));
443
444        let h = hist![(-100)];
445        assert_eq!([0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0], h.inverse_quantile(&vs)?);
446
447        let h = hist![0];
448        assert_eq!([0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0], h.inverse_quantile(&vs)?);
449
450        let h = hist![1,2,3];
451        [0.0,0.0,0.0,0.0,(1.0/3.0)/100.0,1.0/3.0,1.0/3.0,1.0/3.0,2.0/3.0,1.0]
452            .iter().zip(h.inverse_quantile(&vs)?
453            .iter()).for_each(|x| {
454                assert_relative_eq!(x.0, x.1, epsilon = 0.00000001);
455            });
456        Ok(())
457    }
458}