quantogram/
lib.rs

1use std::cell::RefCell;
2use std::ops::Bound::{Included,Excluded};
3use std::fmt::{Formatter,Debug,Result};
4use float_extras::f64::frexp;
5use std::convert::Into;
6use skiplist::SkipMap;
7mod half_sample_mode;
8use half_sample_mode::{HalfSampleModeCache};
9
10
11/// A bin to be used inside a Histogram with a midpoint value and a weight.
12#[derive(Copy, Clone, Debug)]
13struct HistogramBin {
14    /// Midpoint value for the bin.
15    pub sample : f64,
16
17    /// Sample weight, which must be positive.
18    /// If zero, it indicates a removed bin.
19    pub weight : f64
20}
21
22impl HistogramBin {
23    /// Construct a new HistogramBin.
24    pub fn new(sample: f64, weight: f64) -> Self {
25        HistogramBin {
26            sample: sample,
27            weight: weight
28        }
29    }
30
31    /// Validates that the weight is non-negative and finite, else function will panic.
32    fn validate(&self) {
33        if self.weight < 0.0 || !self.weight.is_finite() {
34            panic!("HistogramBin weight must be a non-negative, finite number.");
35        }
36    }
37
38    /// Copy values from another HistogramBin into this one.
39    pub fn set(&mut self, copy_from: Self) {
40        self.sample = copy_from.sample;
41        self.weight = copy_from.weight;
42        self.validate();
43    }
44}
45
46/// Categorize numbers as negative, zero, positive, NAN or an infinity
47enum SampleCategory {
48    Negative,
49    Zero,
50    Positive,
51    NegativeInfinity,
52    PositiveInfinity,
53    NotANumber
54}
55
56impl SampleCategory {
57    /// Decide which category of number a sample is.
58    pub fn categorize(sample : f64) -> Self {
59        match sample {
60            // These arms are sorted by most to least common which improved performance noticeably. 
61            x if x > 0.0 && x != f64::INFINITY => SampleCategory::Positive,
62            x if x < 0.0 && x != f64::NEG_INFINITY => SampleCategory::Negative,
63            x if x == 0.0 => SampleCategory::Zero,
64            x if x == f64::INFINITY => SampleCategory::PositiveInfinity,
65            x if x == f64::NEG_INFINITY => SampleCategory::NegativeInfinity,
66            _ => SampleCategory::NotANumber
67        }
68    }
69}
70
71/// Provides a weighted Histogram of f64 values for computing approximate quantiles.
72/// This guarantees a configurable maximum absolute relative error
73/// and uses sparse storage to reduce memory usage.
74/// 
75/// Worst case accuracy defaults to one percent (0.01) absolute relative error.
76/// The error is unbiased, uniform for the entire range of numbers.
77/// The error for quantiles 0 and 1 (the minimum and maximum, respectively) is guaranteed to 
78/// be zero, except if either of those values is removed.
79/// 
80/// If all inserted values are given a weight of one,
81/// this behaves as an unweighted (normal) histogram. 
82/// 
83/// Samples may be added or removed. 
84/// However, removing a sample that equals the minimum or maximum will cause
85/// those values to be replaced by the center value of the appropriate extreme histogram bucket.
86/// 
87/// The full valid range of floats is divided into two levels of buckets.
88/// For the default case with 1% error, here is what that means:
89/// 
90///    Top. The top level divides the number range into buckets for each power of two and between
91///         positive and negative numbers. 
92///         It has a maximum of 508 buckets in two sparse lists. 
93///         f64 values can range from 2^-126 to 2^127, thus there are a maximum of 
94///         254 buckets for positive numbers and 254 buckets for negatives. 
95/// Bottom. The second level of buckets are in a single sparse list spanning the full range 
96///         from the smallest negative number to the largest positive number. 
97///         Each power of two range is broken into 35 buckets whose size varies exponentially. 
98///         Each bucket is larger than the previous by a factor of 1.02
99///         (or smaller by 1.02, over the range of negative values). 
100///         The value of 1.02 was chosen because 1.02^35 = 1.999889553, which differs 
101///         from 2 by 0.00011. That means that the 35th bucket for each power of two is slightly larger than
102///         the rest, but not by enough to wreck the error guarantee.
103///         There are a maximum of 1 + 508*35 buckets in this list, or 17,781. 
104///         (The "1" is for the zero bucket.) 
105/// 
106/// A bucket is not created until at least one value is added to it.
107/// Removing the last item in a bucket will not cause its memory to be freed; 
108/// its weight will be set to zero. 
109/// 
110/// (If you are familiar with the Rule of 70 used to estimate the doubling period 
111/// for a given interest rate, it was probably chosen because of this nice property of 1.02.)
112/// 
113/// The error rate of 0.01 and the bin scale factor of 1.02 are related in this way: 
114/// ```text
115/// 
116///             (1 + error)     (1 + 1/101)      102
117///    scale = ------------- = ------------- = ------- = 1.02
118///             (1 - error)     (1 - 1/101)      100
119/// ```
120/// So technically, the worst case error is 1/101, or 0.99%, not 1%, but the math is easier
121/// when using 1.02 instead of 1.020202020202 and the error on the last bucket is ideal. 
122/// (A smaller error in the last bucket is available for error = 1/176, but the memory requirements
123/// are much higher.)
124/// 
125/// Usage: 
126/// 
127///   Typical usage (unweighted samples with the default accuracy of 1%): 
128/// 
129///      use quantogram::Quantogram;
130///      let mut q = Quantogram::new();
131///      q.add(10.0);
132///      q.add(40.0);
133///      q.add(20.0);
134///      q.add(30.0);
135///      q.add(50.0);
136/// 
137///      assert_eq!(q.min().unwrap(), 10.0);
138///      assert_eq!(q.max().unwrap(), 50.0);
139///      assert_eq!(q.mean().unwrap(), 30.0);
140///      assert_eq!(q.median().unwrap(), 30.0);
141///      assert_eq!(q.quantile(0.75).unwrap(), 40.0);
142/// 
143///      q.remove(10.0);
144///      q.remove(20.0);
145///      assert_eq!(q.mean().unwrap(), 40.0);
146///   
147/// 
148/// Notes: 
149///   1. Coarse bins are for powers of two not some other value because 
150///      getting the largest power of two less than or equal to a number calls a fast intrinsic function.
151///      This makes assigning a number to a bin very fast.
152/// 
153///   2. When inquiring about a quantile, a value will be returned so long as one
154///      number in range is added. NANs and Infinities will be disregarded. 
155///      The NANs and Infinities are available as separate counts.  
156/// 
157///   3. Unbounded errors are possible in the edge case of a large gap in the middle of the data.
158///      Take the case of the median. If there are an even number of items in the data with a 
159///      large gap in the exact middle, then the proper formula for the median is the mean 
160///      of the last value below the gap and the first value above the gap.
161///      To correct for this, use the fussy_quantile method. 
162///      It will probe for quantiles at Φ + ε and Φ - ε for a small value of ε, 
163///      and average the pair that best span the gap. 
164///      If the histogram is unweighted (all weights are one) then the value of
165///      ε should be 1/2N, where N is the number of items already added to the Histogram.
166///      If the samples are weighted, not sure what to do.
167/// 
168///   4. If one sample is in a bin, the value for the bin will be set to the accurate sample,
169///      not the midpoint of the range for that bin. If a second sample is added, the
170///      bin value will be set to the midpoint. Consequently, bins with one
171///      sample added have no error. 
172///      
173pub struct Quantogram {
174    /// Exponential Scaling factor that relates the start value of one fine bin to the next.
175    /// value[N+1] = value[N] * growth
176    /// The default is 1.02, which yields an absolute relative error of 1%.
177    /// This number must be in the range (1,√2]. 
178    /// Smaller values guarantee smaller errors but require more storage space and execution time.
179    growth : f64,
180
181    /// Number of bins used to store values per power of two range of numbers.
182    /// This must equal log(2, growth), rounded down.
183    /// For example, if growth is 1.02, this must be 35.
184    /// The larger this value, the more memory is required.
185    bins_per_doubling : usize,
186
187    /// Memoize powi(growth, n) for n in [0,bins_per_doubling)
188    growth_bin_power_memo : Vec<f64>,
189
190    /// Unweighted count of all insertions minus all removals.
191    total_count : usize,
192     
193    /// Total weight of all inserted values other than NAN or +/- Infinity. 
194    total_weight : f64,
195
196    /// Weighted sum of all samples (excluding NANs and infinities).
197    /// This permits the exact weighted mean to be calculated.
198    weighted_sum : f64,
199
200    /// Running weighted variance
201    running_variance : f64,
202
203    /// Total weight of all inserted NAN values. 
204    nan_weight : f64,
205
206    /// Total weight of all inserted +Infinity values. 
207    plus_ininity_weight : f64,
208
209    /// Total weight of all inserted -Infinity values. 
210    minus_ininity_weight : f64,
211
212    /// Total weight of all negative values (included in total_weight).
213    negative_weight : f64,
214
215    /// Total weight of all zeroes (included in total_weight).
216    zero_weight : f64,
217    
218    /// Total weight of all positive values (included in total_weight).
219    positive_weight : f64,
220   
221    /// Maximum added sample that is not +Infinity, initialized to NAN. 
222    maximum : f64,
223
224    /// Minimum added sample that is not -Infinity, initialized to NAN. 
225    minimum : f64,
226
227    /// This remains true until a finite number that is not a number is added.
228    /// When true, this causes quantile to round its result.
229    only_integers : bool,
230
231    /// To reduce memory usage, force all numbers whose magnitude is
232    /// greater than this power of two to be counted as +Infinity (overflow). 
233    /// Default is -(2^127), the highest supported by IEEE-754 64 bit floats. 
234    negative_overflow : f64,
235
236    /// To reduce memory usage, force all numbers whose magnitude
237    /// is less than this power of two to be counted as zero (underflow). 
238    /// Default is -(2^-126), the lowest supported by IEEE-754 64 bit floats. 
239    negative_underflow : f64,
240
241    /// To reduce memory usage, force all numbers whose magnitude is
242    /// greater than this power of two to be counted as +Infinity (overflow). 
243    /// Default is +(2^127), the highest supported by IEEE-754 64 bit floats. 
244    positive_overflow : f64,
245 
246    /// To reduce memory usage, force all numbers whose magnitude
247    /// is less than this power of two to be counted as zero (underflow). 
248    /// Default is +(2^-126), the lowest supported by IEEE-754 64 bit floats. 
249    positive_underflow : f64,
250
251    /// Bins for powers of two of positive numbers.
252    /// The key is the power of two, from -126 to 127.
253    /// 
254    /// Example: If the key is 2, the bin is for all numbers in the range [4,8).
255    positive_coarse_bins : SkipMap<isize, HistogramBin>,
256
257    /// Bins for powers of two of negative numbers.
258    /// The key is the power of two, from -126 to 127.
259    /// 
260    /// Example: If the key is 3, the bin is for all numbers in the range (-16,-8].
261    negative_coarse_bins : SkipMap<isize, HistogramBin>,
262
263    /// Bins for zeroes, positive and negative numbers.
264    /// The key is related to the power of two and the position within the 35 bins per
265    /// power of two range: 
266    ///     key = ((power + 126)*35 + bin + 1)*sign
267    /// where:
268    ///     power = the power of two, from -126 to 127
269    ///     bin   = 0 to 34
270    ///     sign  = 1 for positive numbers, -1 for negative numbers. 
271    /// Zeroes are always stored at key = 0.
272    fine_bins : SkipMap<isize, HistogramBin>,
273
274    /// Natural Log of the grown factor, precomputed for speed.
275    log_of_growth : f64,
276
277    /// Cache of mode candidates for computing the mode.
278    mode_cache : ModeCache,
279
280    /// Cache for use with hsm (half-sample mode). 
281    hsm_cache : RefCell<HalfSampleModeCache>
282}
283
284impl Quantogram {
285
286    // ////////////////////////////
287    //                           //
288    //       Constructors        //
289    //                           //
290    // ////////////////////////////
291
292    /// Create a Quantogram with the default error rate of 1%.
293    pub fn new() -> Self {
294        let bins = 35;
295        let growth = 1.02;
296        let mut q = Quantogram {
297            growth : growth,
298            bins_per_doubling : bins,
299
300            growth_bin_power_memo : Vec::new(),
301
302            total_count : 0,
303            total_weight : 0.0,
304            weighted_sum : 0.0,
305            running_variance : 0.0,
306            nan_weight : 0.0, 
307            plus_ininity_weight : 0.0,
308            minus_ininity_weight : 0.0,
309            negative_weight : 0.0,
310            zero_weight : 0.0,
311            positive_weight : 0.0,
312            maximum : f64::NAN,
313            minimum : f64::NAN, 
314
315            only_integers : true,
316
317            negative_underflow : -(2.0_f64.powi(-126)),
318            negative_overflow : -(2.0_f64.powi(127)),
319            positive_underflow : 2.0_f64.powi(-126),
320            positive_overflow : 2.0_f64.powi(127),
321
322            // The SkipMaps do not preallocate bins. This lets it know
323            // how many levels to make each skiplist to make it optimal.
324            positive_coarse_bins : SkipMap::with_capacity(254),
325            negative_coarse_bins : SkipMap::with_capacity(254),
326            fine_bins : SkipMap::with_capacity(17641),
327            log_of_growth : growth.ln(),
328            mode_cache : ModeCache::new(),
329            hsm_cache : RefCell::new(HalfSampleModeCache::new_default())
330        };
331        q.memoize_growth_bin_power();
332        q
333    } 
334
335    /// Create a Quantogram where growth and bins are set, and underflow and overflow bounds are set
336    /// to the given powers of two. 
337    ///   - Inserted values having absolute magnitude below 2^smallest_power will be treated as zeroes.
338    ///   - Inserted values having absolute magnitude above 2^largest_power will be treated as infinities.
339    /// 
340    /// Note: To ensure that consist values for growth and bins are set, it is advised to use QuantogramBuilder 
341    ///       instead of directly calling this constructor.
342    pub fn with_configuration(growth: f64, bins: usize, smallest_power: isize, largest_power: isize) -> Self {
343        let valid_power = -126..=127;
344        if !valid_power.contains(&smallest_power) || !valid_power.contains(&largest_power) {
345            panic!("Power of two constraints must be in range [-126,127]");
346        }
347        if smallest_power >= largest_power {
348            panic!("Power of two constraints not given in ascending order");
349        }
350        let mut q = Quantogram {
351            growth : growth,
352            bins_per_doubling : bins,
353
354            growth_bin_power_memo : Vec::new(),
355
356            total_count : 0,
357            total_weight : 0.0,
358            weighted_sum : 0.0,
359            running_variance : 0.0,
360            nan_weight : 0.0, 
361            plus_ininity_weight : 0.0,
362            minus_ininity_weight : 0.0,
363            negative_weight : 0.0,
364            zero_weight : 0.0,
365            positive_weight : 0.0,
366            maximum : f64::NAN,
367            minimum : f64::NAN, 
368
369            only_integers : true,
370
371            negative_underflow : -(2.0_f64.powi(smallest_power as i32)),
372            negative_overflow : -(2.0_f64.powi(largest_power as i32)),
373            positive_underflow : 2.0_f64.powi(smallest_power as i32),
374            positive_overflow : 2.0_f64.powi(largest_power as i32),
375
376            // The SkipMaps do not preallocate bins. This lets it know
377            // how many levels to make each skiplist to make it optimal.
378            positive_coarse_bins : SkipMap::with_capacity(254),
379            negative_coarse_bins : SkipMap::with_capacity(254),
380            // If the range is constrained, fewer bins will be required,
381            // so the capacity can be reduced. 
382            fine_bins : SkipMap::with_capacity(((largest_power - smallest_power + 1)*35 + 1) as usize),
383            log_of_growth : growth.ln(),
384            mode_cache : ModeCache::new(),
385            hsm_cache : RefCell::new(HalfSampleModeCache::new_default())
386        };
387        q.memoize_growth_bin_power();
388        q
389    }
390
391    pub fn replace_hsm_cache(&mut self, new_cache: HalfSampleModeCache) {
392        self.hsm_cache = RefCell::new(new_cache);
393        self.hsm_cache.borrow_mut().clear();
394    }
395
396    // ////////////////////////////
397    //                           //
398    //    Add/Remove Samples     //
399    //                           //
400    // ////////////////////////////
401
402    /// Add a sample to the histogram with a weight of one.
403    pub fn add(&mut self, sample: f64) {
404        self.add_weighted(sample, 1.0);
405    }
406
407    /// Remove a sample from the histogram, deducting one from the weight.
408    /// If the weight goes negative, the call panics.
409    pub fn remove(&mut self, sample: f64) {
410        self.add_weighted(sample, -1.0);
411    }
412    
413    /// Add a sample to the histogram with the given weight.
414    /// If the weight is positive, the sample is added, otherwise removed.
415    /// If the cumulative weight for the bin holding that sample goes negative, 
416    /// the call panics.
417    pub fn add_weighted(&mut self, sample: f64, weight: f64) -> f64 {
418        self.hsm_cache.borrow_mut().record(sample);
419        let bounded_sample = self.over_or_underflow(sample);
420        let adjusted_fine_weight;
421        self.adjust_aggregates(bounded_sample, weight);
422        match SampleCategory::categorize(bounded_sample) {
423            SampleCategory::Positive => {
424                self.positive_weight += weight;
425                let (fine_key, fine_low, fine_midpoint, fine_high) = self.get_fine_key_with_midpoint(bounded_sample, self.growth, self.bins_per_doubling).unwrap();
426                adjusted_fine_weight = Self::increment_key_weight(&mut self.fine_bins, fine_key, fine_midpoint, sample, weight); 
427                self.adjust_mode(fine_key, fine_low, fine_midpoint, fine_high, adjusted_fine_weight);
428                let (coarse_key, coarse_midpoint) = Self::get_coarse_key_with_midpoint(bounded_sample).unwrap();
429                Self::increment_key_weight(&mut self.positive_coarse_bins, coarse_key, coarse_midpoint, sample, weight);
430            },
431            SampleCategory::Zero => {
432                self.zero_weight += weight;
433                let (fine_key, fine_low, fine_midpoint, fine_high) = self.get_fine_key_with_midpoint(bounded_sample, self.growth, self.bins_per_doubling).unwrap();
434                adjusted_fine_weight = Self::increment_key_weight(&mut self.fine_bins, fine_key, fine_midpoint, sample, weight); 
435                self.adjust_mode(fine_key, fine_low, fine_midpoint, fine_high, adjusted_fine_weight);
436            },
437            SampleCategory::Negative => {
438                self.negative_weight += weight;
439                let (fine_key, fine_low, fine_midpoint, fine_high) = self.get_fine_key_with_midpoint(bounded_sample, self.growth, self.bins_per_doubling).unwrap();
440                adjusted_fine_weight = Self::increment_key_weight(&mut self.fine_bins, fine_key, fine_midpoint, sample, weight); 
441                self.adjust_mode(fine_key, fine_low, fine_midpoint, fine_high, adjusted_fine_weight);
442                let (coarse_key, coarse_midpoint) = Self::get_coarse_key_with_midpoint(bounded_sample).unwrap();
443                Self::increment_key_weight(&mut self.negative_coarse_bins, coarse_key, coarse_midpoint, sample, weight);
444            },
445            SampleCategory::NotANumber => {
446                self.nan_weight += weight;
447                adjusted_fine_weight = self.nan_weight;
448            },
449            SampleCategory::PositiveInfinity => {
450                self.plus_ininity_weight += weight;
451                adjusted_fine_weight = self.plus_ininity_weight;
452            },
453            SampleCategory::NegativeInfinity => {
454                self.minus_ininity_weight += weight;
455                adjusted_fine_weight = self.minus_ininity_weight;
456            }
457        }
458        adjusted_fine_weight
459    }
460
461    /// Add many samples to the Quantogram, all having a weight of 1.0.
462    pub fn add_unweighted_samples<'a, S>(&mut self, samples: impl Iterator<Item = &'a S>) 
463    where S: 'a + Into<f64> + Copy
464    {
465        for sample in samples {
466            self.add((*sample).into());
467        }
468    }
469
470    // ////////////////////////////
471    //                           //
472    //      Basic Measures       //
473    //                           //
474    //   mean min max count      //
475    //   median variance stddev  //
476    //   mode hsm                //
477    //   quantile quantile_at    //
478    //   fussy_quantile          //
479    //   finite zero nan         //
480    //                           //
481    // ////////////////////////////
482
483
484    /// Return actual (not estimated) weighted mean of inserted samples (with machine precision),
485    /// or None if no finite values have been inserted.
486    /// Excludes all NANs and Infinities that have been inserted.
487    pub fn mean(&self) -> Option<f64> {
488        if self.total_weight > 0.0 { Some(self.weighted_sum / self.total_weight) }
489        else { None }
490    }
491
492    /// Return actual (not estimated) minimum of inserted samples (with machine precision),
493    /// or None if no finite values have been inserted.
494    /// Excludes all NANs and Infinities that have been inserted.
495    pub fn min(&self) -> Option<f64> {
496        if self.minimum.is_finite() { Some(self.minimum) }
497        else { None }
498    }
499
500    /// Return actual (not estimated) maximum of inserted samples (with machine precision),
501    /// or None if no finite values have been inserted.
502    /// Excludes all NANs and Infinities that have been inserted.
503    pub fn max(&self) -> Option<f64> {
504        if self.maximum.is_finite() { Some(self.maximum) }
505        else { None }
506    }
507
508    /// Return count of inserted samples, including NANs and Infinities.
509    pub fn count(&self) -> usize {
510        self.total_count
511    }
512
513    /// Return the weighted fraction of values that are finite (not NAN or +/- Infinity)
514    /// as a value in the range [0,1]. 
515    /// If no values have yet been added, return NAN. 
516    pub fn finite(&self) -> f64 {
517        if self.total_count == 0 {
518            f64::NAN
519        }
520        else {
521            let numerator = self.total_weight;
522            let denominator = self.added_weight();
523            numerator / denominator
524        }
525    }
526
527    /// Return the weighted fraction of values that are zero, including NANs and infinities in the basis. 
528    /// If no values have yet been added, return NAN. 
529    pub fn zero(&self) -> f64 {
530        if self.total_count == 0 {
531            f64::NAN
532        }
533        else {
534            let numerator = self.zero_weight;
535            let denominator = self.added_weight();
536            numerator / denominator
537        }
538    }
539
540    /// Return the weighted fraction of values that are NAN, including infinities in the basis. 
541    /// If no values have yet been added, return NAN. 
542    pub fn nan(&self) -> f64 {
543        if self.total_count == 0 {
544            f64::NAN
545        }
546        else {
547            let numerator = self.nan_weight;
548            let denominator = self.added_weight();
549            numerator / denominator
550        }
551    }
552
553    /// Return an estimate of the median.
554    pub fn median(&self) -> Option<f64> {
555        self.quantile(0.5)
556    }
557
558    /// Exact weighted variance of all added finite values.
559    pub fn variance(&self) -> f64 {
560        self.running_variance
561    }
562
563    /// Exact weighted population standard deviation of all added finite values.
564    pub fn stddev(&self) -> Option<f64> {
565        if self.total_weight <= 0.0 {
566            None
567        }
568        else {
569            Some((self.running_variance / self.total_weight).sqrt())
570        }
571    }
572
573    /// Return an estimate of the mode.
574    /// If no finite samples have been added, returns an empty list. 
575    /// If there are multiple modes tieing with the same weight,
576    /// return all of them in a list.
577    /// If all samples in the collection are integers, round the mode.
578    pub fn mode(&self) -> Vec<f64> {
579        let unrounded_mode = self.mode_cache.mode(&self.fine_bins);
580        if self.only_integers {
581            let rounded_mode = unrounded_mode
582                .into_iter()
583                .map(|x| x.round())
584                .collect();
585            rounded_mode
586        }
587        else {
588            unrounded_mode
589        }
590    }
591
592    /// Estimate the half-sample mode.
593    /// More resistant than the mode to outliers and contamination (noise). 
594    /// 
595    /// Based on the mode estimator by Robertson and Cryer (1974), and an 
596    /// algorithm described in "On a Fast, Robust Estimator of the Mode" by David Bickel and Rudolf Fruhwirth.
597    /// That algorithm is applied to raw samples, whereas this is applied 
598    /// to already histogrammed values.
599    /// 
600    /// Edge case: If fewer than five bins of data are present in the histogram,
601    /// the true mode will be returned, unless the data is multi-moded, in which case
602    /// None will be returned.
603    /// 
604    /// NOTE: 
605    pub fn hsm(&self) -> Option<f64> {
606        if self.total_weight == 0.0 { None }
607        else {
608            let mut weights: Vec<f64> = Vec::new();
609            let mut samples: Vec<f64> = Vec::new();
610            for bin in self.fine_bins.values() {
611                samples.push(bin.sample);
612                weights.push(bin.weight);
613            }
614
615            // Cache may be updated because of interior mutability.
616            self.hsm_cache.borrow_mut().hsm(&samples, &weights, self.total_weight)
617            
618            // If we wanted to bypass the cache, do it this way:
619            // half_sample_mode(&samples, &weights, self.total_weight)
620        }
621    }
622
623    /// Estimate the quantile for the inserted data.
624    ///   For the minimum, use phi = 0.0.
625    ///   For the median, use phi = 0.5. 
626    ///   For the 90th percentile, phi = 0.9.
627    ///   For the maximum, use phi = 1.0.
628    /// 
629    /// Added samples that are NANs or Infinities are excluded from the computation.
630    pub fn quantile(&self, phi: f64) -> Option<f64> {
631        if phi < 0.0 || phi > 1.0 || self.total_weight <= 0.0 {
632            None
633        }
634        else if phi == 0.0 {
635            self.min()
636        }
637        else if phi == 1.0 {
638            self.max()
639        }
640        else {
641            let mut target_cume_weight = self.total_weight * phi;
642
643            // Narrow down the position of the quantile to negative, zero or positive numbers. 
644            if target_cume_weight <= self.negative_weight {
645                // Quantile falls in the negative range
646                // Search two maps: coarse and fine. 
647                let (coarse_index, remainder) = Self::find_containing_bucket(target_cume_weight, &self.negative_coarse_bins).unwrap();
648                // Tricky, because in the negative range we must go from the highest magnitude to the lowest magnitude.
649                // Coarse index is therefore the negative of the power of two only for negative numbers.
650                let high_fine_index = ((-coarse_index + 126) * (self.bins_per_doubling as isize) + 0 + 1) * -1 + 1; // Exclusive upper bound
651                let low_fine_index = high_fine_index - (self.bins_per_doubling as isize) + 1; // Inclusive lower bound
652                let (_fine_index, _remainder2, sample_at_quantile) = Self::find_containing_bucket_in_range(low_fine_index, high_fine_index, remainder, &self.fine_bins).unwrap();
653                // println!("Negatives. coarse = {}. fine range = [{},{}) Fine found = {}", coarse_index, low_fine_index, high_fine_index, fine_index);
654                // Previously required a second call to get, which was slower.
655                // let sample_at_quantile = self.fine_bins.get(&fine_index).unwrap().sample;
656                return Some(self.conditional_round(sample_at_quantile));
657            }
658            target_cume_weight -= self.negative_weight;
659            if target_cume_weight <= self.zero_weight {
660                return Some(0.0);
661            }
662            else {
663                target_cume_weight -= self.zero_weight;
664            }
665
666            // Quantile falls in the positive range
667            let (coarse_index, remainder) = Self::find_containing_bucket(target_cume_weight, &self.positive_coarse_bins).unwrap();
668            let low_fine_index = ((coarse_index + 126) * (self.bins_per_doubling as isize) + 0 + 1) * 1;
669            let high_fine_index = low_fine_index + (self.bins_per_doubling as isize); // Exclusive upper bound
670            let (_fine_index, _remainder2, sample_at_quantile) 
671                = Self::find_containing_bucket_in_range(low_fine_index, high_fine_index, remainder, &self.fine_bins)
672                  .unwrap();
673            return Some(self.conditional_round(sample_at_quantile));
674        }
675    }
676
677    /// Quantile measurement that corrects for issues that occur when there is a large gap in samples 
678    /// right where the quantile falls.
679    /// 
680    /// For example, the median of { 1,2,3,4,5,95,96,97,98,99 } is 50! This is because there are an even number of items
681    /// in the set, so you have to average the middle two values.
682    /// 
683    /// Three quantile calculations will be taken, at Φ - ε, Φ and Φ + ε.
684    /// If the difference between the results at the two lower quantiles
685    /// differs substantially from the difference between the results at the two higher quantiles,
686    /// assume there is a gap and average the middle value and the most extreme value. 
687    /// 
688    /// threshold_ratio decides whether a gap exists. If none exists, no averaging occurs. 
689    /// It is used to compare the deltas between quantiles computed at the desired phi and values of 
690    /// phi slightly lower or higher. If the deltas are similar, no gap exists. 
691    /// A value over 2.0 seems sensible, but testing should be done to determine the best value.
692    pub fn fussy_quantile(&self, phi: f64, threshold_ratio: f64) -> Option<f64> {
693        if phi < 0.0 || phi > 1.0 || self.total_weight <= 0.0 {
694            None
695        }
696        else if phi == 0.0 {
697            self.min()
698        }
699        else if phi == 1.0 {
700            self.max()
701        }
702        else {
703            // TODO: This derivation of epsilon is appropriate for unweighted samples (all having weight = 1.0). 
704            // Unsure what the proper value is for weighted samples.
705            let epsilon = 1.0 / (2.0 * self.total_count as f64);
706            
707            let q_middle = self.quantile(phi);
708            if phi <= 1.5 * epsilon || phi >= 1.0 - 1.5 * epsilon || q_middle.is_none() {
709                q_middle
710            } 
711            else {
712                let q_low = self.quantile(phi - epsilon).unwrap();
713                let q_high = self.quantile(phi + epsilon).unwrap();
714                let q_median = q_middle.unwrap();
715
716                let low_difference = q_median - q_low;
717                let high_difference = q_high - q_median;
718                if low_difference >= high_difference * threshold_ratio {
719                    Some((q_low + q_median)/2.0)
720                }
721                else if high_difference >= low_difference * threshold_ratio {
722                    Some((q_high + q_median)/2.0)
723                }
724                else {
725                    None
726                }     
727            }
728        }
729    }
730
731    /// Get the range of quantiles between which the given value falls,
732    /// or None if no finite samples have been added yet or the 
733    /// queried value is not finite.
734    pub fn quantile_at(&self, value: f64) -> Option<(f64,f64)> {
735        let min_opt = self.min();
736        if min_opt.is_none() || !value.is_finite() {
737            return None;
738        } 
739        let min = min_opt.unwrap();
740        let max = self.max().unwrap();
741        match self.count() {
742            0 => None,
743            1 => {
744                if value < min { Some((0.0, 0.0)) }
745                else if value > min { Some((1.0, 1.0)) }
746                else { Some((0.0, 1.0)) }
747            },
748            _ if value < min => Some((0.0, 0.0)),
749            _ if value > max => Some((1.0, 1.0)),
750            _ if min == max => Some((0.0, 1.0)),
751            _ if value == min => Some((0.0, 0.0)),
752            _ if value == max => Some((1.0, 1.0)),
753            _ if value == 0.0 => {
754                let neg = self.negative_weight;
755                let sum = self.total_weight;
756                let zero = self.zero_weight;
757                Some((neg / sum, (neg + zero)/sum))
758            },
759            _ => {
760                // TODO: It is possible to use the coarse skipmaps to narrow in on the location
761                // in the fine skipmap faster, hence speedup this calculation. 
762                // This is a less commonly used method, so optimization is not a priority. 
763                let (end_key, _fine_low, _fine_midpoint, _fine_high) 
764                    = self.get_fine_key_with_midpoint(value, self.growth, self.bins_per_doubling).unwrap();
765                let (start_key, start_weight) = 
766                    if value >= 0.0 { (1_isize, self.negative_weight + self.zero_weight) } 
767                    else { (isize::MIN , 0.0) };
768                let mut cume_weight = start_weight;
769                let sum = self.total_weight;
770                for (key, bucket) in self.fine_bins.range(Included(&start_key), Included(&isize::MAX)) {
771                    let weight = bucket.weight;
772                    if *key == end_key {
773                        // Value falls within an existing bin, so we must define a range
774                        // or phi values.
775                        return Some((cume_weight / sum, (cume_weight + weight)/sum));
776                    }
777                    else if *key > end_key {
778                        // Value would go in a bin not yet present in the collection, so we passed by it.
779                        // Since value falls between bins, it gets a well defined phi value,
780                        // where the bottom and top of range are identical.
781                        return Some((cume_weight / sum, cume_weight/sum));
782                    }
783                    cume_weight += weight;
784                }
785                // Should never fall through, because the maximum sample always has a bin defined for it, 
786                // and we already tested if value > max.
787                None
788            }
789        }
790    }
791
792    // ////////////////////////////
793    //                           //
794    //   Measures of Dispersion  //
795    //                           //
796    //   range q1 q3 iqr         //
797    //   quartile_deviation      //
798    //   coeff_of_range          //
799    //   coeff_of_quartile_dev   //
800    //   coeff_of_stddev         //
801    //   coeff_of_variation      //
802    //                           //
803    // ////////////////////////////
804
805    /// Range between the highest and lowest values sampled.
806    pub fn range(&self) -> Option<f64> {
807        match (self.max(), self.min()) {
808            (Some(max), Some(min)) => Some(max - min),
809            _ => None
810        }
811    }
812
813    /// Sample at the first quartile.
814    pub fn q1(&self) -> Option<f64> { self.quantile(0.25) }
815
816    /// Sample at the third quartile.
817    pub fn q3(&self) -> Option<f64> { self.quantile(0.75) }
818
819    /// Inter quartile range, which is (Q3 - Q1)
820    pub fn iqr(&self) -> Option<f64> {
821        match (self.q3(), self.q1()) {
822            (Some(q3), Some(q1)) => Some(q3 - q1),
823            _ => None
824        }  
825    }
826
827    /// Quartile deviation or semi-interquartile range, which is (Q3 - Q1) / 2
828    pub fn quartile_deviation(&self) -> Option<f64> {
829        match (self.q3(), self.q1()) {
830            (Some(q3), Some(q1)) => Some((q3 - q1) / 2.0),
831            _ => None
832        }  
833    }
834
835    /// Coefficient of range. 
836    /// 
837    /// ```text
838    ///                        max - min
839    ///    coeff of range =  ------------
840    ///                        max + min
841    /// ```
842    pub fn coeff_of_range(&self) -> Option<f64> {
843        match (self.max(), self.min()) {
844            (Some(max), Some(min)) => {
845                let sum = max + min;
846                if sum == 0.0 { None }
847                else { Some((max - min) / sum) }
848            },
849            _ => None
850        }
851    }
852
853    /// Coefficient of quartile deviation, which measures the relative spread between 
854    /// the first and third quartiles.
855    /// 
856    /// ```text
857    ///                                    q3 - q1
858    ///    coeff of quartile deviation =  ---------
859    ///                                    q3 + q1
860    /// ```    
861    pub fn coeff_of_quartile_dev(&self) -> Option<f64> {
862        match (self.q3(), self.q1()) {
863            (Some(q3), Some(q1)) => {
864                let sum = q3 + q1;
865                if sum == 0.0 { None }
866                else { Some((q3 - q1) / sum) }
867            },
868            _ => None
869        }
870    }
871
872    /// Coefficient of standard deviation.
873    /// 
874    /// This give the standard deviation divided by the mean.
875    pub fn coeff_of_stddev(&self) -> Option<f64> {
876        match (self.stddev(), self.mean()) {
877            (Some(stddev), Some(mean)) => {
878                if mean == 0.0 { None }
879                else { Some(stddev / mean) }
880            },
881            _ => None
882        }
883    }
884
885    /// Coefficient of variation.
886    /// 
887    /// This give the standard deviation divided by the mean expressed as a percentage.
888    pub fn coeff_of_variation(&self) -> Option<f64> {
889        match self.coeff_of_stddev() {
890            Some(coeff) => Some(coeff * 100.0),
891            _ => None
892        }
893    }
894
895    // ////////////////////////////
896    //                           //
897    //       Diagnostics         //
898    //                           //
899    // ////////////////////////////
900
901    /// Computes a relative measure of the storage being used by the histogram. 
902    /// 
903    /// As bins are dynamically added to the sparse Skipmaps, this increases. 
904    pub fn size(&self) -> usize {
905        7 // counts of NANs, +/- Infinity, zeroes, three empty Skipmaps are always kept
906        + self.positive_coarse_bins.len()
907        + self.negative_coarse_bins.len()
908        + self.fine_bins.len()
909    }
910
911    // ////////////////////////////
912    //                           //
913    //    Utility functions      //
914    //                           //
915    // ////////////////////////////
916
917    /// Compute growth^N for N = 0..bins_per_doubling and store in a Vec
918    /// to memoize the result as an optimization, to avoid having to call powi function. 
919    fn memoize_growth_bin_power(&mut self) {
920        for bin in 0..=self.bins_per_doubling {
921            self.growth_bin_power_memo.push(self.growth.powi(bin as i32));
922        }
923    }
924
925    /// Approximate the log(x, growth) using a Padé Approximant
926    /// and take the floor, yielding an integer. 
927    /// Good accuracy if x is in the range [1.0,2.0]. (Accuracy actually good up to e = 2.71828.)
928    /// 
929    /// ```text
930    /// A good second-order Padé approximant to ln(x) for the region x ∈ [1,e] is:
931    /// 
932    ///                3x² - 3
933    ///    ln(x) =  -------------
934    ///              x² + 4x + 1
935    /// 
936    /// A third-order Padé approximant to ln(x) for the region x ∈ [1,e] is:
937    /// 
938    ///              11x³ + 27x² - 27x - 11
939    ///    ln(x) =  ------------------------
940    ///               3x³ + 27x² + 27x + 3
941    /// ```
942    /// To transform to use the growth factor as base, we divide by ln(growth). 
943    /// 
944    /// See https://math.stackexchange.com/questions/2238880/good-approximation-to-lnx-for-x-in-1-x-e
945    /// for how the 2nd order Pade approximant was derived.
946    /// 
947    /// The third-order approximant came from Wolfram Alpha with some algebraic rearrangement 
948    /// to optimize the number of arithmetic operations: 
949    ///   > PadeApproximant(ln[x], {x, 1, {3, 3}})
950    /// 
951    /// Over the range [1,2], the worst case absolute relativer error for the:
952    ///   2nd order Padé approximant is 0.12%. 
953    ///   3nd order Padé approximant is 0.004%. (0.033% for e = 2.71828)
954    /// The 3rd order accuracy was needed. In release mode, it is twice as fast as the system log. 
955    fn pade_approximate_log_floor(&self, x: f64) -> isize {
956        let x2 = x * x;
957        let x3 = x2 * x;
958
959        // 2nd order: 
960        // let numerator = 3.0 * (x2 - 1.0);
961        // let denominator = self.log_of_growth * (x2 + 4.0*x + 1.0);
962
963        // 3rd order (after dividing all terms by 27 and changing base): 
964        let numerator = (11.0/27.0) * x3 + x2 - x - (11.0/27.0);
965        let denominator = self.log_of_growth * (x3/9.0 + x2 + x + (1.0/9.0));
966
967        let ln_x = numerator / denominator;
968        ln_x.floor() as isize
969    }
970
971    /// Find the key to the containing bucket by Iterating over buckets in a map 
972    /// in increasing order and accumulating the weights until one is reached where the total
973    /// falls within the range belonging to the bucket.
974    /// Also return the amount of weight remaining prior to the final bucket.
975    fn find_containing_bucket(cume_weight: f64, map: &SkipMap<isize, HistogramBin>) -> Option<(isize,f64)> {
976        let mut remaining_weight = cume_weight;
977        for (k, bucket) in map.iter() {
978            let weight = bucket.weight;
979            if weight >= remaining_weight {
980                return Some((*k, remaining_weight));
981            }
982            remaining_weight -= weight;
983        }
984        None
985    }
986
987    /// Find the key to the containing bucket in the given key range
988    /// by Iterating over buckets in a map 
989    /// accumulating the weights until one is reached where the total
990    /// falls within the range belonging to the bucket.
991    /// 
992    /// Returns a tuple with the key, remaining weight, and the bucket sample value.
993    fn find_containing_bucket_in_range(start_key: isize, end_key: isize, cume_weight: f64, map: &SkipMap<isize, HistogramBin>) -> Option<(isize,f64,f64)> {
994        let mut remaining_weight = cume_weight;
995        for (k, bucket) in map.range(Included(&start_key), Excluded(&end_key)) {
996            let weight = bucket.weight;
997            if weight >= remaining_weight {
998                return Some((*k, remaining_weight, bucket.sample));
999            }
1000            remaining_weight -= weight;
1001        }
1002        None
1003    }
1004
1005    /// Bookkeeping done when adding a new sample or removing an existing one. 
1006    fn adjust_aggregates(&mut self, sample: f64, weight: f64) {
1007        self.adjust_minimum(sample);
1008        self.adjust_maximum(sample);
1009        let weight_is_finite = weight.is_finite();
1010        let sample_is_finite = sample.is_finite();
1011        if weight < 0.0 && weight_is_finite {
1012            // Negative weights are interpreted as a removal.
1013            self.total_count -= 1;
1014        }
1015        else {
1016            self.total_count += 1;
1017        }
1018        if sample_is_finite && weight_is_finite {
1019            let previous_mean = self.mean().unwrap_or_default();
1020            self.total_weight += weight;
1021            self.weighted_sum += weight * sample;
1022            let current_mean = self.mean().unwrap_or_default();
1023            self.running_variance += weight * (sample - previous_mean) * (sample - current_mean);
1024        }
1025
1026        if sample_is_finite && sample.fract() != 0.0 {
1027            // Once we add numbers with fractions, assume that the quantile can have a fraction.
1028            self.only_integers = false;
1029        }
1030    }
1031
1032    /// Restrict a value based on preset overflow and underflow values. 
1033    /// Values that underflow are set to zero.
1034    /// Values that overflow are set to +/- infinity.
1035    fn over_or_underflow(&self, sample: f64) -> f64 {
1036        if sample > self.negative_underflow && sample < self.positive_underflow {
1037            0.0
1038        }
1039        else if sample > self.positive_overflow {
1040            f64::INFINITY
1041        }
1042        else if sample < self.negative_overflow {
1043            f64::NEG_INFINITY
1044        }
1045        else {
1046            sample
1047        }
1048    }
1049
1050    /// If the new value is finite and less than the minimum 
1051    /// (or the minimum is unset, hence a NAN), update the minimum.
1052    fn adjust_minimum(&mut self, sample: f64) {
1053        if self.minimum.is_nan() {
1054            if sample.is_finite() {
1055                self.minimum = sample;
1056            }
1057        }
1058        else if sample < self.minimum && sample.is_finite() {
1059            self.minimum = sample;
1060        }
1061    }
1062
1063    /// If the new value is finite and greater than the maximum
1064    /// (or the maximum is unset, hence a NAN), update the maximum.
1065    fn adjust_maximum(&mut self, sample: f64) {
1066        if self.maximum.is_nan() {
1067            if sample.is_finite() {
1068                self.maximum = sample;
1069            }
1070        }
1071        else if sample > self.maximum && sample.is_finite() {
1072            self.maximum = sample;
1073        }
1074    }
1075
1076    fn adjust_mode(
1077        &mut self, 
1078        bin_key: isize, 
1079        bin_low_value: f64, 
1080        bin_midpoint: f64, 
1081        bin_high_value: f64, 
1082        bin_weight: f64) {
1083        // TODO: Add a configuration parameter here and in QuantogramBuilder
1084        // to enable/disable mode calculations. 
1085        // It is a drag on performance and not all callers need the mode.
1086
1087        // NOTE: Do not update the mode if the weight is 1. 
1088        // Otherwise if all values are distinct (common in unit tests)
1089        // then you store a huge list of modes. This kills performance, too.
1090        if bin_midpoint.is_finite() && bin_weight > 0.0 && bin_weight != 1.0 {
1091            self.mode_cache.update(bin_key, bin_low_value, bin_high_value, bin_weight);
1092        }
1093    }
1094
1095    /// Update bin weight for given key, or create new bin for key if none exists.
1096    /// Returns the updated weight.
1097    fn increment_key_weight(map: &mut SkipMap<isize, HistogramBin>, key: isize, bucket_sample: f64, accurate_sample: f64, added_weight: f64) -> f64 {
1098        match map.get_mut(&key) {
1099            Some(bucket) => {
1100                // When updating an existing bucket, use the bucket_sample value, which is the midpoint value,
1101                // unless the previous bucket used the same value as the accurate value.
1102                let new_bucket:HistogramBin;
1103                if bucket.sample == accurate_sample {
1104                    new_bucket = HistogramBin::new(accurate_sample, bucket.weight + added_weight);
1105                    bucket.set(new_bucket); 
1106                }
1107                else {
1108                    new_bucket = HistogramBin::new(bucket_sample, bucket.weight + added_weight);
1109                    bucket.set(new_bucket);                    
1110                }
1111                new_bucket.weight
1112            },
1113            None => {
1114                // When creating a new bucket, use the accurate_sample. 
1115                // This means that bins with a single sample are 100% accurate,
1116                // but when the second sample is added, shift to the midpoint for the bin.
1117                map.insert(key, HistogramBin::new(accurate_sample, added_weight));
1118                added_weight
1119            }
1120        }
1121    }
1122
1123    /// If only integers have been added to the histogram, round the value.
1124    fn conditional_round(&self, sample: f64) -> f64 {
1125        if self.only_integers && sample.is_finite() {
1126            sample.round()
1127        }
1128        else {
1129            sample
1130        }
1131    }
1132
1133    /// Sum of weights of all samples added, including NAN and infinities.
1134    fn added_weight(&self) -> f64 {
1135        self.total_weight + self.nan_weight + self.plus_ininity_weight + self.minus_ininity_weight
1136    }
1137
1138    
1139    /// Exponent on the largest power of two less than or equal to the magnitude of the given number,
1140    /// or None if a NAN, Infinity or Zero.
1141    /// 
1142    /// ```
1143    ///   use quantogram::Quantogram;
1144    ///   assert_eq!(Quantogram::power_of_two(5.0).unwrap(), 2);
1145    ///   assert_eq!(Quantogram::power_of_two(4.0).unwrap(), 2);
1146    ///   assert_eq!(Quantogram::power_of_two(1.5).unwrap(), 0);
1147    ///   assert_eq!(Quantogram::power_of_two(-8.5).unwrap(), 3);
1148    ///   assert_eq!(Quantogram::power_of_two(0.0), None);
1149    ///   assert_eq!(Quantogram::power_of_two(0.25).unwrap(), -2);
1150    /// ```
1151    pub fn power_of_two(sample: f64) -> Option<isize> {
1152        if sample.is_finite() && sample != 0.0 {
1153            let (_mantissa, exponent) = frexp(sample);
1154            Some(exponent - 1)
1155        }
1156        else {
1157            None
1158        }
1159    }
1160
1161    /// Break a sample into three parts: 
1162    ///   - sign is -1 for negative numbers or +1 for positive numbers
1163    ///   - power is exponent on the largest power of two less than or equal to the absolute value of the number.
1164    ///   - remainder is the mantissa multiplied by two, which is a number in the semi-open range [1.0,2.0)
1165    /// 
1166    /// Returns None if the sample is not finite or is zero.
1167    fn sign_power_remainder(sample: f64) -> Option<(isize, isize, f64)> {
1168        if sample.is_finite() && sample != 0.0 {
1169            let (mantissa, exponent) = frexp(sample);
1170            if sample > 0.0 {
1171                Some((1, exponent - 1, mantissa * 2.0))
1172            }
1173            else {
1174                Some((-1, exponent - 1, mantissa.abs() * 2.0))
1175            }
1176        }
1177        else {
1178            None
1179        }
1180    }
1181
1182    /// Raise two to the given power.
1183    #[inline(always)]
1184    fn two_power(power: isize) -> f64 {
1185        match power {
1186            0..=63 => (1_usize << power) as f64,
1187            64..=127 => (1_u128 << power) as f64,
1188            -63..=-1 => 1.0 / ((1_usize << -power) as f64),
1189            -126..=-64 => 1.0 / ((1_u128 << -power) as f64),
1190            _ => f64::NAN
1191        }
1192    }
1193
1194    /// Get the key that maps a sample to its coarse bin. 
1195    /// For positive numbers, this is the floor of the log base 2 of the sample.
1196    /// For negative numbers, negate the power of two, so that negative numbers
1197    /// sort from low to high. (A negative number with a higher power of two is "smaller"
1198    /// or more negative than one with a lower power of two.)
1199    /// 
1200    /// Returns a Tuple with the bin key and the midpoint value for the bin.
1201    fn get_coarse_key_with_midpoint(sample: f64) -> Option<(isize,f64)> {
1202        match Self::sign_power_remainder(sample) {
1203            Some((sign, power, _remainder)) => {
1204                let low = Self::two_power(power); // 2.0_f64.powi(power as i32);
1205                let high = low * 2.0;
1206                let bucket_middle = (sign as f64) * (low + high) / 2.0;
1207                // Multiply by the sign because negative numbers with large exponents sort before those with small exponents
1208                Some((sign * power, bucket_middle))
1209            },
1210            _ => None
1211        }
1212    }
1213
1214    /// Get the key that maps a number to its fine histogram bin given the desired exponential growth factor.
1215    /// If the growth factor is 1.02, there will be 35 bins per power of two.
1216    /// Thus bins_per_doubling must match the growth_factor: 
1217    /// 
1218    /// ```text
1219    ///     growth^bins_per_doubling ~ 2.0.
1220    /// ```
1221    /// 
1222    /// Bins are defined for zeroes, and finite positive and negative numbers.
1223    /// The key is related to the power of two and the position within the 35 bins per
1224    /// power of two range: 
1225    /// ```text
1226    ///     key = ((power + 126)*35 + bin + 1)*sign
1227    /// ```
1228    /// where:
1229    /// ```text
1230    ///     power = the power of two, from -126 to 127
1231    ///     bin   = 0 to 34 (for growth = 1.02)
1232    ///     sign  = 1 for positive numbers, -1 for negative numbers. 
1233    /// Zeroes are always stored at key = 0.
1234    /// ```
1235    /// Returns a Tuple with the bin key and the low, mid, and high point values for the bin. 
1236    fn get_fine_key_with_midpoint(&self, sample: f64, growth: f64, bins_per_doubling: usize) -> Option<(isize,f64,f64,f64)> {
1237        match Self::sign_power_remainder(sample) {
1238            Some((sign, power, remainder)) => {
1239                // Use of system log is more accurate but slower. 
1240                // This inaccuracy adds a slight bias towards zero. 
1241                // Rare samples will be put into the next lower bin. Since that bin is slightly narrower, 
1242                // this may actually reduce the error slightly as the estimate is pushed towards a closer bin center. 
1243                // let bin = remainder.log(growth).floor() as isize; 
1244                let bin = self.pade_approximate_log_floor(remainder);
1245
1246                let key = ((power + 126) * (bins_per_doubling as isize) + bin + 1) * sign;
1247                let two_power = Self::two_power(power); // f64::powi(2.0, power as i32);
1248                let bucket_low = two_power * self.growth_bin_power_memo[bin as usize]; // Memo of: f64::powi(growth, bin as i32);
1249                let bucket_high = bucket_low * growth;
1250                let bucket_middle = (sign as f64) * (bucket_low + bucket_high) / 2.0;
1251                Some((key, bucket_low, bucket_middle, bucket_high))
1252            },
1253            _ => { 
1254                if sample == 0.0 {
1255                    Some((0, 0.0, 0.0, 0.0))
1256                }
1257                else {
1258                    None
1259                }
1260            }
1261        }
1262    }
1263
1264}
1265
1266impl Debug for Quantogram {
1267    fn fmt(&self, f: &mut Formatter) -> Result {
1268        let mut fine = "(".to_owned();
1269        for (k, bucket) in self.fine_bins.iter() {
1270            fine = fine + &format!("\n              {}->{:.4}:{}", k, bucket.sample, bucket.weight);   
1271        }
1272        fine = fine + "\n            )";
1273
1274        let mut coarse_plus = "(".to_owned();
1275        for (k, bucket) in self.positive_coarse_bins.iter() {
1276            coarse_plus = coarse_plus + &format!("\n              {}->{:.4}:{}", k, bucket.sample, bucket.weight);   
1277        }
1278        coarse_plus = coarse_plus + "\n            )";
1279
1280        let mut coarse_minus = "(".to_owned();
1281        for (k, bucket) in self.negative_coarse_bins.iter() {
1282            coarse_minus = coarse_minus + &format!("\n              {}->{:.4}:{}", k, bucket.sample, bucket.weight);   
1283        }
1284        coarse_minus = coarse_minus + "\n            )";
1285
1286        write!(f, "Quantogram.
1287          growth = {}, bins per doubling = {}
1288          total count = {}, weight = {}, sum = {}, variance = {:.3},
1289          NAN = {}, -Inf = {}, +Inf = {}, Integers? = {}
1290          - = {}, 0 = {}, + = {}
1291          min = {}, mean = {:?}, max = {}
1292          under/overflow = {}/{}
1293          coarse bins = 
1294            Minus {}
1295            Plus {}
1296          fine bins = {}
1297        ", 
1298            self.growth, self.bins_per_doubling,
1299            self.total_count, self.total_weight, self.weighted_sum,
1300            self.running_variance,
1301            self.nan_weight, self.minus_ininity_weight, self.plus_ininity_weight, self.only_integers,
1302            self.negative_weight, self.zero_weight, self.positive_weight,
1303            self.minimum, self.mean(), self.maximum,
1304            self.positive_underflow.log2(), self.positive_overflow.log2(),
1305            coarse_minus,
1306            coarse_plus,
1307            fine
1308        )
1309    }
1310}
1311
1312/// A fluent API Builder for Quantograms. 
1313/// 
1314/// If error, growth or bins_per_doubling are changed, the other two 
1315/// are modified to be consistent with it. If error or growth imply a non-integral
1316/// value for bins_per_doubling, take the ceiling of the derived bins_per_ceiling
1317/// and rederive the other two from that, to satisfy a minimum guarantee. 
1318/// 
1319/// No values may cause bins_per_doubling to exceed 1000, hence the error 
1320/// may not be set lower than 0.0347%.
1321/// 
1322/// Example of creating a Quantogram with a maximum of 2% absolute relative error: 
1323/// 
1324/// 
1325///   use quantogram::QuantogramBuilder;
1326///   let q = QuantogramBuilder().with_error(0.02).build();
1327///
1328/// 
1329/// This is the number of bins used for various error rates: 
1330/// ```text
1331///   Bins    Error Rate
1332///   ----    ----------
1333///     2        17.2 %   
1334///     3        11.5 %   
1335///     4         8.6 %   
1336///     5         6.9 %   
1337///     7         4.9 %   
1338///    12         2.9 %   
1339///    18         1.9 %   
1340///    35         1.0 %   
1341///    70         0.5 %
1342///   139         0.25%
1343///   347         0.10%
1344///   694         0.05%
1345///   867         0.04%
1346///  1000         0.0347%
1347/// ```
1348#[derive(Copy, Clone, Debug)]
1349pub struct QuantogramBuilder {
1350    /// Desired absolute relative error rate.
1351    /// The valid range is a fraction in the range [0.00035,0.17].
1352    /// Defaults to approximately 0.01.
1353    /// Altering error changes growth and bins_per_doubling to be consistent.
1354    /// 
1355    /// This is related to the number of bins per doubling (b) 
1356    /// or growth via these formulas:
1357    /// ```text             
1358    ///              1/b
1359    ///            2      - 1     growth - 1
1360    ///   error = ------------ = ------------
1361    ///              1/b          growth + 1
1362    ///            2      + 1
1363    /// ```
1364    error : f64,
1365
1366    /// Number of bins used to store values per power of two range of numbers.
1367    /// The valid range is [2,1000].
1368    /// The larger this value, the more memory is required.
1369    /// Defaults to 35 and must exceed one.
1370    /// Altering this changes error and growth to be consistent.
1371    /// 
1372    /// The number of bins (b) is related to the growth factor 
1373    /// via this formula: 
1374    /// ```text
1375    ///                1/b
1376    ///     growth = 2
1377    /// ```
1378    bins_per_doubling : usize,
1379
1380    /// Exponential scale factor relating start value of one bin to the next. 
1381    /// The valid range is [1.00069,1.4143].
1382    /// Defaults to approximately 1.02. 
1383    /// Altering growth changes error and bins_per_doubling to be consistent.
1384    /// ```text
1385    ///           b    
1386    ///     growth  = 2
1387    /// ```
1388    growth : f64,
1389
1390    /// A sample whose magnitude is less than this power of two will underflow and be considered a zero.
1391    /// Must be in the range [-126,127] but also be less than largest_power.
1392    /// Defaults to -126.
1393    smallest_power: isize, 
1394    
1395    /// A sample whose magnitude is more than this power of two will overflow and be considered +/- Infinity.
1396    /// Must be in the range [-126,127] but also be more than smallest_power.
1397    /// Defaults to +127.
1398    largest_power: isize,
1399
1400    hsm_cache: Option<HalfSampleModeCache>
1401}
1402
1403impl QuantogramBuilder {
1404    /// Create a new builder that defaults to an error rate of 1% 
1405    /// with 35 bins per doubling and a growth factor of 1.02.
1406    pub fn new() -> Self {
1407        QuantogramBuilder {
1408            error: 0.01,
1409            bins_per_doubling: 35,
1410            growth: 1.02,
1411            smallest_power: -126,
1412            largest_power: 127,
1413            hsm_cache: None
1414        }
1415    }
1416
1417    /// Build a Quantogram using the collected configuration values.
1418    pub fn build(self) -> Quantogram {
1419        let mut q = Quantogram::with_configuration(self.growth, self.bins_per_doubling, self.smallest_power, self.largest_power);
1420        match self.hsm_cache {
1421            Some(cache) => { q.replace_hsm_cache(cache); },
1422            None => (),
1423        }
1424        q
1425    }
1426
1427    pub fn with_hsm_cache(mut self, hsm_cache: &HalfSampleModeCache)-> Self {
1428        let mut cleared_cache = hsm_cache.clone();
1429        cleared_cache.clear();
1430        self.hsm_cache = Some(cleared_cache);
1431        self
1432    } 
1433
1434    /// Configure the underflow of samples.
1435    /// A sample whose magnitude has a power of two less than the given values will be set to zero.
1436    /// 
1437    /// Example: If you only need to compute quantiles to two decimal places:
1438    /// ```text
1439    ///          set power = -7, since 2^-7 = 1/128 < 0.01.
1440    /// ```
1441    pub fn with_smallest_power(mut self, power: isize)-> Self {
1442        self.smallest_power = 
1443            if power < -126 { -126 }
1444            else if power >= 127 { 127 }
1445            else { power };
1446        self
1447    }
1448
1449    /// Configure the overflow of samples.
1450    /// A sample whose magnitude has a power of two greater than the given values will be set to +/- Infinity.
1451    /// 
1452    /// Example: If you only need to study numbers below a thousand, 
1453    ///          set power = 10, since 2^10 = 1024 > 1000.
1454    pub fn with_largest_power(mut self, power: isize)-> Self {
1455        self.largest_power = 
1456            if power <= -126 { -125 }
1457            else if power > 127 { 127 }
1458            else { power };
1459        self
1460    }
1461
1462    /// Configure to target the given maximum absolute relative error.  
1463    pub fn with_error(mut self, error: f64) -> Self {
1464        (&mut self).set_from_error(error);
1465        self
1466    }
1467
1468    /// Configure to target the given growth factor from one bucket to the next. 
1469    /// This will be adjusted to the nearest value that is an Nth root of 2.
1470    pub fn with_growth(mut self, growth: f64) -> Self {
1471        (&mut self).set_from_growth(growth);
1472        self
1473    }
1474
1475    /// Set bins_per_doubling, error, and growth to be consistent with the given value. 
1476    /// Memory usage is proportional to this value, so this 
1477    /// is the way to directly control memory usage. 
1478    /// A bins_per_doubling value of 35 yields 1% error. 
1479    pub fn with_bins_per_doubling(mut self, bins: usize) -> Self {
1480        (&mut self).set_from_bins(bins);
1481        self
1482    }
1483
1484    /// Set bins_per_doubling, error, and growth to be consistent with the given error value. 
1485    /// This is the way to directly control error rate. 
1486    /// A 1% error corresponds to a bins_per_doubling value of 35. 
1487    fn set_from_error(&mut self, error: f64) {
1488        let adjusted_error = 
1489            if error < 0.00035 { 0.00035 }
1490            else if error > 0.17 { 0.17 }
1491            else { error };
1492        let growth = (1.0 + adjusted_error) / (1.0 - adjusted_error);
1493        let fractional_bins = 1.0 / growth.log2();
1494        // Must make the number of bins an integer, then correct the other values.
1495        self.bins_per_doubling = fractional_bins.ceil() as usize;
1496        self.growth = 2.0_f64.powf(1.0/(self.bins_per_doubling as f64));
1497        self.error = (self.growth - 1.0) / (self.growth + 1.0);
1498    }
1499
1500    fn set_from_growth(&mut self, growth: f64) {
1501        let adjusted_growth =
1502            if growth < 1.00069 { 1.00069 }
1503            else if growth > 1.4143 { 1.4143 }
1504            else { growth };
1505        let fractional_bins = 1.0 / adjusted_growth.log2();
1506        // Must make the number of bins an integer, then correct the other values.
1507        self.bins_per_doubling = fractional_bins.ceil() as usize;
1508        self.growth = 2.0_f64.powf(1.0/(self.bins_per_doubling as f64));
1509        self.error = (self.growth - 1.0) / (self.growth + 1.0);
1510    }
1511
1512    /// Set bins_per_doubling, error, and growth to be consistent with the given bins value. 
1513    /// This is the way to directly control memory usage. 
1514    /// The maximum possible memory usage is linearly proportional to bins. 
1515    /// Actual data usually has much redundancy, autocorrelation, and may obey Zipf's law, 
1516    /// which will cause much less than the maximum storage in practice. 
1517    /// 
1518    /// A bins_per_doubling value of 35 corresponds to 1% error. 
1519    /// 
1520    /// Bins will be constrained to be in the range [2,1000].
1521    fn set_from_bins(&mut self, bins: usize) {
1522        let adjusted_bins =
1523            if bins < 2 { 2 }
1524            else if bins > 1000 { 1000 }
1525            else { bins };
1526        self.bins_per_doubling = adjusted_bins;
1527        self.growth = 2.0_f64.powf(1.0/(self.bins_per_doubling as f64));
1528        self.error = (self.growth - 1.0) / (self.growth + 1.0);
1529    }
1530}
1531
1532#[derive(Copy, Clone, Debug)]
1533struct BinInfo {
1534    pub bin_key : isize,
1535    pub bin_low_value : f64,
1536    pub bin_high_value : f64,
1537    pub bin_weight : f64
1538}
1539
1540impl BinInfo {
1541    pub fn bin_width(&self) -> f64 {
1542        self.bin_high_value - self.bin_low_value
1543    }
1544
1545    /// Fetch the weight of the indicated bin, or zero if it does not exist.
1546    pub fn get_key_weight(map: &SkipMap<isize, HistogramBin>, key: isize) -> f64 {
1547        match map.get(&key) {
1548            Some(bucket) => bucket.weight,
1549            None => 0.0
1550        }
1551    }
1552
1553    /// Estimate the mode of grouped data using the weights of
1554    /// the bins immediately below and above.
1555    /// If those bins are empty, it just takes the midpoint of this bin. 
1556    /// 
1557    /// ```text
1558    ///               /   f1 - f0     \
1559    ///   Mode = L + |-----------------| h
1560    ///               \ 2f1 - f0 - f2 /
1561    /// 
1562    ///   where:
1563    ///      L is the lower limit of the modal class
1564    ///      h is the size of the class interval
1565    ///      f1 is the frequency of the modal class
1566    ///      f0 is the frequency of the class preceding the modal class
1567    ///      f2 is the frequency of the class succeeding the modal class
1568    /// ```
1569    pub fn mode_of_grouped_data(&self, bins: &SkipMap<isize, HistogramBin>) -> f64 {
1570        let f1 = self.bin_weight;
1571        let f0 = Self::get_key_weight(bins, self.bin_key - 1);
1572        let f2 = Self::get_key_weight(bins, self.bin_key + 1);
1573        let h = self.bin_width();
1574        let l = self.bin_low_value;
1575        let mode = l + h*(f1 - f0)/(2.0 * f1 - f0 - f2);
1576        mode
1577    }
1578}
1579
1580/// Maintain a cache of information from which one can estimate
1581/// the current values for Mode.
1582struct ModeCache {
1583    modal_classes : Vec<BinInfo>,
1584    weight : f64
1585}
1586
1587impl ModeCache {
1588    pub fn new() -> Self {
1589        ModeCache {
1590            modal_classes: Vec::new(),
1591            weight : 0.0
1592        }
1593    }
1594
1595    /// Attempt to update the ModeCache with a new candidate for Mode. 
1596    /// Return false if no change was made to the mode estimate. 
1597    pub fn update(&mut self, bin_key : isize,
1598        bin_low_value : f64,
1599        bin_high_value : f64,
1600        bin_weight : f64) -> bool {
1601        let bin = BinInfo { 
1602            bin_key : bin_key,
1603            bin_low_value : bin_low_value,
1604            bin_high_value : bin_high_value,
1605            bin_weight : bin_weight
1606        };
1607        if self.weight < bin_weight {
1608            // Replace old mode with a new mode.
1609            self.modal_classes = vec![bin];
1610            self.weight = bin_weight;
1611            true
1612        }
1613        else if self.weight == bin_weight {
1614            // A multi-modal collection of samples.
1615            self.modal_classes.push(bin);
1616            true
1617        }
1618        else {
1619            false
1620        }
1621    }
1622
1623    /// Return a list of all modes among the samples. 
1624    /// If no samples have been added, the list is empty.
1625    pub fn mode(&self, bins: &SkipMap<isize, HistogramBin>) -> Vec<f64> {
1626        let mut modes = Vec::new();
1627        for bin in self.modal_classes.iter() {
1628            let mode_estimate = bin.mode_of_grouped_data(bins);
1629            modes.push(mode_estimate);
1630        }
1631        modes
1632    }
1633}
1634
1635#[cfg(test)]
1636mod tests {
1637    // Note this useful idiom: importing names from outer (for mod tests) scope.
1638    use super::*;
1639    use std::time::{Instant};
1640
1641    fn assert_close(x: f64, y: f64, epsilon: f64) {
1642        let delta = (x - y).abs();
1643        assert!(delta <= epsilon);
1644    }
1645
1646    /// Test data with a gap.
1647    fn gapped_quantogram() -> Quantogram {
1648        let mut q = Quantogram::new();
1649        let data = vec![1.0,2.0,3.0,4.0,5.0,95.0,96.0,97.0,98.0,99.0];
1650        q.add_unweighted_samples(data.iter());
1651        q
1652    }
1653
1654    fn randomish(bottom: f64, top:f64, count: usize) -> Vec<f64> {
1655        let mut data : Vec<f64> = Vec::new();
1656        let range = top - bottom;
1657        let phi: f64 = (1.0 + 5.0_f64.sqrt()) / 2.0;
1658        for i in 0..count {
1659            // Pseudo random Weyl sequence based on Golden ratio.
1660            let pseudo_rand = ((i as f64) * phi) % 1.0_f64;
1661            // By squaring pseudo_rand we get a non-uniform distribution,
1662            // which is more interesting.
1663            let sample = bottom + pseudo_rand * pseudo_rand * range;
1664            data.push(sample);
1665        }
1666        data
1667    }
1668
1669    fn assert_median(data: &mut Vec<f64>, accuracy: f64) {
1670        let mut q = Quantogram::new();
1671        q.add_unweighted_samples(data.iter());
1672        let actual_median = q.median().unwrap();
1673        data.sort_by(|a, b| a.partial_cmp(b).unwrap());
1674        
1675        let expected_median = data[data.len() / 2]; // Ignore adjustment for data with an even number of elements.
1676        let abs_rel_error = (expected_median - actual_median) / actual_median;
1677        assert!(abs_rel_error <= accuracy);
1678    }
1679
1680    /// Add all data and then query a single median.
1681    fn profile_median(data: &mut Vec<f64>) -> (u128,u128) {
1682        let t1 = Instant::now();
1683        let mut q = Quantogram::new();
1684        q.add_unweighted_samples(data.iter());
1685        let _actual_median = q.median().unwrap();
1686        let quantogram_time = t1.elapsed().as_micros();
1687
1688        let t2 = Instant::now();
1689        let mut data_copy = Vec::new();
1690        for sample in data {
1691          data_copy.push(*sample);
1692        }
1693        data_copy.sort_by(|a, b| a.partial_cmp(b).unwrap());
1694        let _expected_median = data_copy[data_copy.len() / 2]; // Ignore adjustment for data with an even number of elements.
1695        let naive_time = t2.elapsed().as_micros();
1696      
1697        (quantogram_time, naive_time)
1698    }
1699
1700    /// Add all data and perform a median after each insertion.
1701    fn profile_many_medians(data: &mut Vec<f64>) -> (u128,u128) {
1702        let t1 = Instant::now();
1703        let mut q = Quantogram::new();
1704        for sample in data.iter() {
1705            q.add(*sample);
1706            let _actual_median = q.median().unwrap();
1707        }
1708        let quantogram_time = t1.elapsed().as_micros();
1709
1710        let t2 = Instant::now();
1711        let mut data_copy = Vec::new();
1712        for sample in data {
1713            data_copy.push(*sample);
1714            data_copy.sort_by(|a, b| a.partial_cmp(b).unwrap());
1715            let _expected_median = data_copy[data_copy.len() / 2]; // Ignore adjustment for data with an even number of elements.
1716        }
1717        let naive_time = t2.elapsed().as_micros();
1718      
1719        (quantogram_time, naive_time)
1720    }
1721
1722    fn absolute_relative_error(expected: f64, actual: f64) -> f64 {
1723        (expected - actual).abs() / expected
1724    }
1725
1726    #[test]
1727    fn test_min() { assert_eq!(gapped_quantogram().min().unwrap(), 1.0); }
1728
1729    #[test]
1730    fn test_max() { assert_eq!(gapped_quantogram().max().unwrap(), 99.0); }
1731
1732    #[test]
1733    fn test_mean() { assert_eq!(gapped_quantogram().mean().unwrap(), 50.0); }
1734
1735    #[test]
1736    fn test_mode_unimodal() { 
1737        let mut q = Quantogram::new();
1738        let data = vec![1.0,2.0,3.0,3.0,4.0,4.0,4.0,5.0,6.0];
1739        q.add_unweighted_samples(data.iter());
1740        assert_eq!(q.mode(), vec![4.0]); 
1741    }
1742
1743    #[test]
1744    fn test_mode_multimodal() { 
1745        let mut q = Quantogram::new();
1746        let data = vec![1.0,2.0,3.0,3.0,3.0,4.0,4.0,4.0,5.0,6.0];
1747        q.add_unweighted_samples(data.iter());
1748        assert_eq!(q.mode(), vec![3.0,4.0]); 
1749    }
1750
1751    /// Verify that a false outlier mode at zero is accepted by mode but rejected by hsm.
1752    #[test]
1753    fn test_hsm() { 
1754        let mut q = Quantogram::new();
1755        let data = vec![
1756            0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0, 
1757            1.0,
1758            2.0,  2.0,
1759            3.0,
1760            4.0,  4.0,  4.0,
1761            5.0,  5.0,
1762            6.0,  6.0,  6.0,  6.0,
1763            7.0,  7.0,  7.0,
1764            8.0,  8.0,  8.0,  8.0,
1765            9.0,  9.0,  9.0,  9.0,  9.0,
1766            10.0, 10.0, 10.0, 10.0, 10.0, 10.0,
1767            11.0, 11.0, 11.0,
1768            12.0, 12.0, 12.0,
1769            13.0,
1770            14.0
1771        ];
1772        q.add_unweighted_samples(data.iter());
1773        assert_eq!(q.hsm(), Some(10.0)); 
1774        assert_eq!(q.mode(), vec![0.0]); 
1775    }
1776
1777    #[test]
1778    fn test_count() { 
1779        assert_eq!(Quantogram::new().count(), 0); 
1780        assert_eq!(gapped_quantogram().count(), 10); 
1781    }
1782
1783    #[test]
1784    fn test_median() { assert_eq!(gapped_quantogram().median().unwrap(), 5.0); }
1785
1786
1787    #[test]
1788    fn test_unweighted_standard_deviation() { 
1789        let mut q = Quantogram::new();
1790
1791        q.add_unweighted_samples(vec![
1792            85, 86, 100, 76, 81, 93, 84, 99, 71, 69, 93, 85, 81, 87, 89
1793        ].iter());
1794
1795        let expected_stddev = 8.698403429493382;
1796        assert_close(q.stddev().unwrap(), expected_stddev, 0.00000000001);
1797    }
1798
1799    #[test]
1800    fn test_weighted_standard_deviation() { 
1801        let mut q = Quantogram::new();
1802
1803        q.add_unweighted_samples(vec![
1804            86, 100, 76, 93, 84, 99, 71, 69, 93, 87, 89
1805        ].iter());
1806        q.add_weighted(85.0, 2.0);
1807        q.add_weighted(81.0, 2.0);
1808
1809        let expected_stddev = 8.69840342949338;
1810        assert_close(q.stddev().unwrap(), expected_stddev, 0.00000000001);
1811    }
1812
1813    /// Removing values may mess up the standard deviation, but this case passes.
1814    #[test]
1815    fn test_unweighted_standard_deviation_with_remove() { 
1816        // Include spurious values of 123 and 150, then remove them. 
1817        let mut q = Quantogram::new();
1818        q.add_unweighted_samples(vec![
1819            123, 150, 85, 86, 100, 76, 81, 93, 84, 99, 71, 69, 93, 85, 81, 87, 89
1820        ].iter());
1821        q.remove(150.0);
1822        q.remove(123.0);
1823
1824        let expected_stddev = 8.698403429493382;
1825        assert_close(q.stddev().unwrap(), expected_stddev, 0.00000000001);
1826    }
1827
1828    #[test]
1829    fn test_quantile() { 
1830        assert_eq!(gapped_quantogram().quantile(0.0).unwrap(), 1.0); 
1831        assert_eq!(gapped_quantogram().quantile(0.75).unwrap(), 96.0); 
1832        assert_eq!(gapped_quantogram().quantile(1.0).unwrap(), 99.0); 
1833    }
1834
1835    #[test]
1836    fn test_quantile_at() { 
1837        let q_mean = |r: Option<(f64,f64)>| { (r.unwrap().0 + r.unwrap().1) / 2.0 };
1838        let mut q = Quantogram::new();
1839        let data: Vec<f64> = vec![0,1,2,3,4,5,6,7,8,9,10]
1840            .into_iter()
1841            .map(|x| x as f64)
1842            .collect();
1843        q.add_unweighted_samples(data.iter());
1844        assert_eq!(0.5, q_mean(q.quantile_at(5.0)));
1845        assert_eq!(17.0/22.0, q_mean(q.quantile_at(8.0)));
1846    }
1847
1848    #[test]
1849    fn test_fussy_median() { 
1850        assert_eq!(gapped_quantogram().fussy_quantile(0.5, 2.0).unwrap(), 50.0); 
1851    }
1852
1853    #[test]
1854    fn test_remove() { 
1855        let mut q = gapped_quantogram();
1856        q.remove(99.0);
1857        q.remove(98.0);
1858        assert_eq!(q.median().unwrap(), 4.0); 
1859    }
1860
1861    #[test]
1862    fn test_quantiles_with_negatives() { 
1863        let mut q = Quantogram::new();
1864        let data = vec![-1.0,-2.0,-3.0,-4.0,-5.0,-95.0,-96.0,-97.0,-98.0,-99.0, 0.0, 1.0,2.0,3.0,4.0,5.0,95.0,96.0,97.0,98.0,99.0];
1865        q.add_unweighted_samples(data.iter());
1866
1867        assert_eq!(q.median().unwrap(), 0.0); 
1868        assert_eq!(q.quantile(0.4).unwrap(), -2.0); 
1869        assert_eq!(q.quantile(0.6).unwrap(), 2.0); 
1870    }
1871
1872    #[test]
1873    fn test_weighted_quantile() { 
1874        let mut q = Quantogram::new();
1875        let data = vec![1.0,2.0,3.0,4.0,5.0,95.0,96.0,97.0,98.0,99.0];
1876        q.add_unweighted_samples(data.iter());
1877        q.add_weighted(0.0, 6.0);
1878
1879        assert_eq!(q.median().unwrap(), 2.0); 
1880    }
1881
1882    #[test]
1883    fn test_coeff_of_range() { 
1884        let mut q = Quantogram::new();
1885        q.add_unweighted_samples(vec![
1886            123, 150, 85, 86, 100, 76, 81, 93, 84, 99, 71, 69, 93, 85, 81, 87, 89
1887        ].iter());
1888
1889        let expected_coeff = (150.0 - 69.0)/(150.0 + 69.0);
1890        assert_close(q.coeff_of_range().unwrap(), expected_coeff, 0.00000000001);
1891    }
1892
1893    #[test]
1894    fn test_coeff_of_quartile_deviation() { 
1895        let mut q = Quantogram::new();
1896        q.add_unweighted_samples(vec![
1897            1,2,3,4,4,5,6,7,7,8,9,10,10,11,12,13
1898        ].iter());
1899
1900        let expected_coeff = (10.0 - 4.0)/(10.0 + 4.0);
1901        assert_close(q.coeff_of_quartile_dev().unwrap(), expected_coeff, 0.00000000001);
1902    }
1903
1904    #[test]
1905    fn test_quartiles() { 
1906        let mut q = Quantogram::new();
1907        q.add_unweighted_samples(vec![
1908            1,2,3,4,4,5,6,7,7,8,9,10,10,11,12,13
1909        ].iter());
1910
1911        assert_close(q.q1().unwrap(), 4.0, 0.00000000001);
1912        assert_close(q.q3().unwrap(), 10.0, 0.00000000001);
1913    }
1914
1915    #[test]
1916    fn test_coeff_of_stddev() { 
1917        let mut q = Quantogram::new();
1918        q.add_unweighted_samples(vec![
1919            85, 86, 100, 76, 81, 93, 84, 99, 71, 69, 93, 85, 81, 87, 89
1920        ].iter());
1921
1922        let expected_stddev =  8.698403429493382;
1923        let expected_mean   = 85.266666666666667;
1924        assert_close(
1925            q.coeff_of_stddev().unwrap(), 
1926            expected_stddev / expected_mean, 
1927            0.00000000001
1928        );
1929        // Also test the coeff_of_variation
1930        assert_close(
1931            q.coeff_of_variation().unwrap(), 
1932            100.0 * expected_stddev / expected_mean, 
1933            0.00000000001
1934        );
1935    }
1936
1937    #[test]
1938    fn test_finite() { 
1939        let mut q = Quantogram::new();
1940        q.add(f64::NAN);
1941        assert_eq!(q.finite(), 0.0);
1942
1943        let data = vec![1.0,2.0,3.0,4.0];
1944        q.add_unweighted_samples(data.iter());
1945        assert_eq!(q.finite(), 0.8); 
1946    }
1947
1948    #[test]
1949    fn test_nan() { 
1950        let mut q = Quantogram::new();
1951        q.add(f64::NAN);
1952        assert_eq!(q.nan(), 1.0);
1953
1954        let data = vec![1.0,2.0,3.0,4.0];
1955        q.add_unweighted_samples(data.iter());
1956        assert_eq!(q.nan(), 0.2); 
1957    }
1958
1959    #[test]
1960    fn test_size() { 
1961        let mut q = Quantogram::new();
1962
1963        // Despite sparseness, storage can go no lower than 7, because we always record counts for NANs, zeroes, etc.
1964        assert_eq!(q.size(), 7); 
1965        
1966        // Adding the first value should add a bin to a coarse and a fine SkipMap.
1967        q.add(1.0);
1968        assert_eq!(q.size(), 9); 
1969
1970        // Adding the same value should not increase the storage used. 
1971        q.add(1.0);
1972        assert_eq!(q.size(), 9); 
1973
1974        // Adding a new value that is in the same coarse bin but a different fine bin should increase the storage by one, not two. 
1975        q.add(1.5);
1976        assert_eq!(q.size(), 10); 
1977    }
1978
1979    /// Verify that storage grows with the logarithm of the number of samples. 
1980    #[test]
1981    fn test_sparsity() { 
1982        let mut q = Quantogram::new();
1983        // A million is about 20 powers of 2. 
1984        // Default histogram has 35 bins per power of two.
1985        // This means that we should have 7 default bins, 20 coarse bins and at most 700 fine bins. 
1986        for i in 1..1000000 {
1987            q.add(i as f64);
1988        }
1989        // Actual size is 577.
1990        assert!(q.size() < 600);
1991
1992        assert!(absolute_relative_error(q.median().unwrap(), 500000.0) <= 0.01);
1993    }
1994
1995    #[test]
1996    fn test_with_constrained_range() { 
1997        let mut q = QuantogramBuilder::new()
1998            .with_smallest_power(-10)
1999            .with_largest_power(10)
2000            .build();
2001        let data = vec![
2002            0.0001, 0.00056, // will underflow and become zeroes
2003            0.002, 16.0, 1000.0, 
2004            6543.0, 15000.0, 2400000.0 // will overflow and become Infinity
2005        ];
2006        q.add_unweighted_samples(data.iter());
2007        assert_eq!(q.min().unwrap(), 0.0); 
2008        assert_eq!(q.finite(), 0.625); 
2009        assert_eq!(q.zero(), 0.25); 
2010        assert_eq!(q.median().unwrap(), 0.002); 
2011    }
2012
2013    #[test]
2014    fn test_accuracy() {
2015        let mut data = randomish(-100.0, 2000000.0, 100000);
2016        assert_median(&mut data, 0.01);
2017    }
2018
2019    #[test]
2020    fn test_insert_heavy_speed() {
2021        // Adding more data to a naive quantile should take longer compared to using a histogram
2022        // because the naive approach has an N log N sort. 
2023        // This test verifies that ther is an improvement in relative performance
2024        // between the two as the number of samples increases.
2025
2026        let mut short_data = randomish(-10000.0, 10000.0, 10000);
2027        let mut long_data = randomish(-10000.0, 10000.0, 1000000);
2028        let (short_quant_time, short_naive_time) = profile_median(&mut short_data);
2029        let (long_quant_time, long_naive_time) = profile_median(&mut long_data);
2030        let short_ratio = (short_quant_time as f64) / (short_naive_time as f64);
2031        let long_ratio = (long_quant_time as f64) / (long_naive_time as f64);
2032        println!("Short Ratio: {}  Long ratio: {}", short_ratio, long_ratio);
2033        // assert!(false);
2034        assert!(short_ratio > 1.4 * long_ratio);
2035    }
2036
2037    #[test]
2038    fn test_median_heavy_speed() {
2039        let mut short_data = randomish(-10000.0, 10000.0, 2000);
2040        let mut long_data = randomish(-10000.0, 10000.0, 20000);
2041        let (short_quant_time, short_naive_time) = profile_many_medians(&mut short_data);
2042        let (long_quant_time, long_naive_time) = profile_many_medians(&mut long_data);
2043        let short_ratio = (short_quant_time as f64) / (short_naive_time as f64);
2044        let long_ratio = (long_quant_time as f64) / (long_naive_time as f64);
2045        println!("Short Ratio: {}  Long ratio: {}", short_ratio, long_ratio);
2046        // assert!(false);
2047        assert!(short_ratio > 8.0 * long_ratio);
2048    }
2049
2050    #[test]
2051    fn test_basics() {
2052        let mut q = Quantogram::new();
2053        q.add(10.0);
2054        q.add(40.0);
2055        q.add(20.0);
2056        q.add(30.0);
2057        q.add(50.0);
2058    
2059        assert_eq!(q.count(), 5);
2060        assert_eq!(q.min().unwrap(), 10.0);
2061        assert_eq!(q.max().unwrap(), 50.0);
2062        assert_eq!(q.mean().unwrap(), 30.0);
2063        assert_eq!(q.median().unwrap(), 30.0);
2064        assert_eq!(q.quantile(0.75).unwrap(), 40.0);
2065
2066        q.remove(10.0);
2067        q.remove(20.0);
2068        assert_eq!(q.mean().unwrap(), 40.0);
2069    }
2070
2071    #[test]
2072    fn quantile_at_nan() {
2073        let mut q = Quantogram::new();
2074        q.add(10.0);
2075        assert!(q.quantile_at(f64::NAN).is_none()); 
2076    }
2077
2078}
2079
quantogram/lib.rs

quantogram/
lib.rs