quantogram/lib.rs
1use std::cell::RefCell;
2use std::ops::Bound::{Included,Excluded};
3use std::fmt::{Formatter,Debug,Result};
4use float_extras::f64::frexp;
5use std::convert::Into;
6use skiplist::SkipMap;
7mod half_sample_mode;
8use half_sample_mode::{HalfSampleModeCache};
9
10
11/// A bin to be used inside a Histogram with a midpoint value and a weight.
12#[derive(Copy, Clone, Debug)]
13struct HistogramBin {
14 /// Midpoint value for the bin.
15 pub sample : f64,
16
17 /// Sample weight, which must be positive.
18 /// If zero, it indicates a removed bin.
19 pub weight : f64
20}
21
22impl HistogramBin {
23 /// Construct a new HistogramBin.
24 pub fn new(sample: f64, weight: f64) -> Self {
25 HistogramBin {
26 sample: sample,
27 weight: weight
28 }
29 }
30
31 /// Validates that the weight is non-negative and finite, else function will panic.
32 fn validate(&self) {
33 if self.weight < 0.0 || !self.weight.is_finite() {
34 panic!("HistogramBin weight must be a non-negative, finite number.");
35 }
36 }
37
38 /// Copy values from another HistogramBin into this one.
39 pub fn set(&mut self, copy_from: Self) {
40 self.sample = copy_from.sample;
41 self.weight = copy_from.weight;
42 self.validate();
43 }
44}
45
46/// Categorize numbers as negative, zero, positive, NAN or an infinity
47enum SampleCategory {
48 Negative,
49 Zero,
50 Positive,
51 NegativeInfinity,
52 PositiveInfinity,
53 NotANumber
54}
55
56impl SampleCategory {
57 /// Decide which category of number a sample is.
58 pub fn categorize(sample : f64) -> Self {
59 match sample {
60 // These arms are sorted by most to least common which improved performance noticeably.
61 x if x > 0.0 && x != f64::INFINITY => SampleCategory::Positive,
62 x if x < 0.0 && x != f64::NEG_INFINITY => SampleCategory::Negative,
63 x if x == 0.0 => SampleCategory::Zero,
64 x if x == f64::INFINITY => SampleCategory::PositiveInfinity,
65 x if x == f64::NEG_INFINITY => SampleCategory::NegativeInfinity,
66 _ => SampleCategory::NotANumber
67 }
68 }
69}
70
71/// Provides a weighted Histogram of f64 values for computing approximate quantiles.
72/// This guarantees a configurable maximum absolute relative error
73/// and uses sparse storage to reduce memory usage.
74///
75/// Worst case accuracy defaults to one percent (0.01) absolute relative error.
76/// The error is unbiased, uniform for the entire range of numbers.
77/// The error for quantiles 0 and 1 (the minimum and maximum, respectively) is guaranteed to
78/// be zero, except if either of those values is removed.
79///
80/// If all inserted values are given a weight of one,
81/// this behaves as an unweighted (normal) histogram.
82///
83/// Samples may be added or removed.
84/// However, removing a sample that equals the minimum or maximum will cause
85/// those values to be replaced by the center value of the appropriate extreme histogram bucket.
86///
87/// The full valid range of floats is divided into two levels of buckets.
88/// For the default case with 1% error, here is what that means:
89///
90/// Top. The top level divides the number range into buckets for each power of two and between
91/// positive and negative numbers.
92/// It has a maximum of 508 buckets in two sparse lists.
93/// f64 values can range from 2^-126 to 2^127, thus there are a maximum of
94/// 254 buckets for positive numbers and 254 buckets for negatives.
95/// Bottom. The second level of buckets are in a single sparse list spanning the full range
96/// from the smallest negative number to the largest positive number.
97/// Each power of two range is broken into 35 buckets whose size varies exponentially.
98/// Each bucket is larger than the previous by a factor of 1.02
99/// (or smaller by 1.02, over the range of negative values).
100/// The value of 1.02 was chosen because 1.02^35 = 1.999889553, which differs
101/// from 2 by 0.00011. That means that the 35th bucket for each power of two is slightly larger than
102/// the rest, but not by enough to wreck the error guarantee.
103/// There are a maximum of 1 + 508*35 buckets in this list, or 17,781.
104/// (The "1" is for the zero bucket.)
105///
106/// A bucket is not created until at least one value is added to it.
107/// Removing the last item in a bucket will not cause its memory to be freed;
108/// its weight will be set to zero.
109///
110/// (If you are familiar with the Rule of 70 used to estimate the doubling period
111/// for a given interest rate, it was probably chosen because of this nice property of 1.02.)
112///
113/// The error rate of 0.01 and the bin scale factor of 1.02 are related in this way:
114/// ```text
115///
116/// (1 + error) (1 + 1/101) 102
117/// scale = ------------- = ------------- = ------- = 1.02
118/// (1 - error) (1 - 1/101) 100
119/// ```
120/// So technically, the worst case error is 1/101, or 0.99%, not 1%, but the math is easier
121/// when using 1.02 instead of 1.020202020202 and the error on the last bucket is ideal.
122/// (A smaller error in the last bucket is available for error = 1/176, but the memory requirements
123/// are much higher.)
124///
125/// Usage:
126///
127/// Typical usage (unweighted samples with the default accuracy of 1%):
128///
129/// use quantogram::Quantogram;
130/// let mut q = Quantogram::new();
131/// q.add(10.0);
132/// q.add(40.0);
133/// q.add(20.0);
134/// q.add(30.0);
135/// q.add(50.0);
136///
137/// assert_eq!(q.min().unwrap(), 10.0);
138/// assert_eq!(q.max().unwrap(), 50.0);
139/// assert_eq!(q.mean().unwrap(), 30.0);
140/// assert_eq!(q.median().unwrap(), 30.0);
141/// assert_eq!(q.quantile(0.75).unwrap(), 40.0);
142///
143/// q.remove(10.0);
144/// q.remove(20.0);
145/// assert_eq!(q.mean().unwrap(), 40.0);
146///
147///
148/// Notes:
149/// 1. Coarse bins are for powers of two not some other value because
150/// getting the largest power of two less than or equal to a number calls a fast intrinsic function.
151/// This makes assigning a number to a bin very fast.
152///
153/// 2. When inquiring about a quantile, a value will be returned so long as one
154/// number in range is added. NANs and Infinities will be disregarded.
155/// The NANs and Infinities are available as separate counts.
156///
157/// 3. Unbounded errors are possible in the edge case of a large gap in the middle of the data.
158/// Take the case of the median. If there are an even number of items in the data with a
159/// large gap in the exact middle, then the proper formula for the median is the mean
160/// of the last value below the gap and the first value above the gap.
161/// To correct for this, use the fussy_quantile method.
162/// It will probe for quantiles at Φ + ε and Φ - ε for a small value of ε,
163/// and average the pair that best span the gap.
164/// If the histogram is unweighted (all weights are one) then the value of
165/// ε should be 1/2N, where N is the number of items already added to the Histogram.
166/// If the samples are weighted, not sure what to do.
167///
168/// 4. If one sample is in a bin, the value for the bin will be set to the accurate sample,
169/// not the midpoint of the range for that bin. If a second sample is added, the
170/// bin value will be set to the midpoint. Consequently, bins with one
171/// sample added have no error.
172///
173pub struct Quantogram {
174 /// Exponential Scaling factor that relates the start value of one fine bin to the next.
175 /// value[N+1] = value[N] * growth
176 /// The default is 1.02, which yields an absolute relative error of 1%.
177 /// This number must be in the range (1,√2].
178 /// Smaller values guarantee smaller errors but require more storage space and execution time.
179 growth : f64,
180
181 /// Number of bins used to store values per power of two range of numbers.
182 /// This must equal log(2, growth), rounded down.
183 /// For example, if growth is 1.02, this must be 35.
184 /// The larger this value, the more memory is required.
185 bins_per_doubling : usize,
186
187 /// Memoize powi(growth, n) for n in [0,bins_per_doubling)
188 growth_bin_power_memo : Vec<f64>,
189
190 /// Unweighted count of all insertions minus all removals.
191 total_count : usize,
192
193 /// Total weight of all inserted values other than NAN or +/- Infinity.
194 total_weight : f64,
195
196 /// Weighted sum of all samples (excluding NANs and infinities).
197 /// This permits the exact weighted mean to be calculated.
198 weighted_sum : f64,
199
200 /// Running weighted variance
201 running_variance : f64,
202
203 /// Total weight of all inserted NAN values.
204 nan_weight : f64,
205
206 /// Total weight of all inserted +Infinity values.
207 plus_ininity_weight : f64,
208
209 /// Total weight of all inserted -Infinity values.
210 minus_ininity_weight : f64,
211
212 /// Total weight of all negative values (included in total_weight).
213 negative_weight : f64,
214
215 /// Total weight of all zeroes (included in total_weight).
216 zero_weight : f64,
217
218 /// Total weight of all positive values (included in total_weight).
219 positive_weight : f64,
220
221 /// Maximum added sample that is not +Infinity, initialized to NAN.
222 maximum : f64,
223
224 /// Minimum added sample that is not -Infinity, initialized to NAN.
225 minimum : f64,
226
227 /// This remains true until a finite number that is not a number is added.
228 /// When true, this causes quantile to round its result.
229 only_integers : bool,
230
231 /// To reduce memory usage, force all numbers whose magnitude is
232 /// greater than this power of two to be counted as +Infinity (overflow).
233 /// Default is -(2^127), the highest supported by IEEE-754 64 bit floats.
234 negative_overflow : f64,
235
236 /// To reduce memory usage, force all numbers whose magnitude
237 /// is less than this power of two to be counted as zero (underflow).
238 /// Default is -(2^-126), the lowest supported by IEEE-754 64 bit floats.
239 negative_underflow : f64,
240
241 /// To reduce memory usage, force all numbers whose magnitude is
242 /// greater than this power of two to be counted as +Infinity (overflow).
243 /// Default is +(2^127), the highest supported by IEEE-754 64 bit floats.
244 positive_overflow : f64,
245
246 /// To reduce memory usage, force all numbers whose magnitude
247 /// is less than this power of two to be counted as zero (underflow).
248 /// Default is +(2^-126), the lowest supported by IEEE-754 64 bit floats.
249 positive_underflow : f64,
250
251 /// Bins for powers of two of positive numbers.
252 /// The key is the power of two, from -126 to 127.
253 ///
254 /// Example: If the key is 2, the bin is for all numbers in the range [4,8).
255 positive_coarse_bins : SkipMap<isize, HistogramBin>,
256
257 /// Bins for powers of two of negative numbers.
258 /// The key is the power of two, from -126 to 127.
259 ///
260 /// Example: If the key is 3, the bin is for all numbers in the range (-16,-8].
261 negative_coarse_bins : SkipMap<isize, HistogramBin>,
262
263 /// Bins for zeroes, positive and negative numbers.
264 /// The key is related to the power of two and the position within the 35 bins per
265 /// power of two range:
266 /// key = ((power + 126)*35 + bin + 1)*sign
267 /// where:
268 /// power = the power of two, from -126 to 127
269 /// bin = 0 to 34
270 /// sign = 1 for positive numbers, -1 for negative numbers.
271 /// Zeroes are always stored at key = 0.
272 fine_bins : SkipMap<isize, HistogramBin>,
273
274 /// Natural Log of the grown factor, precomputed for speed.
275 log_of_growth : f64,
276
277 /// Cache of mode candidates for computing the mode.
278 mode_cache : ModeCache,
279
280 /// Cache for use with hsm (half-sample mode).
281 hsm_cache : RefCell<HalfSampleModeCache>
282}
283
284impl Quantogram {
285
286 // ////////////////////////////
287 // //
288 // Constructors //
289 // //
290 // ////////////////////////////
291
292 /// Create a Quantogram with the default error rate of 1%.
293 pub fn new() -> Self {
294 let bins = 35;
295 let growth = 1.02;
296 let mut q = Quantogram {
297 growth : growth,
298 bins_per_doubling : bins,
299
300 growth_bin_power_memo : Vec::new(),
301
302 total_count : 0,
303 total_weight : 0.0,
304 weighted_sum : 0.0,
305 running_variance : 0.0,
306 nan_weight : 0.0,
307 plus_ininity_weight : 0.0,
308 minus_ininity_weight : 0.0,
309 negative_weight : 0.0,
310 zero_weight : 0.0,
311 positive_weight : 0.0,
312 maximum : f64::NAN,
313 minimum : f64::NAN,
314
315 only_integers : true,
316
317 negative_underflow : -(2.0_f64.powi(-126)),
318 negative_overflow : -(2.0_f64.powi(127)),
319 positive_underflow : 2.0_f64.powi(-126),
320 positive_overflow : 2.0_f64.powi(127),
321
322 // The SkipMaps do not preallocate bins. This lets it know
323 // how many levels to make each skiplist to make it optimal.
324 positive_coarse_bins : SkipMap::with_capacity(254),
325 negative_coarse_bins : SkipMap::with_capacity(254),
326 fine_bins : SkipMap::with_capacity(17641),
327 log_of_growth : growth.ln(),
328 mode_cache : ModeCache::new(),
329 hsm_cache : RefCell::new(HalfSampleModeCache::new_default())
330 };
331 q.memoize_growth_bin_power();
332 q
333 }
334
335 /// Create a Quantogram where growth and bins are set, and underflow and overflow bounds are set
336 /// to the given powers of two.
337 /// - Inserted values having absolute magnitude below 2^smallest_power will be treated as zeroes.
338 /// - Inserted values having absolute magnitude above 2^largest_power will be treated as infinities.
339 ///
340 /// Note: To ensure that consist values for growth and bins are set, it is advised to use QuantogramBuilder
341 /// instead of directly calling this constructor.
342 pub fn with_configuration(growth: f64, bins: usize, smallest_power: isize, largest_power: isize) -> Self {
343 let valid_power = -126..=127;
344 if !valid_power.contains(&smallest_power) || !valid_power.contains(&largest_power) {
345 panic!("Power of two constraints must be in range [-126,127]");
346 }
347 if smallest_power >= largest_power {
348 panic!("Power of two constraints not given in ascending order");
349 }
350 let mut q = Quantogram {
351 growth : growth,
352 bins_per_doubling : bins,
353
354 growth_bin_power_memo : Vec::new(),
355
356 total_count : 0,
357 total_weight : 0.0,
358 weighted_sum : 0.0,
359 running_variance : 0.0,
360 nan_weight : 0.0,
361 plus_ininity_weight : 0.0,
362 minus_ininity_weight : 0.0,
363 negative_weight : 0.0,
364 zero_weight : 0.0,
365 positive_weight : 0.0,
366 maximum : f64::NAN,
367 minimum : f64::NAN,
368
369 only_integers : true,
370
371 negative_underflow : -(2.0_f64.powi(smallest_power as i32)),
372 negative_overflow : -(2.0_f64.powi(largest_power as i32)),
373 positive_underflow : 2.0_f64.powi(smallest_power as i32),
374 positive_overflow : 2.0_f64.powi(largest_power as i32),
375
376 // The SkipMaps do not preallocate bins. This lets it know
377 // how many levels to make each skiplist to make it optimal.
378 positive_coarse_bins : SkipMap::with_capacity(254),
379 negative_coarse_bins : SkipMap::with_capacity(254),
380 // If the range is constrained, fewer bins will be required,
381 // so the capacity can be reduced.
382 fine_bins : SkipMap::with_capacity(((largest_power - smallest_power + 1)*35 + 1) as usize),
383 log_of_growth : growth.ln(),
384 mode_cache : ModeCache::new(),
385 hsm_cache : RefCell::new(HalfSampleModeCache::new_default())
386 };
387 q.memoize_growth_bin_power();
388 q
389 }
390
391 pub fn replace_hsm_cache(&mut self, new_cache: HalfSampleModeCache) {
392 self.hsm_cache = RefCell::new(new_cache);
393 self.hsm_cache.borrow_mut().clear();
394 }
395
396 // ////////////////////////////
397 // //
398 // Add/Remove Samples //
399 // //
400 // ////////////////////////////
401
402 /// Add a sample to the histogram with a weight of one.
403 pub fn add(&mut self, sample: f64) {
404 self.add_weighted(sample, 1.0);
405 }
406
407 /// Remove a sample from the histogram, deducting one from the weight.
408 /// If the weight goes negative, the call panics.
409 pub fn remove(&mut self, sample: f64) {
410 self.add_weighted(sample, -1.0);
411 }
412
413 /// Add a sample to the histogram with the given weight.
414 /// If the weight is positive, the sample is added, otherwise removed.
415 /// If the cumulative weight for the bin holding that sample goes negative,
416 /// the call panics.
417 pub fn add_weighted(&mut self, sample: f64, weight: f64) -> f64 {
418 self.hsm_cache.borrow_mut().record(sample);
419 let bounded_sample = self.over_or_underflow(sample);
420 let adjusted_fine_weight;
421 self.adjust_aggregates(bounded_sample, weight);
422 match SampleCategory::categorize(bounded_sample) {
423 SampleCategory::Positive => {
424 self.positive_weight += weight;
425 let (fine_key, fine_low, fine_midpoint, fine_high) = self.get_fine_key_with_midpoint(bounded_sample, self.growth, self.bins_per_doubling).unwrap();
426 adjusted_fine_weight = Self::increment_key_weight(&mut self.fine_bins, fine_key, fine_midpoint, sample, weight);
427 self.adjust_mode(fine_key, fine_low, fine_midpoint, fine_high, adjusted_fine_weight);
428 let (coarse_key, coarse_midpoint) = Self::get_coarse_key_with_midpoint(bounded_sample).unwrap();
429 Self::increment_key_weight(&mut self.positive_coarse_bins, coarse_key, coarse_midpoint, sample, weight);
430 },
431 SampleCategory::Zero => {
432 self.zero_weight += weight;
433 let (fine_key, fine_low, fine_midpoint, fine_high) = self.get_fine_key_with_midpoint(bounded_sample, self.growth, self.bins_per_doubling).unwrap();
434 adjusted_fine_weight = Self::increment_key_weight(&mut self.fine_bins, fine_key, fine_midpoint, sample, weight);
435 self.adjust_mode(fine_key, fine_low, fine_midpoint, fine_high, adjusted_fine_weight);
436 },
437 SampleCategory::Negative => {
438 self.negative_weight += weight;
439 let (fine_key, fine_low, fine_midpoint, fine_high) = self.get_fine_key_with_midpoint(bounded_sample, self.growth, self.bins_per_doubling).unwrap();
440 adjusted_fine_weight = Self::increment_key_weight(&mut self.fine_bins, fine_key, fine_midpoint, sample, weight);
441 self.adjust_mode(fine_key, fine_low, fine_midpoint, fine_high, adjusted_fine_weight);
442 let (coarse_key, coarse_midpoint) = Self::get_coarse_key_with_midpoint(bounded_sample).unwrap();
443 Self::increment_key_weight(&mut self.negative_coarse_bins, coarse_key, coarse_midpoint, sample, weight);
444 },
445 SampleCategory::NotANumber => {
446 self.nan_weight += weight;
447 adjusted_fine_weight = self.nan_weight;
448 },
449 SampleCategory::PositiveInfinity => {
450 self.plus_ininity_weight += weight;
451 adjusted_fine_weight = self.plus_ininity_weight;
452 },
453 SampleCategory::NegativeInfinity => {
454 self.minus_ininity_weight += weight;
455 adjusted_fine_weight = self.minus_ininity_weight;
456 }
457 }
458 adjusted_fine_weight
459 }
460
461 /// Add many samples to the Quantogram, all having a weight of 1.0.
462 pub fn add_unweighted_samples<'a, S>(&mut self, samples: impl Iterator<Item = &'a S>)
463 where S: 'a + Into<f64> + Copy
464 {
465 for sample in samples {
466 self.add((*sample).into());
467 }
468 }
469
470 // ////////////////////////////
471 // //
472 // Basic Measures //
473 // //
474 // mean min max count //
475 // median variance stddev //
476 // mode hsm //
477 // quantile quantile_at //
478 // fussy_quantile //
479 // finite zero nan //
480 // //
481 // ////////////////////////////
482
483
484 /// Return actual (not estimated) weighted mean of inserted samples (with machine precision),
485 /// or None if no finite values have been inserted.
486 /// Excludes all NANs and Infinities that have been inserted.
487 pub fn mean(&self) -> Option<f64> {
488 if self.total_weight > 0.0 { Some(self.weighted_sum / self.total_weight) }
489 else { None }
490 }
491
492 /// Return actual (not estimated) minimum of inserted samples (with machine precision),
493 /// or None if no finite values have been inserted.
494 /// Excludes all NANs and Infinities that have been inserted.
495 pub fn min(&self) -> Option<f64> {
496 if self.minimum.is_finite() { Some(self.minimum) }
497 else { None }
498 }
499
500 /// Return actual (not estimated) maximum of inserted samples (with machine precision),
501 /// or None if no finite values have been inserted.
502 /// Excludes all NANs and Infinities that have been inserted.
503 pub fn max(&self) -> Option<f64> {
504 if self.maximum.is_finite() { Some(self.maximum) }
505 else { None }
506 }
507
508 /// Return count of inserted samples, including NANs and Infinities.
509 pub fn count(&self) -> usize {
510 self.total_count
511 }
512
513 /// Return the weighted fraction of values that are finite (not NAN or +/- Infinity)
514 /// as a value in the range [0,1].
515 /// If no values have yet been added, return NAN.
516 pub fn finite(&self) -> f64 {
517 if self.total_count == 0 {
518 f64::NAN
519 }
520 else {
521 let numerator = self.total_weight;
522 let denominator = self.added_weight();
523 numerator / denominator
524 }
525 }
526
527 /// Return the weighted fraction of values that are zero, including NANs and infinities in the basis.
528 /// If no values have yet been added, return NAN.
529 pub fn zero(&self) -> f64 {
530 if self.total_count == 0 {
531 f64::NAN
532 }
533 else {
534 let numerator = self.zero_weight;
535 let denominator = self.added_weight();
536 numerator / denominator
537 }
538 }
539
540 /// Return the weighted fraction of values that are NAN, including infinities in the basis.
541 /// If no values have yet been added, return NAN.
542 pub fn nan(&self) -> f64 {
543 if self.total_count == 0 {
544 f64::NAN
545 }
546 else {
547 let numerator = self.nan_weight;
548 let denominator = self.added_weight();
549 numerator / denominator
550 }
551 }
552
553 /// Return an estimate of the median.
554 pub fn median(&self) -> Option<f64> {
555 self.quantile(0.5)
556 }
557
558 /// Exact weighted variance of all added finite values.
559 pub fn variance(&self) -> f64 {
560 self.running_variance
561 }
562
563 /// Exact weighted population standard deviation of all added finite values.
564 pub fn stddev(&self) -> Option<f64> {
565 if self.total_weight <= 0.0 {
566 None
567 }
568 else {
569 Some((self.running_variance / self.total_weight).sqrt())
570 }
571 }
572
573 /// Return an estimate of the mode.
574 /// If no finite samples have been added, returns an empty list.
575 /// If there are multiple modes tieing with the same weight,
576 /// return all of them in a list.
577 /// If all samples in the collection are integers, round the mode.
578 pub fn mode(&self) -> Vec<f64> {
579 let unrounded_mode = self.mode_cache.mode(&self.fine_bins);
580 if self.only_integers {
581 let rounded_mode = unrounded_mode
582 .into_iter()
583 .map(|x| x.round())
584 .collect();
585 rounded_mode
586 }
587 else {
588 unrounded_mode
589 }
590 }
591
592 /// Estimate the half-sample mode.
593 /// More resistant than the mode to outliers and contamination (noise).
594 ///
595 /// Based on the mode estimator by Robertson and Cryer (1974), and an
596 /// algorithm described in "On a Fast, Robust Estimator of the Mode" by David Bickel and Rudolf Fruhwirth.
597 /// That algorithm is applied to raw samples, whereas this is applied
598 /// to already histogrammed values.
599 ///
600 /// Edge case: If fewer than five bins of data are present in the histogram,
601 /// the true mode will be returned, unless the data is multi-moded, in which case
602 /// None will be returned.
603 ///
604 /// NOTE:
605 pub fn hsm(&self) -> Option<f64> {
606 if self.total_weight == 0.0 { None }
607 else {
608 let mut weights: Vec<f64> = Vec::new();
609 let mut samples: Vec<f64> = Vec::new();
610 for bin in self.fine_bins.values() {
611 samples.push(bin.sample);
612 weights.push(bin.weight);
613 }
614
615 // Cache may be updated because of interior mutability.
616 self.hsm_cache.borrow_mut().hsm(&samples, &weights, self.total_weight)
617
618 // If we wanted to bypass the cache, do it this way:
619 // half_sample_mode(&samples, &weights, self.total_weight)
620 }
621 }
622
623 /// Estimate the quantile for the inserted data.
624 /// For the minimum, use phi = 0.0.
625 /// For the median, use phi = 0.5.
626 /// For the 90th percentile, phi = 0.9.
627 /// For the maximum, use phi = 1.0.
628 ///
629 /// Added samples that are NANs or Infinities are excluded from the computation.
630 pub fn quantile(&self, phi: f64) -> Option<f64> {
631 if phi < 0.0 || phi > 1.0 || self.total_weight <= 0.0 {
632 None
633 }
634 else if phi == 0.0 {
635 self.min()
636 }
637 else if phi == 1.0 {
638 self.max()
639 }
640 else {
641 let mut target_cume_weight = self.total_weight * phi;
642
643 // Narrow down the position of the quantile to negative, zero or positive numbers.
644 if target_cume_weight <= self.negative_weight {
645 // Quantile falls in the negative range
646 // Search two maps: coarse and fine.
647 let (coarse_index, remainder) = Self::find_containing_bucket(target_cume_weight, &self.negative_coarse_bins).unwrap();
648 // Tricky, because in the negative range we must go from the highest magnitude to the lowest magnitude.
649 // Coarse index is therefore the negative of the power of two only for negative numbers.
650 let high_fine_index = ((-coarse_index + 126) * (self.bins_per_doubling as isize) + 0 + 1) * -1 + 1; // Exclusive upper bound
651 let low_fine_index = high_fine_index - (self.bins_per_doubling as isize) + 1; // Inclusive lower bound
652 let (_fine_index, _remainder2, sample_at_quantile) = Self::find_containing_bucket_in_range(low_fine_index, high_fine_index, remainder, &self.fine_bins).unwrap();
653 // println!("Negatives. coarse = {}. fine range = [{},{}) Fine found = {}", coarse_index, low_fine_index, high_fine_index, fine_index);
654 // Previously required a second call to get, which was slower.
655 // let sample_at_quantile = self.fine_bins.get(&fine_index).unwrap().sample;
656 return Some(self.conditional_round(sample_at_quantile));
657 }
658 target_cume_weight -= self.negative_weight;
659 if target_cume_weight <= self.zero_weight {
660 return Some(0.0);
661 }
662 else {
663 target_cume_weight -= self.zero_weight;
664 }
665
666 // Quantile falls in the positive range
667 let (coarse_index, remainder) = Self::find_containing_bucket(target_cume_weight, &self.positive_coarse_bins).unwrap();
668 let low_fine_index = ((coarse_index + 126) * (self.bins_per_doubling as isize) + 0 + 1) * 1;
669 let high_fine_index = low_fine_index + (self.bins_per_doubling as isize); // Exclusive upper bound
670 let (_fine_index, _remainder2, sample_at_quantile)
671 = Self::find_containing_bucket_in_range(low_fine_index, high_fine_index, remainder, &self.fine_bins)
672 .unwrap();
673 return Some(self.conditional_round(sample_at_quantile));
674 }
675 }
676
677 /// Quantile measurement that corrects for issues that occur when there is a large gap in samples
678 /// right where the quantile falls.
679 ///
680 /// For example, the median of { 1,2,3,4,5,95,96,97,98,99 } is 50! This is because there are an even number of items
681 /// in the set, so you have to average the middle two values.
682 ///
683 /// Three quantile calculations will be taken, at Φ - ε, Φ and Φ + ε.
684 /// If the difference between the results at the two lower quantiles
685 /// differs substantially from the difference between the results at the two higher quantiles,
686 /// assume there is a gap and average the middle value and the most extreme value.
687 ///
688 /// threshold_ratio decides whether a gap exists. If none exists, no averaging occurs.
689 /// It is used to compare the deltas between quantiles computed at the desired phi and values of
690 /// phi slightly lower or higher. If the deltas are similar, no gap exists.
691 /// A value over 2.0 seems sensible, but testing should be done to determine the best value.
692 pub fn fussy_quantile(&self, phi: f64, threshold_ratio: f64) -> Option<f64> {
693 if phi < 0.0 || phi > 1.0 || self.total_weight <= 0.0 {
694 None
695 }
696 else if phi == 0.0 {
697 self.min()
698 }
699 else if phi == 1.0 {
700 self.max()
701 }
702 else {
703 // TODO: This derivation of epsilon is appropriate for unweighted samples (all having weight = 1.0).
704 // Unsure what the proper value is for weighted samples.
705 let epsilon = 1.0 / (2.0 * self.total_count as f64);
706
707 let q_middle = self.quantile(phi);
708 if phi <= 1.5 * epsilon || phi >= 1.0 - 1.5 * epsilon || q_middle.is_none() {
709 q_middle
710 }
711 else {
712 let q_low = self.quantile(phi - epsilon).unwrap();
713 let q_high = self.quantile(phi + epsilon).unwrap();
714 let q_median = q_middle.unwrap();
715
716 let low_difference = q_median - q_low;
717 let high_difference = q_high - q_median;
718 if low_difference >= high_difference * threshold_ratio {
719 Some((q_low + q_median)/2.0)
720 }
721 else if high_difference >= low_difference * threshold_ratio {
722 Some((q_high + q_median)/2.0)
723 }
724 else {
725 None
726 }
727 }
728 }
729 }
730
731 /// Get the range of quantiles between which the given value falls,
732 /// or None if no finite samples have been added yet or the
733 /// queried value is not finite.
734 pub fn quantile_at(&self, value: f64) -> Option<(f64,f64)> {
735 let min_opt = self.min();
736 if min_opt.is_none() || !value.is_finite() {
737 return None;
738 }
739 let min = min_opt.unwrap();
740 let max = self.max().unwrap();
741 match self.count() {
742 0 => None,
743 1 => {
744 if value < min { Some((0.0, 0.0)) }
745 else if value > min { Some((1.0, 1.0)) }
746 else { Some((0.0, 1.0)) }
747 },
748 _ if value < min => Some((0.0, 0.0)),
749 _ if value > max => Some((1.0, 1.0)),
750 _ if min == max => Some((0.0, 1.0)),
751 _ if value == min => Some((0.0, 0.0)),
752 _ if value == max => Some((1.0, 1.0)),
753 _ if value == 0.0 => {
754 let neg = self.negative_weight;
755 let sum = self.total_weight;
756 let zero = self.zero_weight;
757 Some((neg / sum, (neg + zero)/sum))
758 },
759 _ => {
760 // TODO: It is possible to use the coarse skipmaps to narrow in on the location
761 // in the fine skipmap faster, hence speedup this calculation.
762 // This is a less commonly used method, so optimization is not a priority.
763 let (end_key, _fine_low, _fine_midpoint, _fine_high)
764 = self.get_fine_key_with_midpoint(value, self.growth, self.bins_per_doubling).unwrap();
765 let (start_key, start_weight) =
766 if value >= 0.0 { (1_isize, self.negative_weight + self.zero_weight) }
767 else { (isize::MIN , 0.0) };
768 let mut cume_weight = start_weight;
769 let sum = self.total_weight;
770 for (key, bucket) in self.fine_bins.range(Included(&start_key), Included(&isize::MAX)) {
771 let weight = bucket.weight;
772 if *key == end_key {
773 // Value falls within an existing bin, so we must define a range
774 // or phi values.
775 return Some((cume_weight / sum, (cume_weight + weight)/sum));
776 }
777 else if *key > end_key {
778 // Value would go in a bin not yet present in the collection, so we passed by it.
779 // Since value falls between bins, it gets a well defined phi value,
780 // where the bottom and top of range are identical.
781 return Some((cume_weight / sum, cume_weight/sum));
782 }
783 cume_weight += weight;
784 }
785 // Should never fall through, because the maximum sample always has a bin defined for it,
786 // and we already tested if value > max.
787 None
788 }
789 }
790 }
791
792 // ////////////////////////////
793 // //
794 // Measures of Dispersion //
795 // //
796 // range q1 q3 iqr //
797 // quartile_deviation //
798 // coeff_of_range //
799 // coeff_of_quartile_dev //
800 // coeff_of_stddev //
801 // coeff_of_variation //
802 // //
803 // ////////////////////////////
804
805 /// Range between the highest and lowest values sampled.
806 pub fn range(&self) -> Option<f64> {
807 match (self.max(), self.min()) {
808 (Some(max), Some(min)) => Some(max - min),
809 _ => None
810 }
811 }
812
813 /// Sample at the first quartile.
814 pub fn q1(&self) -> Option<f64> { self.quantile(0.25) }
815
816 /// Sample at the third quartile.
817 pub fn q3(&self) -> Option<f64> { self.quantile(0.75) }
818
819 /// Inter quartile range, which is (Q3 - Q1)
820 pub fn iqr(&self) -> Option<f64> {
821 match (self.q3(), self.q1()) {
822 (Some(q3), Some(q1)) => Some(q3 - q1),
823 _ => None
824 }
825 }
826
827 /// Quartile deviation or semi-interquartile range, which is (Q3 - Q1) / 2
828 pub fn quartile_deviation(&self) -> Option<f64> {
829 match (self.q3(), self.q1()) {
830 (Some(q3), Some(q1)) => Some((q3 - q1) / 2.0),
831 _ => None
832 }
833 }
834
835 /// Coefficient of range.
836 ///
837 /// ```text
838 /// max - min
839 /// coeff of range = ------------
840 /// max + min
841 /// ```
842 pub fn coeff_of_range(&self) -> Option<f64> {
843 match (self.max(), self.min()) {
844 (Some(max), Some(min)) => {
845 let sum = max + min;
846 if sum == 0.0 { None }
847 else { Some((max - min) / sum) }
848 },
849 _ => None
850 }
851 }
852
853 /// Coefficient of quartile deviation, which measures the relative spread between
854 /// the first and third quartiles.
855 ///
856 /// ```text
857 /// q3 - q1
858 /// coeff of quartile deviation = ---------
859 /// q3 + q1
860 /// ```
861 pub fn coeff_of_quartile_dev(&self) -> Option<f64> {
862 match (self.q3(), self.q1()) {
863 (Some(q3), Some(q1)) => {
864 let sum = q3 + q1;
865 if sum == 0.0 { None }
866 else { Some((q3 - q1) / sum) }
867 },
868 _ => None
869 }
870 }
871
872 /// Coefficient of standard deviation.
873 ///
874 /// This give the standard deviation divided by the mean.
875 pub fn coeff_of_stddev(&self) -> Option<f64> {
876 match (self.stddev(), self.mean()) {
877 (Some(stddev), Some(mean)) => {
878 if mean == 0.0 { None }
879 else { Some(stddev / mean) }
880 },
881 _ => None
882 }
883 }
884
885 /// Coefficient of variation.
886 ///
887 /// This give the standard deviation divided by the mean expressed as a percentage.
888 pub fn coeff_of_variation(&self) -> Option<f64> {
889 match self.coeff_of_stddev() {
890 Some(coeff) => Some(coeff * 100.0),
891 _ => None
892 }
893 }
894
895 // ////////////////////////////
896 // //
897 // Diagnostics //
898 // //
899 // ////////////////////////////
900
901 /// Computes a relative measure of the storage being used by the histogram.
902 ///
903 /// As bins are dynamically added to the sparse Skipmaps, this increases.
904 pub fn size(&self) -> usize {
905 7 // counts of NANs, +/- Infinity, zeroes, three empty Skipmaps are always kept
906 + self.positive_coarse_bins.len()
907 + self.negative_coarse_bins.len()
908 + self.fine_bins.len()
909 }
910
911 // ////////////////////////////
912 // //
913 // Utility functions //
914 // //
915 // ////////////////////////////
916
917 /// Compute growth^N for N = 0..bins_per_doubling and store in a Vec
918 /// to memoize the result as an optimization, to avoid having to call powi function.
919 fn memoize_growth_bin_power(&mut self) {
920 for bin in 0..=self.bins_per_doubling {
921 self.growth_bin_power_memo.push(self.growth.powi(bin as i32));
922 }
923 }
924
925 /// Approximate the log(x, growth) using a Padé Approximant
926 /// and take the floor, yielding an integer.
927 /// Good accuracy if x is in the range [1.0,2.0]. (Accuracy actually good up to e = 2.71828.)
928 ///
929 /// ```text
930 /// A good second-order Padé approximant to ln(x) for the region x ∈ [1,e] is:
931 ///
932 /// 3x² - 3
933 /// ln(x) = -------------
934 /// x² + 4x + 1
935 ///
936 /// A third-order Padé approximant to ln(x) for the region x ∈ [1,e] is:
937 ///
938 /// 11x³ + 27x² - 27x - 11
939 /// ln(x) = ------------------------
940 /// 3x³ + 27x² + 27x + 3
941 /// ```
942 /// To transform to use the growth factor as base, we divide by ln(growth).
943 ///
944 /// See https://math.stackexchange.com/questions/2238880/good-approximation-to-lnx-for-x-in-1-x-e
945 /// for how the 2nd order Pade approximant was derived.
946 ///
947 /// The third-order approximant came from Wolfram Alpha with some algebraic rearrangement
948 /// to optimize the number of arithmetic operations:
949 /// > PadeApproximant(ln[x], {x, 1, {3, 3}})
950 ///
951 /// Over the range [1,2], the worst case absolute relativer error for the:
952 /// 2nd order Padé approximant is 0.12%.
953 /// 3nd order Padé approximant is 0.004%. (0.033% for e = 2.71828)
954 /// The 3rd order accuracy was needed. In release mode, it is twice as fast as the system log.
955 fn pade_approximate_log_floor(&self, x: f64) -> isize {
956 let x2 = x * x;
957 let x3 = x2 * x;
958
959 // 2nd order:
960 // let numerator = 3.0 * (x2 - 1.0);
961 // let denominator = self.log_of_growth * (x2 + 4.0*x + 1.0);
962
963 // 3rd order (after dividing all terms by 27 and changing base):
964 let numerator = (11.0/27.0) * x3 + x2 - x - (11.0/27.0);
965 let denominator = self.log_of_growth * (x3/9.0 + x2 + x + (1.0/9.0));
966
967 let ln_x = numerator / denominator;
968 ln_x.floor() as isize
969 }
970
971 /// Find the key to the containing bucket by Iterating over buckets in a map
972 /// in increasing order and accumulating the weights until one is reached where the total
973 /// falls within the range belonging to the bucket.
974 /// Also return the amount of weight remaining prior to the final bucket.
975 fn find_containing_bucket(cume_weight: f64, map: &SkipMap<isize, HistogramBin>) -> Option<(isize,f64)> {
976 let mut remaining_weight = cume_weight;
977 for (k, bucket) in map.iter() {
978 let weight = bucket.weight;
979 if weight >= remaining_weight {
980 return Some((*k, remaining_weight));
981 }
982 remaining_weight -= weight;
983 }
984 None
985 }
986
987 /// Find the key to the containing bucket in the given key range
988 /// by Iterating over buckets in a map
989 /// accumulating the weights until one is reached where the total
990 /// falls within the range belonging to the bucket.
991 ///
992 /// Returns a tuple with the key, remaining weight, and the bucket sample value.
993 fn find_containing_bucket_in_range(start_key: isize, end_key: isize, cume_weight: f64, map: &SkipMap<isize, HistogramBin>) -> Option<(isize,f64,f64)> {
994 let mut remaining_weight = cume_weight;
995 for (k, bucket) in map.range(Included(&start_key), Excluded(&end_key)) {
996 let weight = bucket.weight;
997 if weight >= remaining_weight {
998 return Some((*k, remaining_weight, bucket.sample));
999 }
1000 remaining_weight -= weight;
1001 }
1002 None
1003 }
1004
1005 /// Bookkeeping done when adding a new sample or removing an existing one.
1006 fn adjust_aggregates(&mut self, sample: f64, weight: f64) {
1007 self.adjust_minimum(sample);
1008 self.adjust_maximum(sample);
1009 let weight_is_finite = weight.is_finite();
1010 let sample_is_finite = sample.is_finite();
1011 if weight < 0.0 && weight_is_finite {
1012 // Negative weights are interpreted as a removal.
1013 self.total_count -= 1;
1014 }
1015 else {
1016 self.total_count += 1;
1017 }
1018 if sample_is_finite && weight_is_finite {
1019 let previous_mean = self.mean().unwrap_or_default();
1020 self.total_weight += weight;
1021 self.weighted_sum += weight * sample;
1022 let current_mean = self.mean().unwrap_or_default();
1023 self.running_variance += weight * (sample - previous_mean) * (sample - current_mean);
1024 }
1025
1026 if sample_is_finite && sample.fract() != 0.0 {
1027 // Once we add numbers with fractions, assume that the quantile can have a fraction.
1028 self.only_integers = false;
1029 }
1030 }
1031
1032 /// Restrict a value based on preset overflow and underflow values.
1033 /// Values that underflow are set to zero.
1034 /// Values that overflow are set to +/- infinity.
1035 fn over_or_underflow(&self, sample: f64) -> f64 {
1036 if sample > self.negative_underflow && sample < self.positive_underflow {
1037 0.0
1038 }
1039 else if sample > self.positive_overflow {
1040 f64::INFINITY
1041 }
1042 else if sample < self.negative_overflow {
1043 f64::NEG_INFINITY
1044 }
1045 else {
1046 sample
1047 }
1048 }
1049
1050 /// If the new value is finite and less than the minimum
1051 /// (or the minimum is unset, hence a NAN), update the minimum.
1052 fn adjust_minimum(&mut self, sample: f64) {
1053 if self.minimum.is_nan() {
1054 if sample.is_finite() {
1055 self.minimum = sample;
1056 }
1057 }
1058 else if sample < self.minimum && sample.is_finite() {
1059 self.minimum = sample;
1060 }
1061 }
1062
1063 /// If the new value is finite and greater than the maximum
1064 /// (or the maximum is unset, hence a NAN), update the maximum.
1065 fn adjust_maximum(&mut self, sample: f64) {
1066 if self.maximum.is_nan() {
1067 if sample.is_finite() {
1068 self.maximum = sample;
1069 }
1070 }
1071 else if sample > self.maximum && sample.is_finite() {
1072 self.maximum = sample;
1073 }
1074 }
1075
1076 fn adjust_mode(
1077 &mut self,
1078 bin_key: isize,
1079 bin_low_value: f64,
1080 bin_midpoint: f64,
1081 bin_high_value: f64,
1082 bin_weight: f64) {
1083 // TODO: Add a configuration parameter here and in QuantogramBuilder
1084 // to enable/disable mode calculations.
1085 // It is a drag on performance and not all callers need the mode.
1086
1087 // NOTE: Do not update the mode if the weight is 1.
1088 // Otherwise if all values are distinct (common in unit tests)
1089 // then you store a huge list of modes. This kills performance, too.
1090 if bin_midpoint.is_finite() && bin_weight > 0.0 && bin_weight != 1.0 {
1091 self.mode_cache.update(bin_key, bin_low_value, bin_high_value, bin_weight);
1092 }
1093 }
1094
1095 /// Update bin weight for given key, or create new bin for key if none exists.
1096 /// Returns the updated weight.
1097 fn increment_key_weight(map: &mut SkipMap<isize, HistogramBin>, key: isize, bucket_sample: f64, accurate_sample: f64, added_weight: f64) -> f64 {
1098 match map.get_mut(&key) {
1099 Some(bucket) => {
1100 // When updating an existing bucket, use the bucket_sample value, which is the midpoint value,
1101 // unless the previous bucket used the same value as the accurate value.
1102 let new_bucket:HistogramBin;
1103 if bucket.sample == accurate_sample {
1104 new_bucket = HistogramBin::new(accurate_sample, bucket.weight + added_weight);
1105 bucket.set(new_bucket);
1106 }
1107 else {
1108 new_bucket = HistogramBin::new(bucket_sample, bucket.weight + added_weight);
1109 bucket.set(new_bucket);
1110 }
1111 new_bucket.weight
1112 },
1113 None => {
1114 // When creating a new bucket, use the accurate_sample.
1115 // This means that bins with a single sample are 100% accurate,
1116 // but when the second sample is added, shift to the midpoint for the bin.
1117 map.insert(key, HistogramBin::new(accurate_sample, added_weight));
1118 added_weight
1119 }
1120 }
1121 }
1122
1123 /// If only integers have been added to the histogram, round the value.
1124 fn conditional_round(&self, sample: f64) -> f64 {
1125 if self.only_integers && sample.is_finite() {
1126 sample.round()
1127 }
1128 else {
1129 sample
1130 }
1131 }
1132
1133 /// Sum of weights of all samples added, including NAN and infinities.
1134 fn added_weight(&self) -> f64 {
1135 self.total_weight + self.nan_weight + self.plus_ininity_weight + self.minus_ininity_weight
1136 }
1137
1138
1139 /// Exponent on the largest power of two less than or equal to the magnitude of the given number,
1140 /// or None if a NAN, Infinity or Zero.
1141 ///
1142 /// ```
1143 /// use quantogram::Quantogram;
1144 /// assert_eq!(Quantogram::power_of_two(5.0).unwrap(), 2);
1145 /// assert_eq!(Quantogram::power_of_two(4.0).unwrap(), 2);
1146 /// assert_eq!(Quantogram::power_of_two(1.5).unwrap(), 0);
1147 /// assert_eq!(Quantogram::power_of_two(-8.5).unwrap(), 3);
1148 /// assert_eq!(Quantogram::power_of_two(0.0), None);
1149 /// assert_eq!(Quantogram::power_of_two(0.25).unwrap(), -2);
1150 /// ```
1151 pub fn power_of_two(sample: f64) -> Option<isize> {
1152 if sample.is_finite() && sample != 0.0 {
1153 let (_mantissa, exponent) = frexp(sample);
1154 Some(exponent - 1)
1155 }
1156 else {
1157 None
1158 }
1159 }
1160
1161 /// Break a sample into three parts:
1162 /// - sign is -1 for negative numbers or +1 for positive numbers
1163 /// - power is exponent on the largest power of two less than or equal to the absolute value of the number.
1164 /// - remainder is the mantissa multiplied by two, which is a number in the semi-open range [1.0,2.0)
1165 ///
1166 /// Returns None if the sample is not finite or is zero.
1167 fn sign_power_remainder(sample: f64) -> Option<(isize, isize, f64)> {
1168 if sample.is_finite() && sample != 0.0 {
1169 let (mantissa, exponent) = frexp(sample);
1170 if sample > 0.0 {
1171 Some((1, exponent - 1, mantissa * 2.0))
1172 }
1173 else {
1174 Some((-1, exponent - 1, mantissa.abs() * 2.0))
1175 }
1176 }
1177 else {
1178 None
1179 }
1180 }
1181
1182 /// Raise two to the given power.
1183 #[inline(always)]
1184 fn two_power(power: isize) -> f64 {
1185 match power {
1186 0..=63 => (1_usize << power) as f64,
1187 64..=127 => (1_u128 << power) as f64,
1188 -63..=-1 => 1.0 / ((1_usize << -power) as f64),
1189 -126..=-64 => 1.0 / ((1_u128 << -power) as f64),
1190 _ => f64::NAN
1191 }
1192 }
1193
1194 /// Get the key that maps a sample to its coarse bin.
1195 /// For positive numbers, this is the floor of the log base 2 of the sample.
1196 /// For negative numbers, negate the power of two, so that negative numbers
1197 /// sort from low to high. (A negative number with a higher power of two is "smaller"
1198 /// or more negative than one with a lower power of two.)
1199 ///
1200 /// Returns a Tuple with the bin key and the midpoint value for the bin.
1201 fn get_coarse_key_with_midpoint(sample: f64) -> Option<(isize,f64)> {
1202 match Self::sign_power_remainder(sample) {
1203 Some((sign, power, _remainder)) => {
1204 let low = Self::two_power(power); // 2.0_f64.powi(power as i32);
1205 let high = low * 2.0;
1206 let bucket_middle = (sign as f64) * (low + high) / 2.0;
1207 // Multiply by the sign because negative numbers with large exponents sort before those with small exponents
1208 Some((sign * power, bucket_middle))
1209 },
1210 _ => None
1211 }
1212 }
1213
1214 /// Get the key that maps a number to its fine histogram bin given the desired exponential growth factor.
1215 /// If the growth factor is 1.02, there will be 35 bins per power of two.
1216 /// Thus bins_per_doubling must match the growth_factor:
1217 ///
1218 /// ```text
1219 /// growth^bins_per_doubling ~ 2.0.
1220 /// ```
1221 ///
1222 /// Bins are defined for zeroes, and finite positive and negative numbers.
1223 /// The key is related to the power of two and the position within the 35 bins per
1224 /// power of two range:
1225 /// ```text
1226 /// key = ((power + 126)*35 + bin + 1)*sign
1227 /// ```
1228 /// where:
1229 /// ```text
1230 /// power = the power of two, from -126 to 127
1231 /// bin = 0 to 34 (for growth = 1.02)
1232 /// sign = 1 for positive numbers, -1 for negative numbers.
1233 /// Zeroes are always stored at key = 0.
1234 /// ```
1235 /// Returns a Tuple with the bin key and the low, mid, and high point values for the bin.
1236 fn get_fine_key_with_midpoint(&self, sample: f64, growth: f64, bins_per_doubling: usize) -> Option<(isize,f64,f64,f64)> {
1237 match Self::sign_power_remainder(sample) {
1238 Some((sign, power, remainder)) => {
1239 // Use of system log is more accurate but slower.
1240 // This inaccuracy adds a slight bias towards zero.
1241 // Rare samples will be put into the next lower bin. Since that bin is slightly narrower,
1242 // this may actually reduce the error slightly as the estimate is pushed towards a closer bin center.
1243 // let bin = remainder.log(growth).floor() as isize;
1244 let bin = self.pade_approximate_log_floor(remainder);
1245
1246 let key = ((power + 126) * (bins_per_doubling as isize) + bin + 1) * sign;
1247 let two_power = Self::two_power(power); // f64::powi(2.0, power as i32);
1248 let bucket_low = two_power * self.growth_bin_power_memo[bin as usize]; // Memo of: f64::powi(growth, bin as i32);
1249 let bucket_high = bucket_low * growth;
1250 let bucket_middle = (sign as f64) * (bucket_low + bucket_high) / 2.0;
1251 Some((key, bucket_low, bucket_middle, bucket_high))
1252 },
1253 _ => {
1254 if sample == 0.0 {
1255 Some((0, 0.0, 0.0, 0.0))
1256 }
1257 else {
1258 None
1259 }
1260 }
1261 }
1262 }
1263
1264}
1265
1266impl Debug for Quantogram {
1267 fn fmt(&self, f: &mut Formatter) -> Result {
1268 let mut fine = "(".to_owned();
1269 for (k, bucket) in self.fine_bins.iter() {
1270 fine = fine + &format!("\n {}->{:.4}:{}", k, bucket.sample, bucket.weight);
1271 }
1272 fine = fine + "\n )";
1273
1274 let mut coarse_plus = "(".to_owned();
1275 for (k, bucket) in self.positive_coarse_bins.iter() {
1276 coarse_plus = coarse_plus + &format!("\n {}->{:.4}:{}", k, bucket.sample, bucket.weight);
1277 }
1278 coarse_plus = coarse_plus + "\n )";
1279
1280 let mut coarse_minus = "(".to_owned();
1281 for (k, bucket) in self.negative_coarse_bins.iter() {
1282 coarse_minus = coarse_minus + &format!("\n {}->{:.4}:{}", k, bucket.sample, bucket.weight);
1283 }
1284 coarse_minus = coarse_minus + "\n )";
1285
1286 write!(f, "Quantogram.
1287 growth = {}, bins per doubling = {}
1288 total count = {}, weight = {}, sum = {}, variance = {:.3},
1289 NAN = {}, -Inf = {}, +Inf = {}, Integers? = {}
1290 - = {}, 0 = {}, + = {}
1291 min = {}, mean = {:?}, max = {}
1292 under/overflow = {}/{}
1293 coarse bins =
1294 Minus {}
1295 Plus {}
1296 fine bins = {}
1297 ",
1298 self.growth, self.bins_per_doubling,
1299 self.total_count, self.total_weight, self.weighted_sum,
1300 self.running_variance,
1301 self.nan_weight, self.minus_ininity_weight, self.plus_ininity_weight, self.only_integers,
1302 self.negative_weight, self.zero_weight, self.positive_weight,
1303 self.minimum, self.mean(), self.maximum,
1304 self.positive_underflow.log2(), self.positive_overflow.log2(),
1305 coarse_minus,
1306 coarse_plus,
1307 fine
1308 )
1309 }
1310}
1311
1312/// A fluent API Builder for Quantograms.
1313///
1314/// If error, growth or bins_per_doubling are changed, the other two
1315/// are modified to be consistent with it. If error or growth imply a non-integral
1316/// value for bins_per_doubling, take the ceiling of the derived bins_per_ceiling
1317/// and rederive the other two from that, to satisfy a minimum guarantee.
1318///
1319/// No values may cause bins_per_doubling to exceed 1000, hence the error
1320/// may not be set lower than 0.0347%.
1321///
1322/// Example of creating a Quantogram with a maximum of 2% absolute relative error:
1323///
1324///
1325/// use quantogram::QuantogramBuilder;
1326/// let q = QuantogramBuilder().with_error(0.02).build();
1327///
1328///
1329/// This is the number of bins used for various error rates:
1330/// ```text
1331/// Bins Error Rate
1332/// ---- ----------
1333/// 2 17.2 %
1334/// 3 11.5 %
1335/// 4 8.6 %
1336/// 5 6.9 %
1337/// 7 4.9 %
1338/// 12 2.9 %
1339/// 18 1.9 %
1340/// 35 1.0 %
1341/// 70 0.5 %
1342/// 139 0.25%
1343/// 347 0.10%
1344/// 694 0.05%
1345/// 867 0.04%
1346/// 1000 0.0347%
1347/// ```
1348#[derive(Copy, Clone, Debug)]
1349pub struct QuantogramBuilder {
1350 /// Desired absolute relative error rate.
1351 /// The valid range is a fraction in the range [0.00035,0.17].
1352 /// Defaults to approximately 0.01.
1353 /// Altering error changes growth and bins_per_doubling to be consistent.
1354 ///
1355 /// This is related to the number of bins per doubling (b)
1356 /// or growth via these formulas:
1357 /// ```text
1358 /// 1/b
1359 /// 2 - 1 growth - 1
1360 /// error = ------------ = ------------
1361 /// 1/b growth + 1
1362 /// 2 + 1
1363 /// ```
1364 error : f64,
1365
1366 /// Number of bins used to store values per power of two range of numbers.
1367 /// The valid range is [2,1000].
1368 /// The larger this value, the more memory is required.
1369 /// Defaults to 35 and must exceed one.
1370 /// Altering this changes error and growth to be consistent.
1371 ///
1372 /// The number of bins (b) is related to the growth factor
1373 /// via this formula:
1374 /// ```text
1375 /// 1/b
1376 /// growth = 2
1377 /// ```
1378 bins_per_doubling : usize,
1379
1380 /// Exponential scale factor relating start value of one bin to the next.
1381 /// The valid range is [1.00069,1.4143].
1382 /// Defaults to approximately 1.02.
1383 /// Altering growth changes error and bins_per_doubling to be consistent.
1384 /// ```text
1385 /// b
1386 /// growth = 2
1387 /// ```
1388 growth : f64,
1389
1390 /// A sample whose magnitude is less than this power of two will underflow and be considered a zero.
1391 /// Must be in the range [-126,127] but also be less than largest_power.
1392 /// Defaults to -126.
1393 smallest_power: isize,
1394
1395 /// A sample whose magnitude is more than this power of two will overflow and be considered +/- Infinity.
1396 /// Must be in the range [-126,127] but also be more than smallest_power.
1397 /// Defaults to +127.
1398 largest_power: isize,
1399
1400 hsm_cache: Option<HalfSampleModeCache>
1401}
1402
1403impl QuantogramBuilder {
1404 /// Create a new builder that defaults to an error rate of 1%
1405 /// with 35 bins per doubling and a growth factor of 1.02.
1406 pub fn new() -> Self {
1407 QuantogramBuilder {
1408 error: 0.01,
1409 bins_per_doubling: 35,
1410 growth: 1.02,
1411 smallest_power: -126,
1412 largest_power: 127,
1413 hsm_cache: None
1414 }
1415 }
1416
1417 /// Build a Quantogram using the collected configuration values.
1418 pub fn build(self) -> Quantogram {
1419 let mut q = Quantogram::with_configuration(self.growth, self.bins_per_doubling, self.smallest_power, self.largest_power);
1420 match self.hsm_cache {
1421 Some(cache) => { q.replace_hsm_cache(cache); },
1422 None => (),
1423 }
1424 q
1425 }
1426
1427 pub fn with_hsm_cache(mut self, hsm_cache: &HalfSampleModeCache)-> Self {
1428 let mut cleared_cache = hsm_cache.clone();
1429 cleared_cache.clear();
1430 self.hsm_cache = Some(cleared_cache);
1431 self
1432 }
1433
1434 /// Configure the underflow of samples.
1435 /// A sample whose magnitude has a power of two less than the given values will be set to zero.
1436 ///
1437 /// Example: If you only need to compute quantiles to two decimal places:
1438 /// ```text
1439 /// set power = -7, since 2^-7 = 1/128 < 0.01.
1440 /// ```
1441 pub fn with_smallest_power(mut self, power: isize)-> Self {
1442 self.smallest_power =
1443 if power < -126 { -126 }
1444 else if power >= 127 { 127 }
1445 else { power };
1446 self
1447 }
1448
1449 /// Configure the overflow of samples.
1450 /// A sample whose magnitude has a power of two greater than the given values will be set to +/- Infinity.
1451 ///
1452 /// Example: If you only need to study numbers below a thousand,
1453 /// set power = 10, since 2^10 = 1024 > 1000.
1454 pub fn with_largest_power(mut self, power: isize)-> Self {
1455 self.largest_power =
1456 if power <= -126 { -125 }
1457 else if power > 127 { 127 }
1458 else { power };
1459 self
1460 }
1461
1462 /// Configure to target the given maximum absolute relative error.
1463 pub fn with_error(mut self, error: f64) -> Self {
1464 (&mut self).set_from_error(error);
1465 self
1466 }
1467
1468 /// Configure to target the given growth factor from one bucket to the next.
1469 /// This will be adjusted to the nearest value that is an Nth root of 2.
1470 pub fn with_growth(mut self, growth: f64) -> Self {
1471 (&mut self).set_from_growth(growth);
1472 self
1473 }
1474
1475 /// Set bins_per_doubling, error, and growth to be consistent with the given value.
1476 /// Memory usage is proportional to this value, so this
1477 /// is the way to directly control memory usage.
1478 /// A bins_per_doubling value of 35 yields 1% error.
1479 pub fn with_bins_per_doubling(mut self, bins: usize) -> Self {
1480 (&mut self).set_from_bins(bins);
1481 self
1482 }
1483
1484 /// Set bins_per_doubling, error, and growth to be consistent with the given error value.
1485 /// This is the way to directly control error rate.
1486 /// A 1% error corresponds to a bins_per_doubling value of 35.
1487 fn set_from_error(&mut self, error: f64) {
1488 let adjusted_error =
1489 if error < 0.00035 { 0.00035 }
1490 else if error > 0.17 { 0.17 }
1491 else { error };
1492 let growth = (1.0 + adjusted_error) / (1.0 - adjusted_error);
1493 let fractional_bins = 1.0 / growth.log2();
1494 // Must make the number of bins an integer, then correct the other values.
1495 self.bins_per_doubling = fractional_bins.ceil() as usize;
1496 self.growth = 2.0_f64.powf(1.0/(self.bins_per_doubling as f64));
1497 self.error = (self.growth - 1.0) / (self.growth + 1.0);
1498 }
1499
1500 fn set_from_growth(&mut self, growth: f64) {
1501 let adjusted_growth =
1502 if growth < 1.00069 { 1.00069 }
1503 else if growth > 1.4143 { 1.4143 }
1504 else { growth };
1505 let fractional_bins = 1.0 / adjusted_growth.log2();
1506 // Must make the number of bins an integer, then correct the other values.
1507 self.bins_per_doubling = fractional_bins.ceil() as usize;
1508 self.growth = 2.0_f64.powf(1.0/(self.bins_per_doubling as f64));
1509 self.error = (self.growth - 1.0) / (self.growth + 1.0);
1510 }
1511
1512 /// Set bins_per_doubling, error, and growth to be consistent with the given bins value.
1513 /// This is the way to directly control memory usage.
1514 /// The maximum possible memory usage is linearly proportional to bins.
1515 /// Actual data usually has much redundancy, autocorrelation, and may obey Zipf's law,
1516 /// which will cause much less than the maximum storage in practice.
1517 ///
1518 /// A bins_per_doubling value of 35 corresponds to 1% error.
1519 ///
1520 /// Bins will be constrained to be in the range [2,1000].
1521 fn set_from_bins(&mut self, bins: usize) {
1522 let adjusted_bins =
1523 if bins < 2 { 2 }
1524 else if bins > 1000 { 1000 }
1525 else { bins };
1526 self.bins_per_doubling = adjusted_bins;
1527 self.growth = 2.0_f64.powf(1.0/(self.bins_per_doubling as f64));
1528 self.error = (self.growth - 1.0) / (self.growth + 1.0);
1529 }
1530}
1531
1532#[derive(Copy, Clone, Debug)]
1533struct BinInfo {
1534 pub bin_key : isize,
1535 pub bin_low_value : f64,
1536 pub bin_high_value : f64,
1537 pub bin_weight : f64
1538}
1539
1540impl BinInfo {
1541 pub fn bin_width(&self) -> f64 {
1542 self.bin_high_value - self.bin_low_value
1543 }
1544
1545 /// Fetch the weight of the indicated bin, or zero if it does not exist.
1546 pub fn get_key_weight(map: &SkipMap<isize, HistogramBin>, key: isize) -> f64 {
1547 match map.get(&key) {
1548 Some(bucket) => bucket.weight,
1549 None => 0.0
1550 }
1551 }
1552
1553 /// Estimate the mode of grouped data using the weights of
1554 /// the bins immediately below and above.
1555 /// If those bins are empty, it just takes the midpoint of this bin.
1556 ///
1557 /// ```text
1558 /// / f1 - f0 \
1559 /// Mode = L + |-----------------| h
1560 /// \ 2f1 - f0 - f2 /
1561 ///
1562 /// where:
1563 /// L is the lower limit of the modal class
1564 /// h is the size of the class interval
1565 /// f1 is the frequency of the modal class
1566 /// f0 is the frequency of the class preceding the modal class
1567 /// f2 is the frequency of the class succeeding the modal class
1568 /// ```
1569 pub fn mode_of_grouped_data(&self, bins: &SkipMap<isize, HistogramBin>) -> f64 {
1570 let f1 = self.bin_weight;
1571 let f0 = Self::get_key_weight(bins, self.bin_key - 1);
1572 let f2 = Self::get_key_weight(bins, self.bin_key + 1);
1573 let h = self.bin_width();
1574 let l = self.bin_low_value;
1575 let mode = l + h*(f1 - f0)/(2.0 * f1 - f0 - f2);
1576 mode
1577 }
1578}
1579
1580/// Maintain a cache of information from which one can estimate
1581/// the current values for Mode.
1582struct ModeCache {
1583 modal_classes : Vec<BinInfo>,
1584 weight : f64
1585}
1586
1587impl ModeCache {
1588 pub fn new() -> Self {
1589 ModeCache {
1590 modal_classes: Vec::new(),
1591 weight : 0.0
1592 }
1593 }
1594
1595 /// Attempt to update the ModeCache with a new candidate for Mode.
1596 /// Return false if no change was made to the mode estimate.
1597 pub fn update(&mut self, bin_key : isize,
1598 bin_low_value : f64,
1599 bin_high_value : f64,
1600 bin_weight : f64) -> bool {
1601 let bin = BinInfo {
1602 bin_key : bin_key,
1603 bin_low_value : bin_low_value,
1604 bin_high_value : bin_high_value,
1605 bin_weight : bin_weight
1606 };
1607 if self.weight < bin_weight {
1608 // Replace old mode with a new mode.
1609 self.modal_classes = vec![bin];
1610 self.weight = bin_weight;
1611 true
1612 }
1613 else if self.weight == bin_weight {
1614 // A multi-modal collection of samples.
1615 self.modal_classes.push(bin);
1616 true
1617 }
1618 else {
1619 false
1620 }
1621 }
1622
1623 /// Return a list of all modes among the samples.
1624 /// If no samples have been added, the list is empty.
1625 pub fn mode(&self, bins: &SkipMap<isize, HistogramBin>) -> Vec<f64> {
1626 let mut modes = Vec::new();
1627 for bin in self.modal_classes.iter() {
1628 let mode_estimate = bin.mode_of_grouped_data(bins);
1629 modes.push(mode_estimate);
1630 }
1631 modes
1632 }
1633}
1634
1635#[cfg(test)]
1636mod tests {
1637 // Note this useful idiom: importing names from outer (for mod tests) scope.
1638 use super::*;
1639 use std::time::{Instant};
1640
1641 fn assert_close(x: f64, y: f64, epsilon: f64) {
1642 let delta = (x - y).abs();
1643 assert!(delta <= epsilon);
1644 }
1645
1646 /// Test data with a gap.
1647 fn gapped_quantogram() -> Quantogram {
1648 let mut q = Quantogram::new();
1649 let data = vec![1.0,2.0,3.0,4.0,5.0,95.0,96.0,97.0,98.0,99.0];
1650 q.add_unweighted_samples(data.iter());
1651 q
1652 }
1653
1654 fn randomish(bottom: f64, top:f64, count: usize) -> Vec<f64> {
1655 let mut data : Vec<f64> = Vec::new();
1656 let range = top - bottom;
1657 let phi: f64 = (1.0 + 5.0_f64.sqrt()) / 2.0;
1658 for i in 0..count {
1659 // Pseudo random Weyl sequence based on Golden ratio.
1660 let pseudo_rand = ((i as f64) * phi) % 1.0_f64;
1661 // By squaring pseudo_rand we get a non-uniform distribution,
1662 // which is more interesting.
1663 let sample = bottom + pseudo_rand * pseudo_rand * range;
1664 data.push(sample);
1665 }
1666 data
1667 }
1668
1669 fn assert_median(data: &mut Vec<f64>, accuracy: f64) {
1670 let mut q = Quantogram::new();
1671 q.add_unweighted_samples(data.iter());
1672 let actual_median = q.median().unwrap();
1673 data.sort_by(|a, b| a.partial_cmp(b).unwrap());
1674
1675 let expected_median = data[data.len() / 2]; // Ignore adjustment for data with an even number of elements.
1676 let abs_rel_error = (expected_median - actual_median) / actual_median;
1677 assert!(abs_rel_error <= accuracy);
1678 }
1679
1680 /// Add all data and then query a single median.
1681 fn profile_median(data: &mut Vec<f64>) -> (u128,u128) {
1682 let t1 = Instant::now();
1683 let mut q = Quantogram::new();
1684 q.add_unweighted_samples(data.iter());
1685 let _actual_median = q.median().unwrap();
1686 let quantogram_time = t1.elapsed().as_micros();
1687
1688 let t2 = Instant::now();
1689 let mut data_copy = Vec::new();
1690 for sample in data {
1691 data_copy.push(*sample);
1692 }
1693 data_copy.sort_by(|a, b| a.partial_cmp(b).unwrap());
1694 let _expected_median = data_copy[data_copy.len() / 2]; // Ignore adjustment for data with an even number of elements.
1695 let naive_time = t2.elapsed().as_micros();
1696
1697 (quantogram_time, naive_time)
1698 }
1699
1700 /// Add all data and perform a median after each insertion.
1701 fn profile_many_medians(data: &mut Vec<f64>) -> (u128,u128) {
1702 let t1 = Instant::now();
1703 let mut q = Quantogram::new();
1704 for sample in data.iter() {
1705 q.add(*sample);
1706 let _actual_median = q.median().unwrap();
1707 }
1708 let quantogram_time = t1.elapsed().as_micros();
1709
1710 let t2 = Instant::now();
1711 let mut data_copy = Vec::new();
1712 for sample in data {
1713 data_copy.push(*sample);
1714 data_copy.sort_by(|a, b| a.partial_cmp(b).unwrap());
1715 let _expected_median = data_copy[data_copy.len() / 2]; // Ignore adjustment for data with an even number of elements.
1716 }
1717 let naive_time = t2.elapsed().as_micros();
1718
1719 (quantogram_time, naive_time)
1720 }
1721
1722 fn absolute_relative_error(expected: f64, actual: f64) -> f64 {
1723 (expected - actual).abs() / expected
1724 }
1725
1726 #[test]
1727 fn test_min() { assert_eq!(gapped_quantogram().min().unwrap(), 1.0); }
1728
1729 #[test]
1730 fn test_max() { assert_eq!(gapped_quantogram().max().unwrap(), 99.0); }
1731
1732 #[test]
1733 fn test_mean() { assert_eq!(gapped_quantogram().mean().unwrap(), 50.0); }
1734
1735 #[test]
1736 fn test_mode_unimodal() {
1737 let mut q = Quantogram::new();
1738 let data = vec![1.0,2.0,3.0,3.0,4.0,4.0,4.0,5.0,6.0];
1739 q.add_unweighted_samples(data.iter());
1740 assert_eq!(q.mode(), vec![4.0]);
1741 }
1742
1743 #[test]
1744 fn test_mode_multimodal() {
1745 let mut q = Quantogram::new();
1746 let data = vec![1.0,2.0,3.0,3.0,3.0,4.0,4.0,4.0,5.0,6.0];
1747 q.add_unweighted_samples(data.iter());
1748 assert_eq!(q.mode(), vec![3.0,4.0]);
1749 }
1750
1751 /// Verify that a false outlier mode at zero is accepted by mode but rejected by hsm.
1752 #[test]
1753 fn test_hsm() {
1754 let mut q = Quantogram::new();
1755 let data = vec![
1756 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
1757 1.0,
1758 2.0, 2.0,
1759 3.0,
1760 4.0, 4.0, 4.0,
1761 5.0, 5.0,
1762 6.0, 6.0, 6.0, 6.0,
1763 7.0, 7.0, 7.0,
1764 8.0, 8.0, 8.0, 8.0,
1765 9.0, 9.0, 9.0, 9.0, 9.0,
1766 10.0, 10.0, 10.0, 10.0, 10.0, 10.0,
1767 11.0, 11.0, 11.0,
1768 12.0, 12.0, 12.0,
1769 13.0,
1770 14.0
1771 ];
1772 q.add_unweighted_samples(data.iter());
1773 assert_eq!(q.hsm(), Some(10.0));
1774 assert_eq!(q.mode(), vec![0.0]);
1775 }
1776
1777 #[test]
1778 fn test_count() {
1779 assert_eq!(Quantogram::new().count(), 0);
1780 assert_eq!(gapped_quantogram().count(), 10);
1781 }
1782
1783 #[test]
1784 fn test_median() { assert_eq!(gapped_quantogram().median().unwrap(), 5.0); }
1785
1786
1787 #[test]
1788 fn test_unweighted_standard_deviation() {
1789 let mut q = Quantogram::new();
1790
1791 q.add_unweighted_samples(vec![
1792 85, 86, 100, 76, 81, 93, 84, 99, 71, 69, 93, 85, 81, 87, 89
1793 ].iter());
1794
1795 let expected_stddev = 8.698403429493382;
1796 assert_close(q.stddev().unwrap(), expected_stddev, 0.00000000001);
1797 }
1798
1799 #[test]
1800 fn test_weighted_standard_deviation() {
1801 let mut q = Quantogram::new();
1802
1803 q.add_unweighted_samples(vec![
1804 86, 100, 76, 93, 84, 99, 71, 69, 93, 87, 89
1805 ].iter());
1806 q.add_weighted(85.0, 2.0);
1807 q.add_weighted(81.0, 2.0);
1808
1809 let expected_stddev = 8.69840342949338;
1810 assert_close(q.stddev().unwrap(), expected_stddev, 0.00000000001);
1811 }
1812
1813 /// Removing values may mess up the standard deviation, but this case passes.
1814 #[test]
1815 fn test_unweighted_standard_deviation_with_remove() {
1816 // Include spurious values of 123 and 150, then remove them.
1817 let mut q = Quantogram::new();
1818 q.add_unweighted_samples(vec![
1819 123, 150, 85, 86, 100, 76, 81, 93, 84, 99, 71, 69, 93, 85, 81, 87, 89
1820 ].iter());
1821 q.remove(150.0);
1822 q.remove(123.0);
1823
1824 let expected_stddev = 8.698403429493382;
1825 assert_close(q.stddev().unwrap(), expected_stddev, 0.00000000001);
1826 }
1827
1828 #[test]
1829 fn test_quantile() {
1830 assert_eq!(gapped_quantogram().quantile(0.0).unwrap(), 1.0);
1831 assert_eq!(gapped_quantogram().quantile(0.75).unwrap(), 96.0);
1832 assert_eq!(gapped_quantogram().quantile(1.0).unwrap(), 99.0);
1833 }
1834
1835 #[test]
1836 fn test_quantile_at() {
1837 let q_mean = |r: Option<(f64,f64)>| { (r.unwrap().0 + r.unwrap().1) / 2.0 };
1838 let mut q = Quantogram::new();
1839 let data: Vec<f64> = vec![0,1,2,3,4,5,6,7,8,9,10]
1840 .into_iter()
1841 .map(|x| x as f64)
1842 .collect();
1843 q.add_unweighted_samples(data.iter());
1844 assert_eq!(0.5, q_mean(q.quantile_at(5.0)));
1845 assert_eq!(17.0/22.0, q_mean(q.quantile_at(8.0)));
1846 }
1847
1848 #[test]
1849 fn test_fussy_median() {
1850 assert_eq!(gapped_quantogram().fussy_quantile(0.5, 2.0).unwrap(), 50.0);
1851 }
1852
1853 #[test]
1854 fn test_remove() {
1855 let mut q = gapped_quantogram();
1856 q.remove(99.0);
1857 q.remove(98.0);
1858 assert_eq!(q.median().unwrap(), 4.0);
1859 }
1860
1861 #[test]
1862 fn test_quantiles_with_negatives() {
1863 let mut q = Quantogram::new();
1864 let data = vec![-1.0,-2.0,-3.0,-4.0,-5.0,-95.0,-96.0,-97.0,-98.0,-99.0, 0.0, 1.0,2.0,3.0,4.0,5.0,95.0,96.0,97.0,98.0,99.0];
1865 q.add_unweighted_samples(data.iter());
1866
1867 assert_eq!(q.median().unwrap(), 0.0);
1868 assert_eq!(q.quantile(0.4).unwrap(), -2.0);
1869 assert_eq!(q.quantile(0.6).unwrap(), 2.0);
1870 }
1871
1872 #[test]
1873 fn test_weighted_quantile() {
1874 let mut q = Quantogram::new();
1875 let data = vec![1.0,2.0,3.0,4.0,5.0,95.0,96.0,97.0,98.0,99.0];
1876 q.add_unweighted_samples(data.iter());
1877 q.add_weighted(0.0, 6.0);
1878
1879 assert_eq!(q.median().unwrap(), 2.0);
1880 }
1881
1882 #[test]
1883 fn test_coeff_of_range() {
1884 let mut q = Quantogram::new();
1885 q.add_unweighted_samples(vec![
1886 123, 150, 85, 86, 100, 76, 81, 93, 84, 99, 71, 69, 93, 85, 81, 87, 89
1887 ].iter());
1888
1889 let expected_coeff = (150.0 - 69.0)/(150.0 + 69.0);
1890 assert_close(q.coeff_of_range().unwrap(), expected_coeff, 0.00000000001);
1891 }
1892
1893 #[test]
1894 fn test_coeff_of_quartile_deviation() {
1895 let mut q = Quantogram::new();
1896 q.add_unweighted_samples(vec![
1897 1,2,3,4,4,5,6,7,7,8,9,10,10,11,12,13
1898 ].iter());
1899
1900 let expected_coeff = (10.0 - 4.0)/(10.0 + 4.0);
1901 assert_close(q.coeff_of_quartile_dev().unwrap(), expected_coeff, 0.00000000001);
1902 }
1903
1904 #[test]
1905 fn test_quartiles() {
1906 let mut q = Quantogram::new();
1907 q.add_unweighted_samples(vec![
1908 1,2,3,4,4,5,6,7,7,8,9,10,10,11,12,13
1909 ].iter());
1910
1911 assert_close(q.q1().unwrap(), 4.0, 0.00000000001);
1912 assert_close(q.q3().unwrap(), 10.0, 0.00000000001);
1913 }
1914
1915 #[test]
1916 fn test_coeff_of_stddev() {
1917 let mut q = Quantogram::new();
1918 q.add_unweighted_samples(vec![
1919 85, 86, 100, 76, 81, 93, 84, 99, 71, 69, 93, 85, 81, 87, 89
1920 ].iter());
1921
1922 let expected_stddev = 8.698403429493382;
1923 let expected_mean = 85.266666666666667;
1924 assert_close(
1925 q.coeff_of_stddev().unwrap(),
1926 expected_stddev / expected_mean,
1927 0.00000000001
1928 );
1929 // Also test the coeff_of_variation
1930 assert_close(
1931 q.coeff_of_variation().unwrap(),
1932 100.0 * expected_stddev / expected_mean,
1933 0.00000000001
1934 );
1935 }
1936
1937 #[test]
1938 fn test_finite() {
1939 let mut q = Quantogram::new();
1940 q.add(f64::NAN);
1941 assert_eq!(q.finite(), 0.0);
1942
1943 let data = vec![1.0,2.0,3.0,4.0];
1944 q.add_unweighted_samples(data.iter());
1945 assert_eq!(q.finite(), 0.8);
1946 }
1947
1948 #[test]
1949 fn test_nan() {
1950 let mut q = Quantogram::new();
1951 q.add(f64::NAN);
1952 assert_eq!(q.nan(), 1.0);
1953
1954 let data = vec![1.0,2.0,3.0,4.0];
1955 q.add_unweighted_samples(data.iter());
1956 assert_eq!(q.nan(), 0.2);
1957 }
1958
1959 #[test]
1960 fn test_size() {
1961 let mut q = Quantogram::new();
1962
1963 // Despite sparseness, storage can go no lower than 7, because we always record counts for NANs, zeroes, etc.
1964 assert_eq!(q.size(), 7);
1965
1966 // Adding the first value should add a bin to a coarse and a fine SkipMap.
1967 q.add(1.0);
1968 assert_eq!(q.size(), 9);
1969
1970 // Adding the same value should not increase the storage used.
1971 q.add(1.0);
1972 assert_eq!(q.size(), 9);
1973
1974 // Adding a new value that is in the same coarse bin but a different fine bin should increase the storage by one, not two.
1975 q.add(1.5);
1976 assert_eq!(q.size(), 10);
1977 }
1978
1979 /// Verify that storage grows with the logarithm of the number of samples.
1980 #[test]
1981 fn test_sparsity() {
1982 let mut q = Quantogram::new();
1983 // A million is about 20 powers of 2.
1984 // Default histogram has 35 bins per power of two.
1985 // This means that we should have 7 default bins, 20 coarse bins and at most 700 fine bins.
1986 for i in 1..1000000 {
1987 q.add(i as f64);
1988 }
1989 // Actual size is 577.
1990 assert!(q.size() < 600);
1991
1992 assert!(absolute_relative_error(q.median().unwrap(), 500000.0) <= 0.01);
1993 }
1994
1995 #[test]
1996 fn test_with_constrained_range() {
1997 let mut q = QuantogramBuilder::new()
1998 .with_smallest_power(-10)
1999 .with_largest_power(10)
2000 .build();
2001 let data = vec![
2002 0.0001, 0.00056, // will underflow and become zeroes
2003 0.002, 16.0, 1000.0,
2004 6543.0, 15000.0, 2400000.0 // will overflow and become Infinity
2005 ];
2006 q.add_unweighted_samples(data.iter());
2007 assert_eq!(q.min().unwrap(), 0.0);
2008 assert_eq!(q.finite(), 0.625);
2009 assert_eq!(q.zero(), 0.25);
2010 assert_eq!(q.median().unwrap(), 0.002);
2011 }
2012
2013 #[test]
2014 fn test_accuracy() {
2015 let mut data = randomish(-100.0, 2000000.0, 100000);
2016 assert_median(&mut data, 0.01);
2017 }
2018
2019 #[test]
2020 fn test_insert_heavy_speed() {
2021 // Adding more data to a naive quantile should take longer compared to using a histogram
2022 // because the naive approach has an N log N sort.
2023 // This test verifies that ther is an improvement in relative performance
2024 // between the two as the number of samples increases.
2025
2026 let mut short_data = randomish(-10000.0, 10000.0, 10000);
2027 let mut long_data = randomish(-10000.0, 10000.0, 1000000);
2028 let (short_quant_time, short_naive_time) = profile_median(&mut short_data);
2029 let (long_quant_time, long_naive_time) = profile_median(&mut long_data);
2030 let short_ratio = (short_quant_time as f64) / (short_naive_time as f64);
2031 let long_ratio = (long_quant_time as f64) / (long_naive_time as f64);
2032 println!("Short Ratio: {} Long ratio: {}", short_ratio, long_ratio);
2033 // assert!(false);
2034 assert!(short_ratio > 1.4 * long_ratio);
2035 }
2036
2037 #[test]
2038 fn test_median_heavy_speed() {
2039 let mut short_data = randomish(-10000.0, 10000.0, 2000);
2040 let mut long_data = randomish(-10000.0, 10000.0, 20000);
2041 let (short_quant_time, short_naive_time) = profile_many_medians(&mut short_data);
2042 let (long_quant_time, long_naive_time) = profile_many_medians(&mut long_data);
2043 let short_ratio = (short_quant_time as f64) / (short_naive_time as f64);
2044 let long_ratio = (long_quant_time as f64) / (long_naive_time as f64);
2045 println!("Short Ratio: {} Long ratio: {}", short_ratio, long_ratio);
2046 // assert!(false);
2047 assert!(short_ratio > 8.0 * long_ratio);
2048 }
2049
2050 #[test]
2051 fn test_basics() {
2052 let mut q = Quantogram::new();
2053 q.add(10.0);
2054 q.add(40.0);
2055 q.add(20.0);
2056 q.add(30.0);
2057 q.add(50.0);
2058
2059 assert_eq!(q.count(), 5);
2060 assert_eq!(q.min().unwrap(), 10.0);
2061 assert_eq!(q.max().unwrap(), 50.0);
2062 assert_eq!(q.mean().unwrap(), 30.0);
2063 assert_eq!(q.median().unwrap(), 30.0);
2064 assert_eq!(q.quantile(0.75).unwrap(), 40.0);
2065
2066 q.remove(10.0);
2067 q.remove(20.0);
2068 assert_eq!(q.mean().unwrap(), 40.0);
2069 }
2070
2071 #[test]
2072 fn quantile_at_nan() {
2073 let mut q = Quantogram::new();
2074 q.add(10.0);
2075 assert!(q.quantile_at(f64::NAN).is_none());
2076 }
2077
2078}
2079