pdatastructs/tdigest.rs
1//! TDigest implementation.
2use std::cell::RefCell;
3use std::f64;
4use std::fmt::Debug;
5
6#[derive(Clone, Debug)]
7struct Centroid {
8 sum: f64,
9 count: f64,
10}
11
12impl Centroid {
13 fn fuse(&self, other: &Self) -> Self {
14 Self {
15 count: self.count + other.count,
16 sum: self.sum + other.sum,
17 }
18 }
19
20 fn mean(&self) -> f64 {
21 self.sum / self.count
22 }
23}
24
25/// Scale function used to compress a T-Digest.
26pub trait ScaleFunction {
27 /// Delta / compression factor.
28 fn delta(&self) -> f64;
29
30 /// Calculate scaled value `k` for given value `q` and `n` (number of samples).
31 ///
32 /// `q` will be limited to the range `[0, 1]`.
33 fn f(&self, q: f64, n: usize) -> f64;
34
35 /// Calculate unscaled value `q` for given value `k` and `n` (number of samples).
36 ///
37 /// `k` will be limited to the range `[f(0), f(1)]`.
38 fn f_inv(&self, k: f64, n: usize) -> f64;
39}
40
41/// k0 as mentioned in the paper, equivalent to `\delta / 2 * q`.
42///
43/// The following plot shows this function for a compression
44/// factor of 10:
45/// ```text
46/// 5 +---------------------------------------------------+
47/// | + + + + **** |
48/// | **** |
49/// 4 |-+ **** +-|
50/// | *** |
51/// | **** |
52/// 3 |-+ **** +-|
53/// | *** |
54/// | *** |
55/// | *** |
56/// 2 |-+ **** +-|
57/// | **** |
58/// | *** |
59/// 1 |-+ **** +-|
60/// | **** |
61/// | **** + + + + |
62/// 0 +---------------------------------------------------+
63/// 0 0.2 0.4 0.6 0.8 1
64/// ```
65#[derive(Clone, Copy, Debug)]
66pub struct K0 {
67 delta: f64,
68}
69
70impl K0 {
71 /// Create new K0 scale function with given compression factor.
72 pub fn new(delta: f64) -> Self {
73 assert!(
74 (delta > 1.) && delta.is_finite(),
75 "delta ({}) must be greater than 1 and finite",
76 delta
77 );
78 Self { delta }
79 }
80}
81
82impl ScaleFunction for K0 {
83 fn delta(&self) -> f64 {
84 self.delta
85 }
86
87 fn f(&self, q: f64, _n: usize) -> f64 {
88 let q = q.min(1.).max(0.);
89 self.delta / 2. * q
90 }
91
92 fn f_inv(&self, k: f64, _n: usize) -> f64 {
93 let k = k.min(self.delta / 2.).max(0.);
94 k * 2. / self.delta
95 }
96}
97
98/// k1 as mentioned in the paper, equivalent to `\delta / (2 * PI) * \asin(2q - 1)`.
99///
100/// The following plot shows this function for a compression
101/// factor of 10:
102///
103/// ```text
104/// 3 +---------------------------------------------------+
105/// | + + + + |
106/// | *|
107/// 2 |-+ ***|
108/// | **** |
109/// 1 |-+ ***** +-|
110/// | ****** |
111/// | ******* |
112/// 0 |-+ ******* +-|
113/// | ******* |
114/// | ****** |
115/// -1 |-+ ***** +-|
116/// | **** |
117/// -2 |*** +-|
118/// |* |
119/// | + + + + |
120/// -3 +---------------------------------------------------+
121/// 0 0.2 0.4 0.6 0.8 1
122/// ```
123#[derive(Clone, Copy, Debug)]
124pub struct K1 {
125 delta: f64,
126}
127
128impl K1 {
129 /// Create new K1 scale function with given compression factor.
130 pub fn new(delta: f64) -> Self {
131 assert!(
132 (delta > 1.) && delta.is_finite(),
133 "delta ({}) must be greater than 1 and finite",
134 delta
135 );
136 Self { delta }
137 }
138}
139
140impl ScaleFunction for K1 {
141 fn delta(&self) -> f64 {
142 self.delta
143 }
144
145 fn f(&self, q: f64, _n: usize) -> f64 {
146 let q = q.min(1.).max(0.);
147 self.delta / (2. * f64::consts::PI) * (2. * q - 1.).asin()
148 }
149
150 fn f_inv(&self, k: f64, _n: usize) -> f64 {
151 let range = 0.25 * self.delta;
152 let k = k.min(range).max(-range);
153 ((k * 2. * f64::consts::PI / self.delta).sin() + 1.) / 2.
154 }
155}
156
157/// k2 as mentioned in the paper, equivalent to `\delta / (4 * log(n / \delta) + 24) * log(q / (1 - q))`.
158///
159/// The following plot shows this function for a compression
160/// factor of 10 and 100 centroids:
161///
162/// ```text
163/// 1.5 +-------------------------------------------------+
164/// | + + + + *|
165/// | **|
166/// 1 |-+ **-|
167/// | *** |
168/// 0.5 |-+ **** +-|
169/// | ****** |
170/// | ******** |
171/// 0 |-+ ********* +-|
172/// | ******** |
173/// | ****** |
174/// -0.5 |-+ **** +-|
175/// | *** |
176/// -1 |-** +-|
177/// |** |
178/// |* + + + + |
179/// -1.5 +-------------------------------------------------+
180/// 0 0.2 0.4 0.6 0.8 1
181/// ```
182#[derive(Clone, Copy, Debug)]
183pub struct K2 {
184 delta: f64,
185}
186
187impl K2 {
188 /// Create new K2 scale function with given compression factor.
189 pub fn new(delta: f64) -> Self {
190 assert!(
191 (delta > 1.) && delta.is_finite(),
192 "delta ({}) must be greater than 1 and finite",
193 delta
194 );
195 Self { delta }
196 }
197
198 fn x(&self, n: usize) -> f64 {
199 self.delta / (4. * ((n as f64) / self.delta).ln() + 24.)
200 }
201
202 fn z(&self, k: f64, n: usize) -> f64 {
203 (k / self.x(n)).exp()
204 }
205}
206
207impl ScaleFunction for K2 {
208 fn delta(&self) -> f64 {
209 self.delta
210 }
211
212 fn f(&self, q: f64, n: usize) -> f64 {
213 let q = q.min(1.).max(0.);
214 self.x(n) * (q / (1. - q)).ln()
215 }
216
217 fn f_inv(&self, k: f64, n: usize) -> f64 {
218 if k.is_infinite() {
219 if k > 0. {
220 1.
221 } else {
222 0.
223 }
224 } else {
225 let z = self.z(k, n);
226 z / (z + 1.)
227 }
228 }
229}
230
231/// k3 as mentioned in the paper, equivalent to `\delta / (4 * log(n / \delta) + 21) * f(q)` with:
232///
233/// - `log(2 * q)` if `q <= 0.5`
234/// - `-log(2 * (1 - q))` if `q > 0.5`
235///
236/// The following plot shows this function for a compression
237/// factor of 10 and 100 centroids:
238///
239/// ```text
240/// 1.5 +-------------------------------------------------+
241/// | + + + + *|
242/// | *|
243/// 1 |-+ *-|
244/// | ** |
245/// 0.5 |-+ *** +-|
246/// | ***** |
247/// | ********* |
248/// 0 |-+ ************* +-|
249/// | ********* |
250/// | ***** |
251/// -0.5 |-+ *** +-|
252/// | ** |
253/// -1 |-* +-|
254/// |* |
255/// |* + + + + |
256/// -1.5 +-------------------------------------------------+
257/// 0 0.2 0.4 0.6 0.8 1
258/// ```
259#[derive(Clone, Copy, Debug)]
260pub struct K3 {
261 delta: f64,
262}
263
264impl K3 {
265 /// Create new K3 scale function with given compression factor.
266 pub fn new(delta: f64) -> Self {
267 assert!(
268 (delta > 1.) && delta.is_finite(),
269 "delta ({}) must be greater than 1 and finite",
270 delta
271 );
272 Self { delta }
273 }
274
275 fn x(&self, n: usize) -> f64 {
276 self.delta / (4. * ((n as f64) / self.delta).ln() + 21.)
277 }
278}
279
280impl ScaleFunction for K3 {
281 fn delta(&self) -> f64 {
282 self.delta
283 }
284
285 fn f(&self, q: f64, n: usize) -> f64 {
286 let q = q.min(1.).max(0.);
287 let y = if q <= 0.5 {
288 (2. * q).ln()
289 } else {
290 -(2. * (1. - q)).ln()
291 };
292 self.x(n) * y
293 }
294
295 fn f_inv(&self, k: f64, n: usize) -> f64 {
296 if k.is_infinite() {
297 if k > 0. {
298 1.
299 } else {
300 0.
301 }
302 } else {
303 let x = self.x(n);
304 if k <= 0. {
305 (k / x).exp() / 2.
306 } else {
307 1. - (-k / x).exp() / 2.
308 }
309 }
310 }
311}
312
313/// Inner data structure for tdigest to enable interior mutability.
314#[derive(Clone, Debug)]
315struct TDigestInner<S>
316where
317 S: Clone + Debug + ScaleFunction,
318{
319 centroids: Vec<Centroid>,
320 n_samples: usize,
321 min: f64,
322 max: f64,
323 scale_function: S,
324 backlog: Vec<Centroid>,
325 max_backlog_size: usize,
326}
327
328impl<S> TDigestInner<S>
329where
330 S: Clone + Debug + ScaleFunction,
331{
332 fn new(scale_function: S, max_backlog_size: usize) -> Self {
333 Self {
334 centroids: vec![],
335 n_samples: 0,
336 min: f64::INFINITY,
337 max: f64::NEG_INFINITY,
338 scale_function,
339 backlog: vec![],
340 max_backlog_size,
341 }
342 }
343
344 fn is_empty(&self) -> bool {
345 self.centroids.is_empty() && self.backlog.is_empty()
346 }
347
348 fn clear(&mut self) {
349 self.centroids.clear();
350 self.min = f64::INFINITY;
351 self.max = f64::NEG_INFINITY;
352 self.backlog.clear();
353 }
354
355 fn insert_weighted(&mut self, x: f64, w: f64) {
356 self.backlog.push(Centroid {
357 count: w,
358 sum: x * w,
359 });
360 self.n_samples += 1;
361
362 self.min = self.min.min(x);
363 self.max = self.max.max(x);
364
365 if self.backlog.len() > self.max_backlog_size {
366 self.merge();
367 }
368 }
369
370 fn merge(&mut self) {
371 // early return in case nothing to be done
372 if self.backlog.is_empty() {
373 return;
374 }
375
376 // TODO: use sort_by_cached_key once stable
377 let mut x: Vec<(f64, Centroid)> = self
378 .centroids
379 .drain(..)
380 .chain(self.backlog.drain(..))
381 .map(|c| (c.mean(), c))
382 .collect();
383 x.sort_by(|t1, t2| t1.0.partial_cmp(&t2.0).unwrap());
384 let mut x: Vec<Centroid> = x.drain(..).map(|t| t.1).collect();
385
386 let s: f64 = x.iter().map(|c| c.count).sum();
387
388 let mut q_0 = 0.;
389 let mut q_limit = self.scale_function.f_inv(
390 self.scale_function.f(q_0, self.n_samples) + 1.,
391 self.n_samples,
392 );
393
394 let mut result = vec![];
395 let mut current = x[0].clone();
396 for next in x.drain(1..) {
397 let q = q_0 + (current.count + next.count) / s;
398 if q <= q_limit {
399 current = current.fuse(&next);
400 } else {
401 q_0 += current.count / s;
402 q_limit = self.scale_function.f_inv(
403 self.scale_function.f(q_0, self.n_samples) + 1.,
404 self.n_samples,
405 );
406 result.push(current);
407 current = next;
408 }
409 }
410 result.push(current);
411
412 self.centroids = result;
413 }
414
415 #[inline(always)]
416 fn interpolate(a: f64, b: f64, t: f64) -> f64 {
417 debug_assert!((0. ..=1.).contains(&t));
418 debug_assert!(a <= b);
419 t * b + (1. - t) * a
420 }
421
422 fn quantile(&self, q: f64) -> f64 {
423 // empty case
424 if self.centroids.is_empty() {
425 return f64::NAN;
426 }
427
428 let s: f64 = self.count();
429 let limit = s * q;
430
431 // left tail?
432 let c_first = &self.centroids[0];
433 if limit <= c_first.count * 0.5 {
434 let t = limit / (0.5 * c_first.count);
435 return Self::interpolate(self.min, c_first.mean(), t);
436 }
437
438 let mut cum = 0.;
439 for (i, c) in self.centroids.iter().enumerate() {
440 if cum + c.count * 0.5 >= limit {
441 // default case
442 debug_assert!(i > 0);
443 let c_last = &self.centroids[i - 1];
444 cum -= 0.5 * c_last.count;
445 let delta = 0.5 * (c_last.count + c.count);
446 let t = (limit - cum) / delta;
447 return Self::interpolate(c_last.mean(), c.mean(), t);
448 }
449 cum += c.count;
450 }
451
452 // right tail
453 let c_last = &self.centroids[self.centroids.len() - 1];
454 cum -= 0.5 * c_last.count;
455 let delta = s - 0.5 * c_last.count;
456 let t = (limit - cum) / delta;
457 Self::interpolate(c_last.mean(), self.max, t)
458 }
459
460 fn cdf(&self, x: f64) -> f64 {
461 // empty case
462 if self.centroids.is_empty() {
463 return 0.;
464 }
465 if x < self.min {
466 return 0.;
467 }
468
469 let s: f64 = self.count();
470
471 let mut cum = 0.;
472 let mut last_mean = self.min;
473 let mut last_cum = 0.;
474 for c in &self.centroids {
475 let current_cum = cum + 0.5 * c.count;
476 if x < c.mean() {
477 let delta = c.mean() - last_mean;
478 let t = (x - last_mean) / delta;
479 return Self::interpolate(last_cum, current_cum, t) / s;
480 }
481 last_cum = current_cum;
482 cum += c.count;
483 last_mean = c.mean();
484 }
485
486 if x < self.max {
487 let delta = self.max - last_mean;
488 let t = (x - last_mean) / delta;
489 Self::interpolate(last_cum, s, t) / s
490 } else {
491 1.
492 }
493 }
494
495 fn count(&self) -> f64 {
496 self.centroids.iter().map(|c| c.count).sum()
497 }
498
499 fn sum(&self) -> f64 {
500 self.centroids.iter().map(|c| c.sum).sum()
501 }
502}
503
504/// A TDigest is a data structure to capture quantile and CDF (cumulative distribution function)
505/// data for arbitrary distribution w/o the need to store the entire data and w/o requiring the
506/// user to know ranges beforehand. It can handle outliers and data w/ non-uniform densities (like
507/// mixed gaussian data w/ very different deviations).
508///
509/// # Examples
510/// ```
511/// use pdatastructs::tdigest::{K1, TDigest};
512/// use rand::{Rng, SeedableRng};
513/// use rand_distr::StandardNormal;
514/// use rand_chacha::ChaChaRng;
515///
516/// // Set up moderately compressed digest
517/// let compression_factor = 100.;
518/// let max_backlog_size = 10;
519/// let scale_function = K1::new(compression_factor);
520/// let mut digest = TDigest::new(scale_function, max_backlog_size);
521///
522/// // sample data from normal distribution
523/// let mut rng = ChaChaRng::from_seed([0; 32]);
524/// let n = 100_000;
525/// for _ in 0..n {
526/// let x = rng.sample(StandardNormal);
527/// digest.insert(x);
528/// }
529///
530/// // test some known quantiles
531/// assert!((-1.2816 - digest.quantile(0.10)).abs() < 0.01);
532/// assert!((-0.6745 - digest.quantile(0.25)).abs() < 0.01);
533/// assert!((0.0000 - digest.quantile(0.5)).abs() < 0.01);
534/// assert!((0.6745 - digest.quantile(0.75)).abs() < 0.01);
535/// assert!((1.2816 - digest.quantile(0.90)).abs() < 0.01);
536/// ```
537/// # Applications
538/// - capturing distribution information in big data applications
539/// - gathering quantiles for monitoring purposes (e.g. quantiles of web application response times)
540///
541/// # How It Works
542/// Initially, the digest is empty and has no centroids stored.
543///
544/// ## Insertion
545/// If a new element is presented to the digest, a temporary centroid w/ the given weight and
546/// position is created and added to a backlog. If the backlog reaches a certain size, it is merged
547/// into the digest.
548///
549/// ## Merge Procedure
550/// For the merge, all centroids (the existing ones and the ones in the backlog) are added to a
551/// single list and sorted by position (i.e. their mean which is `sum / count`). Then, the list is
552/// processed in linear order. Every centroid is merged w/ the next one if together they would stay
553/// under a certain size limit.
554///
555/// The size limit is determined by the compression factor and the total weight added to the
556/// filter. They simplest thing is to use a uniform size limit for all centroids (also known as k0
557/// scale function), which is equivalent to `\delta / 2 * q` (`\delta` being the compression factor
558/// and `q` being the position of the centroid in the sorted list ranging from 0 to 1). The
559/// following plot shows k0 w/ a compression factor of 10:
560///
561/// ```text
562/// 5 +---------------------------------------------------+
563/// | + + + + **** |
564/// | **** |
565/// 4 |-+ **** +-|
566/// | *** |
567/// | **** |
568/// 3 |-+ **** +-|
569/// | *** |
570/// | *** |
571/// | *** |
572/// 2 |-+ **** +-|
573/// | **** |
574/// | *** |
575/// 1 |-+ **** +-|
576/// | **** |
577/// | **** + + + + |
578/// 0 +---------------------------------------------------+
579/// 0 0.2 0.4 0.6 0.8 1
580/// ```
581///
582/// The problem here is that outlier can have a high influence on the guessed distribution. To
583/// account for this, there is another scale function k1 which is equivalent to
584/// `\delta / (2 * PI) * \asin(2q - 1)`. The following plot shows this function for a compression
585/// factor of 10:
586///
587/// ```text
588/// 3 +---------------------------------------------------+
589/// | + + + + |
590/// | *|
591/// 2 |-+ ***|
592/// | **** |
593/// 1 |-+ ***** +-|
594/// | ****** |
595/// | ******* |
596/// 0 |-+ ******* +-|
597/// | ******* |
598/// | ****** |
599/// -1 |-+ ***** +-|
600/// | **** |
601/// -2 |*** +-|
602/// |* |
603/// | + + + + |
604/// -3 +---------------------------------------------------+
605/// 0 0.2 0.4 0.6 0.8 1
606/// ```
607///
608/// ## Querying
609/// To query quantiles or CDF values, the sorted list of centroids is scanned until the right
610/// centroid is found. The data between two consecutive centroids is interpolated in a linear
611/// fashion. The same holds for the first and the last centroid where an interpolation with the
612/// measured minimum and maximum of the data takes place.
613///
614/// ## Example
615/// Imagine the following example of mixed gaussian distribution:
616///
617/// - 30% μ=-1.9 σ=0.2
618/// - 70% μ=0.7 σ=0.8
619///
620/// This would lead to the following probability density function (PDF):
621///
622/// ```text
623/// 0.6 +-------------------------------------------------+
624/// | +** + + + + |
625/// | ** |
626/// 0.5 |-+ * * +-|
627/// | * * |
628/// 0.4 |-+ * * +-|
629/// | * * |
630/// | * * ****** |
631/// 0.3 |-+ * * ** ** +-|
632/// | * * ** ** |
633/// | * * ** * |
634/// 0.2 |-+ * * ** * +-|
635/// | * * ** ** |
636/// 0.1 |-+ * * ** ** +-|
637/// | * * *** ** |
638/// | * + ****** + + + *** |
639/// 0 +-------------------------------------------------+
640/// -3 -2 -1 0 1 2 3
641/// ```
642///
643/// and to the following cumulative distribution function (CDF):
644///
645/// ```text
646/// 1 +-------------------------------------------------+
647/// | + + + + ***** |
648/// 0.9 |-+ *** +-|
649/// 0.8 |-+ *** +-|
650/// | ** |
651/// 0.7 |-+ ** +-|
652/// 0.6 |-+ ** +-|
653/// | ** |
654/// 0.5 |-+ *** +-|
655/// | *** |
656/// 0.4 |-+ **** +-|
657/// 0.3 |-+ ********* +-|
658/// | ** |
659/// 0.2 |-+ * +-|
660/// 0.1 |-+ * +-|
661/// | ** + + + + |
662/// 0 +-------------------------------------------------+
663/// -3 -2 -1 0 1 2 3
664/// ```
665///
666/// Extreme compression (factor is 10), resulting in 8 centroids:
667///
668/// ```text
669/// 1 +-------------------------------------------------+
670/// | + + + + *+ |
671/// 0.9 |-+ ****** +-|
672/// 0.8 |-+ * +-|
673/// | * |
674/// 0.7 |-+ ***** +-|
675/// 0.6 |-+ * +-|
676/// | * |
677/// 0.5 |-+ ******* +-|
678/// | * |
679/// 0.4 |-+ * +-|
680/// 0.3 |-+ * +-|
681/// | *********** |
682/// 0.2 |-+ *** +-|
683/// 0.1 |-+ * +-|
684/// | * + + + + |
685/// 0 +-------------------------------------------------+
686/// -3 -2 -1 0 1 2 3
687/// ```
688///
689/// High compression (factor is 20), resulting in 15 centroids:
690///
691/// ```text
692/// 1 +-------------------------------------------------+
693/// | + + + + *** + |
694/// 0.9 |-+ **** +-|
695/// 0.8 |-+ * +-|
696/// | ** |
697/// 0.7 |-+ * +-|
698/// 0.6 |-+ **** +-|
699/// | * |
700/// 0.5 |-+ *** +-|
701/// | * |
702/// 0.4 |-+ ******* +-|
703/// 0.3 |-+ * +-|
704/// | ******* |
705/// 0.2 |-+ ** +-|
706/// 0.1 |-+ * +-|
707/// | * + + + + |
708/// 0 +-------------------------------------------------+
709/// -3 -2 -1 0 1 2 3
710/// ```
711///
712/// Medium compression (factor is 50), resulting in 34 centroids:
713///
714/// ```text
715/// 1 +-------------------------------------------------+
716/// | + + + + ****+ |
717/// 0.9 |-+ *** +-|
718/// 0.8 |-+ * +-|
719/// | ** |
720/// 0.7 |-+ * +-|
721/// 0.6 |-+ *** +-|
722/// | * |
723/// 0.5 |-+ ** +-|
724/// | * |
725/// 0.4 |-+ ****** +-|
726/// 0.3 |-+ ***** +-|
727/// | **** |
728/// 0.2 |-+ * +-|
729/// 0.1 |-+ ** +-|
730/// | * + + + + |
731/// 0 +-------------------------------------------------+
732/// -3 -2 -1 0 1 2 3
733/// ```
734///
735/// Low compression (factor is 200), resulting in 127 centroids:
736///
737/// ```text
738/// 1 +-------------------------------------------------+
739/// | + + + + ***** |
740/// 0.9 |-+ ** +-|
741/// 0.8 |-+ *** +-|
742/// | * |
743/// 0.7 |-+ ** +-|
744/// 0.6 |-+ ** +-|
745/// | ** |
746/// 0.5 |-+ ** +-|
747/// | ** |
748/// 0.4 |-+ **** +-|
749/// 0.3 |-+ ******** +-|
750/// | ** |
751/// 0.2 |-+ * +-|
752/// 0.1 |-+ * +-|
753/// | * + + + + |
754/// 0 +-------------------------------------------------+
755/// -3 -2 -1 0 1 2 3
756/// ```
757///
758/// # See Also
759/// - `pdatastructs::reservoirsampling::ReservoirSampling`: Less complex data structure that (in
760/// case of a high sampling rate) can also be used to capture distribution information
761///
762/// # References
763/// - ["Computing Extremely Accurate Quantiles using t-Digests", T. Dunning, O. Ertl, 2018](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf)
764/// - [Python Implementation, C. Davidson-pilon, MIT License](https://github.com/CamDavidsonPilon/tdigest)
765/// - [Go Implementation, InfluxData, Apache License 2.0](https://github.com/influxdata/tdigest)
766#[derive(Clone, Debug)]
767pub struct TDigest<S>
768where
769 S: Clone + Debug + ScaleFunction,
770{
771 inner: RefCell<TDigestInner<S>>,
772}
773
774impl<S> TDigest<S>
775where
776 S: Clone + Debug + ScaleFunction,
777{
778 /// Create a new, empty TDigest w/ the following parameters:
779 ///
780 /// - `scale_function`: how many centroids (relative to the added weight) should be kept,
781 /// i.e. scale function higher delta (compression factor) mean more precise distribution
782 /// information but less performance and higher memory requirements.
783 /// - `max_backlog_size`: maximum number of centroids to keep in a backlog before starting a
784 /// merge, i.e. higher numbers mean higher insertion performance but higher temporary memory
785 /// requirements.
786 pub fn new(scale_function: S, max_backlog_size: usize) -> Self {
787 Self {
788 inner: RefCell::new(TDigestInner::new(scale_function, max_backlog_size)),
789 }
790 }
791
792 /// Get compression factor of the TDigest.
793 pub fn delta(&self) -> f64 {
794 self.inner.borrow().scale_function.delta()
795 }
796
797 /// Get the maximum number of centroids to be stored in the backlog before starting a merge.
798 pub fn max_backlog_size(&self) -> usize {
799 self.inner.borrow().max_backlog_size
800 }
801
802 /// Check whether the digest has not received any positives weights yet.
803 pub fn is_empty(&self) -> bool {
804 self.inner.borrow().is_empty()
805 }
806
807 /// Clear data from digest as if there was no weight seen.
808 pub fn clear(&mut self) {
809 self.inner.borrow_mut().clear();
810 }
811
812 /// Get number of centroids tracked by the digest.
813 pub fn n_centroids(&self) -> usize {
814 // first apply compression
815 self.inner.borrow_mut().merge();
816
817 self.inner.borrow().centroids.len()
818 }
819
820 /// Insert new element into the TDigest.
821 ///
822 /// The implicit weight of the element is 1.
823 ///
824 /// Panics if the element is not finite.
825 pub fn insert(&mut self, x: f64) {
826 self.insert_weighted(x, 1.);
827 }
828
829 /// Insert a new element into the TDigest w/ the given weight.
830 ///
831 /// Panics if the element is not finite or if the weight is negative.
832 pub fn insert_weighted(&mut self, x: f64, w: f64) {
833 assert!(x.is_finite(), "x ({}) must be finite", x);
834 assert!(
835 (w >= 0.) && w.is_finite(),
836 "w ({}) must be greater or equal than zero and finite",
837 w
838 );
839
840 // early return for zero-weight
841 if w == 0. {
842 return;
843 }
844
845 self.inner.borrow_mut().insert_weighted(x, w)
846 }
847
848 /// Queries the quantile of the TDigest.
849 ///
850 /// If the digest is empty, NaN will be returned.
851 ///
852 /// If there are any unmerged centroids in the backlog, this will trigger a merge process.
853 ///
854 /// Panics if the quantile is not in range `[0, 1]`.
855 pub fn quantile(&self, q: f64) -> f64 {
856 assert!((0. ..=1.).contains(&q), "q ({}) must be in [0, 1]", q);
857
858 // apply compression if required
859 self.inner.borrow_mut().merge();
860
861 // get quantile on immutable state
862 self.inner.borrow().quantile(q)
863 }
864
865 /// Queries the cumulative distribution function (CDF) of the TDigest.
866 ///
867 /// If the digest is empty, 0 will be returned.
868 ///
869 /// If there are any unmerged centroids in the backlog, this will trigger a merge process.
870 ///
871 /// Panics if the given value is NaN.
872 pub fn cdf(&self, x: f64) -> f64 {
873 assert!(!x.is_nan(), "x must not be NaN");
874
875 // apply compression if required
876 self.inner.borrow_mut().merge();
877
878 // get quantile on immutable state
879 self.inner.borrow().cdf(x)
880 }
881
882 /// Give the number of elements in the digest.
883 ///
884 /// If weighted elements were used, this corresponds to the sum of all weights.
885 ///
886 /// If the digest is empty, 0 will be returned.
887 pub fn count(&self) -> f64 {
888 // apply compression if required
889 self.inner.borrow_mut().merge();
890
891 // get quantile on immutable state
892 self.inner.borrow().count()
893 }
894
895 /// Give the (weighted) sum of all elements inserted into the digest.
896 ///
897 /// If the digest is empty, 0 will be returned.
898 pub fn sum(&self) -> f64 {
899 // apply compression if required
900 self.inner.borrow_mut().merge();
901
902 // get quantile on immutable state
903 self.inner.borrow().sum()
904 }
905
906 /// Give the (weighted) mean of all elements inserted into the digest.
907 ///
908 /// If the digest is empty, NaN will be returned.
909 pub fn mean(&self) -> f64 {
910 // apply compression if required
911 self.inner.borrow_mut().merge();
912
913 let inner = self.inner.borrow();
914 inner.sum() / inner.count()
915 }
916
917 /// Give the minimum of all elements inserted into the digest.
918 ///
919 /// If the digest is empty, +Inf will be returned.
920 pub fn min(&self) -> f64 {
921 self.inner.borrow().min
922 }
923
924 /// Give the maximum of all elements inserted into the digest.
925 ///
926 /// If the digest is empty, -Inf will be returned.
927 pub fn max(&self) -> f64 {
928 self.inner.borrow().max
929 }
930}
931
932#[cfg(test)]
933mod tests {
934 use super::{ScaleFunction, TDigest, K0, K1, K2, K3};
935 use rand::{Rng, SeedableRng};
936 use rand_chacha::ChaChaRng;
937 use rand_distr::StandardNormal;
938 use std::f64;
939
940 #[test]
941 #[should_panic(expected = "delta (1) must be greater than 1 and finite")]
942 fn k0_new_panics_delta_a() {
943 K0::new(1.);
944 }
945
946 #[test]
947 #[should_panic(expected = "delta (inf) must be greater than 1 and finite")]
948 fn k0_new_panics_delta_b() {
949 K0::new(f64::INFINITY);
950 }
951
952 #[test]
953 fn k0_q0_a() {
954 let delta = 1.5;
955 let scale_function = K0::new(delta);
956 let q = 0.;
957 let n_samples = 1;
958
959 let k = scale_function.f(q, n_samples);
960 assert_eq!(k, 0.);
961
962 let q2 = scale_function.f_inv(k, n_samples);
963 assert_eq!(q2, q);
964 }
965
966 #[test]
967 fn k0_q0_b() {
968 let delta = 1.7;
969 let scale_function = K0::new(delta);
970 let q = 0.;
971 let n_samples = 1;
972
973 let k = scale_function.f(q, n_samples);
974 assert_eq!(k, 0.);
975
976 let q2 = scale_function.f_inv(k, n_samples);
977 assert_eq!(q2, q);
978 }
979
980 #[test]
981 fn k0_q1_a() {
982 let delta = 2.;
983 let scale_function = K0::new(delta);
984 let q = 1.;
985 let n_samples = 1;
986
987 let k = scale_function.f(q, n_samples);
988 assert_eq!(k, 1.);
989
990 let q2 = scale_function.f_inv(k, n_samples);
991 assert_eq!(q2, q);
992 }
993
994 #[test]
995 fn k0_q1_b() {
996 let delta = 10.;
997 let scale_function = K0::new(delta);
998 let q = 1.;
999 let n_samples = 1;
1000
1001 let k = scale_function.f(q, n_samples);
1002 assert_eq!(k, 5.);
1003
1004 let q2 = scale_function.f_inv(k, n_samples);
1005 assert_eq!(q2, q);
1006 }
1007
1008 #[test]
1009 fn k0_q_other() {
1010 let delta = 1.7;
1011 let scale_function = K0::new(delta);
1012 let q = 0.7;
1013 let n_samples = 1;
1014
1015 let k = scale_function.f(q, n_samples);
1016 assert_ne!(k, 0.);
1017
1018 let q2 = scale_function.f_inv(k, n_samples);
1019 assert_eq!(q2, q);
1020 }
1021
1022 #[test]
1023 fn k0_q_underflow() {
1024 let delta = 1.5;
1025 let scale_function = K0::new(delta);
1026 let q = -0.1;
1027 let n_samples = 1;
1028
1029 let ka = scale_function.f(q, n_samples);
1030 let kb = scale_function.f(0., n_samples);
1031 assert_eq!(ka, kb);
1032 assert_eq!(ka, 0.);
1033
1034 let q2a = scale_function.f_inv(ka - 0.1, n_samples);
1035 let q2b = scale_function.f_inv(ka, n_samples);
1036 assert_eq!(q2a, q2b);
1037 assert_eq!(q2a, 0.);
1038 }
1039
1040 #[test]
1041 fn k0_q_overflow() {
1042 let delta = 1.5;
1043 let scale_function = K0::new(delta);
1044 let q = 1.1;
1045 let n_samples = 1;
1046
1047 let ka = scale_function.f(q, n_samples);
1048 let kb = scale_function.f(1., n_samples);
1049 assert_eq!(ka, kb);
1050 assert_eq!(ka, delta / 2.);
1051
1052 let q2a = scale_function.f_inv(ka + 0.1, n_samples);
1053 let q2b = scale_function.f_inv(ka, n_samples);
1054 assert_eq!(q2a, q2b);
1055 assert_eq!(q2a, 1.);
1056 }
1057
1058 #[test]
1059 #[should_panic(expected = "delta (1) must be greater than 1 and finite")]
1060 fn k1_new_panics_delta_a() {
1061 K1::new(1.);
1062 }
1063
1064 #[test]
1065 #[should_panic(expected = "delta (inf) must be greater than 1 and finite")]
1066 fn k1_new_panics_delta_b() {
1067 K1::new(f64::INFINITY);
1068 }
1069
1070 #[test]
1071 fn k1_q05_a() {
1072 let delta = 1.5;
1073 let scale_function = K1::new(delta);
1074 let q = 0.5;
1075 let n_samples = 1;
1076
1077 let k = scale_function.f(q, n_samples);
1078 assert_eq!(k, 0.);
1079
1080 let q2 = scale_function.f_inv(k, n_samples);
1081 assert_eq!(q2, q);
1082 }
1083
1084 #[test]
1085 fn k1_q05_b() {
1086 let delta = 1.7;
1087 let scale_function = K1::new(delta);
1088 let q = 0.5;
1089 let n_samples = 1;
1090
1091 let k = scale_function.f(q, n_samples);
1092 assert_eq!(k, 0.);
1093
1094 let q2 = scale_function.f_inv(k, n_samples);
1095 assert_eq!(q2, q);
1096 }
1097
1098 #[test]
1099 fn k1_q_other() {
1100 let delta = 1.7;
1101 let scale_function = K1::new(delta);
1102 let q = 0.7;
1103 let n_samples = 1;
1104
1105 let k = scale_function.f(q, n_samples);
1106 assert_ne!(k, 0.);
1107
1108 let q2 = scale_function.f_inv(k, n_samples);
1109 assert_eq!(q2, q);
1110 }
1111
1112 #[test]
1113 fn k1_q_underflow() {
1114 let delta = 1.5;
1115 let scale_function = K1::new(delta);
1116 let q = -0.1;
1117 let n_samples = 1;
1118
1119 let ka = scale_function.f(q, n_samples);
1120 let kb = scale_function.f(0., n_samples);
1121 assert_eq!(ka, kb);
1122 assert_eq!(ka, -delta / 4.);
1123
1124 let q2a = scale_function.f_inv(ka - 0.1, n_samples);
1125 let q2b = scale_function.f_inv(ka, n_samples);
1126 assert_eq!(q2a, q2b);
1127 assert_eq!(q2a, 0.);
1128 }
1129
1130 #[test]
1131 fn k1_q_overflow() {
1132 let delta = 1.5;
1133 let scale_function = K1::new(delta);
1134 let q = 1.1;
1135 let n_samples = 1;
1136
1137 let ka = scale_function.f(q, n_samples);
1138 let kb = scale_function.f(1., n_samples);
1139 assert_eq!(ka, kb);
1140 assert_eq!(ka, delta / 4.);
1141
1142 let q2a = scale_function.f_inv(ka + 0.1, n_samples);
1143 let q2b = scale_function.f_inv(ka, n_samples);
1144 assert_eq!(q2a, q2b);
1145 assert_eq!(q2a, 1.);
1146 }
1147
1148 #[test]
1149 #[should_panic(expected = "delta (1) must be greater than 1 and finite")]
1150 fn k2_new_panics_delta_a() {
1151 K2::new(1.);
1152 }
1153
1154 #[test]
1155 #[should_panic(expected = "delta (inf) must be greater than 1 and finite")]
1156 fn k2_new_panics_delta_b() {
1157 K2::new(f64::INFINITY);
1158 }
1159
1160 #[test]
1161 fn k2_q05_a() {
1162 let delta = 1.5;
1163 let scale_function = K2::new(delta);
1164 let q = 0.5;
1165 let n_samples = 1;
1166
1167 let k = scale_function.f(q, n_samples);
1168 assert_eq!(k, 0.);
1169
1170 let q2 = scale_function.f_inv(k, n_samples);
1171 assert_eq!(q2, q);
1172 }
1173
1174 #[test]
1175 fn k2_q05_b() {
1176 let delta = 1.7;
1177 let scale_function = K2::new(delta);
1178 let q = 0.5;
1179 let n_samples = 1;
1180
1181 let k = scale_function.f(q, n_samples);
1182 assert_eq!(k, 0.);
1183
1184 let q2 = scale_function.f_inv(k, n_samples);
1185 assert_eq!(q2, q);
1186 }
1187
1188 #[test]
1189 fn k2_q_other() {
1190 let delta = 1.7;
1191 let scale_function = K2::new(delta);
1192 let q = 0.7;
1193 let n_samples = 1;
1194
1195 let k = scale_function.f(q, n_samples);
1196 assert_ne!(k, 0.);
1197
1198 let q2 = scale_function.f_inv(k, n_samples);
1199 assert_eq!(q2, q);
1200 }
1201
1202 #[test]
1203 fn k2_q_underflow() {
1204 let delta = 1.5;
1205 let scale_function = K2::new(delta);
1206 let q = -0.1;
1207 let n_samples = 1;
1208
1209 let ka = scale_function.f(q, n_samples);
1210 let kb = scale_function.f(0., n_samples);
1211 assert_eq!(ka, kb);
1212 assert_eq!(ka, -f64::INFINITY);
1213
1214 let q2a = scale_function.f_inv(ka - 0.1, n_samples);
1215 let q2b = scale_function.f_inv(ka, n_samples);
1216 assert_eq!(q2a, q2b);
1217 assert_eq!(q2a, 0.);
1218 }
1219
1220 #[test]
1221 fn k2_q_overflow() {
1222 let delta = 1.5;
1223 let scale_function = K2::new(delta);
1224 let q = 1.1;
1225 let n_samples = 1;
1226
1227 let ka = scale_function.f(q, n_samples);
1228 let kb = scale_function.f(1., n_samples);
1229 assert_eq!(ka, kb);
1230 assert_eq!(ka, f64::INFINITY);
1231
1232 let q2a = scale_function.f_inv(ka + 0.1, n_samples);
1233 let q2b = scale_function.f_inv(ka, n_samples);
1234 assert_eq!(q2a, q2b);
1235 assert_eq!(q2a, 1.);
1236 }
1237
1238 #[test]
1239 #[should_panic(expected = "delta (1) must be greater than 1 and finite")]
1240 fn k3_new_panics_delta_a() {
1241 K3::new(1.);
1242 }
1243
1244 #[test]
1245 #[should_panic(expected = "delta (inf) must be greater than 1 and finite")]
1246 fn k3_new_panics_delta_b() {
1247 K3::new(f64::INFINITY);
1248 }
1249
1250 #[test]
1251 fn k3_q05_a() {
1252 let delta = 1.5;
1253 let scale_function = K3::new(delta);
1254 let q = 0.5;
1255 let n_samples = 1;
1256
1257 let k = scale_function.f(q, n_samples);
1258 assert_eq!(k, 0.);
1259
1260 let q2 = scale_function.f_inv(k, n_samples);
1261 assert_eq!(q2, q);
1262 }
1263
1264 #[test]
1265 fn k3_q05_b() {
1266 let delta = 1.7;
1267 let scale_function = K3::new(delta);
1268 let q = 0.5;
1269 let n_samples = 1;
1270
1271 let k = scale_function.f(q, n_samples);
1272 assert_eq!(k, 0.);
1273
1274 let q2 = scale_function.f_inv(k, n_samples);
1275 assert_eq!(q2, q);
1276 }
1277
1278 #[test]
1279 fn k3_q_other() {
1280 let delta = 1.7;
1281 let scale_function = K3::new(delta);
1282 let q = 0.7;
1283 let n_samples = 1;
1284
1285 let k = scale_function.f(q, n_samples);
1286 assert_ne!(k, 0.);
1287
1288 let q2 = scale_function.f_inv(k, n_samples);
1289 assert_eq!(q2, q);
1290 }
1291
1292 #[test]
1293 fn k3_q_underflow() {
1294 let delta = 1.5;
1295 let scale_function = K3::new(delta);
1296 let q = -0.1;
1297 let n_samples = 1;
1298
1299 let ka = scale_function.f(q, n_samples);
1300 let kb = scale_function.f(0., n_samples);
1301 assert_eq!(ka, kb);
1302 assert_eq!(ka, -f64::INFINITY);
1303
1304 let q2a = scale_function.f_inv(ka - 0.1, n_samples);
1305 let q2b = scale_function.f_inv(ka, n_samples);
1306 assert_eq!(q2a, q2b);
1307 assert_eq!(q2a, 0.);
1308 }
1309
1310 #[test]
1311 fn k3_q_overflow() {
1312 let delta = 1.5;
1313 let scale_function = K3::new(delta);
1314 let q = 1.1;
1315 let n_samples = 1;
1316
1317 let ka = scale_function.f(q, n_samples);
1318 let kb = scale_function.f(1., n_samples);
1319 assert_eq!(ka, kb);
1320 assert_eq!(ka, f64::INFINITY);
1321
1322 let q2a = scale_function.f_inv(ka + 0.1, n_samples);
1323 let q2b = scale_function.f_inv(ka, n_samples);
1324 assert_eq!(q2a, q2b);
1325 assert_eq!(q2a, 1.);
1326 }
1327
1328 #[test]
1329 fn new_empty() {
1330 let delta = 2.;
1331 let max_backlog_size = 13;
1332 let scale_function = K1::new(delta);
1333 let digest = TDigest::new(scale_function, max_backlog_size);
1334
1335 assert_eq!(digest.delta(), 2.);
1336 assert_eq!(digest.max_backlog_size(), 13);
1337 assert_eq!(digest.n_centroids(), 0);
1338 assert!(digest.is_empty());
1339 assert!(digest.quantile(0.5).is_nan());
1340 assert_eq!(digest.cdf(13.37), 0.);
1341 assert_eq!(digest.count(), 0.);
1342 assert_eq!(digest.sum(), 0.);
1343 assert!(digest.mean().is_nan());
1344 assert!(digest.min().is_infinite() && digest.min().is_sign_positive());
1345 assert!(digest.max().is_infinite() && digest.max().is_sign_negative());
1346 }
1347
1348 #[test]
1349 fn with_normal_distribution() {
1350 let delta = 100.;
1351 let max_backlog_size = 10;
1352 let scale_function = K1::new(delta);
1353 let mut digest = TDigest::new(scale_function, max_backlog_size);
1354
1355 let mut rng = ChaChaRng::from_seed([0; 32]);
1356
1357 let n = 100_000;
1358 let mut s = 0.;
1359 let mut a = f64::INFINITY;
1360 let mut b = f64::NEG_INFINITY;
1361 for _ in 0..n {
1362 let x = rng.sample(StandardNormal);
1363 digest.insert(x);
1364 s += x;
1365 a = a.min(x);
1366 b = b.max(x);
1367 }
1368
1369 // generic tests
1370 assert_eq!(digest.count(), n as f64);
1371 assert!((s - digest.sum()).abs() < 0.0001);
1372 assert!((s / (n as f64) - digest.mean()).abs() < 0.0001);
1373 assert_eq!(digest.min(), a);
1374 assert_eq!(digest.max(), b);
1375
1376 // compression works
1377 assert!(digest.n_centroids() < 100);
1378
1379 // test some known quantiles
1380 assert!((-1.2816 - digest.quantile(0.10)).abs() < 0.01);
1381 assert!((-0.6745 - digest.quantile(0.25)).abs() < 0.01);
1382 assert!((0.0000 - digest.quantile(0.5)).abs() < 0.01);
1383 assert!((0.6745 - digest.quantile(0.75)).abs() < 0.01);
1384 assert!((1.2816 - digest.quantile(0.90)).abs() < 0.01);
1385 }
1386
1387 #[test]
1388 fn with_single() {
1389 let delta = 100.;
1390 let max_backlog_size = 10;
1391 let scale_function = K1::new(delta);
1392 let mut digest = TDigest::new(scale_function, max_backlog_size);
1393
1394 digest.insert(13.37);
1395
1396 // generic tests
1397 assert_eq!(digest.count(), 1.);
1398 assert_eq!(digest.sum(), 13.37);
1399 assert_eq!(digest.mean(), 13.37);
1400 assert_eq!(digest.min(), 13.37);
1401 assert_eq!(digest.max(), 13.37);
1402
1403 // compression works
1404 assert_eq!(digest.n_centroids(), 1);
1405
1406 // test some known quantiles
1407 assert_eq!(digest.quantile(0.), 13.37);
1408 assert_eq!(digest.quantile(0.5), 13.37);
1409 assert_eq!(digest.quantile(1.), 13.37);
1410
1411 // test some known CDF values
1412 assert_eq!(digest.cdf(13.36), 0.);
1413 assert_eq!(digest.cdf(13.37), 1.);
1414 assert_eq!(digest.cdf(13.38), 1.);
1415 }
1416
1417 #[test]
1418 fn with_two_symmetric() {
1419 let delta = 100.;
1420 let max_backlog_size = 10;
1421 let scale_function = K1::new(delta);
1422 let mut digest = TDigest::new(scale_function, max_backlog_size);
1423
1424 digest.insert(10.);
1425 digest.insert(20.);
1426
1427 // generic tests
1428 assert_eq!(digest.count(), 2.);
1429 assert_eq!(digest.sum(), 30.);
1430 assert_eq!(digest.mean(), 15.);
1431 assert_eq!(digest.min(), 10.);
1432 assert_eq!(digest.max(), 20.);
1433
1434 // compression works
1435 assert_eq!(digest.n_centroids(), 2);
1436
1437 // test some known quantiles
1438 assert_eq!(digest.quantile(0.), 10.); // min
1439 assert_eq!(digest.quantile(0.25), 10.); // first centroid
1440 assert_eq!(digest.quantile(0.375), 12.5);
1441 assert_eq!(digest.quantile(0.5), 15.); // center
1442 assert_eq!(digest.quantile(0.625), 17.5);
1443 assert_eq!(digest.quantile(0.75), 20.); // second centroid
1444 assert_eq!(digest.quantile(1.), 20.); // max
1445
1446 // test some known CDFs
1447 assert_eq!(digest.cdf(10.), 0.25); // first centroid
1448 assert_eq!(digest.cdf(12.5), 0.375);
1449 assert_eq!(digest.cdf(15.), 0.5); // center
1450 assert_eq!(digest.cdf(17.5), 0.625);
1451 assert_eq!(digest.cdf(20.), 1.); // max
1452 }
1453
1454 #[test]
1455 fn with_two_assymmetric() {
1456 let delta = 100.;
1457 let max_backlog_size = 10;
1458 let scale_function = K1::new(delta);
1459 let mut digest = TDigest::new(scale_function, max_backlog_size);
1460
1461 digest.insert_weighted(10., 1.);
1462 digest.insert_weighted(20., 9.);
1463
1464 // generic tests
1465 assert_eq!(digest.count(), 10.);
1466 assert_eq!(digest.sum(), 190.);
1467 assert_eq!(digest.mean(), 19.);
1468 assert_eq!(digest.min(), 10.);
1469 assert_eq!(digest.max(), 20.);
1470
1471 // compression works
1472 assert_eq!(digest.n_centroids(), 2);
1473
1474 // test some known quantiles
1475 assert_eq!(digest.quantile(0.), 10.); // min
1476 assert_eq!(digest.quantile(0.05), 10.); // first centroid
1477 assert_eq!(digest.quantile(0.175), 12.5);
1478 assert_eq!(digest.quantile(0.3), 15.); // center
1479 assert_eq!(digest.quantile(0.425), 17.5);
1480 assert_eq!(digest.quantile(0.55), 20.); // second centroid
1481 assert_eq!(digest.quantile(1.), 20.); // max
1482
1483 // test some known CDFs
1484 assert_eq!(digest.cdf(10.), 0.05); // first centroid
1485 assert_eq!(digest.cdf(12.5), 0.175);
1486 assert_eq!(digest.cdf(15.), 0.3); // center
1487 assert_eq!(digest.cdf(17.5), 0.425);
1488 assert_eq!(digest.cdf(20.), 1.); // max
1489 }
1490
1491 #[test]
1492 fn zero_weight() {
1493 let delta = 2.;
1494 let max_backlog_size = 13;
1495 let scale_function = K1::new(delta);
1496 let mut digest = TDigest::new(scale_function, max_backlog_size);
1497
1498 digest.insert_weighted(13.37, 0.);
1499
1500 assert_eq!(digest.delta(), 2.);
1501 assert_eq!(digest.max_backlog_size(), 13);
1502 assert_eq!(digest.n_centroids(), 0);
1503 assert!(digest.is_empty());
1504 assert!(digest.quantile(0.5).is_nan());
1505 assert_eq!(digest.cdf(13.37), 0.);
1506 assert_eq!(digest.count(), 0.);
1507 assert_eq!(digest.sum(), 0.);
1508 assert!(digest.mean().is_nan());
1509 assert!(digest.min().is_infinite() && digest.min().is_sign_positive());
1510 assert!(digest.max().is_infinite() && digest.max().is_sign_negative());
1511 }
1512
1513 #[test]
1514 fn highly_compressed() {
1515 let delta = 2.;
1516 let max_backlog_size = 10;
1517 let scale_function = K1::new(delta);
1518 let mut digest = TDigest::new(scale_function, max_backlog_size);
1519
1520 digest.insert(10.);
1521 digest.insert(20.);
1522 for _ in 0..100 {
1523 digest.insert(15.);
1524 }
1525
1526 // generic tests
1527 assert_eq!(digest.count(), 102.);
1528 assert_eq!(digest.sum(), 1530.);
1529 assert_eq!(digest.mean(), 15.);
1530 assert_eq!(digest.min(), 10.);
1531 assert_eq!(digest.max(), 20.);
1532
1533 // compression works
1534 assert_eq!(digest.n_centroids(), 1);
1535
1536 // test some known quantiles
1537 assert_eq!(digest.quantile(0.), 10.); // min
1538 assert_eq!(digest.quantile(0.125), 11.25); // tail
1539 assert_eq!(digest.quantile(0.25), 12.5); // tail
1540 assert_eq!(digest.quantile(0.5), 15.); // center (single centroid)
1541 assert_eq!(digest.quantile(0.75), 17.5); // tail
1542 assert_eq!(digest.quantile(0.875), 18.75); // tail
1543 assert_eq!(digest.quantile(1.), 20.); // max
1544
1545 // test some known CDFs
1546 assert_eq!(digest.cdf(10.), 0.); // min
1547 assert_eq!(digest.cdf(11.25), 0.125); // tail
1548 assert_eq!(digest.cdf(12.5), 0.25); // tail
1549 assert_eq!(digest.cdf(15.), 0.5); // center (single centroid)
1550 assert_eq!(digest.cdf(17.5), 0.75); // tail
1551 assert_eq!(digest.cdf(18.75), 0.875); // tail
1552 assert_eq!(digest.cdf(20.), 1.); // max
1553 }
1554
1555 #[test]
1556 #[should_panic(expected = "q (-1) must be in [0, 1]")]
1557 fn invalid_q_a() {
1558 let delta = 2.;
1559 let max_backlog_size = 13;
1560 let scale_function = K1::new(delta);
1561 let digest = TDigest::new(scale_function, max_backlog_size);
1562 digest.quantile(-1.);
1563 }
1564
1565 #[test]
1566 #[should_panic(expected = "q (2) must be in [0, 1]")]
1567 fn invalid_q_b() {
1568 let delta = 2.;
1569 let max_backlog_size = 13;
1570 let scale_function = K1::new(delta);
1571 let digest = TDigest::new(scale_function, max_backlog_size);
1572 digest.quantile(2.);
1573 }
1574
1575 #[test]
1576 #[should_panic(expected = "q (NaN) must be in [0, 1]")]
1577 fn invalid_q_c() {
1578 let delta = 2.;
1579 let max_backlog_size = 13;
1580 let scale_function = K1::new(delta);
1581 let digest = TDigest::new(scale_function, max_backlog_size);
1582 digest.quantile(f64::NAN);
1583 }
1584
1585 #[test]
1586 #[should_panic(expected = "q (inf) must be in [0, 1]")]
1587 fn invalid_q_d() {
1588 let delta = 2.;
1589 let max_backlog_size = 13;
1590 let scale_function = K1::new(delta);
1591 let digest = TDigest::new(scale_function, max_backlog_size);
1592 digest.quantile(f64::INFINITY);
1593 }
1594
1595 #[test]
1596 #[should_panic(expected = "w (-1) must be greater or equal than zero and finite")]
1597 fn invalid_w_a() {
1598 let delta = 2.;
1599 let max_backlog_size = 13;
1600 let scale_function = K1::new(delta);
1601 let mut digest = TDigest::new(scale_function, max_backlog_size);
1602 digest.insert_weighted(13.37, -1.);
1603 }
1604
1605 #[test]
1606 #[should_panic(expected = "w (inf) must be greater or equal than zero and finite")]
1607 fn invalid_w_b() {
1608 let delta = 2.;
1609 let max_backlog_size = 13;
1610 let scale_function = K1::new(delta);
1611 let mut digest = TDigest::new(scale_function, max_backlog_size);
1612 digest.insert_weighted(13.37, f64::INFINITY);
1613 }
1614
1615 #[test]
1616 #[should_panic(expected = "x (-inf) must be finite")]
1617 fn invalid_x_a() {
1618 let delta = 2.;
1619 let max_backlog_size = 13;
1620 let scale_function = K1::new(delta);
1621 let mut digest = TDigest::new(scale_function, max_backlog_size);
1622 digest.insert(f64::NEG_INFINITY);
1623 }
1624
1625 #[test]
1626 #[should_panic(expected = "x (inf) must be finite")]
1627 fn invalid_x_b() {
1628 let delta = 2.;
1629 let max_backlog_size = 13;
1630 let scale_function = K1::new(delta);
1631 let mut digest = TDigest::new(scale_function, max_backlog_size);
1632 digest.insert(f64::INFINITY);
1633 }
1634
1635 #[test]
1636 #[should_panic(expected = "x (NaN) must be finite")]
1637 fn invalid_x_c() {
1638 let delta = 2.;
1639 let max_backlog_size = 13;
1640 let scale_function = K1::new(delta);
1641 let mut digest = TDigest::new(scale_function, max_backlog_size);
1642 digest.insert(f64::NAN);
1643 }
1644
1645 #[test]
1646 #[should_panic(expected = "x must not be NaN")]
1647 fn invalid_cdf() {
1648 let delta = 2.;
1649 let max_backlog_size = 13;
1650 let scale_function = K1::new(delta);
1651 let digest = TDigest::new(scale_function, max_backlog_size);
1652 digest.cdf(f64::NAN);
1653 }
1654
1655 #[test]
1656 fn clone() {
1657 let delta = 100.;
1658 let max_backlog_size = 10;
1659 let scale_function = K1::new(delta);
1660 let mut digest1 = TDigest::new(scale_function, max_backlog_size);
1661
1662 digest1.insert(13.37);
1663
1664 let mut digest2 = digest1.clone();
1665 digest2.insert(42.);
1666
1667 assert_eq!(digest1.n_centroids(), 1);
1668 assert_eq!(digest1.min(), 13.37);
1669 assert_eq!(digest1.max(), 13.37);
1670
1671 assert_eq!(digest2.n_centroids(), 2);
1672 assert_eq!(digest2.min(), 13.37);
1673 assert_eq!(digest2.max(), 42.);
1674 }
1675
1676 #[test]
1677 fn regression_instablity() {
1678 // this tests if compression works for very small compression factors
1679 let delta = 1.1;
1680 let max_backlog_size = 10;
1681 let scale_function = K1::new(delta);
1682 let mut digest = TDigest::new(scale_function, max_backlog_size);
1683
1684 let mut rng = ChaChaRng::from_seed([0; 32]);
1685
1686 let n = 10_000;
1687 for _ in 0..n {
1688 let x = rng.sample(StandardNormal);
1689 digest.insert(x);
1690 }
1691
1692 // generic tests
1693 assert_eq!(digest.count(), n as f64);
1694
1695 // compression works
1696 assert_eq!(digest.n_centroids(), 1);
1697 }
1698
1699 #[test]
1700 fn clear() {
1701 let delta = 2.;
1702 let max_backlog_size = 13;
1703 let scale_function = K1::new(delta);
1704 let mut digest = TDigest::new(scale_function, max_backlog_size);
1705
1706 digest.insert(13.37);
1707 digest.clear();
1708
1709 assert_eq!(digest.delta(), 2.);
1710 assert_eq!(digest.max_backlog_size(), 13);
1711 assert_eq!(digest.n_centroids(), 0);
1712 assert!(digest.is_empty());
1713 assert!(digest.quantile(0.5).is_nan());
1714 assert_eq!(digest.cdf(13.37), 0.);
1715 assert_eq!(digest.count(), 0.);
1716 assert_eq!(digest.sum(), 0.);
1717 assert!(digest.mean().is_nan());
1718 assert!(digest.min().is_infinite() && digest.min().is_sign_positive());
1719 assert!(digest.max().is_infinite() && digest.max().is_sign_negative());
1720 }
1721}