metrics_util/storage/
summary.rs

1use sketches_ddsketch::{Config, DDSketch};
2use std::fmt;
3
4/// A quantile sketch with relative-error guarantees.
5///
6/// Based on [DDSketch][ddsketch], `Summary` provides quantiles over an arbitrary distribution of
7/// floating-point numbers, including for negative numbers, using a space-efficient sketch that
8/// provides relative-error guarantees, regardless of the absolute range between the smallest and
9/// larger values.
10///
11/// `Summary` is similar to [HDRHistogram][hdrhistogram] in practice, but supports an arbitrary
12/// range of values, and supports floating-point numbers.
13///
14/// Numbers with an absolute value smaller than given `min_value` will be recognized as zeroes.
15///
16/// Memory usage for `Summary` should be nearly identical to `DDSketch`.
17/// [`Summary::estimated_size`] provides a rough estimate of summary size based on the current
18/// values that have been added to it.
19///
20/// As mentioned above, this sketch provides relative-error guarantees across quantiles falling
21/// within 0 <= q <= 1, but trades some accuracy at the lowest quantiles as part of the collapsing
22/// scheme that allows for automatically handling arbitrary ranges of values, even when the
23/// maximum number of bins has been allocated.  Typically, q=0.05 and below is where this error will
24/// be noticed, if present.
25///
26/// For cases when all values are positive, you can simply use [`Summary::min`] in lieu of checking
27/// these quantiles, as the minimum value will be closer to the true value.  For cases when values
28/// range from negative to positive, the aforementioned collapsing will perturb the estimated true
29/// value for quantiles that conceptually fall within this collapsed band.
30///
31/// For example, for a distribution that spans from -25 to 75, we would intuitively expect q=0 to be
32/// -25, q=0.25 to be 0, q=0.5 to be 25, and so on.  Internally, negative numbers and positive
33/// numbers are handled in two separate containers.  Based on this example, one container would
34/// handle -25 to 0, and another would handle the 0 to 75 range.  As the containers are mapped "back
35/// to back", q=0.25 for this hypothetical summary would actually be q=0 within the negative
36/// container, which may return an estimated true value that exceeds the relative error guarantees.
37///
38/// Of course, as these problems are related to the estimation aspect of this data structure, users
39/// can allow the summary to allocate more bins to compensate for these edge cases, if desired.
40///
41/// [ddsketch]: https://arxiv.org/abs/1908.10693
42/// [hdrhistogram]: https://docs.rs/hdrhistogram
43#[derive(Clone)]
44pub struct Summary {
45    sketch: DDSketch,
46}
47
48impl fmt::Debug for Summary {
49    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
50        // manual implementation because DDSketch does not implement Debug
51        f.debug_struct("Summary").finish_non_exhaustive()
52    }
53}
54
55impl Summary {
56    /// Creates a new [`Summary`].
57    ///
58    /// `alpha` represents the desired relative error for this summary.  If `alpha` was 0.0001, that
59    /// would represent a desired relative error of 0.01%.  For example, if the true value at
60    /// quantile q0 was 1, the estimated value at that quantile would be a value within 0.01% of the
61    /// true value, or a value between 0.9999 and 1.0001.
62    ///
63    /// `max_buckets` controls how many subbuckets are created, which directly influences memory usage.
64    /// Each bucket "costs" eight bytes, so a summary with 2048 buckets would consume a maximum of
65    /// around 16 KiB.  Depending on how many samples have been added to the summary, the number of
66    /// subbuckets allocated may be far below `max_buckets`, and the summary will allocate more as
67    /// needed to fulfill the relative error guarantee.
68    ///
69    /// `min_value` controls the smallest value that will be recognized distinctly from zero.  Said
70    /// another way, any value between `-min_value` and `min_value` will be counted as zero.
71    pub fn new(alpha: f64, max_buckets: u32, min_value: f64) -> Summary {
72        let config = Config::new(alpha, max_buckets, min_value.abs());
73
74        Summary { sketch: DDSketch::new(config) }
75    }
76
77    /// Creates a new [`Summary`] with default values.
78    ///
79    /// `alpha` is 0.0001, `max_buckets` is 32,768, and `min_value` is 1.0e-9.
80    ///
81    /// This will yield a summary that is roughly equivalent in memory usage to an HDRHistogram with
82    /// 3 significant digits, and will support values down to a single nanosecond.
83    ///
84    /// In practice, when using only positive values, maximum memory usage can be expected to hover
85    /// around 200KiB, while usage of negative values can lead to an average maximum size of around
86    /// 400KiB.
87    pub fn with_defaults() -> Summary {
88        Summary::new(0.0001, 32_768, 1.0e-9)
89    }
90
91    /// Adds a sample to the summary.
92    ///
93    /// If the absolute value of `value` is smaller than given `min_value`, it will be added as a zero.
94    pub fn add(&mut self, value: f64) {
95        if value.is_infinite() {
96            return;
97        }
98
99        self.sketch.add(value);
100    }
101
102    /// Gets the estimated value at the given quantile.
103    ///
104    /// If the sketch is empty, or if the quantile is less than 0.0 or greater than 1.0, then the
105    /// result will be `None`.
106    ///
107    /// If the 0.0 or 1.0 quantile is requested, this function will return self.min() or self.max()
108    /// instead of the estimated value.
109    pub fn quantile(&self, q: f64) -> Option<f64> {
110        if !(0.0..=1.0).contains(&q) || self.count() == 0 {
111            return None;
112        }
113
114        self.sketch.quantile(q).expect("quantile should be valid at this point")
115    }
116
117    /// Merge another Summary into this one.
118    ///
119    /// # Errors
120    ///
121    /// This function will return an error if the other Summary was not created with the same
122    /// parameters.
123    pub fn merge(&mut self, other: &Summary) -> Result<(), MergeError> {
124        self.sketch.merge(&other.sketch).map_err(|_| MergeError {})?;
125        Ok(())
126    }
127
128    /// Gets the minimum value this summary has seen so far.
129    pub fn min(&self) -> f64 {
130        self.sketch.min().unwrap_or(f64::INFINITY)
131    }
132
133    /// Gets the maximum value this summary has seen so far.
134    pub fn max(&self) -> f64 {
135        self.sketch.max().unwrap_or(f64::NEG_INFINITY)
136    }
137
138    /// Whether or not this summary is empty.
139    pub fn is_empty(&self) -> bool {
140        self.count() == 0
141    }
142
143    /// Gets the number of samples in this summary.
144    pub fn count(&self) -> usize {
145        self.sketch.count()
146    }
147
148    /// Gets the estimized size of this summary, in bytes.
149    ///
150    /// In practice, this value should be very close to the actual size, but will not be entirely
151    /// precise.
152    pub fn estimated_size(&self) -> usize {
153        std::mem::size_of::<Self>() + (self.sketch.length() * 8)
154    }
155}
156
157#[derive(Copy, Clone, Debug)]
158pub struct MergeError {}
159
160impl fmt::Display for MergeError {
161    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
162        write!(f, "merge error")
163    }
164}
165
166impl std::error::Error for MergeError {}
167
168#[cfg(test)]
169mod tests {
170    use super::Summary;
171
172    use quickcheck_macros::quickcheck;
173
174    // Need this, because without the relative_eq/abs_diff_eq imports, we get weird IDE errors.
175    #[allow(unused_imports)]
176    use approx::{abs_diff_eq, assert_abs_diff_eq, assert_relative_eq, relative_eq};
177
178    use ndarray::{Array1, Axis};
179    use ndarray_stats::{interpolate::Linear, QuantileExt};
180    use noisy_float::types::n64;
181    use ordered_float::NotNan;
182    use rand::distr::{Distribution, Uniform};
183
184    #[test]
185    fn test_basics() {
186        let alpha = 0.0001;
187        let max_bins = 32_768;
188        let min_value = 1.0e-9;
189        let mut summary = Summary::new(alpha, max_bins, min_value);
190        assert!(summary.is_empty());
191
192        // Stretch the legs with a single value.
193        summary.add(-420.42);
194        assert_eq!(summary.count(), 1);
195        assert_relative_eq!(summary.min(), -420.42);
196        assert_relative_eq!(summary.max(), -420.42);
197
198        let test_cases = vec![(0.1, -420.42), (0.5, -420.42), (0.9, -420.42)];
199        for (q, val) in test_cases {
200            assert_relative_eq!(
201                summary.quantile(q).expect("value should exist"),
202                val,
203                max_relative = alpha
204            );
205        }
206
207        summary.add(420.42);
208
209        assert_eq!(summary.count(), 2);
210        assert_relative_eq!(summary.min(), -420.42);
211        assert_relative_eq!(summary.max(), 420.42);
212        assert_relative_eq!(
213            summary.quantile(0.5).expect("value should exist"),
214            -420.42,
215            max_relative = alpha
216        );
217        assert_relative_eq!(
218            summary.quantile(0.51).expect("value should exist"),
219            -420.42,
220            max_relative = alpha
221        );
222
223        summary.add(42.42);
224        assert_eq!(summary.count(), 3);
225        assert_relative_eq!(summary.min(), -420.42);
226        assert_relative_eq!(summary.max(), 420.42);
227
228        let test_cases = vec![
229            (0.333333, -420.42),
230            (0.333334, -420.42),
231            (0.666666, 42.42),
232            (0.666667, 42.42),
233            (0.999999, 42.42),
234        ];
235        for (q, val) in test_cases {
236            assert_relative_eq!(
237                summary.quantile(q).expect("value should exist"),
238                val,
239                max_relative = alpha
240            );
241        }
242    }
243
244    #[test]
245    fn test_positive_uniform() {
246        let alpha = 0.0001;
247        let max_bins = 32_768;
248        let min_value = 1.0e-9;
249
250        let mut rng = rand::rng();
251        let dist = Uniform::new(0.0, 100.0).unwrap();
252
253        let mut summary = Summary::new(alpha, max_bins, min_value);
254        let mut uniform = Vec::new();
255        for _ in 0..100_000 {
256            let value = dist.sample(&mut rng);
257            uniform.push(NotNan::new(value).unwrap());
258            summary.add(value);
259        }
260
261        uniform.sort();
262        let mut true_histogram = Array1::from(uniform);
263
264        let quantiles = &[0.25, 0.5, 0.75, 0.99];
265        for quantile in quantiles {
266            let aval_raw = true_histogram
267                .quantile_axis_mut(Axis(0), n64(*quantile), &Linear)
268                .expect("quantile should be in range");
269            let aval = aval_raw.get(()).expect("quantile value should be present").into_inner();
270            let sval = summary.quantile(*quantile).expect("quantile value should be present");
271
272            // Multiply the true value by α, and double it to account from the -α/α swing.
273            let distance = (aval * alpha) * 2.0;
274
275            assert_relative_eq!(aval, sval, max_relative = distance);
276        }
277    }
278
279    #[test]
280    fn test_negative_positive_uniform() {
281        let alpha = 0.0001;
282        let max_bins = 65_536;
283        let min_value = 1.0e-9;
284
285        let mut rng = rand::rng();
286        let dist = Uniform::new(-100.0, 100.0).unwrap();
287
288        let mut summary = Summary::new(alpha, max_bins, min_value);
289        let mut uniform = Vec::new();
290        for _ in 0..100_000 {
291            let value = dist.sample(&mut rng);
292            uniform.push(NotNan::new(value).unwrap());
293            summary.add(value);
294        }
295
296        uniform.sort();
297        let mut true_histogram = Array1::from(uniform);
298
299        // We explicitly skirt q=0.5 here to avoid the edge case quantiles as best as possible
300        // while asserting tightly to our relative error bound for everything else.
301        let quantiles = &[0.25, 0.47, 0.75, 0.99];
302        for quantile in quantiles {
303            let aval_raw = true_histogram
304                .quantile_axis_mut(Axis(0), n64(*quantile), &Linear)
305                .expect("quantile should be in range");
306            let aval = aval_raw.get(()).expect("quantile value should be present").into_inner();
307            let sval = summary.quantile(*quantile).expect("quantile value should be present");
308
309            // Multiply the true value by α, and quadruple it to account from the -α/α swing,
310            // but also to account for the values sitting at the edge case quantiles.
311            let distance = (aval.abs() * alpha) * 2.0;
312
313            assert_relative_eq!(aval, sval, max_relative = distance);
314        }
315    }
316
317    #[test]
318    fn test_zeroes() {
319        let mut summary = Summary::with_defaults();
320        summary.add(0.0);
321        assert_eq!(summary.quantile(0.5), Some(0.0));
322    }
323
324    #[test]
325    fn test_infinities() {
326        let mut summary = Summary::with_defaults();
327        summary.add(f64::INFINITY);
328        assert_eq!(summary.quantile(0.5), None);
329        summary.add(f64::NEG_INFINITY);
330        assert_eq!(summary.quantile(0.5), None);
331    }
332
333    #[quickcheck]
334    fn quantile_validity(inputs: Vec<f64>) -> bool {
335        let mut had_non_inf = false;
336
337        let mut summary = Summary::with_defaults();
338        for input in &inputs {
339            if !input.is_infinite() {
340                had_non_inf = true;
341            }
342            summary.add(*input);
343        }
344
345        let qs = &[0.0, 0.5, 0.9, 0.95, 0.99, 0.999, 1.0];
346        for q in qs {
347            let result = summary.quantile(*q);
348            if had_non_inf {
349                assert!(result.is_some());
350            } else {
351                assert!(result.is_none());
352            }
353        }
354
355        true
356    }
357}
metrics_util/storage/summary.rs

metrics_util/storage/
summary.rs