1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
use sketches_ddsketch::{Config, DDSketch};

/// A quantile sketch with relative-error guarantees.
///
/// Based on [DDSketch][ddsketch], `Summary` provides quantiles over an arbitrary distribution of
/// floating-point numbers, including for negative numbers, using a space-efficient sketch that
/// provides relative-error guarantees, regardless of the absolute range between the smallest and
/// larger values.
///
/// `Summary` is similiar to [HDRHistogram][hdrhistogram] in practice, but supports an arbitrary
/// range of values, and supports floating-point numbers.
///
/// Numbers with an absolute value smaller than given `min_value` will be recognized as zeroes.
///
/// Memory usage for `Summary` should be nearly identical to `DDSketch`.
/// [`Summary::estimated_size`] provides a rough estimate of summary size based on the current
/// values that have been added to it.
///
/// As mentioned above, this sketch provides relative-error guarantees across quantiles falling
/// within 0 <= q <= 1, but trades some accuracy at the lowest quantiles as part of the collapsing
/// scheme that allows for automatically handling arbitrary ranges of values, even when the
/// maximum number of bins has been allocated.  Typically, q=0.05 and below is where this error will
/// be noticed, if present.
///
/// For cases when all values are positive, you can simply use [`Summary::min`] in lieu of checking
/// these quantiles, as the minimum value will be closer to the true value.  For cases when values
/// range from negative to positive, the aforementioned collapsing will perturb the estimated true
/// value for quantiles that conceptually fall within this collapsed band.
///
/// For example, for a distribution that spans from -25 to 75, we would intuitively expect q=0 to be
/// -25, q=0.25 to be 0, q=0.5 to be 25, and so on.  Internally, negative numbers and positive
/// numbers are handled in two separate containers.  Based on this example, one container would
/// handle -25 to 0, and another would handle the 0 to 75 range.  As the containers are mapped "back
/// to back", q=0.25 for this hypothetical summary would actually be q=0 within the negative
/// container, which may return an estimated true value that exceeds the relative error guarantees.
///
/// Of course, as these problems are related to the estimation aspect of this data structure, users
/// can allow the summary to allocate more bins to compensate for these edge cases, if desired.
///
/// [ddsketch]: https://arxiv.org/abs/1908.10693
/// [hdrhistogram]: https://docs.rs/hdrhistogram
#[derive(Clone)]
pub struct Summary {
    negative: DDSketch,
    positive: DDSketch,
    min_value: f64,
    zeroes: usize,
    min: f64,
    max: f64,
}

impl Summary {
    /// Creates a new [`Summary`].
    ///
    /// `alpha` represents the desired relative error for this summary.  If `alpha` was 0.0001, that
    /// would represent a desired relative error of 0.01%.  For example, if the true value at
    /// quantile q0 was 1, the estimated value at that quantile would be a value within 0.01% of the
    /// true value, or a value between 0.9999 and 1.0001.
    ///
    /// `max_buckets` controls how many subbuckets are created, which directly influences memory usage.
    /// Each bucket "costs" eight bytes, so a summary with 2048 buckets would consume a maximum of
    /// around 16 KiB.  Depending on how many samples have been added to the summary, the number of
    /// subbuckets allocated may be far below `max_buckets`, and the summary will allocate more as
    /// needed to fulfill the relative error guarantee.
    ///
    /// `min_value` controls the smallest value that will be recognized distinctly from zero.  Said
    /// another way, any value between `-min_value` and `min_value` will be counted as zero.
    pub fn new(alpha: f64, max_buckets: u32, min_value: f64) -> Summary {
        let config = Config::new(alpha, max_buckets, min_value.abs());

        Summary {
            negative: DDSketch::new(config),
            positive: DDSketch::new(config),
            min_value: min_value.abs(),
            zeroes: 0,
            min: f64::INFINITY,
            max: f64::NEG_INFINITY,
        }
    }

    /// Creates a new [`Summary`] with default values.
    ///
    /// `alpha` is 0.0001, `max_buckets` is 32,768, and `min_value` is 1.0e-9.
    ///
    /// This will yield a summary that is roughly equivalent in memory usage to an HDRHistogram with
    /// 3 significant digits, and will support values down to a single nanosecond.
    ///
    /// In practice, when using only positive values, maximum memory usage can be expected to hover
    /// around 200KiB, while usage of negative values can lead to an average maximum size of around
    /// 400KiB.
    pub fn with_defaults() -> Summary {
        Summary::new(0.0001, 32_768, 1.0e-9)
    }

    /// Adds a sample to the summary.
    ///
    /// If the absolute value of `value` is smaller than given `min_value`, it will be added as a zero.
    pub fn add(&mut self, value: f64) {
        if value.is_infinite() {
            return;
        }

        if value < self.min {
            self.min = value;
        }

        if value > self.max {
            self.max = value;
        }

        if value > self.min_value {
            self.positive.add(value);
        } else if value < -self.min_value {
            self.negative.add(-value);
        } else {
            self.zeroes += 1;
        }
    }

    /// Gets the estimated value at the given quantile.
    ///
    /// If the sketch is empty, or if the quantile is less than 0.0 or greater than 1.0, then the
    /// result will be `None`.
    ///
    /// While `q` can be either 0.0 or 1.0, callers should prefer to use [`Summary::min`] and
    /// [`Summary::max`] as the values will be the true values, and not an estimation.
    pub fn quantile(&self, q: f64) -> Option<f64> {
        if q < 0.0 || q > 1.0 || self.count() == 0 {
            return None;
        }

        let ncount = self.negative.count();
        let pcount = self.positive.count();
        let zcount = self.zeroes;
        let total = ncount + pcount + zcount;
        let rank = (q * (total - 1) as f64) as usize;

        if rank < ncount {
            // Quantile lands in the negative side.
            let nq = 1.0 - (rank as f64 / ncount as f64);
            self.negative
                .quantile(nq)
                .expect("quantile should be valid at this point")
                .map(|v| -v)
        } else if rank >= ncount && rank < (ncount + zcount) {
            // Quantile lands in the zero band.
            Some(0.0)
        } else {
            // Quantile lands in the positive side.
            let pq = (rank - (ncount + zcount)) as f64 / pcount as f64;
            self.positive
                .quantile(pq)
                .expect("quantile should be valid at this point")
        }
    }

    /// Gets the minimum value this summary has seen so far.
    pub fn min(&self) -> f64 {
        self.min
    }

    /// Gets the maximum value this summary has seen so far.
    pub fn max(&self) -> f64 {
        self.max
    }

    /// Whether or not this summary is empty.
    pub fn is_empty(&self) -> bool {
        self.count() == 0
    }

    /// Gets the number of samples in this summary.
    pub fn count(&self) -> usize {
        self.negative.count() + self.positive.count() + self.zeroes
    }

    /// Gets the number of samples in this summary by zeroes, negative, and positive counts.
    pub fn detailed_count(&self) -> (usize, usize, usize) {
        (self.zeroes, self.negative.count(), self.positive.count())
    }

    /// Gets the estimized size of this summary, in bytes.
    ///
    /// In practice, this value should be very close to the actual size, but will not be entirely
    /// precise.
    pub fn estimated_size(&self) -> usize {
        std::mem::size_of::<Self>() + ((self.positive.length() + self.negative.length()) * 8)
    }
}

#[cfg(test)]
mod tests {
    use super::Summary;

    use quickcheck_macros::quickcheck;

    // Need this, because without the relative_eq/abs_diff_eq imports, we get weird IDE errors.
    #[allow(unused_imports)]
    use approx::{abs_diff_eq, assert_abs_diff_eq, assert_relative_eq, relative_eq};

    use ndarray::{Array1, Axis};
    use ndarray_stats::{interpolate::Linear, QuantileExt};
    use noisy_float::types::n64;
    use ordered_float::NotNan;
    use rand::{distributions::Distribution, thread_rng};
    use rand_distr::Uniform;

    #[test]
    fn test_basics() {
        let mut summary = Summary::with_defaults();
        assert!(summary.is_empty());

        // Stretch the legs with a single value.
        summary.add(-420.42);
        assert_eq!(summary.count(), 1);
        assert_relative_eq!(summary.min(), -420.42);
        assert_relative_eq!(summary.max(), -420.42);
        assert_abs_diff_eq!(summary.quantile(0.1).expect("value should exist"), -420.42);
        assert_abs_diff_eq!(summary.quantile(0.5).expect("value should exist"), -420.42);
        assert_abs_diff_eq!(summary.quantile(0.99).expect("value should exist"), -420.42);

        summary.add(420.42);
        assert_eq!(summary.count(), 2);
        assert_relative_eq!(summary.min(), -420.42);
        assert_relative_eq!(summary.max(), 420.42);
        assert_abs_diff_eq!(summary.quantile(0.49).expect("value should exist"), -420.42);

        summary.add(42.42);
        assert_eq!(summary.count(), 3);
        assert_relative_eq!(summary.min(), -420.42);
        assert_relative_eq!(summary.max(), 420.42);
        assert_abs_diff_eq!(
            summary.quantile(0.4999999999).expect("value should exist"),
            -420.42
        );
        assert_abs_diff_eq!(summary.quantile(0.5).expect("value should exist"), 42.42);
        assert_abs_diff_eq!(
            summary.quantile(0.9999999999).expect("value should exist"),
            42.42
        );
    }

    #[test]
    fn test_positive_uniform() {
        let alpha = 0.0001;
        let max_bins = 32_768;
        let min_value = 1.0e-9;

        let mut rng = thread_rng();
        let dist = Uniform::new(0.0, 100.0);

        let mut summary = Summary::new(alpha, max_bins, min_value);
        let mut uniform = Vec::new();
        for _ in 0..100_000 {
            let value = dist.sample(&mut rng);
            uniform.push(NotNan::new(value).unwrap());
            summary.add(value);
        }

        uniform.sort();
        let mut true_histogram = Array1::from(uniform);

        let quantiles = &[0.25, 0.5, 0.75, 0.99];
        for quantile in quantiles {
            let aval_raw = true_histogram
                .quantile_axis_mut(Axis(0), n64(*quantile), &Linear)
                .expect("quantile should be in range");
            let aval = aval_raw
                .get(())
                .expect("quantile value should be present")
                .into_inner();
            let sval = summary
                .quantile(*quantile)
                .expect("quantile value should be present");

            // Multiply the true value by α, and double it to account from the -α/α swing.
            let distance = (aval * alpha) * 2.0;

            assert_relative_eq!(aval, sval, max_relative = distance);
        }
    }

    #[test]
    fn test_negative_positive_uniform() {
        let alpha = 0.0001;
        let max_bins = 65_536;
        let min_value = 1.0e-9;

        let mut rng = thread_rng();
        let dist = Uniform::new(-100.0, 100.0);

        let mut summary = Summary::new(alpha, max_bins, min_value);
        let mut uniform = Vec::new();
        for _ in 0..100_000 {
            let value = dist.sample(&mut rng);
            uniform.push(NotNan::new(value).unwrap());
            summary.add(value);
        }

        uniform.sort();
        let mut true_histogram = Array1::from(uniform);

        // We explicitly skirt q=0.5 here to avoid the edge case quantiles as best as possible
        // while asserting tightly to our relative error bound for everything else.
        let quantiles = &[0.25, 0.47, 0.75, 0.99];
        for quantile in quantiles {
            let aval_raw = true_histogram
                .quantile_axis_mut(Axis(0), n64(*quantile), &Linear)
                .expect("quantile should be in range");
            let aval = aval_raw
                .get(())
                .expect("quantile value should be present")
                .into_inner();
            let sval = summary
                .quantile(*quantile)
                .expect("quantile value should be present");

            // Multiply the true value by α, and quadruple it to account from the -α/α swing,
            // but also to account for the values sitting at the edge case quantiles.
            let distance = (aval.abs() * alpha) * 2.0;

            assert_relative_eq!(aval, sval, max_relative = distance);
        }
    }

    #[test]
    fn test_zeroes() {
        let mut summary = Summary::with_defaults();
        summary.add(0.0);
        assert_eq!(summary.quantile(0.5), Some(0.0));
    }

    #[test]
    fn test_infinities() {
        let mut summary = Summary::with_defaults();
        summary.add(f64::INFINITY);
        assert_eq!(summary.quantile(0.5), None);
        summary.add(f64::NEG_INFINITY);
        assert_eq!(summary.quantile(0.5), None);
    }

    #[quickcheck]
    fn quantile_validity(inputs: Vec<f64>) -> bool {
        let mut had_non_inf = false;

        let mut summary = Summary::with_defaults();
        for input in &inputs {
            if !input.is_infinite() {
                had_non_inf = true;
            }
            summary.add(*input);
        }

        let qs = &[0.0, 0.5, 0.9, 0.95, 0.99, 0.999, 1.0];
        for q in qs {
            let result = summary.quantile(*q);
            if had_non_inf {
                assert!(result.is_some());
            } else {
                assert!(result.is_none());
            }
        }

        true
    }
}