elasticsearch_dsl/search/aggregations/metrics/boxplot_aggregation.rs
1use crate::util::*;
2use crate::{Aggregation, Number};
3
4/// A `boxplot` metrics aggregation that computes boxplot of numeric values extracted from the
5/// aggregated documents. These values can be generated from specific numeric or [histogram fields](https://www.elastic.co/guide/en/elasticsearch/reference/current/histogram.html)
6/// in the documents.
7///
8/// The `boxplot` aggregation returns essential information for making a [box plot](https://en.wikipedia.org/wiki/Box_plot):
9/// minimum, maximum median, first quartile (25th percentile) and third quartile (75th percentile) values.
10///
11/// The algorithm used by the `boxplot` metric is called TDigest (introduced by Ted Dunning in
12/// [Computing Accurate Quantiles using T-Digests](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf)).
13///
14/// > Boxplot as other percentile aggregations are also [non-deterministic](https://en.wikipedia.org/wiki/Nondeterministic_algorithm).
15/// > This means you can get slightly different results using the same data.
16///
17/// <https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-boxplot-aggregation.html>
18#[derive(Debug, Clone, Serialize, PartialEq)]
19pub struct BoxplotAggregation {
20 boxplot: BoxplotAggregationInner,
21}
22
23#[derive(Debug, Clone, Serialize, PartialEq)]
24struct BoxplotAggregationInner {
25 field: String,
26 #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
27 compression: Option<Number>,
28 #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
29 missing: Option<Number>,
30}
31
32impl Aggregation {
33 /// Creates an instance of [`BoxplotAggregation`]
34 ///
35 /// - `field` - field to aggregate
36 pub fn boxplot<T>(field: T) -> BoxplotAggregation
37 where
38 T: ToString,
39 {
40 BoxplotAggregation {
41 boxplot: BoxplotAggregationInner {
42 field: field.to_string(),
43 compression: None,
44 missing: None,
45 },
46 }
47 }
48}
49
50impl BoxplotAggregation {
51 /// Approximate algorithms must balance memory utilization with estimation accuracy.
52 ///
53 /// The TDigest algorithm uses a number of "nodes" to approximate percentiles —— the more
54 /// nodes available, the higher the accuracy (and large memory footprint) proportional to the
55 /// volume of data. The `compression` parameter limits the maximum number of nodes to 20 * `compression`.
56 ///
57 /// Therefore, by increasing the compression value, you can increase the accuracy of your
58 /// percentiles at the cost of more memory. Larger compression values also make the algorithm
59 /// slower since the underlying tree data structure grows in size, resulting in more expensive
60 /// operations. The default compression value is 100.
61 ///
62 /// A "node" uses roughly 32 bytes of memory, so under worst-case scenarios (large amount of
63 /// data which arrives sorted and in-order) the default settings will produce a TDigest roughly
64 /// 64KB in size. In practice data tends to be more random and the TDigest will use less memory.
65 pub fn compression<T>(mut self, compression: T) -> Self
66 where
67 T: Into<Number>,
68 {
69 self.boxplot.compression = Some(compression.into());
70 self
71 }
72
73 /// The `missing` parameter defines how documents that are missing a value should be treated.
74 /// By default they will be ignored but it is also possible to treat them as if they had a value.
75 pub fn missing<T>(mut self, missing: T) -> Self
76 where
77 T: Into<Number>,
78 {
79 self.boxplot.missing = Some(missing.into());
80 self
81 }
82}
83
84#[cfg(test)]
85mod tests {
86 use super::*;
87
88 #[test]
89 fn serialization() {
90 assert_serialize_aggregation(
91 Aggregation::boxplot("test_field"),
92 json!({ "boxplot": { "field": "test_field" } }),
93 );
94
95 assert_serialize_aggregation(
96 Aggregation::boxplot("test_field")
97 .compression(100)
98 .missing(10),
99 json!({
100 "boxplot": {
101 "field": "test_field",
102 "compression": 100,
103 "missing": 10
104 }
105 }),
106 );
107 }
108}