elasticsearch_dsl/search/aggregations/bucket/
diversified_sampler_aggregation.rs

1use crate::search::*;
2use crate::util::*;
3
4/// Like the sampler aggregation this is a filtering aggregation used to limit any sub aggregations' processing
5/// to a sample of the top-scoring documents. The diversified_sampler aggregation adds the ability to limit
6/// the number of matches that share a common value such as an "author".
7///
8/// <https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-diversified-sampler-aggregation.html>
9#[derive(Debug, Clone, Serialize, PartialEq)]
10pub struct DiversifiedSamplerAggregation {
11    diversified_sampler: DiversifiedSamplerAggregationInner,
12
13    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
14    aggs: Aggregations,
15}
16
17/// `execution_hint` field values.
18#[derive(Debug, Clone, Serialize, PartialEq, Eq, Copy)]
19#[serde(rename_all = "snake_case")]
20pub enum ExecutionHint {
21    /// Hold field values directly
22    Map,
23
24    /// Hold hashes of the field values - with potential for hash collisions
25    BytesHash,
26
27    /// Hold ordinals of the field as determined by the Lucene index
28    GlobalOrdinals,
29}
30
31#[derive(Debug, Clone, Serialize, PartialEq)]
32struct DiversifiedSamplerAggregationInner {
33    field: String,
34
35    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
36    shard_size: Option<u64>,
37
38    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
39    max_docs_per_value: Option<u64>,
40
41    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
42    execution_hint: Option<ExecutionHint>,
43}
44
45impl Aggregation {
46    /// Creates an instance of [`DiversifiedSamplerAggregation`]
47    pub fn diversified_sampler<T>(field: T) -> DiversifiedSamplerAggregation
48    where
49        T: ToString,
50    {
51        DiversifiedSamplerAggregation {
52            diversified_sampler: DiversifiedSamplerAggregationInner {
53                field: field.to_string(),
54                shard_size: None,
55                max_docs_per_value: None,
56                execution_hint: None,
57            },
58            aggs: Aggregations::new(),
59        }
60    }
61}
62
63impl DiversifiedSamplerAggregation {
64    /// The `shard_size` parameter limits how many top-scoring documents are
65    /// collected in the sample processed on each shard. The default value is 100.
66    pub fn shard_size(mut self, shard_size: u64) -> Self {
67        self.diversified_sampler.shard_size = Some(shard_size);
68        self
69    }
70
71    /// The `max_docs_per_value` is an optional parameter and limits how many documents
72    /// are permitted per choice of de-duplicating value. The default setting is "1".
73    pub fn max_docs_per_value(mut self, max_docs_per_value: u64) -> Self {
74        self.diversified_sampler.max_docs_per_value = Some(max_docs_per_value);
75        self
76    }
77
78    /// The optional `execution_hint` setting can influence the management of the values
79    /// used for de-duplication. Each option will hold up to `shard_size` values in memory
80    /// while performing de-duplication but the type of value held can be controlled as follows:
81    /// - hold field values directly (`map`)
82    /// - hold ordinals of the field as determined by the Lucene index (`global_ordinals`)
83    /// - hold hashes of the field values - with potential for hash collisions (`bytes_hash`)
84    pub fn execution_hint(mut self, execution_hint: ExecutionHint) -> Self {
85        self.diversified_sampler.execution_hint = Some(execution_hint);
86        self
87    }
88
89    add_aggregate!();
90}
91
92#[cfg(test)]
93mod tests {
94    use super::*;
95
96    #[test]
97    fn serialization() {
98        assert_serialize_aggregation(
99            Aggregation::diversified_sampler("catalog_id").shard_size(50),
100            json!({
101                "diversified_sampler": {
102                    "field": "catalog_id",
103                    "shard_size": 50
104                }
105            }),
106        );
107
108        assert_serialize_aggregation(
109            Aggregation::diversified_sampler("catalog_id")
110                .shard_size(50)
111                .max_docs_per_value(2)
112                .execution_hint(ExecutionHint::GlobalOrdinals)
113                .aggregate("catalog", Aggregation::terms("catalog_id"))
114                .aggregate("brand", Aggregation::terms("brand_id")),
115            json!({
116                "diversified_sampler": {
117                    "field": "catalog_id",
118                    "shard_size": 50,
119                    "max_docs_per_value": 2,
120                    "execution_hint": "global_ordinals"
121                },
122                "aggs": {
123                    "catalog": {
124                        "terms": {
125                            "field": "catalog_id"
126                        }
127                    },
128                    "brand": {
129                        "terms": {
130                            "field": "brand_id"
131                        }
132                    }
133                }
134            }),
135        );
136    }
137}