datafusion_functions_aggregate/
approx_distinct.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Defines physical expressions that can evaluated at runtime during query execution
19
20use crate::hyperloglog::HyperLogLog;
21use arrow::array::{BinaryArray, StringViewArray};
22use arrow::array::{
23    GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray,
24};
25use arrow::datatypes::{
26    ArrowPrimitiveType, Date32Type, Date64Type, FieldRef, Int16Type, Int32Type,
27    Int64Type, Int8Type, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
28    Time64NanosecondType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType,
29    TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type,
30    UInt8Type,
31};
32use arrow::{array::ArrayRef, datatypes::DataType, datatypes::Field};
33use datafusion_common::ScalarValue;
34use datafusion_common::{
35    downcast_value, internal_datafusion_err, internal_err, not_impl_err, DataFusionError,
36    Result,
37};
38use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
39use datafusion_expr::utils::format_state_name;
40use datafusion_expr::{
41    Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
42};
43use datafusion_macros::user_doc;
44use std::any::Any;
45use std::fmt::{Debug, Formatter};
46use std::hash::Hash;
47use std::marker::PhantomData;
48
49make_udaf_expr_and_func!(
50    ApproxDistinct,
51    approx_distinct,
52    expression,
53    "approximate number of distinct input values",
54    approx_distinct_udaf
55);
56
57impl<T: Hash> From<&HyperLogLog<T>> for ScalarValue {
58    fn from(v: &HyperLogLog<T>) -> ScalarValue {
59        let values = v.as_ref().to_vec();
60        ScalarValue::Binary(Some(values))
61    }
62}
63
64impl<T: Hash> TryFrom<&[u8]> for HyperLogLog<T> {
65    type Error = DataFusionError;
66    fn try_from(v: &[u8]) -> Result<HyperLogLog<T>> {
67        let arr: [u8; 16384] = v.try_into().map_err(|_| {
68            internal_datafusion_err!("Impossibly got invalid binary array from states")
69        })?;
70        Ok(HyperLogLog::<T>::new_with_registers(arr))
71    }
72}
73
74impl<T: Hash> TryFrom<&ScalarValue> for HyperLogLog<T> {
75    type Error = DataFusionError;
76    fn try_from(v: &ScalarValue) -> Result<HyperLogLog<T>> {
77        if let ScalarValue::Binary(Some(slice)) = v {
78            slice.as_slice().try_into()
79        } else {
80            internal_err!(
81                "Impossibly got invalid scalar value while converting to HyperLogLog"
82            )
83        }
84    }
85}
86
87#[derive(Debug)]
88struct NumericHLLAccumulator<T>
89where
90    T: ArrowPrimitiveType,
91    T::Native: Hash,
92{
93    hll: HyperLogLog<T::Native>,
94}
95
96impl<T> NumericHLLAccumulator<T>
97where
98    T: ArrowPrimitiveType,
99    T::Native: Hash,
100{
101    /// new approx_distinct accumulator
102    pub fn new() -> Self {
103        Self {
104            hll: HyperLogLog::new(),
105        }
106    }
107}
108
109#[derive(Debug)]
110struct StringHLLAccumulator<T>
111where
112    T: OffsetSizeTrait,
113{
114    hll: HyperLogLog<String>,
115    phantom_data: PhantomData<T>,
116}
117
118impl<T> StringHLLAccumulator<T>
119where
120    T: OffsetSizeTrait,
121{
122    /// new approx_distinct accumulator
123    pub fn new() -> Self {
124        Self {
125            hll: HyperLogLog::new(),
126            phantom_data: PhantomData,
127        }
128    }
129}
130
131#[derive(Debug)]
132struct StringViewHLLAccumulator<T>
133where
134    T: OffsetSizeTrait,
135{
136    hll: HyperLogLog<String>,
137    phantom_data: PhantomData<T>,
138}
139
140impl<T> StringViewHLLAccumulator<T>
141where
142    T: OffsetSizeTrait,
143{
144    pub fn new() -> Self {
145        Self {
146            hll: HyperLogLog::new(),
147            phantom_data: PhantomData,
148        }
149    }
150}
151
152#[derive(Debug)]
153struct BinaryHLLAccumulator<T>
154where
155    T: OffsetSizeTrait,
156{
157    hll: HyperLogLog<Vec<u8>>,
158    phantom_data: PhantomData<T>,
159}
160
161impl<T> BinaryHLLAccumulator<T>
162where
163    T: OffsetSizeTrait,
164{
165    /// new approx_distinct accumulator
166    pub fn new() -> Self {
167        Self {
168            hll: HyperLogLog::new(),
169            phantom_data: PhantomData,
170        }
171    }
172}
173
174#[derive(Debug)]
175struct NullHLLAccumulator;
176
177macro_rules! default_accumulator_impl {
178    () => {
179        fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
180            assert_eq!(1, states.len(), "expect only 1 element in the states");
181            let binary_array = downcast_value!(states[0], BinaryArray);
182            for v in binary_array.iter() {
183                let v = v.ok_or_else(|| {
184                    internal_datafusion_err!(
185                        "Impossibly got empty binary array from states"
186                    )
187                })?;
188                let other = v.try_into()?;
189                self.hll.merge(&other);
190            }
191            Ok(())
192        }
193
194        fn state(&mut self) -> Result<Vec<ScalarValue>> {
195            let value = ScalarValue::from(&self.hll);
196            Ok(vec![value])
197        }
198
199        fn evaluate(&mut self) -> Result<ScalarValue> {
200            Ok(ScalarValue::UInt64(Some(self.hll.count() as u64)))
201        }
202
203        fn size(&self) -> usize {
204            // HLL has static size
205            std::mem::size_of_val(self)
206        }
207    };
208}
209
210impl<T> Accumulator for BinaryHLLAccumulator<T>
211where
212    T: OffsetSizeTrait,
213{
214    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
215        let array: &GenericBinaryArray<T> =
216            downcast_value!(values[0], GenericBinaryArray, T);
217        // flatten because we would skip nulls
218        self.hll
219            .extend(array.into_iter().flatten().map(|v| v.to_vec()));
220        Ok(())
221    }
222
223    default_accumulator_impl!();
224}
225
226impl<T> Accumulator for StringViewHLLAccumulator<T>
227where
228    T: OffsetSizeTrait,
229{
230    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
231        let array: &StringViewArray = downcast_value!(values[0], StringViewArray);
232        // flatten because we would skip nulls
233        self.hll
234            .extend(array.iter().flatten().map(|s| s.to_string()));
235        Ok(())
236    }
237
238    default_accumulator_impl!();
239}
240
241impl<T> Accumulator for StringHLLAccumulator<T>
242where
243    T: OffsetSizeTrait,
244{
245    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
246        let array: &GenericStringArray<T> =
247            downcast_value!(values[0], GenericStringArray, T);
248        // flatten because we would skip nulls
249        self.hll
250            .extend(array.into_iter().flatten().map(|i| i.to_string()));
251        Ok(())
252    }
253
254    default_accumulator_impl!();
255}
256
257impl<T> Accumulator for NumericHLLAccumulator<T>
258where
259    T: ArrowPrimitiveType + Debug,
260    T::Native: Hash,
261{
262    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
263        let array: &PrimitiveArray<T> = downcast_value!(values[0], PrimitiveArray, T);
264        // flatten because we would skip nulls
265        self.hll.extend(array.into_iter().flatten());
266        Ok(())
267    }
268
269    default_accumulator_impl!();
270}
271
272impl Accumulator for NullHLLAccumulator {
273    fn update_batch(&mut self, _values: &[ArrayRef]) -> Result<()> {
274        // do nothing, all values are null
275        Ok(())
276    }
277
278    fn merge_batch(&mut self, _states: &[ArrayRef]) -> Result<()> {
279        Ok(())
280    }
281
282    fn state(&mut self) -> Result<Vec<ScalarValue>> {
283        Ok(vec![])
284    }
285
286    fn evaluate(&mut self) -> Result<ScalarValue> {
287        Ok(ScalarValue::UInt64(Some(0)))
288    }
289
290    fn size(&self) -> usize {
291        size_of_val(self)
292    }
293}
294
295impl Debug for ApproxDistinct {
296    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
297        f.debug_struct("ApproxDistinct")
298            .field("name", &self.name())
299            .field("signature", &self.signature)
300            .finish()
301    }
302}
303
304impl Default for ApproxDistinct {
305    fn default() -> Self {
306        Self::new()
307    }
308}
309
310#[user_doc(
311    doc_section(label = "Approximate Functions"),
312    description = "Returns the approximate number of distinct input values calculated using the HyperLogLog algorithm.",
313    syntax_example = "approx_distinct(expression)",
314    sql_example = r#"```sql
315> SELECT approx_distinct(column_name) FROM table_name;
316+-----------------------------------+
317| approx_distinct(column_name)      |
318+-----------------------------------+
319| 42                                |
320+-----------------------------------+
321```"#,
322    standard_argument(name = "expression",)
323)]
324#[derive(PartialEq, Eq, Hash)]
325pub struct ApproxDistinct {
326    signature: Signature,
327}
328
329impl ApproxDistinct {
330    pub fn new() -> Self {
331        Self {
332            signature: Signature::any(1, Volatility::Immutable),
333        }
334    }
335}
336
337impl AggregateUDFImpl for ApproxDistinct {
338    fn as_any(&self) -> &dyn Any {
339        self
340    }
341
342    fn name(&self) -> &str {
343        "approx_distinct"
344    }
345
346    fn signature(&self) -> &Signature {
347        &self.signature
348    }
349
350    fn return_type(&self, _: &[DataType]) -> Result<DataType> {
351        Ok(DataType::UInt64)
352    }
353
354    fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
355        Ok(vec![Field::new(
356            format_state_name(args.name, "hll_registers"),
357            DataType::Binary,
358            false,
359        )
360        .into()])
361    }
362
363    fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
364        let data_type = acc_args.expr_fields[0].data_type();
365
366        let accumulator: Box<dyn Accumulator> = match data_type {
367            // TODO u8, i8, u16, i16 shall really be done using bitmap, not HLL
368            // TODO support for boolean (trivial case)
369            // https://github.com/apache/datafusion/issues/1109
370            DataType::UInt8 => Box::new(NumericHLLAccumulator::<UInt8Type>::new()),
371            DataType::UInt16 => Box::new(NumericHLLAccumulator::<UInt16Type>::new()),
372            DataType::UInt32 => Box::new(NumericHLLAccumulator::<UInt32Type>::new()),
373            DataType::UInt64 => Box::new(NumericHLLAccumulator::<UInt64Type>::new()),
374            DataType::Int8 => Box::new(NumericHLLAccumulator::<Int8Type>::new()),
375            DataType::Int16 => Box::new(NumericHLLAccumulator::<Int16Type>::new()),
376            DataType::Int32 => Box::new(NumericHLLAccumulator::<Int32Type>::new()),
377            DataType::Int64 => Box::new(NumericHLLAccumulator::<Int64Type>::new()),
378            DataType::Date32 => Box::new(NumericHLLAccumulator::<Date32Type>::new()),
379            DataType::Date64 => Box::new(NumericHLLAccumulator::<Date64Type>::new()),
380            DataType::Time32(TimeUnit::Second) => {
381                Box::new(NumericHLLAccumulator::<Time32SecondType>::new())
382            }
383            DataType::Time32(TimeUnit::Millisecond) => {
384                Box::new(NumericHLLAccumulator::<Time32MillisecondType>::new())
385            }
386            DataType::Time64(TimeUnit::Microsecond) => {
387                Box::new(NumericHLLAccumulator::<Time64MicrosecondType>::new())
388            }
389            DataType::Time64(TimeUnit::Nanosecond) => {
390                Box::new(NumericHLLAccumulator::<Time64NanosecondType>::new())
391            }
392            DataType::Timestamp(TimeUnit::Second, _) => {
393                Box::new(NumericHLLAccumulator::<TimestampSecondType>::new())
394            }
395            DataType::Timestamp(TimeUnit::Millisecond, _) => {
396                Box::new(NumericHLLAccumulator::<TimestampMillisecondType>::new())
397            }
398            DataType::Timestamp(TimeUnit::Microsecond, _) => {
399                Box::new(NumericHLLAccumulator::<TimestampMicrosecondType>::new())
400            }
401            DataType::Timestamp(TimeUnit::Nanosecond, _) => {
402                Box::new(NumericHLLAccumulator::<TimestampNanosecondType>::new())
403            }
404            DataType::Utf8 => Box::new(StringHLLAccumulator::<i32>::new()),
405            DataType::LargeUtf8 => Box::new(StringHLLAccumulator::<i64>::new()),
406            DataType::Utf8View => Box::new(StringViewHLLAccumulator::<i32>::new()),
407            DataType::Binary => Box::new(BinaryHLLAccumulator::<i32>::new()),
408            DataType::LargeBinary => Box::new(BinaryHLLAccumulator::<i64>::new()),
409            DataType::Null => Box::new(NullHLLAccumulator),
410            other => {
411                return not_impl_err!(
412                "Support for 'approx_distinct' for data type {other} is not implemented"
413            )
414            }
415        };
416        Ok(accumulator)
417    }
418
419    fn documentation(&self) -> Option<&Documentation> {
420        self.doc()
421    }
422}