datafusion_functions_aggregate/
approx_distinct.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Defines physical expressions that can evaluated at runtime during query execution
19
20use crate::hyperloglog::HyperLogLog;
21use arrow::array::{BinaryArray, StringViewArray};
22use arrow::array::{
23    GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray,
24};
25use arrow::datatypes::{
26    ArrowPrimitiveType, Date32Type, Date64Type, FieldRef, Int8Type, Int16Type, Int32Type,
27    Int64Type, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
28    Time64NanosecondType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType,
29    TimestampNanosecondType, TimestampSecondType, UInt8Type, UInt16Type, UInt32Type,
30    UInt64Type,
31};
32use arrow::{array::ArrayRef, datatypes::DataType, datatypes::Field};
33use datafusion_common::ScalarValue;
34use datafusion_common::{
35    DataFusionError, Result, downcast_value, internal_datafusion_err, internal_err,
36    not_impl_err,
37};
38use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
39use datafusion_expr::utils::format_state_name;
40use datafusion_expr::{
41    Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
42};
43use datafusion_functions_aggregate_common::noop_accumulator::NoopAccumulator;
44use datafusion_macros::user_doc;
45use std::any::Any;
46use std::fmt::{Debug, Formatter};
47use std::hash::Hash;
48use std::marker::PhantomData;
49
50make_udaf_expr_and_func!(
51    ApproxDistinct,
52    approx_distinct,
53    expression,
54    "approximate number of distinct input values",
55    approx_distinct_udaf
56);
57
58impl<T: Hash> From<&HyperLogLog<T>> for ScalarValue {
59    fn from(v: &HyperLogLog<T>) -> ScalarValue {
60        let values = v.as_ref().to_vec();
61        ScalarValue::Binary(Some(values))
62    }
63}
64
65impl<T: Hash> TryFrom<&[u8]> for HyperLogLog<T> {
66    type Error = DataFusionError;
67    fn try_from(v: &[u8]) -> Result<HyperLogLog<T>> {
68        let arr: [u8; 16384] = v.try_into().map_err(|_| {
69            internal_datafusion_err!("Impossibly got invalid binary array from states")
70        })?;
71        Ok(HyperLogLog::<T>::new_with_registers(arr))
72    }
73}
74
75impl<T: Hash> TryFrom<&ScalarValue> for HyperLogLog<T> {
76    type Error = DataFusionError;
77    fn try_from(v: &ScalarValue) -> Result<HyperLogLog<T>> {
78        if let ScalarValue::Binary(Some(slice)) = v {
79            slice.as_slice().try_into()
80        } else {
81            internal_err!(
82                "Impossibly got invalid scalar value while converting to HyperLogLog"
83            )
84        }
85    }
86}
87
88#[derive(Debug)]
89struct NumericHLLAccumulator<T>
90where
91    T: ArrowPrimitiveType,
92    T::Native: Hash,
93{
94    hll: HyperLogLog<T::Native>,
95}
96
97impl<T> NumericHLLAccumulator<T>
98where
99    T: ArrowPrimitiveType,
100    T::Native: Hash,
101{
102    /// new approx_distinct accumulator
103    pub fn new() -> Self {
104        Self {
105            hll: HyperLogLog::new(),
106        }
107    }
108}
109
110#[derive(Debug)]
111struct StringHLLAccumulator<T>
112where
113    T: OffsetSizeTrait,
114{
115    hll: HyperLogLog<String>,
116    phantom_data: PhantomData<T>,
117}
118
119impl<T> StringHLLAccumulator<T>
120where
121    T: OffsetSizeTrait,
122{
123    /// new approx_distinct accumulator
124    pub fn new() -> Self {
125        Self {
126            hll: HyperLogLog::new(),
127            phantom_data: PhantomData,
128        }
129    }
130}
131
132#[derive(Debug)]
133struct StringViewHLLAccumulator<T>
134where
135    T: OffsetSizeTrait,
136{
137    hll: HyperLogLog<String>,
138    phantom_data: PhantomData<T>,
139}
140
141impl<T> StringViewHLLAccumulator<T>
142where
143    T: OffsetSizeTrait,
144{
145    pub fn new() -> Self {
146        Self {
147            hll: HyperLogLog::new(),
148            phantom_data: PhantomData,
149        }
150    }
151}
152
153#[derive(Debug)]
154struct BinaryHLLAccumulator<T>
155where
156    T: OffsetSizeTrait,
157{
158    hll: HyperLogLog<Vec<u8>>,
159    phantom_data: PhantomData<T>,
160}
161
162impl<T> BinaryHLLAccumulator<T>
163where
164    T: OffsetSizeTrait,
165{
166    /// new approx_distinct accumulator
167    pub fn new() -> Self {
168        Self {
169            hll: HyperLogLog::new(),
170            phantom_data: PhantomData,
171        }
172    }
173}
174
175macro_rules! default_accumulator_impl {
176    () => {
177        fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
178            assert_eq!(1, states.len(), "expect only 1 element in the states");
179            let binary_array = downcast_value!(states[0], BinaryArray);
180            for v in binary_array.iter() {
181                let v = v.ok_or_else(|| {
182                    internal_datafusion_err!(
183                        "Impossibly got empty binary array from states"
184                    )
185                })?;
186                let other = v.try_into()?;
187                self.hll.merge(&other);
188            }
189            Ok(())
190        }
191
192        fn state(&mut self) -> Result<Vec<ScalarValue>> {
193            let value = ScalarValue::from(&self.hll);
194            Ok(vec![value])
195        }
196
197        fn evaluate(&mut self) -> Result<ScalarValue> {
198            Ok(ScalarValue::UInt64(Some(self.hll.count() as u64)))
199        }
200
201        fn size(&self) -> usize {
202            // HLL has static size
203            std::mem::size_of_val(self)
204        }
205    };
206}
207
208impl<T> Accumulator for BinaryHLLAccumulator<T>
209where
210    T: OffsetSizeTrait,
211{
212    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
213        let array: &GenericBinaryArray<T> =
214            downcast_value!(values[0], GenericBinaryArray, T);
215        // flatten because we would skip nulls
216        self.hll
217            .extend(array.into_iter().flatten().map(|v| v.to_vec()));
218        Ok(())
219    }
220
221    default_accumulator_impl!();
222}
223
224impl<T> Accumulator for StringViewHLLAccumulator<T>
225where
226    T: OffsetSizeTrait,
227{
228    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
229        let array: &StringViewArray = downcast_value!(values[0], StringViewArray);
230        // flatten because we would skip nulls
231        self.hll
232            .extend(array.iter().flatten().map(|s| s.to_string()));
233        Ok(())
234    }
235
236    default_accumulator_impl!();
237}
238
239impl<T> Accumulator for StringHLLAccumulator<T>
240where
241    T: OffsetSizeTrait,
242{
243    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
244        let array: &GenericStringArray<T> =
245            downcast_value!(values[0], GenericStringArray, T);
246        // flatten because we would skip nulls
247        self.hll
248            .extend(array.into_iter().flatten().map(|i| i.to_string()));
249        Ok(())
250    }
251
252    default_accumulator_impl!();
253}
254
255impl<T> Accumulator for NumericHLLAccumulator<T>
256where
257    T: ArrowPrimitiveType + Debug,
258    T::Native: Hash,
259{
260    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
261        let array: &PrimitiveArray<T> = downcast_value!(values[0], PrimitiveArray, T);
262        // flatten because we would skip nulls
263        self.hll.extend(array.into_iter().flatten());
264        Ok(())
265    }
266
267    default_accumulator_impl!();
268}
269
270impl Debug for ApproxDistinct {
271    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
272        f.debug_struct("ApproxDistinct")
273            .field("name", &self.name())
274            .field("signature", &self.signature)
275            .finish()
276    }
277}
278
279impl Default for ApproxDistinct {
280    fn default() -> Self {
281        Self::new()
282    }
283}
284
285#[user_doc(
286    doc_section(label = "Approximate Functions"),
287    description = "Returns the approximate number of distinct input values calculated using the HyperLogLog algorithm.",
288    syntax_example = "approx_distinct(expression)",
289    sql_example = r#"```sql
290> SELECT approx_distinct(column_name) FROM table_name;
291+-----------------------------------+
292| approx_distinct(column_name)      |
293+-----------------------------------+
294| 42                                |
295+-----------------------------------+
296```"#,
297    standard_argument(name = "expression",)
298)]
299#[derive(PartialEq, Eq, Hash)]
300pub struct ApproxDistinct {
301    signature: Signature,
302}
303
304impl ApproxDistinct {
305    pub fn new() -> Self {
306        Self {
307            signature: Signature::any(1, Volatility::Immutable),
308        }
309    }
310}
311
312impl AggregateUDFImpl for ApproxDistinct {
313    fn as_any(&self) -> &dyn Any {
314        self
315    }
316
317    fn name(&self) -> &str {
318        "approx_distinct"
319    }
320
321    fn signature(&self) -> &Signature {
322        &self.signature
323    }
324
325    fn return_type(&self, _: &[DataType]) -> Result<DataType> {
326        Ok(DataType::UInt64)
327    }
328
329    fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
330        if args.input_fields[0].data_type().is_null() {
331            Ok(vec![
332                Field::new(
333                    format_state_name(args.name, self.name()),
334                    DataType::Null,
335                    true,
336                )
337                .into(),
338            ])
339        } else {
340            Ok(vec![
341                Field::new(
342                    format_state_name(args.name, "hll_registers"),
343                    DataType::Binary,
344                    false,
345                )
346                .into(),
347            ])
348        }
349    }
350
351    fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
352        let data_type = acc_args.expr_fields[0].data_type();
353
354        let accumulator: Box<dyn Accumulator> = match data_type {
355            // TODO u8, i8, u16, i16 shall really be done using bitmap, not HLL
356            // TODO support for boolean (trivial case)
357            // https://github.com/apache/datafusion/issues/1109
358            DataType::UInt8 => Box::new(NumericHLLAccumulator::<UInt8Type>::new()),
359            DataType::UInt16 => Box::new(NumericHLLAccumulator::<UInt16Type>::new()),
360            DataType::UInt32 => Box::new(NumericHLLAccumulator::<UInt32Type>::new()),
361            DataType::UInt64 => Box::new(NumericHLLAccumulator::<UInt64Type>::new()),
362            DataType::Int8 => Box::new(NumericHLLAccumulator::<Int8Type>::new()),
363            DataType::Int16 => Box::new(NumericHLLAccumulator::<Int16Type>::new()),
364            DataType::Int32 => Box::new(NumericHLLAccumulator::<Int32Type>::new()),
365            DataType::Int64 => Box::new(NumericHLLAccumulator::<Int64Type>::new()),
366            DataType::Date32 => Box::new(NumericHLLAccumulator::<Date32Type>::new()),
367            DataType::Date64 => Box::new(NumericHLLAccumulator::<Date64Type>::new()),
368            DataType::Time32(TimeUnit::Second) => {
369                Box::new(NumericHLLAccumulator::<Time32SecondType>::new())
370            }
371            DataType::Time32(TimeUnit::Millisecond) => {
372                Box::new(NumericHLLAccumulator::<Time32MillisecondType>::new())
373            }
374            DataType::Time64(TimeUnit::Microsecond) => {
375                Box::new(NumericHLLAccumulator::<Time64MicrosecondType>::new())
376            }
377            DataType::Time64(TimeUnit::Nanosecond) => {
378                Box::new(NumericHLLAccumulator::<Time64NanosecondType>::new())
379            }
380            DataType::Timestamp(TimeUnit::Second, _) => {
381                Box::new(NumericHLLAccumulator::<TimestampSecondType>::new())
382            }
383            DataType::Timestamp(TimeUnit::Millisecond, _) => {
384                Box::new(NumericHLLAccumulator::<TimestampMillisecondType>::new())
385            }
386            DataType::Timestamp(TimeUnit::Microsecond, _) => {
387                Box::new(NumericHLLAccumulator::<TimestampMicrosecondType>::new())
388            }
389            DataType::Timestamp(TimeUnit::Nanosecond, _) => {
390                Box::new(NumericHLLAccumulator::<TimestampNanosecondType>::new())
391            }
392            DataType::Utf8 => Box::new(StringHLLAccumulator::<i32>::new()),
393            DataType::LargeUtf8 => Box::new(StringHLLAccumulator::<i64>::new()),
394            DataType::Utf8View => Box::new(StringViewHLLAccumulator::<i32>::new()),
395            DataType::Binary => Box::new(BinaryHLLAccumulator::<i32>::new()),
396            DataType::LargeBinary => Box::new(BinaryHLLAccumulator::<i64>::new()),
397            DataType::Null => {
398                Box::new(NoopAccumulator::new(ScalarValue::UInt64(Some(0))))
399            }
400            other => {
401                return not_impl_err!(
402                    "Support for 'approx_distinct' for data type {other} is not implemented"
403                );
404            }
405        };
406        Ok(accumulator)
407    }
408
409    fn documentation(&self) -> Option<&Documentation> {
410        self.doc()
411    }
412}