datafusion_functions_aggregate/
approx_distinct.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Defines physical expressions that can evaluated at runtime during query execution
19
20use crate::hyperloglog::HyperLogLog;
21use arrow::array::{BinaryArray, StringViewArray};
22use arrow::array::{
23    GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray,
24};
25use arrow::datatypes::{
26    ArrowPrimitiveType, FieldRef, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type,
27    UInt32Type, UInt64Type, UInt8Type,
28};
29use arrow::{array::ArrayRef, datatypes::DataType, datatypes::Field};
30use datafusion_common::ScalarValue;
31use datafusion_common::{
32    downcast_value, internal_err, not_impl_err, DataFusionError, Result,
33};
34use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
35use datafusion_expr::utils::format_state_name;
36use datafusion_expr::{
37    Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
38};
39use datafusion_macros::user_doc;
40use std::any::Any;
41use std::fmt::{Debug, Formatter};
42use std::hash::Hash;
43use std::marker::PhantomData;
44
45make_udaf_expr_and_func!(
46    ApproxDistinct,
47    approx_distinct,
48    expression,
49    "approximate number of distinct input values",
50    approx_distinct_udaf
51);
52
53impl<T: Hash> From<&HyperLogLog<T>> for ScalarValue {
54    fn from(v: &HyperLogLog<T>) -> ScalarValue {
55        let values = v.as_ref().to_vec();
56        ScalarValue::Binary(Some(values))
57    }
58}
59
60impl<T: Hash> TryFrom<&[u8]> for HyperLogLog<T> {
61    type Error = DataFusionError;
62    fn try_from(v: &[u8]) -> Result<HyperLogLog<T>> {
63        let arr: [u8; 16384] = v.try_into().map_err(|_| {
64            DataFusionError::Internal(
65                "Impossibly got invalid binary array from states".into(),
66            )
67        })?;
68        Ok(HyperLogLog::<T>::new_with_registers(arr))
69    }
70}
71
72impl<T: Hash> TryFrom<&ScalarValue> for HyperLogLog<T> {
73    type Error = DataFusionError;
74    fn try_from(v: &ScalarValue) -> Result<HyperLogLog<T>> {
75        if let ScalarValue::Binary(Some(slice)) = v {
76            slice.as_slice().try_into()
77        } else {
78            internal_err!(
79                "Impossibly got invalid scalar value while converting to HyperLogLog"
80            )
81        }
82    }
83}
84
85#[derive(Debug)]
86struct NumericHLLAccumulator<T>
87where
88    T: ArrowPrimitiveType,
89    T::Native: Hash,
90{
91    hll: HyperLogLog<T::Native>,
92}
93
94impl<T> NumericHLLAccumulator<T>
95where
96    T: ArrowPrimitiveType,
97    T::Native: Hash,
98{
99    /// new approx_distinct accumulator
100    pub fn new() -> Self {
101        Self {
102            hll: HyperLogLog::new(),
103        }
104    }
105}
106
107#[derive(Debug)]
108struct StringHLLAccumulator<T>
109where
110    T: OffsetSizeTrait,
111{
112    hll: HyperLogLog<String>,
113    phantom_data: PhantomData<T>,
114}
115
116impl<T> StringHLLAccumulator<T>
117where
118    T: OffsetSizeTrait,
119{
120    /// new approx_distinct accumulator
121    pub fn new() -> Self {
122        Self {
123            hll: HyperLogLog::new(),
124            phantom_data: PhantomData,
125        }
126    }
127}
128
129#[derive(Debug)]
130struct StringViewHLLAccumulator<T>
131where
132    T: OffsetSizeTrait,
133{
134    hll: HyperLogLog<String>,
135    phantom_data: PhantomData<T>,
136}
137
138impl<T> StringViewHLLAccumulator<T>
139where
140    T: OffsetSizeTrait,
141{
142    pub fn new() -> Self {
143        Self {
144            hll: HyperLogLog::new(),
145            phantom_data: PhantomData,
146        }
147    }
148}
149
150#[derive(Debug)]
151struct BinaryHLLAccumulator<T>
152where
153    T: OffsetSizeTrait,
154{
155    hll: HyperLogLog<Vec<u8>>,
156    phantom_data: PhantomData<T>,
157}
158
159impl<T> BinaryHLLAccumulator<T>
160where
161    T: OffsetSizeTrait,
162{
163    /// new approx_distinct accumulator
164    pub fn new() -> Self {
165        Self {
166            hll: HyperLogLog::new(),
167            phantom_data: PhantomData,
168        }
169    }
170}
171
172macro_rules! default_accumulator_impl {
173    () => {
174        fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
175            assert_eq!(1, states.len(), "expect only 1 element in the states");
176            let binary_array = downcast_value!(states[0], BinaryArray);
177            for v in binary_array.iter() {
178                let v = v.ok_or_else(|| {
179                    DataFusionError::Internal(
180                        "Impossibly got empty binary array from states".into(),
181                    )
182                })?;
183                let other = v.try_into()?;
184                self.hll.merge(&other);
185            }
186            Ok(())
187        }
188
189        fn state(&mut self) -> Result<Vec<ScalarValue>> {
190            let value = ScalarValue::from(&self.hll);
191            Ok(vec![value])
192        }
193
194        fn evaluate(&mut self) -> Result<ScalarValue> {
195            Ok(ScalarValue::UInt64(Some(self.hll.count() as u64)))
196        }
197
198        fn size(&self) -> usize {
199            // HLL has static size
200            std::mem::size_of_val(self)
201        }
202    };
203}
204
205impl<T> Accumulator for BinaryHLLAccumulator<T>
206where
207    T: OffsetSizeTrait,
208{
209    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
210        let array: &GenericBinaryArray<T> =
211            downcast_value!(values[0], GenericBinaryArray, T);
212        // flatten because we would skip nulls
213        self.hll
214            .extend(array.into_iter().flatten().map(|v| v.to_vec()));
215        Ok(())
216    }
217
218    default_accumulator_impl!();
219}
220
221impl<T> Accumulator for StringViewHLLAccumulator<T>
222where
223    T: OffsetSizeTrait,
224{
225    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
226        let array: &StringViewArray = downcast_value!(values[0], StringViewArray);
227        // flatten because we would skip nulls
228        self.hll
229            .extend(array.iter().flatten().map(|s| s.to_string()));
230        Ok(())
231    }
232
233    default_accumulator_impl!();
234}
235
236impl<T> Accumulator for StringHLLAccumulator<T>
237where
238    T: OffsetSizeTrait,
239{
240    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
241        let array: &GenericStringArray<T> =
242            downcast_value!(values[0], GenericStringArray, T);
243        // flatten because we would skip nulls
244        self.hll
245            .extend(array.into_iter().flatten().map(|i| i.to_string()));
246        Ok(())
247    }
248
249    default_accumulator_impl!();
250}
251
252impl<T> Accumulator for NumericHLLAccumulator<T>
253where
254    T: ArrowPrimitiveType + Debug,
255    T::Native: Hash,
256{
257    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
258        let array: &PrimitiveArray<T> = downcast_value!(values[0], PrimitiveArray, T);
259        // flatten because we would skip nulls
260        self.hll.extend(array.into_iter().flatten());
261        Ok(())
262    }
263
264    default_accumulator_impl!();
265}
266
267impl Debug for ApproxDistinct {
268    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
269        f.debug_struct("ApproxDistinct")
270            .field("name", &self.name())
271            .field("signature", &self.signature)
272            .finish()
273    }
274}
275
276impl Default for ApproxDistinct {
277    fn default() -> Self {
278        Self::new()
279    }
280}
281
282#[user_doc(
283    doc_section(label = "Approximate Functions"),
284    description = "Returns the approximate number of distinct input values calculated using the HyperLogLog algorithm.",
285    syntax_example = "approx_distinct(expression)",
286    sql_example = r#"```sql
287> SELECT approx_distinct(column_name) FROM table_name;
288+-----------------------------------+
289| approx_distinct(column_name)      |
290+-----------------------------------+
291| 42                                |
292+-----------------------------------+
293```"#,
294    standard_argument(name = "expression",)
295)]
296pub struct ApproxDistinct {
297    signature: Signature,
298}
299
300impl ApproxDistinct {
301    pub fn new() -> Self {
302        Self {
303            signature: Signature::any(1, Volatility::Immutable),
304        }
305    }
306}
307
308impl AggregateUDFImpl for ApproxDistinct {
309    fn as_any(&self) -> &dyn Any {
310        self
311    }
312
313    fn name(&self) -> &str {
314        "approx_distinct"
315    }
316
317    fn signature(&self) -> &Signature {
318        &self.signature
319    }
320
321    fn return_type(&self, _: &[DataType]) -> Result<DataType> {
322        Ok(DataType::UInt64)
323    }
324
325    fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
326        Ok(vec![Field::new(
327            format_state_name(args.name, "hll_registers"),
328            DataType::Binary,
329            false,
330        )
331        .into()])
332    }
333
334    fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
335        let data_type = acc_args.exprs[0].data_type(acc_args.schema)?;
336
337        let accumulator: Box<dyn Accumulator> = match data_type {
338            // TODO u8, i8, u16, i16 shall really be done using bitmap, not HLL
339            // TODO support for boolean (trivial case)
340            // https://github.com/apache/datafusion/issues/1109
341            DataType::UInt8 => Box::new(NumericHLLAccumulator::<UInt8Type>::new()),
342            DataType::UInt16 => Box::new(NumericHLLAccumulator::<UInt16Type>::new()),
343            DataType::UInt32 => Box::new(NumericHLLAccumulator::<UInt32Type>::new()),
344            DataType::UInt64 => Box::new(NumericHLLAccumulator::<UInt64Type>::new()),
345            DataType::Int8 => Box::new(NumericHLLAccumulator::<Int8Type>::new()),
346            DataType::Int16 => Box::new(NumericHLLAccumulator::<Int16Type>::new()),
347            DataType::Int32 => Box::new(NumericHLLAccumulator::<Int32Type>::new()),
348            DataType::Int64 => Box::new(NumericHLLAccumulator::<Int64Type>::new()),
349            DataType::Utf8 => Box::new(StringHLLAccumulator::<i32>::new()),
350            DataType::LargeUtf8 => Box::new(StringHLLAccumulator::<i64>::new()),
351            DataType::Utf8View => Box::new(StringViewHLLAccumulator::<i32>::new()),
352            DataType::Binary => Box::new(BinaryHLLAccumulator::<i32>::new()),
353            DataType::LargeBinary => Box::new(BinaryHLLAccumulator::<i64>::new()),
354            other => {
355                return not_impl_err!(
356                "Support for 'approx_distinct' for data type {other} is not implemented"
357            )
358            }
359        };
360        Ok(accumulator)
361    }
362
363    fn documentation(&self) -> Option<&Documentation> {
364        self.doc()
365    }
366}