datafusion_functions_aggregate/
approx_distinct.rs1use crate::hyperloglog::HyperLogLog;
21use arrow::array::BinaryArray;
22use arrow::array::{
23 GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray,
24};
25use arrow::datatypes::{
26 ArrowPrimitiveType, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type,
27 UInt32Type, UInt64Type, UInt8Type,
28};
29use arrow::{array::ArrayRef, datatypes::DataType, datatypes::Field};
30use datafusion_common::ScalarValue;
31use datafusion_common::{
32 downcast_value, internal_err, not_impl_err, DataFusionError, Result,
33};
34use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
35use datafusion_expr::utils::format_state_name;
36use datafusion_expr::{
37 Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
38};
39use datafusion_macros::user_doc;
40use std::any::Any;
41use std::fmt::{Debug, Formatter};
42use std::hash::Hash;
43use std::marker::PhantomData;
44
45make_udaf_expr_and_func!(
46 ApproxDistinct,
47 approx_distinct,
48 expression,
49 "approximate number of distinct input values",
50 approx_distinct_udaf
51);
52
53impl<T: Hash> From<&HyperLogLog<T>> for ScalarValue {
54 fn from(v: &HyperLogLog<T>) -> ScalarValue {
55 let values = v.as_ref().to_vec();
56 ScalarValue::Binary(Some(values))
57 }
58}
59
60impl<T: Hash> TryFrom<&[u8]> for HyperLogLog<T> {
61 type Error = DataFusionError;
62 fn try_from(v: &[u8]) -> Result<HyperLogLog<T>> {
63 let arr: [u8; 16384] = v.try_into().map_err(|_| {
64 DataFusionError::Internal(
65 "Impossibly got invalid binary array from states".into(),
66 )
67 })?;
68 Ok(HyperLogLog::<T>::new_with_registers(arr))
69 }
70}
71
72impl<T: Hash> TryFrom<&ScalarValue> for HyperLogLog<T> {
73 type Error = DataFusionError;
74 fn try_from(v: &ScalarValue) -> Result<HyperLogLog<T>> {
75 if let ScalarValue::Binary(Some(slice)) = v {
76 slice.as_slice().try_into()
77 } else {
78 internal_err!(
79 "Impossibly got invalid scalar value while converting to HyperLogLog"
80 )
81 }
82 }
83}
84
85#[derive(Debug)]
86struct NumericHLLAccumulator<T>
87where
88 T: ArrowPrimitiveType,
89 T::Native: Hash,
90{
91 hll: HyperLogLog<T::Native>,
92}
93
94impl<T> NumericHLLAccumulator<T>
95where
96 T: ArrowPrimitiveType,
97 T::Native: Hash,
98{
99 pub fn new() -> Self {
101 Self {
102 hll: HyperLogLog::new(),
103 }
104 }
105}
106
107#[derive(Debug)]
108struct StringHLLAccumulator<T>
109where
110 T: OffsetSizeTrait,
111{
112 hll: HyperLogLog<String>,
113 phantom_data: PhantomData<T>,
114}
115
116impl<T> StringHLLAccumulator<T>
117where
118 T: OffsetSizeTrait,
119{
120 pub fn new() -> Self {
122 Self {
123 hll: HyperLogLog::new(),
124 phantom_data: PhantomData,
125 }
126 }
127}
128
129#[derive(Debug)]
130struct BinaryHLLAccumulator<T>
131where
132 T: OffsetSizeTrait,
133{
134 hll: HyperLogLog<Vec<u8>>,
135 phantom_data: PhantomData<T>,
136}
137
138impl<T> BinaryHLLAccumulator<T>
139where
140 T: OffsetSizeTrait,
141{
142 pub fn new() -> Self {
144 Self {
145 hll: HyperLogLog::new(),
146 phantom_data: PhantomData,
147 }
148 }
149}
150
151macro_rules! default_accumulator_impl {
152 () => {
153 fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
154 assert_eq!(1, states.len(), "expect only 1 element in the states");
155 let binary_array = downcast_value!(states[0], BinaryArray);
156 for v in binary_array.iter() {
157 let v = v.ok_or_else(|| {
158 DataFusionError::Internal(
159 "Impossibly got empty binary array from states".into(),
160 )
161 })?;
162 let other = v.try_into()?;
163 self.hll.merge(&other);
164 }
165 Ok(())
166 }
167
168 fn state(&mut self) -> Result<Vec<ScalarValue>> {
169 let value = ScalarValue::from(&self.hll);
170 Ok(vec![value])
171 }
172
173 fn evaluate(&mut self) -> Result<ScalarValue> {
174 Ok(ScalarValue::UInt64(Some(self.hll.count() as u64)))
175 }
176
177 fn size(&self) -> usize {
178 std::mem::size_of_val(self)
180 }
181 };
182}
183
184impl<T> Accumulator for BinaryHLLAccumulator<T>
185where
186 T: OffsetSizeTrait,
187{
188 fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
189 let array: &GenericBinaryArray<T> =
190 downcast_value!(values[0], GenericBinaryArray, T);
191 self.hll
193 .extend(array.into_iter().flatten().map(|v| v.to_vec()));
194 Ok(())
195 }
196
197 default_accumulator_impl!();
198}
199
200impl<T> Accumulator for StringHLLAccumulator<T>
201where
202 T: OffsetSizeTrait,
203{
204 fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
205 let array: &GenericStringArray<T> =
206 downcast_value!(values[0], GenericStringArray, T);
207 self.hll
209 .extend(array.into_iter().flatten().map(|i| i.to_string()));
210 Ok(())
211 }
212
213 default_accumulator_impl!();
214}
215
216impl<T> Accumulator for NumericHLLAccumulator<T>
217where
218 T: ArrowPrimitiveType + Debug,
219 T::Native: Hash,
220{
221 fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
222 let array: &PrimitiveArray<T> = downcast_value!(values[0], PrimitiveArray, T);
223 self.hll.extend(array.into_iter().flatten());
225 Ok(())
226 }
227
228 default_accumulator_impl!();
229}
230
231impl Debug for ApproxDistinct {
232 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
233 f.debug_struct("ApproxDistinct")
234 .field("name", &self.name())
235 .field("signature", &self.signature)
236 .finish()
237 }
238}
239
240impl Default for ApproxDistinct {
241 fn default() -> Self {
242 Self::new()
243 }
244}
245
246#[user_doc(
247 doc_section(label = "Approximate Functions"),
248 description = "Returns the approximate number of distinct input values calculated using the HyperLogLog algorithm.",
249 syntax_example = "approx_distinct(expression)",
250 sql_example = r#"```sql
251> SELECT approx_distinct(column_name) FROM table_name;
252+-----------------------------------+
253| approx_distinct(column_name) |
254+-----------------------------------+
255| 42 |
256+-----------------------------------+
257```"#,
258 standard_argument(name = "expression",)
259)]
260pub struct ApproxDistinct {
261 signature: Signature,
262}
263
264impl ApproxDistinct {
265 pub fn new() -> Self {
266 Self {
267 signature: Signature::any(1, Volatility::Immutable),
268 }
269 }
270}
271
272impl AggregateUDFImpl for ApproxDistinct {
273 fn as_any(&self) -> &dyn Any {
274 self
275 }
276
277 fn name(&self) -> &str {
278 "approx_distinct"
279 }
280
281 fn signature(&self) -> &Signature {
282 &self.signature
283 }
284
285 fn return_type(&self, _: &[DataType]) -> Result<DataType> {
286 Ok(DataType::UInt64)
287 }
288
289 fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<Field>> {
290 Ok(vec![Field::new(
291 format_state_name(args.name, "hll_registers"),
292 DataType::Binary,
293 false,
294 )])
295 }
296
297 fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
298 let data_type = acc_args.exprs[0].data_type(acc_args.schema)?;
299
300 let accumulator: Box<dyn Accumulator> = match data_type {
301 DataType::UInt8 => Box::new(NumericHLLAccumulator::<UInt8Type>::new()),
305 DataType::UInt16 => Box::new(NumericHLLAccumulator::<UInt16Type>::new()),
306 DataType::UInt32 => Box::new(NumericHLLAccumulator::<UInt32Type>::new()),
307 DataType::UInt64 => Box::new(NumericHLLAccumulator::<UInt64Type>::new()),
308 DataType::Int8 => Box::new(NumericHLLAccumulator::<Int8Type>::new()),
309 DataType::Int16 => Box::new(NumericHLLAccumulator::<Int16Type>::new()),
310 DataType::Int32 => Box::new(NumericHLLAccumulator::<Int32Type>::new()),
311 DataType::Int64 => Box::new(NumericHLLAccumulator::<Int64Type>::new()),
312 DataType::Utf8 => Box::new(StringHLLAccumulator::<i32>::new()),
313 DataType::LargeUtf8 => Box::new(StringHLLAccumulator::<i64>::new()),
314 DataType::Binary => Box::new(BinaryHLLAccumulator::<i32>::new()),
315 DataType::LargeBinary => Box::new(BinaryHLLAccumulator::<i64>::new()),
316 other => {
317 return not_impl_err!(
318 "Support for 'approx_distinct' for data type {other} is not implemented"
319 )
320 }
321 };
322 Ok(accumulator)
323 }
324
325 fn documentation(&self) -> Option<&Documentation> {
326 self.doc()
327 }
328}