1use std::hash::Hash;
2
3use arrow_buffer::BooleanBuffer;
4use num_traits::PrimInt;
5use rustc_hash::FxBuildHasher;
6use vortex_array::aliases::hash_map::HashMap;
7use vortex_array::arrays::PrimitiveArray;
8use vortex_array::variants::PrimitiveArrayTrait;
9use vortex_array::{Array, ToCanonical};
10use vortex_dtype::{NativePType, match_each_integer_ptype};
11use vortex_error::{VortexExpect, VortexUnwrap};
12use vortex_scalar::PValue;
13
14use crate::sample::sample;
15use crate::{CompressorStats, GenerateStatsOptions};
16
17#[derive(Clone, Debug)]
18pub struct TypedStats<T> {
19 pub min: T,
20 pub max: T,
21 pub top_value: T,
22 pub top_count: u32,
23 pub distinct_values: HashMap<T, u32, FxBuildHasher>,
24}
25
26#[derive(Clone, Debug)]
31pub enum ErasedStats {
32 U8(TypedStats<u8>),
33 U16(TypedStats<u16>),
34 U32(TypedStats<u32>),
35 U64(TypedStats<u64>),
36 I8(TypedStats<i8>),
37 I16(TypedStats<i16>),
38 I32(TypedStats<i32>),
39 I64(TypedStats<i64>),
40}
41
42impl ErasedStats {
43 pub fn min_is_zero(&self) -> bool {
44 match &self {
45 ErasedStats::U8(x) => x.min == 0,
46 ErasedStats::U16(x) => x.min == 0,
47 ErasedStats::U32(x) => x.min == 0,
48 ErasedStats::U64(x) => x.min == 0,
49 ErasedStats::I8(x) => x.min == 0,
50 ErasedStats::I16(x) => x.min == 0,
51 ErasedStats::I32(x) => x.min == 0,
52 ErasedStats::I64(x) => x.min == 0,
53 }
54 }
55
56 pub fn min_is_negative(&self) -> bool {
57 match &self {
58 ErasedStats::U8(_)
59 | ErasedStats::U16(_)
60 | ErasedStats::U32(_)
61 | ErasedStats::U64(_) => false,
62 ErasedStats::I8(x) => x.min < 0,
63 ErasedStats::I16(x) => x.min < 0,
64 ErasedStats::I32(x) => x.min < 0,
65 ErasedStats::I64(x) => x.min < 0,
66 }
67 }
68
69 pub fn max_minus_min(&self) -> u64 {
71 match &self {
72 ErasedStats::U8(x) => (x.max - x.min) as u64,
73 ErasedStats::U16(x) => (x.max - x.min) as u64,
74 ErasedStats::U32(x) => (x.max - x.min) as u64,
75 ErasedStats::U64(x) => x.max - x.min,
76 ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,
77 ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,
78 ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,
79 ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)
80 .vortex_expect("max minus min result bigger than u64"),
81 }
82 }
83
84 pub fn top_value_and_count(&self) -> (PValue, u32) {
86 match &self {
87 ErasedStats::U8(x) => (x.top_value.into(), x.top_count),
88 ErasedStats::U16(x) => (x.top_value.into(), x.top_count),
89 ErasedStats::U32(x) => (x.top_value.into(), x.top_count),
90 ErasedStats::U64(x) => (x.top_value.into(), x.top_count),
91 ErasedStats::I8(x) => (x.top_value.into(), x.top_count),
92 ErasedStats::I16(x) => (x.top_value.into(), x.top_count),
93 ErasedStats::I32(x) => (x.top_value.into(), x.top_count),
94 ErasedStats::I64(x) => (x.top_value.into(), x.top_count),
95 }
96 }
97}
98
99macro_rules! impl_from_typed {
100 ($T:ty, $variant:path) => {
101 impl From<TypedStats<$T>> for ErasedStats {
102 fn from(typed: TypedStats<$T>) -> Self {
103 $variant(typed)
104 }
105 }
106 };
107}
108
109impl_from_typed!(u8, ErasedStats::U8);
110impl_from_typed!(u16, ErasedStats::U16);
111impl_from_typed!(u32, ErasedStats::U32);
112impl_from_typed!(u64, ErasedStats::U64);
113impl_from_typed!(i8, ErasedStats::I8);
114impl_from_typed!(i16, ErasedStats::I16);
115impl_from_typed!(i32, ErasedStats::I32);
116impl_from_typed!(i64, ErasedStats::I64);
117
118#[derive(Clone, Debug)]
119pub struct IntegerStats {
120 pub(super) src: PrimitiveArray,
121 pub(super) null_count: u32,
123 pub(super) value_count: u32,
125 pub(super) average_run_length: u32,
126 pub(super) distinct_values_count: u32,
127 pub(crate) typed: ErasedStats,
128}
129
130impl CompressorStats for IntegerStats {
131 type ArrayType = PrimitiveArray;
132
133 fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {
134 match_each_integer_ptype!(input.ptype(), |$T| {
135 typed_int_stats::<$T>(input, opts.count_distinct_values)
136 })
137 }
138
139 fn source(&self) -> &PrimitiveArray {
140 &self.src
141 }
142
143 fn sample_opts(&self, sample_size: u16, sample_count: u16, opts: GenerateStatsOptions) -> Self {
144 let sampled = sample(self.src.clone(), sample_size, sample_count)
145 .to_primitive()
146 .vortex_expect("primitive");
147
148 Self::generate_opts(&sampled, opts)
149 }
150}
151
152fn typed_int_stats<T: NativePType + Hash + PrimInt>(
153 array: &PrimitiveArray,
154 count_distinct_values: bool,
155) -> IntegerStats
156where
157 TypedStats<T>: Into<ErasedStats>,
158{
159 if array.is_empty() {
161 return IntegerStats {
162 src: array.clone(),
163 null_count: 0,
164 value_count: 0,
165 average_run_length: 0,
166 distinct_values_count: 0,
167 typed: TypedStats {
168 min: T::max_value(),
169 max: T::min_value(),
170 top_value: T::default(),
171 top_count: 0,
172 distinct_values: HashMap::with_hasher(FxBuildHasher),
173 }
174 .into(),
175 };
176 } else if array.all_invalid().vortex_expect("all_invalid") {
177 return IntegerStats {
178 src: array.clone(),
179 null_count: array.len().try_into().vortex_expect("null_count"),
180 value_count: 0,
181 average_run_length: 0,
182 distinct_values_count: 0,
183 typed: TypedStats {
184 min: T::max_value(),
185 max: T::min_value(),
186 top_value: T::default(),
187 top_count: 0,
188 distinct_values: HashMap::with_hasher(FxBuildHasher),
189 }
190 .into(),
191 };
192 }
193
194 let validity = array.validity_mask().vortex_expect("logical_validity");
195 let null_count = validity.false_count();
196 let value_count = validity.true_count();
197
198 let head = array.as_slice::<T>()[0];
200 let mut loop_state = LoopState {
201 min: head,
202 max: head,
203 distinct_values: if count_distinct_values {
204 HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
205 } else {
206 HashMap::with_hasher(FxBuildHasher)
207 },
208 distinct_values_count: if count_distinct_values { 0 } else { u32::MAX },
209 prev: head,
210 runs: 1,
211 };
212
213 let values = array.buffer::<T>();
214 let mask = validity.to_boolean_buffer();
215
216 let mut offset = 0;
217 for chunk in values.as_slice().chunks(64) {
218 let validity = mask.slice(offset, chunk.len());
219 offset += chunk.len();
220
221 if chunk.len() < 64 {
222 inner_loop_naive(chunk, count_distinct_values, &validity, &mut loop_state);
224 break;
225 }
226
227 let set_bits = validity.count_set_bits();
228
229 match set_bits {
230 0 => continue,
232 64 => inner_loop_nonnull(
234 chunk.try_into().vortex_unwrap(),
235 count_distinct_values,
236 &mut loop_state,
237 ),
238 _ => inner_loop_nullable(
240 chunk.try_into().vortex_unwrap(),
241 count_distinct_values,
242 &validity,
243 &mut loop_state,
244 ),
245 }
246 }
247
248 let (top_value, top_count) = if count_distinct_values {
249 let (&top_value, &top_count) = loop_state
250 .distinct_values
251 .iter()
252 .max_by_key(|&(_, &count)| count)
253 .vortex_expect("non-empty");
254 (top_value, top_count)
255 } else {
256 (T::default(), 0)
257 };
258
259 let runs = loop_state.runs;
260 let distinct_values_count = loop_state.distinct_values_count;
261
262 let typed = TypedStats {
263 min: loop_state.min,
264 max: loop_state.max,
265 distinct_values: loop_state.distinct_values,
266 top_value,
267 top_count,
268 };
269
270 let null_count = null_count
271 .try_into()
272 .vortex_expect("null_count must fit in u32");
273 let value_count = value_count
274 .try_into()
275 .vortex_expect("value_count must fit in u32");
276
277 IntegerStats {
278 src: array.clone(),
279 null_count,
280 value_count,
281 average_run_length: value_count / runs,
282 distinct_values_count,
283 typed: typed.into(),
284 }
285}
286
287struct LoopState<T> {
288 min: T,
289 max: T,
290 prev: T,
291 runs: u32,
292 distinct_values_count: u32,
293 distinct_values: HashMap<T, u32, FxBuildHasher>,
294}
295
296#[inline(always)]
297fn inner_loop_nonnull<T: PrimInt + Hash>(
298 values: &[T; 64],
299 count_distinct_values: bool,
300 state: &mut LoopState<T>,
301) {
302 for &value in values {
303 state.min = state.min.min(value);
304 state.max = state.max.max(value);
305
306 if count_distinct_values {
307 *state.distinct_values.entry(value).or_insert(0) += 1;
308 state.distinct_values_count = state.distinct_values.len().try_into().vortex_unwrap();
309 }
310
311 if value != state.prev {
312 state.prev = value;
313 state.runs += 1;
314 }
315 }
316}
317
318#[inline(always)]
319fn inner_loop_nullable<T: PrimInt + Hash>(
320 values: &[T; 64],
321 count_distinct_values: bool,
322 is_valid: &BooleanBuffer,
323 state: &mut LoopState<T>,
324) {
325 for (idx, &value) in values.iter().enumerate() {
326 if is_valid.value(idx) {
327 state.min = state.min.min(value);
328 state.max = state.max.max(value);
329
330 if count_distinct_values {
331 *state.distinct_values.entry(value).or_insert(0) += 1;
332 state.distinct_values_count =
333 state.distinct_values.len().try_into().vortex_unwrap();
334 }
335
336 if value != state.prev {
337 state.prev = value;
338 state.runs += 1;
339 }
340 }
341 }
342}
343
344#[inline(always)]
345fn inner_loop_naive<T: PrimInt + Hash>(
346 values: &[T],
347 count_distinct_values: bool,
348 is_valid: &BooleanBuffer,
349 state: &mut LoopState<T>,
350) {
351 for (idx, &value) in values.iter().enumerate() {
352 if is_valid.value(idx) {
353 state.min = state.min.min(value);
354 state.max = state.max.max(value);
355
356 if count_distinct_values {
357 *state.distinct_values.entry(value).or_insert(0) += 1;
358 state.distinct_values_count =
359 state.distinct_values.len().try_into().vortex_unwrap();
360 }
361
362 if value != state.prev {
363 state.prev = value;
364 state.runs += 1;
365 }
366 }
367 }
368}
369
370#[cfg(test)]
371mod tests {
372 use std::iter;
373
374 use arrow_buffer::BooleanBuffer;
375 use vortex_array::arrays::PrimitiveArray;
376 use vortex_array::validity::Validity;
377 use vortex_buffer::{Buffer, buffer};
378
379 use crate::integer::stats::typed_int_stats;
380
381 #[test]
382 fn test_naive_count_distinct_values() {
383 let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
384 let stats = typed_int_stats::<u8>(&array, true);
385 assert_eq!(stats.distinct_values_count, 2);
386 }
387
388 #[test]
389 fn test_naive_count_distinct_values_nullable() {
390 let array = PrimitiveArray::new(
391 buffer![217u8, 0],
392 Validity::from(BooleanBuffer::from(vec![true, false])),
393 );
394 let stats = typed_int_stats::<u8>(&array, true);
395 assert_eq!(stats.distinct_values_count, 1);
396 }
397
398 #[test]
399 fn test_count_distinct_values() {
400 let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
401 let stats = typed_int_stats::<u8>(&array, true);
402 assert_eq!(stats.distinct_values_count, 128);
403 }
404
405 #[test]
406 fn test_count_distinct_values_nullable() {
407 let array = PrimitiveArray::new(
408 (0..128u8).collect::<Buffer<u8>>(),
409 Validity::from(BooleanBuffer::from_iter(
410 iter::repeat_n(vec![true, false], 64).flatten(),
411 )),
412 );
413 let stats = typed_int_stats::<u8>(&array, true);
414 assert_eq!(stats.distinct_values_count, 64);
415 }
416}