vortex_btrblocks/integer/
stats.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::hash::Hash;
5
6use arrow_buffer::BooleanBuffer;
7use num_traits::PrimInt;
8use rustc_hash::FxBuildHasher;
9use vortex_array::ToCanonical;
10use vortex_array::arrays::{NativeValue, PrimitiveArray, PrimitiveVTable};
11use vortex_array::stats::Stat;
12use vortex_dtype::{NativePType, match_each_integer_ptype};
13use vortex_error::{VortexError, VortexExpect, VortexUnwrap};
14use vortex_mask::AllOr;
15use vortex_scalar::{PValue, ScalarValue};
16use vortex_utils::aliases::hash_map::HashMap;
17
18use crate::sample::sample;
19use crate::{CompressorStats, GenerateStatsOptions};
20
21#[derive(Clone, Debug)]
22pub struct TypedStats<T> {
23    pub min: T,
24    pub max: T,
25    pub top_value: T,
26    pub top_count: u32,
27    pub distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
28}
29
30/// Type-erased container for one of the [TypedStats] variants.
31///
32/// Building the `TypedStats` is considerably faster and cheaper than building a type-erased
33/// set of stats. We then perform a variety of access methods on them.
34#[derive(Clone, Debug)]
35pub enum ErasedStats {
36    U8(TypedStats<u8>),
37    U16(TypedStats<u16>),
38    U32(TypedStats<u32>),
39    U64(TypedStats<u64>),
40    I8(TypedStats<i8>),
41    I16(TypedStats<i16>),
42    I32(TypedStats<i32>),
43    I64(TypedStats<i64>),
44}
45
46impl ErasedStats {
47    pub fn min_is_zero(&self) -> bool {
48        match &self {
49            ErasedStats::U8(x) => x.min == 0,
50            ErasedStats::U16(x) => x.min == 0,
51            ErasedStats::U32(x) => x.min == 0,
52            ErasedStats::U64(x) => x.min == 0,
53            ErasedStats::I8(x) => x.min == 0,
54            ErasedStats::I16(x) => x.min == 0,
55            ErasedStats::I32(x) => x.min == 0,
56            ErasedStats::I64(x) => x.min == 0,
57        }
58    }
59
60    pub fn min_is_negative(&self) -> bool {
61        match &self {
62            ErasedStats::U8(_)
63            | ErasedStats::U16(_)
64            | ErasedStats::U32(_)
65            | ErasedStats::U64(_) => false,
66            ErasedStats::I8(x) => x.min < 0,
67            ErasedStats::I16(x) => x.min < 0,
68            ErasedStats::I32(x) => x.min < 0,
69            ErasedStats::I64(x) => x.min < 0,
70        }
71    }
72
73    // Difference between max and min.
74    pub fn max_minus_min(&self) -> u64 {
75        match &self {
76            ErasedStats::U8(x) => (x.max - x.min) as u64,
77            ErasedStats::U16(x) => (x.max - x.min) as u64,
78            ErasedStats::U32(x) => (x.max - x.min) as u64,
79            ErasedStats::U64(x) => x.max - x.min,
80            ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,
81            ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,
82            ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,
83            ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)
84                .vortex_expect("max minus min result bigger than u64"),
85        }
86    }
87
88    /// Get the most commonly occurring value and its count
89    pub fn top_value_and_count(&self) -> (PValue, u32) {
90        match &self {
91            ErasedStats::U8(x) => (x.top_value.into(), x.top_count),
92            ErasedStats::U16(x) => (x.top_value.into(), x.top_count),
93            ErasedStats::U32(x) => (x.top_value.into(), x.top_count),
94            ErasedStats::U64(x) => (x.top_value.into(), x.top_count),
95            ErasedStats::I8(x) => (x.top_value.into(), x.top_count),
96            ErasedStats::I16(x) => (x.top_value.into(), x.top_count),
97            ErasedStats::I32(x) => (x.top_value.into(), x.top_count),
98            ErasedStats::I64(x) => (x.top_value.into(), x.top_count),
99        }
100    }
101}
102
103macro_rules! impl_from_typed {
104    ($T:ty, $variant:path) => {
105        impl From<TypedStats<$T>> for ErasedStats {
106            fn from(typed: TypedStats<$T>) -> Self {
107                $variant(typed)
108            }
109        }
110    };
111}
112
113impl_from_typed!(u8, ErasedStats::U8);
114impl_from_typed!(u16, ErasedStats::U16);
115impl_from_typed!(u32, ErasedStats::U32);
116impl_from_typed!(u64, ErasedStats::U64);
117impl_from_typed!(i8, ErasedStats::I8);
118impl_from_typed!(i16, ErasedStats::I16);
119impl_from_typed!(i32, ErasedStats::I32);
120impl_from_typed!(i64, ErasedStats::I64);
121
122#[derive(Clone, Debug)]
123pub struct IntegerStats {
124    pub(super) src: PrimitiveArray,
125    // cache for validity.false_count()
126    pub(super) null_count: u32,
127    // cache for validity.true_count()
128    pub(super) value_count: u32,
129    pub(super) average_run_length: u32,
130    pub(super) distinct_values_count: u32,
131    pub(crate) typed: ErasedStats,
132}
133
134impl CompressorStats for IntegerStats {
135    type ArrayVTable = PrimitiveVTable;
136
137    fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {
138        match_each_integer_ptype!(input.ptype(), |T| {
139            typed_int_stats::<T>(input, opts.count_distinct_values)
140        })
141    }
142
143    fn source(&self) -> &PrimitiveArray {
144        &self.src
145    }
146
147    fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self {
148        let sampled = sample(self.src.as_ref(), sample_size, sample_count)
149            .to_primitive()
150            .vortex_expect("primitive");
151
152        Self::generate_opts(&sampled, opts)
153    }
154}
155
156fn typed_int_stats<T>(array: &PrimitiveArray, count_distinct_values: bool) -> IntegerStats
157where
158    T: NativePType + PrimInt + for<'a> TryFrom<&'a ScalarValue, Error = VortexError>,
159    TypedStats<T>: Into<ErasedStats>,
160    NativeValue<T>: Eq + Hash,
161{
162    // Special case: empty array
163    if array.is_empty() {
164        return IntegerStats {
165            src: array.clone(),
166            null_count: 0,
167            value_count: 0,
168            average_run_length: 0,
169            distinct_values_count: 0,
170            typed: TypedStats {
171                min: T::max_value(),
172                max: T::min_value(),
173                top_value: T::default(),
174                top_count: 0,
175                distinct_values: HashMap::with_hasher(FxBuildHasher),
176            }
177            .into(),
178        };
179    } else if array.all_invalid().vortex_expect("all_invalid") {
180        return IntegerStats {
181            src: array.clone(),
182            null_count: array.len().try_into().vortex_expect("null_count"),
183            value_count: 0,
184            average_run_length: 0,
185            distinct_values_count: 0,
186            typed: TypedStats {
187                min: T::max_value(),
188                max: T::min_value(),
189                top_value: T::default(),
190                top_count: 0,
191                distinct_values: HashMap::with_hasher(FxBuildHasher),
192            }
193            .into(),
194        };
195    }
196
197    let validity = array.validity_mask().vortex_expect("logical_validity");
198    let null_count = validity.false_count();
199    let value_count = validity.true_count();
200
201    // Initialize loop state
202    let head_idx = validity
203        .first()
204        .vortex_expect("All null masks have been handled before");
205    let buffer = array.buffer::<T>();
206    let head = buffer[head_idx];
207
208    let mut loop_state = LoopState {
209        distinct_values: if count_distinct_values {
210            HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
211        } else {
212            HashMap::with_hasher(FxBuildHasher)
213        },
214        prev: head,
215        runs: 1,
216    };
217
218    let sliced = buffer.slice(head_idx..array.len());
219    let mut chunks = sliced.as_slice().chunks_exact(64);
220    match validity.boolean_buffer() {
221        AllOr::All => {
222            for chunk in &mut chunks {
223                inner_loop_nonnull(
224                    chunk.try_into().vortex_unwrap(),
225                    count_distinct_values,
226                    &mut loop_state,
227                )
228            }
229            let remainder = chunks.remainder();
230            inner_loop_naive(
231                remainder,
232                count_distinct_values,
233                &BooleanBuffer::new_set(remainder.len()),
234                &mut loop_state,
235            );
236        }
237        AllOr::None => unreachable!("All invalid arrays have been handled before"),
238        AllOr::Some(v) => {
239            let mask = v.slice(head_idx, array.len() - head_idx);
240            let mut offset = 0;
241            for chunk in &mut chunks {
242                let validity = mask.slice(offset, 64);
243                offset += 64;
244
245                match validity.count_set_bits() {
246                    // All nulls -> no stats to update
247                    0 => continue,
248                    // Inner loop for when validity check can be elided
249                    64 => inner_loop_nonnull(
250                        chunk.try_into().vortex_unwrap(),
251                        count_distinct_values,
252                        &mut loop_state,
253                    ),
254                    // Inner loop for when we need to check validity
255                    _ => inner_loop_nullable(
256                        chunk.try_into().vortex_unwrap(),
257                        count_distinct_values,
258                        &validity,
259                        &mut loop_state,
260                    ),
261                }
262            }
263            // Final iteration, run naive loop
264            let remainder = chunks.remainder();
265            inner_loop_naive(
266                remainder,
267                count_distinct_values,
268                &mask.slice(offset, remainder.len()),
269                &mut loop_state,
270            );
271        }
272    }
273
274    let (top_value, top_count) = if count_distinct_values {
275        let (&top_value, &top_count) = loop_state
276            .distinct_values
277            .iter()
278            .max_by_key(|&(_, &count)| count)
279            .vortex_expect("non-empty");
280        (top_value.0, top_count)
281    } else {
282        (T::default(), 0)
283    };
284
285    let runs = loop_state.runs;
286    let distinct_values_count = if count_distinct_values {
287        loop_state.distinct_values.len().try_into().vortex_unwrap()
288    } else {
289        u32::MAX
290    };
291
292    let min = array
293        .statistics()
294        .compute_as::<T>(Stat::Min)
295        .vortex_expect("min should be computed");
296
297    let max = array
298        .statistics()
299        .compute_as::<T>(Stat::Max)
300        .vortex_expect("max should be computed");
301
302    let typed = TypedStats {
303        min,
304        max,
305        distinct_values: loop_state.distinct_values,
306        top_value,
307        top_count,
308    };
309
310    let null_count = null_count
311        .try_into()
312        .vortex_expect("null_count must fit in u32");
313    let value_count = value_count
314        .try_into()
315        .vortex_expect("value_count must fit in u32");
316
317    IntegerStats {
318        src: array.clone(),
319        null_count,
320        value_count,
321        average_run_length: value_count / runs,
322        distinct_values_count,
323        typed: typed.into(),
324    }
325}
326
327struct LoopState<T> {
328    prev: T,
329    runs: u32,
330    distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
331}
332
333#[inline(always)]
334fn inner_loop_nonnull<T: NativePType>(
335    values: &[T; 64],
336    count_distinct_values: bool,
337    state: &mut LoopState<T>,
338) where
339    NativeValue<T>: Eq + Hash,
340{
341    for &value in values {
342        if count_distinct_values {
343            *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
344        }
345
346        if value != state.prev {
347            state.prev = value;
348            state.runs += 1;
349        }
350    }
351}
352
353#[inline(always)]
354fn inner_loop_nullable<T: NativePType>(
355    values: &[T; 64],
356    count_distinct_values: bool,
357    is_valid: &BooleanBuffer,
358    state: &mut LoopState<T>,
359) where
360    NativeValue<T>: Eq + Hash,
361{
362    for (idx, &value) in values.iter().enumerate() {
363        if is_valid.value(idx) {
364            if count_distinct_values {
365                *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
366            }
367
368            if value != state.prev {
369                state.prev = value;
370                state.runs += 1;
371            }
372        }
373    }
374}
375
376#[inline(always)]
377fn inner_loop_naive<T: NativePType>(
378    values: &[T],
379    count_distinct_values: bool,
380    is_valid: &BooleanBuffer,
381    state: &mut LoopState<T>,
382) where
383    NativeValue<T>: Eq + Hash,
384{
385    for (idx, &value) in values.iter().enumerate() {
386        if is_valid.value(idx) {
387            if count_distinct_values {
388                *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
389            }
390
391            if value != state.prev {
392                state.prev = value;
393                state.runs += 1;
394            }
395        }
396    }
397}
398
399#[cfg(test)]
400mod tests {
401    use std::iter;
402
403    use arrow_buffer::BooleanBuffer;
404    use vortex_array::arrays::PrimitiveArray;
405    use vortex_array::validity::Validity;
406    use vortex_buffer::{Buffer, buffer};
407
408    use crate::CompressorStats;
409    use crate::integer::IntegerStats;
410    use crate::integer::stats::typed_int_stats;
411
412    #[test]
413    fn test_naive_count_distinct_values() {
414        let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
415        let stats = typed_int_stats::<u8>(&array, true);
416        assert_eq!(stats.distinct_values_count, 2);
417    }
418
419    #[test]
420    fn test_naive_count_distinct_values_nullable() {
421        let array = PrimitiveArray::new(
422            buffer![217u8, 0],
423            Validity::from(BooleanBuffer::from(vec![true, false])),
424        );
425        let stats = typed_int_stats::<u8>(&array, true);
426        assert_eq!(stats.distinct_values_count, 1);
427    }
428
429    #[test]
430    fn test_count_distinct_values() {
431        let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
432        let stats = typed_int_stats::<u8>(&array, true);
433        assert_eq!(stats.distinct_values_count, 128);
434    }
435
436    #[test]
437    fn test_count_distinct_values_nullable() {
438        let array = PrimitiveArray::new(
439            (0..128u8).collect::<Buffer<u8>>(),
440            Validity::from(BooleanBuffer::from_iter(
441                iter::repeat_n(vec![true, false], 64).flatten(),
442            )),
443        );
444        let stats = typed_int_stats::<u8>(&array, true);
445        assert_eq!(stats.distinct_values_count, 64);
446    }
447
448    #[test]
449    fn test_integer_stats_leading_nulls() {
450        let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true]));
451
452        let stats = IntegerStats::generate(&ints);
453
454        assert_eq!(stats.value_count, 2);
455        assert_eq!(stats.null_count, 1);
456        assert_eq!(stats.average_run_length, 1);
457        assert_eq!(stats.distinct_values_count, 2);
458    }
459}