vortex_btrblocks/integer/
stats.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::hash::Hash;
5
6use num_traits::PrimInt;
7use rustc_hash::FxBuildHasher;
8use vortex_array::ToCanonical;
9use vortex_array::arrays::NativeValue;
10use vortex_array::arrays::PrimitiveArray;
11use vortex_array::arrays::PrimitiveVTable;
12use vortex_array::expr::stats::Stat;
13use vortex_buffer::BitBuffer;
14use vortex_dtype::IntegerPType;
15use vortex_dtype::match_each_integer_ptype;
16use vortex_error::VortexError;
17use vortex_error::VortexExpect;
18use vortex_error::VortexUnwrap;
19use vortex_mask::AllOr;
20use vortex_scalar::PValue;
21use vortex_scalar::Scalar;
22use vortex_utils::aliases::hash_map::HashMap;
23
24use crate::CompressorStats;
25use crate::GenerateStatsOptions;
26use crate::rle::RLEStats;
27use crate::sample::sample;
28
29#[derive(Clone, Debug)]
30pub struct TypedStats<T> {
31    pub min: T,
32    pub max: T,
33    pub top_value: T,
34    pub top_count: u32,
35    pub distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
36}
37
38/// Type-erased container for one of the [TypedStats] variants.
39///
40/// Building the `TypedStats` is considerably faster and cheaper than building a type-erased
41/// set of stats. We then perform a variety of access methods on them.
42#[derive(Clone, Debug)]
43pub enum ErasedStats {
44    U8(TypedStats<u8>),
45    U16(TypedStats<u16>),
46    U32(TypedStats<u32>),
47    U64(TypedStats<u64>),
48    I8(TypedStats<i8>),
49    I16(TypedStats<i16>),
50    I32(TypedStats<i32>),
51    I64(TypedStats<i64>),
52}
53
54impl ErasedStats {
55    pub fn min_is_zero(&self) -> bool {
56        match &self {
57            ErasedStats::U8(x) => x.min == 0,
58            ErasedStats::U16(x) => x.min == 0,
59            ErasedStats::U32(x) => x.min == 0,
60            ErasedStats::U64(x) => x.min == 0,
61            ErasedStats::I8(x) => x.min == 0,
62            ErasedStats::I16(x) => x.min == 0,
63            ErasedStats::I32(x) => x.min == 0,
64            ErasedStats::I64(x) => x.min == 0,
65        }
66    }
67
68    pub fn min_is_negative(&self) -> bool {
69        match &self {
70            ErasedStats::U8(_)
71            | ErasedStats::U16(_)
72            | ErasedStats::U32(_)
73            | ErasedStats::U64(_) => false,
74            ErasedStats::I8(x) => x.min < 0,
75            ErasedStats::I16(x) => x.min < 0,
76            ErasedStats::I32(x) => x.min < 0,
77            ErasedStats::I64(x) => x.min < 0,
78        }
79    }
80
81    // Difference between max and min.
82    pub fn max_minus_min(&self) -> u64 {
83        match &self {
84            ErasedStats::U8(x) => (x.max - x.min) as u64,
85            ErasedStats::U16(x) => (x.max - x.min) as u64,
86            ErasedStats::U32(x) => (x.max - x.min) as u64,
87            ErasedStats::U64(x) => x.max - x.min,
88            ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,
89            ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,
90            ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,
91            ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)
92                .vortex_expect("max minus min result bigger than u64"),
93        }
94    }
95
96    /// Get the most commonly occurring value and its count
97    pub fn top_value_and_count(&self) -> (PValue, u32) {
98        match &self {
99            ErasedStats::U8(x) => (x.top_value.into(), x.top_count),
100            ErasedStats::U16(x) => (x.top_value.into(), x.top_count),
101            ErasedStats::U32(x) => (x.top_value.into(), x.top_count),
102            ErasedStats::U64(x) => (x.top_value.into(), x.top_count),
103            ErasedStats::I8(x) => (x.top_value.into(), x.top_count),
104            ErasedStats::I16(x) => (x.top_value.into(), x.top_count),
105            ErasedStats::I32(x) => (x.top_value.into(), x.top_count),
106            ErasedStats::I64(x) => (x.top_value.into(), x.top_count),
107        }
108    }
109}
110
111macro_rules! impl_from_typed {
112    ($T:ty, $variant:path) => {
113        impl From<TypedStats<$T>> for ErasedStats {
114            fn from(typed: TypedStats<$T>) -> Self {
115                $variant(typed)
116            }
117        }
118    };
119}
120
121impl_from_typed!(u8, ErasedStats::U8);
122impl_from_typed!(u16, ErasedStats::U16);
123impl_from_typed!(u32, ErasedStats::U32);
124impl_from_typed!(u64, ErasedStats::U64);
125impl_from_typed!(i8, ErasedStats::I8);
126impl_from_typed!(i16, ErasedStats::I16);
127impl_from_typed!(i32, ErasedStats::I32);
128impl_from_typed!(i64, ErasedStats::I64);
129
130/// Array of integers and relevant stats for compression.
131#[derive(Clone, Debug)]
132pub struct IntegerStats {
133    pub(super) src: PrimitiveArray,
134    // cache for validity.false_count()
135    pub(super) null_count: u32,
136    // cache for validity.true_count()
137    pub(super) value_count: u32,
138    pub(super) average_run_length: u32,
139    pub(super) distinct_values_count: u32,
140    pub(crate) typed: ErasedStats,
141}
142
143impl CompressorStats for IntegerStats {
144    type ArrayVTable = PrimitiveVTable;
145
146    fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {
147        match_each_integer_ptype!(input.ptype(), |T| {
148            typed_int_stats::<T>(input, opts.count_distinct_values)
149        })
150    }
151
152    fn source(&self) -> &PrimitiveArray {
153        &self.src
154    }
155
156    fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self {
157        let sampled = sample(self.src.as_ref(), sample_size, sample_count).to_primitive();
158
159        Self::generate_opts(&sampled, opts)
160    }
161}
162
163impl RLEStats for IntegerStats {
164    fn value_count(&self) -> u32 {
165        self.value_count
166    }
167
168    fn average_run_length(&self) -> u32 {
169        self.average_run_length
170    }
171
172    fn source(&self) -> &PrimitiveArray {
173        &self.src
174    }
175}
176
177fn typed_int_stats<T>(array: &PrimitiveArray, count_distinct_values: bool) -> IntegerStats
178where
179    T: IntegerPType + PrimInt + for<'a> TryFrom<&'a Scalar, Error = VortexError>,
180    TypedStats<T>: Into<ErasedStats>,
181    NativeValue<T>: Eq + Hash,
182{
183    // Special case: empty array
184    if array.is_empty() {
185        return IntegerStats {
186            src: array.clone(),
187            null_count: 0,
188            value_count: 0,
189            average_run_length: 0,
190            distinct_values_count: 0,
191            typed: TypedStats {
192                min: T::max_value(),
193                max: T::min_value(),
194                top_value: T::default(),
195                top_count: 0,
196                distinct_values: HashMap::with_hasher(FxBuildHasher),
197            }
198            .into(),
199        };
200    } else if array.all_invalid() {
201        return IntegerStats {
202            src: array.clone(),
203            null_count: array.len().try_into().vortex_expect("null_count"),
204            value_count: 0,
205            average_run_length: 0,
206            distinct_values_count: 0,
207            typed: TypedStats {
208                min: T::max_value(),
209                max: T::min_value(),
210                top_value: T::default(),
211                top_count: 0,
212                distinct_values: HashMap::with_hasher(FxBuildHasher),
213            }
214            .into(),
215        };
216    }
217
218    let validity = array.validity_mask();
219    let null_count = validity.false_count();
220    let value_count = validity.true_count();
221
222    // Initialize loop state
223    let head_idx = validity
224        .first()
225        .vortex_expect("All null masks have been handled before");
226    let buffer = array.buffer::<T>();
227    let head = buffer[head_idx];
228
229    let mut loop_state = LoopState {
230        distinct_values: if count_distinct_values {
231            HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
232        } else {
233            HashMap::with_hasher(FxBuildHasher)
234        },
235        prev: head,
236        runs: 1,
237    };
238
239    let sliced = buffer.slice(head_idx..array.len());
240    let mut chunks = sliced.as_slice().chunks_exact(64);
241    match validity.bit_buffer() {
242        AllOr::All => {
243            for chunk in &mut chunks {
244                inner_loop_nonnull(
245                    chunk.try_into().vortex_unwrap(),
246                    count_distinct_values,
247                    &mut loop_state,
248                )
249            }
250            let remainder = chunks.remainder();
251            inner_loop_naive(
252                remainder,
253                count_distinct_values,
254                &BitBuffer::new_set(remainder.len()),
255                &mut loop_state,
256            );
257        }
258        AllOr::None => unreachable!("All invalid arrays have been handled before"),
259        AllOr::Some(v) => {
260            let mask = v.slice(head_idx..array.len());
261            let mut offset = 0;
262            for chunk in &mut chunks {
263                let validity = mask.slice(offset..(offset + 64));
264                offset += 64;
265
266                match validity.true_count() {
267                    // All nulls -> no stats to update
268                    0 => continue,
269                    // Inner loop for when validity check can be elided
270                    64 => inner_loop_nonnull(
271                        chunk.try_into().vortex_unwrap(),
272                        count_distinct_values,
273                        &mut loop_state,
274                    ),
275                    // Inner loop for when we need to check validity
276                    _ => inner_loop_nullable(
277                        chunk.try_into().vortex_unwrap(),
278                        count_distinct_values,
279                        &validity,
280                        &mut loop_state,
281                    ),
282                }
283            }
284            // Final iteration, run naive loop
285            let remainder = chunks.remainder();
286            inner_loop_naive(
287                remainder,
288                count_distinct_values,
289                &mask.slice(offset..(offset + remainder.len())),
290                &mut loop_state,
291            );
292        }
293    }
294
295    let (top_value, top_count) = if count_distinct_values {
296        let (&top_value, &top_count) = loop_state
297            .distinct_values
298            .iter()
299            .max_by_key(|&(_, &count)| count)
300            .vortex_expect("non-empty");
301        (top_value.0, top_count)
302    } else {
303        (T::default(), 0)
304    };
305
306    let runs = loop_state.runs;
307    let distinct_values_count = if count_distinct_values {
308        loop_state.distinct_values.len().try_into().vortex_unwrap()
309    } else {
310        u32::MAX
311    };
312
313    let min = array
314        .statistics()
315        .compute_as::<T>(Stat::Min)
316        .vortex_expect("min should be computed");
317
318    let max = array
319        .statistics()
320        .compute_as::<T>(Stat::Max)
321        .vortex_expect("max should be computed");
322
323    let typed = TypedStats {
324        min,
325        max,
326        distinct_values: loop_state.distinct_values,
327        top_value,
328        top_count,
329    };
330
331    let null_count = null_count
332        .try_into()
333        .vortex_expect("null_count must fit in u32");
334    let value_count = value_count
335        .try_into()
336        .vortex_expect("value_count must fit in u32");
337
338    IntegerStats {
339        src: array.clone(),
340        null_count,
341        value_count,
342        average_run_length: value_count / runs,
343        distinct_values_count,
344        typed: typed.into(),
345    }
346}
347
348struct LoopState<T> {
349    prev: T,
350    runs: u32,
351    distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
352}
353
354#[inline(always)]
355fn inner_loop_nonnull<T: IntegerPType>(
356    values: &[T; 64],
357    count_distinct_values: bool,
358    state: &mut LoopState<T>,
359) where
360    NativeValue<T>: Eq + Hash,
361{
362    for &value in values {
363        if count_distinct_values {
364            *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
365        }
366
367        if value != state.prev {
368            state.prev = value;
369            state.runs += 1;
370        }
371    }
372}
373
374#[inline(always)]
375fn inner_loop_nullable<T: IntegerPType>(
376    values: &[T; 64],
377    count_distinct_values: bool,
378    is_valid: &BitBuffer,
379    state: &mut LoopState<T>,
380) where
381    NativeValue<T>: Eq + Hash,
382{
383    for (idx, &value) in values.iter().enumerate() {
384        if is_valid.value(idx) {
385            if count_distinct_values {
386                *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
387            }
388
389            if value != state.prev {
390                state.prev = value;
391                state.runs += 1;
392            }
393        }
394    }
395}
396
397#[inline(always)]
398fn inner_loop_naive<T: IntegerPType>(
399    values: &[T],
400    count_distinct_values: bool,
401    is_valid: &BitBuffer,
402    state: &mut LoopState<T>,
403) where
404    NativeValue<T>: Eq + Hash,
405{
406    for (idx, &value) in values.iter().enumerate() {
407        if is_valid.value(idx) {
408            if count_distinct_values {
409                *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
410            }
411
412            if value != state.prev {
413                state.prev = value;
414                state.runs += 1;
415            }
416        }
417    }
418}
419
420#[cfg(test)]
421mod tests {
422    use std::iter;
423
424    use vortex_array::arrays::PrimitiveArray;
425    use vortex_array::validity::Validity;
426    use vortex_buffer::BitBuffer;
427    use vortex_buffer::Buffer;
428    use vortex_buffer::buffer;
429
430    use crate::CompressorStats;
431    use crate::integer::IntegerStats;
432    use crate::integer::stats::typed_int_stats;
433
434    #[test]
435    fn test_naive_count_distinct_values() {
436        let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
437        let stats = typed_int_stats::<u8>(&array, true);
438        assert_eq!(stats.distinct_values_count, 2);
439    }
440
441    #[test]
442    fn test_naive_count_distinct_values_nullable() {
443        let array = PrimitiveArray::new(
444            buffer![217u8, 0],
445            Validity::from(BitBuffer::from(vec![true, false])),
446        );
447        let stats = typed_int_stats::<u8>(&array, true);
448        assert_eq!(stats.distinct_values_count, 1);
449    }
450
451    #[test]
452    fn test_count_distinct_values() {
453        let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
454        let stats = typed_int_stats::<u8>(&array, true);
455        assert_eq!(stats.distinct_values_count, 128);
456    }
457
458    #[test]
459    fn test_count_distinct_values_nullable() {
460        let array = PrimitiveArray::new(
461            (0..128u8).collect::<Buffer<u8>>(),
462            Validity::from(BitBuffer::from_iter(
463                iter::repeat_n(vec![true, false], 64).flatten(),
464            )),
465        );
466        let stats = typed_int_stats::<u8>(&array, true);
467        assert_eq!(stats.distinct_values_count, 64);
468    }
469
470    #[test]
471    fn test_integer_stats_leading_nulls() {
472        let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true]));
473
474        let stats = IntegerStats::generate(&ints);
475
476        assert_eq!(stats.value_count, 2);
477        assert_eq!(stats.null_count, 1);
478        assert_eq!(stats.average_run_length, 1);
479        assert_eq!(stats.distinct_values_count, 2);
480    }
481}